package org.apache.mahout.vectorizer.tfidf;

import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.map.OpenIntLongHashMap;
import org.apache.mahout.vectorizer.TFIDF;
import org.apache.mahout.vectorizer.common.PartialVectorMerger;

/* JADX WARN: Classes with same name are omitted:
  input_file:BOOT-INF/lib/mahout-core-0.9.jar:org/apache/mahout/vectorizer/tfidf/TFIDFPartialVectorReducer.class
 */
/* loaded from: input_file:BOOT-INF/lib/mahout-mr-0.12.2.jar:org/apache/mahout/vectorizer/tfidf/TFIDFPartialVectorReducer.class */
public class TFIDFPartialVectorReducer extends Reducer<WritableComparable<?>, VectorWritable, WritableComparable<?>, VectorWritable> {
    private final OpenIntLongHashMap dictionary = new OpenIntLongHashMap();
    private final TFIDF tfidf = new TFIDF();
    private int minDf = 1;
    private long maxDf = -1;
    private long vectorCount = 1;
    private long featureCount;
    private boolean sequentialAccess;
    private boolean namedVector;

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // org.apache.hadoop.mapreduce.Reducer
    public void reduce(WritableComparable<?> writableComparable, Iterable<VectorWritable> iterable, Reducer<WritableComparable<?>, VectorWritable, WritableComparable<?>, VectorWritable>.Context context) throws IOException, InterruptedException {
        Iterator<VectorWritable> it = iterable.iterator();
        if (it.hasNext()) {
            Vector vector = it.next().get();
            Vector randomAccessSparseVector = new RandomAccessSparseVector((int) this.featureCount, vector.getNumNondefaultElements());
            for (Vector.Element element : vector.nonZeroes()) {
                if (this.dictionary.containsKey(element.index())) {
                    long j = this.dictionary.get(element.index());
                    if (this.maxDf <= -1 || (100.0d * j) / this.vectorCount <= this.maxDf) {
                        if (j < this.minDf) {
                            j = this.minDf;
                        }
                        randomAccessSparseVector.setQuick(element.index(), this.tfidf.calculate((int) element.get(), (int) j, (int) this.featureCount, (int) this.vectorCount));
                    }
                }
            }
            if (this.sequentialAccess) {
                randomAccessSparseVector = new SequentialAccessSparseVector(randomAccessSparseVector);
            }
            if (this.namedVector) {
                randomAccessSparseVector = new NamedVector(randomAccessSparseVector, writableComparable.toString());
            }
            context.write(writableComparable, new VectorWritable(randomAccessSparseVector));
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // org.apache.hadoop.mapreduce.Reducer
    public void setup(Reducer<WritableComparable<?>, VectorWritable, WritableComparable<?>, VectorWritable>.Context context) throws IOException, InterruptedException {
        super.setup(context);
        Configuration configuration = context.getConfiguration();
        this.vectorCount = configuration.getLong(TFIDFConverter.VECTOR_COUNT, 1L);
        this.featureCount = configuration.getLong(TFIDFConverter.FEATURE_COUNT, 1L);
        this.minDf = configuration.getInt("min.df", 1);
        this.maxDf = configuration.getLong("max.df", -1L);
        this.sequentialAccess = configuration.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
        this.namedVector = configuration.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
        Iterator it = new SequenceFileIterable(HadoopUtil.findInCacheByPartOfFilename(TFIDFConverter.FREQUENCY_FILE, DistributedCache.getCacheFiles(configuration)), true, configuration).iterator();
        while (it.hasNext()) {
            Pair pair = (Pair) it.next();
            this.dictionary.put(((IntWritable) pair.getFirst()).get(), ((LongWritable) pair.getSecond()).get());
        }
    }
}
