package org.apache.mahout.vectorizer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.vectorizer.common.PartialVectorMerger;
import org.apache.mahout.vectorizer.pruner.PrunedPartialVectorMergeReducer;
import org.apache.mahout.vectorizer.pruner.WordsPrunerReducer;

/* JADX WARN: Classes with same name are omitted:
  input_file:BOOT-INF/lib/mahout-core-0.9.jar:org/apache/mahout/vectorizer/HighDFWordsPruner.class
 */
/* loaded from: input_file:BOOT-INF/lib/mahout-mr-0.12.2.jar:org/apache/mahout/vectorizer/HighDFWordsPruner.class */
public final class HighDFWordsPruner {
    public static final String STD_CALC_DIR = "stdcalc";
    public static final String MAX_DF = "max.df";
    public static final String MIN_DF = "min.df";

    private HighDFWordsPruner() {
    }

    public static void pruneVectors(Path path, Path path2, Path path3, long j, long j2, Configuration configuration, Pair<Long[], List<Path>> pair, float f, boolean z, int i) throws IOException, InterruptedException, ClassNotFoundException {
        int i2 = 0;
        ArrayList arrayList = new ArrayList();
        for (Path path4 : pair.getSecond()) {
            int i3 = i2;
            i2++;
            Path path5 = new Path(path3, "partial-" + i3);
            arrayList.add(path5);
            pruneVectorsPartial(path, path5, path4, j, j2, configuration);
        }
        mergePartialVectors(arrayList, path2, configuration, f, z, i);
        HadoopUtil.delete(new Configuration(configuration), path3);
    }

    private static void pruneVectorsPartial(Path path, Path path2, Path path3, long j, long j2, Configuration configuration) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration configuration2 = new Configuration(configuration);
        configuration2.set(CommonConfigurationKeysPublic.IO_SERIALIZATIONS_KEY, "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        configuration2.setLong("max.df", j);
        configuration2.setLong("min.df", j2);
        DistributedCache.addCacheFile(path3.toUri(), configuration2);
        Job prepareJob = HadoopUtil.prepareJob(path, path2, SequenceFileInputFormat.class, Mapper.class, null, null, WordsPrunerReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class, configuration2);
        prepareJob.setJobName(": Prune Vectors: input-folder: " + path + ", dictionary-file: " + path3.toString());
        HadoopUtil.delete(configuration2, path2);
        if (!prepareJob.waitForCompletion(true)) {
            throw new IllegalStateException("Job failed!");
        }
    }

    public static void mergePartialVectors(Iterable<Path> iterable, Path path, Configuration configuration, float f, boolean z, int i) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration configuration2 = new Configuration(configuration);
        configuration2.set(CommonConfigurationKeysPublic.IO_SERIALIZATIONS_KEY, "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        configuration2.setFloat(PartialVectorMerger.NORMALIZATION_POWER, f);
        configuration2.setBoolean(PartialVectorMerger.LOG_NORMALIZE, z);
        Job job = new Job(configuration2);
        job.setJobName("PrunerPartialVectorMerger::MergePartialVectors");
        job.setJarByClass(PartialVectorMerger.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(VectorWritable.class);
        FileInputFormat.setInputPaths(job, getCommaSeparatedPaths(iterable));
        FileOutputFormat.setOutputPath(job, path);
        job.setMapperClass(Mapper.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setReducerClass(PrunedPartialVectorMergeReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setNumReduceTasks(i);
        HadoopUtil.delete(configuration2, path);
        if (!job.waitForCompletion(true)) {
            throw new IllegalStateException("Job failed!");
        }
    }

    private static String getCommaSeparatedPaths(Iterable<Path> iterable) {
        StringBuilder sb = new StringBuilder(100);
        String str = "";
        Iterator<Path> it = iterable.iterator();
        while (it.hasNext()) {
            sb.append(str).append(it.next().toString());
            str = ",";
        }
        return sb.toString();
    }
}
