package org.apache.mahout.utils.clustering;

import com.google.common.io.Closeables;
import com.google.common.io.Files;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.commons.io.Charsets;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.mahout.clustering.cdbw.CDbwEvaluator;
import org.apache.mahout.clustering.classify.WeightedPropertyVectorWritable;
import org.apache.mahout.clustering.evaluation.ClusterEvaluator;
import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
import org.apache.mahout.fpm.pfpgrowth.PFPGrowth;
import org.apache.mahout.utils.vectors.VectorHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:BOOT-INF/lib/mahout-integration-0.12.2.jar:org/apache/mahout/utils/clustering/ClusterDumper.class */
public final class ClusterDumper extends AbstractJob {
    public static final String SAMPLE_POINTS = "samplePoints";
    DistanceMeasure measure;
    public static final String DICTIONARY_TYPE_OPTION = "dictionaryType";
    public static final String DICTIONARY_OPTION = "dictionary";
    public static final String POINTS_DIR_OPTION = "pointsDir";
    public static final String NUM_WORDS_OPTION = "numWords";
    public static final String SUBSTRING_OPTION = "substring";
    public static final String EVALUATE_CLUSTERS = "evaluate";
    public static final String OUTPUT_FORMAT_OPT = "outputFormat";
    private static final Logger log = LoggerFactory.getLogger((Class<?>) ClusterDumper.class);
    private Path seqFileDir;
    private Path pointsDir;
    private String termDictionary;
    private String dictionaryFormat;
    private Map<Integer, List<WeightedPropertyVectorWritable>> clusterIdToPoints;
    private boolean runEvaluation;
    private long maxPointsPerCluster = Long.MAX_VALUE;
    private int subString = Integer.MAX_VALUE;
    private int numTopFeatures = 10;
    private OUTPUT_FORMAT outputFormat = OUTPUT_FORMAT.TEXT;

    /* loaded from: input_file:BOOT-INF/lib/mahout-integration-0.12.2.jar:org/apache/mahout/utils/clustering/ClusterDumper$OUTPUT_FORMAT.class */
    public enum OUTPUT_FORMAT {
        TEXT,
        CSV,
        GRAPH_ML,
        JSON
    }

    public ClusterDumper(Path path, Path path2) {
        this.seqFileDir = path;
        this.pointsDir = path2;
        init();
    }

    public ClusterDumper() {
        setConf(new Configuration());
    }

    public static void main(String[] strArr) throws Exception {
        new ClusterDumper().run(strArr);
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        int parseInt;
        addInputOption();
        addOutputOption();
        addOption(OUTPUT_FORMAT_OPT, "of", "The optional output format for the results.  Options: TEXT, CSV, JSON or GRAPH_ML", "TEXT");
        addOption(SUBSTRING_OPTION, WikipediaTokenizer.BOLD, "The number of chars of the asFormatString() to print");
        addOption(NUM_WORDS_OPTION, "n", "The number of top terms to print");
        addOption(POINTS_DIR_OPTION, "p", "The directory containing points sequence files mapping input vectors to their cluster.  If specified, then the program will output the points associated with a cluster");
        addOption(SAMPLE_POINTS, "sp", "Specifies the maximum number of points to include _per_ cluster.  The default is to include all points");
        addOption("dictionary", "d", "The dictionary file");
        addOption(DICTIONARY_TYPE_OPTION, "dt", "The dictionary file type (text|sequencefile)", "text");
        addOption(buildOption(EVALUATE_CLUSTERS, "e", "Run ClusterEvaluator and CDbwEvaluator over the input.  The output will be appended to the rest of the output at the end.", false, false, null));
        addOption(DefaultOptionCreator.distanceMeasureOption().create());
        if (parseArguments(strArr, false, true) == null) {
            return -1;
        }
        this.seqFileDir = getInputPath();
        if (hasOption(POINTS_DIR_OPTION)) {
            this.pointsDir = new Path(getOption(POINTS_DIR_OPTION));
        }
        this.outputFile = getOutputFile();
        if (hasOption(SUBSTRING_OPTION) && (parseInt = Integer.parseInt(getOption(SUBSTRING_OPTION))) >= 0) {
            this.subString = parseInt;
        }
        this.termDictionary = getOption("dictionary");
        this.dictionaryFormat = getOption(DICTIONARY_TYPE_OPTION);
        if (hasOption(NUM_WORDS_OPTION)) {
            this.numTopFeatures = Integer.parseInt(getOption(NUM_WORDS_OPTION));
        }
        if (hasOption(OUTPUT_FORMAT_OPT)) {
            this.outputFormat = OUTPUT_FORMAT.valueOf(getOption(OUTPUT_FORMAT_OPT));
        }
        if (hasOption(SAMPLE_POINTS)) {
            this.maxPointsPerCluster = Long.parseLong(getOption(SAMPLE_POINTS));
        } else {
            this.maxPointsPerCluster = Long.MAX_VALUE;
        }
        this.runEvaluation = hasOption(EVALUATE_CLUSTERS);
        this.measure = (DistanceMeasure) ClassUtils.instantiateAs(getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), DistanceMeasure.class);
        init();
        printClusters(null);
        return 0;
    }

    public void printClusters(String[] strArr) throws Exception {
        boolean z;
        Writer newWriter;
        Configuration configuration = new Configuration();
        if (this.termDictionary != null) {
            if ("text".equals(this.dictionaryFormat)) {
                strArr = VectorHelper.loadTermDictionary(new File(this.termDictionary));
            } else {
                if (!"sequencefile".equals(this.dictionaryFormat)) {
                    throw new IllegalArgumentException("Invalid dictionary format");
                }
                strArr = VectorHelper.loadTermDictionary(configuration, this.termDictionary);
            }
        }
        if (this.outputFile == null) {
            z = false;
            newWriter = new OutputStreamWriter(System.out, Charsets.UTF_8);
        } else {
            z = true;
            if (this.outputFile.getName().startsWith("s3n://")) {
                Path path = this.outputPath;
                newWriter = new OutputStreamWriter(FileSystem.get(path.toUri(), configuration).create(path), Charsets.UTF_8);
            } else {
                Files.createParentDirs(this.outputFile);
                newWriter = Files.newWriter(this.outputFile, Charsets.UTF_8);
            }
        }
        ClusterWriter createClusterWriter = createClusterWriter(newWriter, strArr);
        try {
            long write = createClusterWriter.write(new SequenceFileDirValueIterable(new Path(this.seqFileDir, PFPGrowth.FILE_PATTERN), PathType.GLOB, configuration));
            newWriter.flush();
            if (this.runEvaluation) {
                HadoopUtil.delete(configuration, new Path("tmp/representative"));
                RepresentativePointsDriver.main(new String[]{"--input", this.seqFileDir.toString(), "--output", "tmp/representative", "--clusteredPoints", this.pointsDir.toString(), "--distanceMeasure", this.measure.getClass().getName(), "--maxIter", String.valueOf(5)});
                configuration.set(RepresentativePointsDriver.DISTANCE_MEASURE_KEY, this.measure.getClass().getName());
                configuration.set(RepresentativePointsDriver.STATE_IN_KEY, "tmp/representative/representativePoints-5");
                ClusterEvaluator clusterEvaluator = new ClusterEvaluator(configuration, this.seqFileDir);
                newWriter.append("\n");
                newWriter.append("Inter-Cluster Density: ").append((CharSequence) String.valueOf(clusterEvaluator.interClusterDensity())).append("\n");
                newWriter.append("Intra-Cluster Density: ").append((CharSequence) String.valueOf(clusterEvaluator.intraClusterDensity())).append("\n");
                CDbwEvaluator cDbwEvaluator = new CDbwEvaluator(configuration, this.seqFileDir);
                newWriter.append("CDbw Inter-Cluster Density: ").append((CharSequence) String.valueOf(cDbwEvaluator.interClusterDensity())).append("\n");
                newWriter.append("CDbw Intra-Cluster Density: ").append((CharSequence) String.valueOf(cDbwEvaluator.intraClusterDensity())).append("\n");
                newWriter.append("CDbw Separation: ").append((CharSequence) String.valueOf(cDbwEvaluator.separation())).append("\n");
                newWriter.flush();
            }
            log.info("Wrote {} clusters", Long.valueOf(write));
            if (z) {
                Closeables.close(createClusterWriter, false);
            } else if (createClusterWriter instanceof GraphMLClusterWriter) {
                createClusterWriter.close();
            }
        } catch (Throwable th) {
            if (z) {
                Closeables.close(createClusterWriter, false);
            } else if (createClusterWriter instanceof GraphMLClusterWriter) {
                createClusterWriter.close();
            }
            throw th;
        }
    }

    ClusterWriter createClusterWriter(Writer writer, String[] strArr) throws IOException {
        AbstractClusterWriter jsonClusterWriter;
        switch (this.outputFormat) {
            case TEXT:
                jsonClusterWriter = new ClusterDumperWriter(writer, this.clusterIdToPoints, this.measure, this.numTopFeatures, strArr, this.subString);
                break;
            case CSV:
                jsonClusterWriter = new CSVClusterWriter(writer, this.clusterIdToPoints, this.measure);
                break;
            case GRAPH_ML:
                jsonClusterWriter = new GraphMLClusterWriter(writer, this.clusterIdToPoints, this.measure, this.numTopFeatures, strArr, this.subString);
                break;
            case JSON:
                jsonClusterWriter = new JsonClusterWriter(writer, this.clusterIdToPoints, this.measure, this.numTopFeatures, strArr);
                break;
            default:
                throw new IllegalStateException("Unknown outputformat: " + this.outputFormat);
        }
        return jsonClusterWriter;
    }

    public void setOutputFormat(OUTPUT_FORMAT output_format) {
        this.outputFormat = output_format;
    }

    private void init() {
        if (this.pointsDir == null) {
            this.clusterIdToPoints = Collections.emptyMap();
        } else {
            this.clusterIdToPoints = readPoints(this.pointsDir, this.maxPointsPerCluster, new Configuration());
        }
    }

    public int getSubString() {
        return this.subString;
    }

    public void setSubString(int i) {
        this.subString = i;
    }

    public Map<Integer, List<WeightedPropertyVectorWritable>> getClusterIdToPoints() {
        return this.clusterIdToPoints;
    }

    public String getTermDictionary() {
        return this.termDictionary;
    }

    public void setTermDictionary(String str, String str2) {
        this.termDictionary = str;
        this.dictionaryFormat = str2;
    }

    public void setNumTopFeatures(int i) {
        this.numTopFeatures = i;
    }

    public int getNumTopFeatures() {
        return this.numTopFeatures;
    }

    public long getMaxPointsPerCluster() {
        return this.maxPointsPerCluster;
    }

    public void setMaxPointsPerCluster(long j) {
        this.maxPointsPerCluster = j;
    }

    public static Map<Integer, List<WeightedPropertyVectorWritable>> readPoints(Path path, long j, Configuration configuration) {
        TreeMap treeMap = new TreeMap();
        Iterator it = new SequenceFileDirIterable(path, PathType.LIST, PathFilters.logsCRCFilter(), configuration).iterator();
        while (it.hasNext()) {
            Pair pair = (Pair) it.next();
            int i = ((IntWritable) pair.getFirst()).get();
            List list = (List) treeMap.get(Integer.valueOf(i));
            if (list == null) {
                list = new ArrayList();
                treeMap.put(Integer.valueOf(i), list);
            }
            if (list.size() < j) {
                list.add(pair.getSecond());
            }
        }
        return treeMap;
    }
}
