package org.apache.mahout.text;

import com.sun.jersey.core.header.QualityFactor;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.Deque;
import java.util.HashMap;
import java.util.regex.Pattern;
import org.apache.commons.io.DirectoryWalker;
import org.apache.commons.io.comparator.CompositeFileComparator;
import org.apache.commons.io.comparator.DirectoryFileComparator;
import org.apache.commons.io.comparator.PathFileComparator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRConfig;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.utils.email.MailOptions;
import org.apache.mahout.utils.email.MailProcessor;
import org.apache.mahout.utils.io.ChunkedWriter;
import org.apache.mahout.utils.regex.RegexMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:BOOT-INF/lib/mahout-integration-0.12.2.jar:org/apache/mahout/text/SequenceFilesFromMailArchives.class */
public final class SequenceFilesFromMailArchives extends AbstractJob {
    private static final Logger log = LoggerFactory.getLogger((Class<?>) SequenceFilesFromMailArchives.class);
    public static final String[] CHUNK_SIZE_OPTION = {"chunkSize", "chunk"};
    public static final String[] KEY_PREFIX_OPTION = {"keyPrefix", "prefix"};
    public static final String[] CHARSET_OPTION = {"charset", WikipediaTokenizer.CATEGORY};
    public static final String[] SUBJECT_OPTION = {"subject", "s"};
    public static final String[] TO_OPTION = {"to", "to"};
    public static final String[] FROM_OPTION = {"from", "from"};
    public static final String[] REFERENCES_OPTION = {"references", "refs"};
    public static final String[] BODY_OPTION = {"body", WikipediaTokenizer.BOLD};
    public static final String[] STRIP_QUOTED_OPTION = {"stripQuoted", QualityFactor.QUALITY_FACTOR};
    public static final String[] QUOTED_REGEX_OPTION = {"quotedRegex", RegexMapper.REGEX};
    public static final String[] SEPARATOR_OPTION = {"separator", "sep"};
    public static final String[] BODY_SEPARATOR_OPTION = {"bodySeparator", "bodySep"};
    public static final String BASE_INPUT_PATH = "baseinputpath";
    private static final int MAX_JOB_SPLIT_LOCATIONS = 1000000;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:BOOT-INF/lib/mahout-integration-0.12.2.jar:org/apache/mahout/text/SequenceFilesFromMailArchives$PrefixAdditionDirectoryWalker.class */
    public static class PrefixAdditionDirectoryWalker extends DirectoryWalker<Object> {
        private static final Comparator<File> FILE_COMPARATOR = new CompositeFileComparator((Comparator<File>[]) new Comparator[]{DirectoryFileComparator.DIRECTORY_REVERSE, PathFileComparator.PATH_COMPARATOR});
        private final ChunkedWriter writer;
        private final Deque<MailProcessor> processors = new ArrayDeque();
        private final Deque<Long> messageCounts = new ArrayDeque();

        public PrefixAdditionDirectoryWalker(MailProcessor mailProcessor, ChunkedWriter chunkedWriter) {
            this.processors.addFirst(mailProcessor);
            this.writer = chunkedWriter;
            this.messageCounts.addFirst(0L);
        }

        public void walk(File file) throws IOException {
            super.walk(file, null);
        }

        public long getMessageCount() {
            return this.messageCounts.getFirst().longValue();
        }

        @Override // org.apache.commons.io.DirectoryWalker
        protected void handleDirectoryStart(File file, int i, Collection<Object> collection) throws IOException {
            if (i > 0) {
                SequenceFilesFromMailArchives.log.info("At {}", file.getAbsolutePath());
                MailProcessor first = this.processors.getFirst();
                this.processors.push(new MailProcessor(first.getOptions(), first.getPrefix() + File.separator + file.getName(), this.writer));
                this.messageCounts.push(0L);
            }
        }

        @Override // org.apache.commons.io.DirectoryWalker
        protected File[] filterDirectoryContents(File file, int i, File[] fileArr) throws IOException {
            Arrays.sort(fileArr, FILE_COMPARATOR);
            return fileArr;
        }

        @Override // org.apache.commons.io.DirectoryWalker
        protected void handleFile(File file, int i, Collection<Object> collection) throws IOException {
            try {
                this.messageCounts.push(Long.valueOf(this.messageCounts.pop().longValue() + this.processors.getFirst().parseMboxLineByLine(file)));
            } catch (IOException e) {
                throw new IllegalStateException("Error processing " + file, e);
            }
        }

        @Override // org.apache.commons.io.DirectoryWalker
        protected void handleDirectoryEnd(File file, int i, Collection<Object> collection) throws IOException {
            if (i > 0) {
                long longValue = this.messageCounts.pop().longValue();
                SequenceFilesFromMailArchives.log.info("Parsed {} messages from directory {}", Long.valueOf(longValue), file.getAbsolutePath());
                this.processors.pop();
                this.messageCounts.push(Long.valueOf(this.messageCounts.pop().longValue() + longValue));
            }
        }
    }

    public void createSequenceFiles(MailOptions mailOptions) throws IOException {
        ChunkedWriter chunkedWriter = new ChunkedWriter(getConf(), mailOptions.getChunkSize(), new Path(mailOptions.getOutputDir()));
        Throwable th = null;
        try {
            MailProcessor mailProcessor = new MailProcessor(mailOptions, mailOptions.getPrefix(), chunkedWriter);
            if (mailOptions.getInput().isDirectory()) {
                PrefixAdditionDirectoryWalker prefixAdditionDirectoryWalker = new PrefixAdditionDirectoryWalker(mailProcessor, chunkedWriter);
                prefixAdditionDirectoryWalker.walk(mailOptions.getInput());
                log.info("Parsed {} messages from {}", Long.valueOf(prefixAdditionDirectoryWalker.getMessageCount()), mailOptions.getInput().getAbsolutePath());
            } else {
                long currentTimeMillis = System.currentTimeMillis();
                log.info("Parsed {} messages from {} in time: {}", Long.valueOf(mailProcessor.parseMboxLineByLine(mailOptions.getInput())), mailOptions.getInput().getAbsolutePath(), Long.valueOf(System.currentTimeMillis() - currentTimeMillis));
            }
            if (chunkedWriter != null) {
                if (0 == 0) {
                    chunkedWriter.close();
                    return;
                }
                try {
                    chunkedWriter.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
        } catch (Throwable th3) {
            if (chunkedWriter != null) {
                if (0 != 0) {
                    try {
                        chunkedWriter.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    chunkedWriter.close();
                }
            }
            throw th3;
        }
    }

    public static void main(String[] strArr) throws Exception {
        ToolRunner.run(new Configuration(), new SequenceFilesFromMailArchives(), strArr);
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        addInputOption();
        addOutputOption();
        addOption(DefaultOptionCreator.methodOption().create());
        addOption(CHUNK_SIZE_OPTION[0], CHUNK_SIZE_OPTION[1], "The chunkSize in MegaBytes. Defaults to 64", "64");
        addOption(KEY_PREFIX_OPTION[0], KEY_PREFIX_OPTION[1], "The prefix to be prepended to the key", "");
        addOption(CHARSET_OPTION[0], CHARSET_OPTION[1], "The name of the character encoding of the input files. Default to UTF-8", "UTF-8");
        addFlag(SUBJECT_OPTION[0], SUBJECT_OPTION[1], "Include the Mail subject as part of the text.  Default is false");
        addFlag(TO_OPTION[0], TO_OPTION[1], "Include the to field in the text.  Default is false");
        addFlag(FROM_OPTION[0], FROM_OPTION[1], "Include the from field in the text.  Default is false");
        addFlag(REFERENCES_OPTION[0], REFERENCES_OPTION[1], "Include the references field in the text.  Default is false");
        addFlag(BODY_OPTION[0], BODY_OPTION[1], "Include the body in the output.  Default is false");
        addFlag(STRIP_QUOTED_OPTION[0], STRIP_QUOTED_OPTION[1], "Strip (remove) quoted email text in the body.  Default is false");
        addOption(QUOTED_REGEX_OPTION[0], QUOTED_REGEX_OPTION[1], "Specify the regex that identifies quoted text.  Default is to look for > or | at the beginning of the line.");
        addOption(SEPARATOR_OPTION[0], SEPARATOR_OPTION[1], "The separator to use between metadata items (to, from, etc.).  Default is \\n", "\n");
        addOption(BODY_SEPARATOR_OPTION[0], BODY_SEPARATOR_OPTION[1], "The separator to use between lines in the body.  Default is \\n.  Useful to change if you wish to have the message be on one line", "\n");
        addOption(DefaultOptionCreator.helpOption());
        if (parseArguments(strArr) == null) {
            return -1;
        }
        File inputFile = getInputFile();
        String path = getOutputPath().toString();
        int i = 64;
        if (hasOption(CHUNK_SIZE_OPTION[0])) {
            i = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0]));
        }
        String option = hasOption(KEY_PREFIX_OPTION[0]) ? getOption(KEY_PREFIX_OPTION[0]) : "";
        Charset forName = Charset.forName(getOption(CHARSET_OPTION[0]));
        MailOptions mailOptions = new MailOptions();
        mailOptions.setInput(inputFile);
        mailOptions.setOutputDir(path);
        mailOptions.setPrefix(option);
        mailOptions.setChunkSize(i);
        mailOptions.setCharset(forName);
        ArrayList arrayList = new ArrayList(5);
        HashMap hashMap = new HashMap();
        int i2 = 0;
        if (hasOption(FROM_OPTION[0])) {
            arrayList.add(MailProcessor.FROM_PREFIX);
            i2 = 0 + 1;
            hashMap.put(MailOptions.FROM, 0);
        }
        if (hasOption(TO_OPTION[0])) {
            arrayList.add(MailProcessor.TO_PREFIX);
            int i3 = i2;
            i2++;
            hashMap.put(MailOptions.TO, Integer.valueOf(i3));
        }
        if (hasOption(REFERENCES_OPTION[0])) {
            arrayList.add(MailProcessor.REFS_PREFIX);
            int i4 = i2;
            i2++;
            hashMap.put(MailOptions.REFS, Integer.valueOf(i4));
        }
        if (hasOption(SUBJECT_OPTION[0])) {
            arrayList.add(MailProcessor.SUBJECT_PREFIX);
            hashMap.put(MailOptions.SUBJECT, Integer.valueOf(i2 + 1));
        }
        mailOptions.setStripQuotedText(hasOption(STRIP_QUOTED_OPTION[0]));
        mailOptions.setPatternsToMatch((Pattern[]) arrayList.toArray(new Pattern[arrayList.size()]));
        mailOptions.setPatternOrder(hashMap);
        mailOptions.setIncludeBody(hasOption(BODY_OPTION[0]));
        if (hasOption(SEPARATOR_OPTION[0])) {
            mailOptions.setSeparator(getOption(SEPARATOR_OPTION[0]));
        } else {
            mailOptions.setSeparator("\n");
        }
        if (hasOption(BODY_SEPARATOR_OPTION[0])) {
            mailOptions.setBodySeparator(getOption(BODY_SEPARATOR_OPTION[0]));
        }
        if (hasOption(QUOTED_REGEX_OPTION[0])) {
            mailOptions.setQuotedTextPattern(Pattern.compile(getOption(QUOTED_REGEX_OPTION[0])));
        }
        if (getOption("method", "mapreduce").equals("sequential")) {
            runSequential(mailOptions);
            return 0;
        }
        runMapReduce(getInputPath(), getOutputPath());
        return 0;
    }

    private int runSequential(MailOptions mailOptions) throws IOException, InterruptedException, NoSuchMethodException {
        long currentTimeMillis = System.currentTimeMillis();
        createSequenceFiles(mailOptions);
        log.info("Conversion took {}ms", Long.valueOf(System.currentTimeMillis() - currentTimeMillis));
        return 0;
    }

    private int runMapReduce(Path path, Path path2) throws IOException, InterruptedException, ClassNotFoundException {
        Job prepareJob = prepareJob(path, path2, MultipleTextFileInputFormat.class, SequenceFilesFromMailArchivesMapper.class, Text.class, Text.class, SequenceFileOutputFormat.class, "SequentialFilesFromMailArchives");
        Configuration configuration = prepareJob.getConfiguration();
        if (hasOption(KEY_PREFIX_OPTION[0])) {
            configuration.set(KEY_PREFIX_OPTION[1], getOption(KEY_PREFIX_OPTION[0]));
        }
        int i = 0;
        if (hasOption(CHUNK_SIZE_OPTION[0])) {
            i = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0]));
            configuration.set(CHUNK_SIZE_OPTION[0], String.valueOf(i));
        }
        if (hasOption(CHARSET_OPTION[0])) {
            configuration.set(CHARSET_OPTION[0], Charset.forName(getOption(CHARSET_OPTION[0])).displayName());
        }
        if (hasOption(FROM_OPTION[0])) {
            configuration.set(FROM_OPTION[1], "true");
        }
        if (hasOption(TO_OPTION[0])) {
            configuration.set(TO_OPTION[1], "true");
        }
        if (hasOption(REFERENCES_OPTION[0])) {
            configuration.set(REFERENCES_OPTION[1], "true");
        }
        if (hasOption(SUBJECT_OPTION[0])) {
            configuration.set(SUBJECT_OPTION[1], "true");
        }
        if (hasOption(QUOTED_REGEX_OPTION[0])) {
            configuration.set(QUOTED_REGEX_OPTION[1], Pattern.compile(getOption(QUOTED_REGEX_OPTION[0])).toString());
        }
        if (hasOption(SEPARATOR_OPTION[0])) {
            configuration.set(SEPARATOR_OPTION[1], getOption(SEPARATOR_OPTION[0]));
        } else {
            configuration.set(SEPARATOR_OPTION[1], "\n");
        }
        if (hasOption(BODY_OPTION[0])) {
            configuration.set(BODY_OPTION[1], "true");
        } else {
            configuration.set(BODY_OPTION[1], "false");
        }
        if (hasOption(BODY_SEPARATOR_OPTION[0])) {
            configuration.set(BODY_SEPARATOR_OPTION[1], getOption(BODY_SEPARATOR_OPTION[0]));
        } else {
            configuration.set(BODY_SEPARATOR_OPTION[1], "\n");
        }
        FileSystem fileSystem = FileSystem.get(configuration);
        FileStatus fileStatus = fileSystem.getFileStatus(this.inputPath);
        configuration.set("baseinputpath", this.inputPath.toString());
        FileInputFormat.setInputPaths(prepareJob, HadoopUtil.buildDirList(fileSystem, fileStatus));
        FileInputFormat.setMaxInputSplitSize(prepareJob, i * 1024 * 1024);
        configuration.set(MRConfig.MAX_BLOCK_LOCATIONS_KEY, String.valueOf(1000000));
        return !prepareJob.waitForCompletion(true) ? -1 : 0;
    }
}
