package com.hankcs.hanlp.mining.word;

import com.hankcs.hanlp.algorithm.MaxHeap;
import com.hankcs.hanlp.utility.LexiconUtility;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.regex.Pattern;

/* loaded from: input_file:BOOT-INF/lib/hanlp-portable-1.7.1.jar:com/hankcs/hanlp/mining/word/NewWordDiscover.class */
public class NewWordDiscover {
    private int max_word_len;
    private float min_freq;
    private float min_entropy;
    private float min_aggregation;
    private boolean filter;

    public NewWordDiscover() {
        this(4, 5.0E-5f, 0.4f, 1.2f, false);
    }

    public NewWordDiscover(int i, float f, float f2, float f3, boolean z) {
        this.max_word_len = i;
        this.min_freq = f;
        this.min_entropy = f2;
        this.min_aggregation = f3;
        this.filter = z;
    }

    public List<WordInfo> discover(BufferedReader bufferedReader, int i) throws IOException {
        Map<String, WordInfo> treeMap = new TreeMap<>();
        int i2 = 0;
        Pattern compile = Pattern.compile("[\\s\\d,.<>/?:;'\"\\[\\]{}()\\|~!@#$%^&*\\-_=+，。《》、？：；“”‘’｛｝【】（）…￥！—┄－]+");
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                break;
            }
            String replaceAll = compile.matcher(readLine).replaceAll("��");
            int length = replaceAll.length();
            int i3 = 0;
            while (i3 < length) {
                int min = Math.min(i3 + 1 + this.max_word_len, length + 1);
                int i4 = i3 + 1;
                while (i4 < min) {
                    String substring = replaceAll.substring(i3, i4);
                    if (substring.indexOf(0) < 0) {
                        WordInfo wordInfo = treeMap.get(substring);
                        if (wordInfo == null) {
                            wordInfo = new WordInfo(substring);
                            treeMap.put(substring, wordInfo);
                        }
                        wordInfo.update(i3 == 0 ? (char) 0 : replaceAll.charAt(i3 - 1), i4 < length ? replaceAll.charAt(i4) : (char) 0);
                    }
                    i4++;
                }
                i3++;
            }
            i2 += length;
        }
        Iterator<WordInfo> it = treeMap.values().iterator();
        while (it.hasNext()) {
            it.next().computeProbabilityEntropy(i2);
        }
        Iterator<WordInfo> it2 = treeMap.values().iterator();
        while (it2.hasNext()) {
            it2.next().computeAggregation(treeMap);
        }
        LinkedList linkedList = new LinkedList(treeMap.values());
        ListIterator listIterator = linkedList.listIterator();
        while (listIterator.hasNext()) {
            WordInfo wordInfo2 = (WordInfo) listIterator.next();
            if (wordInfo2.text.trim().length() < 2 || wordInfo2.p < this.min_freq || wordInfo2.entropy < this.min_entropy || wordInfo2.aggregation < this.min_aggregation || (this.filter && LexiconUtility.getFrequency(wordInfo2.text) > 0)) {
                listIterator.remove();
            }
        }
        MaxHeap maxHeap = new MaxHeap(i, new Comparator<WordInfo>() { // from class: com.hankcs.hanlp.mining.word.NewWordDiscover.1
            @Override // java.util.Comparator
            public int compare(WordInfo wordInfo3, WordInfo wordInfo4) {
                return Float.compare(wordInfo3.p, wordInfo4.p);
            }
        });
        maxHeap.addAll(linkedList);
        return maxHeap.toList();
    }

    public List<WordInfo> discover(String str, int i) {
        try {
            return discover(new BufferedReader(new StringReader(str)), i);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}
