package com.hankcs.hanlp.mining.word;

import com.hankcs.hanlp.algorithm.MaxHeap;
import com.hankcs.hanlp.utility.LexiconUtility;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.TreeMap;
import java.util.regex.Pattern;

/* loaded from: classes2.dex */
public class NewWordDiscover {
    private boolean filter;
    private int max_word_len;
    private float min_aggregation;
    private float min_entropy;
    private float min_freq;

    public NewWordDiscover() {
        this(4, 5.0E-5f, 0.4f, 1.2f, false);
    }

    public NewWordDiscover(int i2, float f2, float f3, float f4, boolean z) {
        this.max_word_len = i2;
        this.min_freq = f2;
        this.min_entropy = f3;
        this.min_aggregation = f4;
        this.filter = z;
    }

    public List<WordInfo> discover(BufferedReader bufferedReader, int i2) throws IOException {
        TreeMap treeMap = new TreeMap();
        Pattern compile = Pattern.compile("[\\s\\d,.<>/?:;'\"\\[\\]{}()\\|~!@#$%^&*\\-_=+，。《》、？：；“”‘’｛｝【】（）…￥！—┄－]+");
        int i3 = 0;
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                break;
            }
            String replaceAll = compile.matcher(readLine).replaceAll("\u0000");
            int length = replaceAll.length();
            int i4 = 0;
            while (i4 < length) {
                int i5 = i4 + 1;
                int min = Math.min(this.max_word_len + i5, length + 1);
                int i6 = i5;
                while (i6 < min) {
                    String substring = replaceAll.substring(i4, i6);
                    if (substring.indexOf(0) < 0) {
                        WordInfo wordInfo = (WordInfo) treeMap.get(substring);
                        if (wordInfo == null) {
                            wordInfo = new WordInfo(substring);
                            treeMap.put(substring, wordInfo);
                        }
                        wordInfo.update(i4 == 0 ? (char) 0 : replaceAll.charAt(i4 - 1), i6 < length ? replaceAll.charAt(i6) : (char) 0);
                    }
                    i6++;
                }
                i4 = i5;
            }
            i3 += length;
        }
        Iterator it = treeMap.values().iterator();
        while (it.hasNext()) {
            ((WordInfo) it.next()).computeProbabilityEntropy(i3);
        }
        Iterator it2 = treeMap.values().iterator();
        while (it2.hasNext()) {
            ((WordInfo) it2.next()).computeAggregation(treeMap);
        }
        LinkedList linkedList = new LinkedList(treeMap.values());
        ListIterator listIterator = linkedList.listIterator();
        while (listIterator.hasNext()) {
            WordInfo wordInfo2 = (WordInfo) listIterator.next();
            if (wordInfo2.text.trim().length() < 2 || wordInfo2.f4541p < this.min_freq || wordInfo2.entropy < this.min_entropy || wordInfo2.aggregation < this.min_aggregation || (this.filter && LexiconUtility.getFrequency(wordInfo2.text) > 0)) {
                listIterator.remove();
            }
        }
        MaxHeap maxHeap = new MaxHeap(i2, new Comparator<WordInfo>() { // from class: com.hankcs.hanlp.mining.word.NewWordDiscover.1
            @Override // java.util.Comparator
            public int compare(WordInfo wordInfo3, WordInfo wordInfo4) {
                return Float.compare(wordInfo3.f4541p, wordInfo4.f4541p);
            }
        });
        maxHeap.addAll(linkedList);
        return maxHeap.toList();
    }

    public List<WordInfo> discover(String str, int i2) {
        try {
            return discover(new BufferedReader(new StringReader(str)), i2);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}
