package com.tingwen.widget.webcollector;

import com.github.mikephil.charting.utils.Utils;
import com.xiaomi.mipush.sdk.Constants;
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.jsoup.select.NodeVisitor;

/* loaded from: classes.dex */
public class ContentExtractor {
    protected Document doc;
    protected HashMap<Element, CountInfo> infoMap = new HashMap<>();

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: classes.dex */
    public class CountInfo {
        int textCount = 0;
        int linkTextCount = 0;
        int tagCount = 0;
        int linkTagCount = 0;
        double density = Utils.DOUBLE_EPSILON;
        double densitySum = Utils.DOUBLE_EPSILON;
        double score = Utils.DOUBLE_EPSILON;
        int pCount = 0;
        ArrayList<Integer> leafList = new ArrayList<>();

        CountInfo() {
        }
    }

    ContentExtractor(Document document) {
        this.doc = document;
    }

    public static String getContentByDoc(Document document) throws Exception {
        return new ContentExtractor(document).getContentElement().text();
    }

    public static String getContentByHtml(String str) throws Exception {
        return getContentElementByDoc(Jsoup.parse(str)).text();
    }

    public static String getContentByHtml(String str, String str2) throws Exception {
        return getContentElementByDoc(Jsoup.parse(str, str2)).text();
    }

    public static String getContentByUrl(String str) throws Exception {
        return getContentByHtml(new HttpRequest(str).response().decode(), str);
    }

    public static Element getContentElementByDoc(Document document) throws Exception {
        return new ContentExtractor(document).getContentElement();
    }

    public static Element getContentElementByHtml(String str) throws Exception {
        return getContentElementByDoc(Jsoup.parse(str));
    }

    public static Element getContentElementByHtml(String str, String str2) throws Exception {
        return getContentElementByDoc(Jsoup.parse(str, str2));
    }

    public static Element getContentElementByUrl(String str) throws Exception {
        return getContentElementByHtml(new HttpRequest(str).response().decode(), str);
    }

    public static News getNewsByDoc(Document document) throws Exception {
        return new ContentExtractor(document).getNews();
    }

    public static News getNewsByHtml(String str) throws Exception {
        return getNewsByDoc(Jsoup.parse(str));
    }

    public static News getNewsByHtml(String str, String str2) throws Exception {
        return getNewsByDoc(Jsoup.parse(str, str2));
    }

    public static News getNewsByUrl(String str) throws Exception {
        return getNewsByHtml(new HttpRequest(str).response().decode(), str);
    }

    protected void clean() {
        this.doc.select("script,noscript,style,iframe,br").remove();
    }

    protected CountInfo computeInfo(Node node) {
        if (!(node instanceof Element)) {
            if (!(node instanceof TextNode)) {
                return new CountInfo();
            }
            CountInfo countInfo = new CountInfo();
            int length = ((TextNode) node).text().length();
            countInfo.textCount = length;
            countInfo.leafList.add(Integer.valueOf(length));
            return countInfo;
        }
        Element element = (Element) node;
        CountInfo countInfo2 = new CountInfo();
        Iterator<Node> it = element.childNodes().iterator();
        while (it.hasNext()) {
            CountInfo computeInfo = computeInfo(it.next());
            countInfo2.textCount += computeInfo.textCount;
            countInfo2.linkTextCount += computeInfo.linkTextCount;
            countInfo2.tagCount += computeInfo.tagCount;
            countInfo2.linkTagCount += computeInfo.linkTagCount;
            countInfo2.leafList.addAll(computeInfo.leafList);
            countInfo2.densitySum += computeInfo.density;
            countInfo2.pCount += computeInfo.pCount;
        }
        countInfo2.tagCount++;
        String tagName = element.tagName();
        if (tagName.equals("a")) {
            countInfo2.linkTextCount = countInfo2.textCount;
            countInfo2.linkTagCount++;
        } else if (tagName.equals("p")) {
            countInfo2.pCount++;
        }
        int i = countInfo2.textCount - countInfo2.linkTextCount;
        int i2 = countInfo2.tagCount - countInfo2.linkTagCount;
        if (i == 0 || i2 == 0) {
            countInfo2.density = Utils.DOUBLE_EPSILON;
        } else {
            countInfo2.density = (i + Utils.DOUBLE_EPSILON) / i2;
        }
        this.infoMap.put(element, countInfo2);
        return countInfo2;
    }

    protected double computeScore(Element element) {
        CountInfo countInfo = this.infoMap.get(element);
        return Math.log(Math.sqrt(computeVar(countInfo.leafList) + 1.0d)) * countInfo.densitySum * Math.log((countInfo.textCount - countInfo.linkTextCount) + 1) * Math.log10(countInfo.pCount + 2);
    }

    protected double computeVar(ArrayList<Integer> arrayList) {
        if (arrayList.size() == 0) {
            return Utils.DOUBLE_EPSILON;
        }
        if (arrayList.size() == 1) {
            return arrayList.get(0).intValue() / 2;
        }
        double d = Utils.DOUBLE_EPSILON;
        while (arrayList.iterator().hasNext()) {
            d += r3.next().intValue();
        }
        double size = d / arrayList.size();
        double d2 = Utils.DOUBLE_EPSILON;
        Iterator<Integer> it = arrayList.iterator();
        while (it.hasNext()) {
            Integer next = it.next();
            d2 += (next.intValue() - size) * (next.intValue() - size);
        }
        return d2 / arrayList.size();
    }

    protected int editDistance(String str, String str2) {
        int length = str.length();
        int length2 = str2.length();
        int[][] iArr = (int[][]) Array.newInstance((Class<?>) Integer.TYPE, length + 1, length2 + 1);
        for (int i = 0; i <= length; i++) {
            iArr[i][0] = i;
        }
        for (int i2 = 0; i2 <= length2; i2++) {
            iArr[0][i2] = i2;
        }
        for (int i3 = 0; i3 < length; i3++) {
            char charAt = str.charAt(i3);
            for (int i4 = 0; i4 < length2; i4++) {
                if (charAt == str2.charAt(i4)) {
                    iArr[i3 + 1][i4 + 1] = iArr[i3][i4];
                } else {
                    int i5 = iArr[i3][i4] + 1;
                    int i6 = iArr[i3][i4 + 1] + 1;
                    int i7 = iArr[i3 + 1][i4] + 1;
                    int i8 = i5 > i6 ? i6 : i5;
                    if (i7 <= i8) {
                        i8 = i7;
                    }
                    iArr[i3 + 1][i4 + 1] = i8;
                }
            }
        }
        return iArr[length][length2];
    }

    public Element getContentElement() throws Exception {
        clean();
        computeInfo(this.doc.body());
        double d = Utils.DOUBLE_EPSILON;
        Element element = null;
        Iterator<Map.Entry<Element, CountInfo>> it = this.infoMap.entrySet().iterator();
        while (it.hasNext()) {
            Element key = it.next().getKey();
            if (!key.tagName().equals("a") && key != this.doc.body()) {
                double computeScore = computeScore(key);
                if (computeScore > d) {
                    d = computeScore;
                    element = key;
                }
            }
        }
        if (element == null) {
            throw new Exception("抓取失败  extraction failed");
        }
        return element;
    }

    protected String getDate(Element element) throws Exception {
        Element parent;
        Pattern compile = Pattern.compile("([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})");
        Element element2 = element;
        for (int i = 0; i < 2; i++) {
            if (element2 != null && element2 != this.doc.body() && (parent = element2.parent()) != null) {
                element2 = parent;
            }
        }
        for (int i2 = 0; i2 < 6 && element2 != null; i2++) {
            Matcher matcher = compile.matcher(element2.outerHtml());
            if (matcher.find()) {
                return matcher.group(1) + Constants.ACCEPT_TIME_SEPARATOR_SERVER + matcher.group(2) + Constants.ACCEPT_TIME_SEPARATOR_SERVER + matcher.group(3);
            }
            if (element2 != this.doc.body()) {
                element2 = element2.parent();
            }
        }
        throw new Exception("date not found");
    }

    public News getNews() throws Exception {
        News news = new News();
        try {
            Element contentElement = getContentElement();
            news.setContentElement(contentElement);
            if (this.doc.baseUri() != null) {
                news.setUrl(this.doc.baseUri());
            }
            try {
                news.setTime(getTime(contentElement));
            } catch (Exception e) {
            }
            try {
                news.setTitle(getTitle(contentElement));
            } catch (Exception e2) {
            }
            return news;
        } catch (Exception e3) {
            throw new Exception(e3);
        }
    }

    protected String getTime(Element element) throws Exception {
        Element parent;
        Pattern compile = Pattern.compile("([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[1-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})");
        Element element2 = element;
        for (int i = 0; i < 2; i++) {
            if (element2 != null && element2 != this.doc.body() && (parent = element2.parent()) != null) {
                element2 = parent;
            }
        }
        for (int i2 = 0; i2 < 6 && element2 != null; i2++) {
            Matcher matcher = compile.matcher(element2.outerHtml());
            if (matcher.find()) {
                return matcher.group(1) + Constants.ACCEPT_TIME_SEPARATOR_SERVER + matcher.group(2) + Constants.ACCEPT_TIME_SEPARATOR_SERVER + matcher.group(3) + " " + matcher.group(4) + Constants.COLON_SEPARATOR + matcher.group(5) + Constants.COLON_SEPARATOR + matcher.group(6);
            }
            if (element2 != this.doc.body()) {
                element2 = element2.parent();
            }
        }
        try {
            return getDate(element);
        } catch (Exception e) {
            throw new Exception("time not found");
        }
    }

    protected String getTitle(final Element element) throws Exception {
        final ArrayList arrayList = new ArrayList();
        final ArrayList arrayList2 = new ArrayList();
        final AtomicInteger atomicInteger = new AtomicInteger();
        final String trim = this.doc.title().trim();
        if (!trim.isEmpty()) {
            this.doc.body().traverse(new NodeVisitor() { // from class: com.tingwen.widget.webcollector.ContentExtractor.1
                @Override // org.jsoup.select.NodeVisitor
                public void head(Node node, int i) {
                    if (node instanceof Element) {
                        Element element2 = (Element) node;
                        if (element2 == element) {
                            atomicInteger.set(arrayList.size());
                        } else if (Pattern.matches("h[1-6]", element2.tagName())) {
                            arrayList2.add(Double.valueOf(ContentExtractor.this.strSim(element2.text().trim(), trim)));
                            arrayList.add(element2);
                        }
                    }
                }

                @Override // org.jsoup.select.NodeVisitor
                public void tail(Node node, int i) {
                }
            });
            int i = atomicInteger.get();
            if (i > 0) {
                double d = Utils.DOUBLE_EPSILON;
                int i2 = -1;
                for (int i3 = 0; i3 < i; i3++) {
                    double doubleValue = (i3 + 1) * ((Double) arrayList2.get(i3)).doubleValue();
                    if (doubleValue > d) {
                        d = doubleValue;
                        i2 = i3;
                    }
                }
                if (i2 != -1) {
                    return ((Element) arrayList.get(i2)).text();
                }
            }
        }
        Elements select = this.doc.body().select("*[id^=title],*[id$=title],*[class^=title],*[class$=title]");
        if (select.size() > 0) {
            String text = select.first().text();
            if (text.length() > 5 && text.length() < 40) {
                return select.first().text();
            }
        }
        try {
            return getTitleByEditDistance(element);
        } catch (Exception e) {
            throw new Exception("title not found");
        }
    }

    protected String getTitleByEditDistance(Element element) throws Exception {
        final String title = this.doc.title();
        final ArrayList arrayList = new ArrayList();
        arrayList.add(Double.valueOf(Utils.DOUBLE_EPSILON));
        final StringBuilder sb = new StringBuilder();
        this.doc.body().traverse(new NodeVisitor() { // from class: com.tingwen.widget.webcollector.ContentExtractor.2
            @Override // org.jsoup.select.NodeVisitor
            public void head(Node node, int i) {
                if (node instanceof TextNode) {
                    String trim = ((TextNode) node).text().trim();
                    double strSim = ContentExtractor.this.strSim(trim, title);
                    if (strSim <= Utils.DOUBLE_EPSILON || strSim <= ((Double) arrayList.get(0)).doubleValue()) {
                        return;
                    }
                    arrayList.set(0, Double.valueOf(strSim));
                    sb.setLength(0);
                    sb.append(trim);
                }
            }

            @Override // org.jsoup.select.NodeVisitor
            public void tail(Node node, int i) {
            }
        });
        if (sb.length() > 0) {
            return sb.toString();
        }
        throw new Exception();
    }

    protected int lcs(String str, String str2) {
        int length = str.length();
        int length2 = str2.length();
        if (length == 0 || length2 == 0) {
            return 0;
        }
        int[][] iArr = (int[][]) Array.newInstance((Class<?>) Integer.TYPE, length + 1, length2 + 1);
        for (int i = length - 1; i >= 0; i--) {
            for (int i2 = length2 - 1; i2 >= 0; i2--) {
                if (str.charAt(i) == str2.charAt(i2)) {
                    iArr[i][i2] = iArr[i + 1][i2 + 1] + 1;
                } else {
                    iArr[i][i2] = Math.max(iArr[i + 1][i2], iArr[i][i2 + 1]);
                }
            }
        }
        return iArr[0][0];
    }

    protected double strSim(String str, String str2) {
        int length = str.length();
        int length2 = str2.length();
        if (length == 0 || length2 == 0) {
            return Utils.DOUBLE_EPSILON;
        }
        return (length > length2 ? (((double) length) + Utils.DOUBLE_EPSILON) / ((double) length2) : (((double) length2) + Utils.DOUBLE_EPSILON) / ((double) length)) < 3.0d ? (Utils.DOUBLE_EPSILON + lcs(str, str2)) / Math.max(length, length2) : Utils.DOUBLE_EPSILON;
    }
}
