package weka.filters.unsupervised.attribute;

import java.io.File;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Vector;
import weka.classifiers.lazy.kstar.KStarConstants;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.SparseInstance;
import weka.core.Stopwords;
import weka.core.Tag;
import weka.core.TestInstances;
import weka.core.Utils;
import weka.core.stemmers.NullStemmer;
import weka.core.stemmers.Stemmer;
import weka.core.tokenizers.Tokenizer;
import weka.core.tokenizers.WordTokenizer;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;

/* loaded from: classes.dex */
public class StringToWordVector extends Filter implements OptionHandler, UnsupervisedFilter {
    public static final int FILTER_NONE = 0;
    public static final int FILTER_NORMALIZE_ALL = 1;
    public static final int FILTER_NORMALIZE_TEST_ONLY = 2;
    public static final Tag[] TAGS_FILTER = {new Tag(0, "No normalization"), new Tag(1, "Normalize all data"), new Tag(2, "Normalize test data only")};
    static final long serialVersionUID = 8249106275278565424L;
    private double m_AvgDocLength;
    private TreeMap<String, Integer> m_Dictionary;
    private int[] m_DocsCounts;
    private boolean m_IDFTransform;
    private int m_NumInstances;
    private boolean m_OutputCounts;
    private double m_PeriodicPruningRate;
    private String m_Prefix;
    protected Range m_SelectedRange;
    private Stemmer m_Stemmer;
    private File m_Stopwords;
    private boolean m_TFTransform;
    private Tokenizer m_Tokenizer;
    private int m_WordsToKeep;
    private boolean m_doNotOperateOnPerClassBasis;
    protected int m_filterType;
    private boolean m_lowerCaseTokens;
    private int m_minTermFreq;
    private boolean m_useStoplist;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: classes.dex */
    public class Count implements Serializable, RevisionHandler {
        static final long serialVersionUID = 2157223818584474321L;
        public int count;
        public int docCount;

        public Count(int i) {
            this.count = i;
        }

        @Override // weka.core.RevisionHandler
        public String getRevision() {
            return RevisionUtils.extract("$Revision: 10215 $");
        }
    }

    public StringToWordVector() {
        this.m_SelectedRange = new Range("first-last");
        this.m_Dictionary = new TreeMap<>();
        this.m_OutputCounts = false;
        this.m_Prefix = "";
        this.m_NumInstances = -1;
        this.m_AvgDocLength = -1.0d;
        this.m_WordsToKeep = 1000;
        this.m_PeriodicPruningRate = -1.0d;
        this.m_filterType = 0;
        this.m_Stemmer = new NullStemmer();
        this.m_minTermFreq = 1;
        this.m_doNotOperateOnPerClassBasis = false;
        this.m_Stopwords = new File(System.getProperty("user.dir"));
        this.m_Tokenizer = new WordTokenizer();
    }

    public StringToWordVector(int i) {
        this.m_SelectedRange = new Range("first-last");
        this.m_Dictionary = new TreeMap<>();
        this.m_OutputCounts = false;
        this.m_Prefix = "";
        this.m_NumInstances = -1;
        this.m_AvgDocLength = -1.0d;
        this.m_WordsToKeep = 1000;
        this.m_PeriodicPruningRate = -1.0d;
        this.m_filterType = 0;
        this.m_Stemmer = new NullStemmer();
        this.m_minTermFreq = 1;
        this.m_doNotOperateOnPerClassBasis = false;
        this.m_Stopwords = new File(System.getProperty("user.dir"));
        this.m_Tokenizer = new WordTokenizer();
        this.m_WordsToKeep = i;
    }

    private int convertInstancewoDocNorm(Instance instance, ArrayList<Instance> arrayList) {
        TreeMap treeMap = new TreeMap();
        int i = 0;
        for (int i2 = 0; i2 < getInputFormat().numAttributes(); i2++) {
            if (!this.m_SelectedRange.isInRange(i2)) {
                if (getInputFormat().attribute(i2).type() == 2 || getInputFormat().attribute(i2).type() == 4) {
                    if (instance.isMissing(i2)) {
                        treeMap.put(new Integer(i), new Double(Utils.missingValue()));
                    } else if (getInputFormat().attribute(i2).type() == 2) {
                        if (outputFormatPeek().attribute(i).numValues() == 0) {
                            outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug");
                        }
                        treeMap.put(new Integer(i), new Double(outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i2))));
                    } else {
                        if (outputFormatPeek().attribute(i).numValues() == 0) {
                            outputFormatPeek().attribute(i).addRelation(outputFormatPeek().attribute(i).relation());
                        }
                        treeMap.put(new Integer(i), new Double(outputFormatPeek().attribute(i).addRelation(instance.relationalValue(i2))));
                    }
                } else if (instance.value(i2) != KStarConstants.FLOOR) {
                    treeMap.put(new Integer(i), new Double(instance.value(i2)));
                }
                i++;
            }
        }
        for (int i3 = 0; i3 < instance.numAttributes(); i3++) {
            if (this.m_SelectedRange.isInRange(i3) && !instance.isMissing(i3)) {
                this.m_Tokenizer.tokenize(instance.stringValue(i3));
                while (this.m_Tokenizer.hasMoreElements()) {
                    String nextElement = this.m_Tokenizer.nextElement();
                    if (this.m_lowerCaseTokens) {
                        nextElement = nextElement.toLowerCase();
                    }
                    Integer num = this.m_Dictionary.get(this.m_Stemmer.stem(nextElement));
                    if (num != null) {
                        if (this.m_OutputCounts) {
                            Double d = (Double) treeMap.get(num);
                            if (d != null) {
                                treeMap.put(num, new Double(d.doubleValue() + 1.0d));
                            } else {
                                treeMap.put(num, new Double(1.0d));
                            }
                        } else {
                            treeMap.put(num, new Double(1.0d));
                        }
                    }
                }
            }
        }
        if (this.m_TFTransform) {
            for (Integer num2 : treeMap.keySet()) {
                if (num2.intValue() >= i) {
                    treeMap.put(num2, new Double(Math.log(1.0d + ((Double) treeMap.get(num2)).doubleValue())));
                }
            }
        }
        if (this.m_IDFTransform) {
            for (Integer num3 : treeMap.keySet()) {
                if (num3.intValue() >= i) {
                    treeMap.put(num3, new Double(((Double) treeMap.get(num3)).doubleValue() * Math.log(this.m_NumInstances / this.m_DocsCounts[num3.intValue()])));
                }
            }
        }
        double[] dArr = new double[treeMap.size()];
        int[] iArr = new int[treeMap.size()];
        int i4 = 0;
        for (Integer num4 : treeMap.keySet()) {
            dArr[i4] = ((Double) treeMap.get(num4)).doubleValue();
            iArr[i4] = num4.intValue();
            i4++;
        }
        SparseInstance sparseInstance = new SparseInstance(instance.weight(), dArr, iArr, outputFormatPeek().numAttributes());
        sparseInstance.setDataset(outputFormatPeek());
        arrayList.add(sparseInstance);
        return i;
    }

    private void determineDictionary() {
        Stopwords stopwords = new Stopwords();
        if (getUseStoplist()) {
            try {
                if (getStopwords().exists() && !getStopwords().isDirectory()) {
                    stopwords.read(getStopwords());
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        int classIndex = getInputFormat().classIndex();
        int i = 1;
        if (!this.m_doNotOperateOnPerClassBasis && classIndex != -1) {
            i = getInputFormat().attribute(classIndex).numValues();
        }
        TreeMap[] treeMapArr = new TreeMap[i];
        for (int i2 = 0; i2 < i; i2++) {
            treeMapArr[i2] = new TreeMap();
        }
        determineSelectedRange();
        long round = Math.round((this.m_PeriodicPruningRate / 100.0d) * getInputFormat().numInstances());
        for (int i3 = 0; i3 < getInputFormat().numInstances(); i3++) {
            Instance instance = getInputFormat().instance(i3);
            int i4 = 0;
            if (!this.m_doNotOperateOnPerClassBasis && classIndex != -1) {
                i4 = (int) instance.classValue();
            }
            Hashtable hashtable = new Hashtable();
            for (int i5 = 0; i5 < instance.numAttributes(); i5++) {
                if (this.m_SelectedRange.isInRange(i5) && !instance.isMissing(i5)) {
                    this.m_Tokenizer.tokenize(instance.stringValue(i5));
                    while (this.m_Tokenizer.hasMoreElements()) {
                        String intern = this.m_Tokenizer.nextElement().intern();
                        if (this.m_lowerCaseTokens) {
                            intern = intern.toLowerCase();
                        }
                        String stem = this.m_Stemmer.stem(intern);
                        if (!this.m_useStoplist || !stopwords.is(stem)) {
                            if (!hashtable.containsKey(stem)) {
                                hashtable.put(stem, new Integer(0));
                            }
                            Count count = (Count) treeMapArr[i4].get(stem);
                            if (count == null) {
                                treeMapArr[i4].put(stem, new Count(1));
                            } else {
                                count.count++;
                            }
                        }
                    }
                }
            }
            Enumeration keys = hashtable.keys();
            while (keys.hasMoreElements()) {
                Count count2 = (Count) treeMapArr[i4].get((String) keys.nextElement());
                if (count2 != null) {
                    count2.docCount++;
                } else {
                    System.err.println("Warning: A word should definitely be in the dictionary.Please check the code");
                }
            }
            if (round > 0 && i3 % round == 0 && i3 > 0) {
                for (int i6 = 0; i6 < i; i6++) {
                    ArrayList arrayList = new ArrayList(1000);
                    for (String str : treeMapArr[i6].keySet()) {
                        if (((Count) treeMapArr[i6].get(str)).count <= 1) {
                            arrayList.add(str);
                        }
                    }
                    Iterator it = arrayList.iterator();
                    while (it.hasNext()) {
                        treeMapArr[i6].remove((String) it.next());
                    }
                }
            }
        }
        int i7 = 0;
        int[] iArr = new int[i];
        for (int i8 = 0; i8 < i; i8++) {
            i7 += treeMapArr[i8].size();
            int[] iArr2 = new int[treeMapArr[i8].size()];
            int i9 = 0;
            Iterator it2 = treeMapArr[i8].keySet().iterator();
            while (it2.hasNext()) {
                iArr2[i9] = ((Count) treeMapArr[i8].get((String) it2.next())).count;
                i9++;
            }
            sortArray(iArr2);
            if (iArr2.length < this.m_WordsToKeep) {
                iArr[i8] = this.m_minTermFreq;
            } else {
                iArr[i8] = Math.max(this.m_minTermFreq, iArr2[iArr2.length - this.m_WordsToKeep]);
            }
        }
        ArrayList arrayList2 = new ArrayList(getInputFormat().numAttributes() + i7);
        int i10 = -1;
        for (int i11 = 0; i11 < getInputFormat().numAttributes(); i11++) {
            if (!this.m_SelectedRange.isInRange(i11)) {
                if (getInputFormat().classIndex() == i11) {
                    i10 = arrayList2.size();
                }
                arrayList2.add((Attribute) getInputFormat().attribute(i11).copy());
            }
        }
        TreeMap<String, Integer> treeMap = new TreeMap<>();
        int size = arrayList2.size();
        for (int i12 = 0; i12 < i; i12++) {
            for (String str2 : treeMapArr[i12].keySet()) {
                if (((Count) treeMapArr[i12].get(str2)).count >= iArr[i12] && treeMap.get(str2) == null) {
                    treeMap.put(str2, new Integer(size));
                    arrayList2.add(new Attribute(this.m_Prefix + str2));
                    size++;
                }
            }
        }
        this.m_DocsCounts = new int[arrayList2.size()];
        for (String str3 : treeMap.keySet()) {
            int intValue = treeMap.get(str3).intValue();
            int i13 = 0;
            for (int i14 = 0; i14 < i; i14++) {
                Count count3 = (Count) treeMapArr[i14].get(str3);
                if (count3 != null) {
                    i13 += count3.docCount;
                }
            }
            this.m_DocsCounts[intValue] = i13;
        }
        arrayList2.trimToSize();
        this.m_Dictionary = treeMap;
        this.m_NumInstances = getInputFormat().numInstances();
        Instances instances = new Instances(getInputFormat().relationName(), (ArrayList<Attribute>) arrayList2, 0);
        instances.setClassIndex(i10);
        setOutputFormat(instances);
    }

    private void determineSelectedRange() {
        Instances inputFormat = getInputFormat();
        if (this.m_SelectedRange == null) {
            StringBuffer stringBuffer = new StringBuffer();
            for (int i = 0; i < inputFormat.numAttributes(); i++) {
                if (inputFormat.attribute(i).type() == 2) {
                    stringBuffer.append((i + 1) + ",");
                }
            }
            this.m_SelectedRange = new Range(stringBuffer.toString());
        }
        this.m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);
        StringBuffer stringBuffer2 = new StringBuffer();
        for (int i2 = 0; i2 < inputFormat.numAttributes(); i2++) {
            if (this.m_SelectedRange.isInRange(i2) && inputFormat.attribute(i2).type() == 2) {
                stringBuffer2.append((i2 + 1) + ",");
            }
        }
        this.m_SelectedRange.setRanges(stringBuffer2.toString());
        this.m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);
    }

    public static void main(String[] strArr) {
        runFilter(new StringToWordVector(), strArr);
    }

    private void normalizeInstance(Instance instance, int i) throws Exception {
        double d = KStarConstants.FLOOR;
        if (this.m_AvgDocLength < KStarConstants.FLOOR) {
            throw new Exception("Average document length not set.");
        }
        for (int i2 = 0; i2 < instance.numValues(); i2++) {
            if (instance.index(i2) >= i) {
                d += instance.valueSparse(i2) * instance.valueSparse(i2);
            }
        }
        double sqrt = Math.sqrt(d);
        int i3 = 0;
        while (i3 < instance.numValues()) {
            if (instance.index(i3) >= i) {
                double valueSparse = (instance.valueSparse(i3) * this.m_AvgDocLength) / sqrt;
                instance.setValueSparse(i3, valueSparse);
                if (valueSparse == KStarConstants.FLOOR) {
                    System.err.println("setting value " + instance.index(i3) + " to zero.");
                    i3--;
                }
            }
            i3++;
        }
    }

    private static void sortArray(int[] iArr) {
        int length = iArr.length - 1;
        int i = 1;
        while (i <= length / 9) {
            i = (i * 3) + 1;
        }
        while (i > 0) {
            for (int i2 = i + 1; i2 <= length; i2++) {
                int i3 = iArr[i2];
                int i4 = i2;
                while (i4 > i && iArr[i4 - i] > i3) {
                    iArr[i4] = iArr[i4 - i];
                    i4 -= i;
                }
                iArr[i4] = i3;
            }
            i /= 3;
        }
    }

    public String IDFTransformTipText() {
        return "Sets whether if the word frequencies in a document should be transformed into: \n   fij*log(num of Docs/num of Docs with word i) \n      where fij is the frequency of word i in document (instance) j.";
    }

    public String TFTransformTipText() {
        return "Sets whether if the word frequencies should be transformed into:\n    log(1+fij) \n       where fij is the frequency of word i in document (instance) j.";
    }

    public String attributeIndicesTipText() {
        return "Specify range of attributes to act on. This is a comma separated list of attribute indices, with \"first\" and \"last\" valid values. Specify an inclusive range with \"-\". E.g: \"first-3,5,6-10,last\".";
    }

    public String attributeNamePrefixTipText() {
        return "Prefix for the created attribute names. (default: \"\")";
    }

    @Override // weka.filters.Filter
    public boolean batchFinished() throws Exception {
        if (getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }
        if (!isFirstBatchDone()) {
            if (getInputFormat().classIndex() >= 0 && (!getInputFormat().classAttribute().isNominal() || getInputFormat().attributeStats(getInputFormat().classIndex()).missingCount == getInputFormat().numInstances())) {
                this.m_doNotOperateOnPerClassBasis = true;
            }
            determineDictionary();
            ArrayList<Instance> arrayList = new ArrayList<>();
            int i = 0;
            for (int i2 = 0; i2 < this.m_NumInstances; i2++) {
                i = convertInstancewoDocNorm(getInputFormat().instance(i2), arrayList);
            }
            if (this.m_filterType != 0) {
                this.m_AvgDocLength = KStarConstants.FLOOR;
                for (int i3 = 0; i3 < arrayList.size(); i3++) {
                    Instance instance = arrayList.get(i3);
                    double d = KStarConstants.FLOOR;
                    for (int i4 = 0; i4 < instance.numValues(); i4++) {
                        if (instance.index(i4) >= i) {
                            d += instance.valueSparse(i4) * instance.valueSparse(i4);
                        }
                    }
                    this.m_AvgDocLength += Math.sqrt(d);
                }
                this.m_AvgDocLength /= this.m_NumInstances;
            }
            if (this.m_filterType == 1) {
                for (int i5 = 0; i5 < arrayList.size(); i5++) {
                    normalizeInstance(arrayList.get(i5), i);
                }
            }
            for (int i6 = 0; i6 < arrayList.size(); i6++) {
                push(arrayList.get(i6));
            }
        }
        flushInput();
        this.m_NewBatch = true;
        this.m_FirstBatchDone = true;
        return numPendingOutput() != 0;
    }

    public String doNotOperateOnPerClassBasisTipText() {
        return "If this is set, the maximum number of words and the minimum term frequency is not enforced on a per-class basis but based on the documents in all the classes (even if a class attribute is set).";
    }

    public String getAttributeIndices() {
        return this.m_SelectedRange.getRanges();
    }

    public String getAttributeNamePrefix() {
        return this.m_Prefix;
    }

    @Override // weka.filters.Filter, weka.core.CapabilitiesHandler
    public Capabilities getCapabilities() {
        Capabilities capabilities = super.getCapabilities();
        capabilities.disableAll();
        capabilities.enableAllAttributes();
        capabilities.enable(Capabilities.Capability.MISSING_VALUES);
        capabilities.enableAllClasses();
        capabilities.enable(Capabilities.Capability.MISSING_CLASS_VALUES);
        capabilities.enable(Capabilities.Capability.NO_CLASS);
        return capabilities;
    }

    public boolean getDoNotOperateOnPerClassBasis() {
        return this.m_doNotOperateOnPerClassBasis;
    }

    public boolean getIDFTransform() {
        return this.m_IDFTransform;
    }

    public boolean getInvertSelection() {
        return this.m_SelectedRange.getInvert();
    }

    public boolean getLowerCaseTokens() {
        return this.m_lowerCaseTokens;
    }

    public int getMinTermFreq() {
        return this.m_minTermFreq;
    }

    public SelectedTag getNormalizeDocLength() {
        return new SelectedTag(this.m_filterType, TAGS_FILTER);
    }

    @Override // weka.core.OptionHandler
    public String[] getOptions() {
        Vector vector = new Vector();
        vector.add("-R");
        vector.add(getSelectedRange().getRanges());
        if (getInvertSelection()) {
            vector.add("-V");
        }
        if (!"".equals(getAttributeNamePrefix())) {
            vector.add("-P");
            vector.add(getAttributeNamePrefix());
        }
        vector.add("-W");
        vector.add(String.valueOf(getWordsToKeep()));
        vector.add("-prune-rate");
        vector.add(String.valueOf(getPeriodicPruning()));
        if (getOutputWordCounts()) {
            vector.add("-C");
        }
        if (getTFTransform()) {
            vector.add("-T");
        }
        if (getIDFTransform()) {
            vector.add("-I");
        }
        vector.add("-N");
        vector.add("" + this.m_filterType);
        if (getLowerCaseTokens()) {
            vector.add("-L");
        }
        if (getUseStoplist()) {
            vector.add("-S");
        }
        if (getStemmer() != null) {
            vector.add("-stemmer");
            String name = getStemmer().getClass().getName();
            if (getStemmer() instanceof OptionHandler) {
                name = name + TestInstances.DEFAULT_SEPARATORS + Utils.joinOptions(((OptionHandler) getStemmer()).getOptions());
            }
            vector.add(name.trim());
        }
        vector.add("-M");
        vector.add(String.valueOf(getMinTermFreq()));
        if (getDoNotOperateOnPerClassBasis()) {
            vector.add("-O");
        }
        if (!getStopwords().isDirectory()) {
            vector.add("-stopwords");
            vector.add(getStopwords().getAbsolutePath());
        }
        vector.add("-tokenizer");
        String name2 = getTokenizer().getClass().getName();
        if (getTokenizer() instanceof OptionHandler) {
            name2 = name2 + TestInstances.DEFAULT_SEPARATORS + Utils.joinOptions(getTokenizer().getOptions());
        }
        vector.add(name2.trim());
        return (String[]) vector.toArray(new String[vector.size()]);
    }

    public boolean getOutputWordCounts() {
        return this.m_OutputCounts;
    }

    public double getPeriodicPruning() {
        return this.m_PeriodicPruningRate;
    }

    @Override // weka.filters.Filter, weka.core.RevisionHandler
    public String getRevision() {
        return RevisionUtils.extract("$Revision: 10215 $");
    }

    public Range getSelectedRange() {
        return this.m_SelectedRange;
    }

    public Stemmer getStemmer() {
        return this.m_Stemmer;
    }

    public File getStopwords() {
        return this.m_Stopwords;
    }

    public boolean getTFTransform() {
        return this.m_TFTransform;
    }

    public Tokenizer getTokenizer() {
        return this.m_Tokenizer;
    }

    public boolean getUseStoplist() {
        return this.m_useStoplist;
    }

    public int getWordsToKeep() {
        return this.m_WordsToKeep;
    }

    public String globalInfo() {
        return "Converts String attributes into a set of attributes representing word occurrence (depending on the tokenizer) information from the text contained in the strings. The set of words (attributes) is determined by the first batch filtered (typically training data).";
    }

    @Override // weka.filters.Filter
    public boolean input(Instance instance) throws Exception {
        if (getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }
        if (this.m_NewBatch) {
            resetQueue();
            this.m_NewBatch = false;
        }
        if (!isFirstBatchDone()) {
            bufferInput(instance);
            return false;
        }
        ArrayList<Instance> arrayList = new ArrayList<>();
        int convertInstancewoDocNorm = convertInstancewoDocNorm(instance, arrayList);
        Instance instance2 = arrayList.get(0);
        if (this.m_filterType != 0) {
            normalizeInstance(instance2, convertInstancewoDocNorm);
        }
        push(instance2);
        return true;
    }

    public String invertSelectionTipText() {
        return "Set attribute selection mode. If false, only selected attributes in the range will be worked on; if true, only non-selected attributes will be processed.";
    }

    @Override // weka.core.OptionHandler
    public Enumeration<Option> listOptions() {
        Vector vector = new Vector();
        vector.addElement(new Option("\tOutput word counts rather than boolean word presence.\n", "C", 0, "-C"));
        vector.addElement(new Option("\tSpecify list of string attributes to convert to words (as weka Range).\n\t(default: select all string attributes)", "R", 1, "-R <index1,index2-index4,...>"));
        vector.addElement(new Option("\tInvert matching sense of column indexes.", "V", 0, "-V"));
        vector.addElement(new Option("\tSpecify a prefix for the created attribute names.\n\t(default: \"\")", "P", 1, "-P <attribute name prefix>"));
        vector.addElement(new Option("\tSpecify approximate number of word fields to create.\n\tSurplus words will be discarded..\n\t(default: 1000)", "W", 1, "-W <number of words to keep>"));
        vector.addElement(new Option("\tSpecify the rate (e.g., every 10% of the input dataset) at which to periodically prune the dictionary.\n\t-W prunes after creating a full dictionary. You may not have enough memory for this approach.\n\t(default: no periodic pruning)", "prune-rate", 1, "-prune-rate <rate as a percentage of dataset>"));
        vector.addElement(new Option("\tTransform the word frequencies into log(1+fij)\n\twhere fij is the frequency of word i in jth document(instance).\n", "T", 0, "-T"));
        vector.addElement(new Option("\tTransform each word frequency into:\n\tfij*log(num of Documents/num of documents containing word i)\n\t  where fij if frequency of word i in jth document(instance)", "I", 0, "-I"));
        vector.addElement(new Option("\tWhether to 0=not normalize/1=normalize all data/2=normalize test data only\n\tto average length of training documents (default 0=don't normalize).", "N", 1, "-N"));
        vector.addElement(new Option("\tConvert all tokens to lowercase before adding to the dictionary.", "L", 0, "-L"));
        vector.addElement(new Option("\tIgnore words that are in the stoplist.", "S", 0, "-S"));
        vector.addElement(new Option("\tThe stemmering algorihtm (classname plus parameters) to use.", "stemmer", 1, "-stemmer <spec>"));
        vector.addElement(new Option("\tThe minimum term frequency (default = 1).", "M", 1, "-M <int>"));
        vector.addElement(new Option("\tIf this is set, the maximum number of words and the \n\tminimum term frequency is not enforced on a per-class \n\tbasis but based on the documents in all the classes \n\t(even if a class attribute is set).", "O", 0, "-O"));
        vector.addElement(new Option("\tA file containing stopwords to override the default ones.\n\tUsing this option automatically sets the flag ('-S') to use the\n\tstoplist if the file exists.\n\tFormat: one stopword per line, lines starting with '#'\n\tare interpreted as comments and ignored.", "stopwords", 1, "-stopwords <file>"));
        vector.addElement(new Option("\tThe tokenizing algorihtm (classname plus parameters) to use.\n\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1, "-tokenizer <spec>"));
        return vector.elements();
    }

    public String lowerCaseTokensTipText() {
        return "If set then all the word tokens are converted to lower case before being added to the dictionary.";
    }

    public String minTermFreqTipText() {
        return "Sets the minimum term frequency. This is enforced on a per-class basis.";
    }

    public String normalizeDocLengthTipText() {
        return "Sets whether if the word frequencies for a document (instance) should be normalized or not.";
    }

    public String outputWordCountsTipText() {
        return "Output word counts rather than boolean 0 or 1(indicating presence or absence of a word).";
    }

    public String periodicPruningTipText() {
        return "Specify the rate (x% of the input dataset) at which to periodically prune the dictionary. wordsToKeep prunes after creating a full dictionary. You may not have enough memory for this approach.";
    }

    public void setAttributeIndices(String str) {
        this.m_SelectedRange.setRanges(str);
    }

    public void setAttributeIndicesArray(int[] iArr) {
        setAttributeIndices(Range.indicesToRangeList(iArr));
    }

    public void setAttributeNamePrefix(String str) {
        this.m_Prefix = str;
    }

    public void setDoNotOperateOnPerClassBasis(boolean z) {
        this.m_doNotOperateOnPerClassBasis = z;
    }

    public void setIDFTransform(boolean z) {
        this.m_IDFTransform = z;
    }

    @Override // weka.filters.Filter
    public boolean setInputFormat(Instances instances) throws Exception {
        super.setInputFormat(instances);
        this.m_SelectedRange.setUpper(instances.numAttributes() - 1);
        this.m_AvgDocLength = -1.0d;
        this.m_NumInstances = -1;
        return false;
    }

    public void setInvertSelection(boolean z) {
        this.m_SelectedRange.setInvert(z);
    }

    public void setLowerCaseTokens(boolean z) {
        this.m_lowerCaseTokens = z;
    }

    public void setMinTermFreq(int i) {
        this.m_minTermFreq = i;
    }

    public void setNormalizeDocLength(SelectedTag selectedTag) {
        if (selectedTag.getTags() == TAGS_FILTER) {
            this.m_filterType = selectedTag.getSelectedTag().getID();
        }
    }

    @Override // weka.core.OptionHandler
    public void setOptions(String[] strArr) throws Exception {
        String option = Utils.getOption('R', strArr);
        if (option.length() != 0) {
            setSelectedRange(option);
        } else {
            setSelectedRange("first-last");
        }
        setInvertSelection(Utils.getFlag('V', strArr));
        String option2 = Utils.getOption('P', strArr);
        if (option2.length() != 0) {
            setAttributeNamePrefix(option2);
        } else {
            setAttributeNamePrefix("");
        }
        String option3 = Utils.getOption('W', strArr);
        if (option3.length() != 0) {
            setWordsToKeep(Integer.valueOf(option3).intValue());
        } else {
            setWordsToKeep(1000);
        }
        String option4 = Utils.getOption("prune-rate", strArr);
        if (option4.length() > 0) {
            setPeriodicPruning(Double.parseDouble(option4));
        } else {
            setPeriodicPruning(-1.0d);
        }
        String option5 = Utils.getOption('M', strArr);
        if (option5.length() != 0) {
            setMinTermFreq(Integer.valueOf(option5).intValue());
        } else {
            setMinTermFreq(1);
        }
        setOutputWordCounts(Utils.getFlag('C', strArr));
        setTFTransform(Utils.getFlag('T', strArr));
        setIDFTransform(Utils.getFlag('I', strArr));
        setDoNotOperateOnPerClassBasis(Utils.getFlag('O', strArr));
        String option6 = Utils.getOption('N', strArr);
        if (option6.length() != 0) {
            setNormalizeDocLength(new SelectedTag(Integer.parseInt(option6), TAGS_FILTER));
        } else {
            setNormalizeDocLength(new SelectedTag(0, TAGS_FILTER));
        }
        setLowerCaseTokens(Utils.getFlag('L', strArr));
        setUseStoplist(Utils.getFlag('S', strArr));
        String option7 = Utils.getOption("stemmer", strArr);
        if (option7.length() == 0) {
            setStemmer(null);
        } else {
            String[] splitOptions = Utils.splitOptions(option7);
            if (splitOptions.length == 0) {
                throw new Exception("Invalid stemmer specification string");
            }
            String str = splitOptions[0];
            splitOptions[0] = "";
            Stemmer stemmer = (Stemmer) Class.forName(str).newInstance();
            if (stemmer instanceof OptionHandler) {
                ((OptionHandler) stemmer).setOptions(splitOptions);
            }
            setStemmer(stemmer);
        }
        String option8 = Utils.getOption("stopwords", strArr);
        if (option8.length() != 0) {
            setStopwords(new File(option8));
        } else {
            setStopwords(null);
        }
        String option9 = Utils.getOption("tokenizer", strArr);
        if (option9.length() == 0) {
            setTokenizer(new WordTokenizer());
        } else {
            String[] splitOptions2 = Utils.splitOptions(option9);
            if (splitOptions2.length == 0) {
                throw new Exception("Invalid tokenizer specification string");
            }
            String str2 = splitOptions2[0];
            splitOptions2[0] = "";
            Tokenizer tokenizer = (Tokenizer) Class.forName(str2).newInstance();
            if (tokenizer instanceof OptionHandler) {
                tokenizer.setOptions(splitOptions2);
            }
            setTokenizer(tokenizer);
        }
        Utils.checkForRemainingOptions(strArr);
    }

    public void setOutputWordCounts(boolean z) {
        this.m_OutputCounts = z;
    }

    public void setPeriodicPruning(double d) {
        this.m_PeriodicPruningRate = d;
    }

    public void setSelectedRange(String str) {
        this.m_SelectedRange = new Range(str);
    }

    public void setStemmer(Stemmer stemmer) {
        if (stemmer != null) {
            this.m_Stemmer = stemmer;
        } else {
            this.m_Stemmer = new NullStemmer();
        }
    }

    public void setStopwords(File file) {
        if (file == null) {
            file = new File(System.getProperty("user.dir"));
        }
        this.m_Stopwords = file;
        if (file.exists() && file.isFile()) {
            setUseStoplist(true);
        }
    }

    public void setTFTransform(boolean z) {
        this.m_TFTransform = z;
    }

    public void setTokenizer(Tokenizer tokenizer) {
        this.m_Tokenizer = tokenizer;
    }

    public void setUseStoplist(boolean z) {
        this.m_useStoplist = z;
    }

    public void setWordsToKeep(int i) {
        this.m_WordsToKeep = i;
    }

    public String stemmerTipText() {
        return "The stemming algorithm to use on the words.";
    }

    public String stopwordsTipText() {
        return "The file containing the stopwords (if this is a directory then the default ones are used).";
    }

    public String tokenizerTipText() {
        return "The tokenizing algorithm to use on the strings.";
    }

    public String useStoplistTipText() {
        return "Ignores all the words that are on the stoplist, if set to true.";
    }

    public String wordsToKeepTipText() {
        return "The number of words (per class if there is a class attribute assigned) to attempt to keep.";
    }
}
