/*
 * Decompiled with CFR 0.152.
 */
package weka.filters.unsupervised.attribute;

import java.io.File;
import java.io.Serializable;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Vector;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.SelectedTag;
import weka.core.SparseInstance;
import weka.core.Stopwords;
import weka.core.Tag;
import weka.core.Utils;
import weka.core.stemmers.NullStemmer;
import weka.core.stemmers.Stemmer;
import weka.core.tokenizers.Tokenizer;
import weka.core.tokenizers.WordTokenizer;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;

public class StringToWordVector
extends Filter
implements UnsupervisedFilter,
OptionHandler {
    static final long serialVersionUID = 8249106275278565424L;
    protected Range m_SelectedRange = new Range("first-last");
    private TreeMap m_Dictionary = new TreeMap();
    private boolean m_OutputCounts = false;
    private String m_Prefix = "";
    private int[] m_DocsCounts;
    private int m_NumInstances = -1;
    private double m_AvgDocLength = -1.0;
    private int m_WordsToKeep = 1000;
    private double m_PeriodicPruningRate = -1.0;
    private boolean m_TFTransform;
    protected int m_filterType = 0;
    public static final int FILTER_NONE = 0;
    public static final int FILTER_NORMALIZE_ALL = 1;
    public static final int FILTER_NORMALIZE_TEST_ONLY = 2;
    public static final Tag[] TAGS_FILTER = new Tag[]{new Tag(0, "No normalization"), new Tag(1, "Normalize all data"), new Tag(2, "Normalize test data only")};
    private boolean m_IDFTransform;
    private boolean m_lowerCaseTokens;
    private boolean m_useStoplist;
    private Stemmer m_Stemmer = new NullStemmer();
    private int m_minTermFreq = 1;
    private boolean m_doNotOperateOnPerClassBasis = false;
    private File m_Stopwords = new File(System.getProperty("user.dir"));
    private Tokenizer m_Tokenizer = new WordTokenizer();

    public StringToWordVector() {
    }

    public Enumeration listOptions() {
        Vector<Option> vector = new Vector<Option>();
        vector.addElement(new Option("\tOutput word counts rather than boolean word presence.\n", "C", 0, "-C"));
        vector.addElement(new Option("\tSpecify list of string attributes to convert to words (as weka Range).\n\t(default: select all string attributes)", "R", 1, "-R <index1,index2-index4,...>"));
        vector.addElement(new Option("\tInvert matching sense of column indexes.", "V", 0, "-V"));
        vector.addElement(new Option("\tSpecify a prefix for the created attribute names.\n\t(default: \"\")", "P", 1, "-P <attribute name prefix>"));
        vector.addElement(new Option("\tSpecify approximate number of word fields to create.\n\tSurplus words will be discarded..\n\t(default: 1000)", "W", 1, "-W <number of words to keep>"));
        vector.addElement(new Option("\tSpecify the rate (e.g., every 10% of the input dataset) at which to periodically prune the dictionary.\n\t-W prunes after creating a full dictionary. You may not have enough memory for this approach.\n\t(default: no periodic pruning)", "prune-rate", 1, "-prune-rate <rate as a percentage of dataset>"));
        vector.addElement(new Option("\tTransform the word frequencies into log(1+fij)\n\twhere fij is the frequency of word i in jth document(instance).\n", "T", 0, "-T"));
        vector.addElement(new Option("\tTransform each word frequency into:\n\tfij*log(num of Documents/num of documents containing word i)\n\t  where fij if frequency of word i in jth document(instance)", "I", 0, "-I"));
        vector.addElement(new Option("\tWhether to 0=not normalize/1=normalize all data/2=normalize test data only\n\tto average length of training documents (default 0=don't normalize).", "N", 1, "-N"));
        vector.addElement(new Option("\tConvert all tokens to lowercase before adding to the dictionary.", "L", 0, "-L"));
        vector.addElement(new Option("\tIgnore words that are in the stoplist.", "S", 0, "-S"));
        vector.addElement(new Option("\tThe stemmering algorihtm (classname plus parameters) to use.", "stemmer", 1, "-stemmer <spec>"));
        vector.addElement(new Option("\tThe minimum term frequency (default = 1).", "M", 1, "-M <int>"));
        vector.addElement(new Option("\tIf this is set, the maximum number of words and the \n\tminimum term frequency is not enforced on a per-class \n\tbasis but based on the documents in all the classes \n\t(even if a class attribute is set).", "O", 0, "-O"));
        vector.addElement(new Option("\tA file containing stopwords to override the default ones.\n\tUsing this option automatically sets the flag ('-S') to use the\n\tstoplist if the file exists.\n\tFormat: one stopword per line, lines starting with '#'\n\tare interpreted as comments and ignored.", "stopwords", 1, "-stopwords <file>"));
        vector.addElement(new Option("\tThe tokenizing algorihtm (classname plus parameters) to use.\n\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1, "-tokenizer <spec>"));
        return vector.elements();
    }

    public void setOptions(String[] stringArray) throws Exception {
        Object object;
        String[] stringArray2;
        Object object2;
        String string = Utils.getOption('R', stringArray);
        if (string.length() != 0) {
            this.setSelectedRange(string);
        } else {
            this.setSelectedRange("first-last");
        }
        this.setInvertSelection(Utils.getFlag('V', stringArray));
        string = Utils.getOption('P', stringArray);
        if (string.length() != 0) {
            this.setAttributeNamePrefix(string);
        } else {
            this.setAttributeNamePrefix("");
        }
        string = Utils.getOption('W', stringArray);
        if (string.length() != 0) {
            this.setWordsToKeep(Integer.valueOf(string));
        } else {
            this.setWordsToKeep(1000);
        }
        string = Utils.getOption("prune-rate", stringArray);
        if (string.length() > 0) {
            this.setPeriodicPruning(Double.parseDouble(string));
        } else {
            this.setPeriodicPruning(-1.0);
        }
        string = Utils.getOption('M', stringArray);
        if (string.length() != 0) {
            this.setMinTermFreq(Integer.valueOf(string));
        } else {
            this.setMinTermFreq(1);
        }
        this.setOutputWordCounts(Utils.getFlag('C', stringArray));
        this.setTFTransform(Utils.getFlag('T', stringArray));
        this.setIDFTransform(Utils.getFlag('I', stringArray));
        this.setDoNotOperateOnPerClassBasis(Utils.getFlag('O', stringArray));
        String string2 = Utils.getOption('N', stringArray);
        if (string2.length() != 0) {
            this.setNormalizeDocLength(new SelectedTag(Integer.parseInt(string2), TAGS_FILTER));
        } else {
            this.setNormalizeDocLength(new SelectedTag(0, TAGS_FILTER));
        }
        this.setLowerCaseTokens(Utils.getFlag('L', stringArray));
        this.setUseStoplist(Utils.getFlag('S', stringArray));
        String string3 = Utils.getOption("stemmer", stringArray);
        if (string3.length() == 0) {
            this.setStemmer(null);
        } else {
            object2 = Utils.splitOptions(string3);
            if (((String[])object2).length == 0) {
                throw new Exception("Invalid stemmer specification string");
            }
            stringArray2 = object2[0];
            object2[0] = "";
            object = (Stemmer)Class.forName((String)stringArray2).newInstance();
            if (object instanceof OptionHandler) {
                ((OptionHandler)object).setOptions((String[])object2);
            }
            this.setStemmer((Stemmer)object);
        }
        string = Utils.getOption("stopwords", stringArray);
        if (string.length() != 0) {
            this.setStopwords(new File(string));
        } else {
            this.setStopwords(null);
        }
        object2 = Utils.getOption("tokenizer", stringArray);
        if (((String)object2).length() == 0) {
            this.setTokenizer(new WordTokenizer());
        } else {
            stringArray2 = Utils.splitOptions((String)object2);
            if (stringArray2.length == 0) {
                throw new Exception("Invalid tokenizer specification string");
            }
            object = stringArray2[0];
            stringArray2[0] = "";
            Tokenizer tokenizer = (Tokenizer)Class.forName((String)object).newInstance();
            if (tokenizer instanceof OptionHandler) {
                tokenizer.setOptions(stringArray2);
            }
            this.setTokenizer(tokenizer);
        }
    }

    public String[] getOptions() {
        String string;
        Vector<String> vector = new Vector<String>();
        vector.add("-R");
        vector.add(this.getSelectedRange().getRanges());
        if (this.getInvertSelection()) {
            vector.add("-V");
        }
        if (!"".equals(this.getAttributeNamePrefix())) {
            vector.add("-P");
            vector.add(this.getAttributeNamePrefix());
        }
        vector.add("-W");
        vector.add(String.valueOf(this.getWordsToKeep()));
        vector.add("-prune-rate");
        vector.add(String.valueOf(this.getPeriodicPruning()));
        if (this.getOutputWordCounts()) {
            vector.add("-C");
        }
        if (this.getTFTransform()) {
            vector.add("-T");
        }
        if (this.getIDFTransform()) {
            vector.add("-I");
        }
        vector.add("-N");
        vector.add("" + this.m_filterType);
        if (this.getLowerCaseTokens()) {
            vector.add("-L");
        }
        if (this.getUseStoplist()) {
            vector.add("-S");
        }
        if (this.getStemmer() != null) {
            vector.add("-stemmer");
            string = this.getStemmer().getClass().getName();
            if (this.getStemmer() instanceof OptionHandler) {
                string = string + " " + Utils.joinOptions(((OptionHandler)((Object)this.getStemmer())).getOptions());
            }
            vector.add(string.trim());
        }
        vector.add("-M");
        vector.add(String.valueOf(this.getMinTermFreq()));
        if (this.getDoNotOperateOnPerClassBasis()) {
            vector.add("-O");
        }
        if (!this.getStopwords().isDirectory()) {
            vector.add("-stopwords");
            vector.add(this.getStopwords().getAbsolutePath());
        }
        vector.add("-tokenizer");
        string = this.getTokenizer().getClass().getName();
        if (this.getTokenizer() instanceof OptionHandler) {
            string = string + " " + Utils.joinOptions(this.getTokenizer().getOptions());
        }
        vector.add(string.trim());
        return vector.toArray(new String[vector.size()]);
    }

    public StringToWordVector(int n) {
        this.m_WordsToKeep = n;
    }

    public Capabilities getCapabilities() {
        Capabilities capabilities = super.getCapabilities();
        capabilities.disableAll();
        capabilities.enableAllAttributes();
        capabilities.enable(Capabilities.Capability.MISSING_VALUES);
        capabilities.enableAllClasses();
        capabilities.enable(Capabilities.Capability.MISSING_CLASS_VALUES);
        capabilities.enable(Capabilities.Capability.NO_CLASS);
        return capabilities;
    }

    public boolean setInputFormat(Instances instances) throws Exception {
        super.setInputFormat(instances);
        this.m_SelectedRange.setUpper(instances.numAttributes() - 1);
        this.m_AvgDocLength = -1.0;
        this.m_NumInstances = -1;
        return false;
    }

    public boolean input(Instance instance) throws Exception {
        if (this.getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }
        if (this.m_NewBatch) {
            this.resetQueue();
            this.m_NewBatch = false;
        }
        if (this.isFirstBatchDone()) {
            FastVector fastVector = new FastVector();
            int n = this.convertInstancewoDocNorm(instance, fastVector);
            Instance instance2 = (Instance)fastVector.elementAt(0);
            if (this.m_filterType != 0) {
                this.normalizeInstance(instance2, n);
            }
            this.push(instance2);
            return true;
        }
        this.bufferInput(instance);
        return false;
    }

    public boolean batchFinished() throws Exception {
        if (this.getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }
        if (!this.isFirstBatchDone()) {
            int n;
            this.determineDictionary();
            FastVector fastVector = new FastVector();
            int n2 = 0;
            for (n = 0; n < this.m_NumInstances; ++n) {
                n2 = this.convertInstancewoDocNorm(this.getInputFormat().instance(n), fastVector);
            }
            if (this.m_filterType != 0) {
                this.m_AvgDocLength = 0.0;
                for (n = 0; n < fastVector.size(); ++n) {
                    Instance instance = (Instance)fastVector.elementAt(n);
                    double d = 0.0;
                    for (int i = 0; i < instance.numValues(); ++i) {
                        if (instance.index(i) < n2) continue;
                        d += instance.valueSparse(i) * instance.valueSparse(i);
                    }
                    this.m_AvgDocLength += Math.sqrt(d);
                }
                this.m_AvgDocLength /= (double)this.m_NumInstances;
            }
            if (this.m_filterType == 1) {
                for (n = 0; n < fastVector.size(); ++n) {
                    this.normalizeInstance((Instance)fastVector.elementAt(n), n2);
                }
            }
            for (n = 0; n < fastVector.size(); ++n) {
                this.push((Instance)fastVector.elementAt(n));
            }
        }
        this.flushInput();
        this.m_NewBatch = true;
        this.m_FirstBatchDone = true;
        return this.numPendingOutput() != 0;
    }

    public String globalInfo() {
        return "Converts String attributes into a set of attributes representing word occurrence (depending on the tokenizer) information from the text contained in the strings. The set of words (attributes) is determined by the first batch filtered (typically training data).";
    }

    public boolean getOutputWordCounts() {
        return this.m_OutputCounts;
    }

    public void setOutputWordCounts(boolean bl) {
        this.m_OutputCounts = bl;
    }

    public String outputWordCountsTipText() {
        return "Output word counts rather than boolean 0 or 1(indicating presence or absence of a word).";
    }

    public Range getSelectedRange() {
        return this.m_SelectedRange;
    }

    public void setSelectedRange(String string) {
        this.m_SelectedRange = new Range(string);
    }

    public String attributeIndicesTipText() {
        return "Specify range of attributes to act on. This is a comma separated list of attribute indices, with \"first\" and \"last\" valid values. Specify an inclusive range with \"-\". E.g: \"first-3,5,6-10,last\".";
    }

    public String getAttributeIndices() {
        return this.m_SelectedRange.getRanges();
    }

    public void setAttributeIndices(String string) {
        this.m_SelectedRange.setRanges(string);
    }

    public void setAttributeIndicesArray(int[] nArray) {
        this.setAttributeIndices(Range.indicesToRangeList(nArray));
    }

    public String invertSelectionTipText() {
        return "Set attribute selection mode. If false, only selected attributes in the range will be worked on; if true, only non-selected attributes will be processed.";
    }

    public boolean getInvertSelection() {
        return this.m_SelectedRange.getInvert();
    }

    public void setInvertSelection(boolean bl) {
        this.m_SelectedRange.setInvert(bl);
    }

    public String getAttributeNamePrefix() {
        return this.m_Prefix;
    }

    public void setAttributeNamePrefix(String string) {
        this.m_Prefix = string;
    }

    public String attributeNamePrefixTipText() {
        return "Prefix for the created attribute names. (default: \"\")";
    }

    public int getWordsToKeep() {
        return this.m_WordsToKeep;
    }

    public void setWordsToKeep(int n) {
        this.m_WordsToKeep = n;
    }

    public String wordsToKeepTipText() {
        return "The number of words (per class if there is a class attribute assigned) to attempt to keep.";
    }

    public double getPeriodicPruning() {
        return this.m_PeriodicPruningRate;
    }

    public void setPeriodicPruning(double d) {
        this.m_PeriodicPruningRate = d;
    }

    public String periodicPruningTipText() {
        return "Specify the rate (x% of the input dataset) at which to periodically prune the dictionary. wordsToKeep prunes after creating a full dictionary. You may not have enough memory for this approach.";
    }

    public boolean getTFTransform() {
        return this.m_TFTransform;
    }

    public void setTFTransform(boolean bl) {
        this.m_TFTransform = bl;
    }

    public String TFTransformTipText() {
        return "Sets whether if the word frequencies should be transformed into:\n    log(1+fij) \n       where fij is the frequency of word i in document (instance) j.";
    }

    public boolean getIDFTransform() {
        return this.m_IDFTransform;
    }

    public void setIDFTransform(boolean bl) {
        this.m_IDFTransform = bl;
    }

    public String IDFTransformTipText() {
        return "Sets whether if the word frequencies in a document should be transformed into: \n   fij*log(num of Docs/num of Docs with word i) \n      where fij is the frequency of word i in document (instance) j.";
    }

    public SelectedTag getNormalizeDocLength() {
        return new SelectedTag(this.m_filterType, TAGS_FILTER);
    }

    public void setNormalizeDocLength(SelectedTag selectedTag) {
        if (selectedTag.getTags() == TAGS_FILTER) {
            this.m_filterType = selectedTag.getSelectedTag().getID();
        }
    }

    public String normalizeDocLengthTipText() {
        return "Sets whether if the word frequencies for a document (instance) should be normalized or not.";
    }

    public boolean getLowerCaseTokens() {
        return this.m_lowerCaseTokens;
    }

    public void setLowerCaseTokens(boolean bl) {
        this.m_lowerCaseTokens = bl;
    }

    public String doNotOperateOnPerClassBasisTipText() {
        return "If this is set, the maximum number of words and the minimum term frequency is not enforced on a per-class basis but based on the documents in all the classes (even if a class attribute is set).";
    }

    public boolean getDoNotOperateOnPerClassBasis() {
        return this.m_doNotOperateOnPerClassBasis;
    }

    public void setDoNotOperateOnPerClassBasis(boolean bl) {
        this.m_doNotOperateOnPerClassBasis = bl;
    }

    public String minTermFreqTipText() {
        return "Sets the minimum term frequency. This is enforced on a per-class basis.";
    }

    public int getMinTermFreq() {
        return this.m_minTermFreq;
    }

    public void setMinTermFreq(int n) {
        this.m_minTermFreq = n;
    }

    public String lowerCaseTokensTipText() {
        return "If set then all the word tokens are converted to lower case before being added to the dictionary.";
    }

    public boolean getUseStoplist() {
        return this.m_useStoplist;
    }

    public void setUseStoplist(boolean bl) {
        this.m_useStoplist = bl;
    }

    public String useStoplistTipText() {
        return "Ignores all the words that are on the stoplist, if set to true.";
    }

    public void setStemmer(Stemmer stemmer) {
        this.m_Stemmer = stemmer != null ? stemmer : new NullStemmer();
    }

    public Stemmer getStemmer() {
        return this.m_Stemmer;
    }

    public String stemmerTipText() {
        return "The stemming algorithm to use on the words.";
    }

    public void setStopwords(File file) {
        if (file == null) {
            file = new File(System.getProperty("user.dir"));
        }
        this.m_Stopwords = file;
        if (file.exists() && file.isFile()) {
            this.setUseStoplist(true);
        }
    }

    public File getStopwords() {
        return this.m_Stopwords;
    }

    public String stopwordsTipText() {
        return "The file containing the stopwords (if this is a directory then the default ones are used).";
    }

    public void setTokenizer(Tokenizer tokenizer) {
        this.m_Tokenizer = tokenizer;
    }

    public Tokenizer getTokenizer() {
        return this.m_Tokenizer;
    }

    public String tokenizerTipText() {
        return "The tokenizing algorithm to use on the strings.";
    }

    private static void sortArray(int[] nArray) {
        int n = nArray.length - 1;
        int n2 = 1;
        while (n2 <= n / 9) {
            n2 = 3 * n2 + 1;
        }
        while (n2 > 0) {
            for (int i = n2 + 1; i <= n; ++i) {
                int n3 = nArray[i];
                for (int j = i; j > n2 && nArray[j - n2] > n3; j -= n2) {
                    nArray[j] = nArray[j - n2];
                }
                nArray[j] = n3;
            }
            n2 /= 3;
        }
    }

    private void determineSelectedRange() {
        int n;
        StringBuffer stringBuffer;
        Instances instances = this.getInputFormat();
        if (this.m_SelectedRange == null) {
            stringBuffer = new StringBuffer();
            for (n = 0; n < instances.numAttributes(); ++n) {
                if (instances.attribute(n).type() != 2) continue;
                stringBuffer.append(n + 1 + ",");
            }
            this.m_SelectedRange = new Range(stringBuffer.toString());
        }
        this.m_SelectedRange.setUpper(instances.numAttributes() - 1);
        stringBuffer = new StringBuffer();
        for (n = 0; n < instances.numAttributes(); ++n) {
            if (!this.m_SelectedRange.isInRange(n) || instances.attribute(n).type() != 2) continue;
            stringBuffer.append(n + 1 + ",");
        }
        this.m_SelectedRange.setRanges(stringBuffer.toString());
        this.m_SelectedRange.setUpper(instances.numAttributes() - 1);
    }

    private void determineDictionary() {
        Object object6;
        Object object2;
        Object object3;
        int n;
        Object object4;
        int n2;
        Stopwords stopwords = new Stopwords();
        if (this.getUseStoplist()) {
            try {
                if (this.getStopwords().exists() && !this.getStopwords().isDirectory()) {
                    stopwords.read(this.getStopwords());
                }
            }
            catch (Exception exception) {
                exception.printStackTrace();
            }
        }
        int n3 = this.getInputFormat().classIndex();
        int n4 = 1;
        if (!this.m_doNotOperateOnPerClassBasis && n3 != -1) {
            n4 = this.getInputFormat().attribute(n3).numValues();
        }
        TreeMap[] treeMapArray = new TreeMap[n4];
        for (int i = 0; i < n4; ++i) {
            treeMapArray[i] = new TreeMap();
        }
        this.determineSelectedRange();
        long l = Math.round(this.m_PeriodicPruningRate / 100.0 * (double)this.getInputFormat().numInstances());
        for (n2 = 0; n2 < this.getInputFormat().numInstances(); ++n2) {
            Serializable serializable;
            String string;
            object4 = this.getInputFormat().instance(n2);
            n = 0;
            if (!this.m_doNotOperateOnPerClassBasis && n3 != -1) {
                n = (int)((Instance)object4).classValue();
            }
            object3 = new Hashtable();
            for (int i = 0; i < ((Instance)object4).numAttributes(); ++i) {
                if (!this.m_SelectedRange.isInRange(i) || ((Instance)object4).isMissing(i)) continue;
                this.m_Tokenizer.tokenize(((Instance)object4).stringValue(i));
                while (this.m_Tokenizer.hasMoreElements()) {
                    string = ((String)this.m_Tokenizer.nextElement()).intern();
                    if (this.m_lowerCaseTokens) {
                        string = string.toLowerCase();
                    }
                    string = this.m_Stemmer.stem(string);
                    if (this.m_useStoplist && stopwords.is(string)) continue;
                    if (!((Hashtable)object3).contains(string)) {
                        ((Hashtable)object3).put(string, new Integer(0));
                    }
                    if ((serializable = (Count)treeMapArray[n].get(string)) == null) {
                        treeMapArray[n].put(string, new Count(1));
                        continue;
                    }
                    ++((Count)serializable).count;
                }
            }
            Enumeration enumeration = ((Hashtable)object3).keys();
            while (enumeration.hasMoreElements()) {
                string = (String)enumeration.nextElement();
                serializable = (Count)treeMapArray[n].get(string);
                if (serializable != null) {
                    ++((Count)serializable).docCount;
                    continue;
                }
                System.err.println("Warning: A word should definitely be in the dictionary.Please check the code");
            }
            if (l <= 0L || (long)n2 % l != 0L || n2 <= 0) continue;
            for (int i = 0; i < n4; ++i) {
                Object object52;
                serializable = new Vector(1000);
                for (Object object52 : treeMapArray[i].keySet()) {
                    object2 = (Count)treeMapArray[i].get(object52);
                    if (((Count)object2).count > 1) continue;
                    ((Vector)serializable).add(object52);
                }
                object52 = ((Vector)serializable).iterator();
                while (object52.hasNext()) {
                    object2 = (String)object52.next();
                    treeMapArray[i].remove(object2);
                }
            }
        }
        n2 = 0;
        object4 = new int[n4];
        for (n = 0; n < n4; ++n) {
            n2 += treeMapArray[n].size();
            object3 = new int[treeMapArray[n].size()];
            int n5 = 0;
            for (Serializable serializable : treeMapArray[n].keySet()) {
                object6 = (Count)treeMapArray[n].get(serializable);
                object3[n5] = ((Count)object6).count;
                ++n5;
            }
            StringToWordVector.sortArray((int[])object3);
            object4[n] = ((Object)object3).length < this.m_WordsToKeep ? (Object)this.m_minTermFreq : (Object)Math.max(this.m_minTermFreq, (int)object3[((Object)object3).length - this.m_WordsToKeep]);
        }
        FastVector fastVector = new FastVector(n2 + this.getInputFormat().numAttributes());
        int n6 = -1;
        for (int i = 0; i < this.getInputFormat().numAttributes(); ++i) {
            if (this.m_SelectedRange.isInRange(i)) continue;
            if (this.getInputFormat().classIndex() == i) {
                n6 = fastVector.size();
            }
            fastVector.addElement(this.getInputFormat().attribute(i).copy());
        }
        TreeMap<Object, Integer> treeMap = new TreeMap<Object, Integer>();
        int n7 = fastVector.size();
        for (int i = 0; i < n4; ++i) {
            for (Object object52 : treeMapArray[i].keySet()) {
                object2 = (Count)treeMapArray[i].get(object52);
                if (((Count)object2).count < object4[i] || treeMap.get(object52) != null) continue;
                treeMap.put(object52, new Integer(n7++));
                fastVector.addElement(new Attribute(this.m_Prefix + (String)object52));
            }
        }
        this.m_DocsCounts = new int[fastVector.size()];
        for (Object object6 : treeMap.keySet()) {
            int n8 = (Integer)treeMap.get(object6);
            int n9 = 0;
            for (int i = 0; i < n4; ++i) {
                Count count = (Count)treeMapArray[i].get(object6);
                if (count == null) continue;
                n9 += count.docCount;
            }
            this.m_DocsCounts[n8] = n9;
        }
        fastVector.trimToSize();
        this.m_Dictionary = treeMap;
        this.m_NumInstances = this.getInputFormat().numInstances();
        object6 = new Instances(this.getInputFormat().relationName(), fastVector, 0);
        ((Instances)object6).setClassIndex(n6);
        this.setOutputFormat((Instances)object6);
    }

    private int convertInstancewoDocNorm(Instance instance, FastVector fastVector) {
        Object object;
        int n;
        int n2;
        TreeMap<Object, Double> treeMap = new TreeMap<Object, Double>();
        int n3 = 0;
        for (n2 = 0; n2 < this.getInputFormat().numAttributes(); ++n2) {
            if (this.m_SelectedRange.isInRange(n2)) continue;
            if (this.getInputFormat().attribute(n2).type() != 2) {
                if (instance.value(n2) != 0.0) {
                    treeMap.put(new Integer(n3), new Double(instance.value(n2)));
                }
            } else if (instance.isMissing(n2)) {
                treeMap.put(new Integer(n3), new Double(Instance.missingValue()));
            } else {
                if (this.outputFormatPeek().attribute(n3).numValues() == 0) {
                    this.outputFormatPeek().attribute(n3).addStringValue("Hack to defeat SparseInstance bug");
                }
                n = this.outputFormatPeek().attribute(n3).addStringValue(instance.stringValue(n2));
                treeMap.put(new Integer(n3), new Double(n));
            }
            ++n3;
        }
        for (n2 = 0; n2 < instance.numAttributes(); ++n2) {
            if (!this.m_SelectedRange.isInRange(n2) || instance.isMissing(n2)) continue;
            this.m_Tokenizer.tokenize(instance.stringValue(n2));
            while (this.m_Tokenizer.hasMoreElements()) {
                String string = (String)this.m_Tokenizer.nextElement();
                if (this.m_lowerCaseTokens) {
                    string = string.toLowerCase();
                }
                if ((object = (Integer)this.m_Dictionary.get(string = this.m_Stemmer.stem(string))) == null) continue;
                if (this.m_OutputCounts) {
                    Double d = (Double)treeMap.get(object);
                    if (d != null) {
                        treeMap.put(object, new Double(d + 1.0));
                        continue;
                    }
                    treeMap.put(object, new Double(1.0));
                    continue;
                }
                treeMap.put(object, new Double(1.0));
            }
        }
        if (this.m_TFTransform) {
            Iterator iterator = treeMap.keySet().iterator();
            n = 0;
            while (iterator.hasNext()) {
                object = (Integer)iterator.next();
                if ((Integer)object >= n3) {
                    double d = (Double)treeMap.get(object);
                    d = Math.log(d + 1.0);
                    treeMap.put(object, new Double(d));
                }
                ++n;
            }
        }
        if (this.m_IDFTransform) {
            Iterator iterator = treeMap.keySet().iterator();
            n = 0;
            while (iterator.hasNext()) {
                object = (Integer)iterator.next();
                if ((Integer)object >= n3) {
                    double d = (Double)treeMap.get(object);
                    treeMap.put(object, new Double(d *= Math.log((double)this.m_NumInstances / (double)this.m_DocsCounts[(Integer)object])));
                }
                ++n;
            }
        }
        double[] dArray = new double[treeMap.size()];
        int[] nArray = new int[treeMap.size()];
        object = treeMap.keySet().iterator();
        int n4 = 0;
        while (object.hasNext()) {
            Integer n5 = (Integer)object.next();
            Double d = (Double)treeMap.get(n5);
            dArray[n4] = d;
            nArray[n4] = n5;
            ++n4;
        }
        SparseInstance sparseInstance = new SparseInstance(instance.weight(), dArray, nArray, this.outputFormatPeek().numAttributes());
        sparseInstance.setDataset(this.outputFormatPeek());
        fastVector.addElement(sparseInstance);
        return n3;
    }

    private void normalizeInstance(Instance instance, int n) throws Exception {
        int n2;
        double d = 0.0;
        if (this.m_AvgDocLength < 0.0) {
            throw new Exception("Average document length not set.");
        }
        for (n2 = 0; n2 < instance.numValues(); ++n2) {
            if (instance.index(n2) < n) continue;
            d += instance.valueSparse(n2) * instance.valueSparse(n2);
        }
        d = Math.sqrt(d);
        for (n2 = 0; n2 < instance.numValues(); ++n2) {
            if (instance.index(n2) < n) continue;
            double d2 = instance.valueSparse(n2) * this.m_AvgDocLength / d;
            instance.setValueSparse(n2, d2);
            if (d2 != 0.0) continue;
            System.err.println("setting value " + instance.index(n2) + " to zero.");
            --n2;
        }
    }

    public String getRevision() {
        return RevisionUtils.extract("$Revision: 5547 $");
    }

    public static void main(String[] stringArray) {
        StringToWordVector.runFilter(new StringToWordVector(), stringArray);
    }

    private class Count
    implements Serializable,
    RevisionHandler {
        static final long serialVersionUID = 2157223818584474321L;
        public int count;
        public int docCount;

        public Count(int n) {
            this.count = n;
        }

        public String getRevision() {
            return RevisionUtils.extract("$Revision: 5547 $");
        }
    }
}

