/*
 * Decompiled with CFR 0.152.
 */
package simpletree.textprocessing.processing;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import simpletree.matrix.AbstractMatrix;
import simpletree.matrix.sparse.SparseMatrix;
import simpletree.matrix.sparse.SparseVector;
import simpletree.model.AbstractInstance;
import simpletree.textprocessing.corpus.Corpus;
import simpletree.textprocessing.processing.Ngram;
import simpletree.textprocessing.processing.Stopword;
import simpletree.textprocessing.processing.stemmer.StemmerFactory;

public class Preprocessor {
    private Corpus corpus;
    private ArrayList<Ngram> ngrams;
    private StemmerFactory.StemmerType stemmer;
    private int numberGrams;
    private int lowerCut;
    private int upperCut;

    public Preprocessor(Corpus corpus) {
        this.corpus = corpus;
    }

    public AbstractMatrix getMatrix(int lowerCut, int upperCut, int numberGrams, StemmerFactory.StemmerType stemmer) throws IOException {
        this.lowerCut = lowerCut;
        this.upperCut = upperCut;
        this.numberGrams = numberGrams;
        this.stemmer = stemmer;
        this.ngrams = this.getCorpusNgrams();
        return this.getMatrix(this.corpus.getIds());
    }

    public AbstractMatrix getMatrixSelected(int lowerCut, int upperCut, int numberGrams, StemmerFactory.StemmerType stemmer, ArrayList<AbstractInstance> selected) throws IOException {
        this.lowerCut = lowerCut;
        this.upperCut = upperCut;
        this.numberGrams = numberGrams;
        this.stemmer = stemmer;
        ArrayList<Integer> ids = new ArrayList<Integer>();
        for (AbstractInstance ai : selected) {
            ids.add(ai.getId());
        }
        this.ngrams = this.getCorpusNgrams(ids);
        return this.getMatrix(ids);
    }

    public ArrayList<Ngram> getNgrams() {
        return this.ngrams;
    }

    public ArrayList<Ngram> getNgramsAccordingTo(int lowerCut, int upperCut, int numberGrams, StemmerFactory.StemmerType stemmer) throws IOException {
        this.lowerCut = lowerCut;
        this.upperCut = upperCut;
        this.numberGrams = numberGrams;
        this.stemmer = stemmer;
        return this.getCorpusNgrams();
    }

    private AbstractMatrix getMatrix(ArrayList<Integer> ids) throws IOException {
        Object vector;
        long start = System.currentTimeMillis();
        SparseMatrix matrix = new SparseMatrix();
        for (int i = 0; i < ids.size(); ++i) {
            vector = new float[this.ngrams.size()];
            HashMap<String, Integer> docNgrams = this.getNgrams(ids.get(i));
            int j = 0;
            for (Ngram n : this.ngrams) {
                vector[j] = docNgrams.containsKey(n.ngram) ? (Object)docNgrams.get(n.ngram) : (Object)0.0f;
                ++j;
            }
            SparseVector spv = new SparseVector((float[])vector, ids.get(i), this.corpus.getClassData()[i]);
            ((AbstractMatrix)matrix).addRow(spv);
        }
        ArrayList<String> attr = new ArrayList<String>();
        vector = this.ngrams.iterator();
        while (vector.hasNext()) {
            Ngram n = (Ngram)vector.next();
            attr.add(n.ngram);
        }
        matrix.setAttributes(attr);
        long finish = System.currentTimeMillis();
        Logger.getLogger(this.getClass().getName()).log(Level.INFO, "Document collection processing time: " + (float)(finish - start) / 1000.0f + "s");
        return matrix;
    }

    private ArrayList<Ngram> getCorpusNgrams(ArrayList<Integer> ids) throws IOException {
        HashMap<String, Integer> corpusNgrams_aux = new HashMap<String, Integer>();
        for (Integer id : ids) {
            HashMap<String, Integer> docNgrams = this.getNgrams(id);
            for (String key : docNgrams.keySet()) {
                if (corpusNgrams_aux.containsKey(key)) {
                    corpusNgrams_aux.put(key, (Integer)corpusNgrams_aux.get(key) + docNgrams.get(key));
                    continue;
                }
                corpusNgrams_aux.put(key, docNgrams.get(key));
            }
        }
        ArrayList<Ngram> ngrams_aux = new ArrayList<Ngram>();
        for (String key : corpusNgrams_aux.keySet()) {
            int freq = (Integer)corpusNgrams_aux.get(key);
            if (this.upperCut >= 0) {
                if (freq < this.lowerCut || freq > this.upperCut) continue;
                ngrams_aux.add(new Ngram(key, freq));
                continue;
            }
            if (freq < this.lowerCut) continue;
            ngrams_aux.add(new Ngram(key, freq));
        }
        Collections.sort(ngrams_aux);
        return ngrams_aux;
    }

    private ArrayList<Ngram> getCorpusNgrams() throws IOException {
        HashMap<String, Integer> corpusNgrams_aux = new HashMap<String, Integer>();
        Stopword stp = Stopword.getInstance();
        for (Ngram n : this.corpus.getCorpusNgrams()) {
            String token = n.ngram;
            if (stp.isStopWord(token) || (token = StemmerFactory.getInstance(this.stemmer).stem(token)).trim().length() <= 0) continue;
            if (corpusNgrams_aux.containsKey(token)) {
                corpusNgrams_aux.put(token, (Integer)corpusNgrams_aux.get(token) + n.frequency);
                continue;
            }
            corpusNgrams_aux.put(token, n.frequency);
        }
        ArrayList<Ngram> ngrams_aux = new ArrayList<Ngram>();
        for (String key : corpusNgrams_aux.keySet()) {
            int freq = (Integer)corpusNgrams_aux.get(key);
            if (this.upperCut >= 0) {
                if (freq < this.lowerCut || freq > this.upperCut) continue;
                ngrams_aux.add(new Ngram(key, freq));
                continue;
            }
            if (freq < this.lowerCut) continue;
            ngrams_aux.add(new Ngram(key, freq));
        }
        Collections.sort(ngrams_aux);
        return ngrams_aux;
    }

    private HashMap<String, Integer> getNgrams(int id) throws IOException {
        HashMap<String, Integer> ngrams_aux = new HashMap<String, Integer>();
        Stopword stp = Stopword.getInstance();
        ArrayList<Ngram> fngrams = this.corpus.getNgrams(id);
        if (fngrams != null) {
            for (Ngram n : fngrams) {
                String token = n.ngram;
                if (stp.isStopWord(token) || (token = StemmerFactory.getInstance(this.stemmer).stem(token)).trim().length() <= 0) continue;
                if (ngrams_aux.containsKey(token)) {
                    ngrams_aux.put(token, ngrams_aux.get(token) + n.frequency);
                    continue;
                }
                ngrams_aux.put(token, n.frequency);
            }
        }
        return ngrams_aux;
    }
}

