/*
 * Decompiled with CFR 0.152.
 */
package opennlp.tools.tokenize;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.ml.EventTrainer;
import opennlp.tools.ml.TrainerFactory;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.tokenize.AbstractTokenizer;
import opennlp.tools.tokenize.TokSpanEventStream;
import opennlp.tools.tokenize.TokenContextGenerator;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.tokenize.TokenizerFactory;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.tokenize.lang.Factory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;

public class TokenizerME
extends AbstractTokenizer {
    public static final String SPLIT = "T";
    public static final String NO_SPLIT = "F";
    @Deprecated
    public static final Pattern alphaNumeric = Pattern.compile("^[A-Za-z0-9]+$");
    private final Pattern alphanumeric;
    private MaxentModel model;
    private final TokenContextGenerator cg;
    private boolean useAlphaNumericOptimization;
    private List<Double> tokProbs;
    private List<Span> newTokens;

    public TokenizerME(TokenizerModel model) {
        TokenizerFactory factory = model.getFactory();
        this.alphanumeric = factory.getAlphaNumericPattern();
        this.cg = factory.getContextGenerator();
        this.model = model.getMaxentModel();
        this.useAlphaNumericOptimization = factory.isUseAlphaNumericOptmization();
        this.newTokens = new ArrayList<Span>();
        this.tokProbs = new ArrayList<Double>(50);
    }

    public TokenizerME(TokenizerModel model, Factory factory) {
        String languageCode = model.getLanguage();
        this.alphanumeric = factory.getAlphanumeric(languageCode);
        this.cg = factory.createTokenContextGenerator(languageCode, TokenizerME.getAbbreviations(model.getAbbreviations()));
        this.model = model.getMaxentModel();
        this.useAlphaNumericOptimization = model.useAlphaNumericOptimization();
        this.newTokens = new ArrayList<Span>();
        this.tokProbs = new ArrayList<Double>(50);
    }

    private static Set<String> getAbbreviations(Dictionary abbreviations) {
        if (abbreviations == null) {
            return Collections.emptySet();
        }
        return abbreviations.asStringSet();
    }

    public double[] getTokenProbabilities() {
        double[] tokProbArray = new double[this.tokProbs.size()];
        for (int i = 0; i < tokProbArray.length; ++i) {
            tokProbArray[i] = this.tokProbs.get(i);
        }
        return tokProbArray;
    }

    @Override
    public Span[] tokenizePos(String d) {
        Span[] tokens = WhitespaceTokenizer.INSTANCE.tokenizePos(d);
        this.newTokens.clear();
        this.tokProbs.clear();
        for (Span s : tokens) {
            String tok = d.substring(s.getStart(), s.getEnd());
            if (tok.length() < 2) {
                this.newTokens.add(s);
                this.tokProbs.add(1.0);
                continue;
            }
            if (this.useAlphaNumericOptimization() && this.alphanumeric.matcher(tok).matches()) {
                this.newTokens.add(s);
                this.tokProbs.add(1.0);
                continue;
            }
            int start = s.getStart();
            int end = s.getEnd();
            int origStart = s.getStart();
            double tokenProb = 1.0;
            for (int j = origStart + 1; j < end; ++j) {
                double[] probs = this.model.eval(this.cg.getContext(tok, j - origStart));
                String best = this.model.getBestOutcome(probs);
                tokenProb *= probs[this.model.getIndex(best)];
                if (!best.equals(SPLIT)) continue;
                this.newTokens.add(new Span(start, j));
                this.tokProbs.add(tokenProb);
                start = j;
                tokenProb = 1.0;
            }
            this.newTokens.add(new Span(start, end));
            this.tokProbs.add(tokenProb);
        }
        Span[] spans = new Span[this.newTokens.size()];
        this.newTokens.toArray(spans);
        return spans;
    }

    public static TokenizerModel train(ObjectStream<TokenSample> samples, TokenizerFactory factory, TrainingParameters mlParams) throws IOException {
        HashMap<String, String> manifestInfoEntries = new HashMap<String, String>();
        TokSpanEventStream eventStream = new TokSpanEventStream(samples, factory.isUseAlphaNumericOptmization(), factory.getAlphaNumericPattern(), factory.getContextGenerator());
        EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams.getSettings(), manifestInfoEntries);
        MaxentModel maxentModel = trainer.train(eventStream);
        return new TokenizerModel(maxentModel, manifestInfoEntries, factory);
    }

    public boolean useAlphaNumericOptimization() {
        return this.useAlphaNumericOptimization;
    }
}

