package org.apache.mahout.vectorizer.encoders;

import com.google.common.base.Charsets;
import com.google.common.base.Splitter;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.regex.Pattern;
import org.apache.mahout.math.Vector;

/* loaded from: input_file:org/apache/mahout/vectorizer/encoders/TextValueEncoder.class */
public class TextValueEncoder extends FeatureVectorEncoder {
    private static final double LOG_2 = Math.log(2.0d);
    private static final Splitter ON_NON_WORD = Splitter.on(Pattern.compile("\\W+")).omitEmptyStrings();
    private FeatureVectorEncoder wordEncoder;
    private final Multiset<String> counts;

    public TextValueEncoder(String str) {
        super(str, 2);
        this.wordEncoder = new StaticWordValueEncoder(str);
        this.counts = HashMultiset.create();
    }

    @Override // org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder
    public void addToVector(byte[] bArr, double d, Vector vector) {
        addText(bArr);
        flush(d, vector);
    }

    public void addText(byte[] bArr) {
        addText(new String(bArr, Charsets.UTF_8));
    }

    public void addText(CharSequence charSequence) {
        Iterator<String> it = tokenize(charSequence).iterator();
        while (it.hasNext()) {
            this.counts.add(it.next());
        }
    }

    public void flush(double d, Vector vector) {
        Iterator<String> it = this.counts.elementSet().iterator();
        while (it.hasNext()) {
            this.wordEncoder.addToVector(it.next(), (d * Math.log1p(this.counts.count(r0))) / LOG_2, vector);
        }
        this.counts.clear();
    }

    @Override // org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder
    protected int hashForProbe(byte[] bArr, int i, String str, int i2) {
        return 0;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder
    public Iterable<Integer> hashesForProbe(byte[] bArr, int i, String str, int i2) {
        ArrayList newArrayList = Lists.newArrayList();
        Iterator<String> it = tokenize(new String(bArr, Charsets.UTF_8)).iterator();
        while (it.hasNext()) {
            newArrayList.add(Integer.valueOf(hashForProbe(bytesForString(it.next()), i, str, i2)));
        }
        return newArrayList;
    }

    protected Iterable<String> tokenize(CharSequence charSequence) {
        return ON_NON_WORD.split(charSequence);
    }

    @Override // org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder
    public String asString(String str) {
        StringBuilder sb = new StringBuilder();
        sb.append('[');
        for (String str2 : tokenize(str)) {
            if (sb.length() > 1) {
                sb.append(", ");
            }
            sb.append(this.wordEncoder.asString(str2));
        }
        sb.append(']');
        return sb.toString();
    }

    public final void setWordEncoder(FeatureVectorEncoder featureVectorEncoder) {
        this.wordEncoder = featureVectorEncoder;
    }
}
