package org.apache.lucene.analysis.ja.dict;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.lucene.analysis.ja.util.CSVUtil;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
public final class UserDictionary implements Dictionary {
private final TokenInfoFST fst;
private final int segmentations[][];
private final String data[];
private static final int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;
public static final int WORD_COST = -100000;
public static final int LEFT_ID = 5;
public static final int RIGHT_ID = 5;
public static UserDictionary open(Reader reader) throws IOException {
BufferedReader br = new BufferedReader(reader);
String line = null;
List<String[]> featureEntries = new ArrayList<>();
while ((line = br.readLine()) != null) {
line = line.replaceAll("#.*$", "");
if (line.trim().length() == 0) {
continue;
}
String[] values = CSVUtil.parse(line);
featureEntries.add(values);
}
if (featureEntries.isEmpty()) {
return null;
} else {
return new UserDictionary(featureEntries);
}
}
private UserDictionary(List<String[]> featureEntries) throws IOException {
int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
Collections.sort(featureEntries, new Comparator<String[]>() {
@Override
public int compare(String[] left, String[] right) {
return left[0].compareTo(right[0]);
}
});
List<String> data = new ArrayList<>(featureEntries.size());
List<int[]> segmentations = new ArrayList<>(featureEntries.size());
PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
IntsRefBuilder scratch = new IntsRefBuilder();
long ord = 0;
for (String[] values : featureEntries) {
String[] segmentation = values[1].replaceAll(" *", " ").split(" ");
String[] readings = values[2].replaceAll(" *", " ").split(" ");
String pos = values[3];
if (segmentation.length != readings.length) {
throw new RuntimeException("Illegal user dictionary entry " + values[0] +
" - the number of segmentations (" + segmentation.length + ")" +
" does not the match number of readings (" + readings.length + ")");
}
int[] wordIdAndLength = new int[segmentation.length + 1];
wordIdAndLength[0] = wordId;
for (int i = 0; i < segmentation.length; i++) {
wordIdAndLength[i + 1] = segmentation[i].length();
data.add(readings[i] + INTERNAL_SEPARATOR + pos);
wordId++;
}
String token = values[0];
scratch.grow(token.length());
scratch.setLength(token.length());
for (int i = 0; i < token.length(); i++) {
scratch.setIntAt(i, (int) token.charAt(i));
}
fstBuilder.add(scratch.get(), ord);
segmentations.add(wordIdAndLength);
ord++;
}
this.fst = new TokenInfoFST(fstBuilder.finish(), false);
this.data = data.toArray(new String[data.size()]);
this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
}
public int[][] lookup(char[] chars, int off, int len) throws IOException {
TreeMap<Integer, int[]> result = new TreeMap<>();
boolean found = false;
final FST.BytesReader fstReader = fst.getBytesReader();
FST.Arc<Long> arc = new FST.Arc<>();
int end = off + len;
for (int startOffset = off; startOffset < end; startOffset++) {
arc = fst.getFirstArc(arc);
int output = 0;
int remaining = end - startOffset;
for (int i = 0; i < remaining; i++) {
int ch = chars[startOffset+i];
if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
break;
}
output += arc.output.intValue();
if (arc.isFinal()) {
final int finalOutput = output + arc.nextFinalOutput.intValue();
result.put(startOffset-off, segmentations[finalOutput]);
found = true;
}
}
}
return found ? toIndexArray(result) : EMPTY_RESULT;
}
public TokenInfoFST getFST() {
return fst;
}
private static final int[][] EMPTY_RESULT = new int[0][];
private int[][] toIndexArray(Map<Integer, int[]> input) {
ArrayList<int[]> result = new ArrayList<>();
for (int i : input.keySet()) {
int[] wordIdAndLength = input.get(i);
int wordId = wordIdAndLength[0];
int current = i;
for (int j = 1; j < wordIdAndLength.length; j++) {
int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
result.add(token);
current += wordIdAndLength[j];
}
}
return result.toArray(new int[result.size()][]);
}
public int[] lookupSegmentation(int phraseID) {
return segmentations[phraseID];
}
@Override
public int getLeftId(int wordId) {
return LEFT_ID;
}
@Override
public int getRightId(int wordId) {
return RIGHT_ID;
}
@Override
public int getWordCost(int wordId) {
return WORD_COST;
}
@Override
public String getReading(int wordId, char surface[], int off, int len) {
return getFeature(wordId, 0);
}
@Override
public String getPartOfSpeech(int wordId) {
return getFeature(wordId, 1);
}
@Override
public String getBaseForm(int wordId, char surface[], int off, int len) {
return null;
}
@Override
public String getPronunciation(int wordId, char surface[], int off, int len) {
return null;
}
@Override
public String getInflectionType(int wordId) {
return null;
}
@Override
public String getInflectionForm(int wordId) {
return null;
}
private String[] getAllFeaturesArray(int wordId) {
String allFeatures = data[wordId-CUSTOM_DICTIONARY_WORD_ID_OFFSET];
if(allFeatures == null) {
return null;
}
return allFeatures.split(INTERNAL_SEPARATOR);
}
private String getFeature(int wordId, int... fields) {
String[] allFeatures = getAllFeaturesArray(wordId);
if (allFeatures == null) {
return null;
}
StringBuilder sb = new StringBuilder();
if (fields.length == 0) {
for (String feature : allFeatures) {
sb.append(CSVUtil.quoteEscape(feature)).append(",");
}
} else if (fields.length == 1) {
sb.append(allFeatures[fields[0]]).append(",");
} else {
for (int field : fields){
sb.append(CSVUtil.quoteEscape(allFeatures[field])).append(",");
}
}
return sb.deleteCharAt(sb.length() - 1).toString();
}
}