package org.apache.lucene.analysis.ja.util;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
class UnknownDictionaryBuilder {
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";
private final String encoding;
UnknownDictionaryBuilder(String encoding) {
this.encoding = encoding;
}
public UnknownDictionaryWriter build(Path dir) throws IOException {
UnknownDictionaryWriter unkDictionary = readDictionaryFile(dir.resolve("unk.def"));
readCharacterDefinition(dir.resolve("char.def"), unkDictionary);
return unkDictionary;
}
private UnknownDictionaryWriter readDictionaryFile(Path path) throws IOException {
return readDictionaryFile(path, encoding);
}
private UnknownDictionaryWriter readDictionaryFile(Path path, String encoding) throws IOException {
UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
List<String[]> lines = new ArrayList<>();
try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
LineNumberReader lineReader = new LineNumberReader(reader)) {
dictionary.put(CSVUtil.parse(NGRAM_DICTIONARY_ENTRY));
String line;
while ((line = lineReader.readLine()) != null) {
final String[] parsed = CSVUtil.parse(line + ",*,*");
lines.add(parsed);
}
}
lines.sort(Comparator.comparingInt(entry -> CharacterDefinition.lookupCharacterClass(entry[0])));
for (String[] entry : lines) {
dictionary.put(entry);
}
return dictionary;
}
private void readCharacterDefinition(Path path, UnknownDictionaryWriter dictionary) throws IOException {
try (Reader reader = Files.newBufferedReader(path, Charset.forName(encoding));
LineNumberReader lineReader = new LineNumberReader(reader)) {
String line;
while ((line = lineReader.readLine()) != null) {
line = line.replaceAll("^\\s", "");
line = line.replaceAll("\\s*#.*", "");
line = line.replaceAll("\\s+", " ");
if (line.length() == 0) {
continue;
}
if (line.startsWith("0x")) {
String[] values = line.split(" ", 2);
if (!values[0].contains("..")) {
int cp = Integer.decode(values[0]);
dictionary.putCharacterCategory(cp, values[1]);
} else {
String[] codePoints = values[0].split("\\.\\.");
int cpFrom = Integer.decode(codePoints[0]);
int cpTo = Integer.decode(codePoints[1]);
for (int i = cpFrom; i <= cpTo; i++) {
dictionary.putCharacterCategory(i, values[1]);
}
}
} else {
String[] values = line.split(" ");
String characterClassName = values[0];
int invoke = Integer.parseInt(values[1]);
int group = Integer.parseInt(values[2]);
int length = Integer.parseInt(values[3]);
dictionary.putInvokeDefinition(characterClassName, invoke, group, length);
}
}
}
}
}