package org.apache.lucene.analysis.ko.dict;
import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import org.apache.lucene.analysis.ko.POS;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
public abstract class BinaryDictionary implements Dictionary {
public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
public static final String = "ko_dict";
public static final String = "ko_dict_map";
public static final String = "ko_dict_pos";
public static final int VERSION = 1;
private final ByteBuffer buffer;
private final int[] targetMapOffsets, targetMap;
private final POS.Tag[] posDict;
protected BinaryDictionary() throws IOException {
InputStream mapIS = null, dictIS = null, posIS = null;
int[] targetMapOffsets = null, targetMap = null;
ByteBuffer buffer = null;
boolean success = false;
try {
mapIS = getResource(TARGETMAP_FILENAME_SUFFIX);
mapIS = new BufferedInputStream(mapIS);
DataInput in = new InputStreamDataInput(mapIS);
CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION);
targetMap = new int[in.readVInt()];
targetMapOffsets = new int[in.readVInt()];
int accum = 0, sourceId = 0;
for (int ofs = 0; ofs < targetMap.length; ofs++) {
final int val = in.readVInt();
if ((val & 0x01) != 0) {
targetMapOffsets[sourceId] = ofs;
sourceId++;
}
accum += val >>> 1;
targetMap[ofs] = accum;
}
if (sourceId + 1 != targetMapOffsets.length)
throw new IOException("targetMap file format broken");
targetMapOffsets[sourceId] = targetMap.length;
mapIS.close(); mapIS = null;
posIS = getResource(POSDICT_FILENAME_SUFFIX);
posIS = new BufferedInputStream(posIS);
in = new InputStreamDataInput(posIS);
CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
int posSize = in.readVInt();
posDict = new POS.Tag[posSize];
for (int j = 0; j < posSize; j++) {
posDict[j] = POS.resolveTag(in.readByte());
}
posIS.close(); posIS = null;
dictIS = getResource(DICT_FILENAME_SUFFIX);
in = new InputStreamDataInput(dictIS);
CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION);
final int size = in.readVInt();
final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
final ReadableByteChannel channel = Channels.newChannel(dictIS);
final int read = channel.read(tmpBuffer);
if (read != size) {
throw new EOFException("Cannot read whole dictionary");
}
dictIS.close(); dictIS = null;
buffer = tmpBuffer.asReadOnlyBuffer();
success = true;
} finally {
if (success) {
IOUtils.close(mapIS, dictIS);
} else {
IOUtils.closeWhileHandlingException(mapIS, dictIS);
}
}
this.targetMap = targetMap;
this.targetMapOffsets = targetMapOffsets;
this.buffer = buffer;
}
protected final InputStream getResource(String suffix) throws IOException {
return getClassResource(getClass(), suffix);
}
public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
if (is == null)
throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.','/') + suffix);
return is;
}
public void lookupWordIds(int sourceId, IntsRef ref) {
ref.ints = targetMap;
ref.offset = targetMapOffsets[sourceId];
ref.length = targetMapOffsets[sourceId + 1] - ref.offset;
}
@Override
public int getLeftId(int wordId) {
return buffer.getShort(wordId) >>> 2;
}
@Override
public int getRightId(int wordId) {
return buffer.getShort(wordId+2) >>> 2;
}
@Override
public int getWordCost(int wordId) {
return buffer.getShort(wordId + 4);
}
@Override
public POS.Type getPOSType(int wordId) {
byte value = (byte) (buffer.getShort(wordId) & 3);
return POS.resolveType(value);
}
@Override
public POS.Tag getLeftPOS(int wordId) {
return posDict[getLeftId(wordId)];
}
@Override
public POS.Tag getRightPOS(int wordId) {
POS.Type type = getPOSType(wordId);
if (type == POS.Type.MORPHEME || type == POS.Type.COMPOUND || hasSinglePOS(wordId)) {
return getLeftPOS(wordId);
} else {
byte value = buffer.get(wordId + 6);
return POS.resolveTag(value);
}
}
@Override
public String getReading(int wordId) {
if (hasReadingData(wordId)) {
int offset = wordId + 6;
return readString(offset);
}
return null;
}
@Override
public Morpheme[] getMorphemes(int wordId, char[] surfaceForm, int off, int len) {
POS.Type posType = getPOSType(wordId);
if (posType == POS.Type.MORPHEME) {
return null;
}
int offset = wordId + 6;
boolean hasSinglePos = hasSinglePOS(wordId);
if (hasSinglePos == false) {
offset ++;
}
int length = buffer.get(offset++);
if (length == 0) {
return null;
}
Morpheme[] morphemes = new Morpheme[length];
int surfaceOffset = 0;
final POS.Tag leftPOS = getLeftPOS(wordId);
for (int i = 0; i < length; i++) {
final String form;
final POS.Tag tag = hasSinglePos ? leftPOS : POS.resolveTag(buffer.get(offset++));
if (posType == POS.Type.INFLECT) {
form = readString(offset);
offset += form.length() * 2 + 1;
} else {
int formLen = buffer.get(offset++);
form = new String(surfaceForm, off+surfaceOffset, formLen);
surfaceOffset += formLen;
}
morphemes[i] = new Morpheme(tag, form);
}
return morphemes;
}
private String readString(int offset) {
int strOffset = offset;
int len = buffer.get(strOffset++);
char text[] = new char[len];
for (int i = 0; i < len; i++) {
text[i] = buffer.getChar(strOffset + (i<<1));
}
return new String(text);
}
private boolean hasSinglePOS(int wordId) {
return (buffer.getShort(wordId+2) & HAS_SINGLE_POS) != 0;
}
private boolean hasReadingData(int wordId) {
return (buffer.getShort(wordId+2) & HAS_READING) != 0;
}
public static final int HAS_SINGLE_POS = 1;
public static final int HAS_READING = 2;
}