/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ja.dict;


import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IOUtils;

Base class for a binary-encoded in-memory dictionary.
/** * Base class for a binary-encoded in-memory dictionary. */
public abstract class BinaryDictionary implements Dictionary {
Used to specify where (dictionary) resources get loaded from.
/** * Used to specify where (dictionary) resources get loaded from. */
public enum ResourceScheme { CLASSPATH, FILE } public static final String DICT_FILENAME_SUFFIX = "$buffer.dat"; public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat"; public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat"; public static final String DICT_HEADER = "kuromoji_dict"; public static final String TARGETMAP_HEADER = "kuromoji_dict_map"; public static final String POSDICT_HEADER = "kuromoji_dict_pos"; public static final int VERSION = 1; private final ResourceScheme resourceScheme; private final String resourcePath; private final ByteBuffer buffer; private final int[] targetMapOffsets, targetMap; private final String[] posDict; private final String[] inflTypeDict; private final String[] inflFormDict; protected BinaryDictionary() throws IOException { this(ResourceScheme.CLASSPATH, null); }
Params:
  • resourceScheme – - scheme for loading resources (FILE or CLASSPATH).
  • resourcePath – - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use this class's name as the path.
/** * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH). * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use * this class's name as the path. */
protected BinaryDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException { this.resourceScheme = resourceScheme; if (resourcePath == null) { if (resourceScheme != ResourceScheme.CLASSPATH) { throw new IllegalArgumentException("resourcePath must be supplied with FILE resource scheme"); } this.resourcePath = getClass().getName().replace('.', '/'); } else { this.resourcePath = resourcePath; } InputStream mapIS = null, dictIS = null, posIS = null; int[] targetMapOffsets = null, targetMap = null; String[] posDict = null; String[] inflFormDict = null; String[] inflTypeDict = null; ByteBuffer buffer = null; boolean success = false; try { mapIS = getResource(TARGETMAP_FILENAME_SUFFIX); mapIS = new BufferedInputStream(mapIS); DataInput in = new InputStreamDataInput(mapIS); CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION); targetMap = new int[in.readVInt()]; targetMapOffsets = new int[in.readVInt()]; int accum = 0, sourceId = 0; for (int ofs = 0; ofs < targetMap.length; ofs++) { final int val = in.readVInt(); if ((val & 0x01) != 0) { targetMapOffsets[sourceId] = ofs; sourceId++; } accum += val >>> 1; targetMap[ofs] = accum; } if (sourceId + 1 != targetMapOffsets.length) throw new IOException("targetMap file format broken; targetMap.length=" + targetMap.length + ", targetMapOffsets.length=" + targetMapOffsets.length + ", sourceId=" + sourceId); targetMapOffsets[sourceId] = targetMap.length; mapIS.close(); mapIS = null; posIS = getResource(POSDICT_FILENAME_SUFFIX); posIS = new BufferedInputStream(posIS); in = new InputStreamDataInput(posIS); CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION); int posSize = in.readVInt(); posDict = new String[posSize]; inflTypeDict = new String[posSize]; inflFormDict = new String[posSize]; for (int j = 0; j < posSize; j++) { posDict[j] = in.readString(); inflTypeDict[j] = in.readString(); inflFormDict[j] = in.readString(); // this is how we encode null inflections if (inflTypeDict[j].length() == 0) { inflTypeDict[j] = null; } if (inflFormDict[j].length() == 0) { inflFormDict[j] = null; } } posIS.close(); posIS = null; dictIS = getResource(DICT_FILENAME_SUFFIX); // no buffering here, as we load in one large buffer in = new InputStreamDataInput(dictIS); CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION); final int size = in.readVInt(); final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size); final ReadableByteChannel channel = Channels.newChannel(dictIS); final int read = channel.read(tmpBuffer); if (read != size) { throw new EOFException("Cannot read whole dictionary"); } dictIS.close(); dictIS = null; buffer = tmpBuffer.asReadOnlyBuffer(); success = true; } finally { if (success) { IOUtils.close(mapIS, posIS, dictIS); } else { IOUtils.closeWhileHandlingException(mapIS, posIS, dictIS); } } this.targetMap = targetMap; this.targetMapOffsets = targetMapOffsets; this.posDict = posDict; this.inflTypeDict = inflTypeDict; this.inflFormDict = inflFormDict; this.buffer = buffer; } protected final InputStream getResource(String suffix) throws IOException { switch(resourceScheme) { case CLASSPATH: return getClassResource(resourcePath + suffix); case FILE: return Files.newInputStream(Paths.get(resourcePath + suffix)); default: throw new IllegalStateException("unknown resource scheme " + resourceScheme); } } // util, reused by ConnectionCosts and CharacterDefinition public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException { final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix); if (is == null) { throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.','/') + suffix); } return is; } private InputStream getClassResource(String path) throws IOException { final InputStream is = BinaryDictionary.class.getClassLoader().getResourceAsStream(path); if (is == null) { throw new FileNotFoundException("Not in classpath: " + path); } return is; } public void lookupWordIds(int sourceId, IntsRef ref) { ref.ints = targetMap; ref.offset = targetMapOffsets[sourceId]; // targetMapOffsets always has one more entry pointing behind last: ref.length = targetMapOffsets[sourceId + 1] - ref.offset; } @Override public int getLeftId(int wordId) { return (buffer.getShort(wordId) & 0xffff) >>> 3; } @Override public int getRightId(int wordId) { return (buffer.getShort(wordId) & 0xffff) >>> 3; } @Override public int getWordCost(int wordId) { return buffer.getShort(wordId + 2); // Skip id } @Override public String getBaseForm(int wordId, char surfaceForm[], int off, int len) { if (hasBaseFormData(wordId)) { int offset = baseFormOffset(wordId); int data = buffer.get(offset++) & 0xff; int prefix = data >>> 4; int suffix = data & 0xF; char text[] = new char[prefix+suffix]; System.arraycopy(surfaceForm, off, text, 0, prefix); for (int i = 0; i < suffix; i++) { text[prefix+i] = buffer.getChar(offset + (i << 1)); } return new String(text); } else { return null; } } @Override public String getReading(int wordId, char surface[], int off, int len) { if (hasReadingData(wordId)) { int offset = readingOffset(wordId); int readingData = buffer.get(offset++) & 0xff; return readString(offset, readingData >>> 1, (readingData & 1) == 1); } else { // the reading is the surface form, with hiragana shifted to katakana char text[] = new char[len]; for (int i = 0; i < len; i++) { char ch = surface[off+i]; if (ch > 0x3040 && ch < 0x3097) { text[i] = (char)(ch + 0x60); } else { text[i] = ch; } } return new String(text); } } @Override public String getPartOfSpeech(int wordId) { return posDict[getLeftId(wordId)]; } @Override public String getPronunciation(int wordId, char surface[], int off, int len) { if (hasPronunciationData(wordId)) { int offset = pronunciationOffset(wordId); int pronunciationData = buffer.get(offset++) & 0xff; return readString(offset, pronunciationData >>> 1, (pronunciationData & 1) == 1); } else { return getReading(wordId, surface, off, len); // same as the reading } } @Override public String getInflectionType(int wordId) { return inflTypeDict[getLeftId(wordId)]; } @Override public String getInflectionForm(int wordId) { return inflFormDict[getLeftId(wordId)]; } private static int baseFormOffset(int wordId) { return wordId + 4; } private int readingOffset(int wordId) { int offset = baseFormOffset(wordId); if (hasBaseFormData(wordId)) { int baseFormLength = buffer.get(offset++) & 0xf; return offset + (baseFormLength << 1); } else { return offset; } } private int pronunciationOffset(int wordId) { if (hasReadingData(wordId)) { int offset = readingOffset(wordId); int readingData = buffer.get(offset++) & 0xff; final int readingLength; if ((readingData & 1) == 0) { readingLength = readingData & 0xfe; // UTF-16: mask off kana bit } else { readingLength = readingData >>> 1; } return offset + readingLength; } else { return readingOffset(wordId); } } private boolean hasBaseFormData(int wordId) { return (buffer.getShort(wordId) & HAS_BASEFORM) != 0; } private boolean hasReadingData(int wordId) { return (buffer.getShort(wordId) & HAS_READING) != 0; } private boolean hasPronunciationData(int wordId) { return (buffer.getShort(wordId) & HAS_PRONUNCIATION) != 0; } private String readString(int offset, int length, boolean kana) { char text[] = new char[length]; if (kana) { for (int i = 0; i < length; i++) { text[i] = (char) (0x30A0 + (buffer.get(offset + i) & 0xff)); } } else { for (int i = 0; i < length; i++) { text[i] = buffer.getChar(offset + (i << 1)); } } return new String(text); }
flag that the entry has baseform data. otherwise it's not inflected (same as surface form)
/** flag that the entry has baseform data. otherwise it's not inflected (same as surface form) */
public static final int HAS_BASEFORM = 1;
flag that the entry has reading data. otherwise reading is surface form converted to katakana
/** flag that the entry has reading data. otherwise reading is surface form converted to katakana */
public static final int HAS_READING = 2;
flag that the entry has pronunciation data. otherwise pronunciation is the reading
/** flag that the entry has pronunciation data. otherwise pronunciation is the reading */
public static final int HAS_PRONUNCIATION = 4; }