org.apache.lucene/lucene-analyzers-nori/8.2.0 : org/apache/lucene/analysis/ko/dict/BinaryDictionary.java

BinaryDictionary
http://lucene.apache.org/lucene-parent/lucene-analyzers-nori: Lucene Nori Korean Morphological Analyzer (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ko.dict;

import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;

import org.apache.lucene.analysis.ko.POS;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;

Base class for a binary-encoded in-memory dictionary.
/**
 * Base class for a binary-encoded in-memory dictionary.
 */
public abstract class BinaryDictionary implements Dictionary {
  public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
  public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
  public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";

  public static final String DICT_HEADER = "ko_dict";
  public static final String TARGETMAP_HEADER = "ko_dict_map";
  public static final String POSDICT_HEADER = "ko_dict_pos";
  public static final int VERSION = 1;

  private final ByteBuffer buffer;
  private final int[] targetMapOffsets, targetMap;
  private final POS.Tag[] posDict;

  protected BinaryDictionary() throws IOException {
    InputStream mapIS = null, dictIS = null, posIS = null;
    int[] targetMapOffsets = null, targetMap = null;
    ByteBuffer buffer = null;
    boolean success = false;
    try {
      mapIS = getResource(TARGETMAP_FILENAME_SUFFIX);
      mapIS = new BufferedInputStream(mapIS);
      DataInput in = new InputStreamDataInput(mapIS);
      CodecUtil.checkHeader(in, TARGETMAP_HEADER, VERSION, VERSION);
      targetMap = new int[in.readVInt()];
      targetMapOffsets = new int[in.readVInt()];
      int accum = 0, sourceId = 0;
      for (int ofs = 0; ofs < targetMap.length; ofs++) {
        final int val = in.readVInt();
        if ((val & 0x01) != 0) {
          targetMapOffsets[sourceId] = ofs;
          sourceId++;
        }
        accum += val >>> 1;
        targetMap[ofs] = accum;
      }
      if (sourceId + 1 != targetMapOffsets.length)
        throw new IOException("targetMap file format broken");
      targetMapOffsets[sourceId] = targetMap.length;
      mapIS.close(); mapIS = null;

      posIS = getResource(POSDICT_FILENAME_SUFFIX);
      posIS = new BufferedInputStream(posIS);
      in = new InputStreamDataInput(posIS);
      CodecUtil.checkHeader(in, POSDICT_HEADER, VERSION, VERSION);
      int posSize = in.readVInt();
      posDict = new POS.Tag[posSize];
      for (int j = 0; j < posSize; j++) {
        posDict[j] = POS.resolveTag(in.readByte());
      }
      posIS.close(); posIS = null;

      dictIS = getResource(DICT_FILENAME_SUFFIX);
      // no buffering here, as we load in one large buffer
      in = new InputStreamDataInput(dictIS);
      CodecUtil.checkHeader(in, DICT_HEADER, VERSION, VERSION);
      final int size = in.readVInt();
      final ByteBuffer tmpBuffer = ByteBuffer.allocateDirect(size);
      final ReadableByteChannel channel = Channels.newChannel(dictIS);
      final int read = channel.read(tmpBuffer);
      if (read != size) {
        throw new EOFException("Cannot read whole dictionary");
      }
      dictIS.close(); dictIS = null;
      buffer = tmpBuffer.asReadOnlyBuffer();
      success = true;
    } finally {
      if (success) {
        IOUtils.close(mapIS, dictIS);
      } else {
        IOUtils.closeWhileHandlingException(mapIS, dictIS);
      }
    }

    this.targetMap = targetMap;
    this.targetMapOffsets = targetMapOffsets;
    this.buffer = buffer;
  }
  
  protected final InputStream getResource(String suffix) throws IOException {
    return getClassResource(getClass(), suffix);
  }
  
  // util, reused by ConnectionCosts and CharacterDefinition
  public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
    final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
    if (is == null)
      throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.','/') + suffix);
    return is;
  }

  public void lookupWordIds(int sourceId, IntsRef ref) {
    ref.ints = targetMap;
    ref.offset = targetMapOffsets[sourceId];
    // targetMapOffsets always has one more entry pointing behind last:
    ref.length = targetMapOffsets[sourceId + 1] - ref.offset;
  }

  @Override
  public int getLeftId(int wordId) {
    return buffer.getShort(wordId) >>> 2;
  }
  
  @Override
  public int getRightId(int wordId) {
    return buffer.getShort(wordId+2) >>> 2; // Skip left id
  }
  
  @Override
  public int getWordCost(int wordId) {
    return buffer.getShort(wordId + 4);  // Skip left and right id
  }

  @Override
  public POS.Type getPOSType(int wordId) {
    byte value = (byte) (buffer.getShort(wordId) & 3);
    return POS.resolveType(value);
  }

  @Override
  public POS.Tag getLeftPOS(int wordId) {
    return posDict[getLeftId(wordId)];
  }

  @Override
  public POS.Tag getRightPOS(int wordId) {
    POS.Type type = getPOSType(wordId);
    if (type == POS.Type.MORPHEME || type == POS.Type.COMPOUND || hasSinglePOS(wordId)) {
      return getLeftPOS(wordId);
    } else {
      byte value = buffer.get(wordId + 6);
      return POS.resolveTag(value);
    }
  }

  @Override
  public String getReading(int wordId) {
    if (hasReadingData(wordId)) {
      int offset = wordId + 6;
      return readString(offset);
    }
    return null;
  }

  @Override
  public Morpheme[] getMorphemes(int wordId, char[] surfaceForm, int off, int len) {
    POS.Type posType = getPOSType(wordId);
    if (posType == POS.Type.MORPHEME) {
      return null;
    }
    int offset = wordId + 6;
    boolean hasSinglePos = hasSinglePOS(wordId);
    if (hasSinglePos == false) {
      offset ++; // skip rightPOS
    }
    int length = buffer.get(offset++);
    if (length == 0) {
      return null;
    }
    Morpheme[] morphemes = new Morpheme[length];
    int surfaceOffset = 0;
    final POS.Tag leftPOS = getLeftPOS(wordId);
    for (int i = 0; i < length; i++) {
      final String form;
      final POS.Tag tag = hasSinglePos ? leftPOS : POS.resolveTag(buffer.get(offset++));
      if (posType == POS.Type.INFLECT) {
        form = readString(offset);
        offset += form.length() * 2 + 1;
      } else {
        int formLen = buffer.get(offset++);
        form = new String(surfaceForm, off+surfaceOffset, formLen);
        surfaceOffset += formLen;
      }
      morphemes[i] = new Morpheme(tag, form);
    }
    return morphemes;
  }

  private String readString(int offset) {
    int strOffset = offset;
    int len = buffer.get(strOffset++);
    char text[] = new char[len];
    for (int i = 0; i < len; i++) {
      text[i] = buffer.getChar(strOffset + (i<<1));
    }
    return new String(text);
  }

  private boolean hasSinglePOS(int wordId) {
    return (buffer.getShort(wordId+2) & HAS_SINGLE_POS) != 0;
  }

  private boolean hasReadingData(int wordId) {
    return (buffer.getShort(wordId+2) & HAS_READING) != 0;
  }

  flag that the entry has a single part of speech (leftPOS) /** flag that the entry has a single part of speech (leftPOS) */
  public static final int HAS_SINGLE_POS = 1;

  flag that the entry has reading data. otherwise reading is surface form /** flag that the entry has reading data. otherwise reading is surface form */
  public static final int HAS_READING = 2;
}
/

org.apache.lucene/ lucene-analyzers-nori/ 8.2.0/ org/apache/lucene/analysis/ko/dict/BinaryDictionary.java