org.apache.lucene/lucene-analyzers-nori/8.2.0 : org/apache/lucene/analysis/ko/dict/UserDictionary.java

UserDictionary
http://lucene.apache.org/lucene-parent/lucene-analyzers-nori: Lucene Nori Korean Morphological Analyzer (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ko.dict;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;

import org.apache.lucene.analysis.ko.POS;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;

Class for building a User Dictionary.
This class allows for adding custom nouns (세종) or compounds (세종시 세종 시).
/**
 * Class for building a User Dictionary.
 * This class allows for adding custom nouns (세종) or compounds (세종시 세종 시).
 */
public final class UserDictionary implements Dictionary {
  // text -> wordID
  private final TokenInfoFST fst;

  public static final int WORD_COST = -100000;

  // NNG left
  public static final short LEFT_ID = 1781;

  // NNG right
  public static final short RIGHT_ID = 3533;
  // NNG right with hangul and a coda on the last char
  public static final short RIGHT_ID_T = 3535;
  // NNG right with hangul and no coda on the last char
  public static final short RIGHT_ID_F = 3534;

  // length, length... indexed by compound ID or null for simple noun
  private final int segmentations[][];
  private final short[] rightIds;

  public static UserDictionary open(Reader reader) throws IOException {

    BufferedReader br = new BufferedReader(reader);
    String line = null;
    List<String> entries = new ArrayList<>();

    // text + optional segmentations
    while ((line = br.readLine()) != null) {
      // Remove comments
      line = line.replaceAll("#.*$", "");

      // Skip empty lines or comment lines
      if (line.trim().length() == 0) {
        continue;
      }
      entries.add(line);
    }

    if (entries.isEmpty()) {
      return null;
    } else {
      return new UserDictionary(entries);
    }
  }

  private UserDictionary(List<String> entries) throws IOException {
    final CharacterDefinition charDef = CharacterDefinition.getInstance();
    entries.sort(Comparator.comparing(e -> e.split("\\s+")[0]));

    PositiveIntOutputs fstOutput = PositiveIntOutputs.getSingleton();
    Builder<Long> fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE2, fstOutput);
    IntsRefBuilder scratch = new IntsRefBuilder();

    String lastToken = null;
    List<int[]> segmentations = new ArrayList<>(entries.size());
    List<Short> rightIds = new ArrayList<>(entries.size());
    long ord = 0;
    for (String entry : entries) {
      String[] splits = entry.split("\\s+");
      String token = splits[0];
      if (token.equals(lastToken)) {
        continue;
      }
      char lastChar = entry.charAt(entry.length()-1);
      if (charDef.isHangul(lastChar)) {
        if (charDef.hasCoda(lastChar)) {
          rightIds.add(RIGHT_ID_T);
        } else {
          rightIds.add(RIGHT_ID_F);
        }
      } else {
        rightIds.add(RIGHT_ID);
      }

      if (splits.length == 1) {
        segmentations.add(null);
      } else {
        int[] length = new int[splits.length-1];
        int offset = 0;
        for (int i = 1; i < splits.length; i++) {
          length[i-1] = splits[i].length();
          offset += splits[i].length();
        }
        if (offset > token.length()) {
          throw new IllegalArgumentException("Illegal user dictionary entry " + entry +
              " - the segmentation is bigger than the surface form (" + token + ")");
        }
        segmentations.add(length);
      }

      // add mapping to FST
      scratch.grow(token.length());
      scratch.setLength(token.length());
      for (int i = 0; i < token.length(); i++) {
        scratch.setIntAt(i, (int) token.charAt(i));
      }
      fstBuilder.add(scratch.get(), ord);
      lastToken = token;
      ord ++;
    }
    this.fst = new TokenInfoFST(fstBuilder.finish());
    this.segmentations = segmentations.toArray(new int[segmentations.size()][]);
    this.rightIds = new short[rightIds.size()];
    for (int i = 0; i < rightIds.size(); i++) {
      this.rightIds[i] = rightIds.get(i);
    }
  }

  public TokenInfoFST getFST() {
    return fst;
  }

  @Override
  public int getLeftId(int wordId) {
    return LEFT_ID;
  }
  
  @Override
  public int getRightId(int wordId) {
    return rightIds[wordId];
  }
  
  @Override
  public int getWordCost(int wordId) {
    return WORD_COST;
  }

  @Override
  public POS.Type getPOSType(int wordId) {
    if (segmentations[wordId] == null) {
      return POS.Type.MORPHEME;
    } else {
      return POS.Type.COMPOUND;
    }
  }

  @Override
  public POS.Tag getLeftPOS(int wordId) {
    return POS.Tag.NNG;
  }

  @Override
  public POS.Tag getRightPOS(int wordId) {
    return POS.Tag.NNG;
  }

  @Override
  public String getReading(int wordId) {
    return null;
  }

  @Override
  public Morpheme[] getMorphemes(int wordId, char[] surfaceForm, int off, int len) {
    int[] segs = segmentations[wordId];
    if (segs == null) {
      return null;
    }
    int offset = 0;
    Morpheme[] morphemes = new Morpheme[segs.length];
    for (int i = 0; i < segs.length; i++) {
      morphemes[i] = new Morpheme(POS.Tag.NNG, new String(surfaceForm, off+offset, segs[i]));
      offset += segs[i];
    }
    return morphemes;
  }

  Lookup words in text
Params: chars – text
off – offset into text
len – length of text
Returns: array of wordId/**
   * Lookup words in text
   * @param chars text
   * @param off offset into text
   * @param len length of text
   * @return array of wordId
   */
  public List<Integer> lookup(char[] chars, int off, int len) throws IOException {
    List<Integer> result = new ArrayList<>();
    final FST.BytesReader fstReader = fst.getBytesReader();

    FST.Arc<Long> arc = new FST.Arc<>();
    int end = off + len;
    for (int startOffset = off; startOffset < end; startOffset++) {
      arc = fst.getFirstArc(arc);
      int output = 0;
      int remaining = end - startOffset;
      for (int i = 0; i < remaining; i++) {
        int ch = chars[startOffset+i];
        if (fst.findTargetArc(ch, arc, arc, i == 0, fstReader) == null) {
          break; // continue to next position
        }
        output += arc.output.intValue();
        if (arc.isFinal()) {
          final int finalOutput = output + arc.nextFinalOutput.intValue();
          result.add(finalOutput);
        }
      }
    }
    return result;
  }
}
Params:	chars – text off – offset into text len – length of text
Returns:	array of wordId
/

org.apache.lucene/ lucene-analyzers-nori/ 8.2.0/ org/apache/lucene/analysis/ko/dict/UserDictionary.java