/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ko;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.EnumMap;
import java.util.List;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
import org.apache.lucene.analysis.ko.dict.Dictionary;
import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.ko.dict.TokenInfoFST;
import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
import org.apache.lucene.analysis.ko.dict.UserDictionary;
import org.apache.lucene.analysis.ko.tokenattributes.PartOfSpeechAttribute;
import org.apache.lucene.analysis.ko.tokenattributes.ReadingAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.util.RollingCharBuffer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.FST;

import static java.lang.Character.UnicodeScript;

Tokenizer for Korean that uses morphological analysis.

This tokenizer sets a number of additional attributes:

This tokenizer uses a rolling Viterbi search to find the least cost segmentation (path) of the incoming characters.

@lucene.experimental
/** * Tokenizer for Korean that uses morphological analysis. * <p> * This tokenizer sets a number of additional attributes: * <ul> * <li>{@link PartOfSpeechAttribute} containing part-of-speech. * <li>{@link ReadingAttribute} containing reading. * </ul> * <p> * This tokenizer uses a rolling Viterbi search to find the * least cost segmentation (path) of the incoming characters. * @lucene.experimental */
public final class KoreanTokenizer extends Tokenizer {
Token type reflecting the original source of this token
/** * Token type reflecting the original source of this token */
public enum Type {
Known words from the system dictionary.
/** * Known words from the system dictionary. */
KNOWN,
Unknown words (heuristically segmented).
/** * Unknown words (heuristically segmented). */
UNKNOWN,
Known words from the user dictionary.
/** * Known words from the user dictionary. */
USER }
Decompound mode: this determines how the tokenizer handles Type.COMPOUND, Type.INFLECT and Type.PREANALYSIS tokens.
/** * Decompound mode: this determines how the tokenizer handles * {@link POS.Type#COMPOUND}, {@link POS.Type#INFLECT} and {@link POS.Type#PREANALYSIS} tokens. */
public enum DecompoundMode {
No decomposition for compound.
/** * No decomposition for compound. */
NONE,
Decompose compounds and discards the original form (default).
/** * Decompose compounds and discards the original form (default). */
DISCARD,
Decompose compounds and keeps the original form.
/** * Decompose compounds and keeps the original form. */
MIXED }
Default mode for the decompound of tokens (DecompoundMode.DISCARD.
/** * Default mode for the decompound of tokens ({@link DecompoundMode#DISCARD}. */
public static final DecompoundMode DEFAULT_DECOMPOUND = DecompoundMode.DISCARD; private static final boolean VERBOSE = false; // For safety: private static final int MAX_UNKNOWN_WORD_LENGTH = 1024; private static final int MAX_BACKTRACE_GAP = 1024; private final EnumMap<Type, Dictionary> dictionaryMap = new EnumMap<>(Type.class); private final TokenInfoFST fst; private final TokenInfoDictionary dictionary; private final UnknownDictionary unkDictionary; private final ConnectionCosts costs; private final UserDictionary userDictionary; private final CharacterDefinition characterDefinition; private final FST.Arc<Long> arc = new FST.Arc<>(); private final FST.BytesReader fstReader; private final IntsRef wordIdRef = new IntsRef(); private final FST.BytesReader userFSTReader; private final TokenInfoFST userFST; private final boolean discardPunctuation; private final DecompoundMode mode; private final boolean outputUnknownUnigrams; private final RollingCharBuffer buffer = new RollingCharBuffer(); private final WrappedPositionArray positions = new WrappedPositionArray(); // True once we've hit the EOF from the input reader: private boolean end; // Last absolute position we backtraced from: private int lastBackTracePos; // Next absolute position to process: private int pos; // Already parsed, but not yet passed to caller, tokens: private final List<Token> pending = new ArrayList<>(); private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); private final PartOfSpeechAttribute posAtt = addAttribute(PartOfSpeechAttribute.class); private final ReadingAttribute readingAtt = addAttribute(ReadingAttribute.class);
Creates a new KoreanTokenizer with default parameters.

Uses the default AttributeFactory.

/** * Creates a new KoreanTokenizer with default parameters. * <p> * Uses the default AttributeFactory. */
public KoreanTokenizer() { this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, null, DEFAULT_DECOMPOUND, false, true); }
Create a new KoreanTokenizer.
Params:
  • factory – the AttributeFactory to use
  • userDictionary – Optional: if non-null, user dictionary.
  • mode – Decompound mode.
  • outputUnknownUnigrams – If true outputs unigrams for unknown words.
/** * Create a new KoreanTokenizer. * * @param factory the AttributeFactory to use * @param userDictionary Optional: if non-null, user dictionary. * @param mode Decompound mode. * @param outputUnknownUnigrams If true outputs unigrams for unknown words. */
public KoreanTokenizer(AttributeFactory factory, UserDictionary userDictionary, DecompoundMode mode, boolean outputUnknownUnigrams) { this(factory, userDictionary, mode, outputUnknownUnigrams, true); }
Create a new KoreanTokenizer.
Params:
  • factory – the AttributeFactory to use
  • userDictionary – Optional: if non-null, user dictionary.
  • mode – Decompound mode.
  • outputUnknownUnigrams – If true outputs unigrams for unknown words.
  • discardPunctuation – true if punctuation tokens should be dropped from the output.
/** * Create a new KoreanTokenizer. * * @param factory the AttributeFactory to use * @param userDictionary Optional: if non-null, user dictionary. * @param mode Decompound mode. * @param outputUnknownUnigrams If true outputs unigrams for unknown words. * @param discardPunctuation true if punctuation tokens should be dropped from the output. */
public KoreanTokenizer(AttributeFactory factory, UserDictionary userDictionary, DecompoundMode mode, boolean outputUnknownUnigrams, boolean discardPunctuation) { super(factory); this.mode = mode; this.discardPunctuation = discardPunctuation; this.outputUnknownUnigrams = outputUnknownUnigrams; dictionary = TokenInfoDictionary.getInstance(); fst = dictionary.getFST(); unkDictionary = UnknownDictionary.getInstance(); characterDefinition = unkDictionary.getCharacterDefinition(); this.userDictionary = userDictionary; costs = ConnectionCosts.getInstance(); fstReader = fst.getBytesReader(); if (userDictionary != null) { userFST = userDictionary.getFST(); userFSTReader = userFST.getBytesReader(); } else { userFST = null; userFSTReader = null; } buffer.reset(this.input); resetState(); dictionaryMap.put(Type.KNOWN, dictionary); dictionaryMap.put(Type.UNKNOWN, unkDictionary); dictionaryMap.put(Type.USER, userDictionary); } private GraphvizFormatter dotOut;
Expert: set this to produce graphviz (dot) output of the Viterbi lattice
/** Expert: set this to produce graphviz (dot) output of * the Viterbi lattice */
public void setGraphvizFormatter(GraphvizFormatter dotOut) { this.dotOut = dotOut; } @Override public void close() throws IOException { super.close(); buffer.reset(input); } @Override public void reset() throws IOException { super.reset(); buffer.reset(input); resetState(); } private void resetState() { positions.reset(); pos = 0; end = false; lastBackTracePos = 0; pending.clear(); // Add BOS: positions.get(0).add(0, 0, -1, -1, -1, -1, Type.KNOWN); } @Override public void end() throws IOException { super.end(); // Set final offset int finalOffset = correctOffset(pos); offsetAtt.setOffset(finalOffset, finalOffset); } // Holds all back pointers arriving to this position: final static class Position { int pos; int count; // maybe single int array * 5? int[] costs = new int[8]; int[] lastRightID = new int[8]; int[] backPos = new int[8]; int[] backWordPos = new int[8]; int[] backIndex = new int[8]; int[] backID = new int[8]; Type[] backType = new Type[8]; public void grow() { costs = ArrayUtil.grow(costs, 1+count); lastRightID = ArrayUtil.grow(lastRightID, 1+count); backPos = ArrayUtil.grow(backPos, 1+count); backWordPos = ArrayUtil.grow(backWordPos, 1+count); backIndex = ArrayUtil.grow(backIndex, 1+count); backID = ArrayUtil.grow(backID, 1+count); // NOTE: sneaky: grow separately because // ArrayUtil.grow will otherwise pick a different // length than the int[]s we just grew: final Type[] newBackType = new Type[backID.length]; System.arraycopy(backType, 0, newBackType, 0, backType.length); backType = newBackType; } public void add(int cost, int lastRightID, int backPos, int backRPos, int backIndex, int backID, Type backType) { // NOTE: this isn't quite a true Viterbi search, // because we should check if lastRightID is // already present here, and only update if the new // cost is less than the current cost, instead of // simply appending. However, that will likely hurt // performance (usually we add a lastRightID only once), // and it means we actually create the full graph // intersection instead of a "normal" Viterbi lattice: if (count == costs.length) { grow(); } this.costs[count] = cost; this.lastRightID[count] = lastRightID; this.backPos[count] = backPos; this.backWordPos[count] = backRPos; this.backIndex[count] = backIndex; this.backID[count] = backID; this.backType[count] = backType; count++; } public void reset() { count = 0; } }
Returns the space penalty associated with the provided Tag.
Params:
  • leftPOS – the left part of speech of the current token.
  • numSpaces – the number of spaces before the current token.
/** * Returns the space penalty associated with the provided {@link POS.Tag}. * * @param leftPOS the left part of speech of the current token. * @param numSpaces the number of spaces before the current token. */
private int computeSpacePenalty(POS.Tag leftPOS, int numSpaces) { int spacePenalty = 0; if (numSpaces > 0) { // TODO we should extract the penalty (left-space-penalty-factor) from the dicrc file. switch (leftPOS) { case E: case J: case VCP: case XSA: case XSN: case XSV: spacePenalty = 3000; break; default: break; } } return spacePenalty; } private void add(Dictionary dict, Position fromPosData, int wordPos, int endPos, int wordID, Type type) throws IOException { final POS.Tag leftPOS = dict.getLeftPOS(wordID); final int wordCost = dict.getWordCost(wordID); final int leftID = dict.getLeftId(wordID); int leastCost = Integer.MAX_VALUE; int leastIDX = -1; assert fromPosData.count > 0; for(int idx=0;idx<fromPosData.count;idx++) { // The number of spaces before the term int numSpaces = wordPos - fromPosData.pos; // Cost is path cost so far, plus word cost (added at // end of loop), plus bigram cost and space penalty cost. final int cost = fromPosData.costs[idx] + costs.get(fromPosData.lastRightID[idx], leftID) + computeSpacePenalty(leftPOS, numSpaces); if (VERBOSE) { System.out.println(" fromIDX=" + idx + ": cost=" + cost + " (prevCost=" + fromPosData.costs[idx] + " wordCost=" + wordCost + " bgCost=" + costs.get(fromPosData.lastRightID[idx], leftID) + " spacePenalty=" + computeSpacePenalty(leftPOS, numSpaces) + ") leftID=" + leftID + " leftPOS=" + leftPOS.name() + ")"); } if (cost < leastCost) { leastCost = cost; leastIDX = idx; if (VERBOSE) { System.out.println(" **"); } } } leastCost += wordCost; if (VERBOSE) { System.out.println(" + cost=" + leastCost + " wordID=" + wordID + " leftID=" + leftID + " leastIDX=" + leastIDX + " toPos=" + endPos + " toPos.idx=" + positions.get(endPos).count); } positions.get(endPos).add(leastCost, dict.getRightId(wordID), fromPosData.pos, wordPos, leastIDX, wordID, type); } @Override public boolean incrementToken() throws IOException { // parse() is able to return w/o producing any new // tokens, when the tokens it had produced were entirely // punctuation. So we loop here until we get a real // token or we end: while (pending.size() == 0) { if (end) { return false; } // Push Viterbi forward some more: parse(); } final Token token = pending.remove(pending.size()-1); int length = token.getLength(); clearAttributes(); assert length > 0; //System.out.println("off=" + token.getOffset() + " len=" + length + " vs " + token.getSurfaceForm().length); termAtt.copyBuffer(token.getSurfaceForm(), token.getOffset(), length); offsetAtt.setOffset(correctOffset(token.getStartOffset()), correctOffset(token.getEndOffset())); posAtt.setToken(token); readingAtt.setToken(token); posIncAtt.setPositionIncrement(token.getPositionIncrement()); posLengthAtt.setPositionLength(token.getPositionLength()); if (VERBOSE) { System.out.println(Thread.currentThread().getName() + ": incToken: return token=" + token); } return true; } // TODO: make generic'd version of this "circular array"? // It's a bit tricky because we do things to the Position // (eg, set .pos = N on reuse)... static final class WrappedPositionArray { private Position[] positions = new Position[8]; public WrappedPositionArray() { for(int i=0;i<positions.length;i++) { positions[i] = new Position(); } } // Next array index to write to in positions: private int nextWrite; // Next position to write: private int nextPos; // How many valid Position instances are held in the // positions array: private int count; public void reset() { nextWrite--; while(count > 0) { if (nextWrite == -1) { nextWrite = positions.length - 1; } positions[nextWrite--].reset(); count--; } nextWrite = 0; nextPos = 0; count = 0; }
Get Position instance for this absolute position; this is allowed to be arbitrarily far "in the future" but cannot be before the last freeBefore.
/** Get Position instance for this absolute position; * this is allowed to be arbitrarily far "in the * future" but cannot be before the last freeBefore. */
public Position get(int pos) { while(pos >= nextPos) { //System.out.println("count=" + count + " vs len=" + positions.length); if (count == positions.length) { Position[] newPositions = new Position[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; //System.out.println("grow positions " + newPositions.length); System.arraycopy(positions, nextWrite, newPositions, 0, positions.length-nextWrite); System.arraycopy(positions, 0, newPositions, positions.length-nextWrite, nextWrite); for(int i=positions.length;i<newPositions.length;i++) { newPositions[i] = new Position(); } nextWrite = positions.length; positions = newPositions; } if (nextWrite == positions.length) { nextWrite = 0; } // Should have already been reset: assert positions[nextWrite].count == 0; positions[nextWrite++].pos = nextPos++; count++; } assert inBounds(pos); final int index = getIndex(pos); assert positions[index].pos == pos; return positions[index]; } public int getNextPos() { return nextPos; } // For assert: private boolean inBounds(int pos) { return pos < nextPos && pos >= nextPos - count; } private int getIndex(int pos) { int index = nextWrite - (nextPos - pos); if (index < 0) { index += positions.length; } return index; } public void freeBefore(int pos) { final int toFree = count - (nextPos - pos); assert toFree >= 0; assert toFree <= count; int index = nextWrite - count; if (index < 0) { index += positions.length; } for(int i=0;i<toFree;i++) { if (index == positions.length) { index = 0; } //System.out.println(" fb idx=" + index); positions[index].reset(); index++; } count -= toFree; } } /* Incrementally parse some more characters. This runs * the viterbi search forwards "enough" so that we * generate some more tokens. How much forward depends on * the chars coming in, since some chars could cause * longer-lasting ambiguity in the parsing. Once the * ambiguity is resolved, then we back trace, produce * the pending tokens, and return. */ private void parse() throws IOException { if (VERBOSE) { System.out.println("\nPARSE"); } // Index of the last character of unknown word: int unknownWordEndIndex = -1; // Maximum posAhead of user word in the entire input int userWordMaxPosAhead = -1; // Advances over each position (character): while (true) { if (buffer.get(pos) == -1) { // End break; } final Position posData = positions.get(pos); final boolean isFrontier = positions.getNextPos() == pos+1; if (posData.count == 0) { // No arcs arrive here; move to next position: if (VERBOSE) { System.out.println(" no arcs in; skip pos=" + pos); } pos++; continue; } if (pos > lastBackTracePos && posData.count == 1 && isFrontier) { // We are at a "frontier", and only one node is // alive, so whatever the eventual best path is must // come through this node. So we can safely commit // to the prefix of the best path at this point: backtrace(posData, 0); // Re-base cost so we don't risk int overflow: posData.costs[0] = 0; if (pending.size() > 0) { return; } else { // This means the backtrace only produced // punctuation tokens, so we must keep parsing. } } if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) { // Safety: if we've buffered too much, force a // backtrace now. We find the least-cost partial // path, across all paths, backtrace from it, and // then prune all others. Note that this, in // general, can produce the wrong result, if the // total best path did not in fact back trace // through this partial best path. But it's the // best we can do... (short of not having a // safety!). // First pass: find least cost partial path so far, // including ending at future positions: int leastIDX = -1; int leastCost = Integer.MAX_VALUE; Position leastPosData = null; for(int pos2=pos;pos2<positions.getNextPos();pos2++) { final Position posData2 = positions.get(pos2); for(int idx=0;idx<posData2.count;idx++) { //System.out.println(" idx=" + idx + " cost=" + cost); final int cost = posData2.costs[idx]; if (cost < leastCost) { leastCost = cost; leastIDX = idx; leastPosData = posData2; } } } // We will always have at least one live path: assert leastIDX != -1; // Second pass: prune all but the best path: for(int pos2=pos;pos2<positions.getNextPos();pos2++) { final Position posData2 = positions.get(pos2); if (posData2 != leastPosData) { posData2.reset(); } else { if (leastIDX != 0) { posData2.costs[0] = posData2.costs[leastIDX]; posData2.lastRightID[0] = posData2.lastRightID[leastIDX]; posData2.backPos[0] = posData2.backPos[leastIDX]; posData2.backWordPos[0] = posData2.backWordPos[leastIDX]; posData2.backIndex[0] = posData2.backIndex[leastIDX]; posData2.backID[0] = posData2.backID[leastIDX]; posData2.backType[0] = posData2.backType[leastIDX]; } posData2.count = 1; } } backtrace(leastPosData, 0); // Re-base cost so we don't risk int overflow: Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0); if (pos != leastPosData.pos) { // We jumped into a future position: assert pos < leastPosData.pos; pos = leastPosData.pos; } if (pending.size() > 0) { return; } else { // This means the backtrace only produced // punctuation tokens, so we must keep parsing. continue; } } if (VERBOSE) { System.out.println("\n extend @ pos=" + pos + " char=" + (char) buffer.get(pos) + " hex=" + Integer.toHexString(buffer.get(pos))); } if (VERBOSE) { System.out.println(" " + posData.count + " arcs in"); } // Move to the first character that is not a whitespace. // The whitespaces are added as a prefix for the term that we extract, // this information is then used when computing the cost for the term using // the space penalty factor. // They are removed when the final tokens are generated. if (Character.getType(buffer.get(pos)) == Character.SPACE_SEPARATOR) { int nextChar = buffer.get(++pos); while (nextChar != -1 && Character.getType(nextChar) == Character.SPACE_SEPARATOR) { pos ++; nextChar = buffer.get(pos); } } if (buffer.get(pos) == -1) { pos = posData.pos; } boolean anyMatches = false; // First try user dict: if (userFST != null) { userFST.getFirstArc(arc); int output = 0; int maxPosAhead = 0; int outputMaxPosAhead = 0; int arcFinalOutMaxPosAhead = 0; for(int posAhead=pos;;posAhead++) { final int ch = buffer.get(posAhead); if (ch == -1) { break; } if (userFST.findTargetArc(ch, arc, arc, posAhead == pos, userFSTReader) == null) { break; } output += arc.output.intValue(); if (arc.isFinal()) { maxPosAhead = posAhead; outputMaxPosAhead = output; arcFinalOutMaxPosAhead = arc.nextFinalOutput.intValue(); anyMatches = true; } } // Longest matching for user word if (anyMatches && maxPosAhead > userWordMaxPosAhead) { if (VERBOSE) { System.out.println(" USER word " + new String(buffer.get(pos, maxPosAhead + 1)) + " toPos=" + (maxPosAhead + 1)); } add(userDictionary, posData, pos, maxPosAhead+1, outputMaxPosAhead+arcFinalOutMaxPosAhead, Type.USER); userWordMaxPosAhead = Math.max(userWordMaxPosAhead, maxPosAhead); } } // TODO: we can be more aggressive about user // matches? if we are "under" a user match then don't // extend KNOWN/UNKNOWN paths? if (!anyMatches) { // Next, try known dictionary matches fst.getFirstArc(arc); int output = 0; for(int posAhead=pos;;posAhead++) { final int ch = buffer.get(posAhead); if (ch == -1) { break; } //System.out.println(" match " + (char) ch + " posAhead=" + posAhead); if (fst.findTargetArc(ch, arc, arc, posAhead == pos, fstReader) == null) { break; } output += arc.output.intValue(); // Optimization: for known words that are too-long // (compound), we should pre-compute the 2nd // best segmentation and store it in the // dictionary instead of recomputing it each time a // match is found. if (arc.isFinal()) { dictionary.lookupWordIds(output + arc.nextFinalOutput.intValue(), wordIdRef); if (VERBOSE) { System.out.println(" KNOWN word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.length + " wordIDs"); } for (int ofs = 0; ofs < wordIdRef.length; ofs++) { add(dictionary, posData, pos, posAhead+1, wordIdRef.ints[wordIdRef.offset + ofs], Type.KNOWN); anyMatches = true; } } } } if (unknownWordEndIndex > posData.pos) { pos++; continue; } final char firstCharacter = (char) buffer.get(pos); if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) { // Find unknown match: int characterId = characterDefinition.getCharacterClass(firstCharacter); // NOTE: copied from UnknownDictionary.lookup: int unknownWordLength; if (!characterDefinition.isGroup(firstCharacter)) { unknownWordLength = 1; } else { // Extract unknown word. Characters with the same script are considered to be part of unknown word unknownWordLength = 1; UnicodeScript scriptCode = UnicodeScript.of((int) firstCharacter); final boolean isPunct = isPunctuation(firstCharacter); for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) { int next = buffer.get(posAhead); if (next == -1) { break; } char ch = (char) next; int chType = Character.getType(ch); UnicodeScript sc = UnicodeScript.of(next); boolean sameScript = isSameScript(scriptCode, sc) // Non-spacing marks inherit the script of their base character, // following recommendations from UTR #24. || chType == Character.NON_SPACING_MARK; if (sameScript && isPunctuation(ch, chType) == isPunct && characterDefinition.isGroup(ch)) { unknownWordLength++; } else { break; } // Update the script code and character class if the original script // is Inherited or Common. if (isCommonOrInherited(scriptCode) && isCommonOrInherited(sc) == false) { scriptCode = sc; characterId = characterDefinition.getCharacterClass(ch); } } } unkDictionary.lookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same if (VERBOSE) { System.out.println(" UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.length + " wordIDs"); } for (int ofs = 0; ofs < wordIdRef.length; ofs++) { add(unkDictionary, posData, pos, pos + unknownWordLength, wordIdRef.ints[wordIdRef.offset + ofs], Type.UNKNOWN); } } pos++; } end = true; if (pos > 0) { final Position endPosData = positions.get(pos); int leastCost = Integer.MAX_VALUE; int leastIDX = -1; if (VERBOSE) { System.out.println(" end: " + endPosData.count + " nodes"); } for(int idx=0;idx<endPosData.count;idx++) { // Add EOS cost: final int cost = endPosData.costs[idx] + costs.get(endPosData.lastRightID[idx], 0); //System.out.println(" idx=" + idx + " cost=" + cost + " (pathCost=" + endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ") backPos=" + endPosData.backPos[idx]); if (cost < leastCost) { leastCost = cost; leastIDX = idx; } } backtrace(endPosData, leastIDX); } else { // No characters in the input string; return no tokens! } } // the pending list. The pending list is then in-reverse // (last token should be returned first). private void backtrace(final Position endPosData, final int fromIDX) { final int endPos = endPosData.pos; if (VERBOSE) { System.out.println("\n backtrace: endPos=" + endPos + " pos=" + pos + "; " + (pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]); } final char[] fragment = buffer.get(lastBackTracePos, endPos-lastBackTracePos); if (dotOut != null) { dotOut.onBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end); } int pos = endPos; int bestIDX = fromIDX; // TODO: sort of silly to make Token instances here; the // back trace has all info needed to generate the // token. So, we could just directly set the attrs, // from the backtrace, in incrementToken w/o ever // creating Token; we'd have to defer calling freeBefore // until after the backtrace was fully "consumed" by // incrementToken. while (pos > lastBackTracePos) { //System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX); final Position posData = positions.get(pos); assert bestIDX < posData.count; int backPos = posData.backPos[bestIDX]; int backWordPos = posData.backWordPos[bestIDX]; assert backPos >= lastBackTracePos: "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos; // the length of the word without the whitespaces at the beginning. int length = pos - backWordPos; Type backType = posData.backType[bestIDX]; int backID = posData.backID[bestIDX]; int nextBestIDX = posData.backIndex[bestIDX]; // the start of the word after the whitespace at the beginning. final int fragmentOffset = backWordPos - lastBackTracePos; assert fragmentOffset >= 0; final Dictionary dict = getDict(backType); if (outputUnknownUnigrams && backType == Type.UNKNOWN) { // outputUnknownUnigrams converts unknown word into unigrams: for (int i = length - 1; i >= 0; i--) { int charLen = 1; if (i > 0 && Character.isLowSurrogate(fragment[fragmentOffset + i])) { i--; charLen = 2; } final DictionaryToken token = new DictionaryToken(Type.UNKNOWN, unkDictionary, CharacterDefinition.NGRAM, fragment, fragmentOffset+i, charLen, backWordPos+i, backWordPos+i+charLen ); pending.add(token); if (VERBOSE) { System.out.println(" add token=" + pending.get(pending.size() - 1)); } } } else { final DictionaryToken token = new DictionaryToken(backType, dict, backID, fragment, fragmentOffset, length, backWordPos, backWordPos + length ); if (token.getPOSType() == POS.Type.MORPHEME || mode == DecompoundMode.NONE) { if (shouldFilterToken(token) == false) { pending.add(token); if (VERBOSE) { System.out.println(" add token=" + pending.get(pending.size() - 1)); } } } else { Dictionary.Morpheme[] morphemes = token.getMorphemes(); if (morphemes == null) { pending.add(token); if (VERBOSE) { System.out.println(" add token=" + pending.get(pending.size() - 1)); } } else { int endOffset = backWordPos + length; int posLen = 0; // decompose the compound for (int i = morphemes.length - 1; i >= 0; i--) { final Dictionary.Morpheme morpheme = morphemes[i]; final Token compoundToken; if (token.getPOSType() == POS.Type.COMPOUND) { assert endOffset - morpheme.surfaceForm.length() >= 0; compoundToken = new DecompoundToken(morpheme.posTag, morpheme.surfaceForm, endOffset - morpheme.surfaceForm.length(), endOffset); } else { compoundToken = new DecompoundToken(morpheme.posTag, morpheme.surfaceForm, token.getStartOffset(), token.getEndOffset()); } if (i == 0 && mode == DecompoundMode.MIXED) { compoundToken.setPositionIncrement(0); } ++ posLen; endOffset -= morpheme.surfaceForm.length(); pending.add(compoundToken); if (VERBOSE) { System.out.println(" add token=" + pending.get(pending.size() - 1)); } } if (mode == DecompoundMode.MIXED) { token.setPositionLength(Math.max(1, posLen)); pending.add(token); if (VERBOSE) { System.out.println(" add token=" + pending.get(pending.size() - 1)); } } } } } if (discardPunctuation == false && backWordPos != backPos) { // Add a token for whitespaces between terms int offset = backPos - lastBackTracePos; int len = backWordPos - backPos; //System.out.println(offset + " " + fragmentOffset + " " + len + " " + backWordPos + " " + backPos); unkDictionary.lookupWordIds(characterDefinition.getCharacterClass(' '), wordIdRef); DictionaryToken spaceToken = new DictionaryToken(Type.UNKNOWN, unkDictionary, wordIdRef.ints[wordIdRef.offset], fragment, offset, len, backPos, backPos+len); pending.add(spaceToken); } pos = backPos; bestIDX = nextBestIDX; } lastBackTracePos = endPos; if (VERBOSE) { System.out.println(" freeBefore pos=" + endPos); } // Notify the circular buffers that we are done with // these positions: buffer.freeBefore(endPos); positions.freeBefore(endPos); } Dictionary getDict(Type type) { return dictionaryMap.get(type); } private boolean shouldFilterToken(Token token) { return discardPunctuation && isPunctuation(token.getSurfaceForm()[token.getOffset()]); } private static boolean isPunctuation(char ch) { return isPunctuation(ch, Character.getType(ch)); } private static boolean isPunctuation(char ch, int cid) { // special case for Hangul Letter Araea (interpunct) if (ch == 0x318D) { return true; } switch(cid) { case Character.SPACE_SEPARATOR: case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: case Character.CONTROL: case Character.FORMAT: case Character.DASH_PUNCTUATION: case Character.START_PUNCTUATION: case Character.END_PUNCTUATION: case Character.CONNECTOR_PUNCTUATION: case Character.OTHER_PUNCTUATION: case Character.MATH_SYMBOL: case Character.CURRENCY_SYMBOL: case Character.MODIFIER_SYMBOL: case Character.OTHER_SYMBOL: case Character.INITIAL_QUOTE_PUNCTUATION: case Character.FINAL_QUOTE_PUNCTUATION: return true; default: return false; } } private static boolean isCommonOrInherited(UnicodeScript script) { return script == UnicodeScript.INHERITED || script == UnicodeScript.COMMON; }
Determine if two scripts are compatible.
/** Determine if two scripts are compatible. */
private static boolean isSameScript(UnicodeScript scriptOne, UnicodeScript scriptTwo) { return scriptOne == scriptTwo || isCommonOrInherited(scriptOne) || isCommonOrInherited(scriptTwo); } }