/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util.fst;


import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.RamUsageEstimator;

// TODO: break this into WritableFST and ReadOnlyFST.. then
// we can have subclasses of ReadOnlyFST to handle the
// different byte[] level encodings (packed or
// not)... and things like nodeCount, arcCount are read only

// TODO: if FST is pure prefix trie we can do a more compact
// job, ie, once we are at a 'suffix only', just store the
// completion labels as a string not as a series of arcs.

// NOTE: while the FST is able to represent a non-final
// dead-end state (NON_FINAL_END_NODE=0), the layers above
// (FSTEnum, Util) have problems with this!!

Represents an finite state machine (FST), using a compact byte[] format.

The format is similar to what's used by Morfologik (https://github.com/morfologik/morfologik-stemming).

See the package documentation for some simple examples.

@lucene.experimental
/** Represents an finite state machine (FST), using a * compact byte[] format. * <p> The format is similar to what's used by Morfologik * (https://github.com/morfologik/morfologik-stemming). * * <p> See the {@link org.apache.lucene.util.fst package * documentation} for some simple examples. * * @lucene.experimental */
public final class FST<T> implements Accountable { private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FST.class); private static final long ARC_SHALLOW_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(Arc.class);
Specifies allowed range of each int input label for this FST.
/** Specifies allowed range of each int input label for * this FST. */
public static enum INPUT_TYPE {BYTE1, BYTE2, BYTE4}; static final int BIT_FINAL_ARC = 1 << 0; static final int BIT_LAST_ARC = 1 << 1; static final int BIT_TARGET_NEXT = 1 << 2; // TODO: we can free up a bit if we can nuke this: static final int BIT_STOP_NODE = 1 << 3;
This flag is set if the arc has an output.
/** This flag is set if the arc has an output. */
public static final int BIT_ARC_HAS_OUTPUT = 1 << 4; static final int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5; // We use this as a marker (because this one flag is // illegal by itself ...): private static final byte ARCS_AS_ARRAY_PACKED = BIT_ARC_HAS_FINAL_OUTPUT; // this means either of these things in different contexts // in the midst of a direct array: private static final byte BIT_MISSING_ARC = 1 << 6; // at the start of a direct array: private static final byte ARCS_AS_ARRAY_WITH_GAPS = BIT_MISSING_ARC;
See Also:
  • shouldExpand(Builder, UnCompiledNode)
/** * @see #shouldExpand(Builder, Builder.UnCompiledNode) */
static final int FIXED_ARRAY_SHALLOW_DISTANCE = 3; // 0 => only root node.
See Also:
  • shouldExpand(Builder, UnCompiledNode)
/** * @see #shouldExpand(Builder, Builder.UnCompiledNode) */
static final int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5;
See Also:
  • shouldExpand(Builder, UnCompiledNode)
/** * @see #shouldExpand(Builder, Builder.UnCompiledNode) */
static final int FIXED_ARRAY_NUM_ARCS_DEEP = 10; // Increment version to change it private static final String FILE_FORMAT_NAME = "FST"; private static final int VERSION_START = 6; private static final int VERSION_CURRENT = 6; // Never serialized; just used to represent the virtual // final node w/ no arcs: private static final long FINAL_END_NODE = -1; // Never serialized; just used to represent the virtual // non-final node w/ no arcs: private static final long NON_FINAL_END_NODE = 0;
If arc has this label then that arc is final/accepted
/** If arc has this label then that arc is final/accepted */
public static final int END_LABEL = -1; public final INPUT_TYPE inputType; // if non-null, this FST accepts the empty string and // produces this output T emptyOutput;
A BytesStore, used during building, or during reading when the FST is very large (more than 1 GB). If the FST is less than 1 GB then bytesArray is set instead.
/** A {@link BytesStore}, used during building, or during reading when * the FST is very large (more than 1 GB). If the FST is less than 1 * GB then bytesArray is set instead. */
final BytesStore bytes; private final FSTStore fstStore; private long startNode = -1; public final Outputs<T> outputs; private Arc<T> cachedRootArcs[];
Represents a single arc.
/** Represents a single arc. */
public static final class Arc<T> { public int label; public T output;
To node (ord or address)
/** To node (ord or address) */
public long target; byte flags; public T nextFinalOutput; // address (into the byte[]), or ord/address if label == END_LABEL long nextArc;
Where the first arc in the array starts; only valid if bytesPerArc != 0
/** Where the first arc in the array starts; only valid if * bytesPerArc != 0 */
public long posArcsStart;
Non-zero if this arc is part of an array, which means all arcs for the node are encoded with a fixed number of bytes so that we can random access by index. We do when there are enough arcs leaving one node. It wastes some bytes but gives faster lookups.
/** Non-zero if this arc is part of an array, which means all * arcs for the node are encoded with a fixed number of bytes so * that we can random access by index. We do when there are enough * arcs leaving one node. It wastes some bytes but gives faster * lookups. */
public int bytesPerArc;
Where we are in the array; only valid if bytesPerArc != 0, and the array has no holes. arcIdx = Integer.MIN_VALUE indicates that the arc is part of a direct array, addressed by label.
/** Where we are in the array; only valid if bytesPerArc != 0, and the array has no holes. * arcIdx = Integer.MIN_VALUE indicates that the arc is part of a direct array, addressed by * label. */
public int arcIdx;
How many arc, if bytesPerArc == 0. Otherwise, the size of the arc array. If the array is direct, this may include holes. Otherwise it is also how many arcs are in the array
/** How many arc, if bytesPerArc == 0. Otherwise, the size of the arc array. If the array is * direct, this may include holes. Otherwise it is also how many arcs are in the array */
public int numArcs;
Returns this
/** Returns this */
public Arc<T> copyFrom(Arc<T> other) { label = other.label; target = other.target; flags = other.flags; output = other.output; nextFinalOutput = other.nextFinalOutput; nextArc = other.nextArc; bytesPerArc = other.bytesPerArc; if (bytesPerArc != 0) { posArcsStart = other.posArcsStart; arcIdx = other.arcIdx; numArcs = other.numArcs; } return this; } boolean flag(int flag) { return FST.flag(flags, flag); } public boolean isLast() { return flag(BIT_LAST_ARC); } public boolean isFinal() { return flag(BIT_FINAL_ARC); } @Override public String toString() { StringBuilder b = new StringBuilder(); b.append(" target=").append(target); b.append(" label=0x").append(Integer.toHexString(label)); if (flag(BIT_FINAL_ARC)) { b.append(" final"); } if (flag(BIT_LAST_ARC)) { b.append(" last"); } if (flag(BIT_TARGET_NEXT)) { b.append(" targetNext"); } if (flag(BIT_STOP_NODE)) { b.append(" stop"); } if (flag(BIT_ARC_HAS_OUTPUT)) { b.append(" output=").append(output); } if (flag(BIT_ARC_HAS_FINAL_OUTPUT)) { b.append(" nextFinalOutput=").append(nextFinalOutput); } if (bytesPerArc != 0) { b.append(" arcArray(idx=").append(arcIdx).append(" of ").append(numArcs).append(")"); } return b.toString(); } }; private static boolean flag(int flags, int bit) { return (flags & bit) != 0; } private final int version; // make a new empty FST, for building; Builder invokes // this ctor FST(INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits) { this.inputType = inputType; this.outputs = outputs; version = VERSION_CURRENT; fstStore = null; bytes = new BytesStore(bytesPageBits); // pad: ensure no node gets address 0 which is reserved to mean // the stop state w/ no arcs bytes.writeByte((byte) 0); emptyOutput = null; } public static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28;
Load a previously saved FST.
/** Load a previously saved FST. */
public FST(DataInput in, Outputs<T> outputs) throws IOException { this(in, outputs, new OnHeapFSTStore(DEFAULT_MAX_BLOCK_BITS)); }
Load a previously saved FST; maxBlockBits allows you to control the size of the byte[] pages used to hold the FST bytes.
/** Load a previously saved FST; maxBlockBits allows you to * control the size of the byte[] pages used to hold the FST bytes. */
public FST(DataInput in, Outputs<T> outputs, FSTStore fstStore) throws IOException { bytes = null; this.fstStore = fstStore; this.outputs = outputs; // NOTE: only reads most recent format; we don't have // back-compat promise for FSTs (they are experimental): version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_START, VERSION_CURRENT); if (in.readByte() == 1) { // accepts empty string // 1 KB blocks: BytesStore emptyBytes = new BytesStore(10); int numBytes = in.readVInt(); emptyBytes.copyBytes(in, numBytes); // De-serialize empty-string output: BytesReader reader = emptyBytes.getReverseReader(); // NoOutputs uses 0 bytes when writing its output, // so we have to check here else BytesStore gets // angry: if (numBytes > 0) { reader.setPosition(numBytes-1); } emptyOutput = outputs.readFinalOutput(reader); } else { emptyOutput = null; } final byte t = in.readByte(); switch(t) { case 0: inputType = INPUT_TYPE.BYTE1; break; case 1: inputType = INPUT_TYPE.BYTE2; break; case 2: inputType = INPUT_TYPE.BYTE4; break; default: throw new IllegalStateException("invalid input type " + t); } startNode = in.readVLong(); long numBytes = in.readVLong(); this.fstStore.init(in, numBytes); cacheRootArcs(); } public INPUT_TYPE getInputType() { return inputType; } private long ramBytesUsed(Arc<T>[] arcs) { long size = 0; if (arcs != null) { size += RamUsageEstimator.shallowSizeOf(arcs); for (Arc<T> arc : arcs) { if (arc != null) { size += ARC_SHALLOW_RAM_BYTES_USED; if (arc.output != null && arc.output != outputs.getNoOutput()) { size += outputs.ramBytesUsed(arc.output); } if (arc.nextFinalOutput != null && arc.nextFinalOutput != outputs.getNoOutput()) { size += outputs.ramBytesUsed(arc.nextFinalOutput); } } } } return size; } private int cachedArcsBytesUsed; @Override public long ramBytesUsed() { long size = BASE_RAM_BYTES_USED; if (this.fstStore != null) { size += this.fstStore.ramBytesUsed(); } else { size += bytes.ramBytesUsed(); } size += cachedArcsBytesUsed; return size; } @Override public String toString() { return getClass().getSimpleName() + "(input=" + inputType + ",output=" + outputs; } void finish(long newStartNode) throws IOException { assert newStartNode <= bytes.getPosition(); if (startNode != -1) { throw new IllegalStateException("already finished"); } if (newStartNode == FINAL_END_NODE && emptyOutput != null) { newStartNode = 0; } startNode = newStartNode; bytes.finish(); cacheRootArcs(); } // Optionally caches first 128 labels @SuppressWarnings({"rawtypes","unchecked"}) private void cacheRootArcs() throws IOException { // We should only be called once per FST: assert cachedArcsBytesUsed == 0; final Arc<T> arc = new Arc<>(); getFirstArc(arc); if (targetHasArcs(arc)) { final BytesReader in = getBytesReader(); Arc<T>[] arcs = (Arc<T>[]) new Arc[0x80]; readFirstRealTargetArc(arc.target, arc, in); int count = 0; while(true) { assert arc.label != END_LABEL; if (arc.label < arcs.length) { arcs[arc.label] = new Arc<T>().copyFrom(arc); } else { break; } if (arc.isLast()) { break; } readNextRealArc(arc, in); count++; } int cacheRAM = (int) ramBytesUsed(arcs); // Don't cache if there are only a few arcs or if the cache would use > 20% RAM of the FST itself: if (count >= FIXED_ARRAY_NUM_ARCS_SHALLOW && cacheRAM < ramBytesUsed()/5) { cachedRootArcs = arcs; cachedArcsBytesUsed = cacheRAM; } } } public T getEmptyOutput() { return emptyOutput; } void setEmptyOutput(T v) throws IOException { if (emptyOutput != null) { emptyOutput = outputs.merge(emptyOutput, v); } else { emptyOutput = v; } } public void save(DataOutput out) throws IOException { if (startNode == -1) { throw new IllegalStateException("call finish first"); } CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT); // TODO: really we should encode this as an arc, arriving // to the root node, instead of special casing here: if (emptyOutput != null) { // Accepts empty string out.writeByte((byte) 1); // Serialize empty-string output: RAMOutputStream ros = new RAMOutputStream(); outputs.writeFinalOutput(emptyOutput, ros); byte[] emptyOutputBytes = new byte[(int) ros.getFilePointer()]; ros.writeTo(emptyOutputBytes, 0); // reverse final int stopAt = emptyOutputBytes.length/2; int upto = 0; while(upto < stopAt) { final byte b = emptyOutputBytes[upto]; emptyOutputBytes[upto] = emptyOutputBytes[emptyOutputBytes.length-upto-1]; emptyOutputBytes[emptyOutputBytes.length-upto-1] = b; upto++; } out.writeVInt(emptyOutputBytes.length); out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length); } else { out.writeByte((byte) 0); } final byte t; if (inputType == INPUT_TYPE.BYTE1) { t = 0; } else if (inputType == INPUT_TYPE.BYTE2) { t = 1; } else { t = 2; } out.writeByte(t); out.writeVLong(startNode); if (bytes != null) { long numBytes = bytes.getPosition(); out.writeVLong(numBytes); bytes.writeTo(out); } else { assert fstStore != null; fstStore.writeTo(out); } }
Writes an automaton to a file.
/** * Writes an automaton to a file. */
public void save(final Path path) throws IOException { try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(path))) { save(new OutputStreamDataOutput(os)); } }
Reads an automaton from a file.
/** * Reads an automaton from a file. */
public static <T> FST<T> read(Path path, Outputs<T> outputs) throws IOException { try (InputStream is = Files.newInputStream(path)) { return new FST<>(new InputStreamDataInput(new BufferedInputStream(is)), outputs); } } private void writeLabel(DataOutput out, int v) throws IOException { assert v >= 0: "v=" + v; if (inputType == INPUT_TYPE.BYTE1) { assert v <= 255: "v=" + v; out.writeByte((byte) v); } else if (inputType == INPUT_TYPE.BYTE2) { assert v <= 65535: "v=" + v; out.writeShort((short) v); } else { out.writeVInt(v); } }
Reads one BYTE1/2/4 label from the provided DataInput.
/** Reads one BYTE1/2/4 label from the provided {@link DataInput}. */
public int readLabel(DataInput in) throws IOException { final int v; if (inputType == INPUT_TYPE.BYTE1) { // Unsigned byte: v = in.readByte() & 0xFF; } else if (inputType == INPUT_TYPE.BYTE2) { // Unsigned short: v = in.readShort() & 0xFFFF; } else { v = in.readVInt(); } return v; }
returns true if the node at this address has any outgoing arcs
/** returns true if the node at this address has any * outgoing arcs */
public static<T> boolean targetHasArcs(Arc<T> arc) { return arc.target > 0; } // serializes new node by appending its bytes to the end // of the current byte[] long addNode(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn) throws IOException { T NO_OUTPUT = outputs.getNoOutput(); //System.out.println("FST.addNode pos=" + bytes.getPosition() + " numArcs=" + nodeIn.numArcs); if (nodeIn.numArcs == 0) { if (nodeIn.isFinal) { return FINAL_END_NODE; } else { return NON_FINAL_END_NODE; } } final long startAddress = builder.bytes.getPosition(); //System.out.println(" startAddr=" + startAddress); final boolean doFixedArray = shouldExpand(builder, nodeIn); if (doFixedArray) { //System.out.println(" fixedArray"); if (builder.reusedBytesPerArc.length < nodeIn.numArcs) { builder.reusedBytesPerArc = new int[ArrayUtil.oversize(nodeIn.numArcs, 1)]; } } builder.arcCount += nodeIn.numArcs; final int lastArc = nodeIn.numArcs-1; long lastArcStart = builder.bytes.getPosition(); int maxBytesPerArc = 0; for(int arcIdx=0; arcIdx < nodeIn.numArcs; arcIdx++) { final Builder.Arc<T> arc = nodeIn.arcs[arcIdx]; final Builder.CompiledNode target = (Builder.CompiledNode) arc.target; int flags = 0; //System.out.println(" arc " + arcIdx + " label=" + arc.label + " -> target=" + target.node); if (arcIdx == lastArc) { flags += BIT_LAST_ARC; } if (builder.lastFrozenNode == target.node && !doFixedArray) { // TODO: for better perf (but more RAM used) we // could avoid this except when arc is "near" the // last arc: flags += BIT_TARGET_NEXT; } if (arc.isFinal) { flags += BIT_FINAL_ARC; if (arc.nextFinalOutput != NO_OUTPUT) { flags += BIT_ARC_HAS_FINAL_OUTPUT; } } else { assert arc.nextFinalOutput == NO_OUTPUT; } boolean targetHasArcs = target.node > 0; if (!targetHasArcs) { flags += BIT_STOP_NODE; } if (arc.output != NO_OUTPUT) { flags += BIT_ARC_HAS_OUTPUT; } builder.bytes.writeByte((byte) flags); writeLabel(builder.bytes, arc.label); // System.out.println(" write arc: label=" + (char) arc.label + " flags=" + flags + " target=" + target.node + " pos=" + bytes.getPosition() + " output=" + outputs.outputToString(arc.output)); if (arc.output != NO_OUTPUT) { outputs.write(arc.output, builder.bytes); //System.out.println(" write output"); } if (arc.nextFinalOutput != NO_OUTPUT) { //System.out.println(" write final output"); outputs.writeFinalOutput(arc.nextFinalOutput, builder.bytes); } if (targetHasArcs && (flags & BIT_TARGET_NEXT) == 0) { assert target.node > 0; //System.out.println(" write target"); builder.bytes.writeVLong(target.node); } // just write the arcs "like normal" on first pass, but record how many bytes each one took // and max byte size: if (doFixedArray) { builder.reusedBytesPerArc[arcIdx] = (int) (builder.bytes.getPosition() - lastArcStart); lastArcStart = builder.bytes.getPosition(); maxBytesPerArc = Math.max(maxBytesPerArc, builder.reusedBytesPerArc[arcIdx]); //System.out.println(" bytes=" + builder.reusedBytesPerArc[arcIdx]); } } // TODO: try to avoid wasteful cases: disable doFixedArray in that case /* * * LUCENE-4682: what is a fair heuristic here? * It could involve some of these: * 1. how "busy" the node is: nodeIn.inputCount relative to frontier[0].inputCount? * 2. how much binSearch saves over scan: nodeIn.numArcs * 3. waste: numBytes vs numBytesExpanded * * the one below just looks at #3 if (doFixedArray) { // rough heuristic: make this 1.25 "waste factor" a parameter to the phd ctor???? int numBytes = lastArcStart - startAddress; int numBytesExpanded = maxBytesPerArc * nodeIn.numArcs; if (numBytesExpanded > numBytes*1.25) { doFixedArray = false; } } */ if (doFixedArray) { final int MAX_HEADER_SIZE = 11; // header(byte) + numArcs(vint) + numBytes(vint) assert maxBytesPerArc > 0; // 2nd pass just "expands" all arcs to take up a fixed byte size // If more than (1 / DIRECT_ARC_LOAD_FACTOR) of the "slots" would be occupied, write an arc // array that may have holes in it so that we can address the arcs directly by label without // binary search int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1; boolean writeDirectly = labelRange > 0 && labelRange < Builder.DIRECT_ARC_LOAD_FACTOR * nodeIn.numArcs; // TODO: LUCENE-8920 tighten up RAM usage. Until then, direct // arc encoding is disabled due to excessive memory usage found // in some cases writeDirectly = false; //System.out.println("write int @pos=" + (fixedArrayStart-4) + " numArcs=" + nodeIn.numArcs); // create the header // TODO: clean this up: or just rewind+reuse and deal with it byte header[] = new byte[MAX_HEADER_SIZE]; ByteArrayDataOutput bad = new ByteArrayDataOutput(header); // write a "false" first arc: if (writeDirectly) { bad.writeByte(ARCS_AS_ARRAY_WITH_GAPS); bad.writeVInt(labelRange); } else { bad.writeByte(ARCS_AS_ARRAY_PACKED); bad.writeVInt(nodeIn.numArcs); } bad.writeVInt(maxBytesPerArc); int headerLen = bad.getPosition(); final long fixedArrayStart = startAddress + headerLen; if (writeDirectly) { writeArrayWithGaps(builder, nodeIn, fixedArrayStart, maxBytesPerArc, labelRange); } else { writeArrayPacked(builder, nodeIn, fixedArrayStart, maxBytesPerArc); } // now write the header builder.bytes.writeBytes(startAddress, header, 0, headerLen); } final long thisNodeAddress = builder.bytes.getPosition()-1; builder.bytes.reverse(startAddress, thisNodeAddress); builder.nodeCount++; return thisNodeAddress; } private void writeArrayPacked(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long fixedArrayStart, int maxBytesPerArc) { // expand the arcs in place, backwards long srcPos = builder.bytes.getPosition(); long destPos = fixedArrayStart + nodeIn.numArcs * maxBytesPerArc; assert destPos >= srcPos; if (destPos > srcPos) { builder.bytes.skipBytes((int) (destPos - srcPos)); for(int arcIdx = nodeIn.numArcs - 1; arcIdx >= 0; arcIdx--) { destPos -= maxBytesPerArc; srcPos -= builder.reusedBytesPerArc[arcIdx]; //System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos); if (srcPos != destPos) { //System.out.println(" copy len=" + builder.reusedBytesPerArc[arcIdx]); assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " reusedBytesPerArc[arcIdx]=" + builder.reusedBytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs; builder.bytes.copyBytes(srcPos, destPos, builder.reusedBytesPerArc[arcIdx]); } } } } private void writeArrayWithGaps(Builder<T> builder, Builder.UnCompiledNode<T> nodeIn, long fixedArrayStart, int maxBytesPerArc, int labelRange) { // expand the arcs in place, backwards long srcPos = builder.bytes.getPosition(); long destPos = fixedArrayStart + labelRange * maxBytesPerArc; // if destPos == srcPos it means all the arcs were the same length, and the array of them is *already* direct assert destPos >= srcPos; if (destPos > srcPos) { builder.bytes.skipBytes((int) (destPos - srcPos)); int arcIdx = nodeIn.numArcs - 1; int firstLabel = nodeIn.arcs[0].label; int nextLabel = nodeIn.arcs[arcIdx].label; for (int directArcIdx = labelRange - 1; directArcIdx >= 0; directArcIdx--) { destPos -= maxBytesPerArc; if (directArcIdx == nextLabel - firstLabel) { int arcLen = builder.reusedBytesPerArc[arcIdx]; srcPos -= arcLen; //System.out.println(" direct pack idx=" + directArcIdx + " arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos + " label=" + nextLabel); if (srcPos != destPos) { //System.out.println(" copy len=" + builder.reusedBytesPerArc[arcIdx]); assert destPos > srcPos: "destPos=" + destPos + " srcPos=" + srcPos + " arcIdx=" + arcIdx + " maxBytesPerArc=" + maxBytesPerArc + " reusedBytesPerArc[arcIdx]=" + builder.reusedBytesPerArc[arcIdx] + " nodeIn.numArcs=" + nodeIn.numArcs; builder.bytes.copyBytes(srcPos, destPos, arcLen); if (arcIdx == 0) { break; } } --arcIdx; nextLabel = nodeIn.arcs[arcIdx].label; } else { assert directArcIdx > arcIdx; // mark this as a missing arc //System.out.println(" direct pack idx=" + directArcIdx + " no arc"); builder.bytes.writeByte(destPos, BIT_MISSING_ARC); } } } }
Fills virtual 'start' arc, ie, an empty incoming arc to the FST's start node
/** Fills virtual 'start' arc, ie, an empty incoming arc to * the FST's start node */
public Arc<T> getFirstArc(Arc<T> arc) { T NO_OUTPUT = outputs.getNoOutput(); if (emptyOutput != null) { arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC; arc.nextFinalOutput = emptyOutput; if (emptyOutput != NO_OUTPUT) { arc.flags |= BIT_ARC_HAS_FINAL_OUTPUT; } } else { arc.flags = BIT_LAST_ARC; arc.nextFinalOutput = NO_OUTPUT; } arc.output = NO_OUTPUT; // If there are no nodes, ie, the FST only accepts the // empty string, then startNode is 0 arc.target = startNode; return arc; }
Follows the follow arc and reads the last arc of its target; this changes the provided arc (2nd arg) in-place and returns it.
Returns:Returns the second argument (arc).
/** Follows the <code>follow</code> arc and reads the last * arc of its target; this changes the provided * <code>arc</code> (2nd arg) in-place and returns it. * * @return Returns the second argument * (<code>arc</code>). */
public Arc<T> readLastTargetArc(Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException { //System.out.println("readLast"); if (!targetHasArcs(follow)) { //System.out.println(" end node"); assert follow.isFinal(); arc.label = END_LABEL; arc.target = FINAL_END_NODE; arc.output = follow.nextFinalOutput; arc.flags = BIT_LAST_ARC; return arc; } else { in.setPosition(follow.target); final byte b = in.readByte(); if (b == ARCS_AS_ARRAY_PACKED || b == ARCS_AS_ARRAY_WITH_GAPS) { // array: jump straight to end arc.numArcs = in.readVInt(); arc.bytesPerArc = in.readVInt(); //System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); arc.posArcsStart = in.getPosition(); if (b == ARCS_AS_ARRAY_WITH_GAPS) { arc.arcIdx = Integer.MIN_VALUE; arc.nextArc = arc.posArcsStart - (arc.numArcs - 1) * arc.bytesPerArc; } else { arc.arcIdx = arc.numArcs - 2; } } else { arc.flags = b; // non-array: linear scan arc.bytesPerArc = 0; //System.out.println(" scan"); while(!arc.isLast()) { // skip this arc: readLabel(in); if (arc.flag(BIT_ARC_HAS_OUTPUT)) { outputs.skipOutput(in); } if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { outputs.skipFinalOutput(in); } if (arc.flag(BIT_STOP_NODE)) { } else if (arc.flag(BIT_TARGET_NEXT)) { } else { readUnpackedNodeTarget(in); } arc.flags = in.readByte(); } // Undo the byte flags we read: in.skipBytes(-1); arc.nextArc = in.getPosition(); } readNextRealArc(arc, in); assert arc.isLast(); return arc; } } private long readUnpackedNodeTarget(BytesReader in) throws IOException { return in.readVLong(); }
Follow the follow arc and read the first arc of its target; this changes the provided arc (2nd arg) in-place and returns it.
Returns:Returns the second argument (arc).
/** * Follow the <code>follow</code> arc and read the first arc of its target; * this changes the provided <code>arc</code> (2nd arg) in-place and returns * it. * * @return Returns the second argument (<code>arc</code>). */
public Arc<T> readFirstTargetArc(Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException { //int pos = address; //System.out.println(" readFirstTarget follow.target=" + follow.target + " isFinal=" + follow.isFinal()); if (follow.isFinal()) { // Insert "fake" final first arc: arc.label = END_LABEL; arc.output = follow.nextFinalOutput; arc.flags = BIT_FINAL_ARC; if (follow.target <= 0) { arc.flags |= BIT_LAST_ARC; } else { // NOTE: nextArc is a node (not an address!) in this case: arc.nextArc = follow.target; } arc.target = FINAL_END_NODE; //System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + arc.isLast() + " output=" + outputs.outputToString(arc.output)); return arc; } else { return readFirstRealTargetArc(follow.target, arc, in); } } public Arc<T> readFirstRealTargetArc(long node, Arc<T> arc, final BytesReader in) throws IOException { final long address = node; in.setPosition(address); //System.out.println(" flags=" + arc.flags); byte flags = in.readByte(); if (flags == ARCS_AS_ARRAY_PACKED || flags == ARCS_AS_ARRAY_WITH_GAPS) { //System.out.println(" fixedArray"); // this is first arc in a fixed-array arc.numArcs = in.readVInt(); arc.bytesPerArc = in.readVInt(); if (flags == ARCS_AS_ARRAY_PACKED) { arc.arcIdx = -1; } else { arc.arcIdx = Integer.MIN_VALUE; } arc.nextArc = arc.posArcsStart = in.getPosition(); //System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos); } else { //arc.flags = b; arc.nextArc = address; arc.bytesPerArc = 0; } return readNextRealArc(arc, in); }
Checks if arc's target state is in expanded (or vector) format.
Returns:Returns true if arc points to a state in an expanded array format.
/** * Checks if <code>arc</code>'s target state is in expanded (or vector) format. * * @return Returns <code>true</code> if <code>arc</code> points to a state in an * expanded array format. */
boolean isExpandedTarget(Arc<T> follow, BytesReader in) throws IOException { if (!targetHasArcs(follow)) { return false; } else { in.setPosition(follow.target); byte flags = in.readByte(); return flags == ARCS_AS_ARRAY_PACKED || flags == ARCS_AS_ARRAY_WITH_GAPS; } }
In-place read; returns the arc.
/** In-place read; returns the arc. */
public Arc<T> readNextArc(Arc<T> arc, BytesReader in) throws IOException { if (arc.label == END_LABEL) { // This was a fake inserted "final" arc if (arc.nextArc <= 0) { throw new IllegalArgumentException("cannot readNextArc when arc.isLast()=true"); } return readFirstRealTargetArc(arc.nextArc, arc, in); } else { return readNextRealArc(arc, in); } }
Peeks at next arc's label; does not alter arc. Do not call this if arc.isLast()!
/** Peeks at next arc's label; does not alter arc. Do * not call this if arc.isLast()! */
public int readNextArcLabel(Arc<T> arc, BytesReader in) throws IOException { assert !arc.isLast(); if (arc.label == END_LABEL) { //System.out.println(" nextArc fake " + //arc.nextArc); long pos = arc.nextArc; in.setPosition(pos); final byte flags = in.readByte(); if (flags == ARCS_AS_ARRAY_PACKED || flags == ARCS_AS_ARRAY_WITH_GAPS) { //System.out.println(" nextArc fixed array"); in.readVInt(); // Skip bytesPerArc: in.readVInt(); } else { in.setPosition(pos); } // skip flags in.readByte(); } else { if (arc.bytesPerArc != 0) { //System.out.println(" nextArc real array"); // arcs are in an array if (arc.arcIdx >= 0) { in.setPosition(arc.posArcsStart); // point at next arc, -1 to skip flags in.skipBytes((1 + arc.arcIdx) * arc.bytesPerArc + 1); } else { in.setPosition(arc.nextArc); byte flags = in.readByte(); // skip missing arcs while (flag(flags, BIT_MISSING_ARC)) { in.skipBytes(arc.bytesPerArc - 1); flags = in.readByte(); } } } else { // arcs are packed //System.out.println(" nextArc real packed"); // -1 to skip flags in.setPosition(arc.nextArc - 1); } } return readLabel(in); }
Never returns null, but you should never call this if arc.isLast() is true.
/** Never returns null, but you should never call this if * arc.isLast() is true. */
public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException { // TODO: can't assert this because we call from readFirstArc // assert !flag(arc.flags, BIT_LAST_ARC); // this is a continuing arc in a fixed array if (arc.bytesPerArc != 0) { // arcs are in an array if (arc.arcIdx > Integer.MIN_VALUE) { arc.arcIdx++; assert arc.arcIdx < arc.numArcs; in.setPosition(arc.posArcsStart - arc.arcIdx * arc.bytesPerArc); arc.flags = in.readByte(); } else { assert arc.nextArc <= arc.posArcsStart && arc.nextArc > arc.posArcsStart - arc.numArcs * arc.bytesPerArc; in.setPosition(arc.nextArc); arc.flags = in.readByte(); while (flag(arc.flags, BIT_MISSING_ARC)) { // skip empty arcs arc.nextArc -= arc.bytesPerArc; in.skipBytes(arc.bytesPerArc - 1); arc.flags = in.readByte(); } } } else { // arcs are packed in.setPosition(arc.nextArc); arc.flags = in.readByte(); } arc.label = readLabel(in); if (arc.flag(BIT_ARC_HAS_OUTPUT)) { arc.output = outputs.read(in); } else { arc.output = outputs.getNoOutput(); } if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { arc.nextFinalOutput = outputs.readFinalOutput(in); } else { arc.nextFinalOutput = outputs.getNoOutput(); } if (arc.flag(BIT_STOP_NODE)) { if (arc.flag(BIT_FINAL_ARC)) { arc.target = FINAL_END_NODE; } else { arc.target = NON_FINAL_END_NODE; } if (arc.bytesPerArc == 0) { arc.nextArc = in.getPosition(); } else { arc.nextArc -= arc.bytesPerArc; } } else if (arc.flag(BIT_TARGET_NEXT)) { arc.nextArc = in.getPosition(); // TODO: would be nice to make this lazy -- maybe // caller doesn't need the target and is scanning arcs... if (!arc.flag(BIT_LAST_ARC)) { if (arc.bytesPerArc == 0) { // must scan seekToNextNode(in); } else { in.setPosition(arc.posArcsStart); in.skipBytes(arc.bytesPerArc * arc.numArcs); } } arc.target = in.getPosition(); } else { arc.target = readUnpackedNodeTarget(in); if (arc.bytesPerArc > 0 && arc.arcIdx == Integer.MIN_VALUE) { // nextArc was pointing to *this* arc when we entered; advance to the next // if it is a missing arc, we will skip it later arc.nextArc -= arc.bytesPerArc; } else { // in list and fixed table encodings, the next arc always follows this one arc.nextArc = in.getPosition(); } } return arc; } // LUCENE-5152: called only from asserts, to validate that the // non-cached arc lookup would produce the same result, to // catch callers that illegally modify shared structures with // the result (we shallow-clone the Arc itself, but e.g. a BytesRef // output is still shared): private boolean assertRootCachedArc(int label, Arc<T> cachedArc) throws IOException { Arc<T> arc = new Arc<>(); getFirstArc(arc); BytesReader in = getBytesReader(); Arc<T> result = findTargetArc(label, arc, arc, in, false); if (result == null) { assert cachedArc == null; } else { assert cachedArc != null; assert cachedArc.arcIdx == result.arcIdx; assert cachedArc.bytesPerArc == result.bytesPerArc; assert cachedArc.flags == result.flags; assert cachedArc.label == result.label; if (cachedArc.bytesPerArc == 0 || cachedArc.arcIdx == Integer.MIN_VALUE) { // in the sparse array case, this value is not valid, so don't assert it assert cachedArc.nextArc == result.nextArc; } assert cachedArc.nextFinalOutput.equals(result.nextFinalOutput); assert cachedArc.numArcs == result.numArcs; assert cachedArc.output.equals(result.output); assert cachedArc.posArcsStart == result.posArcsStart; assert cachedArc.target == result.target; } return true; } // TODO: could we somehow [partially] tableize arc lookups // like automaton?
Finds an arc leaving the incoming arc, replacing the arc in place. This returns null if the arc was not found, else the incoming arc.
/** Finds an arc leaving the incoming arc, replacing the arc in place. * This returns null if the arc was not found, else the incoming arc. */
public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in) throws IOException { return findTargetArc(labelToMatch, follow, arc, in, true); }
Finds an arc leaving the incoming arc, replacing the arc in place. This returns null if the arc was not found, else the incoming arc.
/** Finds an arc leaving the incoming arc, replacing the arc in place. * This returns null if the arc was not found, else the incoming arc. */
private Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesReader in, boolean useRootArcCache) throws IOException { if (labelToMatch == END_LABEL) { if (follow.isFinal()) { if (follow.target <= 0) { arc.flags = BIT_LAST_ARC; } else { arc.flags = 0; // NOTE: nextArc is a node (not an address!) in this case: arc.nextArc = follow.target; } arc.output = follow.nextFinalOutput; arc.label = END_LABEL; return arc; } else { return null; } } // Short-circuit if this arc is in the root arc cache: if (useRootArcCache && cachedRootArcs != null && follow.target == startNode && labelToMatch < cachedRootArcs.length) { final Arc<T> result = cachedRootArcs[labelToMatch]; // LUCENE-5152: detect tricky cases where caller // modified previously returned cached root-arcs: assert assertRootCachedArc(labelToMatch, result); if (result == null) { return null; } else { arc.copyFrom(result); return arc; } } if (!targetHasArcs(follow)) { return null; } in.setPosition(follow.target); // System.out.println("fta label=" + (char) labelToMatch); byte flags = in.readByte(); if (flags == ARCS_AS_ARRAY_WITH_GAPS) { arc.numArcs = in.readVInt(); arc.bytesPerArc = in.readVInt(); arc.posArcsStart = in.getPosition(); // Array is direct; address by label in.skipBytes(1); int firstLabel = readLabel(in); int arcPos = labelToMatch - firstLabel; if (arcPos == 0) { arc.nextArc = arc.posArcsStart; } else if (arcPos > 0) { if (arcPos >= arc.numArcs) { return null; } in.setPosition(arc.posArcsStart - arc.bytesPerArc * arcPos); flags = in.readByte(); if (flag(flags, BIT_MISSING_ARC)) { return null; } // point to flags that we just read arc.nextArc = in.getPosition() + 1; } else { return null; } arc.arcIdx = Integer.MIN_VALUE; return readNextRealArc(arc, in); } else if (flags == ARCS_AS_ARRAY_PACKED) { arc.numArcs = in.readVInt(); arc.bytesPerArc = in.readVInt(); arc.posArcsStart = in.getPosition(); // Array is sparse; do binary search: int low = 0; int high = arc.numArcs - 1; while (low <= high) { //System.out.println(" cycle"); int mid = (low + high) >>> 1; // +1 to skip over flags in.setPosition(arc.posArcsStart - (arc.bytesPerArc * mid + 1)); int midLabel = readLabel(in); final int cmp = midLabel - labelToMatch; if (cmp < 0) { low = mid + 1; } else if (cmp > 0) { high = mid - 1; } else { arc.arcIdx = mid - 1; //System.out.println(" found!"); return readNextRealArc(arc, in); } } return null; } // Linear scan readFirstRealTargetArc(follow.target, arc, in); while(true) { //System.out.println(" non-bs cycle"); // TODO: we should fix this code to not have to create // object for the output of every arc we scan... only // for the matching arc, if found if (arc.label == labelToMatch) { //System.out.println(" found!"); return arc; } else if (arc.label > labelToMatch) { return null; } else if (arc.isLast()) { return null; } else { readNextRealArc(arc, in); } } } private void seekToNextNode(BytesReader in) throws IOException { while(true) { final int flags = in.readByte(); readLabel(in); if (flag(flags, BIT_ARC_HAS_OUTPUT)) { outputs.skipOutput(in); } if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) { outputs.skipFinalOutput(in); } if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) { readUnpackedNodeTarget(in); } if (flag(flags, BIT_LAST_ARC)) { return; } } }
Nodes will be expanded if their depth (distance from the root node) is <= this value and their number of arcs is >= FST<T>.FIXED_ARRAY_NUM_ARCS_SHALLOW.

Fixed array consumes more RAM but enables binary search on the arcs (instead of a linear scan) on lookup by arc label.

See Also:
Returns:true if node should be stored in an expanded (array) form.
/** * Nodes will be expanded if their depth (distance from the root node) is * &lt;= this value and their number of arcs is &gt;= * {@link #FIXED_ARRAY_NUM_ARCS_SHALLOW}. * * <p> * Fixed array consumes more RAM but enables binary search on the arcs * (instead of a linear scan) on lookup by arc label. * * @return <code>true</code> if <code>node</code> should be stored in an * expanded (array) form. * * @see #FIXED_ARRAY_NUM_ARCS_DEEP * @see Builder.UnCompiledNode#depth */
private boolean shouldExpand(Builder<T> builder, Builder.UnCompiledNode<T> node) { return builder.allowArrayArcs && ((node.depth <= FIXED_ARRAY_SHALLOW_DISTANCE && node.numArcs >= FIXED_ARRAY_NUM_ARCS_SHALLOW) || node.numArcs >= FIXED_ARRAY_NUM_ARCS_DEEP); }
Returns a BytesReader for this FST, positioned at position 0.
/** Returns a {@link BytesReader} for this FST, positioned at * position 0. */
public BytesReader getBytesReader() { if (this.fstStore != null) { return this.fstStore.getReverseBytesReader(); } else { return bytes.getReverseReader(); } }
Reads bytes stored in an FST.
/** Reads bytes stored in an FST. */
public static abstract class BytesReader extends DataInput {
Get current read position.
/** Get current read position. */
public abstract long getPosition();
Set current read position.
/** Set current read position. */
public abstract void setPosition(long pos);
Returns true if this reader uses reversed bytes under-the-hood.
/** Returns true if this reader uses reversed bytes * under-the-hood. */
public abstract boolean reversed(); } /* public void countSingleChains() throws IOException { // TODO: must assert this FST was built with // "willRewrite" final List<ArcAndState<T>> queue = new ArrayList<>(); // TODO: use bitset to not revisit nodes already // visited FixedBitSet seen = new FixedBitSet(1+nodeCount); int saved = 0; queue.add(new ArcAndState<T>(getFirstArc(new Arc<T>()), new IntsRef())); Arc<T> scratchArc = new Arc<>(); while(queue.size() > 0) { //System.out.println("cycle size=" + queue.size()); //for(ArcAndState<T> ent : queue) { // System.out.println(" " + Util.toBytesRef(ent.chain, new BytesRef())); // } final ArcAndState<T> arcAndState = queue.get(queue.size()-1); seen.set(arcAndState.arc.node); final BytesRef br = Util.toBytesRef(arcAndState.chain, new BytesRef()); if (br.length > 0 && br.bytes[br.length-1] == -1) { br.length--; } //System.out.println(" top node=" + arcAndState.arc.target + " chain=" + br.utf8ToString()); if (targetHasArcs(arcAndState.arc) && !seen.get(arcAndState.arc.target)) { // push readFirstTargetArc(arcAndState.arc, scratchArc); //System.out.println(" push label=" + (char) scratchArc.label); //System.out.println(" tonode=" + scratchArc.target + " last?=" + scratchArc.isLast()); final IntsRef chain = IntsRef.deepCopyOf(arcAndState.chain); chain.grow(1+chain.length); // TODO //assert scratchArc.label != END_LABEL; chain.ints[chain.length] = scratchArc.label; chain.length++; if (scratchArc.isLast()) { if (scratchArc.target != -1 && inCounts[scratchArc.target] == 1) { //System.out.println(" append"); } else { if (arcAndState.chain.length > 1) { saved += chain.length-2; try { System.out.println("chain: " + Util.toBytesRef(chain, new BytesRef()).utf8ToString()); } catch (AssertionError ae) { System.out.println("chain: " + Util.toBytesRef(chain, new BytesRef())); } } chain.length = 0; } } else { //System.out.println(" reset"); if (arcAndState.chain.length > 1) { saved += arcAndState.chain.length-2; try { System.out.println("chain: " + Util.toBytesRef(arcAndState.chain, new BytesRef()).utf8ToString()); } catch (AssertionError ae) { System.out.println("chain: " + Util.toBytesRef(arcAndState.chain, new BytesRef())); } } if (scratchArc.target != -1 && inCounts[scratchArc.target] != 1) { chain.length = 0; } else { chain.ints[0] = scratchArc.label; chain.length = 1; } } // TODO: instead of new Arc() we can re-use from // a by-depth array queue.add(new ArcAndState<T>(new Arc<T>().copyFrom(scratchArc), chain)); } else if (!arcAndState.arc.isLast()) { // next readNextArc(arcAndState.arc); //System.out.println(" next label=" + (char) arcAndState.arc.label + " len=" + arcAndState.chain.length); if (arcAndState.chain.length != 0) { arcAndState.chain.ints[arcAndState.chain.length-1] = arcAndState.arc.label; } } else { if (arcAndState.chain.length > 1) { saved += arcAndState.chain.length-2; System.out.println("chain: " + Util.toBytesRef(arcAndState.chain, new BytesRef()).utf8ToString()); } // pop //System.out.println(" pop"); queue.remove(queue.size()-1); while(queue.size() > 0 && queue.get(queue.size()-1).arc.isLast()) { queue.remove(queue.size()-1); } if (queue.size() > 0) { final ArcAndState<T> arcAndState2 = queue.get(queue.size()-1); readNextArc(arcAndState2.arc); //System.out.println(" read next=" + (char) arcAndState2.arc.label + " queue=" + queue.size()); assert arcAndState2.arc.label != END_LABEL; if (arcAndState2.chain.length != 0) { arcAndState2.chain.ints[arcAndState2.chain.length-1] = arcAndState2.arc.label; } } } } System.out.println("TOT saved " + saved); } */ }