/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util.automaton;


import java.util.Iterator;
import java.util.SortedSet;
import java.util.TreeSet;

import org.apache.lucene.util.UnicodeUtil;

Class to construct DFAs that match a word within some edit distance.

Implements the algorithm described in: Schulz and Mihov: Fast String Correction with Levenshtein Automata

@lucene.experimental
/** * Class to construct DFAs that match a word within some edit distance. * <p> * Implements the algorithm described in: * Schulz and Mihov: Fast String Correction with Levenshtein Automata * @lucene.experimental */
public class LevenshteinAutomata {
Maximum edit distance this class can generate an automaton for. @lucene.internal
/** Maximum edit distance this class can generate an automaton for. * @lucene.internal */
public static final int MAXIMUM_SUPPORTED_DISTANCE = 2; /* input word */ final int word[]; /* the automata alphabet. */ final int alphabet[]; /* the maximum symbol in the alphabet (e.g. 255 for UTF-8 or 10FFFF for UTF-32) */ final int alphaMax; /* the ranges outside of alphabet */ final int rangeLower[]; final int rangeUpper[]; int numRanges = 0; ParametricDescription descriptions[];
Create a new LevenshteinAutomata for some input String. Optionally count transpositions as a primitive edit.
/** * Create a new LevenshteinAutomata for some input String. * Optionally count transpositions as a primitive edit. */
public LevenshteinAutomata(String input, boolean withTranspositions) { this(codePoints(input), Character.MAX_CODE_POINT, withTranspositions); }
Expert: specify a custom maximum possible symbol (alphaMax); default is Character.MAX_CODE_POINT.
/** * Expert: specify a custom maximum possible symbol * (alphaMax); default is Character.MAX_CODE_POINT. */
public LevenshteinAutomata(int[] word, int alphaMax, boolean withTranspositions) { this.word = word; this.alphaMax = alphaMax; // calculate the alphabet SortedSet<Integer> set = new TreeSet<>(); for (int i = 0; i < word.length; i++) { int v = word[i]; if (v > alphaMax) { throw new IllegalArgumentException("alphaMax exceeded by symbol " + v + " in word"); } set.add(v); } alphabet = new int[set.size()]; Iterator<Integer> iterator = set.iterator(); for (int i = 0; i < alphabet.length; i++) alphabet[i] = iterator.next(); rangeLower = new int[alphabet.length + 2]; rangeUpper = new int[alphabet.length + 2]; // calculate the unicode range intervals that exclude the alphabet // these are the ranges for all unicode characters not in the alphabet int lower = 0; for (int i = 0; i < alphabet.length; i++) { int higher = alphabet[i]; if (higher > lower) { rangeLower[numRanges] = lower; rangeUpper[numRanges] = higher - 1; numRanges++; } lower = higher + 1; } /* add the final endpoint */ if (lower <= alphaMax) { rangeLower[numRanges] = lower; rangeUpper[numRanges] = alphaMax; numRanges++; } descriptions = new ParametricDescription[] { null, /* for n=0, we do not need to go through the trouble */ withTranspositions ? new Lev1TParametricDescription(word.length) : new Lev1ParametricDescription(word.length), withTranspositions ? new Lev2TParametricDescription(word.length) : new Lev2ParametricDescription(word.length), }; } private static int[] codePoints(String input) { int length = Character.codePointCount(input, 0, input.length()); int word[] = new int[length]; for (int i = 0, j = 0, cp = 0; i < input.length(); i += Character.charCount(cp)) { word[j++] = cp = input.codePointAt(i); } return word; }
Compute a DFA that accepts all strings within an edit distance of n.

All automata have the following properties:

  • They are deterministic (DFA).
  • There are no transitions to dead states.
  • They are not minimal (some transitions could be combined).
/** * Compute a DFA that accepts all strings within an edit distance of <code>n</code>. * <p> * All automata have the following properties: * <ul> * <li>They are deterministic (DFA). * <li>There are no transitions to dead states. * <li>They are not minimal (some transitions could be combined). * </ul> */
public Automaton toAutomaton(int n) { return toAutomaton(n, ""); }
Compute a DFA that accepts all strings within an edit distance of n, matching the specified exact prefix.

All automata have the following properties:

  • They are deterministic (DFA).
  • There are no transitions to dead states.
  • They are not minimal (some transitions could be combined).
/** * Compute a DFA that accepts all strings within an edit distance of <code>n</code>, * matching the specified exact prefix. * <p> * All automata have the following properties: * <ul> * <li>They are deterministic (DFA). * <li>There are no transitions to dead states. * <li>They are not minimal (some transitions could be combined). * </ul> */
public Automaton toAutomaton(int n, String prefix) { assert prefix != null; if (n == 0) { return Automata.makeString(prefix + UnicodeUtil.newString(word, 0, word.length)); } if (n >= descriptions.length) return null; final int range = 2*n+1; ParametricDescription description = descriptions[n]; // the number of states is based on the length of the word and n final int numStates = description.size(); final int numTransitions = numStates * Math.min(1 + 2 * n, alphabet.length); final int prefixStates = prefix != null ? prefix.codePointCount(0, prefix.length()) : 0; final Automaton a = new Automaton(numStates + prefixStates, numTransitions); int lastState; if (prefix != null) { // Insert prefix lastState = a.createState(); for (int i = 0, cp = 0; i < prefix.length(); i += Character.charCount(cp)) { int state = a.createState(); cp = prefix.codePointAt(i); a.addTransition(lastState, state, cp, cp); lastState = state; } } else { lastState = a.createState(); } int stateOffset = lastState; a.setAccept(lastState, description.isAccept(0)); // create all states, and mark as accept states if appropriate for (int i = 1; i < numStates; i++) { int state = a.createState(); a.setAccept(state, description.isAccept(i)); } // TODO: this creates bogus states/transitions (states are final, have self loops, and can't be reached from an init state) // create transitions from state to state for (int k = 0; k < numStates; k++) { final int xpos = description.getPosition(k); if (xpos < 0) continue; final int end = xpos + Math.min(word.length - xpos, range); for (int x = 0; x < alphabet.length; x++) { final int ch = alphabet[x]; // get the characteristic vector at this position wrt ch final int cvec = getVector(ch, xpos, end); int dest = description.transition(k, xpos, cvec); if (dest >= 0) { a.addTransition(stateOffset+k, stateOffset+dest, ch); } } // add transitions for all other chars in unicode // by definition, their characteristic vectors are always 0, // because they do not exist in the input string. int dest = description.transition(k, xpos, 0); // by definition if (dest >= 0) { for (int r = 0; r < numRanges; r++) { a.addTransition(stateOffset+k, stateOffset+dest, rangeLower[r], rangeUpper[r]); } } } a.finishState(); assert a.isDeterministic(); return a; }
Get the characteristic vector X(x, V) where V is substring(pos, end)
/** * Get the characteristic vector <code>X(x, V)</code> * where V is <code>substring(pos, end)</code> */
int getVector(int x, int pos, int end) { int vector = 0; for (int i = pos; i < end; i++) { vector <<= 1; if (word[i] == x) vector |= 1; } return vector; }
A ParametricDescription describes the structure of a Levenshtein DFA for some degree n.

There are four components of a parametric description, all parameterized on the length of the word w:

  1. The number of states: size()
  2. The set of final states: isAccept(int)
  3. The transition function: transition(int, int, int)
  4. Minimal boundary function: getPosition(int)
/** * A ParametricDescription describes the structure of a Levenshtein DFA for some degree n. * <p> * There are four components of a parametric description, all parameterized on the length * of the word <code>w</code>: * <ol> * <li>The number of states: {@link #size()} * <li>The set of final states: {@link #isAccept(int)} * <li>The transition function: {@link #transition(int, int, int)} * <li>Minimal boundary function: {@link #getPosition(int)} * </ol> */
static abstract class ParametricDescription { protected final int w; protected final int n; private final int[] minErrors; ParametricDescription(int w, int n, int[] minErrors) { this.w = w; this.n = n; this.minErrors = minErrors; }
Return the number of states needed to compute a Levenshtein DFA
/** * Return the number of states needed to compute a Levenshtein DFA */
int size() { return minErrors.length * (w+1); };
Returns true if the state in any Levenshtein DFA is an accept state (final state).
/** * Returns true if the <code>state</code> in any Levenshtein DFA is an accept state (final state). */
boolean isAccept(int absState) { // decode absState -> state, offset int state = absState/(w+1); int offset = absState%(w+1); assert offset >= 0; return w - offset + minErrors[state] <= n; }
Returns the position in the input word for a given state. This is the minimal boundary for the state.
/** * Returns the position in the input word for a given <code>state</code>. * This is the minimal boundary for the state. */
int getPosition(int absState) { return absState % (w+1); }
Returns the state number for a transition from the given state, assuming position and characteristic vector vector
/** * Returns the state number for a transition from the given <code>state</code>, * assuming <code>position</code> and characteristic vector <code>vector</code> */
abstract int transition(int state, int position, int vector); private final static long[] MASKS = new long[] {0x1,0x3,0x7,0xf, 0x1f,0x3f,0x7f,0xff, 0x1ff,0x3ff,0x7ff,0xfff, 0x1fff,0x3fff,0x7fff,0xffff, 0x1ffff,0x3ffff,0x7ffff,0xfffff, 0x1fffff,0x3fffff,0x7fffff,0xffffff, 0x1ffffff,0x3ffffff,0x7ffffff,0xfffffff, 0x1fffffff,0x3fffffff,0x7fffffffL,0xffffffffL, 0x1ffffffffL,0x3ffffffffL,0x7ffffffffL,0xfffffffffL, 0x1fffffffffL,0x3fffffffffL,0x7fffffffffL,0xffffffffffL, 0x1ffffffffffL,0x3ffffffffffL,0x7ffffffffffL,0xfffffffffffL, 0x1fffffffffffL,0x3fffffffffffL,0x7fffffffffffL,0xffffffffffffL, 0x1ffffffffffffL,0x3ffffffffffffL,0x7ffffffffffffL,0xfffffffffffffL, 0x1fffffffffffffL,0x3fffffffffffffL,0x7fffffffffffffL,0xffffffffffffffL, 0x1ffffffffffffffL,0x3ffffffffffffffL,0x7ffffffffffffffL,0xfffffffffffffffL, 0x1fffffffffffffffL,0x3fffffffffffffffL,0x7fffffffffffffffL}; protected int unpack(long[] data, int index, int bitsPerValue) { final long bitLoc = bitsPerValue * index; final int dataLoc = (int) (bitLoc >> 6); final int bitStart = (int) (bitLoc & 63); //System.out.println("index=" + index + " dataLoc=" + dataLoc + " bitStart=" + bitStart + " bitsPerV=" + bitsPerValue); if (bitStart + bitsPerValue <= 64) { // not split return (int) ((data[dataLoc] >> bitStart) & MASKS[bitsPerValue-1]); } else { // split final int part = 64-bitStart; return (int) (((data[dataLoc] >> bitStart) & MASKS[part-1]) + ((data[1+dataLoc] & MASKS[bitsPerValue-part-1]) << part)); } } } }