org.apache.lucene/lucene-suggest/8.2.0 : org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java

FuzzySuggester
http://lucene.apache.org/lucene-parent/lucene-suggest: Lucene Suggest Module (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.analyzing;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; // javadocs
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.FiniteStringsIterator;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.UTF32ToUTF8;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs.Pair;

import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;

Implements a fuzzy AnalyzingSuggester. The similarity measurement is based on the Damerau-Levenshtein (optimal string alignment) algorithm, though you can explicitly choose classic Levenshtein by passing false
for the transpositions parameter.
 At most, this query will match terms up to 2 edits. Higher distances are not supported. Note that the fuzzy distance is measured in "byte space" on the bytes returned by the TokenStream's TermToBytesRefAttribute, usually UTF8. By default the analyzed bytes must be at least 3 DEFAULT_MIN_FUZZY_LENGTH bytes before any edits are considered. Furthermore, the first 1 DEFAULT_NON_FUZZY_PREFIX byte is not allowed to be edited. We allow up to 1 (@link #DEFAULT_MAX_EDITS} edit. If unicodeAware parameter in the constructor is set to true, maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix are measured in Unicode code points (actual letters) instead of bytes. 

NOTE: This suggester does not boost suggestions that
required no edits over suggestions that did require
edits.  This is a known limitation.

Note: complex query analyzers can have a significant impact on the lookup
performance. It's recommended to not use analyzers that drop or inject terms
like synonyms to keep the complexity of the prefix intersection low for good
lookup performance. At index time, complex analyzers can safely be used.

@lucene.experimental /**
 * Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is
 * based on the Damerau-Levenshtein (optimal string alignment) algorithm, though
 * you can explicitly choose classic Levenshtein by passing <code>false</code>
 * for the <code>transpositions</code> parameter.
 * <p>
 * At most, this query will match terms up to
 * {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}
 * edits. Higher distances are not supported.  Note that the
 * fuzzy distance is measured in "byte space" on the bytes
 * returned by the {@link TokenStream}'s {@link
 * TermToBytesRefAttribute}, usually UTF8.  By default
 * the analyzed bytes must be at least 3 {@link
 * #DEFAULT_MIN_FUZZY_LENGTH} bytes before any edits are
 * considered.  Furthermore, the first 1 {@link
 * #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be
 * edited.  We allow up to 1 (@link
 * #DEFAULT_MAX_EDITS} edit.
 * If {@link #unicodeAware} parameter in the constructor is set to true, maxEdits,
 * minFuzzyLength, transpositions and nonFuzzyPrefix are measured in Unicode code 
 * points (actual letters) instead of bytes. 
 *
 * <p>
 * NOTE: This suggester does not boost suggestions that
 * required no edits over suggestions that did require
 * edits.  This is a known limitation.
 *
 * <p>
 * Note: complex query analyzers can have a significant impact on the lookup
 * performance. It's recommended to not use analyzers that drop or inject terms
 * like synonyms to keep the complexity of the prefix intersection low for good
 * lookup performance. At index time, complex analyzers can safely be used.
 * </p>
 *
 * @lucene.experimental
 */
public final class FuzzySuggester extends AnalyzingSuggester {
  private final int maxEdits;
  private final boolean transpositions;
  private final int nonFuzzyPrefix;
  private final int minFuzzyLength;
  private final boolean unicodeAware;

  Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix 
 parameters in Unicode code points (actual letters)
 instead of bytes. /** Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix 
   *  parameters in Unicode code points (actual letters)
   *  instead of bytes. */
  public static final boolean DEFAULT_UNICODE_AWARE = false;

  The default minimum length of the key passed to Lookup.lookup before any edits are allowed. /**
   * The default minimum length of the key passed to {@link
   * #lookup} before any edits are allowed.
   */
  public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;

  The default prefix length where edits are not allowed.
/**
   * The default prefix length where edits are not allowed.
   */
  public static final int DEFAULT_NON_FUZZY_PREFIX = 1;
  
  The default maximum number of edits for fuzzy
suggestions.
/**
   * The default maximum number of edits for fuzzy
   * suggestions.
   */
  public static final int DEFAULT_MAX_EDITS = 1;
  
  The default transposition value passed to LevenshteinAutomata /**
   * The default transposition value passed to {@link LevenshteinAutomata}
   */
  public static final boolean DEFAULT_TRANSPOSITIONS = true;

  Creates a FuzzySuggester instance initialized with default values. 
Params: analyzer – the analyzer used for this suggester/**
   * Creates a {@link FuzzySuggester} instance initialized with default values.
   * 
   * @param analyzer the analyzer used for this suggester
   */
  public FuzzySuggester(Directory tempDir, String tempFileNamePrefix, Analyzer analyzer) {
    this(tempDir, tempFileNamePrefix, analyzer, analyzer);
  }
  
  Creates a FuzzySuggester instance with an index and query analyzer initialized with default values. 
Params: indexAnalyzer – 
          Analyzer that will be used for analyzing suggestions while building the index.
queryAnalyzer – 
          Analyzer that will be used for analyzing query text during lookup/**
   * Creates a {@link FuzzySuggester} instance with an index and query analyzer initialized with default values.
   * 
   * @param indexAnalyzer
   *           Analyzer that will be used for analyzing suggestions while building the index.
   * @param queryAnalyzer
   *           Analyzer that will be used for analyzing query text during lookup
   */
  public FuzzySuggester(Directory tempDir, String tempFileNamePrefix, Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
    this(tempDir, tempFileNamePrefix, indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS,
         DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE);
  }

  Creates a FuzzySuggester instance. 
Params: indexAnalyzer – Analyzer that will be used for
       analyzing suggestions while building the index.
queryAnalyzer – Analyzer that will be used for
       analyzing query text during lookup
options – see AnalyzingSuggester.EXACT_FIRST, AnalyzingSuggester.PRESERVE_SEP
maxSurfaceFormsPerAnalyzedForm – Maximum number of
       surface forms to keep for a single analyzed form.
       When there are too many surface forms we discard the
       lowest weighted ones.
maxGraphExpansions – Maximum number of graph paths
       to expand from the analyzed form.  Set this to -1 for
       no limit.
preservePositionIncrements – Whether position holes should appear in the automaton
maxEdits – must be >= 0 and <= LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE .
transpositions – true if transpositions should be treated as a primitive 
       edit operation. If this is false, comparisons will implement the classic
       Levenshtein algorithm.
nonFuzzyPrefix – length of common (non-fuzzy) prefix (see default DEFAULT_NON_FUZZY_PREFIX
minFuzzyLength – minimum length of lookup key before any edits are allowed (see default DEFAULT_MIN_FUZZY_LENGTH)
unicodeAware – operate Unicode code points instead of bytes./**
   * Creates a {@link FuzzySuggester} instance.
   * 
   * @param indexAnalyzer Analyzer that will be used for
   *        analyzing suggestions while building the index.
   * @param queryAnalyzer Analyzer that will be used for
   *        analyzing query text during lookup
   * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
   * @param maxSurfaceFormsPerAnalyzedForm Maximum number of
   *        surface forms to keep for a single analyzed form.
   *        When there are too many surface forms we discard the
   *        lowest weighted ones.
   * @param maxGraphExpansions Maximum number of graph paths
   *        to expand from the analyzed form.  Set this to -1 for
   *        no limit.
   * @param preservePositionIncrements Whether position holes should appear in the automaton
   * @param maxEdits must be &gt;= 0 and &lt;= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} .
   * @param transpositions <code>true</code> if transpositions should be treated as a primitive 
   *        edit operation. If this is false, comparisons will implement the classic
   *        Levenshtein algorithm.
   * @param nonFuzzyPrefix length of common (non-fuzzy) prefix (see default {@link #DEFAULT_NON_FUZZY_PREFIX}
   * @param minFuzzyLength minimum length of lookup key before any edits are allowed (see default {@link #DEFAULT_MIN_FUZZY_LENGTH})
   * @param unicodeAware operate Unicode code points instead of bytes.
   */
  public FuzzySuggester(Directory tempDir, String tempFileNamePrefix, Analyzer indexAnalyzer, Analyzer queryAnalyzer,
                        int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions,
                        boolean preservePositionIncrements, int maxEdits, boolean transpositions,
                        int nonFuzzyPrefix, int minFuzzyLength, boolean unicodeAware) {
    super(tempDir, tempFileNamePrefix, indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements);
    if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
      throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
    }
    if (nonFuzzyPrefix < 0) {
      throw new IllegalArgumentException("nonFuzzyPrefix must not be >= 0 (got " + nonFuzzyPrefix + ")");
    }
    if (minFuzzyLength < 0) {
      throw new IllegalArgumentException("minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")");
    }
    
    this.maxEdits = maxEdits;
    this.transpositions = transpositions;
    this.nonFuzzyPrefix = nonFuzzyPrefix;
    this.minFuzzyLength = minFuzzyLength;
    this.unicodeAware = unicodeAware;
  }
  
  @Override
  protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
                                                                       Automaton lookupAutomaton,
                                                                       FST<Pair<Long,BytesRef>> fst)
    throws IOException {

    // TODO: right now there's no penalty for fuzzy/edits,
    // ie a completion whose prefix matched exactly what the
    // user typed gets no boost over completions that
    // required an edit, which get no boost over completions
    // requiring two edits.  I suspect a multiplicative
    // factor is appropriate (eg, say a fuzzy match must be at
    // least 2X better weight than the non-fuzzy match to
    // "compete") ... in which case I think the wFST needs
    // to be log weights or something ...

    Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
    /*
      Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
      w.write(levA.toDot());
      w.close();
      System.out.println("Wrote LevA to out.dot");
    */
    return FSTUtil.intersectPrefixPaths(levA, fst);
  }

  @Override
  protected Automaton convertAutomaton(Automaton a) {
    if (unicodeAware) {
      Automaton utf8automaton = new UTF32ToUTF8().convert(a);
      utf8automaton = Operations.determinize(utf8automaton, DEFAULT_MAX_DETERMINIZED_STATES);
      return utf8automaton;
    } else {
      return a;
    }
  }

  @Override
  TokenStreamToAutomaton getTokenStreamToAutomaton() {
    final TokenStreamToAutomaton tsta = super.getTokenStreamToAutomaton();
    tsta.setUnicodeArcs(unicodeAware);
    return tsta;
  }

  Automaton toLevenshteinAutomata(Automaton automaton) {
    List<Automaton> subs = new ArrayList<>();
    FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton);
    for (IntsRef string; (string = finiteStrings.next()) != null;) {
      if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) {
        subs.add(Automata.makeString(string.ints, string.offset, string.length));
      } else {
        int ints[] = new int[string.length-nonFuzzyPrefix];
        System.arraycopy(string.ints, string.offset+nonFuzzyPrefix, ints, 0, ints.length);
        // TODO: maybe add alphaMin to LevenshteinAutomata,
        // and pass 1 instead of 0?  We probably don't want
        // to allow the trailing dedup bytes to be
        // edited... but then 0 byte is "in general" allowed
        // on input (but not in UTF8).
        LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions);
        subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix)));
      }
    }

    if (subs.isEmpty()) {
      // automaton is empty, there is no accepted paths through it
      return Automata.makeEmpty(); // matches nothing
    } else if (subs.size() == 1) {
      // no synonyms or anything: just a single path through the tokenstream
      return subs.get(0);
    } else {
      // multiple paths: this is really scary! is it slow?
      // maybe we should not do this and throw UOE?
      Automaton a = Operations.union(subs);
      // TODO: we could call toLevenshteinAutomata() before det? 
      // this only happens if you have multiple paths anyway (e.g. synonyms)
      return Operations.determinize(a, DEFAULT_MAX_DETERMINIZED_STATES);
    }
  }
}
/

org.apache.lucene/ lucene-suggest/ 8.2.0/ org/apache/lucene/search/suggest/analyzing/FuzzySuggester.java