/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.document;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.suggest.BitsProducer;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.FiniteStringsIterator;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.UTF32ToUTF8;

A CompletionQuery that match documents containing terms within an edit distance of the specified prefix.

This query boost documents relative to how similar the indexed terms are to the provided prefix.

Example usage of querying an analyzed prefix within an edit distance of 1 of 'subg' against a field 'suggest_field' is as follows:

 CompletionQuery query = new FuzzyCompletionQuery(analyzer, new Term("suggest_field", "subg"));
@lucene.experimental
/** * A {@link CompletionQuery} that match documents containing terms * within an edit distance of the specified prefix. * <p> * This query boost documents relative to how similar the indexed terms are to the * provided prefix. * <p> * Example usage of querying an analyzed prefix within an edit distance of 1 of 'subg' * against a field 'suggest_field' is as follows: * * <pre class="prettyprint"> * CompletionQuery query = new FuzzyCompletionQuery(analyzer, new Term("suggest_field", "subg")); * </pre> * * @lucene.experimental */
public class FuzzyCompletionQuery extends PrefixCompletionQuery {
Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix parameters in Unicode code points (actual letters) instead of bytes.
/** * Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix * parameters in Unicode code points (actual letters) * instead of bytes. * */
public static final boolean DEFAULT_UNICODE_AWARE = false;
The default minimum length of the key before any edits are allowed.
/** * The default minimum length of the key before any edits are allowed. */
public static final int DEFAULT_MIN_FUZZY_LENGTH = 3;
The default prefix length where edits are not allowed.
/** * The default prefix length where edits are not allowed. */
public static final int DEFAULT_NON_FUZZY_PREFIX = 1;
The default maximum number of edits for fuzzy suggestions.
/** * The default maximum number of edits for fuzzy * suggestions. */
public static final int DEFAULT_MAX_EDITS = 1;
The default transposition value passed to LevenshteinAutomata
/** * The default transposition value passed to {@link LevenshteinAutomata} */
public static final boolean DEFAULT_TRANSPOSITIONS = true; private final int maxEdits; private final boolean transpositions; private final int nonFuzzyPrefix; private final int minFuzzyLength; private final boolean unicodeAware; private final int maxDeterminizedStates; /** * Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer)} * with no filter */ public FuzzyCompletionQuery(Analyzer analyzer, Term term) { this(analyzer, term, null); } /** * Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer, * int, boolean, int, int, boolean, int)} * with defaults for <code>maxEdits</code>, <code>transpositions</code>, * <code>nonFuzzyPrefix</code>, <code>minFuzzyLength</code>, * <code>unicodeAware</code> and <code>maxDeterminizedStates</code> * * See {@link #DEFAULT_MAX_EDITS}, {@link #DEFAULT_TRANSPOSITIONS}, * {@link #DEFAULT_NON_FUZZY_PREFIX}, {@link #DEFAULT_MIN_FUZZY_LENGTH}, * {@link #DEFAULT_UNICODE_AWARE} and {@link Operations#DEFAULT_MAX_DETERMINIZED_STATES} * for defaults */ public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter) { this(analyzer, term, filter, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS, DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, Operations.DEFAULT_MAX_DETERMINIZED_STATES ); }
Constructs an analyzed fuzzy prefix completion query
Params:
  • analyzer – used to analyze the provided Term.text()
  • term – query is run against Term.field() and Term.text() is analyzed with analyzer
  • filter – used to query on a sub set of documents
  • maxEdits – maximum number of acceptable edits
  • transpositions – value passed to LevenshteinAutomata
  • nonFuzzyPrefix – prefix length where edits are not allowed
  • minFuzzyLength – minimum prefix length before any edits are allowed
  • unicodeAware – treat prefix as unicode rather than bytes
  • maxDeterminizedStates – maximum automaton states allowed for LevenshteinAutomata
/** * Constructs an analyzed fuzzy prefix completion query * * @param analyzer used to analyze the provided {@link Term#text()} * @param term query is run against {@link Term#field()} and {@link Term#text()} * is analyzed with <code>analyzer</code> * @param filter used to query on a sub set of documents * @param maxEdits maximum number of acceptable edits * @param transpositions value passed to {@link LevenshteinAutomata} * @param nonFuzzyPrefix prefix length where edits are not allowed * @param minFuzzyLength minimum prefix length before any edits are allowed * @param unicodeAware treat prefix as unicode rather than bytes * @param maxDeterminizedStates maximum automaton states allowed for {@link LevenshteinAutomata} */
public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter, int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength, boolean unicodeAware, int maxDeterminizedStates) { super(analyzer, term, filter); this.maxEdits = maxEdits; this.transpositions = transpositions; this.nonFuzzyPrefix = nonFuzzyPrefix; this.minFuzzyLength = minFuzzyLength; this.unicodeAware = unicodeAware; this.maxDeterminizedStates = maxDeterminizedStates; } @Override public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { final Automaton originalAutomata; try (CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()) ) { originalAutomata = stream.toAutomaton(unicodeAware); } Set<IntsRef> refs = new HashSet<>(); Automaton automaton = toLevenshteinAutomata(originalAutomata, refs); if (unicodeAware) { Automaton utf8automaton = new UTF32ToUTF8().convert(automaton); utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates); automaton = utf8automaton; } // TODO Accumulating all refs is bad, because the resulting set may be very big. // TODO Better iterate over automaton again inside FuzzyCompletionWeight? return new FuzzyCompletionWeight(this, automaton, refs); } private Automaton toLevenshteinAutomata(Automaton automaton, Set<IntsRef> refs) { List<Automaton> subs = new ArrayList<>(); FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton); for (IntsRef string; (string = finiteStrings.next()) != null;) { refs.add(IntsRef.deepCopyOf(string)); if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) { subs.add(Automata.makeString(string.ints, string.offset, string.length)); } else { int ints[] = new int[string.length - nonFuzzyPrefix]; System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length); // TODO: maybe add alphaMin to LevenshteinAutomata, // and pass 1 instead of 0? We probably don't want // to allow the trailing dedup bytes to be // edited... but then 0 byte is "in general" allowed // on input (but not in UTF8). LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions); subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix))); } } if (subs.isEmpty()) { // automaton is empty, there is no accepted paths through it return Automata.makeEmpty(); // matches nothing } else if (subs.size() == 1) { // no synonyms or anything: just a single path through the tokenstream return subs.get(0); } else { // multiple paths: this is really scary! is it slow? // maybe we should not do this and throw UOE? Automaton a = Operations.union(subs); // TODO: we could call toLevenshteinAutomata() before det? // this only happens if you have multiple paths anyway (e.g. synonyms) return Operations.determinize(a, maxDeterminizedStates); } }
Get the maximum edit distance for fuzzy matches
/** * Get the maximum edit distance for fuzzy matches */
public int getMaxEdits() { return maxEdits; }
Return whether transpositions count as a single edit
/** * Return whether transpositions count as a single edit */
public boolean isTranspositions() { return transpositions; }
Get the length of a prefix where no edits are permitted
/** * Get the length of a prefix where no edits are permitted */
public int getNonFuzzyPrefix() { return nonFuzzyPrefix; }
Get the minimum length of a term considered for matching
/** * Get the minimum length of a term considered for matching */
public int getMinFuzzyLength() { return minFuzzyLength; }
Return true if lengths are measured in unicode code-points rather than bytes
/** * Return true if lengths are measured in unicode code-points rather than bytes */
public boolean isUnicodeAware() { return unicodeAware; }
Get the maximum number of determinized states permitted
/** * Get the maximum number of determinized states permitted */
public int getMaxDeterminizedStates() { return maxDeterminizedStates; } @Override public String toString(String field) { StringBuilder buffer = new StringBuilder(); if (!getField().equals(field)) { buffer.append(getField()); buffer.append(":"); } buffer.append(getTerm().text()); buffer.append('*'); buffer.append('~'); buffer.append(Integer.toString(maxEdits)); if (getFilter() != null) { buffer.append(","); buffer.append("filter"); buffer.append(getFilter().toString()); } return buffer.toString(); } private static class FuzzyCompletionWeight extends CompletionWeight { private final Set<IntsRef> refs; int currentBoost = 0; public FuzzyCompletionWeight(CompletionQuery query, Automaton automaton, Set<IntsRef> refs) throws IOException { super(query, automaton); this.refs = refs; } @Override protected void setNextMatch(IntsRef pathPrefix) { // NOTE: the last letter of the matched prefix for the exact // match never makes it through here // so an exact match and a match with only a edit at the // end is boosted the same int maxCount = 0; for (IntsRef ref : refs) { int minLength = Math.min(ref.length, pathPrefix.length); int count = 0; for (int i = 0; i < minLength; i++) { if (ref.ints[i + ref.offset] == pathPrefix.ints[i + pathPrefix.offset]) { count++; } else { break; } } maxCount = Math.max(maxCount, count); } currentBoost = maxCount; } @Override protected float boost() { return currentBoost; } } }