/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.spell;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.LevenshteinAutomata;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Locale;
import java.util.PriorityQueue;

Simple automaton-based spellchecker.

Candidates are presented directly from the term dictionary, based on Levenshtein distance. This is an alternative to SpellChecker if you are using an edit-distance-like metric such as Levenshtein or JaroWinklerDistance.

A practical benefit of this spellchecker is that it requires no additional datastructures (neither in RAM nor on disk) to do its work.

See Also:
@lucene.experimental
/** * Simple automaton-based spellchecker. * <p> * Candidates are presented directly from the term dictionary, based on * Levenshtein distance. This is an alternative to {@link SpellChecker} * if you are using an edit-distance-like metric such as Levenshtein * or {@link JaroWinklerDistance}. * <p> * A practical benefit of this spellchecker is that it requires no additional * datastructures (neither in RAM nor on disk) to do its work. * * @see LevenshteinAutomata * @see FuzzyTermsEnum * * @lucene.experimental */
public class DirectSpellChecker {
The default StringDistance, Damerau-Levenshtein distance implemented internally via LevenshteinAutomata.

Note: this is the fastest distance metric, because Damerau-Levenshtein is used to draw candidates from the term dictionary: this just re-uses the scoring.

/** The default StringDistance, Damerau-Levenshtein distance implemented internally * via {@link LevenshteinAutomata}. * <p> * Note: this is the fastest distance metric, because Damerau-Levenshtein is used * to draw candidates from the term dictionary: this just re-uses the scoring. */
public static final StringDistance INTERNAL_LEVENSHTEIN = new LuceneLevenshteinDistance();
maximum edit distance for candidate terms
/** maximum edit distance for candidate terms */
private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
minimum prefix for candidate terms
/** minimum prefix for candidate terms */
private int minPrefix = 1;
maximum number of top-N inspections per suggestion
/** maximum number of top-N inspections per suggestion */
private int maxInspections = 5;
minimum accuracy for a term to match
/** minimum accuracy for a term to match */
private float accuracy = SpellChecker.DEFAULT_ACCURACY;
value in [0..1] (or absolute number >= 1) representing the minimum number of documents (of the total) where a term should appear.
/** value in [0..1] (or absolute number &gt;= 1) representing the minimum * number of documents (of the total) where a term should appear. */
private float thresholdFrequency = 0f;
minimum length of a query word to return suggestions
/** minimum length of a query word to return suggestions */
private int minQueryLength = 4;
value in [0..1] (or absolute number >= 1) representing the maximum number of documents (of the total) a query term can appear in to be corrected.
/** value in [0..1] (or absolute number &gt;= 1) representing the maximum * number of documents (of the total) a query term can appear in to * be corrected. */
private float maxQueryFrequency = 0.01f;
true if the spellchecker should lowercase terms
/** true if the spellchecker should lowercase terms */
private boolean lowerCaseTerms = true;
the comparator to use
/** the comparator to use */
private Comparator<SuggestWord> comparator = SuggestWordQueue.DEFAULT_COMPARATOR;
the string distance to use
/** the string distance to use */
private StringDistance distance = INTERNAL_LEVENSHTEIN;
Creates a DirectSpellChecker with default configuration values
/** Creates a DirectSpellChecker with default configuration values */
public DirectSpellChecker() {}
Get the maximum number of Levenshtein edit-distances to draw candidate terms from.
/** Get the maximum number of Levenshtein edit-distances to draw * candidate terms from. */
public int getMaxEdits() { return maxEdits; }
Sets the maximum number of Levenshtein edit-distances to draw candidate terms from. This value can be 1 or 2. The default is 2.

Note: a large number of spelling errors occur with an edit distance of 1, by setting this value to 1 you can increase both performance and precision at the cost of recall.

/** Sets the maximum number of Levenshtein edit-distances to draw * candidate terms from. This value can be 1 or 2. The default is 2. * <p> * Note: a large number of spelling errors occur with an edit distance * of 1, by setting this value to 1 you can increase both performance * and precision at the cost of recall. */
public void setMaxEdits(int maxEdits) { if (maxEdits < 1 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) throw new UnsupportedOperationException("Invalid maxEdits"); this.maxEdits = maxEdits; }
Get the minimal number of characters that must match exactly
/** * Get the minimal number of characters that must match exactly */
public int getMinPrefix() { return minPrefix; }
Sets the minimal number of initial characters (default: 1) that must match exactly.

This can improve both performance and accuracy of results, as misspellings are commonly not the first character.

/** * Sets the minimal number of initial characters (default: 1) * that must match exactly. * <p> * This can improve both performance and accuracy of results, * as misspellings are commonly not the first character. */
public void setMinPrefix(int minPrefix) { this.minPrefix = minPrefix; }
Get the maximum number of top-N inspections per suggestion
/** * Get the maximum number of top-N inspections per suggestion */
public int getMaxInspections() { return maxInspections; }
Set the maximum number of top-N inspections (default: 5) per suggestion.

Increasing this number can improve the accuracy of results, at the cost of performance.

/** * Set the maximum number of top-N inspections (default: 5) per suggestion. * <p> * Increasing this number can improve the accuracy of results, at the cost * of performance. */
public void setMaxInspections(int maxInspections) { this.maxInspections = maxInspections; }
Get the minimal accuracy from the StringDistance for a match
/** * Get the minimal accuracy from the StringDistance for a match */
public float getAccuracy() { return accuracy; }
Set the minimal accuracy required (default: 0.5f) from a StringDistance for a suggestion match.
/** * Set the minimal accuracy required (default: 0.5f) from a StringDistance * for a suggestion match. */
public void setAccuracy(float accuracy) { this.accuracy = accuracy; }
Get the minimal threshold of documents a term must appear for a match
/** * Get the minimal threshold of documents a term must appear for a match */
public float getThresholdFrequency() { return thresholdFrequency; }
Set the minimal threshold of documents a term must appear for a match.

This can improve quality by only suggesting high-frequency terms. Note that very high values might decrease performance slightly, by forcing the spellchecker to draw more candidates from the term dictionary, but a practical value such as 1 can be very useful towards improving quality.

This can be specified as a relative percentage of documents such as 0.5f, or it can be specified as an absolute whole document frequency, such as 4f. Absolute document frequencies may not be fractional.

/** * Set the minimal threshold of documents a term must appear for a match. * <p> * This can improve quality by only suggesting high-frequency terms. Note that * very high values might decrease performance slightly, by forcing the spellchecker * to draw more candidates from the term dictionary, but a practical value such * as <code>1</code> can be very useful towards improving quality. * <p> * This can be specified as a relative percentage of documents such as 0.5f, * or it can be specified as an absolute whole document frequency, such as 4f. * Absolute document frequencies may not be fractional. */
public void setThresholdFrequency(float thresholdFrequency) { if (thresholdFrequency >= 1f && thresholdFrequency != (int) thresholdFrequency) throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed"); this.thresholdFrequency = thresholdFrequency; }
Get the minimum length of a query term needed to return suggestions
/** Get the minimum length of a query term needed to return suggestions */
public int getMinQueryLength() { return minQueryLength; }
Set the minimum length of a query term (default: 4) needed to return suggestions.

Very short query terms will often cause only bad suggestions with any distance metric.

/** * Set the minimum length of a query term (default: 4) needed to return suggestions. * <p> * Very short query terms will often cause only bad suggestions with any distance * metric. */
public void setMinQueryLength(int minQueryLength) { this.minQueryLength = minQueryLength; }
Get the maximum threshold of documents a query term can appear in order to provide suggestions.
/** * Get the maximum threshold of documents a query term can appear in order * to provide suggestions. */
public float getMaxQueryFrequency() { return maxQueryFrequency; }
Set the maximum threshold (default: 0.01f) of documents a query term can appear in order to provide suggestions.

Very high-frequency terms are typically spelled correctly. Additionally, this can increase performance as it will do no work for the common case of correctly-spelled input terms.

This can be specified as a relative percentage of documents such as 0.5f, or it can be specified as an absolute whole document frequency, such as 4f. Absolute document frequencies may not be fractional.

/** * Set the maximum threshold (default: 0.01f) of documents a query term can * appear in order to provide suggestions. * <p> * Very high-frequency terms are typically spelled correctly. Additionally, * this can increase performance as it will do no work for the common case * of correctly-spelled input terms. * <p> * This can be specified as a relative percentage of documents such as 0.5f, * or it can be specified as an absolute whole document frequency, such as 4f. * Absolute document frequencies may not be fractional. */
public void setMaxQueryFrequency(float maxQueryFrequency) { if (maxQueryFrequency >= 1f && maxQueryFrequency != (int) maxQueryFrequency) throw new IllegalArgumentException("Fractional absolute document frequencies are not allowed"); this.maxQueryFrequency = maxQueryFrequency; }
true if the spellchecker should lowercase terms
/** true if the spellchecker should lowercase terms */
public boolean getLowerCaseTerms() { return lowerCaseTerms; }
True if the spellchecker should lowercase terms (default: true)

This is a convenience method, if your index field has more complicated analysis (such as StandardTokenizer removing punctuation), it's probably better to turn this off, and instead run your query terms through your Analyzer first.

If this option is not on, case differences count as an edit!

/** * True if the spellchecker should lowercase terms (default: true) * <p> * This is a convenience method, if your index field has more complicated * analysis (such as StandardTokenizer removing punctuation), it's probably * better to turn this off, and instead run your query terms through your * Analyzer first. * <p> * If this option is not on, case differences count as an edit! */
public void setLowerCaseTerms(boolean lowerCaseTerms) { this.lowerCaseTerms = lowerCaseTerms; }
Get the current comparator in use.
/** * Get the current comparator in use. */
public Comparator<SuggestWord> getComparator() { return comparator; }
Set the comparator for sorting suggestions. The default is SuggestWordQueue.DEFAULT_COMPARATOR
/** * Set the comparator for sorting suggestions. * The default is {@link SuggestWordQueue#DEFAULT_COMPARATOR} */
public void setComparator(Comparator<SuggestWord> comparator) { this.comparator = comparator; }
Get the string distance metric in use.
/** * Get the string distance metric in use. */
public StringDistance getDistance() { return distance; }
Set the string distance metric. The default is INTERNAL_LEVENSHTEIN

Note: because this spellchecker draws its candidates from the term dictionary using Damerau-Levenshtein, it works best with an edit-distance-like string metric. If you use a different metric than the default, you might want to consider increasing setMaxInspections(int) to draw more candidates for your metric to rank.

/** * Set the string distance metric. * The default is {@link #INTERNAL_LEVENSHTEIN} * <p> * Note: because this spellchecker draws its candidates from the term * dictionary using Damerau-Levenshtein, it works best with an edit-distance-like * string metric. If you use a different metric than the default, * you might want to consider increasing {@link #setMaxInspections(int)} * to draw more candidates for your metric to rank. */
public void setDistance(StringDistance distance) { this.distance = distance; } /** * Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode) * suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX)} */ public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir) throws IOException { return suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); } /** * Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode, float) * suggestSimilar(term, numSug, ir, suggestMode, this.accuracy)} * */ public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode) throws IOException { return suggestSimilar(term, numSug, ir, suggestMode, this.accuracy); }
Suggest similar words.

Unlike SpellChecker, the similarity used to fetch the most relevant terms is an edit distance, therefore typically a low value for numSug will work very well.

Params:
  • term – Term you want to spell check on
  • numSug – the maximum number of suggested words
  • ir – IndexReader to find terms from
  • suggestMode – specifies when to return suggested words
  • accuracy – return only suggested words that match with this similarity
Throws:
Returns:sorted list of the suggested words according to the comparator
/** * Suggest similar words. * * <p>Unlike {@link SpellChecker}, the similarity used to fetch the most * relevant terms is an edit distance, therefore typically a low value * for numSug will work very well. * * @param term Term you want to spell check on * @param numSug the maximum number of suggested words * @param ir IndexReader to find terms from * @param suggestMode specifies when to return suggested words * @param accuracy return only suggested words that match with this similarity * @return sorted list of the suggested words according to the comparator * @throws IOException If there is a low-level I/O error. */
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode, float accuracy) throws IOException { final CharsRefBuilder spare = new CharsRefBuilder(); String text = term.text(); if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength) return new SuggestWord[0]; if (lowerCaseTerms) { term = new Term(term.field(), text.toLowerCase(Locale.ROOT)); } int docfreq = ir.docFreq(term); if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) { return new SuggestWord[0]; } int maxDoc = ir.maxDoc(); if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency) { return new SuggestWord[0]; } else if (docfreq > (int) Math.ceil(maxQueryFrequency * (float)maxDoc)) { return new SuggestWord[0]; } if (suggestMode!=SuggestMode.SUGGEST_MORE_POPULAR) docfreq = 0; if (thresholdFrequency >= 1f) { docfreq = Math.max(docfreq, (int) thresholdFrequency); } else if (thresholdFrequency > 0f) { docfreq = Math.max(docfreq, (int)(thresholdFrequency * (float)maxDoc)-1); } Collection<ScoreTerm> terms = null; int inspections = numSug * maxInspections; // try ed=1 first, in case we get lucky terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare); if (maxEdits > 1 && terms.size() < inspections) { HashSet<ScoreTerm> moreTerms = new HashSet<>(); moreTerms.addAll(terms); moreTerms.addAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare)); terms = moreTerms; } // create the suggestword response, sort it, and trim it to size. SuggestWord suggestions[] = new SuggestWord[terms.size()]; int index = suggestions.length - 1; for (ScoreTerm s : terms) { SuggestWord suggestion = new SuggestWord(); if (s.termAsString == null) { spare.copyUTF8Bytes(s.term); s.termAsString = spare.toString(); } suggestion.string = s.termAsString; suggestion.score = s.score; suggestion.freq = s.docfreq; suggestions[index--] = suggestion; } ArrayUtil.timSort(suggestions, Collections.reverseOrder(comparator)); if (numSug < suggestions.length) { SuggestWord trimmed[] = new SuggestWord[numSug]; System.arraycopy(suggestions, 0, trimmed, 0, numSug); suggestions = trimmed; } return suggestions; }
Provide spelling corrections based on several parameters.
Params:
  • term – The term to suggest spelling corrections for
  • numSug – The maximum number of spelling corrections
  • ir – The index reader to fetch the candidate spelling corrections from
  • docfreq – The minimum document frequency a potential suggestion need to have in order to be included
  • editDistance – The maximum edit distance candidates are allowed to have
  • accuracy – The minimum accuracy a suggested spelling correction needs to have in order to be included
  • spare – a chars scratch
Throws:
Returns:a collection of spelling corrections sorted by ScoreTerm's natural order.
/** * Provide spelling corrections based on several parameters. * * @param term The term to suggest spelling corrections for * @param numSug The maximum number of spelling corrections * @param ir The index reader to fetch the candidate spelling corrections from * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included * @param editDistance The maximum edit distance candidates are allowed to have * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included * @param spare a chars scratch * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. * @throws IOException If I/O related errors occur */
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, final CharsRefBuilder spare) throws IOException { AttributeSource atts = new AttributeSource(); MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); Terms terms = MultiTerms.getTerms(ir, term.field()); if (terms == null) { return Collections.emptyList(); } FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1), true); final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>(); BytesRef queryTerm = new BytesRef(term.text()); BytesRef candidateTerm; ScoreTerm st = new ScoreTerm(); BoostAttribute boostAtt = e.attributes().addAttribute(BoostAttribute.class); while ((candidateTerm = e.next()) != null) { // For FuzzyQuery, boost is the score: float score = boostAtt.getBoost(); // ignore uncompetitive hits if (stQueue.size() >= numSug && score <= stQueue.peek().boost) { continue; } // ignore exact match of the same term if (queryTerm.bytesEquals(candidateTerm)) { continue; } int df = e.docFreq(); // check docFreq if required if (df <= docfreq) { continue; } final String termAsString; if (distance == INTERNAL_LEVENSHTEIN) { // delay creating strings until the end termAsString = null; } else { spare.copyUTF8Bytes(candidateTerm); termAsString = spare.toString(); score = distance.getDistance(term.text(), termAsString); } if (score < accuracy) { continue; } // add new entry in PQ st.term = BytesRef.deepCopyOf(candidateTerm); st.boost = score; st.docfreq = df; st.termAsString = termAsString; st.score = score; stQueue.offer(st); // possibly drop entries from queue st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm(); maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); } return stQueue; }
Holds a spelling correction for internal usage inside DirectSpellChecker.
/** * Holds a spelling correction for internal usage inside {@link DirectSpellChecker}. */
protected static class ScoreTerm implements Comparable<ScoreTerm> {
The actual spellcheck correction.
/** * The actual spellcheck correction. */
public BytesRef term;
The boost representing the similarity from the FuzzyTermsEnum (internal similarity score)
/** * The boost representing the similarity from the FuzzyTermsEnum (internal similarity score) */
public float boost;
The df of the spellcheck correction.
/** * The df of the spellcheck correction. */
public int docfreq;
The spellcheck correction represented as string, can be null.
/** * The spellcheck correction represented as string, can be <code>null</code>. */
public String termAsString;
The similarity score.
/** * The similarity score. */
public float score;
Constructor.
/** * Constructor. */
public ScoreTerm() { } @Override public int compareTo(ScoreTerm other) { if (term.bytesEquals(other.term)) return 0; // consistent with equals if (this.boost == other.boost) return other.term.compareTo(this.term); else return Float.compare(this.boost, other.boost); } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((term == null) ? 0 : term.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; ScoreTerm other = (ScoreTerm) obj; if (term == null) { if (other.term != null) return false; } else if (!term.bytesEquals(other.term)) return false; return true; } } }