/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.analyzing;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.TokenStreamToAutomaton;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.LimitedFiniteStringsIterator;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util.Result;
import org.apache.lucene.util.fst.Util.TopResults;
import org.apache.lucene.util.fst.Util;

import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;

Suggester that first analyzes the surface form, adds the analyzed form to a weighted FST, and then does the same thing at lookup time. This means lookup is based on the analyzed form while suggestions are still the surface form(s).

This can result in powerful suggester functionality. For example, if you use an analyzer removing stop words, then the partial text "ghost chr..." could see the suggestion "The Ghost of Christmas Past". Note that position increments MUST NOT be preserved for this example to work, so you should call the constructor with preservePositionIncrements parameter set to false

If SynonymFilter is used to map wifi and wireless network to hotspot then the partial text "wirele..." could suggest "wifi router". Token normalization like stemmers, accent removal, etc., would allow suggestions to ignore such variations.

When two matching suggestions have the same weight, they are tie-broken by the analyzed form. If their analyzed form is the same then the order is undefined.

There are some limitations:

  • A lookup from a query like "net" in English won't be any different than "net " (ie, user added a trailing space) because analyzers don't reflect when they've seen a token separator and when they haven't.
  • If you're using StopFilter, and the user will type "fast apple", but so far all they've typed is "fast a", again because the analyzer doesn't convey whether it's seen a token separator after the "a", StopFilter will remove that "a" causing far more matches than you'd expect.
  • Lookups with the empty string return no results instead of all results.
@lucene.experimental
/** * Suggester that first analyzes the surface form, adds the * analyzed form to a weighted FST, and then does the same * thing at lookup time. This means lookup is based on the * analyzed form while suggestions are still the surface * form(s). * * <p> * This can result in powerful suggester functionality. For * example, if you use an analyzer removing stop words, * then the partial text "ghost chr..." could see the * suggestion "The Ghost of Christmas Past". Note that * position increments MUST NOT be preserved for this example * to work, so you should call the constructor with * <code>preservePositionIncrements</code> parameter set to * false * * <p> * If SynonymFilter is used to map wifi and wireless network to * hotspot then the partial text "wirele..." could suggest * "wifi router". Token normalization like stemmers, accent * removal, etc., would allow suggestions to ignore such * variations. * * <p> * When two matching suggestions have the same weight, they * are tie-broken by the analyzed form. If their analyzed * form is the same then the order is undefined. * * <p> * There are some limitations: * <ul> * * <li> A lookup from a query like "net" in English won't * be any different than "net " (ie, user added a * trailing space) because analyzers don't reflect * when they've seen a token separator and when they * haven't. * * <li> If you're using {@code StopFilter}, and the user will * type "fast apple", but so far all they've typed is * "fast a", again because the analyzer doesn't convey whether * it's seen a token separator after the "a", * {@code StopFilter} will remove that "a" causing * far more matches than you'd expect. * * <li> Lookups with the empty string return no results * instead of all results. * </ul> * * @lucene.experimental */
// redundant 'implements Accountable' to workaround javadocs bugs public class AnalyzingSuggester extends Lookup implements Accountable {
FST<Weight,Surface>: input is the analyzed form, with a null byte between terms weights are encoded as costs: (Integer.MAX_VALUE-weight) surface is the original, unanalyzed form.
/** * FST&lt;Weight,Surface&gt;: * input is the analyzed form, with a null byte between terms * weights are encoded as costs: (Integer.MAX_VALUE-weight) * surface is the original, unanalyzed form. */
private FST<Pair<Long,BytesRef>> fst = null;
Analyzer that will be used for analyzing suggestions at index time.
/** * Analyzer that will be used for analyzing suggestions at * index time. */
private final Analyzer indexAnalyzer;
Analyzer that will be used for analyzing suggestions at query time.
/** * Analyzer that will be used for analyzing suggestions at * query time. */
private final Analyzer queryAnalyzer;
True if exact match suggestions should always be returned first.
/** * True if exact match suggestions should always be returned first. */
private final boolean exactFirst;
True if separator between tokens should be preserved.
/** * True if separator between tokens should be preserved. */
private final boolean preserveSep;
Include this flag in the options parameter to AnalyzingSuggester(Directory, String, Analyzer, Analyzer, int, int, int, boolean) to always return the exact match first, regardless of score. This has no performance impact but could result in low-quality suggestions.
/** Include this flag in the options parameter to {@link * #AnalyzingSuggester(Directory,String,Analyzer,Analyzer,int,int,int,boolean)} to always * return the exact match first, regardless of score. This * has no performance impact but could result in * low-quality suggestions. */
public static final int EXACT_FIRST = 1;
Include this flag in the options parameter to AnalyzingSuggester(Directory, String, Analyzer, Analyzer, int, int, int, boolean) to preserve token separators when matching.
/** Include this flag in the options parameter to {@link * #AnalyzingSuggester(Directory,String,Analyzer,Analyzer,int,int,int,boolean)} to preserve * token separators when matching. */
public static final int PRESERVE_SEP = 2;
Represents the separation between tokens, if PRESERVE_SEP was specified
/** Represents the separation between tokens, if * PRESERVE_SEP was specified */
private static final int SEP_LABEL = '\u001F';
Marks end of the analyzed input and start of dedup byte.
/** Marks end of the analyzed input and start of dedup * byte. */
private static final int END_BYTE = 0x0;
Maximum number of dup surface forms (different surface forms for the same analyzed form).
/** Maximum number of dup surface forms (different surface * forms for the same analyzed form). */
private final int maxSurfaceFormsPerAnalyzedForm;
Maximum graph paths to index for a single analyzed surface form. This only matters if your analyzer makes lots of alternate paths (e.g. contains SynonymFilter).
/** Maximum graph paths to index for a single analyzed * surface form. This only matters if your analyzer * makes lots of alternate paths (e.g. contains * SynonymFilter). */
private final int maxGraphExpansions; private final Directory tempDir; private final String tempFileNamePrefix;
Highest number of analyzed paths we saw for any single input surface form. For analyzers that never create graphs this will always be 1.
/** Highest number of analyzed paths we saw for any single * input surface form. For analyzers that never create * graphs this will always be 1. */
private int maxAnalyzedPathsForOneInput; private boolean hasPayloads; private static final int PAYLOAD_SEP = '\u001f';
Whether position holes should appear in the automaton.
/** Whether position holes should appear in the automaton. */
private boolean preservePositionIncrements;
Number of entries the lookup was built with
/** Number of entries the lookup was built with */
private long count = 0; /** * Calls {@link #AnalyzingSuggester(Directory,String,Analyzer,Analyzer,int,int,int,boolean) * AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST | * PRESERVE_SEP, 256, -1, true)} */ public AnalyzingSuggester(Directory tempDir, String tempFileNamePrefix, Analyzer analyzer) { this(tempDir, tempFileNamePrefix, analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true); } /** * Calls {@link #AnalyzingSuggester(Directory,String,Analyzer,Analyzer,int,int,int,boolean) * AnalyzingSuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST | * PRESERVE_SEP, 256, -1, true)} */ public AnalyzingSuggester(Directory tempDir, String tempFileNamePrefix, Analyzer indexAnalyzer, Analyzer queryAnalyzer) { this(tempDir, tempFileNamePrefix, indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true); }
Creates a new suggester.
Params:
  • indexAnalyzer – Analyzer that will be used for analyzing suggestions while building the index.
  • queryAnalyzer – Analyzer that will be used for analyzing query text during lookup
  • options – see EXACT_FIRST, PRESERVE_SEP
  • maxSurfaceFormsPerAnalyzedForm – Maximum number of surface forms to keep for a single analyzed form. When there are too many surface forms we discard the lowest weighted ones.
  • maxGraphExpansions – Maximum number of graph paths to expand from the analyzed form. Set this to -1 for no limit.
  • preservePositionIncrements – Whether position holes should appear in the automata
/** * Creates a new suggester. * * @param indexAnalyzer Analyzer that will be used for * analyzing suggestions while building the index. * @param queryAnalyzer Analyzer that will be used for * analyzing query text during lookup * @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP} * @param maxSurfaceFormsPerAnalyzedForm Maximum number of * surface forms to keep for a single analyzed form. * When there are too many surface forms we discard the * lowest weighted ones. * @param maxGraphExpansions Maximum number of graph paths * to expand from the analyzed form. Set this to -1 for * no limit. * @param preservePositionIncrements Whether position holes * should appear in the automata */
public AnalyzingSuggester(Directory tempDir, String tempFileNamePrefix, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, boolean preservePositionIncrements) { this.indexAnalyzer = indexAnalyzer; this.queryAnalyzer = queryAnalyzer; if ((options & ~(EXACT_FIRST | PRESERVE_SEP)) != 0) { throw new IllegalArgumentException("options should only contain EXACT_FIRST and PRESERVE_SEP; got " + options); } this.exactFirst = (options & EXACT_FIRST) != 0; this.preserveSep = (options & PRESERVE_SEP) != 0; // NOTE: this is just an implementation limitation; if // somehow this is a problem we could fix it by using // more than one byte to disambiguate ... but 256 seems // like it should be way more then enough. if (maxSurfaceFormsPerAnalyzedForm <= 0 || maxSurfaceFormsPerAnalyzedForm > 256) { throw new IllegalArgumentException("maxSurfaceFormsPerAnalyzedForm must be > 0 and < 256 (got: " + maxSurfaceFormsPerAnalyzedForm + ")"); } this.maxSurfaceFormsPerAnalyzedForm = maxSurfaceFormsPerAnalyzedForm; if (maxGraphExpansions < 1 && maxGraphExpansions != -1) { throw new IllegalArgumentException("maxGraphExpansions must -1 (no limit) or > 0 (got: " + maxGraphExpansions + ")"); } this.maxGraphExpansions = maxGraphExpansions; this.preservePositionIncrements = preservePositionIncrements; this.tempDir = tempDir; this.tempFileNamePrefix = tempFileNamePrefix; }
Returns byte size of the underlying FST.
/** Returns byte size of the underlying FST. */
@Override public long ramBytesUsed() { return fst == null ? 0 : fst.ramBytesUsed(); } @Override public Collection<Accountable> getChildResources() { if (fst == null) { return Collections.emptyList(); } else { return Collections.singletonList(Accountables.namedAccountable("fst", fst)); } } // Replaces SEP with epsilon or remaps them if // we were asked to preserve them: private Automaton replaceSep(Automaton a) { int numStates = a.getNumStates(); Automaton.Builder result = new Automaton.Builder(numStates, a.getNumTransitions()); // Copy all states over result.copyStates(a); // Go in reverse topo sort so we know we only have to // make one pass: Transition t = new Transition(); int[] topoSortStates = Operations.topoSortStates(a); for(int i=0;i<topoSortStates.length;i++) { int state = topoSortStates[topoSortStates.length-1-i]; int count = a.initTransition(state, t); for(int j=0;j<count;j++) { a.getNextTransition(t); if (t.min == TokenStreamToAutomaton.POS_SEP) { assert t.max == TokenStreamToAutomaton.POS_SEP; if (preserveSep) { // Remap to SEP_LABEL: result.addTransition(state, t.dest, SEP_LABEL); } else { result.addEpsilon(state, t.dest); } } else if (t.min == TokenStreamToAutomaton.HOLE) { assert t.max == TokenStreamToAutomaton.HOLE; // Just remove the hole: there will then be two // SEP tokens next to each other, which will only // match another hole at search time. Note that // it will also match an empty-string token ... if // that's somehow a problem we can always map HOLE // to a dedicated byte (and escape it in the // input). result.addEpsilon(state, t.dest); } else { result.addTransition(state, t.dest, t.min, t.max); } } } return result.finish(); }
Used by subclass to change the lookup automaton, if necessary.
/** Used by subclass to change the lookup automaton, if * necessary. */
protected Automaton convertAutomaton(Automaton a) { return a; } TokenStreamToAutomaton getTokenStreamToAutomaton() { final TokenStreamToAutomaton tsta = new TokenStreamToAutomaton(); tsta.setPreservePositionIncrements(preservePositionIncrements); tsta.setFinalOffsetGapAsHole(true); return tsta; } private static class AnalyzingComparator implements Comparator<BytesRef> { private final boolean hasPayloads; public AnalyzingComparator(boolean hasPayloads) { this.hasPayloads = hasPayloads; } private final ByteArrayDataInput readerA = new ByteArrayDataInput(); private final ByteArrayDataInput readerB = new ByteArrayDataInput(); private final BytesRef scratchA = new BytesRef(); private final BytesRef scratchB = new BytesRef(); @Override public int compare(BytesRef a, BytesRef b) { // First by analyzed form: readerA.reset(a.bytes, a.offset, a.length); scratchA.length = readerA.readShort(); scratchA.bytes = a.bytes; scratchA.offset = readerA.getPosition(); readerB.reset(b.bytes, b.offset, b.length); scratchB.bytes = b.bytes; scratchB.length = readerB.readShort(); scratchB.offset = readerB.getPosition(); int cmp = scratchA.compareTo(scratchB); if (cmp != 0) { return cmp; } readerA.skipBytes(scratchA.length); readerB.skipBytes(scratchB.length); // Next by cost: long aCost = readerA.readInt(); long bCost = readerB.readInt(); assert decodeWeight(aCost) >= 0; assert decodeWeight(bCost) >= 0; if (aCost < bCost) { return -1; } else if (aCost > bCost) { return 1; } // Finally by surface form: if (hasPayloads) { scratchA.length = readerA.readShort(); scratchB.length = readerB.readShort(); scratchA.offset = readerA.getPosition(); scratchB.offset = readerB.getPosition(); } else { scratchA.offset = readerA.getPosition(); scratchB.offset = readerB.getPosition(); scratchA.length = readerA.length() - readerA.getPosition(); scratchB.length = readerB.length() - readerB.getPosition(); } assert scratchA.isValid(); assert scratchB.isValid(); return scratchA.compareTo(scratchB); } } @Override public void build(InputIterator iterator) throws IOException { if (iterator.hasContexts()) { throw new IllegalArgumentException("this suggester doesn't support contexts"); } hasPayloads = iterator.hasPayloads(); OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new AnalyzingComparator(hasPayloads)); IndexOutput tempInput = tempDir.createTempOutput(tempFileNamePrefix, "input", IOContext.DEFAULT); OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); OfflineSorter.ByteSequencesReader reader = null; BytesRefBuilder scratch = new BytesRefBuilder(); TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton(); String tempSortedFileName = null; count = 0; byte buffer[] = new byte[8]; try { ByteArrayDataOutput output = new ByteArrayDataOutput(buffer); for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null;) { LimitedFiniteStringsIterator finiteStrings = new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions); for (IntsRef string; (string = finiteStrings.next()) != null; count++) { Util.toBytesRef(string, scratch); // length of the analyzed text (FST input) if (scratch.length() > Short.MAX_VALUE-2) { throw new IllegalArgumentException("cannot handle analyzed forms > " + (Short.MAX_VALUE-2) + " in length (got " + scratch.length() + ")"); } short analyzedLength = (short) scratch.length(); // compute the required length: // analyzed sequence + weight (4) + surface + analyzedLength (short) int requiredLength = analyzedLength + 4 + surfaceForm.length + 2; BytesRef payload; if (hasPayloads) { if (surfaceForm.length > (Short.MAX_VALUE-2)) { throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")"); } payload = iterator.payload(); // payload + surfaceLength (short) requiredLength += payload.length + 2; } else { payload = null; } buffer = ArrayUtil.grow(buffer, requiredLength); output.reset(buffer); output.writeShort(analyzedLength); output.writeBytes(scratch.bytes(), 0, scratch.length()); output.writeInt(encodeWeight(iterator.weight())); if (hasPayloads) { for(int i=0;i<surfaceForm.length;i++) { if (surfaceForm.bytes[i] == PAYLOAD_SEP) { throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved"); } } output.writeShort((short) surfaceForm.length); output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length); output.writeBytes(payload.bytes, payload.offset, payload.length); } else { output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length); } assert output.getPosition() == requiredLength: output.getPosition() + " vs " + requiredLength; writer.write(buffer, 0, output.getPosition()); } maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, finiteStrings.size()); } CodecUtil.writeFooter(tempInput); writer.close(); // Sort all input/output pairs (required by FST.Builder): tempSortedFileName = sorter.sort(tempInput.getName()); // Free disk space: tempDir.deleteFile(tempInput.getName()); reader = new OfflineSorter.ByteSequencesReader(tempDir.openChecksumInput(tempSortedFileName, IOContext.READONCE), tempSortedFileName); PairOutputs<Long,BytesRef> outputs = new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); Builder<Pair<Long,BytesRef>> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs); // Build FST: BytesRefBuilder previousAnalyzed = null; BytesRefBuilder analyzed = new BytesRefBuilder(); BytesRef surface = new BytesRef(); IntsRefBuilder scratchInts = new IntsRefBuilder(); ByteArrayDataInput input = new ByteArrayDataInput(); // Used to remove duplicate surface forms (but we // still index the hightest-weight one). We clear // this when we see a new analyzed form, so it cannot // grow unbounded (at most 256 entries): Set<BytesRef> seenSurfaceForms = new HashSet<>(); int dedup = 0; while (true) { BytesRef bytes = reader.next(); if (bytes == null) { break; } input.reset(bytes.bytes, bytes.offset, bytes.length); short analyzedLength = input.readShort(); analyzed.grow(analyzedLength+2); input.readBytes(analyzed.bytes(), 0, analyzedLength); analyzed.setLength(analyzedLength); long cost = input.readInt(); surface.bytes = bytes.bytes; if (hasPayloads) { surface.length = input.readShort(); surface.offset = input.getPosition(); } else { surface.offset = input.getPosition(); surface.length = bytes.length - surface.offset; } if (previousAnalyzed == null) { previousAnalyzed = new BytesRefBuilder(); previousAnalyzed.copyBytes(analyzed.get()); seenSurfaceForms.add(BytesRef.deepCopyOf(surface)); } else if (analyzed.get().equals(previousAnalyzed.get())) { dedup++; if (dedup >= maxSurfaceFormsPerAnalyzedForm) { // More than maxSurfaceFormsPerAnalyzedForm // dups: skip the rest: continue; } if (seenSurfaceForms.contains(surface)) { continue; } seenSurfaceForms.add(BytesRef.deepCopyOf(surface)); } else { dedup = 0; previousAnalyzed.copyBytes(analyzed); seenSurfaceForms.clear(); seenSurfaceForms.add(BytesRef.deepCopyOf(surface)); } // TODO: I think we can avoid the extra 2 bytes when // there is no dup (dedup==0), but we'd have to fix // the exactFirst logic ... which would be sort of // hairy because we'd need to special case the two // (dup/not dup)... // NOTE: must be byte 0 so we sort before whatever // is next analyzed.append((byte) 0); analyzed.append((byte) dedup); Util.toIntsRef(analyzed.get(), scratchInts); //System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString()); if (!hasPayloads) { builder.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface))); } else { int payloadOffset = input.getPosition() + surface.length; int payloadLength = bytes.length - payloadOffset; BytesRef br = new BytesRef(surface.length + 1 + payloadLength); System.arraycopy(surface.bytes, surface.offset, br.bytes, 0, surface.length); br.bytes[surface.length] = PAYLOAD_SEP; System.arraycopy(bytes.bytes, payloadOffset, br.bytes, surface.length+1, payloadLength); br.length = br.bytes.length; builder.add(scratchInts.get(), outputs.newPair(cost, br)); } } fst = builder.finish(); //Util.dotToFile(fst, "/tmp/suggest.dot"); } finally { IOUtils.closeWhileHandlingException(reader, writer); IOUtils.deleteFilesIgnoringExceptions(tempDir, tempInput.getName(), tempSortedFileName); } } @Override public boolean store(DataOutput output) throws IOException { output.writeVLong(count); if (fst == null) { return false; } fst.save(output); output.writeVInt(maxAnalyzedPathsForOneInput); output.writeByte((byte) (hasPayloads ? 1 : 0)); return true; } @Override public boolean load(DataInput input) throws IOException { count = input.readVLong(); this.fst = new FST<>(input, new PairOutputs<>(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton())); maxAnalyzedPathsForOneInput = input.readVInt(); hasPayloads = input.readByte() == 1; return true; } private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRefBuilder spare) { LookupResult result; if (hasPayloads) { int sepIndex = -1; for(int i=0;i<output2.length;i++) { if (output2.bytes[output2.offset+i] == PAYLOAD_SEP) { sepIndex = i; break; } } assert sepIndex != -1; spare.grow(sepIndex); final int payloadLen = output2.length - sepIndex - 1; spare.copyUTF8Bytes(output2.bytes, output2.offset, sepIndex); BytesRef payload = new BytesRef(payloadLen); System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen); payload.length = payloadLen; result = new LookupResult(spare.toString(), decodeWeight(output1), payload); } else { spare.grow(output2.length); spare.copyUTF8Bytes(output2); result = new LookupResult(spare.toString(), decodeWeight(output1)); } return result; } private boolean sameSurfaceForm(BytesRef key, BytesRef output2) { if (hasPayloads) { // output2 has at least PAYLOAD_SEP byte: if (key.length >= output2.length) { return false; } for(int i=0;i<key.length;i++) { if (key.bytes[key.offset+i] != output2.bytes[output2.offset+i]) { return false; } } return output2.bytes[output2.offset + key.length] == PAYLOAD_SEP; } else { return key.bytesEquals(output2); } } @Override public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) { assert num > 0; if (onlyMorePopular) { throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false"); } if (contexts != null) { throw new IllegalArgumentException("this suggester doesn't support contexts"); } if (fst == null) { return Collections.emptyList(); } //System.out.println("lookup key=" + key + " num=" + num); for (int i = 0; i < key.length(); i++) { if (key.charAt(i) == 0x1E) { throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved"); } if (key.charAt(i) == 0x1F) { throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved"); } } final BytesRef utf8Key = new BytesRef(key); try { Automaton lookupAutomaton = toLookupAutomaton(key); final CharsRefBuilder spare = new CharsRefBuilder(); //System.out.println(" now intersect exactFirst=" + exactFirst); // Intersect automaton w/ suggest wFST and get all // prefix starting nodes & their outputs: //final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst); //System.out.println(" prefixPaths: " + prefixPaths.size()); BytesReader bytesReader = fst.getBytesReader(); FST.Arc<Pair<Long,BytesRef>> scratchArc = new FST.Arc<>(); final List<LookupResult> results = new ArrayList<>(); List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst); if (exactFirst) { int count = 0; for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) { if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) { // This node has END_BYTE arc leaving, meaning it's an // "exact" match: count++; } } // Searcher just to find the single exact only // match, if present: Util.TopNSearcher<Pair<Long,BytesRef>> searcher; searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator); // NOTE: we could almost get away with only using // the first start node. The only catch is if // maxSurfaceFormsPerAnalyzedForm had kicked in and // pruned our exact match from one of these nodes // ...: for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) { if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) { // This node has END_BYTE arc leaving, meaning it's an // "exact" match: searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input); } } TopResults<Pair<Long,BytesRef>> completions = searcher.search(); assert completions.isComplete; // NOTE: this is rather inefficient: we enumerate // every matching "exactly the same analyzed form" // path, and then do linear scan to see if one of // these exactly matches the input. It should be // possible (though hairy) to do something similar // to getByOutput, since the surface form is encoded // into the FST output, so we more efficiently hone // in on the exact surface-form match. Still, I // suspect very little time is spent in this linear // seach: it's bounded by how many prefix start // nodes we have and the // maxSurfaceFormsPerAnalyzedForm: for(Result<Pair<Long,BytesRef>> completion : completions) { BytesRef output2 = completion.output.output2; if (sameSurfaceForm(utf8Key, output2)) { results.add(getLookupResult(completion.output.output1, output2, spare)); break; } } if (results.size() == num) { // That was quick: return results; } } Util.TopNSearcher<Pair<Long,BytesRef>> searcher; searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) { private final Set<BytesRef> seen = new HashSet<>(); @Override protected boolean acceptResult(IntsRef input, Pair<Long,BytesRef> output) { // Dedup: when the input analyzes to a graph we // can get duplicate surface forms: if (seen.contains(output.output2)) { return false; } seen.add(output.output2); if (!exactFirst) { return true; } else { // In exactFirst mode, don't accept any paths // matching the surface form since that will // create duplicate results: if (sameSurfaceForm(utf8Key, output.output2)) { // We found exact match, which means we should // have already found it in the first search: assert results.size() == 1; return false; } else { return true; } } } }; prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst); for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) { searcher.addStartPaths(path.fstNode, path.output, true, path.input); } TopResults<Pair<Long,BytesRef>> completions = searcher.search(); assert completions.isComplete; for(Result<Pair<Long,BytesRef>> completion : completions) { LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare); // TODO: for fuzzy case would be nice to return // how many edits were required //System.out.println(" result=" + result); results.add(result); if (results.size() == num) { // In the exactFirst=true case the search may // produce one extra path break; } } return results; } catch (IOException bogus) { throw new RuntimeException(bogus); } } @Override public long getCount() { return count; }
Returns all prefix paths to initialize the search.
/** Returns all prefix paths to initialize the search. */
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths, Automaton lookupAutomaton, FST<Pair<Long,BytesRef>> fst) throws IOException { return prefixPaths; } final Automaton toAutomaton(final BytesRef surfaceForm, final TokenStreamToAutomaton ts2a) throws IOException { // Analyze surface form: Automaton automaton; try (TokenStream ts = indexAnalyzer.tokenStream("", surfaceForm.utf8ToString())) { // Create corresponding automaton: labels are bytes // from each analyzed token, with byte 0 used as // separator between tokens: automaton = ts2a.toAutomaton(ts); } automaton = replaceSep(automaton); automaton = convertAutomaton(automaton); // TODO: LUCENE-5660 re-enable this once we disallow massive suggestion strings // assert SpecialOperations.isFinite(automaton); // Get all paths from the automaton (there can be // more than one path, eg if the analyzer created a // graph using SynFilter or WDF): return automaton; } final Automaton toLookupAutomaton(final CharSequence key) throws IOException { // TODO: is there a Reader from a CharSequence? // Turn tokenstream into automaton: Automaton automaton = null; try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) { automaton = getTokenStreamToAutomaton().toAutomaton(ts); } automaton = replaceSep(automaton); // TODO: we can optimize this somewhat by determinizing // while we convert automaton = Operations.determinize(automaton, DEFAULT_MAX_DETERMINIZED_STATES); return automaton; }
Returns the weight associated with an input string, or null if it does not exist.
/** * Returns the weight associated with an input string, * or null if it does not exist. */
public Object get(CharSequence key) { throw new UnsupportedOperationException(); }
cost -> weight
/** cost -&gt; weight */
private static int decodeWeight(long encoded) { return (int)(Integer.MAX_VALUE - encoded); }
weight -> cost
/** weight -&gt; cost */
private static int encodeWeight(long value) { if (value < 0 || value > Integer.MAX_VALUE) { throw new UnsupportedOperationException("cannot encode value: " + value); } return Integer.MAX_VALUE - (int)value; } static final Comparator<Pair<Long,BytesRef>> weightComparator = new Comparator<Pair<Long,BytesRef>> () { @Override public int compare(Pair<Long,BytesRef> left, Pair<Long,BytesRef> right) { return left.output1.compareTo(right.output1); } }; }