/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.analyzing;

import java.io.Closeable;
import java.io.IOException;
import java.io.StringReader;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SegmentReader;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;

// TODO:
//   - a PostingsFormat that stores super-high-freq terms as
//     a bitset should be a win for the prefix terms?
//     (LUCENE-5052)
//   - we could offer a better integration with
//     DocumentDictionary and NRT?  so that your suggester
//     "automatically" keeps in sync w/ your index

Analyzes the input text and then suggests matches based on prefix matches to any tokens in the indexed text. This also highlights the tokens that match.

This suggester supports payloads. Matches are sorted only by the suggest weight; it would be nice to support blended score + weight sort in the future. This means this suggester best applies when there is a strong a-priori ranking of all the suggestions.

This suggester supports contexts, including arbitrary binary terms.

@lucene.experimental
/** Analyzes the input text and then suggests matches based * on prefix matches to any tokens in the indexed text. * This also highlights the tokens that match. * * <p>This suggester supports payloads. Matches are sorted only * by the suggest weight; it would be nice to support * blended score + weight sort in the future. This means * this suggester best applies when there is a strong * a-priori ranking of all the suggestions. * * <p>This suggester supports contexts, including arbitrary binary * terms. * * @lucene.experimental */
public class AnalyzingInfixSuggester extends Lookup implements Closeable {
edgegrams for searching short prefixes without Prefix Query that's controlled by minPrefixChars
/** edgegrams for searching short prefixes without Prefix Query * that's controlled by {@linkplain #minPrefixChars} */
protected final static String TEXTGRAMS_FIELD_NAME = "textgrams";
Field name used for the indexed text.
/** Field name used for the indexed text. */
protected final static String TEXT_FIELD_NAME = "text";
Field name used for the indexed text, as a StringField, for exact lookup.
/** Field name used for the indexed text, as a * StringField, for exact lookup. */
protected final static String EXACT_TEXT_FIELD_NAME = "exacttext";
Field name used for the indexed context, as a StringField and a SortedSetDVField, for filtering.
/** Field name used for the indexed context, as a * StringField and a SortedSetDVField, for filtering. */
protected final static String CONTEXTS_FIELD_NAME = "contexts";
Analyzer used at search time
/** Analyzer used at search time */
protected final Analyzer queryAnalyzer;
Analyzer used at index time
/** Analyzer used at index time */
protected final Analyzer indexAnalyzer; private final Directory dir; final int minPrefixChars; private final boolean allTermsRequired; private final boolean highlight; private final boolean commitOnBuild; private final boolean closeIndexWriterOnBuild;
Used for ongoing NRT additions/updates.
/** Used for ongoing NRT additions/updates. */
protected IndexWriter writer;
IndexSearcher used for lookups.
/** {@link IndexSearcher} used for lookups. */
protected SearcherManager searcherMgr;
Used to manage concurrent access to searcherMgr
/** Used to manage concurrent access to searcherMgr */
protected final Object searcherMgrLock = new Object();
Default minimum number of leading characters before PrefixQuery is used (4).
/** Default minimum number of leading characters before * PrefixQuery is used (4). */
public static final int DEFAULT_MIN_PREFIX_CHARS = 4;
Default boolean clause option for multiple terms matching (all terms required).
/** Default boolean clause option for multiple terms matching (all terms required). */
public static final boolean DEFAULT_ALL_TERMS_REQUIRED = true;
Default higlighting option.
/** Default higlighting option. */
public static final boolean DEFAULT_HIGHLIGHT = true;
Default option to close the IndexWriter once the index has been built.
/** Default option to close the IndexWriter once the index has been built. */
protected final static boolean DEFAULT_CLOSE_INDEXWRITER_ON_BUILD = true;
How we sort the postings and search results.
/** How we sort the postings and search results. */
private static final Sort SORT = new Sort(new SortField("weight", SortField.Type.LONG, true));
Create a new instance, loading from a previously built AnalyzingInfixSuggester directory, if it exists. This directory must be private to the infix suggester (i.e., not an external Lucene index). Note that close will also close the provided directory.
/** Create a new instance, loading from a previously built * AnalyzingInfixSuggester directory, if it exists. This directory must be * private to the infix suggester (i.e., not an external * Lucene index). Note that {@link #close} * will also close the provided directory. */
public AnalyzingInfixSuggester(Directory dir, Analyzer analyzer) throws IOException { this(dir, analyzer, analyzer, DEFAULT_MIN_PREFIX_CHARS, false, DEFAULT_ALL_TERMS_REQUIRED, DEFAULT_HIGHLIGHT); }
Create a new instance, loading from a previously built AnalyzingInfixSuggester directory, if it exists. This directory must be private to the infix suggester (i.e., not an external Lucene index). Note that close will also close the provided directory. @param minPrefixChars Minimum number of leading characters before PrefixQuery is used (default 4). Prefixes shorter than this are indexed as character ngrams (increasing index size but making lookups faster). @param commitOnBuild Call commit after the index has finished building. This would persist the suggester index to disk and future instances of this suggester can use this pre-built dictionary.
/** Create a new instance, loading from a previously built * AnalyzingInfixSuggester directory, if it exists. This directory must be * private to the infix suggester (i.e., not an external * Lucene index). Note that {@link #close} * will also close the provided directory. * * @param minPrefixChars Minimum number of leading characters * before PrefixQuery is used (default 4). * Prefixes shorter than this are indexed as character * ngrams (increasing index size but making lookups * faster). * * @param commitOnBuild Call commit after the index has finished building. This would persist the * suggester index to disk and future instances of this suggester can use this pre-built dictionary. */
public AnalyzingInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, boolean commitOnBuild) throws IOException { this(dir, indexAnalyzer, queryAnalyzer, minPrefixChars, commitOnBuild, DEFAULT_ALL_TERMS_REQUIRED, DEFAULT_HIGHLIGHT); }
Create a new instance, loading from a previously built AnalyzingInfixSuggester directory, if it exists. This directory must be private to the infix suggester (i.e., not an external Lucene index). Note that close will also close the provided directory. @param minPrefixChars Minimum number of leading characters before PrefixQuery is used (default 4). Prefixes shorter than this are indexed as character ngrams (increasing index size but making lookups faster). @param commitOnBuild Call commit after the index has finished building. This would persist the suggester index to disk and future instances of this suggester can use this pre-built dictionary. @param allTermsRequired All terms in the suggest query must be matched. @param highlight Highlight suggest query in suggestions.
/** Create a new instance, loading from a previously built * AnalyzingInfixSuggester directory, if it exists. This directory must be * private to the infix suggester (i.e., not an external * Lucene index). Note that {@link #close} * will also close the provided directory. * * @param minPrefixChars Minimum number of leading characters * before PrefixQuery is used (default 4). * Prefixes shorter than this are indexed as character * ngrams (increasing index size but making lookups * faster). * * @param commitOnBuild Call commit after the index has finished building. This would persist the * suggester index to disk and future instances of this suggester can use this pre-built dictionary. * * @param allTermsRequired All terms in the suggest query must be matched. * @param highlight Highlight suggest query in suggestions. * */
public AnalyzingInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, boolean commitOnBuild, boolean allTermsRequired, boolean highlight) throws IOException { this(dir, indexAnalyzer, queryAnalyzer, minPrefixChars, commitOnBuild, allTermsRequired, highlight, DEFAULT_CLOSE_INDEXWRITER_ON_BUILD); }
Create a new instance, loading from a previously built AnalyzingInfixSuggester directory, if it exists. This directory must be private to the infix suggester (i.e., not an external Lucene index). Note that close will also close the provided directory. @param minPrefixChars Minimum number of leading characters before PrefixQuery is used (default 4). Prefixes shorter than this are indexed as character ngrams (increasing index size but making lookups faster). @param commitOnBuild Call commit after the index has finished building. This would persist the suggester index to disk and future instances of this suggester can use this pre-built dictionary. @param allTermsRequired All terms in the suggest query must be matched. @param highlight Highlight suggest query in suggestions. @param closeIndexWriterOnBuild If true, the IndexWriter will be closed after the index has finished building.
/** Create a new instance, loading from a previously built * AnalyzingInfixSuggester directory, if it exists. This directory must be * private to the infix suggester (i.e., not an external * Lucene index). Note that {@link #close} * will also close the provided directory. * * @param minPrefixChars Minimum number of leading characters * before PrefixQuery is used (default 4). * Prefixes shorter than this are indexed as character * ngrams (increasing index size but making lookups * faster). * * @param commitOnBuild Call commit after the index has finished building. This would persist the * suggester index to disk and future instances of this suggester can use this pre-built dictionary. * * @param allTermsRequired All terms in the suggest query must be matched. * @param highlight Highlight suggest query in suggestions. * @param closeIndexWriterOnBuild If true, the IndexWriter will be closed after the index has finished building. */
public AnalyzingInfixSuggester(Directory dir, Analyzer indexAnalyzer, Analyzer queryAnalyzer, int minPrefixChars, boolean commitOnBuild, boolean allTermsRequired, boolean highlight, boolean closeIndexWriterOnBuild) throws IOException { if (minPrefixChars < 0) { throw new IllegalArgumentException("minPrefixChars must be >= 0; got: " + minPrefixChars); } this.queryAnalyzer = queryAnalyzer; this.indexAnalyzer = indexAnalyzer; this.dir = dir; this.minPrefixChars = minPrefixChars; this.commitOnBuild = commitOnBuild; this.allTermsRequired = allTermsRequired; this.highlight = highlight; this.closeIndexWriterOnBuild = closeIndexWriterOnBuild; if (DirectoryReader.indexExists(dir)) { // Already built; open it: searcherMgr = new SearcherManager(dir, null); } }
Override this to customize index settings, e.g. which codec to use.
/** Override this to customize index settings, e.g. which * codec to use. */
protected IndexWriterConfig getIndexWriterConfig(Analyzer indexAnalyzer, IndexWriterConfig.OpenMode openMode) { IndexWriterConfig iwc = new IndexWriterConfig(indexAnalyzer); iwc.setOpenMode(openMode); // This way all merged segments will be sorted at // merge time, allow for per-segment early termination // when those segments are searched: iwc.setIndexSort(SORT); return iwc; }
Subclass can override to choose a specific Directory implementation.
/** Subclass can override to choose a specific {@link * Directory} implementation. */
protected Directory getDirectory(Path path) throws IOException { return FSDirectory.open(path); } @Override public void build(InputIterator iter) throws IOException { synchronized (searcherMgrLock) { if (searcherMgr != null) { searcherMgr.close(); searcherMgr = null; } if (writer != null) { writer.close(); writer = null; } boolean success = false; try { // First pass: build a temporary normal Lucene index, // just indexing the suggestions as they iterate: writer = new IndexWriter(dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.CREATE)); //long t0 = System.nanoTime(); // TODO: use threads? BytesRef text; while ((text = iter.next()) != null) { BytesRef payload; if (iter.hasPayloads()) { payload = iter.payload(); } else { payload = null; } add(text, iter.contexts(), iter.weight(), payload); } //System.out.println("initial indexing time: " + ((System.nanoTime()-t0)/1000000) + " msec"); if (commitOnBuild || closeIndexWriterOnBuild) { commit(); } searcherMgr = new SearcherManager(writer, null); success = true; } finally { if (success) { if (closeIndexWriterOnBuild) { writer.close(); writer = null; } } else { // failure if (writer != null) { writer.rollback(); writer = null; } } } } }
Commits all pending changes made to this suggester to disk. @see IndexWriter#commit
/** Commits all pending changes made to this suggester to disk. * * @see IndexWriter#commit */
public void commit() throws IOException { if (writer == null) { if (searcherMgr == null || closeIndexWriterOnBuild == false) { throw new IllegalStateException("Cannot commit on an closed writer. Add documents first"); } // else no-op: writer was committed and closed after the index was built, so commit is unnecessary } else { writer.commit(); } } private Analyzer getGramAnalyzer() { return new AnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) { @Override protected Analyzer getWrappedAnalyzer(String fieldName) { return indexAnalyzer; } @Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { assert !(fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars == 0) : "no need \"textgrams\" when minPrefixChars="+minPrefixChars; if (fieldName.equals(TEXTGRAMS_FIELD_NAME) && minPrefixChars > 0) { // TODO: should use an EdgeNGramTokenFilterFactory here TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), 1, minPrefixChars, false); return new TokenStreamComponents(components.getSource(), filter); } else { return components; } } }; } private synchronized void ensureOpen() throws IOException { if (writer == null) { if (DirectoryReader.indexExists(dir)) { // Already built; open it: writer = new IndexWriter(dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.APPEND)); } else { writer = new IndexWriter(dir, getIndexWriterConfig(getGramAnalyzer(), IndexWriterConfig.OpenMode.CREATE)); } synchronized (searcherMgrLock) { SearcherManager oldSearcherMgr = searcherMgr; searcherMgr = new SearcherManager(writer, null); if (oldSearcherMgr != null) { oldSearcherMgr.close(); } } } }
Adds a new suggestion. Be sure to use update instead if you want to replace a previous suggestion. After adding or updating a batch of new suggestions, you must call refresh in the end in order to see the suggestions in Lookup.lookup
/** Adds a new suggestion. Be sure to use {@link #update} * instead if you want to replace a previous suggestion. * After adding or updating a batch of new suggestions, * you must call {@link #refresh} in the end in order to * see the suggestions in {@link #lookup} */
public void add(BytesRef text, Set<BytesRef> contexts, long weight, BytesRef payload) throws IOException { ensureOpen(); writer.addDocument(buildDocument(text, contexts, weight, payload)); }
Updates a previous suggestion, matching the exact same text as before. Use this to change the weight or payload of an already added suggestion. If you know this text is not already present you can use add instead. After adding or updating a batch of new suggestions, you must call refresh in the end in order to see the suggestions in Lookup.lookup
/** Updates a previous suggestion, matching the exact same * text as before. Use this to change the weight or * payload of an already added suggestion. If you know * this text is not already present you can use {@link * #add} instead. After adding or updating a batch of * new suggestions, you must call {@link #refresh} in the * end in order to see the suggestions in {@link #lookup} */
public void update(BytesRef text, Set<BytesRef> contexts, long weight, BytesRef payload) throws IOException { ensureOpen(); writer.updateDocument(new Term(EXACT_TEXT_FIELD_NAME, text.utf8ToString()), buildDocument(text, contexts, weight, payload)); } private Document buildDocument(BytesRef text, Set<BytesRef> contexts, long weight, BytesRef payload) throws IOException { String textString = text.utf8ToString(); Document doc = new Document(); FieldType ft = getTextFieldType(); doc.add(new Field(TEXT_FIELD_NAME, textString, ft)); if (minPrefixChars>0) { doc.add(new Field(TEXTGRAMS_FIELD_NAME, textString, ft)); } doc.add(new StringField(EXACT_TEXT_FIELD_NAME, textString, Field.Store.NO)); doc.add(new BinaryDocValuesField(TEXT_FIELD_NAME, text)); doc.add(new NumericDocValuesField("weight", weight)); if (payload != null) { doc.add(new BinaryDocValuesField("payloads", payload)); } if (contexts != null) { for(BytesRef context : contexts) { doc.add(new StringField(CONTEXTS_FIELD_NAME, context, Field.Store.NO)); doc.add(new SortedSetDocValuesField(CONTEXTS_FIELD_NAME, context)); } } return doc; }
Reopens the underlying searcher; it's best to "batch up" many additions/updates, and then call refresh once in the end.
/** Reopens the underlying searcher; it's best to "batch * up" many additions/updates, and then call refresh * once in the end. */
public void refresh() throws IOException { if (searcherMgr == null) { throw new IllegalStateException("suggester was not built"); } if (writer != null) { searcherMgr.maybeRefreshBlocking(); } // else no-op: writer was committed and closed after the index was built // and before searchMgr was constructed, so refresh is unnecessary }
Subclass can override this method to change the field type of the text field e.g. to change the index options
/** * Subclass can override this method to change the field type of the text field * e.g. to change the index options */
protected FieldType getTextFieldType(){ FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS); ft.setOmitNorms(true); return ft; } @Override public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) throws IOException { return lookup(key, contexts, num, allTermsRequired, highlight); }
Lookup, without any context.
/** Lookup, without any context. */
public List<LookupResult> lookup(CharSequence key, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { return lookup(key, (BooleanQuery)null, num, allTermsRequired, doHighlight); }
Lookup, with context but without booleans. Context booleans default to SHOULD, so each suggestion must have at least one of the contexts.
/** Lookup, with context but without booleans. Context booleans default to SHOULD, * so each suggestion must have at least one of the contexts. */
public List<LookupResult> lookup(CharSequence key, Set<BytesRef> contexts, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { return lookup(key, toQuery(contexts), num, allTermsRequired, doHighlight); }
This is called if the last token isn't ended (e.g. user did not type a space after it). Return an appropriate Query clause to add to the BooleanQuery.
/** This is called if the last token isn't ended * (e.g. user did not type a space after it). Return an * appropriate Query clause to add to the BooleanQuery. */
protected Query getLastTokenQuery(String token) throws IOException { if (token.length() < minPrefixChars) { // The leading ngram was directly indexed: return new TermQuery(new Term(TEXTGRAMS_FIELD_NAME, token)); } return new PrefixQuery(new Term(TEXT_FIELD_NAME, token)); }
Retrieve suggestions, specifying whether all terms must match (allTermsRequired) and whether the hits should be highlighted (doHighlight).
/** Retrieve suggestions, specifying whether all terms * must match ({@code allTermsRequired}) and whether the hits * should be highlighted ({@code doHighlight}). */
public List<LookupResult> lookup(CharSequence key, Map<BytesRef, BooleanClause.Occur> contextInfo, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { return lookup(key, toQuery(contextInfo), num, allTermsRequired, doHighlight); } private BooleanQuery toQuery(Map<BytesRef,BooleanClause.Occur> contextInfo) { if (contextInfo == null || contextInfo.isEmpty()) { return null; } BooleanQuery.Builder contextFilter = new BooleanQuery.Builder(); for (Map.Entry<BytesRef,BooleanClause.Occur> entry : contextInfo.entrySet()) { addContextToQuery(contextFilter, entry.getKey(), entry.getValue()); } return contextFilter.build(); } private BooleanQuery toQuery(Set<BytesRef> contextInfo) { if (contextInfo == null || contextInfo.isEmpty()) { return null; } BooleanQuery.Builder contextFilter = new BooleanQuery.Builder(); for (BytesRef context : contextInfo) { addContextToQuery(contextFilter, context, BooleanClause.Occur.SHOULD); } return contextFilter.build(); }
This method is handy as we do not need access to internal fields such as CONTEXTS_FIELD_NAME in order to build queries However, here may not be its best location.
Params:
  • query – an instance of @See BooleanQuery
  • context – the context
  • clause – one of Occur
/** * This method is handy as we do not need access to internal fields such as CONTEXTS_FIELD_NAME in order to build queries * However, here may not be its best location. * * @param query an instance of @See {@link BooleanQuery} * @param context the context * @param clause one of {@link Occur} */
public void addContextToQuery(BooleanQuery.Builder query, BytesRef context, BooleanClause.Occur clause) { // NOTE: we "should" wrap this in // ConstantScoreQuery, or maybe send this as a // Filter instead to search. // TODO: if we had a BinaryTermField we could fix // this "must be valid ut8f" limitation: query.add(new TermQuery(new Term(CONTEXTS_FIELD_NAME, context)), clause); }
This is an advanced method providing the capability to send down to the suggester any arbitrary lucene query to be used to filter the result of the suggester
Params:
  • key – the keyword being looked for
  • contextQuery – an arbitrary Lucene query to be used to filter the result of the suggester. addContextToQuery could be used to build this contextQuery.
  • num – number of items to return
  • allTermsRequired – all searched terms must match or not
  • doHighlight – if true, the matching term will be highlighted in the search result
Throws:
  • IOException – f the is IO exception while reading data from the index
Returns:the result of the suggester
/** * This is an advanced method providing the capability to send down to the suggester any * arbitrary lucene query to be used to filter the result of the suggester * * @param key the keyword being looked for * @param contextQuery an arbitrary Lucene query to be used to filter the result of the suggester. {@link #addContextToQuery} could be used to build this contextQuery. * @param num number of items to return * @param allTermsRequired all searched terms must match or not * @param doHighlight if true, the matching term will be highlighted in the search result * @return the result of the suggester * @throws IOException f the is IO exception while reading data from the index */
public List<LookupResult> lookup(CharSequence key, BooleanQuery contextQuery, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { if (searcherMgr == null) { throw new IllegalStateException("suggester was not built"); } final BooleanClause.Occur occur; if (allTermsRequired) { occur = BooleanClause.Occur.MUST; } else { occur = BooleanClause.Occur.SHOULD; } BooleanQuery.Builder query; Set<String> matchedTokens; String prefixToken = null; try (TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()))) { //long t0 = System.currentTimeMillis(); ts.reset(); final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); String lastToken = null; query = new BooleanQuery.Builder(); int maxEndOffset = -1; matchedTokens = new HashSet<>(); while (ts.incrementToken()) { if (lastToken != null) { matchedTokens.add(lastToken); query.add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur); } lastToken = termAtt.toString(); if (lastToken != null) { maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset()); } } ts.end(); if (lastToken != null) { Query lastQuery; if (maxEndOffset == offsetAtt.endOffset()) { // Use PrefixQuery (or the ngram equivalent) when // there was no trailing discarded chars in the // string (e.g. whitespace), so that if query does // not end with a space we show prefix matches for // that token: lastQuery = getLastTokenQuery(lastToken); prefixToken = lastToken; } else { // Use TermQuery for an exact match if there were // trailing discarded chars (e.g. whitespace), so // that if query ends with a space we only show // exact matches for that term: matchedTokens.add(lastToken); lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)); } if (lastQuery != null) { query.add(lastQuery, occur); } } if (contextQuery != null) { boolean allMustNot = true; for (BooleanClause clause : contextQuery.clauses()) { if (clause.getOccur() != BooleanClause.Occur.MUST_NOT) { allMustNot = false; break; } } if (allMustNot) { // All are MUST_NOT: add the contextQuery to the main query instead (not as sub-query) for (BooleanClause clause : contextQuery.clauses()) { query.add(clause); } } else if (allTermsRequired == false) { // We must carefully upgrade the query clauses to MUST: BooleanQuery.Builder newQuery = new BooleanQuery.Builder(); newQuery.add(query.build(), BooleanClause.Occur.MUST); newQuery.add(contextQuery, BooleanClause.Occur.MUST); query = newQuery; } else { // Add contextQuery as sub-query query.add(contextQuery, BooleanClause.Occur.MUST); } } } // TODO: we could allow blended sort here, combining // weight w/ score. Now we ignore score and sort only // by weight: Query finalQuery = finishQuery(query, allTermsRequired); //System.out.println("finalQuery=" + finalQuery); // Sort by weight, descending: TopFieldCollector c = TopFieldCollector.create(SORT, num, 1); List<LookupResult> results = null; SearcherManager mgr; IndexSearcher searcher; synchronized (searcherMgrLock) { mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference searcher = mgr.acquire(); } try { //System.out.println("got searcher=" + searcher); searcher.search(finalQuery, c); TopFieldDocs hits = c.topDocs(); // Slower way if postings are not pre-sorted by weight: // hits = searcher.search(query, null, num, SORT); results = createResults(searcher, hits, num, key, doHighlight, matchedTokens, prefixToken); } finally { mgr.release(searcher); } //System.out.println((System.currentTimeMillis() - t0) + " msec for infix suggest"); //System.out.println(results); return results; }
Create the results based on the search hits. Can be overridden by subclass to add particular behavior (e.g. weight transformation). Note that there is no prefix token (the prefixToken argument will be null) whenever the final token in the incoming request was in fact finished (had trailing characters, such as white-space).
Throws:
  • IOException – If there are problems reading fields from the underlying Lucene index.
/** * Create the results based on the search hits. * Can be overridden by subclass to add particular behavior (e.g. weight transformation). * Note that there is no prefix token (the {@code prefixToken} argument will * be null) whenever the final token in the incoming request was in fact finished * (had trailing characters, such as white-space). * * @throws IOException If there are problems reading fields from the underlying Lucene index. */
protected List<LookupResult> createResults(IndexSearcher searcher, TopFieldDocs hits, int num, CharSequence charSequence, boolean doHighlight, Set<String> matchedTokens, String prefixToken) throws IOException { List<LeafReaderContext> leaves = searcher.getIndexReader().leaves(); List<LookupResult> results = new ArrayList<>(); for (int i=0;i<hits.scoreDocs.length;i++) { FieldDoc fd = (FieldDoc) hits.scoreDocs[i]; BinaryDocValues textDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), TEXT_FIELD_NAME); textDV.advance(fd.doc); BytesRef term = textDV.binaryValue(); String text = term.utf8ToString(); long score = (Long) fd.fields[0]; // This will just be null if app didn't pass payloads to build(): // TODO: maybe just stored fields? they compress... BinaryDocValues payloadsDV = MultiDocValues.getBinaryValues(searcher.getIndexReader(), "payloads"); BytesRef payload; if (payloadsDV != null) { if (payloadsDV.advance(fd.doc) == fd.doc) { payload = BytesRef.deepCopyOf(payloadsDV.binaryValue()); } else { payload = new BytesRef(BytesRef.EMPTY_BYTES); } } else { payload = null; } // Must look up sorted-set by segment: int segment = ReaderUtil.subIndex(fd.doc, leaves); SortedSetDocValues contextsDV = leaves.get(segment).reader().getSortedSetDocValues(CONTEXTS_FIELD_NAME); Set<BytesRef> contexts; if (contextsDV != null) { contexts = new HashSet<BytesRef>(); int targetDocID = fd.doc - leaves.get(segment).docBase; if (contextsDV.advance(targetDocID) == targetDocID) { long ord; while ((ord = contextsDV.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { BytesRef context = BytesRef.deepCopyOf(contextsDV.lookupOrd(ord)); contexts.add(context); } } } else { contexts = null; } LookupResult result; if (doHighlight) { result = new LookupResult(text, highlight(text, matchedTokens, prefixToken), score, payload, contexts); } else { result = new LookupResult(text, score, payload, contexts); } results.add(result); } return results; }
Subclass can override this to tweak the Query before searching.
/** Subclass can override this to tweak the Query before * searching. */
protected Query finishQuery(BooleanQuery.Builder in, boolean allTermsRequired) { return in.build(); }
Override this method to customize the Object representing a single highlighted suggestions; the result is set on each LookupResult.highlightKey member.
/** Override this method to customize the Object * representing a single highlighted suggestions; the * result is set on each {@link * org.apache.lucene.search.suggest.Lookup.LookupResult#highlightKey} member. */
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException { try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); ts.reset(); StringBuilder sb = new StringBuilder(); int upto = 0; while (ts.incrementToken()) { String token = termAtt.toString(); int startOffset = offsetAtt.startOffset(); int endOffset = offsetAtt.endOffset(); if (upto < startOffset) { addNonMatch(sb, text.substring(upto, startOffset)); upto = startOffset; } else if (upto > startOffset) { continue; } if (matchedTokens.contains(token)) { // Token matches. addWholeMatch(sb, text.substring(startOffset, endOffset), token); upto = endOffset; } else if (prefixToken != null && token.startsWith(prefixToken)) { addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken); upto = endOffset; } } ts.end(); int endOffset = offsetAtt.endOffset(); if (upto < endOffset) { addNonMatch(sb, text.substring(upto)); } return sb.toString(); } }
Called while highlighting a single result, to append a non-matching chunk of text from the suggestion to the provided fragments list. @param sb The StringBuilder to append to @param text The text chunk to add
/** Called while highlighting a single result, to append a * non-matching chunk of text from the suggestion to the * provided fragments list. * @param sb The {@code StringBuilder} to append to * @param text The text chunk to add */
protected void addNonMatch(StringBuilder sb, String text) { sb.append(text); }
Called while highlighting a single result, to append the whole matched token to the provided fragments list. @param sb The StringBuilder to append to @param surface The surface form (original) text @param analyzed The analyzed token corresponding to the surface form text
/** Called while highlighting a single result, to append * the whole matched token to the provided fragments list. * @param sb The {@code StringBuilder} to append to * @param surface The surface form (original) text * @param analyzed The analyzed token corresponding to the surface form text */
protected void addWholeMatch(StringBuilder sb, String surface, String analyzed) { sb.append("<b>"); sb.append(surface); sb.append("</b>"); }
Called while highlighting a single result, to append a matched prefix token, to the provided fragments list. @param sb The StringBuilder to append to @param surface The fragment of the surface form (indexed during build, corresponding to this match @param analyzed The analyzed token that matched @param prefixToken The prefix of the token that matched
/** Called while highlighting a single result, to append a * matched prefix token, to the provided fragments list. * @param sb The {@code StringBuilder} to append to * @param surface The fragment of the surface form * (indexed during {@link #build}, corresponding to * this match * @param analyzed The analyzed token that matched * @param prefixToken The prefix of the token that matched */
protected void addPrefixMatch(StringBuilder sb, String surface, String analyzed, String prefixToken) { // TODO: apps can try to invert their analysis logic // here, e.g. downcase the two before checking prefix: if (prefixToken.length() >= surface.length()) { addWholeMatch(sb, surface, analyzed); return; } sb.append("<b>"); sb.append(surface.substring(0, prefixToken.length())); sb.append("</b>"); sb.append(surface.substring(prefixToken.length())); } @Override public boolean store(DataOutput in) throws IOException { return false; } @Override public boolean load(DataInput out) throws IOException { return false; } @Override public void close() throws IOException { if (searcherMgr != null) { searcherMgr.close(); searcherMgr = null; } if (writer != null) { writer.close(); writer = null; } if (dir != null) { dir.close(); } } @Override public long ramBytesUsed() { long mem = RamUsageEstimator.shallowSizeOf(this); try { if (searcherMgr != null) { SearcherManager mgr; IndexSearcher searcher; synchronized (searcherMgrLock) { mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference searcher = mgr.acquire(); } try { for (LeafReaderContext context : searcher.getIndexReader().leaves()) { LeafReader reader = FilterLeafReader.unwrap(context.reader()); if (reader instanceof SegmentReader) { mem += ((SegmentReader) context.reader()).ramBytesUsed(); } } } finally { mgr.release(searcher); } } return mem; } catch (IOException ioe) { throw new RuntimeException(ioe); } } @Override public Collection<Accountable> getChildResources() { List<Accountable> resources = new ArrayList<>(); try { if (searcherMgr != null) { SearcherManager mgr; IndexSearcher searcher; synchronized (searcherMgrLock) { mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference searcher = mgr.acquire(); } try { for (LeafReaderContext context : searcher.getIndexReader().leaves()) { LeafReader reader = FilterLeafReader.unwrap(context.reader()); if (reader instanceof SegmentReader) { resources.add(Accountables.namedAccountable("segment", (SegmentReader)reader)); } } } finally { mgr.release(searcher); } } return Collections.unmodifiableList(resources); } catch (IOException ioe) { throw new RuntimeException(ioe); } } @Override public long getCount() throws IOException { if (searcherMgr == null) { return 0; } SearcherManager mgr; IndexSearcher searcher; synchronized (searcherMgrLock) { mgr = searcherMgr; // acquire & release on same SearcherManager, via local reference searcher = mgr.acquire(); } try { return searcher.getIndexReader().numDocs(); } finally { mgr.release(searcher); } } }