/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.document;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.CollectionTerminatedException;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.search.suggest.Lookup;

import static org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc;

Collector that collects completion and score, along with document id

Non scoring collector that collect completions in order of their pre-computed scores.

NOTE: One document can be collected multiple times if a document is matched for multiple unique completions for a given query

Subclasses should only override collect(int, CharSequence, CharSequence, float).

NOTE: SimpleCollector.setScorer(Scorable) and collect(int) is not used

@lucene.experimental
/** * {@link org.apache.lucene.search.Collector} that collects completion and * score, along with document id * <p> * Non scoring collector that collect completions in order of their * pre-computed scores. * <p> * NOTE: One document can be collected multiple times if a document * is matched for multiple unique completions for a given query * <p> * Subclasses should only override * {@link TopSuggestDocsCollector#collect(int, CharSequence, CharSequence, float)}. * <p> * NOTE: {@link #setScorer(org.apache.lucene.search.Scorable)} and * {@link #collect(int)} is not used * * @lucene.experimental */
public class TopSuggestDocsCollector extends SimpleCollector { private final SuggestScoreDocPriorityQueue priorityQueue; private final int num;
Only set if we are deduplicating hits: holds all per-segment hits until the end, when we dedup them
/** Only set if we are deduplicating hits: holds all per-segment hits until the end, when we dedup them */
private final List<SuggestScoreDoc> pendingResults;
Only set if we are deduplicating hits: holds all surface forms seen so far in the current segment
/** Only set if we are deduplicating hits: holds all surface forms seen so far in the current segment */
final CharArraySet seenSurfaceForms;
Document base offset for the current Leaf
/** Document base offset for the current Leaf */
protected int docBase;
Sole constructor Collects at most num completions with corresponding document and weight
/** * Sole constructor * * Collects at most <code>num</code> completions * with corresponding document and weight */
public TopSuggestDocsCollector(int num, boolean skipDuplicates) { if (num <= 0) { throw new IllegalArgumentException("'num' must be > 0"); } this.num = num; this.priorityQueue = new SuggestScoreDocPriorityQueue(num); if (skipDuplicates) { seenSurfaceForms = new CharArraySet(num, false); pendingResults = new ArrayList<>(); } else { seenSurfaceForms = null; pendingResults = null; } }
Returns true if duplicates are filtered out
/** Returns true if duplicates are filtered out */
protected boolean doSkipDuplicates() { return seenSurfaceForms != null; }
Returns the number of results to be collected
/** * Returns the number of results to be collected */
public int getCountToCollect() { return num; } @Override protected void doSetNextReader(LeafReaderContext context) throws IOException { docBase = context.docBase; if (seenSurfaceForms != null) { seenSurfaceForms.clear(); // NOTE: this also clears the priorityQueue: for (SuggestScoreDoc hit : priorityQueue.getResults()) { pendingResults.add(hit); } } }
Called for every matched completion, similar to LeafCollector.collect(int) but for completions. NOTE: collection at the leaf level is guaranteed to be in descending order of score
/** * Called for every matched completion, * similar to {@link org.apache.lucene.search.LeafCollector#collect(int)} * but for completions. * * NOTE: collection at the leaf level is guaranteed to be in * descending order of score */
public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException { SuggestScoreDoc current = new SuggestScoreDoc(docBase + docID, key, context, score); if (current == priorityQueue.insertWithOverflow(current)) { // if the current SuggestScoreDoc has overflown from pq, // we can assume all of the successive collections from // this leaf will be overflown as well // TODO: reuse the overflow instance? throw new CollectionTerminatedException(); } }
Returns at most num Top scoring TopSuggestDocss
/** * Returns at most <code>num</code> Top scoring {@link org.apache.lucene.search.suggest.document.TopSuggestDocs}s */
public TopSuggestDocs get() throws IOException { SuggestScoreDoc[] suggestScoreDocs; if (seenSurfaceForms != null) { // NOTE: this also clears the priorityQueue: for (SuggestScoreDoc hit : priorityQueue.getResults()) { pendingResults.add(hit); } // Deduplicate all hits: we already dedup'd efficiently within each segment by // truncating the FST top paths search, but across segments there may still be dups: seenSurfaceForms.clear(); // TODO: we could use a priority queue here to make cost O(N * log(num)) instead of O(N * log(N)), where N = O(num * // numSegments), but typically numSegments is smallish and num is smallish so this won't matter much in practice: Collections.sort(pendingResults, (a, b) -> { // sort by higher score int cmp = Float.compare(b.score, a.score); if (cmp == 0) { // tie break by completion key cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key); if (cmp == 0) { // prefer smaller doc id, in case of a tie cmp = Integer.compare(a.doc, b.doc); } } return cmp; }); List<SuggestScoreDoc> hits = new ArrayList<>(); for (SuggestScoreDoc hit : pendingResults) { if (seenSurfaceForms.contains(hit.key) == false) { seenSurfaceForms.add(hit.key); hits.add(hit); if (hits.size() == num) { break; } } } suggestScoreDocs = hits.toArray(new SuggestScoreDoc[0]); } else { suggestScoreDocs = priorityQueue.getResults(); } if (suggestScoreDocs.length > 0) { return new TopSuggestDocs(new TotalHits(suggestScoreDocs.length, TotalHits.Relation.EQUAL_TO), suggestScoreDocs); } else { return TopSuggestDocs.EMPTY; } }
Ignored
/** * Ignored */
@Override public void collect(int doc) throws IOException { // {@link #collect(int, CharSequence, CharSequence, long)} is used // instead }
Ignored
/** * Ignored */
@Override public ScoreMode scoreMode() { return ScoreMode.COMPLETE; } }