org.apache.lucene/lucene-suggest/8.2.0 : org/apache/lucene/search/suggest/document/TopSuggestDocsCollector.java

TopSuggestDocsCollector
http://lucene.apache.org/lucene-parent/lucene-suggest: Lucene Suggest Module (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.suggest.document;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.CollectionTerminatedException;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.search.suggest.Lookup;

import static org.apache.lucene.search.suggest.document.TopSuggestDocs.SuggestScoreDoc;

Collector that collects completion and score, along with document id 
Non scoring collector that collect completions in order of their
pre-computed scores.

NOTE: One document can be collected multiple times if a document
is matched for multiple unique completions for a given query
 Subclasses should only override collect(int, CharSequence, CharSequence, float). 
 NOTE: SimpleCollector.setScorer(Scorable) and collect(int) is not used 
@lucene.experimental /**
 * {@link org.apache.lucene.search.Collector} that collects completion and
 * score, along with document id
 * <p>
 * Non scoring collector that collect completions in order of their
 * pre-computed scores.
 * <p>
 * NOTE: One document can be collected multiple times if a document
 * is matched for multiple unique completions for a given query
 * <p>
 * Subclasses should only override
 * {@link TopSuggestDocsCollector#collect(int, CharSequence, CharSequence, float)}.
 * <p>
 * NOTE: {@link #setScorer(org.apache.lucene.search.Scorable)} and
 * {@link #collect(int)} is not used
 *
 * @lucene.experimental
 */
public class TopSuggestDocsCollector extends SimpleCollector {

  private final SuggestScoreDocPriorityQueue priorityQueue;
  private final int num;

  Only set if we are deduplicating hits: holds all per-segment hits until the end, when we dedup them /** Only set if we are deduplicating hits: holds all per-segment hits until the end, when we dedup them */
  private final List<SuggestScoreDoc> pendingResults;

  Only set if we are deduplicating hits: holds all surface forms seen so far in the current segment /** Only set if we are deduplicating hits: holds all surface forms seen so far in the current segment */
  final CharArraySet seenSurfaceForms;

  Document base offset for the current Leaf /** Document base offset for the current Leaf */
  protected int docBase;

  Sole constructor
Collects at most num completions
with corresponding document and weight
/**
   * Sole constructor
   *
   * Collects at most <code>num</code> completions
   * with corresponding document and weight
   */
  public TopSuggestDocsCollector(int num, boolean skipDuplicates) {
    if (num <= 0) {
      throw new IllegalArgumentException("'num' must be > 0");
    }
    this.num = num;
    this.priorityQueue = new SuggestScoreDocPriorityQueue(num);
    if (skipDuplicates) {
      seenSurfaceForms = new CharArraySet(num, false);
      pendingResults = new ArrayList<>();
    } else {
      seenSurfaceForms = null;
      pendingResults = null;
    }
  }

  Returns true if duplicates are filtered out /** Returns true if duplicates are filtered out */
  protected boolean doSkipDuplicates() {
    return seenSurfaceForms != null;
  }

  Returns the number of results to be collected
/**
   * Returns the number of results to be collected
   */
  public int getCountToCollect() {
    return num;
  }

  @Override
  protected void doSetNextReader(LeafReaderContext context) throws IOException {
    docBase = context.docBase;
    if (seenSurfaceForms != null) {
      seenSurfaceForms.clear();
      // NOTE: this also clears the priorityQueue:
      for (SuggestScoreDoc hit : priorityQueue.getResults()) {
        pendingResults.add(hit);
      }
    }
  }

  Called for every matched completion, similar to LeafCollector.collect(int) but for completions. NOTE: collection at the leaf level is guaranteed to be in descending order of score /**
   * Called for every matched completion,
   * similar to {@link org.apache.lucene.search.LeafCollector#collect(int)}
   * but for completions.
   *
   * NOTE: collection at the leaf level is guaranteed to be in
   * descending order of score
   */
  public void collect(int docID, CharSequence key, CharSequence context, float score) throws IOException {
    SuggestScoreDoc current = new SuggestScoreDoc(docBase + docID, key, context, score);
    if (current == priorityQueue.insertWithOverflow(current)) {
      // if the current SuggestScoreDoc has overflown from pq,
      // we can assume all of the successive collections from
      // this leaf will be overflown as well
      // TODO: reuse the overflow instance?
      throw new CollectionTerminatedException();
    }
  }

  Returns at most num Top scoring TopSuggestDocss /**
   * Returns at most <code>num</code> Top scoring {@link org.apache.lucene.search.suggest.document.TopSuggestDocs}s
   */
  public TopSuggestDocs get() throws IOException {

    SuggestScoreDoc[] suggestScoreDocs;
    
    if (seenSurfaceForms != null) {
      // NOTE: this also clears the priorityQueue:
      for (SuggestScoreDoc hit : priorityQueue.getResults()) {
        pendingResults.add(hit);
      }

      // Deduplicate all hits: we already dedup'd efficiently within each segment by
      // truncating the FST top paths search, but across segments there may still be dups:
      seenSurfaceForms.clear();

      // TODO: we could use a priority queue here to make cost O(N * log(num)) instead of O(N * log(N)), where N = O(num *
      // numSegments), but typically numSegments is smallish and num is smallish so this won't matter much in practice:

      Collections.sort(pendingResults,
          (a, b) -> {
            // sort by higher score
            int cmp = Float.compare(b.score, a.score);
            if (cmp == 0) {
              // tie break by completion key
              cmp = Lookup.CHARSEQUENCE_COMPARATOR.compare(a.key, b.key);
              if (cmp == 0) {
                // prefer smaller doc id, in case of a tie
                cmp = Integer.compare(a.doc, b.doc);
              }
            }
            return cmp;
          });

      List<SuggestScoreDoc> hits = new ArrayList<>();
      
      for (SuggestScoreDoc hit : pendingResults) {
        if (seenSurfaceForms.contains(hit.key) == false) {
          seenSurfaceForms.add(hit.key);
          hits.add(hit);
          if (hits.size() == num) {
            break;
          }
        }
      }
      suggestScoreDocs = hits.toArray(new SuggestScoreDoc[0]);
    } else {
      suggestScoreDocs = priorityQueue.getResults();
    }

    if (suggestScoreDocs.length > 0) {
      return new TopSuggestDocs(new TotalHits(suggestScoreDocs.length, TotalHits.Relation.EQUAL_TO), suggestScoreDocs);
    } else {
      return TopSuggestDocs.EMPTY;
    }
  }

  Ignored
/**
   * Ignored
   */
  @Override
  public void collect(int doc) throws IOException {
    // {@link #collect(int, CharSequence, CharSequence, long)} is used
    // instead
  }

  Ignored
/**
   * Ignored
   */
  @Override
  public ScoreMode scoreMode() {
    return ScoreMode.COMPLETE;
  }
}
/

org.apache.lucene/ lucene-suggest/ 8.2.0/ org/apache/lucene/search/suggest/document/TopSuggestDocsCollector.java