/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
A Collector
implementation that collects the top-scoring hits, returning them as a TopDocs
. This is used by IndexSearcher
to implement TopDocs
-based search. Hits are sorted by score descending and then (when the scores are tied) docID ascending. When you create an instance of this collector you should know in advance whether documents are going to be collected in doc Id order or not. NOTE: The values Float.NaN
and Float.NEGATIVE_INFINITY
are not valid scores. This collector will not properly collect hits with such scores.
/**
* A {@link Collector} implementation that collects the top-scoring hits,
* returning them as a {@link TopDocs}. This is used by {@link IndexSearcher} to
* implement {@link TopDocs}-based search. Hits are sorted by score descending
* and then (when the scores are tied) docID ascending. When you create an
* instance of this collector you should know in advance whether documents are
* going to be collected in doc Id order or not.
*
* <p><b>NOTE</b>: The values {@link Float#NaN} and
* {@link Float#NEGATIVE_INFINITY} are not valid scores. This
* collector will not properly collect hits with such
* scores.
*/
public abstract class TopScoreDocCollector extends TopDocsCollector<ScoreDoc> {
abstract static class ScorerLeafCollector implements LeafCollector {
Scorable scorer;
@Override
public void setScorer(Scorable scorer) throws IOException {
this.scorer = scorer;
}
}
private static class SimpleTopScoreDocCollector extends TopScoreDocCollector {
SimpleTopScoreDocCollector(int numHits, int totalHitsThreshold) {
super(numHits, totalHitsThreshold);
}
@Override
public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
final int docBase = context.docBase;
return new ScorerLeafCollector() {
@Override
public void setScorer(Scorable scorer) throws IOException {
super.setScorer(scorer);
updateMinCompetitiveScore(scorer);
}
@Override
public void collect(int doc) throws IOException {
float score = scorer.score();
// This collector relies on the fact that scorers produce positive values:
assert score >= 0; // NOTE: false for NaN
totalHits++;
if (score <= pqTop.score) {
if (totalHitsRelation == TotalHits.Relation.EQUAL_TO && totalHits > totalHitsThreshold) {
// we just reached totalHitsThreshold, we can start setting the min
// competitive score now
updateMinCompetitiveScore(scorer);
}
// Since docs are returned in-order (i.e., increasing doc Id), a document
// with equal score to pqTop.score cannot compete since HitQueue favors
// documents with lower doc Ids. Therefore reject those docs too.
return;
}
pqTop.doc = doc + docBase;
pqTop.score = score;
pqTop = pq.updateTop();
updateMinCompetitiveScore(scorer);
}
};
}
}
private static class PagingTopScoreDocCollector extends TopScoreDocCollector {
private final ScoreDoc after;
private int collectedHits;
PagingTopScoreDocCollector(int numHits, ScoreDoc after, int totalHitsThreshold) {
super(numHits, totalHitsThreshold);
this.after = after;
this.collectedHits = 0;
}
@Override
protected int topDocsSize() {
return collectedHits < pq.size() ? collectedHits : pq.size();
}
@Override
protected TopDocs newTopDocs(ScoreDoc[] results, int start) {
return results == null
? new TopDocs(new TotalHits(totalHits, totalHitsRelation), new ScoreDoc[0])
: new TopDocs(new TotalHits(totalHits, totalHitsRelation), results);
}
@Override
public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
final int docBase = context.docBase;
final int afterDoc = after.doc - context.docBase;
return new ScorerLeafCollector() {
@Override
public void collect(int doc) throws IOException {
float score = scorer.score();
// This collector relies on the fact that scorers produce positive values:
assert score >= 0; // NOTE: false for NaN
totalHits++;
if (score > after.score || (score == after.score && doc <= afterDoc)) {
// hit was collected on a previous page
if (totalHitsRelation == TotalHits.Relation.EQUAL_TO && totalHits > totalHitsThreshold) {
// we just reached totalHitsThreshold, we can start setting the min
// competitive score now
updateMinCompetitiveScore(scorer);
}
return;
}
if (score <= pqTop.score) {
// Since docs are returned in-order (i.e., increasing doc Id), a document
// with equal score to pqTop.score cannot compete since HitQueue favors
// documents with lower doc Ids. Therefore reject those docs too.
return;
}
collectedHits++;
pqTop.doc = doc + docBase;
pqTop.score = score;
pqTop = pq.updateTop();
updateMinCompetitiveScore(scorer);
}
};
}
}
Creates a new TopScoreDocCollector
given the number of hits to collect and the number of hits to count accurately. NOTE: If the total hit count of the top docs is less than or exactly totalHitsThreshold
then this value is accurate. On the other hand, if the TopDocs.totalHits
value is greater than totalHitsThreshold
then its value is a lower bound of the hit count. A value of Integer.MAX_VALUE
will make the hit count accurate but will also likely make query processing slower.
NOTE: The instances returned by this method
pre-allocate a full array of length
numHits
, and fill the array with sentinel
objects.
/**
* Creates a new {@link TopScoreDocCollector} given the number of hits to
* collect and the number of hits to count accurately.
*
* <p><b>NOTE</b>: If the total hit count of the top docs is less than or exactly
* {@code totalHitsThreshold} then this value is accurate. On the other hand,
* if the {@link TopDocs#totalHits} value is greater than {@code totalHitsThreshold}
* then its value is a lower bound of the hit count. A value of {@link Integer#MAX_VALUE}
* will make the hit count accurate but will also likely make query processing slower.
* <p><b>NOTE</b>: The instances returned by this method
* pre-allocate a full array of length
* <code>numHits</code>, and fill the array with sentinel
* objects.
*/
public static TopScoreDocCollector create(int numHits, int totalHitsThreshold) {
return create(numHits, null, totalHitsThreshold);
}
Creates a new TopScoreDocCollector
given the number of hits to collect, the bottom of the previous page, and the number of hits to count accurately. NOTE: If the total hit count of the top docs is less than or exactly totalHitsThreshold
then this value is accurate. On the other hand, if the TopDocs.totalHits
value is greater than totalHitsThreshold
then its value is a lower bound of the hit count. A value of Integer.MAX_VALUE
will make the hit count accurate but will also likely make query processing slower.
NOTE: The instances returned by this method
pre-allocate a full array of length
numHits
, and fill the array with sentinel
objects.
/**
* Creates a new {@link TopScoreDocCollector} given the number of hits to
* collect, the bottom of the previous page, and the number of hits to count
* accurately.
*
* <p><b>NOTE</b>: If the total hit count of the top docs is less than or exactly
* {@code totalHitsThreshold} then this value is accurate. On the other hand,
* if the {@link TopDocs#totalHits} value is greater than {@code totalHitsThreshold}
* then its value is a lower bound of the hit count. A value of {@link Integer#MAX_VALUE}
* will make the hit count accurate but will also likely make query processing slower.
* <p><b>NOTE</b>: The instances returned by this method
* pre-allocate a full array of length
* <code>numHits</code>, and fill the array with sentinel
* objects.
*/
public static TopScoreDocCollector create(int numHits, ScoreDoc after, int totalHitsThreshold) {
if (numHits <= 0) {
throw new IllegalArgumentException("numHits must be > 0; please use TotalHitCountCollector if you just need the total hit count");
}
if (totalHitsThreshold < 0) {
throw new IllegalArgumentException("totalHitsThreshold must be >= 0, got " + totalHitsThreshold);
}
if (after == null) {
return new SimpleTopScoreDocCollector(numHits, totalHitsThreshold);
} else {
return new PagingTopScoreDocCollector(numHits, after, totalHitsThreshold);
}
}
final int totalHitsThreshold;
ScoreDoc pqTop;
// prevents instantiation
TopScoreDocCollector(int numHits, int totalHitsThreshold) {
super(new HitQueue(numHits, true));
this.totalHitsThreshold = totalHitsThreshold;
// HitQueue implements getSentinelObject to return a ScoreDoc, so we know
// that at this point top() is already initialized.
pqTop = pq.top();
}
@Override
protected TopDocs newTopDocs(ScoreDoc[] results, int start) {
if (results == null) {
return EMPTY_TOPDOCS;
}
return new TopDocs(new TotalHits(totalHits, totalHitsRelation), results);
}
@Override
public ScoreMode scoreMode() {
return totalHitsThreshold == Integer.MAX_VALUE ? ScoreMode.COMPLETE : ScoreMode.TOP_SCORES;
}
protected void updateMinCompetitiveScore(Scorable scorer) throws IOException {
if (totalHits > totalHitsThreshold
&& pqTop != null
&& pqTop.score != Float.NEGATIVE_INFINITY) { // -Infinity is the score of sentinels
// since we tie-break on doc id and collect in doc id order, we can require
// the next float
scorer.setMinCompetitiveScore(Math.nextUp(pqTop.score));
totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;
}
}
}