/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.uhighlight;

import java.util.Arrays;

import org.apache.lucene.util.BytesRefHash;

Ranks passages found by UnifiedHighlighter.

Each passage is scored as a miniature document within the document. The final score is computed as norm * ∑ (weight * tf). The default implementation is norm * BM25.

@lucene.experimental
/** * Ranks passages found by {@link UnifiedHighlighter}. * <p> * Each passage is scored as a miniature document within the document. * The final score is computed as {@link #norm} * &sum; ({@link #weight} * {@link #tf}). * The default implementation is {@link #norm} * BM25. * * @lucene.experimental */
public class PassageScorer { // TODO: this formula is completely made up. It might not provide relevant snippets!
BM25 k1 parameter, controls term frequency normalization
/** * BM25 k1 parameter, controls term frequency normalization */
final float k1;
BM25 b parameter, controls length normalization.
/** * BM25 b parameter, controls length normalization. */
final float b;
A pivot used for length normalization.
/** * A pivot used for length normalization. */
final float pivot;
Creates PassageScorer with these default values:
  • k1 = 1.2,
  • b = 0.75.
  • pivot = 87
/** * Creates PassageScorer with these default values: * <ul> * <li>{@code k1 = 1.2}, * <li>{@code b = 0.75}. * <li>{@code pivot = 87} * </ul> */
public PassageScorer() { // 1.2 and 0.75 are well-known bm25 defaults (but maybe not the best here) ? // 87 is typical average english sentence length. this(1.2f, 0.75f, 87f); }
Creates PassageScorer with specified scoring parameters
Params:
  • k1 – Controls non-linear term frequency normalization (saturation).
  • b – Controls to what degree passage length normalizes tf values.
  • pivot – Pivot value for length normalization (some rough idea of average sentence length in characters).
/** * Creates PassageScorer with specified scoring parameters * * @param k1 Controls non-linear term frequency normalization (saturation). * @param b Controls to what degree passage length normalizes tf values. * @param pivot Pivot value for length normalization (some rough idea of average sentence length in characters). */
public PassageScorer(float k1, float b, float pivot) { this.k1 = k1; this.b = b; this.pivot = pivot; }
Computes term importance, given its in-document statistics.
Params:
  • contentLength – length of document in characters
  • totalTermFreq – number of time term occurs in document
Returns:term importance
/** * Computes term importance, given its in-document statistics. * * @param contentLength length of document in characters * @param totalTermFreq number of time term occurs in document * @return term importance */
public float weight(int contentLength, int totalTermFreq) { // approximate #docs from content length float numDocs = 1 + contentLength / pivot; // numDocs not numDocs - docFreq (ala DFR), since we approximate numDocs return (k1 + 1) * (float) Math.log(1 + (numDocs + 0.5D) / (totalTermFreq + 0.5D)); }
Computes term weight, given the frequency within the passage and the passage's length.
Params:
  • freq – number of occurrences of within this passage
  • passageLen – length of the passage in characters.
Returns:term weight
/** * Computes term weight, given the frequency within the passage * and the passage's length. * * @param freq number of occurrences of within this passage * @param passageLen length of the passage in characters. * @return term weight */
public float tf(int freq, int passageLen) { float norm = k1 * ((1 - b) + b * (passageLen / pivot)); return freq / (freq + norm); }
Normalize a passage according to its position in the document.

Typically passages towards the beginning of the document are more useful for summarizing the contents.

The default implementation is 1 + 1/log(pivot + passageStart)

Params:
  • passageStart – start offset of the passage
Returns:a boost value multiplied into the passage's core.
/** * Normalize a passage according to its position in the document. * <p> * Typically passages towards the beginning of the document are * more useful for summarizing the contents. * <p> * The default implementation is <code>1 + 1/log(pivot + passageStart)</code> * * @param passageStart start offset of the passage * @return a boost value multiplied into the passage's core. */
public float norm(int passageStart) { return 1 + 1 / (float) Math.log(pivot + passageStart); } public float score(Passage passage, int contentLength) { float score = 0; BytesRefHash termsHash = new BytesRefHash(); int hitCount = passage.getNumMatches(); int[] termFreqsInPassage = new int[hitCount]; // maximum size int[] termFreqsInDoc = new int[hitCount]; Arrays.fill(termFreqsInPassage, 0); for (int i = 0; i < passage.getNumMatches(); i++) { int termIndex = termsHash.add(passage.getMatchTerms()[i]); if (termIndex < 0) { termIndex = -(termIndex + 1); } else { termFreqsInDoc[termIndex] = passage.getMatchTermFreqsInDoc()[i]; } termFreqsInPassage[termIndex]++; } for (int i = 0; i < termsHash.size(); i++) { score += tf(termFreqsInPassage[i], passage.getLength()) * weight(contentLength, termFreqsInDoc[i]); } score *= norm(passage.getStartOffset()); return score; } }