/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.similarities;


import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.similarities.Normalization.NoNormalization;

Implements the divergence from randomness (DFR) framework introduced in Gianni Amati and Cornelis Joost Van Rijsbergen. 2002. Probabilistic models of information retrieval based on measuring the divergence from randomness. ACM Trans. Inf. Syst. 20, 4 (October 2002), 357-389.

The DFR scoring formula is composed of three separate components: the basic model, the aftereffect and an additional normalization component, represented by the classes BasicModel, AfterEffect and Normalization, respectively. The names of these classes were chosen to match the names of their counterparts in the Terrier IR engine.

To construct a DFRSimilarity, you must specify the implementations for all three components of DFR:

  1. BasicModel: Basic model of information content:
  2. AfterEffect: First normalization of information gain:
  3. Normalization: Second (length) normalization:

Note that qtf, the multiplicity of term-occurrence in the query, is not handled by this implementation.

Note that basic models BE (Limiting form of Bose-Einstein), P (Poisson approximation of the Binomial) and D (Divergence approximation of the Binomial) are not implemented because their formula couldn't be written in a way that makes scores non-decreasing with the normalized term frequency.

See Also:
@lucene.experimental
/** * Implements the <em>divergence from randomness (DFR)</em> framework * introduced in Gianni Amati and Cornelis Joost Van Rijsbergen. 2002. * Probabilistic models of information retrieval based on measuring the * divergence from randomness. ACM Trans. Inf. Syst. 20, 4 (October 2002), * 357-389. * <p>The DFR scoring formula is composed of three separate components: the * <em>basic model</em>, the <em>aftereffect</em> and an additional * <em>normalization</em> component, represented by the classes * {@code BasicModel}, {@code AfterEffect} and {@code Normalization}, * respectively. The names of these classes were chosen to match the names of * their counterparts in the Terrier IR engine.</p> * <p>To construct a DFRSimilarity, you must specify the implementations for * all three components of DFR: * <ol> * <li>{@link BasicModel}: Basic model of information content: * <ul> * <li>{@link BasicModelG}: Geometric approximation of Bose-Einstein * <li>{@link BasicModelIn}: Inverse document frequency * <li>{@link BasicModelIne}: Inverse expected document * frequency [mixture of Poisson and IDF] * <li>{@link BasicModelIF}: Inverse term frequency * [approximation of I(ne)] * </ul> * <li>{@link AfterEffect}: First normalization of information * gain: * <ul> * <li>{@link AfterEffectL}: Laplace's law of succession * <li>{@link AfterEffectB}: Ratio of two Bernoulli processes * </ul> * <li>{@link Normalization}: Second (length) normalization: * <ul> * <li>{@link NormalizationH1}: Uniform distribution of term * frequency * <li>{@link NormalizationH2}: term frequency density inversely * related to length * <li>{@link NormalizationH3}: term frequency normalization * provided by Dirichlet prior * <li>{@link NormalizationZ}: term frequency normalization provided * by a Zipfian relation * <li>{@link NoNormalization}: no second normalization * </ul> * </ol> * <p>Note that <em>qtf</em>, the multiplicity of term-occurrence in the query, * is not handled by this implementation.</p> * <p> Note that basic models BE (Limiting form of Bose-Einstein), P (Poisson * approximation of the Binomial) and D (Divergence approximation of the * Binomial) are not implemented because their formula couldn't be written in * a way that makes scores non-decreasing with the normalized term frequency. * @see BasicModel * @see AfterEffect * @see Normalization * @lucene.experimental */
public class DFRSimilarity extends SimilarityBase {
The basic model for information content.
/** The basic model for information content. */
protected final BasicModel basicModel;
The first normalization of the information content.
/** The first normalization of the information content. */
protected final AfterEffect afterEffect;
The term frequency normalization.
/** The term frequency normalization. */
protected final Normalization normalization;
Creates DFRSimilarity from the three components.

Note that null values are not allowed: if you want no normalization, instead pass NoNormalization.

Params:
  • basicModel – Basic model of information content
  • afterEffect – First normalization of information gain
  • normalization – Second (length) normalization
/** * Creates DFRSimilarity from the three components. * <p> * Note that <code>null</code> values are not allowed: * if you want no normalization, instead pass * {@link NoNormalization}. * @param basicModel Basic model of information content * @param afterEffect First normalization of information gain * @param normalization Second (length) normalization */
public DFRSimilarity(BasicModel basicModel, AfterEffect afterEffect, Normalization normalization) { if (basicModel == null || afterEffect == null || normalization == null) { throw new NullPointerException("null parameters not allowed."); } this.basicModel = basicModel; this.afterEffect = afterEffect; this.normalization = normalization; } @Override protected double score(BasicStats stats, double freq, double docLen) { double tfn = normalization.tfn(stats, freq, docLen); double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats); return stats.getBoost() * basicModel.score(stats, tfn, aeTimes1pTfn); } @Override protected void explain(List<Explanation> subs, BasicStats stats, double freq, double docLen) { if (stats.getBoost() != 1.0d) { subs.add(Explanation.match( (float)stats.getBoost(), "boost, query boost")); } Explanation normExpl = normalization.explain(stats, freq, docLen); double tfn = normalization.tfn(stats, freq, docLen); double aeTimes1pTfn = afterEffect.scoreTimes1pTfn(stats); subs.add(normExpl); subs.add(basicModel.explain(stats, tfn, aeTimes1pTfn)); subs.add(afterEffect.explain(stats, tfn)); } @Override protected Explanation explain( BasicStats stats, Explanation freq, double docLen) { List<Explanation> subs = new ArrayList<>(); explain(subs, stats, freq.getValue().doubleValue(), docLen); return Explanation.match( (float) score(stats, freq.getValue().doubleValue(), docLen), "score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed as boost * " + "basicModel.score(stats, tfn) * afterEffect.score(stats, tfn) from:", subs); } @Override public String toString() { return "DFR " + basicModel.toString() + afterEffect.toString() + normalization.toString(); }
Returns the basic model of information content
/** * Returns the basic model of information content */
public BasicModel getBasicModel() { return basicModel; }
Returns the first normalization
/** * Returns the first normalization */
public AfterEffect getAfterEffect() { return afterEffect; }
Returns the second normalization
/** * Returns the second normalization */
public Normalization getNormalization() { return normalization; } }