/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.similarities;


import org.apache.lucene.search.Explanation;

Implements the Divergence from Independence (DFI) model based on Chi-square statistics (i.e., standardized Chi-squared distance from independence in term frequency tf).

DFI is both parameter-free and non-parametric:

  • parameter-free: it does not require any parameter tuning or training.
  • non-parametric: it does not make any assumptions about word frequency distributions on document collections.

It is highly recommended not to remove stopwords (very common terms: the, of, and, to, a, in, for, is, on, that, etc) with this similarity.

For more information see: A nonparametric term weighting method for information retrieval based on measuring the divergence from independence

See Also:
@lucene.experimental
/** * Implements the <em>Divergence from Independence (DFI)</em> model based on Chi-square statistics * (i.e., standardized Chi-squared distance from independence in term frequency tf). * <p> * DFI is both parameter-free and non-parametric: * <ul> * <li>parameter-free: it does not require any parameter tuning or training.</li> * <li>non-parametric: it does not make any assumptions about word frequency distributions on document collections.</li> * </ul> * <p> * It is highly recommended <b>not</b> to remove stopwords (very common terms: the, of, and, to, a, in, for, is, on, that, etc) with this similarity. * <p> * For more information see: <a href="http://dx.doi.org/10.1007/s10791-013-9225-4">A nonparametric term weighting method for information retrieval based on measuring the divergence from independence</a> * * @lucene.experimental * @see org.apache.lucene.search.similarities.IndependenceStandardized * @see org.apache.lucene.search.similarities.IndependenceSaturated * @see org.apache.lucene.search.similarities.IndependenceChiSquared */
public class DFISimilarity extends SimilarityBase { private final Independence independence;
Create DFI with the specified divergence from independence measure
Params:
  • independenceMeasure – measure of divergence from independence
/** * Create DFI with the specified divergence from independence measure * @param independenceMeasure measure of divergence from independence */
public DFISimilarity(Independence independenceMeasure) { this.independence = independenceMeasure; } @Override protected double score(BasicStats stats, double freq, double docLen) { final double expected = (stats.getTotalTermFreq() + 1) * docLen / (stats.getNumberOfFieldTokens() + 1); // if the observed frequency is less than or equal to the expected value, then return zero. if (freq <= expected) return 0; final double measure = independence.score(freq, expected); return stats.getBoost() * log2(measure + 1); }
Returns the measure of independence
/** * Returns the measure of independence */
public Independence getIndependence() { return independence; } @Override protected Explanation explain( BasicStats stats, Explanation freq, double docLen) { final double expected = (stats.getTotalTermFreq() + 1) * docLen / (stats.getNumberOfFieldTokens() + 1); if (freq.getValue().doubleValue() <= expected){ return Explanation.match((float) 0, "score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), equals to 0"); } Explanation explExpected = Explanation.match((float) expected, "expected, computed as (F + 1) * dl / (T + 1) from:", Explanation.match(stats.getTotalTermFreq(), "F, total number of occurrences of term across all docs"), Explanation.match((float) docLen, "dl, length of field"), Explanation.match(stats.getNumberOfFieldTokens(), "T, total number of tokens in the field")); final double measure = independence.score(freq.getValue().doubleValue(), expected); Explanation explMeasure = Explanation.match((float) measure, "measure, computed as independence.score(freq, expected) from:", freq, explExpected); return Explanation.match( (float) score(stats, freq.getValue().doubleValue(), docLen), "score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed as boost * log2(measure + 1) from:", Explanation.match( (float)stats.getBoost(), "boost, query boost"), explMeasure); } @Override public String toString() { return "DFI(" + independence + ")"; } }