/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.math3.distribution;

import org.apache.commons.math3.exception.NotStrictlyPositiveException;
import org.apache.commons.math3.exception.util.LocalizedFormats;
import org.apache.commons.math3.random.RandomGenerator;
import org.apache.commons.math3.random.Well19937c;
import org.apache.commons.math3.util.FastMath;

Implementation of the Zipf distribution.

Parameters: For a random variable X whose values are distributed according to this distribution, the probability mass function is given by

 P(X = k) = H(N,s) * 1 / k^s for k = 1,2,...,N. 
H(N,s) is the normalizing constant which corresponds to the generalized harmonic number of order N of s.

  • N is the number of elements
  • s is the exponent
See Also:
/** * Implementation of the Zipf distribution. * <p> * <strong>Parameters:</strong> * For a random variable {@code X} whose values are distributed according to this * distribution, the probability mass function is given by * <pre> * P(X = k) = H(N,s) * 1 / k^s for {@code k = 1,2,...,N}. * </pre> * {@code H(N,s)} is the normalizing constant * which corresponds to the generalized harmonic number of order N of s. * <p> * <ul> * <li>{@code N} is the number of elements</li> * <li>{@code s} is the exponent</li> * </ul> * @see <a href="https://en.wikipedia.org/wiki/Zipf's_law">Zipf's law (Wikipedia)</a> * @see <a href="https://en.wikipedia.org/wiki/Harmonic_number#Generalized_harmonic_numbers">Generalized harmonic numbers</a> */
public class ZipfDistribution extends AbstractIntegerDistribution {
Serializable version identifier.
/** Serializable version identifier. */
private static final long serialVersionUID = -140627372283420404L;
Number of elements.
/** Number of elements. */
private final int numberOfElements;
Exponent parameter of the distribution.
/** Exponent parameter of the distribution. */
private final double exponent;
Cached numerical mean
/** Cached numerical mean */
private double numericalMean = Double.NaN;
Whether or not the numerical mean has been calculated
/** Whether or not the numerical mean has been calculated */
private boolean numericalMeanIsCalculated = false;
Cached numerical variance
/** Cached numerical variance */
private double numericalVariance = Double.NaN;
Whether or not the numerical variance has been calculated
/** Whether or not the numerical variance has been calculated */
private boolean numericalVarianceIsCalculated = false;
The sampler to be used for the sample() method
/** The sampler to be used for the sample() method */
private transient ZipfRejectionInversionSampler sampler;
Create a new Zipf distribution with the given number of elements and exponent.

Note: this constructor will implicitly create an instance of Well19937c as random generator to be used for sampling only (see sample() and AbstractIntegerDistribution.sample(int)). In case no sampling is needed for the created distribution, it is advised to pass null as random generator via the appropriate constructors to avoid the additional initialisation overhead.

Params:
  • numberOfElements – Number of elements.
  • exponent – Exponent.
Throws:
/** * Create a new Zipf distribution with the given number of elements and * exponent. * <p> * <b>Note:</b> this constructor will implicitly create an instance of * {@link Well19937c} as random generator to be used for sampling only (see * {@link #sample()} and {@link #sample(int)}). In case no sampling is * needed for the created distribution, it is advised to pass {@code null} * as random generator via the appropriate constructors to avoid the * additional initialisation overhead. * * @param numberOfElements Number of elements. * @param exponent Exponent. * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0} * or {@code exponent <= 0}. */
public ZipfDistribution(final int numberOfElements, final double exponent) { this(new Well19937c(), numberOfElements, exponent); }
Creates a Zipf distribution.
Params:
  • rng – Random number generator.
  • numberOfElements – Number of elements.
  • exponent – Exponent.
Throws:
Since:3.1
/** * Creates a Zipf distribution. * * @param rng Random number generator. * @param numberOfElements Number of elements. * @param exponent Exponent. * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0} * or {@code exponent <= 0}. * @since 3.1 */
public ZipfDistribution(RandomGenerator rng, int numberOfElements, double exponent) throws NotStrictlyPositiveException { super(rng); if (numberOfElements <= 0) { throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION, numberOfElements); } if (exponent <= 0) { throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT, exponent); } this.numberOfElements = numberOfElements; this.exponent = exponent; }
Get the number of elements (e.g. corpus size) for the distribution.
Returns:the number of elements
/** * Get the number of elements (e.g. corpus size) for the distribution. * * @return the number of elements */
public int getNumberOfElements() { return numberOfElements; }
Get the exponent characterizing the distribution.
Returns:the exponent
/** * Get the exponent characterizing the distribution. * * @return the exponent */
public double getExponent() { return exponent; }
{@inheritDoc}
/** {@inheritDoc} */
public double probability(final int x) { if (x <= 0 || x > numberOfElements) { return 0.0; } return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent); }
{@inheritDoc}
/** {@inheritDoc} */
@Override public double logProbability(int x) { if (x <= 0 || x > numberOfElements) { return Double.NEGATIVE_INFINITY; } return -FastMath.log(x) * exponent - FastMath.log(generalizedHarmonic(numberOfElements, exponent)); }
{@inheritDoc}
/** {@inheritDoc} */
public double cumulativeProbability(final int x) { if (x <= 0) { return 0.0; } else if (x >= numberOfElements) { return 1.0; } return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent); }
{@inheritDoc} For number of elements N and exponent s, the mean is Hs1 / Hs, where
  • Hs1 = generalizedHarmonic(N, s - 1),
  • Hs = generalizedHarmonic(N, s).
/** * {@inheritDoc} * * For number of elements {@code N} and exponent {@code s}, the mean is * {@code Hs1 / Hs}, where * <ul> * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li> * <li>{@code Hs = generalizedHarmonic(N, s)}.</li> * </ul> */
public double getNumericalMean() { if (!numericalMeanIsCalculated) { numericalMean = calculateNumericalMean(); numericalMeanIsCalculated = true; } return numericalMean; }
Returns:the mean of this distribution
/** * Used by {@link #getNumericalMean()}. * * @return the mean of this distribution */
protected double calculateNumericalMean() { final int N = getNumberOfElements(); final double s = getExponent(); final double Hs1 = generalizedHarmonic(N, s - 1); final double Hs = generalizedHarmonic(N, s); return Hs1 / Hs; }
{@inheritDoc} For number of elements N and exponent s, the mean is (Hs2 / Hs) - (Hs1^2 / Hs^2), where
  • Hs2 = generalizedHarmonic(N, s - 2),
  • Hs1 = generalizedHarmonic(N, s - 1),
  • Hs = generalizedHarmonic(N, s).
/** * {@inheritDoc} * * For number of elements {@code N} and exponent {@code s}, the mean is * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where * <ul> * <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li> * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li> * <li>{@code Hs = generalizedHarmonic(N, s)}.</li> * </ul> */
public double getNumericalVariance() { if (!numericalVarianceIsCalculated) { numericalVariance = calculateNumericalVariance(); numericalVarianceIsCalculated = true; } return numericalVariance; }
Returns:the variance of this distribution
/** * Used by {@link #getNumericalVariance()}. * * @return the variance of this distribution */
protected double calculateNumericalVariance() { final int N = getNumberOfElements(); final double s = getExponent(); final double Hs2 = generalizedHarmonic(N, s - 2); final double Hs1 = generalizedHarmonic(N, s - 1); final double Hs = generalizedHarmonic(N, s); return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs)); }
Calculates the Nth generalized harmonic number. See Harmonic Series.
Params:
  • n – Term in the series to calculate (must be larger than 1)
  • m – Exponent (special case m = 1 is the harmonic series).
Returns:the nth generalized harmonic number.
/** * Calculates the Nth generalized harmonic number. See * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic * Series</a>. * * @param n Term in the series to calculate (must be larger than 1) * @param m Exponent (special case {@code m = 1} is the harmonic series). * @return the n<sup>th</sup> generalized harmonic number. */
private double generalizedHarmonic(final int n, final double m) { double value = 0; for (int k = n; k > 0; --k) { value += 1.0 / FastMath.pow(k, m); } return value; }
{@inheritDoc} The lower bound of the support is always 1 no matter the parameters.
Returns:lower bound of the support (always 1)
/** * {@inheritDoc} * * The lower bound of the support is always 1 no matter the parameters. * * @return lower bound of the support (always 1) */
public int getSupportLowerBound() { return 1; }
{@inheritDoc} The upper bound of the support is the number of elements.
Returns:upper bound of the support
/** * {@inheritDoc} * * The upper bound of the support is the number of elements. * * @return upper bound of the support */
public int getSupportUpperBound() { return getNumberOfElements(); }
{@inheritDoc} The support of this distribution is connected.
Returns:true
/** * {@inheritDoc} * * The support of this distribution is connected. * * @return {@code true} */
public boolean isSupportConnected() { return true; }
{@inheritDoc}
/** * {@inheritDoc} */
@Override public int sample() { if (sampler == null) { sampler = new ZipfRejectionInversionSampler(numberOfElements, exponent); } return sampler.sample(random); }
Utility class implementing a rejection inversion sampling method for a discrete, bounded Zipf distribution that is based on the method described in

Wolfgang Hörmann and Gerhard Derflinger "Rejection-inversion to generate variates from monotone discrete distributions." ACM Transactions on Modeling and Computer Simulation (TOMACS) 6.3 (1996): 169-184.

The paper describes an algorithm for exponents larger than 1 (Algorithm ZRI). The original method uses H(x) := (v + x)^(1 - q) / (1 - q) as the integral of the hat function. This function is undefined for q = 1, which is the reason for the limitation of the exponent. If instead the integral function H(x) := ((v + x)^(1 - q) - 1) / (1 - q) is used, for which a meaningful limit exists for q = 1, the method works for all positive exponents.

The following implementation uses v := 0 and generates integral numbers in the range [1, numberOfElements]. This is different to the original method where v is defined to be positive and numbers are taken from [0, i_max]. This explains why the implementation looks slightly different.

Since:3.6
/** * Utility class implementing a rejection inversion sampling method for a discrete, * bounded Zipf distribution that is based on the method described in * <p> * Wolfgang Hörmann and Gerhard Derflinger * "Rejection-inversion to generate variates from monotone discrete distributions." * ACM Transactions on Modeling and Computer Simulation (TOMACS) 6.3 (1996): 169-184. * <p> * The paper describes an algorithm for exponents larger than 1 (Algorithm ZRI). * The original method uses {@code H(x) := (v + x)^(1 - q) / (1 - q)} * as the integral of the hat function. This function is undefined for * q = 1, which is the reason for the limitation of the exponent. * If instead the integral function * {@code H(x) := ((v + x)^(1 - q) - 1) / (1 - q)} is used, * for which a meaningful limit exists for q = 1, * the method works for all positive exponents. * <p> * The following implementation uses v := 0 and generates integral numbers * in the range [1, numberOfElements]. This is different to the original method * where v is defined to be positive and numbers are taken from [0, i_max]. * This explains why the implementation looks slightly different. * * @since 3.6 */
static final class ZipfRejectionInversionSampler {
Exponent parameter of the distribution.
/** Exponent parameter of the distribution. */
private final double exponent;
Number of elements.
/** Number of elements. */
private final int numberOfElements;
Constant equal to hIntegral(1.5) - 1.
/** Constant equal to {@code hIntegral(1.5) - 1}. */
private final double hIntegralX1;
Constant equal to hIntegral(numberOfElements + 0.5).
/** Constant equal to {@code hIntegral(numberOfElements + 0.5)}. */
private final double hIntegralNumberOfElements;
Constant equal to 2 - hIntegralInverse(hIntegral(2.5) - h(2).
/** Constant equal to {@code 2 - hIntegralInverse(hIntegral(2.5) - h(2)}. */
private final double s;
Simple constructor.
Params:
  • numberOfElements – number of elements
  • exponent – exponent parameter of the distribution
/** Simple constructor. * @param numberOfElements number of elements * @param exponent exponent parameter of the distribution */
ZipfRejectionInversionSampler(final int numberOfElements, final double exponent) { this.exponent = exponent; this.numberOfElements = numberOfElements; this.hIntegralX1 = hIntegral(1.5) - 1d; this.hIntegralNumberOfElements = hIntegral(numberOfElements + 0.5); this.s = 2d - hIntegralInverse(hIntegral(2.5) - h(2)); }
Generate one integral number in the range [1, numberOfElements].
Params:
  • random – random generator to use
Returns:generated integral number in the range [1, numberOfElements]
/** Generate one integral number in the range [1, numberOfElements]. * @param random random generator to use * @return generated integral number in the range [1, numberOfElements] */
int sample(final RandomGenerator random) { while(true) { final double u = hIntegralNumberOfElements + random.nextDouble() * (hIntegralX1 - hIntegralNumberOfElements); // u is uniformly distributed in (hIntegralX1, hIntegralNumberOfElements] double x = hIntegralInverse(u); int k = (int)(x + 0.5); // Limit k to the range [1, numberOfElements] // (k could be outside due to numerical inaccuracies) if (k < 1) { k = 1; } else if (k > numberOfElements) { k = numberOfElements; } // Here, the distribution of k is given by: // // P(k = 1) = C * (hIntegral(1.5) - hIntegralX1) = C // P(k = m) = C * (hIntegral(m + 1/2) - hIntegral(m - 1/2)) for m >= 2 // // where C := 1 / (hIntegralNumberOfElements - hIntegralX1) if (k - x <= s || u >= hIntegral(k + 0.5) - h(k)) { // Case k = 1: // // The right inequality is always true, because replacing k by 1 gives // u >= hIntegral(1.5) - h(1) = hIntegralX1 and u is taken from // (hIntegralX1, hIntegralNumberOfElements]. // // Therefore, the acceptance rate for k = 1 is P(accepted | k = 1) = 1 // and the probability that 1 is returned as random value is // P(k = 1 and accepted) = P(accepted | k = 1) * P(k = 1) = C = C / 1^exponent // // Case k >= 2: // // The left inequality (k - x <= s) is just a short cut // to avoid the more expensive evaluation of the right inequality // (u >= hIntegral(k + 0.5) - h(k)) in many cases. // // If the left inequality is true, the right inequality is also true: // Theorem 2 in the paper is valid for all positive exponents, because // the requirements h'(x) = -exponent/x^(exponent + 1) < 0 and // (-1/hInverse'(x))'' = (1+1/exponent) * x^(1/exponent-1) >= 0 // are both fulfilled. // Therefore, f(x) := x - hIntegralInverse(hIntegral(x + 0.5) - h(x)) // is a non-decreasing function. If k - x <= s holds, // k - x <= s + f(k) - f(2) is obviously also true which is equivalent to // -x <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)), // -hIntegralInverse(u) <= -hIntegralInverse(hIntegral(k + 0.5) - h(k)), // and finally u >= hIntegral(k + 0.5) - h(k). // // Hence, the right inequality determines the acceptance rate: // P(accepted | k = m) = h(m) / (hIntegrated(m+1/2) - hIntegrated(m-1/2)) // The probability that m is returned is given by // P(k = m and accepted) = P(accepted | k = m) * P(k = m) = C * h(m) = C / m^exponent. // // In both cases the probabilities are proportional to the probability mass function // of the Zipf distribution. return k; } } }
H(x) :=
  • (x^(1-exponent) - 1)/(1 - exponent), if exponent != 1
  • log(x), if exponent == 1
H(x) is an integral function of h(x), the derivative of H(x) is h(x).
Params:
  • x – free parameter
Returns:H(x)
/** * {@code H(x) :=} * <ul> * <li>{@code (x^(1-exponent) - 1)/(1 - exponent)}, if {@code exponent != 1}</li> * <li>{@code log(x)}, if {@code exponent == 1}</li> * </ul> * H(x) is an integral function of h(x), * the derivative of H(x) is h(x). * * @param x free parameter * @return {@code H(x)} */
private double hIntegral(final double x) { final double logX = FastMath.log(x); return helper2((1d-exponent)*logX)*logX; }
h(x) := 1/x^exponent
Params:
  • x – free parameter
Returns:h(x)
/** * {@code h(x) := 1/x^exponent} * * @param x free parameter * @return h(x) */
private double h(final double x) { return FastMath.exp(-exponent * FastMath.log(x)); }
The inverse function of H(x).
Params:
  • x – free parameter
Returns:y for which H(y) = x
/** * The inverse function of H(x). * * @param x free parameter * @return y for which {@code H(y) = x} */
private double hIntegralInverse(final double x) { double t = x*(1d-exponent); if (t < -1d) { // Limit value to the range [-1, +inf). // t could be smaller than -1 in some rare cases due to numerical errors. t = -1; } return FastMath.exp(helper1(t)*x); }
Helper function that calculates log(1+x)/x.

A Taylor series expansion is used, if x is close to 0.

Params:
  • x – a value larger than or equal to -1
Returns:log(1+x)/x
/** * Helper function that calculates {@code log(1+x)/x}. * <p> * A Taylor series expansion is used, if x is close to 0. * * @param x a value larger than or equal to -1 * @return {@code log(1+x)/x} */
static double helper1(final double x) { if (FastMath.abs(x)>1e-8) { return FastMath.log1p(x)/x; } else { return 1.-x*((1./2.)-x*((1./3.)-x*(1./4.))); } }
Helper function to calculate (exp(x)-1)/x.

A Taylor series expansion is used, if x is close to 0.

Params:
  • x – free parameter
Returns:(exp(x)-1)/x if x is non-zero, or 1 if x=0
/** * Helper function to calculate {@code (exp(x)-1)/x}. * <p> * A Taylor series expansion is used, if x is close to 0. * * @param x free parameter * @return {@code (exp(x)-1)/x} if x is non-zero, or 1 if x=0 */
static double helper2(final double x) { if (FastMath.abs(x)>1e-8) { return FastMath.expm1(x)/x; } else { return 1.+x*(1./2.)*(1.+x*(1./3.)*(1.+x*(1./4.))); } } } }