org.apache.lucene/lucene-facet/8.2.0 : org/apache/lucene/facet/RandomSamplingFacetsCollector.java

RandomSamplingFacetsCollector
http://lucene.apache.org/lucene-parent/lucene-facet: Package for Faceted Indexing and Search (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.facet;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.facet.FacetsConfig.DimConfig;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.util.FixedBitSet;

Collects hits for subsequent faceting, using sampling if needed. Once you've run a search and collect hits into this, instantiate one of the Facets subclasses to do the facet counting. Note that this collector does not collect the scores of matching docs (i.e. MatchingDocs.scores) is null.  If you require the original set of hits, you can call getOriginalMatchingDocs(). Also, since the counts of the top-facets is based on the sampled set, you can amortize the counts by calling amortizeFacetCounts. /**
 * Collects hits for subsequent faceting, using sampling if needed. Once you've
 * run a search and collect hits into this, instantiate one of the
 * {@link Facets} subclasses to do the facet counting. Note that this collector
 * does not collect the scores of matching docs (i.e.
 * {@link FacetsCollector.MatchingDocs#scores}) is {@code null}.
 * <p>
 * If you require the original set of hits, you can call
 * {@link #getOriginalMatchingDocs()}. Also, since the counts of the top-facets
 * is based on the sampled set, you can amortize the counts by calling
 * {@link #amortizeFacetCounts}.
 */
public class RandomSamplingFacetsCollector extends FacetsCollector {
  
  Faster alternative for java.util.Random, inspired by
http://dmurphy747.wordpress.com/2011/03/23/xorshift-vs-random-
performance-in-java/

Has a period of 2^64-1
/**
   * Faster alternative for java.util.Random, inspired by
   * http://dmurphy747.wordpress.com/2011/03/23/xorshift-vs-random-
   * performance-in-java/
   * <p>
   * Has a period of 2^64-1
   */
  private static class XORShift64Random {
    
    private long x;
    
    Creates a xorshift random generator using the provided seed /** Creates a xorshift random generator using the provided seed */
    public XORShift64Random(long seed) {
      x = seed == 0 ? 0xdeadbeef : seed;
    }
    
    Get the next random long value /** Get the next random long value */
    public long randomLong() {
      x ^= (x << 21);
      x ^= (x >>> 35);
      x ^= (x << 4);
      return x;
    }
    
    Get the next random int, between 0 (inclusive) and n (exclusive) /** Get the next random int, between 0 (inclusive) and n (exclusive) */
    public int nextInt(int n) {
      int res = (int) (randomLong() % n);
      return (res < 0) ? -res : res;
    }
    
  }
  
  private final static int NOT_CALCULATED = -1;
  
  private final int sampleSize;
  private final XORShift64Random random;
  
  private double samplingRate;
  private List<MatchingDocs> sampledDocs;
  private int totalHits = NOT_CALCULATED;
  private int leftoverBin = NOT_CALCULATED;
  private int leftoverIndex = NOT_CALCULATED;
  
  Constructor with the given sample size and default seed.
See Also: RandomSamplingFacetsCollector(int, long)/**
   * Constructor with the given sample size and default seed.
   * 
   * @see #RandomSamplingFacetsCollector(int, long)
   */
  public RandomSamplingFacetsCollector(int sampleSize) {
    this(sampleSize, 0);
  }
  
  Constructor with the given sample size and seed.
Params: sampleSize – 
         The preferred sample size. If the number of hits is greater than
         the size, sampling will be done using a sample ratio of sampling
         size / totalN. For example: 1000 hits, sample size = 10 results in
         samplingRatio of 0.01. If the number of hits is lower, no sampling
         is done at all
seed –  The random seed. If 0 then a seed will be chosen for you./**
   * Constructor with the given sample size and seed.
   * 
   * @param sampleSize
   *          The preferred sample size. If the number of hits is greater than
   *          the size, sampling will be done using a sample ratio of sampling
   *          size / totalN. For example: 1000 hits, sample size = 10 results in
   *          samplingRatio of 0.01. If the number of hits is lower, no sampling
   *          is done at all
   * @param seed
   *          The random seed. If {@code 0} then a seed will be chosen for you.
   */
  public RandomSamplingFacetsCollector(int sampleSize, long seed) {
    super(false);
    this.sampleSize = sampleSize;
    this.random = new XORShift64Random(seed);
    this.sampledDocs = null;
  }
  
  Returns the sampled list of the matching documents. Note that a MatchingDocs instance is returned per segment, even if no hits from that segment are included in the sampled set. 
Note: One or more of the MatchingDocs might be empty (not containing any
hits) as result of sampling.
 Note: MatchingDocs.totalHits is copied from the original MatchingDocs, scores is set to null /**
   * Returns the sampled list of the matching documents. Note that a
   * {@link FacetsCollector.MatchingDocs} instance is returned per segment, even
   * if no hits from that segment are included in the sampled set.
   * <p>
   * Note: One or more of the MatchingDocs might be empty (not containing any
   * hits) as result of sampling.
   * <p>
   * Note: {@code MatchingDocs.totalHits} is copied from the original
   * MatchingDocs, scores is set to {@code null}
   */
  @Override
  public List<MatchingDocs> getMatchingDocs() {
    List<MatchingDocs> matchingDocs = super.getMatchingDocs();
    
    if (totalHits == NOT_CALCULATED) {
      totalHits = 0;
      for (MatchingDocs md : matchingDocs) {
        totalHits += md.totalHits;
      }
    }
    
    if (totalHits <= sampleSize) {
      return matchingDocs;
    }
    
    if (sampledDocs == null) {
      samplingRate = (1.0 * sampleSize) / totalHits;
      sampledDocs = createSampledDocs(matchingDocs);
    }
    return sampledDocs;
  }
  
  Returns the original matching documents. /** Returns the original matching documents. */
  public List<MatchingDocs> getOriginalMatchingDocs() {
    return super.getMatchingDocs();
  }
  
  Create a sampled copy of the matching documents list. /** Create a sampled copy of the matching documents list. */
  private List<MatchingDocs> createSampledDocs(List<MatchingDocs> matchingDocsList) {
    List<MatchingDocs> sampledDocsList = new ArrayList<>(matchingDocsList.size());
    for (MatchingDocs docs : matchingDocsList) {
      sampledDocsList.add(createSample(docs));
    }
    return sampledDocsList;
  }
  
  Create a sampled of the given hits. /** Create a sampled of the given hits. */
  private MatchingDocs createSample(MatchingDocs docs) {
    int maxdoc = docs.context.reader().maxDoc();
    
    // TODO: we could try the WAH8DocIdSet here as well, as the results will be sparse
    FixedBitSet sampleDocs = new FixedBitSet(maxdoc);
    
    int binSize = (int) (1.0 / samplingRate);
    
    try {
      int counter = 0;
      int limit, randomIndex;
      if (leftoverBin != NOT_CALCULATED) {
        limit = leftoverBin;
        // either NOT_CALCULATED, which means we already sampled from that bin,
        // or the next document to sample
        randomIndex = leftoverIndex;
      } else {
        limit = binSize;
        randomIndex = random.nextInt(binSize);
      }
      final DocIdSetIterator it = docs.bits.iterator();
      for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
        if (counter == randomIndex) {
          sampleDocs.set(doc);
        }
        counter++;
        if (counter >= limit) {
          counter = 0;
          limit = binSize;
          randomIndex = random.nextInt(binSize);
        }
      }
      
      if (counter == 0) {
        // we either exhausted the bin and the iterator at the same time, or
        // this segment had no results. in the latter case we might want to
        // carry leftover to the next segment as is, but that complicates the
        // code and doesn't seem so important.
        leftoverBin = leftoverIndex = NOT_CALCULATED;
      } else {
        leftoverBin = limit - counter;
        if (randomIndex > counter) {
          // the document to sample is in the next bin
          leftoverIndex = randomIndex - counter;
        } else if (randomIndex < counter) {
          // we sampled a document from the bin, so just skip over remaining
          // documents in the bin in the next segment.
          leftoverIndex = NOT_CALCULATED;
        }
      }
      
      return new MatchingDocs(docs.context, new BitDocIdSet(sampleDocs), docs.totalHits, null);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
  
  Note: if you use a counting Facets implementation, you can amortize the sampled counts by calling this method. Uses the FacetsConfig and the IndexSearcher to determine the upper bound for each facet value. /**
   * Note: if you use a counting {@link Facets} implementation, you can amortize the
   * sampled counts by calling this method. Uses the {@link FacetsConfig} and
   * the {@link IndexSearcher} to determine the upper bound for each facet value.
   */
  public FacetResult amortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher) throws IOException {
    if (res == null || totalHits <= sampleSize) {
      return res;
    }
    
    LabelAndValue[] fixedLabelValues = new LabelAndValue[res.labelValues.length];
    IndexReader reader = searcher.getIndexReader();
    DimConfig dimConfig = config.getDimConfig(res.dim);
    
    // +2 to prepend dimension, append child label
    String[] childPath = new String[res.path.length + 2];
    childPath[0] = res.dim;
    
    System.arraycopy(res.path, 0, childPath, 1, res.path.length); // reuse
    
    for (int i = 0; i < res.labelValues.length; i++) {
      childPath[res.path.length + 1] = res.labelValues[i].label;
      String fullPath = FacetsConfig.pathToString(childPath, childPath.length);
      int max = reader.docFreq(new Term(dimConfig.indexFieldName, fullPath));
      int correctedCount = (int) (res.labelValues[i].value.doubleValue() / samplingRate);
      correctedCount = Math.min(max, correctedCount);
      fixedLabelValues[i] = new LabelAndValue(res.labelValues[i].label, correctedCount);
    }
    
    // cap the total count on the total number of non-deleted documents in the reader
    int correctedTotalCount = res.value.intValue();
    if (correctedTotalCount > 0) {
      correctedTotalCount = Math.min(reader.numDocs(), (int) (res.value.doubleValue() / samplingRate));
    }
    
    return new FacetResult(res.dim, res.path, correctedTotalCount, fixedLabelValues, res.childCount);
  }
  
  Returns the sampling rate that was used. /** Returns the sampling rate that was used. */
  public double getSamplingRate() {
    return samplingRate;
  }
  
}
/

org.apache.lucene/ lucene-facet/ 8.2.0/ org/apache/lucene/facet/RandomSamplingFacetsCollector.java