org.apache.lucene/lucene-core/8.2.0 : org/apache/lucene/search/CachingCollector.java

CachingCollector
http://lucene.apache.org/lucene-parent/lucene-core: Apache Lucene Java Core (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.util.ArrayUtil;

Caches all docs, and optionally also scores, coming from a search, and is then able to replay them to another collector. You specify the max RAM this class may use. Once the collection is done, call isCached. If this returns true, you can use replay(Collector) against a new collector. If it returns false, this means too much RAM was required and you must instead re-run the original search. NOTE: this class consumes 4 (or 8 bytes, if
scoring is cached) per collected document.  If the result
set is large this can easily be a very substantial amount
of RAM!
See the Lucene modules/grouping module for more
details including a full code example.
@lucene.experimental /**
 * Caches all docs, and optionally also scores, coming from
 * a search, and is then able to replay them to another
 * collector.  You specify the max RAM this class may use.
 * Once the collection is done, call {@link #isCached}. If
 * this returns true, you can use {@link #replay(Collector)}
 * against a new collector.  If it returns false, this means
 * too much RAM was required and you must instead re-run the
 * original search.
 *
 * <p><b>NOTE</b>: this class consumes 4 (or 8 bytes, if
 * scoring is cached) per collected document.  If the result
 * set is large this can easily be a very substantial amount
 * of RAM!
 *
 * <p>See the Lucene <tt>modules/grouping</tt> module for more
 * details including a full code example.</p>
 *
 * @lucene.experimental
 */
public abstract class CachingCollector extends FilterCollector {

  private static final int INITIAL_ARRAY_SIZE = 128;

  private static final class CachedScorable extends Scorable {

    // NOTE: these members are package-private b/c that way accessing them from
    // the outer class does not incur access check by the JVM. The same
    // situation would be if they were defined in the outer class as private
    // members.
    int doc;
    float score;

    @Override
    public final float score() { return score; }

    @Override
    public int docID() {
      return doc;
    }

  }

  private static class NoScoreCachingCollector extends CachingCollector {

    List<LeafReaderContext> contexts;
    List<int[]> docs;
    int maxDocsToCache;
    NoScoreCachingLeafCollector lastCollector;

    NoScoreCachingCollector(Collector in, int maxDocsToCache) {
      super(in);
      this.maxDocsToCache = maxDocsToCache;
      contexts = new ArrayList<>();
      docs = new ArrayList<>();
    }

    protected NoScoreCachingLeafCollector wrap(LeafCollector in, int maxDocsToCache) {
      return new NoScoreCachingLeafCollector(in, maxDocsToCache);
    }

    // note: do *not* override needScore to say false. Just because we aren't caching the score doesn't mean the
    //   wrapped collector doesn't need it to do its job.

    public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
      postCollection();
      final LeafCollector in = this.in.getLeafCollector(context);
      if (contexts != null) {
        contexts.add(context);
      }
      if (maxDocsToCache >= 0) {
        return lastCollector = wrap(in, maxDocsToCache);
      } else {
        return in;
      }
    }

    protected void invalidate() {
      maxDocsToCache = -1;
      contexts = null;
      this.docs = null;
    }

    protected void postCollect(NoScoreCachingLeafCollector collector) {
      final int[] docs = collector.cachedDocs();
      maxDocsToCache -= docs.length;
      this.docs.add(docs);
    }

    private void postCollection() {
      if (lastCollector != null) {
        if (!lastCollector.hasCache()) {
          invalidate();
        } else {
          postCollect(lastCollector);
        }
        lastCollector = null;
      }
    }

    protected void collect(LeafCollector collector, int i) throws IOException {
      final int[] docs = this.docs.get(i);
      for (int doc : docs) {
        collector.collect(doc);
      }
    }

    public void replay(Collector other) throws IOException {
      postCollection();
      if (!isCached()) {
        throw new IllegalStateException("cannot replay: cache was cleared because too much RAM was required");
      }
      assert docs.size() == contexts.size();
      for (int i = 0; i < contexts.size(); ++i) {
        final LeafReaderContext context = contexts.get(i);
        final LeafCollector collector = other.getLeafCollector(context);
        collect(collector, i);
      }
    }

  }

  private static class ScoreCachingCollector extends NoScoreCachingCollector {

    List<float[]> scores;

    ScoreCachingCollector(Collector in, int maxDocsToCache) {
      super(in, maxDocsToCache);
      scores = new ArrayList<>();
    }

    protected NoScoreCachingLeafCollector wrap(LeafCollector in, int maxDocsToCache) {
      return new ScoreCachingLeafCollector(in, maxDocsToCache);
    }

    @Override
    protected void postCollect(NoScoreCachingLeafCollector collector) {
      final ScoreCachingLeafCollector coll = (ScoreCachingLeafCollector) collector;
      super.postCollect(coll);
      scores.add(coll.cachedScores());
    }

    Ensure the scores are collected so they can be replayed, even if the wrapped collector doesn't need them. /** Ensure the scores are collected so they can be replayed, even if the wrapped collector doesn't need them. */
    @Override
    public ScoreMode scoreMode() {
      return ScoreMode.COMPLETE;
    }

    @Override
    protected void collect(LeafCollector collector, int i) throws IOException {
      final int[] docs = this.docs.get(i);
      final float[] scores = this.scores.get(i);
      assert docs.length == scores.length;
      final CachedScorable scorer = new CachedScorable();
      collector.setScorer(scorer);
      for (int j = 0; j < docs.length; ++j) {
        scorer.doc = docs[j];
        scorer.score = scores[j];
        collector.collect(scorer.doc);
      }
    }
  }

  private class NoScoreCachingLeafCollector extends FilterLeafCollector {

    final int maxDocsToCache;
    int[] docs;
    int docCount;

    NoScoreCachingLeafCollector(LeafCollector in, int maxDocsToCache) {
      super(in);
      this.maxDocsToCache = maxDocsToCache;
      docs = new int[Math.min(maxDocsToCache, INITIAL_ARRAY_SIZE)];
      docCount = 0;
    }

    protected void grow(int newLen) {
      docs = ArrayUtil.growExact(docs, newLen);
    }

    protected void invalidate() {
      docs = null;
      docCount = -1;
      cached = false;
    }

    protected void buffer(int doc) throws IOException {
      docs[docCount] = doc;
    }

    @Override
    public void collect(int doc) throws IOException {
      if (docs != null) {
        if (docCount >= docs.length) {
          if (docCount >= maxDocsToCache) {
            invalidate();
          } else {
            final int newLen = Math.min(ArrayUtil.oversize(docCount + 1, Integer.BYTES), maxDocsToCache);
            grow(newLen);
          }
        }
        if (docs != null) {
          buffer(doc);
          ++docCount;
        }
      }
      super.collect(doc);
    }

    boolean hasCache() {
      return docs != null;
    }

    int[] cachedDocs() {
      return docs == null ? null : ArrayUtil.copyOfSubArray(docs, 0, docCount);
    }

  }

  private class ScoreCachingLeafCollector extends NoScoreCachingLeafCollector {

    Scorable scorer;
    float[] scores;

    ScoreCachingLeafCollector(LeafCollector in, int maxDocsToCache) {
      super(in, maxDocsToCache);
      scores = new float[docs.length];
    }

    @Override
    public void setScorer(Scorable scorer) throws IOException {
      this.scorer = scorer;
      super.setScorer(scorer);
    }

    @Override
    protected void grow(int newLen) {
      super.grow(newLen);
      scores = ArrayUtil.growExact(scores, newLen);
    }

    @Override
    protected void invalidate() {
      super.invalidate();
      scores = null;
    }

    @Override
    protected void buffer(int doc) throws IOException {
      super.buffer(doc);
      scores[docCount] = scorer.score();
    }

    float[] cachedScores() {
      return docs == null ? null : ArrayUtil.copyOfSubArray(scores, 0, docCount);
    }
  }

  Creates a CachingCollector which does not wrap another collector. The cached documents and scores can later be 
replayed. /**
   * Creates a {@link CachingCollector} which does not wrap another collector.
   * The cached documents and scores can later be {@link #replay(Collector)
   * replayed}.
   */
  public static CachingCollector create(boolean cacheScores, double maxRAMMB) {
    Collector other = new SimpleCollector() {

      @Override
      public void collect(int doc) {}

      @Override
      public ScoreMode scoreMode() {
        return ScoreMode.COMPLETE;
      }

    };
    return create(other, cacheScores, maxRAMMB);
  }

  Create a new CachingCollector that wraps the given collector and caches documents and scores up to the specified RAM threshold. 
Params: other – 
         the Collector to wrap and delegate calls to.
cacheScores – 
         whether to cache scores in addition to document IDs. Note that
         this increases the RAM consumed per doc
maxRAMMB – 
         the maximum RAM in MB to consume for caching the documents and
         scores. If the collector exceeds the threshold, no documents and
         scores are cached./**
   * Create a new {@link CachingCollector} that wraps the given collector and
   * caches documents and scores up to the specified RAM threshold.
   *
   * @param other
   *          the Collector to wrap and delegate calls to.
   * @param cacheScores
   *          whether to cache scores in addition to document IDs. Note that
   *          this increases the RAM consumed per doc
   * @param maxRAMMB
   *          the maximum RAM in MB to consume for caching the documents and
   *          scores. If the collector exceeds the threshold, no documents and
   *          scores are cached.
   */
  public static CachingCollector create(Collector other, boolean cacheScores, double maxRAMMB) {
    int bytesPerDoc = Integer.BYTES;
    if (cacheScores) {
      bytesPerDoc += Float.BYTES;
    }
    final int maxDocsToCache = (int) ((maxRAMMB * 1024 * 1024) / bytesPerDoc);
    return create(other, cacheScores, maxDocsToCache);
  }

  Create a new CachingCollector that wraps the given collector and caches documents and scores up to the specified max docs threshold. 
Params: other – 
         the Collector to wrap and delegate calls to.
cacheScores – 
         whether to cache scores in addition to document IDs. Note that
         this increases the RAM consumed per doc
maxDocsToCache – 
         the maximum number of documents for caching the documents and
         possible the scores. If the collector exceeds the threshold,
         no documents and scores are cached./**
   * Create a new {@link CachingCollector} that wraps the given collector and
   * caches documents and scores up to the specified max docs threshold.
   *
   * @param other
   *          the Collector to wrap and delegate calls to.
   * @param cacheScores
   *          whether to cache scores in addition to document IDs. Note that
   *          this increases the RAM consumed per doc
   * @param maxDocsToCache
   *          the maximum number of documents for caching the documents and
   *          possible the scores. If the collector exceeds the threshold,
   *          no documents and scores are cached.
   */
  public static CachingCollector create(Collector other, boolean cacheScores, int maxDocsToCache) {
    return cacheScores ? new ScoreCachingCollector(other, maxDocsToCache) : new NoScoreCachingCollector(other, maxDocsToCache);
  }

  private boolean cached;

  private CachingCollector(Collector in) {
    super(in);
    cached = true;
  }

  Return true is this collector is able to replay collection.
/**
   * Return true is this collector is able to replay collection.
   */
  public final boolean isCached() {
    return cached;
  }

  Replays the cached doc IDs (and scores) to the given Collector. If this instance does not cache scores, then Scorer is not set on other.setScorer as well as scores are not replayed. 
Throws: IllegalStateException – 
          if this collector is not cached (i.e., if the RAM limits were too
          low for the number of documents + scores to cache).
IllegalArgumentException – 
          if the given Collect's does not support out-of-order collection,
          while the collector passed to the ctor does./**
   * Replays the cached doc IDs (and scores) to the given Collector. If this
   * instance does not cache scores, then Scorer is not set on
   * {@code other.setScorer} as well as scores are not replayed.
   *
   * @throws IllegalStateException
   *           if this collector is not cached (i.e., if the RAM limits were too
   *           low for the number of documents + scores to cache).
   * @throws IllegalArgumentException
   *           if the given Collect's does not support out-of-order collection,
   *           while the collector passed to the ctor does.
   */
  public abstract void replay(Collector other) throws IOException;

}
/

org.apache.lucene/ lucene-core/ 8.2.0/ org/apache/lucene/search/CachingCollector.java