org.apache.lucene/lucene-facet/8.2.0 : org/apache/lucene/facet/LongValueFacetCounts.java

LongValueFacetCounts
http://lucene.apache.org/lucene-parent/lucene-facet: Package for Faceted Indexing and Search (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.facet;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import com.carrotsearch.hppc.LongIntScatterMap;
import com.carrotsearch.hppc.cursors.LongIntCursor;
import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.ConjunctionDISI;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.LongValues;
import org.apache.lucene.search.LongValuesSource;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.PriorityQueue;


Facets implementation that computes counts for all unique long values, more efficiently counting small values (0-1023) using an int array, and switching to a HashMap for values above 1023. Retrieve all facet counts, in value order, with getAllChildrenSortByValue, or get the topN values sorted by count with getTopChildrenSortByCount. @lucene.experimental /** {@link Facets} implementation that computes counts for
 *  all unique long values, more efficiently counting small values (0-1023) using an int array,
 *  and switching to a <code>HashMap</code> for values above 1023.
 *  Retrieve all facet counts, in value order, with {@link #getAllChildrenSortByValue},
 *  or get the topN values sorted by count with {@link #getTopChildrenSortByCount}.
 *
 *  @lucene.experimental */
public class LongValueFacetCounts extends Facets {

  Used for all values that are < 1K. /** Used for all values that are < 1K. */
  private final int[] counts = new int[1024];

  Used for all values that are >= 1K. /** Used for all values that are >= 1K. */
  private final LongIntScatterMap hashCounts = new LongIntScatterMap();

  private final String field;

  Total number of values counted, which is the subset of hits that had a value for this field. /** Total number of values counted, which is the subset of hits that had a value for this field. */
  private int totCount;

  Create LongValueFacetCounts, using either single-valued NumericDocValues or multi-valued SortedNumericDocValues from the specified field. /** Create {@code LongValueFacetCounts}, using either single-valued {@link
   *  NumericDocValues} or multi-valued {@link SortedNumericDocValues} from the
   *  specified field. */
  public LongValueFacetCounts(String field, FacetsCollector hits, boolean multiValued) throws IOException {
    this(field, null, hits, multiValued);
  }

  Create LongValueFacetCounts, using the provided LongValuesSource. If hits is null then all facets are counted. /** Create {@code LongValueFacetCounts}, using the provided
   *  {@link LongValuesSource}.  If hits is
   *  null then all facets are counted. */
  public LongValueFacetCounts(String field, LongValuesSource valueSource, FacetsCollector hits) throws IOException {
    this(field, valueSource, hits, false);
  }

  Create LongValueFacetCounts, using the provided LongValuesSource. random access (implement DocIdSet.bits). /** Create {@code LongValueFacetCounts}, using the provided
   *  {@link LongValuesSource}.
   *  random access (implement {@link org.apache.lucene.search.DocIdSet#bits}). */
  public LongValueFacetCounts(String field, LongValuesSource valueSource, FacetsCollector hits,
                              boolean multiValued) throws IOException {
    this.field = field;
    if (valueSource == null) {
      if (multiValued) {
        countMultiValued(field, hits.getMatchingDocs());
      } else {
        count(field, hits.getMatchingDocs());
      }
    } else {
      // value source is always single valued
      if (multiValued) {
        throw new IllegalArgumentException("can only compute multi-valued facets directly from doc values (when valueSource is null)");
      }
      count(valueSource, hits.getMatchingDocs());
    }
  }

  Counts all facet values for this reader. This produces the same result as computing facets on a MatchAllDocsQuery, but is more efficient. /** Counts all facet values for this reader.  This produces the same result as computing
   *  facets on a {@link org.apache.lucene.search.MatchAllDocsQuery}, but is more efficient. */
  public LongValueFacetCounts(String field, IndexReader reader, boolean multiValued) throws IOException {
    this.field = field;
    if (multiValued) {
      countAllMultiValued(reader, field);
    } else {
      countAll(reader, field);
    }
  }

  Counts all facet values for the provided LongValuesSource. This produces the same result as computing facets on a MatchAllDocsQuery, but is more efficient. /** Counts all facet values for the provided {@link LongValuesSource}.  This produces the same result as computing
   *  facets on a {@link org.apache.lucene.search.MatchAllDocsQuery}, but is more efficient. */
  public LongValueFacetCounts(String field, LongValuesSource valueSource, IndexReader reader) throws IOException {
    this.field = field;
    countAll(valueSource, field, reader);
  }

  private void count(LongValuesSource valueSource, List<MatchingDocs> matchingDocs) throws IOException {

    for (MatchingDocs hits : matchingDocs) {
      LongValues fv = valueSource.getValues(hits.context, null);

      // NOTE: this is not as efficient as working directly with the doc values APIs in the sparse case
      // because we are doing a linear scan across all hits, but this API is more flexible since a
      // LongValuesSource can compute interesting values at query time

      DocIdSetIterator docs = hits.bits.iterator();
      for (int doc = docs.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS;) {
        // Skip missing docs:
        if (fv.advanceExact(doc)) {
          increment(fv.longValue());
          totCount++;
        }

        doc = docs.nextDoc();
      }
    }
  }

  private void count(String field, List<MatchingDocs> matchingDocs) throws IOException {
    for (MatchingDocs hits : matchingDocs) {
      NumericDocValues fv = hits.context.reader().getNumericDocValues(field);
      if (fv == null) {
        continue;
      }
      countOneSegment(fv, hits);
    }
  }

  private void countOneSegment(NumericDocValues values, MatchingDocs hits) throws IOException {
    DocIdSetIterator it = ConjunctionDISI.intersectIterators(
                             Arrays.asList(hits.bits.iterator(), values));

    for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
      increment(values.longValue());
      totCount++;
    }
  }

  Counts directly from SortedNumericDocValues. /** Counts directly from SortedNumericDocValues. */
  private void countMultiValued(String field, List<MatchingDocs> matchingDocs) throws IOException {

    for (MatchingDocs hits : matchingDocs) {
      SortedNumericDocValues values = hits.context.reader().getSortedNumericDocValues(field);
      if (values == null) {
        // this field has no doc values for this segment
        continue;
      }

      NumericDocValues singleValues = DocValues.unwrapSingleton(values);

      if (singleValues != null) {
        countOneSegment(singleValues, hits);
      } else {

        DocIdSetIterator it = ConjunctionDISI.intersectIterators(
                                 Arrays.asList(hits.bits.iterator(), values));
      
        for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
          int limit = values.docValueCount();
          totCount += limit;
          for (int i = 0; i < limit; i++) {
            increment(values.nextValue());
          }
        }
      }
    }
  }

  Optimized version that directly counts all doc values. /** Optimized version that directly counts all doc values. */
  private void countAll(IndexReader reader, String field) throws IOException {

    for (LeafReaderContext context : reader.leaves()) {

      NumericDocValues values = context.reader().getNumericDocValues(field);
      if (values == null) {
        // this field has no doc values for this segment
        continue;
      }

      countAllOneSegment(values);
    }
  }

  private void countAllOneSegment(NumericDocValues values) throws IOException {
    int doc;
    while ((doc = values.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
      totCount++;
      increment(values.longValue());
    }
  }

  private void countAll(LongValuesSource valueSource, String field, IndexReader reader) throws IOException {

    for (LeafReaderContext context : reader.leaves()) {
      LongValues fv = valueSource.getValues(context, null);
      int maxDoc = context.reader().maxDoc();

      for (int doc = 0; doc < maxDoc; doc++) {
        // Skip missing docs:
        if (fv.advanceExact(doc)) {
          increment(fv.longValue());
          totCount++;
        }
      }
    }
  }
  
  private void countAllMultiValued(IndexReader reader, String field) throws IOException {

    for (LeafReaderContext context : reader.leaves()) {

      SortedNumericDocValues values = context.reader().getSortedNumericDocValues(field);
      if (values == null) {
        // this field has no doc values for this segment
        continue;
      }
      NumericDocValues singleValues = DocValues.unwrapSingleton(values);
      if (singleValues != null) {
        countAllOneSegment(singleValues);
      } else {
        int doc;
        while ((doc = values.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
          int limit = values.docValueCount();
          totCount += limit;
          for (int i = 0; i < limit; i++) {
            increment(values.nextValue());
          }
        }
      }
    }
  }

  private void increment(long value) {
    if (value >= 0 && value < counts.length) {
      counts[(int) value]++;
    } else {
      hashCounts.addTo(value, 1);
    }
  }

  @Override
  public FacetResult getTopChildren(int topN, String dim, String... path) {
    if (dim.equals(field) == false) {
      throw new IllegalArgumentException("invalid dim \"" + dim + "\"; should be \"" + field + "\"");
    }
    if (path.length != 0) {
      throw new IllegalArgumentException("path.length should be 0");
    }
    return getTopChildrenSortByCount(topN);
  }

  Reusable hash entry to hold long facet value and int count. /** Reusable hash entry to hold long facet value and int count. */
  private static class Entry {
    int count;
    long value;
  }

  Returns the specified top number of facets, sorted by count. /** Returns the specified top number of facets, sorted by count. */
  public FacetResult getTopChildrenSortByCount(int topN) {
    PriorityQueue<Entry> pq = new PriorityQueue<Entry>(Math.min(topN, counts.length + hashCounts.size())) {
                        @Override
        protected boolean lessThan(Entry a, Entry b) {
          // sort by count descending, breaking ties by value ascending:
          return a.count < b.count || (a.count == b.count && a.value > b.value);
        }
      };

    int childCount = 0;
    Entry e = null;
    for (int i = 0; i < counts.length; i++) {
      if (counts[i] != 0) {
        childCount++;
        if (e == null) {
          e = new Entry();
        }
        e.value = i;
        e.count = counts[i];
        e = pq.insertWithOverflow(e);
      }
    }

    if (hashCounts.size() != 0) {
      childCount += hashCounts.size();
      for (LongIntCursor c : hashCounts) {
        int count = c.value;
        if (count != 0) {
          if (e == null) {
            e = new Entry();
          }
          e.value = c.key;
          e.count = count;
          e = pq.insertWithOverflow(e);
        }
      }
    }

    LabelAndValue[] results = new LabelAndValue[pq.size()];
    while (pq.size() != 0) {
      Entry entry = pq.pop();
      results[pq.size()] = new LabelAndValue(Long.toString(entry.value), entry.count);
    }

    return new FacetResult(field, new String[0], totCount, results, childCount);
  }


  Returns all unique values seen, sorted by value.  /** Returns all unique values seen, sorted by value.  */
  public FacetResult getAllChildrenSortByValue() {
    List<LabelAndValue> labelValues = new ArrayList<>();

    // compact & sort hash table's arrays by value
    int[] hashCounts = new int[this.hashCounts.size()];
    long[] hashValues = new long[this.hashCounts.size()];
    
    int upto = 0;
    for (LongIntCursor c : this.hashCounts) {
      if (c.value != 0) {
        hashCounts[upto] = c.value;
        hashValues[upto] = c.key;
        upto++;
      }
    }

    assert upto == this.hashCounts.size() : "upto=" + upto + " hashCounts.size=" + this.hashCounts.size();

    new InPlaceMergeSorter() {
      @Override
      public int compare(int i, int j) {
        return Long.compare(hashValues[i], hashValues[j]);
      }

      @Override
      public void swap(int i, int j) {
        int x = hashCounts[i];
        hashCounts[i] = hashCounts[j];
        hashCounts[j] = x;

        long y = hashValues[j];
        hashValues[j] = hashValues[i];
        hashValues[i] = y;
      }
    }.sort(0, upto);

    boolean countsAdded = false;
    for (int i = 0; i < upto; i++) {
      if (countsAdded == false && hashValues[i] >= counts.length) {
        countsAdded = true;
        appendCounts(labelValues);
      }

      labelValues.add(new LabelAndValue(Long.toString(hashValues[i]),
                                        hashCounts[i]));
    }

    if (countsAdded == false) {
      appendCounts(labelValues);
    }

    return new FacetResult(field, new String[0], totCount, labelValues.toArray(new LabelAndValue[0]), labelValues.size());
  }

  private void appendCounts(List<LabelAndValue> labelValues) {
    for (int i = 0; i < counts.length; i++) {
      if (counts[i] != 0) {
        labelValues.add(new LabelAndValue(Long.toString(i), counts[i]));
      }
    }
  }

  @Override
  public Number getSpecificValue(String dim, String... path) throws IOException {
    // TODO: should we impl this?
    throw new UnsupportedOperationException();
  }

  @Override
  public List<FacetResult> getAllDims(int topN) throws IOException {
    return Collections.singletonList(getTopChildren(topN, field));
  }

  @Override
  public String toString() {
    StringBuilder b = new StringBuilder();
    b.append("LongValueFacetCounts totCount=");
    b.append(totCount);
    b.append(":\n");
    for (int i = 0; i < counts.length; i++) {
      if (counts[i] != 0) {
        b.append("  ");
        b.append(i);
        b.append(" -> count=");
        b.append(counts[i]);
        b.append('\n');
      }
    }

    if (hashCounts.size() != 0) {
      for (LongIntCursor c : hashCounts) {
        if (c.value != 0) {
          b.append("  ");
          b.append(c.key);
          b.append(" -> count=");
          b.append(c.value);
          b.append('\n');
        }
      }
    }

    return b.toString();
  }
}
/

org.apache.lucene/ lucene-facet/ 8.2.0/ org/apache/lucene/facet/LongValueFacetCounts.java