org.apache.lucene/lucene-core/8.2.0 : org/apache/lucene/util/MSBRadixSorter.java

MSBRadixSorter
http://lucene.apache.org/lucene-parent/lucene-core: Apache Lucene Java Core (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util;

import java.util.Arrays;

Radix sorter for variable-length strings. This class sorts based on the most significant byte first and falls back to IntroSorter when the size of the buckets to sort becomes small. It is NOT stable. Worst-case memory usage is about 2.3 KB. @lucene.internal /** Radix sorter for variable-length strings. This class sorts based on the most
 *  significant byte first and falls back to {@link IntroSorter} when the size
 *  of the buckets to sort becomes small. It is <b>NOT</b> stable.
 *  Worst-case memory usage is about {@code 2.3 KB}.
 *  @lucene.internal */
public abstract class MSBRadixSorter extends Sorter {

  // after that many levels of recursion we fall back to introsort anyway
  // this is used as a protection against the fact that radix sort performs
  // worse when there are long common prefixes (probably because of cache
  // locality)
  private static final int LEVEL_THRESHOLD = 8;
  // size of histograms: 256 + 1 to indicate that the string is finished
  private static final int HISTOGRAM_SIZE = 257;
  // buckets below this size will be sorted with introsort
  private static final int LENGTH_THRESHOLD = 100;

  // we store one histogram per recursion level
  private final int[][] histograms = new int[LEVEL_THRESHOLD][];
  private final int[] endOffsets = new int[HISTOGRAM_SIZE];
  private final int[] commonPrefix;

  private final int maxLength;

  Sole constructor.
Params: maxLength – the maximum length of keys, pass Integer.MAX_VALUE if unknown./**
   * Sole constructor.
   * @param maxLength the maximum length of keys, pass {@link Integer#MAX_VALUE} if unknown.
   */
  protected MSBRadixSorter(int maxLength) {
    this.maxLength = maxLength;
    this.commonPrefix = new int[Math.min(24, maxLength)];
  }

  Return the k-th byte of the entry at index i, or -1 if its length is less than or equal to k. This may only be called with a value of i between 0 included and maxLength excluded. /** Return the k-th byte of the entry at index {@code i}, or {@code -1} if
   * its length is less than or equal to {@code k}. This may only be called
   * with a value of {@code i} between {@code 0} included and
   * {@code maxLength} excluded. */
  protected abstract int byteAt(int i, int k);

  Get a fall-back sorter which may assume that the first k bytes of all compared strings are equal. /** Get a fall-back sorter which may assume that the first k bytes of all compared strings are equal. */
  protected Sorter getFallbackSorter(int k) {
    return new IntroSorter() {
      @Override
      protected void swap(int i, int j) {
        MSBRadixSorter.this.swap(i, j);
      }

      @Override
      protected int compare(int i, int j) {
        for (int o = k; o < maxLength; ++o) {
          final int b1 = byteAt(i, o);
          final int b2 = byteAt(j, o);
          if (b1 != b2) {
            return b1 - b2;
          } else if (b1 == -1) {
            break;
          }
        }
        return 0;
      }

      @Override
      protected void setPivot(int i) {
        pivot.setLength(0);
        for (int o = k; o < maxLength; ++o) {
          final int b = byteAt(i, o);
          if (b == -1) {
            break;
          }
          pivot.append((byte) b);
        }
      }

      @Override
      protected int comparePivot(int j) {
        for (int o = 0; o < pivot.length(); ++o) {
          final int b1 = pivot.byteAt(o) & 0xff;
          final int b2 = byteAt(j, k + o);
          if (b1 != b2) {
            return b1 - b2;
          }
        }
        if (k + pivot.length() == maxLength) {
          return 0;
        }
        return -1 - byteAt(j, k + pivot.length());
      }

      private final BytesRefBuilder pivot = new BytesRefBuilder();
    };
  }

  @Override
  protected final int compare(int i, int j) {
    throw new UnsupportedOperationException("unused: not a comparison-based sort");
  }

  @Override
  public void sort(int from, int to) {
    checkRange(from, to);
    sort(from, to, 0, 0);
  }

  private void sort(int from, int to, int k, int l) {
    if (to - from <= LENGTH_THRESHOLD || l >= LEVEL_THRESHOLD) {
      introSort(from, to, k);
    } else {
      radixSort(from, to, k, l);
    }
  }

  private void introSort(int from, int to, int k) {
    getFallbackSorter(k).sort(from, to);
  }

  Params: k – the character number to compare
l – the level of recursion/**
   * @param k the character number to compare
   * @param l the level of recursion
   */
  private void radixSort(int from, int to, int k, int l) {
    int[] histogram = histograms[l];
    if (histogram == null) {
      histogram = histograms[l] = new int[HISTOGRAM_SIZE];
    } else {
      Arrays.fill(histogram, 0);
    }

    final int commonPrefixLength = computeCommonPrefixLengthAndBuildHistogram(from, to, k, histogram);
    if (commonPrefixLength > 0) {
      // if there are no more chars to compare or if all entries fell into the
      // first bucket (which means strings are shorter than k) then we are done
      // otherwise recurse
      if (k + commonPrefixLength < maxLength
          && histogram[0] < to - from) {
        radixSort(from, to, k + commonPrefixLength, l);
      }
      return;
    }
    assert assertHistogram(commonPrefixLength, histogram);

    int[] startOffsets = histogram;
    int[] endOffsets = this.endOffsets;
    sumHistogram(histogram, endOffsets);
    reorder(from, to, startOffsets, endOffsets, k);
    endOffsets = startOffsets;

    if (k + 1 < maxLength) {
      // recurse on all but the first bucket since all keys are equals in this
      // bucket (we already compared all bytes)
      for (int prev = endOffsets[0], i = 1; i < HISTOGRAM_SIZE; ++i) {
        int h = endOffsets[i];
        final int bucketLen = h - prev;
        if (bucketLen > 1) {
          sort(from + prev, from + h, k + 1, l + 1);
        }
        prev = h;
      }
    }
  }

  // only used from assert
  private boolean assertHistogram(int commonPrefixLength, int[] histogram) {
    int numberOfUniqueBytes = 0;
    for (int freq : histogram) {
      if (freq > 0) {
        numberOfUniqueBytes++;
      }
    }
    if (numberOfUniqueBytes == 1) {
      assert commonPrefixLength >= 1;
    } else {
      assert commonPrefixLength == 0 : commonPrefixLength;
    }
    return true;
  }

  Return a number for the k-th character between 0 and HISTOGRAM_SIZE. /** Return a number for the k-th character between 0 and {@link #HISTOGRAM_SIZE}. */
  private int getBucket(int i, int k) {
    return byteAt(i, k) + 1;
  }

  Build a histogram of the number of values per bucket and return a common prefix length for all visited values. @see #buildHistogram /** Build a histogram of the number of values per {@link #getBucket(int, int) bucket}
   *  and return a common prefix length for all visited values.
   *  @see #buildHistogram */
  private int computeCommonPrefixLengthAndBuildHistogram(int from, int to, int k, int[] histogram) {
    final int[] commonPrefix = this.commonPrefix;
    int commonPrefixLength = Math.min(commonPrefix.length, maxLength - k);
    for (int j = 0; j < commonPrefixLength; ++j) {
      final int b = byteAt(from, k + j);
      commonPrefix[j] = b;
      if (b == -1) {
        commonPrefixLength = j + 1;
        break;
      }
    }

    int i;
    outer: for (i = from + 1; i < to; ++i) {
      for (int j = 0; j < commonPrefixLength; ++j) {
        final int b = byteAt(i, k + j);
        if (b != commonPrefix[j]) {
          commonPrefixLength = j;
          if (commonPrefixLength == 0) { // we have no common prefix
            histogram[commonPrefix[0] + 1] = i - from;
            histogram[b + 1] = 1;
            break outer;
          }
          break;
        }
      }
    }

    if (i < to) {
      // the loop got broken because there is no common prefix
      assert commonPrefixLength == 0;
      buildHistogram(i + 1, to, k, histogram);
    } else {
      assert commonPrefixLength > 0;
      histogram[commonPrefix[0] + 1] = to - from;
    }

    return commonPrefixLength;
  }

  Build an histogram of the k-th characters of values occurring between offsets from and to, using getBucket. /** Build an histogram of the k-th characters of values occurring between
   *  offsets {@code from} and {@code to}, using {@link #getBucket}. */
  private void buildHistogram(int from, int to, int k, int[] histogram) {
    for (int i = from; i < to; ++i) {
      histogram[getBucket(i, k)]++;
    }
  }

  Accumulate values of the histogram so that it does not store counts but start offsets. endOffsets will store the end offsets. /** Accumulate values of the histogram so that it does not store counts but
   *  start offsets. {@code endOffsets} will store the end offsets. */
  private static void sumHistogram(int[] histogram, int[] endOffsets) {
    int accum = 0;
    for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
      final int count = histogram[i];
      histogram[i] = accum;
      accum += count;
      endOffsets[i] = accum;
    }
  }

  Reorder based on start/end offsets for each bucket. When this method
returns, startOffsets and endOffsets are equal.
Params: startOffsets – start offsets per bucket
endOffsets – end offsets per bucket/**
   * Reorder based on start/end offsets for each bucket. When this method
   * returns, startOffsets and endOffsets are equal.
   * @param startOffsets start offsets per bucket
   * @param endOffsets end offsets per bucket
   */
  private void reorder(int from, int to, int[] startOffsets, int[] endOffsets, int k) {
    // reorder in place, like the dutch flag problem
    for (int i = 0; i < HISTOGRAM_SIZE; ++i) {
      final int limit = endOffsets[i];
      for (int h1 = startOffsets[i]; h1 < limit; h1 = startOffsets[i]) {
        final int b = getBucket(from + h1, k);
        final int h2 = startOffsets[b]++;
        swap(from + h1, from + h2);
      }
    }
  }
}
/

org.apache.lucene/ lucene-core/ 8.2.0/ org/apache/lucene/util/MSBRadixSorter.java