/*
 * Copyright (C) 2010, Google Inc.
 * and other copyright owners as documented in the project's IP log.
 *
 * This program and the accompanying materials are made available
 * under the terms of the Eclipse Distribution License v1.0 which
 * accompanies this distribution, is reproduced below, and is
 * available at http://www.eclipse.org/org/documents/edl-v10.php
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or
 * without modification, are permitted provided that the following
 * conditions are met:
 *
 * - Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 *
 * - Redistributions in binary form must reproduce the above
 *   copyright notice, this list of conditions and the following
 *   disclaimer in the documentation and/or other materials provided
 *   with the distribution.
 *
 * - Neither the name of the Eclipse Foundation, Inc. nor the
 *   names of its contributors may be used to endorse or promote
 *   products derived from this software without specific prior
 *   written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.eclipse.jgit.diff;

import org.eclipse.jgit.internal.JGitText;

Support HistogramDiff by computing occurrence counts of elements.

Each element in the range being considered is put into a hash table, tracking the number of times that distinct element appears in the sequence. Once all elements have been inserted from sequence A, each element of sequence B is probed in the hash table and the longest common subsequence with the lowest occurrence count in A is used as the result.

Type parameters:
  • <S> – type of the base sequence.
/** * Support {@link HistogramDiff} by computing occurrence counts of elements. * <p> * Each element in the range being considered is put into a hash table, tracking * the number of times that distinct element appears in the sequence. Once all * elements have been inserted from sequence A, each element of sequence B is * probed in the hash table and the longest common subsequence with the lowest * occurrence count in A is used as the result. * * @param <S> * type of the base sequence. */
final class HistogramDiffIndex<S extends Sequence> { private static final int REC_NEXT_SHIFT = 28 + 8; private static final int REC_PTR_SHIFT = 8; private static final int REC_PTR_MASK = (1 << 28) - 1; private static final int REC_CNT_MASK = (1 << 8) - 1; private static final int MAX_PTR = REC_PTR_MASK; private static final int MAX_CNT = (1 << 8) - 1; private final int maxChainLength; private final HashedSequenceComparator<S> cmp; private final HashedSequence<S> a; private final HashedSequence<S> b; private final Edit region; /** Keyed by {@link #hash(HashedSequence, int)} for {@link #recs} index. */ private final int[] table;
Number of low bits to discard from a key to index HistogramDiffIndex<S>.table.
/** Number of low bits to discard from a key to index {@link #table}. */
private final int keyShift;
Describes a unique element in sequence A. The records in this table are actually 3-tuples of:
  • index of next record in this table that has same hash code
  • index of first element in this occurrence chain
  • occurrence count for this element (length of locs list)
The occurrence count is capped at HistogramDiffIndex<S>.MAX_CNT, as the field is only a few bits wide. Elements that occur more frequently will have their count capped.
/** * Describes a unique element in sequence A. * * The records in this table are actually 3-tuples of: * <ul> * <li>index of next record in this table that has same hash code</li> * <li>index of first element in this occurrence chain</li> * <li>occurrence count for this element (length of locs list)</li> * </ul> * * The occurrence count is capped at {@link #MAX_CNT}, as the field is only * a few bits wide. Elements that occur more frequently will have their * count capped. */
private long[] recs;
Number of elements in HistogramDiffIndex<S>.recs; also is the unique element count.
/** Number of elements in {@link #recs}; also is the unique element count. */
private int recCnt;
For ptr, next[ptr - ptrShift] has subsequent index. For the sequence element ptr, the value stored at location next[ptr - ptrShift] is the next occurrence of the exact same element in the sequence. Chains always run from the lowest index to the largest index. Therefore the array will store next[1] = 2, but never next[2] = 1. This allows a chain to terminate with 0, as 0 would never be a valid next element. The array is sized to be region.getLengthA() and element indexes are converted to array indexes by subtracting HistogramDiffIndex<S>.ptrShift, which is just a cached version of region.beginA.
/** * For {@code ptr}, {@code next[ptr - ptrShift]} has subsequent index. * * For the sequence element {@code ptr}, the value stored at location * {@code next[ptr - ptrShift]} is the next occurrence of the exact same * element in the sequence. * * Chains always run from the lowest index to the largest index. Therefore * the array will store {@code next[1] = 2}, but never {@code next[2] = 1}. * This allows a chain to terminate with {@code 0}, as {@code 0} would never * be a valid next element. * * The array is sized to be {@code region.getLengthA()} and element indexes * are converted to array indexes by subtracting {@link #ptrShift}, which is * just a cached version of {@code region.beginA}. */
private int[] next;
For element ptr in A, index of the record in HistogramDiffIndex<S>.recs array. The record at recs[recIdx[ptr - ptrShift]] is the record describing all occurrences of the element appearing in sequence A at position ptr. The record is needed to get the occurrence count of the element, or to locate all other occurrences of that element within sequence A. This index provides constant-time access to the record, and avoids needing to scan the hash chain.
/** * For element {@code ptr} in A, index of the record in {@link #recs} array. * * The record at {@code recs[recIdx[ptr - ptrShift]]} is the record * describing all occurrences of the element appearing in sequence A at * position {@code ptr}. The record is needed to get the occurrence count of * the element, or to locate all other occurrences of that element within * sequence A. This index provides constant-time access to the record, and * avoids needing to scan the hash chain. */
private int[] recIdx;
Value to subtract from element indexes to key HistogramDiffIndex<S>.next array.
/** Value to subtract from element indexes to key {@link #next} array. */
private int ptrShift; private Edit lcs; private int cnt; private boolean hasCommon; HistogramDiffIndex(int maxChainLength, HashedSequenceComparator<S> cmp, HashedSequence<S> a, HashedSequence<S> b, Edit r) { this.maxChainLength = maxChainLength; this.cmp = cmp; this.a = a; this.b = b; this.region = r; if (region.endA >= MAX_PTR) throw new IllegalArgumentException( JGitText.get().sequenceTooLargeForDiffAlgorithm); final int sz = r.getLengthA(); final int tableBits = tableBits(sz); table = new int[1 << tableBits]; keyShift = 32 - tableBits; ptrShift = r.beginA; recs = new long[Math.max(4, sz >>> 3)]; next = new int[sz]; recIdx = new int[sz]; } Edit findLongestCommonSequence() { if (!scanA()) return null; lcs = new Edit(0, 0); cnt = maxChainLength + 1; for (int bPtr = region.beginB; bPtr < region.endB;) bPtr = tryLongestCommonSequence(bPtr); return hasCommon && maxChainLength < cnt ? null : lcs; } private boolean scanA() { // Scan the elements backwards, inserting them into the hash table // as we go. Going in reverse places the earliest occurrence of any // element at the start of the chain, so we consider earlier matches // before later matches. // SCAN: for (int ptr = region.endA - 1; region.beginA <= ptr; ptr--) { final int tIdx = hash(a, ptr); int chainLen = 0; for (int rIdx = table[tIdx]; rIdx != 0;) { final long rec = recs[rIdx]; if (cmp.equals(a, recPtr(rec), a, ptr)) { // ptr is identical to another element. Insert it onto // the front of the existing element chain. // int newCnt = recCnt(rec) + 1; if (MAX_CNT < newCnt) newCnt = MAX_CNT; recs[rIdx] = recCreate(recNext(rec), ptr, newCnt); next[ptr - ptrShift] = recPtr(rec); recIdx[ptr - ptrShift] = rIdx; continue SCAN; } rIdx = recNext(rec); chainLen++; } if (chainLen == maxChainLength) return false; // This is the first time we have ever seen this particular // element in the sequence. Construct a new chain for it. // final int rIdx = ++recCnt; if (rIdx == recs.length) { int sz = Math.min(recs.length << 1, 1 + region.getLengthA()); long[] n = new long[sz]; System.arraycopy(recs, 0, n, 0, recs.length); recs = n; } recs[rIdx] = recCreate(table[tIdx], ptr, 1); recIdx[ptr - ptrShift] = rIdx; table[tIdx] = rIdx; } return true; } private int tryLongestCommonSequence(int bPtr) { int bNext = bPtr + 1; int rIdx = table[hash(b, bPtr)]; for (long rec; rIdx != 0; rIdx = recNext(rec)) { rec = recs[rIdx]; // If there are more occurrences in A, don't use this chain. if (recCnt(rec) > cnt) { if (!hasCommon) hasCommon = cmp.equals(a, recPtr(rec), b, bPtr); continue; } int as = recPtr(rec); if (!cmp.equals(a, as, b, bPtr)) continue; hasCommon = true; TRY_LOCATIONS: for (;;) { int np = next[as - ptrShift]; int bs = bPtr; int ae = as + 1; int be = bs + 1; int rc = recCnt(rec); while (region.beginA < as && region.beginB < bs && cmp.equals(a, as - 1, b, bs - 1)) { as--; bs--; if (1 < rc) rc = Math.min(rc, recCnt(recs[recIdx[as - ptrShift]])); } while (ae < region.endA && be < region.endB && cmp.equals(a, ae, b, be)) { if (1 < rc) rc = Math.min(rc, recCnt(recs[recIdx[ae - ptrShift]])); ae++; be++; } if (bNext < be) bNext = be; if (lcs.getLengthA() < ae - as || rc < cnt) { // If this region is the longest, or there are less // occurrences of it in A, its now our LCS. // lcs.beginA = as; lcs.beginB = bs; lcs.endA = ae; lcs.endB = be; cnt = rc; } // Because we added elements in reverse order index 0 // cannot possibly be the next position. Its the first // element of the sequence and thus would have been the // value of as at the start of the TRY_LOCATIONS loop. // if (np == 0) break TRY_LOCATIONS; while (np < ae) { // The next location to consider was actually within // the LCS we examined above. Don't reconsider it. // np = next[np - ptrShift]; if (np == 0) break TRY_LOCATIONS; } as = np; } } return bNext; } private int hash(HashedSequence<S> s, int idx) { return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift; } private static long recCreate(int next, int ptr, int cnt) { return ((long) next << REC_NEXT_SHIFT) // | ((long) ptr << REC_PTR_SHIFT) // | cnt; } private static int recNext(long rec) { return (int) (rec >>> REC_NEXT_SHIFT); } private static int recPtr(long rec) { return ((int) (rec >>> REC_PTR_SHIFT)) & REC_PTR_MASK; } private static int recCnt(long rec) { return ((int) rec) & REC_CNT_MASK; } private static int tableBits(int sz) { int bits = 31 - Integer.numberOfLeadingZeros(sz); if (bits == 0) bits = 1; if (1 << bits < sz) bits++; return bits; } }