org.eclipse.jgit/org.eclipse.jgit/5.5.0.201909110433-r : org/eclipse/jgit/diff/HistogramDiffIndex.java

HistogramDiffIndex
http://www.eclipse.org/jgit//org.eclipse.jgit: Repository access and algorithms (Eclipse JGit Project)
Eclipse Distribution License (New BSD License)
Andrey Loskutov
Christian Halstrick
Dave Borowitz
David Pursehouse
Gunnar Wagenknecht
Jonathan Nieder
Jonathan Tan
Matthias Sohn
Sasa Zivkov
Terry Parker
Thomas Wolf
/*
 * Copyright (C) 2010, Google Inc.
 * and other copyright owners as documented in the project's IP log.
 *
 * This program and the accompanying materials are made available
 * under the terms of the Eclipse Distribution License v1.0 which
 * accompanies this distribution, is reproduced below, and is
 * available at http://www.eclipse.org/org/documents/edl-v10.php
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or
 * without modification, are permitted provided that the following
 * conditions are met:
 *
 * - Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 *
 * - Redistributions in binary form must reproduce the above
 *   copyright notice, this list of conditions and the following
 *   disclaimer in the documentation and/or other materials provided
 *   with the distribution.
 *
 * - Neither the name of the Eclipse Foundation, Inc. nor the
 *   names of its contributors may be used to endorse or promote
 *   products derived from this software without specific prior
 *   written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.eclipse.jgit.diff;

import org.eclipse.jgit.internal.JGitText;

Support HistogramDiff by computing occurrence counts of elements. 
Each element in the range being considered is put into a hash table, tracking
the number of times that distinct element appears in the sequence. Once all
elements have been inserted from sequence A, each element of sequence B is
probed in the hash table and the longest common subsequence with the lowest
occurrence count in A is used as the result.
Type parameters: <S> – 
           type of the base sequence./**
 * Support {@link HistogramDiff} by computing occurrence counts of elements.
 * <p>
 * Each element in the range being considered is put into a hash table, tracking
 * the number of times that distinct element appears in the sequence. Once all
 * elements have been inserted from sequence A, each element of sequence B is
 * probed in the hash table and the longest common subsequence with the lowest
 * occurrence count in A is used as the result.
 *
 * @param <S>
 *            type of the base sequence.
 */
final class HistogramDiffIndex<S extends Sequence> {
	private static final int REC_NEXT_SHIFT = 28 + 8;

	private static final int REC_PTR_SHIFT = 8;

	private static final int REC_PTR_MASK = (1 << 28) - 1;

	private static final int REC_CNT_MASK = (1 << 8) - 1;

	private static final int MAX_PTR = REC_PTR_MASK;

	private static final int MAX_CNT = (1 << 8) - 1;

	private final int maxChainLength;

	private final HashedSequenceComparator<S> cmp;

	private final HashedSequence<S> a;

	private final HashedSequence<S> b;

	private final Edit region;

	Keyed by hash(HashedSequence, int) for HistogramDiffIndex<S>.recs index. /** Keyed by {@link #hash(HashedSequence, int)} for {@link #recs} index. */
	private final int[] table;

	Number of low bits to discard from a key to index HistogramDiffIndex<S>.table. /** Number of low bits to discard from a key to index {@link #table}. */
	private final int keyShift;

	Describes a unique element in sequence A.
The records in this table are actually 3-tuples of:

index of next record in this table that has same hash code
index of first element in this occurrence chain
occurrence count for this element (length of locs list)
 The occurrence count is capped at HistogramDiffIndex<S>.MAX_CNT, as the field is only a few bits wide. Elements that occur more frequently will have their count capped. /**
	 * Describes a unique element in sequence A.
	 *
	 * The records in this table are actually 3-tuples of:
	 * <ul>
	 * <li>index of next record in this table that has same hash code</li>
	 * <li>index of first element in this occurrence chain</li>
	 * <li>occurrence count for this element (length of locs list)</li>
	 * </ul>
	 *
	 * The occurrence count is capped at {@link #MAX_CNT}, as the field is only
	 * a few bits wide. Elements that occur more frequently will have their
	 * count capped.
	 */
	private long[] recs;

	Number of elements in HistogramDiffIndex<S>.recs; also is the unique element count. /** Number of elements in {@link #recs}; also is the unique element count. */
	private int recCnt;

	For ptr, next[ptr - ptrShift] has subsequent index. For the sequence element ptr, the value stored at location next[ptr - ptrShift] is the next occurrence of the exact same element in the sequence. Chains always run from the lowest index to the largest index. Therefore the array will store next[1] = 2, but never next[2] = 1. This allows a chain to terminate with 0, as 0 would never be a valid next element. The array is sized to be region.getLengthA() and element indexes are converted to array indexes by subtracting HistogramDiffIndex<S>.ptrShift, which is just a cached version of region.beginA. /**
	 * For {@code ptr}, {@code next[ptr - ptrShift]} has subsequent index.
	 *
	 * For the sequence element {@code ptr}, the value stored at location
	 * {@code next[ptr - ptrShift]} is the next occurrence of the exact same
	 * element in the sequence.
	 *
	 * Chains always run from the lowest index to the largest index. Therefore
	 * the array will store {@code next[1] = 2}, but never {@code next[2] = 1}.
	 * This allows a chain to terminate with {@code 0}, as {@code 0} would never
	 * be a valid next element.
	 *
	 * The array is sized to be {@code region.getLengthA()} and element indexes
	 * are converted to array indexes by subtracting {@link #ptrShift}, which is
	 * just a cached version of {@code region.beginA}.
	 */
	private int[] next;

	For element ptr in A, index of the record in HistogramDiffIndex<S>.recs array. The record at recs[recIdx[ptr - ptrShift]] is the record describing all occurrences of the element appearing in sequence A at position ptr. The record is needed to get the occurrence count of the element, or to locate all other occurrences of that element within sequence A. This index provides constant-time access to the record, and avoids needing to scan the hash chain. /**
	 * For element {@code ptr} in A, index of the record in {@link #recs} array.
	 *
	 * The record at {@code recs[recIdx[ptr - ptrShift]]} is the record
	 * describing all occurrences of the element appearing in sequence A at
	 * position {@code ptr}. The record is needed to get the occurrence count of
	 * the element, or to locate all other occurrences of that element within
	 * sequence A. This index provides constant-time access to the record, and
	 * avoids needing to scan the hash chain.
	 */
	private int[] recIdx;

	Value to subtract from element indexes to key HistogramDiffIndex<S>.next array. /** Value to subtract from element indexes to key {@link #next} array. */
	private int ptrShift;

	private Edit lcs;

	private int cnt;

	private boolean hasCommon;

	HistogramDiffIndex(int maxChainLength, HashedSequenceComparator<S> cmp,
			HashedSequence<S> a, HashedSequence<S> b, Edit r) {
		this.maxChainLength = maxChainLength;
		this.cmp = cmp;
		this.a = a;
		this.b = b;
		this.region = r;

		if (region.endA >= MAX_PTR)
			throw new IllegalArgumentException(
					JGitText.get().sequenceTooLargeForDiffAlgorithm);

		final int sz = r.getLengthA();
		final int tableBits = tableBits(sz);
		table = new int[1 << tableBits];
		keyShift = 32 - tableBits;
		ptrShift = r.beginA;

		recs = new long[Math.max(4, sz >>> 3)];
		next = new int[sz];
		recIdx = new int[sz];
	}

	Edit findLongestCommonSequence() {
		if (!scanA())
			return null;

		lcs = new Edit(0, 0);
		cnt = maxChainLength + 1;

		for (int bPtr = region.beginB; bPtr < region.endB;)
			bPtr = tryLongestCommonSequence(bPtr);

		return hasCommon && maxChainLength < cnt ? null : lcs;
	}

	private boolean scanA() {
		// Scan the elements backwards, inserting them into the hash table
		// as we go. Going in reverse places the earliest occurrence of any
		// element at the start of the chain, so we consider earlier matches
		// before later matches.
		//
		SCAN: for (int ptr = region.endA - 1; region.beginA <= ptr; ptr--) {
			final int tIdx = hash(a, ptr);

			int chainLen = 0;
			for (int rIdx = table[tIdx]; rIdx != 0;) {
				final long rec = recs[rIdx];
				if (cmp.equals(a, recPtr(rec), a, ptr)) {
					// ptr is identical to another element. Insert it onto
					// the front of the existing element chain.
					//
					int newCnt = recCnt(rec) + 1;
					if (MAX_CNT < newCnt)
						newCnt = MAX_CNT;
					recs[rIdx] = recCreate(recNext(rec), ptr, newCnt);
					next[ptr - ptrShift] = recPtr(rec);
					recIdx[ptr - ptrShift] = rIdx;
					continue SCAN;
				}

				rIdx = recNext(rec);
				chainLen++;
			}

			if (chainLen == maxChainLength)
				return false;

			// This is the first time we have ever seen this particular
			// element in the sequence. Construct a new chain for it.
			//
			final int rIdx = ++recCnt;
			if (rIdx == recs.length) {
				int sz = Math.min(recs.length << 1, 1 + region.getLengthA());
				long[] n = new long[sz];
				System.arraycopy(recs, 0, n, 0, recs.length);
				recs = n;
			}

			recs[rIdx] = recCreate(table[tIdx], ptr, 1);
			recIdx[ptr - ptrShift] = rIdx;
			table[tIdx] = rIdx;
		}
		return true;
	}

	private int tryLongestCommonSequence(int bPtr) {
		int bNext = bPtr + 1;
		int rIdx = table[hash(b, bPtr)];
		for (long rec; rIdx != 0; rIdx = recNext(rec)) {
			rec = recs[rIdx];

			// If there are more occurrences in A, don't use this chain.
			if (recCnt(rec) > cnt) {
				if (!hasCommon)
					hasCommon = cmp.equals(a, recPtr(rec), b, bPtr);
				continue;
			}

			int as = recPtr(rec);
			if (!cmp.equals(a, as, b, bPtr))
				continue;

			hasCommon = true;
			TRY_LOCATIONS: for (;;) {
				int np = next[as - ptrShift];
				int bs = bPtr;
				int ae = as + 1;
				int be = bs + 1;
				int rc = recCnt(rec);

				while (region.beginA < as && region.beginB < bs
						&& cmp.equals(a, as - 1, b, bs - 1)) {
					as--;
					bs--;
					if (1 < rc)
						rc = Math.min(rc, recCnt(recs[recIdx[as - ptrShift]]));
				}
				while (ae < region.endA && be < region.endB
						&& cmp.equals(a, ae, b, be)) {
					if (1 < rc)
						rc = Math.min(rc, recCnt(recs[recIdx[ae - ptrShift]]));
					ae++;
					be++;
				}

				if (bNext < be)
					bNext = be;
				if (lcs.getLengthA() < ae - as || rc < cnt) {
					// If this region is the longest, or there are less
					// occurrences of it in A, its now our LCS.
					//
					lcs.beginA = as;
					lcs.beginB = bs;
					lcs.endA = ae;
					lcs.endB = be;
					cnt = rc;
				}

				// Because we added elements in reverse order index 0
				// cannot possibly be the next position. Its the first
				// element of the sequence and thus would have been the
				// value of as at the start of the TRY_LOCATIONS loop.
				//
				if (np == 0)
					break TRY_LOCATIONS;

				while (np < ae) {
					// The next location to consider was actually within
					// the LCS we examined above. Don't reconsider it.
					//
					np = next[np - ptrShift];
					if (np == 0)
						break TRY_LOCATIONS;
				}

				as = np;
			}
		}
		return bNext;
	}

	private int hash(HashedSequence<S> s, int idx) {
		return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift;
	}

	private static long recCreate(int next, int ptr, int cnt) {
		return ((long) next << REC_NEXT_SHIFT) //
				| ((long) ptr << REC_PTR_SHIFT) //
				| cnt;
	}

	private static int recNext(long rec) {
		return (int) (rec >>> REC_NEXT_SHIFT);
	}

	private static int recPtr(long rec) {
		return ((int) (rec >>> REC_PTR_SHIFT)) & REC_PTR_MASK;
	}

	private static int recCnt(long rec) {
		return ((int) rec) & REC_CNT_MASK;
	}

	private static int tableBits(int sz) {
		int bits = 31 - Integer.numberOfLeadingZeros(sz);
		if (bits == 0)
			bits = 1;
		if (1 << bits < sz)
			bits++;
		return bits;
	}
}
/

org.eclipse.jgit/ org.eclipse.jgit/ 5.5.0.201909110433-r/ org/eclipse/jgit/diff/HistogramDiffIndex.java