org.eclipse.jgit/org.eclipse.jgit/5.5.0.201909110433-r : org/eclipse/jgit/diff/SimilarityIndex.java

SimilarityIndex
http://www.eclipse.org/jgit//org.eclipse.jgit: Repository access and algorithms (Eclipse JGit Project)
Eclipse Distribution License (New BSD License)
Andrey Loskutov
Christian Halstrick
Dave Borowitz
David Pursehouse
Gunnar Wagenknecht
Jonathan Nieder
Jonathan Tan
Matthias Sohn
Sasa Zivkov
Terry Parker
Thomas Wolf
/*
 * Copyright (C) 2010, Google Inc.
 * and other copyright owners as documented in the project's IP log.
 *
 * This program and the accompanying materials are made available
 * under the terms of the Eclipse Distribution License v1.0 which
 * accompanies this distribution, is reproduced below, and is
 * available at http://www.eclipse.org/org/documents/edl-v10.php
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or
 * without modification, are permitted provided that the following
 * conditions are met:
 *
 * - Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 *
 * - Redistributions in binary form must reproduce the above
 *   copyright notice, this list of conditions and the following
 *   disclaimer in the documentation and/or other materials provided
 *   with the distribution.
 *
 * - Neither the name of the Eclipse Foundation, Inc. nor the
 *   names of its contributors may be used to endorse or promote
 *   products derived from this software without specific prior
 *   written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.eclipse.jgit.diff;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;

import org.eclipse.jgit.errors.MissingObjectException;
import org.eclipse.jgit.lib.ObjectLoader;
import org.eclipse.jgit.lib.ObjectStream;

Index structure of lines/blocks in one file.
 This structure can be used to compute an approximation of the similarity between two files. The index is used by SimilarityRenameDetector to compute scores between files. 

To save space in memory, this index uses a space efficient encoding which
will not exceed 1 MiB per instance. The index starts out at a smaller size
(closer to 2 KiB), but may grow as more distinct blocks within the scanned
file are discovered.
Since: 4.0/**
 * Index structure of lines/blocks in one file.
 * <p>
 * This structure can be used to compute an approximation of the similarity
 * between two files. The index is used by
 * {@link org.eclipse.jgit.diff.SimilarityRenameDetector} to compute scores
 * between files.
 * <p>
 * To save space in memory, this index uses a space efficient encoding which
 * will not exceed 1 MiB per instance. The index starts out at a smaller size
 * (closer to 2 KiB), but may grow as more distinct blocks within the scanned
 * file are discovered.
 *
 * @since 4.0
 */
public class SimilarityIndex {
	A special TableFullException used in place of OutOfMemoryError. /** A special {@link TableFullException} used in place of OutOfMemoryError. */
	public static final TableFullException
			TABLE_FULL_OUT_OF_MEMORY = new TableFullException();

	Shift to apply before storing a key.

Within the 64 bit table record space, we leave the highest bit unset so
all values are positive. The lower 32 bits to count bytes.
/**
	 * Shift to apply before storing a key.
	 * <p>
	 * Within the 64 bit table record space, we leave the highest bit unset so
	 * all values are positive. The lower 32 bits to count bytes.
	 */
	private static final int KEY_SHIFT = 32;

	Maximum value of the count field, also mask to extract the count. /** Maximum value of the count field, also mask to extract the count. */
	private static final long MAX_COUNT = (1L << KEY_SHIFT) - 1;

	Total amount of bytes hashed into the structure, including \n. This is
usually the size of the file minus number of CRLF encounters.
/**
	 * Total amount of bytes hashed into the structure, including \n. This is
	 * usually the size of the file minus number of CRLF encounters.
	 */
	private long hashedCnt;

	Number of non-zero entries in idHash. /** Number of non-zero entries in {@link #idHash}. */
	private int idSize;

	idSize that triggers idHash to double in size. /** {@link #idSize} that triggers {@link #idHash} to double in size. */
	private int idGrowAt;

	Pairings of content keys and counters.

Slots in the table are actually two ints wedged into a single long. The
upper 32 bits stores the content key, and the remaining lower bits stores
the number of bytes associated with that key. Empty slots are denoted by
0, which cannot occur because the count cannot be 0. Values can only be
positive, which we enforce during key addition.
/**
	 * Pairings of content keys and counters.
	 * <p>
	 * Slots in the table are actually two ints wedged into a single long. The
	 * upper 32 bits stores the content key, and the remaining lower bits stores
	 * the number of bytes associated with that key. Empty slots are denoted by
	 * 0, which cannot occur because the count cannot be 0. Values can only be
	 * positive, which we enforce during key addition.
	 */
	private long[] idHash;

	idHash.length == 1 << idHashBits. /** {@code idHash.length == 1 << idHashBits}. */
	private int idHashBits;

	Create a new similarity index for the given object
Params: obj – 
           the object to hash
Throws: IOException – 
            file contents cannot be read from the repository.
TableFullException – 
            object hashing overflowed the storage capacity of the
            SimilarityIndex.
Returns: similarity index for this object/**
	 * Create a new similarity index for the given object
	 *
	 * @param obj
	 *            the object to hash
	 * @return similarity index for this object
	 * @throws java.io.IOException
	 *             file contents cannot be read from the repository.
	 * @throws org.eclipse.jgit.diff.SimilarityIndex.TableFullException
	 *             object hashing overflowed the storage capacity of the
	 *             SimilarityIndex.
	 */
	public static SimilarityIndex create(ObjectLoader obj) throws IOException,
			TableFullException {
		SimilarityIndex idx = new SimilarityIndex();
		idx.hash(obj);
		idx.sort();
		return idx;
	}

	SimilarityIndex() {
		idHashBits = 8;
		idHash = new long[1 << idHashBits];
		idGrowAt = growAt(idHashBits);
	}

	void hash(ObjectLoader obj) throws MissingObjectException, IOException,
			TableFullException {
		if (obj.isLarge()) {
			hashLargeObject(obj);
		} else {
			byte[] raw = obj.getCachedBytes();
			hash(raw, 0, raw.length);
		}
	}

	private void hashLargeObject(ObjectLoader obj) throws IOException,
			TableFullException {
		boolean text;
		try (ObjectStream in1 = obj.openStream()) {
			text = !RawText.isBinary(in1);
		}

		try (ObjectStream in2 = obj.openStream()) {
			hash(in2, in2.getSize(), text);
		}
	}

	void hash(byte[] raw, int ptr, int end) throws TableFullException {
		final boolean text = !RawText.isBinary(raw);
		hashedCnt = 0;
		while (ptr < end) {
			int hash = 5381;
			int blockHashedCnt = 0;
			int start = ptr;

			// Hash one line, or one block, whichever occurs first.
			do {
				int c = raw[ptr++] & 0xff;
				// Ignore CR in CRLF sequence if text
				if (text && c == '\r' && ptr < end && raw[ptr] == '\n')
					continue;
				blockHashedCnt++;
				if (c == '\n')
					break;
				hash = (hash << 5) + hash + c;
			} while (ptr < end && ptr - start < 64);
			hashedCnt += blockHashedCnt;
			add(hash, blockHashedCnt);
		}
	}

	void hash(InputStream in, long remaining, boolean text) throws IOException,
			TableFullException {
		byte[] buf = new byte[4096];
		int ptr = 0;
		int cnt = 0;

		while (0 < remaining) {
			int hash = 5381;
			int blockHashedCnt = 0;

			// Hash one line, or one block, whichever occurs first.
			int n = 0;
			do {
				if (ptr == cnt) {
					ptr = 0;
					cnt = in.read(buf, 0, buf.length);
					if (cnt <= 0)
						throw new EOFException();
				}

				n++;
				int c = buf[ptr++] & 0xff;
				// Ignore CR in CRLF sequence if text
				if (text && c == '\r' && ptr < cnt && buf[ptr] == '\n')
					continue;
				blockHashedCnt++;
				if (c == '\n')
					break;
				hash = (hash << 5) + hash + c;
			} while (n < 64 && n < remaining);
			hashedCnt += blockHashedCnt;
			add(hash, blockHashedCnt);
			remaining -= n;
		}
	}

	Sort the internal table so it can be used for efficient scoring.

Once sorted, additional lines/blocks cannot be added to the index.
/**
	 * Sort the internal table so it can be used for efficient scoring.
	 * <p>
	 * Once sorted, additional lines/blocks cannot be added to the index.
	 */
	void sort() {
		// Sort the array. All of the empty space will wind up at the front,
		// because we forced all of the keys to always be positive. Later
		// we only work with the back half of the array.
		//
		Arrays.sort(idHash);
	}

	Compute the similarity score between this index and another.

A region of a file is defined as a line in a text file or a fixed-size
block in a binary file. To prepare an index, each region in the file is
hashed; the values and counts of hashes are retained in a sorted table.
Define the similarity fraction F as the count of matching regions
between the two files divided between the maximum count of regions in
either file. The similarity score is F multiplied by the maxScore
constant, yielding a range [0, maxScore]. It is defined as maxScore for
the degenerate case of two empty files.

The similarity score is symmetrical; i.e. a.score(b) == b.score(a).
Params: dst – 
           the other index
maxScore – 
           the score representing a 100% match
Returns: the similarity score/**
	 * Compute the similarity score between this index and another.
	 * <p>
	 * A region of a file is defined as a line in a text file or a fixed-size
	 * block in a binary file. To prepare an index, each region in the file is
	 * hashed; the values and counts of hashes are retained in a sorted table.
	 * Define the similarity fraction F as the count of matching regions
	 * between the two files divided between the maximum count of regions in
	 * either file. The similarity score is F multiplied by the maxScore
	 * constant, yielding a range [0, maxScore]. It is defined as maxScore for
	 * the degenerate case of two empty files.
	 * <p>
	 * The similarity score is symmetrical; i.e. a.score(b) == b.score(a).
	 *
	 * @param dst
	 *            the other index
	 * @param maxScore
	 *            the score representing a 100% match
	 * @return the similarity score
	 */
	public int score(SimilarityIndex dst, int maxScore) {
		long max = Math.max(hashedCnt, dst.hashedCnt);
		if (max == 0)
			return maxScore;
		return (int) ((common(dst) * maxScore) / max);
	}

	long common(SimilarityIndex dst) {
		return common(this, dst);
	}

	private static long common(SimilarityIndex src, SimilarityIndex dst) {
		int srcIdx = src.packedIndex(0);
		int dstIdx = dst.packedIndex(0);
		long[] srcHash = src.idHash;
		long[] dstHash = dst.idHash;
		return common(srcHash, srcIdx, dstHash, dstIdx);
	}

	private static long common(long[] srcHash, int srcIdx, //
			long[] dstHash, int dstIdx) {
		if (srcIdx == srcHash.length || dstIdx == dstHash.length)
			return 0;

		long common = 0;
		int srcKey = keyOf(srcHash[srcIdx]);
		int dstKey = keyOf(dstHash[dstIdx]);

		for (;;) {
			if (srcKey == dstKey) {
				common += Math.min(countOf(srcHash[srcIdx]),
						countOf(dstHash[dstIdx]));

				if (++srcIdx == srcHash.length)
					break;
				srcKey = keyOf(srcHash[srcIdx]);

				if (++dstIdx == dstHash.length)
					break;
				dstKey = keyOf(dstHash[dstIdx]);

			} else if (srcKey < dstKey) {
				// Regions of src which do not appear in dst.
				if (++srcIdx == srcHash.length)
					break;
				srcKey = keyOf(srcHash[srcIdx]);

			} else /* if (dstKey < srcKey) */{
				// Regions of dst which do not appear in src.
				if (++dstIdx == dstHash.length)
					break;
				dstKey = keyOf(dstHash[dstIdx]);
			}
		}

		return common;
	}

	// Testing only
	int size() {
		return idSize;
	}

	// Testing only
	int key(int idx) {
		return keyOf(idHash[packedIndex(idx)]);
	}

	// Testing only
	long count(int idx) {
		return countOf(idHash[packedIndex(idx)]);
	}

	// Brute force approach only for testing.
	int findIndex(int key) {
		for (int i = 0; i < idSize; i++)
			if (key(i) == key)
				return i;
		return -1;
	}

	private int packedIndex(int idx) {
		return (idHash.length - idSize) + idx;
	}

	void add(int key, int cnt) throws TableFullException {
		key = (key * 0x9e370001) >>> 1; // Mix bits and ensure not negative.

		int j = slot(key);
		for (;;) {
			long v = idHash[j];
			if (v == 0) {
				// Empty slot in the table, store here.
				if (idGrowAt <= idSize) {
					grow();
					j = slot(key);
					continue;
				}
				idHash[j] = pair(key, cnt);
				idSize++;
				return;

			} else if (keyOf(v) == key) {
				// Same key, increment the counter. If it overflows, fail
				// indexing to prevent the key from being impacted.
				//
				idHash[j] = pair(key, countOf(v) + cnt);
				return;

			} else if (++j >= idHash.length) {
				j = 0;
			}
		}
	}

	private static long pair(int key, long cnt) throws TableFullException {
		if (MAX_COUNT < cnt)
			throw new TableFullException();
		return (((long) key) << KEY_SHIFT) | cnt;
	}

	private int slot(int key) {
		// We use 31 - idHashBits because the upper bit was already forced
		// to be 0 and we want the remaining high bits to be used as the
		// table slot.
		//
		return key >>> (31 - idHashBits);
	}

	private static int growAt(int idHashBits) {
		return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
	}

	private void grow() throws TableFullException {
		if (idHashBits == 30)
			throw new TableFullException();

		long[] oldHash = idHash;
		int oldSize = idHash.length;

		idHashBits++;
		idGrowAt = growAt(idHashBits);

		try {
			idHash = new long[1 << idHashBits];
		} catch (OutOfMemoryError noMemory) {
			throw TABLE_FULL_OUT_OF_MEMORY;
		}

		for (int i = 0; i < oldSize; i++) {
			long v = oldHash[i];
			if (v != 0) {
				int j = slot(keyOf(v));
				while (idHash[j] != 0)
					if (++j >= idHash.length)
						j = 0;
				idHash[j] = v;
			}
		}
	}

	private static int keyOf(long v) {
		return (int) (v >>> KEY_SHIFT);
	}

	private static long countOf(long v) {
		return v & MAX_COUNT;
	}

	Thrown by create() when file is too large. /** Thrown by {@code create()} when file is too large. */
	public static class TableFullException extends Exception {
		private static final long serialVersionUID = 1L;
	}
}
Params:	obj – the object to hash
Throws:	IOException – file contents cannot be read from the repository. TableFullException – object hashing overflowed the storage capacity of the SimilarityIndex.
Returns:	similarity index for this object
Params:	dst – the other index maxScore – the score representing a 100% match
Returns:	the similarity score
/

org.eclipse.jgit/ org.eclipse.jgit/ 5.5.0.201909110433-r/ org/eclipse/jgit/diff/SimilarityIndex.java