/*
 * Copyright (C) 2008-2009, Google Inc.
 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
 * and other copyright owners as documented in the project's IP log.
 *
 * This program and the accompanying materials are made available
 * under the terms of the Eclipse Distribution License v1.0 which
 * accompanies this distribution, is reproduced below, and is
 * available at http://www.eclipse.org/org/documents/edl-v10.php
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or
 * without modification, are permitted provided that the following
 * conditions are met:
 *
 * - Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 *
 * - Redistributions in binary form must reproduce the above
 *   copyright notice, this list of conditions and the following
 *   disclaimer in the documentation and/or other materials provided
 *   with the distribution.
 *
 * - Neither the name of the Eclipse Foundation, Inc. nor the
 *   names of its contributors may be used to endorse or promote
 *   products derived from this software without specific prior
 *   written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.eclipse.jgit.util;

import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.eclipse.jgit.lib.ObjectChecker.author;
import static org.eclipse.jgit.lib.ObjectChecker.committer;
import static org.eclipse.jgit.lib.ObjectChecker.encoding;
import static org.eclipse.jgit.lib.ObjectChecker.tagger;

import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

import org.eclipse.jgit.annotations.Nullable;
import org.eclipse.jgit.errors.BinaryBlobException;
import org.eclipse.jgit.lib.Constants;
import org.eclipse.jgit.lib.PersonIdent;

Handy utility functions to parse raw object contents.
/** * Handy utility functions to parse raw object contents. */
public final class RawParseUtils {
UTF-8 charset constant.
Since:2.2
Deprecated:use StandardCharsets.UTF_8 instead
/** * UTF-8 charset constant. * * @since 2.2 * @deprecated use {@link java.nio.charset.StandardCharsets#UTF_8} instead */
@Deprecated public static final Charset UTF8_CHARSET = UTF_8; private static final byte[] digits10; private static final byte[] digits16; private static final byte[] footerLineKeyChars; private static final Map<String, Charset> encodingAliases; static { encodingAliases = new HashMap<>(); encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$ encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$ digits10 = new byte['9' + 1]; Arrays.fill(digits10, (byte) -1); for (char i = '0'; i <= '9'; i++) digits10[i] = (byte) (i - '0'); digits16 = new byte['f' + 1]; Arrays.fill(digits16, (byte) -1); for (char i = '0'; i <= '9'; i++) digits16[i] = (byte) (i - '0'); for (char i = 'a'; i <= 'f'; i++) digits16[i] = (byte) ((i - 'a') + 10); for (char i = 'A'; i <= 'F'; i++) digits16[i] = (byte) ((i - 'A') + 10); footerLineKeyChars = new byte['z' + 1]; footerLineKeyChars['-'] = 1; for (char i = '0'; i <= '9'; i++) footerLineKeyChars[i] = 1; for (char i = 'A'; i <= 'Z'; i++) footerLineKeyChars[i] = 1; for (char i = 'a'; i <= 'z'; i++) footerLineKeyChars[i] = 1; }
Determine if b[ptr] matches src.
Params:
  • b – the buffer to scan.
  • ptr – first position within b, this should match src[0].
  • src – the buffer to test for equality with b.
Returns:ptr + src.length if b[ptr..src.length] == src; else -1.
/** * Determine if b[ptr] matches src. * * @param b * the buffer to scan. * @param ptr * first position within b, this should match src[0]. * @param src * the buffer to test for equality with b. * @return ptr + src.length if b[ptr..src.length] == src; else -1. */
public static final int match(byte[] b, int ptr, byte[] src) { if (ptr + src.length > b.length) return -1; for (int i = 0; i < src.length; i++, ptr++) if (b[ptr] != src[i]) return -1; return ptr; } private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' };
Format a base 10 numeric into a temporary buffer.

Formatting is performed backwards. The method starts at offset o-1 and ends at o-1-digits, where digits is the number of positions necessary to store the base 10 value.

The argument and return values from this method make it easy to chain writing, for example:

final byte[] tmp = new byte[64];
int ptr = tmp.length;
tmp[--ptr] = '\n';
ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
tmp[--ptr] = ' ';
ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
tmp[--ptr] = 0;
final String str = new String(tmp, ptr, tmp.length - ptr);
Params:
  • b – buffer to write into.
  • o – one offset past the location where writing will begin; writing proceeds towards lower index values.
  • value – the value to store.
Returns:the new offset value o. This is the position of the last byte written. Additional writing should start at one position earlier.
/** * Format a base 10 numeric into a temporary buffer. * <p> * Formatting is performed backwards. The method starts at offset * <code>o-1</code> and ends at <code>o-1-digits</code>, where * <code>digits</code> is the number of positions necessary to store the * base 10 value. * <p> * The argument and return values from this method make it easy to chain * writing, for example: * </p> * * <pre> * final byte[] tmp = new byte[64]; * int ptr = tmp.length; * tmp[--ptr] = '\n'; * ptr = RawParseUtils.formatBase10(tmp, ptr, 32); * tmp[--ptr] = ' '; * ptr = RawParseUtils.formatBase10(tmp, ptr, 18); * tmp[--ptr] = 0; * final String str = new String(tmp, ptr, tmp.length - ptr); * </pre> * * @param b * buffer to write into. * @param o * one offset past the location where writing will begin; writing * proceeds towards lower index values. * @param value * the value to store. * @return the new offset value <code>o</code>. This is the position of * the last byte written. Additional writing should start at one * position earlier. */
public static int formatBase10(final byte[] b, int o, int value) { if (value == 0) { b[--o] = '0'; return o; } final boolean isneg = value < 0; if (isneg) value = -value; while (value != 0) { b[--o] = base10byte[value % 10]; value /= 10; } if (isneg) b[--o] = '-'; return o; }
Parse a base 10 numeric from a sequence of ASCII digits into an int.

Digit sequences can begin with an optional run of spaces before the sequence, and may start with a '+' or a '-' to indicate sign position. Any other characters will cause the method to stop and return the current result to the caller.

Params:
  • b – buffer to scan.
  • ptr – position within buffer to start parsing digits at.
  • ptrResult – optional location to return the new ptr value through. If null the ptr value will be discarded.
Returns:the value at this location; 0 if the location is not a valid numeric.
/** * Parse a base 10 numeric from a sequence of ASCII digits into an int. * <p> * Digit sequences can begin with an optional run of spaces before the * sequence, and may start with a '+' or a '-' to indicate sign position. * Any other characters will cause the method to stop and return the current * result to the caller. * * @param b * buffer to scan. * @param ptr * position within buffer to start parsing digits at. * @param ptrResult * optional location to return the new ptr value through. If null * the ptr value will be discarded. * @return the value at this location; 0 if the location is not a valid * numeric. */
public static final int parseBase10(final byte[] b, int ptr, final MutableInteger ptrResult) { int r = 0; int sign = 0; try { final int sz = b.length; while (ptr < sz && b[ptr] == ' ') ptr++; if (ptr >= sz) return 0; switch (b[ptr]) { case '-': sign = -1; ptr++; break; case '+': ptr++; break; } while (ptr < sz) { final byte v = digits10[b[ptr]]; if (v < 0) break; r = (r * 10) + v; ptr++; } } catch (ArrayIndexOutOfBoundsException e) { // Not a valid digit. } if (ptrResult != null) ptrResult.value = ptr; return sign < 0 ? -r : r; }
Parse a base 10 numeric from a sequence of ASCII digits into a long.

Digit sequences can begin with an optional run of spaces before the sequence, and may start with a '+' or a '-' to indicate sign position. Any other characters will cause the method to stop and return the current result to the caller.

Params:
  • b – buffer to scan.
  • ptr – position within buffer to start parsing digits at.
  • ptrResult – optional location to return the new ptr value through. If null the ptr value will be discarded.
Returns:the value at this location; 0 if the location is not a valid numeric.
/** * Parse a base 10 numeric from a sequence of ASCII digits into a long. * <p> * Digit sequences can begin with an optional run of spaces before the * sequence, and may start with a '+' or a '-' to indicate sign position. * Any other characters will cause the method to stop and return the current * result to the caller. * * @param b * buffer to scan. * @param ptr * position within buffer to start parsing digits at. * @param ptrResult * optional location to return the new ptr value through. If null * the ptr value will be discarded. * @return the value at this location; 0 if the location is not a valid * numeric. */
public static final long parseLongBase10(final byte[] b, int ptr, final MutableInteger ptrResult) { long r = 0; int sign = 0; try { final int sz = b.length; while (ptr < sz && b[ptr] == ' ') ptr++; if (ptr >= sz) return 0; switch (b[ptr]) { case '-': sign = -1; ptr++; break; case '+': ptr++; break; } while (ptr < sz) { final byte v = digits10[b[ptr]]; if (v < 0) break; r = (r * 10) + v; ptr++; } } catch (ArrayIndexOutOfBoundsException e) { // Not a valid digit. } if (ptrResult != null) ptrResult.value = ptr; return sign < 0 ? -r : r; }
Parse 4 character base 16 (hex) formatted string to unsigned integer.

The number is read in network byte order, that is, most significant nybble first.

Params:
  • bs – buffer to parse digits from; positions [p, p+4) will be parsed.
  • p – first position within the buffer to parse.
Throws:
Returns:the integer value.
/** * Parse 4 character base 16 (hex) formatted string to unsigned integer. * <p> * The number is read in network byte order, that is, most significant * nybble first. * * @param bs * buffer to parse digits from; positions {@code [p, p+4)} will * be parsed. * @param p * first position within the buffer to parse. * @return the integer value. * @throws java.lang.ArrayIndexOutOfBoundsException * if the string is not hex formatted. */
public static final int parseHexInt16(final byte[] bs, final int p) { int r = digits16[bs[p]] << 4; r |= digits16[bs[p + 1]]; r <<= 4; r |= digits16[bs[p + 2]]; r <<= 4; r |= digits16[bs[p + 3]]; if (r < 0) throw new ArrayIndexOutOfBoundsException(); return r; }
Parse 8 character base 16 (hex) formatted string to unsigned integer.

The number is read in network byte order, that is, most significant nybble first.

Params:
  • bs – buffer to parse digits from; positions [p, p+8) will be parsed.
  • p – first position within the buffer to parse.
Throws:
Returns:the integer value.
/** * Parse 8 character base 16 (hex) formatted string to unsigned integer. * <p> * The number is read in network byte order, that is, most significant * nybble first. * * @param bs * buffer to parse digits from; positions {@code [p, p+8)} will * be parsed. * @param p * first position within the buffer to parse. * @return the integer value. * @throws java.lang.ArrayIndexOutOfBoundsException * if the string is not hex formatted. */
public static final int parseHexInt32(final byte[] bs, final int p) { int r = digits16[bs[p]] << 4; r |= digits16[bs[p + 1]]; r <<= 4; r |= digits16[bs[p + 2]]; r <<= 4; r |= digits16[bs[p + 3]]; r <<= 4; r |= digits16[bs[p + 4]]; r <<= 4; r |= digits16[bs[p + 5]]; r <<= 4; r |= digits16[bs[p + 6]]; final int last = digits16[bs[p + 7]]; if (r < 0 || last < 0) throw new ArrayIndexOutOfBoundsException(); return (r << 4) | last; }
Parse 16 character base 16 (hex) formatted string to unsigned long.

The number is read in network byte order, that is, most significant nibble first.

Params:
  • bs – buffer to parse digits from; positions [p, p+16) will be parsed.
  • p – first position within the buffer to parse.
Throws:
Returns:the integer value.
Since:4.3
/** * Parse 16 character base 16 (hex) formatted string to unsigned long. * <p> * The number is read in network byte order, that is, most significant * nibble first. * * @param bs * buffer to parse digits from; positions {@code [p, p+16)} will * be parsed. * @param p * first position within the buffer to parse. * @return the integer value. * @throws java.lang.ArrayIndexOutOfBoundsException * if the string is not hex formatted. * @since 4.3 */
public static final long parseHexInt64(final byte[] bs, final int p) { long r = digits16[bs[p]] << 4; r |= digits16[bs[p + 1]]; r <<= 4; r |= digits16[bs[p + 2]]; r <<= 4; r |= digits16[bs[p + 3]]; r <<= 4; r |= digits16[bs[p + 4]]; r <<= 4; r |= digits16[bs[p + 5]]; r <<= 4; r |= digits16[bs[p + 6]]; r <<= 4; r |= digits16[bs[p + 7]]; r <<= 4; r |= digits16[bs[p + 8]]; r <<= 4; r |= digits16[bs[p + 9]]; r <<= 4; r |= digits16[bs[p + 10]]; r <<= 4; r |= digits16[bs[p + 11]]; r <<= 4; r |= digits16[bs[p + 12]]; r <<= 4; r |= digits16[bs[p + 13]]; r <<= 4; r |= digits16[bs[p + 14]]; final int last = digits16[bs[p + 15]]; if (r < 0 || last < 0) throw new ArrayIndexOutOfBoundsException(); return (r << 4) | last; }
Parse a single hex digit to its numeric value (0-15).
Params:
  • digit – hex character to parse.
Throws:
Returns:numeric value, in the range 0-15.
/** * Parse a single hex digit to its numeric value (0-15). * * @param digit * hex character to parse. * @return numeric value, in the range 0-15. * @throws java.lang.ArrayIndexOutOfBoundsException * if the input digit is not a valid hex digit. */
public static final int parseHexInt4(final byte digit) { final byte r = digits16[digit]; if (r < 0) throw new ArrayIndexOutOfBoundsException(); return r; }
Parse a Git style timezone string.

The sequence "-0315" will be parsed as the numeric value -195, as the lower two positions count minutes, not 100ths of an hour.

Params:
  • b – buffer to scan.
  • ptr – position within buffer to start parsing digits at.
Returns:the timezone at this location, expressed in minutes.
/** * Parse a Git style timezone string. * <p> * The sequence "-0315" will be parsed as the numeric value -195, as the * lower two positions count minutes, not 100ths of an hour. * * @param b * buffer to scan. * @param ptr * position within buffer to start parsing digits at. * @return the timezone at this location, expressed in minutes. */
public static final int parseTimeZoneOffset(byte[] b, int ptr) { return parseTimeZoneOffset(b, ptr, null); }
Parse a Git style timezone string.

The sequence "-0315" will be parsed as the numeric value -195, as the lower two positions count minutes, not 100ths of an hour.

Params:
  • b – buffer to scan.
  • ptr – position within buffer to start parsing digits at.
  • ptrResult – optional location to return the new ptr value through. If null the ptr value will be discarded.
Returns:the timezone at this location, expressed in minutes.
Since:4.1
/** * Parse a Git style timezone string. * <p> * The sequence "-0315" will be parsed as the numeric value -195, as the * lower two positions count minutes, not 100ths of an hour. * * @param b * buffer to scan. * @param ptr * position within buffer to start parsing digits at. * @param ptrResult * optional location to return the new ptr value through. If null * the ptr value will be discarded. * @return the timezone at this location, expressed in minutes. * @since 4.1 */
public static final int parseTimeZoneOffset(final byte[] b, int ptr, MutableInteger ptrResult) { final int v = parseBase10(b, ptr, ptrResult); final int tzMins = v % 100; final int tzHours = v / 100; return tzHours * 60 + tzMins; }
Locate the first position after a given character.
Params:
  • b – buffer to scan.
  • ptr – position within buffer to start looking for chrA at.
  • chrA – character to find.
Returns:new position just after chrA.
/** * Locate the first position after a given character. * * @param b * buffer to scan. * @param ptr * position within buffer to start looking for chrA at. * @param chrA * character to find. * @return new position just after chrA. */
public static final int next(byte[] b, int ptr, char chrA) { final int sz = b.length; while (ptr < sz) { if (b[ptr++] == chrA) return ptr; } return ptr; }
Locate the first position after the next LF.

This method stops on the first '\n' it finds.

Params:
  • b – buffer to scan.
  • ptr – position within buffer to start looking for LF at.
Returns:new position just after the first LF found.
/** * Locate the first position after the next LF. * <p> * This method stops on the first '\n' it finds. * * @param b * buffer to scan. * @param ptr * position within buffer to start looking for LF at. * @return new position just after the first LF found. */
public static final int nextLF(byte[] b, int ptr) { return next(b, ptr, '\n'); }
Locate the first position after either the given character or LF.

This method stops on the first match it finds from either chrA or '\n'.

Params:
  • b – buffer to scan.
  • ptr – position within buffer to start looking for chrA or LF at.
  • chrA – character to find.
Returns:new position just after the first chrA or LF to be found.
/** * Locate the first position after either the given character or LF. * <p> * This method stops on the first match it finds from either chrA or '\n'. * * @param b * buffer to scan. * @param ptr * position within buffer to start looking for chrA or LF at. * @param chrA * character to find. * @return new position just after the first chrA or LF to be found. */
public static final int nextLF(byte[] b, int ptr, char chrA) { final int sz = b.length; while (ptr < sz) { final byte c = b[ptr++]; if (c == chrA || c == '\n') return ptr; } return ptr; }
Locate the end of the header. Note that headers may be more than one line long.
Params:
  • b – buffer to scan.
  • ptr – position within buffer to start looking for the end-of-header.
Returns:new position just after the header. This is either b.length, or the index of the header's terminating newline.
Since:5.1
/** * Locate the end of the header. Note that headers may be * more than one line long. * @param b * buffer to scan. * @param ptr * position within buffer to start looking for the end-of-header. * @return new position just after the header. This is either * b.length, or the index of the header's terminating newline. * @since 5.1 */
public static final int headerEnd(final byte[] b, int ptr) { final int sz = b.length; while (ptr < sz) { final byte c = b[ptr++]; if (c == '\n' && (ptr == sz || b[ptr] != ' ')) { return ptr - 1; } } return ptr - 1; }
Find the start of the contents of a given header.
Params:
  • b – buffer to scan.
  • headerName – header to search for
  • ptr – position within buffer to start looking for header at.
Returns:new position at the start of the header's contents, -1 for not found
Since:5.1
/** * Find the start of the contents of a given header. * * @param b * buffer to scan. * @param headerName * header to search for * @param ptr * position within buffer to start looking for header at. * @return new position at the start of the header's contents, -1 for * not found * @since 5.1 */
public static final int headerStart(byte[] headerName, byte[] b, int ptr) { // Start by advancing to just past a LF or buffer start if (ptr != 0) { ptr = nextLF(b, ptr - 1); } while (ptr < b.length - (headerName.length + 1)) { boolean found = true; for (int i = 0; i < headerName.length; i++) { if (headerName[i] != b[ptr++]) { found = false; break; } } if (found && b[ptr++] == ' ') { return ptr; } ptr = nextLF(b, ptr); } return -1; }
Locate the first position before a given character.
Params:
  • b – buffer to scan.
  • ptr – position within buffer to start looking for chrA at.
  • chrA – character to find.
Returns:new position just before chrA, -1 for not found
/** * Locate the first position before a given character. * * @param b * buffer to scan. * @param ptr * position within buffer to start looking for chrA at. * @param chrA * character to find. * @return new position just before chrA, -1 for not found */
public static final int prev(byte[] b, int ptr, char chrA) { if (ptr == b.length) --ptr; while (ptr >= 0) { if (b[ptr--] == chrA) return ptr; } return ptr; }
Locate the first position before the previous LF.

This method stops on the first '\n' it finds.

Params:
  • b – buffer to scan.
  • ptr – position within buffer to start looking for LF at.
Returns:new position just before the first LF found, -1 for not found
/** * Locate the first position before the previous LF. * <p> * This method stops on the first '\n' it finds. * * @param b * buffer to scan. * @param ptr * position within buffer to start looking for LF at. * @return new position just before the first LF found, -1 for not found */
public static final int prevLF(byte[] b, int ptr) { return prev(b, ptr, '\n'); }
Locate the previous position before either the given character or LF.

This method stops on the first match it finds from either chrA or '\n'.

Params:
  • b – buffer to scan.
  • ptr – position within buffer to start looking for chrA or LF at.
  • chrA – character to find.
Returns:new position just before the first chrA or LF to be found, -1 for not found
/** * Locate the previous position before either the given character or LF. * <p> * This method stops on the first match it finds from either chrA or '\n'. * * @param b * buffer to scan. * @param ptr * position within buffer to start looking for chrA or LF at. * @param chrA * character to find. * @return new position just before the first chrA or LF to be found, -1 for * not found */
public static final int prevLF(byte[] b, int ptr, char chrA) { if (ptr == b.length) --ptr; while (ptr >= 0) { final byte c = b[ptr--]; if (c == chrA || c == '\n') return ptr; } return ptr; }
Index the region between [ptr, end) to find line starts.

The returned list is 1 indexed. Index 0 contains Integer.MIN_VALUE to pad the list out.

Using a 1 indexed list means that line numbers can be directly accessed from the list, so list.get(1) (aka get line 1) returns ptr.

The last element (index map.size()-1) always contains end.

Params:
  • buf – buffer to scan.
  • ptr – position within the buffer corresponding to the first byte of line 1.
  • end – 1 past the end of the content within buf.
Returns:a line map indicating the starting position of each line.
/** * Index the region between <code>[ptr, end)</code> to find line starts. * <p> * The returned list is 1 indexed. Index 0 contains * {@link java.lang.Integer#MIN_VALUE} to pad the list out. * <p> * Using a 1 indexed list means that line numbers can be directly accessed * from the list, so <code>list.get(1)</code> (aka get line 1) returns * <code>ptr</code>. * <p> * The last element (index <code>map.size()-1</code>) always contains * <code>end</code>. * * @param buf * buffer to scan. * @param ptr * position within the buffer corresponding to the first byte of * line 1. * @param end * 1 past the end of the content within <code>buf</code>. * @return a line map indicating the starting position of each line. */
public static final IntList lineMap(byte[] buf, int ptr, int end) { IntList map = new IntList((end - ptr) / 36); map.fillTo(1, Integer.MIN_VALUE); for (; ptr < end; ptr = nextLF(buf, ptr)) { map.add(ptr); } map.add(end); return map; }
Like lineMap(byte[], int, int) but throw BinaryBlobException if a NUL byte is encountered.
Params:
  • buf – buffer to scan.
  • ptr – position within the buffer corresponding to the first byte of line 1.
  • end – 1 past the end of the content within buf.
Throws:
Returns:a line map indicating the starting position of each line.
Since:5.0
/** * Like {@link #lineMap(byte[], int, int)} but throw * {@link BinaryBlobException} if a NUL byte is encountered. * * @param buf * buffer to scan. * @param ptr * position within the buffer corresponding to the first byte of * line 1. * @param end * 1 past the end of the content within <code>buf</code>. * @return a line map indicating the starting position of each line. * @throws BinaryBlobException * if a NUL byte is found. * @since 5.0 */
public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end) throws BinaryBlobException { IntList map = lineMapOrNull(buf, ptr, end); if (map == null) { throw new BinaryBlobException(); } return map; } @Nullable private static IntList lineMapOrNull(byte[] buf, int ptr, int end) { // Experimentally derived from multiple source repositories // the average number of bytes/line is 36. Its a rough guess // to initially size our map close to the target. IntList map = new IntList((end - ptr) / 36); map.add(Integer.MIN_VALUE); boolean foundLF = true; for (; ptr < end; ptr++) { if (foundLF) { map.add(ptr); } if (buf[ptr] == '\0') { return null; } foundLF = (buf[ptr] == '\n'); } map.add(end); return map; }
Locate the "author " header line data.
Params:
  • b – buffer to scan.
  • ptr – position in buffer to start the scan at. Most callers should pass 0 to ensure the scan starts from the beginning of the commit buffer and does not accidentally look at message body.
Returns:position just after the space in "author ", so the first character of the author's name. If no author header can be located -1 is returned.
/** * Locate the "author " header line data. * * @param b * buffer to scan. * @param ptr * position in buffer to start the scan at. Most callers should * pass 0 to ensure the scan starts from the beginning of the * commit buffer and does not accidentally look at message body. * @return position just after the space in "author ", so the first * character of the author's name. If no author header can be * located -1 is returned. */
public static final int author(byte[] b, int ptr) { final int sz = b.length; if (ptr == 0) ptr += 46; // skip the "tree ..." line. while (ptr < sz && b[ptr] == 'p') ptr += 48; // skip this parent. return match(b, ptr, author); }
Locate the "committer " header line data.
Params:
  • b – buffer to scan.
  • ptr – position in buffer to start the scan at. Most callers should pass 0 to ensure the scan starts from the beginning of the commit buffer and does not accidentally look at message body.
Returns:position just after the space in "committer ", so the first character of the committer's name. If no committer header can be located -1 is returned.
/** * Locate the "committer " header line data. * * @param b * buffer to scan. * @param ptr * position in buffer to start the scan at. Most callers should * pass 0 to ensure the scan starts from the beginning of the * commit buffer and does not accidentally look at message body. * @return position just after the space in "committer ", so the first * character of the committer's name. If no committer header can be * located -1 is returned. */
public static final int committer(byte[] b, int ptr) { final int sz = b.length; if (ptr == 0) ptr += 46; // skip the "tree ..." line. while (ptr < sz && b[ptr] == 'p') ptr += 48; // skip this parent. if (ptr < sz && b[ptr] == 'a') ptr = nextLF(b, ptr); return match(b, ptr, committer); }
Locate the "tagger " header line data.
Params:
  • b – buffer to scan.
  • ptr – position in buffer to start the scan at. Most callers should pass 0 to ensure the scan starts from the beginning of the tag buffer and does not accidentally look at message body.
Returns:position just after the space in "tagger ", so the first character of the tagger's name. If no tagger header can be located -1 is returned.
/** * Locate the "tagger " header line data. * * @param b * buffer to scan. * @param ptr * position in buffer to start the scan at. Most callers should * pass 0 to ensure the scan starts from the beginning of the tag * buffer and does not accidentally look at message body. * @return position just after the space in "tagger ", so the first * character of the tagger's name. If no tagger header can be * located -1 is returned. */
public static final int tagger(byte[] b, int ptr) { final int sz = b.length; if (ptr == 0) ptr += 48; // skip the "object ..." line. while (ptr < sz) { if (b[ptr] == '\n') return -1; final int m = match(b, ptr, tagger); if (m >= 0) return m; ptr = nextLF(b, ptr); } return -1; }
Locate the "encoding " header line.
Params:
  • b – buffer to scan.
  • ptr – position in buffer to start the scan at. Most callers should pass 0 to ensure the scan starts from the beginning of the buffer and does not accidentally look at the message body.
Returns:position just after the space in "encoding ", so the first character of the encoding's name. If no encoding header can be located -1 is returned (and UTF-8 should be assumed).
/** * Locate the "encoding " header line. * * @param b * buffer to scan. * @param ptr * position in buffer to start the scan at. Most callers should * pass 0 to ensure the scan starts from the beginning of the * buffer and does not accidentally look at the message body. * @return position just after the space in "encoding ", so the first * character of the encoding's name. If no encoding header can be * located -1 is returned (and UTF-8 should be assumed). */
public static final int encoding(byte[] b, int ptr) { final int sz = b.length; while (ptr < sz) { if (b[ptr] == '\n') return -1; if (b[ptr] == 'e') break; ptr = nextLF(b, ptr); } return match(b, ptr, encoding); }
Parse the "encoding " header as a string.

Locates the "encoding " header (if present) and returns its value.

Params:
  • b – buffer to scan.
Returns:the encoding header as specified in the commit; null if the header was not present and should be assumed.
Since:4.2
/** * Parse the "encoding " header as a string. * <p> * Locates the "encoding " header (if present) and returns its value. * * @param b * buffer to scan. * @return the encoding header as specified in the commit; null if the * header was not present and should be assumed. * @since 4.2 */
@Nullable public static String parseEncodingName(byte[] b) { int enc = encoding(b, 0); if (enc < 0) { return null; } int lf = nextLF(b, enc); return decode(UTF_8, b, enc, lf - 1); }
Parse the "encoding " header into a character set reference.

Locates the "encoding " header (if present) by first calling encoding(byte[], int) and then returns the proper character set to apply to this buffer to evaluate its contents as character data.

If no encoding header is present UTF-8 is assumed.

Params:
  • b – buffer to scan.
Throws:
Returns:the Java character set representation. Never null.
/** * Parse the "encoding " header into a character set reference. * <p> * Locates the "encoding " header (if present) by first calling * {@link #encoding(byte[], int)} and then returns the proper character set * to apply to this buffer to evaluate its contents as character data. * <p> * If no encoding header is present {@code UTF-8} is assumed. * * @param b * buffer to scan. * @return the Java character set representation. Never null. * @throws IllegalCharsetNameException * if the character set requested by the encoding header is * malformed and unsupportable. * @throws UnsupportedCharsetException * if the JRE does not support the character set requested by * the encoding header. */
public static Charset parseEncoding(byte[] b) { String enc = parseEncodingName(b); if (enc == null) { return UTF_8; } String name = enc.trim(); try { return Charset.forName(name); } catch (IllegalCharsetNameException | UnsupportedCharsetException badName) { Charset aliased = charsetForAlias(name); if (aliased != null) { return aliased; } throw badName; } }
Parse a name string (e.g. author, committer, tagger) into a PersonIdent.

Leading spaces won't be trimmed from the string, i.e. will show up in the parsed name afterwards.

Params:
  • in – the string to parse a name from.
Returns:the parsed identity or null in case the identity could not be parsed.
/** * Parse a name string (e.g. author, committer, tagger) into a PersonIdent. * <p> * Leading spaces won't be trimmed from the string, i.e. will show up in the * parsed name afterwards. * * @param in * the string to parse a name from. * @return the parsed identity or null in case the identity could not be * parsed. */
public static PersonIdent parsePersonIdent(String in) { return parsePersonIdent(Constants.encode(in), 0); }
Parse a name line (e.g. author, committer, tagger) into a PersonIdent.

When passing in a value for nameB callers should use the return value of author(byte[], int) or committer(byte[], int), as these methods provide the proper position within the buffer.

Params:
  • raw – the buffer to parse character data from.
  • nameB – first position of the identity information. This should be the first position after the space which delimits the header field name (e.g. "author" or "committer") from the rest of the identity line.
Returns:the parsed identity or null in case the identity could not be parsed.
/** * Parse a name line (e.g. author, committer, tagger) into a PersonIdent. * <p> * When passing in a value for <code>nameB</code> callers should use the * return value of {@link #author(byte[], int)} or * {@link #committer(byte[], int)}, as these methods provide the proper * position within the buffer. * * @param raw * the buffer to parse character data from. * @param nameB * first position of the identity information. This should be the * first position after the space which delimits the header field * name (e.g. "author" or "committer") from the rest of the * identity line. * @return the parsed identity or null in case the identity could not be * parsed. */
public static PersonIdent parsePersonIdent(byte[] raw, int nameB) { Charset cs; try { cs = parseEncoding(raw); } catch (IllegalCharsetNameException | UnsupportedCharsetException e) { // Assume UTF-8 for person identities, usually this is correct. // If not decode() will fall back to the ISO-8859-1 encoding. cs = UTF_8; } final int emailB = nextLF(raw, nameB, '<'); final int emailE = nextLF(raw, emailB, '>'); if (emailB >= raw.length || raw[emailB] == '\n' || (emailE >= raw.length - 1 && raw[emailE - 1] != '>')) return null; final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ? emailB - 2 : emailB - 1; final String name = decode(cs, raw, nameB, nameEnd); final String email = decode(cs, raw, emailB, emailE - 1); // Start searching from end of line, as after first name-email pair, // another name-email pair may occur. We will ignore all kinds of // "junk" following the first email. // // We've to use (emailE - 1) for the case that raw[email] is LF, // otherwise we would run too far. "-2" is necessary to position // before the LF in case of LF termination resp. the penultimate // character if there is no trailing LF. final int tzBegin = lastIndexOfTrim(raw, ' ', nextLF(raw, emailE - 1) - 2) + 1; if (tzBegin <= emailE) // No time/zone, still valid return new PersonIdent(name, email, 0, 0); final int whenBegin = Math.max(emailE, lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1); if (whenBegin >= tzBegin - 1) // No time/zone, still valid return new PersonIdent(name, email, 0, 0); final long when = parseLongBase10(raw, whenBegin, null); final int tz = parseTimeZoneOffset(raw, tzBegin); return new PersonIdent(name, email, when * 1000L, tz); }
Parse a name data (e.g. as within a reflog) into a PersonIdent.

When passing in a value for nameB callers should use the return value of author(byte[], int) or committer(byte[], int), as these methods provide the proper position within the buffer.

Params:
  • raw – the buffer to parse character data from.
  • nameB – first position of the identity information. This should be the first position after the space which delimits the header field name (e.g. "author" or "committer") from the rest of the identity line.
Returns:the parsed identity. Never null.
/** * Parse a name data (e.g. as within a reflog) into a PersonIdent. * <p> * When passing in a value for <code>nameB</code> callers should use the * return value of {@link #author(byte[], int)} or * {@link #committer(byte[], int)}, as these methods provide the proper * position within the buffer. * * @param raw * the buffer to parse character data from. * @param nameB * first position of the identity information. This should be the * first position after the space which delimits the header field * name (e.g. "author" or "committer") from the rest of the * identity line. * @return the parsed identity. Never null. */
public static PersonIdent parsePersonIdentOnly(final byte[] raw, final int nameB) { int stop = nextLF(raw, nameB); int emailB = nextLF(raw, nameB, '<'); int emailE = nextLF(raw, emailB, '>'); final String name; final String email; if (emailE < stop) { email = decode(raw, emailB, emailE - 1); } else { email = "invalid"; //$NON-NLS-1$ } if (emailB < stop) name = decode(raw, nameB, emailB - 2); else name = decode(raw, nameB, stop); final MutableInteger ptrout = new MutableInteger(); long when; int tz; if (emailE < stop) { when = parseLongBase10(raw, emailE + 1, ptrout); tz = parseTimeZoneOffset(raw, ptrout.value); } else { when = 0; tz = 0; } return new PersonIdent(name, email, when * 1000L, tz); }
Locate the end of a footer line key string.

If the region at raw[ptr] matches ^[A-Za-z0-9-]+: (e.g. "Signed-off-by: A. U. Thor\n") then this method returns the position of the first ':'.

If the region at raw[ptr] does not match ^[A-Za-z0-9-]+: then this method returns -1.

Params:
  • raw – buffer to scan.
  • ptr – first position within raw to consider as a footer line key.
Returns:position of the ':' which terminates the footer line key if this is otherwise a valid footer line key; otherwise -1.
/** * Locate the end of a footer line key string. * <p> * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g. * "Signed-off-by: A. U. Thor\n") then this method returns the position of * the first ':'. * <p> * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:} * then this method returns -1. * * @param raw * buffer to scan. * @param ptr * first position within raw to consider as a footer line key. * @return position of the ':' which terminates the footer line key if this * is otherwise a valid footer line key; otherwise -1. */
public static int endOfFooterLineKey(byte[] raw, int ptr) { try { for (;;) { final byte c = raw[ptr]; if (footerLineKeyChars[c] == 0) { if (c == ':') return ptr; return -1; } ptr++; } } catch (ArrayIndexOutOfBoundsException e) { return -1; } }
Decode a buffer under UTF-8, if possible. If the byte stream cannot be decoded that way, the platform default is tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
Params:
  • buffer – buffer to pull raw bytes from.
Returns:a string representation of the range [start,end), after decoding the region through the specified character set.
/** * Decode a buffer under UTF-8, if possible. * * If the byte stream cannot be decoded that way, the platform default is tried * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. * * @param buffer * buffer to pull raw bytes from. * @return a string representation of the range <code>[start,end)</code>, * after decoding the region through the specified character set. */
public static String decode(byte[] buffer) { return decode(buffer, 0, buffer.length); }
Decode a buffer under UTF-8, if possible. If the byte stream cannot be decoded that way, the platform default is tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
Params:
  • buffer – buffer to pull raw bytes from.
  • start – start position in buffer
  • end – one position past the last location within the buffer to take data from.
Returns:a string representation of the range [start,end), after decoding the region through the specified character set.
/** * Decode a buffer under UTF-8, if possible. * * If the byte stream cannot be decoded that way, the platform default is * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried. * * @param buffer * buffer to pull raw bytes from. * @param start * start position in buffer * @param end * one position past the last location within the buffer to take * data from. * @return a string representation of the range <code>[start,end)</code>, * after decoding the region through the specified character set. */
public static String decode(final byte[] buffer, final int start, final int end) { return decode(UTF_8, buffer, start, end); }
Decode a buffer under the specified character set if possible. If the byte stream cannot be decoded that way, the platform default is tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
Params:
  • cs – character set to use when decoding the buffer.
  • buffer – buffer to pull raw bytes from.
Returns:a string representation of the range [start,end), after decoding the region through the specified character set.
/** * Decode a buffer under the specified character set if possible. * * If the byte stream cannot be decoded that way, the platform default is tried * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. * * @param cs * character set to use when decoding the buffer. * @param buffer * buffer to pull raw bytes from. * @return a string representation of the range <code>[start,end)</code>, * after decoding the region through the specified character set. */
public static String decode(Charset cs, byte[] buffer) { return decode(cs, buffer, 0, buffer.length); }
Decode a region of the buffer under the specified character set if possible. If the byte stream cannot be decoded that way, the platform default is tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
Params:
  • cs – character set to use when decoding the buffer.
  • buffer – buffer to pull raw bytes from.
  • start – first position within the buffer to take data from.
  • end – one position past the last location within the buffer to take data from.
Returns:a string representation of the range [start,end), after decoding the region through the specified character set.
/** * Decode a region of the buffer under the specified character set if possible. * * If the byte stream cannot be decoded that way, the platform default is tried * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. * * @param cs * character set to use when decoding the buffer. * @param buffer * buffer to pull raw bytes from. * @param start * first position within the buffer to take data from. * @param end * one position past the last location within the buffer to take * data from. * @return a string representation of the range <code>[start,end)</code>, * after decoding the region through the specified character set. */
public static String decode(final Charset cs, final byte[] buffer, final int start, final int end) { try { return decodeNoFallback(cs, buffer, start, end); } catch (CharacterCodingException e) { // Fall back to an ISO-8859-1 style encoding. At least all of // the bytes will be present in the output. // return extractBinaryString(buffer, start, end); } }
Decode a region of the buffer under the specified character set if possible. If the byte stream cannot be decoded that way, the platform default is tried and if that too fails, an exception is thrown.
Params:
  • cs – character set to use when decoding the buffer.
  • buffer – buffer to pull raw bytes from.
  • start – first position within the buffer to take data from.
  • end – one position past the last location within the buffer to take data from.
Throws:
Returns:a string representation of the range [start,end), after decoding the region through the specified character set.
/** * Decode a region of the buffer under the specified character set if * possible. * * If the byte stream cannot be decoded that way, the platform default is * tried and if that too fails, an exception is thrown. * * @param cs * character set to use when decoding the buffer. * @param buffer * buffer to pull raw bytes from. * @param start * first position within the buffer to take data from. * @param end * one position past the last location within the buffer to take * data from. * @return a string representation of the range <code>[start,end)</code>, * after decoding the region through the specified character set. * @throws java.nio.charset.CharacterCodingException * the input is not in any of the tested character sets. */
public static String decodeNoFallback(final Charset cs, final byte[] buffer, final int start, final int end) throws CharacterCodingException { ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start); b.mark(); // Try our built-in favorite. The assumption here is that // decoding will fail if the data is not actually encoded // using that encoder. try { return decode(b, UTF_8); } catch (CharacterCodingException e) { b.reset(); } if (!cs.equals(UTF_8)) { // Try the suggested encoding, it might be right since it was // provided by the caller. try { return decode(b, cs); } catch (CharacterCodingException e) { b.reset(); } } // Try the default character set. A small group of people // might actually use the same (or very similar) locale. Charset defcs = Charset.defaultCharset(); if (!defcs.equals(cs) && !defcs.equals(UTF_8)) { try { return decode(b, defcs); } catch (CharacterCodingException e) { b.reset(); } } throw new CharacterCodingException(); }
Decode a region of the buffer under the ISO-8859-1 encoding. Each byte is treated as a single character in the 8859-1 character encoding, performing a raw binary->char conversion.
Params:
  • buffer – buffer to pull raw bytes from.
  • start – first position within the buffer to take data from.
  • end – one position past the last location within the buffer to take data from.
Returns:a string representation of the range [start,end).
/** * Decode a region of the buffer under the ISO-8859-1 encoding. * * Each byte is treated as a single character in the 8859-1 character * encoding, performing a raw binary-&gt;char conversion. * * @param buffer * buffer to pull raw bytes from. * @param start * first position within the buffer to take data from. * @param end * one position past the last location within the buffer to take * data from. * @return a string representation of the range <code>[start,end)</code>. */
public static String extractBinaryString(final byte[] buffer, final int start, final int end) { final StringBuilder r = new StringBuilder(end - start); for (int i = start; i < end; i++) r.append((char) (buffer[i] & 0xff)); return r.toString(); } private static String decode(ByteBuffer b, Charset charset) throws CharacterCodingException { final CharsetDecoder d = charset.newDecoder(); d.onMalformedInput(CodingErrorAction.REPORT); d.onUnmappableCharacter(CodingErrorAction.REPORT); return d.decode(b).toString(); }
Locate the position of the commit message body.
Params:
  • b – buffer to scan.
  • ptr – position in buffer to start the scan at. Most callers should pass 0 to ensure the scan starts from the beginning of the commit buffer.
Returns:position of the user's message buffer.
/** * Locate the position of the commit message body. * * @param b * buffer to scan. * @param ptr * position in buffer to start the scan at. Most callers should * pass 0 to ensure the scan starts from the beginning of the * commit buffer. * @return position of the user's message buffer. */
public static final int commitMessage(byte[] b, int ptr) { final int sz = b.length; if (ptr == 0) ptr += 46; // skip the "tree ..." line. while (ptr < sz && b[ptr] == 'p') ptr += 48; // skip this parent. // Skip any remaining header lines, ignoring what their actual // header line type is. This is identical to the logic for a tag. // return tagMessage(b, ptr); }
Locate the position of the tag message body.
Params:
  • b – buffer to scan.
  • ptr – position in buffer to start the scan at. Most callers should pass 0 to ensure the scan starts from the beginning of the tag buffer.
Returns:position of the user's message buffer.
/** * Locate the position of the tag message body. * * @param b * buffer to scan. * @param ptr * position in buffer to start the scan at. Most callers should * pass 0 to ensure the scan starts from the beginning of the tag * buffer. * @return position of the user's message buffer. */
public static final int tagMessage(byte[] b, int ptr) { final int sz = b.length; if (ptr == 0) ptr += 48; // skip the "object ..." line. while (ptr < sz && b[ptr] != '\n') ptr = nextLF(b, ptr); if (ptr < sz && b[ptr] == '\n') return ptr + 1; return -1; }
Locate the end of a paragraph.

A paragraph is ended by two consecutive LF bytes or CRLF pairs

Params:
  • b – buffer to scan.
  • start – position in buffer to start the scan at. Most callers will want to pass the first position of the commit message (as found by commitMessage(byte[], int).
Returns:position of the LF at the end of the paragraph; b.length if no paragraph end could be located.
/** * Locate the end of a paragraph. * <p> * A paragraph is ended by two consecutive LF bytes or CRLF pairs * * @param b * buffer to scan. * @param start * position in buffer to start the scan at. Most callers will * want to pass the first position of the commit message (as * found by {@link #commitMessage(byte[], int)}. * @return position of the LF at the end of the paragraph; * <code>b.length</code> if no paragraph end could be located. */
public static final int endOfParagraph(byte[] b, int start) { int ptr = start; final int sz = b.length; while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r')) ptr = nextLF(b, ptr); if (ptr > start && b[ptr - 1] == '\n') ptr--; if (ptr > start && b[ptr - 1] == '\r') ptr--; return ptr; }
Get last index of ch in raw, trimming spaces.
Params:
  • raw – buffer to scan.
  • ch – character to find.
  • pos – starting position.
Returns:last index of ch in raw, trimming spaces.
Since:4.1
/** * Get last index of {@code ch} in raw, trimming spaces. * * @param raw * buffer to scan. * @param ch * character to find. * @param pos * starting position. * @return last index of {@code ch} in raw, trimming spaces. * @since 4.1 */
public static int lastIndexOfTrim(byte[] raw, char ch, int pos) { while (pos >= 0 && raw[pos] == ' ') pos--; while (pos >= 0 && raw[pos] != ch) pos--; return pos; } private static Charset charsetForAlias(String name) { return encodingAliases.get(StringUtils.toLowerCase(name)); } private RawParseUtils() { // Don't create instances of a static only utility. } }