/*
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      https://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 */

/*
 * This package is based on the work done by Keiron Liddle, Aftex Software
 * <keiron@aftexsw.com> to whom the Ant project is very grateful for his
 * great code.
 */

package org.apache.tools.bzip2;

import java.io.IOException;
import java.io.OutputStream;

An output stream that compresses into the BZip2 format (without the file header chars) into another stream.

The compression requires large amounts of memory. Thus you should call the close() method as soon as possible, to force CBZip2OutputStream to release the allocated memory.

You can shrink the amount of allocated memory and maybe raise the compression speed by choosing a lower blocksize, which in turn may cause a lower compression ratio. You can avoid unnecessary memory allocation by avoiding using a blocksize which is bigger than the size of the input.

You can compute the memory usage for compressing by the following formula:

<code>400k + (9 * blocksize)</code>.

To get the memory required for decompression by CBZip2InputStream use

<code>65k + (5 * blocksize)</code>.
Memory usage by blocksize
Blocksize Compression
memory usage
Decompression
memory usage
100k 1300k 565k
200k 2200k 1065k
300k 3100k 1565k
400k 4000k 2065k
500k 4900k 2565k
600k 5800k 3065k
700k 6700k 3565k
800k 7600k 4065k
900k 8500k 4565k

For decompression CBZip2InputStream allocates less memory if the bzipped input is smaller than one block.

Instances of this class are not threadsafe.

TODO: Update to BZip2 1.0.1

/** * An output stream that compresses into the BZip2 format (without the file * header chars) into another stream. * * <p> * The compression requires large amounts of memory. Thus you should call the * {@link #close() close()} method as soon as possible, to force * <code>CBZip2OutputStream</code> to release the allocated memory. * </p> * * <p>You can shrink the amount of allocated memory and maybe raise * the compression speed by choosing a lower blocksize, which in turn * may cause a lower compression ratio. You can avoid unnecessary * memory allocation by avoiding using a blocksize which is bigger * than the size of the input.</p> * * <p>You can compute the memory usage for compressing by the * following formula:</p> * * <pre> * &lt;code&gt;400k + (9 * blocksize)&lt;/code&gt;. * </pre> * * <p>To get the memory required for decompression by {@link * CBZip2InputStream CBZip2InputStream} use</p> * * <pre> * &lt;code&gt;65k + (5 * blocksize)&lt;/code&gt;. * </pre> * * <table style="border:1px solid black"> * <caption>Memory usage by blocksize</caption> * <tr> * <th style="text-align:right">Blocksize</th> * <th style="text-align:right">Compression<br>memory usage</th> * <th style="text-align:right">Decompression<br>memory usage</th> * </tr> * <tr> * <td style="text-align:right">100k</td> * <td style="text-align:right">1300k</td> * <td style="text-align:right">565k</td> * </tr> * <tr> * <td style="text-align:right">200k</td> * <td style="text-align:right">2200k</td> * <td style="text-align:right">1065k</td> * </tr> * <tr> * <td style="text-align:right">300k</td> * <td style="text-align:right">3100k</td> * <td style="text-align:right">1565k</td> * </tr> * <tr> * <td style="text-align:right">400k</td> * <td style="text-align:right">4000k</td> * <td style="text-align:right">2065k</td> * </tr> * <tr> * <td style="text-align:right">500k</td> * <td style="text-align:right">4900k</td> * <td style="text-align:right">2565k</td> * </tr> * <tr> * <td style="text-align:right">600k</td> * <td style="text-align:right">5800k</td> * <td style="text-align:right">3065k</td> * </tr> * <tr> * <td style="text-align:right">700k</td> * <td style="text-align:right">6700k</td> * <td style="text-align:right">3565k</td> * </tr> * <tr> * <td style="text-align:right">800k</td> * <td style="text-align:right">7600k</td> * <td style="text-align:right">4065k</td> * </tr> * <tr> * <td style="text-align:right">900k</td> * <td style="text-align:right">8500k</td> * <td style="text-align:right">4565k</td> * </tr> * </table> * * <p> * For decompression <code>CBZip2InputStream</code> allocates less memory if the * bzipped input is smaller than one block. * </p> * * <p> * Instances of this class are not threadsafe. * </p> * * <p> * TODO: Update to BZip2 1.0.1 * </p> * */
public class CBZip2OutputStream extends OutputStream implements BZip2Constants {
The minimum supported blocksize == 1.
/** * The minimum supported blocksize <code> == 1</code>. */
public static final int MIN_BLOCKSIZE = 1;
The maximum supported blocksize == 9.
/** * The maximum supported blocksize <code> == 9</code>. */
public static final int MAX_BLOCKSIZE = 9;
This constant is accessible by subclasses for historical purposes. If you don't know what it means then you don't need it.
/** * This constant is accessible by subclasses for historical * purposes. If you don't know what it means then you don't need * it. */
protected static final int SETMASK = (1 << 21);
This constant is accessible by subclasses for historical purposes. If you don't know what it means then you don't need it.
/** * This constant is accessible by subclasses for historical * purposes. If you don't know what it means then you don't need * it. */
protected static final int CLEARMASK = (~SETMASK);
This constant is accessible by subclasses for historical purposes. If you don't know what it means then you don't need it.
/** * This constant is accessible by subclasses for historical * purposes. If you don't know what it means then you don't need * it. */
protected static final int GREATER_ICOST = 15;
This constant is accessible by subclasses for historical purposes. If you don't know what it means then you don't need it.
/** * This constant is accessible by subclasses for historical * purposes. If you don't know what it means then you don't need * it. */
protected static final int LESSER_ICOST = 0;
This constant is accessible by subclasses for historical purposes. If you don't know what it means then you don't need it.
/** * This constant is accessible by subclasses for historical * purposes. If you don't know what it means then you don't need * it. */
protected static final int SMALL_THRESH = 20;
This constant is accessible by subclasses for historical purposes. If you don't know what it means then you don't need it.
/** * This constant is accessible by subclasses for historical * purposes. If you don't know what it means then you don't need * it. */
protected static final int DEPTH_THRESH = 10;
This constant is accessible by subclasses for historical purposes. If you don't know what it means then you don't need it.
/** * This constant is accessible by subclasses for historical * purposes. If you don't know what it means then you don't need * it. */
protected static final int WORK_FACTOR = 30;
This constant is accessible by subclasses for historical purposes. If you don't know what it means then you don't need it.

If you are ever unlucky/improbable enough to get a stack overflow whilst sorting, increase the following constant and try again. In practice I have never seen the stack go above 27 elems, so the following limit seems very generous.

/** * This constant is accessible by subclasses for historical * purposes. If you don't know what it means then you don't need * it. * <p>If you are ever unlucky/improbable enough to get a stack * overflow whilst sorting, increase the following constant and * try again. In practice I have never seen the stack go above 27 * elems, so the following limit seems very generous.</p> */
protected static final int QSORT_STACK_SIZE = 1000;
Knuth's increments seem to work better than Incerpi-Sedgewick here. Possibly because the number of elems to sort is usually small, typically <= 20.
/** * Knuth's increments seem to work better than Incerpi-Sedgewick here. * Possibly because the number of elems to sort is usually small, typically * &lt;= 20. */
@SuppressWarnings("unused") private static final int[] INCS = {1, 4, 13, 40, 121, 364, 1093, 3280, 9841, 29524, 88573, 265720, 797161, 2391484};
This method is accessible by subclasses for historical purposes. If you don't know what it does then you don't need it.
Params:
  • len – char[]
  • freq – char[]
  • alphaSize – int
  • maxLen – int
/** * This method is accessible by subclasses for historical * purposes. If you don't know what it does then you don't need * it. * * @param len char[] * @param freq char[] * @param alphaSize int * @param maxLen int */
protected static void hbMakeCodeLengths(char[] len, int[] freq, int alphaSize, int maxLen) { /* * Nodes and heap entries run from 1. Entry 0 for both the heap and * nodes is a sentinel. */ final int[] heap = new int[MAX_ALPHA_SIZE * 2]; final int[] weight = new int[MAX_ALPHA_SIZE * 2]; final int[] parent = new int[MAX_ALPHA_SIZE * 2]; for (int i = alphaSize; --i >= 0;) { weight[i + 1] = (freq[i] == 0 ? 1 : freq[i]) << 8; } for (boolean tooLong = true; tooLong;) { tooLong = false; int nNodes = alphaSize; int nHeap = 0; heap[0] = 0; weight[0] = 0; parent[0] = -2; for (int i = 1; i <= alphaSize; i++) { parent[i] = -1; nHeap++; heap[nHeap] = i; int zz = nHeap; int tmp = heap[zz]; while (weight[tmp] < weight[heap[zz >> 1]]) { heap[zz] = heap[zz >> 1]; zz >>= 1; } heap[zz] = tmp; } // assert (nHeap < (MAX_ALPHA_SIZE + 2)) : nHeap; while (nHeap > 1) { int n1 = heap[1]; heap[1] = heap[nHeap]; nHeap--; int yy = 0; int zz = 1; int tmp = heap[1]; while (true) { yy = zz << 1; if (yy > nHeap) { break; } if ((yy < nHeap) && (weight[heap[yy + 1]] < weight[heap[yy]])) { yy++; } if (weight[tmp] < weight[heap[yy]]) { break; } heap[zz] = heap[yy]; zz = yy; } heap[zz] = tmp; int n2 = heap[1]; heap[1] = heap[nHeap]; nHeap--; yy = 0; zz = 1; tmp = heap[1]; while (true) { yy = zz << 1; if (yy > nHeap) { break; } if ((yy < nHeap) && (weight[heap[yy + 1]] < weight[heap[yy]])) { yy++; } if (weight[tmp] < weight[heap[yy]]) { break; } heap[zz] = heap[yy]; zz = yy; } heap[zz] = tmp; nNodes++; parent[n1] = parent[n2] = nNodes; final int weight_n1 = weight[n1]; final int weight_n2 = weight[n2]; weight[nNodes] = (((weight_n1 & 0xffffff00) + (weight_n2 & 0xffffff00)) | (1 + (((weight_n1 & 0x000000ff) > (weight_n2 & 0x000000ff)) ? (weight_n1 & 0x000000ff) : (weight_n2 & 0x000000ff)) )); parent[nNodes] = -1; nHeap++; heap[nHeap] = nNodes; tmp = 0; zz = nHeap; tmp = heap[zz]; final int weight_tmp = weight[tmp]; while (weight_tmp < weight[heap[zz >> 1]]) { heap[zz] = heap[zz >> 1]; zz >>= 1; } heap[zz] = tmp; } // assert (nNodes < (MAX_ALPHA_SIZE * 2)) : nNodes; for (int i = 1; i <= alphaSize; i++) { int j = 0; int k = i; for (int parent_k; (parent_k = parent[k]) >= 0;) { k = parent_k; j++; } len[i - 1] = (char) j; if (j > maxLen) { tooLong = true; } } if (tooLong) { for (int i = 1; i < alphaSize; i++) { int j = weight[i] >> 8; j = 1 + (j >> 1); weight[i] = j << 8; } } } } private static void hbMakeCodeLengths(final byte[] len, final int[] freq, final Data dat, final int alphaSize, final int maxLen) { /* * Nodes and heap entries run from 1. Entry 0 for both the heap and * nodes is a sentinel. */ final int[] heap = dat.heap; final int[] weight = dat.weight; final int[] parent = dat.parent; for (int i = alphaSize; --i >= 0;) { weight[i + 1] = (freq[i] == 0 ? 1 : freq[i]) << 8; } for (boolean tooLong = true; tooLong;) { tooLong = false; int nNodes = alphaSize; int nHeap = 0; heap[0] = 0; weight[0] = 0; parent[0] = -2; for (int i = 1; i <= alphaSize; i++) { parent[i] = -1; nHeap++; heap[nHeap] = i; int zz = nHeap; int tmp = heap[zz]; while (weight[tmp] < weight[heap[zz >> 1]]) { heap[zz] = heap[zz >> 1]; zz >>= 1; } heap[zz] = tmp; } while (nHeap > 1) { int n1 = heap[1]; heap[1] = heap[nHeap]; nHeap--; int yy = 0; int zz = 1; int tmp = heap[1]; while (true) { yy = zz << 1; if (yy > nHeap) { break; } if ((yy < nHeap) && (weight[heap[yy + 1]] < weight[heap[yy]])) { yy++; } if (weight[tmp] < weight[heap[yy]]) { break; } heap[zz] = heap[yy]; zz = yy; } heap[zz] = tmp; int n2 = heap[1]; heap[1] = heap[nHeap]; nHeap--; yy = 0; zz = 1; tmp = heap[1]; while (true) { yy = zz << 1; if (yy > nHeap) { break; } if ((yy < nHeap) && (weight[heap[yy + 1]] < weight[heap[yy]])) { yy++; } if (weight[tmp] < weight[heap[yy]]) { break; } heap[zz] = heap[yy]; zz = yy; } heap[zz] = tmp; nNodes++; parent[n1] = parent[n2] = nNodes; final int weight_n1 = weight[n1]; final int weight_n2 = weight[n2]; weight[nNodes] = ((weight_n1 & 0xffffff00) + (weight_n2 & 0xffffff00)) | (1 + (((weight_n1 & 0x000000ff) > (weight_n2 & 0x000000ff)) ? (weight_n1 & 0x000000ff) : (weight_n2 & 0x000000ff))); parent[nNodes] = -1; nHeap++; heap[nHeap] = nNodes; tmp = 0; zz = nHeap; tmp = heap[zz]; final int weight_tmp = weight[tmp]; while (weight_tmp < weight[heap[zz >> 1]]) { heap[zz] = heap[zz >> 1]; zz >>= 1; } heap[zz] = tmp; } for (int i = 1; i <= alphaSize; i++) { int j = 0; int k = i; for (int parent_k; (parent_k = parent[k]) >= 0;) { k = parent_k; j++; } len[i - 1] = (byte) j; if (j > maxLen) { tooLong = true; } } if (tooLong) { for (int i = 1; i < alphaSize; i++) { int j = weight[i] >> 8; j = 1 + (j >> 1); weight[i] = j << 8; } } } }
Index of the last char in the block, so the block size == last + 1.
/** * Index of the last char in the block, so the block size == last + 1. */
private int last;
Always: in the range 0 .. 9. The current block size is 100000 * this number.
/** * Always: in the range 0 .. 9. The current block size is 100000 * this * number. */
private final int blockSize100k; private int bsBuff; private int bsLive; private final CRC crc = new CRC(); private int nInUse; private int nMTF; private int currentChar = -1; private int runLength = 0; private int blockCRC; private int combinedCRC; private final int allowableBlockSize;
All memory intensive stuff.
/** * All memory intensive stuff. */
private Data data; private BlockSort blockSorter; private OutputStream out;
Chooses a blocksize based on the given length of the data to compress.
Params:
  • inputLength – The length of the data which will be compressed by CBZip2OutputStream.
Returns:The blocksize, between MIN_BLOCKSIZE and MAX_BLOCKSIZE both inclusive. For a negative inputLength this method returns MAX_BLOCKSIZE always.
/** * Chooses a blocksize based on the given length of the data to compress. * * @param inputLength * The length of the data which will be compressed by * <code>CBZip2OutputStream</code>. * @return The blocksize, between {@link #MIN_BLOCKSIZE} and * {@link #MAX_BLOCKSIZE} both inclusive. For a negative * <code>inputLength</code> this method returns <code>MAX_BLOCKSIZE</code> * always. */
public static int chooseBlockSize(long inputLength) { return (inputLength > 0) ? (int) Math .min((inputLength / 132000) + 1, 9) : MAX_BLOCKSIZE; }
Constructs a new CBZip2OutputStream with a blocksize of 900k.

Attention: The caller is responsible to write the two BZip2 magic bytes "BZ" to the specified stream prior to calling this constructor.

Params:
  • out – * the destination stream.
Throws:
/** * Constructs a new <code>CBZip2OutputStream</code> with a blocksize of 900k. * * <p> * <b>Attention: </b>The caller is responsible to write the two BZip2 magic * bytes <code>"BZ"</code> to the specified stream prior to calling this * constructor. * </p> * * @param out * * the destination stream. * * @throws IOException * if an I/O error occurs in the specified stream. * @throws NullPointerException * if <code>out == null</code>. */
public CBZip2OutputStream(final OutputStream out) throws IOException { this(out, MAX_BLOCKSIZE); }
Constructs a new CBZip2OutputStream with specified blocksize.

Attention: The caller is responsible to write the two BZip2 magic bytes "BZ" to the specified stream prior to calling this constructor.

Params:
  • out – the destination stream.
  • blockSize – the blockSize as 100k units.
Throws:
See Also:
/** * Constructs a new <code>CBZip2OutputStream</code> with specified blocksize. * * <p> * <b>Attention: </b>The caller is responsible to write the two BZip2 magic * bytes <code>"BZ"</code> to the specified stream prior to calling this * constructor. * </p> * * * @param out * the destination stream. * @param blockSize * the blockSize as 100k units. * * @throws IOException * if an I/O error occurs in the specified stream. * @throws IllegalArgumentException * if <code>(blockSize &lt; 1) || (blockSize &gt; 9)</code>. * @throws NullPointerException * if <code>out == null</code>. * * @see #MIN_BLOCKSIZE * @see #MAX_BLOCKSIZE */
public CBZip2OutputStream(final OutputStream out, final int blockSize) throws IOException { super(); if (blockSize < 1) { throw new IllegalArgumentException("blockSize(" + blockSize + ") < 1"); } if (blockSize > 9) { throw new IllegalArgumentException("blockSize(" + blockSize + ") > 9"); } this.blockSize100k = blockSize; this.out = out; /* 20 is just a paranoia constant */ this.allowableBlockSize = (this.blockSize100k * BZip2Constants.baseBlockSize) - 20; init(); }
{@inheritDoc}
/** {@inheritDoc} */
@Override public void write(final int b) throws IOException { if (this.out != null) { write0(b); } else { throw new IOException("closed"); } }
Writes the current byte to the buffer, run-length encoding it if it has been repeated at least four times (the first step RLEs sequences of four identical bytes).

Flushes the current block before writing data if it is full.

"write to the buffer" means adding to data.buffer starting two steps "after" this.last - initially starting at index 1 (not 0) - and updating this.last to point to the last index written minus 1.

/** * Writes the current byte to the buffer, run-length encoding it * if it has been repeated at least four times (the first step * RLEs sequences of four identical bytes). * * <p>Flushes the current block before writing data if it is * full.</p> * * <p>"write to the buffer" means adding to data.buffer starting * two steps "after" this.last - initially starting at index 1 * (not 0) - and updating this.last to point to the last index * written minus 1.</p> */
private void writeRun() throws IOException { final int lastShadow = this.last; if (lastShadow < this.allowableBlockSize) { final int currentCharShadow = this.currentChar; final Data dataShadow = this.data; dataShadow.inUse[currentCharShadow] = true; final byte ch = (byte) currentCharShadow; int runLengthShadow = this.runLength; this.crc.updateCRC(currentCharShadow, runLengthShadow); final byte[] block = dataShadow.block; switch (runLengthShadow) { case 1: block[lastShadow + 2] = ch; this.last = lastShadow + 1; break; case 2: block[lastShadow + 2] = ch; block[lastShadow + 3] = ch; this.last = lastShadow + 2; break; case 3: block[lastShadow + 2] = ch; block[lastShadow + 3] = ch; block[lastShadow + 4] = ch; this.last = lastShadow + 3; break; default: runLengthShadow -= 4; dataShadow.inUse[runLengthShadow] = true; block[lastShadow + 2] = ch; block[lastShadow + 3] = ch; block[lastShadow + 4] = ch; block[lastShadow + 5] = ch; block[lastShadow + 6] = (byte) runLengthShadow; this.last = lastShadow + 5; break; } } else { endBlock(); initBlock(); writeRun(); } }
Overridden to close the stream.
/** * Overridden to close the stream. */
@Override protected void finalize() throws Throwable { finish(); super.finalize(); } public void finish() throws IOException { if (out != null) { try { if (this.runLength > 0) { writeRun(); } this.currentChar = -1; endBlock(); endCompression(); } finally { this.out = null; this.data = null; this.blockSorter = null; } } } @Override public void close() throws IOException { if (out != null) { OutputStream outShadow = this.out; finish(); outShadow.close(); } } @Override public void flush() throws IOException { OutputStream outShadow = this.out; if (outShadow != null) { outShadow.flush(); } } private void init() throws IOException { // write magic: done by caller who created this stream // this.out.write('B'); // this.out.write('Z'); this.data = new Data(this.blockSize100k); this.blockSorter = new BlockSort(this.data); /* * Write `magic' bytes h indicating file-format == huffmanised, followed * by a digit indicating blockSize100k. */ bsPutUByte('h'); bsPutUByte('0' + this.blockSize100k); this.combinedCRC = 0; initBlock(); } private void initBlock() { // blockNo++; this.crc.initialiseCRC(); this.last = -1; // ch = 0; boolean[] inUse = this.data.inUse; for (int i = 256; --i >= 0;) { inUse[i] = false; } } private void endBlock() throws IOException { this.blockCRC = this.crc.getFinalCRC(); this.combinedCRC = (this.combinedCRC << 1) | (this.combinedCRC >>> 31); this.combinedCRC ^= this.blockCRC; // empty block at end of file if (this.last == -1) { return; } /* sort the block and establish posn of original string */ blockSort(); /* * A 6-byte block header, the value chosen arbitrarily as 0x314159265359 * :-). A 32 bit value does not really give a strong enough guarantee * that the value will not appear by chance in the compressed * datastream. Worst-case probability of this event, for a 900k block, * is about 2.0e-3 for 32 bits, 1.0e-5 for 40 bits and 4.0e-8 for 48 * bits. For a compressed file of size 100Gb -- about 100000 blocks -- * only a 48-bit marker will do. NB: normal compression/ decompression * do not rely on these statistical properties. They are only important * when trying to recover blocks from damaged files. */ bsPutUByte(0x31); bsPutUByte(0x41); bsPutUByte(0x59); bsPutUByte(0x26); bsPutUByte(0x53); bsPutUByte(0x59); /* Now the block's CRC, so it is in a known place. */ bsPutInt(this.blockCRC); /* Now a single bit indicating no randomisation. */ bsW(1, 0); /* Finally, block's contents proper. */ moveToFrontCodeAndSend(); } private void endCompression() throws IOException { /* * Now another magic 48-bit number, 0x177245385090, to indicate the end * of the last block. (sqrt(pi), if you want to know. I did want to use * e, but it contains too much repetition -- 27 18 28 18 28 46 -- for me * to feel statistically comfortable. Call me paranoid.) */ bsPutUByte(0x17); bsPutUByte(0x72); bsPutUByte(0x45); bsPutUByte(0x38); bsPutUByte(0x50); bsPutUByte(0x90); bsPutInt(this.combinedCRC); bsFinishedWithStream(); }
Returns the blocksize parameter specified at construction time.
Returns:int
/** * Returns the blocksize parameter specified at construction time. * * @return int */
public final int getBlockSize() { return this.blockSize100k; } @Override public void write(final byte[] buf, int offs, final int len) throws IOException { if (offs < 0) { throw new IndexOutOfBoundsException("offs(" + offs + ") < 0."); } if (len < 0) { throw new IndexOutOfBoundsException("len(" + len + ") < 0."); } if (offs + len > buf.length) { throw new IndexOutOfBoundsException("offs(" + offs + ") + len(" + len + ") > buf.length(" + buf.length + ")."); } if (this.out == null) { throw new IOException("stream closed"); } for (int hi = offs + len; offs < hi;) { write0(buf[offs++]); } }
Keeps track of the last bytes written and implicitly performs run-length encoding as the first step of the bzip2 algorithm.
/** * Keeps track of the last bytes written and implicitly performs * run-length encoding as the first step of the bzip2 algorithm. */
private void write0(int b) throws IOException { if (this.currentChar != -1) { b &= 0xff; if (this.currentChar == b) { if (++this.runLength > 254) { writeRun(); this.currentChar = -1; this.runLength = 0; } // else nothing to do } else { writeRun(); this.runLength = 1; this.currentChar = b; } } else { this.currentChar = b & 0xff; this.runLength++; } } private static void hbAssignCodes(final int[] code, final byte[] length, final int minLen, final int maxLen, final int alphaSize) { int vec = 0; for (int n = minLen; n <= maxLen; n++) { for (int i = 0; i < alphaSize; i++) { if ((length[i] & 0xff) == n) { code[i] = vec; vec++; } } vec <<= 1; } } private void bsFinishedWithStream() throws IOException { while (this.bsLive > 0) { int ch = this.bsBuff >> 24; this.out.write(ch); // write 8-bit this.bsBuff <<= 8; this.bsLive -= 8; } } private void bsW(final int n, final int v) throws IOException { final OutputStream outShadow = this.out; int bsLiveShadow = this.bsLive; int bsBuffShadow = this.bsBuff; while (bsLiveShadow >= 8) { outShadow.write(bsBuffShadow >> 24); // write 8-bit bsBuffShadow <<= 8; bsLiveShadow -= 8; } this.bsBuff = bsBuffShadow | (v << (32 - bsLiveShadow - n)); this.bsLive = bsLiveShadow + n; } private void bsPutUByte(final int c) throws IOException { bsW(8, c); } private void bsPutInt(final int u) throws IOException { bsW(8, (u >> 24) & 0xff); bsW(8, (u >> 16) & 0xff); bsW(8, (u >> 8) & 0xff); bsW(8, u & 0xff); } private void sendMTFValues() throws IOException { final byte[][] len = this.data.sendMTFValues_len; final int alphaSize = this.nInUse + 2; for (int t = N_GROUPS; --t >= 0;) { byte[] len_t = len[t]; for (int v = alphaSize; --v >= 0;) { len_t[v] = GREATER_ICOST; } } /* Decide how many coding tables to use */ // assert (this.nMTF > 0) : this.nMTF; final int nGroups = (this.nMTF < 200) ? 2 : (this.nMTF < 600) ? 3 : (this.nMTF < 1200) ? 4 : (this.nMTF < 2400) ? 5 : 6; /* Generate an initial set of coding tables */ sendMTFValues0(nGroups, alphaSize); /* * Iterate up to N_ITERS times to improve the tables. */ final int nSelectors = sendMTFValues1(nGroups, alphaSize); /* Compute MTF values for the selectors. */ sendMTFValues2(nGroups, nSelectors); /* Assign actual codes for the tables. */ sendMTFValues3(nGroups, alphaSize); /* Transmit the mapping table. */ sendMTFValues4(); /* Now the selectors. */ sendMTFValues5(nGroups, nSelectors); /* Now the coding tables. */ sendMTFValues6(nGroups, alphaSize); /* And finally, the block data proper */ sendMTFValues7(); } private void sendMTFValues0(final int nGroups, final int alphaSize) { final byte[][] len = this.data.sendMTFValues_len; final int[] mtfFreq = this.data.mtfFreq; int remF = this.nMTF; int gs = 0; for (int nPart = nGroups; nPart > 0; nPart--) { final int tFreq = remF / nPart; int ge = gs - 1; int aFreq = 0; while (aFreq < tFreq && ge < alphaSize - 1) { aFreq += mtfFreq[++ge]; } if (ge > gs && nPart != nGroups && nPart != 1 && (nGroups - nPart & 1) != 0) { aFreq -= mtfFreq[ge--]; } final byte[] len_np = len[nPart - 1]; for (int v = alphaSize; --v >= 0;) { if (v >= gs && v <= ge) { len_np[v] = LESSER_ICOST; } else { len_np[v] = GREATER_ICOST; } } gs = ge + 1; remF -= aFreq; } } private int sendMTFValues1(final int nGroups, final int alphaSize) { final Data dataShadow = this.data; final int[][] rfreq = dataShadow.sendMTFValues_rfreq; final int[] fave = dataShadow.sendMTFValues_fave; final short[] cost = dataShadow.sendMTFValues_cost; final char[] sfmap = dataShadow.sfmap; final byte[] selector = dataShadow.selector; final byte[][] len = dataShadow.sendMTFValues_len; final byte[] len_0 = len[0]; final byte[] len_1 = len[1]; final byte[] len_2 = len[2]; final byte[] len_3 = len[3]; final byte[] len_4 = len[4]; final byte[] len_5 = len[5]; final int nMTFShadow = this.nMTF; int nSelectors = 0; for (int iter = 0; iter < N_ITERS; iter++) { for (int t = nGroups; --t >= 0;) { fave[t] = 0; int[] rfreqt = rfreq[t]; for (int i = alphaSize; --i >= 0;) { rfreqt[i] = 0; } } nSelectors = 0; for (int gs = 0; gs < this.nMTF;) { /* Set group start & end marks. */ /* * Calculate the cost of this group as coded by each of the * coding tables. */ final int ge = Math.min(gs + G_SIZE - 1, nMTFShadow - 1); if (nGroups == N_GROUPS) { // unrolled version of the else-block short cost0 = 0; short cost1 = 0; short cost2 = 0; short cost3 = 0; short cost4 = 0; short cost5 = 0; for (int i = gs; i <= ge; i++) { final int icv = sfmap[i]; cost0 += len_0[icv] & 0xff; cost1 += len_1[icv] & 0xff; cost2 += len_2[icv] & 0xff; cost3 += len_3[icv] & 0xff; cost4 += len_4[icv] & 0xff; cost5 += len_5[icv] & 0xff; } cost[0] = cost0; cost[1] = cost1; cost[2] = cost2; cost[3] = cost3; cost[4] = cost4; cost[5] = cost5; } else { for (int t = nGroups; --t >= 0;) { cost[t] = 0; } for (int i = gs; i <= ge; i++) { final int icv = sfmap[i]; for (int t = nGroups; --t >= 0;) { cost[t] += len[t][icv] & 0xff; } } } /* * Find the coding table which is best for this group, and * record its identity in the selector table. */ int bt = -1; for (int t = nGroups, bc = 999999999; --t >= 0;) { final int cost_t = cost[t]; if (cost_t < bc) { bc = cost_t; bt = t; } } fave[bt]++; selector[nSelectors] = (byte) bt; nSelectors++; /* * Increment the symbol frequencies for the selected table. */ final int[] rfreq_bt = rfreq[bt]; for (int i = gs; i <= ge; i++) { rfreq_bt[sfmap[i]]++; } gs = ge + 1; } /* * Recompute the tables based on the accumulated frequencies. */ for (int t = 0; t < nGroups; t++) { hbMakeCodeLengths(len[t], rfreq[t], this.data, alphaSize, 20); } } return nSelectors; } private void sendMTFValues2(final int nGroups, final int nSelectors) { // assert (nGroups < 8) : nGroups; final Data dataShadow = this.data; byte[] pos = dataShadow.sendMTFValues2_pos; for (int i = nGroups; --i >= 0;) { pos[i] = (byte) i; } for (int i = 0; i < nSelectors; i++) { final byte ll_i = dataShadow.selector[i]; byte tmp = pos[0]; int j = 0; while (ll_i != tmp) { j++; byte tmp2 = tmp; tmp = pos[j]; pos[j] = tmp2; } pos[0] = tmp; dataShadow.selectorMtf[i] = (byte) j; } } private void sendMTFValues3(final int nGroups, final int alphaSize) { int[][] code = this.data.sendMTFValues_code; byte[][] len = this.data.sendMTFValues_len; for (int t = 0; t < nGroups; t++) { int minLen = 32; int maxLen = 0; final byte[] len_t = len[t]; for (int i = alphaSize; --i >= 0;) { final int l = len_t[i] & 0xff; if (l > maxLen) { maxLen = l; } if (l < minLen) { minLen = l; } } // assert (maxLen <= 20) : maxLen; // assert (minLen >= 1) : minLen; hbAssignCodes(code[t], len[t], minLen, maxLen, alphaSize); } } private void sendMTFValues4() throws IOException { final boolean[] inUse = this.data.inUse; final boolean[] inUse16 = this.data.sentMTFValues4_inUse16; for (int i = 16; --i >= 0;) { inUse16[i] = false; final int i16 = i * 16; for (int j = 16; --j >= 0;) { if (inUse[i16 + j]) { inUse16[i] = true; } } } for (int i = 0; i < 16; i++) { bsW(1, inUse16[i] ? 1 : 0); } final OutputStream outShadow = this.out; int bsLiveShadow = this.bsLive; int bsBuffShadow = this.bsBuff; for (int i = 0; i < 16; i++) { if (inUse16[i]) { final int i16 = i * 16; for (int j = 0; j < 16; j++) { // inlined: bsW(1, inUse[i16 + j] ? 1 : 0); while (bsLiveShadow >= 8) { outShadow.write(bsBuffShadow >> 24); // write 8-bit bsBuffShadow <<= 8; bsLiveShadow -= 8; } if (inUse[i16 + j]) { bsBuffShadow |= 1 << (32 - bsLiveShadow - 1); } bsLiveShadow++; } } } this.bsBuff = bsBuffShadow; this.bsLive = bsLiveShadow; } private void sendMTFValues5(final int nGroups, final int nSelectors) throws IOException { bsW(3, nGroups); bsW(15, nSelectors); final OutputStream outShadow = this.out; final byte[] selectorMtf = this.data.selectorMtf; int bsLiveShadow = this.bsLive; int bsBuffShadow = this.bsBuff; for (int i = 0; i < nSelectors; i++) { for (int j = 0, hj = selectorMtf[i] & 0xff; j < hj; j++) { // inlined: bsW(1, 1); while (bsLiveShadow >= 8) { outShadow.write(bsBuffShadow >> 24); bsBuffShadow <<= 8; bsLiveShadow -= 8; } bsBuffShadow |= 1 << (32 - bsLiveShadow - 1); bsLiveShadow++; } // inlined: bsW(1, 0); while (bsLiveShadow >= 8) { outShadow.write(bsBuffShadow >> 24); bsBuffShadow <<= 8; bsLiveShadow -= 8; } // bsBuffShadow |= 0 << (32 - bsLiveShadow - 1); bsLiveShadow++; } this.bsBuff = bsBuffShadow; this.bsLive = bsLiveShadow; } private void sendMTFValues6(final int nGroups, final int alphaSize) throws IOException { final byte[][] len = this.data.sendMTFValues_len; final OutputStream outShadow = this.out; int bsLiveShadow = this.bsLive; int bsBuffShadow = this.bsBuff; for (int t = 0; t < nGroups; t++) { byte[] len_t = len[t]; int curr = len_t[0] & 0xff; // inlined: bsW(5, curr); while (bsLiveShadow >= 8) { outShadow.write(bsBuffShadow >> 24); // write 8-bit bsBuffShadow <<= 8; bsLiveShadow -= 8; } bsBuffShadow |= curr << (32 - bsLiveShadow - 5); bsLiveShadow += 5; for (int i = 0; i < alphaSize; i++) { int lti = len_t[i] & 0xff; while (curr < lti) { // inlined: bsW(2, 2); while (bsLiveShadow >= 8) { outShadow.write(bsBuffShadow >> 24); // write 8-bit bsBuffShadow <<= 8; bsLiveShadow -= 8; } bsBuffShadow |= 2 << (32 - bsLiveShadow - 2); bsLiveShadow += 2; curr++; /* 10 */ } while (curr > lti) { // inlined: bsW(2, 3); while (bsLiveShadow >= 8) { outShadow.write(bsBuffShadow >> 24); // write 8-bit bsBuffShadow <<= 8; bsLiveShadow -= 8; } bsBuffShadow |= 3 << (32 - bsLiveShadow - 2); bsLiveShadow += 2; curr--; /* 11 */ } // inlined: bsW(1, 0); while (bsLiveShadow >= 8) { outShadow.write(bsBuffShadow >> 24); // write 8-bit bsBuffShadow <<= 8; bsLiveShadow -= 8; } // bsBuffShadow |= 0 << (32 - bsLiveShadow - 1); bsLiveShadow++; } } this.bsBuff = bsBuffShadow; this.bsLive = bsLiveShadow; } private void sendMTFValues7() throws IOException { final Data dataShadow = this.data; final byte[][] len = dataShadow.sendMTFValues_len; final int[][] code = dataShadow.sendMTFValues_code; final OutputStream outShadow = this.out; final byte[] selector = dataShadow.selector; final char[] sfmap = dataShadow.sfmap; final int nMTFShadow = this.nMTF; int selCtr = 0; int bsLiveShadow = this.bsLive; int bsBuffShadow = this.bsBuff; for (int gs = 0; gs < nMTFShadow;) { final int ge = Math.min(gs + G_SIZE - 1, nMTFShadow - 1); final int selector_selCtr = selector[selCtr] & 0xff; final int[] code_selCtr = code[selector_selCtr]; final byte[] len_selCtr = len[selector_selCtr]; while (gs <= ge) { final int sfmap_i = sfmap[gs]; // // inlined: bsW(len_selCtr[sfmap_i] & 0xff, // code_selCtr[sfmap_i]); // while (bsLiveShadow >= 8) { outShadow.write(bsBuffShadow >> 24); bsBuffShadow <<= 8; bsLiveShadow -= 8; } final int n = len_selCtr[sfmap_i] & 0xFF; bsBuffShadow |= code_selCtr[sfmap_i] << (32 - bsLiveShadow - n); bsLiveShadow += n; gs++; } gs = ge + 1; selCtr++; } this.bsBuff = bsBuffShadow; this.bsLive = bsLiveShadow; } private void moveToFrontCodeAndSend() throws IOException { bsW(24, this.data.origPtr); generateMTFValues(); sendMTFValues(); } private void blockSort() { blockSorter.blockSort(data, last); } /* * Performs Move-To-Front on the Burrows-Wheeler transformed * buffer, storing the MTFed data in data.sfmap in RUNA/RUNB * run-length-encoded form. * * <p>Keeps track of byte frequencies in data.mtfFreq at the same time.</p> */ private void generateMTFValues() { final int lastShadow = this.last; final Data dataShadow = this.data; final boolean[] inUse = dataShadow.inUse; final byte[] block = dataShadow.block; final int[] fmap = dataShadow.fmap; final char[] sfmap = dataShadow.sfmap; final int[] mtfFreq = dataShadow.mtfFreq; final byte[] unseqToSeq = dataShadow.unseqToSeq; final byte[] yy = dataShadow.generateMTFValues_yy; // make maps int nInUseShadow = 0; for (int i = 0; i < 256; i++) { if (inUse[i]) { unseqToSeq[i] = (byte) nInUseShadow; nInUseShadow++; } } this.nInUse = nInUseShadow; final int eob = nInUseShadow + 1; for (int i = eob; i >= 0; i--) { mtfFreq[i] = 0; } for (int i = nInUseShadow; --i >= 0;) { yy[i] = (byte) i; } int wr = 0; int zPend = 0; for (int i = 0; i <= lastShadow; i++) { final byte ll_i = unseqToSeq[block[fmap[i]] & 0xff]; byte tmp = yy[0]; int j = 0; while (ll_i != tmp) { j++; byte tmp2 = tmp; tmp = yy[j]; yy[j] = tmp2; } yy[0] = tmp; if (j == 0) { zPend++; } else { if (zPend > 0) { zPend--; while (true) { if ((zPend & 1) == 0) { sfmap[wr] = RUNA; wr++; mtfFreq[RUNA]++; } else { sfmap[wr] = RUNB; wr++; mtfFreq[RUNB]++; } if (zPend >= 2) { zPend = (zPend - 2) >> 1; } else { break; } } zPend = 0; } sfmap[wr] = (char) (j + 1); wr++; mtfFreq[j + 1]++; } } if (zPend > 0) { zPend--; while (true) { if ((zPend & 1) == 0) { sfmap[wr] = RUNA; wr++; mtfFreq[RUNA]++; } else { sfmap[wr] = RUNB; wr++; mtfFreq[RUNB]++; } if (zPend >= 2) { zPend = (zPend - 2) >> 1; } else { break; } } } sfmap[wr] = (char) eob; mtfFreq[eob]++; this.nMTF = wr + 1; } static final class Data { // with blockSize 900k /* maps unsigned byte => "does it occur in block" */ final boolean[] inUse = new boolean[256]; // 256 byte final byte[] unseqToSeq = new byte[256]; // 256 byte final int[] mtfFreq = new int[MAX_ALPHA_SIZE]; // 1032 byte final byte[] selector = new byte[MAX_SELECTORS]; // 18002 byte final byte[] selectorMtf = new byte[MAX_SELECTORS]; // 18002 byte final byte[] generateMTFValues_yy = new byte[256]; // 256 byte final byte[][] sendMTFValues_len = new byte[N_GROUPS][MAX_ALPHA_SIZE]; // 1548 // byte final int[][] sendMTFValues_rfreq = new int[N_GROUPS][MAX_ALPHA_SIZE]; // 6192 // byte final int[] sendMTFValues_fave = new int[N_GROUPS]; // 24 byte final short[] sendMTFValues_cost = new short[N_GROUPS]; // 12 byte final int[][] sendMTFValues_code = new int[N_GROUPS][MAX_ALPHA_SIZE]; // 6192 // byte final byte[] sendMTFValues2_pos = new byte[N_GROUPS]; // 6 byte final boolean[] sentMTFValues4_inUse16 = new boolean[16]; // 16 byte final int[] heap = new int[MAX_ALPHA_SIZE + 2]; // 1040 byte final int[] weight = new int[MAX_ALPHA_SIZE * 2]; // 2064 byte final int[] parent = new int[MAX_ALPHA_SIZE * 2]; // 2064 byte // ------------ // 333408 byte /* holds the RLEd block of original data starting at index 1. * After sorting the last byte added to the buffer is at index * 0. */ final byte[] block; // 900021 byte /* maps index in Burrows-Wheeler transformed block => index of * byte in original block */ final int[] fmap; // 3600000 byte final char[] sfmap; // 3600000 byte // ------------ // 8433529 byte // ============
Index of original line in Burrows-Wheeler table.

This is the index in fmap that points to the last byte of the original data.

/** * Index of original line in Burrows-Wheeler table. * * <p>This is the index in fmap that points to the last byte * of the original data.</p> */
int origPtr; Data(int blockSize100k) { super(); final int n = blockSize100k * BZip2Constants.baseBlockSize; this.block = new byte[(n + 1 + NUM_OVERSHOOT_BYTES)]; this.fmap = new int[n]; this.sfmap = new char[2 * n]; } } }