/*
 * Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
 * Use of this file is governed by the BSD 3-clause license that
 * can be found in the LICENSE.txt file in the project root.
 */

package org.antlr.v4.runtime;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.Channels;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

This class represents the primary interface for creating CharStreams from a variety of sources as of 4.7. The motivation was to support Unicode code points > U+FFFF. ANTLRInputStream and ANTLRFileStream are now deprecated in favor of the streams created by this interface. DEPRECATED: new ANTLRFileStream("myinputfile") NEW: CharStreams.fromFileName("myinputfile") WARNING: If you use both the deprecated and the new streams, you will see a nontrivial performance degradation. This speed hit is because the Lexer's internal code goes from a monomorphic to megamorphic dynamic dispatch to get characters from the input stream. Java's on-the-fly compiler (JIT) is unable to perform the same optimizations so stick with either the old or the new streams, if performance is a primary concern. See the extreme debugging and spelunking needed to identify this issue in our timing rig: https://github.com/antlr/antlr4/pull/1781 The ANTLR character streams still buffer all the input when you create the stream, as they have done for ~20 years. If you need unbuffered access, please note that it becomes challenging to create parse trees. The parse tree has to point to tokens which will either point into a stale location in an unbuffered stream or you have to copy the characters out of the buffer into the token. That defeats the purpose of unbuffered input. Per the ANTLR book, unbuffered streams are primarily useful for processing infinite streams *during the parse.* The new streams also use 8-bit buffers when possible so this new interface supports character streams that use half as much memory as the old ANTLRFileStream, which assumed 16-bit characters. A big shout out to Ben Hamilton (github bhamiltoncx) for his superhuman efforts across all targets to get true Unicode 3.1 support for U+10FFFF. @since 4.7
/** This class represents the primary interface for creating {@link CharStream}s * from a variety of sources as of 4.7. The motivation was to support * Unicode code points > U+FFFF. {@link ANTLRInputStream} and * {@link ANTLRFileStream} are now deprecated in favor of the streams created * by this interface. * * DEPRECATED: {@code new ANTLRFileStream("myinputfile")} * NEW: {@code CharStreams.fromFileName("myinputfile")} * * WARNING: If you use both the deprecated and the new streams, you will see * a nontrivial performance degradation. This speed hit is because the * {@link Lexer}'s internal code goes from a monomorphic to megamorphic * dynamic dispatch to get characters from the input stream. Java's * on-the-fly compiler (JIT) is unable to perform the same optimizations * so stick with either the old or the new streams, if performance is * a primary concern. See the extreme debugging and spelunking * needed to identify this issue in our timing rig: * * https://github.com/antlr/antlr4/pull/1781 * * The ANTLR character streams still buffer all the input when you create * the stream, as they have done for ~20 years. If you need unbuffered * access, please note that it becomes challenging to create * parse trees. The parse tree has to point to tokens which will either * point into a stale location in an unbuffered stream or you have to copy * the characters out of the buffer into the token. That defeats the purpose * of unbuffered input. Per the ANTLR book, unbuffered streams are primarily * useful for processing infinite streams *during the parse.* * * The new streams also use 8-bit buffers when possible so this new * interface supports character streams that use half as much memory * as the old {@link ANTLRFileStream}, which assumed 16-bit characters. * * A big shout out to Ben Hamilton (github bhamiltoncx) for his superhuman * efforts across all targets to get true Unicode 3.1 support for U+10FFFF. * * @since 4.7 */
public final class CharStreams { private static final int DEFAULT_BUFFER_SIZE = 4096; // Utility class; do not construct. private CharStreams() { }
Creates a CharStream given a path to a UTF-8 encoded file on disk. Reads the entire contents of the file into the result before returning.
/** * Creates a {@link CharStream} given a path to a UTF-8 * encoded file on disk. * * Reads the entire contents of the file into the result before returning. */
public static CharStream fromPath(Path path) throws IOException { return fromPath(path, StandardCharsets.UTF_8); }
Creates a CharStream given a path to a file on disk and the charset of the bytes contained in the file. Reads the entire contents of the file into the result before returning.
/** * Creates a {@link CharStream} given a path to a file on disk and the * charset of the bytes contained in the file. * * Reads the entire contents of the file into the result before returning. */
public static CharStream fromPath(Path path, Charset charset) throws IOException { long size = Files.size(path); try (ReadableByteChannel channel = Files.newByteChannel(path)) { return fromChannel( channel, charset, DEFAULT_BUFFER_SIZE, CodingErrorAction.REPLACE, path.toString(), size); } }
Creates a CharStream given a string containing a path to a UTF-8 file on disk. Reads the entire contents of the file into the result before returning.
/** * Creates a {@link CharStream} given a string containing a * path to a UTF-8 file on disk. * * Reads the entire contents of the file into the result before returning. */
public static CharStream fromFileName(String fileName) throws IOException { return fromPath(Paths.get(fileName), StandardCharsets.UTF_8); }
Creates a CharStream given a string containing a path to a file on disk and the charset of the bytes contained in the file. Reads the entire contents of the file into the result before returning.
/** * Creates a {@link CharStream} given a string containing a * path to a file on disk and the charset of the bytes * contained in the file. * * Reads the entire contents of the file into the result before returning. */
public static CharStream fromFileName(String fileName, Charset charset) throws IOException { return fromPath(Paths.get(fileName), charset); }
Creates a CharStream given an opened InputStream containing UTF-8 bytes. Reads the entire contents of the InputStream into the result before returning, then closes the InputStream.
/** * Creates a {@link CharStream} given an opened {@link InputStream} * containing UTF-8 bytes. * * Reads the entire contents of the {@code InputStream} into * the result before returning, then closes the {@code InputStream}. */
public static CharStream fromStream(InputStream is) throws IOException { return fromStream(is, StandardCharsets.UTF_8); }
Creates a CharStream given an opened InputStream and the charset of the bytes contained in the stream. Reads the entire contents of the InputStream into the result before returning, then closes the InputStream.
/** * Creates a {@link CharStream} given an opened {@link InputStream} and the * charset of the bytes contained in the stream. * * Reads the entire contents of the {@code InputStream} into * the result before returning, then closes the {@code InputStream}. */
public static CharStream fromStream(InputStream is, Charset charset) throws IOException { return fromStream(is, charset, -1); } public static CharStream fromStream(InputStream is, Charset charset, long inputSize) throws IOException { try (ReadableByteChannel channel = Channels.newChannel(is)) { return fromChannel( channel, charset, DEFAULT_BUFFER_SIZE, CodingErrorAction.REPLACE, IntStream.UNKNOWN_SOURCE_NAME, inputSize); } }
Creates a CharStream given an opened ReadableByteChannel containing UTF-8 bytes. Reads the entire contents of the channel into the result before returning, then closes the channel.
/** * Creates a {@link CharStream} given an opened {@link ReadableByteChannel} * containing UTF-8 bytes. * * Reads the entire contents of the {@code channel} into * the result before returning, then closes the {@code channel}. */
public static CharStream fromChannel(ReadableByteChannel channel) throws IOException { return fromChannel(channel, StandardCharsets.UTF_8); }
Creates a CharStream given an opened ReadableByteChannel and the charset of the bytes contained in the channel. Reads the entire contents of the channel into the result before returning, then closes the channel.
/** * Creates a {@link CharStream} given an opened {@link ReadableByteChannel} and the * charset of the bytes contained in the channel. * * Reads the entire contents of the {@code channel} into * the result before returning, then closes the {@code channel}. */
public static CharStream fromChannel(ReadableByteChannel channel, Charset charset) throws IOException { return fromChannel( channel, DEFAULT_BUFFER_SIZE, CodingErrorAction.REPLACE, IntStream.UNKNOWN_SOURCE_NAME); }
Creates a CharStream given a Reader. Closes the reader before returning.
/** * Creates a {@link CharStream} given a {@link Reader}. Closes * the reader before returning. */
public static CodePointCharStream fromReader(Reader r) throws IOException { return fromReader(r, IntStream.UNKNOWN_SOURCE_NAME); }
Creates a CharStream given a Reader and its source name. Closes the reader before returning.
/** * Creates a {@link CharStream} given a {@link Reader} and its * source name. Closes the reader before returning. */
public static CodePointCharStream fromReader(Reader r, String sourceName) throws IOException { try { CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder(DEFAULT_BUFFER_SIZE); CharBuffer charBuffer = CharBuffer.allocate(DEFAULT_BUFFER_SIZE); while ((r.read(charBuffer)) != -1) { charBuffer.flip(); codePointBufferBuilder.append(charBuffer); charBuffer.compact(); } return CodePointCharStream.fromBuffer(codePointBufferBuilder.build(), sourceName); } finally { r.close(); } }
Creates a CharStream given a String.
/** * Creates a {@link CharStream} given a {@link String}. */
public static CodePointCharStream fromString(String s) { return fromString(s, IntStream.UNKNOWN_SOURCE_NAME); }
Creates a CharStream given a String and the sourceName from which it came.
/** * Creates a {@link CharStream} given a {@link String} and the {@code sourceName} * from which it came. */
public static CodePointCharStream fromString(String s, String sourceName) { // Initial guess assumes no code points > U+FFFF: one code // point for each code unit in the string CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder(s.length()); // TODO: CharBuffer.wrap(String) rightfully returns a read-only buffer // which doesn't expose its array, so we make a copy. CharBuffer cb = CharBuffer.allocate(s.length()); cb.put(s); cb.flip(); codePointBufferBuilder.append(cb); return CodePointCharStream.fromBuffer(codePointBufferBuilder.build(), sourceName); }
Creates a CharStream given an opened ReadableByteChannel containing UTF-8 bytes. Reads the entire contents of the channel into the result before returning, then closes the channel.
/** * Creates a {@link CharStream} given an opened {@link ReadableByteChannel} * containing UTF-8 bytes. * * Reads the entire contents of the {@code channel} into * the result before returning, then closes the {@code channel}. */
public static CodePointCharStream fromChannel( ReadableByteChannel channel, int bufferSize, CodingErrorAction decodingErrorAction, String sourceName) throws IOException { return fromChannel(channel, StandardCharsets.UTF_8, bufferSize, decodingErrorAction, sourceName, -1); } public static CodePointCharStream fromChannel( ReadableByteChannel channel, Charset charset, int bufferSize, CodingErrorAction decodingErrorAction, String sourceName, long inputSize) throws IOException { try { ByteBuffer utf8BytesIn = ByteBuffer.allocate(bufferSize); CharBuffer utf16CodeUnitsOut = CharBuffer.allocate(bufferSize); if (inputSize == -1) { inputSize = bufferSize; } else if (inputSize > Integer.MAX_VALUE) { // ByteBuffer et al don't support long sizes throw new IOException(String.format("inputSize %d larger than max %d", inputSize, Integer.MAX_VALUE)); } CodePointBuffer.Builder codePointBufferBuilder = CodePointBuffer.builder((int) inputSize); CharsetDecoder decoder = charset .newDecoder() .onMalformedInput(decodingErrorAction) .onUnmappableCharacter(decodingErrorAction); boolean endOfInput = false; while (!endOfInput) { int bytesRead = channel.read(utf8BytesIn); endOfInput = (bytesRead == -1); utf8BytesIn.flip(); CoderResult result = decoder.decode( utf8BytesIn, utf16CodeUnitsOut, endOfInput); if (result.isError() && decodingErrorAction.equals(CodingErrorAction.REPORT)) { result.throwException(); } utf16CodeUnitsOut.flip(); codePointBufferBuilder.append(utf16CodeUnitsOut); utf8BytesIn.compact(); utf16CodeUnitsOut.compact(); } // Handle any bytes at the end of the file which need to // be represented as errors or substitution characters. CoderResult flushResult = decoder.flush(utf16CodeUnitsOut); if (flushResult.isError() && decodingErrorAction.equals(CodingErrorAction.REPORT)) { flushResult.throwException(); } utf16CodeUnitsOut.flip(); codePointBufferBuilder.append(utf16CodeUnitsOut); CodePointBuffer codePointBuffer = codePointBufferBuilder.build(); return CodePointCharStream.fromBuffer(codePointBuffer, sourceName); } finally { channel.close(); } } }