/* Woodstox Lite ("wool") XML processor
 *
 * Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi
 *
 * Licensed under the License specified in the file LICENSE which is
 * included with the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.fasterxml.aalto.in;

import java.io.*;


import com.fasterxml.aalto.in.ReaderConfig;
import com.fasterxml.aalto.util.XmlConsts;

Since JDK does not come with UTF-32/UCS-4, let's implement a simple decoder to use.
/** * Since JDK does not come with UTF-32/UCS-4, let's implement a simple * decoder to use. */
public final class Utf32Reader extends Reader { final static char NULL_CHAR = (char) 0; final ReaderConfig mConfig; protected InputStream mIn; protected byte[] mBuffer; protected int mPtr; protected int mLength; final boolean mBigEndian;
Although input is fine with full Unicode set, Java still uses 16-bit chars, so we may have to split high-order chars into surrogate pairs.
/** * Although input is fine with full Unicode set, Java still uses * 16-bit chars, so we may have to split high-order chars into * surrogate pairs. */
char mSurrogate = NULL_CHAR;
Total read character count; used for error reporting purposes
/** * Total read character count; used for error reporting purposes */
int mCharCount = 0;
Total read byte count; used for error reporting purposes
/** * Total read byte count; used for error reporting purposes */
int mByteCount = 0; /* //////////////////////////////////////// // Life-cycle //////////////////////////////////////// */ public Utf32Reader(ReaderConfig cfg, InputStream in, byte[] buf, int ptr, int len, boolean isBigEndian) { mConfig = cfg; mBigEndian = isBigEndian; } /* //////////////////////////////////////// // Reader API //////////////////////////////////////// */ @Override public void close() throws IOException { InputStream in = mIn; if (in != null) { mIn = null; freeBuffers(); in.close(); } } char[] mTmpBuf = null;
Although this method is implemented by the base class, AND it should never be called by Woodstox code, let's still implement it bit more efficiently just in case
/** * Although this method is implemented by the base class, AND it should * never be called by Woodstox code, let's still implement it bit more * efficiently just in case */
@Override public int read() throws IOException { if (mTmpBuf == null) { mTmpBuf = new char[1]; } if (read(mTmpBuf, 0, 1) < 1) { return -1; } return mTmpBuf[0]; } /* //////////////////////////////////////// // Public API //////////////////////////////////////// */ @Override public int read(char[] cbuf, int start, int len) throws IOException { // Already EOF? if (mBuffer == null) { return -1; } if (len < 1) { return len; } // Let's then ensure there's enough room... if (start < 0 || (start+len) > cbuf.length) { reportBounds(cbuf, start, len); } len += start; int outPtr = start; // Ok, first; do we have a surrogate from last round? if (mSurrogate != NULL_CHAR) { cbuf[outPtr++] = mSurrogate; mSurrogate = NULL_CHAR; // No need to load more, already got one char } else { /* Note: we'll try to avoid blocking as much as possible. As a * result, we only need to get 4 bytes for a full char. */ int left = (mLength - mPtr); if (left < 4) { if (!loadMore(left)) { // (legal) EOF? return -1; } } } byte[] buf = mBuffer; main_loop: while (outPtr < len) { int ptr = mPtr; int ch; if (mBigEndian) { ch = (buf[ptr] << 24) | ((buf[ptr+1] & 0xFF) << 16) | ((buf[ptr+2] & 0xFF) << 8) | (buf[ptr+3] & 0xFF); } else { ch = (buf[ptr] & 0xFF) | ((buf[ptr+1] & 0xFF) << 8) | ((buf[ptr+2] & 0xFF) << 16) | (buf[ptr+3] << 24); } mPtr += 4; // Does it need to be split to surrogates? if (ch >= 0xD800) { // Illegal? if (ch > XmlConsts.MAX_UNICODE_CHAR) { reportInvalid(ch, outPtr-start, "(above "+Integer.toHexString(XmlConsts.MAX_UNICODE_CHAR)+") "); } if (ch > 0xFFFF) { // need to split into surrogates? ch -= 0x10000; // to normalize it starting with 0x0 cbuf[outPtr++] = (char) (0xD800 + (ch >> 10)); // hmmh. can this ever be 0? (not legal, at least?) ch = (0xDC00 | (ch & 0x03FF)); // Room for second part? if (outPtr >= len) { // nope mSurrogate = (char) ch; break main_loop; } } else { // in 16-bit range... just need validity checks if (ch < 0xE000) { reportInvalid(ch, outPtr-start, "(a surrogate char) "); } else if (ch >= 0xFFFE) { reportInvalid(ch, outPtr-start, ""); } } } cbuf[outPtr++] = (char) ch; if (mPtr >= mLength) { break main_loop; } } len = outPtr - start; mCharCount += len; return len; } /* //////////////////////////////////////// // Internal methods //////////////////////////////////////// */
Params:
  • available – Number of "unused" bytes in the input buffer
Returns:True, if enough bytes were read to allow decoding of at least one full character; false if EOF was encountered instead.
/** * @param available Number of "unused" bytes in the input buffer * * @return True, if enough bytes were read to allow decoding of at least * one full character; false if EOF was encountered instead. */
private boolean loadMore(int available) throws IOException { mByteCount += (mLength - available); // Bytes that need to be moved to the beginning of buffer? if (available > 0) { if (mPtr > 0) { for (int i = 0; i < available; ++i) { mBuffer[i] = mBuffer[mPtr+i]; } mPtr = 0; } mLength = available; } else { /* Ok; here we can actually reasonably expect an EOF, * so let's do a separate read right away: */ mPtr = 0; int count = mIn.read(mBuffer); if (count < 1) { mLength = 0; if (count < 0) { // -1 freeBuffers(); // to help GC? return false; } // 0 count is no good; let's err out reportStrangeStream(); } mLength = count; } /* Need at least 4 bytes; if we don't get that many, it's an * error. */ while (mLength < 4) { int count = mIn.read(mBuffer, mLength, mBuffer.length - mLength); if (count < 1) { if (count < 0) { // -1, EOF... no good! freeBuffers(); // to help GC? reportUnexpectedEOF(mLength, 4); } // 0 count is no good; let's err out reportStrangeStream(); } mLength += count; } return true; } public final void freeBuffers() { byte[] buf = mBuffer; if (buf != null) { mBuffer = null; if (mConfig != null) { mConfig.freeFullBBuffer(buf); } } } /* ////////////////////////////////////////// // Error reporting ////////////////////////////////////////// */ private void reportUnexpectedEOF(int gotBytes, int needed) throws IOException { int bytePos = mByteCount + gotBytes; int charPos = mCharCount; throw new CharConversionException("Unexpected EOF in the middle of a 4-byte UTF-32 char: got " +gotBytes+", needed "+needed +", at char #"+charPos+", byte #"+bytePos+")"); } private void reportInvalid(int value, int offset, String msg) throws IOException { int bytePos = mByteCount + mPtr - 1; int charPos = mCharCount + offset; throw new CharConversionException("Invalid UTF-32 character 0x" +Integer.toHexString(value) +msg+" at char #"+charPos+", byte #"+bytePos+")"); } protected void reportBounds(char[] cbuf, int start, int len) throws IOException { throw new ArrayIndexOutOfBoundsException("read(buf,"+start+","+len+"), cbuf["+cbuf.length+"]"); } protected void reportStrangeStream() throws IOException { throw new IOException("Strange I/O stream, returned 0 bytes on read"); } }