java/12/java.base/java/lang/StringCoding.java (new version) from
java/8/java/lang/StringCoding.java (old version).
+921
-196
Showing changes in
/*
- * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package java.lang;
import java.io.UnsupportedEncodingException;
import java.lang.ref.SoftReference;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.MalformedInputException;
+import java.nio.charset.UnmappableCharacterException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Arrays;
-import sun.misc.MessageUtils;
+import jdk.internal.HotSpotIntrinsicCandidate;
import sun.nio.cs.HistoricallyNamedCharset;
import sun.nio.cs.ArrayDecoder;
import sun.nio.cs.ArrayEncoder;
+import static java.lang.String.LATIN1;
+import static java.lang.String.UTF16;
+import static java.lang.String.COMPACT_STRINGS;
+import static java.lang.Character.isSurrogate;
+import static java.lang.Character.highSurrogate;
+import static java.lang.Character.lowSurrogate;
+import static java.lang.Character.isSupplementaryCodePoint;
+import static java.lang.StringUTF16.putChar;
+
/**
* Utility class for string encoding and decoding.
*/
class StringCoding {
private StringCoding() { }
/** The cached coders for each thread */
- private final static ThreadLocal<SoftReference<StringDecoder>> decoder =
+ private static final ThreadLocal<SoftReference<StringDecoder>> decoder =
new ThreadLocal<>();
- private final static ThreadLocal<SoftReference<StringEncoder>> encoder =
+ private static final ThreadLocal<SoftReference<StringEncoder>> encoder =
new ThreadLocal<>();
- private static boolean warnUnsupportedCharset = true;
+ private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE;
+ private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE;
+ private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE;
private static <T> T deref(ThreadLocal<SoftReference<T>> tl) {
SoftReference<T> sr = tl.get();
if (sr == null)
return null;
return sr.get();
}
private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) {
- tl.set(new SoftReference<T>(ob));
+ tl.set(new SoftReference<>(ob));
}
// Trim the given byte array to the given length
- //
- private static byte[] safeTrim(byte[] ba, int len, Charset cs, boolean isTrusted) {
+ private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) {
if (len == ba.length && (isTrusted || System.getSecurityManager() == null))
return ba;
else
return Arrays.copyOf(ba, len);
}
- // Trim the given char array to the given length
- //
- private static char[] safeTrim(char[] ca, int len,
- Charset cs, boolean isTrusted) {
- if (len == ca.length && (isTrusted || System.getSecurityManager() == null))
- return ca;
- else
- return Arrays.copyOf(ca, len);
- }
-
private static int scale(int len, float expansionFactor) {
// We need to perform double, not float, arithmetic; otherwise
// we lose low order bits when len is larger than 2**24.
return (int)(len * (double)expansionFactor);
}
private static Charset lookupCharset(String csn) {
if (Charset.isSupported(csn)) {
try {
return Charset.forName(csn);
} catch (UnsupportedCharsetException x) {
throw new Error(x);
}
}
return null;
}
- private static void warnUnsupportedCharset(String csn) {
- if (warnUnsupportedCharset) {
- // Use sun.misc.MessageUtils rather than the Logging API or
- // System.err since this method may be called during VM
- // initialization before either is available.
- MessageUtils.err("WARNING: Default charset " + csn +
- " not supported, using ISO-8859-1 instead");
- warnUnsupportedCharset = false;
+ static class Result {
+ byte[] value;
+ byte coder;
+
+ Result with() {
+ coder = COMPACT_STRINGS ? LATIN1 : UTF16;
+ value = new byte[0];
+ return this;
+ }
+
+ Result with(char[] val, int off, int len) {
+ if (String.COMPACT_STRINGS) {
+ byte[] bs = StringUTF16.compress(val, off, len);
+ if (bs != null) {
+ value = bs;
+ coder = LATIN1;
+ return this;
+ }
+ }
+ coder = UTF16;
+ value = StringUTF16.toBytes(val, off, len);
+ return this;
+ }
+
+ Result with(byte[] val, byte coder) {
+ this.coder = coder;
+ value = val;
+ return this;
}
}
+ @HotSpotIntrinsicCandidate
+ public static boolean hasNegatives(byte[] ba, int off, int len) {
+ for (int i = off; i < off + len; i++) {
+ if (ba[i] < 0) {
+ return true;
+ }
+ }
+ return false;
+ }
// -- Decoding --
- private static class StringDecoder {
+ static class StringDecoder {
private final String requestedCharsetName;
private final Charset cs;
+ private final boolean isASCIICompatible;
private final CharsetDecoder cd;
- private final boolean isTrusted;
+ protected final Result result;
- private StringDecoder(Charset cs, String rcn) {
+ StringDecoder(Charset cs, String rcn) {
this.requestedCharsetName = rcn;
this.cs = cs;
this.cd = cs.newDecoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE);
- this.isTrusted = (cs.getClass().getClassLoader0() == null);
+ this.result = new Result();
+ this.isASCIICompatible = (cd instanceof ArrayDecoder) &&
+ ((ArrayDecoder)cd).isASCIICompatible();
}
String charsetName() {
if (cs instanceof HistoricallyNamedCharset)
return ((HistoricallyNamedCharset)cs).historicalName();
return cs.name();
}
final String requestedCharsetName() {
return requestedCharsetName;
}
- char[] decode(byte[] ba, int off, int len) {
+ Result decode(byte[] ba, int off, int len) {
+ if (len == 0) {
+ return result.with();
+ }
+ // fastpath for ascii compatible
+ if (isASCIICompatible && !hasNegatives(ba, off, len)) {
+ if (COMPACT_STRINGS) {
+ return result.with(Arrays.copyOfRange(ba, off, off + len),
+ LATIN1);
+ } else {
+ return result.with(StringLatin1.inflate(ba, off, len), UTF16);
+ }
+ }
int en = scale(len, cd.maxCharsPerByte());
char[] ca = new char[en];
- if (len == 0)
- return ca;
if (cd instanceof ArrayDecoder) {
int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
- return safeTrim(ca, clen, cs, isTrusted);
- } else {
- cd.reset();
- ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
- CharBuffer cb = CharBuffer.wrap(ca);
- try {
- CoderResult cr = cd.decode(bb, cb, true);
- if (!cr.isUnderflow())
- cr.throwException();
- cr = cd.flush(cb);
- if (!cr.isUnderflow())
- cr.throwException();
- } catch (CharacterCodingException x) {
- // Substitution is always enabled,
- // so this shouldn't happen
- throw new Error(x);
- }
- return safeTrim(ca, cb.position(), cs, isTrusted);
+ return result.with(ca, 0, clen);
}
- }
- }
-
- static char[] decode(String charsetName, byte[] ba, int off, int len)
- throws UnsupportedEncodingException
- {
- StringDecoder sd = deref(decoder);
- String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
- if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
- || csn.equals(sd.charsetName()))) {
- sd = null;
- try {
- Charset cs = lookupCharset(csn);
- if (cs != null)
- sd = new StringDecoder(cs, csn);
- } catch (IllegalCharsetNameException x) {}
- if (sd == null)
- throw new UnsupportedEncodingException(csn);
- set(decoder, sd);
- }
- return sd.decode(ba, off, len);
- }
-
- static char[] decode(Charset cs, byte[] ba, int off, int len) {
- // (1)We never cache the "external" cs, the only benefit of creating
- // an additional StringDe/Encoder object to wrap it is to share the
- // de/encode() method. These SD/E objects are short-lifed, the young-gen
- // gc should be able to take care of them well. But the best approash
- // is still not to generate them if not really necessary.
- // (2)The defensive copy of the input byte/char[] has a big performance
- // impact, as well as the outgoing result byte/char[]. Need to do the
- // optimization check of (sm==null && classLoader0==null) for both.
- // (3)getClass().getClassLoader0() is expensive
- // (4)There might be a timing gap in isTrusted setting. getClassLoader0()
- // is only chcked (and then isTrusted gets set) when (SM==null). It is
- // possible that the SM==null for now but then SM is NOT null later
- // when safeTrim() is invoked...the "safe" way to do is to redundant
- // check (... && (isTrusted || SM == null || getClassLoader0())) in trim
- // but it then can be argued that the SM is null when the opertaion
- // is started...
- CharsetDecoder cd = cs.newDecoder();
- int en = scale(len, cd.maxCharsPerByte());
- char[] ca = new char[en];
- if (len == 0)
- return ca;
- boolean isTrusted = false;
- if (System.getSecurityManager() != null) {
- if (!(isTrusted = (cs.getClass().getClassLoader0() == null))) {
- ba = Arrays.copyOfRange(ba, off, off + len);
- off = 0;
- }
- }
- cd.onMalformedInput(CodingErrorAction.REPLACE)
- .onUnmappableCharacter(CodingErrorAction.REPLACE)
- .reset();
- if (cd instanceof ArrayDecoder) {
- int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
- return safeTrim(ca, clen, cs, isTrusted);
- } else {
+ cd.reset();
ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
CharBuffer cb = CharBuffer.wrap(ca);
try {
CoderResult cr = cd.decode(bb, cb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = cd.flush(cb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
// Substitution is always enabled,
// so this shouldn't happen
throw new Error(x);
}
- return safeTrim(ca, cb.position(), cs, isTrusted);
+ return result.with(ca, 0, cb.position());
}
}
- static char[] decode(byte[] ba, int off, int len) {
- String csn = Charset.defaultCharset().name();
- try {
- // use charset name decode() variant which provides caching.
- return decode(csn, ba, off, len);
- } catch (UnsupportedEncodingException x) {
- warnUnsupportedCharset(csn);
+ static Result decode(String charsetName, byte[] ba, int off, int len)
+ throws UnsupportedEncodingException
+ {
+ StringDecoder sd = deref(decoder);
+ String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
+ if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
+ || csn.equals(sd.charsetName()))) {
+ sd = null;
+ try {
+ Charset cs = lookupCharset(csn);
+ if (cs != null) {
+ if (cs == UTF_8) {
+ return decodeUTF8(ba, off, len, true);
+ }
+ if (cs == ISO_8859_1) {
+ return decodeLatin1(ba, off, len);
+ }
+ if (cs == US_ASCII) {
+ return decodeASCII(ba, off, len);
+ }
+ sd = new StringDecoder(cs, csn);
+ }
+ } catch (IllegalCharsetNameException x) {}
+ if (sd == null)
+ throw new UnsupportedEncodingException(csn);
+ set(decoder, sd);
}
- try {
- return decode("ISO-8859-1", ba, off, len);
- } catch (UnsupportedEncodingException x) {
- // If this code is hit during VM initialization, MessageUtils is
- // the only way we will be able to get any kind of error message.
- MessageUtils.err("ISO-8859-1 charset not available: "
- + x.toString());
- // If we can not find ISO-8859-1 (a required encoding) then things
- // are seriously wrong with the installation.
- System.exit(1);
- return null;
+ return sd.decode(ba, off, len);
+ }
+
+ static Result decode(Charset cs, byte[] ba, int off, int len) {
+ if (cs == UTF_8) {
+ return decodeUTF8(ba, off, len, true);
}
+ if (cs == ISO_8859_1) {
+ return decodeLatin1(ba, off, len);
+ }
+ if (cs == US_ASCII) {
+ return decodeASCII(ba, off, len);
+ }
+
+ // (1)We never cache the "external" cs, the only benefit of creating
+ // an additional StringDe/Encoder object to wrap it is to share the
+ // de/encode() method. These SD/E objects are short-lived, the young-gen
+ // gc should be able to take care of them well. But the best approach
+ // is still not to generate them if not really necessary.
+ // (2)The defensive copy of the input byte/char[] has a big performance
+ // impact, as well as the outgoing result byte/char[]. Need to do the
+ // optimization check of (sm==null && classLoader0==null) for both.
+ // (3)There might be a timing gap in isTrusted setting. getClassLoader0()
+ // is only checked (and then isTrusted gets set) when (SM==null). It is
+ // possible that the SM==null for now but then SM is NOT null later
+ // when safeTrim() is invoked...the "safe" way to do is to redundant
+ // check (... && (isTrusted || SM == null || getClassLoader0())) in trim
+ // but it then can be argued that the SM is null when the operation
+ // is started...
+ CharsetDecoder cd = cs.newDecoder();
+ // ascii fastpath
+ if ((cd instanceof ArrayDecoder) &&
+ ((ArrayDecoder)cd).isASCIICompatible() && !hasNegatives(ba, off, len)) {
+ return decodeLatin1(ba, off, len);
+ }
+ int en = scale(len, cd.maxCharsPerByte());
+ if (len == 0) {
+ return new Result().with();
+ }
+ cd.onMalformedInput(CodingErrorAction.REPLACE)
+ .onUnmappableCharacter(CodingErrorAction.REPLACE)
+ .reset();
+ char[] ca = new char[en];
+ if (cd instanceof ArrayDecoder) {
+ int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
+ return new Result().with(ca, 0, clen);
+ }
+ if (cs.getClass().getClassLoader0() != null &&
+ System.getSecurityManager() != null) {
+ ba = Arrays.copyOfRange(ba, off, off + len);
+ off = 0;
+ }
+ ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
+ CharBuffer cb = CharBuffer.wrap(ca);
+ try {
+ CoderResult cr = cd.decode(bb, cb, true);
+ if (!cr.isUnderflow())
+ cr.throwException();
+ cr = cd.flush(cb);
+ if (!cr.isUnderflow())
+ cr.throwException();
+ } catch (CharacterCodingException x) {
+ // Substitution is always enabled,
+ // so this shouldn't happen
+ throw new Error(x);
+ }
+ return new Result().with(ca, 0, cb.position());
+ }
+
+ static Result decode(byte[] ba, int off, int len) {
+ Charset cs = Charset.defaultCharset();
+ if (cs == UTF_8) {
+ return decodeUTF8(ba, off, len, true);
+ }
+ if (cs == ISO_8859_1) {
+ return decodeLatin1(ba, off, len);
+ }
+ if (cs == US_ASCII) {
+ return decodeASCII(ba, off, len);
+ }
+ StringDecoder sd = deref(decoder);
+ if (sd == null || !cs.name().equals(sd.cs.name())) {
+ sd = new StringDecoder(cs, cs.name());
+ set(decoder, sd);
+ }
+ return sd.decode(ba, off, len);
}
// -- Encoding --
private static class StringEncoder {
private Charset cs;
private CharsetEncoder ce;
+ private final boolean isASCIICompatible;
private final String requestedCharsetName;
private final boolean isTrusted;
private StringEncoder(Charset cs, String rcn) {
this.requestedCharsetName = rcn;
this.cs = cs;
this.ce = cs.newEncoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE);
this.isTrusted = (cs.getClass().getClassLoader0() == null);
+ this.isASCIICompatible = (ce instanceof ArrayEncoder) &&
+ ((ArrayEncoder)ce).isASCIICompatible();
}
String charsetName() {
if (cs instanceof HistoricallyNamedCharset)
return ((HistoricallyNamedCharset)cs).historicalName();
return cs.name();
}
final String requestedCharsetName() {
return requestedCharsetName;
}
- byte[] encode(char[] ca, int off, int len) {
+ byte[] encode(byte coder, byte[] val) {
+ // fastpath for ascii compatible
+ if (coder == LATIN1 && isASCIICompatible &&
+ !hasNegatives(val, 0, val.length)) {
+ return Arrays.copyOf(val, val.length);
+ }
+ int len = val.length >> coder; // assume LATIN1=0/UTF16=1;
int en = scale(len, ce.maxBytesPerChar());
byte[] ba = new byte[en];
- if (len == 0)
+ if (len == 0) {
return ba;
- if (ce instanceof ArrayEncoder) {
- int blen = ((ArrayEncoder)ce).encode(ca, off, len, ba);
- return safeTrim(ba, blen, cs, isTrusted);
- } else {
- ce.reset();
- ByteBuffer bb = ByteBuffer.wrap(ba);
- CharBuffer cb = CharBuffer.wrap(ca, off, len);
- try {
- CoderResult cr = ce.encode(cb, bb, true);
- if (!cr.isUnderflow())
- cr.throwException();
- cr = ce.flush(bb);
- if (!cr.isUnderflow())
- cr.throwException();
- } catch (CharacterCodingException x) {
- // Substitution is always enabled,
- // so this shouldn't happen
- throw new Error(x);
- }
- return safeTrim(ba, bb.position(), cs, isTrusted);
}
+ if (ce instanceof ArrayEncoder) {
+ int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
+ : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
+ if (blen != -1) {
+ return safeTrim(ba, blen, isTrusted);
+ }
+ }
+ char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
+ : StringUTF16.toChars(val);
+ ce.reset();
+ ByteBuffer bb = ByteBuffer.wrap(ba);
+ CharBuffer cb = CharBuffer.wrap(ca, 0, len);
+ try {
+ CoderResult cr = ce.encode(cb, bb, true);
+ if (!cr.isUnderflow())
+ cr.throwException();
+ cr = ce.flush(bb);
+ if (!cr.isUnderflow())
+ cr.throwException();
+ } catch (CharacterCodingException x) {
+ // Substitution is always enabled,
+ // so this shouldn't happen
+ throw new Error(x);
+ }
+ return safeTrim(ba, bb.position(), isTrusted);
}
}
- static byte[] encode(String charsetName, char[] ca, int off, int len)
+ static byte[] encode(String charsetName, byte coder, byte[] val)
throws UnsupportedEncodingException
{
StringEncoder se = deref(encoder);
String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
if ((se == null) || !(csn.equals(se.requestedCharsetName())
|| csn.equals(se.charsetName()))) {
se = null;
try {
Charset cs = lookupCharset(csn);
- if (cs != null)
+ if (cs != null) {
+ if (cs == UTF_8) {
+ return encodeUTF8(coder, val, true);
+ }
+ if (cs == ISO_8859_1) {
+ return encode8859_1(coder, val);
+ }
+ if (cs == US_ASCII) {
+ return encodeASCII(coder, val);
+ }
se = new StringEncoder(cs, csn);
+ }
} catch (IllegalCharsetNameException x) {}
- if (se == null)
+ if (se == null) {
throw new UnsupportedEncodingException (csn);
+ }
set(encoder, se);
}
- return se.encode(ca, off, len);
+ return se.encode(coder, val);
}
- static byte[] encode(Charset cs, char[] ca, int off, int len) {
+ static byte[] encode(Charset cs, byte coder, byte[] val) {
+ if (cs == UTF_8) {
+ return encodeUTF8(coder, val, true);
+ }
+ if (cs == ISO_8859_1) {
+ return encode8859_1(coder, val);
+ }
+ if (cs == US_ASCII) {
+ return encodeASCII(coder, val);
+ }
CharsetEncoder ce = cs.newEncoder();
+ // fastpath for ascii compatible
+ if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
+ ((ArrayEncoder)ce).isASCIICompatible() &&
+ !hasNegatives(val, 0, val.length)))) {
+ return Arrays.copyOf(val, val.length);
+ }
+ int len = val.length >> coder; // assume LATIN1=0/UTF16=1;
int en = scale(len, ce.maxBytesPerChar());
byte[] ba = new byte[en];
- if (len == 0)
+ if (len == 0) {
return ba;
- boolean isTrusted = false;
- if (System.getSecurityManager() != null) {
- if (!(isTrusted = (cs.getClass().getClassLoader0() == null))) {
- ca = Arrays.copyOfRange(ca, off, off + len);
- off = 0;
- }
}
ce.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.reset();
if (ce instanceof ArrayEncoder) {
- int blen = ((ArrayEncoder)ce).encode(ca, off, len, ba);
- return safeTrim(ba, blen, cs, isTrusted);
- } else {
- ByteBuffer bb = ByteBuffer.wrap(ba);
- CharBuffer cb = CharBuffer.wrap(ca, off, len);
- try {
- CoderResult cr = ce.encode(cb, bb, true);
- if (!cr.isUnderflow())
- cr.throwException();
- cr = ce.flush(bb);
- if (!cr.isUnderflow())
- cr.throwException();
- } catch (CharacterCodingException x) {
- throw new Error(x);
+ int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
+ : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
+ if (blen != -1) {
+ return safeTrim(ba, blen, true);
}
- return safeTrim(ba, bb.position(), cs, isTrusted);
+ }
+ boolean isTrusted = cs.getClass().getClassLoader0() == null ||
+ System.getSecurityManager() == null;
+ char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
+ : StringUTF16.toChars(val);
+ ByteBuffer bb = ByteBuffer.wrap(ba);
+ CharBuffer cb = CharBuffer.wrap(ca, 0, len);
+ try {
+ CoderResult cr = ce.encode(cb, bb, true);
+ if (!cr.isUnderflow())
+ cr.throwException();
+ cr = ce.flush(bb);
+ if (!cr.isUnderflow())
+ cr.throwException();
+ } catch (CharacterCodingException x) {
+ throw new Error(x);
+ }
+ return safeTrim(ba, bb.position(), isTrusted);
+ }
+
+ static byte[] encode(byte coder, byte[] val) {
+ Charset cs = Charset.defaultCharset();
+ if (cs == UTF_8) {
+ return encodeUTF8(coder, val, true);
+ }
+ if (cs == ISO_8859_1) {
+ return encode8859_1(coder, val);
+ }
+ if (cs == US_ASCII) {
+ return encodeASCII(coder, val);
+ }
+ StringEncoder se = deref(encoder);
+ if (se == null || !cs.name().equals(se.cs.name())) {
+ se = new StringEncoder(cs, cs.name());
+ set(encoder, se);
+ }
+ return se.encode(coder, val);
+ }
+
+ /**
+ * Print a message directly to stderr, bypassing all character conversion
+ * methods.
+ * @param msg message to print
+ */
+ private static native void err(String msg);
+
+ /* The cached Result for each thread */
+ private static final ThreadLocal<StringCoding.Result>
+ resultCached = new ThreadLocal<>() {
+ protected StringCoding.Result initialValue() {
+ return new StringCoding.Result();
+ }};
+
+ ////////////////////////// ascii //////////////////////////////
+
+ private static Result decodeASCII(byte[] ba, int off, int len) {
+ Result result = resultCached.get();
+ if (COMPACT_STRINGS && !hasNegatives(ba, off, len)) {
+ return result.with(Arrays.copyOfRange(ba, off, off + len),
+ LATIN1);
+ }
+ byte[] dst = new byte[len<<1];
+ int dp = 0;
+ while (dp < len) {
+ int b = ba[off++];
+ putChar(dst, dp++, (b >= 0) ? (char)b : repl);
+ }
+ return result.with(dst, UTF16);
+ }
+
+ private static byte[] encodeASCII(byte coder, byte[] val) {
+ if (coder == LATIN1) {
+ byte[] dst = new byte[val.length];
+ for (int i = 0; i < val.length; i++) {
+ if (val[i] < 0) {
+ dst[i] = '?';
+ } else {
+ dst[i] = val[i];
+ }
+ }
+ return dst;
+ }
+ int len = val.length >> 1;
+ byte[] dst = new byte[len];
+ int dp = 0;
+ for (int i = 0; i < len; i++) {
+ char c = StringUTF16.getChar(val, i);
+ if (c < 0x80) {
+ dst[dp++] = (byte)c;
+ continue;
+ }
+ if (Character.isHighSurrogate(c) && i + 1 < len &&
+ Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) {
+ i++;
+ }
+ dst[dp++] = '?';
+ }
+ if (len == dp) {
+ return dst;
+ }
+ return Arrays.copyOf(dst, dp);
+ }
+
+ ////////////////////////// latin1/8859_1 ///////////////////////////
+
+ private static Result decodeLatin1(byte[] ba, int off, int len) {
+ Result result = resultCached.get();
+ if (COMPACT_STRINGS) {
+ return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1);
+ } else {
+ return result.with(StringLatin1.inflate(ba, off, len), UTF16);
+ }
+ }
+
+ @HotSpotIntrinsicCandidate
+ private static int implEncodeISOArray(byte[] sa, int sp,
+ byte[] da, int dp, int len) {
+ int i = 0;
+ for (; i < len; i++) {
+ char c = StringUTF16.getChar(sa, sp++);
+ if (c > '\u00FF')
+ break;
+ da[dp++] = (byte)c;
+ }
+ return i;
+ }
+
+ private static byte[] encode8859_1(byte coder, byte[] val) {
+ return encode8859_1(coder, val, true);
+ }
+
+ private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) {
+ if (coder == LATIN1) {
+ return Arrays.copyOf(val, val.length);
+ }
+ int len = val.length >> 1;
+ byte[] dst = new byte[len];
+ int dp = 0;
+ int sp = 0;
+ int sl = len;
+ while (sp < sl) {
+ int ret = implEncodeISOArray(val, sp, dst, dp, len);
+ sp = sp + ret;
+ dp = dp + ret;
+ if (ret != len) {
+ if (!doReplace) {
+ throwUnmappable(sp, 1);
+ }
+ char c = StringUTF16.getChar(val, sp++);
+ if (Character.isHighSurrogate(c) && sp < sl &&
+ Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
+ sp++;
+ }
+ dst[dp++] = '?';
+ len = sl - sp;
+ }
+ }
+ if (dp == dst.length) {
+ return dst;
+ }
+ return Arrays.copyOf(dst, dp);
+ }
+
+ //////////////////////////////// utf8 ////////////////////////////////////
+
+ private static boolean isNotContinuation(int b) {
+ return (b & 0xc0) != 0x80;
+ }
+
+ private static boolean isMalformed3(int b1, int b2, int b3) {
+ return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
+ (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
+ }
+
+ private static boolean isMalformed3_2(int b1, int b2) {
+ return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
+ (b2 & 0xc0) != 0x80;
+ }
+
+ private static boolean isMalformed4(int b2, int b3, int b4) {
+ return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
+ (b4 & 0xc0) != 0x80;
+ }
+
+ private static boolean isMalformed4_2(int b1, int b2) {
+ return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
+ (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
+ (b2 & 0xc0) != 0x80;
+ }
+
+ private static boolean isMalformed4_3(int b3) {
+ return (b3 & 0xc0) != 0x80;
+ }
+
+ // for nb == 3/4
+ private static int malformedN(byte[] src, int sp, int nb) {
+ if (nb == 3) {
+ int b1 = src[sp++];
+ int b2 = src[sp++]; // no need to lookup b3
+ return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
+ isNotContinuation(b2)) ? 1 : 2;
+ } else if (nb == 4) { // we don't care the speed here
+ int b1 = src[sp++] & 0xff;
+ int b2 = src[sp++] & 0xff;
+ if (b1 > 0xf4 ||
+ (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
+ (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
+ isNotContinuation(b2))
+ return 1;
+ if (isNotContinuation(src[sp++]))
+ return 2;
+ return 3;
+ }
+ assert false;
+ return -1;
+ }
+
+ private static void throwMalformed(int off, int nb) {
+ String msg = "malformed input off : " + off + ", length : " + nb;
+ throw new IllegalArgumentException(msg, new MalformedInputException(nb));
+ }
+
+ private static void throwMalformed(byte[] val) {
+ int dp = 0;
+ while (dp < val.length && val[dp] >=0) { dp++; }
+ throwMalformed(dp, 1);
+ }
+
+ private static void throwUnmappable(int off, int nb) {
+ String msg = "malformed input off : " + off + ", length : " + nb;
+ throw new IllegalArgumentException(msg, new UnmappableCharacterException(nb));
+ }
+
+ private static void throwUnmappable(byte[] val) {
+ int dp = 0;
+ while (dp < val.length && val[dp] >=0) { dp++; }
+ throwUnmappable(dp, 1);
+ }
+
+ private static char repl = '\ufffd';
+
+ private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) {
+ // ascii-bais, which has a relative impact to the non-ascii-only bytes
+ if (COMPACT_STRINGS && !hasNegatives(src, sp, len))
+ return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len),
+ LATIN1);
+ return decodeUTF8_0(src, sp, len, doReplace);
+ }
+
+ private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) {
+ Result ret = resultCached.get();
+
+ int sl = sp + len;
+ int dp = 0;
+ byte[] dst = new byte[len];
+
+ if (COMPACT_STRINGS) {
+ while (sp < sl) {
+ int b1 = src[sp];
+ if (b1 >= 0) {
+ dst[dp++] = (byte)b1;
+ sp++;
+ continue;
+ }
+ if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) &&
+ sp + 1 < sl) {
+ int b2 = src[sp + 1];
+ if (!isNotContinuation(b2)) {
+ dst[dp++] = (byte)(((b1 << 6) ^ b2)^
+ (((byte) 0xC0 << 6) ^
+ ((byte) 0x80 << 0)));
+ sp += 2;
+ continue;
+ }
+ }
+ // anything not a latin1, including the repl
+ // we have to go with the utf16
+ break;
+ }
+ if (sp == sl) {
+ if (dp != dst.length) {
+ dst = Arrays.copyOf(dst, dp);
+ }
+ return ret.with(dst, LATIN1);
+ }
+ }
+ if (dp == 0) {
+ dst = new byte[len << 1];
+ } else {
+ byte[] buf = new byte[len << 1];
+ StringLatin1.inflate(dst, 0, buf, 0, dp);
+ dst = buf;
+ }
+ while (sp < sl) {
+ int b1 = src[sp++];
+ if (b1 >= 0) {
+ putChar(dst, dp++, (char) b1);
+ } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
+ if (sp < sl) {
+ int b2 = src[sp++];
+ if (isNotContinuation(b2)) {
+ if (!doReplace) {
+ throwMalformed(sp - 1, 1);
+ }
+ putChar(dst, dp++, repl);
+ sp--;
+ } else {
+ putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^
+ (((byte) 0xC0 << 6) ^
+ ((byte) 0x80 << 0))));
+ }
+ continue;
+ }
+ if (!doReplace) {
+ throwMalformed(sp, 1); // underflow()
+ }
+ putChar(dst, dp++, repl);
+ break;
+ } else if ((b1 >> 4) == -2) {
+ if (sp + 1 < sl) {
+ int b2 = src[sp++];
+ int b3 = src[sp++];
+ if (isMalformed3(b1, b2, b3)) {
+ if (!doReplace) {
+ throwMalformed(sp - 3, 3);
+ }
+ putChar(dst, dp++, repl);
+ sp -= 3;
+ sp += malformedN(src, sp, 3);
+ } else {
+ char c = (char)((b1 << 12) ^
+ (b2 << 6) ^
+ (b3 ^
+ (((byte) 0xE0 << 12) ^
+ ((byte) 0x80 << 6) ^
+ ((byte) 0x80 << 0))));
+ if (isSurrogate(c)) {
+ if (!doReplace) {
+ throwMalformed(sp - 3, 3);
+ }
+ putChar(dst, dp++, repl);
+ } else {
+ putChar(dst, dp++, c);
+ }
+ }
+ continue;
+ }
+ if (sp < sl && isMalformed3_2(b1, src[sp])) {
+ if (!doReplace) {
+ throwMalformed(sp - 1, 2);
+ }
+ putChar(dst, dp++, repl);
+ continue;
+ }
+ if (!doReplace){
+ throwMalformed(sp, 1);
+ }
+ putChar(dst, dp++, repl);
+ break;
+ } else if ((b1 >> 3) == -2) {
+ if (sp + 2 < sl) {
+ int b2 = src[sp++];
+ int b3 = src[sp++];
+ int b4 = src[sp++];
+ int uc = ((b1 << 18) ^
+ (b2 << 12) ^
+ (b3 << 6) ^
+ (b4 ^
+ (((byte) 0xF0 << 18) ^
+ ((byte) 0x80 << 12) ^
+ ((byte) 0x80 << 6) ^
+ ((byte) 0x80 << 0))));
+ if (isMalformed4(b2, b3, b4) ||
+ !isSupplementaryCodePoint(uc)) { // shortest form check
+ if (!doReplace) {
+ throwMalformed(sp - 4, 4);
+ }
+ putChar(dst, dp++, repl);
+ sp -= 4;
+ sp += malformedN(src, sp, 4);
+ } else {
+ putChar(dst, dp++, highSurrogate(uc));
+ putChar(dst, dp++, lowSurrogate(uc));
+ }
+ continue;
+ }
+ b1 &= 0xff;
+ if (b1 > 0xf4 ||
+ sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) {
+ if (!doReplace) {
+ throwMalformed(sp - 1, 1); // or 2
+ }
+ putChar(dst, dp++, repl);
+ continue;
+ }
+ if (!doReplace) {
+ throwMalformed(sp - 1, 1);
+ }
+ sp++;
+ putChar(dst, dp++, repl);
+ if (sp < sl && isMalformed4_3(src[sp])) {
+ continue;
+ }
+ break;
+ } else {
+ if (!doReplace) {
+ throwMalformed(sp - 1, 1);
+ }
+ putChar(dst, dp++, repl);
+ }
+ }
+ if (dp != len) {
+ dst = Arrays.copyOf(dst, dp << 1);
+ }
+ return ret.with(dst, UTF16);
+ }
+
+ private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) {
+ if (coder == UTF16)
+ return encodeUTF8_UTF16(val, doReplace);
+
+ if (!hasNegatives(val, 0, val.length))
+ return Arrays.copyOf(val, val.length);
+
+ int dp = 0;
+ byte[] dst = new byte[val.length << 1];
+ for (int sp = 0; sp < val.length; sp++) {
+ byte c = val[sp];
+ if (c < 0) {
+ dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6));
+ dst[dp++] = (byte)(0x80 | (c & 0x3f));
+ } else {
+ dst[dp++] = c;
+ }
+ }
+ if (dp == dst.length)
+ return dst;
+ return Arrays.copyOf(dst, dp);
+ }
+
+ private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) {
+ int dp = 0;
+ int sp = 0;
+ int sl = val.length >> 1;
+ byte[] dst = new byte[sl * 3];
+ char c;
+ while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') {
+ // ascii fast loop;
+ dst[dp++] = (byte)c;
+ sp++;
+ }
+ while (sp < sl) {
+ c = StringUTF16.getChar(val, sp++);
+ if (c < 0x80) {
+ dst[dp++] = (byte)c;
+ } else if (c < 0x800) {
+ dst[dp++] = (byte)(0xc0 | (c >> 6));
+ dst[dp++] = (byte)(0x80 | (c & 0x3f));
+ } else if (Character.isSurrogate(c)) {
+ int uc = -1;
+ char c2;
+ if (Character.isHighSurrogate(c) && sp < sl &&
+ Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
+ uc = Character.toCodePoint(c, c2);
+ }
+ if (uc < 0) {
+ if (doReplace) {
+ dst[dp++] = '?';
+ } else {
+ throwUnmappable(sp - 1, 1); // or 2, does not matter here
+ }
+ } else {
+ dst[dp++] = (byte)(0xf0 | ((uc >> 18)));
+ dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
+ dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f));
+ dst[dp++] = (byte)(0x80 | (uc & 0x3f));
+ sp++; // 2 chars
+ }
+ } else {
+ // 3 bytes, 16 bits
+ dst[dp++] = (byte)(0xe0 | ((c >> 12)));
+ dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f));
+ dst[dp++] = (byte)(0x80 | (c & 0x3f));
+ }
+ }
+ if (dp == dst.length) {
+ return dst;
+ }
+ return Arrays.copyOf(dst, dp);
+ }
+
+ ////////////////////// for j.u.z.ZipCoder //////////////////////////
+
+ /*
+ * Throws iae, instead of replacing, if malformed or unmappable.
+ */
+ static String newStringUTF8NoRepl(byte[] src, int off, int len) {
+ if (COMPACT_STRINGS && !hasNegatives(src, off, len))
+ return new String(Arrays.copyOfRange(src, off, off + len), LATIN1);
+ Result ret = decodeUTF8_0(src, off, len, false);
+ return new String(ret.value, ret.coder);
+ }
+
+ /*
+ * Throws iae, instead of replacing, if unmappable.
+ */
+ static byte[] getBytesUTF8NoRepl(String s) {
+ return encodeUTF8(s.coder(), s.value(), false);
+ }
+
+ ////////////////////// for j.n.f.Files //////////////////////////
+
+ private static boolean isASCII(byte[] src) {
+ return !hasNegatives(src, 0, src.length);
+ }
+
+ private static String newStringLatin1(byte[] src) {
+ if (COMPACT_STRINGS)
+ return new String(src, LATIN1);
+ return new String(StringLatin1.inflate(src, 0, src.length), UTF16);
+ }
+
+ static String newStringNoRepl(byte[] src, Charset cs) throws CharacterCodingException {
+ try {
+ return newStringNoRepl1(src, cs);
+ } catch (IllegalArgumentException e) {
+ //newStringNoRepl1 throws IAE with MalformedInputException or CCE as the cause
+ Throwable cause = e.getCause();
+ if (cause instanceof MalformedInputException) {
+ throw (MalformedInputException)cause;
+ }
+ throw (CharacterCodingException)cause;
}
}
- static byte[] encode(char[] ca, int off, int len) {
- String csn = Charset.defaultCharset().name();
- try {
- // use charset name encode() variant which provides caching.
- return encode(csn, ca, off, len);
- } catch (UnsupportedEncodingException x) {
- warnUnsupportedCharset(csn);
+ static String newStringNoRepl1(byte[] src, Charset cs) {
+ if (cs == UTF_8) {
+ if (COMPACT_STRINGS && isASCII(src))
+ return new String(src, LATIN1);
+ Result ret = decodeUTF8_0(src, 0, src.length, false);
+ return new String(ret.value, ret.coder);
}
- try {
- return encode("ISO-8859-1", ca, off, len);
- } catch (UnsupportedEncodingException x) {
- // If this code is hit during VM initialization, MessageUtils is
- // the only way we will be able to get any kind of error message.
- MessageUtils.err("ISO-8859-1 charset not available: "
- + x.toString());
- // If we can not find ISO-8859-1 (a required encoding) then things
- // are seriously wrong with the installation.
- System.exit(1);
- return null;
+ if (cs == ISO_8859_1) {
+ return newStringLatin1(src);
}
+ if (cs == US_ASCII) {
+ if (isASCII(src)) {
+ return newStringLatin1(src);
+ } else {
+ throwMalformed(src);
+ }
+ }
+
+ CharsetDecoder cd = cs.newDecoder();
+ // ascii fastpath
+ if ((cd instanceof ArrayDecoder) &&
+ ((ArrayDecoder)cd).isASCIICompatible() && isASCII(src)) {
+ return newStringLatin1(src);
+ }
+ int len = src.length;
+ if (len == 0) {
+ return "";
+ }
+ int en = scale(len, cd.maxCharsPerByte());
+ char[] ca = new char[en];
+ if (cs.getClass().getClassLoader0() != null &&
+ System.getSecurityManager() != null) {
+ src = Arrays.copyOf(src, len);
+ }
+ ByteBuffer bb = ByteBuffer.wrap(src);
+ CharBuffer cb = CharBuffer.wrap(ca);
+ try {
+ CoderResult cr = cd.decode(bb, cb, true);
+ if (!cr.isUnderflow())
+ cr.throwException();
+ cr = cd.flush(cb);
+ if (!cr.isUnderflow())
+ cr.throwException();
+ } catch (CharacterCodingException x) {
+ throw new IllegalArgumentException(x); // todo
+ }
+ Result ret = resultCached.get().with(ca, 0, cb.position());
+ return new String(ret.value, ret.coder);
+ }
+
+ /*
+ * Throws CCE, instead of replacing, if unmappable.
+ */
+ static byte[] getBytesNoRepl(String s, Charset cs) throws CharacterCodingException {
+ try {
+ return getBytesNoRepl1(s, cs);
+ } catch (IllegalArgumentException e) {
+ //getBytesNoRepl1 throws IAE with UnmappableCharacterException or CCE as the cause
+ Throwable cause = e.getCause();
+ if (cause instanceof UnmappableCharacterException) {
+ throw (UnmappableCharacterException)cause;
+ }
+ throw (CharacterCodingException)cause;
+ }
+ }
+
+ static byte[] getBytesNoRepl1(String s, Charset cs) {
+ byte[] val = s.value();
+ byte coder = s.coder();
+ if (cs == UTF_8) {
+ if (coder == LATIN1 && isASCII(val)) {
+ return val;
+ }
+ return encodeUTF8(coder, val, false);
+ }
+ if (cs == ISO_8859_1) {
+ if (coder == LATIN1) {
+ return val;
+ }
+ return encode8859_1(coder, val, false);
+ }
+ if (cs == US_ASCII) {
+ if (coder == LATIN1) {
+ if (isASCII(val)) {
+ return val;
+ } else {
+ throwUnmappable(val);
+ }
+ }
+ }
+ CharsetEncoder ce = cs.newEncoder();
+ // fastpath for ascii compatible
+ if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
+ ((ArrayEncoder)ce).isASCIICompatible() &&
+ isASCII(val)))) {
+ return val;
+ }
+ int len = val.length >> coder; // assume LATIN1=0/UTF16=1;
+ int en = scale(len, ce.maxBytesPerChar());
+ byte[] ba = new byte[en];
+ if (len == 0) {
+ return ba;
+ }
+ if (ce instanceof ArrayEncoder) {
+ int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
+ : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
+ if (blen != -1) {
+ return safeTrim(ba, blen, true);
+ }
+ }
+ boolean isTrusted = cs.getClass().getClassLoader0() == null ||
+ System.getSecurityManager() == null;
+ char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
+ : StringUTF16.toChars(val);
+ ByteBuffer bb = ByteBuffer.wrap(ba);
+ CharBuffer cb = CharBuffer.wrap(ca, 0, len);
+ try {
+ CoderResult cr = ce.encode(cb, bb, true);
+ if (!cr.isUnderflow())
+ cr.throwException();
+ cr = ce.flush(bb);
+ if (!cr.isUnderflow())
+ cr.throwException();
+ } catch (CharacterCodingException x) {
+ throw new IllegalArgumentException(x);
+ }
+ return safeTrim(ba, bb.position(), isTrusted);
}
}