/* Copyright 2002-2006 Elliotte Rusty Harold
   
   This library is free software; you can redistribute it and/or modify
   it under the terms of version 2.1 of the GNU Lesser General Public 
   License as published by the Free Software Foundation.
   
   This library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
   GNU Lesser General Public License for more details.
   
   You should have received a copy of the GNU Lesser General Public
   License along with this library; if not, write to the 
   Free Software Foundation, Inc., 59 Temple Place, Suite 330, 
   Boston, MA 02111-1307  USA
   
   You can contact Elliotte Rusty Harold by sending e-mail to
   elharo@ibiblio.org. Please include the word "XOM" in the
   subject line. The XOM home page is located at http://www.xom.nu/
*/

package nu.xom;

import java.io.IOException;
import java.io.Writer;

This class is responsible for writing strings with the necessary escaping for their context.

Author:Elliotte Rusty Harold
Version:1.2d1
/** * <p> * This class is responsible for writing strings with the * necessary escaping for their context. * </p> * * @author Elliotte Rusty Harold * @version 1.2d1 * */
abstract class TextWriter { protected final Writer out; protected final String encoding; private String lineSeparator = "\r\n"; // true if the user has requested a specific // line separator boolean lineSeparatorSet = false; private boolean inDocType = false; private int maxLength = 0; private int indent = 0; private String indentString = ""; protected int column = 0; // Is an xml:space="preserve" attribute in scope? private boolean preserveSpace = false; protected boolean normalize = false; protected TextWriter(Writer out, String encoding) { this.out = out; this.encoding = encoding; } void reset() { column = 0; fakeIndents = 0; lastCharacterWasSpace = false; skipFollowingLinefeed = false; } protected boolean lastCharacterWasSpace = false;
Indicates whether a linefeed is just half of a \r\n pair used for a line break.
/** * Indicates whether a linefeed is just half of a \r\n pair * used for a line break. */
protected boolean skipFollowingLinefeed = false; // Needed for memory between calls. private char highSurrogate; private boolean isHighSurrogate(int c) { return c >= 0xD800 && c <= 0xDBFF; } private boolean isLowSurrogate(int c) { return c >= 0xDC00 && c <= 0xDFFF; } final void writePCDATA(char c) throws IOException { switch(c) { case '\r': if (!adjustingWhiteSpace() && !lineSeparatorSet) { out.write("&#x0D;"); column += 6; justBroke=false; } else { breakLine(); lastCharacterWasSpace = true; } skipFollowingLinefeed = true; break; case 14: // unreachable case 15: // unreachable case 16: // unreachable case 17: // unreachable case 18: // unreachable case 19: // unreachable case 20: // unreachable case 21: // unreachable case 22: // unreachable case 23: // unreachable case 24: // unreachable case 25: // unreachable case 26: // unreachable case 27: // unreachable case 28: // unreachable case 29: // unreachable case 30: // unreachable case 31: // unreachable throw new XMLException("Bad character snuck into document"); case ' ': write(c); break; case '!': write(c); break; case '"': write(c); break; case '#': write(c); break; case '$': write(c); break; case '%': write(c); break; case '&': out.write("&amp;"); column += 5; lastCharacterWasSpace = false; skipFollowingLinefeed = false; justBroke = false; break; case '\'': write(c); break; case '(': write(c); break; case ')': write(c); break; case '*': write(c); break; case '+': write(c); break; case ',': write(c); break; case '-': write(c); break; case '.': write(c); break; case '/': write(c); break; case '0': write(c); break; case '1': write(c); break; case '2': write(c); break; case '3': write(c); break; case '4': write(c); break; case '5': write(c); break; case '6': write(c); break; case '7': write(c); break; case '8': write(c); break; case '9': write(c); break; case ':': write(c); break; case ';': write(c); break; case '<': out.write("&lt;"); column += 4; lastCharacterWasSpace = false; skipFollowingLinefeed = false; justBroke = false; break; case '=': write(c); break; case '>': out.write("&gt;"); column += 4; lastCharacterWasSpace = false; skipFollowingLinefeed = false; justBroke = false; break; default: if (needsEscaping(c)) writeEscapedChar(c); else write(c); } } private void writeEscapedChar(char c) throws IOException { if (isHighSurrogate(c)) { //store and wait for low half highSurrogate = c; } else if (isLowSurrogate(c)) { // decode and write entity reference // I am assuming here that nothing allows the // text to be created with a malformed surrogate // pair such as a low surrogate that is not immediately // preceded by a high surrogate int uchar = UnicodeUtil.combineSurrogatePair(highSurrogate, c); String s = "&#x" + Integer.toHexString(uchar).toUpperCase() + ';'; out.write(s); column += s.length(); lastCharacterWasSpace = false; skipFollowingLinefeed = false; justBroke = false; } else { String s = "&#x" + Integer.toHexString(c).toUpperCase() + ';'; out.write(s); column += s.length(); lastCharacterWasSpace = false; skipFollowingLinefeed = false; justBroke=false; } } private boolean adjustingWhiteSpace() { return maxLength > 0 || indent > 0; } // This is the same as writePCDATA except that it // also needs to escape " as &quot; and tab as "&#x09;". // I'm not escaping the single quote because Serializer // always uses double quotes to contain // values. final void writeAttributeValue(char c) throws IOException { switch(c) { // Handle white space that the parser might normalize // on roundtrip. We only escape them if the serializer // is not adjusting white space; that is indent is 0 // and maxLength is 0. case '\t': if (!adjustingWhiteSpace()) { out.write("&#x09;"); column += 6; lastCharacterWasSpace = true; skipFollowingLinefeed = false; justBroke=false; } else { write(' '); } break; case '\n': if (skipFollowingLinefeed) { skipFollowingLinefeed = false; return; } else if (adjustingWhiteSpace()) { out.write(" "); lastCharacterWasSpace = true; justBroke=false; } else { if (lineSeparatorSet) { escapeBreakLine(); } else { out.write("&#x0A;"); column += 6; justBroke=false; } lastCharacterWasSpace = true; } break; case 11: // unreachable case 12: // unreachable throw new XMLException("Bad character snuck into document"); case '\r': if (adjustingWhiteSpace()) { out.write(" "); lastCharacterWasSpace = true; skipFollowingLinefeed = true; justBroke=false; } else { if (lineSeparatorSet) { escapeBreakLine(); skipFollowingLinefeed = true; } else { out.write("&#x0D;"); column += 6; justBroke=false; } } break; case 14: // unreachable case 15: // unreachable case 16: // unreachable case 17: // unreachable case 18: // unreachable case 19: // unreachable case 20: // unreachable case 21: // unreachable case 22: // unreachable case 23: // unreachable case 24: // unreachable case 25: // unreachable case 26: // unreachable case 27: // unreachable case 28: // unreachable case 29: // unreachable case 30: // unreachable case 31: // unreachable throw new XMLException("Bad character snuck into document"); case ' ': write(c); break; case '!': write(c); break; case '"': out.write("&quot;"); column += 6; lastCharacterWasSpace = false; skipFollowingLinefeed = false; justBroke=false; break; case '#': write(c); break; case '$': write(c); break; case '%': write(c); break; case '&': out.write("&amp;"); column += 5; lastCharacterWasSpace = false; skipFollowingLinefeed = false; justBroke = false; break; case '\'': write(c); break; case '(': write(c); break; case ')': write(c); break; case '*': write(c); break; case '+': write(c); break; case ',': write(c); break; case '-': write(c); break; case '.': write(c); break; case '/': write(c); break; case '0': write(c); break; case '1': write(c); break; case '2': write(c); break; case '3': write(c); break; case '4': write(c); break; case '5': write(c); break; case '6': write(c); break; case '7': write(c); break; case '8': write(c); break; case '9': write(c); break; case ':': write(c); break; case ';': write(c); break; case '<': out.write("&lt;"); column += 4; lastCharacterWasSpace = false; skipFollowingLinefeed = false; justBroke = false; break; case '=': write(c); break; case '>': out.write("&gt;"); column += 4; lastCharacterWasSpace = false; skipFollowingLinefeed = false; justBroke = false; break; default: if (needsEscaping(c)) writeEscapedChar(c); else write(c); } } // XXX We might be able to optimize this by using switch statements // in the methods that call this to separate out the special cases. // --\n, \t, space, etc.--and passing them to a different method // thus avoiding the if tests here. See if this method shows up as // a HotSpot in profiling. void write(char c) throws IOException { // Carriage returns are completely handled by // writePCDATA and writeAttributeValue. They never // enter this method. if ((c == ' ' || c == '\n' || c == '\t')) { if (needsBreak()) { breakLine(); skipFollowingLinefeed = false; } else if (preserveSpace || (indent <= 0 && maxLength <= 0)) { // We're neither indenting nor wrapping // so we need to preserve white space if (c == ' ' || c == '\t') { out.write(c); skipFollowingLinefeed = false; column++; justBroke=false; } else { // (c == '\n') if (!lineSeparatorSet || !skipFollowingLinefeed) { writeLineSeparator(c); } skipFollowingLinefeed = false; column = 0; } } else if (!lastCharacterWasSpace) { out.write(' '); column++; skipFollowingLinefeed = false; justBroke=false; } lastCharacterWasSpace = true; } else { out.write(c); // don't increment column for high surrogate, only low surrogate if (c < 0xd800 || c > 0xDBFF) column++; lastCharacterWasSpace = false; skipFollowingLinefeed = false; justBroke=false; } } private void writeLineSeparator(char c) throws IOException { if (!inDocType && (!lineSeparatorSet || preserveSpace)) out.write(c); else if (lineSeparator.equals("\r\n")) { out.write("\r\n"); } else if (lineSeparator.equals("\n")) { out.write('\n'); } else { // lineSeparator.equals("\r")) out.write('\r'); } // Remember, there are only three possible line separators } private boolean needsBreak() { if (maxLength <= 0 || preserveSpace) return false; // Better algorithm needed: Should look ahead in the // stream, see if there's a white space character // between here and the maxLength, Then again, simple is good. // Here we just assume there's probably space somewhere // within the next ten characters return column >= maxLength - 10; } protected boolean justBroke = false; boolean justBroke() { return justBroke; } final void breakLine() throws IOException { out.write(lineSeparator); out.write(indentString); column = indentString.length(); lastCharacterWasSpace = true; justBroke = true; } private final void escapeBreakLine() throws IOException { if ("\n".equals(lineSeparator)) { out.write("&#x0A;"); column += 6; } else if ("\r\n".equals(lineSeparator)) { out.write("&#x0D;&#x0A;"); column += 12; } else { out.write("&#x0D;"); column += 6; } lastCharacterWasSpace = true; } // Note that when this method is called directly, then // normalization is not performed on c. Currently this is // only called for ASCII characters like <, >, and the space, // which should be OK final void writeMarkup(char c) throws IOException { if (needsEscaping(c)) { throw new UnavailableCharacterException(c, encoding); } write(c); } // XXX should we have a special package protected // method to be used only for ASCII characters we know don't need escaping or // normalization such as <, /, A-Z, etc.? void writePCDATA(String s) throws IOException { s = normalize(s); int length = s.length(); for (int i=0; i < length; i++) { writePCDATA(s.charAt(i)); } } void writeAttributeValue(String s) throws IOException { s = normalize(s); int length = s.length(); for (int i=0; i < length; i++) { writeAttributeValue(s.charAt(i)); } } void writeMarkup(String s) throws IOException { s = normalize(s); int length = s.length(); for (int i=0; i < length; i++) { writeMarkup(s.charAt(i)); } } // This is for ASCII characters like < and = we know are // available in all encodings and do not need to be normalized void writeUncheckedMarkup(String s) throws IOException { int length = s.length(); for (int i=0; i < length; i++) { write(s.charAt(i)); } } protected String normalize(String s) { if (normalize) { return UnicodeUtil.normalize(s); } return s; } boolean isIndenting() { return indentString.length() > 0; } private int fakeIndents = 0; private final static String _128_SPACES=" "; private final static int _128 = 128; void incrementIndent() { if (indent == 0) return; String newIndent; int length = indentString.length() + indent; if (indentString.length() + indent < _128) { newIndent = _128_SPACES.substring(0, length); } else { StringBuffer sb = new StringBuffer(length); sb.append(_128_SPACES); for (int i = _128; i < length; i++) { sb.append(' '); } newIndent = sb.toString(); } // limit maximum indent to half of maximum line length if (maxLength > 0 && newIndent.length() > maxLength / 2) { fakeIndents++; } else this.indentString = newIndent; } void decrementIndent() { if (indent == 0) return; else if (fakeIndents > 0) fakeIndents--; else { indentString = indentString.substring( 0, indentString.length()-indent ); } } String getEncoding() { return this.encoding; }

Returns the String used as a line separator. This is always "\n", "\r", or "\r\n".

Returns:the line separator
/** * <p> * Returns the String used as a line separator. * This is always "\n", "\r", or "\r\n". * </p> * * @return the line separator */
String getLineSeparator() { return lineSeparator; }

Sets the lineSeparator. This can only be one of the three strings "\n", "\r", or "\r\n". All other values are forbidden.

Params:
  • lineSeparator – the lineSeparator to set
Throws:
/** * <p> * Sets the lineSeparator. This * can only be one of the three * strings "\n", "\r", or "\r\n". * All other values are forbidden. * </p> * * @param lineSeparator the lineSeparator to set * * @throws IllegalArgumentException if you attempt to use * any line separator other than "\n", "\r", or "\r\n". * */
void setLineSeparator(String lineSeparator) { if (lineSeparator.equals("\n") || lineSeparator.equals("\r") || lineSeparator.equals("\r\n")) { this.lineSeparator = lineSeparator; this.lineSeparatorSet = true; } else { throw new IllegalArgumentException( "Illegal Line Separator"); } } void setInDocType(boolean inDocType) { this.inDocType = inDocType; }

Returns the number of spaces this serializer indents.

Returns:the number of spaces this serializer indents
/** * <p> * Returns the number of spaces this serializer indents. * </p> * * @return the number of spaces this serializer indents */
int getIndent() { return indent; }

Returns the maximum line length.

Returns:the maximum line length.
/** * <p> * Returns the maximum line length. * </p> * * @return the maximum line length. */
int getMaxLength() { return maxLength; }

Sets the suggested maximum line length for this serializer. In some circumstances this may not be respected.

Params:
  • maxLength – the maxLength to set
/** * <p> * Sets the suggested maximum line length for this serializer. * In some circumstances this may not be respected. * </p> * * @param maxLength the maxLength to set */
void setMaxLength(int maxLength) { if (maxLength < 0) maxLength = 0; this.maxLength = maxLength; }

Sets the number of spaces to indent each successive level in the hierarchy. Use 0 for no extra indenting.

Params:
  • indent – the indent to set
/** * <p> * Sets the number of spaces to indent each successive level in the * hierarchy. Use 0 for no extra indenting. * </p> * * @param indent the indent to set */
void setIndent(int indent) { this.indent = indent; } void flush() throws IOException { out.flush(); } abstract boolean needsEscaping(char c);

Used to track the current status of xml:space. This is false by default, unless an xml:space="preserve" attribute is in-scope. When such an attribute is in-scope, white space is not adjusted even if indenting and/or a maximum line length has been requested.

Returns:true if an xml:space="true" attribute is in-scope
/** * <p> * Used to track the current status of xml:space. * This is false by default, unless an xml:space="preserve" * attribute is in-scope. When such an attribute is in-scope, * white space is not adjusted even if indenting and/or * a maximum line length has been requested. * </p> * * * @return true if an <code>xml:space="true"</code> attribute * is in-scope */
boolean isPreserveSpace() { return preserveSpace; }
Params:
  • preserveSpace – whether to preserve all white space
/** * @param preserveSpace whether to preserve all white space */
void setPreserveSpace(boolean preserveSpace) { this.preserveSpace = preserveSpace; }
Returns:the current column number
/** * @return the current column number */
int getColumnNumber() { return this.column; }

If true, this property indicates serialization will perform Unicode normalization on all data using normalization form C (NFC). Performing Unicode normalization does change the document's infoset. The default is false; do not normalize.

This feature has not yet been benchmarked or optimized. It may result in substantially slower code.

Params:
  • normalize – true if normalization is performed; false if it isn't.
/** * <p> * If true, this property indicates serialization will * perform Unicode normalization on all data using normalization * form C (NFC). Performing Unicode normalization * does change the document's infoset. * The default is false; do not normalize. * </p> * * <p> * This feature has not yet been benchmarked or optimized. * It may result in substantially slower code. * </p> * * @param normalize true if normalization is performed; * false if it isn't. */
void setNFC(boolean normalize) { this.normalize = normalize; }

If true, this property indicates serialization will perform Unicode normalization on all data using normalization form C (NFC). The default is false; do not normalize.

Returns:true if this serialization performs Unicode normalization; false if it doesn't.
/** * <p> * If true, this property indicates serialization will * perform Unicode normalization on all data using normalization * form C (NFC). The default is false; do not normalize. * </p> * * @return true if this serialization performs Unicode * normalization; false if it doesn't. */
boolean getNFC() { return this.normalize; } void writeName(String name) throws IOException { writeMarkup(name); } }