package org.jsoup.parser;

import org.jsoup.helper.Validate;
import org.jsoup.nodes.Attributes;

import static org.jsoup.internal.Normalizer.lowerCase;

Parse tokens for the Tokeniser.
/** * Parse tokens for the Tokeniser. */
abstract class Token { TokenType type; private Token() { } String tokenType() { return this.getClass().getSimpleName(); }
Reset the data represent by this token, for reuse. Prevents the need to create transfer objects for every piece of data, which immediately get GCed.
/** * Reset the data represent by this token, for reuse. Prevents the need to create transfer objects for every * piece of data, which immediately get GCed. */
abstract Token reset(); static void reset(StringBuilder sb) { if (sb != null) { sb.delete(0, sb.length()); } } static final class Doctype extends Token { final StringBuilder name = new StringBuilder(); String pubSysKey = null; final StringBuilder publicIdentifier = new StringBuilder(); final StringBuilder systemIdentifier = new StringBuilder(); boolean forceQuirks = false; Doctype() { type = TokenType.Doctype; } @Override Token reset() { reset(name); pubSysKey = null; reset(publicIdentifier); reset(systemIdentifier); forceQuirks = false; return this; } String getName() { return name.toString(); } String getPubSysKey() { return pubSysKey; } String getPublicIdentifier() { return publicIdentifier.toString(); } public String getSystemIdentifier() { return systemIdentifier.toString(); } public boolean isForceQuirks() { return forceQuirks; } } static abstract class Tag extends Token { protected String tagName; protected String normalName; // lc version of tag name, for case insensitive tree build private String pendingAttributeName; // attribute names are generally caught in one hop, not accumulated private StringBuilder pendingAttributeValue = new StringBuilder(); // but values are accumulated, from e.g. & in hrefs private String pendingAttributeValueS; // try to get attr vals in one shot, vs Builder private boolean hasEmptyAttributeValue = false; // distinguish boolean attribute from empty string value private boolean hasPendingAttributeValue = false; boolean selfClosing = false; Attributes attributes; // start tags get attributes on construction. End tags get attributes on first new attribute (but only for parser convenience, not used). @Override Tag reset() { tagName = null; normalName = null; pendingAttributeName = null; reset(pendingAttributeValue); pendingAttributeValueS = null; hasEmptyAttributeValue = false; hasPendingAttributeValue = false; selfClosing = false; attributes = null; return this; } final void newAttribute() { if (attributes == null) attributes = new Attributes(); if (pendingAttributeName != null) { // the tokeniser has skipped whitespace control chars, but trimming could collapse to empty for other control codes, so verify here pendingAttributeName = pendingAttributeName.trim(); if (pendingAttributeName.length() > 0) { String value; if (hasPendingAttributeValue) value = pendingAttributeValue.length() > 0 ? pendingAttributeValue.toString() : pendingAttributeValueS; else if (hasEmptyAttributeValue) value = ""; else value = null; attributes.put(pendingAttributeName, value); } } pendingAttributeName = null; hasEmptyAttributeValue = false; hasPendingAttributeValue = false; reset(pendingAttributeValue); pendingAttributeValueS = null; } final void finaliseTag() { // finalises for emit if (pendingAttributeName != null) { // todo: check if attribute name exists; if so, drop and error newAttribute(); } }
Preserves case
/** Preserves case */
final String name() { // preserves case, for input into Tag.valueOf (which may drop case) Validate.isFalse(tagName == null || tagName.length() == 0); return tagName; }
Lower case
/** Lower case */
final String normalName() { // lower case, used in tree building for working out where in tree it should go return normalName; } final Tag name(String name) { tagName = name; normalName = lowerCase(name); return this; } final boolean isSelfClosing() { return selfClosing; } @SuppressWarnings({"TypeMayBeWeakened"}) final Attributes getAttributes() { return attributes; } // these appenders are rarely hit in not null state-- caused by null chars. final void appendTagName(String append) { tagName = tagName == null ? append : tagName.concat(append); normalName = lowerCase(tagName); } final void appendTagName(char append) { appendTagName(String.valueOf(append)); } final void appendAttributeName(String append) { pendingAttributeName = pendingAttributeName == null ? append : pendingAttributeName.concat(append); } final void appendAttributeName(char append) { appendAttributeName(String.valueOf(append)); } final void appendAttributeValue(String append) { ensureAttributeValue(); if (pendingAttributeValue.length() == 0) { pendingAttributeValueS = append; } else { pendingAttributeValue.append(append); } } final void appendAttributeValue(char append) { ensureAttributeValue(); pendingAttributeValue.append(append); } final void appendAttributeValue(char[] append) { ensureAttributeValue(); pendingAttributeValue.append(append); } final void appendAttributeValue(int[] appendCodepoints) { ensureAttributeValue(); for (int codepoint : appendCodepoints) { pendingAttributeValue.appendCodePoint(codepoint); } } final void setEmptyAttributeValue() { hasEmptyAttributeValue = true; } private void ensureAttributeValue() { hasPendingAttributeValue = true; // if on second hit, we'll need to move to the builder if (pendingAttributeValueS != null) { pendingAttributeValue.append(pendingAttributeValueS); pendingAttributeValueS = null; } } } final static class StartTag extends Tag { StartTag() { super(); attributes = new Attributes(); type = TokenType.StartTag; } @Override Tag reset() { super.reset(); attributes = new Attributes(); // todo - would prefer these to be null, but need to check Element assertions return this; } StartTag nameAttr(String name, Attributes attributes) { this.tagName = name; this.attributes = attributes; normalName = lowerCase(tagName); return this; } @Override public String toString() { if (attributes != null && attributes.size() > 0) return "<" + name() + " " + attributes.toString() + ">"; else return "<" + name() + ">"; } } final static class EndTag extends Tag{ EndTag() { super(); type = TokenType.EndTag; } @Override public String toString() { return "</" + name() + ">"; } } final static class Comment extends Token { final StringBuilder data = new StringBuilder(); boolean bogus = false; @Override Token reset() { reset(data); bogus = false; return this; } Comment() { type = TokenType.Comment; } String getData() { return data.toString(); } @Override public String toString() { return "<!--" + getData() + "-->"; } } static class Character extends Token { private String data; Character() { super(); type = TokenType.Character; } @Override Token reset() { data = null; return this; } Character data(String data) { this.data = data; return this; } String getData() { return data; } @Override public String toString() { return getData(); } } final static class CData extends Character { CData(String data) { super(); this.data(data); } @Override public String toString() { return "<![CDATA[" + getData() + "]]>"; } } final static class EOF extends Token { EOF() { type = Token.TokenType.EOF; } @Override Token reset() { return this; } } final boolean isDoctype() { return type == TokenType.Doctype; } final Doctype asDoctype() { return (Doctype) this; } final boolean isStartTag() { return type == TokenType.StartTag; } final StartTag asStartTag() { return (StartTag) this; } final boolean isEndTag() { return type == TokenType.EndTag; } final EndTag asEndTag() { return (EndTag) this; } final boolean isComment() { return type == TokenType.Comment; } final Comment asComment() { return (Comment) this; } final boolean isCharacter() { return type == TokenType.Character; } final boolean isCData() { return this instanceof CData; } final Character asCharacter() { return (Character) this; } final boolean isEOF() { return type == TokenType.EOF; } public enum TokenType { Doctype, StartTag, EndTag, Comment, Character, // note no CData - treated in builder as an extension of Character EOF } }