package org.jsoup.parser;
import org.jsoup.internal.StringUtil;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Entities;
import java.util.Arrays;
final class Tokeniser {
static final char replacementChar = '\uFFFD';
private static final char[] = new char[]{'\t', '\n', '\r', '\f', ' ', '<', '&'};
static final int win1252ExtensionsStart = 0x80;
static final int[] win1252Extensions = new int[] {
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178,
};
static {
Arrays.sort(notCharRefCharsSorted);
}
private final CharacterReader reader;
private final ParseErrorList errors;
private TokeniserState state = TokeniserState.Data;
private Token emitPending;
private boolean isEmitPending = false;
private String = null;
private StringBuilder charsBuilder = new StringBuilder(1024);
StringBuilder dataBuffer = new StringBuilder(1024);
Token.Tag tagPending;
Token.StartTag startPending = new Token.StartTag();
Token.EndTag endPending = new Token.EndTag();
Token.Character charPending = new Token.Character();
Token.Doctype doctypePending = new Token.Doctype();
Token.Comment = new Token.Comment();
private String lastStartTag;
Tokeniser(CharacterReader reader, ParseErrorList errors) {
this.reader = reader;
this.errors = errors;
}
Token read() {
while (!isEmitPending)
state.read(this, reader);
if (charsBuilder.length() > 0) {
String str = charsBuilder.toString();
charsBuilder.delete(0, charsBuilder.length());
charsString = null;
return charPending.data(str);
} else if (charsString != null) {
Token token = charPending.data(charsString);
charsString = null;
return token;
} else {
isEmitPending = false;
return emitPending;
}
}
void emit(Token token) {
Validate.isFalse(isEmitPending, "There is an unread token pending!");
emitPending = token;
isEmitPending = true;
if (token.type == Token.TokenType.StartTag) {
Token.StartTag startTag = (Token.StartTag) token;
lastStartTag = startTag.tagName;
} else if (token.type == Token.TokenType.EndTag) {
Token.EndTag endTag = (Token.EndTag) token;
if (endTag.attributes != null)
error("Attributes incorrectly present on end tag");
}
}
void emit(final String str) {
if (charsString == null) {
charsString = str;
}
else {
if (charsBuilder.length() == 0) {
charsBuilder.append(charsString);
}
charsBuilder.append(str);
}
}
void emit(char[] chars) {
emit(String.valueOf(chars));
}
void emit(int[] codepoints) {
emit(new String(codepoints, 0, codepoints.length));
}
void emit(char c) {
emit(String.valueOf(c));
}
TokeniserState getState() {
return state;
}
void transition(TokeniserState state) {
this.state = state;
}
void advanceTransition(TokeniserState state) {
reader.advance();
this.state = state;
}
final private int[] codepointHolder = new int[1];
final private int[] multipointHolder = new int[2];
int[] consumeCharacterReference(Character additionalAllowedCharacter, boolean inAttribute) {
if (reader.isEmpty())
return null;
if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
return null;
if (reader.matchesAnySorted(notCharRefCharsSorted))
return null;
final int[] codeRef = codepointHolder;
reader.mark();
if (reader.matchConsume("#")) {
boolean isHexMode = reader.matchConsumeIgnoreCase("X");
String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
if (numRef.length() == 0) {
characterReferenceError("numeric reference with no numerals");
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon");
int charval = -1;
try {
int base = isHexMode ? 16 : 10;
charval = Integer.valueOf(numRef, base);
} catch (NumberFormatException ignored) {
}
if (charval == -1 || (charval >= 0xD800 && charval <= 0xDFFF) || charval > 0x10FFFF) {
characterReferenceError("character outside of valid range");
codeRef[0] = replacementChar;
return codeRef;
} else {
if (charval >= win1252ExtensionsStart && charval < win1252ExtensionsStart + win1252Extensions.length) {
characterReferenceError("character is not a valid unicode code point");
charval = win1252Extensions[charval - win1252ExtensionsStart];
}
codeRef[0] = charval;
return codeRef;
}
} else {
String nameRef = reader.consumeLetterThenDigitSequence();
boolean looksLegit = reader.matches(';');
boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
if (!found) {
reader.rewindToMark();
if (looksLegit)
characterReferenceError(String.format("invalid named reference '%s'", nameRef));
return null;
}
if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
reader.rewindToMark();
return null;
}
if (!reader.matchConsume(";"))
characterReferenceError("missing semicolon");
int numChars = Entities.codepointsForName(nameRef, multipointHolder);
if (numChars == 1) {
codeRef[0] = multipointHolder[0];
return codeRef;
} else if (numChars ==2) {
return multipointHolder;
} else {
Validate.fail("Unexpected characters returned for " + nameRef);
return multipointHolder;
}
}
}
Token.Tag createTagPending(boolean start) {
tagPending = start ? startPending.reset() : endPending.reset();
return tagPending;
}
void emitTagPending() {
tagPending.finaliseTag();
emit(tagPending);
}
void () {
commentPending.reset();
}
void () {
emit(commentPending);
}
void createDoctypePending() {
doctypePending.reset();
}
void emitDoctypePending() {
emit(doctypePending);
}
void createTempBuffer() {
Token.reset(dataBuffer);
}
boolean isAppropriateEndTagToken() {
return lastStartTag != null && tagPending.name().equalsIgnoreCase(lastStartTag);
}
String appropriateEndTagName() {
return lastStartTag;
}
void error(TokeniserState state) {
if (errors.canAddError())
errors.add(new ParseError(reader.pos(), "Unexpected character '%s' in input state [%s]", reader.current(), state));
}
void eofError(TokeniserState state) {
if (errors.canAddError())
errors.add(new ParseError(reader.pos(), "Unexpectedly reached end of file (EOF) in input state [%s]", state));
}
private void characterReferenceError(String message) {
if (errors.canAddError())
errors.add(new ParseError(reader.pos(), "Invalid character reference: %s", message));
}
void error(String errorMsg) {
if (errors.canAddError())
errors.add(new ParseError(reader.pos(), errorMsg));
}
boolean currentNodeInHtmlNS() {
return true;
}
String unescapeEntities(boolean inAttribute) {
StringBuilder builder = StringUtil.borrowBuilder();
while (!reader.isEmpty()) {
builder.append(reader.consumeTo('&'));
if (reader.matches('&')) {
reader.consume();
int[] c = consumeCharacterReference(null, inAttribute);
if (c == null || c.length==0)
builder.append('&');
else {
builder.appendCodePoint(c[0]);
if (c.length == 2)
builder.appendCodePoint(c[1]);
}
}
}
return StringUtil.releaseBuilder(builder);
}
}