/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.xerces.impl.xpath.regex;

import java.util.Locale;
import java.util.MissingResourceException;
import java.util.ResourceBundle;
import java.util.Vector;

A Regular Expression Parser.
@xerces.internal
Version:$Id: RegexParser.java 1129306 2011-05-30 19:18:04Z sandygao $
/** * A Regular Expression Parser. * * @xerces.internal * * @version $Id: RegexParser.java 1129306 2011-05-30 19:18:04Z sandygao $ */
class RegexParser { static final int T_CHAR = 0; static final int T_EOF = 1; static final int T_OR = 2; // '|' static final int T_STAR = 3; // '*' static final int T_PLUS = 4; // '+' static final int T_QUESTION = 5; // '?' static final int T_LPAREN = 6; // '(' static final int T_RPAREN = 7; // ')' static final int T_DOT = 8; // '.' static final int T_LBRACKET = 9; // '[' static final int T_BACKSOLIDUS = 10; // '\' static final int T_CARET = 11; // '^' static final int T_DOLLAR = 12; // '$' static final int T_LPAREN2 = 13; // '(?:' static final int T_LOOKAHEAD = 14; // '(?=' static final int T_NEGATIVELOOKAHEAD = 15; // '(?!' static final int T_LOOKBEHIND = 16; // '(?<=' static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!' static final int T_INDEPENDENT = 18; // '(?>' static final int T_SET_OPERATIONS = 19; // '(?[' static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class static final int T_COMMENT = 21; // '(?#' static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z] static final int T_CONDITION = 23; // '(?(' static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class static class ReferencePosition { int refNumber; int position; ReferencePosition(int n, int pos) { this.refNumber = n; this.position = pos; } } int offset; String regex; int regexlen; int options; ResourceBundle resources; int chardata; int nexttoken; static protected final int S_NORMAL = 0; static protected final int S_INBRACKETS = 1; static protected final int S_INXBRACKETS = 2; int context = S_NORMAL; int parenOpened = 1; int parennumber = 1; boolean hasBackReferences; Vector references = null; public RegexParser() { this.setLocale(Locale.getDefault()); } public RegexParser(Locale locale) { this.setLocale(locale); } public void setLocale(Locale locale) { try { if (locale != null) { this.resources = ResourceBundle.getBundle("org.apache.xerces.impl.xpath.regex.message", locale); } else { this.resources = ResourceBundle.getBundle("org.apache.xerces.impl.xpath.regex.message"); } } catch (MissingResourceException mre) { throw new RuntimeException("Installation Problem??? Couldn't load messages: " + mre.getMessage()); } } final ParseException ex(String key, int loc) { return new ParseException(this.resources.getString(key), loc); } protected final boolean isSet(int flag) { return (this.options & flag) == flag; } synchronized Token parse(String regex, int options) throws ParseException { this.options = options; this.offset = 0; this.setContext(S_NORMAL); this.parennumber = 1; this.parenOpened = 1; this.hasBackReferences = false; this.regex = regex; if (this.isSet(RegularExpression.EXTENDED_COMMENT)) this.regex = REUtil.stripExtendedComment(this.regex); this.regexlen = this.regex.length(); this.next(); Token ret = this.parseRegex(); if (this.offset != this.regexlen) throw ex("parser.parse.1", this.offset); if (this.read() != T_EOF) { throw ex("parser.parse.1", this.offset-1); } if (this.references != null) { for (int i = 0; i < this.references.size(); i ++) { ReferencePosition position = (ReferencePosition)this.references.elementAt(i); if (this.parennumber <= position.refNumber) throw ex("parser.parse.2", position.position); } this.references.removeAllElements(); } return ret; } /* public RegularExpression createRegex(String regex, int options) throws ParseException { Token tok = this.parse(regex, options); return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options); } */ protected final void setContext(int con) { this.context = con; } final int read() { return this.nexttoken; } final void next() { if (this.offset >= this.regexlen) { this.chardata = -1; this.nexttoken = T_EOF; return; } int ret; int ch = this.regex.charAt(this.offset++); this.chardata = ch; if (this.context == S_INBRACKETS) { // In a character class, this.chardata has one character, that is to say, // a pair of surrogates is composed and stored to this.chardata. switch (ch) { case '\\': ret = T_BACKSOLIDUS; if (this.offset >= this.regexlen) throw ex("parser.next.1", this.offset-1); this.chardata = this.regex.charAt(this.offset++); break; case '-': // Allow character class subtraction (regardless of whether we are in // XML Schema mode or not) if (this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') { this.offset++; ret = T_XMLSCHEMA_CC_SUBTRACTION; } else ret = T_CHAR; break; case '[': if (!this.isSet(RegularExpression.XMLSCHEMA_MODE) && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') { this.offset++; ret = T_POSIX_CHARCLASS_START; break; } // Through down default: if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) { int low = this.regex.charAt(this.offset); if (REUtil.isLowSurrogate(low)) { this.chardata = REUtil.composeFromSurrogates(ch, low); this.offset ++; } } ret = T_CHAR; } this.nexttoken = ret; return; } switch (ch) { case '|': ret = T_OR; break; case '*': ret = T_STAR; break; case '+': ret = T_PLUS; break; case '?': ret = T_QUESTION; break; case ')': ret = T_RPAREN; break; case '.': ret = T_DOT; break; case '[': ret = T_LBRACKET; break; case '^': if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) { ret = T_CHAR; } else { ret = T_CARET; } break; case '$': if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) { ret = T_CHAR; } else { ret = T_DOLLAR; } break; case '(': ret = T_LPAREN; if (this.offset >= this.regexlen) break; if (this.regex.charAt(this.offset) != '?') break; if (++this.offset >= this.regexlen) throw ex("parser.next.2", this.offset-1); ch = this.regex.charAt(this.offset++); switch (ch) { case ':': ret = T_LPAREN2; break; case '=': ret = T_LOOKAHEAD; break; case '!': ret = T_NEGATIVELOOKAHEAD; break; case '[': ret = T_SET_OPERATIONS; break; case '>': ret = T_INDEPENDENT; break; case '<': if (this.offset >= this.regexlen) throw ex("parser.next.2", this.offset-3); ch = this.regex.charAt(this.offset++); if (ch == '=') { ret = T_LOOKBEHIND; } else if (ch == '!') { ret = T_NEGATIVELOOKBEHIND; } else throw ex("parser.next.3", this.offset-3); break; case '#': while (this.offset < this.regexlen) { ch = this.regex.charAt(this.offset++); if (ch == ')') break; } if (ch != ')') throw ex("parser.next.4", this.offset-1); ret = T_COMMENT; break; default: if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options this.offset --; ret = T_MODIFIERS; break; } else if (ch == '(') { // conditional ret = T_CONDITION; // this.offsets points the next of '('. break; } throw ex("parser.next.2", this.offset-2); } break; case '\\': ret = T_BACKSOLIDUS; if (this.offset >= this.regexlen) throw ex("parser.next.1", this.offset-1); this.chardata = this.regex.charAt(this.offset++); break; default: ret = T_CHAR; } this.nexttoken = ret; }
regex ::= term (`|` term)* term ::= factor+ factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' | atom (('*' | '+' | '?' | minmax ) '?'? )?) | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9] | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
/** * regex ::= term (`|` term)* * term ::= factor+ * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' * | atom (('*' | '+' | '?' | minmax ) '?'? )?) * | '(?=' regex ')' | '(?!' regex ')' | '(?&lt;=' regex ')' | '(?&lt;!' regex ')' * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9] * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block */
Token parseRegex() throws ParseException { Token tok = this.parseTerm(); Token parent = null; while (this.read() == T_OR) { this.next(); // '|' if (parent == null) { parent = Token.createUnion(); parent.addChild(tok); tok = parent; } tok.addChild(this.parseTerm()); } return tok; }
term ::= factor+
/** * term ::= factor+ */
Token parseTerm() throws ParseException { int ch = this.read(); if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) { return Token.createEmpty(); } else { Token tok = this.parseFactor(); Token concat = null; while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) { if (concat == null) { concat = Token.createConcat(); concat.addChild(tok); tok = concat; } concat.addChild(this.parseFactor()); //tok = Token.createConcat(tok, this.parseFactor()); } return tok; } } // ---------------------------------------------------------------- Token processCaret() throws ParseException { this.next(); return Token.token_linebeginning; } Token processDollar() throws ParseException { this.next(); return Token.token_lineend; } Token processLookahead() throws ParseException { this.next(); Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex()); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // ')' return tok; } Token processNegativelookahead() throws ParseException { this.next(); Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex()); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // ')' return tok; } Token processLookbehind() throws ParseException { this.next(); Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex()); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // ')' return tok; } Token processNegativelookbehind() throws ParseException { this.next(); Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex()); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // ')' return tok; } Token processBacksolidus_A() throws ParseException { this.next(); return Token.token_stringbeginning; } Token processBacksolidus_Z() throws ParseException { this.next(); return Token.token_stringend2; } Token processBacksolidus_z() throws ParseException { this.next(); return Token.token_stringend; } Token processBacksolidus_b() throws ParseException { this.next(); return Token.token_wordedge; } Token processBacksolidus_B() throws ParseException { this.next(); return Token.token_not_wordedge; } Token processBacksolidus_lt() throws ParseException { this.next(); return Token.token_wordbeginning; } Token processBacksolidus_gt() throws ParseException { this.next(); return Token.token_wordend; } Token processStar(Token tok) throws ParseException { this.next(); if (this.read() == T_QUESTION) { this.next(); return Token.createNGClosure(tok); } else return Token.createClosure(tok); } Token processPlus(Token tok) throws ParseException { // X+ -> XX* this.next(); if (this.read() == T_QUESTION) { this.next(); return Token.createConcat(tok, Token.createNGClosure(tok)); } else return Token.createConcat(tok, Token.createClosure(tok)); } Token processQuestion(Token tok) throws ParseException { // X? -> X| this.next(); Token par = Token.createUnion(); if (this.read() == T_QUESTION) { this.next(); par.addChild(Token.createEmpty()); par.addChild(tok); } else { par.addChild(tok); par.addChild(Token.createEmpty()); } return par; } boolean checkQuestion(int off) { return off < this.regexlen && this.regex.charAt(off) == '?'; } Token processParen() throws ParseException { this.next(); int p = this.parenOpened++; Token tok = Token.createParen(this.parseRegex(), p); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.parennumber++; this.next(); // Skips ')' return tok; } Token processParen2() throws ParseException { this.next(); Token tok = Token.createParen(this.parseRegex(), 0); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // Skips ')' return tok; } Token processCondition() throws ParseException { // this.offset points the next of '(' if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset); // Parses a condition. int refno = -1; Token condition = null; int ch = this.regex.charAt(this.offset); if ('1' <= ch && ch <= '9') { refno = ch-'0'; int finalRefno = refno; if (this.parennumber <= refno) throw ex("parser.parse.2", this.offset); while (this.offset + 1 < this.regexlen) { ch = this.regex.charAt(this.offset + 1); if ('0' <= ch && ch <= '9') { refno = (refno * 10) + (ch - '0'); if (refno < this.parennumber) { finalRefno= refno; ++this.offset; } else { break; } } else { break; } } this.hasBackReferences = true; if (this.references == null) this.references = new Vector(); this.references.addElement(new ReferencePosition(finalRefno, this.offset)); this.offset ++; if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset); this.offset ++; } else { if (ch == '?') this.offset --; // Points '('. this.next(); condition = this.parseFactor(); switch (condition.type) { case Token.LOOKAHEAD: case Token.NEGATIVELOOKAHEAD: case Token.LOOKBEHIND: case Token.NEGATIVELOOKBEHIND: break; case Token.ANCHOR: if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); break; default: throw ex("parser.factor.5", this.offset); } } // Parses yes/no-patterns. this.next(); Token yesPattern = this.parseRegex(); Token noPattern = null; if (yesPattern.type == Token.UNION) { if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset); noPattern = yesPattern.getChild(1); yesPattern = yesPattern.getChild(0); } if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); return Token.createCondition(refno, condition, yesPattern, noPattern); } Token processModifiers() throws ParseException { // this.offset points the next of '?'. // modifiers ::= [imsw]* ('-' [imsw]*)? ':' int add = 0, mask = 0, ch = -1; while (this.offset < this.regexlen) { ch = this.regex.charAt(this.offset); int v = REUtil.getOptionValue(ch); if (v == 0) break; // '-' or ':'? add |= v; this.offset ++; } if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); if (ch == '-') { this.offset ++; while (this.offset < this.regexlen) { ch = this.regex.charAt(this.offset); int v = REUtil.getOptionValue(ch); if (v == 0) break; // ':'? mask |= v; this.offset ++; } if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); } Token tok; if (ch == ':') { this.offset ++; this.next(); tok = Token.createModifierGroup(this.parseRegex(), add, mask); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); } else if (ch == ')') { // such as (?-i) this.offset ++; this.next(); tok = Token.createModifierGroup(this.parseRegex(), add, mask); } else throw ex("parser.factor.3", this.offset); return tok; } Token processIndependent() throws ParseException { this.next(); Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex()); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); // Skips ')' return tok; } Token processBacksolidus_c() throws ParseException { int ch2; // Must be in 0x0040-0x005f if (this.offset >= this.regexlen || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040) throw ex("parser.atom.1", this.offset-1); this.next(); return Token.createChar(ch2-0x40); } Token processBacksolidus_C() throws ParseException { throw ex("parser.process.1", this.offset); } Token processBacksolidus_i() throws ParseException { Token tok = Token.createChar('i'); this.next(); return tok; } Token processBacksolidus_I() throws ParseException { throw ex("parser.process.1", this.offset); } Token processBacksolidus_g() throws ParseException { this.next(); return Token.getGraphemePattern(); } Token processBacksolidus_X() throws ParseException { this.next(); return Token.getCombiningCharacterSequence(); } Token processBackreference() throws ParseException { int refnum = this.chardata-'0'; int finalRefnum = refnum; if (this.parennumber <= refnum) throw ex("parser.parse.2", this.offset-2); while (this.offset < this.regexlen) { final int ch = this.regex.charAt(this.offset); if ('0' <= ch && ch <= '9') { refnum = (refnum * 10) + (ch - '0'); if (refnum < this.parennumber) { ++this.offset; finalRefnum = refnum; this.chardata = ch; } else { break; } } else { break; } } Token tok = Token.createBackReference(finalRefnum); this.hasBackReferences = true; if (this.references == null) this.references = new Vector(); this.references.addElement(new ReferencePosition(finalRefnum, this.offset-2)); this.next(); return tok; } // ----------------------------------------------------------------
factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' | atom (('*' | '+' | '?' | minmax ) '?'? )?) | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' | '(?#' [^)]* ')' minmax ::= '{' min (',' max?)? '}' min ::= [0-9]+ max ::= [0-9]+
/** * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' * | atom (('*' | '+' | '?' | minmax ) '?'? )?) * | '(?=' regex ')' | '(?!' regex ')' | '(?&lt;=' regex ')' | '(?&lt;!' regex ')' * | '(?#' [^)]* ')' * minmax ::= '{' min (',' max?)? '}' * min ::= [0-9]+ * max ::= [0-9]+ */
Token parseFactor() throws ParseException { int ch = this.read(); Token tok; switch (ch) { case T_CARET: return this.processCaret(); case T_DOLLAR: return this.processDollar(); case T_LOOKAHEAD: return this.processLookahead(); case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead(); case T_LOOKBEHIND: return this.processLookbehind(); case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind(); case T_COMMENT: this.next(); return Token.createEmpty(); case T_BACKSOLIDUS: switch (this.chardata) { case 'A': return this.processBacksolidus_A(); case 'Z': return this.processBacksolidus_Z(); case 'z': return this.processBacksolidus_z(); case 'b': return this.processBacksolidus_b(); case 'B': return this.processBacksolidus_B(); case '<': return this.processBacksolidus_lt(); case '>': return this.processBacksolidus_gt(); } // through down } tok = this.parseAtom(); ch = this.read(); switch (ch) { case T_STAR: return this.processStar(tok); case T_PLUS: return this.processPlus(tok); case T_QUESTION: return this.processQuestion(tok); case T_CHAR: if (this.chardata == '{' && this.offset < this.regexlen) { int off = this.offset; // this.offset -> next of '{' int min = 0, max = -1; if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { min = ch -'0'; while (off < this.regexlen && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { min = min*10 +ch-'0'; if (min < 0) throw ex("parser.quantifier.5", this.offset); } } else { throw ex("parser.quantifier.1", this.offset); } max = min; if (ch == ',') { if (off >= this.regexlen) { throw ex("parser.quantifier.3", this.offset); } else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { max = ch -'0'; // {min,max} while (off < this.regexlen && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { max = max*10 +ch-'0'; if (max < 0) throw ex("parser.quantifier.5", this.offset); } if (min > max) throw ex("parser.quantifier.4", this.offset); } else { // assume {min,} max = -1; } } if (ch != '}') throw ex("parser.quantifier.2", this.offset); if (this.checkQuestion(off)) { // off -> next of '}' tok = Token.createNGClosure(tok); this.offset = off+1; } else { tok = Token.createClosure(tok); this.offset = off; } tok.setMin(min); tok.setMax(max); //System.err.println("CLOSURE: "+min+", "+max); this.next(); } } return tok; }
atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block | '(?>' regex ')' char ::= '\\' | '\' [efnrt] | bmp-code | character-1
/** * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block * | '(?>' regex ')' * char ::= '\\' | '\' [efnrt] | bmp-code | character-1 */
Token parseAtom() throws ParseException { int ch = this.read(); Token tok = null; switch (ch) { case T_LPAREN: return this.processParen(); case T_LPAREN2: return this.processParen2(); // '(?:' case T_CONDITION: return this.processCondition(); // '(?(' case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... ) case T_INDEPENDENT: return this.processIndependent(); case T_DOT: this.next(); // Skips '.' tok = Token.token_dot; break; /** * char-class ::= '[' ( '^'? range ','?)+ ']' * range ::= '\d' | '\w' | '\s' | category-block | range-char * | range-char '-' range-char * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] */ case T_LBRACKET: return this.parseCharacterClass(true); case T_SET_OPERATIONS: return this.parseSetOperations(); case T_BACKSOLIDUS: switch (this.chardata) { case 'd': case 'D': case 'w': case 'W': case 's': case 'S': tok = this.getTokenForShorthand(this.chardata); this.next(); return tok; case 'e': case 'f': case 'n': case 'r': case 't': case 'u': case 'v': case 'x': { int ch2 = this.decodeEscaped(); if (ch2 < 0x10000) { tok = Token.createChar(ch2); } else { tok = Token.createString(REUtil.decomposeToSurrogates(ch2)); } } break; case 'c': return this.processBacksolidus_c(); case 'C': return this.processBacksolidus_C(); case 'i': return this.processBacksolidus_i(); case 'I': return this.processBacksolidus_I(); case 'g': return this.processBacksolidus_g(); case 'X': return this.processBacksolidus_X(); case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return this.processBackreference(); case 'P': case 'p': int pstart = this.offset; tok = processBacksolidus_pP(this.chardata); if (tok == null) throw this.ex("parser.atom.5", pstart); break; default: tok = Token.createChar(this.chardata); } this.next(); break; case T_CHAR: if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}') throw this.ex("parser.atom.4", this.offset-1); tok = Token.createChar(this.chardata); int high = this.chardata; this.next(); if (REUtil.isHighSurrogate(high) && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) { char[] sur = new char[2]; sur[0] = (char)high; sur[1] = (char)this.chardata; tok = Token.createParen(Token.createString(new String(sur)), 0); this.next(); } break; default: throw this.ex("parser.atom.4", this.offset-1); } return tok; } protected RangeToken processBacksolidus_pP(int c) throws ParseException { this.next(); if (this.read() != T_CHAR || this.chardata != '{') throw this.ex("parser.atom.2", this.offset-1); // handle category escape boolean positive = c == 'p'; int namestart = this.offset; int nameend = this.regex.indexOf('}', namestart); if (nameend < 0) throw this.ex("parser.atom.3", this.offset); String pname = this.regex.substring(namestart, nameend); this.offset = nameend+1; return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE)); } int processCIinCharacterClass(RangeToken tok, int c) { return this.decodeEscaped(); }
char-class ::= '[' ( '^'? range ','?)+ ']' range ::= '\d' | '\w' | '\s' | category-block | range-char | range-char '-' range-char range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
/** * char-class ::= '[' ( '^'? range ','?)+ ']' * range ::= '\d' | '\w' | '\s' | category-block | range-char * | range-char '-' range-char * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] */
protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException { this.setContext(S_INBRACKETS); this.next(); // '[' boolean nrange = false; RangeToken base = null; RangeToken tok; if (this.read() == T_CHAR && this.chardata == '^') { nrange = true; this.next(); // '^' if (useNrange) { tok = Token.createNRange(); } else { base = Token.createRange(); base.addRange(0, Token.UTF16_MAX); tok = Token.createRange(); } } else { tok = Token.createRange(); } int type; boolean firstloop = true; while ((type = this.read()) != T_EOF) { if (type == T_CHAR && this.chardata == ']' && !firstloop) break; int c = this.chardata; boolean end = false; if (type == T_BACKSOLIDUS) { switch (c) { case 'd': case 'D': case 'w': case 'W': case 's': case 'S': tok.mergeRanges(this.getTokenForShorthand(c)); end = true; break; case 'i': case 'I': case 'c': case 'C': c = this.processCIinCharacterClass(tok, c); if (c < 0) end = true; break; case 'p': case 'P': int pstart = this.offset; RangeToken tok2 = this.processBacksolidus_pP(c); if (tok2 == null) throw this.ex("parser.atom.5", pstart); tok.mergeRanges(tok2); end = true; break; default: c = this.decodeEscaped(); } // \ + c } // backsolidus // POSIX Character class such as [:alnum:] else if (type == T_POSIX_CHARCLASS_START) { int nameend = this.regex.indexOf(':', this.offset); if (nameend < 0) throw this.ex("parser.cc.1", this.offset); boolean positive = true; if (this.regex.charAt(this.offset) == '^') { this.offset ++; positive = false; } String name = this.regex.substring(this.offset, nameend); RangeToken range = Token.getRange(name, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE)); if (range == null) throw this.ex("parser.cc.3", this.offset); tok.mergeRanges(range); end = true; if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']') throw this.ex("parser.cc.1", nameend); this.offset = nameend+2; } else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) { if (nrange) { nrange = false; if (useNrange) { tok = (RangeToken) Token.complementRanges(tok); } else { base.subtractRanges(tok); tok = base; } } RangeToken range2 = this.parseCharacterClass(false); tok.subtractRanges(range2); if (this.read() != T_CHAR || this.chardata != ']') { throw this.ex("parser.cc.5", this.offset); } break; // Exit this loop } this.next(); if (!end) { // if not shorthands... if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'. if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) { tok.addRange(c, c); } else { addCaseInsensitiveChar(tok, c); } } else if (type == T_XMLSCHEMA_CC_SUBTRACTION) { throw this.ex("parser.cc.8", this.offset-1); } else { this.next(); // Skips '-' if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset); if (type == T_CHAR && this.chardata == ']') { if (!this.isSet(RegularExpression.IGNORE_CASE) || c > 0xffff) { tok.addRange(c, c); } else { addCaseInsensitiveChar(tok, c); } tok.addRange('-', '-'); } else { int rangeend = this.chardata; if (type == T_BACKSOLIDUS) { rangeend = this.decodeEscaped(); } this.next(); if (c > rangeend) { throw this.ex("parser.ope.3", this.offset-1); } if (!this.isSet(RegularExpression.IGNORE_CASE) || (c > 0xffff && rangeend > 0xffff)) { tok.addRange(c, rangeend); } else { addCaseInsensitiveCharRange(tok, c, rangeend); } } } } if (this.isSet(RegularExpression.SPECIAL_COMMA) && this.read() == T_CHAR && this.chardata == ',') { this.next(); } firstloop = false; } if (this.read() == T_EOF) { throw this.ex("parser.cc.2", this.offset); } if (!useNrange && nrange) { base.subtractRanges(tok); tok = base; } tok.sortRanges(); tok.compactRanges(); this.setContext(S_NORMAL); this.next(); // Skips ']' return tok; }
'(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
/** * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')' */
protected RangeToken parseSetOperations() throws ParseException { RangeToken tok = this.parseCharacterClass(false); int type; while ((type = this.read()) != T_RPAREN) { int ch = this.chardata; if (type == T_CHAR && (ch == '-' || ch == '&') || type == T_PLUS) { this.next(); if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1); RangeToken t2 = this.parseCharacterClass(false); if (type == T_PLUS) tok.mergeRanges(t2); else if (ch == '-') tok.subtractRanges(t2); else if (ch == '&') tok.intersectRanges(t2); else throw new RuntimeException("ASSERT"); } else { throw ex("parser.ope.2", this.offset-1); } } this.next(); return tok; } Token getTokenForShorthand(int ch) { Token tok; switch (ch) { case 'd': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("Nd", true) : Token.token_0to9; break; case 'D': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("Nd", false) : Token.token_not_0to9; break; case 'w': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsWord", true) : Token.token_wordchars; break; case 'W': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsWord", false) : Token.token_not_wordchars; break; case 's': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsSpace", true) : Token.token_spaces; break; case 'S': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsSpace", false) : Token.token_not_spaces; break; default: throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16)); } return tok; } /** */ int decodeEscaped() throws ParseException { if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1); int c = this.chardata; switch (c) { case 'e': c = 0x1b; break; // ESCAPE U+001B case 'f': c = '\f'; break; // FORM FEED U+000C case 'n': c = '\n'; break; // LINE FEED U+000A case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009 //case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B case 'x': this.next(); if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); if (this.chardata == '{') { int v1 = 0; int uv = 0; do { this.next(); if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); if ((v1 = hexChar(this.chardata)) < 0) break; if (uv > uv*16) throw ex("parser.descape.2", this.offset-1); uv = uv*16+v1; } while (true); if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1); if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1); c = uv; } else { int v1 = 0; if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); int uv = v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; c = uv; } break; case 'u': int v1 = 0; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); int uv = v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; c = uv; break; case 'v': this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1; if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1); c = uv; break; case 'A': case 'Z': case 'z': throw ex("parser.descape.5", this.offset-2); default: } return c; } static private final int hexChar(int ch) { if (ch < '0') return -1; if (ch > 'f') return -1; if (ch <= '9') return ch-'0'; if (ch < 'A') return -1; if (ch <= 'F') return ch-'A'+10; if (ch < 'a') return -1; return ch-'a'+10; } static protected final void addCaseInsensitiveChar(RangeToken tok, int c) { final int[] caseMap = CaseInsensitiveMap.get(c); tok.addRange(c, c); if (caseMap != null) { for (int i=0; i<caseMap.length; i+=2) { tok.addRange(caseMap[i], caseMap[i]); } } } static protected final void addCaseInsensitiveCharRange(RangeToken tok, int start, int end) { int[] caseMap; int r1, r2; if (start <= end) { r1 = start; r2 = end; } else { r1 = end; r2 = start; } tok.addRange(r1, r2); for (int ch = r1; ch <= r2; ch++) { caseMap = CaseInsensitiveMap.get(ch); if (caseMap != null) { for (int i=0; i<caseMap.length; i+=2) { tok.addRange(caseMap[i], caseMap[i]); } } } } }