/* -*- Mode: java; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- * * ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Rhino code, released * May 6, 1998. * * The Initial Developer of the Original Code is * Netscape Communications Corporation. * Portions created by the Initial Developer are Copyright (C) 1997-1999 * the Initial Developer. All Rights Reserved. * * Contributor(s): * Norris Boyd * Igor Bukanov * Brendan Eich * Matthias Radestock * * Alternatively, the contents of this file may be used under the terms of * the GNU General Public License Version 2 or later (the "GPL"), in which * case the provisions of the GPL are applicable instead of those above. If * you wish to allow use of your version of this file only under the terms of * the GPL and not to allow others to use your version of this file under the * MPL, indicate your decision by deleting the provisions above and replacing * them with the notice and other provisions required by the GPL. If you do * not delete the provisions above, a recipient may use your version of this * file under either the MPL or the GPL. * * ***** END LICENSE BLOCK ***** */ package org.mozilla.javascript.regexp; import java.io.Serializable; import org.mozilla.javascript.Context; import org.mozilla.javascript.Function; import org.mozilla.javascript.IdFunctionObject; import org.mozilla.javascript.IdScriptableObject; import org.mozilla.javascript.Kit; import org.mozilla.javascript.ScriptRuntime; import org.mozilla.javascript.Scriptable; import org.mozilla.javascript.ScriptableObject; import org.mozilla.javascript.Undefined; /** * This class implements the RegExp native object. * * Revision History: * Implementation in C by Brendan Eich * Initial port to Java by Norris Boyd from jsregexp.c version 1.36 * Merged up to version 1.38, which included Unicode support. * Merged bug fixes in version 1.39. * Merged JSFUN13_BRANCH changes up to 1.32.2.13 * * @author Brendan Eich * @author Norris Boyd */ public class NativeRegExp extends IdScriptableObject implements Function { static final long serialVersionUID = 4965263491464903264L; private static final Object REGEXP_TAG = new Object(); public static final int JSREG_GLOB = 0x1; // 'g' flag: global public static final int JSREG_FOLD = 0x2; // 'i' flag: fold public static final int JSREG_MULTILINE = 0x4; // 'm' flag: multiline //type of match to perform public static final int TEST = 0; public static final int MATCH = 1; public static final int PREFIX = 2; private static final boolean debug = false; private static final byte REOP_EMPTY = 0; /* match rest of input against rest of r.e. */ private static final byte REOP_ALT = 1; /* alternative subexpressions in kid and next */ private static final byte REOP_BOL = 2; /* beginning of input (or line if multiline) */ private static final byte REOP_EOL = 3; /* end of input (or line if multiline) */ private static final byte REOP_WBDRY = 4; /* match "" at word boundary */ private static final byte REOP_WNONBDRY = 5; /* match "" at word non-boundary */ private static final byte REOP_QUANT = 6; /* quantified atom: atom{1,2} */ private static final byte REOP_STAR = 7; /* zero or more occurrences of kid */ private static final byte REOP_PLUS = 8; /* one or more occurrences of kid */ private static final byte REOP_OPT = 9; /* optional subexpression in kid */ private static final byte REOP_LPAREN = 10; /* left paren bytecode: kid is u.num'th sub-regexp */ private static final byte REOP_RPAREN = 11; /* right paren bytecode */ private static final byte REOP_DOT = 12; /* stands for any character */ // private static final byte REOP_CCLASS = 13; /* character class: [a-f] */ private static final byte REOP_DIGIT = 14; /* match a digit char: [0-9] */ private static final byte REOP_NONDIGIT = 15; /* match a non-digit char: [^0-9] */ private static final byte REOP_ALNUM = 16; /* match an alphanumeric char: [0-9a-z_A-Z] */ private static final byte REOP_NONALNUM = 17; /* match a non-alphanumeric char: [^0-9a-z_A-Z] */ private static final byte REOP_SPACE = 18; /* match a whitespace char */ private static final byte REOP_NONSPACE = 19; /* match a non-whitespace char */ private static final byte REOP_BACKREF = 20; /* back-reference (e.g., \1) to a parenthetical */ private static final byte REOP_FLAT = 21; /* match a flat string */ private static final byte REOP_FLAT1 = 22; /* match a single char */ private static final byte REOP_JUMP = 23; /* for deoptimized closure loops */ // private static final byte REOP_DOTSTAR = 24; /* optimize .* to use a single opcode */ // private static final byte REOP_ANCHOR = 25; /* like .* but skips left context to unanchored r.e. */ // private static final byte REOP_EOLONLY = 26; /* $ not preceded by any pattern */ // private static final byte REOP_UCFLAT = 27; /* flat Unicode string; len immediate counts chars */ private static final byte REOP_UCFLAT1 = 28; /* single Unicode char */ // private static final byte REOP_UCCLASS = 29; /* Unicode character class, vector of chars to match */ // private static final byte REOP_NUCCLASS = 30; /* negated Unicode character class */ // private static final byte REOP_BACKREFi = 31; /* case-independent REOP_BACKREF */ private static final byte REOP_FLATi = 32; /* case-independent REOP_FLAT */ private static final byte REOP_FLAT1i = 33; /* case-independent REOP_FLAT1 */ // private static final byte REOP_UCFLATi = 34; /* case-independent REOP_UCFLAT */ private static final byte REOP_UCFLAT1i = 35; /* case-independent REOP_UCFLAT1 */ // private static final byte REOP_ANCHOR1 = 36; /* first-char discriminating REOP_ANCHOR */ // private static final byte REOP_NCCLASS = 37; /* negated 8-bit character class */ // private static final byte REOP_DOTSTARMIN = 38; /* ungreedy version of REOP_DOTSTAR */ // private static final byte REOP_LPARENNON = 39; /* non-capturing version of REOP_LPAREN */ // private static final byte REOP_RPARENNON = 40; /* non-capturing version of REOP_RPAREN */ private static final byte REOP_ASSERT = 41; /* zero width positive lookahead assertion */ private static final byte REOP_ASSERT_NOT = 42; /* zero width negative lookahead assertion */ private static final byte REOP_ASSERTTEST = 43; /* sentinel at end of assertion child */ private static final byte REOP_ASSERTNOTTEST = 44; /* sentinel at end of !assertion child */ private static final byte REOP_MINIMALSTAR = 45; /* non-greedy version of * */ private static final byte REOP_MINIMALPLUS = 46; /* non-greedy version of + */ private static final byte REOP_MINIMALOPT = 47; /* non-greedy version of ? */ private static final byte REOP_MINIMALQUANT = 48; /* non-greedy version of {} */ private static final byte REOP_ENDCHILD = 49; /* sentinel at end of quantifier child */ private static final byte REOP_CLASS = 50; /* character class with index */ private static final byte REOP_REPEAT = 51; /* directs execution of greedy quantifier */ private static final byte REOP_MINIMALREPEAT = 52; /* directs execution of non-greedy quantifier */ private static final byte REOP_END = 53; public static void init(Context cx, Scriptable scope, boolean sealed) { NativeRegExp proto = new NativeRegExp(); proto.re = (RECompiled)compileRE(cx, "", null, false); proto.activatePrototypeMap(MAX_PROTOTYPE_ID); proto.setParentScope(scope); proto.setPrototype(getObjectPrototype(scope)); NativeRegExpCtor ctor = new NativeRegExpCtor(); // Bug #324006: ECMA-262 15.10.6.1 says "The initial value of // RegExp.prototype.constructor is the builtin RegExp constructor." proto.put("constructor", proto, ctor); ScriptRuntime.setFunctionProtoAndParent(ctor, scope); ctor.setImmunePrototypeProperty(proto); if (sealed) { proto.sealObject(); ctor.sealObject(); } defineProperty(scope, "RegExp", ctor, ScriptableObject.DONTENUM); } NativeRegExp(Scriptable scope, Object regexpCompiled) { this.re = (RECompiled)regexpCompiled; this.lastIndex = 0; ScriptRuntime.setObjectProtoAndParent(this, scope); } @Override public String getClassName() { return "RegExp"; } public Object call(Context cx, Scriptable scope, Scriptable thisObj, Object[] args) { return execSub(cx, scope, args, MATCH); } public Scriptable construct(Context cx, Scriptable scope, Object[] args) { return (Scriptable)execSub(cx, scope, args, MATCH); } Scriptable compile(Context cx, Scriptable scope, Object[] args) { if (args.length > 0 && args[0] instanceof NativeRegExp) { if (args.length > 1 && args[1] != Undefined.instance) { // report error throw ScriptRuntime.typeError0("msg.bad.regexp.compile"); } NativeRegExp thatObj = (NativeRegExp) args[0]; this.re = thatObj.re; this.lastIndex = thatObj.lastIndex; return this; } String s = args.length == 0 ? "" : ScriptRuntime.toString(args[0]); String global = args.length > 1 && args[1] != Undefined.instance ? ScriptRuntime.toString(args[1]) : null; this.re = (RECompiled)compileRE(cx, s, global, false); this.lastIndex = 0; return this; } @Override public String toString() { StringBuffer buf = new StringBuffer(); buf.append('/'); if (re.source.length != 0) { buf.append(re.source); } else { // See bugzilla 226045 buf.append("(?:)"); } buf.append('/'); if ((re.flags & JSREG_GLOB) != 0) buf.append('g'); if ((re.flags & JSREG_FOLD) != 0) buf.append('i'); if ((re.flags & JSREG_MULTILINE) != 0) buf.append('m'); return buf.toString(); } NativeRegExp() { } private static RegExpImpl getImpl(Context cx) { return (RegExpImpl) ScriptRuntime.getRegExpProxy(cx); } private Object execSub(Context cx, Scriptable scopeObj, Object[] args, int matchType) { RegExpImpl reImpl = getImpl(cx); String str; if (args.length == 0) { str = reImpl.input; if (str == null) { reportError("msg.no.re.input.for", toString()); } } else { str = ScriptRuntime.toString(args[0]); } double d = ((re.flags & JSREG_GLOB) != 0) ? lastIndex : 0; Object rval; if (d < 0 || str.length() < d) { lastIndex = 0; rval = null; } else { int indexp[] = { (int)d }; rval = executeRegExp(cx, scopeObj, reImpl, str, indexp, matchType); if ((re.flags & JSREG_GLOB) != 0) { lastIndex = (rval == null || rval == Undefined.instance) ? 0 : indexp[0]; } } return rval; } static Object compileRE(Context cx, String str, String global, boolean flat) { RECompiled regexp = new RECompiled(); regexp.source = str.toCharArray(); int length = str.length(); int flags = 0; if (global != null) { for (int i = 0; i < global.length(); i++) { char c = global.charAt(i); if (c == 'g') { flags |= JSREG_GLOB; } else if (c == 'i') { flags |= JSREG_FOLD; } else if (c == 'm') { flags |= JSREG_MULTILINE; } else { reportError("msg.invalid.re.flag", String.valueOf(c)); } } } regexp.flags = flags; CompilerState state = new CompilerState(cx, regexp.source, length, flags); if (flat && length > 0) { if (debug) { System.out.println("flat = \"" + str + "\""); } state.result = new RENode(REOP_FLAT); state.result.chr = state.cpbegin[0]; state.result.length = length; state.result.flatIndex = 0; state.progLength += 5; } else if (!parseDisjunction(state)) return null; regexp.program = new byte[state.progLength + 1]; if (state.classCount != 0) { regexp.classList = new RECharSet[state.classCount]; regexp.classCount = state.classCount; } int endPC = emitREBytecode(state, regexp, 0, state.result); regexp.program[endPC++] = REOP_END; if (debug) { System.out.println("Prog. length = " + endPC); for (int i = 0; i < endPC; i++) { System.out.print(regexp.program[i]); if (i < (endPC - 1)) System.out.print(", "); } System.out.println(); } regexp.parenCount = state.parenCount; // If re starts with literal, init anchorCh accordingly switch (regexp.program[0]) { case REOP_UCFLAT1: case REOP_UCFLAT1i: regexp.anchorCh = (char)getIndex(regexp.program, 1); break; case REOP_FLAT1: case REOP_FLAT1i: regexp.anchorCh = (char)(regexp.program[1] & 0xFF); break; case REOP_FLAT: case REOP_FLATi: int k = getIndex(regexp.program, 1); regexp.anchorCh = regexp.source[k]; break; } if (debug) { if (regexp.anchorCh >= 0) { System.out.println("Anchor ch = '" + (char)regexp.anchorCh + "'"); } } return regexp; } static boolean isDigit(char c) { return '0' <= c && c <= '9'; } private static boolean isWord(char c) { return Character.isLetter(c) || isDigit(c) || c == '_'; } private static boolean isLineTerm(char c) { return ScriptRuntime.isJSLineTerminator(c); } private static boolean isREWhiteSpace(int c) { return (c == '\u0020' || c == '\u0009' || c == '\n' || c == '\r' || c == 0x2028 || c == 0x2029 || c == '\u000C' || c == '\u000B' || c == '\u00A0' || Character.getType((char)c) == Character.SPACE_SEPARATOR); } /* * * 1. If IgnoreCase is false, return ch. * 2. Let u be ch converted to upper case as if by calling * String.prototype.toUpperCase on the one-character string ch. * 3. If u does not consist of a single character, return ch. * 4. Let cu be u's character. * 5. If ch's code point value is greater than or equal to decimal 128 and cu's * code point value is less than decimal 128, then return ch. * 6. Return cu. */ private static char upcase(char ch) { if (ch < 128) { if ('a' <= ch && ch <= 'z') { return (char)(ch + ('A' - 'a')); } return ch; } char cu = Character.toUpperCase(ch); if ((ch >= 128) && (cu < 128)) return ch; return cu; } private static char downcase(char ch) { if (ch < 128) { if ('A' <= ch && ch <= 'Z') { return (char)(ch + ('a' - 'A')); } return ch; } char cl = Character.toLowerCase(ch); if ((ch >= 128) && (cl < 128)) return ch; return cl; } /* * Validates and converts hex ascii value. */ private static int toASCIIHexDigit(int c) { if (c < '0') return -1; if (c <= '9') { return c - '0'; } c |= 0x20; if ('a' <= c && c <= 'f') { return c - 'a' + 10; } return -1; } /* * Top-down regular expression grammar, based closely on Perl4. * * regexp: altern A regular expression is one or more * altern '|' regexp alternatives separated by vertical bar. */ private static boolean parseDisjunction(CompilerState state) { if (!parseAlternative(state)) return false; char[] source = state.cpbegin; int index = state.cp; if (index != source.length && source[index] == '|') { RENode altResult; ++state.cp; altResult = new RENode(REOP_ALT); altResult.kid = state.result; if (!parseDisjunction(state)) return false; altResult.kid2 = state.result; state.result = altResult; /* ALT, , ..., JUMP, ... JUMP */ state.progLength += 9; } return true; } /* * altern: item An alternative is one or more items, * item altern concatenated together. */ private static boolean parseAlternative(CompilerState state) { RENode headTerm = null; RENode tailTerm = null; char[] source = state.cpbegin; while (true) { if (state.cp == state.cpend || source[state.cp] == '|' || (state.parenNesting != 0 && source[state.cp] == ')')) { if (headTerm == null) { state.result = new RENode(REOP_EMPTY); } else state.result = headTerm; return true; } if (!parseTerm(state)) return false; if (headTerm == null) headTerm = state.result; else { if (tailTerm == null) { headTerm.next = state.result; tailTerm = state.result; while (tailTerm.next != null) tailTerm = tailTerm.next; } else { tailTerm.next = state.result; tailTerm = tailTerm.next; while (tailTerm.next != null) tailTerm = tailTerm.next; } } } } /* calculate the total size of the bitmap required for a class expression */ private static boolean calculateBitmapSize(CompilerState state, RENode target, char[] src, int index, int end) { char rangeStart = 0; char c; int n; int nDigits; int i; int max = 0; boolean inRange = false; target.bmsize = 0; if (index == end) return true; if (src[index] == '^') ++index; while (index != end) { int localMax = 0; nDigits = 2; switch (src[index]) { case '\\': ++index; c = src[index++]; switch (c) { case 'b': localMax = 0x8; break; case 'f': localMax = 0xC; break; case 'n': localMax = 0xA; break; case 'r': localMax = 0xD; break; case 't': localMax = 0x9; break; case 'v': localMax = 0xB; break; case 'c': if (((index + 1) < end) && Character.isLetter(src[index + 1])) localMax = (char)(src[index++] & 0x1F); else localMax = '\\'; break; case 'u': nDigits += 2; // fall thru... case 'x': n = 0; for (i = 0; (i < nDigits) && (index < end); i++) { c = src[index++]; n = Kit.xDigitToInt(c, n); if (n < 0) { // Back off to accepting the original // '\' as a literal index -= (i + 1); n = '\\'; break; } } localMax = n; break; case 'd': if (inRange) { reportError("msg.bad.range", ""); return false; } localMax = '9'; break; case 'D': case 's': case 'S': case 'w': case 'W': if (inRange) { reportError("msg.bad.range", ""); return false; } target.bmsize = 65535; return true; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': /* * This is a non-ECMA extension - decimal escapes (in this * case, octal!) are supposed to be an error inside class * ranges, but supported here for backwards compatibility. * */ n = (c - '0'); c = src[index]; if ('0' <= c && c <= '7') { index++; n = 8 * n + (c - '0'); c = src[index]; if ('0' <= c && c <= '7') { index++; i = 8 * n + (c - '0'); if (i <= 0377) n = i; else index--; } } localMax = n; break; default: localMax = c; break; } break; default: localMax = src[index++]; break; } if (inRange) { if (rangeStart > localMax) { reportError("msg.bad.range", ""); return false; } inRange = false; } else { if (index < (end - 1)) { if (src[index] == '-') { ++index; inRange = true; rangeStart = (char)localMax; continue; } } } if ((state.flags & JSREG_FOLD) != 0){ char cu = upcase((char)localMax); char cd = downcase((char)localMax); localMax = (cu >= cd) ? cu : cd; } if (localMax > max) max = localMax; } target.bmsize = max; return true; } /* * item: assertion An item is either an assertion or * quantatom a quantified atom. * * assertion: '^' Assertions match beginning of string * (or line if the class static property * RegExp.multiline is true). * '$' End of string (or line if the class * static property RegExp.multiline is * true). * '\b' Word boundary (between \w and \W). * '\B' Word non-boundary. * * quantatom: atom An unquantified atom. * quantatom '{' n ',' m '}' * Atom must occur between n and m times. * quantatom '{' n ',' '}' Atom must occur at least n times. * quantatom '{' n '}' Atom must occur exactly n times. * quantatom '*' Zero or more times (same as {0,}). * quantatom '+' One or more times (same as {1,}). * quantatom '?' Zero or one time (same as {0,1}). * * any of which can be optionally followed by '?' for ungreedy * * atom: '(' regexp ')' A parenthesized regexp (what matched * can be addressed using a backreference, * see '\' n below). * '.' Matches any char except '\n'. * '[' classlist ']' A character class. * '[' '^' classlist ']' A negated character class. * '\f' Form Feed. * '\n' Newline (Line Feed). * '\r' Carriage Return. * '\t' Horizontal Tab. * '\v' Vertical Tab. * '\d' A digit (same as [0-9]). * '\D' A non-digit. * '\w' A word character, [0-9a-z_A-Z]. * '\W' A non-word character. * '\s' A whitespace character, [ \b\f\n\r\t\v]. * '\S' A non-whitespace character. * '\' n A backreference to the nth (n decimal * and positive) parenthesized expression. * '\' octal An octal escape sequence (octal must be * two or three digits long, unless it is * 0 for the null character). * '\x' hex A hex escape (hex must be two digits). * '\c' ctrl A control character, ctrl is a letter. * '\' literalatomchar Any character except one of the above * that follow '\' in an atom. * otheratomchar Any character not first among the other * atom right-hand sides. */ private static void doFlat(CompilerState state, char c) { state.result = new RENode(REOP_FLAT); state.result.chr = c; state.result.length = 1; state.result.flatIndex = -1; state.progLength += 3; } private static int getDecimalValue(char c, CompilerState state, int maxValue, String overflowMessageId) { boolean overflow = false; int start = state.cp; char[] src = state.cpbegin; int value = c - '0'; for (; state.cp != state.cpend; ++state.cp) { c = src[state.cp]; if (!isDigit(c)) { break; } if (!overflow) { int digit = c - '0'; if (value < (maxValue - digit) / 10) { value = value * 10 + digit; } else { overflow = true; value = maxValue; } } } if (overflow) { reportError(overflowMessageId, String.valueOf(src, start, state.cp - start)); } return value; } private static boolean parseTerm(CompilerState state) { char[] src = state.cpbegin; char c = src[state.cp++]; int nDigits = 2; int parenBaseCount = state.parenCount; int num, tmp; RENode term; int termStart; switch (c) { /* assertions and atoms */ case '^': state.result = new RENode(REOP_BOL); state.progLength++; return true; case '$': state.result = new RENode(REOP_EOL); state.progLength++; return true; case '\\': if (state.cp < state.cpend) { c = src[state.cp++]; switch (c) { /* assertion escapes */ case 'b' : state.result = new RENode(REOP_WBDRY); state.progLength++; return true; case 'B': state.result = new RENode(REOP_WNONBDRY); state.progLength++; return true; /* Decimal escape */ case '0': /* * Under 'strict' ECMA 3, we interpret \0 as NUL and don't accept octal. * However, (XXX and since Rhino doesn't have a 'strict' mode) we'll just * behave the old way for compatibility reasons. * (see http://bugzilla.mozilla.org/show_bug.cgi?id=141078) * */ reportWarning(state.cx, "msg.bad.backref", ""); /* octal escape */ num = 0; while (state.cp < state.cpend) { c = src[state.cp]; if ((c >= '0') && (c <= '7')) { state.cp++; tmp = 8 * num + (c - '0'); if (tmp > 0377) break; num = tmp; } else break; } c = (char)(num); doFlat(state, c); break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': termStart = state.cp - 1; num = getDecimalValue(c, state, 0xFFFF, "msg.overlarge.backref"); if (num > state.parenCount) reportWarning(state.cx, "msg.bad.backref", ""); /* * n > 9 or > count of parentheses, * then treat as octal instead. */ if ((num > 9) && (num > state.parenCount)) { state.cp = termStart; num = 0; while (state.cp < state.cpend) { c = src[state.cp]; if ((c >= '0') && (c <= '7')) { state.cp++; tmp = 8 * num + (c - '0'); if (tmp > 0377) break; num = tmp; } else break; } c = (char)(num); doFlat(state, c); break; } /* otherwise, it's a back-reference */ state.result = new RENode(REOP_BACKREF); state.result.parenIndex = num - 1; state.progLength += 3; break; /* Control escape */ case 'f': c = 0xC; doFlat(state, c); break; case 'n': c = 0xA; doFlat(state, c); break; case 'r': c = 0xD; doFlat(state, c); break; case 't': c = 0x9; doFlat(state, c); break; case 'v': c = 0xB; doFlat(state, c); break; /* Control letter */ case 'c': if (((state.cp + 1) < state.cpend) && Character.isLetter(src[state.cp + 1])) c = (char)(src[state.cp++] & 0x1F); else { /* back off to accepting the original '\' as a literal */ --state.cp; c = '\\'; } doFlat(state, c); break; /* UnicodeEscapeSequence */ case 'u': nDigits += 2; // fall thru... /* HexEscapeSequence */ case 'x': { int n = 0; int i; for (i = 0; (i < nDigits) && (state.cp < state.cpend); i++) { c = src[state.cp++]; n = Kit.xDigitToInt(c, n); if (n < 0) { // Back off to accepting the original // 'u' or 'x' as a literal state.cp -= (i + 2); n = src[state.cp++]; break; } } c = (char)(n); } doFlat(state, c); break; /* Character class escapes */ case 'd': state.result = new RENode(REOP_DIGIT); state.progLength++; break; case 'D': state.result = new RENode(REOP_NONDIGIT); state.progLength++; break; case 's': state.result = new RENode(REOP_SPACE); state.progLength++; break; case 'S': state.result = new RENode(REOP_NONSPACE); state.progLength++; break; case 'w': state.result = new RENode(REOP_ALNUM); state.progLength++; break; case 'W': state.result = new RENode(REOP_NONALNUM); state.progLength++; break; /* IdentityEscape */ default: state.result = new RENode(REOP_FLAT); state.result.chr = c; state.result.length = 1; state.result.flatIndex = state.cp - 1; state.progLength += 3; break; } break; } else { /* a trailing '\' is an error */ reportError("msg.trail.backslash", ""); return false; } case '(': { RENode result = null; termStart = state.cp; if (state.cp + 1 < state.cpend && src[state.cp] == '?' && ((c = src[state.cp + 1]) == '=' || c == '!' || c == ':')) { state.cp += 2; if (c == '=') { result = new RENode(REOP_ASSERT); /* ASSERT, , ... ASSERTTEST */ state.progLength += 4; } else if (c == '!') { result = new RENode(REOP_ASSERT_NOT); /* ASSERTNOT, , ... ASSERTNOTTEST */ state.progLength += 4; } } else { result = new RENode(REOP_LPAREN); /* LPAREN, , ... RPAREN, */ state.progLength += 6; result.parenIndex = state.parenCount++; } ++state.parenNesting; if (!parseDisjunction(state)) return false; if (state.cp == state.cpend || src[state.cp] != ')') { reportError("msg.unterm.paren", ""); return false; } ++state.cp; --state.parenNesting; if (result != null) { result.kid = state.result; state.result = result; } break; } case ')': reportError("msg.re.unmatched.right.paren", ""); return false; case '[': state.result = new RENode(REOP_CLASS); termStart = state.cp; state.result.startIndex = termStart; while (true) { if (state.cp == state.cpend) { reportError("msg.unterm.class", ""); return false; } if (src[state.cp] == '\\') state.cp++; else { if (src[state.cp] == ']') { state.result.kidlen = state.cp - termStart; break; } } state.cp++; } state.result.index = state.classCount++; /* * Call calculateBitmapSize now as we want any errors it finds * to be reported during the parse phase, not at execution. */ if (!calculateBitmapSize(state, state.result, src, termStart, state.cp++)) return false; state.progLength += 3; /* CLASS, */ break; case '.': state.result = new RENode(REOP_DOT); state.progLength++; break; case '*': case '+': case '?': reportError("msg.bad.quant", String.valueOf(src[state.cp - 1])); return false; default: state.result = new RENode(REOP_FLAT); state.result.chr = c; state.result.length = 1; state.result.flatIndex = state.cp - 1; state.progLength += 3; break; } term = state.result; if (state.cp == state.cpend) { return true; } boolean hasQ = false; switch (src[state.cp]) { case '+': state.result = new RENode(REOP_QUANT); state.result.min = 1; state.result.max = -1; /* , , , ... */ state.progLength += 8; hasQ = true; break; case '*': state.result = new RENode(REOP_QUANT); state.result.min = 0; state.result.max = -1; /* , , , ... */ state.progLength += 8; hasQ = true; break; case '?': state.result = new RENode(REOP_QUANT); state.result.min = 0; state.result.max = 1; /* , , , ... */ state.progLength += 8; hasQ = true; break; case '{': /* balance '}' */ { int min = 0; int max = -1; int leftCurl = state.cp; /* For Perl etc. compatibility, if quntifier does not match * \{\d+(,\d*)?\} exactly back off from it * being a quantifier, and chew it up as a literal * atom next time instead. */ c = src[++state.cp]; if (isDigit(c)) { ++state.cp; min = getDecimalValue(c, state, 0xFFFF, "msg.overlarge.min"); c = src[state.cp]; if (c == ',') { c = src[++state.cp]; if (isDigit(c)) { ++state.cp; max = getDecimalValue(c, state, 0xFFFF, "msg.overlarge.max"); c = src[state.cp]; if (min > max) { reportError("msg.max.lt.min", String.valueOf(src[state.cp])); return false; } } } else { max = min; } /* balance '{' */ if (c == '}') { state.result = new RENode(REOP_QUANT); state.result.min = min; state.result.max = max; // QUANT, , , , // , ... state.progLength += 12; hasQ = true; } } if (!hasQ) { state.cp = leftCurl; } break; } } if (!hasQ) return true; ++state.cp; state.result.kid = term; state.result.parenIndex = parenBaseCount; state.result.parenCount = state.parenCount - parenBaseCount; if ((state.cp < state.cpend) && (src[state.cp] == '?')) { ++state.cp; state.result.greedy = false; } else state.result.greedy = true; return true; } private static void resolveForwardJump(byte[] array, int from, int pc) { if (from > pc) throw Kit.codeBug(); addIndex(array, from, pc - from); } private static int getOffset(byte[] array, int pc) { return getIndex(array, pc); } private static int addIndex(byte[] array, int pc, int index) { if (index < 0) throw Kit.codeBug(); if (index > 0xFFFF) throw Context.reportRuntimeError("Too complex regexp"); array[pc] = (byte)(index >> 8); array[pc + 1] = (byte)(index); return pc + 2; } private static int getIndex(byte[] array, int pc) { return ((array[pc] & 0xFF) << 8) | (array[pc + 1] & 0xFF); } private static final int OFFSET_LEN = 2; private static final int INDEX_LEN = 2; private static int emitREBytecode(CompilerState state, RECompiled re, int pc, RENode t) { RENode nextAlt; int nextAltFixup, nextTermFixup; byte[] program = re.program; while (t != null) { program[pc++] = t.op; switch (t.op) { case REOP_EMPTY: --pc; break; case REOP_ALT: nextAlt = t.kid2; nextAltFixup = pc; /* address of next alternate */ pc += OFFSET_LEN; pc = emitREBytecode(state, re, pc, t.kid); program[pc++] = REOP_JUMP; nextTermFixup = pc; /* address of following term */ pc += OFFSET_LEN; resolveForwardJump(program, nextAltFixup, pc); pc = emitREBytecode(state, re, pc, nextAlt); program[pc++] = REOP_JUMP; nextAltFixup = pc; pc += OFFSET_LEN; resolveForwardJump(program, nextTermFixup, pc); resolveForwardJump(program, nextAltFixup, pc); break; case REOP_FLAT: /* * Consecutize FLAT's if possible. */ if (t.flatIndex != -1) { while ((t.next != null) && (t.next.op == REOP_FLAT) && ((t.flatIndex + t.length) == t.next.flatIndex)) { t.length += t.next.length; t.next = t.next.next; } } if ((t.flatIndex != -1) && (t.length > 1)) { if ((state.flags & JSREG_FOLD) != 0) program[pc - 1] = REOP_FLATi; else program[pc - 1] = REOP_FLAT; pc = addIndex(program, pc, t.flatIndex); pc = addIndex(program, pc, t.length); } else { if (t.chr < 256) { if ((state.flags & JSREG_FOLD) != 0) program[pc - 1] = REOP_FLAT1i; else program[pc - 1] = REOP_FLAT1; program[pc++] = (byte)(t.chr); } else { if ((state.flags & JSREG_FOLD) != 0) program[pc - 1] = REOP_UCFLAT1i; else program[pc - 1] = REOP_UCFLAT1; pc = addIndex(program, pc, t.chr); } } break; case REOP_LPAREN: pc = addIndex(program, pc, t.parenIndex); pc = emitREBytecode(state, re, pc, t.kid); program[pc++] = REOP_RPAREN; pc = addIndex(program, pc, t.parenIndex); break; case REOP_BACKREF: pc = addIndex(program, pc, t.parenIndex); break; case REOP_ASSERT: nextTermFixup = pc; pc += OFFSET_LEN; pc = emitREBytecode(state, re, pc, t.kid); program[pc++] = REOP_ASSERTTEST; resolveForwardJump(program, nextTermFixup, pc); break; case REOP_ASSERT_NOT: nextTermFixup = pc; pc += OFFSET_LEN; pc = emitREBytecode(state, re, pc, t.kid); program[pc++] = REOP_ASSERTNOTTEST; resolveForwardJump(program, nextTermFixup, pc); break; case REOP_QUANT: if ((t.min == 0) && (t.max == -1)) program[pc - 1] = (t.greedy) ? REOP_STAR : REOP_MINIMALSTAR; else if ((t.min == 0) && (t.max == 1)) program[pc - 1] = (t.greedy) ? REOP_OPT : REOP_MINIMALOPT; else if ((t.min == 1) && (t.max == -1)) program[pc - 1] = (t.greedy) ? REOP_PLUS : REOP_MINIMALPLUS; else { if (!t.greedy) program[pc - 1] = REOP_MINIMALQUANT; pc = addIndex(program, pc, t.min); // max can be -1 which addIndex does not accept pc = addIndex(program, pc, t.max + 1); } pc = addIndex(program, pc, t.parenCount); pc = addIndex(program, pc, t.parenIndex); nextTermFixup = pc; pc += OFFSET_LEN; pc = emitREBytecode(state, re, pc, t.kid); program[pc++] = REOP_ENDCHILD; resolveForwardJump(program, nextTermFixup, pc); break; case REOP_CLASS: pc = addIndex(program, pc, t.index); re.classList[t.index] = new RECharSet(t.bmsize, t.startIndex, t.kidlen); break; default: break; } t = t.next; } return pc; } private static void pushProgState(REGlobalData gData, int min, int max, REBackTrackData backTrackLastToSave, int continuation_pc, int continuation_op) { gData.stateStackTop = new REProgState(gData.stateStackTop, min, max, gData.cp, backTrackLastToSave, continuation_pc, continuation_op); } private static REProgState popProgState(REGlobalData gData) { REProgState state = gData.stateStackTop; gData.stateStackTop = state.previous; return state; } private static void pushBackTrackState(REGlobalData gData, byte op, int target) { gData.backTrackStackTop = new REBackTrackData(gData, op, target); } /* * Consecutive literal characters. */ private static boolean flatNMatcher(REGlobalData gData, int matchChars, int length, char[] chars, int end) { if ((gData.cp + length) > end) return false; for (int i = 0; i < length; i++) { if (gData.regexp.source[matchChars + i] != chars[gData.cp + i]) { return false; } } gData.cp += length; return true; } private static boolean flatNIMatcher(REGlobalData gData, int matchChars, int length, char[] chars, int end) { if ((gData.cp + length) > end) return false; for (int i = 0; i < length; i++) { if (upcase(gData.regexp.source[matchChars + i]) != upcase(chars[gData.cp + i])) { return false; } } gData.cp += length; return true; } /* 1. Evaluate DecimalEscape to obtain an EscapeValue E. 2. If E is not a character then go to step 6. 3. Let ch be E's character. 4. Let A be a one-element RECharSet containing the character ch. 5. Call CharacterSetMatcher(A, false) and return its Matcher result. 6. E must be an integer. Let n be that integer. 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception. 8. Return an internal Matcher closure that takes two arguments, a State x and a Continuation c, and performs the following: 1. Let cap be x's captures internal array. 2. Let s be cap[n]. 3. If s is undefined, then call c(x) and return its result. 4. Let e be x's endIndex. 5. Let len be s's length. 6. Let f be e+len. 7. If f>InputLength, return failure. 8. If there exists an integer i between 0 (inclusive) and len (exclusive) such that Canonicalize(s[i]) is not the same character as Canonicalize(Input [e+i]), then return failure. 9. Let y be the State (f, cap). 10. Call c(y) and return its result. */ private static boolean backrefMatcher(REGlobalData gData, int parenIndex, char[] chars, int end) { int len; int i; int parenContent = gData.parens_index(parenIndex); if (parenContent == -1) return true; len = gData.parens_length(parenIndex); if ((gData.cp + len) > end) return false; if ((gData.regexp.flags & JSREG_FOLD) != 0) { for (i = 0; i < len; i++) { if (upcase(chars[parenContent + i]) != upcase(chars[gData.cp + i])) return false; } } else { for (i = 0; i < len; i++) { if (chars[parenContent + i] != chars[gData.cp + i]) return false; } } gData.cp += len; return true; } /* Add a single character to the RECharSet */ private static void addCharacterToCharSet(RECharSet cs, char c) { int byteIndex = (c / 8); if (c > cs.length) throw new RuntimeException(); cs.bits[byteIndex] |= 1 << (c & 0x7); } /* Add a character range, c1 to c2 (inclusive) to the RECharSet */ private static void addCharacterRangeToCharSet(RECharSet cs, char c1, char c2) { int i; int byteIndex1 = (c1 / 8); int byteIndex2 = (c2 / 8); if ((c2 > cs.length) || (c1 > c2)) throw new RuntimeException(); c1 &= 0x7; c2 &= 0x7; if (byteIndex1 == byteIndex2) { cs.bits[byteIndex1] |= ((0xFF) >> (7 - (c2 - c1))) << c1; } else { cs.bits[byteIndex1] |= 0xFF << c1; for (i = byteIndex1 + 1; i < byteIndex2; i++) cs.bits[i] = (byte)0xFF; cs.bits[byteIndex2] |= (0xFF) >> (7 - c2); } } /* Compile the source of the class into a RECharSet */ private static void processCharSet(REGlobalData gData, RECharSet charSet) { synchronized (charSet) { if (!charSet.converted) { processCharSetImpl(gData, charSet); charSet.converted = true; } } } private static void processCharSetImpl(REGlobalData gData, RECharSet charSet) { int src = charSet.startIndex; int end = src + charSet.strlength; char rangeStart = 0, thisCh; int byteLength; char c; int n; int nDigits; int i; boolean inRange = false; charSet.sense = true; byteLength = (charSet.length / 8) + 1; charSet.bits = new byte[byteLength]; if (src == end) return; if (gData.regexp.source[src] == '^') { charSet.sense = false; ++src; } while (src != end) { nDigits = 2; switch (gData.regexp.source[src]) { case '\\': ++src; c = gData.regexp.source[src++]; switch (c) { case 'b': thisCh = 0x8; break; case 'f': thisCh = 0xC; break; case 'n': thisCh = 0xA; break; case 'r': thisCh = 0xD; break; case 't': thisCh = 0x9; break; case 'v': thisCh = 0xB; break; case 'c': if (((src + 1) < end) && isWord(gData.regexp.source[src + 1])) thisCh = (char)(gData.regexp.source[src++] & 0x1F); else { --src; thisCh = '\\'; } break; case 'u': nDigits += 2; // fall thru case 'x': n = 0; for (i = 0; (i < nDigits) && (src < end); i++) { c = gData.regexp.source[src++]; int digit = toASCIIHexDigit(c); if (digit < 0) { /* back off to accepting the original '\' * as a literal */ src -= (i + 1); n = '\\'; break; } n = (n << 4) | digit; } thisCh = (char)(n); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': /* * This is a non-ECMA extension - decimal escapes (in this * case, octal!) are supposed to be an error inside class * ranges, but supported here for backwards compatibility. * */ n = (c - '0'); c = gData.regexp.source[src]; if ('0' <= c && c <= '7') { src++; n = 8 * n + (c - '0'); c = gData.regexp.source[src]; if ('0' <= c && c <= '7') { src++; i = 8 * n + (c - '0'); if (i <= 0377) n = i; else src--; } } thisCh = (char)(n); break; case 'd': addCharacterRangeToCharSet(charSet, '0', '9'); continue; /* don't need range processing */ case 'D': addCharacterRangeToCharSet(charSet, (char)0, (char)('0' - 1)); addCharacterRangeToCharSet(charSet, (char)('9' + 1), (char)(charSet.length)); continue; case 's': for (i = charSet.length; i >= 0; i--) if (isREWhiteSpace(i)) addCharacterToCharSet(charSet, (char)(i)); continue; case 'S': for (i = charSet.length; i >= 0; i--) if (!isREWhiteSpace(i)) addCharacterToCharSet(charSet, (char)(i)); continue; case 'w': for (i = charSet.length; i >= 0; i--) if (isWord((char)i)) addCharacterToCharSet(charSet, (char)(i)); continue; case 'W': for (i = charSet.length; i >= 0; i--) if (!isWord((char)i)) addCharacterToCharSet(charSet, (char)(i)); continue; default: thisCh = c; break; } break; default: thisCh = gData.regexp.source[src++]; break; } if (inRange) { if ((gData.regexp.flags & JSREG_FOLD) != 0) { addCharacterRangeToCharSet(charSet, upcase(rangeStart), upcase(thisCh)); addCharacterRangeToCharSet(charSet, downcase(rangeStart), downcase(thisCh)); } else { addCharacterRangeToCharSet(charSet, rangeStart, thisCh); } inRange = false; } else { if ((gData.regexp.flags & JSREG_FOLD) != 0) { addCharacterToCharSet(charSet, upcase(thisCh)); addCharacterToCharSet(charSet, downcase(thisCh)); } else { addCharacterToCharSet(charSet, thisCh); } if (src < (end - 1)) { if (gData.regexp.source[src] == '-') { ++src; inRange = true; rangeStart = thisCh; } } } } } /* * Initialize the character set if it this is the first call. * Test the bit - if the ^ flag was specified, non-inclusion is a success */ private static boolean classMatcher(REGlobalData gData, RECharSet charSet, char ch) { if (!charSet.converted) { processCharSet(gData, charSet); } int byteIndex = ch / 8; if (charSet.sense) { if ((charSet.length == 0) || ( (ch > charSet.length) || ((charSet.bits[byteIndex] & (1 << (ch & 0x7))) == 0) )) return false; } else { if (! ((charSet.length == 0) || ( (ch > charSet.length) || ((charSet.bits[byteIndex] & (1 << (ch & 0x7))) == 0) ))) return false; } return true; } private static boolean executeREBytecode(REGlobalData gData, char[] chars, int end) { int pc = 0; byte program[] = gData.regexp.program; int currentContinuation_op; int currentContinuation_pc; boolean result = false; currentContinuation_pc = 0; currentContinuation_op = REOP_END; if (debug) { System.out.println("Input = \"" + new String(chars) + "\", start at " + gData.cp); } int op = program[pc++]; for (;;) { if (debug) { System.out.println("Testing at " + gData.cp + ", op = " + op); } switch (op) { case REOP_EMPTY: result = true; break; case REOP_BOL: if (gData.cp != 0) { if (gData.multiline || ((gData.regexp.flags & JSREG_MULTILINE) != 0)) { if (!isLineTerm(chars[gData.cp - 1])) { result = false; break; } } else { result = false; break; } } result = true; break; case REOP_EOL: if (gData.cp != end) { if (gData.multiline || ((gData.regexp.flags & JSREG_MULTILINE) != 0)) { if (!isLineTerm(chars[gData.cp])) { result = false; break; } } else { result = false; break; } } result = true; break; case REOP_WBDRY: result = ((gData.cp == 0 || !isWord(chars[gData.cp - 1])) ^ !((gData.cp < end) && isWord(chars[gData.cp]))); break; case REOP_WNONBDRY: result = ((gData.cp == 0 || !isWord(chars[gData.cp - 1])) ^ ((gData.cp < end) && isWord(chars[gData.cp]))); break; case REOP_DOT: result = (gData.cp != end && !isLineTerm(chars[gData.cp])); if (result) { gData.cp++; } break; case REOP_DIGIT: result = (gData.cp != end && isDigit(chars[gData.cp])); if (result) { gData.cp++; } break; case REOP_NONDIGIT: result = (gData.cp != end && !isDigit(chars[gData.cp])); if (result) { gData.cp++; } break; case REOP_SPACE: result = (gData.cp != end && isREWhiteSpace(chars[gData.cp])); if (result) { gData.cp++; } break; case REOP_NONSPACE: result = (gData.cp != end && !isREWhiteSpace(chars[gData.cp])); if (result) { gData.cp++; } break; case REOP_ALNUM: result = (gData.cp != end && isWord(chars[gData.cp])); if (result) { gData.cp++; } break; case REOP_NONALNUM: result = (gData.cp != end && !isWord(chars[gData.cp])); if (result) { gData.cp++; } break; case REOP_FLAT: { int offset = getIndex(program, pc); pc += INDEX_LEN; int length = getIndex(program, pc); pc += INDEX_LEN; result = flatNMatcher(gData, offset, length, chars, end); } break; case REOP_FLATi: { int offset = getIndex(program, pc); pc += INDEX_LEN; int length = getIndex(program, pc); pc += INDEX_LEN; result = flatNIMatcher(gData, offset, length, chars, end); } break; case REOP_FLAT1: { char matchCh = (char)(program[pc++] & 0xFF); result = (gData.cp != end && chars[gData.cp] == matchCh); if (result) { gData.cp++; } } break; case REOP_FLAT1i: { char matchCh = (char)(program[pc++] & 0xFF); result = (gData.cp != end && upcase(chars[gData.cp]) == upcase(matchCh)); if (result) { gData.cp++; } } break; case REOP_UCFLAT1: { char matchCh = (char)getIndex(program, pc); pc += INDEX_LEN; result = (gData.cp != end && chars[gData.cp] == matchCh); if (result) { gData.cp++; } } break; case REOP_UCFLAT1i: { char matchCh = (char)getIndex(program, pc); pc += INDEX_LEN; result = (gData.cp != end && upcase(chars[gData.cp]) == upcase(matchCh)); if (result) { gData.cp++; } } break; case REOP_ALT: { int nextpc; byte nextop; pushProgState(gData, 0, 0, null, currentContinuation_pc, currentContinuation_op); nextpc = pc + getOffset(program, pc); nextop = program[nextpc++]; pushBackTrackState(gData, nextop, nextpc); pc += INDEX_LEN; op = program[pc++]; } continue; case REOP_JUMP: { int offset; REProgState state = popProgState(gData); currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; offset = getOffset(program, pc); pc += offset; op = program[pc++]; } continue; case REOP_LPAREN: { int parenIndex = getIndex(program, pc); pc += INDEX_LEN; gData.set_parens(parenIndex, gData.cp, 0); op = program[pc++]; } continue; case REOP_RPAREN: { int cap_index; int parenIndex = getIndex(program, pc); pc += INDEX_LEN; cap_index = gData.parens_index(parenIndex); gData.set_parens(parenIndex, cap_index, gData.cp - cap_index); if (parenIndex > gData.lastParen) gData.lastParen = parenIndex; op = program[pc++]; } continue; case REOP_BACKREF: { int parenIndex = getIndex(program, pc); pc += INDEX_LEN; result = backrefMatcher(gData, parenIndex, chars, end); } break; case REOP_CLASS: { int index = getIndex(program, pc); pc += INDEX_LEN; if (gData.cp != end) { if (classMatcher(gData, gData.regexp.classList[index], chars[gData.cp])) { gData.cp++; result = true; break; } } result = false; } break; case REOP_ASSERT: case REOP_ASSERT_NOT: { byte testOp; pushProgState(gData, 0, 0, gData.backTrackStackTop, currentContinuation_pc, currentContinuation_op); if (op == REOP_ASSERT) { testOp = REOP_ASSERTTEST; } else { testOp = REOP_ASSERTNOTTEST; } pushBackTrackState(gData, testOp, pc + getOffset(program, pc)); pc += INDEX_LEN; op = program[pc++]; } continue; case REOP_ASSERTTEST: case REOP_ASSERTNOTTEST: { REProgState state = popProgState(gData); gData.cp = state.index; gData.backTrackStackTop = state.backTrack; currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; if (result) { if (op == REOP_ASSERTTEST) { result = true; } else { result = false; } } else { if (op == REOP_ASSERTTEST) { // Do nothing } else { result = true; } } } break; case REOP_STAR: case REOP_PLUS: case REOP_OPT: case REOP_QUANT: case REOP_MINIMALSTAR: case REOP_MINIMALPLUS: case REOP_MINIMALOPT: case REOP_MINIMALQUANT: { int min, max; boolean greedy = false; switch (op) { case REOP_STAR: greedy = true; // fallthrough case REOP_MINIMALSTAR: min = 0; max = -1; break; case REOP_PLUS: greedy = true; // fallthrough case REOP_MINIMALPLUS: min = 1; max = -1; break; case REOP_OPT: greedy = true; // fallthrough case REOP_MINIMALOPT: min = 0; max = 1; break; case REOP_QUANT: greedy = true; // fallthrough case REOP_MINIMALQUANT: min = getOffset(program, pc); pc += INDEX_LEN; // See comments in emitREBytecode for " - 1" reason max = getOffset(program, pc) - 1; pc += INDEX_LEN; break; default: throw Kit.codeBug(); } pushProgState(gData, min, max, null, currentContinuation_pc, currentContinuation_op); if (greedy) { currentContinuation_op = REOP_REPEAT; currentContinuation_pc = pc; pushBackTrackState(gData, REOP_REPEAT, pc); /* Step over , & */ pc += 3 * INDEX_LEN; op = program[pc++]; } else { if (min != 0) { currentContinuation_op = REOP_MINIMALREPEAT; currentContinuation_pc = pc; /* & */ pc += 3 * INDEX_LEN; op = program[pc++]; } else { pushBackTrackState(gData, REOP_MINIMALREPEAT, pc); popProgState(gData); pc += 2 * INDEX_LEN; // & pc = pc + getOffset(program, pc); op = program[pc++]; } } } continue; case REOP_ENDCHILD: // Use the current continuation. pc = currentContinuation_pc; op = currentContinuation_op; continue; case REOP_REPEAT: { REProgState state = popProgState(gData); if (!result) { // // There's been a failure, see if we have enough // children. // if (state.min == 0) result = true; currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; pc += 2 * INDEX_LEN; /* & */ pc = pc + getOffset(program, pc); break; } else { if (state.min == 0 && gData.cp == state.index) { // matched an empty string, that'll get us nowhere result = false; currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; pc += 2 * INDEX_LEN; pc = pc + getOffset(program, pc); break; } int new_min = state.min, new_max = state.max; if (new_min != 0) new_min--; if (new_max != -1) new_max--; if (new_max == 0) { result = true; currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; pc += 2 * INDEX_LEN; pc = pc + getOffset(program, pc); break; } pushProgState(gData, new_min, new_max, null, state.continuation_pc, state.continuation_op); currentContinuation_op = REOP_REPEAT; currentContinuation_pc = pc; pushBackTrackState(gData, REOP_REPEAT, pc); int parenCount = getIndex(program, pc); pc += INDEX_LEN; int parenIndex = getIndex(program, pc); pc += 2 * INDEX_LEN; op = program[pc++]; for (int k = 0; k < parenCount; k++) { gData.set_parens(parenIndex + k, -1, 0); } } } continue; case REOP_MINIMALREPEAT: { REProgState state = popProgState(gData); if (!result) { // // Non-greedy failure - try to consume another child. // if (state.max == -1 || state.max > 0) { pushProgState(gData, state.min, state.max, null, state.continuation_pc, state.continuation_op); currentContinuation_op = REOP_MINIMALREPEAT; currentContinuation_pc = pc; int parenCount = getIndex(program, pc); pc += INDEX_LEN; int parenIndex = getIndex(program, pc); pc += 2 * INDEX_LEN; for (int k = 0; k < parenCount; k++) { gData.set_parens(parenIndex + k, -1, 0); } op = program[pc++]; continue; } else { // Don't need to adjust pc since we're going to pop. currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; break; } } else { if (state.min == 0 && gData.cp == state.index) { // Matched an empty string, that'll get us nowhere. result = false; currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; break; } int new_min = state.min, new_max = state.max; if (new_min != 0) new_min--; if (new_max != -1) new_max--; pushProgState(gData, new_min, new_max, null, state.continuation_pc, state.continuation_op); if (new_min != 0) { currentContinuation_op = REOP_MINIMALREPEAT; currentContinuation_pc = pc; int parenCount = getIndex(program, pc); pc += INDEX_LEN; int parenIndex = getIndex(program, pc); pc += 2 * INDEX_LEN; for (int k = 0; k < parenCount; k++) { gData.set_parens(parenIndex + k, -1, 0); } op = program[pc++]; } else { currentContinuation_pc = state.continuation_pc; currentContinuation_op = state.continuation_op; pushBackTrackState(gData, REOP_MINIMALREPEAT, pc); popProgState(gData); pc += 2 * INDEX_LEN; pc = pc + getOffset(program, pc); op = program[pc++]; } continue; } } case REOP_END: return true; default: throw Kit.codeBug(); } /* * If the match failed and there's a backtrack option, take it. * Otherwise this is a complete and utter failure. */ if (!result) { REBackTrackData backTrackData = gData.backTrackStackTop; if (backTrackData != null) { gData.backTrackStackTop = backTrackData.previous; gData.lastParen = backTrackData.lastParen; // XXX: If backTrackData will no longer be used, then // there is no need to clone backTrackData.parens if (backTrackData.parens != null) { gData.parens = backTrackData.parens.clone(); } gData.cp = backTrackData.cp; gData.stateStackTop = backTrackData.stateStackTop; currentContinuation_op = gData.stateStackTop.continuation_op; currentContinuation_pc = gData.stateStackTop.continuation_pc; pc = backTrackData.continuation_pc; op = backTrackData.continuation_op; continue; } else return false; } op = program[pc++]; } } private static boolean matchRegExp(REGlobalData gData, RECompiled re, char[] chars, int start, int end, boolean multiline) { if (re.parenCount != 0) { gData.parens = new long[re.parenCount]; } else { gData.parens = null; } gData.backTrackStackTop = null; gData.stateStackTop = null; gData.multiline = multiline; gData.regexp = re; gData.lastParen = 0; int anchorCh = gData.regexp.anchorCh; // // have to include the position beyond the last character // in order to detect end-of-input/line condition // for (int i = start; i <= end; ++i) { // // If the first node is a literal match, step the index into // the string until that match is made, or fail if it can't be // found at all. // if (anchorCh >= 0) { for (;;) { if (i == end) { return false; } char matchCh = chars[i]; if (matchCh == anchorCh || ((gData.regexp.flags & JSREG_FOLD) != 0 && upcase(matchCh) == upcase((char)anchorCh))) { break; } ++i; } } gData.cp = i; for (int j = 0; j < re.parenCount; j++) { gData.set_parens(j, -1, 0); } boolean result = executeREBytecode(gData, chars, end); gData.backTrackStackTop = null; gData.stateStackTop = null; if (result) { gData.skipped = i - start; return true; } } return false; } /* * indexp is assumed to be an array of length 1 */ Object executeRegExp(Context cx, Scriptable scopeObj, RegExpImpl res, String str, int indexp[], int matchType) { REGlobalData gData = new REGlobalData(); int start = indexp[0]; char[] charArray = str.toCharArray(); int end = charArray.length; if (start > end) start = end; // // Call the recursive matcher to do the real work. // boolean matches = matchRegExp(gData, re, charArray, start, end, res.multiline); if (!matches) { if (matchType != PREFIX) return null; return Undefined.instance; } int index = gData.cp; int i = index; indexp[0] = i; int matchlen = i - (start + gData.skipped); int ep = index; index -= matchlen; Object result; Scriptable obj; if (matchType == TEST) { /* * Testing for a match and updating cx.regExpImpl: don't allocate * an array object, do return true. */ result = Boolean.TRUE; obj = null; } else { /* * The array returned on match has element 0 bound to the matched * string, elements 1 through re.parenCount bound to the paren * matches, an index property telling the length of the left context, * and an input property referring to the input string. */ Scriptable scope = getTopLevelScope(scopeObj); result = ScriptRuntime.newObject(cx, scope, "Array", null); obj = (Scriptable) result; String matchstr = new String(charArray, index, matchlen); obj.put(0, obj, matchstr); } if (re.parenCount == 0) { res.parens = null; res.lastParen = SubString.emptySubString; } else { SubString parsub = null; int num; res.parens = new SubString[re.parenCount]; for (num = 0; num < re.parenCount; num++) { int cap_index = gData.parens_index(num); String parstr; if (cap_index != -1) { int cap_length = gData.parens_length(num); parsub = new SubString(charArray, cap_index, cap_length); res.parens[num] = parsub; if (matchType == TEST) continue; parstr = parsub.toString(); obj.put(num+1, obj, parstr); } else { if (matchType != TEST) obj.put(num+1, obj, Undefined.instance); } } res.lastParen = parsub; } if (! (matchType == TEST)) { /* * Define the index and input properties last for better for/in loop * order (so they come after the elements). */ obj.put("index", obj, new Integer(start + gData.skipped)); obj.put("input", obj, str); } if (res.lastMatch == null) { res.lastMatch = new SubString(); res.leftContext = new SubString(); res.rightContext = new SubString(); } res.lastMatch.charArray = charArray; res.lastMatch.index = index; res.lastMatch.length = matchlen; res.leftContext.charArray = charArray; if (cx.getLanguageVersion() == Context.VERSION_1_2) { /* * JS1.2 emulated Perl4.0.1.8 (patch level 36) for global regexps used * in scalar contexts, and unintentionally for the string.match "list" * psuedo-context. On "hi there bye", the following would result: * * Language while(/ /g){print("$`");} s/ /$`/g * perl4.036 "hi", "there" "hihitherehi therebye" * perl5 "hi", "hi there" "hihitherehi therebye" * js1.2 "hi", "there" "hihitheretherebye" * * Insofar as JS1.2 always defined $` as "left context from the last * match" for global regexps, it was more consistent than perl4. */ res.leftContext.index = start; res.leftContext.length = gData.skipped; } else { /* * For JS1.3 and ECMAv2, emulate Perl5 exactly: * * js1.3 "hi", "hi there" "hihitherehi therebye" */ res.leftContext.index = 0; res.leftContext.length = start + gData.skipped; } res.rightContext.charArray = charArray; res.rightContext.index = ep; res.rightContext.length = end - ep; return result; } int getFlags() { return re.flags; } private static void reportWarning(Context cx, String messageId, String arg) { if (cx.hasFeature(Context.FEATURE_STRICT_MODE)) { String msg = ScriptRuntime.getMessage1(messageId, arg); Context.reportWarning(msg); } } private static void reportError(String messageId, String arg) { String msg = ScriptRuntime.getMessage1(messageId, arg); throw ScriptRuntime.constructError("SyntaxError", msg); } // #string_id_map# private static final int Id_lastIndex = 1, Id_source = 2, Id_global = 3, Id_ignoreCase = 4, Id_multiline = 5, MAX_INSTANCE_ID = 5; @Override protected int getMaxInstanceId() { return MAX_INSTANCE_ID; } @Override protected int findInstanceIdInfo(String s) { int id; // #generated# Last update: 2007-05-09 08:16:24 EDT L0: { id = 0; String X = null; int c; int s_length = s.length(); if (s_length==6) { c=s.charAt(0); if (c=='g') { X="global";id=Id_global; } else if (c=='s') { X="source";id=Id_source; } } else if (s_length==9) { c=s.charAt(0); if (c=='l') { X="lastIndex";id=Id_lastIndex; } else if (c=='m') { X="multiline";id=Id_multiline; } } else if (s_length==10) { X="ignoreCase";id=Id_ignoreCase; } if (X!=null && X!=s && !X.equals(s)) id = 0; break L0; } // #/generated# // #/string_id_map# if (id == 0) return super.findInstanceIdInfo(s); int attr; switch (id) { case Id_lastIndex: attr = PERMANENT | DONTENUM; break; case Id_source: case Id_global: case Id_ignoreCase: case Id_multiline: attr = PERMANENT | READONLY | DONTENUM; break; default: throw new IllegalStateException(); } return instanceIdInfo(attr, id); } @Override protected String getInstanceIdName(int id) { switch (id) { case Id_lastIndex: return "lastIndex"; case Id_source: return "source"; case Id_global: return "global"; case Id_ignoreCase: return "ignoreCase"; case Id_multiline: return "multiline"; } return super.getInstanceIdName(id); } @Override protected Object getInstanceIdValue(int id) { switch (id) { case Id_lastIndex: return ScriptRuntime.wrapNumber(lastIndex); case Id_source: return new String(re.source); case Id_global: return ScriptRuntime.wrapBoolean((re.flags & JSREG_GLOB) != 0); case Id_ignoreCase: return ScriptRuntime.wrapBoolean((re.flags & JSREG_FOLD) != 0); case Id_multiline: return ScriptRuntime.wrapBoolean((re.flags & JSREG_MULTILINE) != 0); } return super.getInstanceIdValue(id); } @Override protected void setInstanceIdValue(int id, Object value) { if (id == Id_lastIndex) { lastIndex = ScriptRuntime.toNumber(value); return; } super.setInstanceIdValue(id, value); } @Override protected void initPrototypeId(int id) { String s; int arity; switch (id) { case Id_compile: arity=1; s="compile"; break; case Id_toString: arity=0; s="toString"; break; case Id_toSource: arity=0; s="toSource"; break; case Id_exec: arity=1; s="exec"; break; case Id_test: arity=1; s="test"; break; case Id_prefix: arity=1; s="prefix"; break; default: throw new IllegalArgumentException(String.valueOf(id)); } initPrototypeMethod(REGEXP_TAG, id, s, arity); } @Override public Object execIdCall(IdFunctionObject f, Context cx, Scriptable scope, Scriptable thisObj, Object[] args) { if (!f.hasTag(REGEXP_TAG)) { return super.execIdCall(f, cx, scope, thisObj, args); } int id = f.methodId(); switch (id) { case Id_compile: return realThis(thisObj, f).compile(cx, scope, args); case Id_toString: case Id_toSource: return realThis(thisObj, f).toString(); case Id_exec: return realThis(thisObj, f).execSub(cx, scope, args, MATCH); case Id_test: { Object x = realThis(thisObj, f).execSub(cx, scope, args, TEST); return Boolean.TRUE.equals(x) ? Boolean.TRUE : Boolean.FALSE; } case Id_prefix: return realThis(thisObj, f).execSub(cx, scope, args, PREFIX); } throw new IllegalArgumentException(String.valueOf(id)); } private static NativeRegExp realThis(Scriptable thisObj, IdFunctionObject f) { if (!(thisObj instanceof NativeRegExp)) throw incompatibleCallError(f); return (NativeRegExp)thisObj; } // #string_id_map# @Override protected int findPrototypeId(String s) { int id; // #generated# Last update: 2007-05-09 08:16:24 EDT L0: { id = 0; String X = null; int c; L: switch (s.length()) { case 4: c=s.charAt(0); if (c=='e') { X="exec";id=Id_exec; } else if (c=='t') { X="test";id=Id_test; } break L; case 6: X="prefix";id=Id_prefix; break L; case 7: X="compile";id=Id_compile; break L; case 8: c=s.charAt(3); if (c=='o') { X="toSource";id=Id_toSource; } else if (c=='t') { X="toString";id=Id_toString; } break L; } if (X!=null && X!=s && !X.equals(s)) id = 0; break L0; } // #/generated# return id; } private static final int Id_compile = 1, Id_toString = 2, Id_toSource = 3, Id_exec = 4, Id_test = 5, Id_prefix = 6, MAX_PROTOTYPE_ID = 6; // #/string_id_map# private RECompiled re; double lastIndex; /* index after last match, for //g iterator */ } // class NativeRegExp class RECompiled implements Serializable { static final long serialVersionUID = -6144956577595844213L; char []source; /* locked source string, sans // */ int parenCount; /* number of parenthesized submatches */ int flags; /* flags */ byte[] program; /* regular expression bytecode */ int classCount; /* count [...] bitmaps */ RECharSet[] classList; /* list of [...] bitmaps */ int anchorCh = -1; /* if >= 0, then re starts with this literal char */ } class RENode { RENode(byte op) { this.op = op; } byte op; /* r.e. op bytecode */ RENode next; /* next in concatenation order */ RENode kid; /* first operand */ RENode kid2; /* second operand */ int num; /* could be a number */ int parenIndex; /* or a parenthesis index */ /* or a range */ int min; int max; int parenCount; boolean greedy; /* or a character class */ int startIndex; int kidlen; /* length of string at kid, in chars */ int bmsize; /* bitmap size, based on max char code */ int index; /* index into class list */ /* or a literal sequence */ char chr; /* of one character */ int length; /* or many (via the index) */ int flatIndex; /* which is -1 if not sourced */ } class CompilerState { CompilerState(Context cx, char[] source, int length, int flags) { this.cx = cx; this.cpbegin = source; this.cp = 0; this.cpend = length; this.flags = flags; this.parenCount = 0; this.classCount = 0; this.progLength = 0; } Context cx; char cpbegin[]; int cpend; int cp; int flags; int parenCount; int parenNesting; int classCount; /* number of [] encountered */ int progLength; /* estimated bytecode length */ RENode result; } class REProgState { REProgState(REProgState previous, int min, int max, int index, REBackTrackData backTrack, int continuation_pc, int continuation_op) { this.previous = previous; this.min = min; this.max = max; this.index = index; this.continuation_op = continuation_op; this.continuation_pc = continuation_pc; this.backTrack = backTrack; } REProgState previous; // previous state in stack int min; /* current quantifier min */ int max; /* current quantifier max */ int index; /* progress in text */ int continuation_op; int continuation_pc; REBackTrackData backTrack; // used by ASSERT_ to recover state } class REBackTrackData { REBackTrackData(REGlobalData gData, int op, int pc) { previous = gData.backTrackStackTop; continuation_op = op; continuation_pc = pc; lastParen = gData.lastParen; if (gData.parens != null) { parens = gData.parens.clone(); } cp = gData.cp; stateStackTop = gData.stateStackTop; } REBackTrackData previous; int continuation_op; /* where to backtrack to */ int continuation_pc; int lastParen; long[] parens; /* parenthesis captures */ int cp; /* char buffer index */ REProgState stateStackTop; /* state of op that backtracked */ } class REGlobalData { boolean multiline; RECompiled regexp; /* the RE in execution */ int lastParen; /* highest paren set so far */ int skipped; /* chars skipped anchoring this r.e. */ int cp; /* char buffer index */ long[] parens; /* parens captures */ REProgState stateStackTop; /* stack of state of current ancestors */ REBackTrackData backTrackStackTop; /* last matched-so-far position */ /** * Get start of parenthesis capture contents, -1 for empty. */ int parens_index(int i) { return (int)(parens[i]); } /** * Get length of parenthesis capture contents. */ int parens_length(int i) { return (int)(parens[i] >>> 32); } void set_parens(int i, int index, int length) { parens[i] = (index & 0xffffffffL) | ((long)length << 32); } } /* * This struct holds a bitmap representation of a class from a regexp. * There's a list of these referenced by the classList field in the NativeRegExp * struct below. The initial state has startIndex set to the offset in the * original regexp source of the beginning of the class contents. The first * use of the class converts the source representation into a bitmap. * */ final class RECharSet implements Serializable { static final long serialVersionUID = 7931787979395898394L; RECharSet(int length, int startIndex, int strlength) { this.length = length; this.startIndex = startIndex; this.strlength = strlength; } int length; int startIndex; int strlength; volatile transient boolean converted; volatile transient boolean sense; volatile transient byte[] bits; }