diff --git a/std/regex/internal/ir.d b/std/regex/internal/ir.d index 28b199895d9..2bd98843c6e 100644 --- a/std/regex/internal/ir.d +++ b/std/regex/internal/ir.d @@ -47,40 +47,11 @@ CharMatcher[CodepointSet] matcherCache; } } -@trusted auto memoizeExpr(string expr)() -{ - if (__ctfe) - return mixin(expr); - alias T = typeof(mixin(expr)); - static T slot; - static bool initialized; - if (!initialized) - { - slot = mixin(expr); - initialized = true; - } - return slot; -} - -//property for \w character class -@property CodepointSet wordCharacter() -{ - return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc - | unicode.Me | unicode.Nd | unicode.Pc")(); -} - -@property CharMatcher wordMatcher() -{ - return memoizeExpr!("CharMatcher(wordCharacter)")(); -} +static CharMatcher wordMatcher = CharMatcher(wordCharacter); // some special Unicode white space characters private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029'; -// Characters that need escaping in string posed as regular expressions -alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-', - ';', ':', '#', '&', '%', '/', '<', '>', '`', '*', '+', '(', ')', '{', '}', '~'); - //Regular expression engine/parser options: // global - search all nonoverlapping matches in input // casefold - case insensitive matching, do casefolding on match in unicode mode diff --git a/std/regex/internal/parser.d b/std/regex/internal/parser.d index e235292c6ae..7b7e9688411 100644 --- a/std/regex/internal/parser.d +++ b/std/regex/internal/parser.d @@ -4,10 +4,10 @@ */ module std.regex.internal.parser; -static import std.ascii; +import std.regex.internal.ir; import std.range.primitives, std.uni, std.meta, std.traits, std.typecons, std.exception; -import std.regex.internal.ir; +static import std.ascii; // package relevant info from parser into a regex object auto makeRegex(S, CG)(Parser!(S, CG) p) @@ -157,98 +157,6 @@ if (isSomeString!S) code[] = rev[]; } -//test if a given string starts with hex number of maxDigit that's a valid codepoint -//returns it's value and skips these maxDigit chars on success, throws on failure -dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit) -{ - //std.conv.parse is both @system and bogus - enforce(str.length >= maxDigit,"incomplete escape sequence"); - uint val; - for (int k = 0; k < maxDigit; k++) - { - immutable current = str[k];//accepts ascii only, so it's OK to index directly - if ('0' <= current && current <= '9') - val = val * 16 + current - '0'; - else if ('a' <= current && current <= 'f') - val = val * 16 + current -'a' + 10; - else if ('A' <= current && current <= 'F') - val = val * 16 + current - 'A' + 10; - else - throw new Exception("invalid escape sequence"); - } - enforce(val <= 0x10FFFF, "invalid codepoint"); - str = str[maxDigit..$]; - return val; -} - -@system unittest //BUG canFind is system -{ - import std.algorithm.searching : canFind; - string[] non_hex = [ "000j", "000z", "FffG", "0Z"]; - string[] hex = [ "01", "ff", "00af", "10FFFF" ]; - int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ]; - foreach (v; non_hex) - assert(collectException(parseUniHex(v, v.length)).msg - .canFind("invalid escape sequence")); - foreach (i, v; hex) - assert(parseUniHex(v, v.length) == value[i]); - string over = "0011FFFF"; - assert(collectException(parseUniHex(over, over.length)).msg - .canFind("invalid codepoint")); -} - -auto caseEnclose(CodepointSet set) -{ - auto cased = set & unicode.LC; - foreach (dchar ch; cased.byCodepoint) - { - foreach (c; simpleCaseFoldings(ch)) - set |= c; - } - return set; -} - -/+ - fetch codepoint set corresponding to a name (InBlock or binary property) -+/ -@trusted CodepointSet getUnicodeSet(in char[] name, bool negated, bool casefold) -{ - CodepointSet s = unicode(name); - //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC) - if (casefold) - s = caseEnclose(s); - if (negated) - s = s.inverted; - return s; -} - -//basic stack, just in case it gets used anywhere else then Parser -@trusted struct Stack(T) -{ - T[] data; - @property bool empty(){ return data.empty; } - - @property size_t length(){ return data.length; } - - void push(T val){ data ~= val; } - - T pop() - { - assert(!empty); - auto val = data[$ - 1]; - data = data[0 .. $ - 1]; - if (!__ctfe) - cast(void) data.assumeSafeAppend(); - return val; - } - - @property ref T top() - { - assert(!empty); - return data[$ - 1]; - } -} - struct CodeGen { Bytecode[] ir; // resulting bytecode @@ -616,7 +524,7 @@ enum infinite = ~0u; struct Parser(R, Generator) if (isForwardRange!R && is(ElementType!R : dchar)) { - dchar _current; + dchar front; bool empty; R pat, origin; //keep full pattern for pretty printing error messages uint re_flags = 0; //global flags e.g. multiline + internal ones @@ -628,8 +536,8 @@ if (isForwardRange!R && is(ElementType!R : dchar)) pat = origin = pattern; //reserve slightly more then avg as sampled from unittests parseFlags(flags); - _current = ' ';//a safe default for freeform parsing - next(); + front = ' ';//a safe default for freeform parsing + popFront(); g.start(cast(uint) pat.length); try { @@ -642,61 +550,47 @@ if (isForwardRange!R && is(ElementType!R : dchar)) g.endPattern(1); } - @property dchar current(){ return _current; } - - bool _next() + void _popFront() { if (pat.empty) { empty = true; - return false; } - _current = pat.front; - pat.popFront(); - return true; + else + { + front = pat.front; + pat.popFront(); + } } void skipSpace() { - while (isWhite(current) && _next()){ } + while (!empty && isWhite(front)) _popFront(); } - bool next() + void popFront() { - if (re_flags & RegexOption.freeform) - { - immutable r = _next(); - skipSpace(); - return r; - } - else - return _next(); + _popFront(); + if (re_flags & RegexOption.freeform) skipSpace(); } + auto save(){ return this; } + //parsing number with basic overflow check uint parseDecimal() { uint r = 0; - while (std.ascii.isDigit(current)) + while (std.ascii.isDigit(front)) { if (r >= (uint.max/10)) error("Overflow in decimal number"); - r = 10*r + cast(uint)(current-'0'); - if (!next()) - break; + r = 10*r + cast(uint)(front-'0'); + popFront(); + if (empty) break; } return r; } - //parse control code of form \cXXX, c assumed to be the current symbol - dchar parseControlCode() - { - enforce(next(), "Unfinished escape sequence"); - enforce(('a' <= current && current <= 'z') || ('A' <= current && current <= 'Z'), - "Only letters are allowed after \\c"); - return current & 0x1f; - } - // @trusted void parseFlags(S)(S flags) {//@@@BUG@@@ text is @system @@ -730,73 +624,74 @@ if (isForwardRange!R && is(ElementType!R : dchar)) { debug(std_regex_parser) __ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data); - switch (current) + switch (front) { case '(': - next(); - if (current == '?') + popFront(); + if (front == '?') { - next(); - switch (current) + popFront(); + switch (front) { case '#': for (;;) { - if (!next()) - error("Unexpected end of pattern"); - if (current == ')') + popFront(); + enforce(!empty, "Unexpected end of pattern"); + if (front == ')') { - next(); + popFront(); break; } } break; case ':': g.genLogicGroup(); - next(); + popFront(); break; case '=': g.genLookaround(IR.LookaheadStart); - next(); + popFront(); break; case '!': g.genLookaround(IR.NeglookaheadStart); - next(); + popFront(); break; case 'P': - next(); - if (current != '<') - error("Expected '<' in named group"); + popFront(); + enforce(front == '<', "Expected '<' in named group"); string name; - if (!next() || !(isAlpha(current) || current == '_')) + popFront(); + if (empty || !(isAlpha(front) || front == '_')) error("Expected alpha starting a named group"); - name ~= current; - while (next() && (isAlpha(current) || - current == '_' || std.ascii.isDigit(current))) + name ~= front; + popFront(); + while (!empty && (isAlpha(front) || + front == '_' || std.ascii.isDigit(front))) { - name ~= current; + name ~= front; + popFront(); } - if (current != '>') - error("Expected '>' closing named group"); - next(); + enforce(front == '>', "Expected '>' closing named group"); + popFront(); g.genNamedGroup(name); break; case '<': - next(); - if (current == '=') + popFront(); + if (front == '=') g.genLookaround(IR.LookbehindStart); - else if (current == '!') + else if (front == '!') g.genLookaround(IR.NeglookbehindStart); else error("'!' or '=' expected after '<'"); - next(); + popFront(); break; default: uint enableFlags, disableFlags; bool enable = true; do { - switch (current) + switch (front) { case 's': if (enable) @@ -830,9 +725,9 @@ if (isForwardRange!R && is(ElementType!R : dchar)) default: error(" 's', 'x', 'i', 'm' or '-' expected after '(?' "); } - next(); - }while (current != ')'); - next(); + popFront(); + }while (front != ')'); + popFront(); re_flags |= enableFlags; re_flags &= ~disableFlags; } @@ -844,13 +739,13 @@ if (isForwardRange!R && is(ElementType!R : dchar)) break; case ')': enforce(g.nesting, "Unmatched ')'"); - next(); + popFront(); auto pair = g.onClose(); if (pair[0]) parseQuantifier(pair[1]); break; case '|': - next(); + popFront(); g.fixAlternation(); break; default://no groups or whatever @@ -875,7 +770,7 @@ if (isForwardRange!R && is(ElementType!R : dchar)) if (empty) return g.fixRepetition(offset); uint min, max; - switch (current) + switch (front) { case '*': min = 0; @@ -890,28 +785,27 @@ if (isForwardRange!R && is(ElementType!R : dchar)) max = infinite; break; case '{': - enforce(next(), "Unexpected end of regex pattern"); - enforce(std.ascii.isDigit(current), "First number required in repetition"); + popFront(); + enforce(!empty, "Unexpected end of regex pattern"); + enforce(std.ascii.isDigit(front), "First number required in repetition"); min = parseDecimal(); - if (current == '}') + if (front == '}') max = min; - else if (current == ',') + else if (front == ',') { - next(); - if (std.ascii.isDigit(current)) + popFront(); + if (std.ascii.isDigit(front)) max = parseDecimal(); - else if (current == '}') + else if (front == '}') max = infinite; else error("Unexpected symbol in regex pattern"); skipSpace(); - if (current != '}') - error("Unmatched '{' in regex pattern"); + enforce(front == '}', "Unmatched '{' in regex pattern"); } else error("Unexpected symbol in regex pattern"); - if (min > max) - error("Illegal {n,m} quantifier"); + enforce(min <= max, "Illegal {n,m} quantifier"); break; default: g.fixRepetition(offset); @@ -919,10 +813,11 @@ if (isForwardRange!R && is(ElementType!R : dchar)) } bool greedy = true; //check only if we managed to get new symbol - if (next() && current == '?') + popFront(); + if (!empty && front == '?') { greedy = false; - next(); + popFront(); } g.fixRepetition(offset, min, max, greedy); } @@ -932,7 +827,7 @@ if (isForwardRange!R && is(ElementType!R : dchar)) { if (empty) return; - switch (current) + switch (front) { case '*', '?', '+', '|', '{', '}': error("'*', '+', '?', '{', '}' not allowed in atom"); @@ -945,13 +840,14 @@ if (isForwardRange!R && is(ElementType!R : dchar)) CodepointSet set; g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted); } - next(); + popFront(); break; case '[': parseCharset(); break; case '\\': - enforce(_next(), "Unfinished escape sequence"); + _popFront(); + enforce(!empty, "Unfinished escape sequence"); parseEscape(); break; case '^': @@ -959,20 +855,19 @@ if (isForwardRange!R && is(ElementType!R : dchar)) g.put(Bytecode(IR.Bol, 0)); else g.put(Bytecode(IR.Bof, 0)); - next(); + popFront(); break; case '$': if (re_flags & RegexOption.multiline) g.put(Bytecode(IR.Eol, 0)); else g.put(Bytecode(IR.Eof, 0)); - next(); + popFront(); break; default: - //FIXME: getCommonCasing in new std uni if (re_flags & RegexOption.casefold) { - auto range = simpleCaseFoldings(current); + auto range = simpleCaseFoldings(front); assert(range.length <= 5); if (range.length == 1) g.put(Bytecode(IR.Char, range.front)); @@ -981,507 +876,96 @@ if (isForwardRange!R && is(ElementType!R : dchar)) g.put(Bytecode(IR.OrChar, v, cast(uint) range.length)); } else - g.put(Bytecode(IR.Char, current)); - next(); + g.put(Bytecode(IR.Char, front)); + popFront(); } } - - - //CodepointSet operations relatively in order of priority - enum Operator:uint { - Open = 0, Negate, Difference, SymDifference, Intersection, Union, None - } - - //parse unit of CodepointSet spec, most notably escape sequences and char ranges - //also fetches next set operation - Tuple!(CodepointSet,Operator) parseCharTerm() - { - enum State{ Start, Char, Escape, CharDash, CharDashEscape, - PotentialTwinSymbolOperator } - Operator op = Operator.None; - dchar last; - CodepointSet set; - State state = State.Start; - - static void addWithFlags(ref CodepointSet set, uint ch, uint re_flags) - { - if (re_flags & RegexOption.casefold) - { - auto range = simpleCaseFoldings(ch); - foreach (v; range) - set |= v; - } - else - set |= ch; - } - - static Operator twinSymbolOperator(dchar symbol) - { - switch (symbol) - { - case '|': - return Operator.Union; - case '-': - return Operator.Difference; - case '~': - return Operator.SymDifference; - case '&': - return Operator.Intersection; - default: - assert(false); - } - } - - L_CharTermLoop: - for (;;) - { - final switch (state) - { - case State.Start: - switch (current) - { - case '|': - case '-': - case '~': - case '&': - state = State.PotentialTwinSymbolOperator; - last = current; - break; - case '[': - op = Operator.Union; - goto case; - case ']': - break L_CharTermLoop; - case '\\': - state = State.Escape; - break; - default: - state = State.Char; - last = current; - } - break; - case State.Char: - // xxx last current xxx - switch (current) - { - case '|': - case '~': - case '&': - // then last is treated as normal char and added as implicit union - state = State.PotentialTwinSymbolOperator; - addWithFlags(set, last, re_flags); - last = current; - break; - case '-': // still need more info - state = State.CharDash; - break; - case '\\': - set |= last; - state = State.Escape; - break; - case '[': - op = Operator.Union; - goto case; - case ']': - addWithFlags(set, last, re_flags); - break L_CharTermLoop; - default: - state = State.Char; - addWithFlags(set, last, re_flags); - last = current; - } - break; - case State.PotentialTwinSymbolOperator: - // xxx last current xxxx - // where last = [|-&~] - if (current == last) - { - op = twinSymbolOperator(last); - next();//skip second twin char - break L_CharTermLoop; - } - goto case State.Char; - case State.Escape: - // xxx \ current xxx - switch (current) - { - case 'f': - last = '\f'; - state = State.Char; - break; - case 'n': - last = '\n'; - state = State.Char; - break; - case 'r': - last = '\r'; - state = State.Char; - break; - case 't': - last = '\t'; - state = State.Char; - break; - case 'v': - last = '\v'; - state = State.Char; - break; - case 'c': - last = parseControlCode(); - state = State.Char; - break; - foreach (val; Escapables) - { - case val: - } - last = current; - state = State.Char; - break; - case 'p': - set.add(parseUnicodePropertySpec(false)); - state = State.Start; - continue L_CharTermLoop; //next char already fetched - case 'P': - set.add(parseUnicodePropertySpec(true)); - state = State.Start; - continue L_CharTermLoop; //next char already fetched - case 'x': - last = parseUniHex(pat, 2); - state = State.Char; - break; - case 'u': - last = parseUniHex(pat, 4); - state = State.Char; - break; - case 'U': - last = parseUniHex(pat, 8); - state = State.Char; - break; - case 'd': - set.add(unicode.Nd); - state = State.Start; - break; - case 'D': - set.add(unicode.Nd.inverted); - state = State.Start; - break; - case 's': - set.add(unicode.White_Space); - state = State.Start; - break; - case 'S': - set.add(unicode.White_Space.inverted); - state = State.Start; - break; - case 'w': - set.add(wordCharacter); - state = State.Start; - break; - case 'W': - set.add(wordCharacter.inverted); - state = State.Start; - break; - default: - if (current >= privateUseStart && current <= privateUseEnd) - enforce(false, "no matching ']' found while parsing character class"); - enforce(false, "invalid escape sequence"); - } - break; - case State.CharDash: - // xxx last - current xxx - switch (current) - { - case '[': - op = Operator.Union; - goto case; - case ']': - //means dash is a single char not an interval specifier - addWithFlags(set, last, re_flags); - addWithFlags(set, '-', re_flags); - break L_CharTermLoop; - case '-'://set Difference again - addWithFlags(set, last, re_flags); - op = Operator.Difference; - next();//skip '-' - break L_CharTermLoop; - case '\\': - state = State.CharDashEscape; - break; - default: - enforce(last <= current, "inverted range"); - if (re_flags & RegexOption.casefold) - { - for (uint ch = last; ch <= current; ch++) - addWithFlags(set, ch, re_flags); - } - else - set.add(last, current + 1); - state = State.Start; - } - break; - case State.CharDashEscape: - //xxx last - \ current xxx - uint end; - switch (current) - { - case 'f': - end = '\f'; - break; - case 'n': - end = '\n'; - break; - case 'r': - end = '\r'; - break; - case 't': - end = '\t'; - break; - case 'v': - end = '\v'; - break; - foreach (val; Escapables) - { - case val: - } - end = current; - break; - case 'c': - end = parseControlCode(); - break; - case 'x': - end = parseUniHex(pat, 2); - break; - case 'u': - end = parseUniHex(pat, 4); - break; - case 'U': - end = parseUniHex(pat, 8); - break; - default: - if (current >= privateUseStart && current <= privateUseEnd) - enforce(false, "no matching ']' found while parsing character class"); - error("invalid escape sequence"); - } - // Lookahead to check if it's a \T - // where T is sub-pattern terminator in multi-pattern scheme - if (end == '\\' && !pat.empty) - { - if (pat.front >= privateUseStart && pat.front <= privateUseEnd) - enforce(false, "invalid escape sequence"); - } - enforce(last <= end,"inverted range"); - set.add(last, end + 1); - state = State.Start; - break; - } - enforce(next(), "unexpected end of CodepointSet"); - } - return tuple(set, op); - } - - alias ValStack = Stack!(CodepointSet); - alias OpStack = Stack!(Operator); - //parse and store IR for CodepointSet void parseCharset() { const save = re_flags; re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did - parseCharsetImpl(); + bool casefold = cast(bool)(re_flags & RegexOption.casefold); + g.charsetToIr(unicode.parseSet(this, casefold)); re_flags = save; - // Last next() in parseCharsetImp is executed w/o freeform flag + // Last next() in parseCharset is executed w/o freeform flag if (re_flags & RegexOption.freeform) skipSpace(); } - void parseCharsetImpl() - { - ValStack vstack; - OpStack opstack; - import std.functional : unaryFun; - // - static bool apply(Operator op, ref ValStack stack) - { - switch (op) - { - case Operator.Negate: - enforce(!stack.empty, "no operand for '^'"); - stack.top = stack.top.inverted; - break; - case Operator.Union: - auto s = stack.pop();//2nd operand - enforce(!stack.empty, "no operand for '||'"); - stack.top.add(s); - break; - case Operator.Difference: - auto s = stack.pop();//2nd operand - enforce(!stack.empty, "no operand for '--'"); - stack.top.sub(s); - break; - case Operator.SymDifference: - auto s = stack.pop();//2nd operand - enforce(!stack.empty, "no operand for '~~'"); - stack.top ~= s; - break; - case Operator.Intersection: - auto s = stack.pop();//2nd operand - enforce(!stack.empty, "no operand for '&&'"); - stack.top.intersect(s); - break; - default: - return false; - } - return true; - } - static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack) - { - while (cond(opstack.top)) - { - if (!apply(opstack.pop(),vstack)) - return false;//syntax error - if (opstack.empty) - return false; - } - return true; - } - - L_CharsetLoop: - do - { - switch (current) - { - case '[': - opstack.push(Operator.Open); - enforce(next(), "unexpected end of character class"); - if (current == '^') - { - opstack.push(Operator.Negate); - enforce(next(), "unexpected end of character class"); - } - else if (current == ']') // []...] is special cased - { - enforce(next(), "wrong character set"); - auto pair = parseCharTerm(); - pair[0].add(']', ']'+1); - if (pair[1] != Operator.None) - { - if (opstack.top == Operator.Union) - unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); - opstack.push(pair[1]); - } - vstack.push(pair[0]); - } - break; - case ']': - enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack), - "character class syntax error"); - enforce(!opstack.empty, "unmatched ']'"); - opstack.pop(); - if (!next() || opstack.empty) - break L_CharsetLoop; - auto pair = parseCharTerm(); - if (!pair[0].empty)//not only operator e.g. -- or ~~ - { - vstack.top.add(pair[0]);//apply union - } - if (pair[1] != Operator.None) - { - if (opstack.top == Operator.Union) - unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); - opstack.push(pair[1]); - } - break; - // - default://yet another pair of term(op)? - auto pair = parseCharTerm(); - if (pair[1] != Operator.None) - { - if (opstack.top == Operator.Union) - unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); - opstack.push(pair[1]); - } - vstack.push(pair[0]); - } - }while (!empty || !opstack.empty); - while (!opstack.empty) - { - enforce(opstack.top != Operator.Open, - "no matching ']' found while parsing character class"); - apply(opstack.pop(), vstack); - } - assert(vstack.length == 1); - g.charsetToIr(vstack.top); - } - //parse and generate IR for escape stand alone escape sequence @trusted void parseEscape() {//accesses array of appender import std.algorithm.iteration : sum; - switch (current) + switch (front) { - case 'f': next(); g.put(Bytecode(IR.Char, '\f')); break; - case 'n': next(); g.put(Bytecode(IR.Char, '\n')); break; - case 'r': next(); g.put(Bytecode(IR.Char, '\r')); break; - case 't': next(); g.put(Bytecode(IR.Char, '\t')); break; - case 'v': next(); g.put(Bytecode(IR.Char, '\v')); break; + case 'f': popFront(); g.put(Bytecode(IR.Char, '\f')); break; + case 'n': popFront(); g.put(Bytecode(IR.Char, '\n')); break; + case 'r': popFront(); g.put(Bytecode(IR.Char, '\r')); break; + case 't': popFront(); g.put(Bytecode(IR.Char, '\t')); break; + case 'v': popFront(); g.put(Bytecode(IR.Char, '\v')); break; case 'd': - next(); + popFront(); g.charsetToIr(unicode.Nd); break; case 'D': - next(); + popFront(); g.charsetToIr(unicode.Nd.inverted); break; - case 'b': next(); g.put(Bytecode(IR.Wordboundary, 0)); break; - case 'B': next(); g.put(Bytecode(IR.Notwordboundary, 0)); break; + case 'b': popFront(); g.put(Bytecode(IR.Wordboundary, 0)); break; + case 'B': popFront(); g.put(Bytecode(IR.Notwordboundary, 0)); break; case 's': - next(); + popFront(); g.charsetToIr(unicode.White_Space); break; case 'S': - next(); + popFront(); g.charsetToIr(unicode.White_Space.inverted); break; case 'w': - next(); + popFront(); g.charsetToIr(wordCharacter); break; case 'W': - next(); + popFront(); g.charsetToIr(wordCharacter.inverted); break; case 'p': case 'P': - auto CodepointSet = parseUnicodePropertySpec(current == 'P'); - g.charsetToIr(CodepointSet); + bool casefold = cast(bool)(re_flags & RegexOption.casefold); + auto set = unicode.parsePropertySpec(this, front == 'P', casefold); + g.charsetToIr(set); break; case 'x': immutable code = parseUniHex(pat, 2); - next(); + popFront(); g.put(Bytecode(IR.Char,code)); break; case 'u': case 'U': - immutable code = parseUniHex(pat, current == 'u' ? 4 : 8); - next(); + immutable code = parseUniHex(pat, front == 'u' ? 4 : 8); + popFront(); g.put(Bytecode(IR.Char, code)); break; case 'c': //control codes - Bytecode code = Bytecode(IR.Char, parseControlCode()); - next(); + Bytecode code = Bytecode(IR.Char, unicode.parseControlCode(this)); + popFront(); g.put(code); break; case '0': - next(); + popFront(); g.put(Bytecode(IR.Char, 0));//NUL character break; case '1': .. case '9': - uint nref = cast(uint) current - '0'; + uint nref = cast(uint) front - '0'; immutable maxBackref = sum(g.groupStack.data); enforce(nref < maxBackref, "Backref to unseen group"); //perl's disambiguation rule i.e. //get next digit only if there is such group number - while (nref < maxBackref && next() && std.ascii.isDigit(current)) + popFront(); + while (nref < maxBackref && !empty && std.ascii.isDigit(front)) { - nref = nref * 10 + current - '0'; + nref = nref * 10 + front - '0'; + popFront(); } if (nref >= maxBackref) nref /= 10; @@ -1497,52 +981,22 @@ if (isForwardRange!R && is(ElementType!R : dchar)) g.markBackref(nref); break; default: - // Lookahead to check if it's a \T - // where T is sub-pattern terminator in multi-pattern scheme - if (current == '\\' && !pat.empty) + if (front == '\\' && !pat.empty) { - if (pat.front >= privateUseStart && current <= privateUseEnd) + if (pat.front >= privateUseStart && pat.front <= privateUseEnd) enforce(false, "invalid escape sequence"); } - if (current >= privateUseStart && current <= privateUseEnd) + if (front >= privateUseStart && front <= privateUseEnd) { - g.endPattern(current - privateUseStart + 1); + g.endPattern(front - privateUseStart + 1); break; } - auto op = Bytecode(IR.Char, current); - next(); + auto op = Bytecode(IR.Char, front); + popFront(); g.put(op); } } - //parse and return a CodepointSet for \p{...Property...} and \P{...Property..}, - //\ - assumed to be processed, p - is current - CodepointSet parseUnicodePropertySpec(bool negated) - { - enum MAX_PROPERTY = 128; - char[MAX_PROPERTY] result; - uint k = 0; - enforce(next(), "eof parsing unicode property spec"); - if (current == '{') - { - while (k < MAX_PROPERTY && next() && current !='}' && current !=':') - if (current != '-' && current != ' ' && current != '_') - result[k++] = cast(char) std.ascii.toLower(current); - enforce(k != MAX_PROPERTY, "invalid property name"); - enforce(current == '}', "} expected "); - } - else - {//single char properties e.g.: \pL, \pN ... - enforce(current < 0x80, "invalid property name"); - result[k++] = cast(char) current; - } - auto s = getUnicodeSet(result[0 .. k], negated, - cast(bool)(re_flags & RegexOption.casefold)); - enforce(!s.empty, "unrecognized unicode property spec"); - next(); - return s; - } - // @trusted void error(string msg) { diff --git a/std/regex/internal/tests.d b/std/regex/internal/tests.d index 1d4f9a26ae6..edd7b391bb5 100644 --- a/std/regex/internal/tests.d +++ b/std/regex/internal/tests.d @@ -1087,15 +1087,15 @@ alias Sequence(int B, int E) = staticIota!(B, E); @safe unittest { import std.algorithm.searching : canFind; - void willThrow(T)(T arg, string msg) + void willThrow(T, size_t line = __LINE__)(T arg, string msg) { auto e = collectException(regex(arg)); - assert(e.msg.canFind(msg), e.msg); + assert(e.msg.canFind(msg), to!string(line) ~ ": " ~ e.msg); } willThrow([r".", r"[\(\{[\]\}\)]"], "no matching ']' found while parsing character class"); willThrow([r"[\", r"123"], "no matching ']' found while parsing character class"); willThrow([r"[a-", r"123"], "no matching ']' found while parsing character class"); - willThrow([r"[a-\", r"123"], "invalid escape sequence"); + willThrow([r"[a-\", r"123"], "no matching ']' found while parsing character class"); willThrow([r"\", r"123"], "invalid escape sequence"); } @@ -1104,5 +1104,5 @@ alias Sequence(int B, int E) = staticIota!(B, E); { import std.algorithm.searching; auto e = collectException!RegexException(regex(q"<[^]>")); - assert(e.msg.canFind("no operand for '^'")); + assert(e.msg.canFind("no operand for '^'"), e.msg); } diff --git a/std/uni.d b/std/uni.d index 88307e6e00a..e41e744a678 100644 --- a/std/uni.d +++ b/std/uni.d @@ -713,7 +713,8 @@ import std.range.primitives; // back, ElementEncodingType, ElementType, empty, // save import std.traits; // isConvertibleToString, isIntegral, isSomeChar, // isSomeString, Unqual - +import std.exception : enforce, collectException; +static import std.ascii; // debug = std_uni; debug(std_uni) import std.stdio; // writefln, writeln @@ -6094,6 +6095,572 @@ template SetSearcher(alias table, string kind) } } +// Characters that need escaping in string posed as regular expressions +package alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-', + ';', ':', '#', '&', '%', '/', '<', '>', '`', '*', '+', '(', ')', '{', '}', '~'); + +package @trusted auto memoizeExpr(string expr)() +{ + if (__ctfe) + return mixin(expr); + alias T = typeof(mixin(expr)); + static T slot; + static bool initialized; + if (!initialized) + { + slot = mixin(expr); + initialized = true; + } + return slot; +} + +//property for \w character class +package @property @safe CodepointSet wordCharacter() +{ + return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc + | unicode.Me | unicode.Nd | unicode.Pc")(); +} + +//basic stack, just in case it gets used anywhere else then Parser +package struct Stack(T) +{ +@safe: + T[] data; + @property bool empty(){ return data.empty; } + + @property size_t length(){ return data.length; } + + void push(T val){ data ~= val; } + + @trusted T pop() + { + assert(!empty); + auto val = data[$ - 1]; + data = data[0 .. $ - 1]; + if (!__ctfe) + cast(void) data.assumeSafeAppend(); + return val; + } + + @property ref T top() + { + assert(!empty); + return data[$ - 1]; + } +} + +//test if a given string starts with hex number of maxDigit that's a valid codepoint +//returns it's value and skips these maxDigit chars on success, throws on failure +package dchar parseUniHex(Range)(ref Range str, size_t maxDigit) +{ + //std.conv.parse is both @system and bogus + uint val; + for (int k = 0; k < maxDigit; k++) + { + enforce(!str.empty, "incomplete escape sequence"); + //accepts ascii only, so it's OK to index directly + immutable current = str.front; + if ('0' <= current && current <= '9') + val = val * 16 + current - '0'; + else if ('a' <= current && current <= 'f') + val = val * 16 + current -'a' + 10; + else if ('A' <= current && current <= 'F') + val = val * 16 + current - 'A' + 10; + else + throw new Exception("invalid escape sequence"); + str.popFront(); + } + enforce(val <= 0x10FFFF, "invalid codepoint"); + return val; +} + +@system unittest //BUG canFind is system +{ + import std.algorithm.searching : canFind; + string[] non_hex = [ "000j", "000z", "FffG", "0Z"]; + string[] hex = [ "01", "ff", "00af", "10FFFF" ]; + int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ]; + foreach (v; non_hex) + assert(collectException(parseUniHex(v, v.length)).msg + .canFind("invalid escape sequence")); + foreach (i, v; hex) + assert(parseUniHex(v, v.length) == value[i]); + string over = "0011FFFF"; + assert(collectException(parseUniHex(over, over.length)).msg + .canFind("invalid codepoint")); +} + +auto caseEnclose(CodepointSet set) +{ + auto cased = set & unicode.LC; + foreach (dchar ch; cased.byCodepoint) + { + foreach (c; simpleCaseFoldings(ch)) + set |= c; + } + return set; +} + +/+ + fetch codepoint set corresponding to a name (InBlock or binary property) ++/ +@trusted CodepointSet getUnicodeSet(in char[] name, bool negated, bool casefold) +{ + CodepointSet s = unicode(name); + //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC) + if (casefold) + s = caseEnclose(s); + if (negated) + s = s.inverted; + return s; +} + +@safe struct UnicodeSetParser(Range) +{ + import std.typecons : tuple, Tuple; + Range range; + bool casefold_; + + @property bool empty(){ return range.empty; } + @property dchar front(){ return range.front; } + void popFront(){ range.popFront(); } + + //CodepointSet operations relatively in order of priority + enum Operator:uint { + Open = 0, Negate, Difference, SymDifference, Intersection, Union, None + } + + //parse unit of CodepointSet spec, most notably escape sequences and char ranges + //also fetches next set operation + Tuple!(CodepointSet,Operator) parseCharTerm() + { + import std.range : drop; + enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD'; + enum State{ Start, Char, Escape, CharDash, CharDashEscape, + PotentialTwinSymbolOperator } + Operator op = Operator.None; + dchar last; + CodepointSet set; + State state = State.Start; + + void addWithFlags(ref CodepointSet set, uint ch) + { + if (casefold_) + { + auto range = simpleCaseFoldings(ch); + foreach (v; range) + set |= v; + } + else + set |= ch; + } + + static Operator twinSymbolOperator(dchar symbol) + { + switch (symbol) + { + case '|': + return Operator.Union; + case '-': + return Operator.Difference; + case '~': + return Operator.SymDifference; + case '&': + return Operator.Intersection; + default: + assert(false); + } + } + + L_CharTermLoop: + for (;;) + { + final switch (state) + { + case State.Start: + switch (front) + { + case '|': + case '-': + case '~': + case '&': + state = State.PotentialTwinSymbolOperator; + last = front; + break; + case '[': + op = Operator.Union; + goto case; + case ']': + break L_CharTermLoop; + case '\\': + state = State.Escape; + break; + default: + state = State.Char; + last = front; + } + break; + case State.Char: + // xxx last front xxx + switch (front) + { + case '|': + case '~': + case '&': + // then last is treated as normal char and added as implicit union + state = State.PotentialTwinSymbolOperator; + addWithFlags(set, last); + last = front; + break; + case '-': // still need more info + state = State.CharDash; + break; + case '\\': + set |= last; + state = State.Escape; + break; + case '[': + op = Operator.Union; + goto case; + case ']': + addWithFlags(set, last); + break L_CharTermLoop; + default: + state = State.Char; + addWithFlags(set, last); + last = front; + } + break; + case State.PotentialTwinSymbolOperator: + // xxx last front xxxx + // where last = [|-&~] + if (front == last) + { + op = twinSymbolOperator(last); + popFront();//skip second twin char + break L_CharTermLoop; + } + goto case State.Char; + case State.Escape: + // xxx \ front xxx + switch (front) + { + case 'f': + last = '\f'; + state = State.Char; + break; + case 'n': + last = '\n'; + state = State.Char; + break; + case 'r': + last = '\r'; + state = State.Char; + break; + case 't': + last = '\t'; + state = State.Char; + break; + case 'v': + last = '\v'; + state = State.Char; + break; + case 'c': + last = unicode.parseControlCode(this); + state = State.Char; + break; + foreach (val; Escapables) + { + case val: + } + last = front; + state = State.Char; + break; + case 'p': + set.add(unicode.parsePropertySpec(this, false, casefold_)); + state = State.Start; + continue L_CharTermLoop; //next char already fetched + case 'P': + set.add(unicode.parsePropertySpec(this, true, casefold_)); + state = State.Start; + continue L_CharTermLoop; //next char already fetched + case 'x': + popFront(); + last = parseUniHex(this, 2); + state = State.Char; + continue L_CharTermLoop; + case 'u': + popFront(); + last = parseUniHex(this, 4); + state = State.Char; + continue L_CharTermLoop; + case 'U': + popFront(); + last = parseUniHex(this, 8); + state = State.Char; + continue L_CharTermLoop; + case 'd': + set.add(unicode.Nd); + state = State.Start; + break; + case 'D': + set.add(unicode.Nd.inverted); + state = State.Start; + break; + case 's': + set.add(unicode.White_Space); + state = State.Start; + break; + case 'S': + set.add(unicode.White_Space.inverted); + state = State.Start; + break; + case 'w': + set.add(wordCharacter); + state = State.Start; + break; + case 'W': + set.add(wordCharacter.inverted); + state = State.Start; + break; + default: + if (front >= privateUseStart && front <= privateUseEnd) + enforce(false, "no matching ']' found while parsing character class"); + enforce(false, "invalid escape sequence"); + } + break; + case State.CharDash: + // xxx last - front xxx + switch (front) + { + case '[': + op = Operator.Union; + goto case; + case ']': + //means dash is a single char not an interval specifier + addWithFlags(set, last); + addWithFlags(set, '-'); + break L_CharTermLoop; + case '-'://set Difference again + addWithFlags(set, last); + op = Operator.Difference; + popFront();//skip '-' + break L_CharTermLoop; + case '\\': + state = State.CharDashEscape; + break; + default: + enforce(last <= front, "inverted range"); + if (casefold_) + { + for (uint ch = last; ch <= front; ch++) + addWithFlags(set, ch); + } + else + set.add(last, front + 1); + state = State.Start; + } + break; + case State.CharDashEscape: + //xxx last - \ front xxx + uint end; + switch (front) + { + case 'f': + end = '\f'; + break; + case 'n': + end = '\n'; + break; + case 'r': + end = '\r'; + break; + case 't': + end = '\t'; + break; + case 'v': + end = '\v'; + break; + foreach (val; Escapables) + { + case val: + } + end = front; + break; + case 'c': + end = unicode.parseControlCode(this); + break; + case 'x': + popFront(); + end = parseUniHex(this, 2); + enforce(last <= end,"inverted range"); + set.add(last, end + 1); + state = State.Start; + continue L_CharTermLoop; + case 'u': + popFront(); + end = parseUniHex(this, 4); + enforce(last <= end,"inverted range"); + set.add(last, end + 1); + state = State.Start; + continue L_CharTermLoop; + case 'U': + popFront(); + end = parseUniHex(this, 8); + enforce(last <= end,"inverted range"); + set.add(last, end + 1); + state = State.Start; + continue L_CharTermLoop; + default: + if (front >= privateUseStart && front <= privateUseEnd) + enforce(false, "no matching ']' found while parsing character class"); + enforce(false, "invalid escape sequence"); + } + // Lookahead to check if it's a \T + // where T is sub-pattern terminator in multi-pattern scheme + auto lookahead = range.save.drop(1); + if (end == '\\' && !lookahead.empty) + { + if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd) + enforce(false, "no matching ']' found while parsing character class"); + } + enforce(last <= end,"inverted range"); + set.add(last, end + 1); + state = State.Start; + break; + } + popFront(); + enforce(!empty, "unexpected end of CodepointSet"); + } + return tuple(set, op); + } + + alias ValStack = Stack!(CodepointSet); + alias OpStack = Stack!(Operator); + + CodepointSet parseSet() + { + ValStack vstack; + OpStack opstack; + import std.functional : unaryFun; + enforce(!empty, "unexpected end of input"); + enforce(front == '[', "expected '[' at the start of unicode set"); + // + static bool apply(Operator op, ref ValStack stack) + { + switch (op) + { + case Operator.Negate: + enforce(!stack.empty, "no operand for '^'"); + stack.top = stack.top.inverted; + break; + case Operator.Union: + auto s = stack.pop();//2nd operand + enforce(!stack.empty, "no operand for '||'"); + stack.top.add(s); + break; + case Operator.Difference: + auto s = stack.pop();//2nd operand + enforce(!stack.empty, "no operand for '--'"); + stack.top.sub(s); + break; + case Operator.SymDifference: + auto s = stack.pop();//2nd operand + enforce(!stack.empty, "no operand for '~~'"); + stack.top ~= s; + break; + case Operator.Intersection: + auto s = stack.pop();//2nd operand + enforce(!stack.empty, "no operand for '&&'"); + stack.top.intersect(s); + break; + default: + return false; + } + return true; + } + static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack) + { + while (cond(opstack.top)) + { + if (!apply(opstack.pop(),vstack)) + return false;//syntax error + if (opstack.empty) + return false; + } + return true; + } + + L_CharsetLoop: + do + { + switch (front) + { + case '[': + opstack.push(Operator.Open); + popFront(); + enforce(!empty, "unexpected end of character class"); + if (front == '^') + { + opstack.push(Operator.Negate); + popFront(); + enforce(!empty, "unexpected end of character class"); + } + else if (front == ']') // []...] is special cased + { + popFront(); + enforce(!empty, "wrong character set"); + auto pair = parseCharTerm(); + pair[0].add(']', ']'+1); + if (pair[1] != Operator.None) + { + if (opstack.top == Operator.Union) + unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); + opstack.push(pair[1]); + } + vstack.push(pair[0]); + } + break; + case ']': + enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack), + "character class syntax error"); + enforce(!opstack.empty, "unmatched ']'"); + opstack.pop(); + popFront(); + if (opstack.empty) + break L_CharsetLoop; + auto pair = parseCharTerm(); + if (!pair[0].empty)//not only operator e.g. -- or ~~ + { + vstack.top.add(pair[0]);//apply union + } + if (pair[1] != Operator.None) + { + if (opstack.top == Operator.Union) + unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); + opstack.push(pair[1]); + } + break; + // + default://yet another pair of term(op)? + auto pair = parseCharTerm(); + if (pair[1] != Operator.None) + { + if (opstack.top == Operator.Union) + unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); + opstack.push(pair[1]); + } + vstack.push(pair[0]); + } + + }while (!empty || !opstack.empty); + while (!opstack.empty) + apply(opstack.pop(),vstack); + assert(vstack.length == 1); + return vstack.top; + } +} + /** A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of a block, script or general category. @@ -6242,6 +6809,83 @@ template SetSearcher(alias table, string kind) assert(leadingVowel == unicode.hangulSyllableType.L); } + //parse control code of form \cXXX, c assumed to be the current symbol + static package dchar parseControlCode(Parser)(ref Parser p) + { + with(p) + { + popFront(); + enforce(!empty, "Unfinished escape sequence"); + enforce(('a' <= front && front <= 'z') + || ('A' <= front && front <= 'Z'), + "Only letters are allowed after \\c"); + return front & 0x1f; + } + } + + //parse and return a CodepointSet for \p{...Property...} and \P{...Property..}, + //\ - assumed to be processed, p - is current + static package CodepointSet parsePropertySpec(Range)(ref Range p, + bool negated, bool casefold) + { + with(p) + { + enum MAX_PROPERTY = 128; + char[MAX_PROPERTY] result; + uint k = 0; + popFront(); + enforce(!empty, "eof parsing unicode property spec"); + if (front == '{') + { + popFront(); + while (k < MAX_PROPERTY && !empty && front !='}' + && front !=':') + { + if (front != '-' && front != ' ' && front != '_') + result[k++] = cast(char) std.ascii.toLower(front); + popFront(); + } + enforce(k != MAX_PROPERTY, "invalid property name"); + enforce(front == '}', "} expected "); + } + else + {//single char properties e.g.: \pL, \pN ... + enforce(front < 0x80, "invalid property name"); + result[k++] = cast(char) front; + } + auto s = getUnicodeSet(result[0 .. k], negated, casefold); + enforce(!s.empty, "unrecognized unicode property spec"); + popFront(); + return s; + } + } + + /** + Parse unicode codepoint set from given `range` using standard regex + syntax '[...]'. The range is advanced skiping over regex set definition. + `casefold` parameter determines if the set should be casefolded - that is + include both lower and upper case versions for any letters in the set. + */ + static CodepointSet parseSet(Range)(ref Range range, bool casefold=false) + if (isInputRange!Range && is(ElementType!Range : dchar)) + { + auto usParser = UnicodeSetParser!Range(range, casefold); + auto set = usParser.parseSet(); + range = usParser.range; + return set; + } + + /// + @safe unittest + { + import std.uni : unicode; + string pat = "[a-zA-Z0-9]hello"; + auto set = unicode.parseSet(pat); + // check some of the codepoints + assert(set['a'] && set['A'] && set['9']); + assert(pat == "hello"); + } + private: alias ucmp = comparePropertyName;