| //Written in the D programming language |
| /* |
| Regular expression pattern parser. |
| */ |
| module std.regex.internal.parser; |
| |
| import std.regex.internal.ir; |
| import std.range.primitives, std.uni, std.meta, |
| std.traits, std.typecons, std.exception; |
| static import std.ascii; |
| |
| // package relevant info from parser into a regex object |
| auto makeRegex(S, CG)(Parser!(S, CG) p) |
| { |
| import std.regex.internal.backtracking : BacktrackingMatcher; |
| import std.regex.internal.thompson : ThompsonMatcher; |
| import std.algorithm.searching : canFind; |
| alias Char = BasicElementOf!S; |
| Regex!Char re; |
| auto g = p.g; |
| with(re) |
| { |
| ir = g.ir; |
| dict = g.dict; |
| ngroup = g.ngroup; |
| maxCounterDepth = g.counterDepth; |
| flags = p.re_flags; |
| charsets = g.charsets; |
| matchers = g.matchers; |
| backrefed = g.backrefed; |
| re.pattern = p.origin.idup; |
| re.postprocess(); |
| // check if we have backreferences, if so - use backtracking |
| if (__ctfe) factory = null; // allows us to use the awful enum re = regex(...); |
| else if (re.backrefed.canFind!"a != 0") |
| factory = new RuntimeFactory!(BacktrackingMatcher, Char); |
| else |
| factory = new RuntimeFactory!(ThompsonMatcher, Char); |
| debug(std_regex_parser) |
| { |
| __ctfe || print(); |
| } |
| //@@@BUG@@@ (not reduced) |
| //somehow just using validate _collides_ with std.utf.validate (!) |
| version (assert) re.validateRe(); |
| } |
| return re; |
| } |
| |
| // helper for unittest |
| auto makeRegex(S)(S arg) |
| if (isSomeString!S) |
| { |
| return makeRegex(Parser!(S, CodeGen)(arg, "")); |
| } |
| |
| @system unittest |
| { |
| import std.algorithm.comparison : equal; |
| auto re = makeRegex(`(?P<name>\w+) = (?P<var>\d+)`); |
| auto nc = re.namedCaptures; |
| static assert(isRandomAccessRange!(typeof(nc))); |
| assert(!nc.empty); |
| assert(nc.length == 2); |
| assert(nc.equal(["name", "var"])); |
| assert(nc[0] == "name"); |
| assert(nc[1..$].equal(["var"])); |
| |
| re = makeRegex(`(\w+) (?P<named>\w+) (\w+)`); |
| nc = re.namedCaptures; |
| assert(nc.length == 1); |
| assert(nc[0] == "named"); |
| assert(nc.front == "named"); |
| assert(nc.back == "named"); |
| |
| re = makeRegex(`(\w+) (\w+)`); |
| nc = re.namedCaptures; |
| assert(nc.empty); |
| |
| re = makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`); |
| nc = re.namedCaptures; |
| auto cp = nc.save; |
| assert(nc.equal(cp)); |
| nc.popFront(); |
| assert(nc.equal(cp[1..$])); |
| nc.popBack(); |
| assert(nc.equal(cp[1 .. $ - 1])); |
| } |
| |
| |
| @trusted void reverseBytecode()(Bytecode[] code) |
| { |
| Bytecode[] rev = new Bytecode[code.length]; |
| uint revPc = cast(uint) rev.length; |
| Stack!(Tuple!(uint, uint, uint)) stack; |
| uint start = 0; |
| uint end = cast(uint) code.length; |
| for (;;) |
| { |
| for (uint pc = start; pc < end; ) |
| { |
| immutable len = code[pc].length; |
| if (code[pc].code == IR.GotoEndOr) |
| break; //pick next alternation branch |
| if (code[pc].isAtom) |
| { |
| rev[revPc - len .. revPc] = code[pc .. pc + len]; |
| revPc -= len; |
| pc += len; |
| } |
| else if (code[pc].isStart || code[pc].isEnd) |
| { |
| //skip over other embedded lookbehinds they are reversed |
| if (code[pc].code == IR.LookbehindStart |
| || code[pc].code == IR.NeglookbehindStart) |
| { |
| immutable blockLen = len + code[pc].data |
| + code[pc].pairedLength; |
| rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen]; |
| pc += blockLen; |
| revPc -= blockLen; |
| continue; |
| } |
| immutable second = code[pc].indexOfPair(pc); |
| immutable secLen = code[second].length; |
| rev[revPc - secLen .. revPc] = code[second .. second + secLen]; |
| revPc -= secLen; |
| if (code[pc].code == IR.OrStart) |
| { |
| //we pass len bytes forward, but secLen in reverse |
| immutable revStart = revPc - (second + len - secLen - pc); |
| uint r = revStart; |
| uint i = pc + IRL!(IR.OrStart); |
| while (code[i].code == IR.Option) |
| { |
| if (code[i - 1].code != IR.OrStart) |
| { |
| assert(code[i - 1].code == IR.GotoEndOr); |
| rev[r - 1] = code[i - 1]; |
| } |
| rev[r] = code[i]; |
| auto newStart = i + IRL!(IR.Option); |
| auto newEnd = newStart + code[i].data; |
| auto newRpc = r + code[i].data + IRL!(IR.Option); |
| if (code[newEnd].code != IR.OrEnd) |
| { |
| newRpc--; |
| } |
| stack.push(tuple(newStart, newEnd, newRpc)); |
| r += code[i].data + IRL!(IR.Option); |
| i += code[i].data + IRL!(IR.Option); |
| } |
| pc = i; |
| revPc = revStart; |
| assert(code[pc].code == IR.OrEnd); |
| } |
| else |
| pc += len; |
| } |
| } |
| if (stack.empty) |
| break; |
| start = stack.top[0]; |
| end = stack.top[1]; |
| revPc = stack.top[2]; |
| stack.pop(); |
| } |
| code[] = rev[]; |
| } |
| |
| struct CodeGen |
| { |
| Bytecode[] ir; // resulting bytecode |
| Stack!(uint) fixupStack; // stack of opened start instructions |
| NamedGroup[] dict; // maps name -> user group number |
| Stack!(uint) groupStack; // stack of current number of group |
| uint nesting = 0; // group nesting level and repetitions step |
| uint lookaroundNest = 0; // nesting of lookaround |
| uint counterDepth = 0; // current depth of nested counted repetitions |
| CodepointSet[] charsets; // sets for char classes |
| const(CharMatcher)[] matchers; // matchers for char classes |
| uint[] backrefed; // bitarray for groups refered by backref |
| uint ngroup; // final number of groups (of all patterns) |
| |
| void start(uint length) |
| { |
| if (!__ctfe) |
| ir.reserve((length*5+2)/4); |
| fixupStack.push(0); |
| groupStack.push(1);//0 - whole match |
| } |
| |
| //mark referenced groups for latter processing |
| void markBackref(uint n) |
| { |
| if (n/32 >= backrefed.length) |
| backrefed.length = n/32 + 1; |
| backrefed[n / 32] |= 1 << (n & 31); |
| } |
| |
| bool isOpenGroup(uint n) |
| { |
| import std.algorithm.searching : canFind; |
| // walk the fixup stack and see if there are groups labeled 'n' |
| // fixup '0' is reserved for alternations |
| return fixupStack.data[1..$]. |
| canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)(); |
| } |
| |
| void put(Bytecode code) |
| { |
| enforce(ir.length < maxCompiledLength, |
| "maximum compiled pattern length is exceeded"); |
| ir ~= code; |
| } |
| |
| void putRaw(uint number) |
| { |
| enforce(ir.length < maxCompiledLength, |
| "maximum compiled pattern length is exceeded"); |
| ir ~= Bytecode.fromRaw(number); |
| } |
| |
| //try to generate optimal IR code for this CodepointSet |
| @trusted void charsetToIr(CodepointSet set) |
| {//@@@BUG@@@ writeln is @system |
| uint chars = cast(uint) set.length; |
| if (chars < Bytecode.maxSequence) |
| { |
| switch (chars) |
| { |
| case 1: |
| put(Bytecode(IR.Char, set.byCodepoint.front)); |
| break; |
| case 0: |
| throw new RegexException("empty CodepointSet not allowed"); |
| default: |
| foreach (ch; set.byCodepoint) |
| put(Bytecode(IR.OrChar, ch, chars)); |
| } |
| } |
| else |
| { |
| import std.algorithm.searching : countUntil; |
| const ivals = set.byInterval; |
| immutable n = charsets.countUntil(set); |
| if (n >= 0) |
| { |
| if (ivals.length*2 > maxCharsetUsed) |
| put(Bytecode(IR.Trie, cast(uint) n)); |
| else |
| put(Bytecode(IR.CodepointSet, cast(uint) n)); |
| return; |
| } |
| if (ivals.length*2 > maxCharsetUsed) |
| { |
| auto t = getMatcher(set); |
| put(Bytecode(IR.Trie, cast(uint) matchers.length)); |
| matchers ~= t; |
| debug(std_regex_allocation) writeln("Trie generated"); |
| } |
| else |
| { |
| put(Bytecode(IR.CodepointSet, cast(uint) charsets.length)); |
| matchers ~= CharMatcher.init; |
| } |
| charsets ~= set; |
| assert(charsets.length == matchers.length); |
| } |
| } |
| |
| void genLogicGroup() |
| { |
| nesting++; |
| pushFixup(length); |
| put(Bytecode(IR.Nop, 0)); |
| } |
| |
| void genGroup() |
| { |
| nesting++; |
| pushFixup(length); |
| immutable nglob = groupStack.top++; |
| enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded"); |
| put(Bytecode(IR.GroupStart, nglob)); |
| } |
| |
| void genNamedGroup(string name) |
| { |
| import std.array : insertInPlace; |
| import std.range : assumeSorted; |
| nesting++; |
| pushFixup(length); |
| immutable nglob = groupStack.top++; |
| enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded"); |
| auto t = NamedGroup(name, nglob); |
| auto d = assumeSorted!"a.name < b.name"(dict); |
| immutable ind = d.lowerBound(t).length; |
| insertInPlace(dict, ind, t); |
| put(Bytecode(IR.GroupStart, nglob)); |
| } |
| |
| //generate code for start of lookaround: (?= (?! (?<= (?<! |
| void genLookaround(IR opcode) |
| { |
| nesting++; |
| pushFixup(length); |
| put(Bytecode(opcode, 0)); |
| put(Bytecode.fromRaw(0)); |
| put(Bytecode.fromRaw(0)); |
| groupStack.push(0); |
| lookaroundNest++; |
| enforce(lookaroundNest <= maxLookaroundDepth, |
| "maximum lookaround depth is exceeded"); |
| } |
| |
| void endPattern(uint num) |
| { |
| import std.algorithm.comparison : max; |
| put(Bytecode(IR.End, num)); |
| ngroup = max(ngroup, groupStack.top); |
| groupStack.top = 1; // reset group counter |
| } |
| |
| //fixup lookaround with start at offset fix and append a proper *-End opcode |
| void fixLookaround(uint fix) |
| { |
| lookaroundNest--; |
| ir[fix] = Bytecode(ir[fix].code, |
| cast(uint) ir.length - fix - IRL!(IR.LookaheadStart)); |
| auto g = groupStack.pop(); |
| assert(!groupStack.empty); |
| ir[fix+1] = Bytecode.fromRaw(groupStack.top); |
| //groups are cumulative across lookarounds |
| ir[fix+2] = Bytecode.fromRaw(groupStack.top+g); |
| groupStack.top += g; |
| if (ir[fix].code == IR.LookbehindStart || ir[fix].code == IR.NeglookbehindStart) |
| { |
| reverseBytecode(ir[fix + IRL!(IR.LookbehindStart) .. $]); |
| } |
| put(ir[fix].paired); |
| } |
| |
| // repetition of {1,1} |
| void fixRepetition(uint offset) |
| { |
| import std.algorithm.mutation : copy; |
| immutable replace = ir[offset].code == IR.Nop; |
| if (replace) |
| { |
| copy(ir[offset + 1 .. $], ir[offset .. $ - 1]); |
| ir.length -= 1; |
| } |
| } |
| |
| // repetition of {x,y} |
| void fixRepetition(uint offset, uint min, uint max, bool greedy) |
| { |
| static import std.algorithm.comparison; |
| import std.algorithm.mutation : copy; |
| import std.array : insertInPlace; |
| immutable replace = ir[offset].code == IR.Nop; |
| immutable len = cast(uint) ir.length - offset - replace; |
| if (max != infinite) |
| { |
| if (min != 1 || max != 1) |
| { |
| Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len); |
| if (replace) |
| ir[offset] = op; |
| else |
| insertInPlace(ir, offset, op); |
| put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len)); |
| put(Bytecode.init); //hotspot |
| putRaw(1); |
| putRaw(min); |
| putRaw(max); |
| counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1); |
| } |
| } |
| else if (min) //&& max is infinite |
| { |
| if (min != 1) |
| { |
| Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len); |
| if (replace) |
| ir[offset] = op; |
| else |
| insertInPlace(ir, offset, op); |
| offset += 1;//so it still points to the repeated block |
| put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len)); |
| put(Bytecode.init); //hotspot |
| putRaw(1); |
| putRaw(min); |
| putRaw(min); |
| counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1); |
| } |
| else if (replace) |
| { |
| copy(ir[offset+1 .. $], ir[offset .. $-1]); |
| ir.length -= 1; |
| } |
| put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len)); |
| enforce(ir.length + len < maxCompiledLength, "maximum compiled pattern length is exceeded"); |
| ir ~= ir[offset .. offset+len]; |
| //IR.InfinteX is always a hotspot |
| put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len)); |
| put(Bytecode.init); //merge index |
| } |
| else//vanila {0,inf} |
| { |
| Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len); |
| if (replace) |
| ir[offset] = op; |
| else |
| insertInPlace(ir, offset, op); |
| //IR.InfinteX is always a hotspot |
| put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len)); |
| put(Bytecode.init); //merge index |
| } |
| } |
| |
| void fixAlternation() |
| { |
| import std.array : insertInPlace; |
| uint fix = fixupStack.top; |
| if (ir.length > fix && ir[fix].code == IR.Option) |
| { |
| ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix); |
| put(Bytecode(IR.GotoEndOr, 0)); |
| fixupStack.top = cast(uint) ir.length; //replace latest fixup for Option |
| put(Bytecode(IR.Option, 0)); |
| return; |
| } |
| uint len, orStart; |
| //start a new option |
| if (fixupStack.length == 1) |
| {//only root entry, effectively no fixup |
| len = cast(uint) ir.length + IRL!(IR.GotoEndOr); |
| orStart = 0; |
| } |
| else |
| {//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length |
| len = cast(uint) ir.length - fix - (ir[fix].length - 1); |
| orStart = fix + ir[fix].length; |
| } |
| insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len)); |
| assert(ir[orStart].code == IR.OrStart); |
| put(Bytecode(IR.GotoEndOr, 0)); |
| fixupStack.push(orStart); //fixup for StartOR |
| fixupStack.push(cast(uint) ir.length); //for second Option |
| put(Bytecode(IR.Option, 0)); |
| } |
| |
| // finalizes IR.Option, fix points to the first option of sequence |
| void finishAlternation(uint fix) |
| { |
| enforce(ir[fix].code == IR.Option, "no matching ')'"); |
| ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix - IRL!(IR.OrStart)); |
| fix = fixupStack.pop(); |
| enforce(ir[fix].code == IR.OrStart, "no matching ')'"); |
| ir[fix] = Bytecode(IR.OrStart, cast(uint) ir.length - fix - IRL!(IR.OrStart)); |
| put(Bytecode(IR.OrEnd, cast(uint) ir.length - fix - IRL!(IR.OrStart))); |
| uint pc = fix + IRL!(IR.OrStart); |
| while (ir[pc].code == IR.Option) |
| { |
| pc = pc + ir[pc].data; |
| if (ir[pc].code != IR.GotoEndOr) |
| break; |
| ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd))); |
| pc += IRL!(IR.GotoEndOr); |
| } |
| put(Bytecode.fromRaw(0)); |
| } |
| |
| // returns: (flag - repetition possible?, fixup of the start of this "group") |
| Tuple!(bool, uint) onClose() |
| { |
| nesting--; |
| uint fix = popFixup(); |
| switch (ir[fix].code) |
| { |
| case IR.GroupStart: |
| put(Bytecode(IR.GroupEnd, ir[fix].data)); |
| return tuple(true, fix); |
| case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: |
| assert(lookaroundNest); |
| fixLookaround(fix); |
| return tuple(false, 0u); |
| case IR.Option: //| xxx ) |
| //two fixups: last option + full OR |
| finishAlternation(fix); |
| fix = topFixup; |
| switch (ir[fix].code) |
| { |
| case IR.GroupStart: |
| popFixup(); |
| put(Bytecode(IR.GroupEnd, ir[fix].data)); |
| return tuple(true, fix); |
| case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: |
| assert(lookaroundNest); |
| fix = popFixup(); |
| fixLookaround(fix); |
| return tuple(false, 0u); |
| default://(?:xxx) |
| popFixup(); |
| return tuple(true, fix); |
| } |
| default://(?:xxx) |
| return tuple(true, fix); |
| } |
| } |
| |
| uint popFixup(){ return fixupStack.pop(); } |
| |
| void pushFixup(uint val){ return fixupStack.push(val); } |
| |
| @property uint topFixup(){ return fixupStack.top; } |
| |
| @property size_t fixupLength(){ return fixupStack.data.length; } |
| |
| @property uint length(){ return cast(uint) ir.length; } |
| } |
| |
| // safety limits |
| enum maxGroupNumber = 2^^19; |
| enum maxLookaroundDepth = 16; |
| // *Bytecode.sizeof, i.e. 1Mb of bytecode alone |
| enum maxCompiledLength = 2^^18; |
| // amounts to up to 4 Mb of auxilary table for matching |
| enum maxCumulativeRepetitionLength = 2^^20; |
| // marker to indicate infinite repetition |
| enum infinite = ~0u; |
| |
| struct Parser(R, Generator) |
| if (isForwardRange!R && is(ElementType!R : dchar)) |
| { |
| dchar front; |
| bool empty; |
| R pat, origin; //keep full pattern for pretty printing error messages |
| uint re_flags = 0; //global flags e.g. multiline + internal ones |
| Generator g; |
| |
| @trusted this(S)(R pattern, S flags) |
| if (isSomeString!S) |
| { |
| pat = origin = pattern; |
| //reserve slightly more then avg as sampled from unittests |
| parseFlags(flags); |
| front = ' ';//a safe default for freeform parsing |
| popFront(); |
| g.start(cast(uint) pat.length); |
| try |
| { |
| parseRegex(); |
| } |
| catch (Exception e) |
| { |
| error(e.msg);//also adds pattern location |
| } |
| g.endPattern(1); |
| } |
| |
| void _popFront() |
| { |
| if (pat.empty) |
| { |
| empty = true; |
| } |
| else |
| { |
| front = pat.front; |
| pat.popFront(); |
| } |
| } |
| |
| void skipSpace() |
| { |
| while (!empty && isWhite(front)) _popFront(); |
| } |
| |
| void popFront() |
| { |
| _popFront(); |
| if (re_flags & RegexOption.freeform) skipSpace(); |
| } |
| |
| auto save(){ return this; } |
| |
| //parsing number with basic overflow check |
| uint parseDecimal() |
| { |
| uint r = 0; |
| while (std.ascii.isDigit(front)) |
| { |
| if (r >= (uint.max/10)) |
| error("Overflow in decimal number"); |
| r = 10*r + cast(uint)(front-'0'); |
| popFront(); |
| if (empty) break; |
| } |
| return r; |
| } |
| |
| // |
| @trusted void parseFlags(S)(S flags) |
| {//@@@BUG@@@ text is @system |
| import std.conv : text; |
| foreach (ch; flags)//flags are ASCII anyway |
| { |
| L_FlagSwitch: |
| switch (ch) |
| { |
| |
| foreach (i, op; __traits(allMembers, RegexOption)) |
| { |
| case RegexOptionNames[i]: |
| if (re_flags & mixin("RegexOption."~op)) |
| throw new RegexException(text("redundant flag specified: ",ch)); |
| re_flags |= mixin("RegexOption."~op); |
| break L_FlagSwitch; |
| } |
| default: |
| throw new RegexException(text("unknown regex flag '",ch,"'")); |
| } |
| } |
| } |
| |
| //parse and store IR for regex pattern |
| @trusted void parseRegex() |
| { |
| uint fix;//fixup pointer |
| |
| while (!empty) |
| { |
| debug(std_regex_parser) |
| __ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data); |
| switch (front) |
| { |
| case '(': |
| popFront(); |
| if (front == '?') |
| { |
| popFront(); |
| switch (front) |
| { |
| case '#': |
| for (;;) |
| { |
| popFront(); |
| enforce(!empty, "Unexpected end of pattern"); |
| if (front == ')') |
| { |
| popFront(); |
| break; |
| } |
| } |
| break; |
| case ':': |
| g.genLogicGroup(); |
| popFront(); |
| break; |
| case '=': |
| g.genLookaround(IR.LookaheadStart); |
| popFront(); |
| break; |
| case '!': |
| g.genLookaround(IR.NeglookaheadStart); |
| popFront(); |
| break; |
| case 'P': |
| popFront(); |
| enforce(front == '<', "Expected '<' in named group"); |
| string name; |
| popFront(); |
| if (empty || !(isAlpha(front) || front == '_')) |
| error("Expected alpha starting a named group"); |
| name ~= front; |
| popFront(); |
| while (!empty && (isAlpha(front) || |
| front == '_' || std.ascii.isDigit(front))) |
| { |
| name ~= front; |
| popFront(); |
| } |
| enforce(front == '>', "Expected '>' closing named group"); |
| popFront(); |
| g.genNamedGroup(name); |
| break; |
| case '<': |
| popFront(); |
| if (front == '=') |
| g.genLookaround(IR.LookbehindStart); |
| else if (front == '!') |
| g.genLookaround(IR.NeglookbehindStart); |
| else |
| error("'!' or '=' expected after '<'"); |
| popFront(); |
| break; |
| default: |
| uint enableFlags, disableFlags; |
| bool enable = true; |
| do |
| { |
| switch (front) |
| { |
| case 's': |
| if (enable) |
| enableFlags |= RegexOption.singleline; |
| else |
| disableFlags |= RegexOption.singleline; |
| break; |
| case 'x': |
| if (enable) |
| enableFlags |= RegexOption.freeform; |
| else |
| disableFlags |= RegexOption.freeform; |
| break; |
| case 'i': |
| if (enable) |
| enableFlags |= RegexOption.casefold; |
| else |
| disableFlags |= RegexOption.casefold; |
| break; |
| case 'm': |
| if (enable) |
| enableFlags |= RegexOption.multiline; |
| else |
| disableFlags |= RegexOption.multiline; |
| break; |
| case '-': |
| if (!enable) |
| error(" unexpected second '-' in flags"); |
| enable = false; |
| break; |
| default: |
| error(" 's', 'x', 'i', 'm' or '-' expected after '(?' "); |
| } |
| popFront(); |
| }while (front != ')'); |
| popFront(); |
| re_flags |= enableFlags; |
| re_flags &= ~disableFlags; |
| } |
| } |
| else |
| { |
| g.genGroup(); |
| } |
| break; |
| case ')': |
| enforce(g.nesting, "Unmatched ')'"); |
| popFront(); |
| auto pair = g.onClose(); |
| if (pair[0]) |
| parseQuantifier(pair[1]); |
| break; |
| case '|': |
| popFront(); |
| g.fixAlternation(); |
| break; |
| default://no groups or whatever |
| immutable start = g.length; |
| parseAtom(); |
| parseQuantifier(start); |
| } |
| } |
| |
| if (g.fixupLength != 1) |
| { |
| fix = g.popFixup(); |
| g.finishAlternation(fix); |
| enforce(g.fixupLength == 1, "no matching ')'"); |
| } |
| } |
| |
| |
| //parse and store IR for atom-quantifier pair |
| @trusted void parseQuantifier(uint offset) |
| {//copy is @system |
| if (empty) |
| return g.fixRepetition(offset); |
| uint min, max; |
| switch (front) |
| { |
| case '*': |
| min = 0; |
| max = infinite; |
| break; |
| case '?': |
| min = 0; |
| max = 1; |
| break; |
| case '+': |
| min = 1; |
| max = infinite; |
| break; |
| case '{': |
| popFront(); |
| enforce(!empty, "Unexpected end of regex pattern"); |
| enforce(std.ascii.isDigit(front), "First number required in repetition"); |
| min = parseDecimal(); |
| if (front == '}') |
| max = min; |
| else if (front == ',') |
| { |
| popFront(); |
| if (std.ascii.isDigit(front)) |
| max = parseDecimal(); |
| else if (front == '}') |
| max = infinite; |
| else |
| error("Unexpected symbol in regex pattern"); |
| skipSpace(); |
| enforce(front == '}', "Unmatched '{' in regex pattern"); |
| } |
| else |
| error("Unexpected symbol in regex pattern"); |
| enforce(min <= max, "Illegal {n,m} quantifier"); |
| break; |
| default: |
| g.fixRepetition(offset); |
| return; |
| } |
| bool greedy = true; |
| //check only if we managed to get new symbol |
| popFront(); |
| if (!empty && front == '?') |
| { |
| greedy = false; |
| popFront(); |
| } |
| g.fixRepetition(offset, min, max, greedy); |
| } |
| |
| //parse and store IR for atom |
| void parseAtom() |
| { |
| if (empty) |
| return; |
| switch (front) |
| { |
| case '*', '?', '+', '|', '{', '}': |
| return error("'*', '+', '?', '{', '}' not allowed in atom"); |
| case '.': |
| if (re_flags & RegexOption.singleline) |
| g.put(Bytecode(IR.Any, 0)); |
| else |
| { |
| CodepointSet set; |
| g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted); |
| } |
| popFront(); |
| break; |
| case '[': |
| parseCharset(); |
| break; |
| case '\\': |
| _popFront(); |
| enforce(!empty, "Unfinished escape sequence"); |
| parseEscape(); |
| break; |
| case '^': |
| if (re_flags & RegexOption.multiline) |
| g.put(Bytecode(IR.Bol, 0)); |
| else |
| g.put(Bytecode(IR.Bof, 0)); |
| popFront(); |
| break; |
| case '$': |
| if (re_flags & RegexOption.multiline) |
| g.put(Bytecode(IR.Eol, 0)); |
| else |
| g.put(Bytecode(IR.Eof, 0)); |
| popFront(); |
| break; |
| default: |
| if (re_flags & RegexOption.casefold) |
| { |
| auto range = simpleCaseFoldings(front); |
| assert(range.length <= 5); |
| if (range.length == 1) |
| g.put(Bytecode(IR.Char, range.front)); |
| else |
| foreach (v; range) |
| g.put(Bytecode(IR.OrChar, v, cast(uint) range.length)); |
| } |
| else |
| g.put(Bytecode(IR.Char, front)); |
| popFront(); |
| } |
| } |
| |
| //parse and store IR for CodepointSet |
| void parseCharset() |
| { |
| const save = re_flags; |
| re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did |
| bool casefold = cast(bool)(re_flags & RegexOption.casefold); |
| g.charsetToIr(unicode.parseSet(this, casefold)); |
| re_flags = save; |
| // Last next() in parseCharset is executed w/o freeform flag |
| if (re_flags & RegexOption.freeform) skipSpace(); |
| } |
| |
| //parse and generate IR for escape stand alone escape sequence |
| @trusted void parseEscape() |
| {//accesses array of appender |
| import std.algorithm.iteration : sum; |
| switch (front) |
| { |
| case 'f': popFront(); g.put(Bytecode(IR.Char, '\f')); break; |
| case 'n': popFront(); g.put(Bytecode(IR.Char, '\n')); break; |
| case 'r': popFront(); g.put(Bytecode(IR.Char, '\r')); break; |
| case 't': popFront(); g.put(Bytecode(IR.Char, '\t')); break; |
| case 'v': popFront(); g.put(Bytecode(IR.Char, '\v')); break; |
| |
| case 'd': |
| popFront(); |
| g.charsetToIr(unicode.Nd); |
| break; |
| case 'D': |
| popFront(); |
| g.charsetToIr(unicode.Nd.inverted); |
| break; |
| case 'b': popFront(); g.put(Bytecode(IR.Wordboundary, 0)); break; |
| case 'B': popFront(); g.put(Bytecode(IR.Notwordboundary, 0)); break; |
| case 's': |
| popFront(); |
| g.charsetToIr(unicode.White_Space); |
| break; |
| case 'S': |
| popFront(); |
| g.charsetToIr(unicode.White_Space.inverted); |
| break; |
| case 'w': |
| popFront(); |
| g.charsetToIr(wordCharacter); |
| break; |
| case 'W': |
| popFront(); |
| g.charsetToIr(wordCharacter.inverted); |
| break; |
| case 'p': case 'P': |
| bool casefold = cast(bool)(re_flags & RegexOption.casefold); |
| auto set = unicode.parsePropertySpec(this, front == 'P', casefold); |
| g.charsetToIr(set); |
| break; |
| case 'x': |
| immutable code = parseUniHex(pat, 2); |
| popFront(); |
| g.put(Bytecode(IR.Char,code)); |
| break; |
| case 'u': case 'U': |
| immutable code = parseUniHex(pat, front == 'u' ? 4 : 8); |
| popFront(); |
| g.put(Bytecode(IR.Char, code)); |
| break; |
| case 'c': //control codes |
| Bytecode code = Bytecode(IR.Char, unicode.parseControlCode(this)); |
| popFront(); |
| g.put(code); |
| break; |
| case '0': |
| popFront(); |
| g.put(Bytecode(IR.Char, 0));//NUL character |
| break; |
| case '1': .. case '9': |
| uint nref = cast(uint) front - '0'; |
| immutable maxBackref = sum(g.groupStack.data); |
| enforce(nref < maxBackref, "Backref to unseen group"); |
| //perl's disambiguation rule i.e. |
| //get next digit only if there is such group number |
| popFront(); |
| while (nref < maxBackref && !empty && std.ascii.isDigit(front)) |
| { |
| nref = nref * 10 + front - '0'; |
| popFront(); |
| } |
| if (nref >= maxBackref) |
| nref /= 10; |
| enforce(!g.isOpenGroup(nref), "Backref to open group"); |
| uint localLimit = maxBackref - g.groupStack.top; |
| if (nref >= localLimit) |
| { |
| g.put(Bytecode(IR.Backref, nref-localLimit)); |
| g.ir[$-1].setLocalRef(); |
| } |
| else |
| g.put(Bytecode(IR.Backref, nref)); |
| g.markBackref(nref); |
| break; |
| default: |
| if (front == '\\' && !pat.empty) |
| { |
| if (pat.front >= privateUseStart && pat.front <= privateUseEnd) |
| enforce(false, "invalid escape sequence"); |
| } |
| if (front >= privateUseStart && front <= privateUseEnd) |
| { |
| g.endPattern(front - privateUseStart + 1); |
| break; |
| } |
| auto op = Bytecode(IR.Char, front); |
| popFront(); |
| g.put(op); |
| } |
| } |
| |
| // |
| @trusted void error(string msg) |
| { |
| import std.array : appender; |
| import std.format.write : formattedWrite; |
| auto app = appender!string(); |
| formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`", |
| msg, origin[0..$-pat.length], pat); |
| throw new RegexException(app.data); |
| } |
| |
| alias Char = BasicElementOf!R; |
| |
| @property program() |
| { |
| return makeRegex(this); |
| } |
| } |
| |
| /+ |
| Postproces the IR, then optimize. |
| +/ |
| @trusted void postprocess(Char)(ref Regex!Char zis) |
| {//@@@BUG@@@ write is @system |
| with(zis) |
| { |
| struct FixedStack(T) |
| { |
| T[] arr; |
| uint _top; |
| //this(T[] storage){ arr = storage; _top = -1; } |
| @property ref T top(){ assert(!empty); return arr[_top]; } |
| void push(T x){ arr[++_top] = x; } |
| T pop() { assert(!empty); return arr[_top--]; } |
| @property bool empty(){ return _top == -1; } |
| } |
| auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1); |
| counterRange.push(1); |
| ulong cumRange = 0; |
| for (uint i = 0; i < ir.length; i += ir[i].length) |
| { |
| if (ir[i].hotspot) |
| { |
| assert(i + 1 < ir.length, |
| "unexpected end of IR while looking for hotspot"); |
| ir[i+1] = Bytecode.fromRaw(hotspotTableSize); |
| hotspotTableSize += counterRange.top; |
| } |
| switch (ir[i].code) |
| { |
| case IR.RepeatStart, IR.RepeatQStart: |
| uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart)); |
| assert(ir[repEnd].code == ir[i].paired.code); |
| immutable max = ir[repEnd + 4].raw; |
| ir[repEnd+2].raw = counterRange.top; |
| ir[repEnd+3].raw *= counterRange.top; |
| ir[repEnd+4].raw *= counterRange.top; |
| ulong cntRange = cast(ulong)(max)*counterRange.top; |
| cumRange += cntRange; |
| enforce(cumRange < maxCumulativeRepetitionLength, |
| "repetition length limit is exceeded"); |
| counterRange.push(cast(uint) cntRange + counterRange.top); |
| threadCount += counterRange.top; |
| break; |
| case IR.RepeatEnd, IR.RepeatQEnd: |
| threadCount += counterRange.top; |
| counterRange.pop(); |
| break; |
| case IR.GroupStart: |
| if (isBackref(ir[i].data)) |
| ir[i].setBackrefence(); |
| threadCount += counterRange.top; |
| break; |
| case IR.GroupEnd: |
| if (isBackref(ir[i].data)) |
| ir[i].setBackrefence(); |
| threadCount += counterRange.top; |
| break; |
| default: |
| threadCount += counterRange.top; |
| } |
| } |
| checkIfOneShot(); |
| if (!(flags & RegexInfo.oneShot)) |
| kickstart = Kickstart!Char(zis, new uint[](256)); |
| debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount); |
| optimize(zis); |
| } |
| } |
| |
| void fixupBytecode()(Bytecode[] ir) |
| { |
| Stack!uint fixups; |
| |
| with(IR) for (uint i=0; i<ir.length; i+= ir[i].length) |
| { |
| if (ir[i].isStart || ir[i].code == Option) |
| fixups.push(i); |
| else if (ir[i].code == OrEnd) |
| { |
| // Alternatives need more care |
| auto j = fixups.pop(); // last Option |
| ir[j].data = i - j - ir[j].length; |
| j = fixups.pop(); // OrStart |
| ir[j].data = i - j - ir[j].length; |
| ir[i].data = ir[j].data; |
| |
| // fixup all GotoEndOrs |
| j = j + IRL!(OrStart); |
| assert(ir[j].code == Option); |
| for (;;) |
| { |
| auto next = j + ir[j].data + IRL!(Option); |
| if (ir[next].code == IR.OrEnd) |
| break; |
| ir[next - IRL!(GotoEndOr)].data = i - next; |
| j = next; |
| } |
| } |
| else if (ir[i].code == GotoEndOr) |
| { |
| auto j = fixups.pop(); // Option |
| ir[j].data = i - j + IRL!(GotoEndOr)- IRL!(Option); // to the next option |
| } |
| else if (ir[i].isEnd) |
| { |
| auto j = fixups.pop(); |
| ir[i].data = i - j - ir[j].length; |
| ir[j].data = ir[i].data; |
| } |
| } |
| assert(fixups.empty); |
| } |
| |
| void optimize(Char)(ref Regex!Char zis) |
| { |
| import std.array : insertInPlace; |
| CodepointSet nextSet(uint idx) |
| { |
| CodepointSet set; |
| with(zis) with(IR) |
| Outer: |
| for (uint i = idx; i < ir.length; i += ir[i].length) |
| { |
| switch (ir[i].code) |
| { |
| case Char: |
| set.add(ir[i].data, ir[i].data+1); |
| goto default; |
| //TODO: OrChar |
| case Trie, CodepointSet: |
| set = zis.charsets[ir[i].data]; |
| goto default; |
| case GroupStart,GroupEnd: |
| break; |
| default: |
| break Outer; |
| } |
| } |
| return set; |
| } |
| |
| with(zis) with(IR) for (uint i = 0; i < ir.length; i += ir[i].length) |
| { |
| if (ir[i].code == InfiniteEnd) |
| { |
| auto set = nextSet(i+IRL!(InfiniteEnd)); |
| if (!set.empty && set.length < 10_000) |
| { |
| ir[i] = Bytecode(InfiniteBloomEnd, ir[i].data); |
| ir[i - ir[i].data - IRL!(InfiniteStart)] = |
| Bytecode(InfiniteBloomStart, ir[i].data); |
| ir.insertInPlace(i+IRL!(InfiniteEnd), |
| Bytecode.fromRaw(cast(uint) zis.filters.length)); |
| zis.filters ~= BitTable(set); |
| fixupBytecode(ir); |
| } |
| } |
| } |
| } |
| |
| //IR code validator - proper nesting, illegal instructions, etc. |
| @trusted void validateRe(Char)(ref Regex!Char zis) |
| {//@@@BUG@@@ text is @system |
| import std.conv : text; |
| with(zis) |
| { |
| for (uint pc = 0; pc < ir.length; pc += ir[pc].length) |
| { |
| if (ir[pc].isStart || ir[pc].isEnd) |
| { |
| immutable dest = ir[pc].indexOfPair(pc); |
| assert(dest < ir.length, text("Wrong length in opcode at pc=", |
| pc, " ", dest, " vs ", ir.length)); |
| assert(ir[dest].paired == ir[pc], |
| text("Wrong pairing of opcodes at pc=", pc, "and pc=", dest)); |
| } |
| else if (ir[pc].isAtom) |
| { |
| |
| } |
| else |
| assert(0, text("Unknown type of instruction at pc=", pc)); |
| } |
| } |
| } |