blob: 448bb99a9a132fd2a7c37cae59abec446b316a40 [file] [log] [blame]
//Written in the D programming language
/*
Regular expression pattern parser.
*/
module std.regex.internal.parser;
import std.regex.internal.ir;
import std.range.primitives, std.uni, std.meta,
std.traits, std.typecons, std.exception;
static import std.ascii;
// package relevant info from parser into a regex object
auto makeRegex(S, CG)(Parser!(S, CG) p)
{
import std.regex.internal.backtracking : BacktrackingMatcher;
import std.regex.internal.thompson : ThompsonMatcher;
import std.algorithm.searching : canFind;
alias Char = BasicElementOf!S;
Regex!Char re;
auto g = p.g;
with(re)
{
ir = g.ir;
dict = g.dict;
ngroup = g.ngroup;
maxCounterDepth = g.counterDepth;
flags = p.re_flags;
charsets = g.charsets;
matchers = g.matchers;
backrefed = g.backrefed;
re.pattern = p.origin.idup;
re.postprocess();
// check if we have backreferences, if so - use backtracking
if (__ctfe) factory = null; // allows us to use the awful enum re = regex(...);
else if (re.backrefed.canFind!"a != 0")
factory = new RuntimeFactory!(BacktrackingMatcher, Char);
else
factory = new RuntimeFactory!(ThompsonMatcher, Char);
debug(std_regex_parser)
{
__ctfe || print();
}
//@@@BUG@@@ (not reduced)
//somehow just using validate _collides_ with std.utf.validate (!)
version (assert) re.validateRe();
}
return re;
}
// helper for unittest
auto makeRegex(S)(S arg)
if (isSomeString!S)
{
return makeRegex(Parser!(S, CodeGen)(arg, ""));
}
@system unittest
{
import std.algorithm.comparison : equal;
auto re = makeRegex(`(?P<name>\w+) = (?P<var>\d+)`);
auto nc = re.namedCaptures;
static assert(isRandomAccessRange!(typeof(nc)));
assert(!nc.empty);
assert(nc.length == 2);
assert(nc.equal(["name", "var"]));
assert(nc[0] == "name");
assert(nc[1..$].equal(["var"]));
re = makeRegex(`(\w+) (?P<named>\w+) (\w+)`);
nc = re.namedCaptures;
assert(nc.length == 1);
assert(nc[0] == "named");
assert(nc.front == "named");
assert(nc.back == "named");
re = makeRegex(`(\w+) (\w+)`);
nc = re.namedCaptures;
assert(nc.empty);
re = makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`);
nc = re.namedCaptures;
auto cp = nc.save;
assert(nc.equal(cp));
nc.popFront();
assert(nc.equal(cp[1..$]));
nc.popBack();
assert(nc.equal(cp[1 .. $ - 1]));
}
@trusted void reverseBytecode()(Bytecode[] code)
{
Bytecode[] rev = new Bytecode[code.length];
uint revPc = cast(uint) rev.length;
Stack!(Tuple!(uint, uint, uint)) stack;
uint start = 0;
uint end = cast(uint) code.length;
for (;;)
{
for (uint pc = start; pc < end; )
{
immutable len = code[pc].length;
if (code[pc].code == IR.GotoEndOr)
break; //pick next alternation branch
if (code[pc].isAtom)
{
rev[revPc - len .. revPc] = code[pc .. pc + len];
revPc -= len;
pc += len;
}
else if (code[pc].isStart || code[pc].isEnd)
{
//skip over other embedded lookbehinds they are reversed
if (code[pc].code == IR.LookbehindStart
|| code[pc].code == IR.NeglookbehindStart)
{
immutable blockLen = len + code[pc].data
+ code[pc].pairedLength;
rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen];
pc += blockLen;
revPc -= blockLen;
continue;
}
immutable second = code[pc].indexOfPair(pc);
immutable secLen = code[second].length;
rev[revPc - secLen .. revPc] = code[second .. second + secLen];
revPc -= secLen;
if (code[pc].code == IR.OrStart)
{
//we pass len bytes forward, but secLen in reverse
immutable revStart = revPc - (second + len - secLen - pc);
uint r = revStart;
uint i = pc + IRL!(IR.OrStart);
while (code[i].code == IR.Option)
{
if (code[i - 1].code != IR.OrStart)
{
assert(code[i - 1].code == IR.GotoEndOr);
rev[r - 1] = code[i - 1];
}
rev[r] = code[i];
auto newStart = i + IRL!(IR.Option);
auto newEnd = newStart + code[i].data;
auto newRpc = r + code[i].data + IRL!(IR.Option);
if (code[newEnd].code != IR.OrEnd)
{
newRpc--;
}
stack.push(tuple(newStart, newEnd, newRpc));
r += code[i].data + IRL!(IR.Option);
i += code[i].data + IRL!(IR.Option);
}
pc = i;
revPc = revStart;
assert(code[pc].code == IR.OrEnd);
}
else
pc += len;
}
}
if (stack.empty)
break;
start = stack.top[0];
end = stack.top[1];
revPc = stack.top[2];
stack.pop();
}
code[] = rev[];
}
struct CodeGen
{
Bytecode[] ir; // resulting bytecode
Stack!(uint) fixupStack; // stack of opened start instructions
NamedGroup[] dict; // maps name -> user group number
Stack!(uint) groupStack; // stack of current number of group
uint nesting = 0; // group nesting level and repetitions step
uint lookaroundNest = 0; // nesting of lookaround
uint counterDepth = 0; // current depth of nested counted repetitions
CodepointSet[] charsets; // sets for char classes
const(CharMatcher)[] matchers; // matchers for char classes
uint[] backrefed; // bitarray for groups refered by backref
uint ngroup; // final number of groups (of all patterns)
void start(uint length)
{
if (!__ctfe)
ir.reserve((length*5+2)/4);
fixupStack.push(0);
groupStack.push(1);//0 - whole match
}
//mark referenced groups for latter processing
void markBackref(uint n)
{
if (n/32 >= backrefed.length)
backrefed.length = n/32 + 1;
backrefed[n / 32] |= 1 << (n & 31);
}
bool isOpenGroup(uint n)
{
import std.algorithm.searching : canFind;
// walk the fixup stack and see if there are groups labeled 'n'
// fixup '0' is reserved for alternations
return fixupStack.data[1..$].
canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)();
}
void put(Bytecode code)
{
enforce(ir.length < maxCompiledLength,
"maximum compiled pattern length is exceeded");
ir ~= code;
}
void putRaw(uint number)
{
enforce(ir.length < maxCompiledLength,
"maximum compiled pattern length is exceeded");
ir ~= Bytecode.fromRaw(number);
}
//try to generate optimal IR code for this CodepointSet
@trusted void charsetToIr(CodepointSet set)
{//@@@BUG@@@ writeln is @system
uint chars = cast(uint) set.length;
if (chars < Bytecode.maxSequence)
{
switch (chars)
{
case 1:
put(Bytecode(IR.Char, set.byCodepoint.front));
break;
case 0:
throw new RegexException("empty CodepointSet not allowed");
default:
foreach (ch; set.byCodepoint)
put(Bytecode(IR.OrChar, ch, chars));
}
}
else
{
import std.algorithm.searching : countUntil;
const ivals = set.byInterval;
immutable n = charsets.countUntil(set);
if (n >= 0)
{
if (ivals.length*2 > maxCharsetUsed)
put(Bytecode(IR.Trie, cast(uint) n));
else
put(Bytecode(IR.CodepointSet, cast(uint) n));
return;
}
if (ivals.length*2 > maxCharsetUsed)
{
auto t = getMatcher(set);
put(Bytecode(IR.Trie, cast(uint) matchers.length));
matchers ~= t;
debug(std_regex_allocation) writeln("Trie generated");
}
else
{
put(Bytecode(IR.CodepointSet, cast(uint) charsets.length));
matchers ~= CharMatcher.init;
}
charsets ~= set;
assert(charsets.length == matchers.length);
}
}
void genLogicGroup()
{
nesting++;
pushFixup(length);
put(Bytecode(IR.Nop, 0));
}
void genGroup()
{
nesting++;
pushFixup(length);
immutable nglob = groupStack.top++;
enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded");
put(Bytecode(IR.GroupStart, nglob));
}
void genNamedGroup(string name)
{
import std.array : insertInPlace;
import std.range : assumeSorted;
nesting++;
pushFixup(length);
immutable nglob = groupStack.top++;
enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded");
auto t = NamedGroup(name, nglob);
auto d = assumeSorted!"a.name < b.name"(dict);
immutable ind = d.lowerBound(t).length;
insertInPlace(dict, ind, t);
put(Bytecode(IR.GroupStart, nglob));
}
//generate code for start of lookaround: (?= (?! (?<= (?<!
void genLookaround(IR opcode)
{
nesting++;
pushFixup(length);
put(Bytecode(opcode, 0));
put(Bytecode.fromRaw(0));
put(Bytecode.fromRaw(0));
groupStack.push(0);
lookaroundNest++;
enforce(lookaroundNest <= maxLookaroundDepth,
"maximum lookaround depth is exceeded");
}
void endPattern(uint num)
{
import std.algorithm.comparison : max;
put(Bytecode(IR.End, num));
ngroup = max(ngroup, groupStack.top);
groupStack.top = 1; // reset group counter
}
//fixup lookaround with start at offset fix and append a proper *-End opcode
void fixLookaround(uint fix)
{
lookaroundNest--;
ir[fix] = Bytecode(ir[fix].code,
cast(uint) ir.length - fix - IRL!(IR.LookaheadStart));
auto g = groupStack.pop();
assert(!groupStack.empty);
ir[fix+1] = Bytecode.fromRaw(groupStack.top);
//groups are cumulative across lookarounds
ir[fix+2] = Bytecode.fromRaw(groupStack.top+g);
groupStack.top += g;
if (ir[fix].code == IR.LookbehindStart || ir[fix].code == IR.NeglookbehindStart)
{
reverseBytecode(ir[fix + IRL!(IR.LookbehindStart) .. $]);
}
put(ir[fix].paired);
}
// repetition of {1,1}
void fixRepetition(uint offset)
{
import std.algorithm.mutation : copy;
immutable replace = ir[offset].code == IR.Nop;
if (replace)
{
copy(ir[offset + 1 .. $], ir[offset .. $ - 1]);
ir.length -= 1;
}
}
// repetition of {x,y}
void fixRepetition(uint offset, uint min, uint max, bool greedy)
{
static import std.algorithm.comparison;
import std.algorithm.mutation : copy;
import std.array : insertInPlace;
immutable replace = ir[offset].code == IR.Nop;
immutable len = cast(uint) ir.length - offset - replace;
if (max != infinite)
{
if (min != 1 || max != 1)
{
Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
if (replace)
ir[offset] = op;
else
insertInPlace(ir, offset, op);
put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
put(Bytecode.init); //hotspot
putRaw(1);
putRaw(min);
putRaw(max);
counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
}
}
else if (min) //&& max is infinite
{
if (min != 1)
{
Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
if (replace)
ir[offset] = op;
else
insertInPlace(ir, offset, op);
offset += 1;//so it still points to the repeated block
put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
put(Bytecode.init); //hotspot
putRaw(1);
putRaw(min);
putRaw(min);
counterDepth = std.algorithm.comparison.max(counterDepth, nesting+1);
}
else if (replace)
{
copy(ir[offset+1 .. $], ir[offset .. $-1]);
ir.length -= 1;
}
put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len));
enforce(ir.length + len < maxCompiledLength, "maximum compiled pattern length is exceeded");
ir ~= ir[offset .. offset+len];
//IR.InfinteX is always a hotspot
put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
put(Bytecode.init); //merge index
}
else//vanila {0,inf}
{
Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len);
if (replace)
ir[offset] = op;
else
insertInPlace(ir, offset, op);
//IR.InfinteX is always a hotspot
put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
put(Bytecode.init); //merge index
}
}
void fixAlternation()
{
import std.array : insertInPlace;
uint fix = fixupStack.top;
if (ir.length > fix && ir[fix].code == IR.Option)
{
ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix);
put(Bytecode(IR.GotoEndOr, 0));
fixupStack.top = cast(uint) ir.length; //replace latest fixup for Option
put(Bytecode(IR.Option, 0));
return;
}
uint len, orStart;
//start a new option
if (fixupStack.length == 1)
{//only root entry, effectively no fixup
len = cast(uint) ir.length + IRL!(IR.GotoEndOr);
orStart = 0;
}
else
{//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length
len = cast(uint) ir.length - fix - (ir[fix].length - 1);
orStart = fix + ir[fix].length;
}
insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len));
assert(ir[orStart].code == IR.OrStart);
put(Bytecode(IR.GotoEndOr, 0));
fixupStack.push(orStart); //fixup for StartOR
fixupStack.push(cast(uint) ir.length); //for second Option
put(Bytecode(IR.Option, 0));
}
// finalizes IR.Option, fix points to the first option of sequence
void finishAlternation(uint fix)
{
enforce(ir[fix].code == IR.Option, "no matching ')'");
ir[fix] = Bytecode(ir[fix].code, cast(uint) ir.length - fix - IRL!(IR.OrStart));
fix = fixupStack.pop();
enforce(ir[fix].code == IR.OrStart, "no matching ')'");
ir[fix] = Bytecode(IR.OrStart, cast(uint) ir.length - fix - IRL!(IR.OrStart));
put(Bytecode(IR.OrEnd, cast(uint) ir.length - fix - IRL!(IR.OrStart)));
uint pc = fix + IRL!(IR.OrStart);
while (ir[pc].code == IR.Option)
{
pc = pc + ir[pc].data;
if (ir[pc].code != IR.GotoEndOr)
break;
ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd)));
pc += IRL!(IR.GotoEndOr);
}
put(Bytecode.fromRaw(0));
}
// returns: (flag - repetition possible?, fixup of the start of this "group")
Tuple!(bool, uint) onClose()
{
nesting--;
uint fix = popFixup();
switch (ir[fix].code)
{
case IR.GroupStart:
put(Bytecode(IR.GroupEnd, ir[fix].data));
return tuple(true, fix);
case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
assert(lookaroundNest);
fixLookaround(fix);
return tuple(false, 0u);
case IR.Option: //| xxx )
//two fixups: last option + full OR
finishAlternation(fix);
fix = topFixup;
switch (ir[fix].code)
{
case IR.GroupStart:
popFixup();
put(Bytecode(IR.GroupEnd, ir[fix].data));
return tuple(true, fix);
case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
assert(lookaroundNest);
fix = popFixup();
fixLookaround(fix);
return tuple(false, 0u);
default://(?:xxx)
popFixup();
return tuple(true, fix);
}
default://(?:xxx)
return tuple(true, fix);
}
}
uint popFixup(){ return fixupStack.pop(); }
void pushFixup(uint val){ return fixupStack.push(val); }
@property uint topFixup(){ return fixupStack.top; }
@property size_t fixupLength(){ return fixupStack.data.length; }
@property uint length(){ return cast(uint) ir.length; }
}
// safety limits
enum maxGroupNumber = 2^^19;
enum maxLookaroundDepth = 16;
// *Bytecode.sizeof, i.e. 1Mb of bytecode alone
enum maxCompiledLength = 2^^18;
// amounts to up to 4 Mb of auxilary table for matching
enum maxCumulativeRepetitionLength = 2^^20;
// marker to indicate infinite repetition
enum infinite = ~0u;
struct Parser(R, Generator)
if (isForwardRange!R && is(ElementType!R : dchar))
{
dchar front;
bool empty;
R pat, origin; //keep full pattern for pretty printing error messages
uint re_flags = 0; //global flags e.g. multiline + internal ones
Generator g;
@trusted this(S)(R pattern, S flags)
if (isSomeString!S)
{
pat = origin = pattern;
//reserve slightly more then avg as sampled from unittests
parseFlags(flags);
front = ' ';//a safe default for freeform parsing
popFront();
g.start(cast(uint) pat.length);
try
{
parseRegex();
}
catch (Exception e)
{
error(e.msg);//also adds pattern location
}
g.endPattern(1);
}
void _popFront()
{
if (pat.empty)
{
empty = true;
}
else
{
front = pat.front;
pat.popFront();
}
}
void skipSpace()
{
while (!empty && isWhite(front)) _popFront();
}
void popFront()
{
_popFront();
if (re_flags & RegexOption.freeform) skipSpace();
}
auto save(){ return this; }
//parsing number with basic overflow check
uint parseDecimal()
{
uint r = 0;
while (std.ascii.isDigit(front))
{
if (r >= (uint.max/10))
error("Overflow in decimal number");
r = 10*r + cast(uint)(front-'0');
popFront();
if (empty) break;
}
return r;
}
//
@trusted void parseFlags(S)(S flags)
{//@@@BUG@@@ text is @system
import std.conv : text;
foreach (ch; flags)//flags are ASCII anyway
{
L_FlagSwitch:
switch (ch)
{
foreach (i, op; __traits(allMembers, RegexOption))
{
case RegexOptionNames[i]:
if (re_flags & mixin("RegexOption."~op))
throw new RegexException(text("redundant flag specified: ",ch));
re_flags |= mixin("RegexOption."~op);
break L_FlagSwitch;
}
default:
throw new RegexException(text("unknown regex flag '",ch,"'"));
}
}
}
//parse and store IR for regex pattern
@trusted void parseRegex()
{
uint fix;//fixup pointer
while (!empty)
{
debug(std_regex_parser)
__ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data);
switch (front)
{
case '(':
popFront();
if (front == '?')
{
popFront();
switch (front)
{
case '#':
for (;;)
{
popFront();
enforce(!empty, "Unexpected end of pattern");
if (front == ')')
{
popFront();
break;
}
}
break;
case ':':
g.genLogicGroup();
popFront();
break;
case '=':
g.genLookaround(IR.LookaheadStart);
popFront();
break;
case '!':
g.genLookaround(IR.NeglookaheadStart);
popFront();
break;
case 'P':
popFront();
enforce(front == '<', "Expected '<' in named group");
string name;
popFront();
if (empty || !(isAlpha(front) || front == '_'))
error("Expected alpha starting a named group");
name ~= front;
popFront();
while (!empty && (isAlpha(front) ||
front == '_' || std.ascii.isDigit(front)))
{
name ~= front;
popFront();
}
enforce(front == '>', "Expected '>' closing named group");
popFront();
g.genNamedGroup(name);
break;
case '<':
popFront();
if (front == '=')
g.genLookaround(IR.LookbehindStart);
else if (front == '!')
g.genLookaround(IR.NeglookbehindStart);
else
error("'!' or '=' expected after '<'");
popFront();
break;
default:
uint enableFlags, disableFlags;
bool enable = true;
do
{
switch (front)
{
case 's':
if (enable)
enableFlags |= RegexOption.singleline;
else
disableFlags |= RegexOption.singleline;
break;
case 'x':
if (enable)
enableFlags |= RegexOption.freeform;
else
disableFlags |= RegexOption.freeform;
break;
case 'i':
if (enable)
enableFlags |= RegexOption.casefold;
else
disableFlags |= RegexOption.casefold;
break;
case 'm':
if (enable)
enableFlags |= RegexOption.multiline;
else
disableFlags |= RegexOption.multiline;
break;
case '-':
if (!enable)
error(" unexpected second '-' in flags");
enable = false;
break;
default:
error(" 's', 'x', 'i', 'm' or '-' expected after '(?' ");
}
popFront();
}while (front != ')');
popFront();
re_flags |= enableFlags;
re_flags &= ~disableFlags;
}
}
else
{
g.genGroup();
}
break;
case ')':
enforce(g.nesting, "Unmatched ')'");
popFront();
auto pair = g.onClose();
if (pair[0])
parseQuantifier(pair[1]);
break;
case '|':
popFront();
g.fixAlternation();
break;
default://no groups or whatever
immutable start = g.length;
parseAtom();
parseQuantifier(start);
}
}
if (g.fixupLength != 1)
{
fix = g.popFixup();
g.finishAlternation(fix);
enforce(g.fixupLength == 1, "no matching ')'");
}
}
//parse and store IR for atom-quantifier pair
@trusted void parseQuantifier(uint offset)
{//copy is @system
if (empty)
return g.fixRepetition(offset);
uint min, max;
switch (front)
{
case '*':
min = 0;
max = infinite;
break;
case '?':
min = 0;
max = 1;
break;
case '+':
min = 1;
max = infinite;
break;
case '{':
popFront();
enforce(!empty, "Unexpected end of regex pattern");
enforce(std.ascii.isDigit(front), "First number required in repetition");
min = parseDecimal();
if (front == '}')
max = min;
else if (front == ',')
{
popFront();
if (std.ascii.isDigit(front))
max = parseDecimal();
else if (front == '}')
max = infinite;
else
error("Unexpected symbol in regex pattern");
skipSpace();
enforce(front == '}', "Unmatched '{' in regex pattern");
}
else
error("Unexpected symbol in regex pattern");
enforce(min <= max, "Illegal {n,m} quantifier");
break;
default:
g.fixRepetition(offset);
return;
}
bool greedy = true;
//check only if we managed to get new symbol
popFront();
if (!empty && front == '?')
{
greedy = false;
popFront();
}
g.fixRepetition(offset, min, max, greedy);
}
//parse and store IR for atom
void parseAtom()
{
if (empty)
return;
switch (front)
{
case '*', '?', '+', '|', '{', '}':
return error("'*', '+', '?', '{', '}' not allowed in atom");
case '.':
if (re_flags & RegexOption.singleline)
g.put(Bytecode(IR.Any, 0));
else
{
CodepointSet set;
g.charsetToIr(set.add('\n','\n'+1).add('\r', '\r'+1).inverted);
}
popFront();
break;
case '[':
parseCharset();
break;
case '\\':
_popFront();
enforce(!empty, "Unfinished escape sequence");
parseEscape();
break;
case '^':
if (re_flags & RegexOption.multiline)
g.put(Bytecode(IR.Bol, 0));
else
g.put(Bytecode(IR.Bof, 0));
popFront();
break;
case '$':
if (re_flags & RegexOption.multiline)
g.put(Bytecode(IR.Eol, 0));
else
g.put(Bytecode(IR.Eof, 0));
popFront();
break;
default:
if (re_flags & RegexOption.casefold)
{
auto range = simpleCaseFoldings(front);
assert(range.length <= 5);
if (range.length == 1)
g.put(Bytecode(IR.Char, range.front));
else
foreach (v; range)
g.put(Bytecode(IR.OrChar, v, cast(uint) range.length));
}
else
g.put(Bytecode(IR.Char, front));
popFront();
}
}
//parse and store IR for CodepointSet
void parseCharset()
{
const save = re_flags;
re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did
bool casefold = cast(bool)(re_flags & RegexOption.casefold);
g.charsetToIr(unicode.parseSet(this, casefold));
re_flags = save;
// Last next() in parseCharset is executed w/o freeform flag
if (re_flags & RegexOption.freeform) skipSpace();
}
//parse and generate IR for escape stand alone escape sequence
@trusted void parseEscape()
{//accesses array of appender
import std.algorithm.iteration : sum;
switch (front)
{
case 'f': popFront(); g.put(Bytecode(IR.Char, '\f')); break;
case 'n': popFront(); g.put(Bytecode(IR.Char, '\n')); break;
case 'r': popFront(); g.put(Bytecode(IR.Char, '\r')); break;
case 't': popFront(); g.put(Bytecode(IR.Char, '\t')); break;
case 'v': popFront(); g.put(Bytecode(IR.Char, '\v')); break;
case 'd':
popFront();
g.charsetToIr(unicode.Nd);
break;
case 'D':
popFront();
g.charsetToIr(unicode.Nd.inverted);
break;
case 'b': popFront(); g.put(Bytecode(IR.Wordboundary, 0)); break;
case 'B': popFront(); g.put(Bytecode(IR.Notwordboundary, 0)); break;
case 's':
popFront();
g.charsetToIr(unicode.White_Space);
break;
case 'S':
popFront();
g.charsetToIr(unicode.White_Space.inverted);
break;
case 'w':
popFront();
g.charsetToIr(wordCharacter);
break;
case 'W':
popFront();
g.charsetToIr(wordCharacter.inverted);
break;
case 'p': case 'P':
bool casefold = cast(bool)(re_flags & RegexOption.casefold);
auto set = unicode.parsePropertySpec(this, front == 'P', casefold);
g.charsetToIr(set);
break;
case 'x':
immutable code = parseUniHex(pat, 2);
popFront();
g.put(Bytecode(IR.Char,code));
break;
case 'u': case 'U':
immutable code = parseUniHex(pat, front == 'u' ? 4 : 8);
popFront();
g.put(Bytecode(IR.Char, code));
break;
case 'c': //control codes
Bytecode code = Bytecode(IR.Char, unicode.parseControlCode(this));
popFront();
g.put(code);
break;
case '0':
popFront();
g.put(Bytecode(IR.Char, 0));//NUL character
break;
case '1': .. case '9':
uint nref = cast(uint) front - '0';
immutable maxBackref = sum(g.groupStack.data);
enforce(nref < maxBackref, "Backref to unseen group");
//perl's disambiguation rule i.e.
//get next digit only if there is such group number
popFront();
while (nref < maxBackref && !empty && std.ascii.isDigit(front))
{
nref = nref * 10 + front - '0';
popFront();
}
if (nref >= maxBackref)
nref /= 10;
enforce(!g.isOpenGroup(nref), "Backref to open group");
uint localLimit = maxBackref - g.groupStack.top;
if (nref >= localLimit)
{
g.put(Bytecode(IR.Backref, nref-localLimit));
g.ir[$-1].setLocalRef();
}
else
g.put(Bytecode(IR.Backref, nref));
g.markBackref(nref);
break;
default:
if (front == '\\' && !pat.empty)
{
if (pat.front >= privateUseStart && pat.front <= privateUseEnd)
enforce(false, "invalid escape sequence");
}
if (front >= privateUseStart && front <= privateUseEnd)
{
g.endPattern(front - privateUseStart + 1);
break;
}
auto op = Bytecode(IR.Char, front);
popFront();
g.put(op);
}
}
//
@trusted void error(string msg)
{
import std.array : appender;
import std.format.write : formattedWrite;
auto app = appender!string();
formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`",
msg, origin[0..$-pat.length], pat);
throw new RegexException(app.data);
}
alias Char = BasicElementOf!R;
@property program()
{
return makeRegex(this);
}
}
/+
Postproces the IR, then optimize.
+/
@trusted void postprocess(Char)(ref Regex!Char zis)
{//@@@BUG@@@ write is @system
with(zis)
{
struct FixedStack(T)
{
T[] arr;
uint _top;
//this(T[] storage){ arr = storage; _top = -1; }
@property ref T top(){ assert(!empty); return arr[_top]; }
void push(T x){ arr[++_top] = x; }
T pop() { assert(!empty); return arr[_top--]; }
@property bool empty(){ return _top == -1; }
}
auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1);
counterRange.push(1);
ulong cumRange = 0;
for (uint i = 0; i < ir.length; i += ir[i].length)
{
if (ir[i].hotspot)
{
assert(i + 1 < ir.length,
"unexpected end of IR while looking for hotspot");
ir[i+1] = Bytecode.fromRaw(hotspotTableSize);
hotspotTableSize += counterRange.top;
}
switch (ir[i].code)
{
case IR.RepeatStart, IR.RepeatQStart:
uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart));
assert(ir[repEnd].code == ir[i].paired.code);
immutable max = ir[repEnd + 4].raw;
ir[repEnd+2].raw = counterRange.top;
ir[repEnd+3].raw *= counterRange.top;
ir[repEnd+4].raw *= counterRange.top;
ulong cntRange = cast(ulong)(max)*counterRange.top;
cumRange += cntRange;
enforce(cumRange < maxCumulativeRepetitionLength,
"repetition length limit is exceeded");
counterRange.push(cast(uint) cntRange + counterRange.top);
threadCount += counterRange.top;
break;
case IR.RepeatEnd, IR.RepeatQEnd:
threadCount += counterRange.top;
counterRange.pop();
break;
case IR.GroupStart:
if (isBackref(ir[i].data))
ir[i].setBackrefence();
threadCount += counterRange.top;
break;
case IR.GroupEnd:
if (isBackref(ir[i].data))
ir[i].setBackrefence();
threadCount += counterRange.top;
break;
default:
threadCount += counterRange.top;
}
}
checkIfOneShot();
if (!(flags & RegexInfo.oneShot))
kickstart = Kickstart!Char(zis, new uint[](256));
debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount);
optimize(zis);
}
}
void fixupBytecode()(Bytecode[] ir)
{
Stack!uint fixups;
with(IR) for (uint i=0; i<ir.length; i+= ir[i].length)
{
if (ir[i].isStart || ir[i].code == Option)
fixups.push(i);
else if (ir[i].code == OrEnd)
{
// Alternatives need more care
auto j = fixups.pop(); // last Option
ir[j].data = i - j - ir[j].length;
j = fixups.pop(); // OrStart
ir[j].data = i - j - ir[j].length;
ir[i].data = ir[j].data;
// fixup all GotoEndOrs
j = j + IRL!(OrStart);
assert(ir[j].code == Option);
for (;;)
{
auto next = j + ir[j].data + IRL!(Option);
if (ir[next].code == IR.OrEnd)
break;
ir[next - IRL!(GotoEndOr)].data = i - next;
j = next;
}
}
else if (ir[i].code == GotoEndOr)
{
auto j = fixups.pop(); // Option
ir[j].data = i - j + IRL!(GotoEndOr)- IRL!(Option); // to the next option
}
else if (ir[i].isEnd)
{
auto j = fixups.pop();
ir[i].data = i - j - ir[j].length;
ir[j].data = ir[i].data;
}
}
assert(fixups.empty);
}
void optimize(Char)(ref Regex!Char zis)
{
import std.array : insertInPlace;
CodepointSet nextSet(uint idx)
{
CodepointSet set;
with(zis) with(IR)
Outer:
for (uint i = idx; i < ir.length; i += ir[i].length)
{
switch (ir[i].code)
{
case Char:
set.add(ir[i].data, ir[i].data+1);
goto default;
//TODO: OrChar
case Trie, CodepointSet:
set = zis.charsets[ir[i].data];
goto default;
case GroupStart,GroupEnd:
break;
default:
break Outer;
}
}
return set;
}
with(zis) with(IR) for (uint i = 0; i < ir.length; i += ir[i].length)
{
if (ir[i].code == InfiniteEnd)
{
auto set = nextSet(i+IRL!(InfiniteEnd));
if (!set.empty && set.length < 10_000)
{
ir[i] = Bytecode(InfiniteBloomEnd, ir[i].data);
ir[i - ir[i].data - IRL!(InfiniteStart)] =
Bytecode(InfiniteBloomStart, ir[i].data);
ir.insertInPlace(i+IRL!(InfiniteEnd),
Bytecode.fromRaw(cast(uint) zis.filters.length));
zis.filters ~= BitTable(set);
fixupBytecode(ir);
}
}
}
}
//IR code validator - proper nesting, illegal instructions, etc.
@trusted void validateRe(Char)(ref Regex!Char zis)
{//@@@BUG@@@ text is @system
import std.conv : text;
with(zis)
{
for (uint pc = 0; pc < ir.length; pc += ir[pc].length)
{
if (ir[pc].isStart || ir[pc].isEnd)
{
immutable dest = ir[pc].indexOfPair(pc);
assert(dest < ir.length, text("Wrong length in opcode at pc=",
pc, " ", dest, " vs ", ir.length));
assert(ir[dest].paired == ir[pc],
text("Wrong pairing of opcodes at pc=", pc, "and pc=", dest));
}
else if (ir[pc].isAtom)
{
}
else
assert(0, text("Unknown type of instruction at pc=", pc));
}
}
}