blob: ffc9779923ae4a8b8fca559d957b065e62465c10 [file] [log] [blame]
/*
Implementation of backtracking std.regex engine.
Contains both compile-time and run-time versions.
*/
module std.regex.internal.backtracking;
package(std.regex):
import core.stdc.stdlib, std.range.primitives, std.traits, std.typecons;
import std.regex.internal.ir;
/+
BacktrackingMatcher implements backtracking scheme of matching
regular expressions.
+/
template BacktrackingMatcher(bool CTregex)
{
@trusted struct BacktrackingMatcher(Char, Stream = Input!Char)
if (is(Char : dchar))
{
alias DataIndex = Stream.DataIndex;
struct State
{//top bit in pc is set if saved along with matches
DataIndex index;
uint pc, counter, infiniteNesting;
}
static assert(State.sizeof % size_t.sizeof == 0);
enum stateSize = State.sizeof / size_t.sizeof;
enum initialStack = 1 << 11; // items in a block of segmented stack
alias String = const(Char)[];
alias RegEx = Regex!Char;
alias MatchFn = bool function (ref BacktrackingMatcher!(Char, Stream));
RegEx re; //regex program
static if (CTregex)
MatchFn nativeFn; //native code for that program
//Stream state
Stream s;
DataIndex index;
dchar front;
bool exhausted;
//backtracking machine state
uint pc, counter;
DataIndex lastState = 0; //top of state stack
static if (!CTregex)
uint infiniteNesting;
size_t[] memory;
Trace[] merge;
static struct Trace
{
ulong mask;
size_t offset;
bool mark(size_t idx)
{
immutable d = idx - offset;
if (d < 64) // including overflow
{
immutable p = mask & (1UL << d);
mask |= 1UL << d;
return p != 0;
}
else
{
offset = idx;
mask = 1;
return false;
}
}
}
//local slice of matches, global for backref
Group!DataIndex[] matches, backrefed;
static if (__traits(hasMember,Stream, "search"))
{
enum kicked = true;
}
else
enum kicked = false;
static size_t initialMemory(const ref RegEx re)
{
return stackSize(re)*size_t.sizeof + re.hotspotTableSize*Trace.sizeof;
}
static size_t stackSize(const ref RegEx re)
{
size_t itemSize = stateSize
+ re.ngroup * (Group!DataIndex).sizeof / size_t.sizeof;
return initialStack * itemSize + 2;
}
@property bool atStart(){ return index == 0; }
@property bool atEnd(){ return index == s.lastIndex && s.atEnd; }
void next()
{
if (!s.nextChar(front, index))
index = s.lastIndex;
}
void search()
{
static if (kicked)
{
if (!s.search(re.kickstart, front, index))
{
index = s.lastIndex;
}
}
else
next();
}
//
void newStack()
{
auto chunk = mallocArray!(size_t)(stackSize(re));
chunk[0] = cast(size_t)(memory.ptr);
chunk[1] = lastState;
memory = chunk[2..$];
lastState = 0;
}
bool prevStack()
{
// pointer to previous block
size_t* prev = cast(size_t*) memory.ptr[-2];
if (!prev)
{
// The last segment is freed in RegexMatch
return false;
}
else
{
import core.stdc.stdlib : free;
// memory used in previous block
size_t size = memory.ptr[-1];
free(memory.ptr-2);
memory = prev[0 .. size];
lastState = size;
return true;
}
}
void initExternalMemory(void[] memBlock)
{
merge = arrayInChunk!(Trace)(re.hotspotTableSize, memBlock);
merge[] = Trace.init;
memory = cast(size_t[]) memBlock;
memory[0] = 0; // hidden pointer
memory[1] = 0; // used size
memory = memory[2..$];
}
void initialize(ref RegEx program, Stream stream, void[] memBlock)
{
re = program;
s = stream;
exhausted = false;
initExternalMemory(memBlock);
backrefed = null;
}
auto dupTo(void[] memory)
{
typeof(this) tmp = this;
tmp.initExternalMemory(memory);
return tmp;
}
this(ref RegEx program, Stream stream, void[] memBlock, dchar ch, DataIndex idx)
{
initialize(program, stream, memBlock);
front = ch;
index = idx;
}
this(ref RegEx program, Stream stream, void[] memBlock)
{
initialize(program, stream, memBlock);
next();
}
auto fwdMatcher(ref BacktrackingMatcher matcher, void[] memBlock)
{
alias BackMatcherTempl = .BacktrackingMatcher!(CTregex);
alias BackMatcher = BackMatcherTempl!(Char, Stream);
auto fwdMatcher = BackMatcher(matcher.re, s, memBlock, front, index);
return fwdMatcher;
}
auto bwdMatcher(ref BacktrackingMatcher matcher, void[] memBlock)
{
alias BackMatcherTempl = .BacktrackingMatcher!(CTregex);
alias BackMatcher = BackMatcherTempl!(Char, typeof(s.loopBack(index)));
auto fwdMatcher =
BackMatcher(matcher.re, s.loopBack(index), memBlock);
return fwdMatcher;
}
//
int matchFinalize()
{
immutable start = index;
immutable val = matchImpl();
if (val)
{//stream is updated here
matches[0].begin = start;
matches[0].end = index;
if (!(re.flags & RegexOption.global) || atEnd)
exhausted = true;
if (start == index)//empty match advances input
next();
return val;
}
else
return 0;
}
//lookup next match, fill matches with indices into input
int match(Group!DataIndex[] matches)
{
debug(std_regex_matcher)
{
writeln("------------------------------------------");
}
if (exhausted) //all matches collected
return false;
this.matches = matches;
if (re.flags & RegexInfo.oneShot)
{
exhausted = true;
const DataIndex start = index;
immutable m = matchImpl();
if (m)
{
matches[0].begin = start;
matches[0].end = index;
}
return m;
}
static if (kicked)
{
if (!re.kickstart.empty)
{
for (;;)
{
immutable val = matchFinalize();
if (val)
return val;
else
{
if (atEnd)
break;
search();
if (atEnd)
{
exhausted = true;
return matchFinalize();
}
}
}
exhausted = true;
return 0; //early return
}
}
//no search available - skip a char at a time
for (;;)
{
immutable val = matchFinalize();
if (val)
return val;
else
{
if (atEnd)
break;
next();
if (atEnd)
{
exhausted = true;
return matchFinalize();
}
}
}
exhausted = true;
return 0;
}
/+
match subexpression against input,
results are stored in matches
+/
int matchImpl()
{
static if (CTregex && is(typeof(nativeFn(this))))
{
debug(std_regex_ctr) writeln("using C-T matcher");
return nativeFn(this);
}
else
{
pc = 0;
counter = 0;
lastState = 0;
matches[] = Group!DataIndex.init;
auto start = s._index;
debug(std_regex_matcher)
writeln("Try match starting at ", s[index .. s.lastIndex]);
for (;;)
{
debug(std_regex_matcher)
writefln("PC: %s\tCNT: %s\t%s \tfront: %s src: %s",
pc, counter, disassemble(re.ir, pc, re.dict),
front, s._index);
switch (re.ir[pc].code)
{
case IR.OrChar://assumes IRL!(OrChar) == 1
if (atEnd)
goto L_backtrack;
uint len = re.ir[pc].sequence;
uint end = pc + len;
if (re.ir[pc].data != front && re.ir[pc+1].data != front)
{
for (pc = pc+2; pc < end; pc++)
if (re.ir[pc].data == front)
break;
if (pc == end)
goto L_backtrack;
}
pc = end;
next();
break;
case IR.Char:
if (atEnd || front != re.ir[pc].data)
goto L_backtrack;
pc += IRL!(IR.Char);
next();
break;
case IR.Any:
if (atEnd)
goto L_backtrack;
pc += IRL!(IR.Any);
next();
break;
case IR.CodepointSet:
if (atEnd || !re.charsets[re.ir[pc].data].scanFor(front))
goto L_backtrack;
next();
pc += IRL!(IR.CodepointSet);
break;
case IR.Trie:
if (atEnd || !re.matchers[re.ir[pc].data][front])
goto L_backtrack;
next();
pc += IRL!(IR.Trie);
break;
case IR.Wordboundary:
dchar back;
DataIndex bi;
//at start & end of input
if (atStart && wordMatcher[front])
{
pc += IRL!(IR.Wordboundary);
break;
}
else if (atEnd && s.loopBack(index).nextChar(back, bi)
&& wordMatcher[back])
{
pc += IRL!(IR.Wordboundary);
break;
}
else if (s.loopBack(index).nextChar(back, bi))
{
immutable af = wordMatcher[front];
immutable ab = wordMatcher[back];
if (af ^ ab)
{
pc += IRL!(IR.Wordboundary);
break;
}
}
goto L_backtrack;
case IR.Notwordboundary:
dchar back;
DataIndex bi;
//at start & end of input
if (atStart && wordMatcher[front])
goto L_backtrack;
else if (atEnd && s.loopBack(index).nextChar(back, bi)
&& wordMatcher[back])
goto L_backtrack;
else if (s.loopBack(index).nextChar(back, bi))
{
immutable af = wordMatcher[front];
immutable ab = wordMatcher[back];
if (af ^ ab)
goto L_backtrack;
}
pc += IRL!(IR.Wordboundary);
break;
case IR.Bof:
if (atStart)
pc += IRL!(IR.Bol);
else
goto L_backtrack;
break;
case IR.Bol:
dchar back;
DataIndex bi;
if (atStart)
pc += IRL!(IR.Bol);
else if (s.loopBack(index).nextChar(back,bi)
&& endOfLine(back, front == '\n'))
{
pc += IRL!(IR.Bol);
}
else
goto L_backtrack;
break;
case IR.Eof:
if (atEnd)
pc += IRL!(IR.Eol);
else
goto L_backtrack;
break;
case IR.Eol:
dchar back;
DataIndex bi;
debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]);
//no matching inside \r\n
if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi)
&& back == '\r')))
{
pc += IRL!(IR.Eol);
}
else
goto L_backtrack;
break;
case IR.InfiniteStart, IR.InfiniteQStart:
pc += re.ir[pc].data + IRL!(IR.InfiniteStart);
//now pc is at end IR.Infinite(Q)End
uint len = re.ir[pc].data;
if (re.ir[pc].code == IR.InfiniteEnd)
{
pushState(pc+IRL!(IR.InfiniteEnd), counter);
pc -= len;
}
else
{
pushState(pc - len, counter);
pc += IRL!(IR.InfiniteEnd);
}
break;
case IR.InfiniteBloomStart:
pc += re.ir[pc].data + IRL!(IR.InfiniteBloomStart);
//now pc is at end IR.InfiniteBloomEnd
immutable len = re.ir[pc].data;
immutable filterIdx = re.ir[pc+2].raw;
if (re.filters[filterIdx][front])
pushState(pc+IRL!(IR.InfiniteBloomEnd), counter);
pc -= len;
break;
case IR.RepeatStart, IR.RepeatQStart:
pc += re.ir[pc].data + IRL!(IR.RepeatStart);
break;
case IR.RepeatEnd:
case IR.RepeatQEnd:
if (merge[re.ir[pc + 1].raw+counter].mark(index))
{
// merged!
goto L_backtrack;
}
//len, step, min, max
immutable len = re.ir[pc].data;
immutable step = re.ir[pc+2].raw;
immutable min = re.ir[pc+3].raw;
immutable max = re.ir[pc+4].raw;
if (counter < min)
{
counter += step;
pc -= len;
}
else if (counter < max)
{
if (re.ir[pc].code == IR.RepeatEnd)
{
pushState(pc + IRL!(IR.RepeatEnd), counter%step);
counter += step;
pc -= len;
}
else
{
pushState(pc - len, counter + step);
counter = counter%step;
pc += IRL!(IR.RepeatEnd);
}
}
else
{
counter = counter%step;
pc += IRL!(IR.RepeatEnd);
}
break;
case IR.InfiniteEnd:
case IR.InfiniteQEnd:
debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting);
if (merge[re.ir[pc + 1].raw+counter].mark(index))
{
// merged!
goto L_backtrack;
}
immutable len = re.ir[pc].data;
if (re.ir[pc].code == IR.InfiniteEnd)
{
pushState(pc + IRL!(IR.InfiniteEnd), counter);
pc -= len;
}
else
{
pushState(pc-len, counter);
pc += IRL!(IR.InfiniteEnd);
}
break;
case IR.InfiniteBloomEnd:
debug(std_regex_matcher) writeln("Infinited nesting:", infiniteNesting);
if (merge[re.ir[pc + 1].raw+counter].mark(index))
{
// merged!
goto L_backtrack;
}
immutable len = re.ir[pc].data;
immutable filterIdx = re.ir[pc+2].raw;
if (re.filters[filterIdx][front])
{
infiniteNesting--;
pushState(pc + IRL!(IR.InfiniteBloomEnd), counter);
infiniteNesting++;
}
pc -= len;
break;
case IR.OrEnd:
if (merge[re.ir[pc + 1].raw+counter].mark(index))
{
// merged!
goto L_backtrack;
}
pc += IRL!(IR.OrEnd);
break;
case IR.OrStart:
pc += IRL!(IR.OrStart);
goto case;
case IR.Option:
immutable len = re.ir[pc].data;
if (re.ir[pc+len].code == IR.GotoEndOr)//not a last one
{
pushState(pc + len + IRL!(IR.Option), counter); //remember 2nd branch
}
pc += IRL!(IR.Option);
break;
case IR.GotoEndOr:
pc = pc + re.ir[pc].data + IRL!(IR.GotoEndOr);
break;
case IR.GroupStart:
immutable n = re.ir[pc].data;
matches[n].begin = index;
debug(std_regex_matcher) writefln("IR group #%u starts at %u", n, index);
pc += IRL!(IR.GroupStart);
break;
case IR.GroupEnd:
immutable n = re.ir[pc].data;
matches[n].end = index;
debug(std_regex_matcher) writefln("IR group #%u ends at %u", n, index);
pc += IRL!(IR.GroupEnd);
break;
case IR.LookaheadStart:
case IR.NeglookaheadStart:
immutable len = re.ir[pc].data;
auto save = index;
immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw;
auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)];
scope(exit) free(mem.ptr);
static if (Stream.isLoopback)
{
auto matcher = bwdMatcher(this, mem);
}
else
{
auto matcher = fwdMatcher(this, mem);
}
matcher.matches = matches[ms .. me];
matcher.backrefed = backrefed.empty ? matches : backrefed;
matcher.re.ir = re.ir[
pc+IRL!(IR.LookaheadStart) .. pc+IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd)
];
immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookaheadStart);
s.reset(save);
next();
if (!match)
goto L_backtrack;
else
{
pc += IRL!(IR.LookaheadStart)+len+IRL!(IR.LookaheadEnd);
}
break;
case IR.LookbehindStart:
case IR.NeglookbehindStart:
immutable len = re.ir[pc].data;
immutable ms = re.ir[pc+1].raw, me = re.ir[pc+2].raw;
auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)];
scope(exit) free(mem.ptr);
static if (Stream.isLoopback)
{
alias Matcher = BacktrackingMatcher!(Char, Stream);
auto matcher = Matcher(re, s, mem, front, index);
}
else
{
alias Matcher = BacktrackingMatcher!(Char, typeof(s.loopBack(index)));
auto matcher = Matcher(re, s.loopBack(index), mem);
}
matcher.matches = matches[ms .. me];
matcher.re.ir = re.ir[
pc + IRL!(IR.LookbehindStart) .. pc + IRL!(IR.LookbehindStart) + len + IRL!(IR.LookbehindEnd)
];
matcher.backrefed = backrefed.empty ? matches : backrefed;
immutable match = (matcher.matchImpl() != 0) ^ (re.ir[pc].code == IR.NeglookbehindStart);
if (!match)
goto L_backtrack;
else
{
pc += IRL!(IR.LookbehindStart)+len+IRL!(IR.LookbehindEnd);
}
break;
case IR.Backref:
immutable n = re.ir[pc].data;
auto referenced = re.ir[pc].localRef
? s[matches[n].begin .. matches[n].end]
: s[backrefed[n].begin .. backrefed[n].end];
while (!atEnd && !referenced.empty && front == referenced.front)
{
next();
referenced.popFront();
}
if (referenced.empty)
pc++;
else
goto L_backtrack;
break;
case IR.Nop:
pc += IRL!(IR.Nop);
break;
case IR.LookaheadEnd:
case IR.NeglookaheadEnd:
case IR.LookbehindEnd:
case IR.NeglookbehindEnd:
case IR.End:
// cleanup stale stack blocks if any
while (prevStack()) {}
return re.ir[pc].data;
default:
debug printBytecode(re.ir[0..$]);
assert(0);
L_backtrack:
if (!popState())
{
s.reset(start);
return 0;
}
}
}
}
assert(0);
}
@property size_t stackAvail()
{
return memory.length - lastState;
}
void stackPush(T)(T val)
if (!isDynamicArray!T)
{
*cast(T*)&memory[lastState] = val;
enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof;
lastState += delta;
debug(std_regex_matcher) writeln("push element SP= ", lastState);
}
void stackPush(T)(T[] val)
{
static assert(T.sizeof % size_t.sizeof == 0);
(cast(T*)&memory[lastState])[0 .. val.length]
= val[0..$];
lastState += val.length*(T.sizeof/size_t.sizeof);
debug(std_regex_matcher) writeln("push array SP= ", lastState);
}
void stackPop(T)(ref T val)
if (!isDynamicArray!T)
{
enum delta = (T.sizeof+size_t.sizeof/2)/size_t.sizeof;
lastState -= delta;
val = *cast(T*)&memory[lastState];
debug(std_regex_matcher) writeln("pop element SP= ", lastState);
}
void stackPop(T)(T[] val)
{
stackPop(val); // call ref version
}
void stackPop(T)(ref T[] val)
{
lastState -= val.length*(T.sizeof/size_t.sizeof);
val[0..$] = (cast(T*)&memory[lastState])[0 .. val.length];
debug(std_regex_matcher) writeln("pop array SP= ", lastState);
}
static if (!CTregex)
{
//helper function, saves engine state
void pushState(uint pc, uint counter)
{
if (stateSize + 2 * matches.length > stackAvail)
{
newStack();
}
*cast(State*)&memory[lastState] =
State(index, pc, counter, infiniteNesting);
lastState += stateSize;
memory[lastState .. lastState + 2 * matches.length] = (cast(size_t[]) matches)[];
lastState += 2*matches.length;
debug(std_regex_matcher)
writefln("Saved(pc=%s) front: %s src: %s",
pc, front, s[index .. s.lastIndex]);
}
//helper function, restores engine state
bool popState()
{
if (!lastState && !prevStack())
return false;
lastState -= 2*matches.length;
auto pm = cast(size_t[]) matches;
pm[] = memory[lastState .. lastState + 2 * matches.length];
lastState -= stateSize;
State* state = cast(State*)&memory[lastState];
index = state.index;
pc = state.pc;
counter = state.counter;
infiniteNesting = state.infiniteNesting;
debug(std_regex_matcher)
{
writefln("Restored matches", front, s[index .. s.lastIndex]);
foreach (i, m; matches)
writefln("Sub(%d) : %s..%s", i, m.begin, m.end);
}
s.reset(index);
next();
debug(std_regex_matcher)
writefln("Backtracked (pc=%s) front: %s src: %s",
pc, front, s[index .. s.lastIndex]);
return true;
}
}
}
}
//very shitty string formatter, $$ replaced with next argument converted to string
@trusted string ctSub( U...)(string format, U args)
{
import std.conv : to;
bool seenDollar;
foreach (i, ch; format)
{
if (ch == '$')
{
if (seenDollar)
{
static if (args.length > 0)
{
return format[0 .. i - 1] ~ to!string(args[0])
~ ctSub(format[i + 1 .. $], args[1 .. $]);
}
else
assert(0);
}
else
seenDollar = true;
}
else
seenDollar = false;
}
return format;
}
alias Sequence(int B, int E) = staticIota!(B, E);
struct CtContext
{
import std.conv : to, text;
//dirty flags
bool counter;
//to mark the portion of matches to save
int match, total_matches;
int reserved;
CodepointSet[] charsets;
//state of codegenerator
static struct CtState
{
string code;
int addr;
}
this(Char)(Regex!Char re)
{
match = 1;
reserved = 1; //first match is skipped
total_matches = re.ngroup;
charsets = re.charsets;
}
CtContext lookaround(uint s, uint e)
{
CtContext ct;
ct.total_matches = e - s;
ct.match = 1;
return ct;
}
//restore state having current context
string restoreCode()
{
string text;
//stack is checked in L_backtrack
text ~= counter
? "
stackPop(counter);"
: "
counter = 0;";
if (match < total_matches)
{
text ~= ctSub("
stackPop(matches[$$..$$]);", reserved, match);
text ~= ctSub("
matches[$$..$] = typeof(matches[0]).init;", match);
}
else
text ~= ctSub("
stackPop(matches[$$..$]);", reserved);
return text;
}
//save state having current context
string saveCode(uint pc, string count_expr="counter")
{
string text = ctSub("
if (stackAvail < $$*(Group!(DataIndex)).sizeof/size_t.sizeof + $$)
{
newStack();
}", match - reserved, cast(int) counter + 2);
if (match < total_matches)
text ~= ctSub("
stackPush(matches[$$..$$]);", reserved, match);
else
text ~= ctSub("
stackPush(matches[$$..$]);", reserved);
text ~= counter ? ctSub("
stackPush($$);", count_expr) : "";
text ~= ctSub("
stackPush(index); stackPush($$); \n", pc);
return text;
}
//
CtState ctGenBlock(Bytecode[] ir, int addr)
{
CtState result;
result.addr = addr;
while (!ir.empty)
{
auto n = ctGenGroup(ir, result.addr);
result.code ~= n.code;
result.addr = n.addr;
}
return result;
}
//
CtState ctGenGroup(ref Bytecode[] ir, int addr)
{
import std.algorithm.comparison : max;
auto bailOut = "goto L_backtrack;";
auto nextInstr = ctSub("goto case $$;", addr+1);
CtState r;
assert(!ir.empty);
switch (ir[0].code)
{
case IR.InfiniteStart, IR.InfiniteBloomStart,IR.InfiniteQStart, IR.RepeatStart, IR.RepeatQStart:
immutable infLoop =
ir[0].code == IR.InfiniteStart || ir[0].code == IR.InfiniteQStart ||
ir[0].code == IR.InfiniteBloomStart;
counter = counter ||
ir[0].code == IR.RepeatStart || ir[0].code == IR.RepeatQStart;
immutable len = ir[0].data;
auto nir = ir[ir[0].length .. ir[0].length+len];
r = ctGenBlock(nir, addr+1);
//start/end codegen
//r.addr is at last test+ jump of loop, addr+1 is body of loop
nir = ir[ir[0].length + len .. $];
r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr) ~ r.code;
r.code ~= ctGenFixupCode(nir, r.addr, addr+1);
r.addr += 2; //account end instruction + restore state
ir = nir;
break;
case IR.OrStart:
immutable len = ir[0].data;
auto nir = ir[ir[0].length .. ir[0].length+len];
r = ctGenAlternation(nir, addr);
ir = ir[ir[0].length + len .. $];
assert(ir[0].code == IR.OrEnd);
ir = ir[ir[0].length..$];
break;
case IR.LookaheadStart:
case IR.NeglookaheadStart:
case IR.LookbehindStart:
case IR.NeglookbehindStart:
immutable len = ir[0].data;
immutable behind = ir[0].code == IR.LookbehindStart || ir[0].code == IR.NeglookbehindStart;
immutable negative = ir[0].code == IR.NeglookaheadStart || ir[0].code == IR.NeglookbehindStart;
string fwdType = "typeof(fwdMatcher(matcher, []))";
string bwdType = "typeof(bwdMatcher(matcher, []))";
string fwdCreate = "fwdMatcher(matcher, mem)";
string bwdCreate = "bwdMatcher(matcher, mem)";
immutable start = IRL!(IR.LookbehindStart);
immutable end = IRL!(IR.LookbehindStart)+len+IRL!(IR.LookaheadEnd);
CtContext context = lookaround(ir[1].raw, ir[2].raw); //split off new context
auto slice = ir[start .. end];
r.code ~= ctSub(`
case $$: //fake lookaround "atom"
static if (typeof(matcher.s).isLoopback)
alias Lookaround = $$;
else
alias Lookaround = $$;
static bool matcher_$$(ref Lookaround matcher) @trusted
{
//(neg)lookaround piece start
$$
//(neg)lookaround piece ends
}
auto save = index;
auto mem = malloc(initialMemory(re))[0 .. initialMemory(re)];
scope(exit) free(mem.ptr);
static if (typeof(matcher.s).isLoopback)
auto lookaround = $$;
else
auto lookaround = $$;
lookaround.matches = matches[$$..$$];
lookaround.backrefed = backrefed.empty ? matches : backrefed;
lookaround.nativeFn = &matcher_$$; //hookup closure's binary code
int match = $$;
s.reset(save);
next();
if (match)
$$
else
$$`, addr,
behind ? fwdType : bwdType, behind ? bwdType : fwdType,
addr, context.ctGenRegEx(slice),
behind ? fwdCreate : bwdCreate, behind ? bwdCreate : fwdCreate,
ir[1].raw, ir[2].raw, //start - end of matches slice
addr,
negative ? "!lookaround.matchImpl()" : "lookaround.matchImpl()",
nextInstr, bailOut);
ir = ir[end .. $];
r.addr = addr + 1;
break;
case IR.LookaheadEnd: case IR.NeglookaheadEnd:
case IR.LookbehindEnd: case IR.NeglookbehindEnd:
ir = ir[IRL!(IR.LookaheadEnd) .. $];
r.addr = addr;
break;
default:
assert(ir[0].isAtom, text(ir[0].mnemonic));
r = ctGenAtom(ir, addr);
}
return r;
}
//generate source for bytecode contained in OrStart ... OrEnd
CtState ctGenAlternation(Bytecode[] ir, int addr)
{
CtState[] pieces;
CtState r;
enum optL = IRL!(IR.Option);
for (;;)
{
assert(ir[0].code == IR.Option);
auto len = ir[0].data;
if (optL+len < ir.length && ir[optL+len].code == IR.Option)//not a last option
{
auto nir = ir[optL .. optL+len-IRL!(IR.GotoEndOr)];
r = ctGenBlock(nir, addr+2);//space for Option + restore state
//r.addr+1 to account GotoEndOr at end of branch
r.code = ctGenFixupCode(ir[0 .. ir[0].length], addr, r.addr+1) ~ r.code;
addr = r.addr+1;//leave space for GotoEndOr
pieces ~= r;
ir = ir[optL + len .. $];
}
else
{
pieces ~= ctGenBlock(ir[optL..$], addr);
addr = pieces[$-1].addr;
break;
}
}
r = pieces[0];
for (uint i = 1; i < pieces.length; i++)
{
r.code ~= ctSub(`
case $$:
goto case $$; `, pieces[i-1].addr, addr);
r.code ~= pieces[i].code;
}
r.addr = addr;
return r;
}
// generate fixup code for instruction in ir,
// fixup means it has an alternative way for control flow
string ctGenFixupCode(Bytecode[] ir, int addr, int fixup)
{
return ctGenFixupCode(ir, addr, fixup); // call ref Bytecode[] version
}
string ctGenFixupCode(ref Bytecode[] ir, int addr, int fixup)
{
string r;
string testCode;
r = ctSub(`
case $$: debug(std_regex_matcher) writeln("#$$");`,
addr, addr);
switch (ir[0].code)
{
case IR.InfiniteStart, IR.InfiniteQStart, IR.InfiniteBloomStart:
r ~= ctSub( `
goto case $$;`, fixup);
ir = ir[ir[0].length..$];
break;
case IR.InfiniteEnd:
testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1);
r ~= ctSub( `
if (merge[$$+counter].mark(index))
{
// merged!
goto L_backtrack;
}
$$
{
$$
}
goto case $$;
case $$: //restore state and go out of loop
$$
goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup,
addr+1, restoreCode());
ir = ir[ir[0].length..$];
break;
case IR.InfiniteBloomEnd:
//TODO: check bloom filter and skip on failure
testCode = ctQuickTest(ir[IRL!(IR.InfiniteBloomEnd) .. $],addr + 1);
r ~= ctSub( `
if (merge[$$+counter].mark(index))
{
// merged!
goto L_backtrack;
}
$$
{
$$
}
goto case $$;
case $$: //restore state and go out of loop
$$
goto case;`, ir[1].raw, testCode, saveCode(addr+1), fixup,
addr+1, restoreCode());
ir = ir[ir[0].length..$];
break;
case IR.InfiniteQEnd:
testCode = ctQuickTest(ir[IRL!(IR.InfiniteEnd) .. $],addr + 1);
auto altCode = testCode.length ? ctSub("else goto case $$;", fixup) : "";
r ~= ctSub( `
if (merge[$$+counter].mark(index))
{
// merged!
goto L_backtrack;
}
$$
{
$$
goto case $$;
}
$$
case $$://restore state and go inside loop
$$
goto case $$;`, ir[1].raw,
testCode, saveCode(addr+1), addr+2, altCode,
addr+1, restoreCode(), fixup);
ir = ir[ir[0].length..$];
break;
case IR.RepeatStart, IR.RepeatQStart:
r ~= ctSub( `
goto case $$;`, fixup);
ir = ir[ir[0].length..$];
break;
case IR.RepeatEnd, IR.RepeatQEnd:
//len, step, min, max
immutable len = ir[0].data;
immutable step = ir[2].raw;
immutable min = ir[3].raw;
immutable max = ir[4].raw;
r ~= ctSub(`
if (merge[$$+counter].mark(index))
{
// merged!
goto L_backtrack;
}
if (counter < $$)
{
debug(std_regex_matcher) writeln("RepeatEnd min case pc=", $$);
counter += $$;
goto case $$;
}`, ir[1].raw, min, addr, step, fixup);
if (ir[0].code == IR.RepeatEnd)
{
string counter_expr = ctSub("counter % $$", step);
r ~= ctSub(`
else if (counter < $$)
{
$$
counter += $$;
goto case $$;
}`, max, saveCode(addr+1, counter_expr), step, fixup);
}
else
{
string counter_expr = ctSub("counter % $$", step);
r ~= ctSub(`
else if (counter < $$)
{
$$
counter = counter % $$;
goto case $$;
}`, max, saveCode(addr+1,counter_expr), step, addr+2);
}
r ~= ctSub(`
else
{
counter = counter % $$;
goto case $$;
}
case $$: //restore state
$$
goto case $$;`, step, addr+2, addr+1, restoreCode(),
ir[0].code == IR.RepeatEnd ? addr+2 : fixup );
ir = ir[ir[0].length..$];
break;
case IR.Option:
r ~= ctSub( `
{
$$
}
goto case $$;
case $$://restore thunk to go to the next group
$$
goto case $$;`, saveCode(addr+1), addr+2,
addr+1, restoreCode(), fixup);
ir = ir[ir[0].length..$];
break;
default:
assert(0, text(ir[0].mnemonic));
}
return r;
}
string ctQuickTest(Bytecode[] ir, int id)
{
uint pc = 0;
while (pc < ir.length && ir[pc].isAtom)
{
if (ir[pc].code == IR.GroupStart || ir[pc].code == IR.GroupEnd)
{
pc++;
}
else if (ir[pc].code == IR.Backref)
break;
else
{
auto code = ctAtomCode(ir[pc..$], -1);
return ctSub(`
int test_$$()
{
$$ //$$
}
if (test_$$() >= 0)`, id, code.ptr ? code : "return 0;",
ir[pc].mnemonic, id);
}
}
return "";
}
//process & generate source for simple bytecodes at front of ir using address addr
CtState ctGenAtom(ref Bytecode[] ir, int addr)
{
CtState result;
result.code = ctAtomCode(ir, addr);
ir.popFrontN(ir[0].code == IR.OrChar ? ir[0].sequence : ir[0].length);
result.addr = addr + 1;
return result;
}
//D code for atom at ir using address addr, addr < 0 means quickTest
string ctAtomCode(Bytecode[] ir, int addr)
{
string code;
string bailOut, nextInstr;
if (addr < 0)
{
bailOut = "return -1;";
nextInstr = "return 0;";
}
else
{
bailOut = "goto L_backtrack;";
nextInstr = ctSub("goto case $$;", addr+1);
code ~= ctSub( `
case $$: debug(std_regex_matcher) writeln("#$$");
`, addr, addr);
}
switch (ir[0].code)
{
case IR.OrChar://assumes IRL!(OrChar) == 1
code ~= ctSub(`
if (atEnd)
$$`, bailOut);
immutable len = ir[0].sequence;
for (uint i = 0; i < len; i++)
{
code ~= ctSub( `
if (front == $$)
{
$$
$$
}`, ir[i].data, addr >= 0 ? "next();" :"", nextInstr);
}
code ~= ctSub( `
$$`, bailOut);
break;
case IR.Char:
code ~= ctSub( `
if (atEnd || front != $$)
$$
$$
$$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
break;
case IR.Any:
code ~= ctSub( `
if (atEnd || (!(re.flags & RegexOption.singleline)
&& (front == '\r' || front == '\n')))
$$
$$
$$`, bailOut, addr >= 0 ? "next();" :"",nextInstr);
break;
case IR.CodepointSet:
if (charsets.length)
{
string name = `func_`~to!string(addr+1);
string funcCode = charsets[ir[0].data].toSourceCode(name);
code ~= ctSub( `
static $$
if (atEnd || !$$(front))
$$
$$
$$`, funcCode, name, bailOut, addr >= 0 ? "next();" :"", nextInstr);
}
else
code ~= ctSub( `
if (atEnd || !re.charsets[$$].scanFor(front))
$$
$$
$$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
break;
case IR.Trie:
if (charsets.length && charsets[ir[0].data].byInterval.length <= 8)
goto case IR.CodepointSet;
code ~= ctSub( `
if (atEnd || !re.matchers[$$][front])
$$
$$
$$`, ir[0].data, bailOut, addr >= 0 ? "next();" :"", nextInstr);
break;
case IR.Wordboundary:
code ~= ctSub( `
dchar back;
DataIndex bi;
if (atStart && wordMatcher[front])
{
$$
}
else if (atEnd && s.loopBack(index).nextChar(back, bi)
&& wordMatcher[back])
{
$$
}
else if (s.loopBack(index).nextChar(back, bi))
{
bool af = wordMatcher[front];
bool ab = wordMatcher[back];
if (af ^ ab)
{
$$
}
}
$$`, nextInstr, nextInstr, nextInstr, bailOut);
break;
case IR.Notwordboundary:
code ~= ctSub( `
dchar back;
DataIndex bi;
//at start & end of input
if (atStart && wordMatcher[front])
$$
else if (atEnd && s.loopBack(index).nextChar(back, bi)
&& wordMatcher[back])
$$
else if (s.loopBack(index).nextChar(back, bi))
{
bool af = wordMatcher[front];
bool ab = wordMatcher[back];
if (af ^ ab)
$$
}
$$`, bailOut, bailOut, bailOut, nextInstr);
break;
case IR.Bol:
code ~= ctSub(`
dchar back;
DataIndex bi;
if (atStart || (s.loopBack(index).nextChar(back,bi)
&& endOfLine(back, front == '\n')))
{
debug(std_regex_matcher) writeln("BOL matched");
$$
}
else
$$`, nextInstr, bailOut);
break;
case IR.Bof:
code ~= ctSub(`
if (atStart)
{
debug(std_regex_matcher) writeln("BOF matched");
$$
}
else
$$`, nextInstr, bailOut);
break;
case IR.Eol:
code ~= ctSub(`
dchar back;
DataIndex bi;
debug(std_regex_matcher) writefln("EOL (front 0x%x) %s", front, s[index .. s.lastIndex]);
//no matching inside \r\n
if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back,bi)
&& back == '\r')))
{
debug(std_regex_matcher) writeln("EOL matched");
$$
}
else
$$`, nextInstr, bailOut);
break;
case IR.Eof:
code ~= ctSub(`
if (atEnd)
{
debug(std_regex_matcher) writeln("BOF matched");
$$
}
else
$$`, nextInstr, bailOut);
break;
case IR.GroupStart:
code ~= ctSub(`
matches[$$].begin = index;
$$`, ir[0].data, nextInstr);
match = ir[0].data+1;
break;
case IR.GroupEnd:
code ~= ctSub(`
matches[$$].end = index;
$$`, ir[0].data, nextInstr);
break;
case IR.Backref:
string mStr = "auto referenced = ";
mStr ~= ir[0].localRef
? ctSub("s[matches[$$].begin .. matches[$$].end];",
ir[0].data, ir[0].data)
: ctSub("s[backrefed[$$].begin .. backrefed[$$].end];",
ir[0].data, ir[0].data);
code ~= ctSub( `
$$
while (!atEnd && !referenced.empty && front == referenced.front)
{
next();
referenced.popFront();
}
if (referenced.empty)
$$
else
$$`, mStr, nextInstr, bailOut);
break;
case IR.Nop:
case IR.End:
break;
default:
assert(0, text(ir[0].mnemonic, " is not supported yet"));
}
return code;
}
//generate D code for the whole regex
public string ctGenRegEx(Bytecode[] ir)
{
auto bdy = ctGenBlock(ir, 0);
auto r = `
import core.stdc.stdlib;
with(matcher)
{
pc = 0;
counter = 0;
lastState = 0;
matches[] = Group!DataIndex.init;
auto start = s._index;`;
r ~= `
goto StartLoop;
debug(std_regex_matcher) writeln("Try CT matching starting at ",s[index .. s.lastIndex]);
L_backtrack:
if (lastState || prevStack())
{
stackPop(pc);
stackPop(index);
s.reset(index);
next();
}
else
{
s.reset(start);
return false;
}
StartLoop:
switch (pc)
{
`;
r ~= bdy.code;
r ~= ctSub(`
case $$: break;`,bdy.addr);
r ~= `
default:
assert(0);
}
// cleanup stale stack blocks
while (prevStack()) {}
return true;
}
`;
return r;
}
}
string ctGenRegExCode(Char)(Regex!Char re)
{
auto context = CtContext(re);
return context.ctGenRegEx(re.ir);
}