| //Written in the D programming language |
| /* |
| Implementation of Thompson NFA std.regex engine. |
| Key point is evaluation of all possible threads (state) at each step |
| in a breadth-first manner, thereby geting some nice properties: |
| - looking at each character only once |
| - merging of equivalent threads, that gives matching process linear time complexity |
| */ |
| module std.regex.internal.thompson; |
| |
| package(std.regex): |
| |
| import std.range.primitives; |
| import std.regex.internal.ir; |
| |
| //State of VM thread |
| struct Thread(DataIndex) |
| { |
| Thread* next; //intrusive linked list |
| uint pc; |
| uint counter; //loop counter |
| uint uopCounter; //counts micro operations inside one macro instruction (e.g. BackRef) |
| Group!DataIndex[1] matches; |
| } |
| |
| //head-tail singly-linked list |
| struct ThreadList(DataIndex) |
| { |
| Thread!DataIndex* tip = null, toe = null; |
| //add new thread to the start of list |
| void insertFront(Thread!DataIndex* t) |
| { |
| if (tip) |
| { |
| t.next = tip; |
| tip = t; |
| } |
| else |
| { |
| t.next = null; |
| tip = toe = t; |
| } |
| } |
| //add new thread to the end of list |
| void insertBack(Thread!DataIndex* t) |
| { |
| if (toe) |
| { |
| toe.next = t; |
| toe = t; |
| } |
| else |
| tip = toe = t; |
| toe.next = null; |
| } |
| //move head element out of list |
| Thread!DataIndex* fetch() |
| { |
| auto t = tip; |
| if (tip == toe) |
| tip = toe = null; |
| else |
| tip = tip.next; |
| return t; |
| } |
| //non-destructive iteration of ThreadList |
| struct ThreadRange |
| { |
| const(Thread!DataIndex)* ct; |
| this(ThreadList tlist){ ct = tlist.tip; } |
| @property bool empty(){ return ct is null; } |
| @property const(Thread!DataIndex)* front(){ return ct; } |
| void popFront() |
| { |
| assert(ct); |
| ct = ct.next; |
| } |
| } |
| @property bool empty() |
| { |
| return tip == null; |
| } |
| ThreadRange opSlice() |
| { |
| return ThreadRange(this); |
| } |
| } |
| |
| template ThompsonOps(E, S, bool withInput:true) |
| { |
| @trusted: |
| static bool op(IR code:IR.End)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| finish(t, matches, re.ir[t.pc].data); |
| //fix endpoint of the whole match |
| matches[0].end = index; |
| recycle(t); |
| //cut off low priority threads |
| recycle(clist); |
| recycle(worklist); |
| debug(std_regex_matcher) writeln("Finished thread ", matches); |
| return false; // no more state to eval |
| } |
| } |
| |
| static bool op(IR code:IR.Wordboundary)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| dchar back; |
| DataIndex bi; |
| //at start & end of input |
| if (atStart && wordMatcher[front]) |
| { |
| t.pc += IRL!(IR.Wordboundary); |
| return true; |
| } |
| else if (atEnd && s.loopBack(index).nextChar(back, bi) |
| && wordMatcher[back]) |
| { |
| t.pc += IRL!(IR.Wordboundary); |
| return true; |
| } |
| else if (s.loopBack(index).nextChar(back, bi)) |
| { |
| bool af = wordMatcher[front]; |
| bool ab = wordMatcher[back]; |
| if (af ^ ab) |
| { |
| t.pc += IRL!(IR.Wordboundary); |
| return true; |
| } |
| } |
| return popState(e); |
| } |
| } |
| |
| static bool op(IR code:IR.Notwordboundary)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| dchar back; |
| DataIndex bi; |
| //at start & end of input |
| if (atStart && wordMatcher[front]) |
| { |
| return popState(e); |
| } |
| else if (atEnd && s.loopBack(index).nextChar(back, bi) |
| && wordMatcher[back]) |
| { |
| return popState(e); |
| } |
| else if (s.loopBack(index).nextChar(back, bi)) |
| { |
| bool af = wordMatcher[front]; |
| bool ab = wordMatcher[back] != 0; |
| if (af ^ ab) |
| { |
| return popState(e); |
| } |
| } |
| t.pc += IRL!(IR.Notwordboundary); |
| } |
| return true; |
| } |
| |
| static bool op(IR code:IR.Bof)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| if (atStart) |
| { |
| t.pc += IRL!(IR.Bof); |
| return true; |
| } |
| else |
| { |
| return popState(e); |
| } |
| } |
| } |
| |
| static bool op(IR code:IR.Bol)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| dchar back; |
| DataIndex bi; |
| if (atStart |
| ||(s.loopBack(index).nextChar(back,bi) |
| && startOfLine(back, front == '\n'))) |
| { |
| t.pc += IRL!(IR.Bol); |
| return true; |
| } |
| else |
| { |
| return popState(e); |
| } |
| } |
| } |
| |
| static bool op(IR code:IR.Eof)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| if (atEnd) |
| { |
| t.pc += IRL!(IR.Eol); |
| return true; |
| } |
| else |
| { |
| return popState(e); |
| } |
| } |
| } |
| |
| static bool op(IR code:IR.Eol)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| dchar back; |
| DataIndex bi; |
| //no matching inside \r\n |
| if (atEnd || (endOfLine(front, s.loopBack(index).nextChar(back, bi) |
| && back == '\r'))) |
| { |
| t.pc += IRL!(IR.Eol); |
| return true; |
| } |
| else |
| { |
| return popState(e); |
| } |
| |
| } |
| } |
| |
| static bool op(IR code:IR.InfiniteStart)(E e, S* state) |
| { |
| with(e) with(state) |
| t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteStart); |
| return op!(IR.InfiniteEnd)(e,state); |
| } |
| |
| static bool op(IR code:IR.InfiniteBloomStart)(E e, S* state) |
| { |
| with(e) with(state) |
| t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteBloomStart); |
| return op!(IR.InfiniteBloomEnd)(e,state); |
| } |
| |
| static bool op(IR code:IR.InfiniteQStart)(E e, S* state) |
| { |
| with(e) with(state) |
| t.pc += re.ir[t.pc].data + IRL!(IR.InfiniteQStart); |
| return op!(IR.InfiniteQEnd)(e,state); |
| } |
| |
| static bool op(IR code:IR.RepeatStart)(E e, S* state) |
| { |
| with(e) with(state) |
| t.pc += re.ir[t.pc].data + IRL!(IR.RepeatStart); |
| return op!(IR.RepeatEnd)(e,state); |
| } |
| |
| static bool op(IR code:IR.RepeatQStart)(E e, S* state) |
| { |
| with(e) with(state) |
| t.pc += re.ir[t.pc].data + IRL!(IR.RepeatQStart); |
| return op!(IR.RepeatQEnd)(e,state); |
| } |
| |
| static bool op(IR code)(E e, S* state) |
| if (code == IR.RepeatEnd || code == IR.RepeatQEnd) |
| { |
| with(e) with(state) |
| { |
| //len, step, min, max |
| uint len = re.ir[t.pc].data; |
| uint step = re.ir[t.pc+2].raw; |
| uint min = re.ir[t.pc+3].raw; |
| if (t.counter < min) |
| { |
| t.counter += step; |
| t.pc -= len; |
| return true; |
| } |
| if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) |
| { |
| debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", |
| t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); |
| merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; |
| } |
| else |
| { |
| debug(std_regex_matcher) |
| writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", |
| t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); |
| return popState(e); |
| } |
| uint max = re.ir[t.pc+4].raw; |
| if (t.counter < max) |
| { |
| if (re.ir[t.pc].code == IR.RepeatEnd) |
| { |
| //queue out-of-loop thread |
| worklist.insertFront(fork(t, t.pc + IRL!(IR.RepeatEnd), t.counter % step)); |
| t.counter += step; |
| t.pc -= len; |
| } |
| else |
| { |
| //queue into-loop thread |
| worklist.insertFront(fork(t, t.pc - len, t.counter + step)); |
| t.counter %= step; |
| t.pc += IRL!(IR.RepeatEnd); |
| } |
| } |
| else |
| { |
| t.counter %= step; |
| t.pc += IRL!(IR.RepeatEnd); |
| } |
| return true; |
| } |
| } |
| |
| static bool op(IR code)(E e, S* state) |
| if (code == IR.InfiniteEnd || code == IR.InfiniteQEnd) |
| { |
| with(e) with(state) |
| { |
| if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) |
| { |
| debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", |
| t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); |
| merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; |
| } |
| else |
| { |
| debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", |
| t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); |
| return popState(e); |
| } |
| uint len = re.ir[t.pc].data; |
| uint pc1, pc2; //branches to take in priority order |
| if (re.ir[t.pc].code == IR.InfiniteEnd) |
| { |
| pc1 = t.pc - len; |
| pc2 = t.pc + IRL!(IR.InfiniteEnd); |
| } |
| else |
| { |
| pc1 = t.pc + IRL!(IR.InfiniteEnd); |
| pc2 = t.pc - len; |
| } |
| worklist.insertFront(fork(t, pc2, t.counter)); |
| t.pc = pc1; |
| return true; |
| } |
| } |
| |
| static bool op(IR code)(E e, S* state) |
| if (code == IR.InfiniteBloomEnd) |
| { |
| with(e) with(state) |
| { |
| if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) |
| { |
| debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", |
| t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); |
| merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; |
| } |
| else |
| { |
| debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", |
| t.pc, index, genCounter, merge[re.ir[t.pc + 1].raw+t.counter] ); |
| return popState(e); |
| } |
| uint len = re.ir[t.pc].data; |
| uint pc1, pc2; //branches to take in priority order |
| pc1 = t.pc - len; |
| pc2 = t.pc + IRL!(IR.InfiniteBloomEnd); |
| uint filterIndex = re.ir[t.pc + 2].raw; |
| if (re.filters[filterIndex][front]) |
| worklist.insertFront(fork(t, pc2, t.counter)); |
| t.pc = pc1; |
| return true; |
| } |
| } |
| |
| static bool op(IR code:IR.OrEnd)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| if (merge[re.ir[t.pc + 1].raw+t.counter] < genCounter) |
| { |
| debug(std_regex_matcher) writefln("A thread(pc=%s) passed there : %s ; GenCounter=%s mergetab=%s", |
| t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] ); |
| merge[re.ir[t.pc + 1].raw+t.counter] = genCounter; |
| t.pc += IRL!(IR.OrEnd); |
| } |
| else |
| { |
| debug(std_regex_matcher) writefln("A thread(pc=%s) got merged there : %s ; GenCounter=%s mergetab=%s", |
| t.pc, s[index .. s.lastIndex], genCounter, merge[re.ir[t.pc + 1].raw + t.counter] ); |
| return popState(e); |
| } |
| return true; |
| } |
| } |
| |
| static bool op(IR code:IR.OrStart)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| t.pc += IRL!(IR.OrStart); |
| return op!(IR.Option)(e,state); |
| } |
| } |
| |
| static bool op(IR code:IR.Option)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| uint next = t.pc + re.ir[t.pc].data + IRL!(IR.Option); |
| //queue next Option |
| if (re.ir[next].code == IR.Option) |
| { |
| worklist.insertFront(fork(t, next, t.counter)); |
| } |
| t.pc += IRL!(IR.Option); |
| return true; |
| } |
| } |
| |
| static bool op(IR code:IR.GotoEndOr)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| t.pc = t.pc + re.ir[t.pc].data + IRL!(IR.GotoEndOr); |
| return op!(IR.OrEnd)(e, state); |
| } |
| } |
| |
| static bool op(IR code:IR.GroupStart)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| uint n = re.ir[t.pc].data; |
| t.matches.ptr[n].begin = index; |
| t.pc += IRL!(IR.GroupStart); |
| return true; |
| } |
| } |
| static bool op(IR code:IR.GroupEnd)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| uint n = re.ir[t.pc].data; |
| t.matches.ptr[n].end = index; |
| t.pc += IRL!(IR.GroupEnd); |
| return true; |
| } |
| } |
| |
| static bool op(IR code:IR.Backref)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| uint n = re.ir[t.pc].data; |
| Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr; |
| assert(source); |
| if (source[n].begin == source[n].end)//zero-width Backref! |
| { |
| t.pc += IRL!(IR.Backref); |
| return true; |
| } |
| else |
| { |
| size_t idx = source[n].begin + t.uopCounter; |
| size_t end = source[n].end; |
| if (s[idx .. end].front == front) |
| { |
| import std.utf : stride; |
| |
| t.uopCounter += stride(s[idx .. end], 0); |
| if (t.uopCounter + source[n].begin == source[n].end) |
| {//last codepoint |
| t.pc += IRL!(IR.Backref); |
| t.uopCounter = 0; |
| } |
| nlist.insertBack(t); |
| } |
| else |
| recycle(t); |
| t = worklist.fetch(); |
| return t != null; |
| } |
| } |
| } |
| |
| |
| static bool op(IR code)(E e, S* state) |
| if (code == IR.LookbehindStart || code == IR.NeglookbehindStart) |
| { |
| with(e) with(state) |
| { |
| uint len = re.ir[t.pc].data; |
| uint ms = re.ir[t.pc + 1].raw, me = re.ir[t.pc + 2].raw; |
| uint end = t.pc + len + IRL!(IR.LookbehindEnd) + IRL!(IR.LookbehindStart); |
| bool positive = re.ir[t.pc].code == IR.LookbehindStart; |
| static if (Stream.isLoopback) |
| auto matcher = fwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0)); |
| else |
| auto matcher = bwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0)); |
| matcher.backrefed = backrefed.empty ? t.matches : backrefed; |
| //backMatch |
| auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookbehindStart)); |
| freelist = matcher.freelist; |
| subCounters[t.pc] = matcher.genCounter; |
| if ((mRes != 0 ) ^ positive) |
| { |
| return popState(e); |
| } |
| t.pc = end; |
| return true; |
| } |
| } |
| |
| static bool op(IR code)(E e, S* state) |
| if (code == IR.LookaheadStart || code == IR.NeglookaheadStart) |
| { |
| with(e) with(state) |
| { |
| auto save = index; |
| uint len = re.ir[t.pc].data; |
| uint ms = re.ir[t.pc+1].raw, me = re.ir[t.pc+2].raw; |
| uint end = t.pc+len+IRL!(IR.LookaheadEnd)+IRL!(IR.LookaheadStart); |
| bool positive = re.ir[t.pc].code == IR.LookaheadStart; |
| static if (Stream.isLoopback) |
| auto matcher = bwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0)); |
| else |
| auto matcher = fwdMatcher(t.pc, end, me - ms, subCounters.get(t.pc, 0)); |
| matcher.backrefed = backrefed.empty ? t.matches : backrefed; |
| auto mRes = matcher.matchOneShot(t.matches.ptr[ms .. me], IRL!(IR.LookaheadStart)); |
| freelist = matcher.freelist; |
| subCounters[t.pc] = matcher.genCounter; |
| s.reset(index); |
| next(); |
| if ((mRes != 0) ^ positive) |
| { |
| return popState(e); |
| } |
| t.pc = end; |
| return true; |
| } |
| } |
| |
| static bool op(IR code)(E e, S* state) |
| if (code == IR.LookaheadEnd || code == IR.NeglookaheadEnd || |
| code == IR.LookbehindEnd || code == IR.NeglookbehindEnd) |
| { |
| with(e) with(state) |
| { |
| finish(t, matches.ptr[0 .. re.ngroup], re.ir[t.pc].data); |
| recycle(t); |
| //cut off low priority threads |
| recycle(clist); |
| recycle(worklist); |
| return false; // no more state |
| } |
| } |
| |
| static bool op(IR code:IR.Nop)(E e, S* state) |
| { |
| with(state) t.pc += IRL!(IR.Nop); |
| return true; |
| } |
| |
| static bool op(IR code:IR.OrChar)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| uint len = re.ir[t.pc].sequence; |
| uint end = t.pc + len; |
| static assert(IRL!(IR.OrChar) == 1); |
| for (; t.pc < end; t.pc++) |
| if (re.ir[t.pc].data == front) |
| break; |
| if (t.pc != end) |
| { |
| t.pc = end; |
| nlist.insertBack(t); |
| } |
| else |
| recycle(t); |
| t = worklist.fetch(); |
| return t != null; |
| } |
| } |
| |
| static bool op(IR code:IR.Char)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| if (front == re.ir[t.pc].data) |
| { |
| t.pc += IRL!(IR.Char); |
| nlist.insertBack(t); |
| } |
| else |
| recycle(t); |
| t = worklist.fetch(); |
| return t != null; |
| } |
| } |
| |
| static bool op(IR code:IR.Any)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| t.pc += IRL!(IR.Any); |
| nlist.insertBack(t); |
| t = worklist.fetch(); |
| return t != null; |
| } |
| } |
| |
| static bool op(IR code:IR.CodepointSet)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| if (re.charsets[re.ir[t.pc].data].scanFor(front)) |
| { |
| t.pc += IRL!(IR.CodepointSet); |
| nlist.insertBack(t); |
| } |
| else |
| { |
| recycle(t); |
| } |
| t = worklist.fetch(); |
| return t != null; |
| } |
| } |
| |
| static bool op(IR code:IR.Trie)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| if (re.matchers[re.ir[t.pc].data][front]) |
| { |
| t.pc += IRL!(IR.Trie); |
| nlist.insertBack(t); |
| } |
| else |
| { |
| recycle(t); |
| } |
| t = worklist.fetch(); |
| return t != null; |
| } |
| } |
| |
| } |
| |
| template ThompsonOps(E,S, bool withInput:false) |
| { |
| @trusted: |
| // can't match these without input |
| static bool op(IR code)(E e, S* state) |
| if (code == IR.Char || code == IR.OrChar || code == IR.CodepointSet |
| || code == IR.Trie || code == IR.Char || code == IR.Any) |
| { |
| return state.popState(e); |
| } |
| |
| // special case of zero-width backref |
| static bool op(IR code:IR.Backref)(E e, S* state) |
| { |
| with(e) with(state) |
| { |
| uint n = re.ir[t.pc].data; |
| Group!DataIndex* source = re.ir[t.pc].localRef ? t.matches.ptr : backrefed.ptr; |
| assert(source); |
| if (source[n].begin == source[n].end)//zero-width Backref! |
| { |
| t.pc += IRL!(IR.Backref); |
| return true; |
| } |
| else |
| return popState(e); |
| } |
| } |
| |
| // forward all control flow to normal versions |
| static bool op(IR code)(E e, S* state) |
| if (code != IR.Char && code != IR.OrChar && code != IR.CodepointSet |
| && code != IR.Trie && code != IR.Char && code != IR.Any && code != IR.Backref) |
| { |
| return ThompsonOps!(E,S,true).op!code(e,state); |
| } |
| } |
| |
| /+ |
| Thomspon matcher does all matching in lockstep, |
| never looking at the same char twice |
| +/ |
| @trusted class ThompsonMatcher(Char, StreamType = Input!Char): Matcher!Char |
| if (is(Char : dchar)) |
| { |
| alias DataIndex = Stream.DataIndex; |
| alias Stream = StreamType; |
| alias OpFunc = bool function(ThompsonMatcher, State*) pure; |
| alias BackMatcher = ThompsonMatcher!(Char, BackLooper!(Stream)); |
| alias OpBackFunc = bool function(BackMatcher, BackMatcher.State*) pure; |
| Thread!DataIndex* freelist; |
| ThreadList!DataIndex clist, nlist; |
| DataIndex[] merge; |
| Group!DataIndex[] backrefed; |
| const Regex!Char re; //regex program |
| Stream s; |
| dchar front; |
| DataIndex index; |
| DataIndex genCounter; //merge trace counter, goes up on every dchar |
| size_t[size_t] subCounters; //a table of gen counter per sub-engine: PC -> counter |
| OpFunc[] opCacheTrue; // pointers to Op!(IR.xyz) for each bytecode |
| OpFunc[] opCacheFalse; // ditto |
| OpBackFunc[] opCacheBackTrue; // ditto |
| OpBackFunc[] opCacheBackFalse; // ditto |
| size_t threadSize; |
| size_t _refCount; |
| int matched; |
| bool exhausted; |
| |
| final: |
| pure |
| static struct State |
| { |
| Thread!DataIndex* t; |
| ThreadList!DataIndex worklist; |
| Group!DataIndex[] matches; |
| |
| bool popState(E)(E e) |
| { |
| with(e) |
| { |
| recycle(t); |
| t = worklist.fetch(); |
| return t != null; |
| } |
| } |
| |
| } |
| |
| static if (__traits(hasMember,Stream, "search")) |
| { |
| enum kicked = true; |
| } |
| else |
| enum kicked = false; |
| |
| static size_t getThreadSize(const ref Regex!Char re) |
| { |
| return re.ngroup |
| ? (Thread!DataIndex).sizeof + (re.ngroup-1)*(Group!DataIndex).sizeof |
| : (Thread!DataIndex).sizeof - (Group!DataIndex).sizeof; |
| } |
| |
| static size_t initialMemory(const ref Regex!Char re) |
| { |
| return getThreadSize(re)*re.threadCount + re.hotspotTableSize*size_t.sizeof |
| +4*OpFunc.sizeof*re.ir.length; |
| } |
| |
| //true if it's start of input |
| @property bool atStart(){ return index == 0; } |
| |
| //true if it's end of input |
| @property bool atEnd(){ return index == s.lastIndex && s.atEnd; } |
| |
| override @property ref size_t refCount() @safe { return _refCount; } |
| |
| override @property ref const(Regex!Char) pattern() @safe { return re; } |
| |
| bool next() |
| { |
| if (!s.nextChar(front, index)) |
| { |
| index = s.lastIndex; |
| return false; |
| } |
| return true; |
| } |
| |
| static if (kicked) |
| { |
| bool search() |
| { |
| |
| if (!s.search(re.kickstart, front, index)) |
| { |
| index = s.lastIndex; |
| return false; |
| } |
| return true; |
| } |
| } |
| |
| void initExternalMemory(void[] memory) |
| { |
| threadSize = getThreadSize(re); |
| prepareFreeList(re.threadCount, memory); |
| if (re.hotspotTableSize) |
| { |
| merge = arrayInChunk!(DataIndex)(re.hotspotTableSize, memory); |
| merge[] = 0; |
| } |
| opCacheTrue = arrayInChunk!(OpFunc)(re.ir.length, memory); |
| opCacheFalse = arrayInChunk!(OpFunc)(re.ir.length, memory); |
| opCacheBackTrue = arrayInChunk!(OpBackFunc)(re.ir.length, memory); |
| opCacheBackFalse = arrayInChunk!(OpBackFunc)(re.ir.length, memory); |
| |
| for (uint pc = 0; pc<re.ir.length; pc += re.ir[pc].length) |
| { |
| L_dispatch: |
| switch (re.ir[pc].code) |
| { |
| foreach (e; __traits(allMembers, IR)) |
| { |
| mixin(`case IR.`~e~`: |
| opCacheTrue[pc] = &Ops!(true).op!(IR.`~e~`); |
| opCacheBackTrue[pc] = &BackOps!(true).op!(IR.`~e~`); |
| opCacheFalse[pc] = &Ops!(false).op!(IR.`~e~`); |
| opCacheBackFalse[pc] = &BackOps!(false).op!(IR.`~e~`); |
| break L_dispatch; |
| `); |
| } |
| default: |
| assert(0, "Unrecognized instruction "~re.ir[pc].mnemonic); |
| } |
| } |
| } |
| |
| override Matcher!Char rearm(in Char[] data) |
| { |
| exhausted = false; |
| matched = 0; |
| s = Stream(data); |
| return this; |
| } |
| |
| this()(ref const Regex!Char program, Stream stream, void[] memory) |
| { |
| // We are emplace'd to malloced memory w/o blitting T.init over it\ |
| // make sure we initialize all fields explicitly |
| _refCount = 1; |
| subCounters = null; |
| backrefed = null; |
| exhausted = false; |
| matched = 0; |
| re = program; |
| s = stream; |
| initExternalMemory(memory); |
| genCounter = 0; |
| } |
| |
| this(ThompsonMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream) |
| { |
| _refCount = 1; |
| subCounters = matcher.subCounters; |
| s = stream; |
| auto code = matcher.re.ir[lo .. hi]; |
| re = matcher.re.withCode(code).withNGroup(nGroup); |
| threadSize = matcher.threadSize; |
| merge = matcher.merge; |
| freelist = matcher.freelist; |
| opCacheTrue = matcher.opCacheTrue[lo .. hi]; |
| opCacheBackTrue = matcher.opCacheBackTrue[lo .. hi]; |
| opCacheFalse = matcher.opCacheFalse[lo .. hi]; |
| opCacheBackFalse = matcher.opCacheBackFalse[lo .. hi]; |
| front = matcher.front; |
| index = matcher.index; |
| } |
| |
| this(BackMatcher matcher, size_t lo, size_t hi, uint nGroup, Stream stream) |
| { |
| _refCount = 1; |
| subCounters = matcher.subCounters; |
| s = stream; |
| auto code = matcher.re.ir[lo .. hi]; |
| re = matcher.re.withCode(code).withNGroup(nGroup); |
| threadSize = matcher.threadSize; |
| merge = matcher.merge; |
| freelist = matcher.freelist; |
| opCacheTrue = matcher.opCacheBackTrue[lo .. hi]; |
| opCacheBackTrue = matcher.opCacheTrue[lo .. hi]; |
| opCacheFalse = matcher.opCacheBackFalse[lo .. hi]; |
| opCacheBackFalse = matcher.opCacheFalse[lo .. hi]; |
| front = matcher.front; |
| index = matcher.index; |
| } |
| |
| auto fwdMatcher()(size_t lo, size_t hi, uint nGroup, size_t counter) |
| { |
| auto m = new ThompsonMatcher!(Char, Stream)(this, lo, hi, nGroup, s); |
| m.genCounter = counter; |
| return m; |
| } |
| |
| auto bwdMatcher()(size_t lo, size_t hi, uint nGroup, size_t counter) |
| { |
| alias BackLooper = typeof(s.loopBack(index)); |
| auto m = new ThompsonMatcher!(Char, BackLooper)(this, lo, hi, nGroup, s.loopBack(index)); |
| m.genCounter = counter; |
| m.next(); |
| return m; |
| } |
| |
| override void dupTo(Matcher!Char engine, void[] memory) |
| { |
| auto thompson = cast(ThompsonMatcher) engine; |
| thompson.s = s; |
| thompson.subCounters = null; |
| thompson.front = front; |
| thompson.index = index; |
| thompson.matched = matched; |
| thompson.exhausted = exhausted; |
| thompson.initExternalMemory(memory); |
| } |
| |
| override int match(Group!DataIndex[] matches) |
| { |
| debug(std_regex_matcher) |
| writeln("------------------------------------------"); |
| if (exhausted) |
| { |
| return false; |
| } |
| if (re.flags & RegexInfo.oneShot) |
| { |
| next(); |
| exhausted = true; |
| return matchOneShot(matches); |
| } |
| static if (kicked) |
| if (!re.kickstart.empty) |
| return matchImpl!(true)(matches); |
| return matchImpl!(false)(matches); |
| } |
| |
| //match the input and fill matches |
| int matchImpl(bool withSearch)(Group!DataIndex[] matches) |
| { |
| if (!matched && clist.empty) |
| { |
| static if (withSearch) |
| search(); |
| else |
| next(); |
| } |
| else//char in question is fetched in prev call to match |
| { |
| matched = 0; |
| } |
| State state; |
| state.matches = matches; |
| |
| if (!atEnd)//if no char |
| for (;;) |
| { |
| genCounter++; |
| debug(std_regex_matcher) |
| { |
| writefln("Threaded matching threads at %s", s[index .. s.lastIndex]); |
| foreach (t; clist[]) |
| { |
| assert(t); |
| writef("pc=%s ",t.pc); |
| write(t.matches); |
| writeln(); |
| } |
| } |
| for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) |
| { |
| eval!true(&state); |
| } |
| //if we already have match no need to push the engine |
| if (!matched) |
| { |
| state.t = createStart(index); |
| eval!true(&state);//new thread staring at this position |
| } |
| else if (nlist.empty) |
| { |
| debug(std_regex_matcher) writeln("Stopped matching before consuming full input"); |
| break;//not a partial match for sure |
| } |
| clist = nlist; |
| nlist = (ThreadList!DataIndex).init; |
| if (clist.tip is null) |
| { |
| static if (withSearch) |
| { |
| if (!search()) |
| break; |
| } |
| else |
| { |
| if (!next()) |
| break; |
| } |
| } |
| else if (!next()) |
| { |
| if (!atEnd) return false; |
| exhausted = true; |
| break; |
| } |
| } |
| |
| genCounter++; //increment also on each end |
| debug(std_regex_matcher) writefln("Threaded matching threads at end"); |
| //try out all zero-width posibilities |
| for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) |
| { |
| eval!false(&state); |
| } |
| if (!matched) |
| { |
| state.t = createStart(index); |
| eval!false(&state);//new thread starting at end of input |
| } |
| if (matched) |
| {//in case NFA found match along the way |
| //and last possible longer alternative ultimately failed |
| s.reset(matches[0].end);//reset to last successful match |
| next();//and reload front character |
| //--- here the exact state of stream was restored --- |
| exhausted = atEnd || !(re.flags & RegexOption.global); |
| //+ empty match advances the input |
| if (!exhausted && matches[0].begin == matches[0].end) |
| next(); |
| } |
| return matched; |
| } |
| |
| /+ |
| handle succesful threads |
| +/ |
| void finish(const(Thread!DataIndex)* t, Group!DataIndex[] matches, int code) |
| { |
| matches.ptr[0 .. re.ngroup] = t.matches.ptr[0 .. re.ngroup]; |
| debug(std_regex_matcher) |
| { |
| writef("FOUND pc=%s prog_len=%s", |
| t.pc, re.ir.length); |
| if (!matches.empty) |
| writefln(": %s..%s", matches[0].begin, matches[0].end); |
| foreach (v; matches) |
| writefln("%d .. %d", v.begin, v.end); |
| } |
| matched = code; |
| } |
| |
| alias Ops(bool withInput) = ThompsonOps!(ThompsonMatcher, State, withInput); |
| alias BackOps(bool withInput) = ThompsonOps!(BackMatcher, BackMatcher.State, withInput); |
| |
| /+ |
| match thread against codepoint, cutting trough all 0-width instructions |
| and taking care of control flow, then add it to nlist |
| +/ |
| void eval(bool withInput)(State* state) |
| { |
| debug(std_regex_matcher) writeln("---- Evaluating thread"); |
| static if (withInput) |
| while (opCacheTrue.ptr[state.t.pc](this, state)){} |
| else |
| while (opCacheFalse.ptr[state.t.pc](this, state)){} |
| } |
| enum uint RestartPc = uint.max; |
| //match the input, evaluating IR without searching |
| int matchOneShot(Group!DataIndex[] matches, uint startPc = 0) |
| { |
| debug(std_regex_matcher) |
| { |
| writefln("---------------single shot match ----------------- "); |
| } |
| alias evalFn = eval; |
| assert(clist == (ThreadList!DataIndex).init || startPc == RestartPc); // incorrect after a partial match |
| assert(nlist == (ThreadList!DataIndex).init || startPc == RestartPc); |
| State state; |
| state.matches = matches; |
| if (!atEnd)//if no char |
| { |
| debug(std_regex_matcher) |
| { |
| writefln("-- Threaded matching threads at %s", s[index .. s.lastIndex]); |
| } |
| if (startPc != RestartPc) |
| { |
| state.t = createStart(index, startPc); |
| genCounter++; |
| evalFn!true(&state); |
| } |
| for (;;) |
| { |
| debug(std_regex_matcher) writeln("\n-- Started iteration of main cycle"); |
| genCounter++; |
| debug(std_regex_matcher) |
| { |
| foreach (t; clist[]) |
| { |
| assert(t); |
| } |
| } |
| for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) |
| { |
| evalFn!true(&state); |
| } |
| if (nlist.empty) |
| { |
| debug(std_regex_matcher) writeln("Stopped matching before consuming full input"); |
| break;//not a partial match for sure |
| } |
| clist = nlist; |
| nlist = (ThreadList!DataIndex).init; |
| if (!next()) |
| break; |
| debug(std_regex_matcher) writeln("-- Ended iteration of main cycle\n"); |
| } |
| } |
| genCounter++; //increment also on each end |
| debug(std_regex_matcher) writefln("-- Matching threads at end"); |
| //try out all zero-width posibilities |
| for (state.t = clist.fetch(); state.t; state.t = clist.fetch()) |
| { |
| evalFn!false(&state); |
| } |
| if (!matched) |
| { |
| state.t = createStart(index, startPc); |
| evalFn!false(&state); |
| } |
| return matched; |
| } |
| |
| //get a dirty recycled Thread |
| Thread!DataIndex* allocate() |
| { |
| assert(freelist, "not enough preallocated memory"); |
| Thread!DataIndex* t = freelist; |
| freelist = freelist.next; |
| return t; |
| } |
| |
| //link memory into a free list of Threads |
| void prepareFreeList(size_t size, ref void[] memory) |
| { |
| void[] mem = memory[0 .. threadSize*size]; |
| memory = memory[threadSize * size .. $]; |
| freelist = cast(Thread!DataIndex*)&mem[0]; |
| size_t i; |
| for (i = threadSize; i < threadSize*size; i += threadSize) |
| (cast(Thread!DataIndex*)&mem[i-threadSize]).next = cast(Thread!DataIndex*)&mem[i]; |
| (cast(Thread!DataIndex*)&mem[i-threadSize]).next = null; |
| } |
| |
| //dispose a thread |
| void recycle(Thread!DataIndex* t) |
| { |
| t.next = freelist; |
| freelist = t; |
| } |
| |
| //dispose list of threads |
| void recycle(ref ThreadList!DataIndex list) |
| { |
| if (list.tip) |
| { |
| // just put this head-tail list in front of freelist |
| list.toe.next = freelist; |
| freelist = list.tip; |
| list = list.init; |
| } |
| } |
| |
| //creates a copy of master thread with given pc |
| Thread!DataIndex* fork(Thread!DataIndex* master, uint pc, uint counter) |
| { |
| auto t = allocate(); |
| t.matches.ptr[0 .. re.ngroup] = master.matches.ptr[0 .. re.ngroup]; |
| t.pc = pc; |
| t.counter = counter; |
| t.uopCounter = 0; |
| return t; |
| } |
| |
| //creates a start thread |
| Thread!DataIndex* createStart(DataIndex index, uint pc = 0) |
| { |
| auto t = allocate(); |
| t.matches.ptr[0 .. re.ngroup] = (Group!DataIndex).init; |
| t.matches[0].begin = index; |
| t.pc = pc; |
| t.counter = 0; |
| t.uopCounter = 0; |
| return t; |
| } |
| } |