libphobos/src/std/regex/internal/ir.d - gcc - Git at Google

 /*
     Implementation of std.regex IR, an intermediate representation
     of a regular expression pattern.

     This is a common ground between frontend regex component (parser)
     and backend components - generators, matchers and other "filters".
 */
 module std.regex.internal.ir;

 package(std.regex):

 import std.exception, std.meta, std.range.primitives, std.traits, std.uni;

 debug(std_regex_parser) import std.stdio;
 // just a common trait, may be moved elsewhere
 alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range);

 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';

 // heuristic value determines maximum CodepointSet length suitable for linear search
 enum maxCharsetUsed = 6;

 // another variable to tweak behavior of caching generated Tries for character classes
 enum maxCachedMatchers = 8;

 alias Trie = CodepointSetTrie!(13, 8);
 alias makeTrie = codepointSetTrie!(13, 8);

 CharMatcher[CodepointSet] matcherCache;

 //accessor with caching
 @trusted CharMatcher getMatcher(CodepointSet set)
 {
     // almost all properties of AA are not @safe
     // https://issues.dlang.org/show_bug.cgi?id=6357
     if (__ctfe || maxCachedMatchers == 0)
         return CharMatcher(set);
     else
     {
         auto p = set in matcherCache;
         if (p)
             return *p;
         if (matcherCache.length == maxCachedMatchers)
         {
             // flush enmatchers in trieCache
             matcherCache = null;
         }
         return (matcherCache[set] = CharMatcher(set));
     }
 }

 @property ref wordMatcher()()
 {
     static immutable CharMatcher matcher = CharMatcher(wordCharacter);
     return matcher;
 }

 // some special Unicode white space characters
 private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029';

 //Regular expression engine/parser options:
 // global - search  all nonoverlapping matches in input
 // casefold - case insensitive matching, do casefolding on match in unicode mode
 // freeform - ignore whitespace in pattern, to match space use [ ] or \s
 // multiline - switch  ^, $ detect start and end of linesinstead of just start and end of input
 enum RegexOption: uint {
     global = 0x1,
     casefold = 0x2,
     freeform = 0x4,
     nonunicode = 0x8,
     multiline = 0x10,
     singleline = 0x20
 }
 //do not reorder this list
 alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's');
 static assert( RegexOption.max < 0x80);

 package(std) string regexOptionsToString()(uint flags) nothrow pure @safe
 {
     flags &= (RegexOption.max << 1) - 1;
     if (!flags)
         return "";
     char[RegexOptionNames.length] buffer = void;
     size_t pos = 0;
     foreach (i, flag; __traits(allMembers, RegexOption))
         if (flags & __traits(getMember, RegexOption, flag))
             buffer[pos++] = RegexOptionNames[i];
     return buffer[0 .. pos].idup;
 }

 // flags that allow guide execution of engine
 enum RegexInfo : uint { oneShot = 0x80 }

 // IR bit pattern: 0b1_xxxxx_yy
 // where yy indicates class of instruction, xxxxx for actual operation code
 //     00: atom, a normal instruction
 //     01: open, opening of a group, has length of contained IR in the low bits
 //     10: close, closing of a group, has length of contained IR in the low bits
 //     11 unused
 //
 // Loops with Q (non-greedy, with ? mark) must have the same size / other properties as non Q version
 // Possible changes:
 //* merge group, option, infinite/repeat start (to never copy during parsing of (a|b){1,2})
 //* reorganize groups to make n args easier to find, or simplify the check for groups of similar ops
 //  (like lookaround), or make it easier to identify hotspots.

 enum IR:uint {
     Char              = 0b1_00000_00, //a character
     Any               = 0b1_00001_00, //any character
     CodepointSet      = 0b1_00010_00, //a most generic CodepointSet [...]
     Trie              = 0b1_00011_00, //CodepointSet implemented as Trie
     //match with any of a consecutive OrChar's in this sequence
     //(used for case insensitive match)
     //OrChar holds in upper two bits of data total number of OrChars in this _sequence_
     //the drawback of this representation is that it is difficult
     // to detect a jump in the middle of it
     OrChar             = 0b1_00100_00,
     Nop                = 0b1_00101_00, //no operation (padding)
     End                = 0b1_00110_00, //end of program
     Bol                = 0b1_00111_00, //beginning of a line ^
     Eol                = 0b1_01000_00, //end of a line $
     Wordboundary       = 0b1_01001_00, //boundary of a word
     Notwordboundary    = 0b1_01010_00, //not a word boundary
     Backref            = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index)
     GroupStart         = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit))
     GroupEnd           = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit))
     Option             = 0b1_01110_00, //start of an option within an alternation x | y (length)
     GotoEndOr          = 0b1_01111_00, //end of an option (length of the rest)
     Bof                = 0b1_10000_00, //begining of "file" (string) ^
     Eof                = 0b1_10001_00, //end of "file" (string) $
     //... any additional atoms here

     OrStart            = 0b1_00000_01, //start of alternation group  (length)
     OrEnd              = 0b1_00000_10, //end of the or group (length,mergeIndex)
     //with this instruction order
     //bit mask 0b1_00001_00 could be used to test/set greediness
     InfiniteStart      = 0b1_00001_01, //start of an infinite repetition x* (length)
     InfiniteEnd        = 0b1_00001_10, //end of infinite repetition x* (length,mergeIndex)
     InfiniteQStart     = 0b1_00010_01, //start of a non eager infinite repetition x*? (length)
     InfiniteQEnd       = 0b1_00010_10, //end of non eager infinite repetition x*? (length,mergeIndex)
     InfiniteBloomStart = 0b1_00011_01, //start of an filtered infinite repetition x* (length)
     InfiniteBloomEnd   = 0b1_00011_10, //end of filtered infinite repetition x* (length,mergeIndex)
     RepeatStart        = 0b1_00100_01, //start of a {n,m} repetition (length)
     RepeatEnd          = 0b1_00100_10, //end of x{n,m} repetition (length,step,minRep,maxRep)
     RepeatQStart       = 0b1_00101_01, //start of a non eager x{n,m}? repetition (length)
     RepeatQEnd         = 0b1_00101_10, //end of non eager x{n,m}? repetition (length,step,minRep,maxRep)

     //
     LookaheadStart     = 0b1_00110_01, //begin of the lookahead group (length)
     LookaheadEnd       = 0b1_00110_10, //end of a lookahead group (length)
     NeglookaheadStart  = 0b1_00111_01, //start of a negative lookahead (length)
     NeglookaheadEnd    = 0b1_00111_10, //end of a negative lookahead (length)
     LookbehindStart    = 0b1_01000_01, //start of a lookbehind (length)
     LookbehindEnd      = 0b1_01000_10, //end of a lookbehind (length)
     NeglookbehindStart = 0b1_01001_01, //start of a negative lookbehind (length)
     NeglookbehindEnd   = 0b1_01001_10, //end of negative lookbehind (length)
 }

 //a shorthand for IR length - full length of specific opcode evaluated at compile time
 template IRL(IR code)
 {
     enum uint IRL =  lengthOfIR(code);
 }
 static assert(IRL!(IR.LookaheadStart) == 3);

 //how many parameters follow the IR, should be optimized fixing some IR bits
 int immediateParamsIR(IR i) @safe pure nothrow @nogc
 {
     switch (i)
     {
     case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd:
         return 1;  // merge table index
     case IR.InfiniteBloomEnd:
         return 2;  // bloom filter index + merge table index
     case IR.RepeatEnd, IR.RepeatQEnd:
         return 4;
     case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
         return 2;  // start-end of captures used
     default:
         return 0;
     }
 }

 //full length of IR instruction inlcuding all parameters that might follow it
 int lengthOfIR(IR i) @safe pure nothrow @nogc
 {
     return 1 + immediateParamsIR(i);
 }

 //full length of the paired IR instruction inlcuding all parameters that might follow it
 int lengthOfPairedIR(IR i) @safe pure nothrow @nogc
 {
     return 1 + immediateParamsIR(pairedIR(i));
 }

 //if the operation has a merge point (this relies on the order of the ops)
 bool hasMerge(IR i) @safe pure nothrow @nogc
 {
     return (i&0b11)==0b10 && i <= IR.RepeatQEnd;
 }

 //is an IR that opens a "group"
 bool isStartIR(IR i) @safe pure nothrow @nogc
 {
     return (i&0b11)==0b01;
 }

 //is an IR that ends a "group"
 bool isEndIR(IR i) @safe pure nothrow @nogc
 {
     return (i&0b11)==0b10;
 }

 //is a standalone IR
 bool isAtomIR(IR i) @safe pure nothrow @nogc
 {
     return (i&0b11)==0b00;
 }

 //makes respective pair out of IR i, swapping start/end bits of instruction
 IR pairedIR(IR i) @safe pure nothrow @nogc
 {
     assert(isStartIR(i) || isEndIR(i));
     return cast(IR) (i ^ 0b11);
 }

 //encoded IR instruction
 @safe pure
 struct Bytecode
 {
     uint raw;
     //natural constraints
     enum maxSequence = 2+4;
     enum maxData = 1 << 22;
     enum maxRaw = 1 << 31;

 @safe pure:
     this(IR code, uint data)
     {
         assert(data < (1 << 22) && code < 256);
         raw = code << 24 | data;
     }

     this(IR code, uint data, uint seq)
     {
         assert(data < (1 << 22) && code < 256 );
         assert(seq >= 2 && seq < maxSequence);
         raw = code << 24 | (seq - 2)<<22 | data;
     }

     //store raw data
     static Bytecode fromRaw(uint data)
     {
         Bytecode t;
         t.raw = data;
         return t;
     }

     // bit twiddling helpers
     // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
     @property uint data()() const { return raw & 0x003f_ffff; }

     @property void data()(uint val)
     {
         raw = (raw & ~0x003f_ffff) | (val & 0x003f_ffff);
     }

     // ditto
     // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
     @property uint sequence()() const { return 2 + (raw >> 22 & 0x3); }

     // ditto
     // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
     @property IR code()() const { return cast(IR)(raw >> 24); }

     //ditto
     @property bool hotspot() const { return hasMerge(code); }

     //test the class of this instruction
     @property bool isAtom() const { return isAtomIR(code); }

     //ditto
     @property bool isStart() const { return isStartIR(code); }

     //ditto
     @property bool isEnd() const { return isEndIR(code); }

     //number of arguments for this instruction
     @property int args() const { return immediateParamsIR(code); }

     //mark this GroupStart or GroupEnd as referenced in backreference
     void setBackrefence()
     {
         assert(code == IR.GroupStart || code == IR.GroupEnd);
         raw = raw | 1 << 23;
     }

     //is referenced
     @property bool backreference() const
     {
         assert(code == IR.GroupStart || code == IR.GroupEnd);
         return cast(bool)(raw & 1 << 23);
     }

     //mark as local reference (for backrefs in lookarounds)
     void setLocalRef()
     {
         assert(code == IR.Backref);
         raw = raw | 1 << 23;
     }

     //is a local ref
     @property bool localRef() const
     {
         assert(code == IR.Backref);
         return cast(bool)(raw & 1 << 23);
     }

     //human readable name of instruction
     @trusted @property string mnemonic()() const
     {//@@@BUG@@@ to is @system
         import std.conv : to;
         return to!string(code);
     }

     //full length of instruction
     @property uint length() const
     {
         return lengthOfIR(code);
     }

     //full length of respective start/end of this instruction
     @property uint pairedLength() const
     {
         return lengthOfPairedIR(code);
     }

     //returns bytecode of paired instruction (assuming this one is start or end)
     @property Bytecode paired() const
     {//depends on bit and struct layout order
         assert(isStart || isEnd);
         return Bytecode.fromRaw(raw ^ 0b11 << 24);
     }

     //gets an index into IR block of the respective pair
     uint indexOfPair(uint pc) const
     {
         assert(isStart || isEnd);
         return isStart ? pc + data + length  : pc - data - lengthOfPairedIR(code);
     }
 }

 static assert(Bytecode.sizeof == 4);


 //index entry structure for name --> number of submatch
 struct NamedGroup
 {
     string name;
     uint group;
 }

 //holds pair of start-end markers for a submatch
 struct Group(DataIndex)
 {
     DataIndex begin = DataIndex.max;
     DataIndex end   = DataIndex.min;

     bool opCast(T : bool)() const
     {
         return begin <= end;
     }

     @trusted string toString()() const
     {
         if (begin < end)
             return "(unmatched)";
         import std.array : appender;
         import std.format.write : formattedWrite;
         auto a = appender!string();
         formattedWrite(a, "%s..%s", begin, end);
         return a.data;
     }
 }

 //debugging tool, prints out instruction along with opcodes
 @trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[])
 {
     import std.array : appender;
     import std.format.write : formattedWrite;
     auto output = appender!string();
     formattedWrite(output,"%s", irb[pc].mnemonic);
     switch (irb[pc].code)
     {
     case IR.Char:
         formattedWrite(output, " %s (0x%x)",cast(dchar) irb[pc].data, irb[pc].data);
         break;
     case IR.OrChar:
         formattedWrite(output, " %s (0x%x) seq=%d", cast(dchar) irb[pc].data, irb[pc].data, irb[pc].sequence);
         break;
     case IR.RepeatStart, IR.InfiniteStart, IR.InfiniteBloomStart,
     IR.Option, IR.GotoEndOr, IR.OrStart:
         //forward-jump instructions
         uint len = irb[pc].data;
         formattedWrite(output, " pc=>%u", pc+len+IRL!(IR.RepeatStart));
         break;
     case IR.RepeatEnd, IR.RepeatQEnd: //backward-jump instructions
         uint len = irb[pc].data;
         formattedWrite(output, " pc=>%u min=%u max=%u step=%u",
             pc - len, irb[pc + 3].raw, irb[pc + 4].raw, irb[pc + 2].raw);
         break;
     case IR.InfiniteEnd, IR.InfiniteQEnd, IR.InfiniteBloomEnd, IR.OrEnd: //ditto
         uint len = irb[pc].data;
         formattedWrite(output, " pc=>%u", pc-len);
         break;
     case  IR.LookaheadEnd, IR.NeglookaheadEnd: //ditto
         uint len = irb[pc].data;
         formattedWrite(output, " pc=>%u", pc-len);
         break;
     case IR.GroupStart, IR.GroupEnd:
         uint n = irb[pc].data;
         string name;
         foreach (v;dict)
             if (v.group == n)
             {
                 name = "'"~v.name~"'";
                 break;
             }
         formattedWrite(output, " %s #%u " ~ (irb[pc].backreference ? "referenced" : ""),
                 name, n);
         break;
     case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
         uint len = irb[pc].data;
         uint start = irb[pc+1].raw, end = irb[pc+2].raw;
         formattedWrite(output, " pc=>%u [%u..%u]", pc + len + IRL!(IR.LookaheadStart), start, end);
         break;
     case IR.Backref: case IR.CodepointSet: case IR.Trie:
         uint n = irb[pc].data;
         formattedWrite(output, " %u",  n);
         if (irb[pc].code == IR.Backref)
             formattedWrite(output, " %s", irb[pc].localRef ? "local" : "global");
         break;
     default://all data-free instructions
     }
     if (irb[pc].hotspot)
         formattedWrite(output, " Hotspot %u", irb[pc+1].raw);
     return output.data;
 }

 //disassemble the whole chunk
 @trusted void printBytecode()(in Bytecode[] slice, in NamedGroup[] dict=[])
 {
     import std.stdio : writeln;
     for (uint pc=0; pc<slice.length; pc += slice[pc].length)
         writeln("\t", disassemble(slice, pc, dict));
 }

 // Encapsulates memory management, explicit ref counting
 // and the exact type of engine created
 // there is a single instance per engine combination type x Char
 // In future may also maintain a (TLS?) cache of memory
 interface MatcherFactory(Char)
 {
 @safe:
     Matcher!Char create(const ref Regex!Char, in Char[] input) const;
     Matcher!Char dup(Matcher!Char m, in Char[] input) const;
     size_t incRef(Matcher!Char m) const;
     size_t decRef(Matcher!Char m) const;
 }

 // Only memory management, no compile-time vs run-time specialities
 abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char
 {
     import core.memory : pureFree;
     import std.internal.memory : enforceMalloc;
     import core.memory : GC;
     // round up to next multiple of size_t for alignment purposes
     enum classSize = (__traits(classInstanceSize, EngineType!Char) + size_t.sizeof - 1) & ~(size_t.sizeof - 1);

     EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const;

     override EngineType!Char create(const ref Regex!Char re, in Char[] input) const @trusted
     {
         immutable size = EngineType!Char.initialMemory(re) + classSize;
         auto memory = enforceMalloc(size)[0 .. size];
         scope(failure) pureFree(memory.ptr);
         GC.addRange(memory.ptr, classSize);
         auto engine = construct(re, input, memory);
         assert(engine.refCount == 1);
         assert(cast(void*) engine == memory.ptr);
         return engine;
     }

     override EngineType!Char dup(Matcher!Char engine, in Char[] input) const @trusted
     {
         immutable size = EngineType!Char.initialMemory(engine.pattern) + classSize;
         auto memory = enforceMalloc(size)[0 .. size];
         scope(failure) pureFree(memory.ptr);
         auto copy = construct(engine.pattern, input, memory);
         GC.addRange(memory.ptr, classSize);
         engine.dupTo(copy, memory[classSize .. size]);
         assert(copy.refCount == 1);
         return copy;
     }

     override size_t incRef(Matcher!Char m) const
     {
         return ++m.refCount;
     }

     override size_t decRef(Matcher!Char m) const  @trusted
     {
         assert(m.refCount != 0);
         auto cnt = --m.refCount;
         if (cnt == 0)
         {
             void* ptr = cast(void*) m;
             GC.removeRange(ptr);
             pureFree(ptr);
         }
         return cnt;
     }
 }

 // A factory for run-time engines
 class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char)
 {
     override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const
     {
         import core.lifetime : emplace;
         return emplace!(EngineType!Char)(memory[0 .. classSize],
             re, Input!Char(input), memory[classSize .. $]);
     }
 }

 // A factory for compile-time engine
 class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineType, Char)
 {
     override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const
     {
         import core.lifetime : emplace;
         return emplace!(EngineType!Char)(memory[0 .. classSize],
             re, &func, Input!Char(input), memory[classSize .. $]);
     }
 }

 private auto defaultFactoryImpl(Char)(const ref Regex!Char re)
 {
     import std.regex.internal.backtracking : BacktrackingMatcher;
     import std.regex.internal.thompson : ThompsonMatcher;
     import std.algorithm.searching : canFind;
     static MatcherFactory!Char backtrackingFactory;
     static MatcherFactory!Char thompsonFactory;
     if (re.backrefed.canFind!"a != 0")
     {
         if (backtrackingFactory is null)
             backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher, Char);
         return backtrackingFactory;
     }
     else
     {
         if (thompsonFactory is null)
             thompsonFactory = new RuntimeFactory!(ThompsonMatcher, Char);
         return thompsonFactory;
     }
 }

 // Used to generate a pure wrapper for defaultFactoryImpl. Based on the example in
 // the std.traits.SetFunctionAttributes documentation.
 auto assumePureFunction(T)(T t)
 if (isFunctionPointer!T)
 {
     enum attrs = functionAttributes!T | FunctionAttribute.pure_;
     return cast(SetFunctionAttributes!(T, functionLinkage!T, attrs)) t;
 }

 // A workaround for R-T enum re = regex(...)
 template defaultFactory(Char)
 {
     // defaultFactory is constructed as a safe, pure wrapper over defaultFactoryImpl.
     // It can be faked as pure because the static mutable variables are used to cache
     // the key and character matcher. The technique used avoids delegates and GC.
     @property MatcherFactory!Char defaultFactory(const ref Regex!Char re) @safe pure
     {
         static auto impl(const ref Regex!Char re)
         {
             return defaultFactoryImpl(re);
         }

         static auto pureImpl(const ref Regex!Char re) @trusted
         {
             auto p = assumePureFunction(&impl);
             return p(re);
         }

         return pureImpl(re);
     }
 }

 // Defining it as an interface has the undesired side-effect:
 // casting any class to an interface silently adjusts pointer to point to a nested vtbl
 abstract class Matcher(Char)
 {
 abstract:
     // Get a (next) match
     int match(Group!size_t[] matches) pure;
     // This only maintains internal ref-count,
     // deallocation happens inside MatcherFactory
     @property ref size_t refCount() @safe;
     // Copy internal state to another engine, using memory arena 'memory'
     void dupTo(Matcher!Char m, void[] memory);
     // The pattern loaded
     @property ref const(Regex!Char) pattern() @safe;
     // Re-arm the engine with new Input
     Matcher rearm(in Char[] stream);
 }

 /++
     `Regex` object holds regular expression pattern in compiled form.
     Instances of this object are constructed via calls to `regex`.
     This is an intended form for caching and storage of frequently
     used regular expressions.
 +/
 struct Regex(Char)
 {
     //temporary workaround for identifier lookup
     CodepointSet[] charsets; //
     Bytecode[] ir;      //compiled bytecode of pattern


     @safe @property bool empty() const nothrow {  return ir is null; }
     /++
     `namedCaptures` returns a range of all named captures in a given regular expression.
     +/
     @safe @property auto namedCaptures()
     {
         static struct NamedGroupRange
         {
         private:
             const(NamedGroup)[] groups;
             size_t start;
             size_t end;
         public:
             this(const(NamedGroup)[] g, size_t s, size_t e)
             {
                 assert(s <= e);
                 assert(e <= g.length);
                 groups = g;
                 start = s;
                 end = e;
             }

             @property string front() { return groups[start].name; }
             @property string back() { return groups[end-1].name; }
             @property bool empty() { return start >= end; }
             @property size_t length() { return end - start; }
             alias opDollar = length;
             @property NamedGroupRange save()
             {
                 return NamedGroupRange(groups, start, end);
             }
             void popFront() { assert(!empty); start++; }
             void popBack() { assert(!empty); end--; }
             string opIndex()(size_t i)
             {
                 assert(start + i < end,
                        "Requested named group is out of range.");
                 return groups[start+i].name;
             }
             NamedGroupRange opSlice(size_t low, size_t high) {
                 assert(low <= high);
                 assert(start + high <= end);
                 return NamedGroupRange(groups, start + low, start + high);
             }
             NamedGroupRange opSlice() { return this.save; }
         }
         return NamedGroupRange(dict, 0, dict.length);
     }

 package(std.regex):
     import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
     const(NamedGroup)[] dict;              // maps name -> user group number
     uint ngroup;                           // number of internal groups
     uint maxCounterDepth;                  // max depth of nested {n,m} repetitions
     uint hotspotTableSize;                 // number of entries in merge table
     uint threadCount;                      // upper bound on number of Thompson VM threads
     uint flags;                            // global regex flags
     public const(CharMatcher)[]  matchers; // tables that represent character sets
     public const(BitTable)[] filters;      // bloom filters for conditional loops
     uint[] backrefed;                      // bit array of backreferenced submatches
     Kickstart!Char kickstart;
     MatcherFactory!Char factory;           // produces optimal matcher for this pattern
     immutable(Char)[] pattern;             // copy of pattern to serve as cache key

     const(Regex) withFactory(MatcherFactory!Char factory) pure const @trusted
     {
         auto r = cast() this;
         r.factory = factory;
         return r;
     }

     const(Regex) withFlags(uint newFlags) pure const @trusted
     {
         auto r = cast() this;
         r.flags = newFlags;
         return r;
     }

     const(Regex) withCode(const(Bytecode)[] code) pure const @trusted
     {
         auto r = cast() this;
         r.ir = code.dup; // TODO: sidestep const instead?
         return r;
     }

     const(Regex) withNGroup(uint nGroup) pure const @trusted
     {
         auto r = cast() this;
         r.ngroup = nGroup;
         return r;
     }

     //bit access helper
     uint isBackref(uint n)
     {
         if (n/32 >= backrefed.length)
             return 0;
         return backrefed[n / 32] & (1 << (n & 31));
     }

     //check if searching is not needed
     void checkIfOneShot()
     {
     L_CheckLoop:
         for (uint i = 0; i < ir.length; i += ir[i].length)
         {
             switch (ir[i].code)
             {
                 case IR.Bof:
                     flags |= RegexInfo.oneShot;
                     break L_CheckLoop;
                 case IR.GroupStart, IR.GroupEnd, IR.Bol, IR.Eol, IR.Eof,
                 IR.Wordboundary, IR.Notwordboundary:
                     break;
                 default:
                     break L_CheckLoop;
             }
         }
     }

     //print out disassembly a program's IR
     @trusted debug(std_regex_parser) void print() const
     {//@@@BUG@@@ write is system
         for (uint i = 0; i < ir.length; i += ir[i].length)
         {
             writefln("%d\t%s ", i, disassemble(ir, i, dict));
         }
         writeln("Total merge table size: ", hotspotTableSize);
         writeln("Max counter nesting depth: ", maxCounterDepth);
     }

     public string toString()() const
     {
         import std.format : format;
         static if (is(typeof(pattern) : string))
             alias patternString = pattern;
         else
         {
             import std.conv : to;
             auto patternString = conv.to!string(pattern);
         }
         auto quotedEscapedPattern = format("%(%s %)", [patternString]);
         auto flagString = regexOptionsToString(flags);
         return "Regex!" ~ Char.stringof ~ "(" ~ quotedEscapedPattern ~ ", \"" ~ flagString ~ "\")";
     }
 }

 // The stuff below this point is temporarrily part of IR module
 // but may need better place in the future (all internals)
 package(std.regex):

 //Simple UTF-string abstraction compatible with stream interface
 struct Input(Char)
 if (is(Char :dchar))
 {
     import std.utf : decode;
     alias DataIndex = size_t;
     enum bool isLoopback = false;
     alias String = const(Char)[];
     String _origin;
     size_t _index;

     //constructs Input object out of plain string
     this(String input, size_t idx = 0)
     {
         _origin = input;
         _index = idx;
     }

     //codepoint at current stream position
     pragma(inline, true) bool nextChar(ref dchar res, ref size_t pos)
     {
         pos = _index;
         // DMD's inliner hates multiple return functions
         // but can live with single statement if/else bodies
         bool n = !(_index == _origin.length);
         if (n)
             res = decode(_origin, _index);
         return n;
     }
     @property bool atEnd(){
         return _index == _origin.length;
     }
     bool search(Kickstart)(ref const Kickstart kick, ref dchar res, ref size_t pos)
     {
         size_t idx = kick.search(_origin, _index);
         _index = idx;
         return nextChar(res, pos);
     }

     //index of at End position
     @property size_t lastIndex(){   return _origin.length; }

     //support for backtracker engine, might not be present
     void reset(size_t index){   _index = index;  }

     String opSlice(size_t start, size_t end){   return _origin[start .. end]; }

     auto loopBack(size_t index){   return BackLooper!Input(this, index); }
 }

 struct BackLooperImpl(Input)
 {
     import std.utf : strideBack;
     alias DataIndex = size_t;
     alias String = Input.String;
     enum bool isLoopback = true;
     String _origin;
     size_t _index;
     this(Input input, size_t index)
     {
         _origin = input._origin;
         _index = index;
     }
     this(String input)
     {
         _origin = input;
         _index = input.length;
     }
     @trusted bool nextChar(ref dchar res,ref size_t pos)
     {
         pos = _index;
         if (_index == 0)
             return false;

         res = _origin[0.._index].back;
         _index -= strideBack(_origin, _index);

         return true;
     }
     @property atEnd(){ return _index == 0 || _index == strideBack(_origin, _index); }
     auto loopBack(size_t index){   return Input(_origin, index); }

     //support for backtracker engine, might not be present
     //void reset(size_t index){   _index = index ? index-std.utf.strideBack(_origin, index) : 0;  }
     void reset(size_t index){   _index = index;  }

     String opSlice(size_t start, size_t end){   return _origin[end .. start]; }
     //index of at End position
     @property size_t lastIndex(){   return 0; }
 }

 template BackLooper(E)
 {
     static if (is(E : BackLooperImpl!U, U))
     {
         alias BackLooper = U;
     }
     else
     {
         alias BackLooper = BackLooperImpl!E;
     }
 }

 //both helpers below are internal, on its own are quite "explosive"
 //unsafe, no initialization of elements
 @system pure T[] mallocArray(T)(size_t len)
 {
     import core.memory : pureMalloc;
     return (cast(T*) pureMalloc(len * T.sizeof))[0 .. len];
 }

 //very unsafe, no initialization
 @system T[] arrayInChunk(T)(size_t len, ref void[] chunk)
 {
     auto ret = (cast(T*) chunk.ptr)[0 .. len];
     chunk = chunk[len * T.sizeof .. $];
     return ret;
 }

 //
 @trusted uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name)
 {//equal is @system?
     import std.algorithm.comparison : equal;
     import std.algorithm.iteration : map;
     import std.conv : text;
     import std.range : assumeSorted;

     auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length;
     enforce(fnd < dict.length && equal(dict[fnd].name, name),
         text("no submatch named ", name));
     return dict[fnd].group;
 }

 // whether ch is one of unicode newline sequences
 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
 bool endOfLine()(dchar front, bool seenCr)
 {
     return ((front == '\n') ^ seenCr) || front == '\r'
     || front == NEL || front == LS || front == PS;
 }

 // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
 bool startOfLine()(dchar back, bool seenNl)
 {
     return ((back == '\r') ^ seenNl) || back == '\n'
     || back == NEL || back == LS || back == PS;
 }

 ///Exception object thrown in case of errors during regex compilation.
 public class RegexException : Exception
 {
     mixin basicExceptionCtors;
 }

 // simple 128-entry bit-table used with a hash function
 struct BitTable {
     uint[4] filter;

     this(CodepointSet set){
         foreach (iv; set.byInterval)
         {
             foreach (v; iv.a .. iv.b)
                 add(v);
         }
     }

     void add()(dchar ch){
         immutable i = index(ch);
         filter[i >> 5]  |=  1<<(i & 31);
     }
     // non-zero -> might be present, 0 -> absent
     bool opIndex()(dchar ch) const{
         immutable i = index(ch);
         return (filter[i >> 5]>>(i & 31)) & 1;
     }

     static uint index()(dchar ch){
         return ((ch >> 7) ^ ch) & 0x7F;
     }
 }

 struct CharMatcher {
     BitTable ascii; // fast path for ASCII
     Trie trie;      // slow path for Unicode

     this(CodepointSet set)
     {
         auto asciiSet = set & unicode.ASCII;
         ascii = BitTable(asciiSet);
         trie = makeTrie(set);
     }

     bool opIndex()(dchar ch) const
     {
         if (ch < 0x80)
             return ascii[ch];
         else
             return trie[ch];
     }
 }

 // Internal non-resizeble array, switches between inline storage and CoW
 // POD-only
 struct SmallFixedArray(T, uint SMALL=3)
 if (!hasElaborateDestructor!T)
 {
     import std.internal.memory : enforceMalloc;
     import core.memory : pureFree;
     static struct Payload
     {
         size_t refcount;
         T[0] placeholder;
         inout(T)* ptr() inout { return placeholder.ptr; }
     }
     static assert(Payload.sizeof == size_t.sizeof);
     union
     {
         Payload* big;
         T[SMALL] small;
     }
     size_t _sizeMask;
     enum BIG_MASK = size_t(1)<<(8*size_t.sizeof-1);
     enum SIZE_MASK = ~BIG_MASK;

     @property bool isBig() const { return (_sizeMask & BIG_MASK) != 0; }
     @property size_t length() const { return _sizeMask & SIZE_MASK; }

     this(size_t size)
     {
         if (size <= SMALL)
         {
             small[] = T.init;
             _sizeMask = size;
         }
         else
         {
             big = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*size);
             big.refcount = 1;
             _sizeMask = size | BIG_MASK;
         }
     }

     private @trusted @property inout(T)[] internalSlice() inout
     {
         return isBig ? big.ptr[0 .. length] : small[0 .. length];
     }

     this(this)
     {
         if (isBig)
         {
             big.refcount++;
         }
     }

     bool opEquals(SmallFixedArray a)
     {
         return internalSlice[] == a.internalSlice[];
     }

     size_t toHash() const
     {
         return hashOf(internalSlice[]);
     }

     ref inout(T) opIndex(size_t idx) inout
     {
         return internalSlice[idx];
     }

     // accesses big to test self-referencing so not @safe
     @trusted ref opAssign(SmallFixedArray arr)
     {
         if (isBig)
         {
             if (arr.isBig)
             {
                 if (big is arr.big) return this; // self-assign
                 else
                 {
                     abandonRef();
                     _sizeMask = arr._sizeMask;
                     big = arr.big;
                     big.refcount++;
                 }
             }
             else
             {
                 abandonRef();
                 _sizeMask = arr._sizeMask;
                 small = arr.small;
             }
         }
         else
         {
             if (arr.isBig)
             {
                 _sizeMask = arr._sizeMask;
                 big = arr.big;
                 big.refcount++;
             }
             else
             {
                 _sizeMask = arr._sizeMask;
                 small = arr.small;
             }
         }
         return this;
     }

     void mutate(scope void delegate(T[]) pure filler)
     {
         if (isBig && big.refcount != 1) // copy on write
         {
             auto oldSizeMask = _sizeMask;
             auto newbig = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*length);
             newbig.refcount = 1;
             abandonRef();
             big = newbig;
             _sizeMask = oldSizeMask;
         }
         filler(internalSlice);
     }

     ~this()
     {
         if (isBig)
         {
             abandonRef();
         }
     }

     @trusted private void abandonRef()
     {
         assert(isBig);
         if (--big.refcount == 0)
         {
             pureFree(big);
             _sizeMask = 0;
             assert(!isBig);
         }
     }
 }

 @system unittest
 {
     alias SA = SmallFixedArray!(int, 2);
     SA create(int[] data)
     {
         SA a = SA(data.length);
         a.mutate((slice) { slice[] = data[]; });
         assert(a.internalSlice == data);
         return a;
     }

     {
         SA a;
         a = SA(1);
         assert(a.length == 1);
         a = SA.init;
         assert(a.length == 0);
     }

     {
         SA a, b, c, d;
         assert(a.length == 0);
         assert(a.internalSlice == b.internalSlice);
         a = create([1]);
         assert(a.internalSlice == [1]);
         b = create([2, 3]);
         assert(b.internalSlice == [2, 3]);
         c = create([3, 4, 5]);
         d = create([5, 6, 7, 8]);
         assert(c.isBig);
         a = c;
         assert(a.isBig);
         assert(a.big is c.big);
         assert(a.big.refcount == 2);
         assert(a.internalSlice == [3, 4, 5]);
         assert(c.internalSlice == [3, 4, 5]);
         a = b;
         assert(!a.isBig);
         assert(a.internalSlice == [2, 3]);
         assert(c.big.refcount == 1);
         a = c;
         assert(c.big.refcount == 2);

         // mutate copies on write if ref-count is not 1
         a.mutate((slice){ slice[] = 1; });
         assert(a.internalSlice == [1, 1, 1]);
         assert(c.internalSlice == [3, 4, 5]);
         assert(a.isBig && c.isBig);
         assert(a.big.refcount == 1);
         assert(c.big.refcount == 1);

         auto e = d;
         assert(e.big.refcount == 2);
         auto f = d;
         f = a;
         assert(f.isBig);
         assert(f.internalSlice == [1, 1, 1]);
         assert(f.big.refcount == 2); // a & f
         assert(e.big.refcount == 2); // d & e
         a = c;
         assert(f.big.refcount == 1); // f
         assert(e.big.refcount == 2); // d & e
         a = a;
         a = a;
         a = a;
         assert(a.big.refcount == 2); // a & c
     }
 }