| ------------------------------------------------------------------------------ |
| -- -- |
| -- GNAT LIBRARY COMPONENTS -- |
| -- -- |
| -- G N A T . R E G P A T -- |
| -- -- |
| -- B o d y -- |
| -- -- |
| -- Copyright (C) 1986 by University of Toronto. -- |
| -- Copyright (C) 1999-2022, AdaCore -- |
| -- -- |
| -- GNAT is free software; you can redistribute it and/or modify it under -- |
| -- terms of the GNU General Public License as published by the Free Soft- -- |
| -- ware Foundation; either version 3, or (at your option) any later ver- -- |
| -- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- |
| -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- |
| -- or FITNESS FOR A PARTICULAR PURPOSE. -- |
| -- -- |
| -- As a special exception under Section 7 of GPL version 3, you are granted -- |
| -- additional permissions described in the GCC Runtime Library Exception, -- |
| -- version 3.1, as published by the Free Software Foundation. -- |
| -- -- |
| -- You should have received a copy of the GNU General Public License and -- |
| -- a copy of the GCC Runtime Library Exception along with this program; -- |
| -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- |
| -- <http://www.gnu.org/licenses/>. -- |
| -- -- |
| -- GNAT was originally developed by the GNAT team at New York University. -- |
| -- Extensive contributions were provided by Ada Core Technologies Inc. -- |
| -- -- |
| ------------------------------------------------------------------------------ |
| |
| -- This is an altered Ada 95 version of the original V8 style regular |
| -- expression library written in C by Henry Spencer. Apart from the |
| -- translation to Ada, the interface has been considerably changed to |
| -- use the Ada String type instead of C-style nul-terminated strings. |
| |
| -- Beware that some of this code is subtly aware of the way operator |
| -- precedence is structured in regular expressions. Serious changes in |
| -- regular-expression syntax might require a total rethink. |
| |
| with System.IO; use System.IO; |
| with Ada.Characters.Handling; use Ada.Characters.Handling; |
| with Ada.Unchecked_Conversion; |
| |
| package body System.Regpat is |
| |
| Debug : constant Boolean := False; |
| -- Set to True to activate debug traces. This is normally set to constant |
| -- False to simply delete all the trace code. It is to be edited to True |
| -- for internal debugging of the package. |
| |
| ---------------------------- |
| -- Implementation details -- |
| ---------------------------- |
| |
| -- This is essentially a linear encoding of a nondeterministic |
| -- finite-state machine, also known as syntax charts or |
| -- "railroad normal form" in parsing technology. |
| |
| -- Each node is an opcode plus a "next" pointer, possibly plus an |
| -- operand. "Next" pointers of all nodes except BRANCH implement |
| -- concatenation; a "next" pointer with a BRANCH on both ends of it |
| -- is connecting two alternatives. |
| |
| -- The operand of some types of node is a literal string; for others, |
| -- it is a node leading into a sub-FSM. In particular, the operand of |
| -- a BRANCH node is the first node of the branch. |
| -- (NB this is *not* a tree structure: the tail of the branch connects |
| -- to the thing following the set of BRANCHes). |
| |
| -- You can see the exact byte-compiled version by using the Dump |
| -- subprogram. However, here are a few examples: |
| |
| -- (a|b): 1 : BRANCH (next at 9) |
| -- 4 : EXACT (next at 17) operand=a |
| -- 9 : BRANCH (next at 17) |
| -- 12 : EXACT (next at 17) operand=b |
| -- 17 : EOP (next at 0) |
| -- |
| -- (ab)*: 1 : CURLYX (next at 25) { 0, 32767} |
| -- 8 : OPEN 1 (next at 12) |
| -- 12 : EXACT (next at 18) operand=ab |
| -- 18 : CLOSE 1 (next at 22) |
| -- 22 : WHILEM (next at 0) |
| -- 25 : NOTHING (next at 28) |
| -- 28 : EOP (next at 0) |
| |
| -- The opcodes are: |
| |
| type Opcode is |
| |
| -- Name Operand? Meaning |
| |
| (EOP, -- no End of program |
| MINMOD, -- no Next operator is not greedy |
| |
| -- Classes of characters |
| |
| ANY, -- no Match any one character except newline |
| SANY, -- no Match any character, including new line |
| ANYOF, -- class Match any character in this class |
| EXACT, -- str Match this string exactly |
| EXACTF, -- str Match this string (case-folding is one) |
| NOTHING, -- no Match empty string |
| SPACE, -- no Match any whitespace character |
| NSPACE, -- no Match any non-whitespace character |
| DIGIT, -- no Match any numeric character |
| NDIGIT, -- no Match any non-numeric character |
| ALNUM, -- no Match any alphanumeric character |
| NALNUM, -- no Match any non-alphanumeric character |
| |
| -- Branches |
| |
| BRANCH, -- node Match this alternative, or the next |
| |
| -- Simple loops (when the following node is one character in length) |
| |
| STAR, -- node Match this simple thing 0 or more times |
| PLUS, -- node Match this simple thing 1 or more times |
| CURLY, -- 2num node Match this simple thing between n and m times. |
| |
| -- Complex loops |
| |
| CURLYX, -- 2num node Match this complex thing {n,m} times |
| -- The nums are coded on two characters each |
| |
| WHILEM, -- no Do curly processing and see if rest matches |
| |
| -- Matches after or before a word |
| |
| BOL, -- no Match "" at beginning of line |
| MBOL, -- no Same, assuming multiline (match after \n) |
| SBOL, -- no Same, assuming single line (don't match at \n) |
| EOL, -- no Match "" at end of line |
| MEOL, -- no Same, assuming multiline (match before \n) |
| SEOL, -- no Same, assuming single line (don't match at \n) |
| |
| BOUND, -- no Match "" at any word boundary |
| NBOUND, -- no Match "" at any word non-boundary |
| |
| -- Parenthesis groups handling |
| |
| REFF, -- num Match some already matched string, folded |
| OPEN, -- num Mark this point in input as start of #n |
| CLOSE); -- num Analogous to OPEN |
| |
| for Opcode'Size use 8; |
| |
| -- Opcode notes: |
| |
| -- BRANCH |
| -- The set of branches constituting a single choice are hooked |
| -- together with their "next" pointers, since precedence prevents |
| -- anything being concatenated to any individual branch. The |
| -- "next" pointer of the last BRANCH in a choice points to the |
| -- thing following the whole choice. This is also where the |
| -- final "next" pointer of each individual branch points; each |
| -- branch starts with the operand node of a BRANCH node. |
| |
| -- STAR,PLUS |
| -- '?', and complex '*' and '+', are implemented with CURLYX. |
| -- branches. Simple cases (one character per match) are implemented with |
| -- STAR and PLUS for speed and to minimize recursive plunges. |
| |
| -- OPEN,CLOSE |
| -- ...are numbered at compile time. |
| |
| -- EXACT, EXACTF |
| -- There are in fact two arguments, the first one is the length (minus |
| -- one of the string argument), coded on one character, the second |
| -- argument is the string itself, coded on length + 1 characters. |
| |
| -- A node is one char of opcode followed by two chars of "next" pointer. |
| -- "Next" pointers are stored as two 8-bit pieces, high order first. The |
| -- value is a positive offset from the opcode of the node containing it. |
| -- An operand, if any, simply follows the node. (Note that much of the |
| -- code generation knows about this implicit relationship.) |
| |
| -- Using two bytes for the "next" pointer is vast overkill for most |
| -- things, but allows patterns to get big without disasters. |
| |
| Next_Pointer_Bytes : constant := 3; |
| -- Points after the "next pointer" data. An instruction is therefore: |
| -- 1 byte: instruction opcode |
| -- 2 bytes: pointer to next instruction |
| -- * bytes: optional data for the instruction |
| |
| ----------------------- |
| -- Character classes -- |
| ----------------------- |
| -- This is the implementation for character classes ([...]) in the |
| -- syntax for regular expressions. Each character (0..256) has an |
| -- entry into the table. This makes for a very fast matching |
| -- algorithm. |
| |
| type Class_Byte is mod 256; |
| type Character_Class is array (Class_Byte range 0 .. 31) of Class_Byte; |
| |
| type Bit_Conversion_Array is array (Class_Byte range 0 .. 7) of Class_Byte; |
| Bit_Conversion : constant Bit_Conversion_Array := |
| [1, 2, 4, 8, 16, 32, 64, 128]; |
| |
| type Std_Class is (ANYOF_NONE, |
| ANYOF_ALNUM, -- Alphanumeric class [a-zA-Z0-9] |
| ANYOF_NALNUM, |
| ANYOF_SPACE, -- Space class [ \t\n\r\f] |
| ANYOF_NSPACE, |
| ANYOF_DIGIT, -- Digit class [0-9] |
| ANYOF_NDIGIT, |
| ANYOF_ALNUMC, -- Alphanumeric class [a-zA-Z0-9] |
| ANYOF_NALNUMC, |
| ANYOF_ALPHA, -- Alpha class [a-zA-Z] |
| ANYOF_NALPHA, |
| ANYOF_ASCII, -- Ascii class (7 bits) 0..127 |
| ANYOF_NASCII, |
| ANYOF_CNTRL, -- Control class |
| ANYOF_NCNTRL, |
| ANYOF_GRAPH, -- Graphic class |
| ANYOF_NGRAPH, |
| ANYOF_LOWER, -- Lower case class [a-z] |
| ANYOF_NLOWER, |
| ANYOF_PRINT, -- printable class |
| ANYOF_NPRINT, |
| ANYOF_PUNCT, -- |
| ANYOF_NPUNCT, |
| ANYOF_UPPER, -- Upper case class [A-Z] |
| ANYOF_NUPPER, |
| ANYOF_XDIGIT, -- Hexadecimal digit |
| ANYOF_NXDIGIT |
| ); |
| |
| procedure Set_In_Class |
| (Bitmap : in out Character_Class; |
| C : Character); |
| -- Set the entry to True for C in the class Bitmap |
| |
| function Get_From_Class |
| (Bitmap : Character_Class; |
| C : Character) return Boolean; |
| -- Return True if the entry is set for C in the class Bitmap |
| |
| procedure Reset_Class (Bitmap : out Character_Class); |
| -- Clear all the entries in the class Bitmap |
| |
| pragma Inline (Set_In_Class); |
| pragma Inline (Get_From_Class); |
| pragma Inline (Reset_Class); |
| |
| ----------------------- |
| -- Local Subprograms -- |
| ----------------------- |
| |
| function "=" (Left : Character; Right : Opcode) return Boolean; |
| |
| function Is_Alnum (C : Character) return Boolean; |
| -- Return True if C is an alphanum character or an underscore ('_') |
| |
| function Is_White_Space (C : Character) return Boolean; |
| -- Return True if C is a whitespace character |
| |
| function Is_Printable (C : Character) return Boolean; |
| -- Return True if C is a printable character |
| |
| function Operand (P : Pointer) return Pointer; |
| -- Return a pointer to the first operand of the node at P |
| |
| function String_Length |
| (Program : Program_Data; |
| P : Pointer) return Program_Size; |
| -- Return the length of the string argument of the node at P |
| |
| function String_Operand (P : Pointer) return Pointer; |
| -- Return a pointer to the string argument of the node at P |
| |
| procedure Bitmap_Operand |
| (Program : Program_Data; |
| P : Pointer; |
| Op : out Character_Class); |
| -- Return a pointer to the string argument of the node at P |
| |
| function Get_Next |
| (Program : Program_Data; |
| IP : Pointer) return Pointer; |
| -- Dig the next instruction pointer out of a node |
| |
| procedure Optimize (Self : in out Pattern_Matcher); |
| -- Optimize a Pattern_Matcher by noting certain special cases |
| |
| function Read_Natural |
| (Program : Program_Data; |
| IP : Pointer) return Natural; |
| -- Return the 2-byte natural coded at position IP |
| |
| -- All of the subprograms above are tiny and should be inlined |
| |
| pragma Inline ("="); |
| pragma Inline (Is_Alnum); |
| pragma Inline (Is_White_Space); |
| pragma Inline (Get_Next); |
| pragma Inline (Operand); |
| pragma Inline (Read_Natural); |
| pragma Inline (String_Length); |
| pragma Inline (String_Operand); |
| |
| type Expression_Flags is record |
| Has_Width, -- Known never to match null string |
| Simple, -- Simple enough to be STAR/PLUS operand |
| SP_Start : Boolean; -- Starts with * or + |
| end record; |
| |
| Worst_Expression : constant Expression_Flags := (others => False); |
| -- Worst case |
| |
| procedure Dump_Until |
| (Program : Program_Data; |
| Index : in out Pointer; |
| Till : Pointer; |
| Indent : Natural; |
| Do_Print : Boolean := True); |
| -- Dump the program until the node Till (not included) is met. Every line |
| -- is indented with Index spaces at the beginning Dumps till the end if |
| -- Till is 0. |
| |
| procedure Dump_Operation |
| (Program : Program_Data; |
| Index : Pointer; |
| Indent : Natural); |
| -- Same as above, but only dumps a single operation, and compute its |
| -- indentation from the program. |
| |
| --------- |
| -- "=" -- |
| --------- |
| |
| function "=" (Left : Character; Right : Opcode) return Boolean is |
| begin |
| return Character'Pos (Left) = Opcode'Pos (Right); |
| end "="; |
| |
| -------------------- |
| -- Bitmap_Operand -- |
| -------------------- |
| |
| procedure Bitmap_Operand |
| (Program : Program_Data; |
| P : Pointer; |
| Op : out Character_Class) |
| is |
| function Convert is new Ada.Unchecked_Conversion |
| (Program_Data, Character_Class); |
| |
| begin |
| Op (0 .. 31) := Convert (Program (P + Next_Pointer_Bytes .. P + 34)); |
| end Bitmap_Operand; |
| |
| ------------- |
| -- Compile -- |
| ------------- |
| |
| procedure Compile |
| (Matcher : out Pattern_Matcher; |
| Expression : String; |
| Final_Code_Size : out Program_Size; |
| Flags : Regexp_Flags := No_Flags) |
| is |
| -- We can't allocate space until we know how big the compiled form |
| -- will be, but we can't compile it (and thus know how big it is) |
| -- until we've got a place to put the code. So we cheat: we compile |
| -- it twice, once with code generation turned off and size counting |
| -- turned on, and once "for real". |
| |
| -- This also means that we don't allocate space until we are sure |
| -- that the thing really will compile successfully, and we never |
| -- have to move the code and thus invalidate pointers into it. |
| |
| -- Beware that the optimization-preparation code in here knows |
| -- about some of the structure of the compiled regexp. |
| |
| PM : Pattern_Matcher renames Matcher; |
| Program : Program_Data renames PM.Program; |
| |
| Emit_Ptr : Pointer := Program_First; |
| |
| Parse_Pos : Natural := Expression'First; -- Input-scan pointer |
| Parse_End : constant Natural := Expression'Last; |
| |
| ---------------------------- |
| -- Subprograms for Create -- |
| ---------------------------- |
| |
| procedure Emit (B : Character); |
| -- Output the Character B to the Program. If code-generation is |
| -- disabled, simply increments the program counter. |
| |
| function Emit_Node (Op : Opcode) return Pointer; |
| -- If code-generation is enabled, Emit_Node outputs the |
| -- opcode Op and reserves space for a pointer to the next node. |
| -- Return value is the location of new opcode, i.e. old Emit_Ptr. |
| |
| procedure Emit_Natural (IP : Pointer; N : Natural); |
| -- Split N on two characters at position IP |
| |
| procedure Emit_Class (Bitmap : Character_Class); |
| -- Emits a character class |
| |
| procedure Case_Emit (C : Character); |
| -- Emit C, after converting is to lower-case if the regular |
| -- expression is case insensitive. |
| |
| procedure Parse |
| (Parenthesized : Boolean; |
| Capturing : Boolean; |
| Flags : out Expression_Flags; |
| IP : out Pointer); |
| -- Parse regular expression, i.e. main body or parenthesized thing. |
| -- Caller must absorb opening parenthesis. Capturing should be set to |
| -- True when we have an open parenthesis from which we want the user |
| -- to extra text. |
| |
| procedure Parse_Branch |
| (Flags : out Expression_Flags; |
| First : Boolean; |
| IP : out Pointer); |
| -- Implements the concatenation operator and handles '|'. |
| -- First should be true if this is the first item of the alternative. |
| |
| procedure Parse_Piece |
| (Expr_Flags : out Expression_Flags; |
| IP : out Pointer); |
| -- Parse something followed by possible [*+?] |
| |
| procedure Parse_Atom |
| (Expr_Flags : out Expression_Flags; |
| IP : out Pointer); |
| -- Parse_Atom is the lowest level parse procedure. |
| -- |
| -- Optimization: Gobbles an entire sequence of ordinary characters so |
| -- that it can turn them into a single node, which is smaller to store |
| -- and faster to run. Backslashed characters are exceptions, each |
| -- becoming a separate node; the code is simpler that way and it's |
| -- not worth fixing. |
| |
| procedure Insert_Operator |
| (Op : Opcode; |
| Operand : Pointer; |
| Greedy : Boolean := True); |
| -- Insert_Operator inserts an operator in front of an already-emitted |
| -- operand and relocates the operand. This applies to PLUS and STAR. |
| -- If Minmod is True, then the operator is non-greedy. |
| |
| function Insert_Operator_Before |
| (Op : Opcode; |
| Operand : Pointer; |
| Greedy : Boolean; |
| Opsize : Pointer) return Pointer; |
| -- Insert an operator before Operand (and move the latter forward in the |
| -- program). Opsize is the size needed to represent the operator. This |
| -- returns the position at which the operator was inserted, and moves |
| -- Emit_Ptr after the new position of the operand. |
| |
| procedure Insert_Curly_Operator |
| (Op : Opcode; |
| Min : Natural; |
| Max : Natural; |
| Operand : Pointer; |
| Greedy : Boolean := True); |
| -- Insert an operator for CURLY ({Min}, {Min,} or {Min,Max}). |
| -- If Minmod is True, then the operator is non-greedy. |
| |
| procedure Link_Tail (P, Val : Pointer); |
| -- Link_Tail sets the next-pointer at the end of a node chain |
| |
| procedure Link_Operand_Tail (P, Val : Pointer); |
| -- Link_Tail on operand of first argument; noop if operand-less |
| |
| procedure Fail (M : String); |
| pragma No_Return (Fail); |
| -- Fail with a diagnostic message, if possible |
| |
| function Is_Curly_Operator (IP : Natural) return Boolean; |
| -- Return True if IP is looking at a '{' that is the beginning |
| -- of a curly operator, i.e. it matches {\d+,?\d*} |
| |
| function Is_Mult (IP : Natural) return Boolean; |
| -- Return True if C is a regexp multiplier: '+', '*' or '?' |
| |
| procedure Get_Curly_Arguments |
| (IP : Natural; |
| Min : out Natural; |
| Max : out Natural; |
| Greedy : out Boolean); |
| -- Parse the argument list for a curly operator. |
| -- It is assumed that IP is indeed pointing at a valid operator. |
| -- So what is IP and how come IP is not referenced in the body ??? |
| |
| procedure Parse_Character_Class (IP : out Pointer); |
| -- Parse a character class. |
| -- The calling subprogram should consume the opening '[' before. |
| |
| procedure Parse_Literal |
| (Expr_Flags : out Expression_Flags; |
| IP : out Pointer); |
| -- Parse_Literal encodes a string of characters to be matched exactly |
| |
| function Parse_Posix_Character_Class return Std_Class; |
| -- Parse a posix character class, like [:alpha:] or [:^alpha:]. |
| -- The caller is supposed to absorb the opening [. |
| |
| pragma Inline (Is_Mult); |
| pragma Inline (Emit_Natural); |
| pragma Inline (Parse_Character_Class); -- since used only once |
| |
| --------------- |
| -- Case_Emit -- |
| --------------- |
| |
| procedure Case_Emit (C : Character) is |
| begin |
| if (Flags and Case_Insensitive) /= 0 then |
| Emit (To_Lower (C)); |
| |
| else |
| -- Dump current character |
| |
| Emit (C); |
| end if; |
| end Case_Emit; |
| |
| ---------- |
| -- Emit -- |
| ---------- |
| |
| procedure Emit (B : Character) is |
| begin |
| if Emit_Ptr <= PM.Size then |
| Program (Emit_Ptr) := B; |
| end if; |
| |
| Emit_Ptr := Emit_Ptr + 1; |
| end Emit; |
| |
| ---------------- |
| -- Emit_Class -- |
| ---------------- |
| |
| procedure Emit_Class (Bitmap : Character_Class) is |
| subtype Program31 is Program_Data (0 .. 31); |
| |
| function Convert is new Ada.Unchecked_Conversion |
| (Character_Class, Program31); |
| |
| begin |
| -- What is the mysterious constant 31 here??? Can't it be expressed |
| -- symbolically (size of integer - 1 or some such???). In any case |
| -- it should be declared as a constant (and referenced presumably |
| -- as this constant + 1 below. |
| |
| if Emit_Ptr + 31 <= PM.Size then |
| Program (Emit_Ptr .. Emit_Ptr + 31) := Convert (Bitmap); |
| end if; |
| |
| Emit_Ptr := Emit_Ptr + 32; |
| end Emit_Class; |
| |
| ------------------ |
| -- Emit_Natural -- |
| ------------------ |
| |
| procedure Emit_Natural (IP : Pointer; N : Natural) is |
| begin |
| if IP + 1 <= PM.Size then |
| Program (IP + 1) := Character'Val (N / 256); |
| Program (IP) := Character'Val (N mod 256); |
| end if; |
| end Emit_Natural; |
| |
| --------------- |
| -- Emit_Node -- |
| --------------- |
| |
| function Emit_Node (Op : Opcode) return Pointer is |
| Result : constant Pointer := Emit_Ptr; |
| |
| begin |
| if Emit_Ptr + 2 <= PM.Size then |
| Program (Emit_Ptr) := Character'Val (Opcode'Pos (Op)); |
| Program (Emit_Ptr + 1) := ASCII.NUL; |
| Program (Emit_Ptr + 2) := ASCII.NUL; |
| end if; |
| |
| Emit_Ptr := Emit_Ptr + Next_Pointer_Bytes; |
| return Result; |
| end Emit_Node; |
| |
| ---------- |
| -- Fail -- |
| ---------- |
| |
| procedure Fail (M : String) is |
| begin |
| raise Expression_Error with M; |
| end Fail; |
| |
| ------------------------- |
| -- Get_Curly_Arguments -- |
| ------------------------- |
| |
| procedure Get_Curly_Arguments |
| (IP : Natural; |
| Min : out Natural; |
| Max : out Natural; |
| Greedy : out Boolean) |
| is |
| pragma Unreferenced (IP); |
| |
| Save_Pos : Natural := Parse_Pos + 1; |
| |
| begin |
| Min := 0; |
| Max := Max_Curly_Repeat; |
| |
| while Expression (Parse_Pos) /= '}' |
| and then Expression (Parse_Pos) /= ',' |
| loop |
| Parse_Pos := Parse_Pos + 1; |
| end loop; |
| |
| Min := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1)); |
| |
| if Expression (Parse_Pos) = ',' then |
| Save_Pos := Parse_Pos + 1; |
| while Expression (Parse_Pos) /= '}' loop |
| Parse_Pos := Parse_Pos + 1; |
| end loop; |
| |
| if Save_Pos /= Parse_Pos then |
| Max := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1)); |
| end if; |
| |
| else |
| Max := Min; |
| end if; |
| |
| if Parse_Pos < Expression'Last |
| and then Expression (Parse_Pos + 1) = '?' |
| then |
| Greedy := False; |
| Parse_Pos := Parse_Pos + 1; |
| |
| else |
| Greedy := True; |
| end if; |
| end Get_Curly_Arguments; |
| |
| --------------------------- |
| -- Insert_Curly_Operator -- |
| --------------------------- |
| |
| procedure Insert_Curly_Operator |
| (Op : Opcode; |
| Min : Natural; |
| Max : Natural; |
| Operand : Pointer; |
| Greedy : Boolean := True) |
| is |
| Old : Pointer; |
| begin |
| Old := Insert_Operator_Before (Op, Operand, Greedy, Opsize => 7); |
| Emit_Natural (Old + Next_Pointer_Bytes, Min); |
| Emit_Natural (Old + Next_Pointer_Bytes + 2, Max); |
| end Insert_Curly_Operator; |
| |
| ---------------------------- |
| -- Insert_Operator_Before -- |
| ---------------------------- |
| |
| function Insert_Operator_Before |
| (Op : Opcode; |
| Operand : Pointer; |
| Greedy : Boolean; |
| Opsize : Pointer) return Pointer |
| is |
| Dest : constant Pointer := Emit_Ptr; |
| Old : Pointer; |
| Size : Pointer := Opsize; |
| |
| begin |
| -- If not greedy, we have to emit another opcode first |
| |
| if not Greedy then |
| Size := Size + Next_Pointer_Bytes; |
| end if; |
| |
| -- Move the operand in the byte-compilation, so that we can insert |
| -- the operator before it. |
| |
| if Emit_Ptr + Size <= PM.Size then |
| Program (Operand + Size .. Emit_Ptr + Size) := |
| Program (Operand .. Emit_Ptr); |
| end if; |
| |
| -- Insert the operator at the position previously occupied by the |
| -- operand. |
| |
| Emit_Ptr := Operand; |
| |
| if not Greedy then |
| Old := Emit_Node (MINMOD); |
| Link_Tail (Old, Old + Next_Pointer_Bytes); |
| end if; |
| |
| Old := Emit_Node (Op); |
| Emit_Ptr := Dest + Size; |
| return Old; |
| end Insert_Operator_Before; |
| |
| --------------------- |
| -- Insert_Operator -- |
| --------------------- |
| |
| procedure Insert_Operator |
| (Op : Opcode; |
| Operand : Pointer; |
| Greedy : Boolean := True) |
| is |
| Discard : Pointer; |
| pragma Warnings (Off, Discard); |
| begin |
| Discard := Insert_Operator_Before |
| (Op, Operand, Greedy, Opsize => Next_Pointer_Bytes); |
| end Insert_Operator; |
| |
| ----------------------- |
| -- Is_Curly_Operator -- |
| ----------------------- |
| |
| function Is_Curly_Operator (IP : Natural) return Boolean is |
| Scan : Natural := IP; |
| |
| begin |
| if Expression (Scan) /= '{' |
| or else Scan + 2 > Expression'Last |
| or else not Is_Digit (Expression (Scan + 1)) |
| then |
| return False; |
| end if; |
| |
| Scan := Scan + 1; |
| |
| -- The first digit |
| |
| loop |
| Scan := Scan + 1; |
| |
| if Scan > Expression'Last then |
| return False; |
| end if; |
| |
| exit when not Is_Digit (Expression (Scan)); |
| end loop; |
| |
| if Expression (Scan) = ',' then |
| loop |
| Scan := Scan + 1; |
| |
| if Scan > Expression'Last then |
| return False; |
| end if; |
| |
| exit when not Is_Digit (Expression (Scan)); |
| end loop; |
| end if; |
| |
| return Expression (Scan) = '}'; |
| end Is_Curly_Operator; |
| |
| ------------- |
| -- Is_Mult -- |
| ------------- |
| |
| function Is_Mult (IP : Natural) return Boolean is |
| C : constant Character := Expression (IP); |
| |
| begin |
| return C = '*' |
| or else C = '+' |
| or else C = '?' |
| or else (C = '{' and then Is_Curly_Operator (IP)); |
| end Is_Mult; |
| |
| ----------------------- |
| -- Link_Operand_Tail -- |
| ----------------------- |
| |
| procedure Link_Operand_Tail (P, Val : Pointer) is |
| begin |
| if P <= PM.Size and then Program (P) = BRANCH then |
| Link_Tail (Operand (P), Val); |
| end if; |
| end Link_Operand_Tail; |
| |
| --------------- |
| -- Link_Tail -- |
| --------------- |
| |
| procedure Link_Tail (P, Val : Pointer) is |
| Scan : Pointer; |
| Temp : Pointer; |
| Offset : Pointer; |
| |
| begin |
| -- Find last node (the size of the pattern matcher might be too |
| -- small, so don't try to read past its end). |
| |
| Scan := P; |
| while Scan + Next_Pointer_Bytes <= PM.Size loop |
| Temp := Get_Next (Program, Scan); |
| exit when Temp = Scan; |
| Scan := Temp; |
| end loop; |
| |
| Offset := Val - Scan; |
| |
| Emit_Natural (Scan + 1, Natural (Offset)); |
| end Link_Tail; |
| |
| ----------- |
| -- Parse -- |
| ----------- |
| |
| -- Combining parenthesis handling with the base level of regular |
| -- expression is a trifle forced, but the need to tie the tails of the |
| -- the branches to what follows makes it hard to avoid. |
| |
| procedure Parse |
| (Parenthesized : Boolean; |
| Capturing : Boolean; |
| Flags : out Expression_Flags; |
| IP : out Pointer) |
| is |
| E : String renames Expression; |
| Br, Br2 : Pointer; |
| Ender : Pointer; |
| Par_No : Natural; |
| New_Flags : Expression_Flags; |
| Have_Branch : Boolean := False; |
| |
| begin |
| Flags := (Has_Width => True, others => False); -- Tentatively |
| |
| -- Make an OPEN node, if parenthesized |
| |
| if Parenthesized and then Capturing then |
| if Matcher.Paren_Count > Max_Paren_Count then |
| Fail ("too many ()"); |
| end if; |
| |
| Par_No := Matcher.Paren_Count + 1; |
| Matcher.Paren_Count := Matcher.Paren_Count + 1; |
| IP := Emit_Node (OPEN); |
| Emit (Character'Val (Par_No)); |
| else |
| IP := 0; |
| Par_No := 0; |
| end if; |
| |
| -- Pick up the branches, linking them together |
| |
| Parse_Branch (New_Flags, True, Br); |
| |
| if Br = 0 then |
| IP := 0; |
| return; |
| end if; |
| |
| if Parse_Pos <= Parse_End |
| and then E (Parse_Pos) = '|' |
| then |
| Insert_Operator (BRANCH, Br); |
| Have_Branch := True; |
| end if; |
| |
| if IP /= 0 then |
| Link_Tail (IP, Br); -- OPEN -> first |
| else |
| IP := Br; |
| end if; |
| |
| if not New_Flags.Has_Width then |
| Flags.Has_Width := False; |
| end if; |
| |
| Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; |
| |
| while Parse_Pos <= Parse_End |
| and then (E (Parse_Pos) = '|') |
| loop |
| Parse_Pos := Parse_Pos + 1; |
| Parse_Branch (New_Flags, False, Br); |
| |
| if Br = 0 then |
| IP := 0; |
| return; |
| end if; |
| |
| Link_Tail (IP, Br); -- BRANCH -> BRANCH |
| |
| if not New_Flags.Has_Width then |
| Flags.Has_Width := False; |
| end if; |
| |
| Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; |
| end loop; |
| |
| -- Make a closing node, and hook it on the end |
| |
| if Parenthesized then |
| if Capturing then |
| Ender := Emit_Node (CLOSE); |
| Emit (Character'Val (Par_No)); |
| Link_Tail (IP, Ender); |
| |
| else |
| -- Need to keep looking after the closing parenthesis |
| Ender := Emit_Ptr; |
| end if; |
| |
| else |
| Ender := Emit_Node (EOP); |
| Link_Tail (IP, Ender); |
| end if; |
| |
| if Have_Branch and then Emit_Ptr <= PM.Size + 1 then |
| |
| -- Hook the tails of the branches to the closing node |
| |
| Br := IP; |
| loop |
| Link_Operand_Tail (Br, Ender); |
| Br2 := Get_Next (Program, Br); |
| exit when Br2 = Br; |
| Br := Br2; |
| end loop; |
| end if; |
| |
| -- Check for proper termination |
| |
| if Parenthesized then |
| if Parse_Pos > Parse_End or else E (Parse_Pos) /= ')' then |
| Fail ("unmatched ()"); |
| end if; |
| |
| Parse_Pos := Parse_Pos + 1; |
| |
| elsif Parse_Pos <= Parse_End then |
| if E (Parse_Pos) = ')' then |
| Fail ("unmatched ')'"); |
| else |
| Fail ("junk on end"); -- "Can't happen" |
| end if; |
| end if; |
| end Parse; |
| |
| ---------------- |
| -- Parse_Atom -- |
| ---------------- |
| |
| procedure Parse_Atom |
| (Expr_Flags : out Expression_Flags; |
| IP : out Pointer) |
| is |
| C : Character; |
| |
| begin |
| -- Tentatively set worst expression case |
| |
| Expr_Flags := Worst_Expression; |
| |
| C := Expression (Parse_Pos); |
| Parse_Pos := Parse_Pos + 1; |
| |
| case (C) is |
| when '^' => |
| IP := |
| Emit_Node |
| (if (Flags and Multiple_Lines) /= 0 then MBOL |
| elsif (Flags and Single_Line) /= 0 then SBOL |
| else BOL); |
| |
| when '$' => |
| IP := |
| Emit_Node |
| (if (Flags and Multiple_Lines) /= 0 then MEOL |
| elsif (Flags and Single_Line) /= 0 then SEOL |
| else EOL); |
| |
| when '.' => |
| IP := |
| Emit_Node |
| (if (Flags and Single_Line) /= 0 then SANY else ANY); |
| |
| Expr_Flags.Has_Width := True; |
| Expr_Flags.Simple := True; |
| |
| when '[' => |
| Parse_Character_Class (IP); |
| Expr_Flags.Has_Width := True; |
| Expr_Flags.Simple := True; |
| |
| when '(' => |
| declare |
| New_Flags : Expression_Flags; |
| |
| begin |
| if Parse_Pos <= Parse_End - 1 |
| and then Expression (Parse_Pos) = '?' |
| and then Expression (Parse_Pos + 1) = ':' |
| then |
| Parse_Pos := Parse_Pos + 2; |
| |
| -- Non-capturing parenthesis |
| |
| Parse (True, False, New_Flags, IP); |
| |
| else |
| -- Capturing parenthesis |
| |
| Parse (True, True, New_Flags, IP); |
| Expr_Flags.Has_Width := |
| Expr_Flags.Has_Width or else New_Flags.Has_Width; |
| Expr_Flags.SP_Start := |
| Expr_Flags.SP_Start or else New_Flags.SP_Start; |
| if IP = 0 then |
| return; |
| end if; |
| end if; |
| end; |
| |
| when '|' | ASCII.LF | ')' => |
| Fail ("internal urp"); -- Supposed to be caught earlier |
| |
| when '?' | '+' | '*' => |
| Fail (C & " follows nothing"); |
| |
| when '{' => |
| if Is_Curly_Operator (Parse_Pos - 1) then |
| Fail (C & " follows nothing"); |
| else |
| Parse_Literal (Expr_Flags, IP); |
| end if; |
| |
| when '\' => |
| if Parse_Pos > Parse_End then |
| Fail ("trailing \"); |
| end if; |
| |
| Parse_Pos := Parse_Pos + 1; |
| |
| case Expression (Parse_Pos - 1) is |
| when 'b' => |
| IP := Emit_Node (BOUND); |
| |
| when 'B' => |
| IP := Emit_Node (NBOUND); |
| |
| when 's' => |
| IP := Emit_Node (SPACE); |
| Expr_Flags.Simple := True; |
| Expr_Flags.Has_Width := True; |
| |
| when 'S' => |
| IP := Emit_Node (NSPACE); |
| Expr_Flags.Simple := True; |
| Expr_Flags.Has_Width := True; |
| |
| when 'd' => |
| IP := Emit_Node (DIGIT); |
| Expr_Flags.Simple := True; |
| Expr_Flags.Has_Width := True; |
| |
| when 'D' => |
| IP := Emit_Node (NDIGIT); |
| Expr_Flags.Simple := True; |
| Expr_Flags.Has_Width := True; |
| |
| when 'w' => |
| IP := Emit_Node (ALNUM); |
| Expr_Flags.Simple := True; |
| Expr_Flags.Has_Width := True; |
| |
| when 'W' => |
| IP := Emit_Node (NALNUM); |
| Expr_Flags.Simple := True; |
| Expr_Flags.Has_Width := True; |
| |
| when 'A' => |
| IP := Emit_Node (SBOL); |
| |
| when 'G' => |
| IP := Emit_Node (SEOL); |
| |
| when '0' .. '9' => |
| IP := Emit_Node (REFF); |
| |
| declare |
| Save : constant Natural := Parse_Pos - 1; |
| |
| begin |
| while Parse_Pos <= Expression'Last |
| and then Is_Digit (Expression (Parse_Pos)) |
| loop |
| Parse_Pos := Parse_Pos + 1; |
| end loop; |
| |
| Emit (Character'Val (Natural'Value |
| (Expression (Save .. Parse_Pos - 1)))); |
| end; |
| |
| when others => |
| Parse_Pos := Parse_Pos - 1; |
| Parse_Literal (Expr_Flags, IP); |
| end case; |
| |
| when others => |
| Parse_Literal (Expr_Flags, IP); |
| end case; |
| end Parse_Atom; |
| |
| ------------------ |
| -- Parse_Branch -- |
| ------------------ |
| |
| procedure Parse_Branch |
| (Flags : out Expression_Flags; |
| First : Boolean; |
| IP : out Pointer) |
| is |
| E : String renames Expression; |
| Chain : Pointer; |
| Last : Pointer; |
| New_Flags : Expression_Flags; |
| |
| Discard : Pointer; |
| pragma Warnings (Off, Discard); |
| |
| begin |
| Flags := Worst_Expression; -- Tentatively |
| IP := (if First then Emit_Ptr else Emit_Node (BRANCH)); |
| |
| Chain := 0; |
| while Parse_Pos <= Parse_End |
| and then E (Parse_Pos) /= ')' |
| and then E (Parse_Pos) /= ASCII.LF |
| and then E (Parse_Pos) /= '|' |
| loop |
| Parse_Piece (New_Flags, Last); |
| |
| if Last = 0 then |
| IP := 0; |
| return; |
| end if; |
| |
| Flags.Has_Width := Flags.Has_Width or else New_Flags.Has_Width; |
| |
| if Chain = 0 then -- First piece |
| Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; |
| else |
| Link_Tail (Chain, Last); |
| end if; |
| |
| Chain := Last; |
| end loop; |
| |
| -- Case where loop ran zero CURLY |
| |
| if Chain = 0 then |
| Discard := Emit_Node (NOTHING); |
| end if; |
| end Parse_Branch; |
| |
| --------------------------- |
| -- Parse_Character_Class -- |
| --------------------------- |
| |
| procedure Parse_Character_Class (IP : out Pointer) is |
| Bitmap : Character_Class; |
| Invert : Boolean := False; |
| In_Range : Boolean := False; |
| Named_Class : Std_Class := ANYOF_NONE; |
| Value : Character; |
| Last_Value : Character := ASCII.NUL; |
| |
| begin |
| Reset_Class (Bitmap); |
| |
| -- Do we have an invert character class ? |
| |
| if Parse_Pos <= Parse_End |
| and then Expression (Parse_Pos) = '^' |
| then |
| Invert := True; |
| Parse_Pos := Parse_Pos + 1; |
| end if; |
| |
| -- First character can be ] or - without closing the class |
| |
| if Parse_Pos <= Parse_End |
| and then (Expression (Parse_Pos) = ']' |
| or else Expression (Parse_Pos) = '-') |
| then |
| Set_In_Class (Bitmap, Expression (Parse_Pos)); |
| Parse_Pos := Parse_Pos + 1; |
| end if; |
| |
| -- While we don't have the end of the class |
| |
| while Parse_Pos <= Parse_End |
| and then Expression (Parse_Pos) /= ']' |
| loop |
| Named_Class := ANYOF_NONE; |
| Value := Expression (Parse_Pos); |
| Parse_Pos := Parse_Pos + 1; |
| |
| -- Do we have a Posix character class |
| if Value = '[' then |
| Named_Class := Parse_Posix_Character_Class; |
| |
| elsif Value = '\' then |
| if Parse_Pos = Parse_End then |
| Fail ("Trailing \"); |
| end if; |
| Value := Expression (Parse_Pos); |
| Parse_Pos := Parse_Pos + 1; |
| |
| case Value is |
| when 'w' => Named_Class := ANYOF_ALNUM; |
| when 'W' => Named_Class := ANYOF_NALNUM; |
| when 's' => Named_Class := ANYOF_SPACE; |
| when 'S' => Named_Class := ANYOF_NSPACE; |
| when 'd' => Named_Class := ANYOF_DIGIT; |
| when 'D' => Named_Class := ANYOF_NDIGIT; |
| when 'n' => Value := ASCII.LF; |
| when 'r' => Value := ASCII.CR; |
| when 't' => Value := ASCII.HT; |
| when 'f' => Value := ASCII.FF; |
| when 'e' => Value := ASCII.ESC; |
| when 'a' => Value := ASCII.BEL; |
| |
| -- when 'x' => ??? hexadecimal value |
| -- when 'c' => ??? control character |
| -- when '0'..'9' => ??? octal character |
| |
| when others => null; |
| end case; |
| end if; |
| |
| -- Do we have a character class? |
| |
| if Named_Class /= ANYOF_NONE then |
| |
| -- A range like 'a-\d' or 'a-[:digit:] is not a range |
| |
| if In_Range then |
| Set_In_Class (Bitmap, Last_Value); |
| Set_In_Class (Bitmap, '-'); |
| In_Range := False; |
| end if; |
| |
| -- Expand the range |
| |
| case Named_Class is |
| when ANYOF_NONE => null; |
| |
| when ANYOF_ALNUM | ANYOF_ALNUMC => |
| for Value in Class_Byte'Range loop |
| if Is_Alnum (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_NALNUM | ANYOF_NALNUMC => |
| for Value in Class_Byte'Range loop |
| if not Is_Alnum (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_SPACE => |
| for Value in Class_Byte'Range loop |
| if Is_White_Space (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_NSPACE => |
| for Value in Class_Byte'Range loop |
| if not Is_White_Space (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_DIGIT => |
| for Value in Class_Byte'Range loop |
| if Is_Digit (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_NDIGIT => |
| for Value in Class_Byte'Range loop |
| if not Is_Digit (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_ALPHA => |
| for Value in Class_Byte'Range loop |
| if Is_Letter (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_NALPHA => |
| for Value in Class_Byte'Range loop |
| if not Is_Letter (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_ASCII => |
| for Value in 0 .. 127 loop |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end loop; |
| |
| when ANYOF_NASCII => |
| for Value in 128 .. 255 loop |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end loop; |
| |
| when ANYOF_CNTRL => |
| for Value in Class_Byte'Range loop |
| if Is_Control (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_NCNTRL => |
| for Value in Class_Byte'Range loop |
| if not Is_Control (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_GRAPH => |
| for Value in Class_Byte'Range loop |
| if Is_Graphic (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_NGRAPH => |
| for Value in Class_Byte'Range loop |
| if not Is_Graphic (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_LOWER => |
| for Value in Class_Byte'Range loop |
| if Is_Lower (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_NLOWER => |
| for Value in Class_Byte'Range loop |
| if not Is_Lower (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_PRINT => |
| for Value in Class_Byte'Range loop |
| if Is_Printable (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_NPRINT => |
| for Value in Class_Byte'Range loop |
| if not Is_Printable (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_PUNCT => |
| for Value in Class_Byte'Range loop |
| if Is_Printable (Character'Val (Value)) |
| and then not Is_White_Space (Character'Val (Value)) |
| and then not Is_Alnum (Character'Val (Value)) |
| then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_NPUNCT => |
| for Value in Class_Byte'Range loop |
| if not Is_Printable (Character'Val (Value)) |
| or else Is_White_Space (Character'Val (Value)) |
| or else Is_Alnum (Character'Val (Value)) |
| then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_UPPER => |
| for Value in Class_Byte'Range loop |
| if Is_Upper (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_NUPPER => |
| for Value in Class_Byte'Range loop |
| if not Is_Upper (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_XDIGIT => |
| for Value in Class_Byte'Range loop |
| if Is_Hexadecimal_Digit (Character'Val (Value)) then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| when ANYOF_NXDIGIT => |
| for Value in Class_Byte'Range loop |
| if not Is_Hexadecimal_Digit |
| (Character'Val (Value)) |
| then |
| Set_In_Class (Bitmap, Character'Val (Value)); |
| end if; |
| end loop; |
| |
| end case; |
| |
| -- Not a character range |
| |
| elsif not In_Range then |
| Last_Value := Value; |
| |
| if Parse_Pos > Expression'Last then |
| Fail ("Empty character class []"); |
| end if; |
| |
| if Expression (Parse_Pos) = '-' |
| and then Parse_Pos < Parse_End |
| and then Expression (Parse_Pos + 1) /= ']' |
| then |
| Parse_Pos := Parse_Pos + 1; |
| In_Range := True; |
| else |
| Set_In_Class (Bitmap, Value); |
| end if; |
| |
| -- Else in a character range |
| |
| else |
| if Last_Value > Value then |
| Fail ("Invalid Range [" & Last_Value'Img |
| & "-" & Value'Img & "]"); |
| end if; |
| |
| while Last_Value <= Value loop |
| Set_In_Class (Bitmap, Last_Value); |
| Last_Value := Character'Succ (Last_Value); |
| end loop; |
| |
| In_Range := False; |
| |
| end if; |
| |
| end loop; |
| |
| -- Optimize case-insensitive ranges (put the upper case or lower |
| -- case character into the bitmap) |
| |
| if (Flags and Case_Insensitive) /= 0 then |
| for C in Character'Range loop |
| if Get_From_Class (Bitmap, C) then |
| Set_In_Class (Bitmap, To_Lower (C)); |
| Set_In_Class (Bitmap, To_Upper (C)); |
| end if; |
| end loop; |
| end if; |
| |
| -- Optimize inverted classes |
| |
| if Invert then |
| for J in Bitmap'Range loop |
| Bitmap (J) := not Bitmap (J); |
| end loop; |
| end if; |
| |
| Parse_Pos := Parse_Pos + 1; |
| |
| -- Emit the class |
| |
| IP := Emit_Node (ANYOF); |
| Emit_Class (Bitmap); |
| end Parse_Character_Class; |
| |
| ------------------- |
| -- Parse_Literal -- |
| ------------------- |
| |
| -- This is a bit tricky due to quoted chars and due to |
| -- the multiplier characters '*', '+', and '?' that |
| -- take the SINGLE char previous as their operand. |
| |
| -- On entry, the character at Parse_Pos - 1 is going to go |
| -- into the string, no matter what it is. It could be |
| -- following a \ if Parse_Atom was entered from the '\' case. |
| |
| -- Basic idea is to pick up a good char in C and examine |
| -- the next char. If Is_Mult (C) then twiddle, if it's a \ |
| -- then frozzle and if it's another magic char then push C and |
| -- terminate the string. If none of the above, push C on the |
| -- string and go around again. |
| |
| -- Start_Pos is used to remember where "the current character" |
| -- starts in the string, if due to an Is_Mult we need to back |
| -- up and put the current char in a separate 1-character string. |
| -- When Start_Pos is 0, C is the only char in the string; |
| -- this is used in Is_Mult handling, and in setting the SIMPLE |
| -- flag at the end. |
| |
| procedure Parse_Literal |
| (Expr_Flags : out Expression_Flags; |
| IP : out Pointer) |
| is |
| Start_Pos : Natural := 0; |
| C : Character; |
| Length_Ptr : Pointer; |
| |
| Has_Special_Operator : Boolean := False; |
| |
| begin |
| Expr_Flags := Worst_Expression; -- Ensure Expr_Flags is initialized |
| Parse_Pos := Parse_Pos - 1; -- Look at current character |
| |
| IP := |
| Emit_Node |
| (if (Flags and Case_Insensitive) /= 0 then EXACTF else EXACT); |
| |
| Length_Ptr := Emit_Ptr; |
| Emit_Ptr := String_Operand (IP); |
| |
| Parse_Loop : |
| loop |
| C := Expression (Parse_Pos); -- Get current character |
| |
| case C is |
| when '.' | '[' | '(' | ')' | '|' | ASCII.LF | '$' | '^' => |
| |
| if Start_Pos = 0 then |
| Start_Pos := Parse_Pos; |
| Emit (C); -- First character is always emitted |
| else |
| exit Parse_Loop; -- Else we are done |
| end if; |
| |
| when '?' | '+' | '*' | '{' => |
| |
| if Start_Pos = 0 then |
| Start_Pos := Parse_Pos; |
| Emit (C); -- First character is always emitted |
| |
| -- Are we looking at an operator, or is this |
| -- simply a normal character ? |
| |
| elsif not Is_Mult (Parse_Pos) then |
| Start_Pos := Parse_Pos; |
| Case_Emit (C); |
| |
| else |
| -- We've got something like "abc?d". Mark this as a |
| -- special case. What we want to emit is a first |
| -- constant string for "ab", then one for "c" that will |
| -- ultimately be transformed with a CURLY operator, A |
| -- special case has to be handled for "a?", since there |
| -- is no initial string to emit. |
| |
| Has_Special_Operator := True; |
| exit Parse_Loop; |
| end if; |
| |
| when '\' => |
| Start_Pos := Parse_Pos; |
| |
| if Parse_Pos = Parse_End then |
| Fail ("Trailing \"); |
| |
| else |
| case Expression (Parse_Pos + 1) is |
| when 'b' | 'B' | 's' | 'S' | 'd' | 'D' |
| | 'w' | 'W' | '0' .. '9' | 'G' | 'A' |
| => exit Parse_Loop; |
| when 'n' => Emit (ASCII.LF); |
| when 't' => Emit (ASCII.HT); |
| when 'r' => Emit (ASCII.CR); |
| when 'f' => Emit (ASCII.FF); |
| when 'e' => Emit (ASCII.ESC); |
| when 'a' => Emit (ASCII.BEL); |
| when others => Emit (Expression (Parse_Pos + 1)); |
| end case; |
| |
| Parse_Pos := Parse_Pos + 1; |
| end if; |
| |
| when others => |
| Start_Pos := Parse_Pos; |
| Case_Emit (C); |
| end case; |
| |
| Parse_Pos := Parse_Pos + 1; |
| exit Parse_Loop when Parse_Pos > Parse_End |
| or else Emit_Ptr - Length_Ptr = 254; |
| end loop Parse_Loop; |
| |
| -- Is the string followed by a '*+?{' operator ? If yes, and if there |
| -- is an initial string to emit, do it now. |
| |
| if Has_Special_Operator |
| and then Emit_Ptr >= Length_Ptr + Next_Pointer_Bytes |
| then |
| Emit_Ptr := Emit_Ptr - 1; |
| Parse_Pos := Start_Pos; |
| end if; |
| |
| if Length_Ptr <= PM.Size then |
| Program (Length_Ptr) := Character'Val (Emit_Ptr - Length_Ptr - 2); |
| end if; |
| |
| Expr_Flags.Has_Width := True; |
| |
| -- Slight optimization when there is a single character |
| |
| if Emit_Ptr = Length_Ptr + 2 then |
| Expr_Flags.Simple := True; |
| end if; |
| end Parse_Literal; |
| |
| ----------------- |
| -- Parse_Piece -- |
| ----------------- |
| |
| -- Note that the branching code sequences used for '?' and the |
| -- general cases of '*' and + are somewhat optimized: they use |
| -- the same NOTHING node as both the endmarker for their branch |
| -- list and the body of the last branch. It might seem that |
| -- this node could be dispensed with entirely, but the endmarker |
| -- role is not redundant. |
| |
| procedure Parse_Piece |
| (Expr_Flags : out Expression_Flags; |
| IP : out Pointer) |
| is |
| Op : Character; |
| New_Flags : Expression_Flags; |
| Greedy : Boolean := True; |
| |
| begin |
| Parse_Atom (New_Flags, IP); |
| |
| if IP = 0 |
| or else Parse_Pos > Parse_End |
| or else not Is_Mult (Parse_Pos) |
| then |
| Expr_Flags := New_Flags; |
| return; |
| end if; |
| |
| Op := Expression (Parse_Pos); |
| |
| Expr_Flags := |
| (if Op /= '+' |
| then (SP_Start => True, others => False) |
| else (Has_Width => True, others => False)); |
| |
| -- Detect non greedy operators in the easy cases |
| |
| if Op /= '{' |
| and then Parse_Pos + 1 <= Parse_End |
| and then Expression (Parse_Pos + 1) = '?' |
| then |
| Greedy := False; |
| Parse_Pos := Parse_Pos + 1; |
| end if; |
| |
| -- Generate the byte code |
| |
| case Op is |
| when '*' => |
| |
| if New_Flags.Simple then |
| Insert_Operator (STAR, IP, Greedy); |
| else |
| Link_Tail (IP, Emit_Node (WHILEM)); |
| Insert_Curly_Operator |
| (CURLYX, 0, Max_Curly_Repeat, IP, Greedy); |
| Link_Tail (IP, Emit_Node (NOTHING)); |
| end if; |
| |
| when '+' => |
| |
| if New_Flags.Simple then |
| Insert_Operator (PLUS, IP, Greedy); |
| else |
| Link_Tail (IP, Emit_Node (WHILEM)); |
| Insert_Curly_Operator |
| (CURLYX, 1, Max_Curly_Repeat, IP, Greedy); |
| Link_Tail (IP, Emit_Node (NOTHING)); |
| end if; |
| |
| when '?' => |
| if New_Flags.Simple then |
| Insert_Curly_Operator (CURLY, 0, 1, IP, Greedy); |
| else |
| Link_Tail (IP, Emit_Node (WHILEM)); |
| Insert_Curly_Operator (CURLYX, 0, 1, IP, Greedy); |
| Link_Tail (IP, Emit_Node (NOTHING)); |
| end if; |
| |
| when '{' => |
| declare |
| Min, Max : Natural; |
| |
| begin |
| Get_Curly_Arguments (Parse_Pos, Min, Max, Greedy); |
| |
| if New_Flags.Simple then |
| Insert_Curly_Operator (CURLY, Min, Max, IP, Greedy); |
| else |
| Link_Tail (IP, Emit_Node (WHILEM)); |
| Insert_Curly_Operator (CURLYX, Min, Max, IP, Greedy); |
| Link_Tail (IP, Emit_Node (NOTHING)); |
| end if; |
| end; |
| |
| when others => |
| null; |
| end case; |
| |
| Parse_Pos := Parse_Pos + 1; |
| |
| if Parse_Pos <= Parse_End |
| and then Is_Mult (Parse_Pos) |
| then |
| Fail ("nested *+{"); |
| end if; |
| end Parse_Piece; |
| |
| --------------------------------- |
| -- Parse_Posix_Character_Class -- |
| --------------------------------- |
| |
| function Parse_Posix_Character_Class return Std_Class is |
| Invert : Boolean := False; |
| Class : Std_Class := ANYOF_NONE; |
| E : String renames Expression; |
| |
| -- Class names. Note that code assumes that the length of all |
| -- classes starting with the same letter have the same length. |
| |
| Alnum : constant String := "alnum:]"; |
| Alpha : constant String := "alpha:]"; |
| Ascii_C : constant String := "ascii:]"; |
| Cntrl : constant String := "cntrl:]"; |
| Digit : constant String := "digit:]"; |
| Graph : constant String := "graph:]"; |
| Lower : constant String := "lower:]"; |
| Print : constant String := "print:]"; |
| Punct : constant String := "punct:]"; |
| Space : constant String := "space:]"; |
| Upper : constant String := "upper:]"; |
| Word : constant String := "word:]"; |
| Xdigit : constant String := "xdigit:]"; |
| |
| begin |
| -- Case of character class specified |
| |
| if Parse_Pos <= Parse_End |
| and then Expression (Parse_Pos) = ':' |
| then |
| Parse_Pos := Parse_Pos + 1; |
| |
| -- Do we have something like: [[:^alpha:]] |
| |
| if Parse_Pos <= Parse_End |
| and then Expression (Parse_Pos) = '^' |
| then |
| Invert := True; |
| Parse_Pos := Parse_Pos + 1; |
| end if; |
| |
| -- Check for class names based on first letter |
| |
| case Expression (Parse_Pos) is |
| when 'a' => |
| |
| -- All 'a' classes have the same length (Alnum'Length) |
| |
| if Parse_Pos + Alnum'Length - 1 <= Parse_End then |
| if |
| E (Parse_Pos .. Parse_Pos + Alnum'Length - 1) = Alnum |
| then |
| Class := |
| (if Invert then ANYOF_NALNUMC else ANYOF_ALNUMC); |
| Parse_Pos := Parse_Pos + Alnum'Length; |
| |
| elsif |
| E (Parse_Pos .. Parse_Pos + Alpha'Length - 1) = Alpha |
| then |
| Class := |
| (if Invert then ANYOF_NALPHA else ANYOF_ALPHA); |
| Parse_Pos := Parse_Pos + Alpha'Length; |
| |
| elsif E (Parse_Pos .. Parse_Pos + Ascii_C'Length - 1) = |
| Ascii_C |
| then |
| Class := |
| (if Invert then ANYOF_NASCII else ANYOF_ASCII); |
| Parse_Pos := Parse_Pos + Ascii_C'Length; |
| else |
| Fail ("Invalid character class: " & E); |
| end if; |
| |
| else |
| Fail ("Invalid character class: " & E); |
| end if; |
| |
| when 'c' => |
| if Parse_Pos + Cntrl'Length - 1 <= Parse_End |
| and then |
| E (Parse_Pos .. Parse_Pos + Cntrl'Length - 1) = Cntrl |
| then |
| Class := (if Invert then ANYOF_NCNTRL else ANYOF_CNTRL); |
| Parse_Pos := Parse_Pos + Cntrl'Length; |
| else |
| Fail ("Invalid character class: " & E); |
| end if; |
| |
| when 'd' => |
| if Parse_Pos + Digit'Length - 1 <= Parse_End |
| and then |
| E (Parse_Pos .. Parse_Pos + Digit'Length - 1) = Digit |
| then |
| Class := (if Invert then ANYOF_NDIGIT else ANYOF_DIGIT); |
| Parse_Pos := Parse_Pos + Digit'Length; |
| end if; |
| |
| when 'g' => |
| if Parse_Pos + Graph'Length - 1 <= Parse_End |
| and then |
| E (Parse_Pos .. Parse_Pos + Graph'Length - 1) = Graph |
| then |
| Class := (if Invert then ANYOF_NGRAPH else ANYOF_GRAPH); |
| Parse_Pos := Parse_Pos + Graph'Length; |
| else |
| Fail ("Invalid character class: " & E); |
| end if; |
| |
| when 'l' => |
| if Parse_Pos + Lower'Length - 1 <= Parse_End |
| and then |
| E (Parse_Pos .. Parse_Pos + Lower'Length - 1) = Lower |
| then |
| Class := (if Invert then ANYOF_NLOWER else ANYOF_LOWER); |
| Parse_Pos := Parse_Pos + Lower'Length; |
| else |
| Fail ("Invalid character class: " & E); |
| end if; |
| |
| when 'p' => |
| |
| -- All 'p' classes have the same length |
| |
| if Parse_Pos + Print'Length - 1 <= Parse_End then |
| if |
| E (Parse_Pos .. Parse_Pos + Print'Length - 1) = Print |
| then |
| Class := |
| (if Invert then ANYOF_NPRINT else ANYOF_PRINT); |
| Parse_Pos := Parse_Pos + Print'Length; |
| |
| elsif |
| E (Parse_Pos .. Parse_Pos + Punct'Length - 1) = Punct |
| then |
| Class := |
| (if Invert then ANYOF_NPUNCT else ANYOF_PUNCT); |
| Parse_Pos := Parse_Pos + Punct'Length; |
| |
| else |
| Fail ("Invalid character class: " & E); |
| end if; |
| |
| else |
| Fail ("Invalid character class: " & E); |
| end if; |
| |
| when 's' => |
| if Parse_Pos + Space'Length - 1 <= Parse_End |
| and then |
| E (Parse_Pos .. Parse_Pos + Space'Length - 1) = Space |
| then |
| Class := (if Invert then ANYOF_NSPACE else ANYOF_SPACE); |
| Parse_Pos := Parse_Pos + Space'Length; |
| else |
| Fail ("Invalid character class: " & E); |
| end if; |
| |
| when 'u' => |
| if Parse_Pos + Upper'Length - 1 <= Parse_End |
| and then |
| E (Parse_Pos .. Parse_Pos + Upper'Length - 1) = Upper |
| then |
| Class := (if Invert then ANYOF_NUPPER else ANYOF_UPPER); |
| Parse_Pos := Parse_Pos + Upper'Length; |
| else |
| Fail ("Invalid character class: " & E); |
| end if; |
| |
| when 'w' => |
| if Parse_Pos + Word'Length - 1 <= Parse_End |
| and then |
| E (Parse_Pos .. Parse_Pos + Word'Length - 1) = Word |
| then |
| Class := (if Invert then ANYOF_NALNUM else ANYOF_ALNUM); |
| Parse_Pos := Parse_Pos + Word'Length; |
| else |
| Fail ("Invalid character class: " & E); |
| end if; |
| |
| when 'x' => |
| if Parse_Pos + Xdigit'Length - 1 <= Parse_End |
| and then |
| E (Parse_Pos .. Parse_Pos + Xdigit'Length - 1) = Xdigit |
| then |
| Class := (if Invert then ANYOF_NXDIGIT else ANYOF_XDIGIT); |
| Parse_Pos := Parse_Pos + Xdigit'Length; |
| |
| else |
| Fail ("Invalid character class: " & E); |
| end if; |
| |
| when others => |
| Fail ("Invalid character class: " & E); |
| end case; |
| |
| -- Character class not specified |
| |
| else |
| return ANYOF_NONE; |
| end if; |
| |
| return Class; |
| end Parse_Posix_Character_Class; |
| |
| -- Local Declarations |
| |
| Result : Pointer; |
| |
| Expr_Flags : Expression_Flags; |
| |
| -- Start of processing for Compile |
| |
| begin |
| Parse (False, False, Expr_Flags, Result); |
| |
| if Result = 0 then |
| Fail ("Couldn't compile expression"); |
| end if; |
| |
| Final_Code_Size := Emit_Ptr - 1; |
| |
| -- Do we want to actually compile the expression, or simply get the |
| -- code size ??? |
| |
| if Emit_Ptr <= PM.Size then |
| Optimize (PM); |
| end if; |
| |
| PM.Flags := Flags; |
| end Compile; |
| |
| function Compile |
| (Expression : String; |
| Flags : Regexp_Flags := No_Flags) return Pattern_Matcher |
| is |
| -- Assume the compiled regexp will fit in 1000 chars. If it does not we |
| -- will have to compile a second time once the correct size is known. If |
| -- it fits, we save a significant amount of time by avoiding the second |
| -- compilation. |
| |
| Dummy : Pattern_Matcher (1000); |
| Size : Program_Size; |
| |
| begin |
| Compile (Dummy, Expression, Size, Flags); |
| |
| if Size <= Dummy.Size then |
| return Pattern_Matcher' |
| (Size => Size, |
| First => Dummy.First, |
| Anchored => Dummy.Anchored, |
| Must_Have => Dummy.Must_Have, |
| Must_Have_Length => Dummy.Must_Have_Length, |
| Paren_Count => Dummy.Paren_Count, |
| Flags => Dummy.Flags, |
| Program => |
| Dummy.Program |
| (Dummy.Program'First .. Dummy.Program'First + Size - 1)); |
| else |
| -- We have to recompile now that we know the size |
| -- ??? Can we use Ada 2005's return construct ? |
| |
| declare |
| Result : Pattern_Matcher (Size); |
| begin |
| Compile (Result, Expression, Size, Flags); |
| return Result; |
| end; |
| end if; |
| end Compile; |
| |
| procedure Compile |
| (Matcher : out Pattern_Matcher; |
| Expression : String; |
| Flags : Regexp_Flags := No_Flags) |
| is |
| Size : Program_Size; |
| |
| begin |
| Compile (Matcher, Expression, Size, Flags); |
| |
| if Size > Matcher.Size then |
| raise Expression_Error with "Pattern_Matcher is too small"; |
| end if; |
| end Compile; |
| |
| -------------------- |
| -- Dump_Operation -- |
| -------------------- |
| |
| procedure Dump_Operation |
| (Program : Program_Data; |
| Index : Pointer; |
| Indent : Natural) |
| is |
| Current : Pointer := Index; |
| begin |
| Dump_Until (Program, Current, Current + 1, Indent); |
| end Dump_Operation; |
| |
| ---------------- |
| -- Dump_Until -- |
| ---------------- |
| |
| procedure Dump_Until |
| (Program : Program_Data; |
| Index : in out Pointer; |
| Till : Pointer; |
| Indent : Natural; |
| Do_Print : Boolean := True) |
| is |
| function Image (S : String) return String; |
| -- Remove leading space |
| |
| ----------- |
| -- Image -- |
| ----------- |
| |
| function Image (S : String) return String is |
| begin |
| if S (S'First) = ' ' then |
| return S (S'First + 1 .. S'Last); |
| else |
| return S; |
| end if; |
| end Image; |
| |
| -- Local variables |
| |
| Op : Opcode; |
| Next : Pointer; |
| Length : Pointer; |
| Local_Indent : Natural := Indent; |
| |
| -- Start of processing for Dump_Until |
| |
| begin |
| while Index < Till loop |
| Op := Opcode'Val (Character'Pos ((Program (Index)))); |
| Next := Get_Next (Program, Index); |
| |
| if Do_Print then |
| declare |
| Point : constant String := Pointer'Image (Index); |
| begin |
| Put ([1 .. 4 - Point'Length => ' '] |
| & Point & ":" |
| & [1 .. Local_Indent * 2 => ' '] & Opcode'Image (Op)); |
| end; |
| |
| -- Print the parenthesis number |
| |
| if Op = OPEN or else Op = CLOSE or else Op = REFF then |
| Put (Image (Natural'Image |
| (Character'Pos |
| (Program (Index + Next_Pointer_Bytes))))); |
| end if; |
| |
| if Next = Index then |
| Put (" (-)"); |
| else |
| Put (" (" & Image (Pointer'Image (Next)) & ")"); |
| end if; |
| end if; |
| |
| case Op is |
| when ANYOF => |
| declare |
| Bitmap : Character_Class; |
| Last : Character := ASCII.NUL; |
| Current : Natural := 0; |
| Current_Char : Character; |
| |
| begin |
| Bitmap_Operand (Program, Index, Bitmap); |
| |
| if Do_Print then |
| Put ("["); |
| |
| while Current <= 255 loop |
| Current_Char := Character'Val (Current); |
| |
| -- First item in a range |
| |
| if Get_From_Class (Bitmap, Current_Char) then |
| Last := Current_Char; |
| |
| -- Search for the last item in the range |
| |
| loop |
| Current := Current + 1; |
| exit when Current > 255; |
| Current_Char := Character'Val (Current); |
| exit when |
| not Get_From_Class (Bitmap, Current_Char); |
| end loop; |
| |
| if not Is_Graphic (Last) then |
| Put (Last'Img); |
| else |
| Put (Last); |
| end if; |
| |
| if Character'Succ (Last) /= Current_Char then |
| Put ("\-" & Character'Pred (Current_Char)); |
| end if; |
| |
| else |
| Current := Current + 1; |
| end if; |
| end loop; |
| |
| Put_Line ("]"); |
| end if; |
| |
| Index := Index + Next_Pointer_Bytes + Bitmap'Length; |
| end; |
| |
| when EXACT | EXACTF => |
| Length := String_Length (Program, Index); |
| if Do_Print then |
| Put (" (" & Image (Program_Size'Image (Length + 1)) |
| & " chars) <" |
| & String (Program (String_Operand (Index) |
| .. String_Operand (Index) |
| + Length))); |
| Put_Line (">"); |
| end if; |
| |
| Index := String_Operand (Index) + Length + 1; |
| |
| -- Node operand |
| |
| when BRANCH | STAR | PLUS => |
| if Do_Print then |
| New_Line; |
| end if; |
| |
| Index := Index + Next_Pointer_Bytes; |
| Dump_Until (Program, Index, Pointer'Min (Next, Till), |
| Local_Indent + 1, Do_Print); |
| |
| when CURLY | CURLYX => |
| if Do_Print then |
| Put_Line |
| (" {" |
| & Image (Natural'Image |
| (Read_Natural (Program, Index + Next_Pointer_Bytes))) |
| & "," |
| & Image (Natural'Image (Read_Natural (Program, Index + 5))) |
| & "}"); |
| end if; |
| |
| Index := Index + 7; |
| Dump_Until (Program, Index, Pointer'Min (Next, Till), |
| Local_Indent + 1, Do_Print); |
| |
| when OPEN => |
| if Do_Print then |
| New_Line; |
| end if; |
| |
| Index := Index + 4; |
| Local_Indent := Local_Indent + 1; |
| |
| when CLOSE | REFF => |
| if Do_Print then |
| New_Line; |
| end if; |
| |
| Index := Index + 4; |
| |
| if Op = CLOSE then |
| Local_Indent := Local_Indent - 1; |
| end if; |
| |
| when others => |
| Index := Index + Next_Pointer_Bytes; |
| |
| if Do_Print then |
| New_Line; |
| end if; |
| |
| exit when Op = EOP; |
| end case; |
| end loop; |
| end Dump_Until; |
| |
| ---------- |
| -- Dump -- |
| ---------- |
| |
| procedure Dump (Self : Pattern_Matcher) is |
| Program : Program_Data renames Self.Program; |
| Index : Pointer := Program'First; |
| |
| -- Start of processing for Dump |
| |
| begin |
| Put_Line ("Must start with (Self.First) = " |
| & Character'Image (Self.First)); |
| |
| if (Self.Flags and Case_Insensitive) /= 0 then |
| Put_Line (" Case_Insensitive mode"); |
| end if; |
| |
| if (Self.Flags and Single_Line) /= 0 then |
| Put_Line (" Single_Line mode"); |
| end if; |
| |
| if (Self.Flags and Multiple_Lines) /= 0 then |
| Put_Line (" Multiple_Lines mode"); |
| end if; |
| |
| Dump_Until (Program, Index, Self.Program'Last + 1, 0); |
| end Dump; |
| |
| -------------------- |
| -- Get_From_Class -- |
| -------------------- |
| |
| function Get_From_Class |
| (Bitmap : Character_Class; |
| C : Character) return Boolean |
| is |
| Value : constant Class_Byte := Character'Pos (C); |
| begin |
| return |
| (Bitmap (Value / 8) and Bit_Conversion (Value mod 8)) /= 0; |
| end Get_From_Class; |
| |
| -------------- |
| -- Get_Next -- |
| -------------- |
| |
| function Get_Next (Program : Program_Data; IP : Pointer) return Pointer is |
| begin |
| return IP + Pointer (Read_Natural (Program, IP + 1)); |
| end Get_Next; |
| |
| -------------- |
| -- Is_Alnum -- |
| -------------- |
| |
| function Is_Alnum (C : Character) return Boolean is |
| begin |
| return Is_Alphanumeric (C) or else C = '_'; |
| end Is_Alnum; |
| |
| ------------------ |
| -- Is_Printable -- |
| ------------------ |
| |
| function Is_Printable (C : Character) return Boolean is |
| begin |
| -- Printable if space or graphic character or other whitespace |
| -- Other white space includes (HT/LF/VT/FF/CR = codes 9-13) |
| |
| return C in Character'Val (32) .. Character'Val (126) |
| or else C in ASCII.HT .. ASCII.CR; |
| end Is_Printable; |
| |
| -------------------- |
| -- Is_White_Space -- |
| -------------------- |
| |
| function Is_White_Space (C : Character) return Boolean is |
| begin |
| -- Note: HT = 9, LF = 10, VT = 11, FF = 12, CR = 13 |
| |
| return C = ' ' or else C in ASCII.HT .. ASCII.CR; |
| end Is_White_Space; |
| |
| ----------- |
| -- Match -- |
| ----------- |
| |
| procedure Match |
| (Self : Pattern_Matcher; |
| Data : String; |
| Matches : out Match_Array; |
| Data_First : Integer := -1; |
| Data_Last : Positive := Positive'Last) |
| is |
| Program : Program_Data renames Self.Program; -- Shorter notation |
| |
| First_In_Data : constant Integer := Integer'Max (Data_First, Data'First); |
| Last_In_Data : constant Integer := Integer'Min (Data_Last, Data'Last); |
| |
| -- Global work variables |
| |
| Input_Pos : Natural; -- String-input pointer |
| BOL_Pos : Natural; -- Beginning of input, for ^ check |
| Matched : Boolean := False; -- Until proven True |
| |
| Matches_Full : Match_Array (0 .. Natural'Max (Self.Paren_Count, |
| Matches'Last)); |
| -- Stores the value of all the parenthesis pairs. |
| -- We do not use directly Matches, so that we can also use back |
| -- references (REFF) even if Matches is too small. |
| |
| type Natural_Array is array (Match_Count range <>) of Natural; |
| Matches_Tmp : Natural_Array (Matches_Full'Range); |
| -- Save the opening position of parenthesis |
| |
| Last_Paren : Natural := 0; |
| -- Last parenthesis seen |
| |
| Greedy : Boolean := True; |
| -- True if the next operator should be greedy |
| |
| type Current_Curly_Record; |
| type Current_Curly_Access is access all Current_Curly_Record; |
| type Current_Curly_Record is record |
| Paren_Floor : Natural; -- How far back to strip parenthesis data |
| Cur : Integer; -- How many instances of scan we've matched |
| Min : Natural; -- Minimal number of scans to match |
| Max : Natural; -- Maximal number of scans to match |
| Greedy : Boolean; -- Whether to work our way up or down |
| Scan : Pointer; -- The thing to match |
| Next : Pointer; -- What has to match after it |
| Lastloc : Natural; -- Where we started matching this scan |
| Old_Cc : Current_Curly_Access; -- Before we started this one |
| end record; |
| -- Data used to handle the curly operator and the plus and star |
| -- operators for complex expressions. |
| |
| Current_Curly : Current_Curly_Access := null; |
| -- The curly currently being processed |
| |
| ----------------------- |
| -- Local Subprograms -- |
| ----------------------- |
| |
| function Index (Start : Positive; C : Character) return Natural; |
| -- Find character C in Data starting at Start and return position |
| |
| function Repeat |
| (IP : Pointer; |
| Max : Natural := Natural'Last) return Natural; |
| -- Repeatedly match something simple, report how many |
| -- It only matches on things of length 1. |
| -- Starting from Input_Pos, it matches at most Max CURLY. |
| |
| function Try (Pos : Positive) return Boolean; |
| -- Try to match at specific point |
| |
| function Match (IP : Pointer) return Boolean; |
| -- This is the main matching routine. Conceptually the strategy |
| -- is simple: check to see whether the current node matches, |
| -- call self recursively to see whether the rest matches, |
| -- and then act accordingly. |
| -- |
| -- In practice Match makes some effort to avoid recursion, in |
| -- particular by going through "ordinary" nodes (that don't |
| -- need to know whether the rest of the match failed) by |
| -- using a loop instead of recursion. |
| -- Why is the above comment part of the spec rather than body ??? |
| |
| function Match_Whilem return Boolean; |
| -- Return True if a WHILEM matches the Current_Curly |
| |
| function Recurse_Match (IP : Pointer; From : Natural) return Boolean; |
| pragma Inline (Recurse_Match); |
| -- Calls Match recursively. It saves and restores the parenthesis |
| -- status and location in the input stream correctly, so that |
| -- backtracking is possible |
| |
| function Match_Simple_Operator |
| (Op : Opcode; |
| Scan : Pointer; |
| Next : Pointer; |
| Greedy : Boolean) return Boolean; |
| -- Return True it the simple operator (possibly non-greedy) matches |
| |
| Dump_Indent : Integer := -1; |
| procedure Dump_Current (Scan : Pointer; Prefix : Boolean := True); |
| procedure Dump_Error (Msg : String); |
| -- Debug: print the current context |
| |
| pragma Inline (Index); |
| pragma Inline (Repeat); |
| |
| -- These are two complex functions, but used only once |
| |
| pragma Inline (Match_Whilem); |
| pragma Inline (Match_Simple_Operator); |
| |
| ----------- |
| -- Index -- |
| ----------- |
| |
| function Index (Start : Positive; C : Character) return Natural is |
| begin |
| for J in Start .. Last_In_Data loop |
| if Data (J) = C then |
| return J; |
| end if; |
| end loop; |
| |
| return 0; |
| end Index; |
| |
| ------------------- |
| -- Recurse_Match -- |
| ------------------- |
| |
| function Recurse_Match (IP : Pointer; From : Natural) return Boolean is |
| L : constant Natural := Last_Paren; |
| Tmp_F : constant Match_Array := |
| Matches_Full (From + 1 .. Matches_Full'Last); |
| Start : constant Natural_Array := |
| Matches_Tmp (From + 1 .. Matches_Tmp'Last); |
| Input : constant Natural := Input_Pos; |
| |
| Dump_Indent_Save : constant Integer := Dump_Indent; |
| |
| begin |
| if Match (IP) then |
| return True; |
| end if; |
| |
| Last_Paren := L; |
| Matches_Full (Tmp_F'Range) := Tmp_F; |
| Matches_Tmp (Start'Range) := Start; |
| Input_Pos := Input; |
| Dump_Indent := Dump_Indent_Save; |
| return False; |
| end Recurse_Match; |
| |
| ------------------ |
| -- Dump_Current -- |
| ------------------ |
| |
| procedure Dump_Current (Scan : Pointer; Prefix : Boolean := True) is |
| Length : constant := 10; |
| Pos : constant String := Integer'Image (Input_Pos); |
| |
| begin |
| if Prefix then |
| Put ([1 .. 5 - Pos'Length => ' ']); |
| Put (Pos & " <" |
| & Data (Input_Pos |
| .. Integer'Min (Last_In_Data, Input_Pos + Length - 1))); |
| Put ([1 .. Length - 1 - Last_In_Data + Input_Pos => ' ']); |
| Put ("> |"); |
| |
| else |
| Put (" "); |
| end if; |
| |
| Dump_Operation (Program, Scan, Indent => Dump_Indent); |
| end Dump_Current; |
| |
| ---------------- |
| -- Dump_Error -- |
| ---------------- |
| |
| procedure Dump_Error (Msg : String) is |
| begin |
| Put (" | "); |
| Put ([1 .. Dump_Indent * 2 => ' ']); |
| Put_Line (Msg); |
| end Dump_Error; |
| |
| ----------- |
| -- Match -- |
| ----------- |
| |
| function Match (IP : Pointer) return Boolean is |
| Scan : Pointer := IP; |
| Next : Pointer; |
| Op : Opcode; |
| Result : Boolean; |
| |
| begin |
| Dump_Indent := Dump_Indent + 1; |
| |
| State_Machine : |
| loop |
| pragma Assert (Scan /= 0); |
| |
| -- Determine current opcode and count its usage in debug mode |
| |
| Op := Opcode'Val (Character'Pos (Program (Scan))); |
| |
| -- Calculate offset of next instruction. Second character is most |
| -- significant in Program_Data. |
| |
| Next := Get_Next (Program, Scan); |
| |
| if Debug then |
| Dump_Current (Scan); |
| end if; |
| |
| case Op is |
| when EOP => |
| Dump_Indent := Dump_Indent - 1; |
| return True; -- Success |
| |
| when BRANCH => |
| if Program (Next) /= BRANCH then |
| Next := Operand (Scan); -- No choice, avoid recursion |
| |
| else |
| loop |
| if Recurse_Match (Operand (Scan), 0) then |
| Dump_Indent := Dump_Indent - 1; |
| return True; |
| end if; |
| |
| Scan := Get_Next (Program, Scan); |
| exit when Scan = 0 or else Program (Scan) /= BRANCH; |
| end loop; |
| |
| exit State_Machine; |
| end if; |
| |
| when NOTHING => |
| null; |
| |
| when BOL => |
| exit State_Machine when Input_Pos /= BOL_Pos |
| and then ((Self.Flags and Multiple_Lines) = 0 |
| or else Data (Input_Pos - 1) /= ASCII.LF); |
| |
| when MBOL => |
| exit State_Machine when Input_Pos /= BOL_Pos |
| and then Data (Input_Pos - 1) /= ASCII.LF; |
| |
| when SBOL => |
| exit State_Machine when Input_Pos /= BOL_Pos; |
| |
| when EOL => |
| |
| -- A combination of MEOL and SEOL |
| |
| if (Self.Flags and Multiple_Lines) = 0 then |
| |
| -- Single line mode |
| |
| exit State_Machine when Input_Pos <= Data'Last; |
| |
| elsif Input_Pos <= Last_In_Data then |
| exit State_Machine when Data (Input_Pos) /= ASCII.LF; |
| else |
| exit State_Machine when Last_In_Data /= Data'Last; |
| end if; |
| |
| when MEOL => |
| if Input_Pos <= Last_In_Data then |
| exit State_Machine when Data (Input_Pos) /= ASCII.LF; |
| else |
| exit State_Machine when Last_In_Data /= Data'Last; |
| end if; |
| |
| when SEOL => |
| |
| -- If there is a character before Data'Last (even if |
| -- Last_In_Data stops before then), we can't have the |
| -- end of the line. |
| |
| exit State_Machine when Input_Pos <= Data'Last; |
| |
| when BOUND | NBOUND => |
| |
| -- Was last char in word ? |
| |
| declare |
| N : Boolean := False; |
| Ln : Boolean := False; |
| |
| begin |
| if Input_Pos /= First_In_Data then |
| N := Is_Alnum (Data (Input_Pos - 1)); |
| end if; |
| |
| Ln := |
| (if Input_Pos > Last_In_Data |
| then False |
| else Is_Alnum (Data (Input_Pos))); |
| |
| if Op = BOUND then |
| if N = Ln then |
| exit State_Machine; |
| end if; |
| else |
| if N /= Ln then |
| exit State_Machine; |
| end if; |
| end if; |
| end; |
| |
| when SPACE => |
| exit State_Machine when Input_Pos > Last_In_Data |
| or else not Is_White_Space (Data (Input_Pos)); |
| Input_Pos := Input_Pos + 1; |
| |
| when NSPACE => |
| exit State_Machine when Input_Pos > Last_In_Data |
| or else Is_White_Space (Data (Input_Pos)); |
| Input_Pos := Input_Pos + 1; |
| |
| when DIGIT => |
| exit State_Machine when Input_Pos > Last_In_Data |
| or else not Is_Digit (Data (Input_Pos)); |
| Input_Pos := Input_Pos + 1; |
| |
| when NDIGIT => |
| exit State_Machine when Input_Pos > Last_In_Data |
| or else Is_Digit (Data (Input_Pos)); |
| Input_Pos := Input_Pos + 1; |
| |
| when ALNUM => |
| exit State_Machine when Input_Pos > Last_In_Data |
| or else not Is_Alnum (Data (Input_Pos)); |
| Input_Pos := Input_Pos + 1; |
| |
| when NALNUM => |
| exit State_Machine when Input_Pos > Last_In_Data |
| or else Is_Alnum (Data (Input_Pos)); |
| Input_Pos := Input_Pos + 1; |
| |
| when ANY => |
| exit State_Machine when Input_Pos > Last_In_Data |
| or else Data (Input_Pos) = ASCII.LF; |
| Input_Pos := Input_Pos + 1; |
| |
| when SANY => |
| exit State_Machine when Input_Pos > Last_In_Data; |
| Input_Pos := Input_Pos + 1; |
| |
| when EXACT => |
| declare |
| Opnd : Pointer := String_Operand (Scan); |
| Current : Positive := Input_Pos; |
| Last : constant Pointer := |
| Opnd + String_Length (Program, Scan); |
| |
| begin |
| while Opnd <= Last loop |
| exit State_Machine when Current > Last_In_Data |
| or else Program (Opnd) /= Data (Current); |
| Current := Current + 1; |
| Opnd := Opnd + 1; |
| end loop; |
| |
| Input_Pos := Current; |
| end; |
| |
| when EXACTF => |
| declare |
| Opnd : Pointer := String_Operand (Scan); |
| Current : Positive := Input_Pos; |
| |
| Last : constant Pointer := |
| Opnd + String_Length (Program, Scan); |
| |
| begin |
| while Opnd <= Last loop |
| exit State_Machine when Current > Last_In_Data |
| or else Program (Opnd) /= To_Lower (Data (Current)); |
| Current := Current + 1; |
| Opnd := Opnd + 1; |
| end loop; |
| |
| Input_Pos := Current; |
| end; |
| |
| when ANYOF => |
| declare |
| Bitmap : Character_Class; |
| begin |
| Bitmap_Operand (Program, Scan, Bitmap); |
| exit State_Machine when Input_Pos > Last_In_Data |
| or else not Get_From_Class (Bitmap, Data (Input_Pos)); |
| Input_Pos := Input_Pos + 1; |
| end; |
| |
| when OPEN => |
| declare |
| No : constant Natural := |
| Character'Pos (Program (Operand (Scan))); |
| begin |
| Matches_Tmp (No) := Input_Pos; |
| end; |
| |
| when CLOSE => |
| declare |
| No : constant Natural := |
| Character'Pos (Program (Operand (Scan))); |
| |
| begin |
| Matches_Full (No) := (Matches_Tmp (No), Input_Pos - 1); |
| |
| if Last_Paren < No then |
| Last_Paren := No; |
| end if; |
| end; |
| |
| when REFF => |
| declare |
| No : constant Natural := |
| Character'Pos (Program (Operand (Scan))); |
| |
| Data_Pos : Natural; |
| |
| begin |
| -- If we haven't seen that parenthesis yet |
| |
| if Last_Paren < No then |
| Dump_Indent := Dump_Indent - 1; |
| |
| if Debug then |
| Dump_Error ("REFF: No match, backtracking"); |
| end if; |
| |
| return False; |
| end if; |
| |
| Data_Pos := Matches_Full (No).First; |
| |
| while Data_Pos <= Matches_Full (No).Last loop |
| if Input_Pos > Last_In_Data |
| or else Data (Input_Pos) /= Data (Data_Pos) |
| then |
| Dump_Indent := Dump_Indent - 1; |
| |
| if Debug then |
| Dump_Error ("REFF: No match, backtracking"); |
| end if; |
| |
| return False; |
| end if; |
| |
| Input_Pos := Input_Pos + 1; |
| Data_Pos := Data_Pos + 1; |
| end loop; |
| end; |
| |
| when MINMOD => |
| Greedy := False; |
| |
| when STAR | PLUS | CURLY => |
| declare |
| Greed : constant Boolean := Greedy; |
| begin |
| Greedy := True; |
| Result := Match_Simple_Operator (Op, Scan, Next, Greed); |
| Dump_Indent := Dump_Indent - 1; |
| return Result; |
| end; |
| |
| when CURLYX => |
| |
| -- Looking at something like: |
| |
| -- 1: CURLYX {n,m} (->4) |
| -- 2: code for complex thing (->3) |
| -- 3: WHILEM (->0) |
| -- 4: NOTHING |
| |
| declare |
| Min : constant Natural := |
| Read_Natural (Program, Scan + Next_Pointer_Bytes); |
| Max : constant Natural := |
| Read_Natural |
| (Program, Scan + Next_Pointer_Bytes + 2); |
| Cc : aliased Current_Curly_Record; |
| |
| Has_Match : Boolean; |
| |
| begin |
| Cc := (Paren_Floor => Last_Paren, |
| Cur => -1, |
| Min => Min, |
| Max => Max, |
| Greedy => Greedy, |
| Scan => Scan + 7, |
| Next => Next, |
| Lastloc => 0, |
| Old_Cc => Current_Curly); |
| Greedy := True; |
| Current_Curly := Cc'Unchecked_Access; |
| |
| Has_Match := Match (Next - Next_Pointer_Bytes); |
| |
| -- Start on the WHILEM |
| |
| Current_Curly := Cc.Old_Cc; |
| Dump_Indent := Dump_Indent - 1; |
| |
| if not Has_Match then |
| if Debug then |
| Dump_Error ("CURLYX failed..."); |
| end if; |
| end if; |
| |
| return Has_Match; |
| end; |
| |
| when WHILEM => |
| Result := Match_Whilem; |
| Dump_Indent := Dump_Indent - 1; |
| |
| if Debug and then not Result then |
| Dump_Error ("WHILEM: no match, backtracking"); |
| end if; |
| |
| return Result; |
| end case; |
| |
| Scan := Next; |
| end loop State_Machine; |
| |
| if Debug then |
| Dump_Error ("failed..."); |
| Dump_Indent := Dump_Indent - 1; |
| end if; |
| |
| -- If we get here, there is no match. For successful matches when EOP |
| -- is the terminating point. |
| |
| return False; |
| end Match; |
| |
| --------------------------- |
| -- Match_Simple_Operator -- |
| --------------------------- |
| |
| function Match_Simple_Operator |
| (Op : Opcode; |
| Scan : Pointer; |
| Next : Pointer; |
| Greedy : Boolean) return Boolean |
| is |
| Next_Char : Character := ASCII.NUL; |
| Next_Char_Known : Boolean := False; |
| No : Integer; -- Can be negative |
| Min : Natural; |
| Max : Natural := Natural'Last; |
| Operand_Code : Pointer; |
| Old : Natural; |
| Last_Pos : Natural; |
| Save : constant Natural := Input_Pos; |
| |
| begin |
| -- Lookahead to avoid useless match attempts when we know what |
| -- character comes next. |
| |
| if Program (Next) = EXACT then |
| Next_Char := Program (String_Operand (Next)); |
| Next_Char_Known := True; |
| end if; |
| |
| -- Find the minimal and maximal values for the operator |
| |
| case Op is |
| when STAR => |
| Min := 0; |
| Operand_Code := Operand (Scan); |
| |
| when PLUS => |
| Min := 1; |
| Operand_Code := Operand (Scan); |
| |
| when others => |
| Min := Read_Natural (Program, Scan + Next_Pointer_Bytes); |
| Max := Read_Natural (Program, Scan + Next_Pointer_Bytes + 2); |
| Operand_Code := Scan + 7; |
| end case; |
| |
| if Debug then |
| Dump_Current (Operand_Code, Prefix => False); |
| end if; |
| |
| -- Non greedy operators |
| |
| if not Greedy then |
| |
| -- Test we can repeat at least Min times |
| |
| if Min /= 0 then |
| No := Repeat (Operand_Code, Min); |
| |
| if No < Min then |
| if Debug then |
| Dump_Error ("failed... matched" & No'Img & " times"); |
| end if; |
| |
| return False; |
| end if; |
| end if; |
| |
| Old := Input_Pos; |
| |
| -- Find the place where 'next' could work |
| |
| if Next_Char_Known then |
| |
| -- Last position to check |
| |
| if Max = Natural'Last then |
| Last_Pos := Last_In_Data; |
| else |
| Last_Pos := Input_Pos + Max; |
| |
| if Last_Pos > Last_In_Data then |
| Last_Pos := Last_In_Data; |
| end if; |
| end if; |
| |
| -- Look for the first possible opportunity |
| |
| if Debug then |
| Dump_Error ("Next_Char must be " & Next_Char); |
| end if; |
| |
| loop |
| -- Find the next possible position |
| |
| while Input_Pos <= Last_Pos |
| and then Data (Input_Pos) /= Next_Char |
| loop |
| Input_Pos := Input_Pos + 1; |
| end loop; |
| |
| if Input_Pos > Last_Pos then |
| return False; |
| end if; |
| |
| -- Check that we still match if we stop at the position we |
| -- just found. |
| |
| declare |
| Num : constant Natural := Input_Pos - Old; |
| |
| begin |
| Input_Pos := Old; |
| |
| if Debug then |
| Dump_Error ("Would we still match at that position?"); |
| end if; |
| |
| if Repeat (Operand_Code, Num) < Num then |
| return False; |
| end if; |
| end; |
| |
| -- Input_Pos now points to the new position |
| |
| if Match (Get_Next (Program, Scan)) then |
| return True; |
| end if; |
| |
| Old := Input_Pos; |
| Input_Pos := Input_Pos + 1; |
| end loop; |
| |
| -- We do not know what the next character is |
| |
| else |
| while Max >= Min loop |
| if Debug then |
| Dump_Error ("Non-greedy repeat, N=" & Min'Img); |
| Dump_Error ("Do we still match Next if we stop here?"); |
| end if; |
| |
| -- If the next character matches |
| |
| if Recurse_Match (Next, 1) then |
| return True; |
| end if; |
| |
| Input_Pos := Save + Min; |
| |
| -- Could not or did not match -- move forward |
| |
| if Repeat (Operand_Code, 1) /= 0 then |
| Min := Min + 1; |
| else |
| if Debug then |
| Dump_Error ("Non-greedy repeat failed..."); |
| end if; |
| |
| return False; |
| end if; |
| end loop; |
| end if; |
| |
| return False; |
| |
| -- Greedy operators |
| |
| else |
| No := Repeat (Operand_Code, Max); |
| |
| if Debug and then No < Min then |
| Dump_Error ("failed... matched" & No'Img & " times"); |
| end if; |
| |
| -- ??? Perl has some special code here in case the next |
| -- instruction is of type EOL, since $ and \Z can match before |
| -- *and* after newline at the end. |
| |
| -- ??? Perl has some special code here in case (paren) is True |
| |
| -- Else, if we don't have any parenthesis |
| |
| while No >= Min loop |
| if not Next_Char_Known |
| or else (Input_Pos <= Last_In_Data |
| and then Data (Input_Pos) = Next_Char) |
| then |
| if Match (Next) then |
| return True; |
| end if; |
| end if; |
| |
| -- Could not or did not work, we back up |
| |
| No := No - 1; |
| Input_Pos := Save + No; |
| end loop; |
| |
| return False; |
| end if; |
| end Match_Simple_Operator; |
| |
| ------------------ |
| -- Match_Whilem -- |
| ------------------ |
| |
| -- This is really hard to understand, because after we match what we |
| -- are trying to match, we must make sure the rest of the REx is going |
| -- to match for sure, and to do that we have to go back UP the parse |
| -- tree by recursing ever deeper. And if it fails, we have to reset |
| -- our parent's current state that we can try again after backing off. |
| |
| function Match_Whilem return Boolean is |
| Cc : constant Current_Curly_Access := Current_Curly; |
| |
| N : constant Natural := Cc.Cur + 1; |
| Ln : Natural := 0; |
| |
| Lastloc : constant Natural := Cc.Lastloc; |
| -- Detection of 0-len |
| |
| begin |
| -- If degenerate scan matches "", assume scan done |
| |
| if Input_Pos = Cc.Lastloc |
| and then N >= Cc.Min |
| then |
| -- Temporarily restore the old context, and check that we |
| -- match was comes after CURLYX. |
| |
| Current_Curly := Cc.Old_Cc; |
| |
| if Current_Curly /= null then |
| Ln := Current_Curly.Cur; |
| end if; |
| |
| if Match (Cc.Next) then |
| return True; |
| end if; |
| |
| if Current_Curly /= null then |
| Current_Curly.Cur := Ln; |
| end if; |
| |
| Current_Curly := Cc; |
| return False; |
| end if; |
| |
| -- First, just match a string of min scans |
| |
| if N < Cc.Min then |
| Cc.Cur := N; |
| Cc.Lastloc := Input_Pos; |
| |
| if Debug then |
| Dump_Error |
| ("Tests that we match at least" & Cc.Min'Img & " N=" & N'Img); |
| end if; |
| |
| if Match (Cc.Scan) then |
| return True; |
| end if; |
| |
| Cc.Cur := N - 1; |
| Cc.Lastloc := Lastloc; |
| |
| if Debug then |
| Dump_Error ("failed..."); |
| end if; |
| |
| return False; |
| end if; |
| |
| -- Prefer next over scan for minimal matching |
| |
| if not Cc.Greedy then |
| Current_Curly := Cc.Old_Cc; |
| |
| if Current_Curly /= null then |
| Ln := Current_Curly.Cur; |
| end if; |
| |
| if Recurse_Match (Cc.Next, Cc.Paren_Floor) then |
| return True; |
| end if; |
| |
| if Current_Curly /= null then |
| Current_Curly.Cur := Ln; |
| end if; |
| |
| Current_Curly := Cc; |
| |
| -- Maximum greed exceeded ? |
| |
| if N >= Cc.Max then |
| if Debug then |
| Dump_Error ("failed..."); |
| end if; |
| return False; |
| end if; |
| |
| -- Try scanning more and see if it helps |
| Cc.Cur := N; |
| Cc.Lastloc := Input_Pos; |
| |
| if Debug then |
| Dump_Error ("Next failed, what about Current?"); |
| end if; |
| |
| if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then |
| return True; |
| end if; |
| |
| Cc.Cur := N - 1; |
| Cc.Lastloc := Lastloc; |
| return False; |
| end if; |
| |
| -- Prefer scan over next for maximal matching |
| |
| if N < Cc.Max then -- more greed allowed ? |
| Cc.Cur := N; |
| Cc.Lastloc := Input_Pos; |
| |
| if Debug then |
| Dump_Error ("Recurse at current position"); |
| end if; |
| |
| if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then |
| return True; |
| end if; |
| end if; |
| |
| -- Failed deeper matches of scan, so see if this one works |
| |
| Current_Curly := Cc.Old_Cc; |
| |
| if Current_Curly /= null then |
| Ln := Current_Curly.Cur; |
| end if; |
| |
| if Debug then |
| Dump_Error ("Failed matching for later positions"); |
| end if; |
| |
| if Match (Cc.Next) then |
| return True; |
| end if; |
| |
| if Current_Curly /= null then |
| Current_Curly.Cur := Ln; |
| end if; |
| |
| Current_Curly := Cc; |
| Cc.Cur := N - 1; |
| Cc.Lastloc := Lastloc; |
| |
| if Debug then |
| Dump_Error ("failed..."); |
| end if; |
| |
| return False; |
| end Match_Whilem; |
| |
| ------------ |
| -- Repeat -- |
| ------------ |
| |
| function Repeat |
| (IP : Pointer; |
| Max : Natural := Natural'Last) return Natural |
| is |
| Scan : Natural := Input_Pos; |
| Last : Natural; |
| Op : constant Opcode := |
| Opcode'Val (Character'Pos (Program (IP))); |
| Count : Natural; |
| C : Character; |
| Bitmap : Character_Class; |
| |
| begin |
| if Max = Natural'Last or else Scan + Max - 1 > Last_In_Data then |
| Last := Last_In_Data; |
| else |
| Last := Scan + Max - 1; |
| end if; |
| |
| case Op is |
| when ANY => |
| while Scan <= Last |
| and then Data (Scan) /= ASCII.LF |
| loop |
| Scan := Scan + 1; |
| end loop; |
| |
| when SANY => |
| Scan := Last + 1; |
| |
| when EXACT => |
| |
| -- The string has only one character if Repeat was called |
| |
| C := Program (String_Operand (IP)); |
| while Scan <= Last |
| and then C = Data (Scan) |
| loop |
| Scan := Scan + 1; |
| end loop; |
| |
| when EXACTF => |
| |
| -- The string has only one character if Repeat was called |
| |
| C := Program (String_Operand (IP)); |
| while Scan <= Last |
| and then To_Lower (C) = Data (Scan) |
| loop |
| Scan := Scan + 1; |
| end loop; |
| |
| when ANYOF => |
| Bitmap_Operand (Program, IP, Bitmap); |
| |
| while Scan <= Last |
| and then Get_From_Class (Bitmap, Data (Scan)) |
| loop |
| Scan := Scan + 1; |
| end loop; |
| |
| when ALNUM => |
| while Scan <= Last |
| and then Is_Alnum (Data (Scan)) |
| loop |
| Scan := Scan + 1; |
| end loop; |
| |
| when NALNUM => |
| while Scan <= Last |
| and then not Is_Alnum (Data (Scan)) |
| loop |
| Scan := Scan + 1; |
| end loop; |
| |
| when SPACE => |
| while Scan <= Last |
| and then Is_White_Space (Data (Scan)) |
| loop |
| Scan := Scan + 1; |
| end loop; |
| |
| when NSPACE => |
| while Scan <= Last |
| and then not Is_White_Space (Data (Scan)) |
| loop |
| Scan := Scan + 1; |
| end loop; |
| |
| when DIGIT => |
| while Scan <= Last |
| and then Is_Digit (Data (Scan)) |
| loop |
| Scan := Scan + 1; |
| end loop; |
| |
| when NDIGIT => |
| while Scan <= Last |
| and then not Is_Digit (Data (Scan)) |
| loop |
| Scan := Scan + 1; |
| end loop; |
| |
| when others => |
| raise Program_Error; |
| end case; |
| |
| Count := Scan - Input_Pos; |
| Input_Pos := Scan; |
| return Count; |
| end Repeat; |
| |
| --------- |
| -- Try -- |
| --------- |
| |
| function Try (Pos : Positive) return Boolean is |
| begin |
| Input_Pos := Pos; |
| Last_Paren := 0; |
| Matches_Full := [others => No_Match]; |
| |
| if Match (Program_First) then |
| Matches_Full (0) := (Pos, Input_Pos - 1); |
| return True; |
| end if; |
| |
| return False; |
| end Try; |
| |
| -- Start of processing for Match |
| |
| begin |
| -- Do we have the regexp Never_Match? |
| |
| if Self.Size = 0 then |
| Matches := [others => No_Match]; |
| return; |
| end if; |
| |
| -- If there is a "must appear" string, look for it |
| |
| if Self.Must_Have_Length > 0 then |
| declare |
| First : constant Character := Program (Self.Must_Have); |
| Must_First : constant Pointer := Self.Must_Have; |
| Must_Last : constant Pointer := |
| Must_First + Pointer (Self.Must_Have_Length - 1); |
| Next_Try : Natural := Index (First_In_Data, First); |
| |
| begin |
| while Next_Try /= 0 |
| and then Data (Next_Try .. Next_Try + Self.Must_Have_Length - 1) |
| = String (Program (Must_First .. Must_Last)) |
| loop |
| Next_Try := Index (Next_Try + 1, First); |
| end loop; |
| |
| if Next_Try = 0 then |
| Matches := [others => No_Match]; |
| return; -- Not present |
| end if; |
| end; |
| end if; |
| |
| -- Mark beginning of line for ^ |
| |
| BOL_Pos := Data'First; |
| |
| -- Simplest case first: an anchored match need be tried only once |
| |
| if Self.Anchored and then (Self.Flags and Multiple_Lines) = 0 then |
| Matched := Try (First_In_Data); |
| |
| elsif Self.Anchored then |
| declare |
| Next_Try : Natural := First_In_Data; |
| begin |
| -- Test the first position in the buffer |
| Matched := Try (Next_Try); |
| |
| -- Else only test after newlines |
| |
| if not Matched then |
| while Next_Try <= Last_In_Data loop |
| while Next_Try <= Last_In_Data |
| and then Data (Next_Try) /= ASCII.LF |
| loop |
| Next_Try := Next_Try + 1; |
| end loop; |
| |
| Next_Try := Next_Try + 1; |
| |
| if Next_Try <= Last_In_Data then |
| Matched := Try (Next_Try); |
| exit when Matched; |
| end if; |
| end loop; |
| end if; |
| end; |
| |
| elsif Self.First /= ASCII.NUL then |
| -- We know what char (modulo casing) it must start with |
| |
| if (Self.Flags and Case_Insensitive) = 0 |
| or else Self.First not in 'a' .. 'z' |
| then |
| declare |
| Next_Try : Natural := Index (First_In_Data, Self.First); |
| begin |
| while Next_Try /= 0 loop |
| Matched := Try (Next_Try); |
| exit when Matched; |
| Next_Try := Index (Next_Try + 1, Self.First); |
| end loop; |
| end; |
| else |
| declare |
| Uc_First : constant Character := To_Upper (Self.First); |
| |
| function Case_Insensitive_Index |
| (Start : Positive) return Natural; |
| -- Search for both Self.First and To_Upper (Self.First). |
| -- If both are nonzero, return the smaller one; if exactly |
| -- one is nonzero, return it; if both are zero, return zero. |
| |
| --------------------------- |
| -- Case_Insenstive_Index -- |
| --------------------------- |
| |
| function Case_Insensitive_Index |
| (Start : Positive) return Natural |
| is |
| Lc_Index : constant Natural := Index (Start, Self.First); |
| Uc_Index : constant Natural := Index (Start, Uc_First); |
| begin |
| if Lc_Index = 0 then |
| return Uc_Index; |
| elsif Uc_Index = 0 then |
| return Lc_Index; |
| else |
| return Natural'Min (Lc_Index, Uc_Index); |
| end if; |
| end Case_Insensitive_Index; |
| |
| Next_Try : Natural := Case_Insensitive_Index (First_In_Data); |
| begin |
| while Next_Try /= 0 loop |
| Matched := Try (Next_Try); |
| exit when Matched; |
| Next_Try := Case_Insensitive_Index (Next_Try + 1); |
| end loop; |
| end; |
| end if; |
| |
| else |
| -- Messy cases: try all locations (including for the empty string) |
| |
| Matched := Try (First_In_Data); |
| |
| if not Matched then |
| for S in First_In_Data + 1 .. Last_In_Data loop |
| Matched := Try (S); |
| exit when Matched; |
| end loop; |
| end if; |
| end if; |
| |
| -- Matched has its value |
| |
| for J in Last_Paren + 1 .. Matches'Last loop |
| Matches_Full (J) := No_Match; |
| end loop; |
| |
| Matches := Matches_Full (Matches'Range); |
| end Match; |
| |
| ----------- |
| -- Match -- |
| ----------- |
| |
| function Match |
| (Self : Pattern_Matcher; |
| Data : String; |
| Data_First : Integer := -1; |
| Data_Last : Positive := Positive'Last) return Natural |
| is |
| Matches : Match_Array (0 .. 0); |
| |
| begin |
| Match (Self, Data, Matches, Data_First, Data_Last); |
| if Matches (0) = No_Match then |
| return Data'First - 1; |
| else |
| return Matches (0).First; |
| end if; |
| end Match; |
| |
| function Match |
| (Self : Pattern_Matcher; |
| Data : String; |
| Data_First : Integer := -1; |
| Data_Last : Positive := Positive'Last) return Boolean |
| is |
| Matches : Match_Array (0 .. 0); |
| |
| begin |
| Match (Self, Data, Matches, Data_First, Data_Last); |
| return Matches (0).First >= Data'First; |
| end Match; |
| |
| procedure Match |
| (Expression : String; |
| Data : String; |
| Matches : out Match_Array; |
| Size : Program_Size := Auto_Size; |
| Data_First : Integer := -1; |
| Data_Last : Positive := Positive'Last) |
| is |
| PM : Pattern_Matcher (Size); |
| Finalize_Size : Program_Size; |
| begin |
| if Size = 0 then |
| Match (Compile (Expression), Data, Matches, Data_First, Data_Last); |
| else |
| Compile (PM, Expression, Finalize_Size); |
| Match (PM, Data, Matches, Data_First, Data_Last); |
| end if; |
| end Match; |
| |
| ----------- |
| -- Match -- |
| ----------- |
| |
| function Match |
| (Expression : String; |
| Data : String; |
| Size : Program_Size := Auto_Size; |
| Data_First : Integer := -1; |
| Data_Last : Positive := Positive'Last) return Natural |
| is |
| PM : Pattern_Matcher (Size); |
| Final_Size : Program_Size; |
| begin |
| if Size = 0 then |
| return Match (Compile (Expression), Data, Data_First, Data_Last); |
| else |
| Compile (PM, Expression, Final_Size); |
| return Match (PM, Data, Data_First, Data_Last); |
| end if; |
| end Match; |
| |
| ----------- |
| -- Match -- |
| ----------- |
| |
| function Match |
| (Expression : String; |
| Data : String; |
| Size : Program_Size := Auto_Size; |
| Data_First : Integer := -1; |
| Data_Last : Positive := Positive'Last) return Boolean |
| is |
| Matches : Match_Array (0 .. 0); |
| PM : Pattern_Matcher (Size); |
| Final_Size : Program_Size; |
| begin |
| if Size = 0 then |
| Match (Compile (Expression), Data, Matches, Data_First, Data_Last); |
| else |
| Compile (PM, Expression, Final_Size); |
| Match (PM, Data, Matches, Data_First, Data_Last); |
| end if; |
| |
| return Matches (0).First >= Data'First; |
| end Match; |
| |
| ------------- |
| -- Operand -- |
| ------------- |
| |
| function Operand (P : Pointer) return Pointer is |
| begin |
| return P + Next_Pointer_Bytes; |
| end Operand; |
| |
| -------------- |
| -- Optimize -- |
| -------------- |
| |
| procedure Optimize (Self : in out Pattern_Matcher) is |
| Scan : Pointer; |
| Program : Program_Data renames Self.Program; |
| |
| begin |
| -- Start with safe defaults (no optimization): |
| -- * No known first character of match |
| -- * Does not necessarily start at beginning of line |
| -- * No string known that has to appear in data |
| |
| Self.First := ASCII.NUL; |
| Self.Anchored := False; |
| Self.Must_Have := Program'Last + 1; |
| Self.Must_Have_Length := 0; |
| |
| Scan := Program_First; -- First instruction (can be anything) |
| |
| if Program (Scan) = EXACT then |
| Self.First := Program (String_Operand (Scan)); |
| |
| elsif Program (Scan) = EXACTF then |
| Self.First := To_Lower (Program (String_Operand (Scan))); |
| |
| elsif Program (Scan) = BOL |
| or else Program (Scan) = SBOL |
| or else Program (Scan) = MBOL |
| then |
| Self.Anchored := True; |
| end if; |
| end Optimize; |
| |
| ----------------- |
| -- Paren_Count -- |
| ----------------- |
| |
| function Paren_Count (Regexp : Pattern_Matcher) return Match_Count is |
| begin |
| return Regexp.Paren_Count; |
| end Paren_Count; |
| |
| ----------- |
| -- Quote -- |
| ----------- |
| |
| function Quote (Str : String) return String is |
| S : String (1 .. Str'Length * 2); |
| Last : Natural := 0; |
| |
| begin |
| for J in Str'Range loop |
| case Str (J) is |
| when '^' | '$' | '|' | '*' | '+' | '?' | '{' | |
| '}' | '[' | ']' | '(' | ')' | '\' | '.' => |
| |
| S (Last + 1) := '\'; |
| S (Last + 2) := Str (J); |
| Last := Last + 2; |
| |
| when others => |
| S (Last + 1) := Str (J); |
| Last := Last + 1; |
| end case; |
| end loop; |
| |
| return S (1 .. Last); |
| end Quote; |
| |
| ------------------ |
| -- Read_Natural -- |
| ------------------ |
| |
| function Read_Natural |
| (Program : Program_Data; |
| IP : Pointer) return Natural |
| is |
| begin |
| return Character'Pos (Program (IP)) + |
| 256 * Character'Pos (Program (IP + 1)); |
| end Read_Natural; |
| |
| ----------------- |
| -- Reset_Class -- |
| ----------------- |
| |
| procedure Reset_Class (Bitmap : out Character_Class) is |
| begin |
| Bitmap := [others => 0]; |
| end Reset_Class; |
| |
| ------------------ |
| -- Set_In_Class -- |
| ------------------ |
| |
| procedure Set_In_Class |
| (Bitmap : in out Character_Class; |
| C : Character) |
| is |
| Value : constant Class_Byte := Character'Pos (C); |
| begin |
| Bitmap (Value / 8) := Bitmap (Value / 8) |
| or Bit_Conversion (Value mod 8); |
| end Set_In_Class; |
| |
| ------------------- |
| -- String_Length -- |
| ------------------- |
| |
| function String_Length |
| (Program : Program_Data; |
| P : Pointer) return Program_Size |
| is |
| begin |
| pragma Assert (Program (P) = EXACT or else Program (P) = EXACTF); |
| return Character'Pos (Program (P + Next_Pointer_Bytes)); |
| end String_Length; |
| |
| -------------------- |
| -- String_Operand -- |
| -------------------- |
| |
| function String_Operand (P : Pointer) return Pointer is |
| begin |
| return P + 4; |
| end String_Operand; |
| |
| end System.Regpat; |