| /* Extended regular expression matching and search library, |
| version 0.12. |
| (Implements POSIX draft P1003.2/D11.2, except for some of the |
| internationalization features.) |
| |
| Copyright (C) 1993-2021 Free Software Foundation, Inc. |
| This file is part of the GNU C Library. |
| |
| The GNU C Library is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Lesser General Public |
| License as published by the Free Software Foundation; either |
| version 2.1 of the License, or (at your option) any later version. |
| |
| The GNU C Library is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Lesser General Public License for more details. |
| |
| You should have received a copy of the GNU Lesser General Public |
| License along with the GNU C Library; if not, write to the Free |
| Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
| 02110-1301 USA. */ |
| |
| /* This file has been modified for usage in libiberty. It includes "xregex.h" |
| instead of <regex.h>. The "xregex.h" header file renames all external |
| routines with an "x" prefix so they do not collide with the native regex |
| routines or with other components regex routines. */ |
| /* AIX requires this to be the first thing in the file. */ |
| #if defined _AIX && !defined __GNUC__ && !defined REGEX_MALLOC |
| #pragma alloca |
| #endif |
| |
| #undef _GNU_SOURCE |
| #define _GNU_SOURCE |
| |
| #ifndef INSIDE_RECURSION |
| # ifdef HAVE_CONFIG_H |
| # include <config.h> |
| # endif |
| #endif |
| |
| #include <ansidecl.h> |
| |
| #ifndef INSIDE_RECURSION |
| |
| # if defined STDC_HEADERS && !defined emacs |
| # include <stddef.h> |
| # define PTR_INT_TYPE ptrdiff_t |
| # else |
| /* We need this for `regex.h', and perhaps for the Emacs include files. */ |
| # include <sys/types.h> |
| # define PTR_INT_TYPE long |
| # endif |
| |
| # define WIDE_CHAR_SUPPORT (HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_BTOWC) |
| |
| /* For platform which support the ISO C amendement 1 functionality we |
| support user defined character classes. */ |
| # if defined _LIBC || WIDE_CHAR_SUPPORT |
| /* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */ |
| # include <wchar.h> |
| # include <wctype.h> |
| # endif |
| |
| # ifdef _LIBC |
| /* We have to keep the namespace clean. */ |
| # define regfree(preg) __regfree (preg) |
| # define regexec(pr, st, nm, pm, ef) __regexec (pr, st, nm, pm, ef) |
| # define regcomp(preg, pattern, cflags) __regcomp (preg, pattern, cflags) |
| # define regerror(errcode, preg, errbuf, errbuf_size) \ |
| __regerror(errcode, preg, errbuf, errbuf_size) |
| # define re_set_registers(bu, re, nu, st, en) \ |
| __re_set_registers (bu, re, nu, st, en) |
| # define re_match_2(bufp, string1, size1, string2, size2, pos, regs, stop) \ |
| __re_match_2 (bufp, string1, size1, string2, size2, pos, regs, stop) |
| # define re_match(bufp, string, size, pos, regs) \ |
| __re_match (bufp, string, size, pos, regs) |
| # define re_search(bufp, string, size, startpos, range, regs) \ |
| __re_search (bufp, string, size, startpos, range, regs) |
| # define re_compile_pattern(pattern, length, bufp) \ |
| __re_compile_pattern (pattern, length, bufp) |
| # define re_set_syntax(syntax) __re_set_syntax (syntax) |
| # define re_search_2(bufp, st1, s1, st2, s2, startpos, range, regs, stop) \ |
| __re_search_2 (bufp, st1, s1, st2, s2, startpos, range, regs, stop) |
| # define re_compile_fastmap(bufp) __re_compile_fastmap (bufp) |
| |
| # define btowc __btowc |
| |
| /* We are also using some library internals. */ |
| # include <locale/localeinfo.h> |
| # include <locale/elem-hash.h> |
| # include <langinfo.h> |
| # include <locale/coll-lookup.h> |
| # endif |
| |
| /* This is for other GNU distributions with internationalized messages. */ |
| # if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC |
| # include <libintl.h> |
| # ifdef _LIBC |
| # undef gettext |
| # define gettext(msgid) __dcgettext ("libc", msgid, LC_MESSAGES) |
| # endif |
| # else |
| # define gettext(msgid) (msgid) |
| # endif |
| |
| # ifndef gettext_noop |
| /* This define is so xgettext can find the internationalizable |
| strings. */ |
| # define gettext_noop(String) String |
| # endif |
| |
| /* The `emacs' switch turns on certain matching commands |
| that make sense only in Emacs. */ |
| # ifdef emacs |
| |
| # include "lisp.h" |
| # include "buffer.h" |
| # include "syntax.h" |
| |
| # else /* not emacs */ |
| |
| /* If we are not linking with Emacs proper, |
| we can't use the relocating allocator |
| even if config.h says that we can. */ |
| # undef REL_ALLOC |
| |
| # if defined STDC_HEADERS || defined _LIBC |
| # include <stdlib.h> |
| # else |
| char *malloc (); |
| char *realloc (); |
| # endif |
| |
| /* When used in Emacs's lib-src, we need to get bzero and bcopy somehow. |
| If nothing else has been done, use the method below. */ |
| # ifdef INHIBIT_STRING_HEADER |
| # if !(defined HAVE_BZERO && defined HAVE_BCOPY) |
| # if !defined bzero && !defined bcopy |
| # undef INHIBIT_STRING_HEADER |
| # endif |
| # endif |
| # endif |
| |
| /* This is the normal way of making sure we have a bcopy and a bzero. |
| This is used in most programs--a few other programs avoid this |
| by defining INHIBIT_STRING_HEADER. */ |
| # ifndef INHIBIT_STRING_HEADER |
| # if defined HAVE_STRING_H || defined STDC_HEADERS || defined _LIBC |
| # include <string.h> |
| # ifndef bzero |
| # ifndef _LIBC |
| # define bzero(s, n) ((void) memset (s, '\0', n)) |
| # else |
| # define bzero(s, n) __bzero (s, n) |
| # endif |
| # endif |
| # else |
| # include <strings.h> |
| # ifndef memcmp |
| # define memcmp(s1, s2, n) bcmp (s1, s2, n) |
| # endif |
| # ifndef memcpy |
| # define memcpy(d, s, n) (bcopy (s, d, n), (d)) |
| # endif |
| # endif |
| # endif |
| |
| /* Define the syntax stuff for \<, \>, etc. */ |
| |
| /* This must be nonzero for the wordchar and notwordchar pattern |
| commands in re_match_2. */ |
| # ifndef Sword |
| # define Sword 1 |
| # endif |
| |
| # ifdef SWITCH_ENUM_BUG |
| # define SWITCH_ENUM_CAST(x) ((int)(x)) |
| # else |
| # define SWITCH_ENUM_CAST(x) (x) |
| # endif |
| |
| # endif /* not emacs */ |
| |
| # if defined _LIBC || HAVE_LIMITS_H |
| # include <limits.h> |
| # endif |
| |
| # ifndef MB_LEN_MAX |
| # define MB_LEN_MAX 1 |
| # endif |
| |
| /* Get the interface, including the syntax bits. */ |
| # include "xregex.h" /* change for libiberty */ |
| |
| /* isalpha etc. are used for the character classes. */ |
| # include <ctype.h> |
| |
| /* Jim Meyering writes: |
| |
| "... Some ctype macros are valid only for character codes that |
| isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when |
| using /bin/cc or gcc but without giving an ansi option). So, all |
| ctype uses should be through macros like ISPRINT... If |
| STDC_HEADERS is defined, then autoconf has verified that the ctype |
| macros don't need to be guarded with references to isascii. ... |
| Defining isascii to 1 should let any compiler worth its salt |
| eliminate the && through constant folding." |
| Solaris defines some of these symbols so we must undefine them first. */ |
| |
| # undef ISASCII |
| # if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII) |
| # define ISASCII(c) 1 |
| # else |
| # define ISASCII(c) isascii(c) |
| # endif |
| |
| # ifdef isblank |
| # define ISBLANK(c) (ISASCII (c) && isblank (c)) |
| # else |
| # define ISBLANK(c) ((c) == ' ' || (c) == '\t') |
| # endif |
| # ifdef isgraph |
| # define ISGRAPH(c) (ISASCII (c) && isgraph (c)) |
| # else |
| # define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c)) |
| # endif |
| |
| # undef ISPRINT |
| # define ISPRINT(c) (ISASCII (c) && isprint (c)) |
| # define ISDIGIT(c) (ISASCII (c) && isdigit (c)) |
| # define ISALNUM(c) (ISASCII (c) && isalnum (c)) |
| # define ISALPHA(c) (ISASCII (c) && isalpha (c)) |
| # define ISCNTRL(c) (ISASCII (c) && iscntrl (c)) |
| # define ISLOWER(c) (ISASCII (c) && islower (c)) |
| # define ISPUNCT(c) (ISASCII (c) && ispunct (c)) |
| # define ISSPACE(c) (ISASCII (c) && isspace (c)) |
| # define ISUPPER(c) (ISASCII (c) && isupper (c)) |
| # define ISXDIGIT(c) (ISASCII (c) && isxdigit (c)) |
| |
| # ifdef _tolower |
| # define TOLOWER(c) _tolower(c) |
| # else |
| # define TOLOWER(c) tolower(c) |
| # endif |
| |
| # ifndef NULL |
| # define NULL (void *)0 |
| # endif |
| |
| /* We remove any previous definition of `SIGN_EXTEND_CHAR', |
| since ours (we hope) works properly with all combinations of |
| machines, compilers, `char' and `unsigned char' argument types. |
| (Per Bothner suggested the basic approach.) */ |
| # undef SIGN_EXTEND_CHAR |
| # if __STDC__ |
| # define SIGN_EXTEND_CHAR(c) ((signed char) (c)) |
| # else /* not __STDC__ */ |
| /* As in Harbison and Steele. */ |
| # define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128) |
| # endif |
| |
| # ifndef emacs |
| /* How many characters in the character set. */ |
| # define CHAR_SET_SIZE 256 |
| |
| # ifdef SYNTAX_TABLE |
| |
| extern char *re_syntax_table; |
| |
| # else /* not SYNTAX_TABLE */ |
| |
| static char re_syntax_table[CHAR_SET_SIZE]; |
| |
| static void init_syntax_once (void); |
| |
| static void |
| init_syntax_once (void) |
| { |
| register int c; |
| static int done = 0; |
| |
| if (done) |
| return; |
| bzero (re_syntax_table, sizeof re_syntax_table); |
| |
| for (c = 0; c < CHAR_SET_SIZE; ++c) |
| if (ISALNUM (c)) |
| re_syntax_table[c] = Sword; |
| |
| re_syntax_table['_'] = Sword; |
| |
| done = 1; |
| } |
| |
| # endif /* not SYNTAX_TABLE */ |
| |
| # define SYNTAX(c) re_syntax_table[(unsigned char) (c)] |
| |
| # endif /* emacs */ |
| |
| /* Integer type for pointers. */ |
| # if !defined _LIBC && !defined HAVE_UINTPTR_T |
| typedef unsigned long int uintptr_t; |
| # endif |
| |
| /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we |
| use `alloca' instead of `malloc'. This is because using malloc in |
| re_search* or re_match* could cause memory leaks when C-g is used in |
| Emacs; also, malloc is slower and causes storage fragmentation. On |
| the other hand, malloc is more portable, and easier to debug. |
| |
| Because we sometimes use alloca, some routines have to be macros, |
| not functions -- `alloca'-allocated space disappears at the end of the |
| function it is called in. */ |
| |
| # ifdef REGEX_MALLOC |
| |
| # define REGEX_ALLOCATE malloc |
| # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize) |
| # define REGEX_FREE free |
| |
| # else /* not REGEX_MALLOC */ |
| |
| /* Emacs already defines alloca, sometimes. */ |
| # ifndef alloca |
| |
| /* Make alloca work the best possible way. */ |
| # ifdef __GNUC__ |
| # define alloca __builtin_alloca |
| # else /* not __GNUC__ */ |
| # if HAVE_ALLOCA_H |
| # include <alloca.h> |
| # endif /* HAVE_ALLOCA_H */ |
| # endif /* not __GNUC__ */ |
| |
| # endif /* not alloca */ |
| |
| # define REGEX_ALLOCATE alloca |
| |
| /* Assumes a `char *destination' variable. */ |
| # define REGEX_REALLOCATE(source, osize, nsize) \ |
| (destination = (char *) alloca (nsize), \ |
| memcpy (destination, source, osize)) |
| |
| /* No need to do anything to free, after alloca. */ |
| # define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */ |
| |
| # endif /* not REGEX_MALLOC */ |
| |
| /* Define how to allocate the failure stack. */ |
| |
| # if defined REL_ALLOC && defined REGEX_MALLOC |
| |
| # define REGEX_ALLOCATE_STACK(size) \ |
| r_alloc (&failure_stack_ptr, (size)) |
| # define REGEX_REALLOCATE_STACK(source, osize, nsize) \ |
| r_re_alloc (&failure_stack_ptr, (nsize)) |
| # define REGEX_FREE_STACK(ptr) \ |
| r_alloc_free (&failure_stack_ptr) |
| |
| # else /* not using relocating allocator */ |
| |
| # ifdef REGEX_MALLOC |
| |
| # define REGEX_ALLOCATE_STACK malloc |
| # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize) |
| # define REGEX_FREE_STACK free |
| |
| # else /* not REGEX_MALLOC */ |
| |
| # define REGEX_ALLOCATE_STACK alloca |
| |
| # define REGEX_REALLOCATE_STACK(source, osize, nsize) \ |
| REGEX_REALLOCATE (source, osize, nsize) |
| /* No need to explicitly free anything. */ |
| # define REGEX_FREE_STACK(arg) |
| |
| # endif /* not REGEX_MALLOC */ |
| # endif /* not using relocating allocator */ |
| |
| |
| /* True if `size1' is non-NULL and PTR is pointing anywhere inside |
| `string1' or just past its end. This works if PTR is NULL, which is |
| a good thing. */ |
| # define FIRST_STRING_P(ptr) \ |
| (size1 && string1 <= (ptr) && (ptr) <= string1 + size1) |
| |
| /* (Re)Allocate N items of type T using malloc, or fail. */ |
| # define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t))) |
| # define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t))) |
| # define RETALLOC_IF(addr, n, t) \ |
| if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t) |
| # define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t))) |
| |
| # define BYTEWIDTH 8 /* In bits. */ |
| |
| # define STREQ(s1, s2) ((strcmp (s1, s2) == 0)) |
| |
| # undef MAX |
| # undef MIN |
| # define MAX(a, b) ((a) > (b) ? (a) : (b)) |
| # define MIN(a, b) ((a) < (b) ? (a) : (b)) |
| |
| typedef char boolean; |
| # define false 0 |
| # define true 1 |
| |
| static reg_errcode_t byte_regex_compile (const char *pattern, size_t size, |
| reg_syntax_t syntax, |
| struct re_pattern_buffer *bufp); |
| |
| static int byte_re_match_2_internal (struct re_pattern_buffer *bufp, |
| const char *string1, int size1, |
| const char *string2, int size2, |
| int pos, |
| struct re_registers *regs, |
| int stop); |
| static int byte_re_search_2 (struct re_pattern_buffer *bufp, |
| const char *string1, int size1, |
| const char *string2, int size2, |
| int startpos, int range, |
| struct re_registers *regs, int stop); |
| static int byte_re_compile_fastmap (struct re_pattern_buffer *bufp); |
| |
| #ifdef MBS_SUPPORT |
| static reg_errcode_t wcs_regex_compile (const char *pattern, size_t size, |
| reg_syntax_t syntax, |
| struct re_pattern_buffer *bufp); |
| |
| |
| static int wcs_re_match_2_internal (struct re_pattern_buffer *bufp, |
| const char *cstring1, int csize1, |
| const char *cstring2, int csize2, |
| int pos, |
| struct re_registers *regs, |
| int stop, |
| wchar_t *string1, int size1, |
| wchar_t *string2, int size2, |
| int *mbs_offset1, int *mbs_offset2); |
| static int wcs_re_search_2 (struct re_pattern_buffer *bufp, |
| const char *string1, int size1, |
| const char *string2, int size2, |
| int startpos, int range, |
| struct re_registers *regs, int stop); |
| static int wcs_re_compile_fastmap (struct re_pattern_buffer *bufp); |
| #endif |
| |
| /* These are the command codes that appear in compiled regular |
| expressions. Some opcodes are followed by argument bytes. A |
| command code can specify any interpretation whatsoever for its |
| arguments. Zero bytes may appear in the compiled regular expression. */ |
| |
| typedef enum |
| { |
| no_op = 0, |
| |
| /* Succeed right away--no more backtracking. */ |
| succeed, |
| |
| /* Followed by one byte giving n, then by n literal bytes. */ |
| exactn, |
| |
| # ifdef MBS_SUPPORT |
| /* Same as exactn, but contains binary data. */ |
| exactn_bin, |
| # endif |
| |
| /* Matches any (more or less) character. */ |
| anychar, |
| |
| /* Matches any one char belonging to specified set. First |
| following byte is number of bitmap bytes. Then come bytes |
| for a bitmap saying which chars are in. Bits in each byte |
| are ordered low-bit-first. A character is in the set if its |
| bit is 1. A character too large to have a bit in the map is |
| automatically not in the set. */ |
| /* ifdef MBS_SUPPORT, following element is length of character |
| classes, length of collating symbols, length of equivalence |
| classes, length of character ranges, and length of characters. |
| Next, character class element, collating symbols elements, |
| equivalence class elements, range elements, and character |
| elements follow. |
| See regex_compile function. */ |
| charset, |
| |
| /* Same parameters as charset, but match any character that is |
| not one of those specified. */ |
| charset_not, |
| |
| /* Start remembering the text that is matched, for storing in a |
| register. Followed by one byte with the register number, in |
| the range 0 to one less than the pattern buffer's re_nsub |
| field. Then followed by one byte with the number of groups |
| inner to this one. (This last has to be part of the |
| start_memory only because we need it in the on_failure_jump |
| of re_match_2.) */ |
| start_memory, |
| |
| /* Stop remembering the text that is matched and store it in a |
| memory register. Followed by one byte with the register |
| number, in the range 0 to one less than `re_nsub' in the |
| pattern buffer, and one byte with the number of inner groups, |
| just like `start_memory'. (We need the number of inner |
| groups here because we don't have any easy way of finding the |
| corresponding start_memory when we're at a stop_memory.) */ |
| stop_memory, |
| |
| /* Match a duplicate of something remembered. Followed by one |
| byte containing the register number. */ |
| duplicate, |
| |
| /* Fail unless at beginning of line. */ |
| begline, |
| |
| /* Fail unless at end of line. */ |
| endline, |
| |
| /* Succeeds if at beginning of buffer (if emacs) or at beginning |
| of string to be matched (if not). */ |
| begbuf, |
| |
| /* Analogously, for end of buffer/string. */ |
| endbuf, |
| |
| /* Followed by two byte relative address to which to jump. */ |
| jump, |
| |
| /* Same as jump, but marks the end of an alternative. */ |
| jump_past_alt, |
| |
| /* Followed by two-byte relative address of place to resume at |
| in case of failure. */ |
| /* ifdef MBS_SUPPORT, the size of address is 1. */ |
| on_failure_jump, |
| |
| /* Like on_failure_jump, but pushes a placeholder instead of the |
| current string position when executed. */ |
| on_failure_keep_string_jump, |
| |
| /* Throw away latest failure point and then jump to following |
| two-byte relative address. */ |
| /* ifdef MBS_SUPPORT, the size of address is 1. */ |
| pop_failure_jump, |
| |
| /* Change to pop_failure_jump if know won't have to backtrack to |
| match; otherwise change to jump. This is used to jump |
| back to the beginning of a repeat. If what follows this jump |
| clearly won't match what the repeat does, such that we can be |
| sure that there is no use backtracking out of repetitions |
| already matched, then we change it to a pop_failure_jump. |
| Followed by two-byte address. */ |
| /* ifdef MBS_SUPPORT, the size of address is 1. */ |
| maybe_pop_jump, |
| |
| /* Jump to following two-byte address, and push a dummy failure |
| point. This failure point will be thrown away if an attempt |
| is made to use it for a failure. A `+' construct makes this |
| before the first repeat. Also used as an intermediary kind |
| of jump when compiling an alternative. */ |
| /* ifdef MBS_SUPPORT, the size of address is 1. */ |
| dummy_failure_jump, |
| |
| /* Push a dummy failure point and continue. Used at the end of |
| alternatives. */ |
| push_dummy_failure, |
| |
| /* Followed by two-byte relative address and two-byte number n. |
| After matching N times, jump to the address upon failure. */ |
| /* ifdef MBS_SUPPORT, the size of address is 1. */ |
| succeed_n, |
| |
| /* Followed by two-byte relative address, and two-byte number n. |
| Jump to the address N times, then fail. */ |
| /* ifdef MBS_SUPPORT, the size of address is 1. */ |
| jump_n, |
| |
| /* Set the following two-byte relative address to the |
| subsequent two-byte number. The address *includes* the two |
| bytes of number. */ |
| /* ifdef MBS_SUPPORT, the size of address is 1. */ |
| set_number_at, |
| |
| wordchar, /* Matches any word-constituent character. */ |
| notwordchar, /* Matches any char that is not a word-constituent. */ |
| |
| wordbeg, /* Succeeds if at word beginning. */ |
| wordend, /* Succeeds if at word end. */ |
| |
| wordbound, /* Succeeds if at a word boundary. */ |
| notwordbound /* Succeeds if not at a word boundary. */ |
| |
| # ifdef emacs |
| ,before_dot, /* Succeeds if before point. */ |
| at_dot, /* Succeeds if at point. */ |
| after_dot, /* Succeeds if after point. */ |
| |
| /* Matches any character whose syntax is specified. Followed by |
| a byte which contains a syntax code, e.g., Sword. */ |
| syntaxspec, |
| |
| /* Matches any character whose syntax is not that specified. */ |
| notsyntaxspec |
| # endif /* emacs */ |
| } re_opcode_t; |
| #endif /* not INSIDE_RECURSION */ |
| |
| |
| #ifdef BYTE |
| # define CHAR_T char |
| # define UCHAR_T unsigned char |
| # define COMPILED_BUFFER_VAR bufp->buffer |
| # define OFFSET_ADDRESS_SIZE 2 |
| # define PREFIX(name) byte_##name |
| # define ARG_PREFIX(name) name |
| # define PUT_CHAR(c) putchar (c) |
| #else |
| # ifdef WCHAR |
| # define CHAR_T wchar_t |
| # define UCHAR_T wchar_t |
| # define COMPILED_BUFFER_VAR wc_buffer |
| # define OFFSET_ADDRESS_SIZE 1 /* the size which STORE_NUMBER macro use */ |
| # define CHAR_CLASS_SIZE ((__alignof__(wctype_t)+sizeof(wctype_t))/sizeof(CHAR_T)+1) |
| # define PREFIX(name) wcs_##name |
| # define ARG_PREFIX(name) c##name |
| /* Should we use wide stream?? */ |
| # define PUT_CHAR(c) printf ("%C", c); |
| # define TRUE 1 |
| # define FALSE 0 |
| # else |
| # ifdef MBS_SUPPORT |
| # define WCHAR |
| # define INSIDE_RECURSION |
| # include "regex.c" |
| # undef INSIDE_RECURSION |
| # endif |
| # define BYTE |
| # define INSIDE_RECURSION |
| # include "regex.c" |
| # undef INSIDE_RECURSION |
| # endif |
| #endif |
| |
| #ifdef INSIDE_RECURSION |
| /* Common operations on the compiled pattern. */ |
| |
| /* Store NUMBER in two contiguous bytes starting at DESTINATION. */ |
| /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ |
| |
| # ifdef WCHAR |
| # define STORE_NUMBER(destination, number) \ |
| do { \ |
| *(destination) = (UCHAR_T)(number); \ |
| } while (0) |
| # else /* BYTE */ |
| # define STORE_NUMBER(destination, number) \ |
| do { \ |
| (destination)[0] = (number) & 0377; \ |
| (destination)[1] = (number) >> 8; \ |
| } while (0) |
| # endif /* WCHAR */ |
| |
| /* Same as STORE_NUMBER, except increment DESTINATION to |
| the byte after where the number is stored. Therefore, DESTINATION |
| must be an lvalue. */ |
| /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ |
| |
| # define STORE_NUMBER_AND_INCR(destination, number) \ |
| do { \ |
| STORE_NUMBER (destination, number); \ |
| (destination) += OFFSET_ADDRESS_SIZE; \ |
| } while (0) |
| |
| /* Put into DESTINATION a number stored in two contiguous bytes starting |
| at SOURCE. */ |
| /* ifdef MBS_SUPPORT, we store NUMBER in 1 element. */ |
| |
| # ifdef WCHAR |
| # define EXTRACT_NUMBER(destination, source) \ |
| do { \ |
| (destination) = *(source); \ |
| } while (0) |
| # else /* BYTE */ |
| # define EXTRACT_NUMBER(destination, source) \ |
| do { \ |
| (destination) = *(source) & 0377; \ |
| (destination) += ((unsigned) SIGN_EXTEND_CHAR (*((source) + 1))) << 8; \ |
| } while (0) |
| # endif |
| |
| # ifdef DEBUG |
| static void PREFIX(extract_number) (int *dest, UCHAR_T *source); |
| static void |
| PREFIX(extract_number) (int *dest, UCHAR_T *source) |
| { |
| # ifdef WCHAR |
| *dest = *source; |
| # else /* BYTE */ |
| int temp = SIGN_EXTEND_CHAR (*(source + 1)); |
| *dest = *source & 0377; |
| *dest += temp << 8; |
| # endif |
| } |
| |
| # ifndef EXTRACT_MACROS /* To debug the macros. */ |
| # undef EXTRACT_NUMBER |
| # define EXTRACT_NUMBER(dest, src) PREFIX(extract_number) (&dest, src) |
| # endif /* not EXTRACT_MACROS */ |
| |
| # endif /* DEBUG */ |
| |
| /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number. |
| SOURCE must be an lvalue. */ |
| |
| # define EXTRACT_NUMBER_AND_INCR(destination, source) \ |
| do { \ |
| EXTRACT_NUMBER (destination, source); \ |
| (source) += OFFSET_ADDRESS_SIZE; \ |
| } while (0) |
| |
| # ifdef DEBUG |
| static void PREFIX(extract_number_and_incr) (int *destination, |
| UCHAR_T **source); |
| static void |
| PREFIX(extract_number_and_incr) (int *destination, UCHAR_T **source) |
| { |
| PREFIX(extract_number) (destination, *source); |
| *source += OFFSET_ADDRESS_SIZE; |
| } |
| |
| # ifndef EXTRACT_MACROS |
| # undef EXTRACT_NUMBER_AND_INCR |
| # define EXTRACT_NUMBER_AND_INCR(dest, src) \ |
| PREFIX(extract_number_and_incr) (&dest, &src) |
| # endif /* not EXTRACT_MACROS */ |
| |
| # endif /* DEBUG */ |
| |
| |
| |
| /* If DEBUG is defined, Regex prints many voluminous messages about what |
| it is doing (if the variable `debug' is nonzero). If linked with the |
| main program in `iregex.c', you can enter patterns and strings |
| interactively. And if linked with the main program in `main.c' and |
| the other test files, you can run the already-written tests. */ |
| |
| # ifdef DEBUG |
| |
| # ifndef DEFINED_ONCE |
| |
| /* We use standard I/O for debugging. */ |
| # include <stdio.h> |
| |
| /* It is useful to test things that ``must'' be true when debugging. */ |
| # include <assert.h> |
| |
| static int debug; |
| |
| # define DEBUG_STATEMENT(e) e |
| # define DEBUG_PRINT1(x) if (debug) printf (x) |
| # define DEBUG_PRINT2(x1, x2) if (debug) printf (x1, x2) |
| # define DEBUG_PRINT3(x1, x2, x3) if (debug) printf (x1, x2, x3) |
| # define DEBUG_PRINT4(x1, x2, x3, x4) if (debug) printf (x1, x2, x3, x4) |
| # endif /* not DEFINED_ONCE */ |
| |
| # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \ |
| if (debug) PREFIX(print_partial_compiled_pattern) (s, e) |
| # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \ |
| if (debug) PREFIX(print_double_string) (w, s1, sz1, s2, sz2) |
| |
| |
| /* Print the fastmap in human-readable form. */ |
| |
| # ifndef DEFINED_ONCE |
| void |
| print_fastmap (char *fastmap) |
| { |
| unsigned was_a_range = 0; |
| unsigned i = 0; |
| |
| while (i < (1 << BYTEWIDTH)) |
| { |
| if (fastmap[i++]) |
| { |
| was_a_range = 0; |
| putchar (i - 1); |
| while (i < (1 << BYTEWIDTH) && fastmap[i]) |
| { |
| was_a_range = 1; |
| i++; |
| } |
| if (was_a_range) |
| { |
| printf ("-"); |
| putchar (i - 1); |
| } |
| } |
| } |
| putchar ('\n'); |
| } |
| # endif /* not DEFINED_ONCE */ |
| |
| |
| /* Print a compiled pattern string in human-readable form, starting at |
| the START pointer into it and ending just before the pointer END. */ |
| |
| void |
| PREFIX(print_partial_compiled_pattern) (UCHAR_T *start, UCHAR_T *end) |
| { |
| int mcnt, mcnt2; |
| UCHAR_T *p1; |
| UCHAR_T *p = start; |
| UCHAR_T *pend = end; |
| |
| if (start == NULL) |
| { |
| printf ("(null)\n"); |
| return; |
| } |
| |
| /* Loop over pattern commands. */ |
| while (p < pend) |
| { |
| # ifdef _LIBC |
| printf ("%td:\t", p - start); |
| # else |
| printf ("%ld:\t", (long int) (p - start)); |
| # endif |
| |
| switch ((re_opcode_t) *p++) |
| { |
| case no_op: |
| printf ("/no_op"); |
| break; |
| |
| case exactn: |
| mcnt = *p++; |
| printf ("/exactn/%d", mcnt); |
| do |
| { |
| putchar ('/'); |
| PUT_CHAR (*p++); |
| } |
| while (--mcnt); |
| break; |
| |
| # ifdef MBS_SUPPORT |
| case exactn_bin: |
| mcnt = *p++; |
| printf ("/exactn_bin/%d", mcnt); |
| do |
| { |
| printf("/%lx", (long int) *p++); |
| } |
| while (--mcnt); |
| break; |
| # endif /* MBS_SUPPORT */ |
| |
| case start_memory: |
| mcnt = *p++; |
| printf ("/start_memory/%d/%ld", mcnt, (long int) *p++); |
| break; |
| |
| case stop_memory: |
| mcnt = *p++; |
| printf ("/stop_memory/%d/%ld", mcnt, (long int) *p++); |
| break; |
| |
| case duplicate: |
| printf ("/duplicate/%ld", (long int) *p++); |
| break; |
| |
| case anychar: |
| printf ("/anychar"); |
| break; |
| |
| case charset: |
| case charset_not: |
| { |
| # ifdef WCHAR |
| int i, length; |
| wchar_t *workp = p; |
| printf ("/charset [%s", |
| (re_opcode_t) *(workp - 1) == charset_not ? "^" : ""); |
| p += 5; |
| length = *workp++; /* the length of char_classes */ |
| for (i=0 ; i<length ; i++) |
| printf("[:%lx:]", (long int) *p++); |
| length = *workp++; /* the length of collating_symbol */ |
| for (i=0 ; i<length ;) |
| { |
| printf("[."); |
| while(*p != 0) |
| PUT_CHAR((i++,*p++)); |
| i++,p++; |
| printf(".]"); |
| } |
| length = *workp++; /* the length of equivalence_class */ |
| for (i=0 ; i<length ;) |
| { |
| printf("[="); |
| while(*p != 0) |
| PUT_CHAR((i++,*p++)); |
| i++,p++; |
| printf("=]"); |
| } |
| length = *workp++; /* the length of char_range */ |
| for (i=0 ; i<length ; i++) |
| { |
| wchar_t range_start = *p++; |
| wchar_t range_end = *p++; |
| printf("%C-%C", range_start, range_end); |
| } |
| length = *workp++; /* the length of char */ |
| for (i=0 ; i<length ; i++) |
| printf("%C", *p++); |
| putchar (']'); |
| # else |
| register int c, last = -100; |
| register int in_range = 0; |
| |
| printf ("/charset [%s", |
| (re_opcode_t) *(p - 1) == charset_not ? "^" : ""); |
| |
| assert (p + *p < pend); |
| |
| for (c = 0; c < 256; c++) |
| if (c / 8 < *p |
| && (p[1 + (c/8)] & (1 << (c % 8)))) |
| { |
| /* Are we starting a range? */ |
| if (last + 1 == c && ! in_range) |
| { |
| putchar ('-'); |
| in_range = 1; |
| } |
| /* Have we broken a range? */ |
| else if (last + 1 != c && in_range) |
| { |
| putchar (last); |
| in_range = 0; |
| } |
| |
| if (! in_range) |
| putchar (c); |
| |
| last = c; |
| } |
| |
| if (in_range) |
| putchar (last); |
| |
| putchar (']'); |
| |
| p += 1 + *p; |
| # endif /* WCHAR */ |
| } |
| break; |
| |
| case begline: |
| printf ("/begline"); |
| break; |
| |
| case endline: |
| printf ("/endline"); |
| break; |
| |
| case on_failure_jump: |
| PREFIX(extract_number_and_incr) (&mcnt, &p); |
| # ifdef _LIBC |
| printf ("/on_failure_jump to %td", p + mcnt - start); |
| # else |
| printf ("/on_failure_jump to %ld", (long int) (p + mcnt - start)); |
| # endif |
| break; |
| |
| case on_failure_keep_string_jump: |
| PREFIX(extract_number_and_incr) (&mcnt, &p); |
| # ifdef _LIBC |
| printf ("/on_failure_keep_string_jump to %td", p + mcnt - start); |
| # else |
| printf ("/on_failure_keep_string_jump to %ld", |
| (long int) (p + mcnt - start)); |
| # endif |
| break; |
| |
| case dummy_failure_jump: |
| PREFIX(extract_number_and_incr) (&mcnt, &p); |
| # ifdef _LIBC |
| printf ("/dummy_failure_jump to %td", p + mcnt - start); |
| # else |
| printf ("/dummy_failure_jump to %ld", (long int) (p + mcnt - start)); |
| # endif |
| break; |
| |
| case push_dummy_failure: |
| printf ("/push_dummy_failure"); |
| break; |
| |
| case maybe_pop_jump: |
| PREFIX(extract_number_and_incr) (&mcnt, &p); |
| # ifdef _LIBC |
| printf ("/maybe_pop_jump to %td", p + mcnt - start); |
| # else |
| printf ("/maybe_pop_jump to %ld", (long int) (p + mcnt - start)); |
| # endif |
| break; |
| |
| case pop_failure_jump: |
| PREFIX(extract_number_and_incr) (&mcnt, &p); |
| # ifdef _LIBC |
| printf ("/pop_failure_jump to %td", p + mcnt - start); |
| # else |
| printf ("/pop_failure_jump to %ld", (long int) (p + mcnt - start)); |
| # endif |
| break; |
| |
| case jump_past_alt: |
| PREFIX(extract_number_and_incr) (&mcnt, &p); |
| # ifdef _LIBC |
| printf ("/jump_past_alt to %td", p + mcnt - start); |
| # else |
| printf ("/jump_past_alt to %ld", (long int) (p + mcnt - start)); |
| # endif |
| break; |
| |
| case jump: |
| PREFIX(extract_number_and_incr) (&mcnt, &p); |
| # ifdef _LIBC |
| printf ("/jump to %td", p + mcnt - start); |
| # else |
| printf ("/jump to %ld", (long int) (p + mcnt - start)); |
| # endif |
| break; |
| |
| case succeed_n: |
| PREFIX(extract_number_and_incr) (&mcnt, &p); |
| p1 = p + mcnt; |
| PREFIX(extract_number_and_incr) (&mcnt2, &p); |
| # ifdef _LIBC |
| printf ("/succeed_n to %td, %d times", p1 - start, mcnt2); |
| # else |
| printf ("/succeed_n to %ld, %d times", |
| (long int) (p1 - start), mcnt2); |
| # endif |
| break; |
| |
| case jump_n: |
| PREFIX(extract_number_and_incr) (&mcnt, &p); |
| p1 = p + mcnt; |
| PREFIX(extract_number_and_incr) (&mcnt2, &p); |
| printf ("/jump_n to %d, %d times", p1 - start, mcnt2); |
| break; |
| |
| case set_number_at: |
| PREFIX(extract_number_and_incr) (&mcnt, &p); |
| p1 = p + mcnt; |
| PREFIX(extract_number_and_incr) (&mcnt2, &p); |
| # ifdef _LIBC |
| printf ("/set_number_at location %td to %d", p1 - start, mcnt2); |
| # else |
| printf ("/set_number_at location %ld to %d", |
| (long int) (p1 - start), mcnt2); |
| # endif |
| break; |
| |
| case wordbound: |
| printf ("/wordbound"); |
| break; |
| |
| case notwordbound: |
| printf ("/notwordbound"); |
| break; |
| |
| case wordbeg: |
| printf ("/wordbeg"); |
| break; |
| |
| case wordend: |
| printf ("/wordend"); |
| break; |
| |
| # ifdef emacs |
| case before_dot: |
| printf ("/before_dot"); |
| break; |
| |
| case at_dot: |
| printf ("/at_dot"); |
| break; |
| |
| case after_dot: |
| printf ("/after_dot"); |
| break; |
| |
| case syntaxspec: |
| printf ("/syntaxspec"); |
| mcnt = *p++; |
| printf ("/%d", mcnt); |
| break; |
| |
| case notsyntaxspec: |
| printf ("/notsyntaxspec"); |
| mcnt = *p++; |
| printf ("/%d", mcnt); |
| break; |
| # endif /* emacs */ |
| |
| case wordchar: |
| printf ("/wordchar"); |
| break; |
| |
| case notwordchar: |
| printf ("/notwordchar"); |
| break; |
| |
| case begbuf: |
| printf ("/begbuf"); |
| break; |
| |
| case endbuf: |
| printf ("/endbuf"); |
| break; |
| |
| default: |
| printf ("?%ld", (long int) *(p-1)); |
| } |
| |
| putchar ('\n'); |
| } |
| |
| # ifdef _LIBC |
| printf ("%td:\tend of pattern.\n", p - start); |
| # else |
| printf ("%ld:\tend of pattern.\n", (long int) (p - start)); |
| # endif |
| } |
| |
| |
| void |
| PREFIX(print_compiled_pattern) (struct re_pattern_buffer *bufp) |
| { |
| UCHAR_T *buffer = (UCHAR_T*) bufp->buffer; |
| |
| PREFIX(print_partial_compiled_pattern) (buffer, buffer |
| + bufp->used / sizeof(UCHAR_T)); |
| printf ("%ld bytes used/%ld bytes allocated.\n", |
| bufp->used, bufp->allocated); |
| |
| if (bufp->fastmap_accurate && bufp->fastmap) |
| { |
| printf ("fastmap: "); |
| print_fastmap (bufp->fastmap); |
| } |
| |
| # ifdef _LIBC |
| printf ("re_nsub: %Zd\t", bufp->re_nsub); |
| # else |
| printf ("re_nsub: %ld\t", (long int) bufp->re_nsub); |
| # endif |
| printf ("regs_alloc: %d\t", bufp->regs_allocated); |
| printf ("can_be_null: %d\t", bufp->can_be_null); |
| printf ("newline_anchor: %d\n", bufp->newline_anchor); |
| printf ("no_sub: %d\t", bufp->no_sub); |
| printf ("not_bol: %d\t", bufp->not_bol); |
| printf ("not_eol: %d\t", bufp->not_eol); |
| printf ("syntax: %lx\n", bufp->syntax); |
| /* Perhaps we should print the translate table? */ |
| } |
| |
| |
| void |
| PREFIX(print_double_string) (const CHAR_T *where, const CHAR_T *string1, |
| int size1, const CHAR_T *string2, int size2) |
| { |
| int this_char; |
| |
| if (where == NULL) |
| printf ("(null)"); |
| else |
| { |
| int cnt; |
| |
| if (FIRST_STRING_P (where)) |
| { |
| for (this_char = where - string1; this_char < size1; this_char++) |
| PUT_CHAR (string1[this_char]); |
| |
| where = string2; |
| } |
| |
| cnt = 0; |
| for (this_char = where - string2; this_char < size2; this_char++) |
| { |
| PUT_CHAR (string2[this_char]); |
| if (++cnt > 100) |
| { |
| fputs ("...", stdout); |
| break; |
| } |
| } |
| } |
| } |
| |
| # ifndef DEFINED_ONCE |
| void |
| printchar (int c) |
| { |
| putc (c, stderr); |
| } |
| # endif |
| |
| # else /* not DEBUG */ |
| |
| # ifndef DEFINED_ONCE |
| # undef assert |
| # define assert(e) |
| |
| # define DEBUG_STATEMENT(e) |
| # define DEBUG_PRINT1(x) |
| # define DEBUG_PRINT2(x1, x2) |
| # define DEBUG_PRINT3(x1, x2, x3) |
| # define DEBUG_PRINT4(x1, x2, x3, x4) |
| # endif /* not DEFINED_ONCE */ |
| # define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) |
| # define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) |
| |
| # endif /* not DEBUG */ |
| |
| |
| |
| # ifdef WCHAR |
| /* This convert a multibyte string to a wide character string. |
| And write their correspondances to offset_buffer(see below) |
| and write whether each wchar_t is binary data to is_binary. |
| This assume invalid multibyte sequences as binary data. |
| We assume offset_buffer and is_binary is already allocated |
| enough space. */ |
| |
| static size_t convert_mbs_to_wcs (CHAR_T *dest, const unsigned char* src, |
| size_t len, int *offset_buffer, |
| char *is_binary); |
| static size_t |
| convert_mbs_to_wcs (CHAR_T *dest, const unsigned char*src, size_t len, |
| int *offset_buffer, char *is_binary) |
| /* It hold correspondances between src(char string) and |
| dest(wchar_t string) for optimization. |
| e.g. src = "xxxyzz" |
| dest = {'X', 'Y', 'Z'} |
| (each "xxx", "y" and "zz" represent one multibyte character |
| corresponding to 'X', 'Y' and 'Z'.) |
| offset_buffer = {0, 0+3("xxx"), 0+3+1("y"), 0+3+1+2("zz")} |
| = {0, 3, 4, 6} |
| */ |
| { |
| wchar_t *pdest = dest; |
| const unsigned char *psrc = src; |
| size_t wc_count = 0; |
| |
| mbstate_t mbs; |
| int i, consumed; |
| size_t mb_remain = len; |
| size_t mb_count = 0; |
| |
| /* Initialize the conversion state. */ |
| memset (&mbs, 0, sizeof (mbstate_t)); |
| |
| offset_buffer[0] = 0; |
| for( ; mb_remain > 0 ; ++wc_count, ++pdest, mb_remain -= consumed, |
| psrc += consumed) |
| { |
| #ifdef _LIBC |
| consumed = __mbrtowc (pdest, psrc, mb_remain, &mbs); |
| #else |
| consumed = mbrtowc (pdest, psrc, mb_remain, &mbs); |
| #endif |
| |
| if (consumed <= 0) |
| /* failed to convert. maybe src contains binary data. |
| So we consume 1 byte manualy. */ |
| { |
| *pdest = *psrc; |
| consumed = 1; |
| is_binary[wc_count] = TRUE; |
| } |
| else |
| is_binary[wc_count] = FALSE; |
| /* In sjis encoding, we use yen sign as escape character in |
| place of reverse solidus. So we convert 0x5c(yen sign in |
| sjis) to not 0xa5(yen sign in UCS2) but 0x5c(reverse |
| solidus in UCS2). */ |
| if (consumed == 1 && (int) *psrc == 0x5c && (int) *pdest == 0xa5) |
| *pdest = (wchar_t) *psrc; |
| |
| offset_buffer[wc_count + 1] = mb_count += consumed; |
| } |
| |
| /* Fill remain of the buffer with sentinel. */ |
| for (i = wc_count + 1 ; i <= len ; i++) |
| offset_buffer[i] = mb_count + 1; |
| |
| return wc_count; |
| } |
| |
| # endif /* WCHAR */ |
| |
| #else /* not INSIDE_RECURSION */ |
| |
| /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can |
| also be assigned to arbitrarily: each pattern buffer stores its own |
| syntax, so it can be changed between regex compilations. */ |
| /* This has no initializer because initialized variables in Emacs |
| become read-only after dumping. */ |
| reg_syntax_t re_syntax_options; |
| |
| |
| /* Specify the precise syntax of regexps for compilation. This provides |
| for compatibility for various utilities which historically have |
| different, incompatible syntaxes. |
| |
| The argument SYNTAX is a bit mask comprised of the various bits |
| defined in regex.h. We return the old syntax. */ |
| |
| reg_syntax_t |
| re_set_syntax (reg_syntax_t syntax) |
| { |
| reg_syntax_t ret = re_syntax_options; |
| |
| re_syntax_options = syntax; |
| # ifdef DEBUG |
| if (syntax & RE_DEBUG) |
| debug = 1; |
| else if (debug) /* was on but now is not */ |
| debug = 0; |
| # endif /* DEBUG */ |
| return ret; |
| } |
| # ifdef _LIBC |
| weak_alias (__re_set_syntax, re_set_syntax) |
| # endif |
| |
| /* This table gives an error message for each of the error codes listed |
| in regex.h. Obviously the order here has to be same as there. |
| POSIX doesn't require that we do anything for REG_NOERROR, |
| but why not be nice? */ |
| |
| static const char *re_error_msgid[] = |
| { |
| gettext_noop ("Success"), /* REG_NOERROR */ |
| gettext_noop ("No match"), /* REG_NOMATCH */ |
| gettext_noop ("Invalid regular expression"), /* REG_BADPAT */ |
| gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */ |
| gettext_noop ("Invalid character class name"), /* REG_ECTYPE */ |
| gettext_noop ("Trailing backslash"), /* REG_EESCAPE */ |
| gettext_noop ("Invalid back reference"), /* REG_ESUBREG */ |
| gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */ |
| gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */ |
| gettext_noop ("Unmatched \\{"), /* REG_EBRACE */ |
| gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */ |
| gettext_noop ("Invalid range end"), /* REG_ERANGE */ |
| gettext_noop ("Memory exhausted"), /* REG_ESPACE */ |
| gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */ |
| gettext_noop ("Premature end of regular expression"), /* REG_EEND */ |
| gettext_noop ("Regular expression too big"), /* REG_ESIZE */ |
| gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ |
| }; |
| |
| #endif /* INSIDE_RECURSION */ |
| |
| #ifndef DEFINED_ONCE |
| /* Avoiding alloca during matching, to placate r_alloc. */ |
| |
| /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the |
| searching and matching functions should not call alloca. On some |
| systems, alloca is implemented in terms of malloc, and if we're |
| using the relocating allocator routines, then malloc could cause a |
| relocation, which might (if the strings being searched are in the |
| ralloc heap) shift the data out from underneath the regexp |
| routines. |
| |
| Here's another reason to avoid allocation: Emacs |
| processes input from X in a signal handler; processing X input may |
| call malloc; if input arrives while a matching routine is calling |
| malloc, then we're scrod. But Emacs can't just block input while |
| calling matching routines; then we don't notice interrupts when |
| they come in. So, Emacs blocks input around all regexp calls |
| except the matching calls, which it leaves unprotected, in the |
| faith that they will not malloc. */ |
| |
| /* Normally, this is fine. */ |
| # define MATCH_MAY_ALLOCATE |
| |
| /* When using GNU C, we are not REALLY using the C alloca, no matter |
| what config.h may say. So don't take precautions for it. */ |
| # ifdef __GNUC__ |
| # undef C_ALLOCA |
| # endif |
| |
| /* The match routines may not allocate if (1) they would do it with malloc |
| and (2) it's not safe for them to use malloc. |
| Note that if REL_ALLOC is defined, matching would not use malloc for the |
| failure stack, but we would still use it for the register vectors; |
| so REL_ALLOC should not affect this. */ |
| # if (defined C_ALLOCA || defined REGEX_MALLOC) && defined emacs |
| # undef MATCH_MAY_ALLOCATE |
| # endif |
| #endif /* not DEFINED_ONCE */ |
| |
| #ifdef INSIDE_RECURSION |
| /* Failure stack declarations and macros; both re_compile_fastmap and |
| re_match_2 use a failure stack. These have to be macros because of |
| REGEX_ALLOCATE_STACK. */ |
| |
| |
| /* Number of failure points for which to initially allocate space |
| when matching. If this number is exceeded, we allocate more |
| space, so it is not a hard limit. */ |
| # ifndef INIT_FAILURE_ALLOC |
| # define INIT_FAILURE_ALLOC 5 |
| # endif |
| |
| /* Roughly the maximum number of failure points on the stack. Would be |
| exactly that if always used MAX_FAILURE_ITEMS items each time we failed. |
| This is a variable only so users of regex can assign to it; we never |
| change it ourselves. */ |
| |
| # ifdef INT_IS_16BIT |
| |
| # ifndef DEFINED_ONCE |
| # if defined MATCH_MAY_ALLOCATE |
| /* 4400 was enough to cause a crash on Alpha OSF/1, |
| whose default stack limit is 2mb. */ |
| long int re_max_failures = 4000; |
| # else |
| long int re_max_failures = 2000; |
| # endif |
| # endif |
| |
| union PREFIX(fail_stack_elt) |
| { |
| UCHAR_T *pointer; |
| long int integer; |
| }; |
| |
| typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t); |
| |
| typedef struct |
| { |
| PREFIX(fail_stack_elt_t) *stack; |
| unsigned long int size; |
| unsigned long int avail; /* Offset of next open position. */ |
| } PREFIX(fail_stack_type); |
| |
| # else /* not INT_IS_16BIT */ |
| |
| # ifndef DEFINED_ONCE |
| # if defined MATCH_MAY_ALLOCATE |
| /* 4400 was enough to cause a crash on Alpha OSF/1, |
| whose default stack limit is 2mb. */ |
| int re_max_failures = 4000; |
| # else |
| int re_max_failures = 2000; |
| # endif |
| # endif |
| |
| union PREFIX(fail_stack_elt) |
| { |
| UCHAR_T *pointer; |
| int integer; |
| }; |
| |
| typedef union PREFIX(fail_stack_elt) PREFIX(fail_stack_elt_t); |
| |
| typedef struct |
| { |
| PREFIX(fail_stack_elt_t) *stack; |
| unsigned size; |
| unsigned avail; /* Offset of next open position. */ |
| } PREFIX(fail_stack_type); |
| |
| # endif /* INT_IS_16BIT */ |
| |
| # ifndef DEFINED_ONCE |
| # define FAIL_STACK_EMPTY() (fail_stack.avail == 0) |
| # define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0) |
| # define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size) |
| # endif |
| |
| |
| /* Define macros to initialize and free the failure stack. |
| Do `return -2' if the alloc fails. */ |
| |
| # ifdef MATCH_MAY_ALLOCATE |
| # define INIT_FAIL_STACK() \ |
| do { \ |
| fail_stack.stack = (PREFIX(fail_stack_elt_t) *) \ |
| REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (PREFIX(fail_stack_elt_t))); \ |
| \ |
| if (fail_stack.stack == NULL) \ |
| return -2; \ |
| \ |
| fail_stack.size = INIT_FAILURE_ALLOC; \ |
| fail_stack.avail = 0; \ |
| } while (0) |
| |
| # define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack) |
| # else |
| # define INIT_FAIL_STACK() \ |
| do { \ |
| fail_stack.avail = 0; \ |
| } while (0) |
| |
| # define RESET_FAIL_STACK() |
| # endif |
| |
| |
| /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items. |
| |
| Return 1 if succeeds, and 0 if either ran out of memory |
| allocating space for it or it was already too large. |
| |
| REGEX_REALLOCATE_STACK requires `destination' be declared. */ |
| |
| # define DOUBLE_FAIL_STACK(fail_stack) \ |
| ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS) \ |
| ? 0 \ |
| : ((fail_stack).stack = (PREFIX(fail_stack_elt_t) *) \ |
| REGEX_REALLOCATE_STACK ((fail_stack).stack, \ |
| (fail_stack).size * sizeof (PREFIX(fail_stack_elt_t)), \ |
| ((fail_stack).size << 1) * sizeof (PREFIX(fail_stack_elt_t))),\ |
| \ |
| (fail_stack).stack == NULL \ |
| ? 0 \ |
| : ((fail_stack).size <<= 1, \ |
| 1))) |
| |
| |
| /* Push pointer POINTER on FAIL_STACK. |
| Return 1 if was able to do so and 0 if ran out of memory allocating |
| space to do so. */ |
| # define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \ |
| ((FAIL_STACK_FULL () \ |
| && !DOUBLE_FAIL_STACK (FAIL_STACK)) \ |
| ? 0 \ |
| : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \ |
| 1)) |
| |
| /* Push a pointer value onto the failure stack. |
| Assumes the variable `fail_stack'. Probably should only |
| be called from within `PUSH_FAILURE_POINT'. */ |
| # define PUSH_FAILURE_POINTER(item) \ |
| fail_stack.stack[fail_stack.avail++].pointer = (UCHAR_T *) (item) |
| |
| /* This pushes an integer-valued item onto the failure stack. |
| Assumes the variable `fail_stack'. Probably should only |
| be called from within `PUSH_FAILURE_POINT'. */ |
| # define PUSH_FAILURE_INT(item) \ |
| fail_stack.stack[fail_stack.avail++].integer = (item) |
| |
| /* Push a fail_stack_elt_t value onto the failure stack. |
| Assumes the variable `fail_stack'. Probably should only |
| be called from within `PUSH_FAILURE_POINT'. */ |
| # define PUSH_FAILURE_ELT(item) \ |
| fail_stack.stack[fail_stack.avail++] = (item) |
| |
| /* These three POP... operations complement the three PUSH... operations. |
| All assume that `fail_stack' is nonempty. */ |
| # define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer |
| # define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer |
| # define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail] |
| |
| /* Used to omit pushing failure point id's when we're not debugging. */ |
| # ifdef DEBUG |
| # define DEBUG_PUSH PUSH_FAILURE_INT |
| # define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT () |
| # else |
| # define DEBUG_PUSH(item) |
| # define DEBUG_POP(item_addr) |
| # endif |
| |
| |
| /* Push the information about the state we will need |
| if we ever fail back to it. |
| |
| Requires variables fail_stack, regstart, regend, reg_info, and |
| num_regs_pushed be declared. DOUBLE_FAIL_STACK requires `destination' |
| be declared. |
| |
| Does `return FAILURE_CODE' if runs out of memory. */ |
| |
| # define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \ |
| do { \ |
| char *destination; \ |
| /* Must be int, so when we don't save any registers, the arithmetic \ |
| of 0 + -1 isn't done as unsigned. */ \ |
| /* Can't be int, since there is not a shred of a guarantee that int \ |
| is wide enough to hold a value of something to which pointer can \ |
| be assigned */ \ |
| active_reg_t this_reg; \ |
| \ |
| DEBUG_STATEMENT (failure_id++); \ |
| DEBUG_STATEMENT (nfailure_points_pushed++); \ |
| DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \ |
| DEBUG_PRINT2 (" Before push, next avail: %d\n", (fail_stack).avail);\ |
| DEBUG_PRINT2 (" size: %d\n", (fail_stack).size);\ |
| \ |
| DEBUG_PRINT2 (" slots needed: %ld\n", NUM_FAILURE_ITEMS); \ |
| DEBUG_PRINT2 (" available: %d\n", REMAINING_AVAIL_SLOTS); \ |
| \ |
| /* Ensure we have enough space allocated for what we will push. */ \ |
| while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) \ |
| { \ |
| if (!DOUBLE_FAIL_STACK (fail_stack)) \ |
| return failure_code; \ |
| \ |
| DEBUG_PRINT2 ("\n Doubled stack; size now: %d\n", \ |
| (fail_stack).size); \ |
| DEBUG_PRINT2 (" slots available: %d\n", REMAINING_AVAIL_SLOTS);\ |
| } \ |
| \ |
| /* Push the info, starting with the registers. */ \ |
| DEBUG_PRINT1 ("\n"); \ |
| \ |
| if (1) \ |
| for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \ |
| this_reg++) \ |
| { \ |
| DEBUG_PRINT2 (" Pushing reg: %lu\n", this_reg); \ |
| DEBUG_STATEMENT (num_regs_pushed++); \ |
| \ |
| DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ |
| PUSH_FAILURE_POINTER (regstart[this_reg]); \ |
| \ |
| DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ |
| PUSH_FAILURE_POINTER (regend[this_reg]); \ |
| \ |
| DEBUG_PRINT2 (" info: %p\n ", \ |
| reg_info[this_reg].word.pointer); \ |
| DEBUG_PRINT2 (" match_null=%d", \ |
| REG_MATCH_NULL_STRING_P (reg_info[this_reg])); \ |
| DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg])); \ |
| DEBUG_PRINT2 (" matched_something=%d", \ |
| MATCHED_SOMETHING (reg_info[this_reg])); \ |
| DEBUG_PRINT2 (" ever_matched=%d", \ |
| EVER_MATCHED_SOMETHING (reg_info[this_reg])); \ |
| DEBUG_PRINT1 ("\n"); \ |
| PUSH_FAILURE_ELT (reg_info[this_reg].word); \ |
| } \ |
| \ |
| DEBUG_PRINT2 (" Pushing low active reg: %ld\n", lowest_active_reg);\ |
| PUSH_FAILURE_INT (lowest_active_reg); \ |
| \ |
| DEBUG_PRINT2 (" Pushing high active reg: %ld\n", highest_active_reg);\ |
| PUSH_FAILURE_INT (highest_active_reg); \ |
| \ |
| DEBUG_PRINT2 (" Pushing pattern %p:\n", pattern_place); \ |
| DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend); \ |
| PUSH_FAILURE_POINTER (pattern_place); \ |
| \ |
| DEBUG_PRINT2 (" Pushing string %p: `", string_place); \ |
| DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2, \ |
| size2); \ |
| DEBUG_PRINT1 ("'\n"); \ |
| PUSH_FAILURE_POINTER (string_place); \ |
| \ |
| DEBUG_PRINT2 (" Pushing failure id: %u\n", failure_id); \ |
| DEBUG_PUSH (failure_id); \ |
| } while (0) |
| |
| # ifndef DEFINED_ONCE |
| /* This is the number of items that are pushed and popped on the stack |
| for each register. */ |
| # define NUM_REG_ITEMS 3 |
| |
| /* Individual items aside from the registers. */ |
| # ifdef DEBUG |
| # define NUM_NONREG_ITEMS 5 /* Includes failure point id. */ |
| # else |
| # define NUM_NONREG_ITEMS 4 |
| # endif |
| |
| /* We push at most this many items on the stack. */ |
| /* We used to use (num_regs - 1), which is the number of registers |
| this regexp will save; but that was changed to 5 |
| to avoid stack overflow for a regexp with lots of parens. */ |
| # define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS) |
| |
| /* We actually push this many items. */ |
| # define NUM_FAILURE_ITEMS \ |
| (((0 \ |
| ? 0 : highest_active_reg - lowest_active_reg + 1) \ |
| * NUM_REG_ITEMS) \ |
| + NUM_NONREG_ITEMS) |
| |
| /* How many items can still be added to the stack without overflowing it. */ |
| # define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail) |
| # endif /* not DEFINED_ONCE */ |
| |
| |
| /* Pops what PUSH_FAIL_STACK pushes. |
| |
| We restore into the parameters, all of which should be lvalues: |
| STR -- the saved data position. |
| PAT -- the saved pattern position. |
| LOW_REG, HIGH_REG -- the highest and lowest active registers. |
| REGSTART, REGEND -- arrays of string positions. |
| REG_INFO -- array of information about each subexpression. |
| |
| Also assumes the variables `fail_stack' and (if debugging), `bufp', |
| `pend', `string1', `size1', `string2', and `size2'. */ |
| # define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\ |
| { \ |
| DEBUG_STATEMENT (unsigned failure_id;) \ |
| active_reg_t this_reg; \ |
| const UCHAR_T *string_temp; \ |
| \ |
| assert (!FAIL_STACK_EMPTY ()); \ |
| \ |
| /* Remove failure points and point to how many regs pushed. */ \ |
| DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \ |
| DEBUG_PRINT2 (" Before pop, next avail: %d\n", fail_stack.avail); \ |
| DEBUG_PRINT2 (" size: %d\n", fail_stack.size); \ |
| \ |
| assert (fail_stack.avail >= NUM_NONREG_ITEMS); \ |
| \ |
| DEBUG_POP (&failure_id); \ |
| DEBUG_PRINT2 (" Popping failure id: %u\n", failure_id); \ |
| \ |
| /* If the saved string location is NULL, it came from an \ |
| on_failure_keep_string_jump opcode, and we want to throw away the \ |
| saved NULL, thus retaining our current position in the string. */ \ |
| string_temp = POP_FAILURE_POINTER (); \ |
| if (string_temp != NULL) \ |
| str = (const CHAR_T *) string_temp; \ |
| \ |
| DEBUG_PRINT2 (" Popping string %p: `", str); \ |
| DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2); \ |
| DEBUG_PRINT1 ("'\n"); \ |
| \ |
| pat = (UCHAR_T *) POP_FAILURE_POINTER (); \ |
| DEBUG_PRINT2 (" Popping pattern %p:\n", pat); \ |
| DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend); \ |
| \ |
| /* Restore register info. */ \ |
| high_reg = (active_reg_t) POP_FAILURE_INT (); \ |
| DEBUG_PRINT2 (" Popping high active reg: %ld\n", high_reg); \ |
| \ |
| low_reg = (active_reg_t) POP_FAILURE_INT (); \ |
| DEBUG_PRINT2 (" Popping low active reg: %ld\n", low_reg); \ |
| \ |
| if (1) \ |
| for (this_reg = high_reg; this_reg >= low_reg; this_reg--) \ |
| { \ |
| DEBUG_PRINT2 (" Popping reg: %ld\n", this_reg); \ |
| \ |
| reg_info[this_reg].word = POP_FAILURE_ELT (); \ |
| DEBUG_PRINT2 (" info: %p\n", \ |
| reg_info[this_reg].word.pointer); \ |
| \ |
| regend[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \ |
| DEBUG_PRINT2 (" end: %p\n", regend[this_reg]); \ |
| \ |
| regstart[this_reg] = (const CHAR_T *) POP_FAILURE_POINTER (); \ |
| DEBUG_PRINT2 (" start: %p\n", regstart[this_reg]); \ |
| } \ |
| else \ |
| { \ |
| for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \ |
| { \ |
| reg_info[this_reg].word.integer = 0; \ |
| regend[this_reg] = 0; \ |
| regstart[this_reg] = 0; \ |
| } \ |
| highest_active_reg = high_reg; \ |
| } \ |
| \ |
| set_regs_matched_done = 0; \ |
| DEBUG_STATEMENT (nfailure_points_popped++); \ |
| } /* POP_FAILURE_POINT */ |
| |
| /* Structure for per-register (a.k.a. per-group) information. |
| Other register information, such as the |
| starting and ending positions (which are addresses), and the list of |
| inner groups (which is a bits list) are maintained in separate |
| variables. |
| |
| We are making a (strictly speaking) nonportable assumption here: that |
| the compiler will pack our bit fields into something that fits into |
| the type of `word', i.e., is something that fits into one item on the |
| failure stack. */ |
| |
| |
| /* Declarations and macros for re_match_2. */ |
| |
| typedef union |
| { |
| PREFIX(fail_stack_elt_t) word; |
| struct |
| { |
| /* This field is one if this group can match the empty string, |
| zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */ |
| # define MATCH_NULL_UNSET_VALUE 3 |
| unsigned match_null_string_p : 2; |
| unsigned is_active : 1; |
| unsigned matched_something : 1; |
| unsigned ever_matched_something : 1; |
| } bits; |
| } PREFIX(register_info_type); |
| |
| # ifndef DEFINED_ONCE |
| # define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p) |
| # define IS_ACTIVE(R) ((R).bits.is_active) |
| # define MATCHED_SOMETHING(R) ((R).bits.matched_something) |
| # define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something) |
| |
| |
| /* Call this when have matched a real character; it sets `matched' flags |
| for the subexpressions which we are currently inside. Also records |
| that those subexprs have matched. */ |
| # define SET_REGS_MATCHED() \ |
| do \ |
| { \ |
| if (!set_regs_matched_done) \ |
| { \ |
| active_reg_t r; \ |
| set_regs_matched_done = 1; \ |
| for (r = lowest_active_reg; r <= highest_active_reg; r++) \ |
| { \ |
| MATCHED_SOMETHING (reg_info[r]) \ |
| = EVER_MATCHED_SOMETHING (reg_info[r]) \ |
| = 1; \ |
| } \ |
| } \ |
| } \ |
| while (0) |
| # endif /* not DEFINED_ONCE */ |
| |
| /* Registers are set to a sentinel when they haven't yet matched. */ |
| static CHAR_T PREFIX(reg_unset_dummy); |
| # define REG_UNSET_VALUE (&PREFIX(reg_unset_dummy)) |
| # define REG_UNSET(e) ((e) == REG_UNSET_VALUE) |
| |
| /* Subroutine declarations and macros for regex_compile. */ |
| static void PREFIX(store_op1) (re_opcode_t op, UCHAR_T *loc, int arg); |
| static void PREFIX(store_op2) (re_opcode_t op, UCHAR_T *loc, |
| int arg1, int arg2); |
| static void PREFIX(insert_op1) (re_opcode_t op, UCHAR_T *loc, |
| int arg, UCHAR_T *end); |
| static void PREFIX(insert_op2) (re_opcode_t op, UCHAR_T *loc, |
| int arg1, int arg2, UCHAR_T *end); |
| static boolean PREFIX(at_begline_loc_p) (const CHAR_T *pattern, |
| const CHAR_T *p, |
| reg_syntax_t syntax); |
| static boolean PREFIX(at_endline_loc_p) (const CHAR_T *p, |
| const CHAR_T *pend, |
| reg_syntax_t syntax); |
| # ifdef WCHAR |
| static reg_errcode_t wcs_compile_range (CHAR_T range_start, |
| const CHAR_T **p_ptr, |
| const CHAR_T *pend, |
| char *translate, |
| reg_syntax_t syntax, |
| UCHAR_T *b, |
| CHAR_T *char_set); |
| static void insert_space (int num, CHAR_T *loc, CHAR_T *end); |
| # else /* BYTE */ |
| static reg_errcode_t byte_compile_range (unsigned int range_start, |
| const char **p_ptr, |
| const char *pend, |
| char *translate, |
| reg_syntax_t syntax, |
| unsigned char *b); |
| # endif /* WCHAR */ |
| |
| /* Fetch the next character in the uncompiled pattern---translating it |
| if necessary. Also cast from a signed character in the constant |
| string passed to us by the user to an unsigned char that we can use |
| as an array index (in, e.g., `translate'). */ |
| /* ifdef MBS_SUPPORT, we translate only if character <= 0xff, |
| because it is impossible to allocate 4GB array for some encodings |
| which have 4 byte character_set like UCS4. */ |
| # ifndef PATFETCH |
| # ifdef WCHAR |
| # define PATFETCH(c) \ |
| do {if (p == pend) return REG_EEND; \ |
| c = (UCHAR_T) *p++; \ |
| if (translate && (c <= 0xff)) c = (UCHAR_T) translate[c]; \ |
| } while (0) |
| # else /* BYTE */ |
| # define PATFETCH(c) \ |
| do {if (p == pend) return REG_EEND; \ |
| c = (unsigned char) *p++; \ |
| if (translate) c = (unsigned char) translate[c]; \ |
| } while (0) |
| # endif /* WCHAR */ |
| # endif |
| |
| /* Fetch the next character in the uncompiled pattern, with no |
| translation. */ |
| # define PATFETCH_RAW(c) \ |
| do {if (p == pend) return REG_EEND; \ |
| c = (UCHAR_T) *p++; \ |
| } while (0) |
| |
| /* Go backwards one character in the pattern. */ |
| # define PATUNFETCH p-- |
| |
| |
| /* If `translate' is non-null, return translate[D], else just D. We |
| cast the subscript to translate because some data is declared as |
| `char *', to avoid warnings when a string constant is passed. But |
| when we use a character as a subscript we must make it unsigned. */ |
| /* ifdef MBS_SUPPORT, we translate only if character <= 0xff, |
| because it is impossible to allocate 4GB array for some encodings |
| which have 4 byte character_set like UCS4. */ |
| |
| # ifndef TRANSLATE |
| # ifdef WCHAR |
| # define TRANSLATE(d) \ |
| ((translate && ((UCHAR_T) (d)) <= 0xff) \ |
| ? (char) translate[(unsigned char) (d)] : (d)) |
| # else /* BYTE */ |
| # define TRANSLATE(d) \ |
| (translate ? (char) translate[(unsigned char) (d)] : (char) (d)) |
| # endif /* WCHAR */ |
| # endif |
| |
| |
| /* Macros for outputting the compiled pattern into `buffer'. */ |
| |
| /* If the buffer isn't allocated when it comes in, use this. */ |
| # define INIT_BUF_SIZE (32 * sizeof(UCHAR_T)) |
| |
| /* Make sure we have at least N more bytes of space in buffer. */ |
| # ifdef WCHAR |
| # define GET_BUFFER_SPACE(n) \ |
| while (((unsigned long)b - (unsigned long)COMPILED_BUFFER_VAR \ |
| + (n)*sizeof(CHAR_T)) > bufp->allocated) \ |
| EXTEND_BUFFER () |
| # else /* BYTE */ |
| # define GET_BUFFER_SPACE(n) \ |
| while ((unsigned long) (b - bufp->buffer + (n)) > bufp->allocated) \ |
| EXTEND_BUFFER () |
| # endif /* WCHAR */ |
| |
| /* Make sure we have one more byte of buffer space and then add C to it. */ |
| # define BUF_PUSH(c) \ |
| do { \ |
| GET_BUFFER_SPACE (1); \ |
| *b++ = (UCHAR_T) (c); \ |
| } while (0) |
| |
| |
| /* Ensure we have two more bytes of buffer space and then append C1 and C2. */ |
| # define BUF_PUSH_2(c1, c2) \ |
| do { \ |
| GET_BUFFER_SPACE (2); \ |
| *b++ = (UCHAR_T) (c1); \ |
| *b++ = (UCHAR_T) (c2); \ |
| } while (0) |
| |
| |
| /* As with BUF_PUSH_2, except for three bytes. */ |
| # define BUF_PUSH_3(c1, c2, c3) \ |
| do { \ |
| GET_BUFFER_SPACE (3); \ |
| *b++ = (UCHAR_T) (c1); \ |
| *b++ = (UCHAR_T) (c2); \ |
| *b++ = (UCHAR_T) (c3); \ |
| } while (0) |
| |
| /* Store a jump with opcode OP at LOC to location TO. We store a |
| relative address offset by the three bytes the jump itself occupies. */ |
| # define STORE_JUMP(op, loc, to) \ |
| PREFIX(store_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE))) |
| |
| /* Likewise, for a two-argument jump. */ |
| # define STORE_JUMP2(op, loc, to, arg) \ |
| PREFIX(store_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), arg) |
| |
| /* Like `STORE_JUMP', but for inserting. Assume `b' is the buffer end. */ |
| # define INSERT_JUMP(op, loc, to) \ |
| PREFIX(insert_op1) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)), b) |
| |
| /* Like `STORE_JUMP2', but for inserting. Assume `b' is the buffer end. */ |
| # define INSERT_JUMP2(op, loc, to, arg) \ |
| PREFIX(insert_op2) (op, loc, (int) ((to) - (loc) - (1 + OFFSET_ADDRESS_SIZE)),\ |
| arg, b) |
| |
| /* This is not an arbitrary limit: the arguments which represent offsets |
| into the pattern are two bytes long. So if 2^16 bytes turns out to |
| be too small, many things would have to change. */ |
| /* Any other compiler which, like MSC, has allocation limit below 2^16 |
| bytes will have to use approach similar to what was done below for |
| MSC and drop MAX_BUF_SIZE a bit. Otherwise you may end up |
| reallocating to 0 bytes. Such thing is not going to work too well. |
| You have been warned!! */ |
| # ifndef DEFINED_ONCE |
| # if defined _MSC_VER && !defined WIN32 |
| /* Microsoft C 16-bit versions limit malloc to approx 65512 bytes. |
| The REALLOC define eliminates a flurry of conversion warnings, |
| but is not required. */ |
| # define MAX_BUF_SIZE 65500L |
| # define REALLOC(p,s) realloc ((p), (size_t) (s)) |
| # else |
| # define MAX_BUF_SIZE (1L << 16) |
| # define REALLOC(p,s) realloc ((p), (s)) |
| # endif |
| |
| /* Extend the buffer by twice its current size via realloc and |
| reset the pointers that pointed into the old block to point to the |
| correct places in the new one. If extending the buffer results in it |
| being larger than MAX_BUF_SIZE, then flag memory exhausted. */ |
| # if __BOUNDED_POINTERS__ |
| # define SET_HIGH_BOUND(P) (__ptrhigh (P) = __ptrlow (P) + bufp->allocated) |
| # define MOVE_BUFFER_POINTER(P) \ |
| (__ptrlow (P) += incr, SET_HIGH_BOUND (P), __ptrvalue (P) += incr) |
| # define ELSE_EXTEND_BUFFER_HIGH_BOUND \ |
| else \ |
| { \ |
| SET_HIGH_BOUND (b); \ |
| SET_HIGH_BOUND (begalt); \ |
| if (fixup_alt_jump) \ |
| SET_HIGH_BOUND (fixup_alt_jump); \ |
| if (laststart) \ |
| SET_HIGH_BOUND (laststart); \ |
| if (pending_exact) \ |
| SET_HIGH_BOUND (pending_exact); \ |
| } |
| # else |
| # define MOVE_BUFFER_POINTER(P) (P) += incr |
| # define ELSE_EXTEND_BUFFER_HIGH_BOUND |
| # endif |
| # endif /* not DEFINED_ONCE */ |
| |
| # ifdef WCHAR |
| # define EXTEND_BUFFER() \ |
| do { \ |
| UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \ |
| int wchar_count; \ |
| if (bufp->allocated + sizeof(UCHAR_T) > MAX_BUF_SIZE) \ |
| return REG_ESIZE; \ |
| bufp->allocated <<= 1; \ |
| if (bufp->allocated > MAX_BUF_SIZE) \ |
| bufp->allocated = MAX_BUF_SIZE; \ |
| /* How many characters the new buffer can have? */ \ |
| wchar_count = bufp->allocated / sizeof(UCHAR_T); \ |
| if (wchar_count == 0) wchar_count = 1; \ |
| /* Truncate the buffer to CHAR_T align. */ \ |
| bufp->allocated = wchar_count * sizeof(UCHAR_T); \ |
| RETALLOC (COMPILED_BUFFER_VAR, wchar_count, UCHAR_T); \ |
| bufp->buffer = (char*)COMPILED_BUFFER_VAR; \ |
| if (COMPILED_BUFFER_VAR == NULL) \ |
| return REG_ESPACE; \ |
| /* If the buffer moved, move all the pointers into it. */ \ |
| if (old_buffer != COMPILED_BUFFER_VAR) \ |
| { \ |
| PTR_INT_TYPE incr = COMPILED_BUFFER_VAR - old_buffer; \ |
| MOVE_BUFFER_POINTER (b); \ |
| MOVE_BUFFER_POINTER (begalt); \ |
| if (fixup_alt_jump) \ |
| MOVE_BUFFER_POINTER (fixup_alt_jump); \ |
| if (laststart) \ |
| MOVE_BUFFER_POINTER (laststart); \ |
| if (pending_exact) \ |
| MOVE_BUFFER_POINTER (pending_exact); \ |
| } \ |
| ELSE_EXTEND_BUFFER_HIGH_BOUND \ |
| } while (0) |
| # else /* BYTE */ |
| # define EXTEND_BUFFER() \ |
| do { \ |
| UCHAR_T *old_buffer = COMPILED_BUFFER_VAR; \ |
| if (bufp->allocated == MAX_BUF_SIZE) \ |
| return REG_ESIZE; \ |
| bufp->allocated <<= 1; \ |
| if (bufp->allocated > MAX_BUF_SIZE) \ |
| bufp->allocated = MAX_BUF_SIZE; \ |
| bufp->buffer = (UCHAR_T *) REALLOC (COMPILED_BUFFER_VAR, \ |
| bufp->allocated); \ |
| if (COMPILED_BUFFER_VAR == NULL) \ |
| return REG_ESPACE; \ |
| /* If the buffer moved, move all the pointers into it. */ \ |
| if (old_buffer != COMPILED_BUFFER_VAR) \ |
| { \ |
| PTR_INT_TYPE incr = COMPILED_BUFFER_VAR - old_buffer; \ |
| MOVE_BUFFER_POINTER (b); \ |
| MOVE_BUFFER_POINTER (begalt); \ |
| if (fixup_alt_jump) \ |
| MOVE_BUFFER_POINTER (fixup_alt_jump); \ |
| if (laststart) \ |
| MOVE_BUFFER_POINTER (laststart); \ |
| if (pending_exact) \ |
| MOVE_BUFFER_POINTER (pending_exact); \ |
| } \ |
| ELSE_EXTEND_BUFFER_HIGH_BOUND \ |
| } while (0) |
| # endif /* WCHAR */ |
| |
| # ifndef DEFINED_ONCE |
| /* Since we have one byte reserved for the register number argument to |
| {start,stop}_memory, the maximum number of groups we can report |
| things about is what fits in that byte. */ |
| # define MAX_REGNUM 255 |
| |
| /* But patterns can have more than `MAX_REGNUM' registers. We just |
| ignore the excess. */ |
| typedef unsigned regnum_t; |
| |
| |
| /* Macros for the compile stack. */ |
| |
| /* Since offsets can go either forwards or backwards, this type needs to |
| be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */ |
| /* int may be not enough when sizeof(int) == 2. */ |
| typedef long pattern_offset_t; |
| |
| typedef struct |
| { |
| pattern_offset_t begalt_offset; |
| pattern_offset_t fixup_alt_jump; |
| pattern_offset_t inner_group_offset; |
| pattern_offset_t laststart_offset; |
| regnum_t regnum; |
| } compile_stack_elt_t; |
| |
| |
| typedef struct |
| { |
| compile_stack_elt_t *stack; |
| unsigned size; |
| unsigned avail; /* Offset of next open position. */ |
| } compile_stack_type; |
| |
| |
| # define INIT_COMPILE_STACK_SIZE 32 |
| |
| # define COMPILE_STACK_EMPTY (compile_stack.avail == 0) |
| # define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size) |
| |
| /* The next available element. */ |
| # define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail]) |
| |
| # endif /* not DEFINED_ONCE */ |
| |
| /* Set the bit for character C in a list. */ |
| # ifndef DEFINED_ONCE |
| # define SET_LIST_BIT(c) \ |
| (b[((unsigned char) (c)) / BYTEWIDTH] \ |
| |= 1 << (((unsigned char) c) % BYTEWIDTH)) |
| # endif /* DEFINED_ONCE */ |
| |
| /* Get the next unsigned number in the uncompiled pattern. */ |
| # define GET_UNSIGNED_NUMBER(num) \ |
| { \ |
| while (p != pend) \ |
| { \ |
| PATFETCH (c); \ |
| if (c < '0' || c > '9') \ |
| break; \ |
| if (num <= RE_DUP_MAX) \ |
| { \ |
| if (num < 0) \ |
| num = 0; \ |
| num = num * 10 + c - '0'; \ |
| } \ |
| } \ |
| } |
| |
| # ifndef DEFINED_ONCE |
| # if defined _LIBC || WIDE_CHAR_SUPPORT |
| /* The GNU C library provides support for user-defined character classes |
| and the functions from ISO C amendement 1. */ |
| # ifdef CHARCLASS_NAME_MAX |
| # define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX |
| # else |
| /* This shouldn't happen but some implementation might still have this |
| problem. Use a reasonable default value. */ |
| # define CHAR_CLASS_MAX_LENGTH 256 |
| # endif |
| |
| # ifdef _LIBC |
| # define IS_CHAR_CLASS(string) __wctype (string) |
| # else |
| # define IS_CHAR_CLASS(string) wctype (string) |
| # endif |
| # else |
| # define CHAR_CLASS_MAX_LENGTH 6 /* Namely, `xdigit'. */ |
| |
| # define IS_CHAR_CLASS(string) \ |
| (STREQ (string, "alpha") || STREQ (string, "upper") \ |
| || STREQ (string, "lower") || STREQ (string, "digit") \ |
| || STREQ (string, "alnum") || STREQ (string, "xdigit") \ |
| || STREQ (string, "space") || STREQ (string, "print") \ |
| || STREQ (string, "punct") || STREQ (string, "graph") \ |
| || STREQ (string, "cntrl") || STREQ (string, "blank")) |
| # endif |
| # endif /* DEFINED_ONCE */ |
| |
| # ifndef MATCH_MAY_ALLOCATE |
| |
| /* If we cannot allocate large objects within re_match_2_internal, |
| we make the fail stack and register vectors global. |
| The fail stack, we grow to the maximum size when a regexp |
| is compiled. |
| The register vectors, we adjust in size each time we |
| compile a regexp, according to the number of registers it needs. */ |
| |
| static PREFIX(fail_stack_type) fail_stack; |
| |
| /* Size with which the following vectors are currently allocated. |
| That is so we can make them bigger as needed, |
| but never make them smaller. */ |
| # ifdef DEFINED_ONCE |
| static int regs_allocated_size; |
| |
| static const char ** regstart, ** regend; |
| static const char ** old_regstart, ** old_regend; |
| static const char **best_regstart, **best_regend; |
| static const char **reg_dummy; |
| # endif /* DEFINED_ONCE */ |
| |
| static PREFIX(register_info_type) *PREFIX(reg_info); |
| static PREFIX(register_info_type) *PREFIX(reg_info_dummy); |
| |
| /* Make the register vectors big enough for NUM_REGS registers, |
| but don't make them smaller. */ |
| |
| static void |
| PREFIX(regex_grow_registers) (int num_regs) |
| { |
| if (num_regs > regs_allocated_size) |
| { |
| RETALLOC_IF (regstart, num_regs, const char *); |
| RETALLOC_IF (regend, num_regs, const char *); |
| RETALLOC_IF (old_regstart, num_regs, const char *); |
| RETALLOC_IF (old_regend, num_regs, const char *); |
| RETALLOC_IF (best_regstart, num_regs, const char *); |
| RETALLOC_IF (best_regend, num_regs, const char *); |
| RETALLOC_IF (PREFIX(reg_info), num_regs, PREFIX(register_info_type)); |
| RETALLOC_IF (reg_dummy, num_regs, const char *); |
| RETALLOC_IF (PREFIX(reg_info_dummy), num_regs, PREFIX(register_info_type)); |
| |
| regs_allocated_size = num_regs; |
| } |
| } |
| |
| # endif /* not MATCH_MAY_ALLOCATE */ |
| |
| # ifndef DEFINED_ONCE |
| static boolean group_in_compile_stack (compile_stack_type compile_stack, |
| regnum_t regnum); |
| # endif /* not DEFINED_ONCE */ |
| |
| /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX. |
| Returns one of error codes defined in `regex.h', or zero for success. |
| |
| Assumes the `allocated' (and perhaps `buffer') and `translate' |
| fields are set in BUFP on entry. |
| |
| If it succeeds, results are put in BUFP (if it returns an error, the |
| contents of BUFP are undefined): |
| `buffer' is the compiled pattern; |
| `syntax' is set to SYNTAX; |
| `used' is set to the length of the compiled pattern; |
| `fastmap_accurate' is zero; |
| `re_nsub' is the number of subexpressions in PATTERN; |
| `not_bol' and `not_eol' are zero; |
| |
| The `fastmap' and `newline_anchor' fields are neither |
| examined nor set. */ |
| |
| /* Return, freeing storage we allocated. */ |
| # ifdef WCHAR |
| # define FREE_STACK_RETURN(value) \ |
| return (free(pattern), free(mbs_offset), free(is_binary), free (compile_stack.stack), value) |
| # else |
| # define FREE_STACK_RETURN(value) \ |
| return (free (compile_stack.stack), value) |
| # endif /* WCHAR */ |
| |
| static reg_errcode_t |
| PREFIX(regex_compile) (const char *ARG_PREFIX(pattern), |
| size_t ARG_PREFIX(size), reg_syntax_t syntax, |
| struct re_pattern_buffer *bufp) |
| { |
| /* We fetch characters from PATTERN here. Even though PATTERN is |
| `char *' (i.e., signed), we declare these variables as unsigned, so |
| they can be reliably used as array indices. */ |
| register UCHAR_T c, c1; |
| |
| #ifdef WCHAR |
| /* A temporary space to keep wchar_t pattern and compiled pattern. */ |
| CHAR_T *pattern, *COMPILED_BUFFER_VAR; |
| size_t size; |
| /* offset buffer for optimization. See convert_mbs_to_wc. */ |
| int *mbs_offset = NULL; |
| /* It hold whether each wchar_t is binary data or not. */ |
| char *is_binary = NULL; |
| /* A flag whether exactn is handling binary data or not. */ |
| char is_exactn_bin = FALSE; |
| #endif /* WCHAR */ |
| |
| /* A random temporary spot in PATTERN. */ |
| const CHAR_T *p1; |
| |
| /* Points to the end of the buffer, where we should append. */ |
| register UCHAR_T *b; |
| |
| /* Keeps track of unclosed groups. */ |
| compile_stack_type compile_stack; |
| |
| /* Points to the current (ending) position in the pattern. */ |
| #ifdef WCHAR |
| const CHAR_T *p; |
| const CHAR_T *pend; |
| #else /* BYTE */ |
| const CHAR_T *p = pattern; |
| const CHAR_T *pend = pattern + size; |
| #endif /* WCHAR */ |
| |
| /* How to translate the characters in the pattern. */ |
| RE_TRANSLATE_TYPE translate = bufp->translate; |
| |
| /* Address of the count-byte of the most recently inserted `exactn' |
| command. This makes it possible to tell if a new exact-match |
| character can be added to that command or if the character requires |
| a new `exactn' command. */ |
| UCHAR_T *pending_exact = 0; |
| |
| /* Address of start of the most recently finished expression. |
| This tells, e.g., postfix * where to find the start of its |
| operand. Reset at the beginning of groups and alternatives. */ |
| UCHAR_T *laststart = 0; |
| |
| /* Address of beginning of regexp, or inside of last group. */ |
| UCHAR_T *begalt; |
| |
| /* Address of the place where a forward jump should go to the end of |
| the containing expression. Each alternative of an `or' -- except the |
| last -- ends with a forward jump of this sort. */ |
| UCHAR_T *fixup_alt_jump = 0; |
| |
| /* Counts open-groups as they are encountered. Remembered for the |
| matching close-group on the compile stack, so the same register |
| number is put in the stop_memory as the start_memory. */ |
| regnum_t regnum = 0; |
| |
| #ifdef WCHAR |
| /* Initialize the wchar_t PATTERN and offset_buffer. */ |
| p = pend = pattern = TALLOC(csize + 1, CHAR_T); |
| mbs_offset = TALLOC(csize + 1, int); |
| is_binary = TALLOC(csize + 1, char); |
| if (pattern == NULL || mbs_offset == NULL || is_binary == NULL) |
| { |
| free(pattern); |
| free(mbs_offset); |
| free(is_binary); |
| return REG_ESPACE; |
| } |
| pattern[csize] = L'\0'; /* sentinel */ |
| size = convert_mbs_to_wcs(pattern, cpattern, csize, mbs_offset, is_binary); |
| pend = p + size; |
| if (size < 0) |
| { |
| free(pattern); |
| free(mbs_offset); |
| free(is_binary); |
| return REG_BADPAT; |
| } |
| #endif |
| |
| #ifdef DEBUG |
| DEBUG_PRINT1 ("\nCompiling pattern: "); |
| if (debug) |
| { |
| unsigned debug_count; |
| |
| for (debug_count = 0; debug_count < size; debug_count++) |
| PUT_CHAR (pattern[debug_count]); |
| putchar ('\n'); |
| } |
| #endif /* DEBUG */ |
| |
| /* Initialize the compile stack. */ |
| compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t); |
| if (compile_stack.stack == NULL) |
| { |
| #ifdef WCHAR |
| free(pattern); |
| free(mbs_offset); |
| free(is_binary); |
| #endif |
| return REG_ESPACE; |
| } |
| |
| compile_stack.size = INIT_COMPILE_STACK_SIZE; |
| compile_stack.avail = 0; |
| |
| /* Initialize the pattern buffer. */ |
| bufp->syntax = syntax; |
| bufp->fastmap_accurate = 0; |
| bufp->not_bol = bufp->not_eol = 0; |
| |
| /* Set `used' to zero, so that if we return an error, the pattern |
| printer (for debugging) will think there's no pattern. We reset it |
| at the end. */ |
| bufp->used = 0; |
| |
| /* Always count groups, whether or not bufp->no_sub is set. */ |
| bufp->re_nsub = 0; |
| |
| #if !defined emacs && !defined SYNTAX_TABLE |
| /* Initialize the syntax table. */ |
| init_syntax_once (); |
| #endif |
| |
| if (bufp->allocated == 0) |
| { |
| if (bufp->buffer) |
| { /* If zero allocated, but buffer is non-null, try to realloc |
| enough space. This loses if buffer's address is bogus, but |
| that is the user's responsibility. */ |
| #ifdef WCHAR |
| /* Free bufp->buffer and allocate an array for wchar_t pattern |
| buffer. */ |
| free(bufp->buffer); |
| COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE/sizeof(UCHAR_T), |
| UCHAR_T); |
| #else |
| RETALLOC (COMPILED_BUFFER_VAR, INIT_BUF_SIZE, UCHAR_T); |
| #endif /* WCHAR */ |
| } |
| else |
| { /* Caller did not allocate a buffer. Do it for them. */ |
| COMPILED_BUFFER_VAR = TALLOC (INIT_BUF_SIZE / sizeof(UCHAR_T), |
| UCHAR_T); |
| } |
| |
| if (!COMPILED_BUFFER_VAR) FREE_STACK_RETURN (REG_ESPACE); |
| #ifdef WCHAR |
| bufp->buffer = (char*)COMPILED_BUFFER_VAR; |
| #endif /* WCHAR */ |
| bufp->allocated = INIT_BUF_SIZE; |
| } |
| #ifdef WCHAR |
| else |
| COMPILED_BUFFER_VAR = (UCHAR_T*) bufp->buffer; |
| #endif |
| |
| begalt = b = COMPILED_BUFFER_VAR; |
| |
| /* Loop through the uncompiled pattern until we're at the end. */ |
| while (p != pend) |
| { |
| PATFETCH (c); |
| |
| switch (c) |
| { |
| case '^': |
| { |
| if ( /* If at start of pattern, it's an operator. */ |
| p == pattern + 1 |
| /* If context independent, it's an operator. */ |
| || syntax & RE_CONTEXT_INDEP_ANCHORS |
| /* Otherwise, depends on what's come before. */ |
| || PREFIX(at_begline_loc_p) (pattern, p, syntax)) |
| BUF_PUSH (begline); |
| else |
| goto normal_char; |
| } |
| break; |
| |
| |
| case '$': |
| { |
| if ( /* If at end of pattern, it's an operator. */ |
| p == pend |
| /* If context independent, it's an operator. */ |
| || syntax & RE_CONTEXT_INDEP_ANCHORS |
| /* Otherwise, depends on what's next. */ |
| || PREFIX(at_endline_loc_p) (p, pend, syntax)) |
| BUF_PUSH (endline); |
| else |
| goto normal_char; |
| } |
| break; |
| |
| |
| case '+': |
| case '?': |
| if ((syntax & RE_BK_PLUS_QM) |
| || (syntax & RE_LIMITED_OPS)) |
| goto normal_char; |
| /* Fall through. */ |
| handle_plus: |
| case '*': |
| /* If there is no previous pattern... */ |
| if (!laststart) |
| { |
| if (syntax & RE_CONTEXT_INVALID_OPS) |
| FREE_STACK_RETURN (REG_BADRPT); |
| else if (!(syntax & RE_CONTEXT_INDEP_OPS)) |
| goto normal_char; |
| } |
| |
| { |
| /* Are we optimizing this jump? */ |
| boolean keep_string_p = false; |
| |
| /* 1 means zero (many) matches is allowed. */ |
| char zero_times_ok = 0, many_times_ok = 0; |
| |
| /* If there is a sequence of repetition chars, collapse it |
| down to just one (the right one). We can't combine |
| interval operators with these because of, e.g., `a{2}*', |
| which should only match an even number of `a's. */ |
| |
| for (;;) |
| { |
| zero_times_ok |= c != '+'; |
| many_times_ok |= c != '?'; |
| |
| if (p == pend) |
| break; |
| |
| PATFETCH (c); |
| |
| if (c == '*' |
| || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?'))) |
| ; |
| |
| else if (syntax & RE_BK_PLUS_QM && c == '\\') |
| { |
| if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); |
| |
| PATFETCH (c1); |
| if (!(c1 == '+' || c1 == '?')) |
| { |
| PATUNFETCH; |
| PATUNFETCH; |
| break; |
| } |
| |
| c = c1; |
| } |
| else |
| { |
| PATUNFETCH; |
| break; |
| } |
| |
| /* If we get here, we found another repeat character. */ |
| } |
| |
| /* Star, etc. applied to an empty pattern is equivalent |
| to an empty pattern. */ |
| if (!laststart) |
| break; |
| |
| /* Now we know whether or not zero matches is allowed |
| and also whether or not two or more matches is allowed. */ |
| if (many_times_ok) |
| { /* More than one repetition is allowed, so put in at the |
| end a backward relative jump from `b' to before the next |
| jump we're going to put in below (which jumps from |
| laststart to after this jump). |
| |
| But if we are at the `*' in the exact sequence `.*\n', |
| insert an unconditional jump backwards to the ., |
| instead of the beginning of the loop. This way we only |
| push a failure point once, instead of every time |
| through the loop. */ |
| assert (p - 1 > pattern); |
| |
| /* Allocate the space for the jump. */ |
| GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); |
| |
| /* We know we are not at the first character of the pattern, |
| because laststart was nonzero. And we've already |
| incremented `p', by the way, to be the character after |
| the `*'. Do we have to do something analogous here |
| for null bytes, because of RE_DOT_NOT_NULL? */ |
| if (TRANSLATE (*(p - 2)) == TRANSLATE ('.') |
| && zero_times_ok |
| && p < pend && TRANSLATE (*p) == TRANSLATE ('\n') |
| && !(syntax & RE_DOT_NEWLINE)) |
| { /* We have .*\n. */ |
| STORE_JUMP (jump, b, laststart); |
| keep_string_p = true; |
| } |
| else |
| /* Anything else. */ |
| STORE_JUMP (maybe_pop_jump, b, laststart - |
| (1 + OFFSET_ADDRESS_SIZE)); |
| |
| /* We've added more stuff to the buffer. */ |
| b += 1 + OFFSET_ADDRESS_SIZE; |
| } |
| |
| /* On failure, jump from laststart to b + 3, which will be the |
| end of the buffer after this jump is inserted. */ |
| /* ifdef WCHAR, 'b + 1 + OFFSET_ADDRESS_SIZE' instead of |
| 'b + 3'. */ |
| GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); |
| INSERT_JUMP (keep_string_p ? on_failure_keep_string_jump |
| : on_failure_jump, |
| laststart, b + 1 + OFFSET_ADDRESS_SIZE); |
| pending_exact = 0; |
| b += 1 + OFFSET_ADDRESS_SIZE; |
| |
| if (!zero_times_ok) |
| { |
| /* At least one repetition is required, so insert a |
| `dummy_failure_jump' before the initial |
| `on_failure_jump' instruction of the loop. This |
| effects a skip over that instruction the first time |
| we hit that loop. */ |
| GET_BUFFER_SPACE (1 + OFFSET_ADDRESS_SIZE); |
| INSERT_JUMP (dummy_failure_jump, laststart, laststart + |
| 2 + 2 * OFFSET_ADDRESS_SIZE); |
| b += 1 + OFFSET_ADDRESS_SIZE; |
| } |
| } |
| break; |
| |
| |
| case '.': |
| laststart = b; |
| BUF_PUSH (anychar); |
| break; |
| |
| |
| case '[': |
| { |
| boolean had_char_class = false; |
| #ifdef WCHAR |
| CHAR_T range_start = 0xffffffff; |
| #else |
| unsigned int range_start = 0xffffffff; |
| #endif |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| #ifdef WCHAR |
| /* We assume a charset(_not) structure as a wchar_t array. |
| charset[0] = (re_opcode_t) charset(_not) |
| charset[1] = l (= length of char_classes) |
| charset[2] = m (= length of collating_symbols) |
| charset[3] = n (= length of equivalence_classes) |
| charset[4] = o (= length of char_ranges) |
| charset[5] = p (= length of chars) |
| |
| charset[6] = char_class (wctype_t) |
| charset[6+CHAR_CLASS_SIZE] = char_class (wctype_t) |
| ... |
| charset[l+5] = char_class (wctype_t) |
| |
| charset[l+6] = collating_symbol (wchar_t) |
| ... |
| charset[l+m+5] = collating_symbol (wchar_t) |
| ifdef _LIBC we use the index if |
| _NL_COLLATE_SYMB_EXTRAMB instead of |
| wchar_t string. |
| |
| charset[l+m+6] = equivalence_classes (wchar_t) |
| ... |
| charset[l+m+n+5] = equivalence_classes (wchar_t) |
| ifdef _LIBC we use the index in |
| _NL_COLLATE_WEIGHT instead of |
| wchar_t string. |
| |
| charset[l+m+n+6] = range_start |
| charset[l+m+n+7] = range_end |
| ... |
| charset[l+m+n+2o+4] = range_start |
| charset[l+m+n+2o+5] = range_end |
| ifdef _LIBC we use the value looked up |
| in _NL_COLLATE_COLLSEQ instead of |
| wchar_t character. |
| |
| charset[l+m+n+2o+6] = char |
| ... |
| charset[l+m+n+2o+p+5] = char |
| |
| */ |
| |
| /* We need at least 6 spaces: the opcode, the length of |
| char_classes, the length of collating_symbols, the length of |
| equivalence_classes, the length of char_ranges, the length of |
| chars. */ |
| GET_BUFFER_SPACE (6); |
| |
| /* Save b as laststart. And We use laststart as the pointer |
| to the first element of the charset here. |
| In other words, laststart[i] indicates charset[i]. */ |
| laststart = b; |
| |
| /* We test `*p == '^' twice, instead of using an if |
| statement, so we only need one BUF_PUSH. */ |
| BUF_PUSH (*p == '^' ? charset_not : charset); |
| if (*p == '^') |
| p++; |
| |
| /* Push the length of char_classes, the length of |
| collating_symbols, the length of equivalence_classes, the |
| length of char_ranges and the length of chars. */ |
| BUF_PUSH_3 (0, 0, 0); |
| BUF_PUSH_2 (0, 0); |
| |
| /* Remember the first position in the bracket expression. */ |
| p1 = p; |
| |
| /* charset_not matches newline according to a syntax bit. */ |
| if ((re_opcode_t) b[-6] == charset_not |
| && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) |
| { |
| BUF_PUSH('\n'); |
| laststart[5]++; /* Update the length of characters */ |
| } |
| |
| /* Read in characters and ranges, setting map bits. */ |
| for (;;) |
| { |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| PATFETCH (c); |
| |
| /* \ might escape characters inside [...] and [^...]. */ |
| if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') |
| { |
| if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); |
| |
| PATFETCH (c1); |
| BUF_PUSH(c1); |
| laststart[5]++; /* Update the length of chars */ |
| range_start = c1; |
| continue; |
| } |
| |
| /* Could be the end of the bracket expression. If it's |
| not (i.e., when the bracket expression is `[]' so |
| far), the ']' character bit gets set way below. */ |
| if (c == ']' && p != p1 + 1) |
| break; |
| |
| /* Look ahead to see if it's a range when the last thing |
| was a character class. */ |
| if (had_char_class && c == '-' && *p != ']') |
| FREE_STACK_RETURN (REG_ERANGE); |
| |
| /* Look ahead to see if it's a range when the last thing |
| was a character: if this is a hyphen not at the |
| beginning or the end of a list, then it's the range |
| operator. */ |
| if (c == '-' |
| && !(p - 2 >= pattern && p[-2] == '[') |
| && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') |
| && *p != ']') |
| { |
| reg_errcode_t ret; |
| /* Allocate the space for range_start and range_end. */ |
| GET_BUFFER_SPACE (2); |
| /* Update the pointer to indicate end of buffer. */ |
| b += 2; |
| ret = wcs_compile_range (range_start, &p, pend, translate, |
| syntax, b, laststart); |
| if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
| range_start = 0xffffffff; |
| } |
| else if (p[0] == '-' && p[1] != ']') |
| { /* This handles ranges made up of characters only. */ |
| reg_errcode_t ret; |
| |
| /* Move past the `-'. */ |
| PATFETCH (c1); |
| /* Allocate the space for range_start and range_end. */ |
| GET_BUFFER_SPACE (2); |
| /* Update the pointer to indicate end of buffer. */ |
| b += 2; |
| ret = wcs_compile_range (c, &p, pend, translate, syntax, b, |
| laststart); |
| if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
| range_start = 0xffffffff; |
| } |
| |
| /* See if we're at the beginning of a possible character |
| class. */ |
| else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') |
| { /* Leave room for the null. */ |
| char str[CHAR_CLASS_MAX_LENGTH + 1]; |
| |
| PATFETCH (c); |
| c1 = 0; |
| |
| /* If pattern is `[[:'. */ |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| for (;;) |
| { |
| PATFETCH (c); |
| if ((c == ':' && *p == ']') || p == pend) |
| break; |
| if (c1 < CHAR_CLASS_MAX_LENGTH) |
| str[c1++] = c; |
| else |
| /* This is in any case an invalid class name. */ |
| str[0] = '\0'; |
| } |
| str[c1] = '\0'; |
| |
| /* If isn't a word bracketed by `[:' and `:]': |
| undo the ending character, the letters, and leave |
| the leading `:' and `[' (but store them as character). */ |
| if (c == ':' && *p == ']') |
| { |
| wctype_t wt; |
| uintptr_t alignedp; |
| |
| /* Query the character class as wctype_t. */ |
| wt = IS_CHAR_CLASS (str); |
| if (wt == 0) |
| FREE_STACK_RETURN (REG_ECTYPE); |
| |
| /* Throw away the ] at the end of the character |
| class. */ |
| PATFETCH (c); |
| |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| /* Allocate the space for character class. */ |
| GET_BUFFER_SPACE(CHAR_CLASS_SIZE); |
| /* Update the pointer to indicate end of buffer. */ |
| b += CHAR_CLASS_SIZE; |
| /* Move data which follow character classes |
| not to violate the data. */ |
| insert_space(CHAR_CLASS_SIZE, |
| laststart + 6 + laststart[1], |
| b - 1); |
| alignedp = ((uintptr_t)(laststart + 6 + laststart[1]) |
| + __alignof__(wctype_t) - 1) |
| & ~(uintptr_t)(__alignof__(wctype_t) - 1); |
| /* Store the character class. */ |
| *((wctype_t*)alignedp) = wt; |
| /* Update length of char_classes */ |
| laststart[1] += CHAR_CLASS_SIZE; |
| |
| had_char_class = true; |
| } |
| else |
| { |
| c1++; |
| while (c1--) |
| PATUNFETCH; |
| BUF_PUSH ('['); |
| BUF_PUSH (':'); |
| laststart[5] += 2; /* Update the length of characters */ |
| range_start = ':'; |
| had_char_class = false; |
| } |
| } |
| else if (syntax & RE_CHAR_CLASSES && c == '[' && (*p == '=' |
| || *p == '.')) |
| { |
| CHAR_T str[128]; /* Should be large enough. */ |
| CHAR_T delim = *p; /* '=' or '.' */ |
| # ifdef _LIBC |
| uint32_t nrules = |
| _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); |
| # endif |
| PATFETCH (c); |
| c1 = 0; |
| |
| /* If pattern is `[[=' or '[[.'. */ |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| for (;;) |
| { |
| PATFETCH (c); |
| if ((c == delim && *p == ']') || p == pend) |
| break; |
| if (c1 < sizeof (str) - 1) |
| str[c1++] = c; |
| else |
| /* This is in any case an invalid class name. */ |
| str[0] = '\0'; |
| } |
| str[c1] = '\0'; |
| |
| if (c == delim && *p == ']' && str[0] != '\0') |
| { |
| unsigned int i, offset; |
| /* If we have no collation data we use the default |
| collation in which each character is in a class |
| by itself. It also means that ASCII is the |
| character set and therefore we cannot have character |
| with more than one byte in the multibyte |
| representation. */ |
| |
| /* If not defined _LIBC, we push the name and |
| `\0' for the sake of matching performance. */ |
| int datasize = c1 + 1; |
| |
| # ifdef _LIBC |
| int32_t idx = 0; |
| if (nrules == 0) |
| # endif |
| { |
| if (c1 != 1) |
| FREE_STACK_RETURN (REG_ECOLLATE); |
| } |
| # ifdef _LIBC |
| else |
| { |
| const int32_t *table; |
| const int32_t *weights; |
| const int32_t *extra; |
| const int32_t *indirect; |
| wint_t *cp; |
| |
| /* This #include defines a local function! */ |
| # include <locale/weightwc.h> |
| |
| if(delim == '=') |
| { |
| /* We push the index for equivalence class. */ |
| cp = (wint_t*)str; |
| |
| table = (const int32_t *) |
| _NL_CURRENT (LC_COLLATE, |
| _NL_COLLATE_TABLEWC); |
| weights = (const int32_t *) |
| _NL_CURRENT (LC_COLLATE, |
| _NL_COLLATE_WEIGHTWC); |
| extra = (const int32_t *) |
| _NL_CURRENT (LC_COLLATE, |
| _NL_COLLATE_EXTRAWC); |
| indirect = (const int32_t *) |
| _NL_CURRENT (LC_COLLATE, |
| _NL_COLLATE_INDIRECTWC); |
| |
| idx = findidx ((const wint_t**)&cp); |
| if (idx == 0 || cp < (wint_t*) str + c1) |
| /* This is no valid character. */ |
| FREE_STACK_RETURN (REG_ECOLLATE); |
| |
| str[0] = (wchar_t)idx; |
| } |
| else /* delim == '.' */ |
| { |
| /* We push collation sequence value |
| for collating symbol. */ |
| int32_t table_size; |
| const int32_t *symb_table; |
| const unsigned char *extra; |
| int32_t idx; |
| int32_t elem; |
| int32_t second; |
| int32_t hash; |
| char char_str[c1]; |
| |
| /* We have to convert the name to a single-byte |
| string. This is possible since the names |
| consist of ASCII characters and the internal |
| representation is UCS4. */ |
| for (i = 0; i < c1; ++i) |
| char_str[i] = str[i]; |
| |
| table_size = |
| _NL_CURRENT_WORD (LC_COLLATE, |
| _NL_COLLATE_SYMB_HASH_SIZEMB); |
| symb_table = (const int32_t *) |
| _NL_CURRENT (LC_COLLATE, |
| _NL_COLLATE_SYMB_TABLEMB); |
| extra = (const unsigned char *) |
| _NL_CURRENT (LC_COLLATE, |
| _NL_COLLATE_SYMB_EXTRAMB); |
| |
| /* Locate the character in the hashing table. */ |
| hash = elem_hash (char_str, c1); |
| |
| idx = 0; |
| elem = hash % table_size; |
| second = hash % (table_size - 2); |
| while (symb_table[2 * elem] != 0) |
| { |
| /* First compare the hashing value. */ |
| if (symb_table[2 * elem] == hash |
| && c1 == extra[symb_table[2 * elem + 1]] |
| && memcmp (char_str, |
| &extra[symb_table[2 * elem + 1] |
| + 1], c1) == 0) |
| { |
| /* Yep, this is the entry. */ |
| idx = symb_table[2 * elem + 1]; |
| idx += 1 + extra[idx]; |
| break; |
| } |
| |
| /* Next entry. */ |
| elem += second; |
| } |
| |
| if (symb_table[2 * elem] != 0) |
| { |
| /* Compute the index of the byte sequence |
| in the table. */ |
| idx += 1 + extra[idx]; |
| /* Adjust for the alignment. */ |
| idx = (idx + 3) & ~3; |
| |
| str[0] = (wchar_t) idx + 4; |
| } |
| else if (symb_table[2 * elem] == 0 && c1 == 1) |
| { |
| /* No valid character. Match it as a |
| single byte character. */ |
| had_char_class = false; |
| BUF_PUSH(str[0]); |
| /* Update the length of characters */ |
| laststart[5]++; |
| range_start = str[0]; |
| |
| /* Throw away the ] at the end of the |
| collating symbol. */ |
| PATFETCH (c); |
| /* exit from the switch block. */ |
| continue; |
| } |
| else |
| FREE_STACK_RETURN (REG_ECOLLATE); |
| } |
| datasize = 1; |
| } |
| # endif |
| /* Throw away the ] at the end of the equivalence |
| class (or collating symbol). */ |
| PATFETCH (c); |
| |
| /* Allocate the space for the equivalence class |
| (or collating symbol) (and '\0' if needed). */ |
| GET_BUFFER_SPACE(datasize); |
| /* Update the pointer to indicate end of buffer. */ |
| b += datasize; |
| |
| if (delim == '=') |
| { /* equivalence class */ |
| /* Calculate the offset of char_ranges, |
| which is next to equivalence_classes. */ |
| offset = laststart[1] + laststart[2] |
| + laststart[3] +6; |
| /* Insert space. */ |
| insert_space(datasize, laststart + offset, b - 1); |
| |
| /* Write the equivalence_class and \0. */ |
| for (i = 0 ; i < datasize ; i++) |
| laststart[offset + i] = str[i]; |
| |
| /* Update the length of equivalence_classes. */ |
| laststart[3] += datasize; |
| had_char_class = true; |
| } |
| else /* delim == '.' */ |
| { /* collating symbol */ |
| /* Calculate the offset of the equivalence_classes, |
| which is next to collating_symbols. */ |
| offset = laststart[1] + laststart[2] + 6; |
| /* Insert space and write the collationg_symbol |
| and \0. */ |
| insert_space(datasize, laststart + offset, b-1); |
| for (i = 0 ; i < datasize ; i++) |
| laststart[offset + i] = str[i]; |
| |
| /* In re_match_2_internal if range_start < -1, we |
| assume -range_start is the offset of the |
| collating symbol which is specified as |
| the character of the range start. So we assign |
| -(laststart[1] + laststart[2] + 6) to |
| range_start. */ |
| range_start = -(laststart[1] + laststart[2] + 6); |
| /* Update the length of collating_symbol. */ |
| laststart[2] += datasize; |
| had_char_class = false; |
| } |
| } |
| else |
| { |
| c1++; |
| while (c1--) |
| PATUNFETCH; |
| BUF_PUSH ('['); |
| BUF_PUSH (delim); |
| laststart[5] += 2; /* Update the length of characters */ |
| range_start = delim; |
| had_char_class = false; |
| } |
| } |
| else |
| { |
| had_char_class = false; |
| BUF_PUSH(c); |
| laststart[5]++; /* Update the length of characters */ |
| range_start = c; |
| } |
| } |
| |
| #else /* BYTE */ |
| /* Ensure that we have enough space to push a charset: the |
| opcode, the length count, and the bitset; 34 bytes in all. */ |
| GET_BUFFER_SPACE (34); |
| |
| laststart = b; |
| |
| /* We test `*p == '^' twice, instead of using an if |
| statement, so we only need one BUF_PUSH. */ |
| BUF_PUSH (*p == '^' ? charset_not : charset); |
| if (*p == '^') |
| p++; |
| |
| /* Remember the first position in the bracket expression. */ |
| p1 = p; |
| |
| /* Push the number of bytes in the bitmap. */ |
| BUF_PUSH ((1 << BYTEWIDTH) / BYTEWIDTH); |
| |
| /* Clear the whole map. */ |
| bzero (b, (1 << BYTEWIDTH) / BYTEWIDTH); |
| |
| /* charset_not matches newline according to a syntax bit. */ |
| if ((re_opcode_t) b[-2] == charset_not |
| && (syntax & RE_HAT_LISTS_NOT_NEWLINE)) |
| SET_LIST_BIT ('\n'); |
| |
| /* Read in characters and ranges, setting map bits. */ |
| for (;;) |
| { |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| PATFETCH (c); |
| |
| /* \ might escape characters inside [...] and [^...]. */ |
| if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) && c == '\\') |
| { |
| if (p == pend) FREE_STACK_RETURN (REG_EESCAPE); |
| |
| PATFETCH (c1); |
| SET_LIST_BIT (c1); |
| range_start = c1; |
| continue; |
| } |
| |
| /* Could be the end of the bracket expression. If it's |
| not (i.e., when the bracket expression is `[]' so |
| far), the ']' character bit gets set way below. */ |
| if (c == ']' && p != p1 + 1) |
| break; |
| |
| /* Look ahead to see if it's a range when the last thing |
| was a character class. */ |
| if (had_char_class && c == '-' && *p != ']') |
| FREE_STACK_RETURN (REG_ERANGE); |
| |
| /* Look ahead to see if it's a range when the last thing |
| was a character: if this is a hyphen not at the |
| beginning or the end of a list, then it's the range |
| operator. */ |
| if (c == '-' |
| && !(p - 2 >= pattern && p[-2] == '[') |
| && !(p - 3 >= pattern && p[-3] == '[' && p[-2] == '^') |
| && *p != ']') |
| { |
| reg_errcode_t ret |
| = byte_compile_range (range_start, &p, pend, translate, |
| syntax, b); |
| if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
| range_start = 0xffffffff; |
| } |
| |
| else if (p[0] == '-' && p[1] != ']') |
| { /* This handles ranges made up of characters only. */ |
| reg_errcode_t ret; |
| |
| /* Move past the `-'. */ |
| PATFETCH (c1); |
| |
| ret = byte_compile_range (c, &p, pend, translate, syntax, b); |
| if (ret != REG_NOERROR) FREE_STACK_RETURN (ret); |
| range_start = 0xffffffff; |
| } |
| |
| /* See if we're at the beginning of a possible character |
| class. */ |
| |
| else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') |
| { /* Leave room for the null. */ |
| char str[CHAR_CLASS_MAX_LENGTH + 1]; |
| |
| PATFETCH (c); |
| c1 = 0; |
| |
| /* If pattern is `[[:'. */ |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| for (;;) |
| { |
| PATFETCH (c); |
| if ((c == ':' && *p == ']') || p == pend) |
| break; |
| if (c1 < CHAR_CLASS_MAX_LENGTH) |
| str[c1++] = c; |
| else |
| /* This is in any case an invalid class name. */ |
| str[0] = '\0'; |
| } |
| str[c1] = '\0'; |
| |
| /* If isn't a word bracketed by `[:' and `:]': |
| undo the ending character, the letters, and leave |
| the leading `:' and `[' (but set bits for them). */ |
| if (c == ':' && *p == ']') |
| { |
| # if defined _LIBC || WIDE_CHAR_SUPPORT |
| boolean is_lower = STREQ (str, "lower"); |
| boolean is_upper = STREQ (str, "upper"); |
| wctype_t wt; |
| int ch; |
| |
| wt = IS_CHAR_CLASS (str); |
| if (wt == 0) |
| FREE_STACK_RETURN (REG_ECTYPE); |
| |
| /* Throw away the ] at the end of the character |
| class. */ |
| PATFETCH (c); |
| |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| for (ch = 0; ch < 1 << BYTEWIDTH; ++ch) |
| { |
| # ifdef _LIBC |
| if (__iswctype (__btowc (ch), wt)) |
| SET_LIST_BIT (ch); |
| # else |
| if (iswctype (btowc (ch), wt)) |
| SET_LIST_BIT (ch); |
| # endif |
| |
| if (translate && (is_upper || is_lower) |
| && (ISUPPER (ch) || ISLOWER (ch))) |
| SET_LIST_BIT (ch); |
| } |
| |
| had_char_class = true; |
| # else |
| int ch; |
| boolean is_alnum = STREQ (str, "alnum"); |
| boolean is_alpha = STREQ (str, "alpha"); |
| boolean is_blank = STREQ (str, "blank"); |
| boolean is_cntrl = STREQ (str, "cntrl"); |
| boolean is_digit = STREQ (str, "digit"); |
| boolean is_graph = STREQ (str, "graph"); |
| boolean is_lower = STREQ (str, "lower"); |
| boolean is_print = STREQ (str, "print"); |
| boolean is_punct = STREQ (str, "punct"); |
| boolean is_space = STREQ (str, "space"); |
| boolean is_upper = STREQ (str, "upper"); |
| boolean is_xdigit = STREQ (str, "xdigit"); |
| |
| if (!IS_CHAR_CLASS (str)) |
| FREE_STACK_RETURN (REG_ECTYPE); |
| |
| /* Throw away the ] at the end of the character |
| class. */ |
| PATFETCH (c); |
| |
| if (p == pend) FREE_STACK_RETURN (REG_EBRACK); |
| |
| for (ch = 0; ch < 1 << BYTEWIDTH; ch++) |
| { |
| /* This was split into 3 if's to |
| avoid an arbitrary limit in some compiler. */ |
| if ( (is_alnum && ISALNUM (ch)) |
| || (is_alpha && ISALPHA (ch)) |
| || (is_blank && ISBLANK (ch)) |
| || (is_cntrl && ISCNTRL (ch))) |
| SET_LIST_BIT (ch); |
| if ( (is_digit && ISDIGIT (ch)) |
| || (is_graph && ISGRAPH (ch)) |
| || (is_lower && ISLOWER (ch)) |
| || (is_print && ISPRINT (ch))) |
| SET_LIST_BIT (ch); |
| if ( (is_punct && ISPUNCT (ch)) |
| || (is_space && ISSPACE (ch)) |
| || (is_upper && ISUPPER (ch)) |
| || (is_xdigit && ISXDIGIT (ch))) |
| SET_LIST_BIT (ch); |
| if ( translate && (is_upper || is_lower) |
| && (ISUPPER (ch) || ISLOWER (ch))) |
| SET_LIST_BIT (ch); |
| } |
| had_char_class = true; |
| # endif /* libc || wctype.h */ |
| } |
| else |
| { |
| c1++; |
| while (c1--) |
| PATUNFETCH; |
| SET_LIST_BIT ('['); |
| SET_LIST_BIT (':'); |
| range_start = ':'; |
| had_char_class = false; |
| } |
| } |
| else if (syntax & RE_CHAR_CLASSES && c == '[' && *p == '=') |
| { |
| unsigned char str[MB_LEN_MAX + 1]; |
| # ifdef _LIBC |
| uint32_t nrules = |
| _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); |
| # endif |