| /* search.c -- How to search large bodies of text. */ |
| |
| /* This file is part of GNU Info, a program for reading online documentation |
| stored in Info format. |
| |
| Copyright (C) 1993, 97 Free Software Foundation, Inc. |
| |
| This program is free software; you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 2, or (at your option) |
| any later version. |
| |
| This program is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with this program; if not, write to the Free Software |
| Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
| |
| Written by Brian Fox (bfox@ai.mit.edu). */ |
| |
| #include "info.h" |
| |
| #include "search.h" |
| #include "nodes.h" |
| |
| /* The search functions take two arguments: |
| |
| 1) a string to search for, and |
| |
| 2) a pointer to a SEARCH_BINDING which contains the buffer, start, |
| and end of the search. |
| |
| They return a long, which is the offset from the start of the buffer |
| at which the match was found. An offset of -1 indicates failure. */ |
| |
| /* A function which makes a binding with buffer and bounds. */ |
| SEARCH_BINDING * |
| make_binding (buffer, start, end) |
| char *buffer; |
| long start, end; |
| { |
| SEARCH_BINDING *binding; |
| |
| binding = (SEARCH_BINDING *)xmalloc (sizeof (SEARCH_BINDING)); |
| binding->buffer = buffer; |
| binding->start = start; |
| binding->end = end; |
| binding->flags = 0; |
| |
| return (binding); |
| } |
| |
| /* Make a copy of BINDING without duplicating the data. */ |
| SEARCH_BINDING * |
| copy_binding (binding) |
| SEARCH_BINDING *binding; |
| { |
| SEARCH_BINDING *copy; |
| |
| copy = make_binding (binding->buffer, binding->start, binding->end); |
| copy->flags = binding->flags; |
| return (copy); |
| } |
| |
| |
| /* **************************************************************** */ |
| /* */ |
| /* The Actual Searching Functions */ |
| /* */ |
| /* **************************************************************** */ |
| |
| /* Search forwards or backwards for the text delimited by BINDING. |
| The search is forwards if BINDING->start is greater than BINDING->end. */ |
| long |
| search (string, binding) |
| char *string; |
| SEARCH_BINDING *binding; |
| { |
| long result; |
| |
| /* If the search is backwards, then search backwards, otherwise forwards. */ |
| if (binding->start > binding->end) |
| result = search_backward (string, binding); |
| else |
| result = search_forward (string, binding); |
| |
| return (result); |
| } |
| |
| /* Search forwards for STRING through the text delimited in BINDING. */ |
| long |
| search_forward (string, binding) |
| char *string; |
| SEARCH_BINDING *binding; |
| { |
| register int c, i, len; |
| register char *buff, *end; |
| char *alternate = (char *)NULL; |
| |
| len = strlen (string); |
| |
| /* We match characters in the search buffer against STRING and ALTERNATE. |
| ALTERNATE is a case reversed version of STRING; this is cheaper than |
| case folding each character before comparison. Alternate is only |
| used if the case folding bit is turned on in the passed BINDING. */ |
| |
| if (binding->flags & S_FoldCase) |
| { |
| alternate = xstrdup (string); |
| |
| for (i = 0; i < len; i++) |
| { |
| if (islower (alternate[i])) |
| alternate[i] = toupper (alternate[i]); |
| else if (isupper (alternate[i])) |
| alternate[i] = tolower (alternate[i]); |
| } |
| } |
| |
| buff = binding->buffer + binding->start; |
| end = binding->buffer + binding->end + 1; |
| |
| while (buff < (end - len)) |
| { |
| for (i = 0; i < len; i++) |
| { |
| c = buff[i]; |
| |
| if ((c != string[i]) && (!alternate || c != alternate[i])) |
| break; |
| } |
| |
| if (!string[i]) |
| { |
| if (alternate) |
| free (alternate); |
| if (binding->flags & S_SkipDest) |
| buff += len; |
| return ((long) (buff - binding->buffer)); |
| } |
| |
| buff++; |
| } |
| |
| if (alternate) |
| free (alternate); |
| |
| return ((long) -1); |
| } |
| |
| /* Search for STRING backwards through the text delimited in BINDING. */ |
| long |
| search_backward (input_string, binding) |
| char *input_string; |
| SEARCH_BINDING *binding; |
| { |
| register int c, i, len; |
| register char *buff, *end; |
| char *string; |
| char *alternate = (char *)NULL; |
| |
| len = strlen (input_string); |
| |
| /* Reverse the characters in the search string. */ |
| string = (char *)xmalloc (1 + len); |
| for (c = 0, i = len - 1; input_string[c]; c++, i--) |
| string[i] = input_string[c]; |
| |
| string[c] = '\0'; |
| |
| /* We match characters in the search buffer against STRING and ALTERNATE. |
| ALTERNATE is a case reversed version of STRING; this is cheaper than |
| case folding each character before comparison. ALTERNATE is only |
| used if the case folding bit is turned on in the passed BINDING. */ |
| |
| if (binding->flags & S_FoldCase) |
| { |
| alternate = xstrdup (string); |
| |
| for (i = 0; i < len; i++) |
| { |
| if (islower (alternate[i])) |
| alternate[i] = toupper (alternate[i]); |
| else if (isupper (alternate[i])) |
| alternate[i] = tolower (alternate[i]); |
| } |
| } |
| |
| buff = binding->buffer + binding->start - 1; |
| end = binding->buffer + binding->end; |
| |
| while (buff > (end + len)) |
| { |
| for (i = 0; i < len; i++) |
| { |
| c = *(buff - i); |
| |
| if (c != string[i] && (alternate && c != alternate[i])) |
| break; |
| } |
| |
| if (!string[i]) |
| { |
| free (string); |
| if (alternate) |
| free (alternate); |
| |
| if (binding->flags & S_SkipDest) |
| buff -= len; |
| return ((long) (1 + (buff - binding->buffer))); |
| } |
| |
| buff--; |
| } |
| |
| free (string); |
| if (alternate) |
| free (alternate); |
| |
| return ((long) -1); |
| } |
| |
| /* Find STRING in LINE, returning the offset of the end of the string. |
| Return an offset of -1 if STRING does not appear in LINE. The search |
| is bound by the end of the line (i.e., either NEWLINE or 0). */ |
| int |
| string_in_line (string, line) |
| char *string, *line; |
| { |
| register int end; |
| SEARCH_BINDING binding; |
| |
| /* Find the end of the line. */ |
| for (end = 0; line[end] && line[end] != '\n'; end++); |
| |
| /* Search for STRING within these confines. */ |
| binding.buffer = line; |
| binding.start = 0; |
| binding.end = end; |
| binding.flags = S_FoldCase | S_SkipDest; |
| |
| return (search_forward (string, &binding)); |
| } |
| |
| /* Return non-zero if STRING is the first text to appear at BINDING. */ |
| int |
| looking_at (string, binding) |
| char *string; |
| SEARCH_BINDING *binding; |
| { |
| long search_end; |
| |
| search_end = search (string, binding); |
| |
| /* If the string was not found, SEARCH_END is -1. If the string was found, |
| but not right away, SEARCH_END is != binding->start. Otherwise, the |
| string was found at binding->start. */ |
| return (search_end == binding->start); |
| } |
| |
| /* **************************************************************** */ |
| /* */ |
| /* Small String Searches */ |
| /* */ |
| /* **************************************************************** */ |
| |
| /* Function names that start with "skip" are passed a string, and return |
| an offset from the start of that string. Function names that start |
| with "find" are passed a SEARCH_BINDING, and return an absolute position |
| marker of the item being searched for. "Find" functions return a value |
| of -1 if the item being looked for couldn't be found. */ |
| |
| /* Return the index of the first non-whitespace character in STRING. */ |
| int |
| skip_whitespace (string) |
| char *string; |
| { |
| register int i; |
| |
| for (i = 0; string && whitespace (string[i]); i++); |
| return (i); |
| } |
| |
| /* Return the index of the first non-whitespace or newline character in |
| STRING. */ |
| int |
| skip_whitespace_and_newlines (string) |
| char *string; |
| { |
| register int i; |
| |
| for (i = 0; string && (whitespace (string[i]) || string[i] == '\n'); i++); |
| return (i); |
| } |
| |
| /* Return the index of the first whitespace character in STRING. */ |
| int |
| skip_non_whitespace (string) |
| char *string; |
| { |
| register int i; |
| |
| for (i = 0; string && !whitespace (string[i]); i++); |
| return (i); |
| } |
| |
| /* Return the index of the first non-node character in STRING. Note that |
| this function contains quite a bit of hair to ignore periods in some |
| special cases. This is because we here at GNU ship some info files which |
| contain nodenames that contain periods. No such nodename can start with |
| a period, or continue with whitespace, newline, or ')' immediately following |
| the period. If second argument NEWLINES_OKAY is non-zero, newlines should |
| be skipped while parsing out the nodename specification. */ |
| int |
| skip_node_characters (string, newlines_okay) |
| char *string; |
| int newlines_okay; |
| { |
| register int c, i = 0; |
| int paren_seen = 0; |
| int paren = 0; |
| |
| /* Handle special case. This is when another function has parsed out the |
| filename component of the node name, and we just want to parse out the |
| nodename proper. In that case, a period at the start of the nodename |
| indicates an empty nodename. */ |
| if (string && *string == '.') |
| return (0); |
| |
| if (string && *string == '(') |
| { |
| paren++; |
| paren_seen++; |
| i++; |
| } |
| |
| for (; string && (c = string[i]); i++) |
| { |
| if (paren) |
| { |
| if (c == '(') |
| paren++; |
| else if (c == ')') |
| paren--; |
| |
| continue; |
| } |
| |
| /* If the character following the close paren is a space or period, |
| then this node name has no more characters associated with it. */ |
| if (c == '\t' || |
| c == ',' || |
| c == INFO_TAGSEP || |
| ((!newlines_okay) && (c == '\n')) || |
| ((paren_seen && string[i - 1] == ')') && |
| (c == ' ' || c == '.')) || |
| (c == '.' && |
| ( |
| #if 0 |
| /* This test causes a node name ending in a period, like `This.', not to |
| be found. The trailing . is stripped. This occurs in the jargon |
| file (`I see no X here.' is a node name). */ |
| (!string[i + 1]) || |
| #endif |
| (whitespace_or_newline (string[i + 1])) || |
| (string[i + 1] == ')')))) |
| break; |
| } |
| return (i); |
| } |
| |
| |
| /* **************************************************************** */ |
| /* */ |
| /* Searching FILE_BUFFER's */ |
| /* */ |
| /* **************************************************************** */ |
| |
| /* Return the absolute position of the first occurence of a node separator in |
| BINDING-buffer. The search starts at BINDING->start. Return -1 if no node |
| separator was found. */ |
| long |
| find_node_separator (binding) |
| SEARCH_BINDING *binding; |
| { |
| register long i; |
| char *body; |
| |
| body = binding->buffer; |
| |
| /* A node is started by [^L]^_[^L]\n. That is to say, the C-l's are |
| optional, but the DELETE and NEWLINE are not. This separator holds |
| true for all separated elements in an Info file, including the tags |
| table (if present) and the indirect tags table (if present). */ |
| for (i = binding->start; i < binding->end - 1; i++) |
| if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) && |
| (body[i + 2] == '\n' || |
| (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) || |
| ((body[i] == INFO_COOKIE) && |
| (body[i + 1] == '\n' || |
| (body[i + 1] == INFO_FF && body[i + 2] == '\n')))) |
| return (i); |
| return (-1); |
| } |
| |
| /* Return the length of the node separator characters that BODY is |
| currently pointing at. */ |
| int |
| skip_node_separator (body) |
| char *body; |
| { |
| register int i; |
| |
| i = 0; |
| |
| if (body[i] == INFO_FF) |
| i++; |
| |
| if (body[i++] != INFO_COOKIE) |
| return (0); |
| |
| if (body[i] == INFO_FF) |
| i++; |
| |
| if (body[i++] != '\n') |
| return (0); |
| |
| return (i); |
| } |
| |
| /* Return the number of characters from STRING to the start of |
| the next line. */ |
| int |
| skip_line (string) |
| char *string; |
| { |
| register int i; |
| |
| for (i = 0; string && string[i] && string[i] != '\n'; i++); |
| |
| if (string[i] == '\n') |
| i++; |
| |
| return (i); |
| } |
| |
| /* Return the absolute position of the beginning of a tags table in this |
| binding starting the search at binding->start. */ |
| long |
| find_tags_table (binding) |
| SEARCH_BINDING *binding; |
| { |
| SEARCH_BINDING search; |
| long position; |
| |
| search.buffer = binding->buffer; |
| search.start = binding->start; |
| search.end = binding->end; |
| search.flags = S_FoldCase; |
| |
| while ((position = find_node_separator (&search)) != -1 ) |
| { |
| search.start = position; |
| search.start += skip_node_separator (search.buffer + search.start); |
| |
| if (looking_at (TAGS_TABLE_BEG_LABEL, &search)) |
| return (position); |
| } |
| return (-1); |
| } |
| |
| /* Return the absolute position of the node named NODENAME in BINDING. |
| This is a brute force search, and we wish to avoid it when possible. |
| This function is called when a tag (indirect or otherwise) doesn't |
| really point to the right node. It returns the absolute position of |
| the separator preceding the node. */ |
| long |
| find_node_in_binding (nodename, binding) |
| char *nodename; |
| SEARCH_BINDING *binding; |
| { |
| long position; |
| int offset, namelen; |
| SEARCH_BINDING search; |
| |
| namelen = strlen (nodename); |
| |
| search.buffer = binding->buffer; |
| search.start = binding->start; |
| search.end = binding->end; |
| search.flags = 0; |
| |
| while ((position = find_node_separator (&search)) != -1) |
| { |
| search.start = position; |
| search.start += skip_node_separator (search.buffer + search.start); |
| |
| offset = string_in_line (INFO_NODE_LABEL, search.buffer + search.start); |
| |
| if (offset == -1) |
| continue; |
| |
| search.start += offset; |
| search.start += skip_whitespace (search.buffer + search.start); |
| offset = skip_node_characters |
| (search.buffer + search.start, DONT_SKIP_NEWLINES); |
| |
| /* Notice that this is an exact match. You cannot grovel through |
| the buffer with this function looking for random nodes. */ |
| if ((offset == namelen) && |
| (search.buffer[search.start] == nodename[0]) && |
| (strncmp (search.buffer + search.start, nodename, offset) == 0)) |
| return (position); |
| } |
| return (-1); |
| } |