info/scan.c - texinfo - Git at Google

 /* scan.c -- scanning Info files and nodes

    Copyright 1993-2022 Free Software Foundation, Inc.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

    Originally written by Brian Fox. */

 #include "info.h"
 #include "session.h"
 #include "scan.h"
 #include "util.h"
 #include "tag.h"

 #include <langinfo.h>
 #if HAVE_ICONV
 # include <iconv.h>
 #endif
 #include <wchar.h>
 #ifdef __MINGW32__
 /* MinGW uses a replacement nl_langinfo, see pcterm.c.  */
 # define nl_langinfo rpl_nl_langinfo
 extern char * rpl_nl_langinfo (nl_item);
 /* MinGW uses its own replacement wcwidth, see pcterm.c for the
    reasons.  Since Gnulib's wchar.h might redirect wcwidth to
    rpl_wcwidth, we explicitly undo that here.  */
 #undef wcwidth
 #endif

 #ifdef __hpux
 #define va_copy(ap1,ap2) memcpy((&ap1),(&ap2),sizeof(va_list))
 #endif

 /* Variable which holds the most recent filename parsed as a result of
    calling info_parse_xxx (). */
 char *info_parsed_filename = NULL;

 /* Variable which holds the most recent nodename parsed as a result of
    calling info_parse_xxx (). */
 char *info_parsed_nodename = NULL;

 /* Read a filename surrounded by "(" and ")", accounting for matching
    characters, and place it in *FILENAME if FILENAME is not null.  Return
    length of read filename.  On error, set *FILENAME to null and return 0.  */
 int
 read_bracketed_filename (char *string, char **filename)
 {
   register int i = 0;
   int count = 0; /* Level of nesting. */
   int first_close = -1; /* First ")" encountered. */

   if (*string != '(')
     return 0;

   string++;
   count = 1;
   for (i = 0; string[i]; i++)
     {
       if (string[i] == '(')
         count++;
       else if (string[i] == ')')
         {
           if (first_close == -1)
             first_close = i;

           count--;
           if (count == 0)
             break;
         }
     }

   /* If string ended before brackets were balanced, take the first ")" as
      terminating the filename. */
   if (count > 0)
     {
       if (first_close == -1)
         {
           if (filename)
             *filename = 0;
           return 0;
         }
       i = first_close;
     }

   if (filename)
     {
       *filename = xcalloc (1, i + 1);
       memcpy (*filename, string, i);
     }

   return i + 2; /* Length of filename plus "(" and ")". */
 }

 /* Parse the filename and nodename out of STRING, saving in
    INFO_PARSED_FILENAME and INFO_PARSED_NODENAME.  These variables should not
    be freed by calling code.  If either is missing, the relevant variable is
    set to a null pointer. */
 void
 info_parse_node (char *string)
 {
   int nodename_len;

   free (info_parsed_filename);
   free (info_parsed_nodename);
   info_parsed_filename = 0;
   info_parsed_nodename = 0;

   /* Special case of nothing passed.  Return nothing. */
   if (!string || !*string)
     return;

   string += skip_whitespace_and_newlines (string);

   string += read_bracketed_filename (string, &info_parsed_filename);

   /* Parse out nodename. */
   string += skip_whitespace_and_newlines (string);
   nodename_len = read_quoted_string (string, "", 0, &info_parsed_nodename);

   if (nodename_len != 0)
     {
       canonicalize_whitespace (info_parsed_nodename);
     }
 }

 /* Set *OUTPUT to a copy of the string starting at START and finishing at
    a character in TERMINATOR, unless START[0] == INFO_QUOTE, in which case
    copy string from START+1 until the next occurence of INFO_QUOTE.  If
    TERMINATOR is an empty string, finish at a null character.   LINES is
    the number of lines that the string can span.  If LINES is zero, there is no
    limit.  Return length of string including any quoting characters.  Return
    0 if input was invalid. */
 long
 read_quoted_string (char *start, char *terminator, int lines, char **output)
 {
   long len;
   char *nl = 0, saved_char;

   if (lines)
     {
       int i;
       nl = start;
       for (i = 0; i < lines; i++)
         {
           nl = strchr (nl, '\n');
           if (!nl)
             break; /* End of input string reached. */
           nl++;
         }
       if (nl)
         {
           saved_char = *nl;
           *nl = '\0';
         }
     }

   if (start[0] != '\177')
     {
       len = strcspn (start, terminator);

       if (*terminator && !start[len])
         {
           len = 0;
           *output = 0;
         }
       else
         {
           *output = xmalloc (len + 1);
           strncpy (*output, start, len);
           (*output)[len] = '\0';
         }
     }
   else
     {
       len = strcspn (start + 1, "\177");

       if (*terminator && !(start + 1)[len])
         {
           /* No closing 177 byte. */
           len = 0;
           *output = 0;
         }
       else
         {
           *output = xmalloc (len + 1);
           strncpy (*output, start + 1, len);
           (*output)[len] = '\0';
           len += 2; /* Count the two 177 bytes. */
         }

     }

   if (nl)
     *nl = saved_char;
   return len;
 }


 /* **************************************************************** */
 /*                                                                  */
 /*                  Finding and Building Menus                      */
 /*                                                                  */
 /* **************************************************************** */

 /* Get the entry associated with LABEL in the menu of NODE.  Return a
    pointer to the ENTRY if found, or null.  Return value should not
    be freed by caller.  If SLOPPY, allow initial matches, like
    "Buffers" for a LABEL "buffer". */
 REFERENCE *
 info_get_menu_entry_by_label (NODE *node, char *label, int sloppy)
 {
   register int i;
   int best_guess = -1;
   REFERENCE *entry;
   REFERENCE **references = node->references;

   if (!references)
     return 0;

   for (i = 0; (entry = references[i]); i++)
     {
       if (entry->type != REFERENCE_MENU_ITEM)
         continue;
       if (mbscasecmp (label, entry->label) == 0)
         return entry; /* Exact, case-insensitive match. */
       else if (sloppy && best_guess == -1
                && (mbsncasecmp (entry->label, label, strlen (label)) == 0))
         best_guess = i;
     }

   if (sloppy && best_guess != -1)
     return references[best_guess];

   return 0;
 }

 /* A utility function for concatenating REFERENCE **.  Returns a new
    REFERENCE ** which is the concatenation of REF1 and REF2.  */
 REFERENCE **
 info_concatenate_references (REFERENCE **ref1, REFERENCE **ref2)
 {
   register int i, j;
   REFERENCE **result;
   int size = 0;

   /* Get the total size of the slots that we will need. */
   if (ref1)
     {
       for (i = 0; ref1[i]; i++);
       size += i;
     }

   if (ref2)
     {
       for (i = 0; ref2[i]; i++);
       size += i;
     }

   result = xmalloc ((1 + size) * sizeof (REFERENCE *));

   /* Copy the contents over. */

   j = 0;
   if (ref1)
     {
       for (i = 0; ref1[i]; i++)
         result[j++] = ref1[i];
     }

   if (ref2)
     {
       for (i = 0; ref2[i]; i++)
         result[j++] = ref2[i];
     }

   result[j] = NULL;
   return result;
 }

 /* Copy a reference structure.  Copy each field into new memory.  */
 REFERENCE *
 info_copy_reference (REFERENCE *src)
 {
   REFERENCE *dest = xmalloc (sizeof (REFERENCE));
   dest->label = src->label ? xstrdup (src->label) : NULL;
   dest->filename = src->filename ? xstrdup (src->filename) : NULL;
   dest->nodename = src->nodename ? xstrdup (src->nodename) : NULL;
   dest->start = src->start;
   dest->end = src->end;
   dest->line_number = src->line_number;
   dest->type = src->type;

   return dest;
 }

 /* Copy a list of references, copying in reference in turn with
    info_copy_reference. */
 REFERENCE **
 info_copy_references (REFERENCE **ref1)
 {
   int i;
   REFERENCE **result;
   int size;

   if (!ref1)
     return 0;

   /* Get the total size of the slots that we will need. */
   for (i = 0; ref1[i]; i++);
   size = i;

   result = xmalloc ((1 + size) * sizeof (REFERENCE *));

   /* Copy the contents over. */
   for (i = 0; ref1[i]; i++)
     result[i] = info_copy_reference (ref1[i]);
   result[i] = NULL;

   return result;
 }

 void
 info_reference_free (REFERENCE *ref)
 {
   if (ref)
     {
       free (ref->label);
       free (ref->filename);
       free (ref->nodename);
       free (ref);
     }
 }

 /* Free the data associated with REFERENCES. */
 void
 info_free_references (REFERENCE **references)
 {
   register int i;
   REFERENCE *entry;

   if (references)
     {
       for (i = 0; references && (entry = references[i]); i++)
         info_reference_free (entry);

       free (references);
     }
 }

 /* Return new REFERENCE with filename and nodename fields set. */
 REFERENCE *
 info_new_reference (char *filename, char *nodename)
 {
   REFERENCE *r = xmalloc (sizeof (REFERENCE));
   r->label = 0;
   r->filename = filename ? xstrdup (filename) : 0;
   r->nodename = nodename ? xstrdup (nodename) : 0;
   r->start = 0;
   r->end = 0;
   r->line_number = 0;
   r->type = 0;
   return r;
 }


 /* Search for sequences of whitespace or newlines in STRING, replacing
    all such sequences with just a single space.  Remove whitespace from
    start and end of string. */
 void
 canonicalize_whitespace (char *string)
 {
   register int i, j;
   int len, whitespace_found, whitespace_loc = 0;
   char *temp;

   if (!string)
     return;

   len = strlen (string);
   temp = xmalloc (1 + len);

   /* Search for sequences of whitespace or newlines.  Replace all such
      sequences in the string with just a single space. */

   whitespace_found = 0;
   for (i = 0, j = 0; string[i]; i++)
     {
       if (whitespace_or_newline (string[i]))
         {
           whitespace_found++;
           whitespace_loc = i;
           continue;
         }
       else
         {
           if (whitespace_found && whitespace_loc)
             {
               whitespace_found = 0;

               /* Suppress whitespace at start of string. */
               if (j)
                 temp[j++] = ' ';
             }

           temp[j++] = string[i];
         }
     }

   /* Kill trailing whitespace. */
   if (j && whitespace (temp[j - 1]))
     j--;

   temp[j] = '\0';
   strcpy (string, temp);
   free (temp);
 }


 /* **************************************************************** */
 /*                                                                  */
 /*                          Scanning node                           */
 /*                                                                  */
 /* **************************************************************** */

 /* Whether to strip syntax from the text of nodes. */
 int preprocess_nodes_p;

 /* Whether contents of nodes should be rewritten. */
 static int rewrite_p;

 /* inptr is moved forward through the body of a node. */
 static char *inptr;

 /* Pointer to first byte of node (after node separator). */
 static char *input_start;

 /* Number of bytes in node contents. */
 static size_t input_length;

 struct text_buffer output_buf;

 /* Pointer into a tags table for the file to the anchor we need to adjust as
    a result of byte counts changing due to character encoding conversion or
    inserted/deleted text. */
 static TAG **anchor_to_adjust;
 /* Offset within file buffer of first byte of node, used for anchor
    adjustment. */
 static int node_offset;

 /* Difference so far between the number of bytes input in the file and
    bytes output.  Used to adjust the values of anchors in nodes. */
 static long int output_bytes_difference;

 /* Whether we are converting the character encoding of the file. */
 static int convert_encoding_p;

 #if HAVE_ICONV

 /* Whether text in file is encoded in UTF-8. */
 static int file_is_in_utf8;

 /* Used for conversion from file encoding to output encoding. */
 static iconv_t iconv_to_output;

 /* Conversion from file encoding to UTF-8. */
 static iconv_t iconv_to_utf8;

 #endif /* HAVE_ICONV */

 void
 init_conversion (FILE_BUFFER *fb)
 {
   char *target_encoding;

   convert_encoding_p = 0;

   /* Node being processed does not come from an Info file. */
   if (!fb)
     return;

 #if !HAVE_ICONV
   return;
 #else
   file_is_in_utf8 = 0;

   /* Don't process file if encoding is unknown. */
   if (!fb->encoding)
     return;

   /* Read name of character encoding from environment locale */
   target_encoding = nl_langinfo (CODESET);

   /* Don't convert the contents if the locale
      uses the same character encoding as the file */
   if (!strcasecmp(target_encoding, fb->encoding))
     return;

   /* Check if an iconv conversion from file locale to system
      locale exists */
   iconv_to_output = iconv_open (target_encoding, fb->encoding);
   if (iconv_to_output == (iconv_t) -1)
     return; /* Return if no conversion function implemented */

   if (   !strcasecmp ("UTF8",  fb->encoding)
       || !strcasecmp ("UTF-8", fb->encoding))
     file_is_in_utf8 = 1;

   if (!file_is_in_utf8)
     {
       iconv_to_utf8 = iconv_open ("UTF-8", fb->encoding);
       if (iconv_to_utf8 == (iconv_t) -1)
         {
           /* Return if no conversion function implemented */
           iconv_close (iconv_to_output);
           iconv_to_output = (iconv_t) -1;
           return;
         }
     }

   convert_encoding_p = 1;
   rewrite_p = 1;
 #endif /* HAVE_ICONV */
 }

 void close_conversion (void)
 {
 #if HAVE_ICONV
   if (convert_encoding_p)
     {
       iconv_close (iconv_to_output);
       iconv_to_output = (iconv_t) -1;
       if (!file_is_in_utf8) iconv_close (iconv_to_utf8);
     }
 #endif
 }

 static void
 init_output_stream (FILE_BUFFER *fb)
 {
   init_conversion (fb);
   output_bytes_difference = 0;

   if (rewrite_p)
     text_buffer_init (&output_buf);
 }

 static size_t saved_offset;
 static char *saved_inptr;
 static long saved_difference;

 void
 save_conversion_state (void)
 {
   saved_offset = text_buffer_off (&output_buf);
   saved_inptr = inptr;
   saved_difference = output_bytes_difference;
 }

 /* Go back to the saved state of the output stream. */
 void
 reset_conversion (void)
 {
   text_buffer_off (&output_buf) = saved_offset;
   inptr = saved_inptr;
   output_bytes_difference = saved_difference;
 }

 /* Copy bytes from input to output with no encoding conversion. */
 static void
 copy_direct (long n)
 {
   text_buffer_add_string (&output_buf, inptr, n);
   inptr += n;
 }

 /* Read one character at *FROM and write out a sequence
    of bytes representing that character in ASCII.  *FROM
    is advanced past the read character. */
 static int
 degrade_utf8 (char **from, size_t *from_left)
 {
   static struct encoding_replacement
   {
     char *from_string;
     char *to_string;
   } er[] = {
     {"\xE2\x80\x98","'"}, /* Opening single quote */
     {"\xE2\x80\x99","'"}, /* Closing single quote */
     {"\xE2\x80\x9C","\""},/* Opening double quote */
     {"\xE2\x80\x9D","\""},/* Closing double quote */
     {"\xC2\xA9","(C)"},   /* Copyright symbol */
     {"\xC2\xBB",">>"},    /* Closing double angle brackets */

     {"\xE2\x86\x92","->"},/* Right arrow */
     {"\xE2\x87\x92","=>"},/* Right double arrow */
     {"\xE2\x8A\xA3","-|"},/* Print symbol */
     {"\xE2\x98\x85","-!-"}, /* Point symbol */
     {"\xE2\x86\xA6","==>"}, /* Expansion symbol */

     {"\xE2\x80\x90","-"},  /* Hyphen */
     {"\xE2\x80\x91","-"},  /* Non-breaking hyphen */
     {"\xE2\x80\x92","-"},  /* Figure dash */
     {"\xE2\x80\x93","-"},  /* En dash */
     {"\xE2\x80\x94","--"},  /* Em dash */
     {"\xE2\x88\x92","-"},  /* Minus sign */
     {"\xE2\x80\xA6","..."},  /* Ellipsis */
     {"\xE2\x80\xA2","*"},  /* Bullet */

     {"\xC3\xA0","a`"},   /* Lower case letter a with grave accent */
     {"\xC3\xA2","a^"},   /* Lower case letter a with circumflex */
     {"\xC3\xA4","a\""},  /* Lower case letter a with diaeresis */
     {"\xC3\xA6","ae"},   /* Lower case letter ae ligature */
     {"\xC3\xA9","e'"},   /* Lower case letter e with acute accent */
     {"\xC3\xA8","e`"},   /* Lower case letter e with grave accent */
     {"\xC3\xAA","e^"},   /* Lower case letter e with circumflex */
     {"\xC3\xAB","e\""},  /* Lower case letter e with diaeresis */
     {"\xC3\xB6","o\""},  /* Lower case letter o with diaeresis */
     {"\xC3\xBC","u\""},  /* Lower case letter u with diaeresis */
     {"\xC3\x84", "A\""},  /* Upper case letter A with diaeresis. */
     {"\xC3\x96", "O\""},  /* Upper case letter O with diaeresis. */
     {"\xC3\x9c", "U\""},  /* Upper case letter U with diaeresis. */

     {"\xC3\xB1","n~"},  /* Lower case letter n with tilde */
     {"\xC3\x87","C,"},  /* Upper case letter C with cedilla */
     {"\xC3\xA7","c,"},  /* Lower case letter c with cedilla */
     {"\xC3\x9f","ss"},  /* Lower case letter sharp s */

     {0, 0}
   };

   struct encoding_replacement *erp;

   for (erp = er; erp->from_string != 0; erp++)
     {
       /* Avoid reading past end of input. */
       int width = strlen (erp->from_string);
       if (width > *from_left)
         continue;

       if (!strncmp (erp->from_string, *from, width))
         {
           text_buffer_add_string (&output_buf, erp->to_string,
                                   strlen(erp->to_string));
           *from += width;
           *from_left -= width;
           return 1;
         }
     }

   /* Failing this, just print a question mark.  Maybe we should use SUB
      (^Z) (ASCII substitute character code) instead, or pass through the
      original bytes. */
   text_buffer_add_string (&output_buf, "?", 1);

   /* Ideally we would advance one UTF-8 character.  This would
      require knowing its length in bytes. */
   (*from)++;
   (*from_left)--;

   return 0;
 }

 /* Convert N bytes from input to output encoding and write to
    output buffer.  Return number of bytes over N written. */
 static int
 copy_converting (long n)
 {
 #if !HAVE_ICONV
   return 0;
 #else
   size_t bytes_left, orig_bytes_left;
   int extra_at_end;
   size_t iconv_ret;
   long output_start;

   size_t utf8_char_free;
   char utf8_char[4]; /* Maximum 4 bytes in a UTF-8 character */
   char *utf8_char_ptr, *orig_inptr;
   size_t i;

   /* Use n as an estimate of how many bytes will be required
      in target encoding. */
   text_buffer_alloc (&output_buf, (size_t) n);

   output_start = text_buffer_off (&output_buf);
   bytes_left = n;
   extra_at_end = 0;
   while (1)
     {
       iconv_ret = text_buffer_iconv (&output_buf, iconv_to_output,
                                      (ICONV_CONST char **)&inptr, &bytes_left);

       /* Make sure libiconv flushes out the last converted character.
 	 This is required when the conversion is stateful, in which
 	 case libiconv might not output the last character, waiting to
 	 see whether it should be combined with the next one.  */
       if (iconv_ret != (size_t) -1
 	  && text_buffer_iconv (&output_buf, iconv_to_output,
 				NULL, NULL) != (size_t) -1)
         /* Success: all of input converted. */
         break;

       /* There's been an error while converting. */
       switch (errno)
         {
         case EINVAL:
           /* Incomplete byte sequence at end of input buffer.  Try to read
              more. */

           /* input_length - 2 is offset of last-but-one byte within input.
              This checks if there is at least one more byte within node
              contents. */
           if (inptr - input_start + (bytes_left - 1) <= input_length - 2)
             {
               bytes_left++;
               extra_at_end++;
             }
           else
             {
               copy_direct (bytes_left);
               bytes_left = 0;
             }
           continue;
         default: /* Unknown error */
           info_error (_("Error converting file character encoding"));

           /* Skip past current input and hope we don't get an
              error next time. */
           inptr += bytes_left;
           return 0;
         case EILSEQ:
           /* Byte sequence in input not recognized.  Degrade to ASCII.  */
           break;
         }

       /* Flush any waiting input in iconv_to_output and enter the
          default shift state. */
       text_buffer_iconv (&output_buf, iconv_to_output, NULL, NULL);

       if (file_is_in_utf8)
         {
           degrade_utf8 (&inptr, &bytes_left);
           continue;
         }

       /* If file is not in UTF-8, we degrade to ASCII in two steps:
          first convert the character to UTF-8, then look up a replacement
          string.  Note that mixing iconv_to_output and iconv_to_utf8
          on the same input may not work well if the input encoding
          is stateful.  We could deal with this by always converting to
          UTF-8 first; then we could mix conversions on the UTF-8 stream. */

       /* We want to read exactly one character.  Do this by
          restricting size of output buffer. */
       utf8_char_ptr = utf8_char;
       orig_inptr = inptr;
       orig_bytes_left = bytes_left;
       for (i = 1; i <= 4; i++)
         {
           utf8_char_free = i;
           errno = 0;
           iconv_ret = iconv (iconv_to_utf8, (ICONV_CONST char **)&inptr,
                              &bytes_left, &utf8_char_ptr, &utf8_char_free);
           if ((iconv_ret == (size_t) -1 && errno != E2BIG)
               /* If we managed to convert a character: */
               || utf8_char_ptr > utf8_char)
             break;
         }

       /* errno == E2BIG if iconv ran out of output buffer,
          which is expected. */
       if (iconv_ret == (size_t) -1 && errno != E2BIG)
 	{
 	  /* Character is not recognized.  Copy a single byte.  */
 	  inptr = orig_inptr;	/* iconv might have incremented inptr  */
 	  copy_direct (1);
 	  bytes_left = orig_bytes_left - 1;
 	}
       else
         {
           utf8_char_ptr = utf8_char;
           /* i is width of UTF-8 character */
           degrade_utf8 (&utf8_char_ptr, &i);
 	  /* If we are done, make sure iconv flushes the last character.  */
 	  if (bytes_left <= 0)
 	    {
 	      utf8_char_ptr = utf8_char;
 	      i = 4;
 	      iconv (iconv_to_utf8, NULL, NULL,
 		     &utf8_char_ptr, &utf8_char_free);
 	      if (utf8_char_ptr > utf8_char)
 		{
 		  utf8_char_ptr = utf8_char;
 		  degrade_utf8 (&utf8_char_ptr, &i);
 		}
 	    }
         }
     }

   /* Must cast because the difference between unsigned size_t is always
      positive. */
   output_bytes_difference +=
     n - ((signed long) text_buffer_off (&output_buf) - output_start);

   return extra_at_end;
 #endif /* HAVE_ICONV */
 }

 /* Functions below are named from the perspective of the preprocess_nodes_p
    flag being on. */

 /* Copy text from input node contents, possibly converting the
    character encoding and adjusting anchor offsets at the same time. */
 static void
 copy_input_to_output (long n)
 {
   if (rewrite_p)
     {
       long bytes_left;

       bytes_left = n;
       while (bytes_left > 0)
         {
           if (!convert_encoding_p)
             {
               copy_direct (bytes_left);
               bytes_left = 0;
             }
           else
             {
               long bytes_to_convert;
               long extra_written;

               bytes_to_convert = bytes_left;

               if (anchor_to_adjust)
                 {
                   /* Check there is an anchor in the input. */
                   long first_anchor =
                     (*anchor_to_adjust)->nodestart - node_offset;

                   if (first_anchor < 0)
                     anchor_to_adjust = 0; /* error in input file */
                   else if (first_anchor < (inptr-input_start) + bytes_left)
                     {
                       /* Convert enough to pass the first anchor in input. */
                       bytes_to_convert = first_anchor - (inptr-input_start)+1;
                       if (bytes_to_convert < 0)
                         {
                           bytes_to_convert = bytes_left;
                           anchor_to_adjust = 0;
                         }
                     }
                 }

               /* copy_converting may read more than bytes_to_convert
                  bytes if its input ends in an incomplete byte sequence. */
               extra_written = copy_converting (bytes_to_convert);

               bytes_left -= bytes_to_convert + extra_written;
             }

           /* Check if we have gone past any anchors and
              adjust with output_bytes_difference. */
           if (anchor_to_adjust)
             while ((*anchor_to_adjust)->nodestart - node_offset
                    <= inptr - input_start)
               {
                 (*anchor_to_adjust)->nodestart_adjusted
                    = (*anchor_to_adjust)->nodestart - output_bytes_difference;

                 anchor_to_adjust++;
                 if (!*anchor_to_adjust
                     || (*anchor_to_adjust)->cache.nodelen != 0)
                   {
                     anchor_to_adjust = 0;
                     break;
                   }
               }
         }
     }
   else
     inptr += n;
 }

 static void
 skip_input (long n)
 {
   if (preprocess_nodes_p)
     {
       inptr += n;
       output_bytes_difference += n;
     }
   else if (rewrite_p)
     {
       /* We are expanding tags only.  Do not skip input. */
       copy_input_to_output (n);
     }
   else
     {
       inptr += n;
     }
 }

 static void
 write_extra_bytes_to_output (char *input, long n)
 {
   if (preprocess_nodes_p)
     {
       text_buffer_add_string (&output_buf, input, n);
       output_bytes_difference -= n;
     }
 }

 /* Like write_extra_bytes_to_output, but writes bytes even when
    preprocess_nodes=Off. */
 static void
 write_tag_contents (char *input, long n)
 {
   if (rewrite_p)
     {
       text_buffer_add_string (&output_buf, input, n);
       output_bytes_difference -= n;
     }
 }

 /* Like skip_input, but skip even when !preprocess_nodes_p. */
 static void
 skip_tag_contents (long n)
 {
   if (rewrite_p)
     {
       inptr += n;
       output_bytes_difference += n;
     }
 }

 /* Read first line of node and set next, prev and up. */
 static void
 parse_top_node_line (NODE *node)
 {
   char **store_in = 0;
   char *nodename;
   char *ptr;
   int value_length;

   /* If the first line is empty, leave it in.  This is the case
      in the index-apropos window. */
   if (*node->contents == '\n')
     return;

   node->next = node->prev = node->up = 0;
   ptr = node->contents;

   while (1)
     {
       store_in = 0;

       ptr += skip_whitespace (ptr);

       /* Check what field we are looking at */
       if (!strncasecmp (ptr, INFO_FILE_LABEL, strlen(INFO_FILE_LABEL)))
         {
           ptr += strlen (INFO_FILE_LABEL);
         }
       else if (!strncasecmp (ptr, INFO_NODE_LABEL, strlen(INFO_NODE_LABEL)))
         {
           ptr += strlen (INFO_NODE_LABEL);
         }
       else if (!strncasecmp (ptr, INFO_PREV_LABEL, strlen(INFO_PREV_LABEL)))
         {
           ptr += strlen (INFO_PREV_LABEL);
           store_in = &node->prev;
         }
       else if (!strncasecmp (ptr, INFO_ALTPREV_LABEL,
                              strlen(INFO_ALTPREV_LABEL)))
         {
           ptr += strlen (INFO_ALTPREV_LABEL);
           store_in = &node->prev;
         }
       else if (!strncasecmp (ptr, INFO_NEXT_LABEL, strlen(INFO_NEXT_LABEL)))
         {
           ptr += strlen (INFO_NEXT_LABEL);
           store_in = &node->next;
         }
       else if (!strncasecmp (ptr, INFO_UP_LABEL, strlen(INFO_UP_LABEL)))
         {
           ptr += strlen (INFO_UP_LABEL);
           store_in = &node->up;
         }
       else
         {
           store_in = 0;
           /* Not recognized - code below will skip to next comma */
         }
       ptr += skip_whitespace (ptr);

       /* Get length of a bracketed filename component. */
       if (*ptr != '(')
         value_length = 0;
       else
         value_length = read_bracketed_filename (ptr, 0);

       /* Get length of node name, or filename if following "File:".  Note
          that .  is not included in the second argument here in order to
          support this character in file names. */
       value_length += read_quoted_string (ptr + value_length,
                                           "\n\r\t,", 1, &nodename);
       if (store_in)
         {
           *store_in = xmalloc (value_length + 1);
           strncpy (*store_in, ptr, value_length);
           (*store_in)[value_length] = '\0';
         }

       free (nodename);
       ptr += value_length;

       if (*ptr == '\n' || !*ptr)
         break;

       ptr += 1; /* Point after field terminator */
     }
 }

 /* Output, replace or hide text introducing a reference.  INPTR starts on
    the first byte of a sequence introducing a reference and finishes on the
    first (non-whitespace) byte of the reference label. */
 static int
 scan_reference_marker (REFERENCE *entry, int in_parentheses)
 {
   /* When preprocess_nodes is Off, we position the cursor on
      the "*" when moving between references. */
   if (!preprocess_nodes_p)
     {
       if (rewrite_p)
         entry->start = text_buffer_off(&output_buf);
       else
         entry->start = inptr - input_start;
     }

   /* Check what we found based on first character of match */
   if (inptr[0] == '\n')
     {
       entry->type = REFERENCE_MENU_ITEM;
       if (!preprocess_nodes_p)
         entry->start++;
     }
   else
     entry->type = REFERENCE_XREF;

   if (entry->type == REFERENCE_MENU_ITEM)
     copy_input_to_output (strlen ("\n* "));
   else
     {
       /* Only match "*Note" if it is followed by a whitespace character so that
          it will not be recognized if, e.g., it is surrounded in inverted
          commas. */
       if (!strchr (" \t\r\n", inptr[strlen ("*Note")]))
         {
           copy_input_to_output (strlen ("*Note:"));
           return 0;
         }

       /* Cross-references can be generated by four different Texinfo
          commands.  @inforef and @xref output "*Note " in Info format,
          and "See" in HTML and print.  @ref and @pxref output "*note "
          in Info format, and either nothing at all or "see" in HTML
          and print.  Unfortunately, there is no easy way to distinguish
          between these latter two cases. */
       /* TODO: Internationalize these strings, but only if we know the
          language of the document. */
       if (inptr[1] == 'N')
         {
           write_extra_bytes_to_output ("See", 3);
           in_parentheses = 1;
         }
       else if (in_parentheses)
         {
           write_extra_bytes_to_output ("see", 3);
           /* Only output the "see" for input like "(*note ...)", which
              would have come from a use of @pxref.  We used to output "see" for
              "*note" in more circumstances, with a list of words where to
              suppress it (to avoid "see *note" turning into "see see"), but
              such a list can't be complete or reliable.  It's better to remove
              it with more enthusiasm, then if the document writer wants a "see"
              to appear, they can add one themselves. */
         }

       skip_input (strlen ("*Note"));
       if (!in_parentheses)
         skip_input (skip_whitespace (inptr));
     }

   /* Copy any white space before label. */
   copy_input_to_output (skip_whitespace_and_newlines (inptr));

   return 1;
 }

 /* Output reference label and update ENTRY.  INPTR should be on the first
    non-whitespace byte of label when this function is called.  It is left
    at the first character after the colon terminating the label.  Return 0 if
    invalid syntax is encountered. */
 static int
 scan_reference_label (REFERENCE *entry, int in_index)
 {
   int max_lines;
   int len, label_len = 0;

   /* Handle case of cross-reference like (FILE)NODE::. */
   if (inptr[0] == '(' && !in_index)
     label_len = read_bracketed_filename (inptr, &entry->filename);

   /* Search forward to ":" to get label name.  Cross-references may have
      a newline in the middle. */
   if (entry->type == REFERENCE_MENU_ITEM)
     max_lines = 1;
   else
     max_lines = 2;
   if (!in_index || inptr[label_len] == '\177')
     {
       len = read_quoted_string (inptr + label_len, ":", max_lines,
                                 &entry->nodename);
       canonicalize_whitespace (entry->nodename);
       if (!len)
         return 0; /* Input invalid. */
       label_len += len;
     }
   else
     {
       /* If in an index node, go forward to the last colon on the line
          (not preceded by a newline, NUL or DEL).  This is in order to
          support index entries containing colons.  This should work fine
          as long as the node name does not contain a colon as well. */

       char *p;
       int n, m = 0;
       p = inptr + label_len;

       while (1)
         {
           n = strcspn (p, ":\n\177");
           if (p[n] == ':')
             {
               m += n + 1;
               p += n + 1;
               continue;
             }
           break;
         }
       if (m == 0)
         return 0; /* no : found */
       label_len += m - 1;
     }

 #if HAVE_ICONV
   if (iconv_to_output != (iconv_t) -1 && iconv_to_output != (iconv_t) 0)
     {
       static struct text_buffer label_text;
       size_t iconv_ret;
       size_t inbytesleft = label_len;
       char *p = inptr;
       text_buffer_reset (&label_text);
       text_buffer_alloc (&label_text, label_len);

       while (1)
         {
           iconv_ret = text_buffer_iconv (&label_text, iconv_to_output,
                                          (ICONV_CONST char **)&p,
                                          &inbytesleft);

           /* Make sure libiconv flushes out the last converted character. */
           if (iconv_ret != (size_t) -1
                 && text_buffer_iconv (&label_text, iconv_to_output,
                        NULL, NULL) != (size_t) -1)
             break; /* Success: all of input converted. */

           /* There's been an error while converting. */
           goto no_convert;
         }

       text_buffer_add_char (&label_text, '\0');
       entry->label = strdup (label_text.base);
     }
   else
 #endif
     {
   no_convert:
       entry->label = xmalloc (label_len + 1);
       memcpy (entry->label, inptr, label_len);
       entry->label[label_len] = '\0';
     }
   canonicalize_whitespace (entry->label);

   if (preprocess_nodes_p)
     entry->start = text_buffer_off (&output_buf);

   /* Write text of label. */
   copy_input_to_output (label_len);

   if (rewrite_p)
     entry->end = text_buffer_off (&output_buf);
   else
     entry->end = inptr - input_start;

   /* Colon after label. */
   if (*inptr)
     skip_input (1);
   /* Don't mess up the margin of a menu description. */
   if (entry->type == REFERENCE_MENU_ITEM)
     write_extra_bytes_to_output (" ", 1);

   return 1;
 }

 /* INPTR should be at the first character after the colon
    terminating the label.  Return 0 on syntax error. */
 static int
 scan_reference_target (REFERENCE *entry, NODE *node, int in_parentheses)
 {
   int i;

   /* This entry continues with a specific target.  Parse the
      file name and node name from the specification. */

   if (entry->type == REFERENCE_XREF)
     {
       int length = 0; /* Length of specification */
       char *target_start = inptr;
       char *nl_off = 0;
       int space_at_start_of_line = 0;

       length += skip_whitespace_and_newlines (inptr);

       length += read_bracketed_filename (inptr + length, &entry->filename);

       length += skip_whitespace_and_newlines (inptr + length);

       /* Get the node name. */
       length += read_quoted_string (inptr + length, ",.", 2, &entry->nodename);

       skip_input (length);

       /* Check if there is a newline in the target. */
       nl_off = strchr (target_start, '\n');
       if (nl_off)
         {
           if (nl_off < inptr)
             space_at_start_of_line = skip_whitespace (nl_off + 1);
           else
             nl_off = 0;
         }
       canonicalize_whitespace (entry->nodename);

       if (entry->filename)
         {
           /* Heuristic of whether it's worth outputing a newline before the
              filename.  This checks whether the newline appears more
              than half way through the text, and therefore which side is
              longer. */
           if (nl_off
               && nl_off < target_start + (length - space_at_start_of_line) / 2)
             {
               int i;
               write_extra_bytes_to_output ("\n", 1);

               for (i = 0; i < space_at_start_of_line; i++)
                 write_extra_bytes_to_output (" ", 1);
               skip_input (strspn (inptr, " "));
               nl_off = 0;
             }
           else

           if (*inptr != '\n')
             {
               write_extra_bytes_to_output (" ", 1);
             }
           write_extra_bytes_to_output ("(", 1);
           write_extra_bytes_to_output (entry->filename,
                                        strlen (entry->filename));
           write_extra_bytes_to_output (" manual)",
                                        strlen (" manual)"));
         }

       /* Hide terminating punctuation if we are in a reference
          like "(*note Label:(file)node.)". */
       if (in_parentheses && inptr[0] == '.')
         skip_input (1);

       /* Copy any terminating punctuation before the optional newline. */
       copy_input_to_output (strspn (inptr, ".),"));

       /* Output a newline if one is needed.  Don't do it at the end of
          a paragraph. */
       if (nl_off && *inptr != '\n')
         {
           int i;

           write_extra_bytes_to_output ("\n", 1);
           for (i = 0; i < space_at_start_of_line; i++)
             write_extra_bytes_to_output (" ", 1);
           skip_input (strspn (inptr, " "));
         }
     }
   else /* entry->type == REFERENCE_MENU_ITEM */
     {
       int line_len;
       int length = 0; /* Length of specification */

       length = skip_whitespace (inptr);
       length += read_bracketed_filename (inptr + length, &entry->filename);
       length += strspn (inptr + length, " ");

       /* Get the node name. */
       length += read_quoted_string (inptr + length, ",.\t\n", 2,
                                     &entry->nodename);
       if (inptr[length] == '.') /* A '.' terminating the entry. */
         length++;
       canonicalize_whitespace (entry->nodename);

       if (node->flags & N_IsDir)
         {
           /* Set line_len to length of line so far. */

           char *linestart;
           linestart = memrchr (input_start, '\n', inptr - input_start);
           if (!linestart)
             linestart = input_start;
           else
             linestart++; /* Point to first character after newline. */
           line_len = inptr - linestart;
         }

       if (node->flags & N_IsIndex)
         /* Show the name of the node the index entry refers to. */
         copy_input_to_output (length);
       else
         {
           skip_input (length);

           if ((node->flags & N_IsDir) && inptr[strspn (inptr, " ")] == '\n')
             {
               /* For a dir node, if there is no more text in this line,
                  check if there is a menu entry description in the next
                  line to the right of the end of the label, and display it
                  in this line. */
               skip_input (strspn (inptr, " "));
               if (line_len <= strspn (inptr + 1, " "))
                 skip_input (1 + line_len);
             }
           else
             {
               for (i = 0; i < length; i++)
                 write_extra_bytes_to_output (" ", 1);
             }
         }

       /* Parse "(line ...)" part of menus, if any.  */
       {
         char *lineptr = inptr;
         /* Skip any whitespace first, and then a newline in case the item
            was so long to contain the ``(line ...)'' string in the same
            physical line.  */
         lineptr += skip_whitespace (inptr);
         if (*lineptr == '\n')
           lineptr += 1 + skip_whitespace (lineptr + 1);

         if (!strncmp (lineptr, "(line ", strlen ("(line ")))
           {
             lineptr += strlen ("(line ");
             entry->line_number = strtol (lineptr, 0, 0);
           }
         else
           entry->line_number = 0;
       }
     }

   return 1;
 }

 /* BASE is earlier in a block of allocated memory than PTR, and the block
    extends until at least BASE + LEN - 1.  Return PTR[INDEX], unless this
    could be outside the allocated block, in which case return 0. */
 static char
 safe_string_index (char *ptr, long index, char *base, long len)
 {
   long offset = ptr - base;

   if (   offset + index < 0
       || offset + index >= len)
     return 0;

   return ptr[index];
 }

 /* Process an in index marker ("^@^H[index^@^H]") or an image marker
    ("^@^H[image ...^@^H]"). */
 static void
 scan_info_tag (NODE *node, int *in_index, FILE_BUFFER *fb)
 {
   char *p, *p1;
   struct text_buffer *expansion = xmalloc (sizeof (struct text_buffer));

   p = inptr;
   p1 = p;

   text_buffer_init (expansion);

   if (tag_expand (&p1, input_start + input_length, expansion, in_index))
     {
       if (*in_index)
         node->flags |= N_IsIndex;

       if (!rewrite_p)
         {
           rewrite_p = 1;
           init_output_stream (fb);

           /* Put inptr back to start so that
              copy_input_to_output below gets all
              preceding contents. */
           inptr = node->contents;
         }

       /* Write out up to tag. */
       copy_input_to_output (p - inptr);

       write_tag_contents (text_buffer_base (expansion),
                           text_buffer_off (expansion));
       /* Skip past body of tag. */
       skip_tag_contents (p1 - inptr);
     }
   else
     {
       /* It was not a valid tag. */
       copy_input_to_output (p - inptr + 1);
     }

   text_buffer_free (expansion);
   free (expansion);
 }

 #define looking_at_string(contents, string) \
   (!strncasecmp (contents, string, strlen (string)))

 static char *
 forward_to_info_syntax (char *contents)
 {
   /* Loop until just before the end of the input.  The '- 3' prevents us
      accessing memory after the end of the input, and none of the strings we
      are looking for are shorter than 3 bytes. */
   while (contents < input_start + input_length - 3)
     {
       /* Menu entry comes first to optimize for the case of looking through a
          long index node. */
       if (looking_at_string (contents, INFO_MENU_ENTRY_LABEL)
           || looking_at_string (contents, INFO_XREF_LABEL)
           || !memcmp (contents, "\0\b[", 3))
         return contents;
       contents++;
     }
   return 0;
 }

 /* Scan contents of NODE, recording cross-references and similar.

    Convert character encoding of node contents to that of the user if the two
    are known to be different.  If PREPROCESS_NODES_P == 1, remove Info syntax
    in contents.

    If FB is non-null, it is the file containing the node, and TAG_PTR is an
    offset into FB->tags.  If the node contents are rewritten, adjust anchors
    that occur in the node and store adjusted value as TAG->nodestart_adjusted,
    otherwise simply copy TAG->nodestart to TAG->nodestart_adjusted for each
    anchor in the node. */
 void
 scan_node_contents (NODE *node, FILE_BUFFER *fb, TAG **tag_ptr)
 {
   int in_menu = 0;
   char *match;

   REFERENCE **refs = NULL;
   size_t refs_index = 0, refs_slots = 0;

   /* Whether an index tag was seen. */
   int in_index = 0;

   rewrite_p = preprocess_nodes_p;

   init_output_stream (fb);

   if (fb)
     {
       char *file_contents;

       /* Set anchor_to_adjust to first anchor in node, if any. */
       anchor_to_adjust = tag_ptr + 1;
       if (!*anchor_to_adjust)
         anchor_to_adjust = 0;
       else if (*anchor_to_adjust
                && (*anchor_to_adjust)->cache.nodelen != 0)
         anchor_to_adjust = 0;

       if (!node->subfile)
         file_contents = fb->contents;
       else
         {
           FILE_BUFFER *f = info_find_subfile (node->subfile);
           if (!f)
             return; /* This shouldn't happen. */
           file_contents = f->contents;
         }
       node_offset = (*tag_ptr)->nodestart
         + skip_node_separator (file_contents + (*tag_ptr)->nodestart);
     }
   else
     anchor_to_adjust = 0;

   /* Initialize refs to point to array of one null pointer in case
      there are no results.  This way we know if refs has been initialized
      even if it is empty. */
   refs = calloc (1, sizeof *refs);
   refs_slots = 1;

   parse_top_node_line (node);

   /* This should be the only time we assign to inptr in this function -
      all other assignment should be done with the helper functions above. */
   inptr = node->contents;
   input_start = node->contents;
   input_length = node->nodelen;


   while ((match = forward_to_info_syntax (inptr))
           && match < node->contents + node->nodelen)
     {
       int in_parentheses = 0;
       REFERENCE *entry;

       /* Write out up to match */
       copy_input_to_output (match - inptr);

       if ((in_menu && match[0] == '\n') || match[0] == '*')
         {
           /* Menu entry or cross reference. */
           /* Create REFERENCE entity. */
           entry = info_new_reference (0, 0);

           if (safe_string_index (inptr, -1, input_start, input_length) == '('
              && safe_string_index (inptr, 1, input_start, input_length) == 'n')
             in_parentheses = 1;

           save_conversion_state ();

           if (!scan_reference_marker (entry, in_parentheses))
             goto not_a_reference;

           if (!scan_reference_label (entry, in_index))
             goto not_a_reference;

           /* If this reference entry continues with another ':' then the target
              of the reference is given by the label. */
           if (*inptr == ':')
             {
               int label_len;
               skip_input (1);
               if (entry->type == REFERENCE_MENU_ITEM)
                 write_extra_bytes_to_output (" ", 1);

               /* Remove the DEL bytes from a label like "(FOO)^?BAR^?::". */
               label_len = strlen (entry->label);
               if (label_len >= 2 && entry->label[label_len - 1] == 0177)
                 {
                   char *p = strchr (entry->label, '\177');
                   memmove (p, p + 1, label_len - (p - entry->label) - 1);
                   entry->label[label_len - 2] = '\0';
                 }
             }
           else
             {
               /* Proceed to read the rest of the reference. */
               /* TODO: we should probably not allow references of the form
                  "(file)node1:node2." or "(file1)node1:(file2)node2", so
                  bail out here if entry->filename is non-null. */

               free (entry->filename); entry->filename = 0;
               free (entry->nodename); entry->nodename = 0;
               if (!scan_reference_target (entry, node, in_parentheses))
                 goto not_a_reference;
             }

           if (0)
             {
               char *cur_inptr;

 not_a_reference:
               /* This is not a menu entry or reference.  Do not add to our
                  list. */
               cur_inptr = inptr;
               reset_conversion ();
               copy_input_to_output (cur_inptr - inptr);

               info_reference_free (entry);
               continue;
             }

           add_pointer_to_array (entry, refs_index, refs, refs_slots, 50);
         }
       /* Was "* Menu:" seen?  If so, search for menu entries hereafter. */
       else if (!in_menu && !strncmp (match, INFO_MENU_LABEL,
                                strlen (INFO_MENU_LABEL)))
         {
           in_menu = 1;
           skip_input (strlen ("\n* Menu:"));
           if (*inptr == '\n')
             skip_input (strspn (inptr, "\n") - 1); /* Keep one newline. */

         }
       else if (match[0] == '\0') /* Info tag */
         {
           scan_info_tag (node, &in_index, fb);
         }
       else
         copy_input_to_output (1);
     }

   /* If we haven't accidentally gone past the end of the node, write
      out the rest of it. */
   if (inptr < node->contents + node->nodelen)
     copy_input_to_output ((node->contents + node->nodelen) - inptr);

   /* Null to terminate buffer. */
   if (rewrite_p)
     text_buffer_add_string (&output_buf, "\0", 1);

   /* Free resources used in character encoding conversion. */
   close_conversion ();

   node->references = refs;

   if (rewrite_p)
     {
       if (node->flags & N_WasRewritten)
         free (node->contents);
       node->contents = text_buffer_base (&output_buf);
       node->flags |= N_WasRewritten;

       /* output_buf.off is the offset of the next character to be
          written.  Subtracting 1 gives the offset of our terminating
          null, that is, the length. */
       node->nodelen = text_buffer_off (&output_buf) - 1;
     }
   else if (fb && tag_ptr)
     {
       /* Set nodestart_adjusted for all of the anchors in this node. */
       tag_ptr++;
       while (*tag_ptr && (*tag_ptr)->cache.nodelen == 0)
         {
           (*tag_ptr)->nodestart_adjusted = (*tag_ptr)->nodestart
                                              - output_bytes_difference;
           tag_ptr++;
         }
     }
 }