tta/C/main/node_name_normalization.c - texinfo - Git at Google

 /* Copyright 2010-2026 Free Software Foundation, Inc.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>. */

 /* In sync with Texinfo::Convert::NodeNameNormalization */

 #include <config.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <ctype.h>
 #include "unistr.h"

 /* also for xvasprintf */
 #include "text.h"
 #include "command_ids.h"
 #include "element_types.h"
 #include "tree_types.h"
 #include "types_data.h"
 /* isascii_alnum bug */
 #include "base_utils.h"
 #include "tree.h"
 #include "extra.h"
 #include "builtin_commands.h"
 /* for xasprintf whitespace_chars find_innermost_accent_contents */
 #include "utils.h"
 #include "debug.h"
 #include "call_perl_function.h"
 #include "unicode.h"
 /* nobrace_symbol_text */
 #include "convert_to_text.h"
 #include "convert_utils.h"
 #include "node_name_normalization.h"

 static const char *command_normalization_text[BUILTIN_CMD_NUMBER];

 /* Should be called only once */
 void
 setup_node_name_normalization (void)
 {
   int i;

   for (i = 0; i < BUILTIN_CMD_NUMBER; i++)
     {
       if (unicode_character_brace_no_arg_commands[i].codepoint)
         command_normalization_text[i]
           = unicode_character_brace_no_arg_commands[i].text;
       else if (text_brace_no_arg_commands[i])
         command_normalization_text[i] = text_brace_no_arg_commands[i];
       else if (nobrace_symbol_text[i])
         {
           if (i == CM_ASTERISK)
             command_normalization_text[i] = " ";
           else
             command_normalization_text[i] = nobrace_symbol_text[i];
         }
     }
 }

 #define ADD(x) text_append (result, x)
 void
 convert_to_normalized_internal (const ELEMENT *e, TEXT *result)
 {
   enum command_id cmd;

   if (type_data[e->type].flags & TF_text)
     {
       if (e->type != ET_ignorable_spaces_after_command
           && e->type != ET_ignorable_spaces_before_command
           && e->type != ET_spaces_at_end
           && e->type != ET_spaces_before_paragraph
           && e->type != ET_space_at_end_menu_node
           && e->type != ET_spaces_after_close_brace
           && e->e.text->end > 0)
         {
           char *text_norm_spaces = collapse_spaces (e->e.text->text);
           ADD(text_norm_spaces);
           free (text_norm_spaces);
         }
       return;
     }

   cmd = element_builtin_data_cmd (e);

   if ((e->type == ET_postamble_after_end
        || e->type == ET_preamble_before_beginning
        || e->type == ET_preamble_before_setfilename)
       || (cmd
           && ((cmd == CM_anchor
                || cmd == CM_footnote
                || cmd == CM_shortcaption
                || cmd == CM_caption
                || cmd == CM_hyphenation
                || cmd == CM_namedanchor
                || cmd == CM_sortas
                || cmd == CM_seealso
                || cmd == CM_seeentry)
              /* here ignore the 'regular' line commands */
               || (e->e.c->contents.number > 0
                   && e->e.c->contents.list[0]->type == ET_line_arg)
              /* here ignore the root-level line commands, @node and
                 sectioning commands */
               || (e->e.c->contents.number > 0
                   && e->e.c->contents.list[0]->type == ET_arguments_line
                   && e->e.c->contents.list[0]->e.c->contents.number > 0
                   && e->e.c->contents.list[0]->e.c->contents.list[0]->type
                                                    == ET_line_arg))))
     return;

   if (cmd)
     {
       if (command_normalization_text[cmd])
         ADD(command_normalization_text[cmd]);
       else if (builtin_command_data[cmd].flags & CF_accent)
         {
           if (e->e.c->contents.number > 0)
             {
               ACCENTS_STACK *accent_stack
                 = find_innermost_accent_contents (e);
               TEXT accent_text;
               char *accented_char;

               if (!accent_stack->argument)
                 {
                   destroy_accent_stack (accent_stack);
                   return;
                 }

               text_init (&accent_text);
               convert_to_normalized_internal (accent_stack->argument,
                                               &accent_text);

       /* We pass undef as last resort formatting function, because we know that
          unicode_accent is used, and it cannot fail/return undef. */
               accented_char = encoded_accents (0, accent_text.text,
                                         &accent_stack->stack, "utf-8", 0, 0);

               ADD(accented_char);
               free (accented_char);
               free (accent_text.text);
               destroy_accent_stack (accent_stack);
             }
           return;
         }
       else if (builtin_command_data[cmd].flags & CF_ref)
         {
           int order_index = 0;
           int *arguments_order = ref_5_args_order;
           if (cmd == CM_inforef || cmd == CM_link)
             arguments_order = ref_3_args_order;
           while (arguments_order[order_index] >= 0)
             {
               /* no risk with that casting as idx < 5 */
               size_t idx = (size_t) arguments_order[order_index];
               if (e->e.c->contents.number > idx)
                 {
                   TEXT arg_text;

                   text_init (&arg_text);
                   convert_to_normalized_internal (
                     e->e.c->contents.list[idx], &arg_text);
                   if (arg_text.end > 0)
                     {
                       char *non_space_char = arg_text.text
                               + strspn (arg_text.text, whitespace_chars);
                       if (*non_space_char)
                         {
                           ADD (arg_text.text);
                           free (arg_text.text);
                           break;
                         }
                     }
                 }
               order_index++;
             }
           return;
         }
       else if (e->e.c->contents.number > 0
                && (e->e.c->contents.list[0]->type == ET_brace_container
                    || e->e.c->contents.list[0]->type == ET_brace_arg
                    || cmd == CM_math))
         {
           convert_to_normalized_internal (e->e.c->contents.list[0], result);
           return;
         }
     }
   if (e->e.c->contents.number > 0)
     {
       size_t i;
       for (i = 0; i < e->e.c->contents.number; i++)
         convert_to_normalized_internal (e->e.c->contents.list[i], result);
     }
 }
 #undef ADD

 /* Return value to be freed by caller. */
 char *
 convert_to_normalized (const ELEMENT *e)
 {
   TEXT result;

   if (!e)
     return strdup ("");
   text_init (&result);
   /* this is needed for a test result with empty listoffloats */
   text_append (&result, "");
   convert_to_normalized_internal (e, &result);
   return result.text;
 }

 void
 protect_unicode_char (const char *text, TEXT *result)
 {
   uint8_t *encoded_u8;
   const uint8_t *next;
   ucs4_t next_char;
   char *str;

   /* determine unicode codepoint */
   encoded_u8 = utf8_from_string (text);
   next = u8_next (&next_char, encoded_u8);
   if (next && *next)
     bug ("Something left on next_str/encoded_u8\n");
   free (encoded_u8);

   if (next_char <= 0xFFFF)
     {
       xasprintf (&str, "%04lx", next_char);
       text_append_n (result, "_", 1);
     }
   else
     {
       xasprintf (&str, "%06lx", next_char);
       text_append_n (result, "__", 2);
     }
   text_append (result, str);
   free (str);
 }

 /* to be freed by caller */
 char *unicode_to_protected (const char *text)
 {
   TEXT result;
   const char *p = text;

   text_init (&result);
   text_append (&result, "");

   while (*p)
     {
       int n = strspn (p, " ");
       if (n)
         {
           text_append_n (&result, "-", 1);
           p += n;
           if (!*p)
             break;
         }

       if (isascii_alnum (*p))
         {
           text_append_n (&result, p, 1);
           p++;
         }
       else
         {
           int char_len = 1;
           char *next_str;

           /* Count any UTF-8 continuation bytes. */
           while ((p[char_len] & 0xC0) == 0x80)
             char_len++;

           next_str = strndup (p, char_len);
           protect_unicode_char (next_str, &result);

           free (next_str);
           p += char_len;
         }
     }
   return (result.text);
 }

 /* to be freed by caller */
 char *normalize_top_name (const char *text)
 {
   if (strlen (text) == 3)
     {
       char *normalized = strdup (text);
       char *p;

       for (p = normalized; *p; p++)
         if (isascii_alnum (*p))
           {
             *p = tolower (*p);
           }
         else
           {
             free (normalized);
             return strdup (text);
           }

       if (!strcmp (normalized, "top"))
         {
           free (normalized);
           return strdup ("Top");
         }

       free (normalized);
       return strdup (text);
     }
   return strdup (text);
 }

 /* to be freed by caller */
 char *
 convert_to_node_identifier (const ELEMENT *element)
 {
   char *converted_name = convert_to_normalized (element);
   char *normalized_name = normalize_NFC (converted_name);
   char *protected = unicode_to_protected (normalized_name);
   char *result = normalize_top_name (protected);

   free (protected);
   free (converted_name);
   free (normalized_name);
   return result;
 }

 char *
 convert_contents_to_node_identifier (const ELEMENT *e)
 {
   ELEMENT *tmp = new_element (ET_NONE);
   char *result;

   tmp->e.c->contents = e->e.c->contents;
   result = convert_to_node_identifier (tmp);
   tmp->e.c->contents.list = 0;
   destroy_element (tmp);

   return result;
 }

 /* to be freed by caller */
 char *
 convert_to_identifier (const ELEMENT *element)
 {
   char *converted_name = convert_to_normalized (element);
   char *normalized_name = normalize_NFC (converted_name);
   char *result = unicode_to_protected (normalized_name);

   free (converted_name);
   free (normalized_name);
   return result;
 }

 char *
 convert_contents_to_identifier (const ELEMENT *e)
 {
   ELEMENT *tmp = new_element (ET_NONE);
   char *result;

   tmp->e.c->contents = e->e.c->contents;
   result = convert_to_identifier (tmp);
   tmp->e.c->contents.list = 0;
   destroy_element (tmp);

   return result;
 }

 static char *
 unicode_to_transliterate (char *text, int external,
                           int in_test, int no_unidecode)
 {
   char *result;
   int status;

   if (external)
     {
       result = call_nodenamenormalization_unicode_to_transliterate (text,
                                                    in_test, no_unidecode);
       if (result)
         return result;
     }

   /* We silence the transliteration errors that may happen (for example on
      solaris 11).  The calling code should never depend on a specific
      transliteration result, transliteration should only be used for
      internal identifiers. */
   result = encode_string (text, "us-ascii//TRANSLIT", &status, 0,
                           ieh_skip, 0);

   return result;
 }

 char *
 normalize_transliterate_texinfo (const ELEMENT *e, int external_translit,
                                  int in_test, int no_unidecode)
 {
   char *converted_name = convert_to_normalized (e);
   char *normalized_name = normalize_NFC (converted_name);
   char *transliterated = unicode_to_transliterate (normalized_name,
                           external_translit, in_test, no_unidecode);
   char *result = unicode_to_protected (transliterated);

   free (converted_name);
   free (normalized_name);
   free (transliterated);
   return result;
 }

 char *
 normalize_transliterate_texinfo_contents (const ELEMENT *e,
                     int external_translit, int in_test, int no_unidecode)
 {
   ELEMENT *tmp = new_element (ET_NONE);
   char *result;

   tmp->e.c->contents = e->e.c->contents;
   result = normalize_transliterate_texinfo (tmp, external_translit,
                                             in_test, no_unidecode);
   tmp->e.c->contents.list = 0;
   destroy_element (tmp);

   return result;
 }
	/* Copyright 2010-2026 Free Software Foundation, Inc.

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program. If not, see <https://www.gnu.org/licenses/>. */

	/* In sync with Texinfo::Convert::NodeNameNormalization */

	#include <config.h>
	#include <string.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <ctype.h>
	#include "unistr.h"

	/* also for xvasprintf */
	#include "text.h"
	#include "command_ids.h"
	#include "element_types.h"
	#include "tree_types.h"
	#include "types_data.h"
	/* isascii_alnum bug */
	#include "base_utils.h"
	#include "tree.h"
	#include "extra.h"
	#include "builtin_commands.h"
	/* for xasprintf whitespace_chars find_innermost_accent_contents */
	#include "utils.h"
	#include "debug.h"
	#include "call_perl_function.h"
	#include "unicode.h"
	/* nobrace_symbol_text */
	#include "convert_to_text.h"
	#include "convert_utils.h"
	#include "node_name_normalization.h"

	static const char *command_normalization_text[BUILTIN_CMD_NUMBER];

	/* Should be called only once */
	void
	setup_node_name_normalization (void)
	{
	int i;

	for (i = 0; i < BUILTIN_CMD_NUMBER; i++)
	{
	if (unicode_character_brace_no_arg_commands[i].codepoint)
	command_normalization_text[i]
	= unicode_character_brace_no_arg_commands[i].text;
	else if (text_brace_no_arg_commands[i])
	command_normalization_text[i] = text_brace_no_arg_commands[i];
	else if (nobrace_symbol_text[i])
	{
	if (i == CM_ASTERISK)
	command_normalization_text[i] = " ";
	else
	command_normalization_text[i] = nobrace_symbol_text[i];
	}
	}
	}

	#define ADD(x) text_append (result, x)
	void
	convert_to_normalized_internal (const ELEMENT e, TEXT result)
	{
	enum command_id cmd;

	if (type_data[e->type].flags & TF_text)
	{
	if (e->type != ET_ignorable_spaces_after_command
	&& e->type != ET_ignorable_spaces_before_command
	&& e->type != ET_spaces_at_end
	&& e->type != ET_spaces_before_paragraph
	&& e->type != ET_space_at_end_menu_node
	&& e->type != ET_spaces_after_close_brace
	&& e->e.text->end > 0)
	{
	char *text_norm_spaces = collapse_spaces (e->e.text->text);
	ADD(text_norm_spaces);
	free (text_norm_spaces);
	}
	return;
	}

	cmd = element_builtin_data_cmd (e);

	if ((e->type == ET_postamble_after_end
	\|\| e->type == ET_preamble_before_beginning
	\|\| e->type == ET_preamble_before_setfilename)
	\|\| (cmd
	&& ((cmd == CM_anchor
	\|\| cmd == CM_footnote
	\|\| cmd == CM_shortcaption
	\|\| cmd == CM_caption
	\|\| cmd == CM_hyphenation
	\|\| cmd == CM_namedanchor
	\|\| cmd == CM_sortas
	\|\| cmd == CM_seealso
	\|\| cmd == CM_seeentry)
	/* here ignore the 'regular' line commands */
	\|\| (e->e.c->contents.number > 0
	&& e->e.c->contents.list[0]->type == ET_line_arg)
	/* here ignore the root-level line commands, @node and
	sectioning commands */
	\|\| (e->e.c->contents.number > 0
	&& e->e.c->contents.list[0]->type == ET_arguments_line
	&& e->e.c->contents.list[0]->e.c->contents.number > 0
	&& e->e.c->contents.list[0]->e.c->contents.list[0]->type
	== ET_line_arg))))
	return;

	if (cmd)
	{
	if (command_normalization_text[cmd])
	ADD(command_normalization_text[cmd]);
	else if (builtin_command_data[cmd].flags & CF_accent)
	{
	if (e->e.c->contents.number > 0)
	{
	ACCENTS_STACK *accent_stack
	= find_innermost_accent_contents (e);
	TEXT accent_text;
	char *accented_char;

	if (!accent_stack->argument)
	{
	destroy_accent_stack (accent_stack);
	return;
	}

	text_init (&accent_text);
	convert_to_normalized_internal (accent_stack->argument,
	&accent_text);

	/* We pass undef as last resort formatting function, because we know that
	unicode_accent is used, and it cannot fail/return undef. */
	accented_char = encoded_accents (0, accent_text.text,
	&accent_stack->stack, "utf-8", 0, 0);

	ADD(accented_char);
	free (accented_char);
	free (accent_text.text);
	destroy_accent_stack (accent_stack);
	}
	return;
	}
	else if (builtin_command_data[cmd].flags & CF_ref)
	{
	int order_index = 0;
	int *arguments_order = ref_5_args_order;
	if (cmd == CM_inforef \|\| cmd == CM_link)
	arguments_order = ref_3_args_order;
	while (arguments_order[order_index] >= 0)
	{
	/* no risk with that casting as idx < 5 */
	size_t idx = (size_t) arguments_order[order_index];
	if (e->e.c->contents.number > idx)
	{
	TEXT arg_text;

	text_init (&arg_text);
	convert_to_normalized_internal (
	e->e.c->contents.list[idx], &arg_text);
	if (arg_text.end > 0)
	{
	char *non_space_char = arg_text.text
	+ strspn (arg_text.text, whitespace_chars);
	if (*non_space_char)
	{
	ADD (arg_text.text);
	free (arg_text.text);
	break;
	}
	}
	}
	order_index++;
	}
	return;
	}
	else if (e->e.c->contents.number > 0
	&& (e->e.c->contents.list[0]->type == ET_brace_container
	\|\| e->e.c->contents.list[0]->type == ET_brace_arg
	\|\| cmd == CM_math))
	{
	convert_to_normalized_internal (e->e.c->contents.list[0], result);
	return;
	}
	}
	if (e->e.c->contents.number > 0)
	{
	size_t i;
	for (i = 0; i < e->e.c->contents.number; i++)
	convert_to_normalized_internal (e->e.c->contents.list[i], result);
	}
	}
	#undef ADD

	/* Return value to be freed by caller. */
	char *
	convert_to_normalized (const ELEMENT *e)
	{
	TEXT result;

	if (!e)
	return strdup ("");
	text_init (&result);
	/* this is needed for a test result with empty listoffloats */
	text_append (&result, "");
	convert_to_normalized_internal (e, &result);
	return result.text;
	}

	void
	protect_unicode_char (const char text, TEXT result)
	{
	uint8_t *encoded_u8;
	const uint8_t *next;
	ucs4_t next_char;
	char *str;

	/* determine unicode codepoint */
	encoded_u8 = utf8_from_string (text);
	next = u8_next (&next_char, encoded_u8);
	if (next && *next)
	bug ("Something left on next_str/encoded_u8\n");
	free (encoded_u8);

	if (next_char <= 0xFFFF)
	{
	xasprintf (&str, "%04lx", next_char);
	text_append_n (result, "_", 1);
	}
	else
	{
	xasprintf (&str, "%06lx", next_char);
	text_append_n (result, "__", 2);
	}
	text_append (result, str);
	free (str);
	}

	/* to be freed by caller */
	char unicode_to_protected (const char text)
	{
	TEXT result;
	const char *p = text;

	text_init (&result);
	text_append (&result, "");

	while (*p)
	{
	int n = strspn (p, " ");
	if (n)
	{
	text_append_n (&result, "-", 1);
	p += n;
	if (!*p)
	break;
	}

	if (isascii_alnum (*p))
	{
	text_append_n (&result, p, 1);
	p++;
	}
	else
	{
	int char_len = 1;
	char *next_str;

	/* Count any UTF-8 continuation bytes. */
	while ((p[char_len] & 0xC0) == 0x80)
	char_len++;

	next_str = strndup (p, char_len);
	protect_unicode_char (next_str, &result);

	free (next_str);
	p += char_len;
	}
	}
	return (result.text);
	}

	/* to be freed by caller */
	char normalize_top_name (const char text)
	{
	if (strlen (text) == 3)
	{
	char *normalized = strdup (text);
	char *p;

	for (p = normalized; *p; p++)
	if (isascii_alnum (*p))
	{
	p = tolower (p);
	}
	else
	{
	free (normalized);
	return strdup (text);
	}

	if (!strcmp (normalized, "top"))
	{
	free (normalized);
	return strdup ("Top");
	}

	free (normalized);
	return strdup (text);
	}
	return strdup (text);
	}

	/* to be freed by caller */
	char *
	convert_to_node_identifier (const ELEMENT *element)
	{
	char *converted_name = convert_to_normalized (element);
	char *normalized_name = normalize_NFC (converted_name);
	char *protected = unicode_to_protected (normalized_name);
	char *result = normalize_top_name (protected);

	free (protected);
	free (converted_name);
	free (normalized_name);
	return result;
	}

	char *
	convert_contents_to_node_identifier (const ELEMENT *e)
	{
	ELEMENT *tmp = new_element (ET_NONE);
	char *result;

	tmp->e.c->contents = e->e.c->contents;
	result = convert_to_node_identifier (tmp);
	tmp->e.c->contents.list = 0;
	destroy_element (tmp);

	return result;
	}

	/* to be freed by caller */
	char *
	convert_to_identifier (const ELEMENT *element)
	{
	char *converted_name = convert_to_normalized (element);
	char *normalized_name = normalize_NFC (converted_name);
	char *result = unicode_to_protected (normalized_name);

	free (converted_name);
	free (normalized_name);
	return result;
	}

	char *
	convert_contents_to_identifier (const ELEMENT *e)
	{
	ELEMENT *tmp = new_element (ET_NONE);
	char *result;

	tmp->e.c->contents = e->e.c->contents;
	result = convert_to_identifier (tmp);
	tmp->e.c->contents.list = 0;
	destroy_element (tmp);

	return result;
	}

	static char *
	unicode_to_transliterate (char *text, int external,
	int in_test, int no_unidecode)
	{
	char *result;
	int status;

	if (external)
	{
	result = call_nodenamenormalization_unicode_to_transliterate (text,
	in_test, no_unidecode);
	if (result)
	return result;
	}

	/* We silence the transliteration errors that may happen (for example on
	solaris 11). The calling code should never depend on a specific
	transliteration result, transliteration should only be used for
	internal identifiers. */
	result = encode_string (text, "us-ascii//TRANSLIT", &status, 0,
	ieh_skip, 0);

	return result;
	}

	char *
	normalize_transliterate_texinfo (const ELEMENT *e, int external_translit,
	int in_test, int no_unidecode)
	{
	char *converted_name = convert_to_normalized (e);
	char *normalized_name = normalize_NFC (converted_name);
	char *transliterated = unicode_to_transliterate (normalized_name,
	external_translit, in_test, no_unidecode);
	char *result = unicode_to_protected (transliterated);

	free (converted_name);
	free (normalized_name);
	free (transliterated);
	return result;
	}

	char *
	normalize_transliterate_texinfo_contents (const ELEMENT *e,
	int external_translit, int in_test, int no_unidecode)
	{
	ELEMENT *tmp = new_element (ET_NONE);
	char *result;

	tmp->e.c->contents = e->e.c->contents;
	result = normalize_transliterate_texinfo (tmp, external_translit,
	in_test, no_unidecode);
	tmp->e.c->contents.list = 0;
	destroy_element (tmp);

	return result;
	}