blob: 7c0a18659ff178a05ee1c30f19b8def29a199371 [file]
/* Copyright 2010-2026 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
/* In sync with Texinfo::Convert::NodeNameNormalization */
#include <config.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include "unistr.h"
/* also for xvasprintf */
#include "text.h"
#include "command_ids.h"
#include "element_types.h"
#include "tree_types.h"
#include "types_data.h"
/* isascii_alnum bug */
#include "base_utils.h"
#include "tree.h"
#include "extra.h"
#include "builtin_commands.h"
/* for xasprintf whitespace_chars find_innermost_accent_contents */
#include "utils.h"
#include "debug.h"
#include "call_perl_function.h"
#include "unicode.h"
/* nobrace_symbol_text */
#include "convert_to_text.h"
#include "convert_utils.h"
#include "node_name_normalization.h"
static const char *command_normalization_text[BUILTIN_CMD_NUMBER];
/* Should be called only once */
void
setup_node_name_normalization (void)
{
int i;
for (i = 0; i < BUILTIN_CMD_NUMBER; i++)
{
if (unicode_character_brace_no_arg_commands[i].codepoint)
command_normalization_text[i]
= unicode_character_brace_no_arg_commands[i].text;
else if (text_brace_no_arg_commands[i])
command_normalization_text[i] = text_brace_no_arg_commands[i];
else if (nobrace_symbol_text[i])
{
if (i == CM_ASTERISK)
command_normalization_text[i] = " ";
else
command_normalization_text[i] = nobrace_symbol_text[i];
}
}
}
#define ADD(x) text_append (result, x)
void
convert_to_normalized_internal (const ELEMENT *e, TEXT *result)
{
enum command_id cmd;
if (type_data[e->type].flags & TF_text)
{
if (e->type != ET_ignorable_spaces_after_command
&& e->type != ET_spaces_at_end
&& e->type != ET_spaces_before_paragraph
&& e->type != ET_space_at_end_menu_node
&& e->type != ET_spaces_after_close_brace
&& e->type != ET_spaces_before_argument
&& e->type != ET_spaces_after_argument
&& e->e.text->end > 0)
{
char *text_norm_spaces = collapse_spaces (e->e.text->text);
ADD(text_norm_spaces);
free (text_norm_spaces);
}
return;
}
cmd = element_builtin_data_cmd (e);
if ((e->type == ET_postamble_after_end
|| e->type == ET_preamble_before_beginning
|| e->type == ET_preamble_before_setfilename)
|| (cmd
&& ((cmd == CM_anchor
|| cmd == CM_footnote
|| cmd == CM_shortcaption
|| cmd == CM_caption
|| cmd == CM_hyphenation
|| cmd == CM_namedanchor
|| cmd == CM_sortas
|| cmd == CM_seealso
|| cmd == CM_seeentry)
/* here ignore the 'regular' line commands */
|| (e->e.c->contents.number > 0
&& e->e.c->contents.list[0]->type == ET_line_arg)
/* here ignore the root-level line commands, @node and
sectioning commands */
|| (e->e.c->contents.number > 0
&& e->e.c->contents.list[0]->type == ET_arguments_line
&& e->e.c->contents.list[0]->e.c->contents.number > 0
&& e->e.c->contents.list[0]->e.c->contents.list[0]->type
== ET_line_arg))))
return;
if (cmd)
{
if (command_normalization_text[cmd])
ADD(command_normalization_text[cmd]);
else if (builtin_command_data[cmd].flags & CF_accent)
{
if (e->e.c->contents.number > 0)
{
ACCENTS_STACK *accent_stack
= find_innermost_accent_contents (e);
TEXT accent_text;
char *accented_char;
if (!accent_stack->argument)
{
destroy_accent_stack (accent_stack);
return;
}
text_init (&accent_text);
convert_to_normalized_internal (accent_stack->argument,
&accent_text);
/* We pass undef as last resort formatting function, because we know that
unicode_accent is used, and it cannot fail/return undef. */
accented_char = encoded_accents (0, accent_text.text,
&accent_stack->stack, "utf-8", 0, 0);
ADD(accented_char);
free (accented_char);
free (accent_text.text);
destroy_accent_stack (accent_stack);
}
return;
}
else if (builtin_command_data[cmd].flags & CF_ref)
{
int order_index = 0;
int *arguments_order = ref_5_args_order;
if (cmd == CM_inforef || cmd == CM_link)
arguments_order = ref_3_args_order;
while (arguments_order[order_index] >= 0)
{
/* no risk with that casting as idx < 5 */
size_t idx = (size_t) arguments_order[order_index];
if (e->e.c->contents.number > idx)
{
TEXT arg_text;
text_init (&arg_text);
convert_to_normalized_internal (
e->e.c->contents.list[idx], &arg_text);
if (arg_text.end > 0)
{
char *non_space_char = arg_text.text
+ strspn (arg_text.text, whitespace_chars);
if (*non_space_char)
{
ADD (arg_text.text);
free (arg_text.text);
break;
}
}
}
order_index++;
}
return;
}
else if (e->e.c->contents.number > 0
&& (e->e.c->contents.list[0]->type == ET_brace_container
|| e->e.c->contents.list[0]->type == ET_brace_arg
|| cmd == CM_math))
{
convert_to_normalized_internal (e->e.c->contents.list[0], result);
return;
}
}
if (e->e.c->contents.number > 0)
{
size_t i;
for (i = 0; i < e->e.c->contents.number; i++)
convert_to_normalized_internal (e->e.c->contents.list[i], result);
}
}
#undef ADD
/* Return value to be freed by caller. */
char *
convert_to_normalized (const ELEMENT *e)
{
TEXT result;
if (!e)
return strdup ("");
text_init (&result);
/* this is needed for a test result with empty listoffloats */
text_append (&result, "");
convert_to_normalized_internal (e, &result);
return result.text;
}
void
protect_unicode_char (const char *text, TEXT *result)
{
uint8_t *encoded_u8;
const uint8_t *next;
ucs4_t next_char;
char *str;
/* determine unicode codepoint */
encoded_u8 = utf8_from_string (text);
next = u8_next (&next_char, encoded_u8);
if (next && *next)
bug ("Something left on next_str/encoded_u8\n");
free (encoded_u8);
if (next_char <= 0xFFFF)
{
xasprintf (&str, "%04lx", next_char);
text_append_n (result, "_", 1);
}
else
{
xasprintf (&str, "%06lx", next_char);
text_append_n (result, "__", 2);
}
text_append (result, str);
free (str);
}
/* to be freed by caller */
char *unicode_to_protected (const char *text)
{
TEXT result;
const char *p = text;
text_init (&result);
text_append (&result, "");
while (*p)
{
int n = strspn (p, " ");
if (n)
{
text_append_n (&result, "-", 1);
p += n;
if (!*p)
break;
}
if (isascii_alnum (*p))
{
text_append_n (&result, p, 1);
p++;
}
else
{
int char_len = 1;
char *next_str;
/* Count any UTF-8 continuation bytes. */
while ((p[char_len] & 0xC0) == 0x80)
char_len++;
next_str = strndup (p, char_len);
protect_unicode_char (next_str, &result);
free (next_str);
p += char_len;
}
}
return (result.text);
}
/* to be freed by caller */
char *normalize_top_name (const char *text)
{
if (strlen (text) == 3)
{
char *normalized = strdup (text);
char *p;
for (p = normalized; *p; p++)
if (isascii_alnum (*p))
{
*p = tolower (*p);
}
else
{
free (normalized);
return strdup (text);
}
if (!strcmp (normalized, "top"))
{
free (normalized);
return strdup ("Top");
}
free (normalized);
return strdup (text);
}
return strdup (text);
}
/* to be freed by caller */
char *
convert_to_node_identifier (const ELEMENT *element)
{
char *converted_name = convert_to_normalized (element);
char *normalized_name = normalize_NFC (converted_name);
char *protected = unicode_to_protected (normalized_name);
char *result = normalize_top_name (protected);
free (protected);
free (converted_name);
free (normalized_name);
return result;
}
char *
convert_contents_to_node_identifier (const ELEMENT *e)
{
ELEMENT *tmp = new_element (ET_NONE);
char *result;
tmp->e.c->contents = e->e.c->contents;
result = convert_to_node_identifier (tmp);
tmp->e.c->contents.list = 0;
destroy_element (tmp);
return result;
}
/* to be freed by caller */
char *
convert_to_identifier (const ELEMENT *element)
{
char *converted_name = convert_to_normalized (element);
char *normalized_name = normalize_NFC (converted_name);
char *result = unicode_to_protected (normalized_name);
free (converted_name);
free (normalized_name);
return result;
}
char *
convert_contents_to_identifier (const ELEMENT *e)
{
ELEMENT *tmp = new_element (ET_NONE);
char *result;
tmp->e.c->contents = e->e.c->contents;
result = convert_to_identifier (tmp);
tmp->e.c->contents.list = 0;
destroy_element (tmp);
return result;
}
/* Transliteration/protection with GNU iconv leads to a result different
from Perl for some characters. It seems that the iconv result depends
on the locale, and there are quite a bit of ? output, probably when
there is no obvious transliteration. In those cases, the Unidecode
transliterations are not necessarily very good, either. There is
no reason to think that all the iconv implementations transliterate
the same way, nor the same as Perl, therefore differences are expected.
*/
static char *
unicode_to_transliterate (char *text)
{
char *result;
int status;
/* We silence the transliteration errors that may happen (for example on
solaris 11). The calling code should never depend on a specific
transliteration result, transliteration should only be used for
internal identifiers. */
result = encode_string (text, "us-ascii//TRANSLIT", &status, 0,
ieh_skip, 0);
return result;
}
char *
normalize_transliterate_texinfo (const ELEMENT *e)
{
char *converted_name = convert_to_normalized (e);
char *normalized_name = normalize_NFC (converted_name);
char *transliterated = unicode_to_transliterate (normalized_name);
char *result = unicode_to_protected (transliterated);
free (converted_name);
free (normalized_name);
free (transliterated);
return result;
}
char *
normalize_transliterate_texinfo_contents (const ELEMENT *e)
{
ELEMENT *tmp = new_element (ET_NONE);
char *result;
tmp->e.c->contents = e->e.c->contents;
result = normalize_transliterate_texinfo (tmp);
tmp->e.c->contents.list = 0;
destroy_element (tmp);
return result;
}