blob: 4c36e97807702b054ef1e12be590c333924687c2 [file] [log] [blame]
/* Copyright 2010-2026 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
/* In sync with Texinfo::Convert::NodeNameNormalization */
#include <config.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
#include "unistr.h"
/* also for xvasprintf */
#include "text.h"
#include "command_ids.h"
#include "element_types.h"
#include "tree_types.h"
#include "types_data.h"
/* isascii_alnum bug */
#include "base_utils.h"
#include "tree.h"
#include "extra.h"
#include "builtin_commands.h"
/* for xasprintf whitespace_chars find_innermost_accent_contents */
#include "utils.h"
#include "debug.h"
#include "call_perl_function.h"
#include "unicode.h"
/* nobrace_symbol_text */
#include "convert_to_text.h"
#include "convert_utils.h"
#include "node_name_normalization.h"
static const char *command_normalization_text[BUILTIN_CMD_NUMBER];
/* Should be called only once */
void
setup_node_name_normalization (void)
{
int i;
for (i = 0; i < BUILTIN_CMD_NUMBER; i++)
{
if (unicode_character_brace_no_arg_commands[i].codepoint)
command_normalization_text[i]
= unicode_character_brace_no_arg_commands[i].text;
else if (text_brace_no_arg_commands[i])
command_normalization_text[i] = text_brace_no_arg_commands[i];
else if (nobrace_symbol_text[i])
{
if (i == CM_ASTERISK)
command_normalization_text[i] = " ";
else
command_normalization_text[i] = nobrace_symbol_text[i];
}
}
}
#define ADD(x) text_append (result, x)
void
convert_to_normalized_internal (const ELEMENT *e, TEXT *result)
{
enum command_id cmd;
if (type_data[e->type].flags & TF_text)
{
if (e->type != ET_ignorable_spaces_after_command
&& e->type != ET_ignorable_spaces_before_command
&& e->type != ET_spaces_at_end
&& e->type != ET_spaces_before_paragraph
&& e->type != ET_space_at_end_menu_node
&& e->type != ET_spaces_after_close_brace
&& e->e.text->end > 0)
{
char *text_norm_spaces = collapse_spaces (e->e.text->text);
ADD(text_norm_spaces);
free (text_norm_spaces);
}
return;
}
cmd = element_builtin_data_cmd (e);
if ((e->type == ET_postamble_after_end
|| e->type == ET_preamble_before_beginning
|| e->type == ET_preamble_before_setfilename)
|| (cmd
&& ((cmd == CM_anchor
|| cmd == CM_footnote
|| cmd == CM_shortcaption
|| cmd == CM_caption
|| cmd == CM_hyphenation
|| cmd == CM_namedanchor
|| cmd == CM_sortas
|| cmd == CM_seealso
|| cmd == CM_seeentry)
/* here ignore the 'regular' line commands */
|| (e->e.c->contents.number > 0
&& e->e.c->contents.list[0]->type == ET_line_arg)
/* here ignore the root-level line commands, @node and
sectioning commands */
|| (e->e.c->contents.number > 0
&& e->e.c->contents.list[0]->type == ET_arguments_line
&& e->e.c->contents.list[0]->e.c->contents.number > 0
&& e->e.c->contents.list[0]->e.c->contents.list[0]->type
== ET_line_arg))))
return;
if (cmd)
{
if (command_normalization_text[cmd])
ADD(command_normalization_text[cmd]);
else if (builtin_command_data[cmd].flags & CF_accent)
{
if (e->e.c->contents.number > 0)
{
ACCENTS_STACK *accent_stack
= find_innermost_accent_contents (e);
TEXT accent_text;
char *accented_char;
if (!accent_stack->argument)
{
destroy_accent_stack (accent_stack);
return;
}
text_init (&accent_text);
convert_to_normalized_internal (accent_stack->argument,
&accent_text);
/* We pass undef as last resort formatting function, because we know that
unicode_accent is used, and it cannot fail/return undef. */
accented_char = encoded_accents (0, accent_text.text,
&accent_stack->stack, "utf-8", 0, 0);
ADD(accented_char);
free (accented_char);
free (accent_text.text);
destroy_accent_stack (accent_stack);
}
return;
}
else if (builtin_command_data[cmd].flags & CF_ref)
{
int order_index = 0;
int *arguments_order = ref_5_args_order;
if (cmd == CM_inforef || cmd == CM_link)
arguments_order = ref_3_args_order;
while (arguments_order[order_index] >= 0)
{
/* no risk with that casting as idx < 5 */
size_t idx = (size_t) arguments_order[order_index];
if (e->e.c->contents.number > idx)
{
TEXT arg_text;
text_init (&arg_text);
convert_to_normalized_internal (
e->e.c->contents.list[idx], &arg_text);
if (arg_text.end > 0)
{
char *non_space_char = arg_text.text
+ strspn (arg_text.text, whitespace_chars);
if (*non_space_char)
{
ADD (arg_text.text);
free (arg_text.text);
break;
}
}
}
order_index++;
}
return;
}
else if (e->e.c->contents.number > 0
&& (e->e.c->contents.list[0]->type == ET_brace_container
|| e->e.c->contents.list[0]->type == ET_brace_arg
|| cmd == CM_math))
{
convert_to_normalized_internal (e->e.c->contents.list[0], result);
return;
}
}
if (e->e.c->contents.number > 0)
{
size_t i;
for (i = 0; i < e->e.c->contents.number; i++)
convert_to_normalized_internal (e->e.c->contents.list[i], result);
}
}
#undef ADD
/* Return value to be freed by caller. */
char *
convert_to_normalized (const ELEMENT *e)
{
TEXT result;
if (!e)
return strdup ("");
text_init (&result);
/* this is needed for a test result with empty listoffloats */
text_append (&result, "");
convert_to_normalized_internal (e, &result);
return result.text;
}
void
protect_unicode_char (const char *text, TEXT *result)
{
uint8_t *encoded_u8;
const uint8_t *next;
ucs4_t next_char;
char *str;
/* determine unicode codepoint */
encoded_u8 = utf8_from_string (text);
next = u8_next (&next_char, encoded_u8);
if (next && *next)
bug ("Something left on next_str/encoded_u8\n");
free (encoded_u8);
if (next_char <= 0xFFFF)
{
xasprintf (&str, "%04lx", next_char);
text_append_n (result, "_", 1);
}
else
{
xasprintf (&str, "%06lx", next_char);
text_append_n (result, "__", 2);
}
text_append (result, str);
free (str);
}
/* to be freed by caller */
char *unicode_to_protected (const char *text)
{
TEXT result;
const char *p = text;
text_init (&result);
text_append (&result, "");
while (*p)
{
int n = strspn (p, " ");
if (n)
{
text_append_n (&result, "-", 1);
p += n;
if (!*p)
break;
}
if (isascii_alnum (*p))
{
text_append_n (&result, p, 1);
p++;
}
else
{
int char_len = 1;
char *next_str;
/* Count any UTF-8 continuation bytes. */
while ((p[char_len] & 0xC0) == 0x80)
char_len++;
next_str = strndup (p, char_len);
protect_unicode_char (next_str, &result);
free (next_str);
p += char_len;
}
}
return (result.text);
}
/* to be freed by caller */
char *normalize_top_name (const char *text)
{
if (strlen (text) == 3)
{
char *normalized = strdup (text);
char *p;
for (p = normalized; *p; p++)
if (isascii_alnum (*p))
{
*p = tolower (*p);
}
else
{
free (normalized);
return strdup (text);
}
if (!strcmp (normalized, "top"))
{
free (normalized);
return strdup ("Top");
}
free (normalized);
return strdup (text);
}
return strdup (text);
}
/* to be freed by caller */
char *
convert_to_node_identifier (const ELEMENT *element)
{
char *converted_name = convert_to_normalized (element);
char *normalized_name = normalize_NFC (converted_name);
char *protected = unicode_to_protected (normalized_name);
char *result = normalize_top_name (protected);
free (protected);
free (converted_name);
free (normalized_name);
return result;
}
char *
convert_contents_to_node_identifier (const ELEMENT *e)
{
ELEMENT *tmp = new_element (ET_NONE);
char *result;
tmp->e.c->contents = e->e.c->contents;
result = convert_to_node_identifier (tmp);
tmp->e.c->contents.list = 0;
destroy_element (tmp);
return result;
}
/* to be freed by caller */
char *
convert_to_identifier (const ELEMENT *element)
{
char *converted_name = convert_to_normalized (element);
char *normalized_name = normalize_NFC (converted_name);
char *result = unicode_to_protected (normalized_name);
free (converted_name);
free (normalized_name);
return result;
}
char *
convert_contents_to_identifier (const ELEMENT *e)
{
ELEMENT *tmp = new_element (ET_NONE);
char *result;
tmp->e.c->contents = e->e.c->contents;
result = convert_to_identifier (tmp);
tmp->e.c->contents.list = 0;
destroy_element (tmp);
return result;
}
static char *
unicode_to_transliterate (char *text, int external,
int in_test, int no_unidecode)
{
char *result;
int status;
if (external)
{
result = call_nodenamenormalization_unicode_to_transliterate (text,
in_test, no_unidecode);
if (result)
return result;
}
/* We silence the transliteration errors that may happen (for example on
solaris 11). The calling code should never depend on a specific
transliteration result, transliteration should only be used for
internal identifiers. */
result = encode_string (text, "us-ascii//TRANSLIT", &status, 0,
ieh_skip, 0);
return result;
}
char *
normalize_transliterate_texinfo (const ELEMENT *e, int external_translit,
int in_test, int no_unidecode)
{
char *converted_name = convert_to_normalized (e);
char *normalized_name = normalize_NFC (converted_name);
char *transliterated = unicode_to_transliterate (normalized_name,
external_translit, in_test, no_unidecode);
char *result = unicode_to_protected (transliterated);
free (converted_name);
free (normalized_name);
free (transliterated);
return result;
}
char *
normalize_transliterate_texinfo_contents (const ELEMENT *e,
int external_translit, int in_test, int no_unidecode)
{
ELEMENT *tmp = new_element (ET_NONE);
char *result;
tmp->e.c->contents = e->e.c->contents;
result = normalize_transliterate_texinfo (tmp, external_translit,
in_test, no_unidecode);
tmp->e.c->contents.list = 0;
destroy_element (tmp);
return result;
}