| /* Copyright 2010-2026 Free Software Foundation, Inc. |
| |
| This program is free software: you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation, either version 3 of the License, or |
| (at your option) any later version. |
| |
| This program is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
| |
| /* In sync with Texinfo::Convert::NodeNameNormalization */ |
| |
| #include <config.h> |
| #include <string.h> |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <ctype.h> |
| #include "unistr.h" |
| |
| /* also for xvasprintf */ |
| #include "text.h" |
| #include "command_ids.h" |
| #include "element_types.h" |
| #include "tree_types.h" |
| #include "types_data.h" |
| /* isascii_alnum bug */ |
| #include "base_utils.h" |
| #include "tree.h" |
| #include "extra.h" |
| #include "builtin_commands.h" |
| /* for xasprintf whitespace_chars find_innermost_accent_contents */ |
| #include "utils.h" |
| #include "debug.h" |
| #include "call_perl_function.h" |
| #include "unicode.h" |
| /* nobrace_symbol_text */ |
| #include "convert_to_text.h" |
| #include "convert_utils.h" |
| #include "node_name_normalization.h" |
| |
| static const char *command_normalization_text[BUILTIN_CMD_NUMBER]; |
| |
| /* Should be called only once */ |
| void |
| setup_node_name_normalization (void) |
| { |
| int i; |
| |
| for (i = 0; i < BUILTIN_CMD_NUMBER; i++) |
| { |
| if (unicode_character_brace_no_arg_commands[i].codepoint) |
| command_normalization_text[i] |
| = unicode_character_brace_no_arg_commands[i].text; |
| else if (text_brace_no_arg_commands[i]) |
| command_normalization_text[i] = text_brace_no_arg_commands[i]; |
| else if (nobrace_symbol_text[i]) |
| { |
| if (i == CM_ASTERISK) |
| command_normalization_text[i] = " "; |
| else |
| command_normalization_text[i] = nobrace_symbol_text[i]; |
| } |
| } |
| } |
| |
| #define ADD(x) text_append (result, x) |
| void |
| convert_to_normalized_internal (const ELEMENT *e, TEXT *result) |
| { |
| enum command_id cmd; |
| |
| if (type_data[e->type].flags & TF_text) |
| { |
| if (e->type != ET_ignorable_spaces_after_command |
| && e->type != ET_ignorable_spaces_before_command |
| && e->type != ET_spaces_at_end |
| && e->type != ET_spaces_before_paragraph |
| && e->type != ET_space_at_end_menu_node |
| && e->type != ET_spaces_after_close_brace |
| && e->e.text->end > 0) |
| { |
| char *text_norm_spaces = collapse_spaces (e->e.text->text); |
| ADD(text_norm_spaces); |
| free (text_norm_spaces); |
| } |
| return; |
| } |
| |
| cmd = element_builtin_data_cmd (e); |
| |
| if ((e->type == ET_postamble_after_end |
| || e->type == ET_preamble_before_beginning |
| || e->type == ET_preamble_before_setfilename) |
| || (cmd |
| && ((cmd == CM_anchor |
| || cmd == CM_footnote |
| || cmd == CM_shortcaption |
| || cmd == CM_caption |
| || cmd == CM_hyphenation |
| || cmd == CM_namedanchor |
| || cmd == CM_sortas |
| || cmd == CM_seealso |
| || cmd == CM_seeentry) |
| /* here ignore the 'regular' line commands */ |
| || (e->e.c->contents.number > 0 |
| && e->e.c->contents.list[0]->type == ET_line_arg) |
| /* here ignore the root-level line commands, @node and |
| sectioning commands */ |
| || (e->e.c->contents.number > 0 |
| && e->e.c->contents.list[0]->type == ET_arguments_line |
| && e->e.c->contents.list[0]->e.c->contents.number > 0 |
| && e->e.c->contents.list[0]->e.c->contents.list[0]->type |
| == ET_line_arg)))) |
| return; |
| |
| if (cmd) |
| { |
| if (command_normalization_text[cmd]) |
| ADD(command_normalization_text[cmd]); |
| else if (builtin_command_data[cmd].flags & CF_accent) |
| { |
| if (e->e.c->contents.number > 0) |
| { |
| ACCENTS_STACK *accent_stack |
| = find_innermost_accent_contents (e); |
| TEXT accent_text; |
| char *accented_char; |
| |
| if (!accent_stack->argument) |
| { |
| destroy_accent_stack (accent_stack); |
| return; |
| } |
| |
| text_init (&accent_text); |
| convert_to_normalized_internal (accent_stack->argument, |
| &accent_text); |
| |
| /* We pass undef as last resort formatting function, because we know that |
| unicode_accent is used, and it cannot fail/return undef. */ |
| accented_char = encoded_accents (0, accent_text.text, |
| &accent_stack->stack, "utf-8", 0, 0); |
| |
| ADD(accented_char); |
| free (accented_char); |
| free (accent_text.text); |
| destroy_accent_stack (accent_stack); |
| } |
| return; |
| } |
| else if (builtin_command_data[cmd].flags & CF_ref) |
| { |
| int order_index = 0; |
| int *arguments_order = ref_5_args_order; |
| if (cmd == CM_inforef || cmd == CM_link) |
| arguments_order = ref_3_args_order; |
| while (arguments_order[order_index] >= 0) |
| { |
| /* no risk with that casting as idx < 5 */ |
| size_t idx = (size_t) arguments_order[order_index]; |
| if (e->e.c->contents.number > idx) |
| { |
| TEXT arg_text; |
| |
| text_init (&arg_text); |
| convert_to_normalized_internal ( |
| e->e.c->contents.list[idx], &arg_text); |
| if (arg_text.end > 0) |
| { |
| char *non_space_char = arg_text.text |
| + strspn (arg_text.text, whitespace_chars); |
| if (*non_space_char) |
| { |
| ADD (arg_text.text); |
| free (arg_text.text); |
| break; |
| } |
| } |
| } |
| order_index++; |
| } |
| return; |
| } |
| else if (e->e.c->contents.number > 0 |
| && (e->e.c->contents.list[0]->type == ET_brace_container |
| || e->e.c->contents.list[0]->type == ET_brace_arg |
| || cmd == CM_math)) |
| { |
| convert_to_normalized_internal (e->e.c->contents.list[0], result); |
| return; |
| } |
| } |
| if (e->e.c->contents.number > 0) |
| { |
| size_t i; |
| for (i = 0; i < e->e.c->contents.number; i++) |
| convert_to_normalized_internal (e->e.c->contents.list[i], result); |
| } |
| } |
| #undef ADD |
| |
| /* Return value to be freed by caller. */ |
| char * |
| convert_to_normalized (const ELEMENT *e) |
| { |
| TEXT result; |
| |
| if (!e) |
| return strdup (""); |
| text_init (&result); |
| /* this is needed for a test result with empty listoffloats */ |
| text_append (&result, ""); |
| convert_to_normalized_internal (e, &result); |
| return result.text; |
| } |
| |
| void |
| protect_unicode_char (const char *text, TEXT *result) |
| { |
| uint8_t *encoded_u8; |
| const uint8_t *next; |
| ucs4_t next_char; |
| char *str; |
| |
| /* determine unicode codepoint */ |
| encoded_u8 = utf8_from_string (text); |
| next = u8_next (&next_char, encoded_u8); |
| if (next && *next) |
| bug ("Something left on next_str/encoded_u8\n"); |
| free (encoded_u8); |
| |
| if (next_char <= 0xFFFF) |
| { |
| xasprintf (&str, "%04lx", next_char); |
| text_append_n (result, "_", 1); |
| } |
| else |
| { |
| xasprintf (&str, "%06lx", next_char); |
| text_append_n (result, "__", 2); |
| } |
| text_append (result, str); |
| free (str); |
| } |
| |
| /* to be freed by caller */ |
| char *unicode_to_protected (const char *text) |
| { |
| TEXT result; |
| const char *p = text; |
| |
| text_init (&result); |
| text_append (&result, ""); |
| |
| while (*p) |
| { |
| int n = strspn (p, " "); |
| if (n) |
| { |
| text_append_n (&result, "-", 1); |
| p += n; |
| if (!*p) |
| break; |
| } |
| |
| if (isascii_alnum (*p)) |
| { |
| text_append_n (&result, p, 1); |
| p++; |
| } |
| else |
| { |
| int char_len = 1; |
| char *next_str; |
| |
| /* Count any UTF-8 continuation bytes. */ |
| while ((p[char_len] & 0xC0) == 0x80) |
| char_len++; |
| |
| next_str = strndup (p, char_len); |
| protect_unicode_char (next_str, &result); |
| |
| free (next_str); |
| p += char_len; |
| } |
| } |
| return (result.text); |
| } |
| |
| /* to be freed by caller */ |
| char *normalize_top_name (const char *text) |
| { |
| if (strlen (text) == 3) |
| { |
| char *normalized = strdup (text); |
| char *p; |
| |
| for (p = normalized; *p; p++) |
| if (isascii_alnum (*p)) |
| { |
| *p = tolower (*p); |
| } |
| else |
| { |
| free (normalized); |
| return strdup (text); |
| } |
| |
| if (!strcmp (normalized, "top")) |
| { |
| free (normalized); |
| return strdup ("Top"); |
| } |
| |
| free (normalized); |
| return strdup (text); |
| } |
| return strdup (text); |
| } |
| |
| /* to be freed by caller */ |
| char * |
| convert_to_node_identifier (const ELEMENT *element) |
| { |
| char *converted_name = convert_to_normalized (element); |
| char *normalized_name = normalize_NFC (converted_name); |
| char *protected = unicode_to_protected (normalized_name); |
| char *result = normalize_top_name (protected); |
| |
| free (protected); |
| free (converted_name); |
| free (normalized_name); |
| return result; |
| } |
| |
| char * |
| convert_contents_to_node_identifier (const ELEMENT *e) |
| { |
| ELEMENT *tmp = new_element (ET_NONE); |
| char *result; |
| |
| tmp->e.c->contents = e->e.c->contents; |
| result = convert_to_node_identifier (tmp); |
| tmp->e.c->contents.list = 0; |
| destroy_element (tmp); |
| |
| return result; |
| } |
| |
| /* to be freed by caller */ |
| char * |
| convert_to_identifier (const ELEMENT *element) |
| { |
| char *converted_name = convert_to_normalized (element); |
| char *normalized_name = normalize_NFC (converted_name); |
| char *result = unicode_to_protected (normalized_name); |
| |
| free (converted_name); |
| free (normalized_name); |
| return result; |
| } |
| |
| char * |
| convert_contents_to_identifier (const ELEMENT *e) |
| { |
| ELEMENT *tmp = new_element (ET_NONE); |
| char *result; |
| |
| tmp->e.c->contents = e->e.c->contents; |
| result = convert_to_identifier (tmp); |
| tmp->e.c->contents.list = 0; |
| destroy_element (tmp); |
| |
| return result; |
| } |
| |
| static char * |
| unicode_to_transliterate (char *text, int external, |
| int in_test, int no_unidecode) |
| { |
| char *result; |
| int status; |
| |
| if (external) |
| { |
| result = call_nodenamenormalization_unicode_to_transliterate (text, |
| in_test, no_unidecode); |
| if (result) |
| return result; |
| } |
| |
| /* We silence the transliteration errors that may happen (for example on |
| solaris 11). The calling code should never depend on a specific |
| transliteration result, transliteration should only be used for |
| internal identifiers. */ |
| result = encode_string (text, "us-ascii//TRANSLIT", &status, 0, |
| ieh_skip, 0); |
| |
| return result; |
| } |
| |
| char * |
| normalize_transliterate_texinfo (const ELEMENT *e, int external_translit, |
| int in_test, int no_unidecode) |
| { |
| char *converted_name = convert_to_normalized (e); |
| char *normalized_name = normalize_NFC (converted_name); |
| char *transliterated = unicode_to_transliterate (normalized_name, |
| external_translit, in_test, no_unidecode); |
| char *result = unicode_to_protected (transliterated); |
| |
| free (converted_name); |
| free (normalized_name); |
| free (transliterated); |
| return result; |
| } |
| |
| char * |
| normalize_transliterate_texinfo_contents (const ELEMENT *e, |
| int external_translit, int in_test, int no_unidecode) |
| { |
| ELEMENT *tmp = new_element (ET_NONE); |
| char *result; |
| |
| tmp->e.c->contents = e->e.c->contents; |
| result = normalize_transliterate_texinfo (tmp, external_translit, |
| in_test, no_unidecode); |
| tmp->e.c->contents.list = 0; |
| destroy_element (tmp); |
| |
| return result; |
| } |