blob: 950f8b7e44294a19890bba90a33dd6ad5ae21d64 [file]
/* Copyright 2010-2026 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
/* In sync with Texinfo::Convert::Unicode */
#include <config.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include <stddef.h>
#include "unictype.h"
#include "uninorm.h"
#include "unistr.h"
#include "text.h"
#include "command_ids.h"
#include "tree_types.h"
#include "converter_types.h"
/* for fatal */
#include "base_utils.h"
#include "errors.h"
/* for element_builtin_cmd */
#include "builtin_commands.h"
/* for xasprintf to_upper_or_lower_multibyte normalize_encoding_name */
#include "utils.h"
#include "unicode.h"
/* define unicode_diacritics base_unicode_map extra_unicode_map */
#include "cmd_unicode.c"
#include "accent_tables_8bit_codepoints.c"
COMMAND_UNICODE unicode_character_brace_no_arg_commands[BUILTIN_CMD_NUMBER];
void
setup_unicode_data (void)
{
int i;
memset (&unicode_character_brace_no_arg_commands, 0,
BUILTIN_CMD_NUMBER * sizeof (COMMAND_UNICODE));
for (i = 0; i < BUILTIN_CMD_NUMBER; i++)
{
const COMMAND_UNICODE_CHARACTER *unicode_character = 0;
int is_extra = 0;
if (base_unicode_map[i].codepoint)
unicode_character = &base_unicode_map[i];
else if (extra_unicode_map[i].codepoint)
{
unicode_character = &extra_unicode_map[i];
is_extra = 1;
}
if (unicode_character)
{
unicode_character_brace_no_arg_commands[i].codepoint
= unicode_character->codepoint;
unicode_character_brace_no_arg_commands[i].text
= unicode_character->text;
unicode_character_brace_no_arg_commands[i].css_string
= unicode_character->css_string;
unicode_character_brace_no_arg_commands[i].is_extra = is_extra;
}
}
}
uint8_t *
utf8_from_string (const char *text)
{
return (uint8_t *) strdup (text);
}
char *
string_from_utf8 (const uint8_t *encoded_u8)
{
return strdup ((char *) encoded_u8);
}
char *
normalize_NFC (const char *text)
{
size_t lengthp;
char *result = 0;
uint8_t *encoded_u8 = utf8_from_string (text);
/* +1 to have the terminating NUL included in the string */
uint8_t *normalized_u8 = u8_normalize (UNINORM_NFC, encoded_u8,
u8_strlen (encoded_u8)+1,
NULL, &lengthp);
free (encoded_u8);
result = string_from_utf8 (normalized_u8);
free (normalized_u8);
return result;
}
char *
normalize_NFKD (const char *text)
{
size_t lengthp;
char *result = 0;
uint8_t *encoded_u8 = utf8_from_string (text);
/* +1 to have the terminating NUL included in the string */
uint8_t *normalized_u8 = u8_normalize (UNINORM_NFKD, encoded_u8,
u8_strlen (encoded_u8)+1,
NULL, &lengthp);
free (encoded_u8);
result = string_from_utf8 (normalized_u8);
free (normalized_u8);
return result;
}
char *
unicode_accent (const char *text, const ELEMENT *e, int index_in_stack,
const ELEMENT_STACK *stack)
{
char *result = 0;
/*
special handling of @dotless{i}.
\x{0131}\x{0308} for @dotless{i} @" doesn't lead to NFC 00ef.
so it is set to a real dotless i only if not in an accent command.
Do the same for dotless j, even though we have no clear idea on
what is going on for that character.
*/
if (e->e.c->cmd == CM_dotless)
{
if (index_in_stack == 0
|| !unicode_diacritics[element_builtin_cmd (
stack->stack[index_in_stack-1])].text)
{
if (!strcmp (text, "i"))
/* dotless i in UTF-8 */
return strdup ("\xc4\xb1");
else if (!strcmp (text, "j"))
return strdup ("\xc8\xb7");
}
/* also correct for dotless I as dotless I is I */
return strdup (text);
}
if (unicode_diacritics[e->e.c->cmd].text)
{
static TEXT accented_text;
if (e->e.c->cmd == CM_tieaccent)
{
/* tieaccent diacritic is naturally and correctly composed
between two characters */
uint8_t *encoded_u8 = utf8_from_string (text);
const uint8_t *next;
ucs4_t first_char;
next = u8_next (&first_char, encoded_u8);
if (next && (uc_is_general_category (first_char, UC_CATEGORY_L)
/* ASCII digits */
|| (first_char >= 0x0030 && first_char <= 0x0039)))
{
const uint8_t *remaining;
ucs4_t second_char;
remaining = u8_next (&second_char, next);
if (remaining && (uc_is_general_category (second_char, UC_CATEGORY_L)
/* ASCII digits */
|| (second_char >= 0x0030 && second_char <= 0x0039)))
{
char *first_char_text;
char *next_text;
uint8_t *first_char_u8 = malloc (7 * sizeof (uint8_t));
int first_char_len = u8_uctomb (first_char_u8, first_char, 6);
if (first_char_len < 0)
fatal ("u8_uctomb returns negative value");
first_char_u8[first_char_len] = 0;
first_char_text = string_from_utf8 (first_char_u8);
free (first_char_u8);
text_init (&accented_text);
text_append (&accented_text, first_char_text);
free (first_char_text);
text_append (&accented_text, unicode_diacritics[e->e.c->cmd].text);
next_text = string_from_utf8 (next);
text_append (&accented_text, next_text);
free (next_text);
result = normalize_NFC (accented_text.text);
free (accented_text.text);
}
}
free (encoded_u8);
if (result)
return result;
}
text_init (&accented_text);
text_append (&accented_text, text);
text_append (&accented_text, unicode_diacritics[e->e.c->cmd].text);
result = normalize_NFC (accented_text.text);
free (accented_text.text);
}
return result;
}
static int
compare_strings (const void *a, const void *b)
{
const char **str_a = (const char **) a;
const char **str_b = (const char **) b;
return strcmp (*str_a, *str_b);
}
char *
format_eight_bit_accents_stack (CONVERTER *self, const char *text,
const ELEMENT_STACK *stack, int encoding_index,
char *(*format_accent)(CONVERTER *self, const char *text,
const ELEMENT *element, int index_in_stack,
const ELEMENT_STACK *stack, int set_case),
int set_case)
{
int i, j, k;
char *result = strdup (text);
char *prev_eight_bit;
char *new_eight_bit;
int const stack_nr = stack->top;
char **results_stack
= malloc ((stack_nr +1) * sizeof (char *));
memset (results_stack, 0, (stack_nr +1) * sizeof (char *));
results_stack[stack_nr] = strdup (text);
for (i = stack_nr -1; i >= 0; i--)
{
const ELEMENT *accent_command = stack->stack[i];
results_stack[i] = unicode_accent (results_stack[i+1], accent_command,
i, stack);
if (!results_stack[i])
{
/* decrease a last time as if the loop had been gone through */
i--;
break;
}
else if (set_case)
{
char *cased = to_upper_or_lower_multibyte (results_stack[i], set_case);
free (results_stack[i]);
results_stack[i] = cased;
}
}
/* undo the last decrease of i */
i++;
/*
At this point we have the unicode character results for the accent
commands stack, with all the intermediate results.
For each one we'll check if it is possible to encode it in the
current eight bit output encoding table and, if so, set the result
to the character.
*/
prev_eight_bit = strdup ("");
for (j = stack_nr; j >= i; j--)
{
new_eight_bit = 0;
if (!results_stack[j])
break;
uint8_t *encoded_u8 = utf8_from_string (results_stack[j]);
ucs4_t first_char;
u8_next (&first_char, encoded_u8);
free (encoded_u8);
if (first_char < 127)
xasprintf (&new_eight_bit, "%02lX", first_char);
else
{
char *codepoint;
if (first_char <= 0xFFFF)
{
xasprintf (&codepoint, "%04lX", first_char);
const char *found = (const char *)bsearch (&codepoint,
unicode_to_eight_bit[encoding_index].codepoints,
unicode_to_eight_bit[encoding_index].number,
sizeof (const char *), compare_strings);
if (found)
new_eight_bit = strdup (found);
free (codepoint);
}
}
if (!new_eight_bit)
break;
/*
# in that case, the new eight bit character is the same than the one
# found with one less character (and it isn't a @dotless{i}). It may
# hapen in 2 case, both meaning that there is no corresponding 8bit char:
#
# -> there are 2 characters in accent. This could happen, for example
# if an accent that cannot be rendered is found and it leads to
# appending or prepending a character. For example this happens for
# @={@,{@~{n}}}, where @,{@~{n}} is expanded to 2 characters:
# n with a tilde, followed by a ,
# In that case, the additional diacritic is appended, which
# means that it is composed with the , and leaves n with a tilde
# untouched.
# -> the diacritic is appended but the normal form doesn't lead
# to a composed character, such that the first character
# of the string is unchanged. This, for example, happens for
# @ubaraccent{a} since there is no composed accent with a and an
# underbar.
*/
if (!strcmp (new_eight_bit, prev_eight_bit)
&& !(stack->stack[j]->e.c->cmd == CM_dotless
&& !strcmp (results_stack[j], "i")))
{
free (new_eight_bit);
break;
}
free (result);
result = strdup (results_stack[j]);
free (prev_eight_bit);
prev_eight_bit = strdup (new_eight_bit);
free (new_eight_bit);
}
free (prev_eight_bit);
/*
handle the remaining accents, that have not been converted to 8bit
compatible unicode
*/
for (; j >= 0; j--)
{
const ELEMENT *accent_command = stack->stack[j];
char *formatted_result
= (*format_accent) (self, result, accent_command, i, stack, set_case);
free (result);
result = formatted_result;
}
for (k = stack_nr; k >= i; k--)
{
free (results_stack[k]);
}
free (results_stack);
return result;
}
char *
format_unicode_accents_stack_internal (CONVERTER *self, const char *text,
const ELEMENT_STACK *stack,
char *(*format_accent)(CONVERTER *self, const char *text,
const ELEMENT *element, int index_in_stack,
const ELEMENT_STACK *stack, int set_case),
int set_case)
{
int i;
char *result = strdup (text);
for (i = stack->top - 1; i >= 0; i--)
{
const ELEMENT *accent_command = stack->stack[i];
char *formatted_result = unicode_accent (result, accent_command,
i, stack);
if (formatted_result)
{
free (result);
result = formatted_result;
}
else
break;
}
if (set_case)
{
char *cased = to_upper_or_lower_multibyte (result, set_case);
free (result);
result = cased;
}
for (; i >= 0; i--)
{
const ELEMENT *accent_command = stack->stack[i];
char *formatted_result
= (*format_accent) (self, result, accent_command, i, stack, set_case);
free (result);
result = formatted_result;
}
return result;
}
char *
encoded_accents (CONVERTER *self, const char *text, const ELEMENT_STACK *stack,
const char *encoding,
char *(*format_accent)(CONVERTER *self, const char *text,
const ELEMENT *element, int index_in_stack,
const ELEMENT_STACK *stack, int set_case),
int set_case)
{
if (encoding)
{
/*
in case an encoding is directly specified with -c OUTPUT_ENCODING_NAME
in upper case to match with the encodings in Texinfo input, we convert
to lower case to match the encoding names used here. In the code
encoding names are lower cased early.
*/
int possible_encoding;
char *normalized_encoding = normalize_encoding_name
(encoding, &possible_encoding);
if (possible_encoding)
{
int encoding_index = -1;
size_t i;
if (!strcmp (normalized_encoding, "utf-8"))
{
free (normalized_encoding);
return format_unicode_accents_stack_internal (self, text, stack,
format_accent, set_case);
}
for (i = 0; i < sizeof (unicode_to_eight_bit)
/ sizeof (unicode_to_eight_bit[0]); i++)
{
if (!strcmp (normalized_encoding,
unicode_to_eight_bit[i].encoding))
{
encoding_index = i;
break;
}
}
if (encoding_index >= 0)
{
free (normalized_encoding);
return format_eight_bit_accents_stack (self, text, stack,
encoding_index, format_accent, set_case);
}
}
free (normalized_encoding);
}
return 0;
}
/* UNICODE_POINT is a string describing an hexadecimal number with
letters in upper case */
/* returns the index in unicode_to_eight_bit +1 if > 0 */
int unicode_point_decoded_in_encoding (const char *encoding,
const char *codepoint)
{
if (encoding)
{
int possible_encoding;
char *normalized_encoding = normalize_encoding_name
(encoding, &possible_encoding);
if (possible_encoding)
{
size_t i;
if (!strcmp (normalized_encoding, "utf-8"))
{
free (normalized_encoding);
return -1;
}
for (i = 0; i < sizeof (unicode_to_eight_bit)
/ sizeof (unicode_to_eight_bit[0]); i++)
{
if (!strcmp (normalized_encoding,
unicode_to_eight_bit[i].encoding))
{
unsigned long point_nr = strtoul (codepoint, NULL, 16);
/* excludes 127 \x{7F} DEL */
if (point_nr < 127)
{
free (normalized_encoding);
return (int) i + 1;
}
char *found = (char *)bsearch (&codepoint,
unicode_to_eight_bit[i].codepoints,
unicode_to_eight_bit[i].number,
sizeof (char *), compare_strings);
if (found)
{
free (normalized_encoding);
return (int) i + 1;
}
break;
}
}
}
free (normalized_encoding);
/* unknown encoding or not represented in encoding */
return 0;
}
else
/* if encoding is not set, consider that it is the default, utf-8 */
return -1;
}
const char *
unicode_brace_no_arg_command (enum command_id cmd, const char *encoding)
{
if (unicode_character_brace_no_arg_commands[cmd].text
&& unicode_point_decoded_in_encoding (encoding,
unicode_character_brace_no_arg_commands[cmd].codepoint))
return unicode_character_brace_no_arg_commands[cmd].text;
else
return 0;
}