tta/C/main/unicode.c - texinfo - Git at Google

 /* Copyright 2010-2026 Free Software Foundation, Inc.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>. */

 /* In sync with Texinfo::Convert::Unicode */

 #include <config.h>

 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <stddef.h>
 #include "unictype.h"
 #include "uninorm.h"
 #include "unistr.h"

 #include "text.h"
 #include "command_ids.h"
 #include "tree_types.h"
 #include "converter_types.h"
 /* for fatal */
 #include "base_utils.h"
 #include "errors.h"
 /* for element_builtin_cmd */
 #include "builtin_commands.h"
 /* for xasprintf to_upper_or_lower_multibyte normalize_encoding_name */
 #include "utils.h"
 #include "unicode.h"

 /* define unicode_diacritics base_unicode_map extra_unicode_map */
 #include "cmd_unicode.c"

 #include "accent_tables_8bit_codepoints.c"


 COMMAND_UNICODE unicode_character_brace_no_arg_commands[BUILTIN_CMD_NUMBER];

 void
 setup_unicode_data (void)
 {
   int i;

   memset (&unicode_character_brace_no_arg_commands, 0,
           BUILTIN_CMD_NUMBER * sizeof (COMMAND_UNICODE));

   for (i = 0; i < BUILTIN_CMD_NUMBER; i++)
     {
       const COMMAND_UNICODE_CHARACTER *unicode_character = 0;
       int is_extra = 0;
       if (base_unicode_map[i].codepoint)
         unicode_character = &base_unicode_map[i];
       else if (extra_unicode_map[i].codepoint)
         {
           unicode_character = &extra_unicode_map[i];
           is_extra = 1;
         }

       if (unicode_character)
         {
           unicode_character_brace_no_arg_commands[i].codepoint
             = unicode_character->codepoint;
           unicode_character_brace_no_arg_commands[i].text
             = unicode_character->text;
           unicode_character_brace_no_arg_commands[i].css_string
             = unicode_character->css_string;
           unicode_character_brace_no_arg_commands[i].is_extra = is_extra;
         }
     }
 }

 uint8_t *
 utf8_from_string (const char *text)
 {
   return (uint8_t *) strdup (text);
 }

 char *
 string_from_utf8 (const uint8_t *encoded_u8)
 {
   return strdup ((char *) encoded_u8);
 }

 char *
 normalize_NFC (const char *text)
 {
   size_t lengthp;

   char *result = 0;
   uint8_t *encoded_u8 = utf8_from_string (text);
   /* +1 to have the terminating NUL included in the string */
   uint8_t *normalized_u8 = u8_normalize (UNINORM_NFC, encoded_u8,
                                          u8_strlen (encoded_u8)+1,
                                          NULL, &lengthp);
   free (encoded_u8);
   result = string_from_utf8 (normalized_u8);
   free (normalized_u8);
   return result;
 }

 char *
 normalize_NFKD (const char *text)
 {
   size_t lengthp;

   char *result = 0;
   uint8_t *encoded_u8 = utf8_from_string (text);
   /* +1 to have the terminating NUL included in the string */
   uint8_t *normalized_u8 = u8_normalize (UNINORM_NFKD, encoded_u8,
                                          u8_strlen (encoded_u8)+1,
                                          NULL, &lengthp);
   free (encoded_u8);
   result = string_from_utf8 (normalized_u8);
   free (normalized_u8);
   return result;
 }

 char *
 unicode_accent (const char *text, const ELEMENT *e, int index_in_stack,
                 const ELEMENT_STACK *stack)
 {
   char *result = 0;

   /*
   special handling of @dotless{i}.
   \x{0131}\x{0308} for @dotless{i} @" doesn't lead to NFC 00ef.
   so it is set to a real dotless i only if not in an accent command.
   Do the same for dotless j, even though we have no clear idea on
   what is going on for that character.
   */

   if (e->e.c->cmd == CM_dotless)
     {
       if (index_in_stack == 0
           || !unicode_diacritics[element_builtin_cmd (
                                 stack->stack[index_in_stack-1])].text)
         {
           if (!strcmp (text, "i"))
             /* dotless i in UTF-8 */
             return strdup ("\xc4\xb1");
           else if (!strcmp (text, "j"))
             return strdup ("\xc8\xb7");
         }
       /* also correct for dotless I as dotless I is I */
       return strdup (text);
     }

   if (unicode_diacritics[e->e.c->cmd].text)
     {
       static TEXT accented_text;
       if (e->e.c->cmd == CM_tieaccent)
         {
           /* tieaccent diacritic is naturally and correctly composed
              between two characters */
           uint8_t *encoded_u8 = utf8_from_string (text);
           const uint8_t *next;
           ucs4_t first_char;
           next = u8_next (&first_char, encoded_u8);
           if (next && (uc_is_general_category (first_char, UC_CATEGORY_L)
                        /* ASCII digits */
                        || (first_char >= 0x0030 && first_char <= 0x0039)))
             {
               const uint8_t *remaining;
               ucs4_t second_char;
               remaining = u8_next (&second_char, next);
               if (remaining && (uc_is_general_category (second_char, UC_CATEGORY_L)
                                 /* ASCII digits */
                                 || (second_char >= 0x0030 && second_char <= 0x0039)))
                 {
                   char *first_char_text;
                   char *next_text;
                   uint8_t *first_char_u8 = malloc (7 * sizeof (uint8_t));
                   int first_char_len = u8_uctomb (first_char_u8, first_char, 6);
                   if (first_char_len < 0)
                     fatal ("u8_uctomb returns negative value");
                   first_char_u8[first_char_len] = 0;
                   first_char_text = string_from_utf8 (first_char_u8);
                   free (first_char_u8);
                   text_init (&accented_text);
                   text_append (&accented_text, first_char_text);
                   free (first_char_text);
                   text_append (&accented_text, unicode_diacritics[e->e.c->cmd].text);
                   next_text = string_from_utf8 (next);
                   text_append (&accented_text, next_text);
                   free (next_text);
                   result = normalize_NFC (accented_text.text);
                   free (accented_text.text);
                 }
             }
           free (encoded_u8);
           if (result)
             return result;
         }
       text_init (&accented_text);
       text_append (&accented_text, text);
       text_append (&accented_text, unicode_diacritics[e->e.c->cmd].text);
       result = normalize_NFC (accented_text.text);
       free (accented_text.text);
     }

   return result;
 }

 static int
 compare_strings (const void *a, const void *b)
 {
   const char **str_a = (const char **) a;
   const char **str_b = (const char **) b;

   return strcmp (*str_a, *str_b);
 }

 char *
 format_eight_bit_accents_stack (CONVERTER *self, const char *text,
                       const ELEMENT_STACK *stack, int encoding_index,
   char *(*format_accent)(CONVERTER *self, const char *text,
                          const ELEMENT *element, int index_in_stack,
                          const ELEMENT_STACK *stack, int set_case),
   int set_case)
 {
   int i, j, k;
   char *result = strdup (text);
   char *prev_eight_bit;
   char *new_eight_bit;
   int const stack_nr = stack->top;
   char **results_stack
      = malloc ((stack_nr +1) * sizeof (char *));

   memset (results_stack, 0, (stack_nr +1) * sizeof (char *));

   results_stack[stack_nr] = strdup (text);

   for (i = stack_nr -1; i >= 0; i--)
     {
       const ELEMENT *accent_command = stack->stack[i];
       results_stack[i] = unicode_accent (results_stack[i+1], accent_command,
                                          i, stack);
       if (!results_stack[i])
         {
           /* decrease a last time as if the loop had been gone through */
           i--;
           break;
         }
       else if (set_case)
         {
           char *cased = to_upper_or_lower_multibyte (results_stack[i], set_case);
           free (results_stack[i]);
           results_stack[i] = cased;
         }
     }
   /* undo the last decrease of i */
   i++;

   /*
     At this point we have the unicode character results for the accent
     commands stack, with all the intermediate results.
     For each one we'll check if it is possible to encode it in the
     current eight bit output encoding table and, if so, set the result
     to the character.
    */

   prev_eight_bit = strdup ("");

   for (j = stack_nr; j >= i; j--)
     {
       new_eight_bit = 0;
       if (!results_stack[j])
         break;

       uint8_t *encoded_u8 = utf8_from_string (results_stack[j]);
       ucs4_t first_char;
       u8_next (&first_char, encoded_u8);
       free (encoded_u8);
       if (first_char < 127)
         xasprintf (&new_eight_bit, "%02lX", first_char);
       else
         {
           char *codepoint;
           if (first_char <= 0xFFFF)
             {
               xasprintf (&codepoint, "%04lX", first_char);
               const char *found = (const char *)bsearch (&codepoint,
                              unicode_to_eight_bit[encoding_index].codepoints,
                              unicode_to_eight_bit[encoding_index].number,
                              sizeof (const char *), compare_strings);
               if (found)
                 new_eight_bit = strdup (found);

               free (codepoint);
             }
         }
       if (!new_eight_bit)
         break;

    /*
     # in that case, the new eight bit character is the same than the one
     # found with one less character (and it isn't a @dotless{i}). It may
     # hapen in 2 case, both meaning that there is no corresponding 8bit char:
     #
     # -> there are 2 characters in accent. This could happen, for example
     #    if an accent that cannot be rendered is found and it leads to
     #    appending or prepending a character. For example this happens for
     #    @={@,{@~{n}}}, where @,{@~{n}} is expanded to 2 characters:
     #    n with a tilde, followed by a ,
     #    In that case, the additional diacritic is appended, which
     #    means that it is composed with the , and leaves n with a tilde
     #    untouched.
     # -> the diacritic is appended but the normal form doesn't lead
     #    to a composed character, such that the first character
     #    of the string is unchanged. This, for example, happens for
     #    @ubaraccent{a} since there is no composed accent with a and an
     #    underbar.
     */
       if (!strcmp (new_eight_bit, prev_eight_bit)
           && !(stack->stack[j]->e.c->cmd == CM_dotless
                && !strcmp (results_stack[j], "i")))
         {
           free (new_eight_bit);
           break;
         }
       free (result);
       result = strdup (results_stack[j]);
       free (prev_eight_bit);
       prev_eight_bit = strdup (new_eight_bit);
       free (new_eight_bit);
     }

   free (prev_eight_bit);

   /*
     handle the remaining accents, that have not been converted to 8bit
     compatible unicode
    */
   for (; j >= 0; j--)
     {
       const ELEMENT *accent_command = stack->stack[j];
       char *formatted_result
           = (*format_accent) (self, result, accent_command, i, stack, set_case);
       free (result);
       result = formatted_result;
     }

   for (k = stack_nr; k >= i; k--)
     {
       free (results_stack[k]);
     }
   free (results_stack);

   return result;
 }

 char *
 format_unicode_accents_stack_internal (CONVERTER *self, const char *text,
   const ELEMENT_STACK *stack,
   char *(*format_accent)(CONVERTER *self, const char *text,
                          const ELEMENT *element, int index_in_stack,
                          const ELEMENT_STACK *stack, int set_case),
   int set_case)
 {
   int i;
   char *result = strdup (text);

   for (i = stack->top - 1; i >= 0; i--)
     {
       const ELEMENT *accent_command = stack->stack[i];
       char *formatted_result = unicode_accent (result, accent_command,
                                                i, stack);
       if (formatted_result)
         {
           free (result);
           result = formatted_result;
         }
       else
         break;
     }

   if (set_case)
     {
       char *cased = to_upper_or_lower_multibyte (result, set_case);
       free (result);
       result = cased;
     }

   for (; i >= 0; i--)
     {
       const ELEMENT *accent_command = stack->stack[i];
       char *formatted_result
           = (*format_accent) (self, result, accent_command, i, stack, set_case);
       free (result);
       result = formatted_result;
     }
   return result;
 }

 char *
 encoded_accents (CONVERTER *self, const char *text, const ELEMENT_STACK *stack,
   const char *encoding,
   char *(*format_accent)(CONVERTER *self, const char *text,
                          const ELEMENT *element, int index_in_stack,
                          const ELEMENT_STACK *stack, int set_case),
   int set_case)
 {
   if (encoding)
     {
      /*
       in case an encoding is directly specified with -c OUTPUT_ENCODING_NAME
       in upper case to match with the encodings in Texinfo input, we convert
       to lower case to match the encoding names used here.  In the code
       encoding names are lower cased early.
      */
       int possible_encoding;
       char *normalized_encoding = normalize_encoding_name
                                       (encoding, &possible_encoding);
       if (possible_encoding)
         {
           int encoding_index = -1;
           size_t i;
           if (!strcmp (normalized_encoding, "utf-8"))
             {
               free (normalized_encoding);
               return format_unicode_accents_stack_internal (self, text, stack,
                                                 format_accent, set_case);
             }
           for (i = 0; i < sizeof (unicode_to_eight_bit)
                          / sizeof (unicode_to_eight_bit[0]); i++)
             {
               if (!strcmp (normalized_encoding,
                            unicode_to_eight_bit[i].encoding))
                 {
                   encoding_index = i;
                   break;
                 }
             }
           if (encoding_index >= 0)
             {
               free (normalized_encoding);
               return format_eight_bit_accents_stack (self, text, stack,
                                      encoding_index, format_accent, set_case);
             }
         }
       free (normalized_encoding);
     }
   return 0;
 }


 /* UNICODE_POINT is a string describing an hexadecimal number with
    letters in upper case */
 /* returns the index in unicode_to_eight_bit +1 if > 0 */
 int unicode_point_decoded_in_encoding (const char *encoding,
                                        const char *codepoint)
 {
   if (encoding)
     {
       int possible_encoding;
       char *normalized_encoding = normalize_encoding_name
                                       (encoding, &possible_encoding);
       if (possible_encoding)
         {
           size_t i;
           if (!strcmp (normalized_encoding, "utf-8"))
             {
               free (normalized_encoding);
               return -1;
             }
           for (i = 0; i < sizeof (unicode_to_eight_bit)
                          / sizeof (unicode_to_eight_bit[0]); i++)
             {
               if (!strcmp (normalized_encoding,
                            unicode_to_eight_bit[i].encoding))
                 {
                   unsigned long point_nr = strtoul (codepoint, NULL, 16);
                   /* excludes 127 \x{7F} DEL */
                   if (point_nr < 127)
                     {
                       free (normalized_encoding);
                       return (int) i + 1;
                     }
                   char *found = (char *)bsearch (&codepoint,
                              unicode_to_eight_bit[i].codepoints,
                              unicode_to_eight_bit[i].number,
                              sizeof (char *), compare_strings);
                   if (found)
                     {
                       free (normalized_encoding);
                       return (int) i + 1;
                     }
                   break;
                 }
             }
         }
       free (normalized_encoding);
       /* unknown encoding or not represented in encoding */
       return 0;
     }
   else
     /* if encoding is not set, consider that it is the default, utf-8 */
     return -1;
 }

 const char *
 unicode_brace_no_arg_command (enum command_id cmd, const char *encoding)
 {
   if (unicode_character_brace_no_arg_commands[cmd].text
       && unicode_point_decoded_in_encoding (encoding,
           unicode_character_brace_no_arg_commands[cmd].codepoint))
     return unicode_character_brace_no_arg_commands[cmd].text;
   else
     return 0;
 }
	/* Copyright 2010-2026 Free Software Foundation, Inc.

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program. If not, see <https://www.gnu.org/licenses/>. */

	/* In sync with Texinfo::Convert::Unicode */

	#include <config.h>

	#include <stdio.h>
	#include <string.h>
	#include <stdlib.h>
	#include <ctype.h>
	#include <stddef.h>
	#include "unictype.h"
	#include "uninorm.h"
	#include "unistr.h"

	#include "text.h"
	#include "command_ids.h"
	#include "tree_types.h"
	#include "converter_types.h"
	/* for fatal */
	#include "base_utils.h"
	#include "errors.h"
	/* for element_builtin_cmd */
	#include "builtin_commands.h"
	/* for xasprintf to_upper_or_lower_multibyte normalize_encoding_name */
	#include "utils.h"
	#include "unicode.h"

	/* define unicode_diacritics base_unicode_map extra_unicode_map */
	#include "cmd_unicode.c"

	#include "accent_tables_8bit_codepoints.c"


	COMMAND_UNICODE unicode_character_brace_no_arg_commands[BUILTIN_CMD_NUMBER];

	void
	setup_unicode_data (void)
	{
	int i;

	memset (&unicode_character_brace_no_arg_commands, 0,
	BUILTIN_CMD_NUMBER * sizeof (COMMAND_UNICODE));

	for (i = 0; i < BUILTIN_CMD_NUMBER; i++)
	{
	const COMMAND_UNICODE_CHARACTER *unicode_character = 0;
	int is_extra = 0;
	if (base_unicode_map[i].codepoint)
	unicode_character = &base_unicode_map[i];
	else if (extra_unicode_map[i].codepoint)
	{
	unicode_character = &extra_unicode_map[i];
	is_extra = 1;
	}

	if (unicode_character)
	{
	unicode_character_brace_no_arg_commands[i].codepoint
	= unicode_character->codepoint;
	unicode_character_brace_no_arg_commands[i].text
	= unicode_character->text;
	unicode_character_brace_no_arg_commands[i].css_string
	= unicode_character->css_string;
	unicode_character_brace_no_arg_commands[i].is_extra = is_extra;
	}
	}
	}

	uint8_t *
	utf8_from_string (const char *text)
	{
	return (uint8_t *) strdup (text);
	}

	char *
	string_from_utf8 (const uint8_t *encoded_u8)
	{
	return strdup ((char *) encoded_u8);
	}

	char *
	normalize_NFC (const char *text)
	{
	size_t lengthp;

	char *result = 0;
	uint8_t *encoded_u8 = utf8_from_string (text);
	/* +1 to have the terminating NUL included in the string */
	uint8_t *normalized_u8 = u8_normalize (UNINORM_NFC, encoded_u8,
	u8_strlen (encoded_u8)+1,
	NULL, &lengthp);
	free (encoded_u8);
	result = string_from_utf8 (normalized_u8);
	free (normalized_u8);
	return result;
	}

	char *
	normalize_NFKD (const char *text)
	{
	size_t lengthp;

	char *result = 0;
	uint8_t *encoded_u8 = utf8_from_string (text);
	/* +1 to have the terminating NUL included in the string */
	uint8_t *normalized_u8 = u8_normalize (UNINORM_NFKD, encoded_u8,
	u8_strlen (encoded_u8)+1,
	NULL, &lengthp);
	free (encoded_u8);
	result = string_from_utf8 (normalized_u8);
	free (normalized_u8);
	return result;
	}

	char *
	unicode_accent (const char text, const ELEMENT e, int index_in_stack,
	const ELEMENT_STACK *stack)
	{
	char *result = 0;

	/*
	special handling of @dotless{i}.
	\x{0131}\x{0308} for @dotless{i} @" doesn't lead to NFC 00ef.
	so it is set to a real dotless i only if not in an accent command.
	Do the same for dotless j, even though we have no clear idea on
	what is going on for that character.
	*/

	if (e->e.c->cmd == CM_dotless)
	{
	if (index_in_stack == 0
	\|\| !unicode_diacritics[element_builtin_cmd (
	stack->stack[index_in_stack-1])].text)
	{
	if (!strcmp (text, "i"))
	/* dotless i in UTF-8 */
	return strdup ("\xc4\xb1");
	else if (!strcmp (text, "j"))
	return strdup ("\xc8\xb7");
	}
	/* also correct for dotless I as dotless I is I */
	return strdup (text);
	}

	if (unicode_diacritics[e->e.c->cmd].text)
	{
	static TEXT accented_text;
	if (e->e.c->cmd == CM_tieaccent)
	{
	/* tieaccent diacritic is naturally and correctly composed
	between two characters */
	uint8_t *encoded_u8 = utf8_from_string (text);
	const uint8_t *next;
	ucs4_t first_char;
	next = u8_next (&first_char, encoded_u8);
	if (next && (uc_is_general_category (first_char, UC_CATEGORY_L)
	/* ASCII digits */
	\|\| (first_char >= 0x0030 && first_char <= 0x0039)))
	{
	const uint8_t *remaining;
	ucs4_t second_char;
	remaining = u8_next (&second_char, next);
	if (remaining && (uc_is_general_category (second_char, UC_CATEGORY_L)
	/* ASCII digits */
	\|\| (second_char >= 0x0030 && second_char <= 0x0039)))
	{
	char *first_char_text;
	char *next_text;
	uint8_t first_char_u8 = malloc (7 sizeof (uint8_t));
	int first_char_len = u8_uctomb (first_char_u8, first_char, 6);
	if (first_char_len < 0)
	fatal ("u8_uctomb returns negative value");
	first_char_u8[first_char_len] = 0;
	first_char_text = string_from_utf8 (first_char_u8);
	free (first_char_u8);
	text_init (&accented_text);
	text_append (&accented_text, first_char_text);
	free (first_char_text);
	text_append (&accented_text, unicode_diacritics[e->e.c->cmd].text);
	next_text = string_from_utf8 (next);
	text_append (&accented_text, next_text);
	free (next_text);
	result = normalize_NFC (accented_text.text);
	free (accented_text.text);
	}
	}
	free (encoded_u8);
	if (result)
	return result;
	}
	text_init (&accented_text);
	text_append (&accented_text, text);
	text_append (&accented_text, unicode_diacritics[e->e.c->cmd].text);
	result = normalize_NFC (accented_text.text);
	free (accented_text.text);
	}

	return result;
	}

	static int
	compare_strings (const void a, const void b)
	{
	const char str_a = (const char ) a;
	const char str_b = (const char ) b;

	return strcmp (str_a, str_b);
	}

	char *
	format_eight_bit_accents_stack (CONVERTER self, const char text,
	const ELEMENT_STACK *stack, int encoding_index,
	char (format_accent)(CONVERTER self, const char text,
	const ELEMENT *element, int index_in_stack,
	const ELEMENT_STACK *stack, int set_case),
	int set_case)
	{
	int i, j, k;
	char *result = strdup (text);
	char *prev_eight_bit;
	char *new_eight_bit;
	int const stack_nr = stack->top;
	char **results_stack
	= malloc ((stack_nr +1) * sizeof (char *));

	memset (results_stack, 0, (stack_nr +1) * sizeof (char *));

	results_stack[stack_nr] = strdup (text);

	for (i = stack_nr -1; i >= 0; i--)
	{
	const ELEMENT *accent_command = stack->stack[i];
	results_stack[i] = unicode_accent (results_stack[i+1], accent_command,
	i, stack);
	if (!results_stack[i])
	{
	/* decrease a last time as if the loop had been gone through */
	i--;
	break;
	}
	else if (set_case)
	{
	char *cased = to_upper_or_lower_multibyte (results_stack[i], set_case);
	free (results_stack[i]);
	results_stack[i] = cased;
	}
	}
	/* undo the last decrease of i */
	i++;

	/*
	At this point we have the unicode character results for the accent
	commands stack, with all the intermediate results.
	For each one we'll check if it is possible to encode it in the
	current eight bit output encoding table and, if so, set the result
	to the character.
	*/

	prev_eight_bit = strdup ("");

	for (j = stack_nr; j >= i; j--)
	{
	new_eight_bit = 0;
	if (!results_stack[j])
	break;

	uint8_t *encoded_u8 = utf8_from_string (results_stack[j]);
	ucs4_t first_char;
	u8_next (&first_char, encoded_u8);
	free (encoded_u8);
	if (first_char < 127)
	xasprintf (&new_eight_bit, "%02lX", first_char);
	else
	{
	char *codepoint;
	if (first_char <= 0xFFFF)
	{
	xasprintf (&codepoint, "%04lX", first_char);
	const char found = (const char )bsearch (&codepoint,
	unicode_to_eight_bit[encoding_index].codepoints,
	unicode_to_eight_bit[encoding_index].number,
	sizeof (const char *), compare_strings);
	if (found)
	new_eight_bit = strdup (found);

	free (codepoint);
	}
	}
	if (!new_eight_bit)
	break;

	/*
	# in that case, the new eight bit character is the same than the one
	# found with one less character (and it isn't a @dotless{i}). It may
	# hapen in 2 case, both meaning that there is no corresponding 8bit char:
	#
	# -> there are 2 characters in accent. This could happen, for example
	# if an accent that cannot be rendered is found and it leads to
	# appending or prepending a character. For example this happens for
	# @={@,{@~{n}}}, where @,{@~{n}} is expanded to 2 characters:
	# n with a tilde, followed by a ,
	# In that case, the additional diacritic is appended, which
	# means that it is composed with the , and leaves n with a tilde
	# untouched.
	# -> the diacritic is appended but the normal form doesn't lead
	# to a composed character, such that the first character
	# of the string is unchanged. This, for example, happens for
	# @ubaraccent{a} since there is no composed accent with a and an
	# underbar.
	*/
	if (!strcmp (new_eight_bit, prev_eight_bit)
	&& !(stack->stack[j]->e.c->cmd == CM_dotless
	&& !strcmp (results_stack[j], "i")))
	{
	free (new_eight_bit);
	break;
	}
	free (result);
	result = strdup (results_stack[j]);
	free (prev_eight_bit);
	prev_eight_bit = strdup (new_eight_bit);
	free (new_eight_bit);
	}

	free (prev_eight_bit);

	/*
	handle the remaining accents, that have not been converted to 8bit
	compatible unicode
	*/
	for (; j >= 0; j--)
	{
	const ELEMENT *accent_command = stack->stack[j];
	char *formatted_result
	= (*format_accent) (self, result, accent_command, i, stack, set_case);
	free (result);
	result = formatted_result;
	}

	for (k = stack_nr; k >= i; k--)
	{
	free (results_stack[k]);
	}
	free (results_stack);

	return result;
	}

	char *
	format_unicode_accents_stack_internal (CONVERTER self, const char text,
	const ELEMENT_STACK *stack,
	char (format_accent)(CONVERTER self, const char text,
	const ELEMENT *element, int index_in_stack,
	const ELEMENT_STACK *stack, int set_case),
	int set_case)
	{
	int i;
	char *result = strdup (text);

	for (i = stack->top - 1; i >= 0; i--)
	{
	const ELEMENT *accent_command = stack->stack[i];
	char *formatted_result = unicode_accent (result, accent_command,
	i, stack);
	if (formatted_result)
	{
	free (result);
	result = formatted_result;
	}
	else
	break;
	}

	if (set_case)
	{
	char *cased = to_upper_or_lower_multibyte (result, set_case);
	free (result);
	result = cased;
	}

	for (; i >= 0; i--)
	{
	const ELEMENT *accent_command = stack->stack[i];
	char *formatted_result
	= (*format_accent) (self, result, accent_command, i, stack, set_case);
	free (result);
	result = formatted_result;
	}
	return result;
	}

	char *
	encoded_accents (CONVERTER self, const char text, const ELEMENT_STACK *stack,
	const char *encoding,
	char (format_accent)(CONVERTER self, const char text,
	const ELEMENT *element, int index_in_stack,
	const ELEMENT_STACK *stack, int set_case),
	int set_case)
	{
	if (encoding)
	{
	/*
	in case an encoding is directly specified with -c OUTPUT_ENCODING_NAME
	in upper case to match with the encodings in Texinfo input, we convert
	to lower case to match the encoding names used here. In the code
	encoding names are lower cased early.
	*/
	int possible_encoding;
	char *normalized_encoding = normalize_encoding_name
	(encoding, &possible_encoding);
	if (possible_encoding)
	{
	int encoding_index = -1;
	size_t i;
	if (!strcmp (normalized_encoding, "utf-8"))
	{
	free (normalized_encoding);
	return format_unicode_accents_stack_internal (self, text, stack,
	format_accent, set_case);
	}
	for (i = 0; i < sizeof (unicode_to_eight_bit)
	/ sizeof (unicode_to_eight_bit[0]); i++)
	{
	if (!strcmp (normalized_encoding,
	unicode_to_eight_bit[i].encoding))
	{
	encoding_index = i;
	break;
	}
	}
	if (encoding_index >= 0)
	{
	free (normalized_encoding);
	return format_eight_bit_accents_stack (self, text, stack,
	encoding_index, format_accent, set_case);
	}
	}
	free (normalized_encoding);
	}
	return 0;
	}


	/* UNICODE_POINT is a string describing an hexadecimal number with
	letters in upper case */
	/* returns the index in unicode_to_eight_bit +1 if > 0 */
	int unicode_point_decoded_in_encoding (const char *encoding,
	const char *codepoint)
	{
	if (encoding)
	{
	int possible_encoding;
	char *normalized_encoding = normalize_encoding_name
	(encoding, &possible_encoding);
	if (possible_encoding)
	{
	size_t i;
	if (!strcmp (normalized_encoding, "utf-8"))
	{
	free (normalized_encoding);
	return -1;
	}
	for (i = 0; i < sizeof (unicode_to_eight_bit)
	/ sizeof (unicode_to_eight_bit[0]); i++)
	{
	if (!strcmp (normalized_encoding,
	unicode_to_eight_bit[i].encoding))
	{
	unsigned long point_nr = strtoul (codepoint, NULL, 16);
	/* excludes 127 \x{7F} DEL */
	if (point_nr < 127)
	{
	free (normalized_encoding);
	return (int) i + 1;
	}
	char found = (char )bsearch (&codepoint,
	unicode_to_eight_bit[i].codepoints,
	unicode_to_eight_bit[i].number,
	sizeof (char *), compare_strings);
	if (found)
	{
	free (normalized_encoding);
	return (int) i + 1;
	}
	break;
	}
	}
	}
	free (normalized_encoding);
	/* unknown encoding or not represented in encoding */
	return 0;
	}
	else
	/* if encoding is not set, consider that it is the default, utf-8 */
	return -1;
	}

	const char *
	unicode_brace_no_arg_command (enum command_id cmd, const char *encoding)
	{
	if (unicode_character_brace_no_arg_commands[cmd].text
	&& unicode_point_decoded_in_encoding (encoding,
	unicode_character_brace_no_arg_commands[cmd].codepoint))
	return unicode_character_brace_no_arg_commands[cmd].text;
	else
	return 0;
	}