blob: 1b6f46afee39785b8136ee51fbe6a570154d5acc [file] [log] [blame]
/* libga68 unicode support routines.
Copyright (C) 2009-2025 Free Software Foundation, Inc.
Copyright (C) 2025 Jose E. Marchesi.
GCC is free software; you can redistribute it and/or modify it under the
terms of the GNU General Public License as published by the Free Software
Foundation; either version 3, or (at your option) any later version.
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
details.
Under Section 7 of GPL version 3, you are granted additional permissions
described in the GCC Runtime Library Exception, version 3.1, as published by
the Free Software Foundation.
You should have received a copy of the GNU General Public License and a copy
of the GCC Runtime Library Exception along with this program; see the files
COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* The code in this file has been copied from the unistr gnulib module, written
by Bruno Haible, and adapted to support strides. */
#include <stddef.h> /* For ptrdiff_t */
#include <stdlib.h>
#include <stdint.h>
#include <errno.h>
#include <string.h>
#include "ga68.h"
/* CMP (n1, n2) performs a three-valued comparison on n1 vs. n2, where
n1 and n2 are expressions without side effects, that evaluate to real
numbers (excluding NaN).
It returns
1 if n1 > n2
0 if n1 == n2
-1 if n1 < n2
The naïve code (n1 > n2 ? 1 : n1 < n2 ? -1 : 0) produces a conditional
jump with nearly all GCC versions up to GCC 10.
This variant (n1 < n2 ? -1 : n1 > n2) produces a conditional with many
GCC versions up to GCC 9.
The better code (n1 > n2) - (n1 < n2) from Hacker's Delight § 2-9
avoids conditional jumps in all GCC versions >= 3.4. */
#define CMP(n1, n2) (((n1) > (n2)) - ((n1) < (n2)))
/* MIN(a,b) returns the minimum of A and B. */
#ifndef MIN
# define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif
/* Compare two UCS-4 strings of same lenght, lexicographically.
Return -1, 0 or 1. */
static int
_libga68_u32_cmp (const uint32_t *s1, size_t stride1,
const uint32_t *s2, size_t stride2,
size_t n)
{
stride1 = stride1 / sizeof (uint32_t);
stride2 = stride2 / sizeof (uint32_t);
for (; n > 0;)
{
uint32_t uc1 = *s1;
s1 += stride1;
uint32_t uc2 = *s2;
s2 += stride2;
if (uc1 == uc2)
{
n--;
continue;
}
/* Note that uc1 and uc2 each have at most 31 bits. */
return (int)uc1 - (int)uc2;
/* > 0 if uc1 > uc2, < 0 if uc1 < uc2. */
}
return 0;
}
/* Compare two UCS-4 strings of perhaps different lenghts, lexicographically.
Return -1, 0 or 1. */
int
_libga68_u32_cmp2 (const uint32_t *s1, size_t n1, size_t stride1,
const uint32_t *s2, size_t n2, size_t stride2)
{
int cmp = _libga68_u32_cmp (s1, stride1, s2, stride2, MIN (n1, n2));
if (cmp == 0)
cmp = CMP (n1, n2);
return cmp;
}
/* Get the UCS code for the first character of a given UTF-8 string. */
int
_libga68_u8_mbtouc (uint32_t *puc, const uint8_t *s, size_t n)
{
uint8_t c = *s;
if (c < 0x80)
{
*puc = c;
return 1;
}
else if (c >= 0xc2)
{
if (c < 0xe0)
{
if (n >= 2)
{
if ((s[1] ^ 0x80) < 0x40)
{
*puc = ((unsigned int) (c & 0x1f) << 6)
| (unsigned int) (s[1] ^ 0x80);
return 2;
}
/* invalid multibyte character */
}
else
{
/* incomplete multibyte character */
*puc = 0xfffd;
return 1;
}
}
else if (c < 0xf0)
{
if (n >= 3)
{
if ((s[1] ^ 0x80) < 0x40
&& (c >= 0xe1 || s[1] >= 0xa0)
&& (c != 0xed || s[1] < 0xa0))
{
if ((s[2] ^ 0x80) < 0x40)
{
*puc = ((unsigned int) (c & 0x0f) << 12)
| ((unsigned int) (s[1] ^ 0x80) << 6)
| (unsigned int) (s[2] ^ 0x80);
return 3;
}
/* invalid multibyte character */
*puc = 0xfffd;
return 2;
}
/* invalid multibyte character */
*puc = 0xfffd;
return 1;
}
else
{
*puc = 0xfffd;
if (n == 1)
{
/* incomplete multibyte character */
return 1;
}
else
{
if ((s[1] ^ 0x80) < 0x40
&& (c >= 0xe1 || s[1] >= 0xa0)
&& (c != 0xed || s[1] < 0xa0))
{
/* incomplete multibyte character */
return 2;
}
else
{
/* invalid multibyte character */
return 1;
}
}
}
}
else if (c <= 0xf4)
{
if (n >= 4)
{
if ((s[1] ^ 0x80) < 0x40
&& (c >= 0xf1 || s[1] >= 0x90)
&& (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
{
if ((s[2] ^ 0x80) < 0x40)
{
if ((s[3] ^ 0x80) < 0x40)
{
*puc = ((unsigned int) (c & 0x07) << 18)
| ((unsigned int) (s[1] ^ 0x80) << 12)
| ((unsigned int) (s[2] ^ 0x80) << 6)
| (unsigned int) (s[3] ^ 0x80);
return 4;
}
/* invalid multibyte character */
*puc = 0xfffd;
return 3;
}
/* invalid multibyte character */
*puc = 0xfffd;
return 2;
}
/* invalid multibyte character */
*puc = 0xfffd;
return 1;
}
else
{
*puc = 0xfffd;
if (n == 1)
{
/* incomplete multibyte character */
return 1;
}
else
{
if ((s[1] ^ 0x80) < 0x40
&& (c >= 0xf1 || s[1] >= 0x90)
&& (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
{
if (n == 2)
{
/* incomplete multibyte character */
return 2;
}
else
{
if ((s[2] ^ 0x80) < 0x40)
{
/* incomplete multibyte character */
return 3;
}
else
{
/* invalid multibyte character */
return 2;
}
}
}
else
{
/* invalid multibyte character */
return 1;
}
}
}
}
}
/* invalid multibyte character */
*puc = 0xfffd;
return 1;
}
/* Encode a given UCS code in UTF-8. */
int
_libga68_u8_uctomb (uint8_t *s, uint32_t uc, ptrdiff_t n)
{
if (uc < 0x80)
{
if (n > 0)
{
s[0] = uc;
return 1;
}
/* else return -2, below. */
}
else
{
int count;
if (uc < 0x800)
count = 2;
else if (uc < 0x10000)
{
if (uc < 0xd800 || uc >= 0xe000)
count = 3;
else
return -1;
}
else if (uc < 0x110000)
count = 4;
else
return -1;
if (n >= count)
{
switch (count) /* note: code falls through cases! */
{
case 4: s[3] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x10000;
/* Fallthrough. */
case 3: s[2] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x800;
/* Fallthrough. */
case 2: s[1] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0xc0;
/*case 1:*/ s[0] = uc;
}
return count;
}
}
return -2;
}
/* Convert UCS-4 to UTF-8 */
uint8_t *
_libga68_u32_to_u8 (const uint32_t *s, size_t n, size_t stride,
uint8_t *resultbuf, size_t *lengthp)
{
const uint32_t *s_end;
/* Output string accumulator. */
uint8_t *result;
size_t allocated;
size_t length;
stride = stride / sizeof (uint32_t);
s_end = s + (n * stride);
if (resultbuf != NULL)
{
result = resultbuf;
allocated = *lengthp;
}
else
{
result = NULL;
allocated = 0;
}
length = 0;
/* Invariants:
result is either == resultbuf or == NULL or malloc-allocated.
If length > 0, then result != NULL. */
while (s < s_end)
{
uint32_t uc;
int count;
/* Fetch a Unicode character from the input string. */
uc = *s;
s += stride;
/* No need to call the safe variant u32_mbtouc, because
u8_uctomb will verify uc anyway. */
/* Store it in the output string. */
count = _libga68_u8_uctomb (result + length, uc, allocated - length);
if (count == -1)
{
if (!(result == resultbuf || result == NULL))
free (result);
errno = EILSEQ;
return NULL;
}
if (count == -2)
{
uint8_t *memory;
allocated = (allocated > 0 ? 2 * allocated : 12);
if (length + 6 > allocated)
allocated = length + 6;
if (result == resultbuf || result == NULL)
memory = (uint8_t *) _libga68_malloc (allocated * sizeof (uint8_t));
else
memory =
(uint8_t *) _libga68_realloc (result, allocated * sizeof (uint8_t));
if (result == resultbuf && length > 0)
memcpy ((char *) memory, (char *) result,
length * sizeof (uint8_t));
result = memory;
count = _libga68_u8_uctomb (result + length, uc, allocated - length);
if (count < 0)
abort ();
}
length += count;
}
if (length == 0)
{
if (result == NULL)
{
/* Return a non-NULL value. NULL means error. */
result = (uint8_t *) _libga68_malloc (1);
if (result == NULL)
{
errno = ENOMEM;
return NULL;
}
}
}
else if (result != resultbuf && length < allocated)
{
/* Shrink the allocated memory if possible. */
uint8_t *memory;
memory = (uint8_t *) _libga68_realloc_unchecked (result, length * sizeof (uint8_t));
if (memory != NULL)
result = memory;
}
*lengthp = length;
return result;
}
/* Used by ga68_u8_to_u32 below. */
static int
_libga68_u8_mbtoucr (uint32_t *puc, const uint8_t *s, size_t n)
{
uint8_t c = *s;
if (c < 0x80)
{
*puc = c;
return 1;
}
else if (c >= 0xc2)
{
if (c < 0xe0)
{
if (n >= 2)
{
if ((s[1] ^ 0x80) < 0x40)
{
*puc = ((unsigned int) (c & 0x1f) << 6)
| (unsigned int) (s[1] ^ 0x80);
return 2;
}
/* invalid multibyte character */
}
else
{
/* incomplete multibyte character */
*puc = 0xfffd;
return -2;
}
}
else if (c < 0xf0)
{
if (n >= 2)
{
if ((s[1] ^ 0x80) < 0x40
&& (c >= 0xe1 || s[1] >= 0xa0)
&& (c != 0xed || s[1] < 0xa0))
{
if (n >= 3)
{
if ((s[2] ^ 0x80) < 0x40)
{
*puc = ((unsigned int) (c & 0x0f) << 12)
| ((unsigned int) (s[1] ^ 0x80) << 6)
| (unsigned int) (s[2] ^ 0x80);
return 3;
}
/* invalid multibyte character */
}
else
{
/* incomplete multibyte character */
*puc = 0xfffd;
return -2;
}
}
/* invalid multibyte character */
}
else
{
/* incomplete multibyte character */
*puc = 0xfffd;
return -2;
}
}
else if (c <= 0xf4)
{
if (n >= 2)
{
if ((s[1] ^ 0x80) < 0x40
&& (c >= 0xf1 || s[1] >= 0x90)
&& (c < 0xf4 || (/* c == 0xf4 && */ s[1] < 0x90)))
{
if (n >= 3)
{
if ((s[2] ^ 0x80) < 0x40)
{
if (n >= 4)
{
if ((s[3] ^ 0x80) < 0x40)
{
*puc = ((unsigned int) (c & 0x07) << 18)
| ((unsigned int) (s[1] ^ 0x80) << 12)
| ((unsigned int) (s[2] ^ 0x80) << 6)
| (unsigned int) (s[3] ^ 0x80);
return 4;
}
/* invalid multibyte character */
}
else
{
/* incomplete multibyte character */
*puc = 0xfffd;
return -2;
}
}
/* invalid multibyte character */
}
else
{
/* incomplete multibyte character */
*puc = 0xfffd;
return -2;
}
}
/* invalid multibyte character */
}
else
{
/* incomplete multibyte character */
*puc = 0xfffd;
return -2;
}
}
}
/* invalid multibyte character */
*puc = 0xfffd;
return -1;
}
/* Convert UTF-8 to UTF-32/UCS-4 */
uint32_t *
_libga68_u8_to_u32 (const uint8_t *s, size_t n, uint32_t *resultbuf, size_t *lengthp)
{
const uint8_t *s_end = s + n;
/* Output string accumulator. */
uint32_t *result;
size_t allocated;
size_t length;
if (resultbuf != NULL)
{
result = resultbuf;
allocated = *lengthp;
}
else
{
result = NULL;
allocated = 0;
}
length = 0;
/* Invariants:
result is either == resultbuf or == NULL or malloc-allocated.
If length > 0, then result != NULL. */
while (s < s_end)
{
uint32_t uc;
int count;
/* Fetch a Unicode character from the input string. */
count = _libga68_u8_mbtoucr (&uc, s, s_end - s);
if (count < 0)
{
if (!(result == resultbuf || result == NULL))
free (result);
errno = EILSEQ;
return NULL;
}
s += count;
/* Store it in the output string. */
if (length + 1 > allocated)
{
uint32_t *memory;
allocated = (allocated > 0 ? 2 * allocated : 12);
if (length + 1 > allocated)
allocated = length + 1;
if (result == resultbuf || result == NULL)
memory = (uint32_t *) _libga68_malloc (allocated * sizeof (uint32_t));
else
memory =
(uint32_t *) _libga68_realloc (result, allocated * sizeof (uint32_t));
if (result == resultbuf && length > 0)
memcpy ((char *) memory, (char *) result,
length * sizeof (uint32_t));
result = memory;
}
result[length++] = uc;
}
if (length == 0)
{
if (result == NULL)
{
/* Return a non-NULL value. NULL means error. */
result = (uint32_t *) _libga68_malloc (1);
}
}
else if (result != resultbuf && length < allocated)
{
/* Shrink the allocated memory if possible. */
uint32_t *memory;
memory = (uint32_t *) _libga68_realloc_unchecked (result, length * sizeof (uint32_t));
if (memory != NULL)
result = memory;
}
*lengthp = length;
return result;
}