| |
| /* Compiler implementation of the D programming language |
| * Copyright (C) 2003-2019 by The D Language Foundation, All Rights Reserved |
| * written by Walter Bright |
| * http://www.digitalmars.com |
| * Distributed under the Boost Software License, Version 1.0. |
| * http://www.boost.org/LICENSE_1_0.txt |
| * https://github.com/D-Programming-Language/dmd/blob/master/src/utf.c |
| */ |
| |
| /// Description of UTF-8 in [1]. Unicode non-characters and private-use |
| /// code points described in [2],[4]. |
| /// |
| /// References: |
| /// [1] http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 |
| /// [2] http://en.wikipedia.org/wiki/Unicode |
| /// [3] http://unicode.org/faq/utf_bom.html |
| /// [4] http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf |
| |
| #include "utf.h" |
| |
| /* The following encodings are valid, except for the 5 and 6 byte |
| * combinations: |
| * 0xxxxxxx |
| * 110xxxxx 10xxxxxx |
| * 1110xxxx 10xxxxxx 10xxxxxx |
| * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| */ |
| const unsigned UTF8_STRIDE[256] = |
| { |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, |
| 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, |
| 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, |
| 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, |
| 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
| 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
| 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, |
| 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, |
| }; |
| |
| // UTF-8 decoding errors |
| char const UTF8_DECODE_OUTSIDE_CODE_SPACE[] = "Outside Unicode code space"; |
| char const UTF8_DECODE_TRUNCATED_SEQUENCE[] = "Truncated UTF-8 sequence"; |
| char const UTF8_DECODE_OVERLONG[] = "Overlong UTF-8 sequence"; |
| char const UTF8_DECODE_INVALID_TRAILER[] = "Invalid trailing code unit"; |
| char const UTF8_DECODE_INVALID_CODE_POINT[] = "Invalid code point decoded"; |
| |
| // UTF-16 decoding errors |
| char const UTF16_DECODE_TRUNCATED_SEQUENCE[]= "Truncated UTF-16 sequence"; |
| char const UTF16_DECODE_INVALID_SURROGATE[] = "Invalid low surrogate"; |
| char const UTF16_DECODE_UNPAIRED_SURROGATE[]= "Unpaired surrogate"; |
| char const UTF16_DECODE_INVALID_CODE_POINT[]= "Invalid code point decoded"; |
| |
| /// The Unicode code space is the range of code points [0x000000,0x10FFFF] |
| /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF] |
| /// and non-characters (which end in 0xFFFE or 0xFFFF). |
| bool utf_isValidDchar(dchar_t c) |
| { |
| // TODO: Whether non-char code points should be rejected is pending review |
| // largest character code point |
| if (c > 0x10FFFF) |
| return false; |
| // surrogate pairs |
| if (0xD800 <= c && c <= 0xDFFF) |
| return false; |
| // non-characters |
| if ((c & 0xFFFFFE) == 0x00FFFE) |
| return false; |
| return true; |
| } |
| |
| /******************************* |
| * Return !=0 if unicode alpha. |
| * Use table from C99 Appendix D. |
| */ |
| |
| bool isUniAlpha(dchar_t c) |
| { |
| size_t high = ALPHA_TABLE_LENGTH - 1; |
| // Shortcut search if c is out of range |
| size_t low |
| = (c < ALPHA_TABLE[0][0] || ALPHA_TABLE[high][1] < c) ? high + 1 : 0; |
| // Binary search |
| while (low <= high) |
| { |
| size_t mid = (low + high) >> 1; |
| if (c < ALPHA_TABLE[mid][0]) |
| high = mid - 1; |
| else if (ALPHA_TABLE[mid][1] < c) |
| low = mid + 1; |
| else |
| { |
| assert(ALPHA_TABLE[mid][0] <= c && c <= ALPHA_TABLE[mid][1]); |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| /** |
| * Returns the code length of c in code units. |
| */ |
| |
| int utf_codeLengthChar(dchar_t c) |
| { |
| if (c <= 0x7F) |
| return 1; |
| if (c <= 0x7FF) |
| return 2; |
| if (c <= 0xFFFF) |
| return 3; |
| if (c <= 0x10FFFF) |
| return 4; |
| assert(false); |
| } |
| |
| int utf_codeLengthWchar(dchar_t c) |
| { |
| return c <= 0xFFFF ? 1 : 2; |
| } |
| |
| /** |
| * Returns the code length of c in code units for the encoding. |
| * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32. |
| */ |
| |
| int utf_codeLength(int sz, dchar_t c) |
| { |
| if (sz == 1) |
| return utf_codeLengthChar(c); |
| if (sz == 2) |
| return utf_codeLengthWchar(c); |
| assert(sz == 4); |
| return 1; |
| } |
| |
| void utf_encodeChar(utf8_t *s, dchar_t c) |
| { |
| assert(s != NULL); |
| assert(utf_isValidDchar(c)); |
| if (c <= 0x7F) |
| { |
| s[0] = static_cast<utf8_t>(c); |
| } |
| else if (c <= 0x07FF) |
| { |
| s[0] = static_cast<utf8_t>(0xC0 | (c >> 6)); |
| s[1] = static_cast<utf8_t>(0x80 | (c & 0x3F)); |
| } |
| else if (c <= 0xFFFF) |
| { |
| s[0] = static_cast<utf8_t>(0xE0 | (c >> 12)); |
| s[1] = static_cast<utf8_t>(0x80 | ((c >> 6) & 0x3F)); |
| s[2] = static_cast<utf8_t>(0x80 | (c & 0x3F)); |
| } |
| else if (c <= 0x10FFFF) |
| { |
| s[0] = static_cast<utf8_t>(0xF0 | (c >> 18)); |
| s[1] = static_cast<utf8_t>(0x80 | ((c >> 12) & 0x3F)); |
| s[2] = static_cast<utf8_t>(0x80 | ((c >> 6) & 0x3F)); |
| s[3] = static_cast<utf8_t>(0x80 | (c & 0x3F)); |
| } |
| else |
| assert(0); |
| } |
| |
| void utf_encodeWchar(utf16_t *s, dchar_t c) |
| { |
| assert(s != NULL); |
| assert(utf_isValidDchar(c)); |
| if (c <= 0xFFFF) |
| { |
| s[0] = static_cast<utf16_t>(c); |
| } |
| else |
| { |
| s[0] = static_cast<utf16_t>((((c - 0x010000) >> 10) & 0x03FF) + 0xD800); |
| s[1] = static_cast<utf16_t>(((c - 0x010000) & 0x03FF) + 0xDC00); |
| } |
| } |
| |
| void utf_encode(int sz, void *s, dchar_t c) |
| { |
| if (sz == 1) |
| utf_encodeChar((utf8_t *)s, c); |
| else if (sz == 2) |
| utf_encodeWchar((utf16_t *)s, c); |
| else |
| { |
| assert(sz == 4); |
| *((utf32_t *)s) = c; |
| } |
| } |
| |
| /******************************************** |
| * Decode a UTF-8 sequence as a single UTF-32 code point. |
| * Returns: |
| * NULL success |
| * !=NULL error message string |
| */ |
| |
| const char *utf_decodeChar(utf8_t const *s, size_t len, size_t *pidx, dchar_t *presult) |
| { |
| assert(s != NULL); |
| assert(pidx != NULL); |
| assert(presult != NULL); |
| size_t i = (*pidx)++; |
| assert(i < len); |
| utf8_t u = s[i]; |
| // Pre-stage results for ASCII and error cases |
| *presult = u; |
| |
| //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len); |
| |
| // Get expected sequence length |
| size_t n = UTF8_STRIDE[u]; |
| switch (n) |
| { |
| case 1: // ASCII |
| return UTF8_DECODE_OK; |
| case 2: case 3: case 4: // multi-byte UTF-8 |
| break; |
| default: // 5- or 6-byte sequence |
| return UTF8_DECODE_OUTSIDE_CODE_SPACE; |
| } |
| if (len < i + n) // source too short |
| return UTF8_DECODE_TRUNCATED_SEQUENCE; |
| |
| // Pick off 7 - n low bits from first code unit |
| utf32_t c = u & ((1 << (7 - n)) - 1); |
| /* The following combinations are overlong, and illegal: |
| * 1100000x (10xxxxxx) |
| * 11100000 100xxxxx (10xxxxxx) |
| * 11110000 1000xxxx (10xxxxxx 10xxxxxx) |
| * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) |
| * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) |
| */ |
| utf8_t u2 = s[++i]; |
| // overlong combination |
| if ((u & 0xFE) == 0xC0 || |
| (u == 0xE0 && (u2 & 0xE0) == 0x80) || |
| (u == 0xF0 && (u2 & 0xF0) == 0x80) || |
| (u == 0xF8 && (u2 & 0xF8) == 0x80) || |
| (u == 0xFC && (u2 & 0xFC) == 0x80)) |
| return UTF8_DECODE_OVERLONG; |
| // Decode remaining bits |
| for (n += i - 1; i != n; ++i) |
| { |
| u = s[i]; |
| if ((u & 0xC0) != 0x80) // trailing bytes are 10xxxxxx |
| return UTF8_DECODE_INVALID_TRAILER; |
| c = (c << 6) | (u & 0x3F); |
| } |
| if (!utf_isValidDchar(c)) |
| return UTF8_DECODE_INVALID_CODE_POINT; |
| *pidx = i; |
| *presult = c; |
| return UTF8_DECODE_OK; |
| } |
| |
| /******************************************** |
| * Decode a UTF-16 sequence as a single UTF-32 code point. |
| * Returns: |
| * NULL success |
| * !=NULL error message string |
| */ |
| |
| const char *utf_decodeWchar(utf16_t const *s, size_t len, size_t *pidx, dchar_t *presult) |
| { |
| assert(s != NULL); |
| assert(pidx != NULL); |
| assert(presult != NULL); |
| size_t i = (*pidx)++; |
| assert(i < len); |
| // Pre-stage results for ASCII and error cases |
| utf32_t u = *presult = s[i]; |
| |
| if (u < 0x80) // ASCII |
| return UTF16_DECODE_OK; |
| if (0xD800 <= u && u <= 0xDBFF) // Surrogate pair |
| { if (len <= i + 1) |
| return UTF16_DECODE_TRUNCATED_SEQUENCE; |
| utf16_t u2 = s[i + 1]; |
| if (u2 < 0xDC00 || 0xDFFF < u) |
| return UTF16_DECODE_INVALID_SURROGATE; |
| u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); |
| ++*pidx; |
| } |
| else if (0xDC00 <= u && u <= 0xDFFF) |
| return UTF16_DECODE_UNPAIRED_SURROGATE; |
| if (!utf_isValidDchar(u)) |
| return UTF16_DECODE_INVALID_CODE_POINT; |
| *presult = u; |
| return UTF16_DECODE_OK; |
| } |