| /******************************************** |
| * Encode and decode UTF-8, UTF-16 and UTF-32 strings. |
| * |
| * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D |
| * wchar type. |
| * For Posix systems, the C wchar_t type is UTF-32 and corresponds to |
| * the D utf.dchar type. |
| * |
| * UTF character support is restricted to (\u0000 <= character <= \U0010FFFF). |
| * |
| * See_Also: |
| * $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br> |
| * $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br> |
| * $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) |
| * |
| * Copyright: Copyright Digital Mars 2003 - 2016. |
| * License: $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0). |
| * Authors: Walter Bright, Sean Kelly |
| * Source: $(DRUNTIMESRC src/rt/util/_utf.d) |
| */ |
| |
| module rt.util.utf; |
| |
| extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ ) @safe pure; |
| |
| /******************************* |
| * Test if c is a valid UTF-32 character. |
| * |
| * \uFFFE and \uFFFF are considered valid by this function, |
| * as they are permitted for internal use by an application, |
| * but they are not allowed for interchange by the Unicode standard. |
| * |
| * Returns: true if it is, false if not. |
| */ |
| |
| @safe @nogc pure nothrow |
| bool isValidDchar(dchar c) |
| { |
| /* Note: FFFE and FFFF are specifically permitted by the |
| * Unicode standard for application internal use, but are not |
| * allowed for interchange. |
| * (thanks to Arcane Jill) |
| */ |
| |
| return c < 0xD800 || |
| (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); |
| } |
| |
| unittest |
| { |
| debug(utf) printf("utf.isValidDchar.unittest\n"); |
| assert(isValidDchar(cast(dchar)'a') == true); |
| assert(isValidDchar(cast(dchar)0x1FFFFF) == false); |
| } |
| |
| |
| |
| static immutable UTF8stride = |
| [ |
| cast(ubyte) |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
| 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, |
| 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, |
| 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, |
| 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, |
| 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
| 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
| 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, |
| 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, |
| ]; |
| |
| /** |
| * stride() returns the length of a UTF-8 sequence starting at index i |
| * in string s. |
| * Returns: |
| * The number of bytes in the UTF-8 sequence or |
| * 0xFF meaning s[i] is not the start of of UTF-8 sequence. |
| */ |
| @safe @nogc pure nothrow |
| uint stride(in char[] s, size_t i) |
| { |
| return UTF8stride[s[i]]; |
| } |
| |
| /** |
| * stride() returns the length of a UTF-16 sequence starting at index i |
| * in string s. |
| */ |
| @safe @nogc pure nothrow |
| uint stride(in wchar[] s, size_t i) |
| { uint u = s[i]; |
| return 1 + (u >= 0xD800 && u <= 0xDBFF); |
| } |
| |
| /** |
| * stride() returns the length of a UTF-32 sequence starting at index i |
| * in string s. |
| * Returns: The return value will always be 1. |
| */ |
| @safe @nogc pure nothrow |
| uint stride(in dchar[] s, size_t i) |
| { |
| return 1; |
| } |
| |
| /******************************************* |
| * Given an index i into an array of characters s[], |
| * and assuming that index i is at the start of a UTF character, |
| * determine the number of UCS characters up to that index i. |
| */ |
| @safe pure |
| size_t toUCSindex(in char[] s, size_t i) |
| { |
| size_t n; |
| size_t j; |
| |
| for (j = 0; j < i; ) |
| { |
| j += stride(s, j); |
| n++; |
| } |
| if (j > i) |
| { |
| onUnicodeError("invalid UTF-8 sequence", j); |
| } |
| return n; |
| } |
| |
| /** ditto */ |
| @safe pure |
| size_t toUCSindex(in wchar[] s, size_t i) |
| { |
| size_t n; |
| size_t j; |
| |
| for (j = 0; j < i; ) |
| { |
| j += stride(s, j); |
| n++; |
| } |
| if (j > i) |
| { |
| onUnicodeError("invalid UTF-16 sequence", j); |
| } |
| return n; |
| } |
| |
| /** ditto */ |
| @safe @nogc pure nothrow |
| size_t toUCSindex(in dchar[] s, size_t i) |
| { |
| return i; |
| } |
| |
| /****************************************** |
| * Given a UCS index n into an array of characters s[], return the UTF index. |
| */ |
| @safe pure |
| size_t toUTFindex(in char[] s, size_t n) |
| { |
| size_t i; |
| |
| while (n--) |
| { |
| uint j = UTF8stride[s[i]]; |
| if (j == 0xFF) |
| onUnicodeError("invalid UTF-8 sequence", i); |
| i += j; |
| } |
| return i; |
| } |
| |
| /** ditto */ |
| @safe @nogc pure nothrow |
| size_t toUTFindex(in wchar[] s, size_t n) |
| { |
| size_t i; |
| |
| while (n--) |
| { wchar u = s[i]; |
| |
| i += 1 + (u >= 0xD800 && u <= 0xDBFF); |
| } |
| return i; |
| } |
| |
| /** ditto */ |
| @safe @nogc pure nothrow |
| size_t toUTFindex(in dchar[] s, size_t n) |
| { |
| return n; |
| } |
| |
| /* =================== Decode ======================= */ |
| |
| /*************** |
| * Decodes and returns character starting at s[idx]. idx is advanced past the |
| * decoded character. If the character is not well formed, a UtfException is |
| * thrown and idx remains unchanged. |
| */ |
| @safe pure |
| dchar decode(in char[] s, ref size_t idx) |
| in |
| { |
| assert(idx >= 0 && idx < s.length); |
| } |
| out (result) |
| { |
| assert(isValidDchar(result)); |
| } |
| body |
| { |
| size_t len = s.length; |
| dchar V; |
| size_t i = idx; |
| char u = s[i]; |
| |
| if (u & 0x80) |
| { uint n; |
| char u2; |
| |
| /* The following encodings are valid, except for the 5 and 6 byte |
| * combinations: |
| * 0xxxxxxx |
| * 110xxxxx 10xxxxxx |
| * 1110xxxx 10xxxxxx 10xxxxxx |
| * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| */ |
| for (n = 1; ; n++) |
| { |
| if (n > 4) |
| goto Lerr; // only do the first 4 of 6 encodings |
| if (((u << n) & 0x80) == 0) |
| { |
| if (n == 1) |
| goto Lerr; |
| break; |
| } |
| } |
| |
| // Pick off (7 - n) significant bits of B from first byte of octet |
| V = cast(dchar)(u & ((1 << (7 - n)) - 1)); |
| |
| if (i + (n - 1) >= len) |
| goto Lerr; // off end of string |
| |
| /* The following combinations are overlong, and illegal: |
| * 1100000x (10xxxxxx) |
| * 11100000 100xxxxx (10xxxxxx) |
| * 11110000 1000xxxx (10xxxxxx 10xxxxxx) |
| * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) |
| * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) |
| */ |
| u2 = s[i + 1]; |
| if ((u & 0xFE) == 0xC0 || |
| (u == 0xE0 && (u2 & 0xE0) == 0x80) || |
| (u == 0xF0 && (u2 & 0xF0) == 0x80) || |
| (u == 0xF8 && (u2 & 0xF8) == 0x80) || |
| (u == 0xFC && (u2 & 0xFC) == 0x80)) |
| goto Lerr; // overlong combination |
| |
| for (uint j = 1; j != n; j++) |
| { |
| u = s[i + j]; |
| if ((u & 0xC0) != 0x80) |
| goto Lerr; // trailing bytes are 10xxxxxx |
| V = (V << 6) | (u & 0x3F); |
| } |
| if (!isValidDchar(V)) |
| goto Lerr; |
| i += n; |
| } |
| else |
| { |
| V = cast(dchar) u; |
| i++; |
| } |
| |
| idx = i; |
| return V; |
| |
| Lerr: |
| onUnicodeError("invalid UTF-8 sequence", i); |
| return V; // dummy return |
| } |
| |
| unittest |
| { size_t i; |
| dchar c; |
| |
| debug(utf) printf("utf.decode.unittest\n"); |
| |
| static s1 = "abcd"c; |
| i = 0; |
| c = decode(s1, i); |
| assert(c == cast(dchar)'a'); |
| assert(i == 1); |
| c = decode(s1, i); |
| assert(c == cast(dchar)'b'); |
| assert(i == 2); |
| |
| static s2 = "\xC2\xA9"c; |
| i = 0; |
| c = decode(s2, i); |
| assert(c == cast(dchar)'\u00A9'); |
| assert(i == 2); |
| |
| static s3 = "\xE2\x89\xA0"c; |
| i = 0; |
| c = decode(s3, i); |
| assert(c == cast(dchar)'\u2260'); |
| assert(i == 3); |
| |
| static s4 = |
| [ "\xE2\x89"c[], // too short |
| "\xC0\x8A", |
| "\xE0\x80\x8A", |
| "\xF0\x80\x80\x8A", |
| "\xF8\x80\x80\x80\x8A", |
| "\xFC\x80\x80\x80\x80\x8A", |
| ]; |
| |
| for (int j = 0; j < s4.length; j++) |
| { |
| try |
| { |
| i = 0; |
| c = decode(s4[j], i); |
| assert(0); |
| } |
| catch (Throwable o) |
| { |
| i = 23; |
| } |
| assert(i == 23); |
| } |
| } |
| |
| /** ditto */ |
| @safe pure |
| dchar decode(in wchar[] s, ref size_t idx) |
| in |
| { |
| assert(idx >= 0 && idx < s.length); |
| } |
| out (result) |
| { |
| assert(isValidDchar(result)); |
| } |
| body |
| { |
| string msg; |
| dchar V; |
| size_t i = idx; |
| uint u = s[i]; |
| |
| if (u & ~0x7F) |
| { if (u >= 0xD800 && u <= 0xDBFF) |
| { uint u2; |
| |
| if (i + 1 == s.length) |
| { msg = "surrogate UTF-16 high value past end of string"; |
| goto Lerr; |
| } |
| u2 = s[i + 1]; |
| if (u2 < 0xDC00 || u2 > 0xDFFF) |
| { msg = "surrogate UTF-16 low value out of range"; |
| goto Lerr; |
| } |
| u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); |
| i += 2; |
| } |
| else if (u >= 0xDC00 && u <= 0xDFFF) |
| { msg = "unpaired surrogate UTF-16 value"; |
| goto Lerr; |
| } |
| else if (u == 0xFFFE || u == 0xFFFF) |
| { msg = "illegal UTF-16 value"; |
| goto Lerr; |
| } |
| else |
| i++; |
| } |
| else |
| { |
| i++; |
| } |
| |
| idx = i; |
| return cast(dchar)u; |
| |
| Lerr: |
| onUnicodeError(msg, i); |
| return cast(dchar)u; // dummy return |
| } |
| |
| /** ditto */ |
| @safe pure |
| dchar decode(in dchar[] s, ref size_t idx) |
| in |
| { |
| assert(idx >= 0 && idx < s.length); |
| } |
| body |
| { |
| size_t i = idx; |
| dchar c = s[i]; |
| |
| if (!isValidDchar(c)) |
| goto Lerr; |
| idx = i + 1; |
| return c; |
| |
| Lerr: |
| onUnicodeError("invalid UTF-32 value", i); |
| return c; // dummy return |
| } |
| |
| |
| /* =================== Encode ======================= */ |
| |
| /******************************* |
| * Encodes character c and appends it to array s[]. |
| */ |
| @safe pure nothrow |
| void encode(ref char[] s, dchar c) |
| in |
| { |
| assert(isValidDchar(c)); |
| } |
| body |
| { |
| char[] r = s; |
| |
| if (c <= 0x7F) |
| { |
| r ~= cast(char) c; |
| } |
| else |
| { |
| char[4] buf; |
| uint L; |
| |
| if (c <= 0x7FF) |
| { |
| buf[0] = cast(char)(0xC0 | (c >> 6)); |
| buf[1] = cast(char)(0x80 | (c & 0x3F)); |
| L = 2; |
| } |
| else if (c <= 0xFFFF) |
| { |
| buf[0] = cast(char)(0xE0 | (c >> 12)); |
| buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); |
| buf[2] = cast(char)(0x80 | (c & 0x3F)); |
| L = 3; |
| } |
| else if (c <= 0x10FFFF) |
| { |
| buf[0] = cast(char)(0xF0 | (c >> 18)); |
| buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); |
| buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); |
| buf[3] = cast(char)(0x80 | (c & 0x3F)); |
| L = 4; |
| } |
| else |
| { |
| assert(0); |
| } |
| r ~= buf[0 .. L]; |
| } |
| s = r; |
| } |
| |
| unittest |
| { |
| debug(utf) printf("utf.encode.unittest\n"); |
| |
| char[] s = "abcd".dup; |
| encode(s, cast(dchar)'a'); |
| assert(s.length == 5); |
| assert(s == "abcda"); |
| |
| encode(s, cast(dchar)'\u00A9'); |
| assert(s.length == 7); |
| assert(s == "abcda\xC2\xA9"); |
| //assert(s == "abcda\u00A9"); // BUG: fix compiler |
| |
| encode(s, cast(dchar)'\u2260'); |
| assert(s.length == 10); |
| assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); |
| } |
| |
| /** ditto */ |
| @safe pure nothrow |
| void encode(ref wchar[] s, dchar c) |
| in |
| { |
| assert(isValidDchar(c)); |
| } |
| body |
| { |
| wchar[] r = s; |
| |
| if (c <= 0xFFFF) |
| { |
| r ~= cast(wchar) c; |
| } |
| else |
| { |
| wchar[2] buf; |
| |
| buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); |
| buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); |
| r ~= buf; |
| } |
| s = r; |
| } |
| |
| /** ditto */ |
| @safe pure nothrow |
| void encode(ref dchar[] s, dchar c) |
| in |
| { |
| assert(isValidDchar(c)); |
| } |
| body |
| { |
| s ~= c; |
| } |
| |
| /** |
| Returns the code length of $(D c) in the encoding using $(D C) as a |
| code point. The code is returned in character count, not in bytes. |
| */ |
| @safe pure nothrow @nogc |
| ubyte codeLength(C)(dchar c) |
| { |
| static if (C.sizeof == 1) |
| { |
| if (c <= 0x7F) return 1; |
| if (c <= 0x7FF) return 2; |
| if (c <= 0xFFFF) return 3; |
| if (c <= 0x10FFFF) return 4; |
| assert(false); |
| } |
| else static if (C.sizeof == 2) |
| { |
| return c <= 0xFFFF ? 1 : 2; |
| } |
| else |
| { |
| static assert(C.sizeof == 4); |
| return 1; |
| } |
| } |
| |
| /* =================== Validation ======================= */ |
| |
| /*********************************** |
| Checks to see if string is well formed or not. $(D S) can be an array |
| of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException) |
| if it is not. Use to check all untrusted input for correctness. |
| */ |
| @safe pure |
| void validate(S)(in S s) |
| { |
| auto len = s.length; |
| for (size_t i = 0; i < len; ) |
| { |
| decode(s, i); |
| } |
| } |
| |
| /* =================== Conversion to UTF8 ======================= */ |
| |
| @safe pure nothrow @nogc |
| char[] toUTF8(char[] buf, dchar c) |
| in |
| { |
| assert(isValidDchar(c)); |
| } |
| body |
| { |
| if (c <= 0x7F) |
| { |
| buf[0] = cast(char) c; |
| return buf[0 .. 1]; |
| } |
| else if (c <= 0x7FF) |
| { |
| buf[0] = cast(char)(0xC0 | (c >> 6)); |
| buf[1] = cast(char)(0x80 | (c & 0x3F)); |
| return buf[0 .. 2]; |
| } |
| else if (c <= 0xFFFF) |
| { |
| buf[0] = cast(char)(0xE0 | (c >> 12)); |
| buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); |
| buf[2] = cast(char)(0x80 | (c & 0x3F)); |
| return buf[0 .. 3]; |
| } |
| else if (c <= 0x10FFFF) |
| { |
| buf[0] = cast(char)(0xF0 | (c >> 18)); |
| buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); |
| buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); |
| buf[3] = cast(char)(0x80 | (c & 0x3F)); |
| return buf[0 .. 4]; |
| } |
| assert(0); |
| } |
| |
| /******************* |
| * Encodes string s into UTF-8 and returns the encoded string. |
| */ |
| @safe pure nothrow |
| string toUTF8(string s) |
| in |
| { |
| validate(s); |
| } |
| body |
| { |
| return s; |
| } |
| |
| /** ditto */ |
| @trusted pure |
| string toUTF8(in wchar[] s) |
| { |
| char[] r; |
| size_t i; |
| size_t slen = s.length; |
| |
| r.length = slen; |
| |
| for (i = 0; i < slen; i++) |
| { wchar c = s[i]; |
| |
| if (c <= 0x7F) |
| r[i] = cast(char)c; // fast path for ascii |
| else |
| { |
| r.length = i; |
| foreach (dchar ch; s[i .. slen]) |
| { |
| encode(r, ch); |
| } |
| break; |
| } |
| } |
| return cast(string)r; |
| } |
| |
| /** ditto */ |
| @trusted pure |
| string toUTF8(in dchar[] s) |
| { |
| char[] r; |
| size_t i; |
| size_t slen = s.length; |
| |
| r.length = slen; |
| |
| for (i = 0; i < slen; i++) |
| { dchar c = s[i]; |
| |
| if (c <= 0x7F) |
| r[i] = cast(char)c; // fast path for ascii |
| else |
| { |
| r.length = i; |
| foreach (dchar d; s[i .. slen]) |
| { |
| encode(r, d); |
| } |
| break; |
| } |
| } |
| return cast(string)r; |
| } |
| |
| /* =================== Conversion to UTF16 ======================= */ |
| |
| @safe pure nothrow @nogc |
| wchar[] toUTF16(wchar[] buf, dchar c) |
| in |
| { |
| assert(isValidDchar(c)); |
| } |
| body |
| { |
| if (c <= 0xFFFF) |
| { |
| buf[0] = cast(wchar) c; |
| return buf[0 .. 1]; |
| } |
| else |
| { |
| buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); |
| buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); |
| return buf[0 .. 2]; |
| } |
| } |
| |
| /**************** |
| * Encodes string s into UTF-16 and returns the encoded string. |
| * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take |
| * an LPWSTR or LPCWSTR argument. |
| */ |
| @trusted pure |
| wstring toUTF16(in char[] s) |
| { |
| wchar[] r; |
| size_t slen = s.length; |
| |
| r.length = slen; |
| r.length = 0; |
| for (size_t i = 0; i < slen; ) |
| { |
| dchar c = s[i]; |
| if (c <= 0x7F) |
| { |
| i++; |
| r ~= cast(wchar)c; |
| } |
| else |
| { |
| c = decode(s, i); |
| encode(r, c); |
| } |
| } |
| return cast(wstring)r; |
| } |
| |
| alias const(wchar)* wptr; |
| /** ditto */ |
| @safe pure |
| wptr toUTF16z(in char[] s) |
| { |
| wchar[] r; |
| size_t slen = s.length; |
| |
| r.length = slen + 1; |
| r.length = 0; |
| for (size_t i = 0; i < slen; ) |
| { |
| dchar c = s[i]; |
| if (c <= 0x7F) |
| { |
| i++; |
| r ~= cast(wchar)c; |
| } |
| else |
| { |
| c = decode(s, i); |
| encode(r, c); |
| } |
| } |
| r ~= '\000'; |
| return &r[0]; |
| } |
| |
| /** ditto */ |
| @safe pure nothrow |
| wstring toUTF16(wstring s) |
| in |
| { |
| validate(s); |
| } |
| body |
| { |
| return s; |
| } |
| |
| /** ditto */ |
| @trusted pure nothrow |
| wstring toUTF16(in dchar[] s) |
| { |
| wchar[] r; |
| size_t slen = s.length; |
| |
| r.length = slen; |
| r.length = 0; |
| for (size_t i = 0; i < slen; i++) |
| { |
| encode(r, s[i]); |
| } |
| return cast(wstring)r; |
| } |
| |
| /* =================== Conversion to UTF32 ======================= */ |
| |
| /***** |
| * Encodes string s into UTF-32 and returns the encoded string. |
| */ |
| @trusted pure |
| dstring toUTF32(in char[] s) |
| { |
| dchar[] r; |
| size_t slen = s.length; |
| size_t j = 0; |
| |
| r.length = slen; // r[] will never be longer than s[] |
| for (size_t i = 0; i < slen; ) |
| { |
| dchar c = s[i]; |
| if (c >= 0x80) |
| c = decode(s, i); |
| else |
| i++; // c is ascii, no need for decode |
| r[j++] = c; |
| } |
| return cast(dstring)r[0 .. j]; |
| } |
| |
| /** ditto */ |
| @trusted pure |
| dstring toUTF32(in wchar[] s) |
| { |
| dchar[] r; |
| size_t slen = s.length; |
| size_t j = 0; |
| |
| r.length = slen; // r[] will never be longer than s[] |
| for (size_t i = 0; i < slen; ) |
| { |
| dchar c = s[i]; |
| if (c >= 0x80) |
| c = decode(s, i); |
| else |
| i++; // c is ascii, no need for decode |
| r[j++] = c; |
| } |
| return cast(dstring)r[0 .. j]; |
| } |
| |
| /** ditto */ |
| @safe pure nothrow |
| dstring toUTF32(dstring s) |
| in |
| { |
| validate(s); |
| } |
| body |
| { |
| return s; |
| } |
| |
| /* ================================ tests ================================== */ |
| |
| unittest |
| { |
| debug(utf) printf("utf.toUTF.unittest\n"); |
| |
| auto c = "hello"c[]; |
| auto w = toUTF16(c); |
| assert(w == "hello"); |
| auto d = toUTF32(c); |
| assert(d == "hello"); |
| |
| c = toUTF8(w); |
| assert(c == "hello"); |
| d = toUTF32(w); |
| assert(d == "hello"); |
| |
| c = toUTF8(d); |
| assert(c == "hello"); |
| w = toUTF16(d); |
| assert(w == "hello"); |
| |
| |
| c = "hel\u1234o"; |
| w = toUTF16(c); |
| assert(w == "hel\u1234o"); |
| d = toUTF32(c); |
| assert(d == "hel\u1234o"); |
| |
| c = toUTF8(w); |
| assert(c == "hel\u1234o"); |
| d = toUTF32(w); |
| assert(d == "hel\u1234o"); |
| |
| c = toUTF8(d); |
| assert(c == "hel\u1234o"); |
| w = toUTF16(d); |
| assert(w == "hel\u1234o"); |
| |
| |
| c = "he\U000BAAAAllo"; |
| w = toUTF16(c); |
| //foreach (wchar c; w) printf("c = x%x\n", c); |
| //foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c); |
| assert(w == "he\U000BAAAAllo"); |
| d = toUTF32(c); |
| assert(d == "he\U000BAAAAllo"); |
| |
| c = toUTF8(w); |
| assert(c == "he\U000BAAAAllo"); |
| d = toUTF32(w); |
| assert(d == "he\U000BAAAAllo"); |
| |
| c = toUTF8(d); |
| assert(c == "he\U000BAAAAllo"); |
| w = toUTF16(d); |
| assert(w == "he\U000BAAAAllo"); |
| |
| wchar[2] buf; |
| auto ret = toUTF16(buf, '\U000BAAAA'); |
| assert(ret == "\U000BAAAA"); |
| } |