libphobos/libdruntime/rt/util/utf.d - gcc - Git at Google

 /********************************************
  * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
  *
  * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
  * wchar type.
  * For Posix systems, the C wchar_t type is UTF-32 and corresponds to
  * the D utf.dchar type.
  *
  * UTF character support is restricted to (\u0000 &lt;= character &lt;= \U0010FFFF).
  *
  * See_Also:
  *      $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
  *      $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
  *      $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
  *
  * Copyright: Copyright Digital Mars 2003 - 2016.
  * License:   $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
  * Authors:   Walter Bright, Sean Kelly
  * Source:    $(DRUNTIMESRC src/rt/util/_utf.d)
  */

 module rt.util.utf;

 extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ ) @safe pure;

 /*******************************
  * Test if c is a valid UTF-32 character.
  *
  * \uFFFE and \uFFFF are considered valid by this function,
  * as they are permitted for internal use by an application,
  * but they are not allowed for interchange by the Unicode standard.
  *
  * Returns: true if it is, false if not.
  */

 @safe @nogc pure nothrow
 bool isValidDchar(dchar c)
 {
     /* Note: FFFE and FFFF are specifically permitted by the
      * Unicode standard for application internal use, but are not
      * allowed for interchange.
      * (thanks to Arcane Jill)
      */

     return c < 0xD800 ||
         (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
 }

 unittest
 {
     debug(utf) printf("utf.isValidDchar.unittest\n");
     assert(isValidDchar(cast(dchar)'a') == true);
     assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
 }


 static immutable UTF8stride =
 [
     cast(ubyte)
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
     4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
 ];

 /**
  * stride() returns the length of a UTF-8 sequence starting at index i
  * in string s.
  * Returns:
  *      The number of bytes in the UTF-8 sequence or
  *      0xFF meaning s[i] is not the start of of UTF-8 sequence.
  */
 @safe @nogc pure nothrow
 uint stride(in char[] s, size_t i)
 {
     return UTF8stride[s[i]];
 }

 /**
  * stride() returns the length of a UTF-16 sequence starting at index i
  * in string s.
  */
 @safe @nogc pure nothrow
 uint stride(in wchar[] s, size_t i)
 {   uint u = s[i];
     return 1 + (u >= 0xD800 && u <= 0xDBFF);
 }

 /**
  * stride() returns the length of a UTF-32 sequence starting at index i
  * in string s.
  * Returns: The return value will always be 1.
  */
 @safe @nogc pure nothrow
 uint stride(in dchar[] s, size_t i)
 {
     return 1;
 }

 /*******************************************
  * Given an index i into an array of characters s[],
  * and assuming that index i is at the start of a UTF character,
  * determine the number of UCS characters up to that index i.
  */
 @safe pure
 size_t toUCSindex(in char[] s, size_t i)
 {
     size_t n;
     size_t j;

     for (j = 0; j < i; )
     {
         j += stride(s, j);
         n++;
     }
     if (j > i)
     {
         onUnicodeError("invalid UTF-8 sequence", j);
     }
     return n;
 }

 /** ditto */
 @safe pure
 size_t toUCSindex(in wchar[] s, size_t i)
 {
     size_t n;
     size_t j;

     for (j = 0; j < i; )
     {
         j += stride(s, j);
         n++;
     }
     if (j > i)
     {
         onUnicodeError("invalid UTF-16 sequence", j);
     }
     return n;
 }

 /** ditto */
 @safe @nogc pure nothrow
 size_t toUCSindex(in dchar[] s, size_t i)
 {
     return i;
 }

 /******************************************
  * Given a UCS index n into an array of characters s[], return the UTF index.
  */
 @safe pure
 size_t toUTFindex(in char[] s, size_t n)
 {
     size_t i;

     while (n--)
     {
         uint j = UTF8stride[s[i]];
         if (j == 0xFF)
             onUnicodeError("invalid UTF-8 sequence", i);
         i += j;
     }
     return i;
 }

 /** ditto */
 @safe @nogc pure nothrow
 size_t toUTFindex(in wchar[] s, size_t n)
 {
     size_t i;

     while (n--)
     {   wchar u = s[i];

         i += 1 + (u >= 0xD800 && u <= 0xDBFF);
     }
     return i;
 }

 /** ditto */
 @safe @nogc pure nothrow
 size_t toUTFindex(in dchar[] s, size_t n)
 {
     return n;
 }

 /* =================== Decode ======================= */

 /***************
  * Decodes and returns character starting at s[idx]. idx is advanced past the
  * decoded character. If the character is not well formed, a UtfException is
  * thrown and idx remains unchanged.
  */
 @safe pure
 dchar decode(in char[] s, ref size_t idx)
     in
     {
         assert(idx >= 0 && idx < s.length);
     }
     out (result)
     {
         assert(isValidDchar(result));
     }
     body
     {
         size_t len = s.length;
         dchar V;
         size_t i = idx;
         char u = s[i];

         if (u & 0x80)
         {   uint n;
             char u2;

             /* The following encodings are valid, except for the 5 and 6 byte
              * combinations:
              *  0xxxxxxx
              *  110xxxxx 10xxxxxx
              *  1110xxxx 10xxxxxx 10xxxxxx
              *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
              *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
              *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
              */
             for (n = 1; ; n++)
             {
                 if (n > 4)
                     goto Lerr;          // only do the first 4 of 6 encodings
                 if (((u << n) & 0x80) == 0)
                 {
                     if (n == 1)
                         goto Lerr;
                     break;
                 }
             }

             // Pick off (7 - n) significant bits of B from first byte of octet
             V = cast(dchar)(u & ((1 << (7 - n)) - 1));

             if (i + (n - 1) >= len)
                 goto Lerr;                      // off end of string

             /* The following combinations are overlong, and illegal:
              *  1100000x (10xxxxxx)
              *  11100000 100xxxxx (10xxxxxx)
              *  11110000 1000xxxx (10xxxxxx 10xxxxxx)
              *  11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
              *  11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
              */
             u2 = s[i + 1];
             if ((u & 0xFE) == 0xC0 ||
                 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
                 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
                 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
                 (u == 0xFC && (u2 & 0xFC) == 0x80))
                 goto Lerr;                      // overlong combination

             for (uint j = 1; j != n; j++)
             {
                 u = s[i + j];
                 if ((u & 0xC0) != 0x80)
                     goto Lerr;                  // trailing bytes are 10xxxxxx
                 V = (V << 6) | (u & 0x3F);
             }
             if (!isValidDchar(V))
                 goto Lerr;
             i += n;
         }
         else
         {
             V = cast(dchar) u;
             i++;
         }

         idx = i;
         return V;

       Lerr:
       onUnicodeError("invalid UTF-8 sequence", i);
     return V; // dummy return
     }

 unittest
 {   size_t i;
     dchar c;

     debug(utf) printf("utf.decode.unittest\n");

     static s1 = "abcd"c;
     i = 0;
     c = decode(s1, i);
     assert(c == cast(dchar)'a');
     assert(i == 1);
     c = decode(s1, i);
     assert(c == cast(dchar)'b');
     assert(i == 2);

     static s2 = "\xC2\xA9"c;
     i = 0;
     c = decode(s2, i);
     assert(c == cast(dchar)'\u00A9');
     assert(i == 2);

     static s3 = "\xE2\x89\xA0"c;
     i = 0;
     c = decode(s3, i);
     assert(c == cast(dchar)'\u2260');
     assert(i == 3);

     static s4 =
     [   "\xE2\x89"c[],          // too short
         "\xC0\x8A",
         "\xE0\x80\x8A",
         "\xF0\x80\x80\x8A",
         "\xF8\x80\x80\x80\x8A",
         "\xFC\x80\x80\x80\x80\x8A",
     ];

     for (int j = 0; j < s4.length; j++)
     {
         try
         {
             i = 0;
             c = decode(s4[j], i);
             assert(0);
         }
         catch (Throwable o)
         {
             i = 23;
         }
         assert(i == 23);
     }
 }

 /** ditto */
 @safe pure
 dchar decode(in wchar[] s, ref size_t idx)
     in
     {
         assert(idx >= 0 && idx < s.length);
     }
     out (result)
     {
         assert(isValidDchar(result));
     }
     body
     {
         string msg;
         dchar V;
         size_t i = idx;
         uint u = s[i];

         if (u & ~0x7F)
         {   if (u >= 0xD800 && u <= 0xDBFF)
             {   uint u2;

                 if (i + 1 == s.length)
                 {   msg = "surrogate UTF-16 high value past end of string";
                     goto Lerr;
                 }
                 u2 = s[i + 1];
                 if (u2 < 0xDC00 || u2 > 0xDFFF)
                 {   msg = "surrogate UTF-16 low value out of range";
                     goto Lerr;
                 }
                 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
                 i += 2;
             }
             else if (u >= 0xDC00 && u <= 0xDFFF)
             {   msg = "unpaired surrogate UTF-16 value";
                 goto Lerr;
             }
             else if (u == 0xFFFE || u == 0xFFFF)
             {   msg = "illegal UTF-16 value";
                 goto Lerr;
             }
             else
                 i++;
         }
         else
         {
             i++;
         }

         idx = i;
         return cast(dchar)u;

       Lerr:
           onUnicodeError(msg, i);
         return cast(dchar)u; // dummy return
     }

 /** ditto */
 @safe pure
 dchar decode(in dchar[] s, ref size_t idx)
     in
     {
         assert(idx >= 0 && idx < s.length);
     }
     body
     {
         size_t i = idx;
         dchar c = s[i];

         if (!isValidDchar(c))
             goto Lerr;
         idx = i + 1;
         return c;

       Lerr:
           onUnicodeError("invalid UTF-32 value", i);
         return c; // dummy return
     }


 /* =================== Encode ======================= */

 /*******************************
  * Encodes character c and appends it to array s[].
  */
 @safe pure nothrow
 void encode(ref char[] s, dchar c)
     in
     {
         assert(isValidDchar(c));
     }
     body
     {
         char[] r = s;

         if (c <= 0x7F)
         {
             r ~= cast(char) c;
         }
         else
         {
             char[4] buf;
             uint L;

             if (c <= 0x7FF)
             {
                 buf[0] = cast(char)(0xC0 | (c >> 6));
                 buf[1] = cast(char)(0x80 | (c & 0x3F));
                 L = 2;
             }
             else if (c <= 0xFFFF)
             {
                 buf[0] = cast(char)(0xE0 | (c >> 12));
                 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
                 buf[2] = cast(char)(0x80 | (c & 0x3F));
                 L = 3;
             }
             else if (c <= 0x10FFFF)
             {
                 buf[0] = cast(char)(0xF0 | (c >> 18));
                 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
                 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
                 buf[3] = cast(char)(0x80 | (c & 0x3F));
                 L = 4;
             }
             else
             {
                 assert(0);
             }
             r ~= buf[0 .. L];
         }
         s = r;
     }

 unittest
 {
     debug(utf) printf("utf.encode.unittest\n");

     char[] s = "abcd".dup;
     encode(s, cast(dchar)'a');
     assert(s.length == 5);
     assert(s == "abcda");

     encode(s, cast(dchar)'\u00A9');
     assert(s.length == 7);
     assert(s == "abcda\xC2\xA9");
     //assert(s == "abcda\u00A9");       // BUG: fix compiler

     encode(s, cast(dchar)'\u2260');
     assert(s.length == 10);
     assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
 }

 /** ditto */
 @safe pure nothrow
 void encode(ref wchar[] s, dchar c)
     in
     {
         assert(isValidDchar(c));
     }
     body
     {
         wchar[] r = s;

         if (c <= 0xFFFF)
         {
             r ~= cast(wchar) c;
         }
         else
         {
             wchar[2] buf;

             buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
             buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
             r ~= buf;
         }
         s = r;
     }

 /** ditto */
 @safe pure nothrow
 void encode(ref dchar[] s, dchar c)
     in
     {
         assert(isValidDchar(c));
     }
     body
     {
         s ~= c;
     }

 /**
 Returns the code length of $(D c) in the encoding using $(D C) as a
 code point. The code is returned in character count, not in bytes.
  */
 @safe pure nothrow @nogc
 ubyte codeLength(C)(dchar c)
 {
     static if (C.sizeof == 1)
     {
         if (c <= 0x7F) return 1;
         if (c <= 0x7FF) return 2;
         if (c <= 0xFFFF) return 3;
         if (c <= 0x10FFFF) return 4;
         assert(false);
     }
     else static if (C.sizeof == 2)
     {
         return c <= 0xFFFF ? 1 : 2;
     }
     else
     {
         static assert(C.sizeof == 4);
         return 1;
     }
 }

 /* =================== Validation ======================= */

 /***********************************
 Checks to see if string is well formed or not. $(D S) can be an array
  of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
  if it is not. Use to check all untrusted input for correctness.
  */
 @safe pure
 void validate(S)(in S s)
 {
     auto len = s.length;
     for (size_t i = 0; i < len; )
     {
         decode(s, i);
     }
 }

 /* =================== Conversion to UTF8 ======================= */

 @safe pure nothrow @nogc
 char[] toUTF8(char[] buf, dchar c)
     in
     {
         assert(isValidDchar(c));
     }
     body
     {
         if (c <= 0x7F)
         {
             buf[0] = cast(char) c;
             return buf[0 .. 1];
         }
         else if (c <= 0x7FF)
         {
             buf[0] = cast(char)(0xC0 | (c >> 6));
             buf[1] = cast(char)(0x80 | (c & 0x3F));
             return buf[0 .. 2];
         }
         else if (c <= 0xFFFF)
         {
             buf[0] = cast(char)(0xE0 | (c >> 12));
             buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
             buf[2] = cast(char)(0x80 | (c & 0x3F));
             return buf[0 .. 3];
         }
         else if (c <= 0x10FFFF)
         {
             buf[0] = cast(char)(0xF0 | (c >> 18));
             buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
             buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
             buf[3] = cast(char)(0x80 | (c & 0x3F));
             return buf[0 .. 4];
         }
         assert(0);
     }

 /*******************
  * Encodes string s into UTF-8 and returns the encoded string.
  */
 @safe pure nothrow
 string toUTF8(string s)
     in
     {
         validate(s);
     }
     body
     {
         return s;
     }

 /** ditto */
 @trusted pure
 string toUTF8(in wchar[] s)
 {
     char[] r;
     size_t i;
     size_t slen = s.length;

     r.length = slen;

     for (i = 0; i < slen; i++)
     {   wchar c = s[i];

         if (c <= 0x7F)
             r[i] = cast(char)c;         // fast path for ascii
         else
         {
             r.length = i;
             foreach (dchar ch; s[i .. slen])
             {
                 encode(r, ch);
             }
             break;
         }
     }
     return cast(string)r;
 }

 /** ditto */
 @trusted pure
 string toUTF8(in dchar[] s)
 {
     char[] r;
     size_t i;
     size_t slen = s.length;

     r.length = slen;

     for (i = 0; i < slen; i++)
     {   dchar c = s[i];

         if (c <= 0x7F)
             r[i] = cast(char)c;         // fast path for ascii
         else
         {
             r.length = i;
             foreach (dchar d; s[i .. slen])
             {
                 encode(r, d);
             }
             break;
         }
     }
     return cast(string)r;
 }

 /* =================== Conversion to UTF16 ======================= */

 @safe pure nothrow @nogc
 wchar[] toUTF16(wchar[] buf, dchar c)
     in
     {
         assert(isValidDchar(c));
     }
     body
     {
         if (c <= 0xFFFF)
         {
             buf[0] = cast(wchar) c;
             return buf[0 .. 1];
         }
         else
         {
             buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
             buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
             return buf[0 .. 2];
         }
     }

 /****************
  * Encodes string s into UTF-16 and returns the encoded string.
  * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
  * an LPWSTR or LPCWSTR argument.
  */
 @trusted pure
 wstring toUTF16(in char[] s)
 {
     wchar[] r;
     size_t slen = s.length;

     r.length = slen;
     r.length = 0;
     for (size_t i = 0; i < slen; )
     {
         dchar c = s[i];
         if (c <= 0x7F)
         {
             i++;
             r ~= cast(wchar)c;
         }
         else
         {
             c = decode(s, i);
             encode(r, c);
         }
     }
     return cast(wstring)r;
 }

 alias const(wchar)* wptr;
 /** ditto */
 @safe pure
 wptr toUTF16z(in char[] s)
 {
     wchar[] r;
     size_t slen = s.length;

     r.length = slen + 1;
     r.length = 0;
     for (size_t i = 0; i < slen; )
     {
         dchar c = s[i];
         if (c <= 0x7F)
         {
             i++;
             r ~= cast(wchar)c;
         }
         else
         {
             c = decode(s, i);
             encode(r, c);
         }
     }
     r ~= '\000';
     return &r[0];
 }

 /** ditto */
 @safe pure nothrow
 wstring toUTF16(wstring s)
     in
     {
         validate(s);
     }
     body
     {
         return s;
     }

 /** ditto */
 @trusted pure nothrow
 wstring toUTF16(in dchar[] s)
 {
     wchar[] r;
     size_t slen = s.length;

     r.length = slen;
     r.length = 0;
     for (size_t i = 0; i < slen; i++)
     {
         encode(r, s[i]);
     }
     return cast(wstring)r;
 }

 /* =================== Conversion to UTF32 ======================= */

 /*****
  * Encodes string s into UTF-32 and returns the encoded string.
  */
 @trusted pure
 dstring toUTF32(in char[] s)
 {
     dchar[] r;
     size_t slen = s.length;
     size_t j = 0;

     r.length = slen;            // r[] will never be longer than s[]
     for (size_t i = 0; i < slen; )
     {
         dchar c = s[i];
         if (c >= 0x80)
             c = decode(s, i);
         else
             i++;                // c is ascii, no need for decode
         r[j++] = c;
     }
     return cast(dstring)r[0 .. j];
 }

 /** ditto */
 @trusted pure
 dstring toUTF32(in wchar[] s)
 {
     dchar[] r;
     size_t slen = s.length;
     size_t j = 0;

     r.length = slen;            // r[] will never be longer than s[]
     for (size_t i = 0; i < slen; )
     {
         dchar c = s[i];
         if (c >= 0x80)
             c = decode(s, i);
         else
             i++;                // c is ascii, no need for decode
         r[j++] = c;
     }
     return cast(dstring)r[0 .. j];
 }

 /** ditto */
 @safe pure nothrow
 dstring toUTF32(dstring s)
     in
     {
         validate(s);
     }
     body
     {
         return s;
     }

 /* ================================ tests ================================== */

 unittest
 {
     debug(utf) printf("utf.toUTF.unittest\n");

     auto c = "hello"c[];
     auto w = toUTF16(c);
     assert(w == "hello");
     auto d = toUTF32(c);
     assert(d == "hello");

     c = toUTF8(w);
     assert(c == "hello");
     d = toUTF32(w);
     assert(d == "hello");

     c = toUTF8(d);
     assert(c == "hello");
     w = toUTF16(d);
     assert(w == "hello");


     c = "hel\u1234o";
     w = toUTF16(c);
     assert(w == "hel\u1234o");
     d = toUTF32(c);
     assert(d == "hel\u1234o");

     c = toUTF8(w);
     assert(c == "hel\u1234o");
     d = toUTF32(w);
     assert(d == "hel\u1234o");

     c = toUTF8(d);
     assert(c == "hel\u1234o");
     w = toUTF16(d);
     assert(w == "hel\u1234o");


     c = "he\U000BAAAAllo";
     w = toUTF16(c);
     //foreach (wchar c; w) printf("c = x%x\n", c);
     //foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c);
     assert(w == "he\U000BAAAAllo");
     d = toUTF32(c);
     assert(d == "he\U000BAAAAllo");

     c = toUTF8(w);
     assert(c == "he\U000BAAAAllo");
     d = toUTF32(w);
     assert(d == "he\U000BAAAAllo");

     c = toUTF8(d);
     assert(c == "he\U000BAAAAllo");
     w = toUTF16(d);
     assert(w == "he\U000BAAAAllo");

     wchar[2] buf;
     auto ret = toUTF16(buf, '\U000BAAAA');
     assert(ret == "\U000BAAAA");
 }
	/********************************************
	* Encode and decode UTF-8, UTF-16 and UTF-32 strings.
	*
	* For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
	* wchar type.
	* For Posix systems, the C wchar_t type is UTF-32 and corresponds to
	* the D utf.dchar type.
	*
	* UTF character support is restricted to (\u0000 <= character <= \U0010FFFF).
	*
	* See_Also:
	* $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
	* $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
	* $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
	*
	* Copyright: Copyright Digital Mars 2003 - 2016.
	* License: $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
	* Authors: Walter Bright, Sean Kelly
	* Source: $(DRUNTIMESRC src/rt/util/_utf.d)
	*/

	module rt.util.utf;

	extern (C) void onUnicodeError( string msg, size_t idx, string file = __FILE__, size_t line = __LINE__ ) @safe pure;

	/*******************************
	* Test if c is a valid UTF-32 character.
	*
	* \uFFFE and \uFFFF are considered valid by this function,
	* as they are permitted for internal use by an application,
	* but they are not allowed for interchange by the Unicode standard.
	*
	* Returns: true if it is, false if not.
	*/

	@safe @nogc pure nothrow
	bool isValidDchar(dchar c)
	{
	/* Note: FFFE and FFFF are specifically permitted by the
	* Unicode standard for application internal use, but are not
	* allowed for interchange.
	* (thanks to Arcane Jill)
	*/

	return c < 0xD800 \|\|
	(c > 0xDFFF && c <= 0x10FFFF /&& c != 0xFFFE && c != 0xFFFF/);
	}

	unittest
	{
	debug(utf) printf("utf.isValidDchar.unittest\n");
	assert(isValidDchar(cast(dchar)'a') == true);
	assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
	}



	static immutable UTF8stride =
	[
	cast(ubyte)
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
	0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
	4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
	];

	/**
	* stride() returns the length of a UTF-8 sequence starting at index i
	* in string s.
	* Returns:
	* The number of bytes in the UTF-8 sequence or
	* 0xFF meaning s[i] is not the start of of UTF-8 sequence.
	*/
	@safe @nogc pure nothrow
	uint stride(in char[] s, size_t i)
	{
	return UTF8stride[s[i]];
	}

	/**
	* stride() returns the length of a UTF-16 sequence starting at index i
	* in string s.
	*/
	@safe @nogc pure nothrow
	uint stride(in wchar[] s, size_t i)
	{ uint u = s[i];
	return 1 + (u >= 0xD800 && u <= 0xDBFF);
	}

	/**
	* stride() returns the length of a UTF-32 sequence starting at index i
	* in string s.
	* Returns: The return value will always be 1.
	*/
	@safe @nogc pure nothrow
	uint stride(in dchar[] s, size_t i)
	{
	return 1;
	}

	/*******************************************
	* Given an index i into an array of characters s[],
	* and assuming that index i is at the start of a UTF character,
	* determine the number of UCS characters up to that index i.
	*/
	@safe pure
	size_t toUCSindex(in char[] s, size_t i)
	{
	size_t n;
	size_t j;

	for (j = 0; j < i; )
	{
	j += stride(s, j);
	n++;
	}
	if (j > i)
	{
	onUnicodeError("invalid UTF-8 sequence", j);
	}
	return n;
	}

	/** ditto */
	@safe pure
	size_t toUCSindex(in wchar[] s, size_t i)
	{
	size_t n;
	size_t j;

	for (j = 0; j < i; )
	{
	j += stride(s, j);
	n++;
	}
	if (j > i)
	{
	onUnicodeError("invalid UTF-16 sequence", j);
	}
	return n;
	}

	/** ditto */
	@safe @nogc pure nothrow
	size_t toUCSindex(in dchar[] s, size_t i)
	{
	return i;
	}

	/******************************************
	* Given a UCS index n into an array of characters s[], return the UTF index.
	*/
	@safe pure
	size_t toUTFindex(in char[] s, size_t n)
	{
	size_t i;

	while (n--)
	{
	uint j = UTF8stride[s[i]];
	if (j == 0xFF)
	onUnicodeError("invalid UTF-8 sequence", i);
	i += j;
	}
	return i;
	}

	/** ditto */
	@safe @nogc pure nothrow
	size_t toUTFindex(in wchar[] s, size_t n)
	{
	size_t i;

	while (n--)
	{ wchar u = s[i];

	i += 1 + (u >= 0xD800 && u <= 0xDBFF);
	}
	return i;
	}

	/** ditto */
	@safe @nogc pure nothrow
	size_t toUTFindex(in dchar[] s, size_t n)
	{
	return n;
	}

	/* =================== Decode ======================= */

	/***************
	* Decodes and returns character starting at s[idx]. idx is advanced past the
	* decoded character. If the character is not well formed, a UtfException is
	* thrown and idx remains unchanged.
	*/
	@safe pure
	dchar decode(in char[] s, ref size_t idx)
	in
	{
	assert(idx >= 0 && idx < s.length);
	}
	out (result)
	{
	assert(isValidDchar(result));
	}
	body
	{
	size_t len = s.length;
	dchar V;
	size_t i = idx;
	char u = s[i];

	if (u & 0x80)
	{ uint n;
	char u2;

	/* The following encodings are valid, except for the 5 and 6 byte
	* combinations:
	* 0xxxxxxx
	* 110xxxxx 10xxxxxx
	* 1110xxxx 10xxxxxx 10xxxxxx
	* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
	* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
	*/
	for (n = 1; ; n++)
	{
	if (n > 4)
	goto Lerr; // only do the first 4 of 6 encodings
	if (((u << n) & 0x80) == 0)
	{
	if (n == 1)
	goto Lerr;
	break;
	}
	}

	// Pick off (7 - n) significant bits of B from first byte of octet
	V = cast(dchar)(u & ((1 << (7 - n)) - 1));

	if (i + (n - 1) >= len)
	goto Lerr; // off end of string

	/* The following combinations are overlong, and illegal:
	* 1100000x (10xxxxxx)
	* 11100000 100xxxxx (10xxxxxx)
	* 11110000 1000xxxx (10xxxxxx 10xxxxxx)
	* 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
	* 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
	*/
	u2 = s[i + 1];
	if ((u & 0xFE) == 0xC0 \|\|
	(u == 0xE0 && (u2 & 0xE0) == 0x80) \|\|
	(u == 0xF0 && (u2 & 0xF0) == 0x80) \|\|
	(u == 0xF8 && (u2 & 0xF8) == 0x80) \|\|
	(u == 0xFC && (u2 & 0xFC) == 0x80))
	goto Lerr; // overlong combination

	for (uint j = 1; j != n; j++)
	{
	u = s[i + j];
	if ((u & 0xC0) != 0x80)
	goto Lerr; // trailing bytes are 10xxxxxx
	V = (V << 6) \| (u & 0x3F);
	}
	if (!isValidDchar(V))
	goto Lerr;
	i += n;
	}
	else
	{
	V = cast(dchar) u;
	i++;
	}

	idx = i;
	return V;

	Lerr:
	onUnicodeError("invalid UTF-8 sequence", i);
	return V; // dummy return
	}

	unittest
	{ size_t i;
	dchar c;

	debug(utf) printf("utf.decode.unittest\n");

	static s1 = "abcd"c;
	i = 0;
	c = decode(s1, i);
	assert(c == cast(dchar)'a');
	assert(i == 1);
	c = decode(s1, i);
	assert(c == cast(dchar)'b');
	assert(i == 2);

	static s2 = "\xC2\xA9"c;
	i = 0;
	c = decode(s2, i);
	assert(c == cast(dchar)'\u00A9');
	assert(i == 2);

	static s3 = "\xE2\x89\xA0"c;
	i = 0;
	c = decode(s3, i);
	assert(c == cast(dchar)'\u2260');
	assert(i == 3);

	static s4 =
	[ "\xE2\x89"c[], // too short
	"\xC0\x8A",
	"\xE0\x80\x8A",
	"\xF0\x80\x80\x8A",
	"\xF8\x80\x80\x80\x8A",
	"\xFC\x80\x80\x80\x80\x8A",
	];

	for (int j = 0; j < s4.length; j++)
	{
	try
	{
	i = 0;
	c = decode(s4[j], i);
	assert(0);
	}
	catch (Throwable o)
	{
	i = 23;
	}
	assert(i == 23);
	}
	}

	/** ditto */
	@safe pure
	dchar decode(in wchar[] s, ref size_t idx)
	in
	{
	assert(idx >= 0 && idx < s.length);
	}
	out (result)
	{
	assert(isValidDchar(result));
	}
	body
	{
	string msg;
	dchar V;
	size_t i = idx;
	uint u = s[i];

	if (u & ~0x7F)
	{ if (u >= 0xD800 && u <= 0xDBFF)
	{ uint u2;

	if (i + 1 == s.length)
	{ msg = "surrogate UTF-16 high value past end of string";
	goto Lerr;
	}
	u2 = s[i + 1];
	if (u2 < 0xDC00 \|\| u2 > 0xDFFF)
	{ msg = "surrogate UTF-16 low value out of range";
	goto Lerr;
	}
	u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
	i += 2;
	}
	else if (u >= 0xDC00 && u <= 0xDFFF)
	{ msg = "unpaired surrogate UTF-16 value";
	goto Lerr;
	}
	else if (u == 0xFFFE \|\| u == 0xFFFF)
	{ msg = "illegal UTF-16 value";
	goto Lerr;
	}
	else
	i++;
	}
	else
	{
	i++;
	}

	idx = i;
	return cast(dchar)u;

	Lerr:
	onUnicodeError(msg, i);
	return cast(dchar)u; // dummy return
	}

	/** ditto */
	@safe pure
	dchar decode(in dchar[] s, ref size_t idx)
	in
	{
	assert(idx >= 0 && idx < s.length);
	}
	body
	{
	size_t i = idx;
	dchar c = s[i];

	if (!isValidDchar(c))
	goto Lerr;
	idx = i + 1;
	return c;

	Lerr:
	onUnicodeError("invalid UTF-32 value", i);
	return c; // dummy return
	}


	/* =================== Encode ======================= */

	/*******************************
	* Encodes character c and appends it to array s[].
	*/
	@safe pure nothrow
	void encode(ref char[] s, dchar c)
	in
	{
	assert(isValidDchar(c));
	}
	body
	{
	char[] r = s;

	if (c <= 0x7F)
	{
	r ~= cast(char) c;
	}
	else
	{
	char[4] buf;
	uint L;

	if (c <= 0x7FF)
	{
	buf[0] = cast(char)(0xC0 \| (c >> 6));
	buf[1] = cast(char)(0x80 \| (c & 0x3F));
	L = 2;
	}
	else if (c <= 0xFFFF)
	{
	buf[0] = cast(char)(0xE0 \| (c >> 12));
	buf[1] = cast(char)(0x80 \| ((c >> 6) & 0x3F));
	buf[2] = cast(char)(0x80 \| (c & 0x3F));
	L = 3;
	}
	else if (c <= 0x10FFFF)
	{
	buf[0] = cast(char)(0xF0 \| (c >> 18));
	buf[1] = cast(char)(0x80 \| ((c >> 12) & 0x3F));
	buf[2] = cast(char)(0x80 \| ((c >> 6) & 0x3F));
	buf[3] = cast(char)(0x80 \| (c & 0x3F));
	L = 4;
	}
	else
	{
	assert(0);
	}
	r ~= buf[0 .. L];
	}
	s = r;
	}

	unittest
	{
	debug(utf) printf("utf.encode.unittest\n");

	char[] s = "abcd".dup;
	encode(s, cast(dchar)'a');
	assert(s.length == 5);
	assert(s == "abcda");

	encode(s, cast(dchar)'\u00A9');
	assert(s.length == 7);
	assert(s == "abcda\xC2\xA9");
	//assert(s == "abcda\u00A9"); // BUG: fix compiler

	encode(s, cast(dchar)'\u2260');
	assert(s.length == 10);
	assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
	}

	/** ditto */
	@safe pure nothrow
	void encode(ref wchar[] s, dchar c)
	in
	{
	assert(isValidDchar(c));
	}
	body
	{
	wchar[] r = s;

	if (c <= 0xFFFF)
	{
	r ~= cast(wchar) c;
	}
	else
	{
	wchar[2] buf;

	buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
	buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
	r ~= buf;
	}
	s = r;
	}

	/** ditto */
	@safe pure nothrow
	void encode(ref dchar[] s, dchar c)
	in
	{
	assert(isValidDchar(c));
	}
	body
	{
	s ~= c;
	}

	/**
	Returns the code length of $(D c) in the encoding using $(D C) as a
	code point. The code is returned in character count, not in bytes.
	*/
	@safe pure nothrow @nogc
	ubyte codeLength(C)(dchar c)
	{
	static if (C.sizeof == 1)
	{
	if (c <= 0x7F) return 1;
	if (c <= 0x7FF) return 2;
	if (c <= 0xFFFF) return 3;
	if (c <= 0x10FFFF) return 4;
	assert(false);
	}
	else static if (C.sizeof == 2)
	{
	return c <= 0xFFFF ? 1 : 2;
	}
	else
	{
	static assert(C.sizeof == 4);
	return 1;
	}
	}

	/* =================== Validation ======================= */

	/***********************************
	Checks to see if string is well formed or not. $(D S) can be an array
	of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
	if it is not. Use to check all untrusted input for correctness.
	*/
	@safe pure
	void validate(S)(in S s)
	{
	auto len = s.length;
	for (size_t i = 0; i < len; )
	{
	decode(s, i);
	}
	}

	/* =================== Conversion to UTF8 ======================= */

	@safe pure nothrow @nogc
	char[] toUTF8(char[] buf, dchar c)
	in
	{
	assert(isValidDchar(c));
	}
	body
	{
	if (c <= 0x7F)
	{
	buf[0] = cast(char) c;
	return buf[0 .. 1];
	}
	else if (c <= 0x7FF)
	{
	buf[0] = cast(char)(0xC0 \| (c >> 6));
	buf[1] = cast(char)(0x80 \| (c & 0x3F));
	return buf[0 .. 2];
	}
	else if (c <= 0xFFFF)
	{
	buf[0] = cast(char)(0xE0 \| (c >> 12));
	buf[1] = cast(char)(0x80 \| ((c >> 6) & 0x3F));
	buf[2] = cast(char)(0x80 \| (c & 0x3F));
	return buf[0 .. 3];
	}
	else if (c <= 0x10FFFF)
	{
	buf[0] = cast(char)(0xF0 \| (c >> 18));
	buf[1] = cast(char)(0x80 \| ((c >> 12) & 0x3F));
	buf[2] = cast(char)(0x80 \| ((c >> 6) & 0x3F));
	buf[3] = cast(char)(0x80 \| (c & 0x3F));
	return buf[0 .. 4];
	}
	assert(0);
	}

	/*******************
	* Encodes string s into UTF-8 and returns the encoded string.
	*/
	@safe pure nothrow
	string toUTF8(string s)
	in
	{
	validate(s);
	}
	body
	{
	return s;
	}

	/** ditto */
	@trusted pure
	string toUTF8(in wchar[] s)
	{
	char[] r;
	size_t i;
	size_t slen = s.length;

	r.length = slen;

	for (i = 0; i < slen; i++)
	{ wchar c = s[i];

	if (c <= 0x7F)
	r[i] = cast(char)c; // fast path for ascii
	else
	{
	r.length = i;
	foreach (dchar ch; s[i .. slen])
	{
	encode(r, ch);
	}
	break;
	}
	}
	return cast(string)r;
	}

	/** ditto */
	@trusted pure
	string toUTF8(in dchar[] s)
	{
	char[] r;
	size_t i;
	size_t slen = s.length;

	r.length = slen;

	for (i = 0; i < slen; i++)
	{ dchar c = s[i];

	if (c <= 0x7F)
	r[i] = cast(char)c; // fast path for ascii
	else
	{
	r.length = i;
	foreach (dchar d; s[i .. slen])
	{
	encode(r, d);
	}
	break;
	}
	}
	return cast(string)r;
	}

	/* =================== Conversion to UTF16 ======================= */

	@safe pure nothrow @nogc
	wchar[] toUTF16(wchar[] buf, dchar c)
	in
	{
	assert(isValidDchar(c));
	}
	body
	{
	if (c <= 0xFFFF)
	{
	buf[0] = cast(wchar) c;
	return buf[0 .. 1];
	}
	else
	{
	buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
	buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
	return buf[0 .. 2];
	}
	}

	/****************
	* Encodes string s into UTF-16 and returns the encoded string.
	* toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
	* an LPWSTR or LPCWSTR argument.
	*/
	@trusted pure
	wstring toUTF16(in char[] s)
	{
	wchar[] r;
	size_t slen = s.length;

	r.length = slen;
	r.length = 0;
	for (size_t i = 0; i < slen; )
	{
	dchar c = s[i];
	if (c <= 0x7F)
	{
	i++;
	r ~= cast(wchar)c;
	}
	else
	{
	c = decode(s, i);
	encode(r, c);
	}
	}
	return cast(wstring)r;
	}

	alias const(wchar)* wptr;
	/** ditto */
	@safe pure
	wptr toUTF16z(in char[] s)
	{
	wchar[] r;
	size_t slen = s.length;

	r.length = slen + 1;
	r.length = 0;
	for (size_t i = 0; i < slen; )
	{
	dchar c = s[i];
	if (c <= 0x7F)
	{
	i++;
	r ~= cast(wchar)c;
	}
	else
	{
	c = decode(s, i);
	encode(r, c);
	}
	}
	r ~= '\000';
	return &r[0];
	}

	/** ditto */
	@safe pure nothrow
	wstring toUTF16(wstring s)
	in
	{
	validate(s);
	}
	body
	{
	return s;
	}

	/** ditto */
	@trusted pure nothrow
	wstring toUTF16(in dchar[] s)
	{
	wchar[] r;
	size_t slen = s.length;

	r.length = slen;
	r.length = 0;
	for (size_t i = 0; i < slen; i++)
	{
	encode(r, s[i]);
	}
	return cast(wstring)r;
	}

	/* =================== Conversion to UTF32 ======================= */

	/*****
	* Encodes string s into UTF-32 and returns the encoded string.
	*/
	@trusted pure
	dstring toUTF32(in char[] s)
	{
	dchar[] r;
	size_t slen = s.length;
	size_t j = 0;

	r.length = slen; // r[] will never be longer than s[]
	for (size_t i = 0; i < slen; )
	{
	dchar c = s[i];
	if (c >= 0x80)
	c = decode(s, i);
	else
	i++; // c is ascii, no need for decode
	r[j++] = c;
	}
	return cast(dstring)r[0 .. j];
	}

	/** ditto */
	@trusted pure
	dstring toUTF32(in wchar[] s)
	{
	dchar[] r;
	size_t slen = s.length;
	size_t j = 0;

	r.length = slen; // r[] will never be longer than s[]
	for (size_t i = 0; i < slen; )
	{
	dchar c = s[i];
	if (c >= 0x80)
	c = decode(s, i);
	else
	i++; // c is ascii, no need for decode
	r[j++] = c;
	}
	return cast(dstring)r[0 .. j];
	}

	/** ditto */
	@safe pure nothrow
	dstring toUTF32(dstring s)
	in
	{
	validate(s);
	}
	body
	{
	return s;
	}

	/* ================================ tests ================================== */

	unittest
	{
	debug(utf) printf("utf.toUTF.unittest\n");

	auto c = "hello"c[];
	auto w = toUTF16(c);
	assert(w == "hello");
	auto d = toUTF32(c);
	assert(d == "hello");

	c = toUTF8(w);
	assert(c == "hello");
	d = toUTF32(w);
	assert(d == "hello");

	c = toUTF8(d);
	assert(c == "hello");
	w = toUTF16(d);
	assert(w == "hello");


	c = "hel\u1234o";
	w = toUTF16(c);
	assert(w == "hel\u1234o");
	d = toUTF32(c);
	assert(d == "hel\u1234o");

	c = toUTF8(w);
	assert(c == "hel\u1234o");
	d = toUTF32(w);
	assert(d == "hel\u1234o");

	c = toUTF8(d);
	assert(c == "hel\u1234o");
	w = toUTF16(d);
	assert(w == "hel\u1234o");


	c = "he\U000BAAAAllo";
	w = toUTF16(c);
	//foreach (wchar c; w) printf("c = x%x\n", c);
	//foreach (wchar c; cast(wstring)"he\U000BAAAAllo") printf("c = x%x\n", c);
	assert(w == "he\U000BAAAAllo");
	d = toUTF32(c);
	assert(d == "he\U000BAAAAllo");

	c = toUTF8(w);
	assert(c == "he\U000BAAAAllo");
	d = toUTF32(w);
	assert(d == "he\U000BAAAAllo");

	c = toUTF8(d);
	assert(c == "he\U000BAAAAllo");
	w = toUTF16(d);
	assert(w == "he\U000BAAAAllo");

	wchar[2] buf;
	auto ret = toUTF16(buf, '\U000BAAAA');
	assert(ret == "\U000BAAAA");
	}