|  | // Written in the D programming language. | 
|  |  | 
|  | /** | 
|  | Classes and functions for handling and transcoding between various encodings. | 
|  |  | 
|  | For cases where the encoding is known at compile-time, functions are provided | 
|  | for arbitrary encoding and decoding of characters, arbitrary transcoding | 
|  | between strings of different type, as well as validation and sanitization. | 
|  |  | 
|  | Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1 | 
|  | (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250, WINDOWS-1251 | 
|  | and WINDOWS-1252. | 
|  |  | 
|  | $(SCRIPT inhibitQuickIndex = 1;) | 
|  | $(DIVC quickindex, | 
|  | $(BOOKTABLE, | 
|  | $(TR $(TH Category) $(TH Functions)) | 
|  | $(TR $(TD Decode) $(TD | 
|  | $(LREF codePoints) | 
|  | $(LREF decode) | 
|  | $(LREF decodeReverse) | 
|  | $(LREF safeDecode) | 
|  | )) | 
|  | $(TR $(TD Conversion) $(TD | 
|  | $(LREF codeUnits) | 
|  | $(LREF sanitize) | 
|  | $(LREF transcode) | 
|  | )) | 
|  | $(TR $(TD Classification) $(TD | 
|  | $(LREF canEncode) | 
|  | $(LREF isValid) | 
|  | $(LREF isValidCodePoint) | 
|  | $(LREF isValidCodeUnit) | 
|  | )) | 
|  | $(TR $(TD BOM) $(TD | 
|  | $(LREF BOM) | 
|  | $(LREF BOMSeq) | 
|  | $(LREF getBOM) | 
|  | $(LREF utfBOM) | 
|  | )) | 
|  | $(TR $(TD Length & Index) $(TD | 
|  | $(LREF firstSequence) | 
|  | $(LREF encodedLength) | 
|  | $(LREF index) | 
|  | $(LREF lastSequence) | 
|  | $(LREF validLength) | 
|  | )) | 
|  | $(TR $(TD Encoding schemes) $(TD | 
|  | $(LREF encodingName) | 
|  | $(LREF EncodingScheme) | 
|  | $(LREF EncodingSchemeASCII) | 
|  | $(LREF EncodingSchemeLatin1) | 
|  | $(LREF EncodingSchemeLatin2) | 
|  | $(LREF EncodingSchemeUtf16Native) | 
|  | $(LREF EncodingSchemeUtf32Native) | 
|  | $(LREF EncodingSchemeUtf8) | 
|  | $(LREF EncodingSchemeWindows1250) | 
|  | $(LREF EncodingSchemeWindows1251) | 
|  | $(LREF EncodingSchemeWindows1252) | 
|  | )) | 
|  | $(TR $(TD Representation) $(TD | 
|  | $(LREF AsciiChar) | 
|  | $(LREF AsciiString) | 
|  | $(LREF Latin1Char) | 
|  | $(LREF Latin1String) | 
|  | $(LREF Latin2Char) | 
|  | $(LREF Latin2String) | 
|  | $(LREF Windows1250Char) | 
|  | $(LREF Windows1250String) | 
|  | $(LREF Windows1251Char) | 
|  | $(LREF Windows1251String) | 
|  | $(LREF Windows1252Char) | 
|  | $(LREF Windows1252String) | 
|  | )) | 
|  | $(TR $(TD Exceptions) $(TD | 
|  | $(LREF INVALID_SEQUENCE) | 
|  | $(LREF EncodingException) | 
|  | )) | 
|  | )) | 
|  |  | 
|  | For cases where the encoding is not known at compile-time, but is | 
|  | known at run-time, the abstract class $(LREF EncodingScheme) | 
|  | and its subclasses is provided.  To construct a run-time encoder/decoder, | 
|  | one does e.g. | 
|  |  | 
|  | ---------------------------------------------------- | 
|  | auto e = EncodingScheme.create("utf-8"); | 
|  | ---------------------------------------------------- | 
|  |  | 
|  | This library supplies $(LREF EncodingScheme) subclasses for ASCII, | 
|  | ISO-8859-1 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252, UTF-8, and (on little-endian architectures) | 
|  | UTF-16LE and UTF-32LE; or (on big-endian architectures) UTF-16BE and UTF-32BE. | 
|  |  | 
|  | This library provides a mechanism whereby other modules may add $(LREF | 
|  | EncodingScheme) subclasses for any other encoding. | 
|  |  | 
|  | Copyright: Copyright Janice Caron 2008 - 2009. | 
|  | License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). | 
|  | Authors:   Janice Caron | 
|  | Source:    $(PHOBOSSRC std/encoding.d) | 
|  | */ | 
|  | /* | 
|  | Copyright Janice Caron 2008 - 2009. | 
|  | Distributed under the Boost Software License, Version 1.0. | 
|  | (See accompanying file LICENSE_1_0.txt or copy at | 
|  | http://www.boost.org/LICENSE_1_0.txt) | 
|  | */ | 
|  | module std.encoding; | 
|  |  | 
|  | import std.range.primitives; | 
|  | import std.traits; | 
|  | import std.typecons; | 
|  |  | 
|  | @system unittest | 
|  | { | 
|  | static ubyte[][] validStrings = | 
|  | [ | 
|  | // Plain ASCII | 
|  | cast(ubyte[])"hello", | 
|  |  | 
|  | // First possible sequence of a certain length | 
|  | [ 0x00 ],                       // U+00000000   one byte | 
|  | [ 0xC2, 0x80 ],                 // U+00000080   two bytes | 
|  | [ 0xE0, 0xA0, 0x80 ],           // U+00000800   three bytes | 
|  | [ 0xF0, 0x90, 0x80, 0x80 ],     // U+00010000   three bytes | 
|  |  | 
|  | // Last possible sequence of a certain length | 
|  | [ 0x7F ],                       // U+0000007F   one byte | 
|  | [ 0xDF, 0xBF ],                 // U+000007FF   two bytes | 
|  | [ 0xEF, 0xBF, 0xBF ],           // U+0000FFFF   three bytes | 
|  |  | 
|  | // Other boundary conditions | 
|  | [ 0xED, 0x9F, 0xBF ], | 
|  | // U+0000D7FF   Last character before surrogates | 
|  | [ 0xEE, 0x80, 0x80 ], | 
|  | // U+0000E000   First character after surrogates | 
|  | [ 0xEF, 0xBF, 0xBD ], | 
|  | // U+0000FFFD   Unicode replacement character | 
|  | [ 0xF4, 0x8F, 0xBF, 0xBF ], | 
|  | // U+0010FFFF   Very last character | 
|  |  | 
|  | // Non-character code points | 
|  | /*  NOTE: These are legal in UTF, and may be converted from | 
|  | one UTF to another, however they do not represent Unicode | 
|  | characters. These code points have been reserved by | 
|  | Unicode as non-character code points. They are permissible | 
|  | for data exchange within an application, but they are are | 
|  | not permitted to be used as characters. Since this module | 
|  | deals with UTF, and not with Unicode per se, we choose to | 
|  | accept them here. */ | 
|  | [ 0xDF, 0xBE ],                 // U+0000FFFE | 
|  | [ 0xDF, 0xBF ],                 // U+0000FFFF | 
|  | ]; | 
|  |  | 
|  | static ubyte[][] invalidStrings = | 
|  | [ | 
|  | // First possible sequence of a certain length, but greater | 
|  | // than U+10FFFF | 
|  | [ 0xF8, 0x88, 0x80, 0x80, 0x80 ],           // U+00200000   five bytes | 
|  | [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ],     // U+04000000   six bytes | 
|  |  | 
|  | // Last possible sequence of a certain length, but greater than U+10FFFF | 
|  | [ 0xF7, 0xBF, 0xBF, 0xBF ],                 // U+001FFFFF   four bytes | 
|  | [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ],           // U+03FFFFFF   five bytes | 
|  | [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+7FFFFFFF   six bytes | 
|  |  | 
|  | // Other boundary conditions | 
|  | [ 0xF4, 0x90, 0x80, 0x80 ],                 // U+00110000 | 
|  | // First code | 
|  | // point after | 
|  | // last character | 
|  |  | 
|  | // Unexpected continuation bytes | 
|  | [ 0x80 ], | 
|  | [ 0xBF ], | 
|  | [ 0x20, 0x80, 0x20 ], | 
|  | [ 0x20, 0xBF, 0x20 ], | 
|  | [ 0x80, 0x9F, 0xA0 ], | 
|  |  | 
|  | // Lonely start bytes | 
|  | [ 0xC0 ], | 
|  | [ 0xCF ], | 
|  | [ 0x20, 0xC0, 0x20 ], | 
|  | [ 0x20, 0xCF, 0x20 ], | 
|  | [ 0xD0 ], | 
|  | [ 0xDF ], | 
|  | [ 0x20, 0xD0, 0x20 ], | 
|  | [ 0x20, 0xDF, 0x20 ], | 
|  | [ 0xE0 ], | 
|  | [ 0xEF ], | 
|  | [ 0x20, 0xE0, 0x20 ], | 
|  | [ 0x20, 0xEF, 0x20 ], | 
|  | [ 0xF0 ], | 
|  | [ 0xF1 ], | 
|  | [ 0xF2 ], | 
|  | [ 0xF3 ], | 
|  | [ 0xF4 ], | 
|  | [ 0xF5 ],   // If this were legal it would start a character > U+10FFFF | 
|  | [ 0xF6 ],   // If this were legal it would start a character > U+10FFFF | 
|  | [ 0xF7 ],   // If this were legal it would start a character > U+10FFFF | 
|  |  | 
|  | [ 0xEF, 0xBF ],             // Three byte sequence with third byte missing | 
|  | [ 0xF7, 0xBF, 0xBF ],       // Four byte sequence with fourth byte missing | 
|  | [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ],   // Concatenation of the above | 
|  |  | 
|  | // Impossible bytes | 
|  | [ 0xF8 ], | 
|  | [ 0xF9 ], | 
|  | [ 0xFA ], | 
|  | [ 0xFB ], | 
|  | [ 0xFC ], | 
|  | [ 0xFD ], | 
|  | [ 0xFE ], | 
|  | [ 0xFF ], | 
|  | [ 0x20, 0xF8, 0x20 ], | 
|  | [ 0x20, 0xF9, 0x20 ], | 
|  | [ 0x20, 0xFA, 0x20 ], | 
|  | [ 0x20, 0xFB, 0x20 ], | 
|  | [ 0x20, 0xFC, 0x20 ], | 
|  | [ 0x20, 0xFD, 0x20 ], | 
|  | [ 0x20, 0xFE, 0x20 ], | 
|  | [ 0x20, 0xFF, 0x20 ], | 
|  |  | 
|  | // Overlong sequences, all representing U+002F | 
|  | /*  With a safe UTF-8 decoder, all of the following five overlong | 
|  | representations of the ASCII character slash ("/") should be | 
|  | rejected like a malformed UTF-8 sequence */ | 
|  | [ 0xC0, 0xAF ], | 
|  | [ 0xE0, 0x80, 0xAF ], | 
|  | [ 0xF0, 0x80, 0x80, 0xAF ], | 
|  | [ 0xF8, 0x80, 0x80, 0x80, 0xAF ], | 
|  | [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ], | 
|  |  | 
|  | // Maximum overlong sequences | 
|  | /*  Below you see the highest Unicode value that is still resulting in | 
|  | an overlong sequence if represented with the given number of bytes. | 
|  | This is a boundary test for safe UTF-8 decoders. All five | 
|  | characters should be rejected like malformed UTF-8 sequences. */ | 
|  | [ 0xC1, 0xBF ],                             // U+0000007F | 
|  | [ 0xE0, 0x9F, 0xBF ],                       // U+000007FF | 
|  | [ 0xF0, 0x8F, 0xBF, 0xBF ],                 // U+0000FFFF | 
|  | [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ],           // U+001FFFFF | 
|  | [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+03FFFFFF | 
|  |  | 
|  | // Overlong representation of the NUL character | 
|  | /*  The following five sequences should also be rejected like malformed | 
|  | UTF-8 sequences and should not be treated like the ASCII NUL | 
|  | character. */ | 
|  | [ 0xC0, 0x80 ], | 
|  | [ 0xE0, 0x80, 0x80 ], | 
|  | [ 0xF0, 0x80, 0x80, 0x80 ], | 
|  | [ 0xF8, 0x80, 0x80, 0x80, 0x80 ], | 
|  | [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ], | 
|  |  | 
|  | // Illegal code positions | 
|  | /*  The following UTF-8 sequences should be rejected like malformed | 
|  | sequences, because they never represent valid ISO 10646 characters | 
|  | and a UTF-8 decoder that accepts them might introduce security | 
|  | problems comparable to overlong UTF-8 sequences. */ | 
|  | [ 0xED, 0xA0, 0x80 ],       // U+D800 | 
|  | [ 0xED, 0xAD, 0xBF ],       // U+DB7F | 
|  | [ 0xED, 0xAE, 0x80 ],       // U+DB80 | 
|  | [ 0xED, 0xAF, 0xBF ],       // U+DBFF | 
|  | [ 0xED, 0xB0, 0x80 ],       // U+DC00 | 
|  | [ 0xED, 0xBE, 0x80 ],       // U+DF80 | 
|  | [ 0xED, 0xBF, 0xBF ],       // U+DFFF | 
|  | ]; | 
|  |  | 
|  | static string[] sanitizedStrings = | 
|  | [ | 
|  | "\uFFFD","\uFFFD", | 
|  | "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ", | 
|  | " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ", | 
|  | "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ", | 
|  | " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", | 
|  | "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD", | 
|  | "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ", | 
|  | " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ", | 
|  | " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", | 
|  | "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", | 
|  | "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", | 
|  | ]; | 
|  |  | 
|  | // HELPER FUNCTIONS | 
|  | // we can probably do this better... | 
|  | static char toHexDigit(int n) | 
|  | { | 
|  | return "0123456789ABCDEF"[n & 0xF]; | 
|  | } | 
|  |  | 
|  | static string makeReadable(string s) | 
|  | { | 
|  | string r = "\""; | 
|  | foreach (char c;s) | 
|  | { | 
|  | if (c >= 0x20 && c < 0x80) | 
|  | { | 
|  | r ~= c; | 
|  | } | 
|  | else | 
|  | { | 
|  | r ~= "\\x"; | 
|  | r ~= toHexDigit(c >> 4); | 
|  | r ~= toHexDigit(c); | 
|  | } | 
|  | } | 
|  | r ~= "\""; | 
|  | return r; | 
|  | } | 
|  |  | 
|  | void transcodeReverse(Src,Dst)(immutable(Src)[] s, out immutable(Dst)[] r) | 
|  | { | 
|  | static if (is(Src == Dst)) | 
|  | { | 
|  | return s; | 
|  | } | 
|  | else static if (is(Src == AsciiChar)) | 
|  | { | 
|  | transcodeReverse!(char,Dst)(cast(string) s,r); | 
|  | } | 
|  | else | 
|  | { | 
|  | foreach_reverse (d;codePoints(s)) | 
|  | { | 
|  | foreach_reverse (c;codeUnits!(Dst)(d)) | 
|  | { | 
|  | r = c ~ r; | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // Make sure everything that should be valid, is | 
|  | foreach (a;validStrings) | 
|  | { | 
|  | string s = cast(string) a; | 
|  | assert(isValid(s),"Failed to validate: "~makeReadable(s)); | 
|  | } | 
|  |  | 
|  | // Make sure everything that shouldn't be valid, isn't | 
|  | foreach (a;invalidStrings) | 
|  | { | 
|  | string s = cast(string) a; | 
|  | assert(!isValid(s),"Incorrectly validated: "~makeReadable(s)); | 
|  | } | 
|  |  | 
|  | // Make sure we can sanitize everything bad | 
|  | assert(invalidStrings.length == sanitizedStrings.length); | 
|  | for (int i=0; i<invalidStrings.length; ++i) | 
|  | { | 
|  | string s = cast(string) invalidStrings[i]; | 
|  | string t = sanitize(s); | 
|  | assert(isValid(t)); | 
|  | assert(t == sanitizedStrings[i]); | 
|  | ubyte[] u = cast(ubyte[]) t; | 
|  | validStrings ~= u; | 
|  | } | 
|  |  | 
|  | // Make sure all transcodings work in both directions, using both forward | 
|  | // and reverse iteration | 
|  | foreach (a; validStrings) | 
|  | { | 
|  | string s = cast(string) a; | 
|  | string s2; | 
|  | wstring ws, ws2; | 
|  | dstring ds, ds2; | 
|  |  | 
|  | transcode(s,ws); | 
|  | assert(isValid(ws)); | 
|  | transcode(ws,s2); | 
|  | assert(s == s2); | 
|  |  | 
|  | transcode(s,ds); | 
|  | assert(isValid(ds)); | 
|  | transcode(ds,s2); | 
|  | assert(s == s2); | 
|  |  | 
|  | transcode(ws,s); | 
|  | assert(isValid(s)); | 
|  | transcode(s,ws2); | 
|  | assert(ws == ws2); | 
|  |  | 
|  | transcode(ws,ds); | 
|  | assert(isValid(ds)); | 
|  | transcode(ds,ws2); | 
|  | assert(ws == ws2); | 
|  |  | 
|  | transcode(ds,s); | 
|  | assert(isValid(s)); | 
|  | transcode(s,ds2); | 
|  | assert(ds == ds2); | 
|  |  | 
|  | transcode(ds,ws); | 
|  | assert(isValid(ws)); | 
|  | transcode(ws,ds2); | 
|  | assert(ds == ds2); | 
|  |  | 
|  | transcodeReverse(s,ws); | 
|  | assert(isValid(ws)); | 
|  | transcodeReverse(ws,s2); | 
|  | assert(s == s2); | 
|  |  | 
|  | transcodeReverse(s,ds); | 
|  | assert(isValid(ds)); | 
|  | transcodeReverse(ds,s2); | 
|  | assert(s == s2); | 
|  |  | 
|  | transcodeReverse(ws,s); | 
|  | assert(isValid(s)); | 
|  | transcodeReverse(s,ws2); | 
|  | assert(ws == ws2); | 
|  |  | 
|  | transcodeReverse(ws,ds); | 
|  | assert(isValid(ds)); | 
|  | transcodeReverse(ds,ws2); | 
|  | assert(ws == ws2); | 
|  |  | 
|  | transcodeReverse(ds,s); | 
|  | assert(isValid(s)); | 
|  | transcodeReverse(s,ds2); | 
|  | assert(ds == ds2); | 
|  |  | 
|  | transcodeReverse(ds,ws); | 
|  | assert(isValid(ws)); | 
|  | transcodeReverse(ws,ds2); | 
|  | assert(ds == ds2); | 
|  | } | 
|  |  | 
|  | // Make sure the non-UTF encodings work too | 
|  | { | 
|  | auto s = "\u20AC100"; | 
|  | Windows1252String t; | 
|  | transcode(s,t); | 
|  | assert(t == cast(Windows1252Char[])[0x80, '1', '0', '0']); | 
|  | string u; | 
|  | transcode(s,u); | 
|  | assert(s == u); | 
|  | Latin1String v; | 
|  | transcode(s,v); | 
|  | assert(cast(string) v == "?100"); | 
|  | AsciiString w; | 
|  | transcode(v,w); | 
|  | assert(cast(string) w == "?100"); | 
|  | s = "\u017Dlu\u0165ou\u010Dk\u00FD k\u016F\u0148"; | 
|  | Latin2String x; | 
|  | transcode(s,x); | 
|  | assert(x == cast(Latin2Char[])[0xae, 'l', 'u', 0xbb, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]); | 
|  | Windows1250String y; | 
|  | transcode(s,y); | 
|  | assert(y == cast(Windows1250Char[])[0x8e, 'l', 'u', 0x9d, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]); | 
|  | s = "\u0402lu\u0403ou\u201D\u045C k\u0414\u044F"; | 
|  | Windows1251String s51; | 
|  | transcode(s,s51); | 
|  | assert(s51 == cast(Windows1251Char[])[0x80, 'l', 'u', 0x81, 'o', 'u', 0x94, 0x9d, ' ', 'k', 0xc4, 0xff]); | 
|  | } | 
|  |  | 
|  | // Make sure we can count properly | 
|  | { | 
|  | assert(encodedLength!(char)('A') == 1); | 
|  | assert(encodedLength!(char)('\u00E3') == 2); | 
|  | assert(encodedLength!(char)('\u2028') == 3); | 
|  | assert(encodedLength!(char)('\U0010FFF0') == 4); | 
|  | assert(encodedLength!(wchar)('A') == 1); | 
|  | assert(encodedLength!(wchar)('\U0010FFF0') == 2); | 
|  | } | 
|  |  | 
|  | // Make sure we can write into mutable arrays | 
|  | { | 
|  | char[4] buffer; | 
|  | auto n = encode(cast(dchar)'\u00E3',buffer); | 
|  | assert(n == 2); | 
|  | assert(buffer[0] == 0xC3); | 
|  | assert(buffer[1] == 0xA3); | 
|  | } | 
|  | } | 
|  |  | 
|  | //============================================================================= | 
|  |  | 
|  | /** Special value returned by `safeDecode` */ | 
|  | enum dchar INVALID_SEQUENCE = cast(dchar) 0xFFFFFFFF; | 
|  |  | 
|  | template EncoderFunctions() | 
|  | { | 
|  | // Various forms of read | 
|  |  | 
|  | template ReadFromString() | 
|  | { | 
|  | @property bool canRead() { return s.length != 0; } | 
|  | E peek() @safe pure @nogc nothrow { return s[0]; } | 
|  | E read() @safe pure @nogc nothrow { E t = s[0]; s = s[1..$]; return t; } | 
|  | } | 
|  |  | 
|  | template ReverseReadFromString() | 
|  | { | 
|  | @property bool canRead() { return s.length != 0; } | 
|  | E peek() @safe pure @nogc nothrow { return s[$-1]; } | 
|  | E read() @safe pure @nogc nothrow { E t = s[$-1]; s = s[0..$-1]; return t; } | 
|  | } | 
|  |  | 
|  | // Various forms of Write | 
|  |  | 
|  | template WriteToString() | 
|  | { | 
|  | E[] s; | 
|  | void write(E c) @safe pure nothrow { s ~= c; } | 
|  | } | 
|  |  | 
|  | template WriteToArray() | 
|  | { | 
|  | void write(E c) @safe pure @nogc nothrow { array[0] = c; array = array[1..$]; } | 
|  | } | 
|  |  | 
|  | template WriteToDelegate() | 
|  | { | 
|  | void write(E c) { dg(c); } | 
|  | } | 
|  |  | 
|  | // Functions we will export | 
|  |  | 
|  | template EncodeViaWrite() | 
|  | { | 
|  | mixin encodeViaWrite; | 
|  | void encode(dchar c) { encodeViaWrite(c); } | 
|  | } | 
|  |  | 
|  | template SkipViaRead() | 
|  | { | 
|  | mixin skipViaRead; | 
|  | void skip() @safe pure @nogc nothrow { skipViaRead(); } | 
|  | } | 
|  |  | 
|  | template DecodeViaRead() | 
|  | { | 
|  | mixin decodeViaRead; | 
|  | dchar decode() @safe pure @nogc nothrow { return decodeViaRead(); } | 
|  | } | 
|  |  | 
|  | template SafeDecodeViaRead() | 
|  | { | 
|  | mixin safeDecodeViaRead; | 
|  | dchar safeDecode() @safe pure @nogc nothrow { return safeDecodeViaRead(); } | 
|  | } | 
|  |  | 
|  | template DecodeReverseViaRead() | 
|  | { | 
|  | mixin decodeReverseViaRead; | 
|  | dchar decodeReverse() @safe pure @nogc nothrow { return decodeReverseViaRead(); } | 
|  | } | 
|  |  | 
|  | // Encoding to different destinations | 
|  |  | 
|  | template EncodeToString() | 
|  | { | 
|  | mixin WriteToString; | 
|  | mixin EncodeViaWrite; | 
|  | } | 
|  |  | 
|  | template EncodeToArray() | 
|  | { | 
|  | mixin WriteToArray; | 
|  | mixin EncodeViaWrite; | 
|  | } | 
|  |  | 
|  | template EncodeToDelegate() | 
|  | { | 
|  | mixin WriteToDelegate; | 
|  | mixin EncodeViaWrite; | 
|  | } | 
|  |  | 
|  | // Decoding functions | 
|  |  | 
|  | template SkipFromString() | 
|  | { | 
|  | mixin ReadFromString; | 
|  | mixin SkipViaRead; | 
|  | } | 
|  |  | 
|  | template DecodeFromString() | 
|  | { | 
|  | mixin ReadFromString; | 
|  | mixin DecodeViaRead; | 
|  | } | 
|  |  | 
|  | template SafeDecodeFromString() | 
|  | { | 
|  | mixin ReadFromString; | 
|  | mixin SafeDecodeViaRead; | 
|  | } | 
|  |  | 
|  | template DecodeReverseFromString() | 
|  | { | 
|  | mixin ReverseReadFromString; | 
|  | mixin DecodeReverseViaRead; | 
|  | } | 
|  |  | 
|  | //========================================================================= | 
|  |  | 
|  | // Below are the functions we will ultimately expose to the user | 
|  |  | 
|  | E[] encode(dchar c) @safe pure nothrow | 
|  | { | 
|  | mixin EncodeToString e; | 
|  | e.encode(c); | 
|  | return e.s; | 
|  | } | 
|  |  | 
|  | void encode(dchar c, ref E[] array) @safe pure nothrow | 
|  | { | 
|  | mixin EncodeToArray e; | 
|  | e.encode(c); | 
|  | } | 
|  |  | 
|  | void encode(dchar c, void delegate(E) dg) | 
|  | { | 
|  | mixin EncodeToDelegate e; | 
|  | e.encode(c); | 
|  | } | 
|  |  | 
|  | void skip(ref const(E)[] s) @safe pure nothrow | 
|  | { | 
|  | mixin SkipFromString e; | 
|  | e.skip(); | 
|  | } | 
|  |  | 
|  | dchar decode(S)(ref S s) | 
|  | { | 
|  | mixin DecodeFromString e; | 
|  | return e.decode(); | 
|  | } | 
|  |  | 
|  | dchar safeDecode(S)(ref S s) | 
|  | { | 
|  | mixin SafeDecodeFromString e; | 
|  | return e.safeDecode(); | 
|  | } | 
|  |  | 
|  | dchar decodeReverse(ref const(E)[] s) @safe pure nothrow | 
|  | { | 
|  | mixin DecodeReverseFromString e; | 
|  | return e.decodeReverse(); | 
|  | } | 
|  | } | 
|  |  | 
|  | //========================================================================= | 
|  |  | 
|  | struct CodePoints(E) | 
|  | { | 
|  | const(E)[] s; | 
|  |  | 
|  | this(const(E)[] s) | 
|  | in | 
|  | { | 
|  | assert(isValid(s)); | 
|  | } | 
|  | do | 
|  | { | 
|  | this.s = s; | 
|  | } | 
|  |  | 
|  | int opApply(scope int delegate(ref dchar) dg) | 
|  | { | 
|  | int result = 0; | 
|  | while (s.length != 0) | 
|  | { | 
|  | dchar c = decode(s); | 
|  | result = dg(c); | 
|  | if (result != 0) break; | 
|  | } | 
|  | return result; | 
|  | } | 
|  |  | 
|  | int opApply(scope int delegate(ref size_t, ref dchar) dg) | 
|  | { | 
|  | size_t i = 0; | 
|  | int result = 0; | 
|  | while (s.length != 0) | 
|  | { | 
|  | immutable len = s.length; | 
|  | dchar c = decode(s); | 
|  | size_t j = i; // We don't want the delegate corrupting i | 
|  | result = dg(j,c); | 
|  | if (result != 0) break; | 
|  | i += len - s.length; | 
|  | } | 
|  | return result; | 
|  | } | 
|  |  | 
|  | int opApplyReverse(scope int delegate(ref dchar) dg) | 
|  | { | 
|  | int result = 0; | 
|  | while (s.length != 0) | 
|  | { | 
|  | dchar c = decodeReverse(s); | 
|  | result = dg(c); | 
|  | if (result != 0) break; | 
|  | } | 
|  | return result; | 
|  | } | 
|  |  | 
|  | int opApplyReverse(scope int delegate(ref size_t, ref dchar) dg) | 
|  | { | 
|  | int result = 0; | 
|  | while (s.length != 0) | 
|  | { | 
|  | dchar c = decodeReverse(s); | 
|  | size_t i = s.length; | 
|  | result = dg(i,c); | 
|  | if (result != 0) break; | 
|  | } | 
|  | return result; | 
|  | } | 
|  | } | 
|  |  | 
|  | struct CodeUnits(E) | 
|  | { | 
|  | E[] s; | 
|  |  | 
|  | this(dchar d) | 
|  | in | 
|  | { | 
|  | assert(isValidCodePoint(d)); | 
|  | } | 
|  | do | 
|  | { | 
|  | s = encode!(E)(d); | 
|  | } | 
|  |  | 
|  | int opApply(scope int delegate(ref E) dg) | 
|  | { | 
|  | int result = 0; | 
|  | foreach (E c;s) | 
|  | { | 
|  | result = dg(c); | 
|  | if (result != 0) break; | 
|  | } | 
|  | return result; | 
|  | } | 
|  |  | 
|  | int opApplyReverse(scope int delegate(ref E) dg) | 
|  | { | 
|  | int result = 0; | 
|  | foreach_reverse (E c;s) | 
|  | { | 
|  | result = dg(c); | 
|  | if (result != 0) break; | 
|  | } | 
|  | return result; | 
|  | } | 
|  | } | 
|  |  | 
|  | //============================================================================= | 
|  |  | 
|  | template EncoderInstance(E) | 
|  | { | 
|  | static assert(false,"Cannot instantiate EncoderInstance for type " | 
|  | ~ E.stringof); | 
|  | } | 
|  |  | 
|  | private template GenericEncoder() | 
|  | { | 
|  | bool canEncode(dchar c) @safe pure @nogc nothrow | 
|  | { | 
|  | if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) return true; | 
|  | if (c >= 0xFFFD) return false; | 
|  |  | 
|  | auto idx = 0; | 
|  | while (idx < bstMap.length) | 
|  | { | 
|  | if (bstMap[idx][0] == c) return true; | 
|  | idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index | 
|  | } | 
|  |  | 
|  | return false; | 
|  | } | 
|  |  | 
|  | bool isValidCodeUnit(E c) @safe pure @nogc nothrow | 
|  | { | 
|  | if (c < m_charMapStart || c > m_charMapEnd) return true; | 
|  | return charMap[c-m_charMapStart] != 0xFFFD; | 
|  | } | 
|  |  | 
|  | size_t encodedLength(dchar c) @safe pure @nogc nothrow | 
|  | in | 
|  | { | 
|  | assert(canEncode(c)); | 
|  | } | 
|  | do | 
|  | { | 
|  | return 1; | 
|  | } | 
|  |  | 
|  | void encodeViaWrite()(dchar c) | 
|  | { | 
|  | if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) {} | 
|  | else if (c >= 0xFFFD) { c = '?'; } | 
|  | else | 
|  | { | 
|  | auto idx = 0; | 
|  | while (idx < bstMap.length) | 
|  | { | 
|  | if (bstMap[idx][0] == c) | 
|  | { | 
|  | write(cast(E) bstMap[idx][1]); | 
|  | return; | 
|  | } | 
|  | idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index | 
|  | } | 
|  | c = '?'; | 
|  | } | 
|  | write(cast(E) c); | 
|  | } | 
|  |  | 
|  | void skipViaRead()() | 
|  | { | 
|  | read(); | 
|  | } | 
|  |  | 
|  | dchar decodeViaRead()() | 
|  | { | 
|  | E c = read(); | 
|  | return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c; | 
|  | } | 
|  |  | 
|  | dchar safeDecodeViaRead()() | 
|  | { | 
|  | immutable E c = read(); | 
|  | immutable d = (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c; | 
|  | return d == 0xFFFD ? INVALID_SEQUENCE : d; | 
|  | } | 
|  |  | 
|  | dchar decodeReverseViaRead()() | 
|  | { | 
|  | E c = read(); | 
|  | return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c; | 
|  | } | 
|  |  | 
|  | @property EString replacementSequence() @safe pure @nogc nothrow | 
|  | { | 
|  | return cast(EString)("?"); | 
|  | } | 
|  |  | 
|  | mixin EncoderFunctions; | 
|  | } | 
|  |  | 
|  | //============================================================================= | 
|  | //          ASCII | 
|  | //============================================================================= | 
|  |  | 
|  | /** Defines various character sets. */ | 
|  | enum AsciiChar : ubyte { _init } | 
|  | /// Ditto | 
|  | alias AsciiString = immutable(AsciiChar)[]; | 
|  |  | 
|  | template EncoderInstance(CharType : AsciiChar) | 
|  | { | 
|  | alias E = AsciiChar; | 
|  | alias EString = AsciiString; | 
|  |  | 
|  | @property string encodingName() @safe pure nothrow @nogc | 
|  | { | 
|  | return "ASCII"; | 
|  | } | 
|  |  | 
|  | bool canEncode(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return c < 0x80; | 
|  | } | 
|  |  | 
|  | bool isValidCodeUnit(AsciiChar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return c < 0x80; | 
|  | } | 
|  |  | 
|  | size_t encodedLength(dchar c) @safe pure nothrow @nogc | 
|  | in | 
|  | { | 
|  | assert(canEncode(c)); | 
|  | } | 
|  | do | 
|  | { | 
|  | return 1; | 
|  | } | 
|  |  | 
|  | void encodeX(Range)(dchar c, Range r) | 
|  | { | 
|  | if (!canEncode(c)) c = '?'; | 
|  | r.write(cast(AsciiChar) c); | 
|  | } | 
|  |  | 
|  | void encodeViaWrite()(dchar c) | 
|  | { | 
|  | if (!canEncode(c)) c = '?'; | 
|  | write(cast(AsciiChar) c); | 
|  | } | 
|  |  | 
|  | void skipViaRead()() | 
|  | { | 
|  | read(); | 
|  | } | 
|  |  | 
|  | dchar decodeViaRead()() | 
|  | { | 
|  | return read(); | 
|  | } | 
|  |  | 
|  | dchar safeDecodeViaRead()() | 
|  | { | 
|  | immutable c = read(); | 
|  | return canEncode(c) ? c : INVALID_SEQUENCE; | 
|  | } | 
|  |  | 
|  | dchar decodeReverseViaRead()() | 
|  | { | 
|  | return read(); | 
|  | } | 
|  |  | 
|  | @property EString replacementSequence() @safe pure nothrow @nogc | 
|  | { | 
|  | return cast(EString)("?"); | 
|  | } | 
|  |  | 
|  | mixin EncoderFunctions; | 
|  | } | 
|  |  | 
|  | //============================================================================= | 
|  | //          ISO-8859-1 | 
|  | //============================================================================= | 
|  |  | 
|  | /** Defines an Latin1-encoded character. */ | 
|  | enum Latin1Char : ubyte { _init } | 
|  | /** | 
|  | Defines an Latin1-encoded string (as an array of $(D | 
|  | immutable(Latin1Char))). | 
|  | */ | 
|  | alias Latin1String = immutable(Latin1Char)[]; | 
|  |  | 
|  | template EncoderInstance(CharType : Latin1Char) | 
|  | { | 
|  | alias E = Latin1Char; | 
|  | alias EString = Latin1String; | 
|  |  | 
|  | @property string encodingName() @safe pure nothrow @nogc | 
|  | { | 
|  | return "ISO-8859-1"; | 
|  | } | 
|  |  | 
|  | bool canEncode(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return c < 0x100; | 
|  | } | 
|  |  | 
|  | bool isValidCodeUnit(Latin1Char c) @safe pure nothrow @nogc | 
|  | { | 
|  | return true; | 
|  | } | 
|  |  | 
|  | size_t encodedLength(dchar c) @safe pure nothrow @nogc | 
|  | in | 
|  | { | 
|  | assert(canEncode(c)); | 
|  | } | 
|  | do | 
|  | { | 
|  | return 1; | 
|  | } | 
|  |  | 
|  | void encodeViaWrite()(dchar c) | 
|  | { | 
|  | if (!canEncode(c)) c = '?'; | 
|  | write(cast(Latin1Char) c); | 
|  | } | 
|  |  | 
|  | void skipViaRead()() | 
|  | { | 
|  | read(); | 
|  | } | 
|  |  | 
|  | dchar decodeViaRead()() | 
|  | { | 
|  | return read(); | 
|  | } | 
|  |  | 
|  | dchar safeDecodeViaRead()() | 
|  | { | 
|  | return read(); | 
|  | } | 
|  |  | 
|  | dchar decodeReverseViaRead()() | 
|  | { | 
|  | return read(); | 
|  | } | 
|  |  | 
|  | @property EString replacementSequence() @safe pure nothrow @nogc | 
|  | { | 
|  | return cast(EString)("?"); | 
|  | } | 
|  |  | 
|  | mixin EncoderFunctions; | 
|  | } | 
|  |  | 
|  | //============================================================================= | 
|  | //          ISO-8859-2 | 
|  | //============================================================================= | 
|  |  | 
|  | /// Defines a Latin2-encoded character. | 
|  | enum Latin2Char : ubyte { _init } | 
|  |  | 
|  | /** | 
|  | * Defines an Latin2-encoded string (as an array of $(D | 
|  | * immutable(Latin2Char))). | 
|  | */ | 
|  | alias Latin2String = immutable(Latin2Char)[]; | 
|  |  | 
|  | private template EncoderInstance(CharType : Latin2Char) | 
|  | { | 
|  | import std.typecons : Tuple, tuple; | 
|  |  | 
|  | alias E = Latin2Char; | 
|  | alias EString = Latin2String; | 
|  |  | 
|  | @property string encodingName() @safe pure nothrow @nogc | 
|  | { | 
|  | return "ISO-8859-2"; | 
|  | } | 
|  |  | 
|  | private static immutable dchar m_charMapStart = 0xa1; | 
|  | private static immutable dchar m_charMapEnd = 0xff; | 
|  |  | 
|  | private immutable wstring charMap = | 
|  | "\u0104\u02D8\u0141\u00A4\u013D\u015A\u00A7\u00A8"~ | 
|  | "\u0160\u015E\u0164\u0179\u00AD\u017D\u017B\u00B0"~ | 
|  | "\u0105\u02DB\u0142\u00B4\u013E\u015B\u02C7\u00B8"~ | 
|  | "\u0161\u015F\u0165\u017A\u02DD\u017E\u017C\u0154"~ | 
|  | "\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7\u010C"~ | 
|  | "\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E\u0110"~ | 
|  | "\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7\u0158"~ | 
|  | "\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF\u0155"~ | 
|  | "\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7\u010D"~ | 
|  | "\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F\u0111"~ | 
|  | "\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7\u0159"~ | 
|  | "\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9"; | 
|  |  | 
|  | private immutable Tuple!(wchar, char)[] bstMap = [ | 
|  | tuple('\u0148','\xF2'), tuple('\u00F3','\xF3'), tuple('\u0165','\xBB'), | 
|  | tuple('\u00D3','\xD3'), tuple('\u010F','\xEF'), tuple('\u015B','\xB6'), | 
|  | tuple('\u017C','\xBF'), tuple('\u00C1','\xC1'), tuple('\u00E1','\xE1'), | 
|  | tuple('\u0103','\xE3'), tuple('\u013A','\xE5'), tuple('\u0155','\xE0'), | 
|  | tuple('\u0161','\xB9'), tuple('\u0171','\xFB'), tuple('\u02D8','\xA2'), | 
|  | tuple('\u00AD','\xAD'), tuple('\u00C9','\xC9'), tuple('\u00DA','\xDA'), | 
|  | tuple('\u00E9','\xE9'), tuple('\u00FA','\xFA'), tuple('\u0107','\xE6'), | 
|  | tuple('\u0119','\xEA'), tuple('\u0142','\xB3'), tuple('\u0151','\xF5'), | 
|  | tuple('\u0159','\xF8'), tuple('\u015F','\xBA'), tuple('\u0163','\xFE'), | 
|  | tuple('\u016F','\xF9'), tuple('\u017A','\xBC'), tuple('\u017E','\xBE'), | 
|  | tuple('\u02DB','\xB2'), tuple('\u00A7','\xA7'), tuple('\u00B4','\xB4'), | 
|  | tuple('\u00C4','\xC4'), tuple('\u00CD','\xCD'), tuple('\u00D6','\xD6'), | 
|  | tuple('\u00DD','\xDD'), tuple('\u00E4','\xE4'), tuple('\u00ED','\xED'), | 
|  | tuple('\u00F6','\xF6'), tuple('\u00FD','\xFD'), tuple('\u0105','\xB1'), | 
|  | tuple('\u010D','\xE8'), tuple('\u0111','\xF0'), tuple('\u011B','\xEC'), | 
|  | tuple('\u013E','\xB5'), tuple('\u0144','\xF1'), tuple('\u0150','\xD5'), | 
|  | tuple('\u0154','\xC0'), tuple('\u0158','\xD8'), tuple('\u015A','\xA6'), | 
|  | tuple('\u015E','\xAA'), tuple('\u0160','\xA9'), tuple('\u0162','\xDE'), | 
|  | tuple('\u0164','\xAB'), tuple('\u016E','\xD9'), tuple('\u0170','\xDB'), | 
|  | tuple('\u0179','\xAC'), tuple('\u017B','\xAF'), tuple('\u017D','\xAE'), | 
|  | tuple('\u02C7','\xB7'), tuple('\u02D9','\xFF'), tuple('\u02DD','\xBD'), | 
|  | tuple('\u00A4','\xA4'), tuple('\u00A8','\xA8'), tuple('\u00B0','\xB0'), | 
|  | tuple('\u00B8','\xB8'), tuple('\u00C2','\xC2'), tuple('\u00C7','\xC7'), | 
|  | tuple('\u00CB','\xCB'), tuple('\u00CE','\xCE'), tuple('\u00D4','\xD4'), | 
|  | tuple('\u00D7','\xD7'), tuple('\u00DC','\xDC'), tuple('\u00DF','\xDF'), | 
|  | tuple('\u00E2','\xE2'), tuple('\u00E7','\xE7'), tuple('\u00EB','\xEB'), | 
|  | tuple('\u00EE','\xEE'), tuple('\u00F4','\xF4'), tuple('\u00F7','\xF7'), | 
|  | tuple('\u00FC','\xFC'), tuple('\u0102','\xC3'), tuple('\u0104','\xA1'), | 
|  | tuple('\u0106','\xC6'), tuple('\u010C','\xC8'), tuple('\u010E','\xCF'), | 
|  | tuple('\u0110','\xD0'), tuple('\u0118','\xCA'), tuple('\u011A','\xCC'), | 
|  | tuple('\u0139','\xC5'), tuple('\u013D','\xA5'), tuple('\u0141','\xA3'), | 
|  | tuple('\u0143','\xD1'), tuple('\u0147','\xD2') | 
|  | ]; | 
|  |  | 
|  | mixin GenericEncoder!(); | 
|  | } | 
|  |  | 
|  | //============================================================================= | 
|  | //          WINDOWS-1250 | 
|  | //============================================================================= | 
|  |  | 
|  | /// Defines a Windows1250-encoded character. | 
|  | enum Windows1250Char : ubyte { _init } | 
|  |  | 
|  | /** | 
|  | * Defines an Windows1250-encoded string (as an array of $(D | 
|  | * immutable(Windows1250Char))). | 
|  | */ | 
|  | alias Windows1250String = immutable(Windows1250Char)[]; | 
|  |  | 
|  | private template EncoderInstance(CharType : Windows1250Char) | 
|  | { | 
|  | import std.typecons : Tuple, tuple; | 
|  |  | 
|  | alias E = Windows1250Char; | 
|  | alias EString = Windows1250String; | 
|  |  | 
|  | @property string encodingName() @safe pure nothrow @nogc | 
|  | { | 
|  | return "windows-1250"; | 
|  | } | 
|  |  | 
|  | private static immutable dchar m_charMapStart = 0x80; | 
|  | private static immutable dchar m_charMapEnd = 0xff; | 
|  |  | 
|  | private immutable wstring charMap = | 
|  | "\u20AC\uFFFD\u201A\uFFFD\u201E\u2026\u2020\u2021"~ | 
|  | "\uFFFD\u2030\u0160\u2039\u015A\u0164\u017D\u0179"~ | 
|  | "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~ | 
|  | "\uFFFD\u2122\u0161\u203A\u015B\u0165\u017E\u017A"~ | 
|  | "\u00A0\u02C7\u02D8\u0141\u00A4\u0104\u00A6\u00A7"~ | 
|  | "\u00A8\u00A9\u015E\u00AB\u00AC\u00AD\u00AE\u017B"~ | 
|  | "\u00B0\u00B1\u02DB\u0142\u00B4\u00B5\u00B6\u00B7"~ | 
|  | "\u00B8\u0105\u015F\u00BB\u013D\u02DD\u013E\u017C"~ | 
|  | "\u0154\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7"~ | 
|  | "\u010C\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E"~ | 
|  | "\u0110\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7"~ | 
|  | "\u0158\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF"~ | 
|  | "\u0155\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7"~ | 
|  | "\u010D\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F"~ | 
|  | "\u0111\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7"~ | 
|  | "\u0159\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9"; | 
|  |  | 
|  | private immutable Tuple!(wchar, char)[] bstMap = [ | 
|  | tuple('\u011A','\xCC'), tuple('\u00DC','\xDC'), tuple('\u0179','\x8F'), | 
|  | tuple('\u00B7','\xB7'), tuple('\u00FC','\xFC'), tuple('\u0158','\xD8'), | 
|  | tuple('\u201C','\x93'), tuple('\u00AC','\xAC'), tuple('\u00CB','\xCB'), | 
|  | tuple('\u00EB','\xEB'), tuple('\u010C','\xC8'), tuple('\u0143','\xD1'), | 
|  | tuple('\u0162','\xDE'), tuple('\u02D9','\xFF'), tuple('\u2039','\x8B'), | 
|  | tuple('\u00A7','\xA7'), tuple('\u00B1','\xB1'), tuple('\u00C2','\xC2'), | 
|  | tuple('\u00D4','\xD4'), tuple('\u00E2','\xE2'), tuple('\u00F4','\xF4'), | 
|  | tuple('\u0104','\xA5'), tuple('\u0110','\xD0'), tuple('\u013D','\xBC'), | 
|  | tuple('\u0150','\xD5'), tuple('\u015E','\xAA'), tuple('\u016E','\xD9'), | 
|  | tuple('\u017D','\x8E'), tuple('\u2014','\x97'), tuple('\u2021','\x87'), | 
|  | tuple('\u20AC','\x80'), tuple('\u00A4','\xA4'), tuple('\u00A9','\xA9'), | 
|  | tuple('\u00AE','\xAE'), tuple('\u00B5','\xB5'), tuple('\u00BB','\xBB'), | 
|  | tuple('\u00C7','\xC7'), tuple('\u00CE','\xCE'), tuple('\u00D7','\xD7'), | 
|  | tuple('\u00DF','\xDF'), tuple('\u00E7','\xE7'), tuple('\u00EE','\xEE'), | 
|  | tuple('\u00F7','\xF7'), tuple('\u0102','\xC3'), tuple('\u0106','\xC6'), | 
|  | tuple('\u010E','\xCF'), tuple('\u0118','\xCA'), tuple('\u0139','\xC5'), | 
|  | tuple('\u0141','\xA3'), tuple('\u0147','\xD2'), tuple('\u0154','\xC0'), | 
|  | tuple('\u015A','\x8C'), tuple('\u0160','\x8A'), tuple('\u0164','\x8D'), | 
|  | tuple('\u0170','\xDB'), tuple('\u017B','\xAF'), tuple('\u02C7','\xA1'), | 
|  | tuple('\u02DD','\xBD'), tuple('\u2019','\x92'), tuple('\u201E','\x84'), | 
|  | tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'), | 
|  | tuple('\u00A0','\xA0'), tuple('\u00A6','\xA6'), tuple('\u00A8','\xA8'), | 
|  | tuple('\u00AB','\xAB'), tuple('\u00AD','\xAD'), tuple('\u00B0','\xB0'), | 
|  | tuple('\u00B4','\xB4'), tuple('\u00B6','\xB6'), tuple('\u00B8','\xB8'), | 
|  | tuple('\u00C1','\xC1'), tuple('\u00C4','\xC4'), tuple('\u00C9','\xC9'), | 
|  | tuple('\u00CD','\xCD'), tuple('\u00D3','\xD3'), tuple('\u00D6','\xD6'), | 
|  | tuple('\u00DA','\xDA'), tuple('\u00DD','\xDD'), tuple('\u00E1','\xE1'), | 
|  | tuple('\u00E4','\xE4'), tuple('\u00E9','\xE9'), tuple('\u00ED','\xED'), | 
|  | tuple('\u00F3','\xF3'), tuple('\u00F6','\xF6'), tuple('\u00FA','\xFA'), | 
|  | tuple('\u00FD','\xFD'), tuple('\u0103','\xE3'), tuple('\u0105','\xB9'), | 
|  | tuple('\u0107','\xE6'), tuple('\u010D','\xE8'), tuple('\u010F','\xEF'), | 
|  | tuple('\u0111','\xF0'), tuple('\u0119','\xEA'), tuple('\u011B','\xEC'), | 
|  | tuple('\u013A','\xE5'), tuple('\u013E','\xBE'), tuple('\u0142','\xB3'), | 
|  | tuple('\u0144','\xF1'), tuple('\u0148','\xF2'), tuple('\u0151','\xF5'), | 
|  | tuple('\u0155','\xE0'), tuple('\u0159','\xF8'), tuple('\u015B','\x9C'), | 
|  | tuple('\u015F','\xBA'), tuple('\u0161','\x9A'), tuple('\u0163','\xFE'), | 
|  | tuple('\u0165','\x9D'), tuple('\u016F','\xF9'), tuple('\u0171','\xFB'), | 
|  | tuple('\u017A','\x9F'), tuple('\u017C','\xBF'), tuple('\u017E','\x9E'), | 
|  | tuple('\u02D8','\xA2'), tuple('\u02DB','\xB2'), tuple('\u2013','\x96'), | 
|  | tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'), | 
|  | tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89') | 
|  | ]; | 
|  |  | 
|  | mixin GenericEncoder!(); | 
|  | } | 
|  |  | 
|  | //============================================================================= | 
|  | //          WINDOWS-1251 | 
|  | //============================================================================= | 
|  |  | 
|  | /// Defines a Windows1251-encoded character. | 
|  | enum Windows1251Char : ubyte { _init } | 
|  |  | 
|  | /** | 
|  | * Defines an Windows1251-encoded string (as an array of $(D | 
|  | * immutable(Windows1251Char))). | 
|  | */ | 
|  | alias Windows1251String = immutable(Windows1251Char)[]; | 
|  |  | 
|  | private template EncoderInstance(CharType : Windows1251Char) | 
|  | { | 
|  | import std.typecons : Tuple, tuple; | 
|  |  | 
|  | alias E = Windows1251Char; | 
|  | alias EString = Windows1251String; | 
|  |  | 
|  | @property string encodingName() @safe pure nothrow @nogc | 
|  | { | 
|  | return "windows-1251"; | 
|  | } | 
|  |  | 
|  | private static immutable dchar m_charMapStart = 0x80; | 
|  | private static immutable dchar m_charMapEnd = 0xff; | 
|  |  | 
|  | private immutable wstring charMap = | 
|  | "\u0402\u0403\u201A\u0453\u201E\u2026\u2020\u2021"~ | 
|  | "\u20AC\u2030\u0409\u2039\u040A\u040C\u040B\u040F"~ | 
|  | "\u0452\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~ | 
|  | "\uFFFD\u2122\u0459\u203A\u045A\u045C\u045B\u045F"~ | 
|  | "\u00A0\u040E\u045E\u0408\u00A4\u0490\u00A6\u00A7"~ | 
|  | "\u0401\u00A9\u0404\u00AB\u00AC\u00AD\u00AE\u0407"~ | 
|  | "\u00B0\u00B1\u0406\u0456\u0491\u00B5\u00B6\u00B7"~ | 
|  | "\u0451\u2116\u0454\u00BB\u0458\u0405\u0455\u0457"~ | 
|  | "\u0410\u0411\u0412\u0413\u0414\u0415\u0416\u0417"~ | 
|  | "\u0418\u0419\u041A\u041B\u041C\u041D\u041E\u041F"~ | 
|  | "\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427"~ | 
|  | "\u0428\u0429\u042A\u042B\u042C\u042D\u042E\u042F"~ | 
|  | "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437"~ | 
|  | "\u0438\u0439\u043A\u043B\u043C\u043D\u043E\u043F"~ | 
|  | "\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447"~ | 
|  | "\u0448\u0449\u044A\u044B\u044C\u044D\u044E\u044F"; | 
|  |  | 
|  | private immutable Tuple!(wchar, char)[] bstMap = [ | 
|  | tuple('\u0432','\xE2'),tuple('\u0412','\xC2'),tuple('\u0453','\x83'), | 
|  | tuple('\u0401','\xA8'),tuple('\u0422','\xD2'),tuple('\u0442','\xF2'), | 
|  | tuple('\u2018','\x91'),tuple('\u00AD','\xAD'),tuple('\u0409','\x8A'), | 
|  | tuple('\u041A','\xCA'),tuple('\u042A','\xDA'),tuple('\u043A','\xEA'), | 
|  | tuple('\u044A','\xFA'),tuple('\u045B','\x9E'),tuple('\u2022','\x95'), | 
|  | tuple('\u00A7','\xA7'),tuple('\u00B5','\xB5'),tuple('\u0405','\xBD'), | 
|  | tuple('\u040E','\xA1'),tuple('\u0416','\xC6'),tuple('\u041E','\xCE'), | 
|  | tuple('\u0426','\xD6'),tuple('\u042E','\xDE'),tuple('\u0436','\xE6'), | 
|  | tuple('\u043E','\xEE'),tuple('\u0446','\xF6'),tuple('\u044E','\xFE'), | 
|  | tuple('\u0457','\xBF'),tuple('\u0490','\xA5'),tuple('\u201D','\x94'), | 
|  | tuple('\u203A','\x9B'),tuple('\u00A4','\xA4'),tuple('\u00AB','\xAB'), | 
|  | tuple('\u00B0','\xB0'),tuple('\u00B7','\xB7'),tuple('\u0403','\x81'), | 
|  | tuple('\u0407','\xAF'),tuple('\u040B','\x8E'),tuple('\u0410','\xC0'), | 
|  | tuple('\u0414','\xC4'),tuple('\u0418','\xC8'),tuple('\u041C','\xCC'), | 
|  | tuple('\u0420','\xD0'),tuple('\u0424','\xD4'),tuple('\u0428','\xD8'), | 
|  | tuple('\u042C','\xDC'),tuple('\u0430','\xE0'),tuple('\u0434','\xE4'), | 
|  | tuple('\u0438','\xE8'),tuple('\u043C','\xEC'),tuple('\u0440','\xF0'), | 
|  | tuple('\u0444','\xF4'),tuple('\u0448','\xF8'),tuple('\u044C','\xFC'), | 
|  | tuple('\u0451','\xB8'),tuple('\u0455','\xBE'),tuple('\u0459','\x9A'), | 
|  | tuple('\u045E','\xA2'),tuple('\u2013','\x96'),tuple('\u201A','\x82'), | 
|  | tuple('\u2020','\x86'),tuple('\u2030','\x89'),tuple('\u2116','\xB9'), | 
|  | tuple('\u00A0','\xA0'),tuple('\u00A6','\xA6'),tuple('\u00A9','\xA9'), | 
|  | tuple('\u00AC','\xAC'),tuple('\u00AE','\xAE'),tuple('\u00B1','\xB1'), | 
|  | tuple('\u00B6','\xB6'),tuple('\u00BB','\xBB'),tuple('\u0402','\x80'), | 
|  | tuple('\u0404','\xAA'),tuple('\u0406','\xB2'),tuple('\u0408','\xA3'), | 
|  | tuple('\u040A','\x8C'),tuple('\u040C','\x8D'),tuple('\u040F','\x8F'), | 
|  | tuple('\u0411','\xC1'),tuple('\u0413','\xC3'),tuple('\u0415','\xC5'), | 
|  | tuple('\u0417','\xC7'),tuple('\u0419','\xC9'),tuple('\u041B','\xCB'), | 
|  | tuple('\u041D','\xCD'),tuple('\u041F','\xCF'),tuple('\u0421','\xD1'), | 
|  | tuple('\u0423','\xD3'),tuple('\u0425','\xD5'),tuple('\u0427','\xD7'), | 
|  | tuple('\u0429','\xD9'),tuple('\u042B','\xDB'),tuple('\u042D','\xDD'), | 
|  | tuple('\u042F','\xDF'),tuple('\u0431','\xE1'),tuple('\u0433','\xE3'), | 
|  | tuple('\u0435','\xE5'),tuple('\u0437','\xE7'),tuple('\u0439','\xE9'), | 
|  | tuple('\u043B','\xEB'),tuple('\u043D','\xED'),tuple('\u043F','\xEF'), | 
|  | tuple('\u0441','\xF1'),tuple('\u0443','\xF3'),tuple('\u0445','\xF5'), | 
|  | tuple('\u0447','\xF7'),tuple('\u0449','\xF9'),tuple('\u044B','\xFB'), | 
|  | tuple('\u044D','\xFD'),tuple('\u044F','\xFF'),tuple('\u0452','\x90'), | 
|  | tuple('\u0454','\xBA'),tuple('\u0456','\xB3'),tuple('\u0458','\xBC'), | 
|  | tuple('\u045A','\x9C'),tuple('\u045C','\x9D'),tuple('\u045F','\x9F'), | 
|  | tuple('\u0491','\xB4'),tuple('\u2014','\x97'),tuple('\u2019','\x92'), | 
|  | tuple('\u201C','\x93'),tuple('\u201E','\x84'),tuple('\u2021','\x87'), | 
|  | tuple('\u2026','\x85'),tuple('\u2039','\x8B'),tuple('\u20AC','\x88'), | 
|  | tuple('\u2122','\x99') | 
|  | ]; | 
|  |  | 
|  | mixin GenericEncoder!(); | 
|  | } | 
|  |  | 
|  | //============================================================================= | 
|  | //          WINDOWS-1252 | 
|  | //============================================================================= | 
|  |  | 
|  | /// Defines a Windows1252-encoded character. | 
|  | enum Windows1252Char : ubyte { _init } | 
|  |  | 
|  | /** | 
|  | * Defines an Windows1252-encoded string (as an array of $(D | 
|  | * immutable(Windows1252Char))). | 
|  | */ | 
|  | alias Windows1252String = immutable(Windows1252Char)[]; | 
|  |  | 
|  | template EncoderInstance(CharType : Windows1252Char) | 
|  | { | 
|  | import std.typecons : Tuple, tuple; | 
|  |  | 
|  | alias E = Windows1252Char; | 
|  | alias EString = Windows1252String; | 
|  |  | 
|  | @property string encodingName() @safe pure nothrow @nogc | 
|  | { | 
|  | return "windows-1252"; | 
|  | } | 
|  |  | 
|  | private static immutable dchar m_charMapStart = 0x80; | 
|  | private static immutable dchar m_charMapEnd = 0x9f; | 
|  |  | 
|  | private immutable wstring charMap = | 
|  | "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"~ | 
|  | "\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"~ | 
|  | "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~ | 
|  | "\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178"; | 
|  |  | 
|  | private immutable Tuple!(wchar, char)[] bstMap = [ | 
|  | tuple('\u201C','\x93'), tuple('\u0192','\x83'), tuple('\u2039','\x8B'), | 
|  | tuple('\u0161','\x9A'), tuple('\u2014','\x97'), tuple('\u2021','\x87'), | 
|  | tuple('\u20AC','\x80'), tuple('\u0153','\x9C'), tuple('\u017D','\x8E'), | 
|  | tuple('\u02DC','\x98'), tuple('\u2019','\x92'), tuple('\u201E','\x84'), | 
|  | tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'), | 
|  | tuple('\u0152','\x8C'), tuple('\u0160','\x8A'), tuple('\u0178','\x9F'), | 
|  | tuple('\u017E','\x9E'), tuple('\u02C6','\x88'), tuple('\u2013','\x96'), | 
|  | tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'), | 
|  | tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89') | 
|  | ]; | 
|  |  | 
|  | mixin GenericEncoder!(); | 
|  | } | 
|  |  | 
|  | //============================================================================= | 
|  | //          UTF-8 | 
|  | //============================================================================= | 
|  |  | 
|  | template EncoderInstance(CharType : char) | 
|  | { | 
|  | alias E = char; | 
|  | alias EString = immutable(char)[]; | 
|  |  | 
|  | @property string encodingName() @safe pure nothrow @nogc | 
|  | { | 
|  | return "UTF-8"; | 
|  | } | 
|  |  | 
|  | bool canEncode(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return isValidCodePoint(c); | 
|  | } | 
|  |  | 
|  | bool isValidCodeUnit(char c) @safe pure nothrow @nogc | 
|  | { | 
|  | return (c < 0xC0 || (c >= 0xC2 && c < 0xF5)); | 
|  | } | 
|  |  | 
|  | immutable ubyte[128] tailTable = | 
|  | [ | 
|  | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
|  | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
|  | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
|  | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | 
|  | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | 
|  | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | 
|  | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | 
|  | 3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0, | 
|  | ]; | 
|  |  | 
|  | private int tails(char c) @safe pure nothrow @nogc | 
|  | in | 
|  | { | 
|  | assert(c >= 0x80); | 
|  | } | 
|  | do | 
|  | { | 
|  | return tailTable[c-0x80]; | 
|  | } | 
|  |  | 
|  | size_t encodedLength(dchar c) @safe pure nothrow @nogc | 
|  | in | 
|  | { | 
|  | assert(canEncode(c)); | 
|  | } | 
|  | do | 
|  | { | 
|  | if (c < 0x80) return 1; | 
|  | if (c < 0x800) return 2; | 
|  | if (c < 0x10000) return 3; | 
|  | return 4; | 
|  | } | 
|  |  | 
|  | void encodeViaWrite()(dchar c) | 
|  | { | 
|  | if (c < 0x80) | 
|  | { | 
|  | write(cast(char) c); | 
|  | } | 
|  | else if (c < 0x800) | 
|  | { | 
|  | write(cast(char)((c >> 6) + 0xC0)); | 
|  | write(cast(char)((c & 0x3F) + 0x80)); | 
|  | } | 
|  | else if (c < 0x10000) | 
|  | { | 
|  | write(cast(char)((c >> 12) + 0xE0)); | 
|  | write(cast(char)(((c >> 6) & 0x3F) + 0x80)); | 
|  | write(cast(char)((c & 0x3F) + 0x80)); | 
|  | } | 
|  | else | 
|  | { | 
|  | write(cast(char)((c >> 18) + 0xF0)); | 
|  | write(cast(char)(((c >> 12) & 0x3F) + 0x80)); | 
|  | write(cast(char)(((c >> 6) & 0x3F) + 0x80)); | 
|  | write(cast(char)((c & 0x3F) + 0x80)); | 
|  | } | 
|  | } | 
|  |  | 
|  | void skipViaRead()() | 
|  | { | 
|  | auto c = read(); | 
|  | if (c < 0xC0) return; | 
|  | int n = tails(cast(char) c); | 
|  | for (size_t i=0; i<n; ++i) | 
|  | { | 
|  | read(); | 
|  | } | 
|  | } | 
|  |  | 
|  | dchar decodeViaRead()() | 
|  | { | 
|  | dchar c = read(); | 
|  | if (c < 0xC0) return c; | 
|  | int n = tails(cast(char) c); | 
|  | c &= (1 << (6 - n)) - 1; | 
|  | for (size_t i=0; i<n; ++i) | 
|  | { | 
|  | c = (c << 6) + (read() & 0x3F); | 
|  | } | 
|  | return c; | 
|  | } | 
|  |  | 
|  | dchar safeDecodeViaRead()() | 
|  | { | 
|  | dchar c = read(); | 
|  | if (c < 0x80) return c; | 
|  | int n = tails(cast(char) c); | 
|  | if (n == 0) return INVALID_SEQUENCE; | 
|  |  | 
|  | if (!canRead) return INVALID_SEQUENCE; | 
|  | size_t d = peek(); | 
|  | immutable err = | 
|  | ( | 
|  | (c < 0xC2)                              // fail overlong 2-byte sequences | 
|  | ||  (c > 0xF4)                              // fail overlong 4-6-byte sequences | 
|  | ||  (c == 0xE0 && ((d & 0xE0) == 0x80))     // fail overlong 3-byte sequences | 
|  | ||  (c == 0xED && ((d & 0xE0) == 0xA0))     // fail surrogates | 
|  | ||  (c == 0xF0 && ((d & 0xF0) == 0x80))     // fail overlong 4-byte sequences | 
|  | ||  (c == 0xF4 && ((d & 0xF0) >= 0x90))     // fail code points > 0x10FFFF | 
|  | ); | 
|  |  | 
|  | c &= (1 << (6 - n)) - 1; | 
|  | for (size_t i=0; i<n; ++i) | 
|  | { | 
|  | if (!canRead) return INVALID_SEQUENCE; | 
|  | d = peek(); | 
|  | if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE; | 
|  | c = (c << 6) + (read() & 0x3F); | 
|  | } | 
|  |  | 
|  | return err ? INVALID_SEQUENCE : c; | 
|  | } | 
|  |  | 
|  | dchar decodeReverseViaRead()() | 
|  | { | 
|  | dchar c = read(); | 
|  | if (c < 0x80) return c; | 
|  | size_t shift = 0; | 
|  | c &= 0x3F; | 
|  | for (size_t i=0; i<4; ++i) | 
|  | { | 
|  | shift += 6; | 
|  | auto d = read(); | 
|  | size_t n = tails(cast(char) d); | 
|  | immutable mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1; | 
|  | c += ((d & mask) << shift); | 
|  | if (n != 0) break; | 
|  | } | 
|  | return c; | 
|  | } | 
|  |  | 
|  | @property EString replacementSequence() @safe pure nothrow @nogc | 
|  | { | 
|  | return "\uFFFD"; | 
|  | } | 
|  |  | 
|  | mixin EncoderFunctions; | 
|  | } | 
|  |  | 
|  | //============================================================================= | 
|  | //          UTF-16 | 
|  | //============================================================================= | 
|  |  | 
|  | template EncoderInstance(CharType : wchar) | 
|  | { | 
|  | alias E = wchar; | 
|  | alias EString = immutable(wchar)[]; | 
|  |  | 
|  | @property string encodingName() @safe pure nothrow @nogc | 
|  | { | 
|  | return "UTF-16"; | 
|  | } | 
|  |  | 
|  | bool canEncode(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return isValidCodePoint(c); | 
|  | } | 
|  |  | 
|  | bool isValidCodeUnit(wchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return true; | 
|  | } | 
|  |  | 
|  | size_t encodedLength(dchar c) @safe pure nothrow @nogc | 
|  | in | 
|  | { | 
|  | assert(canEncode(c)); | 
|  | } | 
|  | do | 
|  | { | 
|  | return (c < 0x10000) ? 1 : 2; | 
|  | } | 
|  |  | 
|  | void encodeViaWrite()(dchar c) | 
|  | { | 
|  | if (c < 0x10000) | 
|  | { | 
|  | write(cast(wchar) c); | 
|  | } | 
|  | else | 
|  | { | 
|  | size_t n = c - 0x10000; | 
|  | write(cast(wchar)(0xD800 + (n >> 10))); | 
|  | write(cast(wchar)(0xDC00 + (n & 0x3FF))); | 
|  | } | 
|  | } | 
|  |  | 
|  | void skipViaRead()() | 
|  | { | 
|  | immutable c = read(); | 
|  | if (c < 0xD800 || c >= 0xE000) return; | 
|  | read(); | 
|  | } | 
|  |  | 
|  | dchar decodeViaRead()() | 
|  | { | 
|  | wchar c = read(); | 
|  | if (c < 0xD800 || c >= 0xE000) return cast(dchar) c; | 
|  | wchar d = read(); | 
|  | c &= 0x3FF; | 
|  | d &= 0x3FF; | 
|  | return 0x10000 + (c << 10) + d; | 
|  | } | 
|  |  | 
|  | dchar safeDecodeViaRead()() | 
|  | { | 
|  | wchar c = read(); | 
|  | if (c < 0xD800 || c >= 0xE000) return cast(dchar) c; | 
|  | if (c >= 0xDC00) return INVALID_SEQUENCE; | 
|  | if (!canRead) return INVALID_SEQUENCE; | 
|  | wchar d = peek(); | 
|  | if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE; | 
|  | d = read(); | 
|  | c &= 0x3FF; | 
|  | d &= 0x3FF; | 
|  | return 0x10000 + (c << 10) + d; | 
|  | } | 
|  |  | 
|  | dchar decodeReverseViaRead()() | 
|  | { | 
|  | wchar c = read(); | 
|  | if (c < 0xD800 || c >= 0xE000) return cast(dchar) c; | 
|  | wchar d = read(); | 
|  | c &= 0x3FF; | 
|  | d &= 0x3FF; | 
|  | return 0x10000 + (d << 10) + c; | 
|  | } | 
|  |  | 
|  | @property EString replacementSequence() @safe pure nothrow @nogc | 
|  | { | 
|  | return "\uFFFD"w; | 
|  | } | 
|  |  | 
|  | mixin EncoderFunctions; | 
|  | } | 
|  |  | 
|  | //============================================================================= | 
|  | //          UTF-32 | 
|  | //============================================================================= | 
|  |  | 
|  | template EncoderInstance(CharType : dchar) | 
|  | { | 
|  | alias E = dchar; | 
|  | alias EString = immutable(dchar)[]; | 
|  |  | 
|  | @property string encodingName() @safe pure nothrow @nogc | 
|  | { | 
|  | return "UTF-32"; | 
|  | } | 
|  |  | 
|  | bool canEncode(dchar c) @safe pure @nogc nothrow | 
|  | { | 
|  | return isValidCodePoint(c); | 
|  | } | 
|  |  | 
|  | bool isValidCodeUnit(dchar c) @safe pure @nogc nothrow | 
|  | { | 
|  | return isValidCodePoint(c); | 
|  | } | 
|  |  | 
|  | size_t encodedLength(dchar c) @safe pure @nogc nothrow | 
|  | in | 
|  | { | 
|  | assert(canEncode(c)); | 
|  | } | 
|  | do | 
|  | { | 
|  | return 1; | 
|  | } | 
|  |  | 
|  | void encodeViaWrite()(dchar c) | 
|  | { | 
|  | write(c); | 
|  | } | 
|  |  | 
|  | void skipViaRead()() | 
|  | { | 
|  | read(); | 
|  | } | 
|  |  | 
|  | dchar decodeViaRead()() | 
|  | { | 
|  | return cast(dchar) read(); | 
|  | } | 
|  |  | 
|  | dchar safeDecodeViaRead()() | 
|  | { | 
|  | immutable c = read(); | 
|  | return isValidCodePoint(c) ? c : INVALID_SEQUENCE; | 
|  | } | 
|  |  | 
|  | dchar decodeReverseViaRead()() | 
|  | { | 
|  | return cast(dchar) read(); | 
|  | } | 
|  |  | 
|  | @property EString replacementSequence() @safe pure nothrow @nogc | 
|  | { | 
|  | return "\uFFFD"d; | 
|  | } | 
|  |  | 
|  | mixin EncoderFunctions; | 
|  | } | 
|  |  | 
|  | //============================================================================= | 
|  | // Below are forwarding functions which expose the function to the user | 
|  |  | 
|  | /** | 
|  | Returns true if c is a valid code point | 
|  |  | 
|  | Note that this includes the non-character code points U+FFFE and U+FFFF, | 
|  | since these are valid code points (even though they are not valid | 
|  | characters). | 
|  |  | 
|  | Supersedes: | 
|  | This function supersedes `std.utf.startsValidDchar()`. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | c = the code point to be tested | 
|  | */ | 
|  | bool isValidCodePoint(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return c < 0xD800 || (c >= 0xE000 && c < 0x110000); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Returns the name of an encoding. | 
|  |  | 
|  | The type of encoding cannot be deduced. Therefore, it is necessary to | 
|  | explicitly specify the encoding type. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  | */ | 
|  | @property string encodingName(T)() | 
|  | { | 
|  | return EncoderInstance!(T).encodingName; | 
|  | } | 
|  |  | 
|  | /// | 
|  | @safe unittest | 
|  | { | 
|  | assert(encodingName!(char) == "UTF-8"); | 
|  | assert(encodingName!(wchar) == "UTF-16"); | 
|  | assert(encodingName!(dchar) == "UTF-32"); | 
|  | assert(encodingName!(AsciiChar) == "ASCII"); | 
|  | assert(encodingName!(Latin1Char) == "ISO-8859-1"); | 
|  | assert(encodingName!(Latin2Char) == "ISO-8859-2"); | 
|  | assert(encodingName!(Windows1250Char) == "windows-1250"); | 
|  | assert(encodingName!(Windows1251Char) == "windows-1251"); | 
|  | assert(encodingName!(Windows1252Char) == "windows-1252"); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Returns true iff it is possible to represent the specified codepoint | 
|  | in the encoding. | 
|  |  | 
|  | The type of encoding cannot be deduced. Therefore, it is necessary to | 
|  | explicitly specify the encoding type. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  | */ | 
|  | bool canEncode(E)(dchar c) | 
|  | { | 
|  | return EncoderInstance!(E).canEncode(c); | 
|  | } | 
|  |  | 
|  | /// | 
|  | @safe pure unittest | 
|  | { | 
|  | assert( canEncode!(Latin1Char)('A')); | 
|  | assert( canEncode!(Latin2Char)('A')); | 
|  | assert(!canEncode!(AsciiChar)('\u00A0')); | 
|  | assert( canEncode!(Latin1Char)('\u00A0')); | 
|  | assert( canEncode!(Latin2Char)('\u00A0')); | 
|  | assert( canEncode!(Windows1250Char)('\u20AC')); | 
|  | assert(!canEncode!(Windows1250Char)('\u20AD')); | 
|  | assert(!canEncode!(Windows1250Char)('\uFFFD')); | 
|  | assert( canEncode!(Windows1251Char)('\u0402')); | 
|  | assert(!canEncode!(Windows1251Char)('\u20AD')); | 
|  | assert(!canEncode!(Windows1251Char)('\uFFFD')); | 
|  | assert( canEncode!(Windows1252Char)('\u20AC')); | 
|  | assert(!canEncode!(Windows1252Char)('\u20AD')); | 
|  | assert(!canEncode!(Windows1252Char)('\uFFFD')); | 
|  | assert(!canEncode!(char)(cast(dchar) 0x110000)); | 
|  | } | 
|  |  | 
|  | /// How to check an entire string | 
|  | @safe pure unittest | 
|  | { | 
|  | import std.algorithm.searching : find; | 
|  | import std.utf : byDchar; | 
|  |  | 
|  | assert("The quick brown fox" | 
|  | .byDchar | 
|  | .find!(x => !canEncode!AsciiChar(x)) | 
|  | .empty); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Returns true if the code unit is legal. For example, the byte 0x80 would | 
|  | not be legal in ASCII, because ASCII code units must always be in the range | 
|  | 0x00 to 0x7F. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | c = the code unit to be tested | 
|  | */ | 
|  | bool isValidCodeUnit(E)(E c) | 
|  | { | 
|  | return EncoderInstance!(E).isValidCodeUnit(c); | 
|  | } | 
|  |  | 
|  | /// | 
|  | @system pure unittest | 
|  | { | 
|  | assert(!isValidCodeUnit(cast(char) 0xC0)); | 
|  | assert(!isValidCodeUnit(cast(char) 0xFF)); | 
|  | assert( isValidCodeUnit(cast(wchar) 0xD800)); | 
|  | assert(!isValidCodeUnit(cast(dchar) 0xD800)); | 
|  | assert(!isValidCodeUnit(cast(AsciiChar) 0xA0)); | 
|  | assert( isValidCodeUnit(cast(Windows1250Char) 0x80)); | 
|  | assert(!isValidCodeUnit(cast(Windows1250Char) 0x81)); | 
|  | assert( isValidCodeUnit(cast(Windows1251Char) 0x80)); | 
|  | assert(!isValidCodeUnit(cast(Windows1251Char) 0x98)); | 
|  | assert( isValidCodeUnit(cast(Windows1252Char) 0x80)); | 
|  | assert(!isValidCodeUnit(cast(Windows1252Char) 0x81)); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Returns true if the string is encoded correctly | 
|  |  | 
|  | Supersedes: | 
|  | This function supersedes std.utf.validate(), however note that this | 
|  | function returns a bool indicating whether the input was valid or not, | 
|  | whereas the older function would throw an exception. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | s = the string to be tested | 
|  | */ | 
|  | bool isValid(E)(const(E)[] s) | 
|  | { | 
|  | return s.length == validLength(s); | 
|  | } | 
|  |  | 
|  | /// | 
|  | @system pure unittest | 
|  | { | 
|  | assert( isValid("\u20AC100")); | 
|  | assert(!isValid(cast(char[3])[167, 133, 175])); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Returns the length of the longest possible substring, starting from | 
|  | the first code unit, which is validly encoded. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | s = the string to be tested | 
|  | */ | 
|  | size_t validLength(E)(const(E)[] s) | 
|  | { | 
|  | size_t result, before = void; | 
|  | while ((before = s.length) > 0) | 
|  | { | 
|  | if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE) | 
|  | break; | 
|  | result += before - s.length; | 
|  | } | 
|  | return result; | 
|  | } | 
|  |  | 
|  | /** | 
|  | Sanitizes a string by replacing malformed code unit sequences with valid | 
|  | code unit sequences. The result is guaranteed to be valid for this encoding. | 
|  |  | 
|  | If the input string is already valid, this function returns the original, | 
|  | otherwise it constructs a new string by replacing all illegal code unit | 
|  | sequences with the encoding's replacement character, Invalid sequences will | 
|  | be replaced with the Unicode replacement character (U+FFFD) if the | 
|  | character repertoire contains it, otherwise invalid sequences will be | 
|  | replaced with '?'. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | s = the string to be sanitized | 
|  | */ | 
|  | immutable(E)[] sanitize(E)(immutable(E)[] s) | 
|  | { | 
|  | size_t n = validLength(s); | 
|  | if (n == s.length) return s; | 
|  |  | 
|  | auto repSeq = EncoderInstance!(E).replacementSequence; | 
|  |  | 
|  | // Count how long the string needs to be. | 
|  | // Overestimating is not a problem | 
|  | size_t len = s.length; | 
|  | const(E)[] t = s[n..$]; | 
|  | while (t.length != 0) | 
|  | { | 
|  | immutable c = EncoderInstance!(E).safeDecode(t); | 
|  | assert(c == INVALID_SEQUENCE); | 
|  | len += repSeq.length; | 
|  | t = t[validLength(t)..$]; | 
|  | } | 
|  |  | 
|  | // Now do the write | 
|  | E[] array = new E[len]; | 
|  | array[0 .. n] = s[0 .. n]; | 
|  | size_t offset = n; | 
|  |  | 
|  | t = s[n..$]; | 
|  | while (t.length != 0) | 
|  | { | 
|  | immutable c = EncoderInstance!(E).safeDecode(t); | 
|  | assert(c == INVALID_SEQUENCE); | 
|  | array[offset .. offset+repSeq.length] = repSeq[]; | 
|  | offset += repSeq.length; | 
|  | n = validLength(t); | 
|  | array[offset .. offset+n] = t[0 .. n]; | 
|  | offset += n; | 
|  | t = t[n..$]; | 
|  | } | 
|  | return cast(immutable(E)[])array[0 .. offset]; | 
|  | } | 
|  |  | 
|  | /// | 
|  | @system pure unittest | 
|  | { | 
|  | assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld"); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Returns the length of the first encoded sequence. | 
|  |  | 
|  | The input to this function MUST be validly encoded. | 
|  | This is enforced by the function's in-contract. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | s = the string to be sliced | 
|  | */ | 
|  | size_t firstSequence(E)(const(E)[] s) | 
|  | in | 
|  | { | 
|  | assert(s.length != 0); | 
|  | const(E)[] u = s; | 
|  | assert(safeDecode(u) != INVALID_SEQUENCE); | 
|  | } | 
|  | do | 
|  | { | 
|  | auto before = s.length; | 
|  | EncoderInstance!(E).skip(s); | 
|  | return before - s.length; | 
|  | } | 
|  |  | 
|  | /// | 
|  | @system pure unittest | 
|  | { | 
|  | assert(firstSequence("\u20AC1000") == "\u20AC".length); | 
|  | assert(firstSequence("hel") == "h".length); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Returns the length of the last encoded sequence. | 
|  |  | 
|  | The input to this function MUST be validly encoded. | 
|  | This is enforced by the function's in-contract. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | s = the string to be sliced | 
|  | */ | 
|  | size_t lastSequence(E)(const(E)[] s) | 
|  | in | 
|  | { | 
|  | assert(s.length != 0); | 
|  | assert(isValid(s)); | 
|  | } | 
|  | do | 
|  | { | 
|  | const(E)[] t = s; | 
|  | EncoderInstance!(E).decodeReverse(s); | 
|  | return t.length - s.length; | 
|  | } | 
|  |  | 
|  | /// | 
|  | @system pure unittest | 
|  | { | 
|  | assert(lastSequence("1000\u20AC") == "\u20AC".length); | 
|  | assert(lastSequence("hellö") == "ö".length); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Returns the array index at which the (n+1)th code point begins. | 
|  |  | 
|  | The input to this function MUST be validly encoded. | 
|  | This is enforced by the function's in-contract. | 
|  |  | 
|  | Supersedes: | 
|  | This function supersedes std.utf.toUTFindex(). | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | s = the string to be counted | 
|  | n = the current code point index | 
|  | */ | 
|  | ptrdiff_t index(E)(const(E)[] s,int n) | 
|  | in | 
|  | { | 
|  | assert(isValid(s)); | 
|  | assert(n >= 0); | 
|  | } | 
|  | do | 
|  | { | 
|  | const(E)[] t = s; | 
|  | for (size_t i=0; i<n; ++i) EncoderInstance!(E).skip(s); | 
|  | return t.length - s.length; | 
|  | } | 
|  |  | 
|  | /// | 
|  | @system pure unittest | 
|  | { | 
|  | assert(index("\u20AC100",1) == 3); | 
|  | assert(index("hällo",2) == 3); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Decodes a single code point. | 
|  |  | 
|  | This function removes one or more code units from the start of a string, | 
|  | and returns the decoded code point which those code units represent. | 
|  |  | 
|  | The input to this function MUST be validly encoded. | 
|  | This is enforced by the function's in-contract. | 
|  |  | 
|  | Supersedes: | 
|  | This function supersedes std.utf.decode(), however, note that the | 
|  | function codePoints() supersedes it more conveniently. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | s = the string whose first code point is to be decoded | 
|  | */ | 
|  | dchar decode(S)(ref S s) | 
|  | in | 
|  | { | 
|  | assert(s.length != 0); | 
|  | auto u = s; | 
|  | assert(safeDecode(u) != INVALID_SEQUENCE); | 
|  | } | 
|  | do | 
|  | { | 
|  | return EncoderInstance!(typeof(s[0])).decode(s); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Decodes a single code point from the end of a string. | 
|  |  | 
|  | This function removes one or more code units from the end of a string, | 
|  | and returns the decoded code point which those code units represent. | 
|  |  | 
|  | The input to this function MUST be validly encoded. | 
|  | This is enforced by the function's in-contract. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | s = the string whose first code point is to be decoded | 
|  | */ | 
|  | dchar decodeReverse(E)(ref const(E)[] s) | 
|  | in | 
|  | { | 
|  | assert(s.length != 0); | 
|  | assert(isValid(s)); | 
|  | } | 
|  | do | 
|  | { | 
|  | return EncoderInstance!(E).decodeReverse(s); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Decodes a single code point. The input does not have to be valid. | 
|  |  | 
|  | This function removes one or more code units from the start of a string, | 
|  | and returns the decoded code point which those code units represent. | 
|  |  | 
|  | This function will accept an invalidly encoded string as input. | 
|  | If an invalid sequence is found at the start of the string, this | 
|  | function will remove it, and return the value INVALID_SEQUENCE. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | s = the string whose first code point is to be decoded | 
|  | */ | 
|  | dchar safeDecode(S)(ref S s) | 
|  | in | 
|  | { | 
|  | assert(s.length != 0); | 
|  | } | 
|  | do | 
|  | { | 
|  | return EncoderInstance!(typeof(s[0])).safeDecode(s); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Returns the number of code units required to encode a single code point. | 
|  |  | 
|  | The input to this function MUST be a valid code point. | 
|  | This is enforced by the function's in-contract. | 
|  |  | 
|  | The type of the output cannot be deduced. Therefore, it is necessary to | 
|  | explicitly specify the encoding as a template parameter. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | c = the code point to be encoded | 
|  | */ | 
|  | size_t encodedLength(E)(dchar c) | 
|  | in | 
|  | { | 
|  | assert(isValidCodePoint(c)); | 
|  | } | 
|  | do | 
|  | { | 
|  | return EncoderInstance!(E).encodedLength(c); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Encodes a single code point. | 
|  |  | 
|  | This function encodes a single code point into one or more code units. | 
|  | It returns a string containing those code units. | 
|  |  | 
|  | The input to this function MUST be a valid code point. | 
|  | This is enforced by the function's in-contract. | 
|  |  | 
|  | The type of the output cannot be deduced. Therefore, it is necessary to | 
|  | explicitly specify the encoding as a template parameter. | 
|  |  | 
|  | Supersedes: | 
|  | This function supersedes std.utf.encode(), however, note that the | 
|  | function codeUnits() supersedes it more conveniently. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | c = the code point to be encoded | 
|  | */ | 
|  | E[] encode(E)(dchar c) | 
|  | in | 
|  | { | 
|  | assert(isValidCodePoint(c)); | 
|  | } | 
|  | do | 
|  | { | 
|  | return EncoderInstance!(E).encode(c); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Encodes a single code point into an array. | 
|  |  | 
|  | This function encodes a single code point into one or more code units | 
|  | The code units are stored in a user-supplied fixed-size array, | 
|  | which must be passed by reference. | 
|  |  | 
|  | The input to this function MUST be a valid code point. | 
|  | This is enforced by the function's in-contract. | 
|  |  | 
|  | The type of the output cannot be deduced. Therefore, it is necessary to | 
|  | explicitly specify the encoding as a template parameter. | 
|  |  | 
|  | Supersedes: | 
|  | This function supersedes std.utf.encode(), however, note that the | 
|  | function codeUnits() supersedes it more conveniently. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | c     = the code point to be encoded | 
|  | array = the destination array | 
|  |  | 
|  | Returns: | 
|  | the number of code units written to the array | 
|  | */ | 
|  | size_t encode(E)(dchar c, E[] array) | 
|  | in | 
|  | { | 
|  | assert(isValidCodePoint(c)); | 
|  | } | 
|  | do | 
|  | { | 
|  | E[] t = array; | 
|  | EncoderInstance!(E).encode(c,t); | 
|  | return array.length - t.length; | 
|  | } | 
|  |  | 
|  | /* | 
|  | Encodes `c` in units of type `E` and writes the result to the | 
|  | output range `R`. Returns the number of `E`s written. | 
|  | */ | 
|  | size_t encode(E, R)(dchar c, auto ref R range) | 
|  | if (isNativeOutputRange!(R, E)) | 
|  | { | 
|  | static if (is(immutable E == immutable char)) | 
|  | { | 
|  | if (c <= 0x7F) | 
|  | { | 
|  | put(range, cast(char) c); | 
|  | return 1; | 
|  | } | 
|  | if (c <= 0x7FF) | 
|  | { | 
|  | put(range, cast(char)(0xC0 | (c >> 6))); | 
|  | put(range, cast(char)(0x80 | (c & 0x3F))); | 
|  | return 2; | 
|  | } | 
|  | if (c <= 0xFFFF) | 
|  | { | 
|  | put(range, cast(char)(0xE0 | (c >> 12))); | 
|  | put(range, cast(char)(0x80 | ((c >> 6) & 0x3F))); | 
|  | put(range, cast(char)(0x80 | (c & 0x3F))); | 
|  | return 3; | 
|  | } | 
|  | if (c <= 0x10FFFF) | 
|  | { | 
|  | put(range, cast(char)(0xF0 | (c >> 18))); | 
|  | put(range, cast(char)(0x80 | ((c >> 12) & 0x3F))); | 
|  | put(range, cast(char)(0x80 | ((c >> 6) & 0x3F))); | 
|  | put(range, cast(char)(0x80 | (c & 0x3F))); | 
|  | return 4; | 
|  | } | 
|  | else | 
|  | { | 
|  | assert(0); | 
|  | } | 
|  | } | 
|  | else static if (is(immutable E == immutable wchar)) | 
|  | { | 
|  | if (c <= 0xFFFF) | 
|  | { | 
|  | range.put(cast(wchar) c); | 
|  | return 1; | 
|  | } | 
|  | range.put(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800)); | 
|  | range.put(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00)); | 
|  | return 2; | 
|  | } | 
|  | else static if (is(immutable E == immutable dchar)) | 
|  | { | 
|  | range.put(c); | 
|  | return 1; | 
|  | } | 
|  | else | 
|  | { | 
|  | static assert(0); | 
|  | } | 
|  | } | 
|  |  | 
|  | @safe pure unittest | 
|  | { | 
|  | import std.array; | 
|  | Appender!(char[]) r; | 
|  | assert(encode!(char)('T', r) == 1); | 
|  | assert(encode!(wchar)('T', r) == 1); | 
|  | assert(encode!(dchar)('T', r) == 1); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Encodes a single code point to a delegate. | 
|  |  | 
|  | This function encodes a single code point into one or more code units. | 
|  | The code units are passed one at a time to the supplied delegate. | 
|  |  | 
|  | The input to this function MUST be a valid code point. | 
|  | This is enforced by the function's in-contract. | 
|  |  | 
|  | The type of the output cannot be deduced. Therefore, it is necessary to | 
|  | explicitly specify the encoding as a template parameter. | 
|  |  | 
|  | Supersedes: | 
|  | This function supersedes std.utf.encode(), however, note that the | 
|  | function codeUnits() supersedes it more conveniently. | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | c  = the code point to be encoded | 
|  | dg = the delegate to invoke for each code unit | 
|  | */ | 
|  | void encode(E)(dchar c, void delegate(E) dg) | 
|  | in | 
|  | { | 
|  | assert(isValidCodePoint(c)); | 
|  | } | 
|  | do | 
|  | { | 
|  | EncoderInstance!(E).encode(c,dg); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Encodes the contents of `s` in units of type `Tgt`, writing the result to an | 
|  | output range. | 
|  |  | 
|  | Returns: The number of `Tgt` elements written. | 
|  | Params: | 
|  | Tgt = Element type of `range`. | 
|  | s = Input array. | 
|  | range = Output range. | 
|  | */ | 
|  | size_t encode(Tgt, Src, R)(in Src[] s, R range) | 
|  | { | 
|  | size_t result; | 
|  | foreach (c; s) | 
|  | { | 
|  | result += encode!(Tgt)(c, range); | 
|  | } | 
|  | return result; | 
|  | } | 
|  |  | 
|  | /** | 
|  | Returns a foreachable struct which can bidirectionally iterate over all | 
|  | code points in a string. | 
|  |  | 
|  | The input to this function MUST be validly encoded. | 
|  | This is enforced by the function's in-contract. | 
|  |  | 
|  | You can foreach either | 
|  | with or without an index. If an index is specified, it will be initialized | 
|  | at each iteration with the offset into the string at which the code point | 
|  | begins. | 
|  |  | 
|  | Supersedes: | 
|  | This function supersedes std.utf.decode(). | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | s = the string to be decoded | 
|  |  | 
|  | Example: | 
|  | -------------------------------------------------------- | 
|  | string s = "hello world"; | 
|  | foreach (c;codePoints(s)) | 
|  | { | 
|  | // do something with c (which will always be a dchar) | 
|  | } | 
|  | -------------------------------------------------------- | 
|  |  | 
|  | Note that, currently, foreach (c:codePoints(s)) is superior to foreach (c;s) | 
|  | in that the latter will fall over on encountering U+FFFF. | 
|  | */ | 
|  | CodePoints!(E) codePoints(E)(immutable(E)[] s) | 
|  | in | 
|  | { | 
|  | assert(isValid(s)); | 
|  | } | 
|  | do | 
|  | { | 
|  | return CodePoints!(E)(s); | 
|  | } | 
|  |  | 
|  | /// | 
|  | @system unittest | 
|  | { | 
|  | string s = "hello"; | 
|  | string t; | 
|  | foreach (c;codePoints(s)) | 
|  | { | 
|  | t ~= cast(char) c; | 
|  | } | 
|  | assert(s == t); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Returns a foreachable struct which can bidirectionally iterate over all | 
|  | code units in a code point. | 
|  |  | 
|  | The input to this function MUST be a valid code point. | 
|  | This is enforced by the function's in-contract. | 
|  |  | 
|  | The type of the output cannot be deduced. Therefore, it is necessary to | 
|  | explicitly specify the encoding type in the template parameter. | 
|  |  | 
|  | Supersedes: | 
|  | This function supersedes std.utf.encode(). | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | c = the code point to be encoded | 
|  | */ | 
|  | CodeUnits!(E) codeUnits(E)(dchar c) | 
|  | in | 
|  | { | 
|  | assert(isValidCodePoint(c)); | 
|  | } | 
|  | do | 
|  | { | 
|  | return CodeUnits!(E)(c); | 
|  | } | 
|  |  | 
|  | /// | 
|  | @system unittest | 
|  | { | 
|  | char[] a; | 
|  | foreach (c;codeUnits!(char)(cast(dchar)'\u20AC')) | 
|  | { | 
|  | a ~= c; | 
|  | } | 
|  | assert(a.length == 3); | 
|  | assert(a[0] == 0xE2); | 
|  | assert(a[1] == 0x82); | 
|  | assert(a[2] == 0xAC); | 
|  | } | 
|  |  | 
|  | /** | 
|  | Convert a string from one encoding to another. | 
|  |  | 
|  | Supersedes: | 
|  | This function supersedes std.utf.toUTF8(), std.utf.toUTF16() and | 
|  | std.utf.toUTF32() | 
|  | (but note that to!() supersedes it more conveniently). | 
|  |  | 
|  | Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250, | 
|  | WINDOWS-1251, WINDOWS-1252 | 
|  |  | 
|  | Params: | 
|  | s = Source string. $(B Must) be validly encoded. | 
|  | This is enforced by the function's in-contract. | 
|  | r = Destination string | 
|  |  | 
|  | See_Also: | 
|  | $(REF to, std,conv) | 
|  | */ | 
|  | void transcode(Src, Dst)(Src[] s, out Dst[] r) | 
|  | in | 
|  | { | 
|  | assert(isValid(s)); | 
|  | } | 
|  | do | 
|  | { | 
|  | static if (is(Src == Dst) && is(Src == immutable)) | 
|  | { | 
|  | r = s; | 
|  | } | 
|  | else static if (is(immutable Src == immutable AsciiChar)) | 
|  | { | 
|  | transcode(cast(const(char)[])s, r); | 
|  | } | 
|  | else | 
|  | { | 
|  | static if (is(immutable Dst == immutable wchar)) | 
|  | { | 
|  | immutable minReservePlace = 2; | 
|  | } | 
|  | else static if (is(immutable Dst == immutable dchar)) | 
|  | { | 
|  | immutable minReservePlace = 1; | 
|  | } | 
|  | else | 
|  | { | 
|  | immutable minReservePlace = 6; | 
|  | } | 
|  |  | 
|  | auto buffer = new Unqual!Dst[s.length]; | 
|  | auto tmpBuffer = buffer; | 
|  |  | 
|  | while (s.length != 0) | 
|  | { | 
|  | if (tmpBuffer.length < minReservePlace) | 
|  | { | 
|  | size_t prevLength = buffer.length; | 
|  | buffer.length += s.length + minReservePlace; | 
|  | tmpBuffer = buffer[prevLength - tmpBuffer.length .. $]; | 
|  | } | 
|  | EncoderInstance!(Unqual!Dst).encode(decode(s), tmpBuffer); | 
|  | } | 
|  |  | 
|  | r = cast(Dst[]) buffer[0 .. buffer.length - tmpBuffer.length]; | 
|  | } | 
|  | } | 
|  |  | 
|  | /// | 
|  | @system pure unittest | 
|  | { | 
|  | wstring ws; | 
|  | // transcode from UTF-8 to UTF-16 | 
|  | transcode("hello world",ws); | 
|  | assert(ws == "hello world"w); | 
|  |  | 
|  | Latin1String ls; | 
|  | // transcode from UTF-16 to ISO-8859-1 | 
|  | transcode(ws, ls); | 
|  | assert(ls == "hello world"); | 
|  | } | 
|  |  | 
|  | @system pure unittest | 
|  | { | 
|  | import std.meta; | 
|  | import std.range; | 
|  | { | 
|  | import std.conv : to; | 
|  |  | 
|  | string asciiCharString = to!string(iota(0, 128, 1)); | 
|  |  | 
|  | alias Types = AliasSeq!(string, Latin1String, Latin2String, AsciiString, | 
|  | Windows1250String, Windows1251String, Windows1252String, dstring, wstring); | 
|  | foreach (S; Types) | 
|  | foreach (D; Types) | 
|  | { | 
|  | string str; | 
|  | S sStr; | 
|  | D dStr; | 
|  | transcode(asciiCharString, sStr); | 
|  | transcode(sStr, dStr); | 
|  | transcode(dStr, str); | 
|  | assert(asciiCharString == str); | 
|  | } | 
|  | } | 
|  | { | 
|  | string czechChars = "Příliš žluťoučký kůň úpěl ďábelské ódy."; | 
|  | alias Types = AliasSeq!(string, dstring, wstring); | 
|  | foreach (S; Types) | 
|  | foreach (D; Types) | 
|  | { | 
|  | string str; | 
|  | S sStr; | 
|  | D dStr; | 
|  | transcode(czechChars, sStr); | 
|  | transcode(sStr, dStr); | 
|  | transcode(dStr, str); | 
|  | assert(czechChars == str); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | @system unittest // mutable/const input/output | 
|  | { | 
|  | import std.meta : AliasSeq; | 
|  |  | 
|  | static foreach (O; AliasSeq!(Latin1Char, const Latin1Char, immutable Latin1Char)) | 
|  | {{ | 
|  | O[] output; | 
|  |  | 
|  | char[] mutableInput = "äbc".dup; | 
|  | transcode(mutableInput, output); | 
|  | assert(output == [0xE4, 'b', 'c']); | 
|  |  | 
|  | const char[] constInput = "öbc"; | 
|  | transcode(constInput, output); | 
|  | assert(output == [0xF6, 'b', 'c']); | 
|  |  | 
|  | immutable char[] immutInput = "übc"; | 
|  | transcode(immutInput, output); | 
|  | assert(output == [0xFC, 'b', 'c']); | 
|  | }} | 
|  |  | 
|  | // Make sure that const/mutable input is copied. | 
|  | static foreach (C; AliasSeq!(char, const char)) | 
|  | {{ | 
|  | C[] input = "foo".dup; | 
|  | C[] output; | 
|  | transcode(input, output); | 
|  | assert(input == output); | 
|  | assert(input !is output); | 
|  | }} | 
|  |  | 
|  | // But immutable input should not be copied. | 
|  | string input = "foo"; | 
|  | string output; | 
|  | transcode(input, output); | 
|  | assert(input is output); | 
|  | } | 
|  |  | 
|  | //============================================================================= | 
|  |  | 
|  | /** The base class for exceptions thrown by this module */ | 
|  | class EncodingException : Exception { this(string msg) @safe pure { super(msg); } } | 
|  |  | 
|  | class UnrecognizedEncodingException : EncodingException | 
|  | { | 
|  | private this(string msg) @safe pure { super(msg); } | 
|  | } | 
|  |  | 
|  | /** Abstract base class of all encoding schemes */ | 
|  | abstract class EncodingScheme | 
|  | { | 
|  | import std.uni : toLower; | 
|  |  | 
|  | /** | 
|  | * Registers a subclass of EncodingScheme. | 
|  | * | 
|  | * This function allows user-defined subclasses of EncodingScheme to | 
|  | * be declared in other modules. | 
|  | * | 
|  | * Params: | 
|  | *     Klass = The subclass of EncodingScheme to register. | 
|  | * | 
|  | * Example: | 
|  | * ---------------------------------------------- | 
|  | * class Amiga1251 : EncodingScheme | 
|  | * { | 
|  | *     shared static this() | 
|  | *     { | 
|  | *         EncodingScheme.register!Amiga1251; | 
|  | *     } | 
|  | * } | 
|  | * ---------------------------------------------- | 
|  | */ | 
|  | static void register(Klass:EncodingScheme)() | 
|  | { | 
|  | scope scheme = new Klass(); | 
|  | foreach (encodingName;scheme.names()) | 
|  | { | 
|  | supported[toLower(encodingName)] = () => new Klass(); | 
|  | } | 
|  | } | 
|  |  | 
|  | deprecated("Please pass the EncodingScheme subclass as template argument instead.") | 
|  | static void register(string className) | 
|  | { | 
|  | auto scheme = cast(EncodingScheme) ClassInfo.find(className).create(); | 
|  | if (scheme is null) | 
|  | throw new EncodingException("Unable to create class "~className); | 
|  | foreach (encodingName;scheme.names()) | 
|  | { | 
|  | supportedFactories[toLower(encodingName)] = className; | 
|  | } | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Obtains a subclass of EncodingScheme which is capable of encoding | 
|  | * and decoding the named encoding scheme. | 
|  | * | 
|  | * This function is only aware of EncodingSchemes which have been | 
|  | * registered with the register() function. | 
|  | * | 
|  | * Example: | 
|  | * --------------------------------------------------- | 
|  | * auto scheme = EncodingScheme.create("Amiga-1251"); | 
|  | * --------------------------------------------------- | 
|  | */ | 
|  | static EncodingScheme create(string encodingName) | 
|  | { | 
|  | static bool registerDefaultEncodings() | 
|  | { | 
|  | EncodingScheme.register!EncodingSchemeASCII; | 
|  | EncodingScheme.register!EncodingSchemeLatin1; | 
|  | EncodingScheme.register!EncodingSchemeLatin2; | 
|  | EncodingScheme.register!EncodingSchemeWindows1250; | 
|  | EncodingScheme.register!EncodingSchemeWindows1251; | 
|  | EncodingScheme.register!EncodingSchemeWindows1252; | 
|  | EncodingScheme.register!EncodingSchemeUtf8; | 
|  | EncodingScheme.register!EncodingSchemeUtf16Native; | 
|  | EncodingScheme.register!EncodingSchemeUtf32Native; | 
|  | return true; | 
|  | } | 
|  |  | 
|  | static shared bool initialized; | 
|  | import std.concurrency : initOnce; | 
|  | initOnce!initialized(registerDefaultEncodings()); | 
|  | encodingName = toLower(encodingName); | 
|  |  | 
|  | if (auto p = encodingName in supported) | 
|  | return (*p)(); | 
|  |  | 
|  | auto p = encodingName in supportedFactories; | 
|  | if (p is null) | 
|  | throw new EncodingException("Unrecognized Encoding: "~encodingName); | 
|  | string className = *p; | 
|  | auto scheme = cast(EncodingScheme) ClassInfo.find(className).create(); | 
|  | if (scheme is null) throw new EncodingException("Unable to create class "~className); | 
|  | return scheme; | 
|  | } | 
|  |  | 
|  | const | 
|  | { | 
|  | /** | 
|  | * Returns the standard name of the encoding scheme | 
|  | */ | 
|  | abstract override string toString(); | 
|  |  | 
|  | /** | 
|  | * Returns an array of all known names for this encoding scheme | 
|  | */ | 
|  | abstract string[] names(); | 
|  |  | 
|  | /** | 
|  | * Returns true if the character c can be represented | 
|  | * in this encoding scheme. | 
|  | */ | 
|  | abstract bool canEncode(dchar c); | 
|  |  | 
|  | /** | 
|  | * Returns the number of ubytes required to encode this code point. | 
|  | * | 
|  | * The input to this function MUST be a valid code point. | 
|  | * | 
|  | * Params: | 
|  | *    c = the code point to be encoded | 
|  | * | 
|  | * Returns: | 
|  | *    the number of ubytes required. | 
|  | */ | 
|  | abstract size_t encodedLength(dchar c); | 
|  |  | 
|  | /** | 
|  | * Encodes a single code point into a user-supplied, fixed-size buffer. | 
|  | * | 
|  | * This function encodes a single code point into one or more ubytes. | 
|  | * The supplied buffer must be code unit aligned. | 
|  | * (For example, UTF-16LE or UTF-16BE must be wchar-aligned, | 
|  | * UTF-32LE or UTF-32BE must be dchar-aligned, etc.) | 
|  | * | 
|  | * The input to this function MUST be a valid code point. | 
|  | * | 
|  | * Params: | 
|  | *    c      = the code point to be encoded | 
|  | *    buffer = the destination array | 
|  | * | 
|  | * Returns: | 
|  | *    the number of ubytes written. | 
|  | */ | 
|  | abstract size_t encode(dchar c, ubyte[] buffer); | 
|  |  | 
|  | /** | 
|  | * Decodes a single code point. | 
|  | * | 
|  | * This function removes one or more ubytes from the start of an array, | 
|  | * and returns the decoded code point which those ubytes represent. | 
|  | * | 
|  | * The input to this function MUST be validly encoded. | 
|  | * | 
|  | * Params: | 
|  | *    s = the array whose first code point is to be decoded | 
|  | */ | 
|  | abstract dchar decode(ref const(ubyte)[] s); | 
|  |  | 
|  | /** | 
|  | * Decodes a single code point. The input does not have to be valid. | 
|  | * | 
|  | * This function removes one or more ubytes from the start of an array, | 
|  | * and returns the decoded code point which those ubytes represent. | 
|  | * | 
|  | * This function will accept an invalidly encoded array as input. | 
|  | * If an invalid sequence is found at the start of the string, this | 
|  | * function will remove it, and return the value INVALID_SEQUENCE. | 
|  | * | 
|  | * Params: | 
|  | *    s = the array whose first code point is to be decoded | 
|  | */ | 
|  | abstract dchar safeDecode(ref const(ubyte)[] s); | 
|  |  | 
|  | /** | 
|  | * Returns the sequence of ubytes to be used to represent | 
|  | * any character which cannot be represented in the encoding scheme. | 
|  | * | 
|  | * Normally this will be a representation of some substitution | 
|  | * character, such as U+FFFD or '?'. | 
|  | */ | 
|  | abstract @property immutable(ubyte)[] replacementSequence(); | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Returns true if the array is encoded correctly | 
|  | * | 
|  | * Params: | 
|  | *    s = the array to be tested | 
|  | */ | 
|  | bool isValid(const(ubyte)[] s) | 
|  | { | 
|  | while (s.length != 0) | 
|  | { | 
|  | if (safeDecode(s) == INVALID_SEQUENCE) | 
|  | return false; | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Returns the length of the longest possible substring, starting from | 
|  | * the first element, which is validly encoded. | 
|  | * | 
|  | * Params: | 
|  | *    s = the array to be tested | 
|  | */ | 
|  | size_t validLength()(const(ubyte)[] s) | 
|  | { | 
|  | const(ubyte)[] r = s; | 
|  | const(ubyte)[] t = s; | 
|  | while (s.length != 0) | 
|  | { | 
|  | if (safeDecode(s) == INVALID_SEQUENCE) break; | 
|  | t = s; | 
|  | } | 
|  | return r.length - t.length; | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Sanitizes an array by replacing malformed ubyte sequences with valid | 
|  | * ubyte sequences. The result is guaranteed to be valid for this | 
|  | * encoding scheme. | 
|  | * | 
|  | * If the input array is already valid, this function returns the | 
|  | * original, otherwise it constructs a new array by replacing all illegal | 
|  | * sequences with the encoding scheme's replacement sequence. | 
|  | * | 
|  | * Params: | 
|  | *    s = the string to be sanitized | 
|  | */ | 
|  | immutable(ubyte)[] sanitize()(immutable(ubyte)[] s) | 
|  | { | 
|  | auto n = validLength(s); | 
|  | if (n == s.length) return s; | 
|  |  | 
|  | auto repSeq = replacementSequence; | 
|  |  | 
|  | // Count how long the string needs to be. | 
|  | // Overestimating is not a problem | 
|  | auto len = s.length; | 
|  | const(ubyte)[] t = s[n..$]; | 
|  | while (t.length != 0) | 
|  | { | 
|  | immutable c = safeDecode(t); | 
|  | assert(c == INVALID_SEQUENCE); | 
|  | len += repSeq.length; | 
|  | t = t[validLength(t)..$]; | 
|  | } | 
|  |  | 
|  | // Now do the write | 
|  | ubyte[] array = new ubyte[len]; | 
|  | array[0 .. n] = s[0 .. n]; | 
|  | auto offset = n; | 
|  |  | 
|  | t = s[n..$]; | 
|  | while (t.length != 0) | 
|  | { | 
|  | immutable c = safeDecode(t); | 
|  | assert(c == INVALID_SEQUENCE); | 
|  | array[offset .. offset+repSeq.length] = repSeq[]; | 
|  | offset += repSeq.length; | 
|  | n = validLength(t); | 
|  | array[offset .. offset+n] = t[0 .. n]; | 
|  | offset += n; | 
|  | t = t[n..$]; | 
|  | } | 
|  | return cast(immutable(ubyte)[])array[0 .. offset]; | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Returns the length of the first encoded sequence. | 
|  | * | 
|  | * The input to this function MUST be validly encoded. | 
|  | * This is enforced by the function's in-contract. | 
|  | * | 
|  | * Params: | 
|  | *    s = the array to be sliced | 
|  | */ | 
|  | size_t firstSequence()(const(ubyte)[] s) | 
|  | in | 
|  | { | 
|  | assert(s.length != 0); | 
|  | const(ubyte)[] u = s; | 
|  | assert(safeDecode(u) != INVALID_SEQUENCE); | 
|  | } | 
|  | do | 
|  | { | 
|  | const(ubyte)[] t = s; | 
|  | decode(s); | 
|  | return t.length - s.length; | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Returns the total number of code points encoded in a ubyte array. | 
|  | * | 
|  | * The input to this function MUST be validly encoded. | 
|  | * This is enforced by the function's in-contract. | 
|  | * | 
|  | * Params: | 
|  | *    s = the string to be counted | 
|  | */ | 
|  | size_t count()(const(ubyte)[] s) | 
|  | in | 
|  | { | 
|  | assert(isValid(s)); | 
|  | } | 
|  | do | 
|  | { | 
|  | size_t n = 0; | 
|  | while (s.length != 0) | 
|  | { | 
|  | decode(s); | 
|  | ++n; | 
|  | } | 
|  | return n; | 
|  | } | 
|  |  | 
|  | /** | 
|  | * Returns the array index at which the (n+1)th code point begins. | 
|  | * | 
|  | * The input to this function MUST be validly encoded. | 
|  | * This is enforced by the function's in-contract. | 
|  | * | 
|  | * Params: | 
|  | *    s = the string to be counted | 
|  | *    n = the current code point index | 
|  | */ | 
|  | ptrdiff_t index()(const(ubyte)[] s, size_t n) | 
|  | in | 
|  | { | 
|  | assert(isValid(s)); | 
|  | assert(n >= 0); | 
|  | } | 
|  | do | 
|  | { | 
|  | const(ubyte)[] t = s; | 
|  | for (size_t i=0; i<n; ++i) decode(s); | 
|  | return t.length - s.length; | 
|  | } | 
|  |  | 
|  | __gshared EncodingScheme function()[string] supported; | 
|  | __gshared string[string] supportedFactories; | 
|  | } | 
|  |  | 
|  | /** | 
|  | EncodingScheme to handle ASCII | 
|  |  | 
|  | This scheme recognises the following names: | 
|  | "ANSI_X3.4-1968", | 
|  | "ANSI_X3.4-1986", | 
|  | "ASCII", | 
|  | "IBM367", | 
|  | "ISO646-US", | 
|  | "ISO_646.irv:1991", | 
|  | "US-ASCII", | 
|  | "cp367", | 
|  | "csASCII" | 
|  | "iso-ir-6", | 
|  | "us" | 
|  | */ | 
|  | class EncodingSchemeASCII : EncodingScheme | 
|  | { | 
|  | /* // moved to std.internal.phobosinit | 
|  | shared static this() | 
|  | { | 
|  | EncodingScheme.register("std.encoding.EncodingSchemeASCII"); | 
|  | }*/ | 
|  |  | 
|  | const | 
|  | { | 
|  | override string[] names() @safe pure nothrow | 
|  | { | 
|  | return | 
|  | [ | 
|  | "ANSI_X3.4-1968", | 
|  | "ANSI_X3.4-1986", | 
|  | "ASCII", | 
|  | "IBM367", | 
|  | "ISO646-US", | 
|  | "ISO_646.irv:1991", | 
|  | "US-ASCII", | 
|  | "cp367", | 
|  | "csASCII", | 
|  | "iso-ir-6", | 
|  | "us" | 
|  | ]; | 
|  | } | 
|  |  | 
|  | override string toString() @safe pure nothrow @nogc | 
|  | { | 
|  | return "ASCII"; | 
|  | } | 
|  |  | 
|  | override bool canEncode(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.canEncode!(AsciiChar)(c); | 
|  | } | 
|  |  | 
|  | override size_t encodedLength(dchar c)  @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.encodedLength!(AsciiChar)(c); | 
|  | } | 
|  |  | 
|  | override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc | 
|  | { | 
|  | auto r = cast(AsciiChar[]) buffer; | 
|  | return std.encoding.encode(c,r); | 
|  | } | 
|  |  | 
|  | override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | { | 
|  | auto t = cast(const(AsciiChar)[]) s; | 
|  | dchar c = std.encoding.decode(t); | 
|  | s = s[$-t.length..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | { | 
|  | auto t = cast(const(AsciiChar)[]) s; | 
|  | dchar c = std.encoding.safeDecode(t); | 
|  | s = s[$-t.length..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc | 
|  | { | 
|  | return cast(immutable(ubyte)[])"?"; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /** | 
|  | EncodingScheme to handle Latin-1 | 
|  |  | 
|  | This scheme recognises the following names: | 
|  | "CP819", | 
|  | "IBM819", | 
|  | "ISO-8859-1", | 
|  | "ISO_8859-1", | 
|  | "ISO_8859-1:1987", | 
|  | "csISOLatin1", | 
|  | "iso-ir-100", | 
|  | "l1", | 
|  | "latin1" | 
|  | */ | 
|  | class EncodingSchemeLatin1 : EncodingScheme | 
|  | { | 
|  | /* // moved to std.internal.phobosinit | 
|  | shared static this() | 
|  | { | 
|  | EncodingScheme.register("std.encoding.EncodingSchemeLatin1"); | 
|  | }*/ | 
|  |  | 
|  | const | 
|  | { | 
|  | override string[] names() @safe pure nothrow | 
|  | { | 
|  | return | 
|  | [ | 
|  | "CP819", | 
|  | "IBM819", | 
|  | "ISO-8859-1", | 
|  | "ISO_8859-1", | 
|  | "ISO_8859-1:1987", | 
|  | "csISOLatin1", | 
|  | "iso-ir-100", | 
|  | "l1", | 
|  | "latin1" | 
|  | ]; | 
|  | } | 
|  |  | 
|  | override string toString() @safe pure nothrow @nogc | 
|  | { | 
|  | return "ISO-8859-1"; | 
|  | } | 
|  |  | 
|  | override bool canEncode(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.canEncode!(Latin1Char)(c); | 
|  | } | 
|  |  | 
|  | override size_t encodedLength(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.encodedLength!(Latin1Char)(c); | 
|  | } | 
|  |  | 
|  | override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc | 
|  | { | 
|  | auto r = cast(Latin1Char[]) buffer; | 
|  | return std.encoding.encode(c,r); | 
|  | } | 
|  |  | 
|  | override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | { | 
|  | auto t = cast(const(Latin1Char)[]) s; | 
|  | dchar c = std.encoding.decode(t); | 
|  | s = s[$-t.length..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | { | 
|  | auto t = cast(const(Latin1Char)[]) s; | 
|  | dchar c = std.encoding.safeDecode(t); | 
|  | s = s[$-t.length..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc | 
|  | { | 
|  | return cast(immutable(ubyte)[])"?"; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /** | 
|  | EncodingScheme to handle Latin-2 | 
|  |  | 
|  | This scheme recognises the following names: | 
|  | "Latin 2", | 
|  | "ISO-8859-2", | 
|  | "ISO_8859-2", | 
|  | "ISO_8859-2:1999", | 
|  | "Windows-28592" | 
|  | */ | 
|  | class EncodingSchemeLatin2 : EncodingScheme | 
|  | { | 
|  | /* // moved to std.internal.phobosinit | 
|  | shared static this() | 
|  | { | 
|  | EncodingScheme.register("std.encoding.EncodingSchemeLatin2"); | 
|  | }*/ | 
|  |  | 
|  | const | 
|  | { | 
|  | override string[] names() @safe pure nothrow | 
|  | { | 
|  | return | 
|  | [ | 
|  | "Latin 2", | 
|  | "ISO-8859-2", | 
|  | "ISO_8859-2", | 
|  | "ISO_8859-2:1999", | 
|  | "windows-28592" | 
|  | ]; | 
|  | } | 
|  |  | 
|  | override string toString() @safe pure nothrow @nogc | 
|  | { | 
|  | return "ISO-8859-2"; | 
|  | } | 
|  |  | 
|  | override bool canEncode(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.canEncode!(Latin2Char)(c); | 
|  | } | 
|  |  | 
|  | override size_t encodedLength(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.encodedLength!(Latin2Char)(c); | 
|  | } | 
|  |  | 
|  | override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc | 
|  | { | 
|  | auto r = cast(Latin2Char[]) buffer; | 
|  | return std.encoding.encode(c,r); | 
|  | } | 
|  |  | 
|  | override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | { | 
|  | auto t = cast(const(Latin2Char)[]) s; | 
|  | dchar c = std.encoding.decode(t); | 
|  | s = s[$-t.length..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | { | 
|  | auto t = cast(const(Latin2Char)[]) s; | 
|  | dchar c = std.encoding.safeDecode(t); | 
|  | s = s[$-t.length..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc | 
|  | { | 
|  | return cast(immutable(ubyte)[])"?"; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /** | 
|  | EncodingScheme to handle Windows-1250 | 
|  |  | 
|  | This scheme recognises the following names: | 
|  | "windows-1250" | 
|  | */ | 
|  | class EncodingSchemeWindows1250 : EncodingScheme | 
|  | { | 
|  | /* // moved to std.internal.phobosinit | 
|  | shared static this() | 
|  | { | 
|  | EncodingScheme.register("std.encoding.EncodingSchemeWindows1250"); | 
|  | }*/ | 
|  |  | 
|  | const | 
|  | { | 
|  | override string[] names() @safe pure nothrow | 
|  | { | 
|  | return | 
|  | [ | 
|  | "windows-1250" | 
|  | ]; | 
|  | } | 
|  |  | 
|  | override string toString() @safe pure nothrow @nogc | 
|  | { | 
|  | return "windows-1250"; | 
|  | } | 
|  |  | 
|  | override bool canEncode(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.canEncode!(Windows1250Char)(c); | 
|  | } | 
|  |  | 
|  | override size_t encodedLength(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.encodedLength!(Windows1250Char)(c); | 
|  | } | 
|  |  | 
|  | override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc | 
|  | { | 
|  | auto r = cast(Windows1250Char[]) buffer; | 
|  | return std.encoding.encode(c,r); | 
|  | } | 
|  |  | 
|  | override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | { | 
|  | auto t = cast(const(Windows1250Char)[]) s; | 
|  | dchar c = std.encoding.decode(t); | 
|  | s = s[$-t.length..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | { | 
|  | auto t = cast(const(Windows1250Char)[]) s; | 
|  | dchar c = std.encoding.safeDecode(t); | 
|  | s = s[$-t.length..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc | 
|  | { | 
|  | return cast(immutable(ubyte)[])"?"; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /** | 
|  | EncodingScheme to handle Windows-1251 | 
|  |  | 
|  | This scheme recognises the following names: | 
|  | "windows-1251" | 
|  | */ | 
|  | class EncodingSchemeWindows1251 : EncodingScheme | 
|  | { | 
|  | /* // moved to std.internal.phobosinit | 
|  | shared static this() | 
|  | { | 
|  | EncodingScheme.register("std.encoding.EncodingSchemeWindows1251"); | 
|  | }*/ | 
|  |  | 
|  | const | 
|  | { | 
|  | override string[] names() @safe pure nothrow | 
|  | { | 
|  | return | 
|  | [ | 
|  | "windows-1251" | 
|  | ]; | 
|  | } | 
|  |  | 
|  | override string toString() @safe pure nothrow @nogc | 
|  | { | 
|  | return "windows-1251"; | 
|  | } | 
|  |  | 
|  | override bool canEncode(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.canEncode!(Windows1251Char)(c); | 
|  | } | 
|  |  | 
|  | override size_t encodedLength(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.encodedLength!(Windows1251Char)(c); | 
|  | } | 
|  |  | 
|  | override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc | 
|  | { | 
|  | auto r = cast(Windows1251Char[]) buffer; | 
|  | return std.encoding.encode(c,r); | 
|  | } | 
|  |  | 
|  | override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | { | 
|  | auto t = cast(const(Windows1251Char)[]) s; | 
|  | dchar c = std.encoding.decode(t); | 
|  | s = s[$-t.length..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | { | 
|  | auto t = cast(const(Windows1251Char)[]) s; | 
|  | dchar c = std.encoding.safeDecode(t); | 
|  | s = s[$-t.length..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc | 
|  | { | 
|  | return cast(immutable(ubyte)[])"?"; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /** | 
|  | EncodingScheme to handle Windows-1252 | 
|  |  | 
|  | This scheme recognises the following names: | 
|  | "windows-1252" | 
|  | */ | 
|  | class EncodingSchemeWindows1252 : EncodingScheme | 
|  | { | 
|  | /* // moved to std.internal.phobosinit | 
|  | shared static this() | 
|  | { | 
|  | EncodingScheme.register("std.encoding.EncodingSchemeWindows1252"); | 
|  | }*/ | 
|  |  | 
|  | const | 
|  | { | 
|  | override string[] names() @safe pure nothrow | 
|  | { | 
|  | return | 
|  | [ | 
|  | "windows-1252" | 
|  | ]; | 
|  | } | 
|  |  | 
|  | override string toString() @safe pure nothrow @nogc | 
|  | { | 
|  | return "windows-1252"; | 
|  | } | 
|  |  | 
|  | override bool canEncode(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.canEncode!(Windows1252Char)(c); | 
|  | } | 
|  |  | 
|  | override size_t encodedLength(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.encodedLength!(Windows1252Char)(c); | 
|  | } | 
|  |  | 
|  | override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc | 
|  | { | 
|  | auto r = cast(Windows1252Char[]) buffer; | 
|  | return std.encoding.encode(c,r); | 
|  | } | 
|  |  | 
|  | override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | { | 
|  | auto t = cast(const(Windows1252Char)[]) s; | 
|  | dchar c = std.encoding.decode(t); | 
|  | s = s[$-t.length..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | { | 
|  | auto t = cast(const(Windows1252Char)[]) s; | 
|  | dchar c = std.encoding.safeDecode(t); | 
|  | s = s[$-t.length..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc | 
|  | { | 
|  | return cast(immutable(ubyte)[])"?"; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | @system unittest | 
|  | { | 
|  | static string[] schemeNames = | 
|  | [ | 
|  | "ASCII", | 
|  | "ISO-8859-1", | 
|  | "ISO-8859-2", | 
|  | "windows-1250", | 
|  | "windows-1251", | 
|  | "windows-1252" | 
|  | ]; | 
|  |  | 
|  | EncodingScheme[] schemes; | 
|  |  | 
|  | foreach (name;schemeNames) | 
|  | { | 
|  | schemes ~= EncodingScheme.create(name); | 
|  | } | 
|  |  | 
|  | ubyte[1] buffer; | 
|  | static dchar[][] valid = | 
|  | [ | 
|  | //Valid ASCII | 
|  | ['\u0001','\u0020','\u0040','\u0060','\u007F'], | 
|  | //Vaild 8859-1 | 
|  | ['\u0001','\u0020','\u0070','\u00DA','\u00FF'], | 
|  | //Valid 8859-2 | 
|  | ['\u0020','\u00D7','\u00DF','\u010F','\u02D9'], | 
|  | //Valid 1250 | 
|  | ['\u0020','\u20AC','\u201E','\u2021','\u2039'], | 
|  | //Valid 1251 | 
|  | ['\u0402','\u00A4','\u0415','\u0439','\u044F'], | 
|  | //Valid 1252 | 
|  | ['\u20AC','\u0160','\u2019','\u2122','\u0178'], | 
|  | ]; | 
|  |  | 
|  | static const(ubyte)[] invalid = [0xA0,0xFF,0xFF,0x81,0x98,0x81]; | 
|  |  | 
|  | foreach (i,scheme;schemes) | 
|  | { | 
|  | assert(scheme.toString() == schemeNames[i],"Error in the name of encoding scheme"~schemeNames[i]); | 
|  | assert(!scheme.canEncode('\uFFFD')); | 
|  | assert(scheme.encodedLength('A') == 1); | 
|  | const(ubyte)[] encodeStr; | 
|  | dchar[] decStr; | 
|  | foreach (chr;valid[i]) | 
|  | { | 
|  | assert(scheme.encode(chr,buffer) == 1); | 
|  | encodeStr ~= buffer; | 
|  | const(ubyte)[] buf = buffer; | 
|  | decStr ~= scheme.decode(buf); | 
|  | } | 
|  |  | 
|  | assert(scheme.isValid(encodeStr),"Not correctly encoded UTF => " ~ schemeNames[i]); | 
|  | assert(valid[i] == decStr,"Error encode/decode UTF8 <=> " ~ schemeNames[i]); | 
|  |  | 
|  | if (schemeNames[i] == "ISO-8859-1" || schemeNames[i] == "ISO-8859-2") | 
|  | { | 
|  | assert(scheme.safeDecode(invalid) != INVALID_SEQUENCE); | 
|  | } | 
|  | else | 
|  | { | 
|  | assert(scheme.safeDecode(invalid) == INVALID_SEQUENCE); | 
|  | } | 
|  | assert(scheme.replacementSequence() == cast(immutable(ubyte)[])"?"); | 
|  | } | 
|  | assert(invalid.length == 0); | 
|  | } | 
|  |  | 
|  | /** | 
|  | EncodingScheme to handle UTF-8 | 
|  |  | 
|  | This scheme recognises the following names: | 
|  | "UTF-8" | 
|  | */ | 
|  | class EncodingSchemeUtf8 : EncodingScheme | 
|  | { | 
|  | /* // moved to std.internal.phobosinit | 
|  | shared static this() | 
|  | { | 
|  | EncodingScheme.register("std.encoding.EncodingSchemeUtf8"); | 
|  | }*/ | 
|  |  | 
|  | const | 
|  | { | 
|  | override string[] names() @safe pure nothrow | 
|  | { | 
|  | return | 
|  | [ | 
|  | "UTF-8" | 
|  | ]; | 
|  | } | 
|  |  | 
|  | override string toString() @safe pure nothrow @nogc | 
|  | { | 
|  | return "UTF-8"; | 
|  | } | 
|  |  | 
|  | override bool canEncode(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.canEncode!(char)(c); | 
|  | } | 
|  |  | 
|  | override size_t encodedLength(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.encodedLength!(char)(c); | 
|  | } | 
|  |  | 
|  | override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc | 
|  | { | 
|  | auto r = cast(char[]) buffer; | 
|  | return std.encoding.encode(c,r); | 
|  | } | 
|  |  | 
|  | override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | { | 
|  | auto t = cast(const(char)[]) s; | 
|  | dchar c = std.encoding.decode(t); | 
|  | s = s[$-t.length..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | { | 
|  | auto t = cast(const(char)[]) s; | 
|  | dchar c = std.encoding.safeDecode(t); | 
|  | s = s[$-t.length..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc | 
|  | { | 
|  | return cast(immutable(ubyte)[])"\uFFFD"; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /** | 
|  | EncodingScheme to handle UTF-16 in native byte order | 
|  |  | 
|  | This scheme recognises the following names: | 
|  | "UTF-16LE" (little-endian architecture only) | 
|  | "UTF-16BE" (big-endian architecture only) | 
|  | */ | 
|  | class EncodingSchemeUtf16Native : EncodingScheme | 
|  | { | 
|  | /* // moved to std.internal.phobosinit | 
|  | shared static this() | 
|  | { | 
|  | EncodingScheme.register("std.encoding.EncodingSchemeUtf16Native"); | 
|  | }*/ | 
|  |  | 
|  | const | 
|  | { | 
|  | version (LittleEndian) { enum string NAME = "UTF-16LE"; } | 
|  | version (BigEndian)    { enum string NAME = "UTF-16BE"; } | 
|  |  | 
|  | override string[] names() @safe pure nothrow | 
|  | { | 
|  | return [ NAME ]; | 
|  | } | 
|  |  | 
|  | override string toString() @safe pure nothrow @nogc | 
|  | { | 
|  | return NAME; | 
|  | } | 
|  |  | 
|  | override bool canEncode(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.canEncode!(wchar)(c); | 
|  | } | 
|  |  | 
|  | override size_t encodedLength(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.encodedLength!(wchar)(c); | 
|  | } | 
|  |  | 
|  | override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc | 
|  | { | 
|  | auto r = cast(wchar[]) buffer; | 
|  | return wchar.sizeof * std.encoding.encode(c,r); | 
|  | } | 
|  |  | 
|  | override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | in | 
|  | { | 
|  | assert((s.length & 1) == 0); | 
|  | } | 
|  | do | 
|  | { | 
|  | auto t = cast(const(wchar)[]) s; | 
|  | dchar c = std.encoding.decode(t); | 
|  | s = s[$-t.length * wchar.sizeof..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | in | 
|  | { | 
|  | assert((s.length & 1) == 0); | 
|  | } | 
|  | do | 
|  | { | 
|  | auto t = cast(const(wchar)[]) s; | 
|  | dchar c = std.encoding.safeDecode(t); | 
|  | s = s[$-t.length * wchar.sizeof..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc | 
|  | { | 
|  | return cast(immutable(ubyte)[])"\uFFFD"w; | 
|  | } | 
|  | } | 
|  | } | 
|  | @system unittest | 
|  | { | 
|  | version (LittleEndian) | 
|  | { | 
|  | auto efrom = EncodingScheme.create("utf-16le"); | 
|  | ubyte[6] sample = [154,1, 155,1, 156,1]; | 
|  | } | 
|  | version (BigEndian) | 
|  | { | 
|  | auto efrom = EncodingScheme.create("utf-16be"); | 
|  | ubyte[6] sample = [1,154, 1,155, 1,156]; | 
|  | } | 
|  | const(ubyte)[] ub = cast(const(ubyte)[])sample; | 
|  | dchar dc = efrom.safeDecode(ub); | 
|  | assert(dc == 410); | 
|  | assert(ub.length == 4); | 
|  | } | 
|  |  | 
|  | /** | 
|  | EncodingScheme to handle UTF-32 in native byte order | 
|  |  | 
|  | This scheme recognises the following names: | 
|  | "UTF-32LE" (little-endian architecture only) | 
|  | "UTF-32BE" (big-endian architecture only) | 
|  | */ | 
|  | class EncodingSchemeUtf32Native : EncodingScheme | 
|  | { | 
|  | /* // moved to std.internal.phobosinit | 
|  | shared static this() | 
|  | { | 
|  | EncodingScheme.register("std.encoding.EncodingSchemeUtf32Native"); | 
|  | }*/ | 
|  |  | 
|  | const | 
|  | { | 
|  | version (LittleEndian) { enum string NAME = "UTF-32LE"; } | 
|  | version (BigEndian)    { enum string NAME = "UTF-32BE"; } | 
|  |  | 
|  | override string[] names() @safe pure nothrow | 
|  | { | 
|  | return [ NAME ]; | 
|  | } | 
|  |  | 
|  | override string toString() @safe pure nothrow @nogc | 
|  | { | 
|  | return NAME; | 
|  | } | 
|  |  | 
|  | override bool canEncode(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.canEncode!(dchar)(c); | 
|  | } | 
|  |  | 
|  | override size_t encodedLength(dchar c) @safe pure nothrow @nogc | 
|  | { | 
|  | return std.encoding.encodedLength!(dchar)(c); | 
|  | } | 
|  |  | 
|  | override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc | 
|  | { | 
|  | auto r = cast(dchar[]) buffer; | 
|  | return dchar.sizeof * std.encoding.encode(c,r); | 
|  | } | 
|  |  | 
|  | override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | in | 
|  | { | 
|  | assert((s.length & 3) == 0); | 
|  | } | 
|  | do | 
|  | { | 
|  | auto t = cast(const(dchar)[]) s; | 
|  | dchar c = std.encoding.decode(t); | 
|  | s = s[$-t.length * dchar.sizeof..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc | 
|  | in | 
|  | { | 
|  | assert((s.length & 3) == 0); | 
|  | } | 
|  | do | 
|  | { | 
|  | auto t = cast(const(dchar)[]) s; | 
|  | dchar c = std.encoding.safeDecode(t); | 
|  | s = s[$-t.length * dchar.sizeof..$]; | 
|  | return c; | 
|  | } | 
|  |  | 
|  | override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc | 
|  | { | 
|  | return cast(immutable(ubyte)[])"\uFFFD"d; | 
|  | } | 
|  | } | 
|  | } | 
|  | @system unittest | 
|  | { | 
|  | version (LittleEndian) | 
|  | { | 
|  | auto efrom = EncodingScheme.create("utf-32le"); | 
|  | ubyte[12] sample = [154,1,0,0, 155,1,0,0, 156,1,0,0]; | 
|  | } | 
|  | version (BigEndian) | 
|  | { | 
|  | auto efrom = EncodingScheme.create("utf-32be"); | 
|  | ubyte[12] sample = [0,0,1,154, 0,0,1,155, 0,0,1,156]; | 
|  | } | 
|  | const(ubyte)[] ub = cast(const(ubyte)[])sample; | 
|  | dchar dc = efrom.safeDecode(ub); | 
|  | assert(dc == 410); | 
|  | assert(ub.length == 8); | 
|  | } | 
|  |  | 
|  | //============================================================================= | 
|  |  | 
|  |  | 
|  | /** Definitions of common Byte Order Marks. | 
|  | The elements of the `enum` can used as indices into `bomTable` to get | 
|  | matching `BOMSeq`. | 
|  | */ | 
|  | enum BOM | 
|  | { | 
|  | none      = 0,  /// no BOM was found | 
|  | utf32be   = 1,  /// [0x00, 0x00, 0xFE, 0xFF] | 
|  | utf32le   = 2,  /// [0xFF, 0xFE, 0x00, 0x00] | 
|  | utf7      = 3,  /** [0x2B, 0x2F, 0x76, 0x38] | 
|  | [0x2B, 0x2F, 0x76, 0x39], | 
|  | [0x2B, 0x2F, 0x76, 0x2B], | 
|  | [0x2B, 0x2F, 0x76, 0x2F], | 
|  | [0x2B, 0x2F, 0x76, 0x38, 0x2D] | 
|  | */ | 
|  | utf1      = 8,  /// [0xF7, 0x64, 0x4C] | 
|  | utfebcdic = 9,  /// [0xDD, 0x73, 0x66, 0x73] | 
|  | scsu      = 10, /// [0x0E, 0xFE, 0xFF] | 
|  | bocu1     = 11, /// [0xFB, 0xEE, 0x28] | 
|  | gb18030   = 12, /// [0x84, 0x31, 0x95, 0x33] | 
|  | utf8      = 13, /// [0xEF, 0xBB, 0xBF] | 
|  | utf16be   = 14, /// [0xFE, 0xFF] | 
|  | utf16le   = 15  /// [0xFF, 0xFE] | 
|  | } | 
|  |  | 
|  | /// The type stored inside `bomTable`. | 
|  | alias BOMSeq = Tuple!(BOM, "schema", ubyte[], "sequence"); | 
|  |  | 
|  | /** Mapping of a byte sequence to $(B Byte Order Mark (BOM)) | 
|  | */ | 
|  | immutable bomTable = [ | 
|  | BOMSeq(BOM.none, null), | 
|  | BOMSeq(BOM.utf32be, cast(ubyte[])([0x00, 0x00, 0xFE, 0xFF])), | 
|  | BOMSeq(BOM.utf32le, cast(ubyte[])([0xFF, 0xFE, 0x00, 0x00])), | 
|  | BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x39])), | 
|  | BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2B])), | 
|  | BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2F])), | 
|  | BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38, 0x2D])), | 
|  | BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38])), | 
|  | BOMSeq(BOM.utf1, cast(ubyte[])([0xF7, 0x64, 0x4C])), | 
|  | BOMSeq(BOM.utfebcdic, cast(ubyte[])([0xDD, 0x73, 0x66, 0x73])), | 
|  | BOMSeq(BOM.scsu, cast(ubyte[])([0x0E, 0xFE, 0xFF])), | 
|  | BOMSeq(BOM.bocu1, cast(ubyte[])([0xFB, 0xEE, 0x28])), | 
|  | BOMSeq(BOM.gb18030, cast(ubyte[])([0x84, 0x31, 0x95, 0x33])), | 
|  | BOMSeq(BOM.utf8, cast(ubyte[])([0xEF, 0xBB, 0xBF])), | 
|  | BOMSeq(BOM.utf16be, cast(ubyte[])([0xFE, 0xFF])), | 
|  | BOMSeq(BOM.utf16le, cast(ubyte[])([0xFF, 0xFE])) | 
|  | ]; | 
|  |  | 
|  | /** Returns a `BOMSeq` for a given `input`. | 
|  | If no `BOM` is present the `BOMSeq` for `BOM.none` is | 
|  | returned. The `BOM` sequence at the beginning of the range will | 
|  | not be comsumed from the passed range. If you pass a reference type | 
|  | range make sure that `save` creates a deep copy. | 
|  |  | 
|  | Params: | 
|  | input = The sequence to check for the `BOM` | 
|  |  | 
|  | Returns: | 
|  | the found `BOMSeq` corresponding to the passed `input`. | 
|  | */ | 
|  | immutable(BOMSeq) getBOM(Range)(Range input) | 
|  | if (isForwardRange!Range && is(immutable ElementType!Range == immutable ubyte)) | 
|  | { | 
|  | import std.algorithm.searching : startsWith; | 
|  | foreach (it; bomTable[1 .. $]) | 
|  | { | 
|  | if (startsWith(input.save, it.sequence)) | 
|  | { | 
|  | return it; | 
|  | } | 
|  | } | 
|  |  | 
|  | return bomTable[0]; | 
|  | } | 
|  |  | 
|  | /// | 
|  | @system unittest | 
|  | { | 
|  | import std.format : format; | 
|  |  | 
|  | auto ts = dchar(0x0000FEFF) ~ "Hello World"d; | 
|  |  | 
|  | auto entry = getBOM(cast(ubyte[]) ts); | 
|  | version (BigEndian) | 
|  | { | 
|  | assert(entry.schema == BOM.utf32be, format("%s", entry.schema)); | 
|  | } | 
|  | else | 
|  | { | 
|  | assert(entry.schema == BOM.utf32le, format("%s", entry.schema)); | 
|  | } | 
|  | } | 
|  |  | 
|  | @system unittest | 
|  | { | 
|  | import std.format : format; | 
|  |  | 
|  | foreach (idx, it; bomTable) | 
|  | { | 
|  | auto s = it[1] ~ cast(ubyte[])"hello world"; | 
|  | auto i = getBOM(s); | 
|  | assert(i[0] == bomTable[idx][0]); | 
|  |  | 
|  | if (idx < 4 || idx > 7) // get around the multiple utf7 bom's | 
|  | { | 
|  | assert(i[0] == BOM.init + idx); | 
|  | assert(i[1] == it[1]); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | @safe pure unittest | 
|  | { | 
|  | struct BOMInputRange | 
|  | { | 
|  | ubyte[] arr; | 
|  |  | 
|  | @property ubyte front() | 
|  | { | 
|  | return this.arr.front; | 
|  | } | 
|  |  | 
|  | @property bool empty() | 
|  | { | 
|  | return this.arr.empty; | 
|  | } | 
|  |  | 
|  | void popFront() | 
|  | { | 
|  | this.arr = this.arr[1 .. $]; | 
|  | } | 
|  |  | 
|  | @property typeof(this) save() | 
|  | { | 
|  | return this; | 
|  | } | 
|  | } | 
|  |  | 
|  | static assert( isInputRange!BOMInputRange); | 
|  | static assert(!isArray!BOMInputRange); | 
|  |  | 
|  | ubyte[] dummyEnd = [0,0,0,0]; | 
|  |  | 
|  | foreach (idx, it; bomTable[1 .. $]) | 
|  | { | 
|  | { | 
|  | auto ir = BOMInputRange(it.sequence.dup); | 
|  |  | 
|  | auto b = getBOM(ir); | 
|  | assert(b.schema == it.schema); | 
|  | assert(ir.arr == it.sequence); | 
|  | } | 
|  |  | 
|  | { | 
|  | auto noBom = it.sequence[0 .. 1].dup ~ dummyEnd; | 
|  | size_t oldLen = noBom.length; | 
|  | assert(oldLen - 4 < it.sequence.length); | 
|  |  | 
|  | auto ir = BOMInputRange(noBom.dup); | 
|  | auto b = getBOM(ir); | 
|  | assert(b.schema == BOM.none); | 
|  | assert(noBom.length == oldLen); | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | /** Constant defining a fully decoded BOM */ | 
|  | enum dchar utfBOM = 0xfeff; |