| // Written in the D programming language. |
| |
| /** |
| * Encode and decode Uniform Resource Identifiers (URIs). |
| * URIs are used in internet transfer protocols. |
| * Valid URI characters consist of letters, digits, |
| * and the characters $(B ;/?:@&=+$,-_.!~*'()) |
| * Reserved URI characters are $(B ;/?:@&=+$,) |
| * Escape sequences consist of $(B %) followed by two hex digits. |
| * |
| * See_Also: |
| * $(LINK2 http://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br> |
| * $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia) |
| * Copyright: Copyright Digital Mars 2000 - 2009. |
| * License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). |
| * Authors: $(HTTP digitalmars.com, Walter Bright) |
| * Source: $(PHOBOSSRC std/_uri.d) |
| */ |
| /* Copyright Digital Mars 2000 - 2009. |
| * Distributed under the Boost Software License, Version 1.0. |
| * (See accompanying file LICENSE_1_0.txt or copy at |
| * http://www.boost.org/LICENSE_1_0.txt) |
| */ |
| module std.uri; |
| |
| //debug=uri; // uncomment to turn on debugging writefln's |
| debug(uri) import std.stdio; |
| import std.traits : isSomeChar; |
| |
| /** This Exception is thrown if something goes wrong when encoding or |
| decoding a URI. |
| */ |
| class URIException : Exception |
| { |
| import std.exception : basicExceptionCtors; |
| mixin basicExceptionCtors; |
| } |
| |
| private enum |
| { |
| URI_Alpha = 1, |
| URI_Reserved = 2, |
| URI_Mark = 4, |
| URI_Digit = 8, |
| URI_Hash = 0x10, // '#' |
| } |
| |
| private immutable char[16] hex2ascii = "0123456789ABCDEF"; |
| |
| private immutable ubyte[128] uri_flags = // indexed by character |
| ({ |
| ubyte[128] uflags; |
| |
| // Compile time initialize |
| uflags['#'] |= URI_Hash; |
| |
| foreach (c; 'A' .. 'Z' + 1) |
| { |
| uflags[c] |= URI_Alpha; |
| uflags[c + 0x20] |= URI_Alpha; // lowercase letters |
| } |
| foreach (c; '0' .. '9' + 1) uflags[c] |= URI_Digit; |
| foreach (c; ";/?:@&=+$,") uflags[c] |= URI_Reserved; |
| foreach (c; "-_.!~*'()") uflags[c] |= URI_Mark; |
| return uflags; |
| })(); |
| |
| private string URI_Encode(dstring str, uint unescapedSet) |
| { |
| import core.exception : OutOfMemoryError; |
| import core.stdc.stdlib : alloca; |
| |
| uint j; |
| uint k; |
| dchar V; |
| dchar C; |
| |
| // result buffer |
| char[50] buffer = void; |
| char* R; |
| uint Rlen; |
| uint Rsize; // alloc'd size |
| |
| immutable len = str.length; |
| |
| R = buffer.ptr; |
| Rsize = buffer.length; |
| Rlen = 0; |
| |
| for (k = 0; k != len; k++) |
| { |
| C = str[k]; |
| // if (C in unescapedSet) |
| if (C < uri_flags.length && uri_flags[C] & unescapedSet) |
| { |
| if (Rlen == Rsize) |
| { |
| char* R2; |
| |
| Rsize *= 2; |
| if (Rsize > 1024) |
| { |
| R2 = (new char[Rsize]).ptr; |
| } |
| else |
| { |
| R2 = cast(char *) alloca(Rsize * char.sizeof); |
| if (!R2) |
| throw new OutOfMemoryError("Alloca failure"); |
| } |
| R2[0 .. Rlen] = R[0 .. Rlen]; |
| R = R2; |
| } |
| R[Rlen] = cast(char) C; |
| Rlen++; |
| } |
| else |
| { |
| char[6] Octet; |
| uint L; |
| |
| V = C; |
| |
| // Transform V into octets |
| if (V <= 0x7F) |
| { |
| Octet[0] = cast(char) V; |
| L = 1; |
| } |
| else if (V <= 0x7FF) |
| { |
| Octet[0] = cast(char)(0xC0 | (V >> 6)); |
| Octet[1] = cast(char)(0x80 | (V & 0x3F)); |
| L = 2; |
| } |
| else if (V <= 0xFFFF) |
| { |
| Octet[0] = cast(char)(0xE0 | (V >> 12)); |
| Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F)); |
| Octet[2] = cast(char)(0x80 | (V & 0x3F)); |
| L = 3; |
| } |
| else if (V <= 0x1FFFFF) |
| { |
| Octet[0] = cast(char)(0xF0 | (V >> 18)); |
| Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F)); |
| Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F)); |
| Octet[3] = cast(char)(0x80 | (V & 0x3F)); |
| L = 4; |
| } |
| else |
| { |
| throw new URIException("Undefined UTF-32 code point"); |
| } |
| |
| if (Rlen + L * 3 > Rsize) |
| { |
| char *R2; |
| |
| Rsize = 2 * (Rlen + L * 3); |
| if (Rsize > 1024) |
| { |
| R2 = (new char[Rsize]).ptr; |
| } |
| else |
| { |
| R2 = cast(char *) alloca(Rsize * char.sizeof); |
| if (!R2) |
| throw new OutOfMemoryError("Alloca failure"); |
| } |
| R2[0 .. Rlen] = R[0 .. Rlen]; |
| R = R2; |
| } |
| |
| for (j = 0; j < L; j++) |
| { |
| R[Rlen] = '%'; |
| R[Rlen + 1] = hex2ascii[Octet[j] >> 4]; |
| R[Rlen + 2] = hex2ascii[Octet[j] & 15]; |
| |
| Rlen += 3; |
| } |
| } |
| } |
| |
| return R[0 .. Rlen].idup; |
| } |
| |
| private uint ascii2hex(dchar c) @nogc @safe pure nothrow |
| { |
| return (c <= '9') ? c - '0' : |
| (c <= 'F') ? c - 'A' + 10 : |
| c - 'a' + 10; |
| } |
| |
| private dstring URI_Decode(Char)(in Char[] uri, uint reservedSet) |
| if (isSomeChar!Char) |
| { |
| import core.exception : OutOfMemoryError; |
| import core.stdc.stdlib : alloca; |
| import std.ascii : isHexDigit; |
| |
| uint j; |
| uint k; |
| uint V; |
| dchar C; |
| |
| // Result array, allocated on stack |
| dchar* R; |
| uint Rlen; |
| |
| immutable len = uri.length; |
| auto s = uri.ptr; |
| |
| // Preallocate result buffer R guaranteed to be large enough for result |
| auto Rsize = len; |
| if (Rsize > 1024 / dchar.sizeof) |
| { |
| R = (new dchar[Rsize]).ptr; |
| } |
| else |
| { |
| R = cast(dchar *) alloca(Rsize * dchar.sizeof); |
| if (!R) |
| throw new OutOfMemoryError("Alloca failure"); |
| } |
| Rlen = 0; |
| |
| for (k = 0; k != len; k++) |
| { |
| char B; |
| uint start; |
| |
| C = s[k]; |
| if (C != '%') |
| { |
| R[Rlen] = C; |
| Rlen++; |
| continue; |
| } |
| start = k; |
| if (k + 2 >= len) |
| throw new URIException("Unexpected end of URI"); |
| if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2])) |
| throw new URIException("Expected two hexadecimal digits after '%'"); |
| B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2])); |
| k += 2; |
| if ((B & 0x80) == 0) |
| { |
| C = B; |
| } |
| else |
| { |
| uint n; |
| |
| for (n = 1; ; n++) |
| { |
| if (n > 4) |
| throw new URIException("UTF-32 code point size too large"); |
| if (((B << n) & 0x80) == 0) |
| { |
| if (n == 1) |
| throw new URIException("UTF-32 code point size too small"); |
| break; |
| } |
| } |
| |
| // Pick off (7 - n) significant bits of B from first byte of octet |
| V = B & ((1 << (7 - n)) - 1); // (!!!) |
| |
| if (k + (3 * (n - 1)) >= len) |
| throw new URIException("UTF-32 unaligned String"); |
| for (j = 1; j != n; j++) |
| { |
| k++; |
| if (s[k] != '%') |
| throw new URIException("Expected: '%'"); |
| if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2])) |
| throw new URIException("Expected two hexadecimal digits after '%'"); |
| B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2])); |
| if ((B & 0xC0) != 0x80) |
| throw new URIException("Incorrect UTF-32 multi-byte sequence"); |
| k += 2; |
| V = (V << 6) | (B & 0x3F); |
| } |
| if (V > 0x10FFFF) |
| throw new URIException("Unknown UTF-32 code point"); |
| C = V; |
| } |
| if (C < uri_flags.length && uri_flags[C] & reservedSet) |
| { |
| // R ~= s[start .. k + 1]; |
| immutable width = (k + 1) - start; |
| for (int ii = 0; ii < width; ii++) |
| R[Rlen + ii] = s[start + ii]; |
| Rlen += width; |
| } |
| else |
| { |
| R[Rlen] = C; |
| Rlen++; |
| } |
| } |
| assert(Rlen <= Rsize); // enforce our preallocation size guarantee |
| |
| // Copy array on stack to array in memory |
| return R[0 .. Rlen].idup; |
| } |
| |
| /************************************* |
| * Decodes the URI string encodedURI into a UTF-8 string and returns it. |
| * Escape sequences that resolve to reserved URI characters are not replaced. |
| * Escape sequences that resolve to the '#' character are not replaced. |
| */ |
| |
| string decode(Char)(in Char[] encodedURI) |
| if (isSomeChar!Char) |
| { |
| import std.algorithm.iteration : each; |
| import std.utf : encode; |
| auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash); |
| char[] r; |
| s.each!(c => encode(r, c)); |
| return r; |
| } |
| |
| /******************************* |
| * Decodes the URI string encodedURI into a UTF-8 string and returns it. All |
| * escape sequences are decoded. |
| */ |
| |
| string decodeComponent(Char)(in Char[] encodedURIComponent) |
| if (isSomeChar!Char) |
| { |
| import std.algorithm.iteration : each; |
| import std.utf : encode; |
| auto s = URI_Decode(encodedURIComponent, 0); |
| char[] r; |
| s.each!(c => encode(r, c)); |
| return r; |
| } |
| |
| /***************************** |
| * Encodes the UTF-8 string uri into a URI and returns that URI. Any character |
| * not a valid URI character is escaped. The '#' character is not escaped. |
| */ |
| |
| string encode(Char)(in Char[] uri) |
| if (isSomeChar!Char) |
| { |
| import std.utf : toUTF32; |
| auto s = toUTF32(uri); |
| return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark); |
| } |
| |
| /******************************** |
| * Encodes the UTF-8 string uriComponent into a URI and returns that URI. |
| * Any character not a letter, digit, or one of -_.!~*'() is escaped. |
| */ |
| |
| string encodeComponent(Char)(in Char[] uriComponent) |
| if (isSomeChar!Char) |
| { |
| import std.utf : toUTF32; |
| auto s = toUTF32(uriComponent); |
| return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark); |
| } |
| |
| /* Encode associative array using www-form-urlencoding |
| * |
| * Params: |
| * values = an associative array containing the values to be encoded. |
| * |
| * Returns: |
| * A string encoded using www-form-urlencoding. |
| */ |
| package string urlEncode(in string[string] values) |
| { |
| if (values.length == 0) |
| return ""; |
| |
| import std.array : Appender; |
| import std.format : formattedWrite; |
| |
| Appender!string enc; |
| enc.reserve(values.length * 128); |
| |
| bool first = true; |
| foreach (k, v; values) |
| { |
| if (!first) |
| enc.put('&'); |
| formattedWrite(enc, "%s=%s", encodeComponent(k), encodeComponent(v)); |
| first = false; |
| } |
| return enc.data; |
| } |
| |
| @system unittest |
| { |
| // @system because urlEncode -> encodeComponent -> URI_Encode |
| // URI_Encode uses alloca and pointer slicing |
| string[string] a; |
| assert(urlEncode(a) == ""); |
| assert(urlEncode(["name1" : "value1"]) == "name1=value1"); |
| auto enc = urlEncode(["name1" : "value1", "name2" : "value2"]); |
| assert(enc == "name1=value1&name2=value2" || enc == "name2=value2&name1=value1"); |
| } |
| |
| /*************************** |
| * Does string s[] start with a URL? |
| * Returns: |
| * -1 it does not |
| * len it does, and s[0 .. len] is the slice of s[] that is that URL |
| */ |
| |
| ptrdiff_t uriLength(Char)(in Char[] s) |
| if (isSomeChar!Char) |
| { |
| /* Must start with one of: |
| * http:// |
| * https:// |
| * www. |
| */ |
| import std.ascii : isAlphaNum; |
| import std.uni : icmp; |
| |
| ptrdiff_t i; |
| |
| if (s.length <= 4) |
| return -1; |
| |
| if (s.length > 7 && icmp(s[0 .. 7], "http://") == 0) |
| { |
| i = 7; |
| } |
| else |
| { |
| if (s.length > 8 && icmp(s[0 .. 8], "https://") == 0) |
| i = 8; |
| else |
| return -1; |
| } |
| |
| ptrdiff_t lastdot; |
| for (; i < s.length; i++) |
| { |
| auto c = s[i]; |
| if (isAlphaNum(c)) |
| continue; |
| if (c == '-' || c == '_' || c == '?' || |
| c == '=' || c == '%' || c == '&' || |
| c == '/' || c == '+' || c == '#' || |
| c == '~' || c == '$') |
| continue; |
| if (c == '.') |
| { |
| lastdot = i; |
| continue; |
| } |
| break; |
| } |
| if (!lastdot) |
| return -1; |
| |
| return i; |
| } |
| |
| /// |
| @safe unittest |
| { |
| string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!"; |
| assert(uriLength(s1) == 49); |
| string s2 = "no uri here"; |
| assert(uriLength(s2) == -1); |
| assert(uriLength("issue 14924") < 0); |
| } |
| |
| |
| /*************************** |
| * Does string s[] start with an email address? |
| * Returns: |
| * -1 it does not |
| * len it does, and s[0 .. i] is the slice of s[] that is that email address |
| * References: |
| * RFC2822 |
| */ |
| ptrdiff_t emailLength(Char)(in Char[] s) |
| if (isSomeChar!Char) |
| { |
| import std.ascii : isAlpha, isAlphaNum; |
| |
| ptrdiff_t i; |
| |
| if (!isAlpha(s[0])) |
| return -1; |
| |
| for (i = 1; 1; i++) |
| { |
| if (i == s.length) |
| return -1; |
| auto c = s[i]; |
| if (isAlphaNum(c)) |
| continue; |
| if (c == '-' || c == '_' || c == '.') |
| continue; |
| if (c != '@') |
| return -1; |
| i++; |
| break; |
| } |
| |
| /* Now do the part past the '@' |
| */ |
| ptrdiff_t lastdot; |
| for (; i < s.length; i++) |
| { |
| auto c = s[i]; |
| if (isAlphaNum(c)) |
| continue; |
| if (c == '-' || c == '_') |
| continue; |
| if (c == '.') |
| { |
| lastdot = i; |
| continue; |
| } |
| break; |
| } |
| if (!lastdot || (i - lastdot != 3 && i - lastdot != 4)) |
| return -1; |
| |
| return i; |
| } |
| |
| /// |
| @safe unittest |
| { |
| string s1 = "my.e-mail@www.example-domain.com with garbage added"; |
| assert(emailLength(s1) == 32); |
| string s2 = "no email address here"; |
| assert(emailLength(s2) == -1); |
| assert(emailLength("issue 14924") < 0); |
| } |
| |
| |
| @system unittest |
| { |
| //@system because of encode -> URI_Encode |
| debug(uri) writeln("uri.encodeURI.unittest"); |
| |
| string source = "http://www.digitalmars.com/~fred/fred's RX.html#foo"; |
| string target = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo"; |
| |
| auto result = encode(source); |
| debug(uri) writefln("result = '%s'", result); |
| assert(result == target); |
| result = decode(target); |
| debug(uri) writefln("result = '%s'", result); |
| assert(result == source); |
| |
| result = encode(decode("%E3%81%82%E3%81%82")); |
| assert(result == "%E3%81%82%E3%81%82"); |
| |
| result = encodeComponent("c++"); |
| assert(result == "c%2B%2B"); |
| |
| auto str = new char[10_000_000]; |
| str[] = 'A'; |
| result = encodeComponent(str); |
| foreach (char c; result) |
| assert(c == 'A'); |
| |
| result = decode("%41%42%43"); |
| debug(uri) writeln(result); |
| |
| import std.meta : AliasSeq; |
| foreach (StringType; AliasSeq!(char[], wchar[], dchar[], string, wstring, dstring)) |
| { |
| import std.conv : to; |
| StringType decoded1 = source.to!StringType; |
| string encoded1 = encode(decoded1); |
| assert(decoded1 == source.to!StringType); // check that `decoded1` wasn't changed |
| assert(encoded1 == target); |
| assert(decoded1 == decode(encoded1).to!StringType); |
| |
| StringType encoded2 = target.to!StringType; |
| string decoded2 = decode(encoded2); |
| assert(encoded2 == target.to!StringType); // check that `encoded2` wasn't changed |
| assert(decoded2 == source); |
| assert(encoded2 == encode(decoded2).to!StringType); |
| } |
| } |