blob: cdb3439c941c75b1941a08a6e98be6b17fc61d5d [file] [log] [blame]
pure @safe unittest
{
import std.uni;
import std.algorithm.comparison : equal;
auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
foreach (v; 'a'..'z'+1)
assert(set[v]);
// Cyrillic lowercase interval
foreach (v; 'а'..'я'+1)
assert(set[v]);
//specific order is not required, intervals may interesect
auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1);
//the same end result
assert(set2.byInterval.equal(set.byInterval));
// test constructor this(Range)(Range intervals)
auto chessPiecesWhite = CodepointInterval(9812, 9818);
auto chessPiecesBlack = CodepointInterval(9818, 9824);
auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]);
foreach (v; '♔'..'♟'+1)
assert(set3[v]);
}
pure @safe unittest
{
import std.uni;
auto gothic = unicode.Gothic;
// Gothic letter ahsa
assert(gothic['\U00010330']);
// no ascii in Gothic obviously
assert(!gothic['$']);
}
pure @safe unittest
{
import std.uni;
import std.algorithm.comparison : equal;
import std.range : iota;
auto lower = unicode.LowerCase;
auto upper = unicode.UpperCase;
auto ascii = unicode.ASCII;
assert((lower & upper).empty); // no intersection
auto lowerASCII = lower & ascii;
assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
// throw away all of the lowercase ASCII
assert((ascii - lower).length == 128 - 26);
auto onlyOneOf = lower ~ ascii;
assert(!onlyOneOf['Δ']); // not ASCII and not lowercase
assert(onlyOneOf['$']); // ASCII and not lowercase
assert(!onlyOneOf['a']); // ASCII and lowercase
assert(onlyOneOf['я']); // not ASCII but lowercase
// throw away all cased letters from ASCII
auto noLetters = ascii - (lower | upper);
assert(noLetters.length == 128 - 26*2);
}
pure @safe unittest
{
import std.uni;
assert('я' in unicode.Cyrillic);
assert(!('z' in unicode.Cyrillic));
}
pure @safe unittest
{
import std.uni;
import std.algorithm.comparison : equal;
import std.range : iota;
auto set = unicode.ASCII;
set.byCodepoint.equal(iota(0, 0x80));
}
pure @safe unittest
{
import std.uni;
import std.conv : to;
import std.format : format;
import std.uni : unicode;
// This was originally using Cyrillic script.
// Unfortunately this is a pretty active range for changes,
// and hence broke in an update.
// Therefore the range Basic latin was used instead as it
// unlikely to ever change.
assert(unicode.InBasic_latin.to!string == "[0..128)");
// The specs '%s' and '%d' are equivalent to the to!string call above.
assert(format("%d", unicode.InBasic_latin) == unicode.InBasic_latin.to!string);
assert(format("%#x", unicode.InBasic_latin) == "[0..0x80)");
assert(format("%#X", unicode.InBasic_latin) == "[0..0X80)");
}
pure @safe unittest
{
import std.uni;
CodepointSet someSet;
someSet.add('0', '5').add('A','Z'+1);
someSet.add('5', '9'+1);
assert(someSet['0']);
assert(someSet['5']);
assert(someSet['9']);
assert(someSet['Z']);
}
pure @safe unittest
{
import std.uni;
auto set = unicode.ASCII;
// union with the inverse gets all of the code points in the Unicode
assert((set | set.inverted).length == 0x110000);
// no intersection with the inverse
assert((set & set.inverted).empty);
}
pure @safe unittest
{
import std.uni;
CodepointSet emptySet;
assert(emptySet.length == 0);
assert(emptySet.empty);
}
pure @safe unittest
{
import std.uni;
string truth = "2² = 4";
auto m = utfMatcher!char(unicode.Number);
assert(m.match(truth)); // '2' is a number all right
assert(truth == "² = 4"); // skips on match
assert(m.match(truth)); // so is the superscript '2'
assert(!m.match(truth)); // space is not a number
assert(truth == " = 4"); // unaffected on no match
assert(!m.skip(truth)); // same test ...
assert(truth == "= 4"); // but skips a codepoint regardless
assert(!m.test(truth)); // '=' is not a number
assert(truth == "= 4"); // test never affects argument
}
@safe unittest
{
import std.uni;
import std.exception : collectException;
auto ascii = unicode.ASCII;
assert(ascii['A']);
assert(ascii['~']);
assert(!ascii['\u00e0']);
// matching is case-insensitive
assert(ascii == unicode.ascII);
assert(!ascii['à']);
// underscores, '-' and whitespace in names are ignored too
auto latin = unicode.in_latin1_Supplement;
assert(latin['à']);
assert(!latin['$']);
// BTW Latin 1 Supplement is a block, hence "In" prefix
assert(latin == unicode("In Latin 1 Supplement"));
// run-time look up throws if no such set is found
assert(collectException(unicode("InCyrilliac")));
}
@safe unittest
{
import std.uni;
// use .block for explicitness
assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
}
@safe unittest
{
import std.uni;
auto arabicScript = unicode.script.arabic;
auto arabicBlock = unicode.block.arabic;
// there is an intersection between script and block
assert(arabicBlock['؁']);
assert(arabicScript['؁']);
// but they are different
assert(arabicBlock != arabicScript);
assert(arabicBlock == unicode.inArabic);
assert(arabicScript == unicode.arabic);
}
@safe unittest
{
import std.uni;
// L here is syllable type not Letter as in unicode.L short-cut
auto leadingVowel = unicode.hangulSyllableType("L");
// check that some leading vowels are present
foreach (vowel; '\u1110'..'\u115F')
assert(leadingVowel[vowel]);
assert(leadingVowel == unicode.hangulSyllableType.L);
}
@safe unittest
{
import std.uni;
import std.uni : unicode;
string pat = "[a-zA-Z0-9]hello";
auto set = unicode.parseSet(pat);
// check some of the codepoints
assert(set['a'] && set['A'] && set['9']);
assert(pat == "hello");
}
@safe unittest
{
import std.uni;
assert(graphemeStride(" ", 1) == 1);
// A + combing ring above
string city = "A\u030Arhus";
size_t first = graphemeStride(city, 0);
assert(first == 3); //\u030A has 2 UTF-8 code units
assert(city[0 .. first] == "A\u030A");
assert(city[first..$] == "rhus");
}
@safe pure unittest
{
import std.uni;
// Two Union Jacks of the Great Britain in each
string s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";
wstring ws = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";
dstring ds = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";
// String pop length in code units, not points.
assert(s.popGrapheme() == 8);
assert(ws.popGrapheme() == 4);
assert(ds.popGrapheme() == 2);
assert(s == "\U0001F1EC\U0001F1E7");
assert(ws == "\U0001F1EC\U0001F1E7");
assert(ds == "\U0001F1EC\U0001F1E7");
import std.algorithm.comparison : equal;
import std.algorithm.iteration : filter;
// Also works for non-random access ranges as long as the
// character type is 32-bit.
auto testPiece = "\r\nhello!"d.filter!(x => !x.isAlpha);
// Windows-style line ending is two code points in a single grapheme.
assert(testPiece.popGrapheme() == 2);
assert(testPiece.equal("!"d));
}
@safe unittest
{
import std.uni;
import std.algorithm.comparison : equal;
import std.range.primitives : walkLength;
import std.range : take, drop;
auto text = "noe\u0308l"; // noël using e + combining diaeresis
assert(text.walkLength == 5); // 5 code points
auto gText = text.byGrapheme;
assert(gText.walkLength == 4); // 4 graphemes
assert(gText.take(3).equal("noe\u0308".byGrapheme));
assert(gText.drop(3).equal("l".byGrapheme));
}
@safe unittest
{
import std.uni;
import std.array : array;
import std.conv : text;
import std.range : retro;
string s = "noe\u0308l"; // noël
// reverse it and convert the result to a string
string reverse = s.byGrapheme
.array
.retro
.byCodePoint
.text;
assert(reverse == "le\u0308on"); // lëon
}
@safe unittest
{
import std.uni;
auto g = Grapheme("A\u0302");
assert(g[0] == 'A');
assert(g.valid);
g[1] = '~'; // ASCII tilda is not a combining mark
assert(g[1] == '~');
assert(!g.valid);
}
@safe unittest
{
import std.uni;
import std.algorithm.comparison : equal;
auto g = Grapheme("A");
assert(g.valid);
g ~= '\u0301';
assert(g[].equal("A\u0301"));
assert(g.valid);
g ~= "B";
// not a valid grapheme cluster anymore
assert(!g.valid);
// still could be useful though
assert(g[].equal("A\u0301B"));
}
@safe unittest
{
import std.uni;
import std.algorithm.comparison : equal;
import std.algorithm.iteration : filter;
import std.range : isRandomAccessRange;
string bold = "ku\u0308hn";
// note that decodeGrapheme takes parameter by ref
auto first = decodeGrapheme(bold);
assert(first.length == 1);
assert(first[0] == 'k');
// the next grapheme is 2 characters long
auto wideOne = decodeGrapheme(bold);
// slicing a grapheme yields a random-access range of dchar
assert(wideOne[].equal("u\u0308"));
assert(wideOne.length == 2);
static assert(isRandomAccessRange!(typeof(wideOne[])));
// all of the usual range manipulation is possible
assert(wideOne[].filter!isMark().equal("\u0308"));
auto g = Grapheme("A");
assert(g.valid);
g ~= '\u0301';
assert(g[].equal("A\u0301"));
assert(g.valid);
g ~= "B";
// not a valid grapheme cluster anymore
assert(!g.valid);
// still could be useful though
assert(g[].equal("A\u0301B"));
}
@safe @nogc pure nothrow unittest
{
import std.uni;
assert(sicmp("Август", "авгусТ") == 0);
// Greek also works as long as there is no 1:M mapping in sight
assert(sicmp("ΌΎ", "όύ") == 0);
// things like the following won't get matched as equal
// Greek small letter iota with dialytika and tonos
assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
// while icmp has no problem with that
assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0);
assert(icmp("ΌΎ", "όύ") == 0);
}
@safe @nogc pure nothrow unittest
{
import std.uni;
assert(icmp("Rußland", "Russland") == 0);
assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
}
@safe @nogc nothrow pure unittest
{
import std.uni;
import std.utf : byDchar;
assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0);
assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0);
}
@safe unittest
{
import std.uni;
// shorten the code
alias CC = combiningClass;
// combining tilda
assert(CC('\u0303') == 230);
// combining ring below
assert(CC('\u0325') == 220);
// the simple consequence is that "tilda" should be
// placed after a "ring below" in a sequence
}
@safe unittest
{
import std.uni;
assert(compose('A','\u0308') == '\u00C4');
assert(compose('A', 'B') == dchar.init);
assert(compose('C', '\u0301') == '\u0106');
// note that the starter is the first one
// thus the following doesn't compose
assert(compose('\u0308', 'A') == dchar.init);
}
@safe unittest
{
import std.uni;
import std.algorithm.comparison : equal;
assert(compose('A','\u0308') == '\u00C4');
assert(compose('A', 'B') == dchar.init);
assert(compose('C', '\u0301') == '\u0106');
// note that the starter is the first one
// thus the following doesn't compose
assert(compose('\u0308', 'A') == dchar.init);
assert(decompose('Ĉ')[].equal("C\u0302"));
assert(decompose('D')[].equal("D"));
assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
assert(decompose!Compatibility('¹')[].equal("1"));
}
@safe unittest
{
import std.uni;
import std.algorithm.comparison : equal;
assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
}
@safe unittest
{
import std.uni;
assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
// leaving out T-vowel, or passing any codepoint
// that is not trailing consonant composes an LV-syllable
assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
assert(composeJamo('\u1111', 'A') == dchar.init);
assert(composeJamo('A', '\u1171') == dchar.init);
}
@safe pure unittest
{
import std.uni;
// any encoding works
wstring greet = "Hello world";
assert(normalize(greet) is greet); // the same exact slice
// An example of a character with all 4 forms being different:
// Greek upsilon with acute and hook symbol (code point 0x03D3)
assert(normalize!NFC("ϓ") == "\u03D3");
assert(normalize!NFD("ϓ") == "\u03D2\u0301");
assert(normalize!NFKC("ϓ") == "\u038E");
assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
}
@safe unittest
{
import std.uni;
// e.g. Cyrillic is always allowed, so is ASCII
assert(allowedIn!NFC('я'));
assert(allowedIn!NFD('я'));
assert(allowedIn!NFKC('я'));
assert(allowedIn!NFKD('я'));
assert(allowedIn!NFC('Z'));
}
@safe pure unittest
{
import std.uni;
import std.algorithm.comparison : equal;
assert("hEllo".asUpperCase.equal("HELLO"));
}
@safe pure unittest
{
import std.uni;
import std.algorithm.comparison : equal;
assert("hEllo".asCapitalized.equal("Hello"));
}
@safe unittest
{
import std.uni;
import std.algorithm.iteration : map;
import std.algorithm.mutation : copy;
import std.array : appender;
auto abuf = appender!(char[])();
"hello".map!toUpper.copy(abuf);
assert(abuf.data == "HELLO");
}