| // Written in the D programming language. |
| |
| /++ |
| $(P The $(D std.uni) module provides an implementation |
| of fundamental Unicode algorithms and data structures. |
| This doesn't include UTF encoding and decoding primitives, |
| see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf) |
| for this functionality. ) |
| |
| $(SCRIPT inhibitQuickIndex = 1;) |
| $(BOOKTABLE, |
| $(TR $(TH Category) $(TH Functions)) |
| $(TR $(TD Decode) $(TD |
| $(LREF byCodePoint) |
| $(LREF byGrapheme) |
| $(LREF decodeGrapheme) |
| $(LREF graphemeStride) |
| )) |
| $(TR $(TD Comparison) $(TD |
| $(LREF icmp) |
| $(LREF sicmp) |
| )) |
| $(TR $(TD Classification) $(TD |
| $(LREF isAlpha) |
| $(LREF isAlphaNum) |
| $(LREF isCodepointSet) |
| $(LREF isControl) |
| $(LREF isFormat) |
| $(LREF isGraphical) |
| $(LREF isIntegralPair) |
| $(LREF isMark) |
| $(LREF isNonCharacter) |
| $(LREF isNumber) |
| $(LREF isPrivateUse) |
| $(LREF isPunctuation) |
| $(LREF isSpace) |
| $(LREF isSurrogate) |
| $(LREF isSurrogateHi) |
| $(LREF isSurrogateLo) |
| $(LREF isSymbol) |
| $(LREF isWhite) |
| )) |
| $(TR $(TD Normalization) $(TD |
| $(LREF NFC) |
| $(LREF NFD) |
| $(LREF NFKD) |
| $(LREF NormalizationForm) |
| $(LREF normalize) |
| )) |
| $(TR $(TD Decompose) $(TD |
| $(LREF decompose) |
| $(LREF decomposeHangul) |
| $(LREF UnicodeDecomposition) |
| )) |
| $(TR $(TD Compose) $(TD |
| $(LREF compose) |
| $(LREF composeJamo) |
| )) |
| $(TR $(TD Sets) $(TD |
| $(LREF CodepointInterval) |
| $(LREF CodepointSet) |
| $(LREF InversionList) |
| $(LREF unicode) |
| )) |
| $(TR $(TD Trie) $(TD |
| $(LREF codepointSetTrie) |
| $(LREF CodepointSetTrie) |
| $(LREF codepointTrie) |
| $(LREF CodepointTrie) |
| $(LREF toTrie) |
| $(LREF toDelegate) |
| )) |
| $(TR $(TD Casing) $(TD |
| $(LREF asCapitalized) |
| $(LREF asLowerCase) |
| $(LREF asUpperCase) |
| $(LREF isLower) |
| $(LREF isUpper) |
| $(LREF toLower) |
| $(LREF toLowerInPlace) |
| $(LREF toUpper) |
| $(LREF toUpperInPlace) |
| )) |
| $(TR $(TD Utf8Matcher) $(TD |
| $(LREF isUtfMatcher) |
| $(LREF MatcherConcept) |
| $(LREF utfMatcher) |
| )) |
| $(TR $(TD Separators) $(TD |
| $(LREF lineSep) |
| $(LREF nelSep) |
| $(LREF paraSep) |
| )) |
| $(TR $(TD Building blocks) $(TD |
| $(LREF allowedIn) |
| $(LREF combiningClass) |
| $(LREF Grapheme) |
| )) |
| ) |
| |
| $(P All primitives listed operate on Unicode characters and |
| sets of characters. For functions which operate on ASCII characters |
| and ignore Unicode $(CHARACTERS), see $(MREF std, ascii). |
| For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms |
| used throughout this module see the $(S_LINK Terminology, terminology) section |
| below. |
| ) |
| $(P The focus of this module is the core needs of developing Unicode-aware |
| applications. To that effect it provides the following optimized primitives: |
| ) |
| $(UL |
| $(LI Character classification by category and common properties: |
| $(LREF isAlpha), $(LREF isWhite) and others. |
| ) |
| $(LI |
| Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)). |
| ) |
| $(LI |
| Converting text to any of the four normalization forms via $(LREF normalize). |
| ) |
| $(LI |
| Decoding ($(LREF decodeGrapheme)) and iteration ($(LREF byGrapheme), $(LREF graphemeStride)) |
| by user-perceived characters, that is by $(LREF Grapheme) clusters. |
| ) |
| $(LI |
| Decomposing and composing of individual character(s) according to canonical |
| or compatibility rules, see $(LREF compose) and $(LREF decompose), |
| including the specific version for Hangul syllables $(LREF composeJamo) |
| and $(LREF decomposeHangul). |
| ) |
| ) |
| $(P It's recognized that an application may need further enhancements |
| and extensions, such as less commonly known algorithms, |
| or tailoring existing ones for region specific needs. To help users |
| with building any extra functionality beyond the core primitives, |
| the module provides: |
| ) |
| $(UL |
| $(LI |
| $(LREF CodepointSet), a type for easy manipulation of sets of characters. |
| Besides the typical set algebra it provides an unusual feature: |
| a D source code generator for detection of $(CODEPOINTS) in this set. |
| This is a boon for meta-programming parser frameworks, |
| and is used internally to power classification in small |
| sets like $(LREF isWhite). |
| ) |
| $(LI |
| A way to construct optimal packed multi-stage tables also known as a |
| special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie). |
| The functions $(LREF codepointTrie), $(LREF codepointSetTrie) |
| construct custom tries that map dchar to value. |
| The end result is a fast and predictable $(BIGOH 1) lookup that powers |
| functions like $(LREF isAlpha) and $(LREF combiningClass), |
| but for user-defined data sets. |
| ) |
| $(LI |
| A useful technique for Unicode-aware parsers that perform |
| character classification of encoded $(CODEPOINTS) |
| is to avoid unnecassary decoding at all costs. |
| $(LREF utfMatcher) provides an improvement over the usual workflow |
| of decode-classify-process, combining the decoding and classification |
| steps. By extracting necessary bits directly from encoded |
| $(S_LINK Code unit, code units) matchers achieve |
| significant performance improvements. See $(LREF MatcherConcept) for |
| the common interface of UTF matchers. |
| ) |
| $(LI |
| Generally useful building blocks for customized normalization: |
| $(LREF combiningClass) for querying combining class |
| and $(LREF allowedIn) for testing the Quick_Check |
| property of a given normalization form. |
| ) |
| $(LI |
| Access to a large selection of commonly used sets of $(CODEPOINTS). |
| $(S_LINK Unicode properties, Supported sets) include Script, |
| Block and General Category. The exact contents of a set can be |
| observed in the CLDR utility, on the |
| $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page |
| of the Unicode website. |
| See $(LREF unicode) for easy and (optionally) compile-time checked set |
| queries. |
| ) |
| ) |
| $(SECTION Synopsis) |
| --- |
| import std.uni; |
| void main() |
| { |
| // initialize code point sets using script/block or property name |
| // now 'set' contains code points from both scripts. |
| auto set = unicode("Cyrillic") | unicode("Armenian"); |
| // same thing but simpler and checked at compile-time |
| auto ascii = unicode.ASCII; |
| auto currency = unicode.Currency_Symbol; |
| |
| // easy set ops |
| auto a = set & ascii; |
| assert(a.empty); // as it has no intersection with ascii |
| a = set | ascii; |
| auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian |
| |
| // some properties of code point sets |
| assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 |
| // testing presence of a code point in a set |
| // is just fine, it is O(logN) |
| assert(!b['$']); |
| assert(!b['\u058F']); // Armenian dram sign |
| assert(b['¥']); |
| |
| // building fast lookup tables, these guarantee O(1) complexity |
| // 1-level Trie lookup table essentially a huge bit-set ~262Kb |
| auto oneTrie = toTrie!1(b); |
| // 2-level far more compact but typically slightly slower |
| auto twoTrie = toTrie!2(b); |
| // 3-level even smaller, and a bit slower yet |
| auto threeTrie = toTrie!3(b); |
| assert(oneTrie['£']); |
| assert(twoTrie['£']); |
| assert(threeTrie['£']); |
| |
| // build the trie with the most sensible trie level |
| // and bind it as a functor |
| auto cyrillicOrArmenian = toDelegate(set); |
| auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); |
| assert(balance == "ընկեր!"); |
| // compatible with bool delegate(dchar) |
| bool delegate(dchar) bindIt = cyrillicOrArmenian; |
| |
| // Normalization |
| string s = "Plain ascii (and not only), is always normalized!"; |
| assert(s is normalize(s));// is the same string |
| |
| string nonS = "A\u0308ffin"; // A ligature |
| auto nS = normalize(nonS); // to NFC, the W3C endorsed standard |
| assert(nS == "Äffin"); |
| assert(nS != nonS); |
| string composed = "Äffin"; |
| |
| assert(normalize!NFD(composed) == "A\u0308ffin"); |
| // to NFKD, compatibility decomposition useful for fuzzy matching/searching |
| assert(normalize!NFKD("2¹⁰") == "210"); |
| } |
| --- |
| $(SECTION Terminology |
| ) |
| $(P The following is a list of important Unicode notions |
| and definitions. Any conventions used specifically in this |
| module alone are marked as such. The descriptions are based on the formal |
| definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf, |
| chapter three of The Unicode Standard Core Specification.) |
| ) |
| $(P $(DEF Abstract character) A unit of information used for the organization, |
| control, or representation of textual data. |
| Note that: |
| $(UL |
| $(LI When representing data, the nature of that data |
| is generally symbolic as opposed to some other |
| kind of data (for example, visual). |
| ) |
| $(LI An abstract character has no concrete form |
| and should not be confused with a $(S_LINK Glyph, glyph). |
| ) |
| $(LI An abstract character does not necessarily |
| correspond to what a user thinks of as a “character” |
| and should not be confused with a $(LREF Grapheme). |
| ) |
| $(LI The abstract characters encoded (see Encoded character) |
| are known as Unicode abstract characters. |
| ) |
| $(LI Abstract characters not directly |
| encoded by the Unicode Standard can often be |
| represented by the use of combining character sequences. |
| ) |
| ) |
| ) |
| $(P $(DEF Canonical decomposition) |
| The decomposition of a character or character sequence |
| that results from recursively applying the canonical |
| mappings found in the Unicode Character Database |
| and these described in Conjoining Jamo Behavior |
| (section 12 of |
| $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)). |
| ) |
| $(P $(DEF Canonical composition) |
| The precise definition of the Canonical composition |
| is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf, |
| Unicode Conformance) section 11. |
| Informally it's the process that does the reverse of the canonical |
| decomposition with the addition of certain rules |
| that e.g. prevent legacy characters from appearing in the composed result. |
| ) |
| $(P $(DEF Canonical equivalent) |
| Two character sequences are said to be canonical equivalents if |
| their full canonical decompositions are identical. |
| ) |
| $(P $(DEF Character) Typically differs by context. |
| For the purpose of this documentation the term $(I character) |
| implies $(I encoded character), that is, a code point having |
| an assigned abstract character (a symbolic meaning). |
| ) |
| $(P $(DEF Code point) Any value in the Unicode codespace; |
| that is, the range of integers from 0 to 10FFFF (hex). |
| Not all code points are assigned to encoded characters. |
| ) |
| $(P $(DEF Code unit) The minimal bit combination that can represent |
| a unit of encoded text for processing or interchange. |
| Depending on the encoding this could be: |
| 8-bit code units in the UTF-8 ($(D char)), |
| 16-bit code units in the UTF-16 ($(D wchar)), |
| and 32-bit code units in the UTF-32 ($(D dchar)). |
| $(I Note that in UTF-32, a code unit is a code point |
| and is represented by the D $(D dchar) type.) |
| ) |
| $(P $(DEF Combining character) A character with the General Category |
| of Combining Mark(M). |
| $(UL |
| $(LI All characters with non-zero canonical combining class |
| are combining characters, but the reverse is not the case: |
| there are combining characters with a zero combining class. |
| ) |
| $(LI These characters are not normally used in isolation |
| unless they are being described. They include such characters |
| as accents, diacritics, Hebrew points, Arabic vowel signs, |
| and Indic matras. |
| ) |
| ) |
| ) |
| $(P $(DEF Combining class) |
| A numerical value used by the Unicode Canonical Ordering Algorithm |
| to determine which sequences of combining marks are to be |
| considered canonically equivalent and which are not. |
| ) |
| $(P $(DEF Compatibility decomposition) |
| The decomposition of a character or character sequence that results |
| from recursively applying both the compatibility mappings and |
| the canonical mappings found in the Unicode Character Database, and those |
| described in Conjoining Jamo Behavior no characters |
| can be further decomposed. |
| ) |
| $(P $(DEF Compatibility equivalent) |
| Two character sequences are said to be compatibility |
| equivalents if their full compatibility decompositions are identical. |
| ) |
| $(P $(DEF Encoded character) An association (or mapping) |
| between an abstract character and a code point. |
| ) |
| $(P $(DEF Glyph) The actual, concrete image of a glyph representation |
| having been rasterized or otherwise imaged onto some display surface. |
| ) |
| $(P $(DEF Grapheme base) A character with the property |
| Grapheme_Base, or any standard Korean syllable block. |
| ) |
| $(P $(DEF Grapheme cluster) Defined as the text between |
| grapheme boundaries as specified by Unicode Standard Annex #29, |
| $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation). |
| Important general properties of a grapheme: |
| $(UL |
| $(LI The grapheme cluster represents a horizontally segmentable |
| unit of text, consisting of some grapheme base (which may |
| consist of a Korean syllable) together with any number of |
| nonspacing marks applied to it. |
| ) |
| $(LI A grapheme cluster typically starts with a grapheme base |
| and then extends across any subsequent sequence of nonspacing marks. |
| A grapheme cluster is most directly relevant to text rendering and |
| processes such as cursor placement and text selection in editing, |
| but may also be relevant to comparison and searching. |
| ) |
| $(LI For many processes, a grapheme cluster behaves as if it was a |
| single character with the same properties as its grapheme base. |
| Effectively, nonspacing marks apply $(I graphically) to the base, |
| but do not change its properties. |
| ) |
| ) |
| $(P This module defines a number of primitives that work with graphemes: |
| $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride). |
| All of them are using $(I extended grapheme) boundaries |
| as defined in the aforementioned standard annex. |
| ) |
| ) |
| $(P $(DEF Nonspacing mark) A combining character with the |
| General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me). |
| ) |
| $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark. |
| ) |
| $(SECTION Normalization |
| ) |
| $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent) |
| or $(S_LINK Compatibility equivalent, compatibility equivalent) |
| characters in the Unicode Standard make it necessary to have a full, formal |
| definition of equivalence for Unicode strings. |
| String equivalence is determined by a process called normalization, |
| whereby strings are converted into forms which are compared |
| directly for identity. This is the primary goal of the normalization process, |
| see the function $(LREF normalize) to convert into any of |
| the four defined forms. |
| ) |
| $(P A very important attribute of the Unicode Normalization Forms |
| is that they must remain stable between versions of the Unicode Standard. |
| A Unicode string normalized to a particular Unicode Normalization Form |
| in one version of the standard is guaranteed to remain in that Normalization |
| Form for implementations of future versions of the standard. |
| ) |
| $(P The Unicode Standard specifies four normalization forms. |
| Informally, two of these forms are defined by maximal decomposition |
| of equivalent sequences, and two of these forms are defined |
| by maximal $(I composition) of equivalent sequences. |
| $(UL |
| $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition, |
| canonical decomposition) of a character sequence.) |
| $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition, |
| compatibility decomposition) of a character sequence.) |
| $(LI Normalization Form C (NFC): The canonical composition of the |
| $(S_LINK Canonical decomposition, canonical decomposition) |
| of a coded character sequence.) |
| $(LI Normalization Form KC (NFKC): The canonical composition |
| of the $(S_LINK Compatibility decomposition, |
| compatibility decomposition) of a character sequence) |
| ) |
| ) |
| $(P The choice of the normalization form depends on the particular use case. |
| NFC is the best form for general text, since it's more compatible with |
| strings converted from legacy encodings. NFKC is the preferred form for |
| identifiers, especially where there are security concerns. NFD and NFKD |
| are the most useful for internal processing. |
| ) |
| $(SECTION Construction of lookup tables |
| ) |
| $(P The Unicode standard describes a set of algorithms that |
| depend on having the ability to quickly look up various properties |
| of a code point. Given the the codespace of about 1 million $(CODEPOINTS), |
| it is not a trivial task to provide a space-efficient solution for |
| the multitude of properties. |
| ) |
| $(P Common approaches such as hash-tables or binary search over |
| sorted code point intervals (as in $(LREF InversionList)) are insufficient. |
| Hash-tables have enormous memory footprint and binary search |
| over intervals is not fast enough for some heavy-duty algorithms. |
| ) |
| $(P The recommended solution (see Unicode Implementation Guidelines) |
| is using multi-stage tables that are an implementation of the |
| $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer |
| keys and a fixed number of stages. For the remainder of the section |
| this will be called a fixed trie. The following describes a particular |
| implementation that is aimed for the speed of access at the expense |
| of ideal size savings. |
| ) |
| $(P Taking a 2-level Trie as an example the principle of operation is as follows. |
| Split the number of bits in a key (code point, 21 bits) into 2 components |
| (e.g. 15 and 8). The first is the number of bits in the index of the trie |
| and the other is number of bits in each page of the trie. |
| The layout of the trie is then an array of size 2^^bits-of-index followed |
| an array of memory chunks of size 2^^bits-of-page/bits-per-element. |
| ) |
| $(P The number of pages is variable (but not less then 1) |
| unlike the number of entries in the index. The slots of the index |
| all have to contain a number of a page that is present. The lookup is then |
| just a couple of operations - slice the upper bits, |
| lookup an index for these, take a page at this index and use |
| the lower bits as an offset within this page. |
| |
| Assuming that pages are laid out consequently |
| in one array at $(D pages), the pseudo-code is: |
| ) |
| --- |
| auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits; |
| pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)]; |
| --- |
| $(P Where if $(D elemsPerPage) is a power of 2 the whole process is |
| a handful of simple instructions and 2 array reads. Subsequent levels |
| of the trie are introduced by recursing on this notion - the index array |
| is treated as values. The number of bits in index is then again |
| split into 2 parts, with pages over 'current-index' and the new 'upper-index'. |
| ) |
| |
| $(P For completeness a level 1 trie is simply an array. |
| The current implementation takes advantage of bit-packing values |
| when the range is known to be limited in advance (such as $(D bool)). |
| See also $(LREF BitPacked) for enforcing it manually. |
| The major size advantage however comes from the fact |
| that multiple $(B identical pages on every level are merged) by construction. |
| ) |
| $(P The process of constructing a trie is more involved and is hidden from |
| the user in a form of the convenience functions $(LREF codepointTrie), |
| $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie). |
| In general a set or built-in AA with $(D dchar) type |
| can be turned into a trie. The trie object in this module |
| is read-only (immutable); it's effectively frozen after construction. |
| ) |
| $(SECTION Unicode properties |
| ) |
| $(P This is a full list of Unicode properties accessible through $(LREF unicode) |
| with specific helpers per category nested within. Consult the |
| $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility) |
| when in doubt about the contents of a particular set. |
| ) |
| $(P General category sets listed below are only accessible with the |
| $(LREF unicode) shorthand accessor.) |
| $(BOOKTABLE $(B General category ), |
| $(TR $(TH Abb.) $(TH Long form) |
| $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form)) |
| $(TR $(TD L) $(TD Letter) |
| $(TD Cn) $(TD Unassigned) $(TD Po) $(TD Other_Punctuation)) |
| $(TR $(TD Ll) $(TD Lowercase_Letter) |
| $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation)) |
| $(TR $(TD Lm) $(TD Modifier_Letter) |
| $(TD Cs) $(TD Surrogate) $(TD S) $(TD Symbol)) |
| $(TR $(TD Lo) $(TD Other_Letter) |
| $(TD N) $(TD Number) $(TD Sc) $(TD Currency_Symbol)) |
| $(TR $(TD Lt) $(TD Titlecase_Letter) |
| $(TD Nd) $(TD Decimal_Number) $(TD Sk) $(TD Modifier_Symbol)) |
| $(TR $(TD Lu) $(TD Uppercase_Letter) |
| $(TD Nl) $(TD Letter_Number) $(TD Sm) $(TD Math_Symbol)) |
| $(TR $(TD M) $(TD Mark) |
| $(TD No) $(TD Other_Number) $(TD So) $(TD Other_Symbol)) |
| $(TR $(TD Mc) $(TD Spacing_Mark) |
| $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator)) |
| $(TR $(TD Me) $(TD Enclosing_Mark) |
| $(TD Pc) $(TD Connector_Punctuation) $(TD Zl) $(TD Line_Separator)) |
| $(TR $(TD Mn) $(TD Nonspacing_Mark) |
| $(TD Pd) $(TD Dash_Punctuation) $(TD Zp) $(TD Paragraph_Separator)) |
| $(TR $(TD C) $(TD Other) |
| $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator)) |
| $(TR $(TD Cc) $(TD Control) $(TD Pf) |
| $(TD Final_Punctuation) $(TD -) $(TD Any)) |
| $(TR $(TD Cf) $(TD Format) |
| $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII)) |
| ) |
| $(P Sets for other commonly useful properties that are |
| accessible with $(LREF unicode):) |
| $(BOOKTABLE $(B Common binary properties), |
| $(TR $(TH Name) $(TH Name) $(TH Name)) |
| $(TR $(TD Alphabetic) $(TD Ideographic) $(TD Other_Uppercase)) |
| $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax)) |
| $(TR $(TD Bidi_Control) $(TD ID_Start) $(TD Pattern_White_Space)) |
| $(TR $(TD Cased) $(TD IDS_Trinary_Operator) $(TD Quotation_Mark)) |
| $(TR $(TD Case_Ignorable) $(TD Join_Control) $(TD Radical)) |
| $(TR $(TD Dash) $(TD Logical_Order_Exception) $(TD Soft_Dotted)) |
| $(TR $(TD Default_Ignorable_Code_Point) $(TD Lowercase) $(TD STerm)) |
| $(TR $(TD Deprecated) $(TD Math) $(TD Terminal_Punctuation)) |
| $(TR $(TD Diacritic) $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph)) |
| $(TR $(TD Extender) $(TD Other_Alphabetic) $(TD Uppercase)) |
| $(TR $(TD Grapheme_Base) $(TD Other_Default_Ignorable_Code_Point) $(TD Variation_Selector)) |
| $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend) $(TD White_Space)) |
| $(TR $(TD Grapheme_Link) $(TD Other_ID_Continue) $(TD XID_Continue)) |
| $(TR $(TD Hex_Digit) $(TD Other_ID_Start) $(TD XID_Start)) |
| $(TR $(TD Hyphen) $(TD Other_Lowercase) ) |
| $(TR $(TD ID_Continue) $(TD Other_Math) ) |
| ) |
| $(P Below is the table with block names accepted by $(LREF unicode.block). |
| Note that the shorthand version $(LREF unicode) requires "In" |
| to be prepended to the names of blocks so as to disambiguate |
| scripts and blocks. |
| ) |
| $(BOOKTABLE $(B Blocks), |
| $(TR $(TD Aegean Numbers) $(TD Ethiopic Extended) $(TD Mongolian)) |
| $(TR $(TD Alchemical Symbols) $(TD Ethiopic Extended-A) $(TD Musical Symbols)) |
| $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement) $(TD Myanmar)) |
| $(TR $(TD Ancient Greek Musical Notation) $(TD General Punctuation) $(TD Myanmar Extended-A)) |
| $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes) $(TD New Tai Lue)) |
| $(TR $(TD Ancient Symbols) $(TD Georgian) $(TD NKo)) |
| $(TR $(TD Arabic) $(TD Georgian Supplement) $(TD Number Forms)) |
| $(TR $(TD Arabic Extended-A) $(TD Glagolitic) $(TD Ogham)) |
| $(TR $(TD Arabic Mathematical Alphabetic Symbols) $(TD Gothic) $(TD Ol Chiki)) |
| $(TR $(TD Arabic Presentation Forms-A) $(TD Greek and Coptic) $(TD Old Italic)) |
| $(TR $(TD Arabic Presentation Forms-B) $(TD Greek Extended) $(TD Old Persian)) |
| $(TR $(TD Arabic Supplement) $(TD Gujarati) $(TD Old South Arabian)) |
| $(TR $(TD Armenian) $(TD Gurmukhi) $(TD Old Turkic)) |
| $(TR $(TD Arrows) $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition)) |
| $(TR $(TD Avestan) $(TD Hangul Compatibility Jamo) $(TD Oriya)) |
| $(TR $(TD Balinese) $(TD Hangul Jamo) $(TD Osmanya)) |
| $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A) $(TD Phags-pa)) |
| $(TR $(TD Bamum Supplement) $(TD Hangul Jamo Extended-B) $(TD Phaistos Disc)) |
| $(TR $(TD Basic Latin) $(TD Hangul Syllables) $(TD Phoenician)) |
| $(TR $(TD Batak) $(TD Hanunoo) $(TD Phonetic Extensions)) |
| $(TR $(TD Bengali) $(TD Hebrew) $(TD Phonetic Extensions Supplement)) |
| $(TR $(TD Block Elements) $(TD High Private Use Surrogates) $(TD Playing Cards)) |
| $(TR $(TD Bopomofo) $(TD High Surrogates) $(TD Private Use Area)) |
| $(TR $(TD Bopomofo Extended) $(TD Hiragana) $(TD Rejang)) |
| $(TR $(TD Box Drawing) $(TD Ideographic Description Characters) $(TD Rumi Numeral Symbols)) |
| $(TR $(TD Brahmi) $(TD Imperial Aramaic) $(TD Runic)) |
| $(TR $(TD Braille Patterns) $(TD Inscriptional Pahlavi) $(TD Samaritan)) |
| $(TR $(TD Buginese) $(TD Inscriptional Parthian) $(TD Saurashtra)) |
| $(TR $(TD Buhid) $(TD IPA Extensions) $(TD Sharada)) |
| $(TR $(TD Byzantine Musical Symbols) $(TD Javanese) $(TD Shavian)) |
| $(TR $(TD Carian) $(TD Kaithi) $(TD Sinhala)) |
| $(TR $(TD Chakma) $(TD Kana Supplement) $(TD Small Form Variants)) |
| $(TR $(TD Cham) $(TD Kanbun) $(TD Sora Sompeng)) |
| $(TR $(TD Cherokee) $(TD Kangxi Radicals) $(TD Spacing Modifier Letters)) |
| $(TR $(TD CJK Compatibility) $(TD Kannada) $(TD Specials)) |
| $(TR $(TD CJK Compatibility Forms) $(TD Katakana) $(TD Sundanese)) |
| $(TR $(TD CJK Compatibility Ideographs) $(TD Katakana Phonetic Extensions) $(TD Sundanese Supplement)) |
| $(TR $(TD CJK Compatibility Ideographs Supplement) $(TD Kayah Li) $(TD Superscripts and Subscripts)) |
| $(TR $(TD CJK Radicals Supplement) $(TD Kharoshthi) $(TD Supplemental Arrows-A)) |
| $(TR $(TD CJK Strokes) $(TD Khmer) $(TD Supplemental Arrows-B)) |
| $(TR $(TD CJK Symbols and Punctuation) $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators)) |
| $(TR $(TD CJK Unified Ideographs) $(TD Lao) $(TD Supplemental Punctuation)) |
| $(TR $(TD CJK Unified Ideographs Extension A) $(TD Latin-1 Supplement) $(TD Supplementary Private Use Area-A)) |
| $(TR $(TD CJK Unified Ideographs Extension B) $(TD Latin Extended-A) $(TD Supplementary Private Use Area-B)) |
| $(TR $(TD CJK Unified Ideographs Extension C) $(TD Latin Extended Additional) $(TD Syloti Nagri)) |
| $(TR $(TD CJK Unified Ideographs Extension D) $(TD Latin Extended-B) $(TD Syriac)) |
| $(TR $(TD Combining Diacritical Marks) $(TD Latin Extended-C) $(TD Tagalog)) |
| $(TR $(TD Combining Diacritical Marks for Symbols) $(TD Latin Extended-D) $(TD Tagbanwa)) |
| $(TR $(TD Combining Diacritical Marks Supplement) $(TD Lepcha) $(TD Tags)) |
| $(TR $(TD Combining Half Marks) $(TD Letterlike Symbols) $(TD Tai Le)) |
| $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham)) |
| $(TR $(TD Control Pictures) $(TD Linear B Ideograms) $(TD Tai Viet)) |
| $(TR $(TD Coptic) $(TD Linear B Syllabary) $(TD Tai Xuan Jing Symbols)) |
| $(TR $(TD Counting Rod Numerals) $(TD Lisu) $(TD Takri)) |
| $(TR $(TD Cuneiform) $(TD Low Surrogates) $(TD Tamil)) |
| $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian) $(TD Telugu)) |
| $(TR $(TD Currency Symbols) $(TD Lydian) $(TD Thaana)) |
| $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai)) |
| $(TR $(TD Cyrillic) $(TD Malayalam) $(TD Tibetan)) |
| $(TR $(TD Cyrillic Extended-A) $(TD Mandaic) $(TD Tifinagh)) |
| $(TR $(TD Cyrillic Extended-B) $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols)) |
| $(TR $(TD Cyrillic Supplement) $(TD Mathematical Operators) $(TD Ugaritic)) |
| $(TR $(TD Deseret) $(TD Meetei Mayek) $(TD Unified Canadian Aboriginal Syllabics)) |
| $(TR $(TD Devanagari) $(TD Meetei Mayek Extensions) $(TD Unified Canadian Aboriginal Syllabics Extended)) |
| $(TR $(TD Devanagari Extended) $(TD Meroitic Cursive) $(TD Vai)) |
| $(TR $(TD Dingbats) $(TD Meroitic Hieroglyphs) $(TD Variation Selectors)) |
| $(TR $(TD Domino Tiles) $(TD Miao) $(TD Variation Selectors Supplement)) |
| $(TR $(TD Egyptian Hieroglyphs) $(TD Miscellaneous Mathematical Symbols-A) $(TD Vedic Extensions)) |
| $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B) $(TD Vertical Forms)) |
| $(TR $(TD Enclosed Alphanumerics) $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols)) |
| $(TR $(TD Enclosed Alphanumeric Supplement) $(TD Miscellaneous Symbols and Arrows) $(TD Yi Radicals)) |
| $(TR $(TD Enclosed CJK Letters and Months) $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables)) |
| $(TR $(TD Enclosed Ideographic Supplement) $(TD Miscellaneous Technical) ) |
| $(TR $(TD Ethiopic) $(TD Modifier Tone Letters) ) |
| ) |
| $(P Below is the table with script names accepted by $(LREF unicode.script) |
| and by the shorthand version $(LREF unicode):) |
| $(BOOKTABLE $(B Scripts), |
| $(TR $(TD Arabic) $(TD Hanunoo) $(TD Old_Italic)) |
| $(TR $(TD Armenian) $(TD Hebrew) $(TD Old_Persian)) |
| $(TR $(TD Avestan) $(TD Hiragana) $(TD Old_South_Arabian)) |
| $(TR $(TD Balinese) $(TD Imperial_Aramaic) $(TD Old_Turkic)) |
| $(TR $(TD Bamum) $(TD Inherited) $(TD Oriya)) |
| $(TR $(TD Batak) $(TD Inscriptional_Pahlavi) $(TD Osmanya)) |
| $(TR $(TD Bengali) $(TD Inscriptional_Parthian) $(TD Phags_Pa)) |
| $(TR $(TD Bopomofo) $(TD Javanese) $(TD Phoenician)) |
| $(TR $(TD Brahmi) $(TD Kaithi) $(TD Rejang)) |
| $(TR $(TD Braille) $(TD Kannada) $(TD Runic)) |
| $(TR $(TD Buginese) $(TD Katakana) $(TD Samaritan)) |
| $(TR $(TD Buhid) $(TD Kayah_Li) $(TD Saurashtra)) |
| $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi) $(TD Sharada)) |
| $(TR $(TD Carian) $(TD Khmer) $(TD Shavian)) |
| $(TR $(TD Chakma) $(TD Lao) $(TD Sinhala)) |
| $(TR $(TD Cham) $(TD Latin) $(TD Sora_Sompeng)) |
| $(TR $(TD Cherokee) $(TD Lepcha) $(TD Sundanese)) |
| $(TR $(TD Common) $(TD Limbu) $(TD Syloti_Nagri)) |
| $(TR $(TD Coptic) $(TD Linear_B) $(TD Syriac)) |
| $(TR $(TD Cuneiform) $(TD Lisu) $(TD Tagalog)) |
| $(TR $(TD Cypriot) $(TD Lycian) $(TD Tagbanwa)) |
| $(TR $(TD Cyrillic) $(TD Lydian) $(TD Tai_Le)) |
| $(TR $(TD Deseret) $(TD Malayalam) $(TD Tai_Tham)) |
| $(TR $(TD Devanagari) $(TD Mandaic) $(TD Tai_Viet)) |
| $(TR $(TD Egyptian_Hieroglyphs) $(TD Meetei_Mayek) $(TD Takri)) |
| $(TR $(TD Ethiopic) $(TD Meroitic_Cursive) $(TD Tamil)) |
| $(TR $(TD Georgian) $(TD Meroitic_Hieroglyphs) $(TD Telugu)) |
| $(TR $(TD Glagolitic) $(TD Miao) $(TD Thaana)) |
| $(TR $(TD Gothic) $(TD Mongolian) $(TD Thai)) |
| $(TR $(TD Greek) $(TD Myanmar) $(TD Tibetan)) |
| $(TR $(TD Gujarati) $(TD New_Tai_Lue) $(TD Tifinagh)) |
| $(TR $(TD Gurmukhi) $(TD Nko) $(TD Ugaritic)) |
| $(TR $(TD Han) $(TD Ogham) $(TD Vai)) |
| $(TR $(TD Hangul) $(TD Ol_Chiki) $(TD Yi)) |
| ) |
| $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).) |
| $(BOOKTABLE $(B Hangul syllable type), |
| $(TR $(TH Abb.) $(TH Long form)) |
| $(TR $(TD L) $(TD Leading_Jamo)) |
| $(TR $(TD LV) $(TD LV_Syllable)) |
| $(TR $(TD LVT) $(TD LVT_Syllable) ) |
| $(TR $(TD T) $(TD Trailing_Jamo)) |
| $(TR $(TD V) $(TD Vowel_Jamo)) |
| ) |
| References: |
| $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table), |
| $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia), |
| $(HTTP www.unicode.org, The Unicode Consortium), |
| $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms), |
| $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation) |
| $(HTTP www.unicode.org/uni2book/ch05.pdf, |
| Unicode Implementation Guidelines) |
| $(HTTP www.unicode.org/uni2book/ch03.pdf, |
| Unicode Conformance) |
| Trademarks: |
| Unicode(tm) is a trademark of Unicode, Inc. |
| |
| Copyright: Copyright 2013 - |
| License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). |
| Authors: Dmitry Olshansky |
| Source: $(PHOBOSSRC std/_uni.d) |
| Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2) |
| |
| Macros: |
| |
| SECTION = <h3><a id="$1">$0</a></h3> |
| DEF = <div><a id="$1"><i>$0</i></a></div> |
| S_LINK = <a href="#$1">$+</a> |
| CODEPOINT = $(S_LINK Code point, code point) |
| CODEPOINTS = $(S_LINK Code point, code points) |
| CHARACTER = $(S_LINK Character, character) |
| CHARACTERS = $(S_LINK Character, characters) |
| CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster) |
| +/ |
| module std.uni; |
| |
| import std.meta; // AliasSeq |
| import std.range.primitives; // back, ElementEncodingType, ElementType, empty, |
| // front, isForwardRange, isInputRange, isRandomAccessRange, popFront, put, |
| // save |
| import std.traits; // isConvertibleToString, isIntegral, isSomeChar, |
| // isSomeString, Unqual |
| import std.exception;// : enforce; |
| import core.memory; //: pureMalloc, pureRealloc, pureFree; |
| import core.exception; // : onOutOfMemoryError; |
| static import std.ascii; |
| // debug = std_uni; |
| |
| debug(std_uni) import std.stdio; // writefln, writeln |
| |
| private: |
| |
| version (unittest) |
| { |
| private: |
| struct TestAliasedString |
| { |
| string get() @safe @nogc pure nothrow { return _s; } |
| alias get this; |
| @disable this(this); |
| string _s; |
| } |
| |
| bool testAliasedString(alias func, Args...)(string s, Args args) |
| { |
| import std.algorithm.comparison : equal; |
| auto a = func(TestAliasedString(s), args); |
| auto b = func(s, args); |
| static if (is(typeof(equal(a, b)))) |
| { |
| // For ranges, compare contents instead of object identity. |
| return equal(a, b); |
| } |
| else |
| { |
| return a == b; |
| } |
| } |
| } |
| |
| void copyBackwards(T,U)(T[] src, U[] dest) |
| { |
| assert(src.length == dest.length); |
| for (size_t i=src.length; i-- > 0; ) |
| dest[i] = src[i]; |
| } |
| |
| void copyForward(T,U)(T[] src, U[] dest) |
| { |
| assert(src.length == dest.length); |
| for (size_t i=0; i<src.length; i++) |
| dest[i] = src[i]; |
| } |
| |
| // TODO: update to reflect all major CPUs supporting unaligned reads |
| version (X86) |
| enum hasUnalignedReads = true; |
| else version (X86_64) |
| enum hasUnalignedReads = true; |
| else version (SystemZ) |
| enum hasUnalignedReads = true; |
| else |
| enum hasUnalignedReads = false; // better be safe then sorry |
| |
| public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator. |
| public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator. |
| public enum dchar nelSep = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line. |
| |
| // test the intro example |
| @safe unittest |
| { |
| import std.algorithm.searching : find; |
| // initialize code point sets using script/block or property name |
| // set contains code points from both scripts. |
| auto set = unicode("Cyrillic") | unicode("Armenian"); |
| // or simpler and statically-checked look |
| auto ascii = unicode.ASCII; |
| auto currency = unicode.Currency_Symbol; |
| |
| // easy set ops |
| auto a = set & ascii; |
| assert(a.empty); // as it has no intersection with ascii |
| a = set | ascii; |
| auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian |
| |
| // some properties of code point sets |
| assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 |
| // testing presence of a code point in a set |
| // is just fine, it is O(logN) |
| assert(!b['$']); |
| assert(!b['\u058F']); // Armenian dram sign |
| assert(b['¥']); |
| |
| // building fast lookup tables, these guarantee O(1) complexity |
| // 1-level Trie lookup table essentially a huge bit-set ~262Kb |
| auto oneTrie = toTrie!1(b); |
| // 2-level far more compact but typically slightly slower |
| auto twoTrie = toTrie!2(b); |
| // 3-level even smaller, and a bit slower yet |
| auto threeTrie = toTrie!3(b); |
| assert(oneTrie['£']); |
| assert(twoTrie['£']); |
| assert(threeTrie['£']); |
| |
| // build the trie with the most sensible trie level |
| // and bind it as a functor |
| auto cyrillicOrArmenian = toDelegate(set); |
| auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); |
| assert(balance == "ընկեր!"); |
| // compatible with bool delegate(dchar) |
| bool delegate(dchar) bindIt = cyrillicOrArmenian; |
| |
| // Normalization |
| string s = "Plain ascii (and not only), is always normalized!"; |
| assert(s is normalize(s));// is the same string |
| |
| string nonS = "A\u0308ffin"; // A ligature |
| auto nS = normalize(nonS); // to NFC, the W3C endorsed standard |
| assert(nS == "Äffin"); |
| assert(nS != nonS); |
| string composed = "Äffin"; |
| |
| assert(normalize!NFD(composed) == "A\u0308ffin"); |
| // to NFKD, compatibility decomposition useful for fuzzy matching/searching |
| assert(normalize!NFKD("2¹⁰") == "210"); |
| } |
| |
| enum lastDchar = 0x10FFFF; |
| |
| auto force(T, F)(F from) |
| if (isIntegral!T && !is(T == F)) |
| { |
| assert(from <= T.max && from >= T.min); |
| return cast(T) from; |
| } |
| |
| auto force(T, F)(F from) |
| if (isBitPacked!T && !is(T == F)) |
| { |
| assert(from <= 2^^bitSizeOf!T-1); |
| return T(cast(TypeOfBitPacked!T) from); |
| } |
| |
| auto force(T, F)(F from) |
| if (is(T == F)) |
| { |
| return from; |
| } |
| |
| // repeat X times the bit-pattern in val assuming it's length is 'bits' |
| size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc |
| { |
| static if (times == 1) |
| return val; |
| else static if (bits == 1) |
| { |
| static if (times == size_t.sizeof*8) |
| return val ? size_t.max : 0; |
| else |
| return val ? (1 << times)-1 : 0; |
| } |
| else static if (times % 2) |
| return (replicateBits!(times-1, bits)(val)<<bits) | val; |
| else |
| return replicateBits!(times/2, bits*2)((val << bits) | val); |
| } |
| |
| @safe pure nothrow @nogc unittest // for replicate |
| { |
| import std.algorithm.iteration : sum, map; |
| import std.range : iota; |
| size_t m = 0b111; |
| size_t m2 = 0b01; |
| foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) |
| { |
| assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i))); |
| assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum()); |
| } |
| } |
| |
| // multiple arrays squashed into one memory block |
| struct MultiArray(Types...) |
| { |
| import std.range.primitives : isOutputRange; |
| this(size_t[] sizes...) @safe pure nothrow |
| { |
| assert(dim == sizes.length); |
| size_t full_size; |
| foreach (i, v; Types) |
| { |
| full_size += spaceFor!(bitSizeOf!v)(sizes[i]); |
| sz[i] = sizes[i]; |
| static if (i >= 1) |
| offsets[i] = offsets[i-1] + |
| spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]); |
| } |
| |
| storage = new size_t[full_size]; |
| } |
| |
| this(const(size_t)[] raw_offsets, |
| const(size_t)[] raw_sizes, const(size_t)[] data)const @safe pure nothrow @nogc |
| { |
| offsets[] = raw_offsets[]; |
| sz[] = raw_sizes[]; |
| storage = data; |
| } |
| |
| @property auto slice(size_t n)()inout pure nothrow @nogc |
| { |
| auto ptr = raw_ptr!n; |
| return packedArrayView!(Types[n])(ptr, sz[n]); |
| } |
| |
| @property auto ptr(size_t n)()inout pure nothrow @nogc |
| { |
| auto ptr = raw_ptr!n; |
| return inout(PackedPtr!(Types[n]))(ptr); |
| } |
| |
| template length(size_t n) |
| { |
| @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; } |
| |
| @property void length(size_t new_size) |
| { |
| if (new_size > sz[n]) |
| {// extend |
| size_t delta = (new_size - sz[n]); |
| sz[n] += delta; |
| delta = spaceFor!(bitSizeOf!(Types[n]))(delta); |
| storage.length += delta;// extend space at end |
| // raw_slice!x must follow resize as it could be moved! |
| // next stmts move all data past this array, last-one-goes-first |
| static if (n != dim-1) |
| { |
| auto start = raw_ptr!(n+1); |
| // len includes delta |
| size_t len = (storage.ptr+storage.length-start); |
| |
| copyBackwards(start[0 .. len-delta], start[delta .. len]); |
| |
| start[0 .. delta] = 0; |
| // offsets are used for raw_slice, ptr etc. |
| foreach (i; n+1 .. dim) |
| offsets[i] += delta; |
| } |
| } |
| else if (new_size < sz[n]) |
| {// shrink |
| size_t delta = (sz[n] - new_size); |
| sz[n] -= delta; |
| delta = spaceFor!(bitSizeOf!(Types[n]))(delta); |
| // move all data past this array, forward direction |
| static if (n != dim-1) |
| { |
| auto start = raw_ptr!(n+1); |
| size_t len = (storage.ptr+storage.length-start); |
| copyForward(start[0 .. len-delta], start[delta .. len]); |
| |
| // adjust offsets last, they affect raw_slice |
| foreach (i; n+1 .. dim) |
| offsets[i] -= delta; |
| } |
| storage.length -= delta; |
| } |
| // else - NOP |
| } |
| } |
| |
| @property size_t bytes(size_t n=size_t.max)() const @safe |
| { |
| static if (n == size_t.max) |
| return storage.length*size_t.sizeof; |
| else static if (n != Types.length-1) |
| return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof; |
| else |
| return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof; |
| } |
| |
| void store(OutRange)(scope OutRange sink) const |
| if (isOutputRange!(OutRange, char)) |
| { |
| import std.format : formattedWrite; |
| formattedWrite(sink, "[%( 0x%x, %)]", offsets[]); |
| formattedWrite(sink, ", [%( 0x%x, %)]", sz[]); |
| formattedWrite(sink, ", [%( 0x%x, %)]", storage); |
| } |
| |
| private: |
| import std.meta : staticMap; |
| @property auto raw_ptr(size_t n)()inout pure nothrow @nogc |
| { |
| static if (n == 0) |
| return storage.ptr; |
| else |
| { |
| return storage.ptr+offsets[n]; |
| } |
| } |
| enum dim = Types.length; |
| size_t[dim] offsets;// offset for level x |
| size_t[dim] sz;// size of level x |
| alias bitWidth = staticMap!(bitSizeOf, Types); |
| size_t[] storage; |
| } |
| |
| @system unittest |
| { |
| import std.conv : text; |
| enum dg = (){ |
| // sizes are: |
| // lvl0: 3, lvl1 : 2, lvl2: 1 |
| auto m = MultiArray!(int, ubyte, int)(3,2,1); |
| |
| static void check(size_t k, T)(ref T m, int n) |
| { |
| foreach (i; 0 .. n) |
| assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n])); |
| } |
| |
| static void checkB(size_t k, T)(ref T m, int n) |
| { |
| foreach (i; 0 .. n) |
| assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n])); |
| } |
| |
| static void fill(size_t k, T)(ref T m, int n) |
| { |
| foreach (i; 0 .. n) |
| m.slice!(k)[i] = force!ubyte(i+1); |
| } |
| |
| static void fillB(size_t k, T)(ref T m, int n) |
| { |
| foreach (i; 0 .. n) |
| m.slice!(k)[i] = force!ubyte(n-i); |
| } |
| |
| m.length!1 = 100; |
| fill!1(m, 100); |
| check!1(m, 100); |
| |
| m.length!0 = 220; |
| fill!0(m, 220); |
| check!1(m, 100); |
| check!0(m, 220); |
| |
| m.length!2 = 17; |
| fillB!2(m, 17); |
| checkB!2(m, 17); |
| check!0(m, 220); |
| check!1(m, 100); |
| |
| m.length!2 = 33; |
| checkB!2(m, 17); |
| fillB!2(m, 33); |
| checkB!2(m, 33); |
| check!0(m, 220); |
| check!1(m, 100); |
| |
| m.length!1 = 195; |
| fillB!1(m, 195); |
| checkB!1(m, 195); |
| checkB!2(m, 33); |
| check!0(m, 220); |
| |
| auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10); |
| marr.length!0 = 15; |
| marr.length!1 = 30; |
| fill!1(marr, 30); |
| fill!0(marr, 15); |
| check!1(marr, 30); |
| check!0(marr, 15); |
| return 0; |
| }; |
| enum ct = dg(); |
| auto rt = dg(); |
| } |
| |
| @system unittest |
| {// more bitpacking tests |
| import std.conv : text; |
| |
| alias Bitty = |
| MultiArray!(BitPacked!(size_t, 3) |
| , BitPacked!(size_t, 4) |
| , BitPacked!(size_t, 3) |
| , BitPacked!(size_t, 6) |
| , bool); |
| alias fn1 = sliceBits!(13, 16); |
| alias fn2 = sliceBits!( 9, 13); |
| alias fn3 = sliceBits!( 6, 9); |
| alias fn4 = sliceBits!( 0, 6); |
| static void check(size_t lvl, MA)(ref MA arr){ |
| for (size_t i = 0; i< arr.length!lvl; i++) |
| assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i])); |
| } |
| |
| static void fillIdx(size_t lvl, MA)(ref MA arr){ |
| for (size_t i = 0; i< arr.length!lvl; i++) |
| arr.slice!(lvl)[i] = i; |
| } |
| Bitty m1; |
| |
| m1.length!4 = 10; |
| m1.length!3 = 2^^6; |
| m1.length!2 = 2^^3; |
| m1.length!1 = 2^^4; |
| m1.length!0 = 2^^3; |
| |
| m1.length!4 = 2^^16; |
| |
| for (size_t i = 0; i< m1.length!4; i++) |
| m1.slice!(4)[i] = i % 2; |
| |
| fillIdx!1(m1); |
| check!1(m1); |
| fillIdx!2(m1); |
| check!2(m1); |
| fillIdx!3(m1); |
| check!3(m1); |
| fillIdx!0(m1); |
| check!0(m1); |
| check!3(m1); |
| check!2(m1); |
| check!1(m1); |
| for (size_t i=0; i < 2^^16; i++) |
| { |
| m1.slice!(4)[i] = i % 2; |
| m1.slice!(0)[fn1(i)] = fn1(i); |
| m1.slice!(1)[fn2(i)] = fn2(i); |
| m1.slice!(2)[fn3(i)] = fn3(i); |
| m1.slice!(3)[fn4(i)] = fn4(i); |
| } |
| for (size_t i=0; i < 2^^16; i++) |
| { |
| assert(m1.slice!(4)[i] == i % 2); |
| assert(m1.slice!(0)[fn1(i)] == fn1(i)); |
| assert(m1.slice!(1)[fn2(i)] == fn2(i)); |
| assert(m1.slice!(2)[fn3(i)] == fn3(i)); |
| assert(m1.slice!(3)[fn4(i)] == fn4(i)); |
| } |
| } |
| |
| size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc |
| { |
| import std.math : nextPow2; |
| enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView |
| static if (bits > 8*size_t.sizeof) |
| { |
| static assert(bits % (size_t.sizeof*8) == 0); |
| return new_len * bits/(8*size_t.sizeof); |
| } |
| else |
| { |
| enum factor = size_t.sizeof*8/bits; |
| return (new_len+factor-1)/factor; // rounded up |
| } |
| } |
| |
| template isBitPackableType(T) |
| { |
| enum isBitPackableType = isBitPacked!T |
| || isIntegral!T || is(T == bool) || isSomeChar!T; |
| } |
| |
| //============================================================================ |
| template PackedArrayView(T) |
| if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) |
| && isBitPackableType!U) || isBitPackableType!T) |
| { |
| import std.math : nextPow2; |
| private enum bits = bitSizeOf!T; |
| alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); |
| } |
| |
| //unsafe and fast access to a chunk of RAM as if it contains packed values |
| template PackedPtr(T) |
| if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) |
| && isBitPackableType!U) || isBitPackableType!T) |
| { |
| import std.math : nextPow2; |
| private enum bits = bitSizeOf!T; |
| alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); |
| } |
| |
| struct PackedPtrImpl(T, size_t bits) |
| { |
| pure nothrow: |
| static assert(isPow2OrZero(bits)); |
| |
| this(inout(size_t)* ptr)inout @safe @nogc |
| { |
| origin = ptr; |
| } |
| |
| private T simpleIndex(size_t n) inout |
| { |
| immutable q = n / factor; |
| immutable r = n % factor; |
| return cast(T)((origin[q] >> bits*r) & mask); |
| } |
| |
| private void simpleWrite(TypeOfBitPacked!T val, size_t n) |
| in |
| { |
| static if (isIntegral!T) |
| assert(val <= mask); |
| } |
| body |
| { |
| immutable q = n / factor; |
| immutable r = n % factor; |
| immutable tgt_shift = bits*r; |
| immutable word = origin[q]; |
| origin[q] = (word & ~(mask << tgt_shift)) |
| | (cast(size_t) val << tgt_shift); |
| } |
| |
| static if (factor == bytesPerWord// can safely pack by byte |
| || factor == 1 // a whole word at a time |
| || ((factor == bytesPerWord/2 || factor == bytesPerWord/4) |
| && hasUnalignedReads)) // this needs unaligned reads |
| { |
| static if (factor == bytesPerWord) |
| alias U = ubyte; |
| else static if (factor == bytesPerWord/2) |
| alias U = ushort; |
| else static if (factor == bytesPerWord/4) |
| alias U = uint; |
| else static if (size_t.sizeof == 8 && factor == bytesPerWord/8) |
| alias U = ulong; |
| |
| T opIndex(size_t idx) inout |
| { |
| T ret; |
| version (LittleEndian) |
| ret = __ctfe ? simpleIndex(idx) : |
| cast(inout(T))(cast(U*) origin)[idx]; |
| else |
| ret = simpleIndex(idx); |
| return ret; |
| } |
| |
| static if (isBitPacked!T) // lack of user-defined implicit conversion |
| { |
| void opIndexAssign(T val, size_t idx) |
| { |
| return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); |
| } |
| } |
| |
| void opIndexAssign(TypeOfBitPacked!T val, size_t idx) |
| { |
| version (LittleEndian) |
| { |
| if (__ctfe) |
| simpleWrite(val, idx); |
| else |
| (cast(U*) origin)[idx] = cast(U) val; |
| } |
| else |
| simpleWrite(val, idx); |
| } |
| } |
| else |
| { |
| T opIndex(size_t n) inout |
| { |
| return simpleIndex(n); |
| } |
| |
| static if (isBitPacked!T) // lack of user-defined implicit conversion |
| { |
| void opIndexAssign(T val, size_t idx) |
| { |
| return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); |
| } |
| } |
| |
| void opIndexAssign(TypeOfBitPacked!T val, size_t n) |
| { |
| return simpleWrite(val, n); |
| } |
| } |
| |
| private: |
| // factor - number of elements in one machine word |
| enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1; |
| enum bytesPerWord = size_t.sizeof; |
| size_t* origin; |
| } |
| |
| // data is packed only by power of two sized packs per word, |
| // thus avoiding mul/div overhead at the cost of ultimate packing |
| // this construct doesn't own memory, only provides access, see MultiArray for usage |
| struct PackedArrayViewImpl(T, size_t bits) |
| { |
| pure nothrow: |
| |
| this(inout(size_t)* origin, size_t offset, size_t items) inout @safe |
| { |
| ptr = inout(PackedPtr!(T))(origin); |
| ofs = offset; |
| limit = items; |
| } |
| |
| bool zeros(size_t s, size_t e) |
| in |
| { |
| assert(s <= e); |
| } |
| body |
| { |
| s += ofs; |
| e += ofs; |
| immutable pad_s = roundUp(s); |
| if ( s >= e) |
| { |
| foreach (i; s .. e) |
| if (ptr[i]) |
| return false; |
| return true; |
| } |
| immutable pad_e = roundDown(e); |
| size_t i; |
| for (i=s; i<pad_s; i++) |
| if (ptr[i]) |
| return false; |
| // all in between is x*factor elements |
| for (size_t j=i/factor; i<pad_e; i+=factor, j++) |
| if (ptr.origin[j]) |
| return false; |
| for (; i<e; i++) |
| if (ptr[i]) |
| return false; |
| return true; |
| } |
| |
| T opIndex(size_t idx) inout |
| in |
| { |
| assert(idx < limit); |
| } |
| body |
| { |
| return ptr[ofs + idx]; |
| } |
| |
| static if (isBitPacked!T) // lack of user-defined implicit conversion |
| { |
| void opIndexAssign(T val, size_t idx) |
| { |
| return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); |
| } |
| } |
| |
| void opIndexAssign(TypeOfBitPacked!T val, size_t idx) |
| in |
| { |
| assert(idx < limit); |
| } |
| body |
| { |
| ptr[ofs + idx] = val; |
| } |
| |
| static if (isBitPacked!T) // lack of user-defined implicit conversions |
| { |
| void opSliceAssign(T val, size_t start, size_t end) |
| { |
| opSliceAssign(cast(TypeOfBitPacked!T) val, start, end); |
| } |
| } |
| |
| void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end) |
| in |
| { |
| assert(start <= end); |
| assert(end <= limit); |
| } |
| body |
| { |
| // account for ofsetted view |
| start += ofs; |
| end += ofs; |
| // rounded to factor granularity |
| immutable pad_start = roundUp(start);// rounded up |
| if (pad_start >= end) //rounded up >= then end of slice |
| { |
| //nothing to gain, use per element assignment |
| foreach (i; start .. end) |
| ptr[i] = val; |
| return; |
| } |
| immutable pad_end = roundDown(end); // rounded down |
| size_t i; |
| for (i=start; i<pad_start; i++) |
| ptr[i] = val; |
| // all in between is x*factor elements |
| if (pad_start != pad_end) |
| { |
| immutable repval = replicateBits!(factor, bits)(val); |
| for (size_t j=i/factor; i<pad_end; i+=factor, j++) |
| ptr.origin[j] = repval;// so speed it up by factor |
| } |
| for (; i<end; i++) |
| ptr[i] = val; |
| } |
| |
| auto opSlice(size_t from, size_t to)inout |
| in |
| { |
| assert(from <= to); |
| assert(ofs + to <= limit); |
| } |
| body |
| { |
| return typeof(this)(ptr.origin, ofs + from, to - from); |
| } |
| |
| auto opSlice(){ return opSlice(0, length); } |
| |
| bool opEquals(T)(auto ref T arr) const |
| { |
| if (limit != arr.limit) |
| return false; |
| size_t s1 = ofs, s2 = arr.ofs; |
| size_t e1 = s1 + limit, e2 = s2 + limit; |
| if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0) |
| { |
| return ptr.origin[s1/factor .. e1/factor] |
| == arr.ptr.origin[s2/factor .. e2/factor]; |
| } |
| for (size_t i=0;i<limit; i++) |
| if (this[i] != arr[i]) |
| return false; |
| return true; |
| } |
| |
| @property size_t length()const{ return limit; } |
| |
| private: |
| auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; } |
| auto roundDown()(size_t val){ return val/factor*factor; } |
| // factor - number of elements in one machine word |
| enum factor = size_t.sizeof*8/bits; |
| PackedPtr!(T) ptr; |
| size_t ofs, limit; |
| } |
| |
| |
| private struct SliceOverIndexed(T) |
| { |
| enum assignableIndex = is(typeof((){ T.init[0] = Item.init; })); |
| enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; })); |
| auto opIndex(size_t idx)const |
| in |
| { |
| assert(idx < to - from); |
| } |
| body |
| { |
| return (*arr)[from+idx]; |
| } |
| |
| static if (assignableIndex) |
| void opIndexAssign(Item val, size_t idx) |
| in |
| { |
| assert(idx < to - from); |
| } |
| body |
| { |
| (*arr)[from+idx] = val; |
| } |
| |
| auto opSlice(size_t a, size_t b) |
| { |
| return typeof(this)(from+a, from+b, arr); |
| } |
| |
| // static if (assignableSlice) |
| void opSliceAssign(T)(T val, size_t start, size_t end) |
| { |
| (*arr)[start+from .. end+from] = val; |
| } |
| |
| auto opSlice() |
| { |
| return typeof(this)(from, to, arr); |
| } |
| |
| @property size_t length()const { return to-from;} |
| |
| auto opDollar()const { return length; } |
| |
| @property bool empty()const { return from == to; } |
| |
| @property auto front()const { return (*arr)[from]; } |
| |
| static if (assignableIndex) |
| @property void front(Item val) { (*arr)[from] = val; } |
| |
| @property auto back()const { return (*arr)[to-1]; } |
| |
| static if (assignableIndex) |
| @property void back(Item val) { (*arr)[to-1] = val; } |
| |
| @property auto save() inout { return this; } |
| |
| void popFront() { from++; } |
| |
| void popBack() { to--; } |
| |
| bool opEquals(T)(auto ref T arr) const |
| { |
| if (arr.length != length) |
| return false; |
| for (size_t i=0; i <length; i++) |
| if (this[i] != arr[i]) |
| return false; |
| return true; |
| } |
| private: |
| alias Item = typeof(T.init[0]); |
| size_t from, to; |
| T* arr; |
| } |
| |
| static assert(isRandomAccessRange!(SliceOverIndexed!(int[]))); |
| |
| SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x) |
| if (is(Unqual!T == T)) |
| { |
| return SliceOverIndexed!(const(T))(a, b, x); |
| } |
| |
| // BUG? inout is out of reach |
| //...SliceOverIndexed.arr only parameters or stack based variables can be inout |
| SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x) |
| if (is(Unqual!T == T)) |
| { |
| return SliceOverIndexed!T(a, b, x); |
| } |
| |
| @system unittest |
| { |
| int[] idxArray = [2, 3, 5, 8, 13]; |
| auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray); |
| |
| assert(!sliced.empty); |
| assert(sliced.front == 2); |
| sliced.front = 1; |
| assert(sliced.front == 1); |
| assert(sliced.back == 13); |
| sliced.popFront(); |
| assert(sliced.front == 3); |
| assert(sliced.back == 13); |
| sliced.back = 11; |
| assert(sliced.back == 11); |
| sliced.popBack(); |
| |
| assert(sliced.front == 3); |
| assert(sliced[$-1] == 8); |
| sliced = sliced[]; |
| assert(sliced[0] == 3); |
| assert(sliced.back == 8); |
| sliced = sliced[1..$]; |
| assert(sliced.front == 5); |
| sliced = sliced[0..$-1]; |
| assert(sliced[$-1] == 5); |
| |
| int[] other = [2, 5]; |
| assert(sliced[] == sliceOverIndexed(1, 2, &other)); |
| sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1; |
| assert(idxArray[0 .. 2] == [-1, -1]); |
| uint[] nullArr = null; |
| auto nullSlice = sliceOverIndexed(0, 0, &idxArray); |
| assert(nullSlice.empty); |
| } |
| |
| private auto packedArrayView(T)(inout(size_t)* ptr, size_t items) @trusted pure nothrow |
| { |
| return inout(PackedArrayView!T)(ptr, 0, items); |
| } |
| |
| |
| //============================================================================ |
| // Partially unrolled binary search using Shar's method |
| //============================================================================ |
| |
| string genUnrolledSwitchSearch(size_t size) @safe pure nothrow |
| { |
| import core.bitop : bsr; |
| import std.array : replace; |
| import std.conv : to; |
| assert(isPow2OrZero(size)); |
| string code = ` |
| import core.bitop : bsr; |
| auto power = bsr(m)+1; |
| switch (power){`; |
| size_t i = bsr(size); |
| foreach_reverse (val; 0 .. bsr(size)) |
| { |
| auto v = 2^^val; |
| code ~= ` |
| case pow: |
| if (pred(range[idx+m], needle)) |
| idx += m; |
| goto case; |
| `.replace("m", to!string(v)) |
| .replace("pow", to!string(i)); |
| i--; |
| } |
| code ~= ` |
| case 0: |
| if (pred(range[idx], needle)) |
| idx += 1; |
| goto default; |
| `; |
| code ~= ` |
| default: |
| }`; |
| return code; |
| } |
| |
| bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc |
| { |
| // See also: std.math.isPowerOf2() |
| return (sz & (sz-1)) == 0; |
| } |
| |
| size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle) |
| if (is(T : ElementType!Range)) |
| { |
| assert(isPow2OrZero(range.length)); |
| size_t idx = 0, m = range.length/2; |
| while (m != 0) |
| { |
| if (pred(range[idx+m], needle)) |
| idx += m; |
| m /= 2; |
| } |
| if (pred(range[idx], needle)) |
| idx += 1; |
| return idx; |
| } |
| |
| size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle) |
| if (is(T : ElementType!Range)) |
| { |
| assert(isPow2OrZero(range.length)); |
| size_t idx = 0, m = range.length/2; |
| enum max = 1 << 10; |
| while (m >= max) |
| { |
| if (pred(range[idx+m], needle)) |
| idx += m; |
| m /= 2; |
| } |
| mixin(genUnrolledSwitchSearch(max)); |
| return idx; |
| } |
| |
| template sharMethod(alias uniLowerBound) |
| { |
| size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle) |
| if (is(T : ElementType!Range)) |
| { |
| import std.functional : binaryFun; |
| import std.math : nextPow2, truncPow2; |
| alias pred = binaryFun!_pred; |
| if (range.length == 0) |
| return 0; |
| if (isPow2OrZero(range.length)) |
| return uniLowerBound!pred(range, needle); |
| size_t n = truncPow2(range.length); |
| if (pred(range[n-1], needle)) |
| {// search in another 2^^k area that fully covers the tail of range |
| size_t k = nextPow2(range.length - n + 1); |
| return range.length - k + uniLowerBound!pred(range[$-k..$], needle); |
| } |
| else |
| return uniLowerBound!pred(range[0 .. n], needle); |
| } |
| } |
| |
| alias sharLowerBound = sharMethod!uniformLowerBound; |
| alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound; |
| |
| @safe unittest |
| { |
| import std.array : array; |
| import std.range : assumeSorted, iota; |
| |
| auto stdLowerBound(T)(T[] range, T needle) |
| { |
| return assumeSorted(range).lowerBound(needle).length; |
| } |
| immutable MAX = 5*1173; |
| auto arr = array(iota(5, MAX, 5)); |
| assert(arr.length == MAX/5-1); |
| foreach (i; 0 .. MAX+5) |
| { |
| auto st = stdLowerBound(arr, i); |
| assert(st == sharLowerBound(arr, i)); |
| assert(st == sharSwitchLowerBound(arr, i)); |
| } |
| arr = []; |
| auto st = stdLowerBound(arr, 33); |
| assert(st == sharLowerBound(arr, 33)); |
| assert(st == sharSwitchLowerBound(arr, 33)); |
| } |
| //============================================================================ |
| |
| @safe |
| { |
| // hope to see simillar stuff in public interface... once Allocators are out |
| //@@@BUG moveFront and friends? dunno, for now it's POD-only |
| |
| @trusted size_t genericReplace(Policy=void, T, Range) |
| (ref T dest, size_t from, size_t to, Range stuff) |
| { |
| import std.algorithm.mutation : copy; |
| size_t delta = to - from; |
| size_t stuff_end = from+stuff.length; |
| if (stuff.length > delta) |
| {// replace increases length |
| delta = stuff.length - delta;// now, new is > old by delta |
| static if (is(Policy == void)) |
| dest.length = dest.length+delta;//@@@BUG lame @property |
| else |
| dest = Policy.realloc(dest, dest.length+delta); |
| copyBackwards(dest[to .. dest.length-delta], |
| dest[to+delta .. dest.length]); |
| copyForward(stuff, dest[from .. stuff_end]); |
| } |
| else if (stuff.length == delta) |
| { |
| copy(stuff, dest[from .. to]); |
| } |
| else |
| {// replace decreases length by delta |
| delta = delta - stuff.length; |
| copy(stuff, dest[from .. stuff_end]); |
| copyForward(dest[to .. dest.length], |
| dest[stuff_end .. dest.length-delta]); |
| static if (is(Policy == void)) |
| dest.length = dest.length - delta;//@@@BUG lame @property |
| else |
| dest = Policy.realloc(dest, dest.length-delta); |
| } |
| return stuff_end; |
| } |
| |
| |
| // Simple storage manipulation policy |
| @trusted private struct GcPolicy |
| { |
| import std.traits : isDynamicArray; |
| |
| static T[] dup(T)(const T[] arr) |
| { |
| return arr.dup; |
| } |
| |
| static T[] alloc(T)(size_t size) |
| { |
| return new T[size]; |
| } |
| |
| static T[] realloc(T)(T[] arr, size_t sz) |
| { |
| arr.length = sz; |
| return arr; |
| } |
| |
| static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) |
| { |
| replaceInPlace(dest, from, to, stuff); |
| } |
| |
| static void append(T, V)(ref T[] arr, V value) |
| if (!isInputRange!V) |
| { |
| arr ~= force!T(value); |
| } |
| |
| static void append(T, V)(ref T[] arr, V value) |
| if (isInputRange!V) |
| { |
| insertInPlace(arr, arr.length, value); |
| } |
| |
| static void destroy(T)(ref T arr) |
| if (isDynamicArray!T && is(Unqual!T == T)) |
| { |
| debug |
| { |
| arr[] = cast(typeof(T.init[0]))(0xdead_beef); |
| } |
| arr = null; |
| } |
| |
| static void destroy(T)(ref T arr) |
| if (isDynamicArray!T && !is(Unqual!T == T)) |
| { |
| arr = null; |
| } |
| } |
| |
| // ditto |
| @trusted struct ReallocPolicy |
| { |
| import std.range.primitives : hasLength; |
| |
| static T[] dup(T)(const T[] arr) |
| { |
| auto result = alloc!T(arr.length); |
| result[] = arr[]; |
| return result; |
| } |
| |
| static T[] alloc(T)(size_t size) |
| { |
| import core.stdc.stdlib : malloc; |
| import std.exception : enforce; |
| |
| import core.checkedint : mulu; |
| bool overflow; |
| size_t nbytes = mulu(size, T.sizeof, overflow); |
| if (overflow) assert(0); |
| |
| auto ptr = cast(T*) enforce(malloc(nbytes), "out of memory on C heap"); |
| return ptr[0 .. size]; |
| } |
| |
| static T[] realloc(T)(T[] arr, size_t size) |
| { |
| import core.stdc.stdlib : realloc; |
| import std.exception : enforce; |
| if (!size) |
| { |
| destroy(arr); |
| return null; |
| } |
| |
| import core.checkedint : mulu; |
| bool overflow; |
| size_t nbytes = mulu(size, T.sizeof, overflow); |
| if (overflow) assert(0); |
| |
| auto ptr = cast(T*) enforce(realloc(arr.ptr, nbytes), "out of memory on C heap"); |
| return ptr[0 .. size]; |
| } |
| |
| static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) |
| { |
| genericReplace!(ReallocPolicy)(dest, from, to, stuff); |
| } |
| |
| static void append(T, V)(ref T[] arr, V value) |
| if (!isInputRange!V) |
| { |
| if (arr.length == size_t.max) assert(0); |
| arr = realloc(arr, arr.length+1); |
| arr[$-1] = force!T(value); |
| } |
| |
| @safe unittest |
| { |
| int[] arr; |
| ReallocPolicy.append(arr, 3); |
| |
| import std.algorithm.comparison : equal; |
| assert(equal(arr, [3])); |
| } |
| |
| static void append(T, V)(ref T[] arr, V value) |
| if (isInputRange!V && hasLength!V) |
| { |
| import core.checkedint : addu; |
| bool overflow; |
| size_t nelems = addu(arr.length, value.length, overflow); |
| if (overflow) assert(0); |
| |
| arr = realloc(arr, nelems); |
| |
| import std.algorithm.mutation : copy; |
| copy(value, arr[$-value.length..$]); |
| } |
| |
| @safe unittest |
| { |
| int[] arr; |
| ReallocPolicy.append(arr, [1,2,3]); |
| |
| import std.algorithm.comparison : equal; |
| assert(equal(arr, [1,2,3])); |
| } |
| |
| static void destroy(T)(ref T[] arr) |
| { |
| import core.stdc.stdlib : free; |
| if (arr.ptr) |
| free(arr.ptr); |
| arr = null; |
| } |
| } |
| |
| //build hack |
| alias _RealArray = CowArray!ReallocPolicy; |
| |
| @safe unittest |
| { |
| import std.algorithm.comparison : equal; |
| |
| with(ReallocPolicy) |
| { |
| bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result, |
| string file = __FILE__, size_t line = __LINE__) |
| { |
| { |
| replaceImpl(orig, from, to, toReplace); |
| scope(exit) destroy(orig); |
| if (!equal(orig, result)) |
| return false; |
| } |
| return true; |
| } |
| static T[] arr(T)(T[] args... ) |
| { |
| return dup(args); |
| } |
| |
| assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4])); |
| assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4])); |
| assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7])); |
| assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4])); |
| assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4])); |
| } |
| } |
| |
| /** |
| Tests if T is some kind a set of code points. Intended for template constraints. |
| */ |
| public template isCodepointSet(T) |
| { |
| static if (is(T dummy == InversionList!(Args), Args...)) |
| enum isCodepointSet = true; |
| else |
| enum isCodepointSet = false; |
| } |
| |
| /** |
| Tests if $(D T) is a pair of integers that implicitly convert to $(D V). |
| The following code must compile for any pair $(D T): |
| --- |
| (T x){ V a = x[0]; V b = x[1];} |
| --- |
| The following must not compile: |
| --- |
| (T x){ V c = x[2];} |
| --- |
| */ |
| public template isIntegralPair(T, V=uint) |
| { |
| enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];})) |
| && !is(typeof((T x){ V c = x[2]; })); |
| } |
| |
| |
| /** |
| The recommended default type for set of $(CODEPOINTS). |
| For details, see the current implementation: $(LREF InversionList). |
| */ |
| public alias CodepointSet = InversionList!GcPolicy; |
| |
| |
| //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin |
| // which relies on std.uni.isGraphical and this chain blows up with Forward reference error |
| // hence below doesn't seem to work |
| // public alias CodepointInterval = Tuple!(uint, "a", uint, "b"); |
| |
| /** |
| The recommended type of $(REF Tuple, std,_typecons) |
| to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList). |
| Any interval type should pass $(LREF isIntegralPair) trait. |
| */ |
| public struct CodepointInterval |
| { |
| pure: |
| uint[2] _tuple; |
| alias _tuple this; |
| |
| @safe pure nothrow @nogc: |
| |
| this(uint low, uint high) |
| { |
| _tuple[0] = low; |
| _tuple[1] = high; |
| } |
| bool opEquals(T)(T val) const |
| { |
| return this[0] == val[0] && this[1] == val[1]; |
| } |
| @property ref inout(uint) a() inout { return _tuple[0]; } |
| @property ref inout(uint) b() inout { return _tuple[1]; } |
| } |
| |
| /** |
| $(P |
| $(D InversionList) is a set of $(CODEPOINTS) |
| represented as an array of open-right [a, b$(RPAREN) |
| intervals (see $(LREF CodepointInterval) above). |
| The name comes from the way the representation reads left to right. |
| For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN), |
| plus a singular value 60 looks like this: |
| ) |
| --- |
| 10, 50, 60, 61, 80, 90 |
| --- |
| $(P |
| The way to read this is: start with negative meaning that all numbers |
| smaller then the next one are not present in this set (and positive |
| - the contrary). Then switch positive/negative after each |
| number passed from left to right. |
| ) |
| $(P This way negative spans until 10, then positive until 50, |
| then negative until 60, then positive until 61, and so on. |
| As seen this provides a space-efficient storage of highly redundant data |
| that comes in long runs. A description which Unicode $(CHARACTER) |
| properties fit nicely. The technique itself could be seen as a variation |
| on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding). |
| ) |
| |
| $(P Sets are value types (just like $(D int) is) thus they |
| are never aliased. |
| ) |
| Example: |
| --- |
| auto a = CodepointSet('a', 'z'+1); |
| auto b = CodepointSet('A', 'Z'+1); |
| auto c = a; |
| a = a | b; |
| assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1)); |
| assert(a != c); |
| --- |
| $(P See also $(LREF unicode) for simpler construction of sets |
| from predefined ones. |
| ) |
| |
| $(P Memory usage is 8 bytes per each contiguous interval in a set. |
| The value semantics are achieved by using the |
| $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique |
| and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared). |
| ) |
| |
| Note: |
| $(P It's not recommended to rely on the template parameters |
| or the exact type of a current $(CODEPOINT) set in $(D std.uni). |
| The type and parameters may change when the standard |
| allocators design is finalized. |
| Use $(LREF isCodepointSet) with templates or just stick with the default |
| alias $(LREF CodepointSet) throughout the whole code base. |
| ) |
| */ |
| @trusted public struct InversionList(SP=GcPolicy) |
| { |
| import std.range : assumeSorted; |
| |
| /** |
| Construct from another code point set of any type. |
| */ |
| this(Set)(Set set) pure |
| if (isCodepointSet!Set) |
| { |
| uint[] arr; |
| foreach (v; set.byInterval) |
| { |
| arr ~= v.a; |
| arr ~= v.b; |
| } |
| data = CowArray!(SP).reuse(arr); |
| } |
| |
| /** |
| Construct a set from a forward range of code point intervals. |
| */ |
| this(Range)(Range intervals) pure |
| if (isForwardRange!Range && isIntegralPair!(ElementType!Range)) |
| { |
| uint[] arr; |
| foreach (v; intervals) |
| { |
| SP.append(arr, v.a); |
| SP.append(arr, v.b); |
| } |
| data = CowArray!(SP).reuse(arr); |
| sanitize(); //enforce invariant: sort intervals etc. |
| } |
| |
| //helper function that avoids sanity check to be CTFE-friendly |
| private static fromIntervals(Range)(Range intervals) pure |
| { |
| import std.algorithm.iteration : map; |
| import std.range : roundRobin; |
| auto flattened = roundRobin(intervals.save.map!"a[0]"(), |
| intervals.save.map!"a[1]"()); |
| InversionList set; |
| set.data = CowArray!(SP)(flattened); |
| return set; |
| } |
| //ditto untill sort is CTFE-able |
| private static fromIntervals()(uint[] intervals...) pure |
| in |
| { |
| import std.conv : text; |
| assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); |
| for (uint i = 0; i < intervals.length; i += 2) |
| { |
| auto a = intervals[i], b = intervals[i+1]; |
| assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); |
| } |
| } |
| body |
| { |
| InversionList set; |
| set.data = CowArray!(SP)(intervals); |
| return set; |
| } |
| |
| /** |
| Construct a set from plain values of code point intervals. |
| */ |
| this()(uint[] intervals...) |
| in |
| { |
| import std.conv : text; |
| assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); |
| for (uint i = 0; i < intervals.length; i += 2) |
| { |
| auto a = intervals[i], b = intervals[i+1]; |
| assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); |
| } |
| } |
| body |
| { |
| data = CowArray!(SP)(intervals); |
| sanitize(); //enforce invariant: sort intervals etc. |
| } |
| |
| /// |
| @safe unittest |
| { |
| import std.algorithm.comparison : equal; |
| |
| auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1); |
| foreach (v; 'a'..'z'+1) |
| assert(set[v]); |
| // Cyrillic lowercase interval |
| foreach (v; 'а'..'я'+1) |
| assert(set[v]); |
| //specific order is not required, intervals may interesect |
| auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1); |
| //the same end result |
| assert(set2.byInterval.equal(set.byInterval)); |
| } |
| |
| /** |
| Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList). |
| |
| Example: |
| ----------- |
| import std.algorithm.comparison : equal; |
| import std.typecons : tuple; |
| |
| auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1); |
| |
| assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')])); |
| ----------- |
| */ |
| @property auto byInterval() |
| { |
| return Intervals!(typeof(data))(data); |
| } |
| |
| /** |
| Tests the presence of code point $(D val) in this set. |
| */ |
| bool opIndex(uint val) const |
| { |
| // the <= ensures that searching in interval of [a, b) for 'a' you get .length == 1 |
| // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1; |
| return sharSwitchLowerBound!"a <= b"(data[], val) & 1; |
| } |
| |
| /// |
| @safe unittest |
| { |
| auto gothic = unicode.Gothic; |
| // Gothic letter ahsa |
| assert(gothic['\U00010330']); |
| // no ascii in Gothic obviously |
| assert(!gothic['$']); |
| } |
| |
| |
| // Linear scan for $(D ch). Useful only for small sets. |
| // TODO: |
| // used internally in std.regex |
| // should be properly exposed in a public API ? |
| package auto scanFor()(dchar ch) const |
| { |
| immutable len = data.length; |
| for (size_t i = 0; i < len; i++) |
| if (ch < data[i]) |
| return i & 1; |
| return 0; |
| } |
| |
| /// Number of $(CODEPOINTS) in this set |
| @property size_t length() |
| { |
| size_t sum = 0; |
| foreach (iv; byInterval) |
| { |
| sum += iv.b - iv.a; |
| } |
| return sum; |
| } |
| |
| // bootstrap full set operations from 4 primitives (suitable as a template mixin): |
| // addInterval, skipUpTo, dropUpTo & byInterval iteration |
| //============================================================================ |
| public: |
| /** |
| $(P Sets support natural syntax for set algebra, namely: ) |
| $(BOOKTABLE , |
| $(TR $(TH Operator) $(TH Math notation) $(TH Description) ) |
| $(TR $(TD &) $(TD a ∩ b) $(TD intersection) ) |
| $(TR $(TD |) $(TD a ∪ b) $(TD union) ) |
| $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) ) |
| $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) ) |
| ) |
| */ |
| This opBinary(string op, U)(U rhs) |
| if (isCodepointSet!U || is(U:dchar)) |
| { |
| static if (op == "&" || op == "|" || op == "~") |
| {// symmetric ops thus can swap arguments to reuse r-value |
| static if (is(U:dchar)) |
| { |
| auto tmp = this; |
| mixin("tmp "~op~"= rhs; "); |
| return tmp; |
| } |
| else |
| { |
| static if (is(Unqual!U == U)) |
| { |
| // try hard to reuse r-value |
| mixin("rhs "~op~"= this;"); |
| return rhs; |
| } |
| else |
| { |
| auto tmp = this; |
| mixin("tmp "~op~"= rhs;"); |
| return tmp; |
| } |
| } |
| } |
| else static if (op == "-") // anti-symmetric |
| { |
| auto tmp = this; |
| tmp -= rhs; |
| return tmp; |
| } |
| else |
| static assert(0, "no operator "~op~" defined for Set"); |
| } |
| |
| /// |
| @safe unittest |
| { |
| import std.algorithm.comparison : equal; |
| import std.range : iota; |
| |
| auto lower = unicode.LowerCase; |
| auto upper = unicode.UpperCase; |
| auto ascii = unicode.ASCII; |
| |
| assert((lower & upper).empty); // no intersection |
| auto lowerASCII = lower & ascii; |
| assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1))); |
| // throw away all of the lowercase ASCII |
| assert((ascii - lower).length == 128 - 26); |
| |
| auto onlyOneOf = lower ~ ascii; |
| assert(!onlyOneOf['Δ']); // not ASCII and not lowercase |
| assert(onlyOneOf['$']); // ASCII and not lowercase |
| assert(!onlyOneOf['a']); // ASCII and lowercase |
| assert(onlyOneOf['я']); // not ASCII but lowercase |
| |
| // throw away all cased letters from ASCII |
| auto noLetters = ascii - (lower | upper); |
| assert(noLetters.length == 128 - 26*2); |
| } |
| |
| /// The 'op=' versions of the above overloaded operators. |
| ref This opOpAssign(string op, U)(U rhs) |
| if (isCodepointSet!U || is(U:dchar)) |
| { |
| static if (op == "|") // union |
| { |
| static if (is(U:dchar)) |
| { |
| this.addInterval(rhs, rhs+1); |
| return this; |
| } |
| else |
| return this.add(rhs); |
| } |
| else static if (op == "&") // intersection |
| return this.intersect(rhs);// overloaded |
| else static if (op == "-") // set difference |
| return this.sub(rhs);// overloaded |
| else static if (op == "~") // symmetric set difference |
| { |
| auto copy = this & rhs; |
| this |= rhs; |
| this -= copy; |
| return this; |
| } |
| else |
| static assert(0, "no operator "~op~" defined for Set"); |
| } |
| |
| /** |
| Tests the presence of codepoint $(D ch) in this set, |
| the same as $(LREF opIndex). |
| */ |
| bool opBinaryRight(string op: "in", U)(U ch) const |
| if (is(U : dchar)) |
| { |
| return this[ch]; |
| } |
| |
| /// |
| @safe unittest |
| { |
| assert('я' in unicode.Cyrillic); |
| assert(!('z' in unicode.Cyrillic)); |
| } |
| |
| |
| |
| /** |
| * Obtains a set that is the inversion of this set. |
| * |
| * See_Also: $(LREF inverted) |
| */ |
| auto opUnary(string op: "!")() |
| { |
| return this.inverted; |
| } |
| |
| /** |
| A range that spans each $(CODEPOINT) in this set. |
| */ |
| @property auto byCodepoint() |
| { |
| @trusted static struct CodepointRange |
| { |
| this(This set) |
| { |
| r = set.byInterval; |
| if (!r.empty) |
| cur = r.front.a; |
| } |
| |
| @property dchar front() const |
| { |
| return cast(dchar) cur; |
| } |
| |
| @property bool empty() const |
| { |
| return r.empty; |
| } |
| |
| void popFront() |
| { |
| cur++; |
| while (cur >= r.front.b) |
| { |
| r.popFront(); |
| if (r.empty) |
| break; |
| cur = r.front.a; |
| } |
| } |
| private: |
| uint cur; |
| typeof(This.init.byInterval) r; |
| } |
| |
| return CodepointRange(this); |
| } |
| |
| /// |
| @safe unittest |
| { |
| import std.algorithm.comparison : equal; |
| import std.range : iota; |
| |
| auto set = unicode.ASCII; |
| set.byCodepoint.equal(iota(0, 0x80)); |
| } |
| |
| /** |
| $(P Obtain textual representation of this set in from of |
| open-right intervals and feed it to $(D sink). |
| ) |
| $(P Used by various standard formatting facilities such as |
| $(REF formattedWrite, std,_format), $(REF write, std,_stdio), |
| $(REF writef, std,_stdio), $(REF to, std,_conv) and others. |
| ) |
| Example: |
| --- |
| import std.conv; |
| assert(unicode.ASCII.to!string == "[0..128$(RPAREN)"); |
| --- |
| */ |
| |
| private import std.format : FormatSpec; |
| |
| /*************************************** |
| * Obtain a textual representation of this InversionList |
| * in form of open-right intervals. |
| * |
| * The formatting flag is applied individually to each value, for example: |
| * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals) |
| * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters) |
| * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters) |
| */ |
| void toString(Writer)(scope Writer sink, |
| FormatSpec!char fmt) /* const */ |
| { |
| import std.format : formatValue; |
| auto range = byInterval; |
| if (range.empty) |
| return; |
| |
| while (1) |
| { |
| auto i = range.front; |
| range.popFront(); |
| |
| put(sink, "["); |
| formatValue(sink, i.a, fmt); |
| put(sink, ".."); |
| formatValue(sink, i.b, fmt); |
| put(sink, ")"); |
| if (range.empty) return; |
| put(sink, " "); |
| } |
| } |
| |
| /// |
| @safe unittest |
| { |
| import std.conv : to; |
| import std.format : format; |
| import std.uni : unicode; |
| |
| assert(unicode.Cyrillic.to!string == |
| "[1024..1157) [1159..1320) [7467..7468) [7544..7545) [11744..11776) [42560..42648) [42655..42656)"); |
| |
| // The specs '%s' and '%d' are equivalent to the to!string call above. |
| assert(format("%d", unicode.Cyrillic) == unicode.Cyrillic.to!string); |
| |
| assert(format("%#x", unicode.Cyrillic) == |
| "[0x400..0x485) [0x487..0x528) [0x1d2b..0x1d2c) [0x1d78..0x1d79) [0x2de0..0x2e00) " |
| ~"[0xa640..0xa698) [0xa69f..0xa6a0)"); |
| |
| assert(format("%#X", unicode.Cyrillic) == |
| "[0X400..0X485) [0X487..0X528) [0X1D2B..0X1D2C) [0X1D78..0X1D79) [0X2DE0..0X2E00) " |
| ~"[0XA640..0XA698) [0XA69F..0XA6A0)"); |
| } |
| |
| @safe unittest |
| { |
| import std.exception : assertThrown; |
| import std.format : format, FormatException; |
| assertThrown!FormatException(format("%a", unicode.ASCII)); |
| } |
| |
| |
| /** |
| Add an interval [a, b$(RPAREN) to this set. |
| */ |
| ref add()(uint a, uint b) |
| { |
| addInterval(a, b); |
| return this; |
| } |
| |
| /// |
| @safe unittest |
| { |
| CodepointSet someSet; |
| someSet.add('0', '5').add('A','Z'+1); |
| someSet.add('5', '9'+1); |
| assert(someSet['0']); |
| assert(someSet['5']); |
| assert(someSet['9']); |
| assert(someSet['Z']); |
| } |
| |
| private: |
| |
| package(std) // used from: std.regex.internal.parser |
| ref intersect(U)(U rhs) |
| if (isCodepointSet!U) |
| { |
| Marker mark; |
| foreach ( i; rhs.byInterval) |
| { |
| mark = this.dropUpTo(i.a, mark); |
| mark = this.skipUpTo(i.b, mark); |
| } |
| this.dropUpTo(uint.max, mark); |
| return this; |
| } |
| |
| ref intersect()(dchar ch) |
| { |
| foreach (i; byInterval) |
| if (i.a <= ch && ch < i.b) |
| return this = This.init.add(ch, ch+1); |
| this = This.init; |
| return this; |
| } |
| |
| @safe unittest |
| { |
| assert(unicode.Cyrillic.intersect('-').byInterval.empty); |
| } |
| |
| ref sub()(dchar ch) |
| { |
| return subChar(ch); |
| } |
| |
| // same as the above except that skip & drop parts are swapped |
| package(std) // used from: std.regex.internal.parser |
| ref sub(U)(U rhs) |
| if (isCodepointSet!U) |
| { |
| Marker mark; |
| foreach (i; rhs.byInterval) |
| { |
| mark = this.skipUpTo(i.a, mark); |
| mark = this.dropUpTo(i.b, mark); |
| } |
| return this; |
| } |
| |
| package(std) // used from: std.regex.internal.parse |
| ref add(U)(U rhs) |
| if (isCodepointSet!U) |
| { |
| Marker start; |
| foreach (i; rhs.byInterval) |
| { |
| start = addInterval(i.a, i.b, start); |
| } |
| return this; |
| } |
| |
| // end of mixin-able part |
| //============================================================================ |
| public: |
| /** |
| Obtains a set that is the inversion of this set. |
| |
| See the '!' $(LREF opUnary) for the same but using operators. |
| */ |
| @property auto inverted() |
| { |
| InversionList inversion = this; |
| if (inversion.data.length == 0) |
| { |
| inversion.addInterval(0, lastDchar+1); |
| return inversion; |
| } |
| if (inversion.data[0] != 0) |
| genericReplace(inversion.data, 0, 0, [0]); |
| else |
| genericReplace(inversion.data, 0, 1, cast(uint[]) null); |
| if (data[data.length-1] != lastDchar+1) |
| genericReplace(inversion.data, |
| inversion.data.length, inversion.data.length, [lastDchar+1]); |
| else |
| genericReplace(inversion.data, |
| inversion.data.length-1, inversion.data.length, cast(uint[]) null); |
| |
| return inversion; |
| } |
| |
| /// |
| @safe unittest |
| { |
| auto set = unicode.ASCII; |
| // union with the inverse gets all of the code points in the Unicode |
| assert((set | set.inverted).length == 0x110000); |
| // no intersection with the inverse |
| assert((set & set.inverted).empty); |
| } |
| |
| /** |
| Generates string with D source code of unary function with name of |
| $(D funcName) taking a single $(D dchar) argument. If $(D funcName) is empty |
| the code is adjusted to be a lambda function. |
| |
| The function generated tests if the $(CODEPOINT) passed |
| belongs to this set or not. The result is to be used with string mixin. |
| The intended usage area is aggressive optimization via meta programming |
| in parser generators and the like. |
| |
| Note: Use with care for relatively small or regular sets. It |
| could end up being slower then just using multi-staged tables. |
| |
| Example: |
| --- |
| import std.stdio; |
| |
| // construct set directly from [a, b$RPAREN intervals |
| auto set = CodepointSet(10, 12, 45, 65, 100, 200); |
| writeln(set); |
| writeln(set.toSourceCode("func")); |
| --- |
| |
| The above outputs something along the lines of: |
| --- |
| bool func(dchar ch) @safe pure nothrow @nogc |
| { |
| if (ch < 45) |
| { |
| if (ch == 10 || ch == 11) return true; |
| return false; |
| } |
| else if (ch < 65) return true; |
| else |
| { |
| if (ch < 100) return false; |
| if (ch < 200) return true; |
| return false; |
| } |
| } |
| --- |
| */ |
| string toSourceCode(string funcName="") |
| { |
| import std.algorithm.searching : countUntil; |
| import std.array : array; |
| import std.format : format; |
| enum maxBinary = 3; |
| static string linearScope(R)(R ivals, string indent) |
| { |
| string result = indent~"{\n"; |
| string deeper = indent~" "; |
| foreach (ival; ivals) |
| { |
| immutable span = ival[1] - ival[0]; |
| assert(span != 0); |
| if (span == 1) |
| { |
| result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]); |
| } |
| else if (span == 2) |
| { |
| result ~= format("%sif (ch == %s || ch == %s) return true;\n", |
| deeper, ival[0], ival[0]+1); |
| } |
| else |
| { |
| if (ival[0] != 0) // dchar is unsigned and < 0 is useless |
| result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]); |
| result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]); |
| } |
| } |
| result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals |
| return result; |
| } |
| |
| static string binaryScope(R)(R ivals, string indent) |
| { |
| // time to do unrolled comparisons? |
| if (ivals.length < maxBinary) |
| return linearScope(ivals, indent); |
| else |
| return bisect(ivals, ivals.length/2, indent); |
| } |
| |
| // not used yet if/elsebinary search is far better with DMD as of 2.061 |
| // and GDC is doing fine job either way |
| static string switchScope(R)(R ivals, string indent) |
| { |
| string result = indent~"switch (ch){\n"; |
| string deeper = indent~" "; |
| foreach (ival; ivals) |
| { |
| if (ival[0]+1 == ival[1]) |
| { |
| result ~= format("%scase %s: return true;\n", |
| deeper, ival[0]); |
| } |
| else |
| { |
| result ~= format("%scase %s: .. case %s: return true;\n", |
| deeper, ival[0], ival[1]-1); |
| } |
| } |
| result ~= deeper~"default: return false;\n"~indent~"}\n"; |
| return result; |
| } |
| |
| static string bisect(R)(R range, size_t idx, string indent) |
| { |
| string deeper = indent ~ " "; |
| // bisect on one [a, b) interval at idx |
| string result = indent~"{\n"; |
| // less branch, < a |
| result ~= format("%sif (ch < %s)\n%s", |
| deeper, range[idx][0], binaryScope(range[0 .. idx], deeper)); |
| // middle point, >= a && < b |
| result ~= format("%selse if (ch < %s) return true;\n", |
| deeper, range[idx][1]); |
| // greater or equal branch, >= b |
| result ~= format("%selse\n%s", |
| deeper, binaryScope(range[idx+1..$], deeper)); |
| return result~indent~"}\n"; |
| } |
| |
| string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n", |
| funcName.empty ? "function" : funcName); |
| auto range = byInterval.array(); |
| // special case first bisection to be on ASCII vs beyond |
| auto tillAscii = countUntil!"a[0] > 0x80"(range); |
| if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0) |
| code ~= binaryScope(range, ""); |
| else |
| code ~= bisect(range, tillAscii, ""); |
| return code; |
| } |
| |
| /** |
| True if this set doesn't contain any $(CODEPOINTS). |
| */ |
| @property bool empty() const |
| { |
| return data.length == 0; |
| } |
| |
| /// |
| @safe unittest |
| { |
| CodepointSet emptySet; |
| assert(emptySet.length == 0); |
| assert(emptySet.empty); |
| } |
| |
| private: |
| alias This = typeof(this); |
| alias Marker = size_t; |
| |
| // a random-access range of integral pairs |
| static struct Intervals(Range) |
| { |
| this(Range sp) |
| { |
| slice = sp; |
| start = 0; |
| end = sp.length; |
| } |
| |
| this(Range sp, size_t s, size_t e) |
| { |
| slice = sp; |
| start = s; |
| end = e; |
| } |
| |
| @property auto front()const |
| { |
| immutable a = slice[start]; |
| immutable b = slice[start+1]; |
| return CodepointInterval(a, b); |
| } |
| |
| //may break sorted property - but we need std.sort to access it |
| //hence package protection attribute |
| package @property void front(CodepointInterval val) |
| { |
| slice[start] = val.a; |
| slice[start+1] = val.b; |
| } |
| |
| @property auto back()const |
| { |
| immutable a = slice[end-2]; |
| immutable b = slice[end-1]; |
| return CodepointInterval(a, b); |
| } |
| |
| //ditto about package |
| package @property void back(CodepointInterval val) |
| { |
| slice[end-2] = val.a; |
| slice[end-1] = val.b; |
| } |
| |
| void popFront() |
| { |
| start += 2; |
| } |
| |
| void popBack() |
| { |
| end -= 2; |
| } |
| |
| auto opIndex(size_t idx) const |
| { |
| immutable a = slice[start+idx*2]; |
| immutable b = slice[start+idx*2+1]; |
| return CodepointInterval(a, b); |
| } |
| |
| //ditto about package |
| package void opIndexAssign(CodepointInterval val, size_t idx) |
| { |
| slice[start+idx*2] = val.a; |
| slice[start+idx*2+1] = val.b; |
| } |
| |
| auto opSlice(size_t s, size_t e) |
| { |
| return Intervals(slice, s*2+start, e*2+start); |
| } |
| |
| @property size_t length()const { return slice.length/2; } |
| |
| @property bool empty()const { return start == end; } |
| |
| @property auto save(){ return this; } |
| private: |
| size_t start, end; |
| Range slice; |
| } |
| |
| // called after construction from intervals |
| // to make sure invariants hold |
| void sanitize() |
| { |
| import std.algorithm.comparison : max; |
| import std.algorithm.mutation : SwapStrategy; |
| import std.algorithm.sorting : sort; |
| if (data.length == 0) |
| return; |
| alias Ival = CodepointInterval; |
| //intervals wrapper for a _range_ over packed array |
| auto ivals = Intervals!(typeof(data[]))(data[]); |
| //@@@BUG@@@ can't use "a.a < b.a" see issue 12265 |
| sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals); |
| // what follows is a variation on stable remove |
| // differences: |
| // - predicate is binary, and is tested against |
| // the last kept element (at 'i'). |
| // - predicate mutates lhs (merges rhs into lhs) |
| size_t len = ivals.length; |
| size_t i = 0; |
| size_t j = 1; |
| while (j < len) |
| { |
| if (ivals[i].b >= ivals[j].a) |
| { |
| ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b)); |
| j++; |
| } |
| else //unmergable |
| { |
| // check if there is a hole after merges |
| // (in the best case we do 0 writes to ivals) |
| if (j != i+1) |
| ivals[i+1] = ivals[j]; //copy over |
| i++; |
| j++; |
| } |
| } |
| len = i + 1; |
| for (size_t k=0; k + 1 < len; k++) |
| { |
| assert(ivals[k].a < ivals[k].b); |
| assert(ivals[k].b < ivals[k+1].a); |
| } |
| data.length = len * 2; |
| } |
| |
| // special case for normal InversionList |
| ref subChar(dchar ch) |
| { |
| auto mark = skipUpTo(ch); |
| if (mark != data.length |
| && data[mark] == ch && data[mark-1] == ch) |
| { |
| // it has split, meaning that ch happens to be in one of intervals |
| data[mark] = data[mark]+1; |
| } |
| return this; |
| } |
| |
| // |
| Marker addInterval(int a, int b, Marker hint=Marker.init) |
| in |
| { |
| assert(a <= b); |
| } |
| body |
| { |
| import std.range : assumeSorted, SearchPolicy; |
| auto range = assumeSorted(data[]); |
| size_t pos; |
| size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length; |
| if (a_idx == range.length) |
| { |
| // [---+++----++++----++++++] |
| // [ a b] |
| data.append(a, b); |
| return data.length-1; |
| } |
| size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx; |
| uint[3] buf = void; |
| uint to_insert; |
| debug(std_uni) |
| { |
| writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); |
| } |
| if (b_idx == range.length) |
| { |
| // [-------++++++++----++++++-] |
| // [ s a b] |
| if (a_idx & 1)// a in positive |
| { |
| buf[0] = b; |
| to_insert = 1; |
| } |
| else// a in negative |
| { |
| buf[0] = a; |
| buf[1] = b; |
| to_insert = 2; |
| } |
| pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]); |
| return pos - 1; |
| } |
| |
| uint top = data[b_idx]; |
| |
| debug(std_uni) |
| { |
| writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); |
| writefln("a=%s; b=%s; top=%s;", a, b, top); |
| } |
| if (a_idx & 1) |
| {// a in positive |
| if (b_idx & 1)// b in positive |
| { |
| // [-------++++++++----++++++-] |
| // [ s a b ] |
| buf[0] = top; |
| to_insert = 1; |
| } |
| else // b in negative |
| { |
| // [-------++++++++----++++++-] |
| // [ s a b ] |
| if (top == b) |
| { |
| assert(b_idx+1 < data.length); |
| buf[0] = data[b_idx+1]; |
| pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]); |
| return pos - 1; |
| } |
| buf[0] = b; |
| buf[1] = top; |
| to_insert = 2; |
| } |
| } |
| else |
| { // a in negative |
| if (b_idx & 1) // b in positive |
| { |
| // [----------+++++----++++++-] |
| // [ a b ] |
| buf[0] = a; |
| buf[1] = top; |
| to_insert = 2; |
| } |
| else// b in negative |
| { |
| // [----------+++++----++++++-] |
| // [ a s b ] |
| if (top == b) |
| { |
| assert(b_idx+1 < data.length); |
| buf[0] = a; |
| buf[1] = data[b_idx+1]; |
| pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]); |
| return pos - 1; |
| } |
| buf[0] = a; |
| buf[1] = b; |
| buf[2] = top; |
| to_insert = 3; |
| } |
| } |
| pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]); |
| debug(std_uni) |
| { |
| writefln("marker idx: %d; length=%d", pos, data[pos], data.length); |
| writeln("inserting ", buf[0 .. to_insert]); |
| } |
| return pos - 1; |
| } |
| |
| // |
| Marker dropUpTo(uint a, Marker pos=Marker.init) |
| in |
| { |
| assert(pos % 2 == 0); // at start of interval |
| } |
| body |
| { |
| auto range = assumeSorted!"a <= b"(data[pos .. data.length]); |
| if (range.empty) |
| return pos; |
| size_t idx = pos; |
| idx += range.lowerBound(a).length; |
| |
| debug(std_uni) |
| { |
| writeln("dropUpTo full length=", data.length); |
| writeln(pos,"~~~", idx); |
| } |
| if (idx == data.length) |
| return genericReplace(data, pos, idx, cast(uint[])[]); |
| if (idx & 1) |
| { // a in positive |
| //[--+++----++++++----+++++++------...] |
| // |<---si s a t |
| genericReplace(data, pos, idx, [a]); |
| } |
| else |
| { // a in negative |
| //[--+++----++++++----+++++++-------+++...] |
| // |<---si s a t |
| genericReplace(data, pos, idx, cast(uint[])[]); |
| } |
| return pos; |
| } |
| |
| // |
| Marker skipUpTo(uint a, Marker pos=Marker.init) |
| out(result) |
| { |
| assert(result % 2 == 0);// always start of interval |
| //(may be 0-width after-split) |
| } |
| body |
| { |
| assert(data.length % 2 == 0); |
| auto range = assumeSorted!"a <= b"(data[pos .. data.length]); |
| size_t idx = pos+range.lowerBound(a).length; |
| |
| if (idx >= data.length) // could have Marker point to recently removed stuff |
| return data.length; |
| |
| if (idx & 1)// inside of interval, check for split |
| { |
| |
| immutable top = data[idx]; |
| if (top == a)// no need to split, it's end |
| return idx+1; |
| immutable start = data[idx-1]; |
| if (a == start) |
| return idx-1; |
| // split it up |
| genericReplace(data, idx, idx+1, [a, a, top]); |
| return idx+1; // avoid odd index |
| } |
| return idx; |
| } |
| |
| CowArray!SP data; |
| } |
| |
| @system unittest |
| { |
| import std.conv : to; |
| assert(unicode.ASCII.to!string() == "[0..128)"); |
| } |
| |
| // pedantic version for ctfe, and aligned-access only architectures |
| @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc |
| { |
| idx *= 3; |
| version (LittleEndian) |
| return ptr[idx] + (cast(uint) ptr[idx+1]<<8) |
| + (cast(uint) ptr[idx+2]<<16); |
| else |
| return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8) |
| + ptr[idx+2]; |
| } |
| |
| // ditto |
| @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc |
| { |
| idx *= 3; |
| version (LittleEndian) |
| { |
| ptr[idx] = val & 0xFF; |
| ptr[idx+1] = (val >> 8) & 0xFF; |
| ptr[idx+2] = (val >> 16) & 0xFF; |
| } |
| else |
| { |
| ptr[idx] = (val >> 16) & 0xFF; |
| ptr[idx+1] = (val >> 8) & 0xFF; |
| ptr[idx+2] = val & 0xFF; |
| } |
| } |
| |
| // unaligned x86-like read/write functions |
| @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc |
| { |
| uint* src = cast(uint*)(ptr+3*idx); |
| version (LittleEndian) |
| return *src & 0xFF_FFFF; |
| else |
| return *src >> 8; |
| } |
| |
| // ditto |
| @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc |
| { |
| uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx); |
| version (LittleEndian) |
| *dest = val | (*dest & 0xFF00_0000); |
| else |
| *dest = (val << 8) | (*dest & 0xFF); |
| } |
| |
| @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc |
| { |
| static if (hasUnalignedReads) |
| return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx); |
| else |
| return safeRead24(ptr, idx); |
| } |
| |
| @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc |
| { |
| static if (hasUnalignedReads) |
| return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx); |
| else |
| return safeWrite24(ptr, val, idx); |
| } |
| |
| struct CowArray(SP=GcPolicy) |
| { |
| import std.range.primitives : hasLength; |
| |
| @safe: |
| static auto reuse(uint[] arr) |
| { |
| CowArray cow; |
| cow.data = arr; |
| SP.append(cow.data, 1); |
| assert(cow.refCount == 1); |
| assert(cow.length == arr.length); |
| return cow; |
| } |
| |
| this(Range)(Range range) |
| if (isInputRange!Range && hasLength!Range) |
| { |
| import std.algorithm.mutation : copy; |
| length = range.length; |
| copy(range, data[0..$-1]); |
| } |
| |
| this(Range)(Range range) |
| if (isForwardRange!Range && !hasLength!Range) |
| { |
| import std.algorithm.mutation : copy; |
| import std.range.primitives : walkLength; |
| immutable len = walkLength(range.save); |
| length = len; |
| copy(range, data[0..$-1]); |
| } |
| |
| this(this) |
| { |
| if (!empty) |
| { |
| refCount = refCount + 1; |
| } |
| } |
| |
| ~this() |
| { |
| if (!empty) |
| { |
| immutable cnt = refCount; |
| if (cnt == 1) |
| SP.destroy(data); |
| else |
| refCount = cnt - 1; |
| } |
| } |
| |
| // no ref-count for empty U24 array |
| @property bool empty() const { return data.length == 0; } |
| |
| // report one less then actual size |
| @property size_t length() const |
| { |
| return data.length ? data.length - 1 : 0; |
| } |
| |
| //+ an extra slot for ref-count |
| @property void length(size_t len) |
| { |
| import std.algorithm.comparison : min; |
| import std.algorithm.mutation : copy; |
| if (len == 0) |
| { |
| if (!empty) |
| freeThisReference(); |
| return; |
| } |
| immutable total = len + 1; // including ref-count |
| if (empty) |
| { |
| data = SP.alloc!uint(total); |
| refCount = 1; |
| return; |
| } |
| immutable cur_cnt = refCount; |
| if (cur_cnt != 1) // have more references to this memory |
| { |
| refCount = cur_cnt - 1; |
| auto new_data = SP.alloc!uint(total); |
| // take shrinking into account |
| auto to_copy = min(total, data.length) - 1; |
| copy(data[0 .. to_copy], new_data[0 .. to_copy]); |
| data = new_data; // before setting refCount! |
| refCount = 1; |
| } |
| else // 'this' is the only reference |
| { |
| // use the realloc (hopefully in-place operation) |
| data = SP.realloc(data, total); |
| refCount = 1; // setup a ref-count in the new end of the array |
| } |
| } |
| |
| alias opDollar = length; |
| |
| uint opIndex()(size_t idx)const |
| { |
| return data[idx]; |
| } |
| |
| void opIndexAssign(uint val, size_t idx) |
| { |
| auto cnt = refCount; |
| if (cnt != 1) |
| dupThisReference(cnt); |
| data[idx] = val; |
| } |
| |
| // |
| auto opSlice(size_t from, size_t to) |
| { |
| if (!empty) |
| { |
| auto cnt = refCount; |
| if (cnt != 1) |
| dupThisReference(cnt); |
| } |
| return data[from .. to]; |
| |
| } |
| |
| // |
| auto opSlice(size_t from, size_t to) const |
| { |
| return data[from .. to]; |
| } |
| |
| // length slices before the ref count |
| auto opSlice() |
| { |
| return opSlice(0, length); |
| } |
| |
| // ditto |
| auto opSlice() const |
| { |
| return opSlice(0, length); |
| } |
| |
| void append(Range)(Range range) |
| if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint)) |
| { |
| size_t nl = length + range.length; |
| length = nl; |
| copy(range, this[nl-range.length .. nl]); |
| } |
| |
| void append()(uint[] val...) |
| { |
| length = length + val.length; |
| data[$-val.length-1 .. $-1] = val[]; |
| } |
| |
| bool opEquals()(auto const ref CowArray rhs)const |
| { |
| if (empty ^ rhs.empty) |
| return false; // one is empty and the other isn't |
| return empty || data[0..$-1] == rhs.data[0..$-1]; |
| } |
| |
| private: |
| // ref-count is right after the data |
| @property uint refCount() const |
| { |
| return data[$-1]; |
| } |
| |
| @property void refCount(uint cnt) |
| { |
| data[$-1] = cnt; |
| } |
| |
| void freeThisReference() |
| { |
| immutable count = refCount; |
| if (count != 1) // have more references to this memory |
| { |
| // dec shared ref-count |
| refCount = count - 1; |
| data = []; |
| } |
| else |
| SP.destroy(data); |
| assert(!data.ptr); |
| } |
| |
| void dupThisReference(uint count) |
| in |
| { |
| assert(!empty && count != 1 && count == refCount); |
| } |
| body |
| { |
| import std.algorithm.mutation : copy; |
| // dec shared ref-count |
| refCount = count - 1; |
| // copy to the new chunk of RAM |
| auto new_data = SP.alloc!uint(data.length); |
| // bit-blit old stuff except the counter |
| copy(data[0..$-1], new_data[0..$-1]); |
| data = new_data; // before setting refCount! |
| refCount = 1; // so that this updates the right one |
| } |
| |
| uint[] data; |
| } |
| |
| @safe unittest// Uint24 tests |
| { |
| import std.algorithm.comparison : equal; |
| import std.algorithm.mutation : copy; |
| import std.conv : text; |
| import std.range : iota, chain; |
| import std.range.primitives : isBidirectionalRange, isOutputRange; |
| void funcRef(T)(ref T u24) |
| { |
| u24.length = 2; |
| u24[1] = 1024; |
| T u24_c = u24; |
| assert(u24[1] == 1024); |
| u24.length = 0; |
| assert(u24.empty); |
| u24.append([1, 2]); |
| assert(equal(u24[], [1, 2])); |
| u24.append(111); |
| assert(equal(u24[], [1, 2, 111])); |
| assert(!u24_c.empty && u24_c[1] == 1024); |
| u24.length = 3; |
| copy(iota(0, 3), u24[]); |
| assert(equal(u24[], iota(0, 3))); |
| assert(u24_c[1] == 1024); |
| } |
| |
| void func2(T)(T u24) |
| { |
| T u24_2 = u24; |
| T u24_3; |
| u24_3 = u24_2; |
| assert(u24_2 == u24_3); |
| assert(equal(u24[], u24_2[])); |
| assert(equal(u24_2[], u24_3[])); |
| funcRef(u24_3); |
| |
| assert(equal(u24_3[], iota(0, 3))); |
| assert(!equal(u24_2[], u24_3[])); |
| assert(equal(u24_2[], u24[])); |
| u24_2 = u24_3; |
| assert(equal(u24_2[], iota(0, 3))); |
| // to test that passed arg is intact outside |
| // plus try out opEquals |
| u24 = u24_3; |
| u24 = T.init; |
| u24_3 = T.init; |
| assert(u24.empty); |
| assert(u24 == u24_3); |
| assert(u24 != u24_2); |
| } |
| |
| foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy)) |
| { |
| alias Range = typeof(CowArray!Policy.init[]); |
| alias U24A = CowArray!Policy; |
| static assert(isForwardRange!Range); |
| static assert(isBidirectionalRange!Range); |
| static assert(isOutputRange!(Range, uint)); |
| static assert(isRandomAccessRange!(Range)); |
| |
| auto arr = U24A([42u, 36, 100]); |
| assert(arr[0] == 42); |
| assert(arr[1] == 36); |
| arr[0] = 72; |
| arr[1] = 0xFE_FEFE; |
| assert(arr[0] == 72); |
| assert(arr[1] == 0xFE_FEFE); |
| assert(arr[2] == 100); |
| U24A arr2 = arr; |
| assert(arr2[0] == 72); |
| arr2[0] = 11; |
| // test COW-ness |
| assert(arr[0] == 72); |
| assert(arr2[0] == 11); |
| // set this to about 100M to stress-test COW memory management |
| foreach (v; 0 .. 10_000) |
| func2(arr); |
| assert(equal(arr[], [72, 0xFE_FEFE, 100])); |
| |
| auto r2 = U24A(iota(0, 100)); |
| assert(equal(r2[], iota(0, 100)), text(r2[])); |
| copy(iota(10, 170, 2), r2[10 .. 90]); |
| assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100))) |
| , text(r2[])); |
| } |
| } |
| |
| version (unittest) |
| { |
| private alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); |
| } |
| |
| @safe unittest// core set primitives test |
| { |
| import std.conv : text; |
| foreach (CodeList; AllSets) |
| { |
| CodeList a; |
| //"plug a hole" test |
| a.add(10, 20).add(25, 30).add(15, 27); |
| assert(a == CodeList(10, 30), text(a)); |
| |
| auto x = CodeList.init; |
| x.add(10, 20).add(30, 40).add(50, 60); |
| |
| a = x; |
| a.add(20, 49);//[10, 49) [50, 60) |
| assert(a == CodeList(10, 49, 50 ,60)); |
| |
| a = x; |
| a.add(20, 50); |
| assert(a == CodeList(10, 60), text(a)); |
| |
| // simple unions, mostly edge effects |
| x = CodeList.init; |
| x.add(10, 20).add(40, 60); |
| |
| a = x; |
| a.add(10, 25); //[10, 25) [40, 60) |
| assert(a == CodeList(10, 25, 40, 60)); |
| |
| a = x; |
| a.add(5, 15); //[5, 20) [40, 60) |
| assert(a == CodeList(5, 20, 40, 60)); |
| |
| a = x; |
| a.add(0, 10); // [0, 20) [40, 60) |
| assert(a == CodeList(0, 20, 40, 60)); |
| |
| a = x; |
| a.add(0, 5); // prepand |
| assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a)); |
| |
| a = x; |
| a.add(5, 20); |
| assert(a == CodeList(5, 20, 40, 60)); |
| |
| a = x; |
| a.add(3, 37); |
| assert(a == CodeList(3, 37, 40, 60)); |
| |
| a = x; |
| a.add(37, 65); |
| assert(a == CodeList(10, 20, 37, 65)); |
| |
| // some tests on helpers for set intersection |
| x = CodeList.init.add(10, 20).add(40, 60).add(100, 120); |
| a = x; |
| |
| auto m = a.skipUpTo(60); |
| a.dropUpTo(110, m); |
| assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[])); |
| |
| a = x; |
| a.dropUpTo(100); |
| assert(a == CodeList(100, 120), text(a.data[])); |
| |
| a = x; |
| m = a.skipUpTo(50); |
| a.dropUpTo(140, m); |
| assert(a == CodeList(10, 20, 40, 50), text(a.data[])); |
| a = x; |
| a.dropUpTo(60); |
| assert(a == CodeList(100, 120), text(a.data[])); |
| } |
| } |
| |
| |
| //test constructor to work with any order of intervals |
| @safe unittest |
| { |
| import std.algorithm.comparison : equal; |
| import std.conv : text, to; |
| import std.range : chain, iota; |
| import std.typecons : tuple; |
| //ensure constructor handles bad ordering and overlap |
| auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1); |
| foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1))) |
| assert(ch in c1, to!string(ch)); |
| |
| //contiguos |
| assert(CodepointSet(1000, 1006, 1006, 1009) |
| .byInterval.equal([tuple(1000, 1009)])); |
| //contains |
| assert(CodepointSet(900, 1200, 1000, 1100) |
| .byInterval.equal([tuple(900, 1200)])); |
| //intersect left |
| assert(CodepointSet(900, 1100, 1000, 1200) |
| .byInterval.equal([tuple(900, 1200)])); |
| //intersect right |
| assert(CodepointSet(1000, 1200, 900, 1100) |
| .byInterval.equal([tuple(900, 1200)])); |
| |
| //ditto with extra items at end |
| assert(CodepointSet(1000, 1200, 900, 1100, 800, 850) |
| .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); |
| assert(CodepointSet(900, 1100, 1000, 1200, 800, 850) |
| .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); |
| |
| //"plug a hole" test |
| auto c2 = CodepointSet(20, 40, |
| 60, 80, 100, 140, 150, 200, |
| 40, 60, 80, 100, 140, 150 |
| ); |
| assert(c2.byInterval.equal([tuple(20, 200)])); |
| |
| auto c3 = CodepointSet( |
| 20, 40, 60, 80, 100, 140, 150, 200, |
| 0, 10, 15, 100, 10, 20, 200, 220); |
| assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)])); |
| } |
| |
| |
| @safe unittest |
| { // full set operations |
| import std.conv : text; |
| foreach (CodeList; AllSets) |
| { |
| CodeList a, b, c, d; |
| |
| //"plug a hole" |
| a.add(20, 40).add(60, 80).add(100, 140).add(150, 200); |
| b.add(40, 60).add(80, 100).add(140, 150); |
| c = a | b; |
| d = b | a; |
| assert(c == CodeList(20, 200), text(CodeList.stringof," ", c)); |
| assert(c == d, text(c," vs ", d)); |
| |
| b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210); |
| c = a | b; //[20,45) [60, 85) [95, 140) [150, 210) |
| d = b | a; |
| assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c)); |
| assert(c == d, text(c," vs ", d)); |
| |
| b = CodeList.init.add(10, 20).add(30,100).add(145,200); |
| c = a | b;//[10, 140) [145, 200) |
| d = b | a; |
| assert(c == CodeList(10, 140, 145, 200)); |
| assert(c == d, text(c," vs ", d)); |
| |
| b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220); |
| c = a | b;//[0, 140) [150, 220) |
| d = b | a; |
| assert(c == CodeList(0, 140, 150, 220)); |
| assert(c == d, text(c," vs ", d)); |
| |
| |
| a = CodeList.init.add(20, 40).add(60, 80); |
| b = CodeList.init.add(25, 35).add(65, 75); |
| c = a & b; |
| d = b & a; |
| assert(c == CodeList(25, 35, 65, 75), text(c)); |
| assert(c == d, text(c," vs ", d)); |
| |
| a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); |
| b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180); |
| c = a & b; |
| d = b & a; |
| assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c)); |
| assert(c == d, text(c," vs ", d)); |
| |
| a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); |
| b = CodeList.init.add(10, 30).add(60, 120).add(135, 160); |
| c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160) |
| d = b & a; |
| |
| assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c)); |
| assert(c == d, text(c, " vs ",d)); |
| assert((c & a) == c); |
| assert((d & b) == d); |
| assert((c & d) == d); |
| |
| b = CodeList.init.add(40, 60).add(80, 100).add(140, 200); |
| c = a & b; |
| d = b & a; |
| assert(c == CodeList(150, 200), text(c)); |
| assert(c == d, text(c, " vs ",d)); |
| assert((c & a) == c); |
| assert((d & b) == d); |
| assert((c & d) == d); |
| |
| assert((a & a) == a); |
| assert((b & b) == b); |
| |
| a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); |
| b = CodeList.init.add(30, 60).add(75, 120).add(190, 300); |
| c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190) |
| d = b - a;// [40, 60) [80, 100) [200, 300) |
| assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c)); |
| assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d)); |
| assert(c - d == c, text(c-d, " vs ", c)); |
| assert(d - c == d, text(d-c, " vs ", d)); |
| assert(c - c == CodeList.init); |
| assert(d - d == CodeList.init); |
| |
| a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150, 200); |
| b = CodeList.init.add(10, 50).add(60, 160).add(190, 300); |
| c = a - b;// [160, 190) |
| d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300) |
| assert(c == CodeList(160, 190), text(c)); |
| assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d)); |
| assert(c - d == c, text(c-d, " vs ", c)); |
| assert(d - c == d, text(d-c, " vs ", d)); |
| assert(c - c == CodeList.init); |
| assert(d - d == CodeList.init); |
| |
| a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); |
| b = CodeList.init.add(10, 30).add(45, 100).add(130, 190); |
| c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200) |
| d = b ~ a; |
| assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200), |
| text(c)); |
| assert(c == d, text(c, " vs ", d)); |
| } |
| } |
| |
| } |
| |
| @safe unittest// vs single dchar |
| { |
| import std.conv : text; |
| CodepointSet a = CodepointSet(10, 100, 120, 200); |
| assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A')); |
| assert((a & 'B') == CodepointSet(66, 67)); |
| } |
| |
| @safe unittest// iteration & opIndex |
| { |
| import std.algorithm.comparison : equal; |
| import std.conv : text; |
| import std.typecons : tuple, Tuple; |
| |
| foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy))) |
| { |
| auto arr = "ABCDEFGHIJKLMabcdefghijklm"d; |
| auto a = CodeList('A','N','a', 'n'); |
| assert(equal(a.byInterval, |
| [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')] |
| ), text(a.byInterval)); |
| |
| // same @@@BUG as in issue 8949 ? |
| version (bug8949) |
| { |
| import std.range : retro; |
| assert(equal(retro(a.byInterval), |
| [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')] |
| ), text(retro(a.byInterval))); |
| } |
| auto achr = a.byCodepoint; |
| assert(equal(achr, arr), text(a.byCodepoint)); |
| foreach (ch; a.byCodepoint) |
| assert(a[ch]); |
| auto x = CodeList(100, 500, 600, 900, 1200, 1500); |
| assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval)); |
| foreach (ch; x.byCodepoint) |
| assert(x[ch]); |
| static if (is(CodeList == CodepointSet)) |
| { |
| auto y = CodeList(x.byInterval); |
| assert(equal(x.byInterval, y.byInterval)); |
| } |
| assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[])); |
| assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[])); |
| } |
| } |
| |
| //============================================================================ |
| // Generic Trie template and various ways to build it |
| //============================================================================ |
| |
| // debug helper to get a shortened array dump |
| auto arrayRepr(T)(T x) |
| { |
| import std.conv : text; |
| if (x.length > 32) |
| { |
| return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]); |
| } |
| else |
| return text(x); |
| } |
| |
| /** |
| Maps $(D Key) to a suitable integer index within the range of $(D size_t). |
| The mapping is constructed by applying predicates from $(D Prefix) left to right |
| and concatenating the resulting bits. |
| |
| The first (leftmost) predicate defines the most significant bits of |
| the resulting index. |
| */ |
| template mapTrieIndex(Prefix...) |
| { |
| size_t mapTrieIndex(Key)(Key key) |
| if (isValidPrefixForTrie!(Key, Prefix)) |
| { |
| alias p = Prefix; |
| size_t idx; |
| foreach (i, v; p[0..$-1]) |
| { |
| idx |= p[i](key); |
| idx <<= p[i+1].bitSize; |
| } |
| idx |= p[$-1](key); |
| return idx; |
| } |
| } |
| |
| /* |
| $(D TrieBuilder) is a type used for incremental construction |
| of $(LREF Trie)s. |
| |
| See $(LREF buildTrie) for generic helpers built on top of it. |
| */ |
| @trusted private struct TrieBuilder(Value, Key, Args...) |
| if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args)) |
| { |
| import std.exception : enforce; |
| |
| private: |
| // last index is not stored in table, it is used as an offset to values in a block. |
| static if (is(Value == bool))// always pack bool |
| alias V = BitPacked!(Value, 1); |
| else |
| alias V = Value; |
| static auto deduceMaxIndex(Preds...)() |
| { |
| size_t idx = 1; |
| foreach (v; Preds) |
| idx *= 2^^v.bitSize; |
| return idx; |
| } |
| |
| static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key |
| { |
| alias Prefix = Args[1..$]; |
| enum lastPageSize = 2^^Prefix[$-1].bitSize; |
| enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]); |
| enum roughedMaxIndex = |
| (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize; |
| // check warp around - if wrapped, use the default deduction rule |
| enum maxIndex = roughedMaxIndex < translatedMaxIndex ? |
| deduceMaxIndex!(Prefix)() : roughedMaxIndex; |
| } |
| else |
| { |
| alias Prefix = Args; |
| enum maxIndex = deduceMaxIndex!(Prefix)(); |
| } |
| |
| alias getIndex = mapTrieIndex!(Prefix); |
| |
| enum lastLevel = Prefix.length-1; |
| struct ConstructState |
| { |
| size_t idx_zeros, idx_ones; |
| } |
| // iteration over levels of Trie, each indexes its own level and thus a shortened domain |
| size_t[Prefix.length] indices; |
| // default filler value to use |
| Value defValue; |
| // this is a full-width index of next item |
| size_t curIndex; |
| // all-zeros page index, all-ones page index (+ indicator if there is such a page) |
| ConstructState[Prefix.length] state; |
| // the table being constructed |
| MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table; |
| |
| @disable this(); |
| |
| //shortcut for index variable at level 'level' |
| @property ref idx(size_t level)(){ return indices[level]; } |
| |
| // this function assumes no holes in the input so |
| // indices are going one by one |
| void addValue(size_t level, T)(T val, size_t numVals) |
| { |
| alias j = idx!level; |
| enum pageSize = 1 << Prefix[level].bitSize; |
| if (numVals == 0) |
| return; |
| auto ptr = table.slice!(level); |
| if (numVals == 1) |
| { |
| static if (level == Prefix.length-1) |
| ptr[j] = val; |
| else |
| {// can incur narrowing conversion |
| assert(j < ptr.length); |
| ptr[j] = force!(typeof(ptr[j]))(val); |
| } |
| j++; |
| if (j % pageSize == 0) |
| spillToNextPage!level(ptr); |
| return; |
| } |
| // longer row of values |
| // get to the next page boundary |
| immutable nextPB = (j + pageSize) & ~(pageSize-1); |
| immutable n = nextPB - j;// can fill right in this page |
| if (numVals < n) //fits in current page |
| { |
| ptr[j .. j+numVals] = val; |
| j += numVals; |
| return; |
| } |
| static if (level != 0)//on the first level it always fits |
| { |
| numVals -= n; |
| //write till the end of current page |
| ptr[j .. j+n] = val; |
| j += n; |
| //spill to the next page |
| spillToNextPage!level(ptr); |
| // page at once loop |
| if (state[level].idx_zeros != size_t.max && val == T.init) |
| { |
| alias NextIdx = typeof(table.slice!(level-1)[0]); |
| addValue!(level-1)(force!NextIdx(state[level].idx_zeros), |
| numVals/pageSize); |
| ptr = table.slice!level; //table structure might have changed |
| numVals %= pageSize; |
| } |
| else |
| { |
| while (numVals >= pageSize) |
| { |
| numVals -= pageSize; |
| ptr[j .. j+pageSize] = val; |
| j += pageSize; |
| spillToNextPage!level(ptr); |
| } |
| } |
| if (numVals) |
| { |
| // the leftovers, an incomplete page |
| ptr[j .. j+numVals] = val; |
| j += numVals; |
| } |
| } |
| } |
| |
| void spillToNextPage(size_t level, Slice)(ref Slice ptr) |
| { |
| // last level (i.e. topmost) has 1 "page" |
| // thus it need not to add a new page on upper level |
| static if (level != 0) |
| spillToNextPageImpl!(level)(ptr); |
| } |
| |
| // this can re-use the current page if duplicate or allocate a new one |
| // it also makes sure that previous levels point to the correct page in this level |
| void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr) |
| { |
| alias NextIdx = typeof(table.slice!(level-1)[0]); |
| NextIdx next_lvl_index; |
| enum pageSize = 1 << Prefix[level].bitSize; |
| assert(idx!level % pageSize == 0); |
| immutable last = idx!level-pageSize; |
| const slice = ptr[idx!level - pageSize .. idx!level]; |
| size_t j; |
| for (j=0; j<last; j+=pageSize) |
| { |
| if (ptr[j .. j+pageSize] == slice) |
| { |
| // get index to it, reuse ptr space for the next block |
| next_lvl_index = force!NextIdx(j/pageSize); |
| version (none) |
| { |
| import std.stdio : writefln, writeln; |
| writefln("LEVEL(%s) page mapped idx: %s: 0..%s ---> [%s..%s]" |
| ,level |
| ,indices[level-1], pageSize, j, j+pageSize); |
| writeln("LEVEL(", level |
| , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize])); |
| writeln("LEVEL(", level |
| , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize])); |
| } |
| idx!level -= pageSize; // reuse this page, it is duplicate |
| break; |
| } |
| } |
| if (j == last) |
| { |
| L_allocate_page: |
| next_lvl_index = force!NextIdx(idx!level/pageSize - 1); |
| if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize)) |
| { |
| state[level].idx_zeros = next_lvl_index; |
| } |
| // allocate next page |
| version (none) |
| { |
| import std.stdio : writefln; |
| writefln("LEVEL(%s) page allocated: %s" |
| , level, arrayRepr(slice[0 .. pageSize])); |
| writefln("LEVEL(%s) index: %s ; page at this index %s" |
| , level |
| , next_lvl_index |
| , arrayRepr( |
| table.slice!(level) |
| [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize] |
| )); |
| } |
| table.length!level = table.length!level + pageSize; |
| } |
| L_know_index: |
| // for the previous level, values are indices to the pages in the current level |
| addValue!(level-1)(next_lvl_index, 1); |
| ptr = table.slice!level; //re-load the slice after moves |
| } |
| |
| // idx - full-width index to fill with v (full-width index != key) |
| // fills everything in the range of [curIndex, idx) with filler |
| void putAt(size_t idx, Value v) |
| { |
| assert(idx >= curIndex); |
| immutable numFillers = idx - curIndex; |
| addValue!lastLevel(defValue, numFillers); |
| addValue!lastLevel(v, 1); |
| curIndex = idx + 1; |
| } |
| |
| // ditto, but sets the range of [idxA, idxB) to v |
| void putRangeAt(size_t idxA, size_t idxB, Value v) |
| { |
| assert(idxA >= curIndex); |
| assert(idxB >= idxA); |
| size_t numFillers = idxA - curIndex; |
| addValue!lastLevel(defValue, numFillers); |
| addValue!lastLevel(v, idxB - idxA); |
| curIndex = idxB; // open-right |
| } |
| |
| enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~ |
| "duplicate key->value mapping"; |
| |
| public: |
| /** |
| Construct a builder, where $(D filler) is a value |
| to indicate empty slots (or "not found" condition). |
| */ |
| this(Value filler) |
| { |
| curIndex = 0; |
| defValue = filler; |
| // zeros-page index, ones-page index |
| foreach (ref v; state) |
| v = ConstructState(size_t.max, size_t.max); |
| table = typeof(table)(indices); |
| // one page per level is a bootstrap minimum |
| foreach (i, Pred; Prefix) |
| table.length!i = (1 << Pred.bitSize); |
| } |
| |
| /** |
| Put a value $(D v) into interval as |
| mapped by keys from $(D a) to $(D b). |
| All slots prior to $(D a) are filled with |
| the default filler. |
| */ |
| void putRange(Key a, Key b, Value v) |
| { |
| auto idxA = getIndex(a), idxB = getIndex(b); |
| // indexes of key should always grow |
| enforce(idxB >= idxA && idxA >= curIndex, errMsg); |
| putRangeAt(idxA, idxB, v); |
| } |
| |
| /** |
| Put a value $(D v) into slot mapped by $(D key). |
| All slots prior to $(D key) are filled with the |
| default filler. |
| */ |
| void putValue(Key key, Value v) |
| { |
| import std.conv : text; |
| auto idx = getIndex(key); |
| enforce(idx >= curIndex, text(errMsg, " ", idx)); |
| putAt(idx, v); |
| } |
| |
| /// Finishes construction of Trie, yielding an immutable Trie instance. |
| auto build() |
| { |
| static if (maxIndex != 0) // doesn't cover full range of size_t |
| { |
| assert(curIndex <= maxIndex); |
| addValue!lastLevel(defValue, maxIndex - curIndex); |
| } |
| else |
| { |
| if (curIndex != 0 // couldn't wrap around |
| || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty |
| { |
| addValue!lastLevel(defValue, size_t.max - curIndex); |
| addValue!lastLevel(defValue, 1); |
| } |
| // else curIndex already completed the full range of size_t by wrapping around |
| } |
| return Trie!(V, Key, maxIndex, Prefix)(table); |
| } |
| } |
| |
| /** |
| $(P A generic Trie data-structure for a fixed number of stages. |
| The design goal is optimal speed with smallest footprint size. |
| ) |
| $(P It's intentionally read-only and doesn't provide constructors. |
| To construct one use a special builder, |
| see $(LREF TrieBuilder) and $(LREF buildTrie). |
| ) |
| |
| */ |
| @trusted private struct Trie(Value, Key, Args...) |
| if (isValidPrefixForTrie!(Key, Args) |
| || (isValidPrefixForTrie!(Key, Args[1..$]) |
| && is(typeof(Args[0]) : size_t))) |
| { |
| import std.range.primitives : isOutputRange; |
| static if (is(typeof(Args[0]) : size_t)) |
| { |
| private enum maxIndex = Args[0]; |
| private enum hasBoundsCheck = true; |
| private alias Prefix = Args[1..$]; |
| } |
| else |
| { |
| private enum hasBoundsCheck = false; |
| private alias Prefix = Args; |
| } |
| |
| private this()(typeof(_table) table) |
| { |
| _table = table; |
| } |
| |
| // only for constant Tries constructed from precompiled tables |
| private this()(const(size_t)[] offsets, const(size_t)[] sizes, |
| const(size_t)[] data) const |
| { |
| _table = typeof(_table)(offsets, sizes, data); |
| } |
| |
| /** |
| $(P Lookup the $(D key) in this $(D Trie). ) |
| |
| $(P The lookup always succeeds if key fits the domain |
| provided during construction. The whole domain defined |
| is covered so instead of not found condition |
| the sentinel (filler) value could be used. ) |
| |
| $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to |
| define a domain of $(D Trie) keys and the sentinel value. ) |
| |
| Note: |
| Domain range-checking is only enabled in debug builds |
| and results in assertion failure. |
| */ |
| TypeOfBitPacked!Value opIndex()(Key key) const |
| { |
| static if (hasBoundsCheck) |
| assert(mapTrieIndex!Prefix(key) < maxIndex); |
| size_t idx; |
| alias p = Prefix; |
| idx = cast(size_t) p[0](key); |
| foreach (i, v; p[0..$-1]) |
| idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key)); |
| return _table.ptr!(p.length-1)[idx]; |
| } |
| |
| /// |
| @property size_t bytes(size_t n=size_t.max)() const |
| { |
| return _table.bytes!n; |
| } |
| |
| /// |
| @property size_t pages(size_t n)() const |
| { |
| return (bytes!n+2^^(Prefix[n].bitSize-1)) |
| /2^^Prefix[n].bitSize; |
| } |
| |
| /// |
| void store(OutRange)(scope OutRange sink) const |
| if (isOutputRange!(OutRange, char)) |
| { |
| _table.store(sink); |
| } |
| |
| private: |
| MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table; |
| } |
| |
| // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes' |
| // left-to-right, the most significant bits first |
| template GetBitSlicing(size_t top, sizes...) |
| { |
| static if (sizes.length > 0) |
| alias GetBitSlicing = |
| AliasSeq!(sliceBits!(top - sizes[0], top), |
| GetBitSlicing!(top - sizes[0], sizes[1..$])); |
| else |
| alias GetBitSlicing = AliasSeq!(); |
| } |
| |
| template callableWith(T) |
| { |
| template callableWith(alias Pred) |
| { |
| static if (!is(typeof(Pred(T.init)))) |
| enum callableWith = false; |
| else |
| { |
| alias Result = typeof(Pred(T.init)); |
| enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result)); |
| } |
| } |
| } |
| |
| /* |
| Check if $(D Prefix) is a valid set of predicates |
| for $(D Trie) template having $(D Key) as the type of keys. |
| This requires all predicates to be callable, take |
| single argument of type $(D Key) and return unsigned value. |
| */ |
| template isValidPrefixForTrie(Key, Prefix...) |
| { |
| import std.meta : allSatisfy; |
| enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws |
| } |
| |
| /* |
| Check if $(D Args) is a set of maximum key value followed by valid predicates |
| for $(D Trie) template having $(D Key) as the type of keys. |
| */ |
| template isValidArgsForTrie(Key, Args...) |
| { |
| static if (Args.length > 1) |
| { |
| enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args) |
| || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key)); |
| } |
| else |
| enum isValidArgsForTrie = isValidPrefixForTrie!Args; |
| } |
| |
| @property size_t sumOfIntegerTuple(ints...)() |
| { |
| size_t count=0; |
| foreach (v; ints) |
| count += v; |
| return count; |
| } |
| |
| /** |
| A shorthand for creating a custom multi-level fixed Trie |
| from a $(D CodepointSet). $(D sizes) are numbers of bits per level, |
| with the most significant bits used first. |
| |
| Note: The sum of $(D sizes) must be equal 21. |
| |
| See_Also: $(LREF toTrie), which is even simpler. |
| |
| Example: |
| --- |
| { |
| import std.stdio; |
| auto set = unicode("Number"); |
| auto trie = codepointSetTrie!(8, 5, 8)(set); |
| writeln("Input code points to test:"); |
| foreach (line; stdin.byLine) |
| { |
| int count=0; |
| foreach (dchar ch; line) |
| if (trie[ch])// is number |
| count++; |
| writefln("Contains %d number code points.", count); |
| } |
| } |
| --- |
|