| /* Make uname2c.h from various sources. |
| Copyright (C) 2005-2022 Free Software Foundation, Inc. |
| Contributed by Jakub Jelinek <jakub@redhat.com> |
| |
| This program is free software; you can redistribute it and/or modify it |
| under the terms of the GNU General Public License as published by the |
| Free Software Foundation; either version 3, or (at your option) any |
| later version. |
| |
| This program is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| GNU General Public License for more details. |
| |
| You should have received a copy of the GNU General Public License |
| along with this program; see the file COPYING3. If not see |
| <http://www.gnu.org/licenses/>. */ |
| |
| /* Run this program as |
| ./makeuname2c UnicodeData.txt NameAliases.txt > uname2c.h |
| |
| This program generates 2 big arrays and 2 small ones. |
| The large ones are uname2c_dict, initialized by string literal |
| representing dictionary, and uname2c_tree, which is a space optimized |
| radix tree. |
| The format of the radix tree is: |
| byte 0 either 0x80 + (key[0] - ' ') (if key_len == 1) |
| or key_len (otherwise) |
| either of them ored with 0x40 if it has a codepoint |
| byte 1 LSB of offset into uname2c_dict for key (only if key_len > 1) |
| byte 2 MSB of offset into uname2c_dict for key (only if key_len > 1) |
| if key_len == 1, the above 2 bytes are omitted |
| byte 3 LSB of codepoint (only if it has a codepoint) |
| byte 4 middle byte of codepoint (ditto) |
| byte 5 MSB of codepoint (ditto), ored with 0x80 if node has children |
| ored with 0x40 if it doesn't have siblings |
| if it doesn't have a codepoint, the above 3 bytes are omitted |
| and we assume that the node has children |
| byte 6, 7, 8 uleb128 encoded offset to first child relative to the end |
| of the uleb128 (only if node has children) |
| byte 9 0xff (only if node doesn't have a codepoint and doesn't |
| have siblings) |
| |
| For prefixes of Unicode NR1 or NR2 rule generated names, on a node |
| representing end of the prefix codepoint is 0xd800 + index into |
| uname2c_generated array with indexes into uname2c_pairs array of |
| code points (low, high) of the ranges terminated by single 0. |
| 0xd800 is NR1 rule (Hangul syllables), rest are NR2 rules. |
| */ |
| |
| #include <assert.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <stdint.h> |
| #include <ctype.h> |
| #include <limits.h> |
| #include <stdarg.h> |
| #include <stdbool.h> |
| #include <stdlib.h> |
| |
| #define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0])) |
| |
| #define NUM_CODE_POINTS 0x110000 |
| #define MAX_CODE_POINT 0x10ffff |
| #define NO_VALUE 0xdc00 |
| #define GENERATED 0xd800 |
| |
| struct entry { const char *name; unsigned long codepoint; }; |
| static struct entry *entries; |
| static unsigned long num_allocated, num_entries; |
| |
| /* Unicode 15 Table 4-8. */ |
| struct generated { |
| const char *prefix; |
| /* max_high is a workaround for UnicodeData.txt inconsistencies |
| on a few CJK UNIFIED IDEOGRAPH- ranges where the "*, Last>" |
| entry is a few code points above the end of the range. */ |
| unsigned long low, high, max_high; |
| int idx, ok; |
| }; |
| static struct generated generated_ranges[] = |
| { { "HANGUL SYLLABLE ", 0xac00, 0xd7a3, 0, 0, 0 }, /* NR1 rule */ |
| { "CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4dbf, 0, 1, 0 }, /* NR2 rules */ |
| { "CJK UNIFIED IDEOGRAPH-", 0x4e00, 0x9fff, 0, 1, 0 }, |
| { "CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2a6df, 0, 1, 0 }, |
| { "CJK UNIFIED IDEOGRAPH-", 0x2a700, 0x2b739, 0, 1, 0 }, |
| { "CJK UNIFIED IDEOGRAPH-", 0x2b740, 0x2b81d, 0, 1, 0 }, |
| { "CJK UNIFIED IDEOGRAPH-", 0x2b820, 0x2cea1, 0, 1, 0 }, |
| { "CJK UNIFIED IDEOGRAPH-", 0x2ceb0, 0x2ebe0, 0, 1, 0 }, |
| { "CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134a, 0, 1, 0 }, |
| { "CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323af, 0, 1, 0 }, |
| { "TANGUT IDEOGRAPH-", 0x17000, 0x187f7, 0, 2, 0 }, |
| { "TANGUT IDEOGRAPH-", 0x18d00, 0x18d08, 0, 2, 0 }, |
| { "KHITAN SMALL SCRIPT CHARACTER-", 0x18b00, 0x18cd5, 0, 3, 0 }, |
| { "NUSHU CHARACTER-", 0x1b170, 0x1b2fb, 0, 4, 0 }, |
| { "CJK COMPATIBILITY IDEOGRAPH-", 0xf900, 0xfa6d, 0, 5, 0 }, |
| { "CJK COMPATIBILITY IDEOGRAPH-", 0xfa70, 0xfad9, 0, 5, 0 }, |
| { "CJK COMPATIBILITY IDEOGRAPH-", 0x2f800, 0x2fa1d, 0, 5, 0 } |
| }; |
| |
| struct node { |
| struct node *sibling, *child; |
| const char *key; |
| size_t key_len, key_idx, node_size, size_sum, child_off; |
| unsigned long codepoint; |
| bool in_dict; |
| }; |
| static struct node *root, **nodes; |
| static unsigned long num_nodes; |
| static size_t dict_size, tree_size, max_entry_len; |
| static char *dict; |
| static unsigned char *tree; |
| |
| /* Die! */ |
| |
| static void |
| fail (const char *s, ...) |
| { |
| va_list ap; |
| |
| va_start (ap, s); |
| vfprintf (stderr, s, ap); |
| va_end (ap); |
| fputc ('\n', stderr); |
| exit (1); |
| } |
| |
| static void * |
| xmalloc (size_t size) |
| { |
| void *ret = malloc (size); |
| |
| if (ret == NULL) |
| fail ("failed to allocate %ld bytes", (long) size); |
| return ret; |
| } |
| |
| static void * |
| xrealloc (void *p, size_t size) |
| { |
| void *ret = p ? realloc (p, size) : malloc (size); |
| |
| if (ret == NULL) |
| fail ("failed to allocate %ld bytes", (long) size); |
| return ret; |
| } |
| |
| static int |
| entrycmp (const void *p1, const void *p2) |
| { |
| const struct entry *e1 = (const struct entry *) p1; |
| const struct entry *e2 = (const struct entry *) p2; |
| int ret = strcmp (e1->name, e2->name); |
| |
| if (ret != 0) |
| return ret; |
| if (e1->codepoint < e2->codepoint) |
| return -1; |
| if (e1->codepoint > e2->codepoint) |
| return 1; |
| return 0; |
| } |
| |
| static int |
| nodecmp (const void *p1, const void *p2) |
| { |
| const struct node *n1 = *(const struct node *const *) p1; |
| const struct node *n2 = *(const struct node *const *) p2; |
| if (n1->key_len > n2->key_len) |
| return -1; |
| if (n1->key_len < n2->key_len) |
| return 1; |
| return memcmp (n1->key, n2->key, n1->key_len); |
| } |
| |
| /* Read UnicodeData.txt and fill in the 'decomp' table to be the |
| decompositions of characters for which both the character |
| decomposed and all the code points in the decomposition are valid |
| for some supported language version, and the 'all_decomp' table to |
| be the decompositions of all characters without those |
| constraints. */ |
| |
| static void |
| read_table (char *fname, bool aliases_p) |
| { |
| FILE *f = fopen (fname, "r"); |
| const char *sname = aliases_p ? "NameAliases.txt" : "UnicodeData.txt"; |
| |
| if (!f) |
| fail ("opening %s", sname); |
| for (;;) |
| { |
| char line[256]; |
| unsigned long codepoint; |
| const char *name, *aname; |
| char *l; |
| size_t i; |
| |
| if (!fgets (line, sizeof (line), f)) |
| break; |
| codepoint = strtoul (line, &l, 16); |
| if (l == line && aliases_p) |
| { |
| /* NameAliased.txt can contain comments and empty lines. */ |
| if (*line == '#' || *line == '\n') |
| continue; |
| } |
| if (l == line || *l != ';') |
| fail ("parsing %s, reading code point", sname); |
| if (codepoint > MAX_CODE_POINT) |
| fail ("parsing %s, code point too large", sname); |
| |
| name = l + 1; |
| do { |
| ++l; |
| } while (*l != ';'); |
| |
| aname = NULL; |
| if (aliases_p) |
| { |
| /* Ignore figment and abbreviation aliases. */ |
| if (strcmp (l + 1, "correction\n") != 0 |
| && strcmp (l + 1, "control\n") != 0 |
| && strcmp (l + 1, "alternate\n") != 0) |
| continue; |
| i = ARRAY_SIZE (generated_ranges); |
| } |
| else |
| { |
| for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) |
| if (codepoint >= generated_ranges[i].low |
| && codepoint <= generated_ranges[i].max_high) |
| break; |
| if (i != ARRAY_SIZE (generated_ranges)) |
| { |
| if (*name == '<' && l[-1] == '>') |
| { |
| if (codepoint == generated_ranges[i].low |
| && l - name >= 9 |
| && memcmp (l - 8, ", First>", 8) == 0 |
| && generated_ranges[i].ok == 0) |
| { |
| generated_ranges[i].ok = INT_MAX - 1; |
| aname = generated_ranges[i].prefix; |
| codepoint = GENERATED + generated_ranges[i].idx; |
| } |
| /* Unfortunately, UnicodeData.txt isn't consistent |
| with the Table 4-8 range endpoints in 3 cases, |
| the ranges are longer there by a few codepoints. |
| So use the max_high hack to avoid verification |
| failures. */ |
| else if (codepoint == generated_ranges[i].max_high |
| && l - name >= 8 |
| && memcmp (l - 7, ", Last>", 7) == 0 |
| && generated_ranges[i].ok == INT_MAX - 1) |
| { |
| generated_ranges[i].ok = INT_MAX; |
| continue; |
| } |
| else |
| fail ("unexpected generated entry %lx %.*s", |
| codepoint, (int) (l - name), name); |
| } |
| else if (codepoint |
| == generated_ranges[i].low + generated_ranges[i].ok |
| && l - name == (strlen (generated_ranges[i].prefix) |
| + (name - 1 - line)) |
| && memcmp (name, generated_ranges[i].prefix, |
| strlen (generated_ranges[i].prefix)) == 0 |
| && memcmp (name + strlen (generated_ranges[i].prefix), |
| line, name - 1 - line) == 0) |
| { |
| ++generated_ranges[i].ok; |
| if (codepoint != generated_ranges[i].low) |
| continue; |
| aname = generated_ranges[i].prefix; |
| codepoint = GENERATED + generated_ranges[i].idx; |
| } |
| else |
| fail ("unexpected generated entry %lx %.*s", |
| codepoint, (int) (l - name), name); |
| if (aname == generated_ranges[i].prefix) |
| { |
| size_t j; |
| |
| /* Don't add an entry for a generated range where the |
| same prefix has been added already. */ |
| for (j = 0; j < i; ++j) |
| if (generated_ranges[j].idx == generated_ranges[i].idx |
| && generated_ranges[j].ok != 0) |
| break; |
| if (j < i) |
| continue; |
| } |
| } |
| else if (*name == '<' && l[-1] == '>') |
| continue; |
| } |
| |
| if (num_entries == num_allocated) |
| { |
| num_allocated = num_allocated ? 2 * num_allocated : 65536; |
| entries = (struct entry *) xrealloc (entries, num_allocated |
| * sizeof (entries[0])); |
| } |
| |
| if (aname == NULL) |
| { |
| char *a = (char *) xmalloc (l + 1 - name); |
| if (l - name > max_entry_len) |
| max_entry_len = l - name; |
| memcpy (a, name, l - name); |
| a[l - name] = '\0'; |
| aname = a; |
| } |
| entries[num_entries].name = aname; |
| entries[num_entries++].codepoint = codepoint; |
| } |
| if (ferror (f)) |
| fail ("reading %s", sname); |
| fclose (f); |
| } |
| |
| /* Assumes nodes are added from sorted array, so we never |
| add any node before existing one, only after it. */ |
| |
| static void |
| node_add (struct node **p, const char *key, size_t key_len, |
| unsigned long codepoint) |
| { |
| struct node *n; |
| size_t i; |
| |
| do |
| { |
| if (*p == NULL) |
| { |
| *p = n = (struct node *) xmalloc (sizeof (struct node)); |
| ++num_nodes; |
| assert (key_len); |
| n->sibling = NULL; |
| n->child = NULL; |
| n->key = key; |
| n->key_len = key_len; |
| n->codepoint = codepoint; |
| return; |
| } |
| n = *p; |
| for (i = 0; i < n->key_len && i < key_len; ++i) |
| if (n->key[i] != key[i]) |
| break; |
| if (i == 0) |
| { |
| p = &n->sibling; |
| continue; |
| } |
| if (i == n->key_len) |
| { |
| assert (key_len > n->key_len); |
| p = &n->child; |
| key += n->key_len; |
| key_len -= n->key_len; |
| continue; |
| } |
| /* Need to split the node. */ |
| assert (i < key_len); |
| n = (struct node *) xmalloc (sizeof (struct node)); |
| ++num_nodes; |
| n->sibling = NULL; |
| n->child = (*p)->child; |
| n->key = (*p)->key + i; |
| n->key_len = (*p)->key_len - i; |
| n->codepoint = (*p)->codepoint; |
| (*p)->child = n; |
| (*p)->key_len = i; |
| (*p)->codepoint = NO_VALUE; |
| key += i; |
| key_len -= i; |
| p = &n->sibling; |
| } |
| while (1); |
| } |
| |
| static void |
| append_nodes (struct node *n) |
| { |
| for (; n; n = n->sibling) |
| { |
| nodes[num_nodes++] = n; |
| append_nodes (n->child); |
| } |
| } |
| |
| static size_t |
| sizeof_uleb128 (size_t val) |
| { |
| size_t sz = 0; |
| do |
| { |
| val >>= 7; |
| sz += 1; |
| } |
| while (val != 0); |
| return sz; |
| } |
| |
| static void |
| size_nodes (struct node *n) |
| { |
| if (n->child) |
| size_nodes (n->child); |
| if (n->sibling) |
| size_nodes (n->sibling); |
| n->node_size = 1 + (n->key_len > 1) * 2; |
| if (n->codepoint != NO_VALUE) |
| n->node_size += 3; |
| else if (n->sibling == NULL) |
| ++n->node_size; |
| n->size_sum = 0; |
| n->child_off = 0; |
| if (n->sibling) |
| n->size_sum += n->sibling->size_sum; |
| if (n->child) |
| { |
| n->child_off = n->size_sum + (n->codepoint == NO_VALUE |
| && n->sibling == NULL); |
| n->node_size += sizeof_uleb128 (n->child_off); |
| } |
| n->size_sum += n->node_size; |
| if (n->child) |
| n->size_sum += n->child->size_sum; |
| tree_size += n->node_size; |
| } |
| |
| static void |
| write_uleb128 (unsigned char *p, size_t val) |
| { |
| unsigned char c; |
| do |
| { |
| c = val & 0x7f; |
| val >>= 7; |
| if (val) |
| c |= 0x80; |
| *p++ = c; |
| } |
| while (val); |
| } |
| |
| static void |
| write_nodes (struct node *n, size_t off) |
| { |
| for (; n; n = n->sibling) |
| { |
| assert (off < tree_size && tree[off] == 0); |
| if (n->key_len > 1) |
| { |
| assert (n->key_len < 64); |
| tree[off] = n->key_len; |
| } |
| else |
| tree[off] = (n->key[0] - ' ') | 0x80; |
| assert ((tree[off] & 0x40) == 0); |
| if (n->codepoint != NO_VALUE) |
| tree[off] |= 0x40; |
| off++; |
| if (n->key_len > 1) |
| { |
| tree[off++] = n->key_idx & 0xff; |
| tree[off++] = (n->key_idx >> 8) & 0xff; |
| } |
| if (n->codepoint != NO_VALUE) |
| { |
| assert (n->codepoint < (1L << 21)); |
| tree[off++] = n->codepoint & 0xff; |
| tree[off++] = (n->codepoint >> 8) & 0xff; |
| tree[off] = (n->codepoint >> 16) & 0xff; |
| if (n->child) |
| tree[off] |= 0x80; |
| if (!n->sibling) |
| tree[off] |= 0x40; |
| off++; |
| } |
| if (n->child) |
| { |
| write_uleb128 (&tree[off], n->child_off); |
| off += sizeof_uleb128 (n->child_off); |
| write_nodes (n->child, off + n->child_off); |
| } |
| if (n->codepoint == NO_VALUE |
| && n->sibling == NULL) |
| tree[off++] = 0xff; |
| } |
| assert (off <= tree_size); |
| } |
| |
| static void |
| build_radix_tree (void) |
| { |
| size_t i, j, k, key_idx; |
| |
| for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) |
| if (generated_ranges[i].ok == INT_MAX) |
| { |
| if (generated_ranges[i].max_high - generated_ranges[i].high > 15UL) |
| break; |
| } |
| else if (generated_ranges[i].ok == (generated_ranges[i].high |
| - generated_ranges[i].low + 1)) |
| { |
| if (generated_ranges[i].max_high != generated_ranges[i].high) |
| break; |
| } |
| else |
| break; |
| if (i < ARRAY_SIZE (generated_ranges)) |
| fail ("uncovered generated range %s %lx %lx", |
| generated_ranges[i].prefix, generated_ranges[i].low, |
| generated_ranges[i].high); |
| /* Sort entries alphabetically, node_add relies on that. */ |
| qsort (entries, num_entries, sizeof (struct entry), entrycmp); |
| for (i = 1; i < num_entries; ++i) |
| if (i && strcmp (entries[i].name, entries[i - 1].name) == 0) |
| fail ("multiple entries for name %s", entries[i].name); |
| |
| for (i = 0; i < num_entries; ++i) |
| node_add (&root, entries[i].name, strlen (entries[i].name), |
| entries[i].codepoint); |
| |
| nodes = (struct node **) xmalloc (num_nodes * sizeof (struct node *)); |
| i = num_nodes; |
| num_nodes = 0; |
| append_nodes (root); |
| assert (num_nodes == i); |
| /* Sort node pointers by decreasing string length to handle substrings |
| right. */ |
| qsort (nodes, num_nodes, sizeof (struct node *), nodecmp); |
| if (nodes[0]->key_len >= 64) |
| /* We could actually encode even 64 and 65, as key_len 0 and 1 will |
| never appear in the multiple letter key encodings, so could subtract |
| 2. */ |
| fail ("can't encode key length %d >= 64, so need to split some radix " |
| "tree nodes to ensure length fits", nodes[0]->key_len); |
| |
| /* Verify a property charset.cc UAX44-LM2 matching relies on: |
| if - is at the end of key of some node, then all its siblings |
| start with alphanumeric characters. |
| Only 2 character names and 1 alias have - followed by space: |
| U+0F0A TIBETAN MARK BKA- SHOG YIG MGO |
| U+0FD0 TIBETAN MARK BKA- SHOG GI MGO RGYAN |
| U+0FD0 TIBETAN MARK BSKA- SHOG GI MGO RGYAN |
| so the KA- in there will always be followed at least by SHOG |
| in the same node. |
| If this changes, charset.cc needs to change. */ |
| for (i = 0; i < num_nodes; ++i) |
| if (nodes[i]->key[nodes[i]->key_len - 1] == '-' |
| && nodes[i]->child) |
| { |
| struct node *n; |
| |
| for (n = nodes[i]->child; n; n = n->sibling) |
| if (n->key[0] == ' ') |
| fail ("node with key %.*s followed by node with key %.*s", |
| (int) nodes[i]->key_len, nodes[i]->key, |
| (int) n->key_len, n->key); |
| } |
| |
| /* This is expensive, O(num_nodes * num_nodes * nodes[0]->key_len), but |
| fortunately num_nodes is < 64K and key_len < 64. */ |
| key_idx = 0; |
| for (i = 0; i < num_nodes; ++i) |
| { |
| nodes[i]->key_idx = SIZE_MAX; |
| nodes[i]->in_dict = false; |
| if (nodes[i]->key_len > 1) |
| { |
| for (j = 0; j < i; ++j) |
| /* Can't rely on memmem unfortunately. */ |
| if (nodes[j]->in_dict) |
| { |
| for (k = 0; k <= nodes[j]->key_len - nodes[i]->key_len; ++k) |
| if (nodes[j]->key[k] == nodes[i]->key[0] |
| && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1, |
| nodes[i]->key_len - 1) == 0) |
| { |
| nodes[i]->key_idx = nodes[j]->key_idx + k; |
| j = i; |
| break; |
| } |
| if (j == i) |
| break; |
| for (; k < nodes[j]->key_len; ++k) |
| if (nodes[j]->key[k] == nodes[i]->key[0] |
| && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1, |
| nodes[j]->key_len - 1 - k) == 0) |
| { |
| size_t l; |
| |
| for (l = j + 1; l < i; ++l) |
| if (nodes[l]->in_dict) |
| break; |
| if (l < i |
| && memcmp (nodes[l]->key, |
| nodes[i]->key + (nodes[j]->key_len - k), |
| nodes[i]->key_len |
| - (nodes[j]->key_len - k)) == 0) |
| { |
| nodes[i]->key_idx = nodes[j]->key_idx + k; |
| j = i; |
| } |
| else |
| j = l - 1; |
| break; |
| } |
| } |
| if (nodes[i]->key_idx == SIZE_MAX) |
| { |
| nodes[i]->key_idx = key_idx; |
| nodes[i]->in_dict = true; |
| key_idx += nodes[i]->key_len; |
| } |
| } |
| } |
| if (key_idx >= 65536) |
| /* We only use 2 bytes for offsets into the dictionary. |
| If it grows more, there is e.g. a possibility to replace |
| most often seen words or substrings in the dictionary |
| with characters other than [A-Z0-9 -] (say LETTER occurs |
| in the dictionary almost 197 times and so by using a |
| instead of LETTER we could save (6 - 1) * 197 bytes, |
| with some on the side table mapping 'a' to "LETTER". */ |
| fail ("too large dictionary %ld", (long) key_idx); |
| dict_size = key_idx; |
| |
| size_nodes (root); |
| |
| dict = (char *) xmalloc (dict_size + 1); |
| for (i = 0; i < num_nodes; ++i) |
| if (nodes[i]->in_dict) |
| memcpy (dict + nodes[i]->key_idx, nodes[i]->key, nodes[i]->key_len); |
| dict[dict_size] = '\0'; |
| |
| tree = (unsigned char *) xmalloc (tree_size); |
| memset (tree, 0, tree_size); |
| write_nodes (root, 0); |
| } |
| |
| /* Print out the huge copyright notice. */ |
| |
| static void |
| write_copyright (void) |
| { |
| static const char copyright[] = "\ |
| /* Unicode name to codepoint.\n\ |
| Copyright (C) 2005-2022 Free Software Foundation, Inc.\n\ |
| \n\ |
| This program is free software; you can redistribute it and/or modify it\n\ |
| under the terms of the GNU General Public License as published by the\n\ |
| Free Software Foundation; either version 3, or (at your option) any\n\ |
| later version.\n\ |
| \n\ |
| This program is distributed in the hope that it will be useful,\n\ |
| but WITHOUT ANY WARRANTY; without even the implied warranty of\n\ |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\ |
| GNU General Public License for more details.\n\ |
| \n\ |
| You should have received a copy of the GNU General Public License\n\ |
| along with this program; see the file COPYING3. If not see\n\ |
| <http://www.gnu.org/licenses/>.\n\ |
| \n\ |
| \n\ |
| Copyright (C) 1991-2021 Unicode, Inc. All rights reserved.\n\ |
| Distributed under the Terms of Use in\n\ |
| http://www.unicode.org/copyright.html.\n\ |
| \n\ |
| Permission is hereby granted, free of charge, to any person\n\ |
| obtaining a copy of the Unicode data files and any associated\n\ |
| documentation (the \"Data Files\") or Unicode software and any\n\ |
| associated documentation (the \"Software\") to deal in the Data Files\n\ |
| or Software without restriction, including without limitation the\n\ |
| rights to use, copy, modify, merge, publish, distribute, and/or\n\ |
| sell copies of the Data Files or Software, and to permit persons to\n\ |
| whom the Data Files or Software are furnished to do so, provided\n\ |
| that (a) the above copyright notice(s) and this permission notice\n\ |
| appear with all copies of the Data Files or Software, (b) both the\n\ |
| above copyright notice(s) and this permission notice appear in\n\ |
| associated documentation, and (c) there is clear notice in each\n\ |
| modified Data File or in the Software as well as in the\n\ |
| documentation associated with the Data File(s) or Software that the\n\ |
| data or software has been modified.\n\ |
| \n\ |
| THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\ |
| OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\ |
| WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\ |
| NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\ |
| COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\ |
| ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\ |
| DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\ |
| WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\ |
| ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\ |
| OF THE DATA FILES OR SOFTWARE.\n\ |
| \n\ |
| Except as contained in this notice, the name of a copyright holder\n\ |
| shall not be used in advertising or otherwise to promote the sale,\n\ |
| use or other dealings in these Data Files or Software without prior\n\ |
| written authorization of the copyright holder. */\n"; |
| |
| puts (copyright); |
| } |
| |
| static void |
| write_dict (void) |
| { |
| size_t i; |
| |
| printf ("static const char uname2c_dict[%ld] =\n", (long) (dict_size + 1)); |
| for (i = 0; i < dict_size; i += 77) |
| printf ("\"%.77s\"%s\n", dict + i, i + 76 > dict_size ? ";" : ""); |
| puts (""); |
| } |
| |
| static void |
| write_tree (void) |
| { |
| size_t i, j; |
| |
| printf ("static const unsigned char uname2c_tree[%ld] = {\n", |
| (long) tree_size); |
| for (i = 0, j = 0; i < tree_size; ++i) |
| { |
| printf ("%s0x%02x%s", j == 0 ? " " : "", tree[i], |
| i == tree_size - 1 ? " };\n\n" : j == 11 ? ",\n" : ", "); |
| if (j == 11) |
| j = 0; |
| else |
| ++j; |
| } |
| } |
| |
| static void |
| write_generated (void) |
| { |
| size_t i, j; |
| |
| puts ("static const cppchar_t uname2c_pairs[] = {"); |
| for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) |
| { |
| if (i == 0) |
| ; |
| else if (generated_ranges[i - 1].idx != generated_ranges[i].idx) |
| puts (", 0,"); |
| else |
| puts (","); |
| printf (" 0x%lx, 0x%lx /* %s */", |
| generated_ranges[i].low, |
| generated_ranges[i].high, |
| generated_ranges[i].prefix); |
| } |
| puts (", 0 };\n"); |
| |
| puts ("static const unsigned char uname2c_generated[] = {"); |
| for (i = 0, j = -1; i < ARRAY_SIZE (generated_ranges); ++i) |
| { |
| if (i == 0 || generated_ranges[i - 1].idx != generated_ranges[i].idx) |
| printf ("%s %d /* %s */", i ? ",\n" : "", |
| ++j, generated_ranges[i].prefix); |
| j += 2; |
| } |
| puts (" };\n"); |
| } |
| |
| /* Main program. */ |
| |
| int |
| main (int argc, char **argv) |
| { |
| size_t i; |
| |
| if (argc != 3) |
| fail ("too few arguments to makeradixtree"); |
| for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) |
| if (!generated_ranges[i].max_high) |
| generated_ranges[i].max_high = generated_ranges[i].high; |
| read_table (argv[1], false); |
| read_table (argv[2], true); |
| build_radix_tree (); |
| |
| write_copyright (); |
| write_dict (); |
| write_tree (); |
| write_generated (); |
| printf ("static const unsigned int uname2c_max_name_len = %ld;\n\n", max_entry_len); |
| return 0; |
| } |