| #include <config.h> |
| |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <stdint.h> |
| #include <string.h> |
| #include <ctype.h> |
| #include "uninorm.h" |
| |
| #include "allkeys_bin.h" |
| |
| typedef struct |
| { |
| uint8_t num_elements; |
| CollationElement elements[MAX_COLLATION_ELEMENTS]; |
| } CollationData; |
| |
| typedef struct |
| { |
| uint8_t offset; |
| CollationData *data; |
| } PageEntry; |
| |
| typedef struct |
| { |
| uint16_t count; |
| PageEntry *entries; |
| } Page; |
| |
| typedef struct TrieNode |
| { |
| char32_t codepoint; |
| CollationData *data; |
| struct TrieNode **children; |
| uint16_t num_children; |
| } TrieNode; |
| |
| typedef struct |
| { |
| Page *pages[NUM_PAGES]; |
| TrieNode *trie_root; |
| uint16_t max_variable_weight; |
| uint32_t num_singles; |
| uint32_t num_sequences; |
| uint32_t version; |
| } Database; |
| |
| typedef struct |
| { |
| CollationElement element; |
| bool variable_weight; |
| } CollationElementParsed; |
| |
| /* Dynamic byte buffer for building binary data */ |
| typedef struct |
| { |
| uint8_t *data; |
| size_t size; |
| size_t capacity; |
| } ByteBuffer; |
| |
| static ByteBuffer * |
| buffer_create (void) |
| { |
| ByteBuffer *buf = calloc (1, sizeof (ByteBuffer)); |
| buf->capacity = 1024 * 1024; |
| buf->data = malloc (buf->capacity); |
| return buf; |
| } |
| |
| static void |
| buffer_ensure_space (ByteBuffer *buf, size_t needed) |
| { |
| while (buf->size + needed > buf->capacity) |
| { |
| buf->capacity *= 2; |
| buf->data = realloc (buf->data, buf->capacity); |
| } |
| } |
| |
| static uint32_t |
| buffer_write_bytes (ByteBuffer *buf, const void *data, size_t len) |
| { |
| buffer_ensure_space (buf, len); |
| uint32_t offset = buf->size; |
| memcpy (buf->data + buf->size, data, len); |
| buf->size += len; |
| return offset; |
| } |
| |
| static uint32_t |
| buffer_write_u8 (ByteBuffer *buf, uint8_t val) |
| { |
| return buffer_write_bytes (buf, &val, 1); |
| } |
| |
| static void |
| buffer_write_u8_at (ByteBuffer *buf, uint32_t offset, uint8_t val) |
| { |
| memcpy (buf->data + offset, &val, 1); |
| } |
| |
| static uint32_t |
| buffer_write_u16 (ByteBuffer *buf, uint16_t val) |
| { |
| return buffer_write_bytes (buf, &val, 2); |
| } |
| |
| static uint32_t |
| buffer_write_u32 (ByteBuffer *buf, uint32_t val) |
| { |
| return buffer_write_bytes (buf, &val, 4); |
| } |
| |
| static void |
| buffer_write_u32_at (ByteBuffer *buf, uint32_t offset, uint32_t val) |
| { |
| memcpy (buf->data + offset, &val, 4); |
| } |
| |
| /* Parser functions */ |
| static int |
| parse_hex (const char *str, uint32_t *value) |
| { |
| char *endptr; |
| unsigned long val = strtoul (str, &endptr, 16); |
| if (endptr == str || val > 0x10FFFF) |
| return 0; |
| *value = (uint32_t) val; |
| return 1; |
| } |
| |
| static int |
| parse_collation_element (const char **str, CollationElementParsed *elem) |
| { |
| const char *s = *str; |
| while (*s && isspace (*s)) |
| s++; |
| if (*s != '[') |
| return 0; |
| s++; |
| |
| if (*s == '*') |
| { |
| elem->variable_weight = 1; |
| s++; |
| } |
| else if (*s == '.') |
| { |
| elem->variable_weight = 0; |
| s++; |
| } |
| else |
| { |
| return 0; |
| } |
| |
| char hex[5] = { 0 }; |
| for (int i = 0; i < 4 && isxdigit (*s); i++, s++) |
| hex[i] = *s; |
| elem->element.primary = (uint16_t) strtoul (hex, NULL, 16); |
| if (*s != '.') |
| return 0; |
| s++; |
| |
| memset (hex, 0, sizeof (hex)); |
| for (int i = 0; i < 4 && isxdigit (*s); i++, s++) |
| hex[i] = *s; |
| elem->element.secondary = (uint16_t) strtoul (hex, NULL, 16); |
| if (*s != '.') |
| return 0; |
| s++; |
| |
| memset (hex, 0, sizeof (hex)); |
| for (int i = 0; i < 4 && isxdigit (*s); i++, s++) |
| hex[i] = *s; |
| elem->element.tertiary = (uint8_t) strtoul (hex, NULL, 16); |
| if (*s != ']') |
| return 0; |
| s++; |
| |
| *str = s; |
| return 1; |
| } |
| |
| static uint32_t |
| parse_version (const char *line) |
| { |
| unsigned int major = 0, minor = 0, patch = 0; |
| if (sscanf (line, "@version %u.%u.%u", &major, &minor, &patch) == 3) |
| { |
| return major * 10000 + minor * 100 + patch; |
| } |
| return 0; |
| } |
| |
| /* Used to skip decomposable sequences. */ |
| int |
| check_codepoint_nondecomposable (char32_t codepoint) |
| { |
| size_t length; |
| char32_t *normalized; |
| |
| normalized = |
| u32_normalize (UNINORM_NFD, &codepoint, 1, NULL, &length); |
| |
| free (normalized); |
| |
| if (length > 1) |
| return 0; |
| return 1; |
| } |
| |
| /* Build database from allkeys.txt */ |
| static Database * |
| build_database (const char *filename) |
| { |
| FILE *fp = fopen (filename, "r"); |
| if (!fp) |
| { |
| perror ("Failed to open input file"); |
| return NULL; |
| } |
| |
| Database *db = calloc (1, sizeof (Database)); |
| db->trie_root = calloc (1, sizeof (TrieNode)); |
| |
| char line[4096]; |
| size_t line_num = 0; |
| |
| printf ("Parsing %s...\n", filename); |
| |
| while (fgets (line, sizeof (line), fp)) |
| { |
| line_num++; |
| const char *p = line; |
| |
| while (*p && isspace (*p)) |
| p++; |
| if (*p == '#' || *p == '\0') |
| continue; |
| |
| if (*p == '@') |
| { |
| if (strncmp (line, "@version", 8) == 0) |
| { |
| db->version = parse_version (line); |
| } |
| continue; |
| } |
| |
| /* Parse codepoints */ |
| char32_t codepoints[MAX_SEQUENCE_LENGTH]; |
| size_t num_codepoints = 0; |
| |
| while (*p && isxdigit (*p)) |
| { |
| char hex[7] = { 0 }; |
| int i = 0; |
| while (isxdigit (*p) && i < 6) |
| hex[i++] = *p++; |
| if (!parse_hex (hex, &codepoints[num_codepoints])) |
| break; |
| num_codepoints++; |
| if (num_codepoints >= MAX_SEQUENCE_LENGTH) |
| break; |
| while (*p && isspace (*p) && *p != ';') |
| p++; |
| } |
| |
| if (num_codepoints == 0) |
| continue; |
| |
| if (num_codepoints == 1 |
| && !check_codepoint_nondecomposable (codepoints[0])) |
| { |
| continue; |
| } |
| |
| while (*p && *p != ';') |
| p++; |
| if (*p != ';') |
| continue; |
| p++; |
| |
| /* Parse collation elements */ |
| CollationData *data = calloc (1, sizeof (CollationData)); |
| while (*p && *p != '#') |
| { |
| if (*p == '[') |
| { |
| CollationElementParsed elem; |
| if (parse_collation_element (&p, &elem)) |
| { |
| if (data->num_elements < MAX_COLLATION_ELEMENTS) |
| { |
| data->elements[data->num_elements++] = elem.element; |
| } |
| else |
| { |
| printf |
| ("parse error: maximum collation element sequence length exceeded\n"); |
| } |
| if (elem.variable_weight) |
| { |
| if (elem.element.primary > db->max_variable_weight) |
| db->max_variable_weight = elem.element.primary; |
| } |
| } |
| } |
| else |
| { |
| p++; |
| } |
| } |
| |
| if (data->num_elements == 0) |
| { |
| free (data); |
| continue; |
| } |
| |
| /* Insert into database. */ |
| if (num_codepoints == 1) |
| { |
| uint32_t page_num = codepoints[0] >> 8; |
| uint8_t offset = codepoints[0] & 0xFF; |
| |
| if (!db->pages[page_num]) |
| { |
| db->pages[page_num] = calloc (1, sizeof (Page)); |
| } |
| |
| Page *page = db->pages[page_num]; |
| page->entries = |
| realloc (page->entries, (page->count + 1) * sizeof (PageEntry)); |
| page->entries[page->count].offset = offset; |
| page->entries[page->count].data = data; |
| page->count++; |
| db->num_singles++; |
| } |
| else |
| { |
| /* Insert into trie. */ |
| TrieNode *node = db->trie_root; |
| for (size_t i = 0; i < num_codepoints; i++) |
| { |
| TrieNode *child = NULL; |
| for (uint16_t j = 0; j < node->num_children; j++) |
| { |
| if (node->children[j]->codepoint == codepoints[i]) |
| { |
| child = node->children[j]; |
| break; |
| } |
| } |
| if (!child) |
| { |
| child = calloc (1, sizeof (TrieNode)); |
| child->codepoint = codepoints[i]; |
| node->children = |
| realloc (node->children, |
| (node->num_children + 1) * sizeof (TrieNode *)); |
| node->children[node->num_children++] = child; |
| } |
| node = child; |
| } |
| node->data = data; |
| db->num_sequences++; |
| } |
| |
| if (line_num % 5000 == 0) |
| { |
| printf (" Processed %zu lines...\r", line_num); |
| fflush (stdout); |
| } |
| } |
| |
| fclose (fp); |
| printf ("\nParsing complete: %u singles, %u sequences\n", |
| db->num_singles, db->num_sequences); |
| |
| return db; |
| } |
| |
| /* Write collation data and return its offset */ |
| static uint32_t |
| write_collation_data (ByteBuffer *buf, CollationData *data) |
| { |
| uint8_t num_elements = data->num_elements; |
| uint32_t offset = buffer_write_u8 (buf, num_elements); |
| for (int i = 0; i < data->num_elements; i++) |
| { |
| buffer_write_u16 (buf, data->elements[i].primary); |
| uint16_t secondary = data->elements[i].secondary; |
| uint8_t secondary_write; |
| |
| /* Fit secondary weight in a single byte for 255 possible |
| secondary weights (0x0000 and 0x0020 - 0x011D) */ |
| |
| if (secondary == 0x00) |
| secondary_write = secondary; |
| else if (secondary <= 0x011D) |
| secondary_write = secondary - 0x1f; |
| else if (secondary <= 0x0127) |
| { |
| /* For larger weights, output an extra collation element. */ |
| secondary_write = 0xff; |
| buffer_write_u8 (buf, secondary_write); |
| buffer_write_u8 (buf, data->elements[i].tertiary); |
| |
| buffer_write_u8_at (buf, offset, ++num_elements); |
| |
| buffer_write_u16 (buf, 0x0000); |
| buffer_write_u8 (buf, secondary - 0x0100); |
| buffer_write_u8 (buf, 0x00); |
| |
| continue; |
| } |
| else |
| { |
| fprintf (stderr, "secondary weight too big: %4x\n", secondary); |
| exit (1); |
| } |
| |
| buffer_write_u8 (buf, secondary_write); |
| buffer_write_u8 (buf, data->elements[i].tertiary); |
| } |
| return offset; |
| } |
| |
| /* Write trie recursively and return its offset */ |
| static uint32_t |
| write_trie_node (ByteBuffer *buf, TrieNode *node) |
| { |
| uint32_t offset = buf->size; |
| |
| buffer_write_u32 (buf, node->codepoint); |
| |
| /* Reserve space for codepoint data. */ |
| uint32_t data_offset_pos = buf->size; |
| buffer_write_u32 (buf, 0); |
| |
| buffer_write_u16 (buf, node->num_children); |
| |
| /* Reserve space for child offsets */ |
| uint32_t children_offset_pos = buf->size; |
| for (uint16_t i = 0; i < node->num_children; i++) |
| { |
| buffer_write_u32 (buf, 0); /* Placeholder */ |
| } |
| |
| /* Write children and update offsets */ |
| for (uint16_t i = 0; i < node->num_children; i++) |
| { |
| uint32_t child_offset = write_trie_node (buf, node->children[i]); |
| buffer_write_u32_at (buf, children_offset_pos + i * 4, child_offset); |
| } |
| |
| /* Write collation data and update offset. */ |
| if (node->data) |
| { |
| uint32_t data_offset = write_collation_data (buf, node->data); |
| buffer_write_u32_at (buf, data_offset_pos, data_offset); |
| } |
| |
| return offset; |
| } |
| |
| /* Compare function for sorting page entries by offset */ |
| static int |
| compare_page_entries (const void *a, const void *b) |
| { |
| const PageEntry *ea = (const PageEntry *) a; |
| const PageEntry *eb = (const PageEntry *) b; |
| return (int) ea->offset - (int) eb->offset; |
| } |
| |
| /* Convert database to binary format */ |
| static ByteBuffer * |
| serialize_database (Database *db) |
| { |
| ByteBuffer *buf = buffer_create (); |
| |
| printf ("\nSerializing to binary format...\n"); |
| |
| // Sort all pages by offset to enable binary search |
| printf ("Sorting page entries...\n"); |
| for (uint32_t i = 0; i < NUM_PAGES; i++) |
| { |
| if (db->pages[i] && db->pages[i]->count > 0) |
| { |
| qsort (db->pages[i]->entries, db->pages[i]->count, |
| sizeof (PageEntry), compare_page_entries); |
| } |
| } |
| |
| /* Write header, leaving some placeholders to be filled in later. */ |
| buffer_write_bytes (buf, "UCADATA1", 8); |
| buffer_write_u32 (buf, db->version); |
| buffer_write_u16 (buf, db->max_variable_weight); |
| buffer_write_u32 (buf, db->num_singles); |
| buffer_write_u32 (buf, db->num_sequences); |
| uint32_t page_table_offset_pos = buf->size; |
| buffer_write_u32 (buf, 0); /* Placeholder for page_table_offset */ |
| uint32_t trie_offset_pos = buf->size; |
| buffer_write_u32 (buf, 0); /* Placeholder for trie_offset */ |
| |
| /* Write page table. */ |
| uint32_t page_table_offset = buf->size; |
| buffer_write_u32_at (buf, page_table_offset_pos, page_table_offset); |
| |
| uint32_t page_offset_positions[NUM_PAGES]; |
| for (uint32_t i = 0; i < NUM_PAGES; i++) |
| { |
| page_offset_positions[i] = buf->size; |
| buffer_write_u32 (buf, 0); // Placeholder |
| } |
| |
| /* Write page data (entries only, no collation data yet). |
| We'll track where to write collation data offsets. */ |
| typedef struct |
| { |
| uint32_t offset_position; /* Where to write the data_offset. */ |
| CollationData *data; /* The data to write later. */ |
| } PendingData; |
| |
| PendingData *pending = malloc (db->num_singles * sizeof (PendingData)); |
| uint32_t pending_count = 0; |
| |
| for (uint32_t i = 0; i < NUM_PAGES; i++) |
| { |
| if (!db->pages[i]) |
| continue; |
| |
| Page *page = db->pages[i]; |
| uint32_t page_offset = buf->size; |
| buffer_write_u32_at (buf, page_offset_positions[i], page_offset); |
| |
| buffer_write_u16 (buf, page->count); |
| |
| /* Write entries with placeholder data offsets. */ |
| for (uint16_t j = 0; j < page->count; j++) |
| { |
| buffer_write_u8 (buf, page->entries[j].offset); |
| |
| /* Remember where we need to write the data offset. */ |
| pending[pending_count].offset_position = buf->size; |
| pending[pending_count].data = page->entries[j].data; |
| pending_count++; |
| |
| buffer_write_u32 (buf, 0); /* Placeholder for data_offset. */ |
| } |
| } |
| |
| /* Now write all collation data and backfill offsets. */ |
| for (uint32_t i = 0; i < pending_count; i++) |
| { |
| uint32_t data_offset = write_collation_data (buf, pending[i].data); |
| buffer_write_u32_at (buf, pending[i].offset_position, data_offset); |
| } |
| |
| free (pending); |
| |
| /* Write trie. */ |
| uint32_t trie_offset = write_trie_node (buf, db->trie_root); |
| buffer_write_u32_at (buf, trie_offset_pos, trie_offset); |
| |
| printf ("Binary size: %zu bytes (%.2f MB)\n", buf->size, |
| buf->size / 1e6); |
| |
| return buf; |
| } |
| |
| /* Write as C source file */ |
| static void |
| write_c_source (ByteBuffer *buf, const char *output_file) |
| { |
| FILE *fp = fopen (output_file, "w"); |
| if (!fp) |
| { |
| perror ("Failed to open output file"); |
| return; |
| } |
| |
| printf ("Writing C source file: %s\n", output_file); |
| |
| fprintf (fp, "/* DO NOT EDIT:\n"); |
| fprintf (fp, " * Automatically generated from allkeys.txt\n"); |
| fprintf (fp, " */\n\n"); |
| |
| fprintf (fp, "#include <stdint.h>\n\n"); |
| |
| fprintf (fp, "static const size_t collation_data_size = %zuU;\n\n", buf->size); |
| |
| fprintf (fp, "static const uint8_t collation_data[] = {\n"); |
| |
| for (size_t i = 0; i < buf->size; i++) |
| { |
| if (i % 16 == 0) |
| { |
| if (i > 0) |
| fprintf (fp, "\n"); |
| fprintf (fp, " "); |
| } |
| fprintf (fp, "0x%02X", buf->data[i]); |
| if (i < buf->size - 1) |
| fprintf (fp, ","); |
| if (i % 16 != 15 && i < buf->size - 1) |
| fprintf (fp, " "); |
| } |
| |
| fprintf (fp, "\n};\n"); |
| |
| fclose (fp); |
| } |
| |
| /* Write as binary file */ |
| static void |
| write_binary_file (ByteBuffer *buf, const char *output_file) |
| { |
| FILE *fp = fopen (output_file, "wb"); |
| if (!fp) |
| { |
| perror ("Failed to open output file"); |
| return; |
| } |
| |
| printf ("Writing binary file: %s\n", output_file); |
| |
| fwrite (buf->data, 1, buf->size, fp); |
| fclose (fp); |
| } |
| |
| int |
| main (int argc, char *argv[]) |
| { |
| if (argc < 2 || argc > 4) |
| { |
| fprintf (stderr, "Usage: %s <allkeys.txt> [output.bin] [output.c]\n", |
| argv[0]); |
| return 1; |
| } |
| |
| const char *input_file = argv[1]; |
| const char *binary_file = argc >= 3 ? argv[2] : "allkeys.bin"; |
| const char *c_file = argc >= 4 ? argv[3] : NULL; |
| |
| Database *db = build_database (input_file); |
| if (!db) |
| return 1; |
| |
| ByteBuffer *buf = serialize_database (db); |
| |
| /* Always write binary file. */ |
| write_binary_file (buf, binary_file); |
| |
| /* Optionally write C source. */ |
| if (c_file) |
| { |
| write_c_source (buf, c_file); |
| } |
| |
| return 0; |
| } |