blob: 300758c583aa7f9c0aa383db2464803cd5b390eb [file] [log] [blame]
#include <config.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <ctype.h>
#include "uninorm.h"
#include "allkeys_bin.h"
typedef struct
{
uint8_t num_elements;
CollationElement elements[MAX_COLLATION_ELEMENTS];
} CollationData;
typedef struct
{
uint8_t offset;
CollationData *data;
} PageEntry;
typedef struct
{
uint16_t count;
PageEntry *entries;
} Page;
typedef struct TrieNode
{
char32_t codepoint;
CollationData *data;
struct TrieNode **children;
uint16_t num_children;
} TrieNode;
typedef struct
{
Page *pages[NUM_PAGES];
TrieNode *trie_root;
uint16_t max_variable_weight;
uint32_t num_singles;
uint32_t num_sequences;
uint32_t version;
} Database;
typedef struct
{
CollationElement element;
bool variable_weight;
} CollationElementParsed;
/* Dynamic byte buffer for building binary data */
typedef struct
{
uint8_t *data;
size_t size;
size_t capacity;
} ByteBuffer;
static ByteBuffer *
buffer_create (void)
{
ByteBuffer *buf = calloc (1, sizeof (ByteBuffer));
buf->capacity = 1024 * 1024;
buf->data = malloc (buf->capacity);
return buf;
}
static void
buffer_ensure_space (ByteBuffer *buf, size_t needed)
{
while (buf->size + needed > buf->capacity)
{
buf->capacity *= 2;
buf->data = realloc (buf->data, buf->capacity);
}
}
static uint32_t
buffer_write_bytes (ByteBuffer *buf, const void *data, size_t len)
{
buffer_ensure_space (buf, len);
uint32_t offset = buf->size;
memcpy (buf->data + buf->size, data, len);
buf->size += len;
return offset;
}
static uint32_t
buffer_write_u8 (ByteBuffer *buf, uint8_t val)
{
return buffer_write_bytes (buf, &val, 1);
}
static void
buffer_write_u8_at (ByteBuffer *buf, uint32_t offset, uint8_t val)
{
memcpy (buf->data + offset, &val, 1);
}
static uint32_t
buffer_write_u16 (ByteBuffer *buf, uint16_t val)
{
return buffer_write_bytes (buf, &val, 2);
}
static uint32_t
buffer_write_u32 (ByteBuffer *buf, uint32_t val)
{
return buffer_write_bytes (buf, &val, 4);
}
static void
buffer_write_u32_at (ByteBuffer *buf, uint32_t offset, uint32_t val)
{
memcpy (buf->data + offset, &val, 4);
}
/* Parser functions */
static int
parse_hex (const char *str, uint32_t *value)
{
char *endptr;
unsigned long val = strtoul (str, &endptr, 16);
if (endptr == str || val > 0x10FFFF)
return 0;
*value = (uint32_t) val;
return 1;
}
static int
parse_collation_element (const char **str, CollationElementParsed *elem)
{
const char *s = *str;
while (*s && isspace (*s))
s++;
if (*s != '[')
return 0;
s++;
if (*s == '*')
{
elem->variable_weight = 1;
s++;
}
else if (*s == '.')
{
elem->variable_weight = 0;
s++;
}
else
{
return 0;
}
char hex[5] = { 0 };
for (int i = 0; i < 4 && isxdigit (*s); i++, s++)
hex[i] = *s;
elem->element.primary = (uint16_t) strtoul (hex, NULL, 16);
if (*s != '.')
return 0;
s++;
memset (hex, 0, sizeof (hex));
for (int i = 0; i < 4 && isxdigit (*s); i++, s++)
hex[i] = *s;
elem->element.secondary = (uint16_t) strtoul (hex, NULL, 16);
if (*s != '.')
return 0;
s++;
memset (hex, 0, sizeof (hex));
for (int i = 0; i < 4 && isxdigit (*s); i++, s++)
hex[i] = *s;
elem->element.tertiary = (uint8_t) strtoul (hex, NULL, 16);
if (*s != ']')
return 0;
s++;
*str = s;
return 1;
}
static uint32_t
parse_version (const char *line)
{
unsigned int major = 0, minor = 0, patch = 0;
if (sscanf (line, "@version %u.%u.%u", &major, &minor, &patch) == 3)
{
return major * 10000 + minor * 100 + patch;
}
return 0;
}
/* Used to skip decomposable sequences. */
int
check_codepoint_nondecomposable (char32_t codepoint)
{
size_t length;
char32_t *normalized;
normalized =
u32_normalize (UNINORM_NFD, &codepoint, 1, NULL, &length);
free (normalized);
if (length > 1)
return 0;
return 1;
}
/* Build database from allkeys.txt */
static Database *
build_database (const char *filename)
{
FILE *fp = fopen (filename, "r");
if (!fp)
{
perror ("Failed to open input file");
return NULL;
}
Database *db = calloc (1, sizeof (Database));
db->trie_root = calloc (1, sizeof (TrieNode));
char line[4096];
size_t line_num = 0;
printf ("Parsing %s...\n", filename);
while (fgets (line, sizeof (line), fp))
{
line_num++;
const char *p = line;
while (*p && isspace (*p))
p++;
if (*p == '#' || *p == '\0')
continue;
if (*p == '@')
{
if (strncmp (line, "@version", 8) == 0)
{
db->version = parse_version (line);
}
continue;
}
/* Parse codepoints */
char32_t codepoints[MAX_SEQUENCE_LENGTH];
size_t num_codepoints = 0;
while (*p && isxdigit (*p))
{
char hex[7] = { 0 };
int i = 0;
while (isxdigit (*p) && i < 6)
hex[i++] = *p++;
if (!parse_hex (hex, &codepoints[num_codepoints]))
break;
num_codepoints++;
if (num_codepoints >= MAX_SEQUENCE_LENGTH)
break;
while (*p && isspace (*p) && *p != ';')
p++;
}
if (num_codepoints == 0)
continue;
if (num_codepoints == 1
&& !check_codepoint_nondecomposable (codepoints[0]))
{
continue;
}
while (*p && *p != ';')
p++;
if (*p != ';')
continue;
p++;
/* Parse collation elements */
CollationData *data = calloc (1, sizeof (CollationData));
while (*p && *p != '#')
{
if (*p == '[')
{
CollationElementParsed elem;
if (parse_collation_element (&p, &elem))
{
if (data->num_elements < MAX_COLLATION_ELEMENTS)
{
data->elements[data->num_elements++] = elem.element;
}
else
{
printf
("parse error: maximum collation element sequence length exceeded\n");
}
if (elem.variable_weight)
{
if (elem.element.primary > db->max_variable_weight)
db->max_variable_weight = elem.element.primary;
}
}
}
else
{
p++;
}
}
if (data->num_elements == 0)
{
free (data);
continue;
}
/* Insert into database. */
if (num_codepoints == 1)
{
uint32_t page_num = codepoints[0] >> 8;
uint8_t offset = codepoints[0] & 0xFF;
if (!db->pages[page_num])
{
db->pages[page_num] = calloc (1, sizeof (Page));
}
Page *page = db->pages[page_num];
page->entries =
realloc (page->entries, (page->count + 1) * sizeof (PageEntry));
page->entries[page->count].offset = offset;
page->entries[page->count].data = data;
page->count++;
db->num_singles++;
}
else
{
/* Insert into trie. */
TrieNode *node = db->trie_root;
for (size_t i = 0; i < num_codepoints; i++)
{
TrieNode *child = NULL;
for (uint16_t j = 0; j < node->num_children; j++)
{
if (node->children[j]->codepoint == codepoints[i])
{
child = node->children[j];
break;
}
}
if (!child)
{
child = calloc (1, sizeof (TrieNode));
child->codepoint = codepoints[i];
node->children =
realloc (node->children,
(node->num_children + 1) * sizeof (TrieNode *));
node->children[node->num_children++] = child;
}
node = child;
}
node->data = data;
db->num_sequences++;
}
if (line_num % 5000 == 0)
{
printf (" Processed %zu lines...\r", line_num);
fflush (stdout);
}
}
fclose (fp);
printf ("\nParsing complete: %u singles, %u sequences\n",
db->num_singles, db->num_sequences);
return db;
}
/* Write collation data and return its offset */
static uint32_t
write_collation_data (ByteBuffer *buf, CollationData *data)
{
uint8_t num_elements = data->num_elements;
uint32_t offset = buffer_write_u8 (buf, num_elements);
for (int i = 0; i < data->num_elements; i++)
{
buffer_write_u16 (buf, data->elements[i].primary);
uint16_t secondary = data->elements[i].secondary;
uint8_t secondary_write;
/* Fit secondary weight in a single byte for 255 possible
secondary weights (0x0000 and 0x0020 - 0x011D) */
if (secondary == 0x00)
secondary_write = secondary;
else if (secondary <= 0x011D)
secondary_write = secondary - 0x1f;
else if (secondary <= 0x0127)
{
/* For larger weights, output an extra collation element. */
secondary_write = 0xff;
buffer_write_u8 (buf, secondary_write);
buffer_write_u8 (buf, data->elements[i].tertiary);
buffer_write_u8_at (buf, offset, ++num_elements);
buffer_write_u16 (buf, 0x0000);
buffer_write_u8 (buf, secondary - 0x0100);
buffer_write_u8 (buf, 0x00);
continue;
}
else
{
fprintf (stderr, "secondary weight too big: %4x\n", secondary);
exit (1);
}
buffer_write_u8 (buf, secondary_write);
buffer_write_u8 (buf, data->elements[i].tertiary);
}
return offset;
}
/* Write trie recursively and return its offset */
static uint32_t
write_trie_node (ByteBuffer *buf, TrieNode *node)
{
uint32_t offset = buf->size;
buffer_write_u32 (buf, node->codepoint);
/* Reserve space for codepoint data. */
uint32_t data_offset_pos = buf->size;
buffer_write_u32 (buf, 0);
buffer_write_u16 (buf, node->num_children);
/* Reserve space for child offsets */
uint32_t children_offset_pos = buf->size;
for (uint16_t i = 0; i < node->num_children; i++)
{
buffer_write_u32 (buf, 0); /* Placeholder */
}
/* Write children and update offsets */
for (uint16_t i = 0; i < node->num_children; i++)
{
uint32_t child_offset = write_trie_node (buf, node->children[i]);
buffer_write_u32_at (buf, children_offset_pos + i * 4, child_offset);
}
/* Write collation data and update offset. */
if (node->data)
{
uint32_t data_offset = write_collation_data (buf, node->data);
buffer_write_u32_at (buf, data_offset_pos, data_offset);
}
return offset;
}
/* Compare function for sorting page entries by offset */
static int
compare_page_entries (const void *a, const void *b)
{
const PageEntry *ea = (const PageEntry *) a;
const PageEntry *eb = (const PageEntry *) b;
return (int) ea->offset - (int) eb->offset;
}
/* Convert database to binary format */
static ByteBuffer *
serialize_database (Database *db)
{
ByteBuffer *buf = buffer_create ();
printf ("\nSerializing to binary format...\n");
// Sort all pages by offset to enable binary search
printf ("Sorting page entries...\n");
for (uint32_t i = 0; i < NUM_PAGES; i++)
{
if (db->pages[i] && db->pages[i]->count > 0)
{
qsort (db->pages[i]->entries, db->pages[i]->count,
sizeof (PageEntry), compare_page_entries);
}
}
/* Write header, leaving some placeholders to be filled in later. */
buffer_write_bytes (buf, "UCADATA1", 8);
buffer_write_u32 (buf, db->version);
buffer_write_u16 (buf, db->max_variable_weight);
buffer_write_u32 (buf, db->num_singles);
buffer_write_u32 (buf, db->num_sequences);
uint32_t page_table_offset_pos = buf->size;
buffer_write_u32 (buf, 0); /* Placeholder for page_table_offset */
uint32_t trie_offset_pos = buf->size;
buffer_write_u32 (buf, 0); /* Placeholder for trie_offset */
/* Write page table. */
uint32_t page_table_offset = buf->size;
buffer_write_u32_at (buf, page_table_offset_pos, page_table_offset);
uint32_t page_offset_positions[NUM_PAGES];
for (uint32_t i = 0; i < NUM_PAGES; i++)
{
page_offset_positions[i] = buf->size;
buffer_write_u32 (buf, 0); // Placeholder
}
/* Write page data (entries only, no collation data yet).
We'll track where to write collation data offsets. */
typedef struct
{
uint32_t offset_position; /* Where to write the data_offset. */
CollationData *data; /* The data to write later. */
} PendingData;
PendingData *pending = malloc (db->num_singles * sizeof (PendingData));
uint32_t pending_count = 0;
for (uint32_t i = 0; i < NUM_PAGES; i++)
{
if (!db->pages[i])
continue;
Page *page = db->pages[i];
uint32_t page_offset = buf->size;
buffer_write_u32_at (buf, page_offset_positions[i], page_offset);
buffer_write_u16 (buf, page->count);
/* Write entries with placeholder data offsets. */
for (uint16_t j = 0; j < page->count; j++)
{
buffer_write_u8 (buf, page->entries[j].offset);
/* Remember where we need to write the data offset. */
pending[pending_count].offset_position = buf->size;
pending[pending_count].data = page->entries[j].data;
pending_count++;
buffer_write_u32 (buf, 0); /* Placeholder for data_offset. */
}
}
/* Now write all collation data and backfill offsets. */
for (uint32_t i = 0; i < pending_count; i++)
{
uint32_t data_offset = write_collation_data (buf, pending[i].data);
buffer_write_u32_at (buf, pending[i].offset_position, data_offset);
}
free (pending);
/* Write trie. */
uint32_t trie_offset = write_trie_node (buf, db->trie_root);
buffer_write_u32_at (buf, trie_offset_pos, trie_offset);
printf ("Binary size: %zu bytes (%.2f MB)\n", buf->size,
buf->size / 1e6);
return buf;
}
/* Write as C source file */
static void
write_c_source (ByteBuffer *buf, const char *output_file)
{
FILE *fp = fopen (output_file, "w");
if (!fp)
{
perror ("Failed to open output file");
return;
}
printf ("Writing C source file: %s\n", output_file);
fprintf (fp, "/* DO NOT EDIT:\n");
fprintf (fp, " * Automatically generated from allkeys.txt\n");
fprintf (fp, " */\n\n");
fprintf (fp, "#include <stdint.h>\n\n");
fprintf (fp, "static const size_t collation_data_size = %zuU;\n\n", buf->size);
fprintf (fp, "static const uint8_t collation_data[] = {\n");
for (size_t i = 0; i < buf->size; i++)
{
if (i % 16 == 0)
{
if (i > 0)
fprintf (fp, "\n");
fprintf (fp, " ");
}
fprintf (fp, "0x%02X", buf->data[i]);
if (i < buf->size - 1)
fprintf (fp, ",");
if (i % 16 != 15 && i < buf->size - 1)
fprintf (fp, " ");
}
fprintf (fp, "\n};\n");
fclose (fp);
}
/* Write as binary file */
static void
write_binary_file (ByteBuffer *buf, const char *output_file)
{
FILE *fp = fopen (output_file, "wb");
if (!fp)
{
perror ("Failed to open output file");
return;
}
printf ("Writing binary file: %s\n", output_file);
fwrite (buf->data, 1, buf->size, fp);
fclose (fp);
}
int
main (int argc, char *argv[])
{
if (argc < 2 || argc > 4)
{
fprintf (stderr, "Usage: %s <allkeys.txt> [output.bin] [output.c]\n",
argv[0]);
return 1;
}
const char *input_file = argv[1];
const char *binary_file = argc >= 3 ? argv[2] : "allkeys.bin";
const char *c_file = argc >= 4 ? argv[3] : NULL;
Database *db = build_database (input_file);
if (!db)
return 1;
ByteBuffer *buf = serialize_database (db);
/* Always write binary file. */
write_binary_file (buf, binary_file);
/* Optionally write C source. */
if (c_file)
{
write_c_source (buf, c_file);
}
return 0;
}