| # Copyright (C) 2020-2025 Free Software Foundation, Inc. |
| |
| # This file is part of GCC. |
| |
| # GCC is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 3, or (at your option) any later |
| # version. |
| |
| # GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| # for more details. |
| |
| # You should have received a copy of the GNU General Public License |
| # along with GCC; see the file COPYING3. If not see |
| # <http://www.gnu.org/licenses/>. |
| |
| # Run this program as |
| # python ./make-rust-unicode.py UnicodeData.txt \ |
| # DerivedNormalizationProps.txt DerivedCoreProperties.txt \ |
| # > rust-unicode-data.h |
| |
| import sys |
| from typing import Tuple |
| |
| Codepoint = int |
| Range = Tuple[Codepoint, Codepoint] |
| |
| COPYRIGHT = ( |
| "// Copyright (C) 2020-2025 Free Software Foundation, Inc.\n" |
| "\n" |
| "// This file is part of GCC.\n" |
| "\n" |
| "// GCC is free software; you can redistribute it and/or modify it under\n" |
| "// the terms of the GNU General Public License as published by the Free\n" |
| "// Software Foundation; either version 3, or (at your option) any later\n" |
| "// version.\n" |
| "\n" |
| "// GCC is distributed in the hope that it will be useful, but WITHOUT ANY\n" |
| "// WARRANTY; without even the implied warranty of MERCHANTABILITY or\n" |
| "// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License\n" |
| "// for more details.\n" |
| "\n" |
| "// You should have received a copy of the GNU General Public License\n" |
| "// along with GCC; see the file COPYING3. If not see\n" |
| "// <http://www.gnu.org/licenses/>." |
| ) |
| |
| # Decomposition_Mapping table |
| decomposition_map: dict[Codepoint, list[Codepoint]] = {} |
| # Canonical_Combining_Class table |
| ccc_table: dict[Codepoint, int] = {} |
| # Ranges of codepoints with the Full_Composition_Exclusion property |
| composition_exclusion_ranges: list[Range] = [] |
| # Ranges of codepoints with the Full_Composition_Exclusion property |
| alphabetic_ranges: list[Range] = [] |
| # Ranges of codepoints with NFC_QC=No |
| nfc_qc_no_ranges: list[Range] = [] |
| # Ranges of codepoints with NFC_QC=Maybe |
| nfc_qc_maybe_ranges: list[Range] = [] |
| numeric_codepoints: list[Codepoint] = [] |
| |
| # Note that an element of range `[m, n]` (a list in python) represents [m, n) |
| |
| |
| def binary_search_ranges(ranges: list[Range], target: Codepoint) -> int: |
| low: int = 0 |
| high: int = len(ranges) - 1 |
| while low <= high: |
| mid = (low + high) // 2 |
| start, end = ranges[mid] |
| if start <= target <= end - 1: |
| return mid # target found. returns index. |
| elif target < start: |
| high = mid - 1 |
| else: |
| low = mid + 1 |
| # target not found. |
| return -1 |
| |
| |
| # Utility function to parse '<codepoint>...<codepoint>' or '<codepoint>' |
| def parse_codepoint_range(range_str: str) -> Range: |
| codepoint_range: list[str] = range_str.split("..") |
| assert len(codepoint_range) == 1 or len(codepoint_range) == 2, "Invalid format" |
| start_cp, end_cp = 0, 0 |
| if len(codepoint_range) == 1: |
| # m..n => [m, n+1) |
| start_cp = int(codepoint_range[0], 16) |
| end_cp = start_cp + 1 |
| else: |
| # m => [m, m+1) |
| start_cp = int(codepoint_range[0], 16) |
| end_cp = int(codepoint_range[1], 16) + 1 |
| return start_cp, end_cp |
| |
| |
| def read_unicode_data_txt(filepath: str) -> None: |
| def process_line(line: str) -> None: |
| rows = line.split(";") |
| if len(rows) != 15: |
| return |
| # Parse codepoint |
| cp = int(rows[0], 16) |
| # Parse general category |
| category = rows[2] |
| if category == "Nd" or category == "Nl" or category == "No": |
| numeric_codepoints.append(cp) |
| |
| # Parse CCC |
| ccc = int(rows[3], 10) |
| if ccc != 0: |
| ccc_table[cp] = ccc |
| # Parse decomposition mapping |
| # Ignore compatibility decomposition mapping because |
| # it is not required for **NFC** normalization. |
| if not rows[5].startswith("<"): |
| decomp_cp_strs = rows[5].split(" ") |
| decomp_cps = [] |
| for s in decomp_cp_strs: |
| if s == "": |
| continue |
| decomp_cps.append(int(s, 16)) |
| assert ( |
| len(decomp_cps) <= 2 |
| ), "Decomposition_Mapping must not contain more than 2 characters." |
| if len(decomp_cps) > 0: |
| decomposition_map[cp] = decomp_cps |
| |
| with open(filepath, "r", encoding="UTF-8") as file: |
| while line := file.readline(): |
| process_line(line.rstrip()) |
| |
| |
| def read_derived_norm_props_txt(filepath: str) -> None: |
| def process_line(line) -> None: |
| # Ignore comments |
| line = line.split("#")[0] |
| rows = line.split(";") |
| # Too few rows. Skipped. |
| if len(rows) < 2: |
| return |
| rows[0] = rows[0].lstrip().rstrip() |
| rows[1] = rows[1].lstrip().rstrip() |
| cp_range = parse_codepoint_range(rows[0]) |
| if rows[1] == "Full_Composition_Exclusion": |
| composition_exclusion_ranges.append(cp_range) |
| elif rows[1] == "NFC_QC": |
| assert len(rows) >= 3, "Too few rows for NFC_QC" |
| rows[2] = rows[2].lstrip().rstrip() |
| if rows[2] == "N": |
| nfc_qc_no_ranges.append(cp_range) |
| elif rows[2] == "M": |
| nfc_qc_maybe_ranges.append(cp_range) |
| else: |
| raise RuntimeError("Value of NFC_QC must be N or M") |
| |
| with open(filepath, "r", encoding="UTF-8") as file: |
| while line := file.readline(): |
| process_line(line.rstrip()) |
| |
| |
| def read_derived_core_props_txt(filepath: str) -> None: |
| def process_line(line: str) -> None: |
| # Ignore comments |
| line = line.split("#")[0] |
| rows = line.split(";") |
| # Too few rows. Skipped. |
| if len(rows) < 2: |
| return |
| rows[0] = rows[0].lstrip().rstrip() |
| rows[1] = rows[1].lstrip().rstrip() |
| if rows[1] != "Alphabetic": |
| return |
| cp_range: Range = parse_codepoint_range(rows[0]) |
| alphabetic_ranges.append(cp_range) |
| |
| with open(filepath, "r", encoding="UTF-8") as file: |
| while line := file.readline(): |
| process_line(line.rstrip()) |
| |
| |
| def write_decomposition() -> None: |
| print("const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {") |
| print(" // clang-format off") |
| for cp in sorted(decomposition_map): |
| print(" {{{:#06x}, ".format(cp), end="") |
| print("{", end="") |
| for decomp_cp in decomposition_map[cp]: |
| print("{:#06x}, ".format(decomp_cp), end="") |
| print("}},") |
| print(" // clang-format on") |
| print("};") |
| |
| |
| def write_recomposition() -> None: |
| print( |
| "const std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{" |
| ) |
| print(" // clang-format off") |
| for cp in decomposition_map: |
| if binary_search_ranges(composition_exclusion_ranges, cp) != -1: |
| continue |
| d1: Codepoint |
| d2: Codepoint |
| if len(decomposition_map[cp]) == 1: |
| d1 = decomposition_map[cp][0] |
| d2 = 0 |
| else: |
| d1 = decomposition_map[cp][0] |
| d2 = decomposition_map[cp][1] |
| print(" {{{{{:#06x}, {:#06x}}}, {:#06x}}},".format(d1, d2, cp)) |
| print(" // clang-format on") |
| print("}};") |
| |
| |
| def write_ccc() -> None: |
| print("const std::map<uint32_t, int32_t> CCC_TABLE = {") |
| print(" // clang-format off") |
| for cp in ccc_table: |
| print(" {{{:#06x}, {}}},".format(cp, ccc_table[cp])) |
| print(" // clang-format on") |
| print("};") |
| |
| |
| def write_alphabetic() -> None: |
| print( |
| "const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES> ALPHABETIC_RANGES = {{" |
| ) |
| print(" // clang-format off") |
| for r in alphabetic_ranges: |
| print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1])) |
| print(" // clang-format on") |
| print("}};") |
| |
| |
| def write_numeric() -> None: |
| print("const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{") |
| print(" // clang-format off") |
| for i, cp in enumerate(numeric_codepoints): |
| if i % 16 == 0: |
| print(" ", end="") |
| print("{:#06x}, ".format(cp), end="") |
| if i % 16 == 15: |
| print() |
| if i % 16 != 15: |
| print() |
| print(" // clang-format on") |
| print("}};") |
| |
| |
| def write_nfc_qc(): |
| print( |
| "const std::array<std::pair<uint32_t, uint32_t>, {}> NFC_QC_NO_RANGES = {{{{".format( |
| len(nfc_qc_no_ranges) |
| ) |
| ) |
| print(" // clang-format off") |
| for r in nfc_qc_no_ranges: |
| print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1])) |
| print(" // clang-format on") |
| print("}};") |
| |
| print( |
| "const std::array<std::pair<uint32_t, uint32_t>, {}> NFC_QC_MAYBE_RANGES = {{{{".format( |
| len(nfc_qc_maybe_ranges) |
| ) |
| ) |
| print(" // clang-format off") |
| for r in nfc_qc_maybe_ranges: |
| print(" {{{:#06x}, {:#06x}}},".format(r[0], r[1])) |
| print(" // clang-format on") |
| print("}};") |
| |
| |
| def main() -> None: |
| if len(sys.argv) != 4: |
| print("too few arguments", file=sys.stderr) |
| exit(-1) |
| unicode_txt_path: str = sys.argv[1] |
| norm_props_txt_path: str = sys.argv[2] |
| core_props_txt_path: str = sys.argv[3] |
| |
| read_unicode_data_txt(unicode_txt_path) |
| read_derived_norm_props_txt(norm_props_txt_path) |
| read_derived_core_props_txt(core_props_txt_path) |
| |
| print(COPYRIGHT) |
| print() |
| |
| print('#include "rust-system.h"\n') |
| print("namespace Rust {\n") |
| print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges))) |
| print( |
| "const uint32_t NUM_NUMERIC_CODEPOINTS = {};\n".format(len(numeric_codepoints)) |
| ) |
| |
| write_decomposition() |
| print() |
| write_recomposition() |
| print() |
| write_ccc() |
| print() |
| write_alphabetic() |
| print() |
| write_numeric() |
| print() |
| write_nfc_qc() |
| print() |
| |
| print("} // namespace Rust") |
| |
| |
| if __name__ == "__main__": |
| main() |