| #!/usr/bin/env python3 |
| # |
| # Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py. |
| # |
| # This file is part of GCC. |
| # |
| # GCC is free software; you can redistribute it and/or modify it under |
| # the terms of the GNU General Public License as published by the Free |
| # Software Foundation; either version 3, or (at your option) any later |
| # version. |
| # |
| # GCC is distributed in the hope that it will be useful, but WITHOUT ANY |
| # WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| # for more details. |
| # |
| # You should have received a copy of the GNU General Public License |
| # along with GCC; see the file COPYING3. If not see |
| # <http://www.gnu.org/licenses/>. */ |
| |
| import sys |
| import os |
| |
| if len(sys.argv) != 2: |
| print("usage: %s <unicode version>", file=sys.stderr) |
| sys.exit(1) |
| unicode_version = sys.argv[1] |
| |
| # Parse a codepoint in the format output by glibc tools. |
| def parse_ucn(s): |
| if not (s.startswith("<U") and s.endswith(">")): |
| raise ValueError |
| return int(s[2:-1], base=16) |
| |
| # Process a line of width output from utf_gen.py and update global array. |
| widths = [1] * (1 + 0x10FFFF) |
| def process_width(line): |
| # Example lines: |
| # <UA8FF> 0 |
| # <UA926>...<UA92D> 0 |
| |
| s = line.split() |
| width = int(s[1]) |
| r = s[0].split("...") |
| if len(r) == 1: |
| begin = parse_ucn(r[0]) |
| end = begin + 1 |
| elif len(r) == 2: |
| begin = parse_ucn(r[0]) |
| end = parse_ucn(r[1]) + 1 |
| else: |
| raise ValueError |
| widths[begin:end] = [width] * (end - begin) |
| |
| # To keep things simple, we use glibc utf8_gen.py as-is. It only outputs to a |
| # file named UTF-8, which is not configurable. Then we parse this into the form |
| # we want it. |
| os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version) |
| processing = False |
| for line in open("UTF-8", "r"): |
| if processing: |
| if line == "END WIDTH\n": |
| processing = False |
| else: |
| try: |
| process_width(line) |
| except (ValueError, IndexError): |
| print(e, "warning: ignored unexpected line: %s" % line, |
| file=sys.stderr, end="") |
| elif line == "WIDTH\n": |
| processing = True |
| |
| # All bytes < 256 we treat as width 1. |
| widths[0:255] = [1] * 255 |
| |
| # Condense the list to contiguous ranges. |
| cur_range = [-1, 1] |
| all_ranges = [] |
| for i, width in enumerate(widths): |
| if width == cur_range[1]: |
| cur_range[0] = i |
| else: |
| all_ranges.append(cur_range) |
| cur_range = [i, width] |
| |
| # Output the arrays for generated_cpp_wcwidth.h |
| print("/* Generated by contrib/unicode/gen_wcwidth.py,", |
| "with the help of glibc's") |
| print(" utf8_gen.py, using version %s" % unicode_version, |
| "of the Unicode standard. */") |
| print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="") |
| for i, r in enumerate(all_ranges): |
| if i % 8: |
| print(" ", end="") |
| else: |
| print("\n ", end="") |
| print("0x%x," % (r[0]), end="") |
| print("\n};\n") |
| print("static const unsigned char wcwidth_widths[] = {", end="") |
| for i, r in enumerate(all_ranges): |
| if i % 24: |
| print(" ", end="") |
| else: |
| print("\n ", end="") |
| print("%d," % r[1], end="") |
| print("\n};") |