contrib/unicode/gen_wcwidth.py - gcc - Git at Google

 #!/usr/bin/env python3
 #
 # Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py.
 #
 # This file is part of GCC.
 #
 # GCC is free software; you can redistribute it and/or modify it under
 # the terms of the GNU General Public License as published by the Free
 # Software Foundation; either version 3, or (at your option) any later
 # version.
 #
 # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
 # WARRANTY; without even the implied warranty of MERCHANTABILITY or
 # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 # for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with GCC; see the file COPYING3.  If not see
 # <http://www.gnu.org/licenses/>.  */

 import sys
 import os

 if len(sys.argv) != 2:
     print("usage: %s <unicode version>", file=sys.stderr)
     sys.exit(1)
 unicode_version = sys.argv[1]

 # Parse a codepoint in the format output by glibc tools.
 def parse_ucn(s):
     if not (s.startswith("<U") and s.endswith(">")):
         raise ValueError
     return int(s[2:-1], base=16)

 # Process a line of width output from utf_gen.py and update global array.
 widths = [1] * (1 + 0x10FFFF)
 def process_width(line):
     # Example lines:
     # <UA8FF>	0
     # <UA926>...<UA92D>	0

     s = line.split()
     width = int(s[1])
     r = s[0].split("...")
     if len(r) == 1:
         begin = parse_ucn(r[0])
         end = begin + 1
     elif len(r) == 2:
         begin = parse_ucn(r[0])
         end = parse_ucn(r[1]) + 1
     else:
         raise ValueError
     widths[begin:end] = [width] * (end - begin)

 # To keep things simple, we use glibc utf8_gen.py as-is.  It only outputs to a
 # file named UTF-8, which is not configurable.  Then we parse this into the form
 # we want it.
 os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version)
 processing = False
 for line in open("UTF-8", "r"):
     if processing:
         if line == "END WIDTH\n":
             processing = False
         else:
             try:
                 process_width(line)
             except (ValueError, IndexError):
                 print(e, "warning: ignored unexpected line: %s" % line,
                         file=sys.stderr, end="")
     elif line == "WIDTH\n":
         processing = True

 # All bytes < 256 we treat as width 1.
 widths[0:255] = [1] * 255

 # Condense the list to contiguous ranges.
 cur_range = [-1, 1]
 all_ranges = []
 for i, width in enumerate(widths):
     if width == cur_range[1]:
         cur_range[0] = i
     else:
         all_ranges.append(cur_range)
         cur_range = [i, width]

 # Output the arrays for generated_cpp_wcwidth.h
 print("/*  Generated by contrib/unicode/gen_wcwidth.py,",
           "with the help of glibc's")
 print("    utf8_gen.py, using version %s" % unicode_version,
           "of the Unicode standard.  */")
 print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="")
 for i, r in enumerate(all_ranges):
     if i % 8:
         print(" ", end="")
     else:
         print("\n  ", end="")
     print("0x%x," % (r[0]), end="")
 print("\n};\n")
 print("static const unsigned char wcwidth_widths[] = {", end="")
 for i, r in enumerate(all_ranges):
     if i % 24:
         print(" ", end="")
     else:
         print("\n  ", end="")
     print("%d," % r[1], end="")
 print("\n};")
	#!/usr/bin/env python3
	#
	# Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py.
	#
	# This file is part of GCC.
	#
	# GCC is free software; you can redistribute it and/or modify it under
	# the terms of the GNU General Public License as published by the Free
	# Software Foundation; either version 3, or (at your option) any later
	# version.
	#
	# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
	# WARRANTY; without even the implied warranty of MERCHANTABILITY or
	# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	# for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with GCC; see the file COPYING3. If not see
	# <http://www.gnu.org/licenses/>. */

	import sys
	import os

	if len(sys.argv) != 2:
	print("usage: %s <unicode version>", file=sys.stderr)
	sys.exit(1)
	unicode_version = sys.argv[1]

	# Parse a codepoint in the format output by glibc tools.
	def parse_ucn(s):
	if not (s.startswith("<U") and s.endswith(">")):
	raise ValueError
	return int(s[2:-1], base=16)

	# Process a line of width output from utf_gen.py and update global array.
	widths = [1] * (1 + 0x10FFFF)
	def process_width(line):
	# Example lines:
	# <UA8FF> 0
	# <UA926>...<UA92D> 0

	s = line.split()
	width = int(s[1])
	r = s[0].split("...")
	if len(r) == 1:
	begin = parse_ucn(r[0])
	end = begin + 1
	elif len(r) == 2:
	begin = parse_ucn(r[0])
	end = parse_ucn(r[1]) + 1
	else:
	raise ValueError
	widths[begin:end] = [width] * (end - begin)

	# To keep things simple, we use glibc utf8_gen.py as-is. It only outputs to a
	# file named UTF-8, which is not configurable. Then we parse this into the form
	# we want it.
	os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version)
	processing = False
	for line in open("UTF-8", "r"):
	if processing:
	if line == "END WIDTH\n":
	processing = False
	else:
	try:
	process_width(line)
	except (ValueError, IndexError):
	print(e, "warning: ignored unexpected line: %s" % line,
	file=sys.stderr, end="")
	elif line == "WIDTH\n":
	processing = True

	# All bytes < 256 we treat as width 1.
	widths[0:255] = [1] * 255

	# Condense the list to contiguous ranges.
	cur_range = [-1, 1]
	all_ranges = []
	for i, width in enumerate(widths):
	if width == cur_range[1]:
	cur_range[0] = i
	else:
	all_ranges.append(cur_range)
	cur_range = [i, width]

	# Output the arrays for generated_cpp_wcwidth.h
	print("/* Generated by contrib/unicode/gen_wcwidth.py,",
	"with the help of glibc's")
	print(" utf8_gen.py, using version %s" % unicode_version,
	"of the Unicode standard. */")
	print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="")
	for i, r in enumerate(all_ranges):
	if i % 8:
	print(" ", end="")
	else:
	print("\n ", end="")
	print("0x%x," % (r[0]), end="")
	print("\n};\n")
	print("static const unsigned char wcwidth_widths[] = {", end="")
	for i, r in enumerate(all_ranges):
	if i % 24:
	print(" ", end="")
	else:
	print("\n ", end="")
	print("%d," % r[1], end="")
	print("\n};")