contrib/unicode/from_glibc/unicode_utils.py - gcc - Git at Google

 # Utilities to generate Unicode data for glibc from upstream Unicode data.
 #
 # Copyright (C) 2014-2020 Free Software Foundation, Inc.
 # This file is part of the GNU C Library.
 #
 # The GNU C Library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
 #
 # The GNU C Library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with the GNU C Library; if not, see
 # <https://www.gnu.org/licenses/>.

 '''
 This module contains utilities used by the scripts to generate
 Unicode data for glibc from upstream Unicode data files.
 '''

 import sys
 import re


 # Common locale header.
 COMMENT_HEADER = """
 % This file is part of the GNU C Library and contains locale data.
 % The Free Software Foundation does not claim any copyright interest
 % in the locale data contained in this file.  The foregoing does not
 % affect the license of the GNU C Library as a whole.  It does not
 % exempt you from the conditions of the license if your use would
 % otherwise be governed by that license.
 """

 # Dictionary holding the entire contents of the UnicodeData.txt file
 #
 # Contents of this dictionary look like this:
 #
 # {0: {'category': 'Cc',
 #      'title': None,
 #      'digit': '',
 #      'name': '<control>',
 #      'bidi': 'BN',
 #      'combining': '0',
 #      'comment': '',
 #      'oldname': 'NULL',
 #      'decomposition': '',
 #      'upper': None,
 #      'mirrored': 'N',
 #      'lower': None,
 #      'decdigit': '',
 #      'numeric': ''},
 #      …
 # }
 UNICODE_ATTRIBUTES = {}

 # Dictionary holding the entire contents of the DerivedCoreProperties.txt file
 #
 # Contents of this dictionary look like this:
 #
 # {917504: ['Default_Ignorable_Code_Point'],
 #  917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
 #  …
 # }
 DERIVED_CORE_PROPERTIES = {}

 # Dictionary holding the entire contents of the EastAsianWidths.txt file
 #
 # Contents of this dictionary look like this:
 #
 # {0: 'N', … , 45430: 'W', …}
 EAST_ASIAN_WIDTHS = {}

 def fill_attribute(code_point, fields):
     '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.

     One entry in the UNICODE_ATTRIBUTES dictionary represents one line
     in the UnicodeData.txt file.

     '''
     UNICODE_ATTRIBUTES[code_point] =  {
         'name': fields[1],          # Character name
         'category': fields[2],      # General category
         'combining': fields[3],     # Canonical combining classes
         'bidi': fields[4],          # Bidirectional category
         'decomposition': fields[5], # Character decomposition mapping
         'decdigit': fields[6],      # Decimal digit value
         'digit': fields[7],         # Digit value
         'numeric': fields[8],       # Numeric value
         'mirrored': fields[9],      # mirrored
         'oldname': fields[10],      # Old Unicode 1.0 name
         'comment': fields[11],      # comment
         # Uppercase mapping
         'upper': int(fields[12], 16) if fields[12] else None,
         # Lowercase mapping
         'lower': int(fields[13], 16) if fields[13] else None,
         # Titlecase mapping
         'title': int(fields[14], 16) if fields[14] else None,
     }

 def fill_attributes(filename):
     '''Stores the entire contents of the UnicodeData.txt file
     in the UNICODE_ATTRIBUTES dictionary.

     A typical line for a single code point in UnicodeData.txt looks
     like this:

     0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;

     Code point ranges are indicated by pairs of lines like this:

     4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
     9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
     '''
     with open(filename, mode='r') as unicode_data_file:
         fields_start = []
         for line in unicode_data_file:
             fields = line.strip().split(';')
             if len(fields) != 15:
                 sys.stderr.write(
                     'short line in file "%(f)s": %(l)s\n' %{
                     'f': filename, 'l': line})
                 exit(1)
             if fields[2] == 'Cs':
                 # Surrogates are UTF-16 artefacts,
                 # not real characters. Ignore them.
                 fields_start = []
                 continue
             if fields[1].endswith(', First>'):
                 fields_start = fields
                 fields_start[1] = fields_start[1].split(',')[0][1:]
                 continue
             if fields[1].endswith(', Last>'):
                 fields[1] = fields[1].split(',')[0][1:]
                 if fields[1:] != fields_start[1:]:
                     sys.stderr.write(
                         'broken code point range in file "%(f)s": %(l)s\n' %{
                             'f': filename, 'l': line})
                     exit(1)
                 for code_point in range(
                         int(fields_start[0], 16),
                         int(fields[0], 16)+1):
                     fill_attribute(code_point, fields)
                 fields_start = []
                 continue
             fill_attribute(int(fields[0], 16), fields)
             fields_start = []

 def fill_derived_core_properties(filename):
     '''Stores the entire contents of the DerivedCoreProperties.txt file
     in the DERIVED_CORE_PROPERTIES dictionary.

     Lines in DerivedCoreProperties.txt are either a code point range like
     this:

     0061..007A    ; Lowercase # L&  [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z

     or a single code point like this:

     00AA          ; Lowercase # Lo       FEMININE ORDINAL INDICATOR

     '''
     with open(filename, mode='r') as derived_core_properties_file:
         for line in derived_core_properties_file:
             match = re.match(
                 r'^(?P<codepoint1>[0-9A-F]{4,6})'
                 + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
                 + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
                 line)
             if not match:
                 continue
             start = match.group('codepoint1')
             end = match.group('codepoint2')
             if not end:
                 end = start
             for code_point in range(int(start, 16), int(end, 16)+1):
                 prop = match.group('property')
                 if code_point in DERIVED_CORE_PROPERTIES:
                     DERIVED_CORE_PROPERTIES[code_point].append(prop)
                 else:
                     DERIVED_CORE_PROPERTIES[code_point] = [prop]

 def fill_east_asian_widths(filename):
     '''Stores the entire contents of the EastAsianWidths.txt file
     in the EAST_ASIAN_WIDTHS dictionary.

     Lines in EastAsianWidths.txt are either a code point range like
     this:

     9FCD..9FFF;W     # Cn    [51] <reserved-9FCD>..<reserved-9FFF>

     or a single code point like this:

     A015;W           # Lm         YI SYLLABLE WU
     '''
     with open(filename, mode='r') as east_asian_widths_file:
         for line in east_asian_widths_file:
             match = re.match(
                 r'^(?P<codepoint1>[0-9A-F]{4,6})'
                 +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
                 +r'\s*;\s*(?P<property>[a-zA-Z]+)',
                 line)
             if not match:
                 continue
             start = match.group('codepoint1')
             end = match.group('codepoint2')
             if not end:
                 end = start
             for code_point in range(int(start, 16), int(end, 16)+1):
                 EAST_ASIAN_WIDTHS[code_point] = match.group('property')

 def to_upper(code_point):
     '''Returns the code point of the uppercase version
     of the given code point'''
     if (UNICODE_ATTRIBUTES[code_point]['name']
         and UNICODE_ATTRIBUTES[code_point]['upper']):
         return UNICODE_ATTRIBUTES[code_point]['upper']
     else:
         return code_point

 def to_lower(code_point):
     '''Returns the code point of the lowercase version
     of the given code point'''
     if (UNICODE_ATTRIBUTES[code_point]['name']
         and UNICODE_ATTRIBUTES[code_point]['lower']):
         return UNICODE_ATTRIBUTES[code_point]['lower']
     else:
         return code_point

 def to_upper_turkish(code_point):
     '''Returns the code point of the Turkish uppercase version
     of the given code point'''
     if code_point == 0x0069:
         return 0x0130
     return to_upper(code_point)

 def to_lower_turkish(code_point):
     '''Returns the code point of the Turkish lowercase version
     of the given code point'''
     if code_point == 0x0049:
         return 0x0131
     return to_lower(code_point)

 def to_title(code_point):
     '''Returns the code point of the titlecase version
     of the given code point'''
     if (UNICODE_ATTRIBUTES[code_point]['name']
         and UNICODE_ATTRIBUTES[code_point]['title']):
         return UNICODE_ATTRIBUTES[code_point]['title']
     else:
         return code_point

 def is_upper(code_point):
     '''Checks whether the character with this code point is uppercase'''
     return (to_lower(code_point) != code_point
             or (code_point in DERIVED_CORE_PROPERTIES
                 and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))

 def is_lower(code_point):
     '''Checks whether the character with this code point is lowercase'''
     # Some characters are defined as “Lowercase” in
     # DerivedCoreProperties.txt but do not have a mapping to upper
     # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
     # one of these.
     return (to_upper(code_point) != code_point
             # <U00DF> is lowercase, but without simple to_upper mapping.
             or code_point == 0x00DF
             or (code_point in DERIVED_CORE_PROPERTIES
                 and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))

 def is_alpha(code_point):
     '''Checks whether the character with this code point is alphabetic'''
     return ((code_point in DERIVED_CORE_PROPERTIES
              and
              'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
             or
             # Consider all the non-ASCII digits as alphabetic.
             # ISO C 99 forbids us to have them in category “digit”,
             # but we want iswalnum to return true on them.
             (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
              and not (code_point >= 0x0030 and code_point <= 0x0039)))

 def is_digit(code_point):
     '''Checks whether the character with this code point is a digit'''
     if False:
         return (UNICODE_ATTRIBUTES[code_point]['name']
                 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
         # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
         # a zero.  Must add <0> in front of them by hand.
     else:
         # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
         # takes it away:
         # 7.25.2.1.5:
         #    The iswdigit function tests for any wide character that
         #    corresponds to a decimal-digit character (as defined in 5.2.1).
         # 5.2.1:
         #    the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
         return (code_point >= 0x0030 and code_point <= 0x0039)

 def is_outdigit(code_point):
     '''Checks whether the character with this code point is outdigit'''
     return (code_point >= 0x0030 and code_point <= 0x0039)

 def is_blank(code_point):
     '''Checks whether the character with this code point is blank'''
     return (code_point == 0x0009 # '\t'
             # Category Zs without mention of '<noBreak>'
             or (UNICODE_ATTRIBUTES[code_point]['name']
                 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
                 and '<noBreak>' not in
                 UNICODE_ATTRIBUTES[code_point]['decomposition']))

 def is_space(code_point):
     '''Checks whether the character with this code point is a space'''
     # Don’t make U+00A0 a space. Non-breaking space means that all programs
     # should treat it like a punctuation character, not like a space.
     return (code_point == 0x0020 # ' '
             or code_point == 0x000C # '\f'
             or code_point == 0x000A # '\n'
             or code_point == 0x000D # '\r'
             or code_point == 0x0009 # '\t'
             or code_point == 0x000B # '\v'
             # Categories Zl, Zp, and Zs without mention of "<noBreak>"
             or (UNICODE_ATTRIBUTES[code_point]['name']
                 and
                 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
                  or
                  (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
                   and
                   '<noBreak>' not in
                   UNICODE_ATTRIBUTES[code_point]['decomposition']))))

 def is_cntrl(code_point):
     '''Checks whether the character with this code point is
     a control character'''
     return (UNICODE_ATTRIBUTES[code_point]['name']
             and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
                  or
                  UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))

 def is_xdigit(code_point):
     '''Checks whether the character with this code point is
     a hexadecimal digit'''
     if False:
         return (is_digit(code_point)
                 or (code_point >= 0x0041 and code_point <= 0x0046)
                 or (code_point >= 0x0061 and code_point <= 0x0066))
     else:
         # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
         # takes it away:
         # 7.25.2.1.12:
         #    The iswxdigit function tests for any wide character that
         #    corresponds to a hexadecimal-digit character (as defined
         #    in 6.4.4.1).
         # 6.4.4.1:
         #    hexadecimal-digit: one of
         #    0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
         return ((code_point >= 0x0030 and code_point  <= 0x0039)
                 or (code_point >= 0x0041 and code_point <= 0x0046)
                 or (code_point >= 0x0061 and code_point <= 0x0066))

 def is_graph(code_point):
     '''Checks whether the character with this code point is
     a graphical character'''
     return (UNICODE_ATTRIBUTES[code_point]['name']
             and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
             and not is_space(code_point))

 def is_print(code_point):
     '''Checks whether the character with this code point is printable'''
     return (UNICODE_ATTRIBUTES[code_point]['name']
             and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
             and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])

 def is_punct(code_point):
     '''Checks whether the character with this code point is punctuation'''
     if False:
         return (UNICODE_ATTRIBUTES[code_point]['name']
                 and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
     else:
         # The traditional POSIX definition of punctuation is every graphic,
         # non-alphanumeric character.
         return (is_graph(code_point)
                 and not is_alpha(code_point)
                 and not is_digit(code_point))

 def is_combining(code_point):
     '''Checks whether the character with this code point is
     a combining character'''
     # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
     # file. In 3.0.1 it was identical to the union of the general categories
     # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
     # PropList.txt file, so we take the latter definition.
     return (UNICODE_ATTRIBUTES[code_point]['name']
             and
             UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])

 def is_combining_level3(code_point):
     '''Checks whether the character with this code point is
     a combining level3 character'''
     return (is_combining(code_point)
             and
             int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))

 def ucs_symbol(code_point):
     '''Return the UCS symbol string for a Unicode character.'''
     if code_point < 0x10000:
         return '<U{:04X}>'.format(code_point)
     else:
         return '<U{:08X}>'.format(code_point)

 def ucs_symbol_range(code_point_low, code_point_high):
     '''Returns a string UCS symbol string for a code point range.

     Example:

     <U0041>..<U005A>
     '''
     return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)

 def verifications():
     '''Tests whether the is_* functions observe the known restrictions'''
     for code_point in sorted(UNICODE_ATTRIBUTES):
         # toupper restriction: "Only characters specified for the keywords
         # lower and upper shall be specified.
         if (to_upper(code_point) != code_point
             and not (is_lower(code_point) or is_upper(code_point))):
             sys.stderr.write(
                 ('%(sym)s is not upper|lower '
                  + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
                     'sym': ucs_symbol(code_point),
                     'c': code_point,
                     'uc': to_upper(code_point)})
         # tolower restriction: "Only characters specified for the keywords
         # lower and upper shall be specified.
         if (to_lower(code_point) != code_point
             and not (is_lower(code_point) or is_upper(code_point))):
             sys.stderr.write(
                 ('%(sym)s is not upper|lower '
                  + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
                     'sym': ucs_symbol(code_point),
                     'c': code_point,
                     'uc': to_lower(code_point)})
         # alpha restriction: "Characters classified as either upper or lower
         # shall automatically belong to this class.
         if ((is_lower(code_point) or is_upper(code_point))
              and not is_alpha(code_point)):
             sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
                 'sym': ucs_symbol(code_point)})
         # alpha restriction: “No character specified for the keywords cntrl,
         # digit, punct or space shall be specified.”
         if (is_alpha(code_point) and is_cntrl(code_point)):
             sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
                 'sym': ucs_symbol(code_point)})
         if (is_alpha(code_point) and is_digit(code_point)):
             sys.stderr.write('%(sym)s is alpha and digit\n' %{
                 'sym': ucs_symbol(code_point)})
         if (is_alpha(code_point) and is_punct(code_point)):
             sys.stderr.write('%(sym)s is alpha and punct\n' %{
                 'sym': ucs_symbol(code_point)})
         if (is_alpha(code_point) and is_space(code_point)):
             sys.stderr.write('%(sym)s is alpha and space\n' %{
                 'sym': ucs_symbol(code_point)})
         # space restriction: “No character specified for the keywords upper,
         # lower, alpha, digit, graph or xdigit shall be specified.”
         # upper, lower, alpha already checked above.
         if (is_space(code_point) and is_digit(code_point)):
             sys.stderr.write('%(sym)s is space and digit\n' %{
                 'sym': ucs_symbol(code_point)})
         if (is_space(code_point) and is_graph(code_point)):
             sys.stderr.write('%(sym)s is space and graph\n' %{
                 'sym': ucs_symbol(code_point)})
         if (is_space(code_point) and is_xdigit(code_point)):
             sys.stderr.write('%(sym)s is space and xdigit\n' %{
                 'sym': ucs_symbol(code_point)})
         # cntrl restriction: “No character specified for the keywords upper,
         # lower, alpha, digit, punct, graph, print or xdigit shall be
         # specified.”  upper, lower, alpha already checked above.
         if (is_cntrl(code_point) and is_digit(code_point)):
             sys.stderr.write('%(sym)s is cntrl and digit\n' %{
                 'sym': ucs_symbol(code_point)})
         if (is_cntrl(code_point) and is_punct(code_point)):
             sys.stderr.write('%(sym)s is cntrl and punct\n' %{
                 'sym': ucs_symbol(code_point)})
         if (is_cntrl(code_point) and is_graph(code_point)):
             sys.stderr.write('%(sym)s is cntrl and graph\n' %{
                 'sym': ucs_symbol(code_point)})
         if (is_cntrl(code_point) and is_print(code_point)):
             sys.stderr.write('%(sym)s is cntrl and print\n' %{
                 'sym': ucs_symbol(code_point)})
         if (is_cntrl(code_point) and is_xdigit(code_point)):
             sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
                 'sym': ucs_symbol(code_point)})
         # punct restriction: “No character specified for the keywords upper,
         # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
         # be specified.”  upper, lower, alpha, cntrl already checked above.
         if (is_punct(code_point) and is_digit(code_point)):
             sys.stderr.write('%(sym)s is punct and digit\n' %{
                 'sym': ucs_symbol(code_point)})
         if (is_punct(code_point) and is_xdigit(code_point)):
             sys.stderr.write('%(sym)s is punct and xdigit\n' %{
                 'sym': ucs_symbol(code_point)})
         if (is_punct(code_point) and code_point == 0x0020):
             sys.stderr.write('%(sym)s is punct\n' %{
                 'sym': ucs_symbol(code_point)})
         # graph restriction: “No character specified for the keyword cntrl
         # shall be specified.”  Already checked above.

         # print restriction: “No character specified for the keyword cntrl
         # shall be specified.”  Already checked above.

         # graph - print relation: differ only in the <space> character.
         # How is this possible if there are more than one space character?!
         # I think susv2/xbd/locale.html should speak of “space characters”,
         # not “space character”.
         if (is_print(code_point)
             and not (is_graph(code_point) or is_space(code_point))):
             sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
                 'sym': unicode_utils.ucs_symbol(code_point)})
         if (not is_print(code_point)
             and (is_graph(code_point) or code_point == 0x0020)):
             sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
                 'sym': unicode_utils.ucs_symbol(code_point)})
	# Utilities to generate Unicode data for glibc from upstream Unicode data.
	#
	# Copyright (C) 2014-2020 Free Software Foundation, Inc.
	# This file is part of the GNU C Library.
	#
	# The GNU C Library is free software; you can redistribute it and/or
	# modify it under the terms of the GNU Lesser General Public
	# License as published by the Free Software Foundation; either
	# version 2.1 of the License, or (at your option) any later version.
	#
	# The GNU C Library is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# Lesser General Public License for more details.
	#
	# You should have received a copy of the GNU Lesser General Public
	# License along with the GNU C Library; if not, see
	# <https://www.gnu.org/licenses/>.

	'''
	This module contains utilities used by the scripts to generate
	Unicode data for glibc from upstream Unicode data files.
	'''

	import sys
	import re


	# Common locale header.
	COMMENT_HEADER = """
	% This file is part of the GNU C Library and contains locale data.
	% The Free Software Foundation does not claim any copyright interest
	% in the locale data contained in this file. The foregoing does not
	% affect the license of the GNU C Library as a whole. It does not
	% exempt you from the conditions of the license if your use would
	% otherwise be governed by that license.
	"""

	# Dictionary holding the entire contents of the UnicodeData.txt file
	#
	# Contents of this dictionary look like this:
	#
	# {0: {'category': 'Cc',
	# 'title': None,
	# 'digit': '',
	# 'name': '<control>',
	# 'bidi': 'BN',
	# 'combining': '0',
	# 'comment': '',
	# 'oldname': 'NULL',
	# 'decomposition': '',
	# 'upper': None,
	# 'mirrored': 'N',
	# 'lower': None,
	# 'decdigit': '',
	# 'numeric': ''},
	# …
	# }
	UNICODE_ATTRIBUTES = {}

	# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
	#
	# Contents of this dictionary look like this:
	#
	# {917504: ['Default_Ignorable_Code_Point'],
	# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
	# …
	# }
	DERIVED_CORE_PROPERTIES = {}

	# Dictionary holding the entire contents of the EastAsianWidths.txt file
	#
	# Contents of this dictionary look like this:
	#
	# {0: 'N', … , 45430: 'W', …}
	EAST_ASIAN_WIDTHS = {}

	def fill_attribute(code_point, fields):
	'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.

	One entry in the UNICODE_ATTRIBUTES dictionary represents one line
	in the UnicodeData.txt file.

	'''
	UNICODE_ATTRIBUTES[code_point] = {
	'name': fields[1], # Character name
	'category': fields[2], # General category
	'combining': fields[3], # Canonical combining classes
	'bidi': fields[4], # Bidirectional category
	'decomposition': fields[5], # Character decomposition mapping
	'decdigit': fields[6], # Decimal digit value
	'digit': fields[7], # Digit value
	'numeric': fields[8], # Numeric value
	'mirrored': fields[9], # mirrored
	'oldname': fields[10], # Old Unicode 1.0 name
	'comment': fields[11], # comment
	# Uppercase mapping
	'upper': int(fields[12], 16) if fields[12] else None,
	# Lowercase mapping
	'lower': int(fields[13], 16) if fields[13] else None,
	# Titlecase mapping
	'title': int(fields[14], 16) if fields[14] else None,
	}

	def fill_attributes(filename):
	'''Stores the entire contents of the UnicodeData.txt file
	in the UNICODE_ATTRIBUTES dictionary.

	A typical line for a single code point in UnicodeData.txt looks
	like this:

	0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;

	Code point ranges are indicated by pairs of lines like this:

	4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
	9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
	'''
	with open(filename, mode='r') as unicode_data_file:
	fields_start = []
	for line in unicode_data_file:
	fields = line.strip().split(';')
	if len(fields) != 15:
	sys.stderr.write(
	'short line in file "%(f)s": %(l)s\n' %{
	'f': filename, 'l': line})
	exit(1)
	if fields[2] == 'Cs':
	# Surrogates are UTF-16 artefacts,
	# not real characters. Ignore them.
	fields_start = []
	continue
	if fields[1].endswith(', First>'):
	fields_start = fields
	fields_start[1] = fields_start[1].split(',')[0][1:]
	continue
	if fields[1].endswith(', Last>'):
	fields[1] = fields[1].split(',')[0][1:]
	if fields[1:] != fields_start[1:]:
	sys.stderr.write(
	'broken code point range in file "%(f)s": %(l)s\n' %{
	'f': filename, 'l': line})
	exit(1)
	for code_point in range(
	int(fields_start[0], 16),
	int(fields[0], 16)+1):
	fill_attribute(code_point, fields)
	fields_start = []
	continue
	fill_attribute(int(fields[0], 16), fields)
	fields_start = []

	def fill_derived_core_properties(filename):
	'''Stores the entire contents of the DerivedCoreProperties.txt file
	in the DERIVED_CORE_PROPERTIES dictionary.

	Lines in DerivedCoreProperties.txt are either a code point range like
	this:

	0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z

	or a single code point like this:

	00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR

	'''
	with open(filename, mode='r') as derived_core_properties_file:
	for line in derived_core_properties_file:
	match = re.match(
	r'^(?P<codepoint1>[0-9A-F]{4,6})'
	+ r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
	+ r'\s;\s(?P<property>[a-zA-Z_]+)',
	line)
	if not match:
	continue
	start = match.group('codepoint1')
	end = match.group('codepoint2')
	if not end:
	end = start
	for code_point in range(int(start, 16), int(end, 16)+1):
	prop = match.group('property')
	if code_point in DERIVED_CORE_PROPERTIES:
	DERIVED_CORE_PROPERTIES[code_point].append(prop)
	else:
	DERIVED_CORE_PROPERTIES[code_point] = [prop]

	def fill_east_asian_widths(filename):
	'''Stores the entire contents of the EastAsianWidths.txt file
	in the EAST_ASIAN_WIDTHS dictionary.

	Lines in EastAsianWidths.txt are either a code point range like
	this:

	9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>

	or a single code point like this:

	A015;W # Lm YI SYLLABLE WU
	'''
	with open(filename, mode='r') as east_asian_widths_file:
	for line in east_asian_widths_file:
	match = re.match(
	r'^(?P<codepoint1>[0-9A-F]{4,6})'
	+r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
	+r'\s;\s(?P<property>[a-zA-Z]+)',
	line)
	if not match:
	continue
	start = match.group('codepoint1')
	end = match.group('codepoint2')
	if not end:
	end = start
	for code_point in range(int(start, 16), int(end, 16)+1):
	EAST_ASIAN_WIDTHS[code_point] = match.group('property')

	def to_upper(code_point):
	'''Returns the code point of the uppercase version
	of the given code point'''
	if (UNICODE_ATTRIBUTES[code_point]['name']
	and UNICODE_ATTRIBUTES[code_point]['upper']):
	return UNICODE_ATTRIBUTES[code_point]['upper']
	else:
	return code_point

	def to_lower(code_point):
	'''Returns the code point of the lowercase version
	of the given code point'''
	if (UNICODE_ATTRIBUTES[code_point]['name']
	and UNICODE_ATTRIBUTES[code_point]['lower']):
	return UNICODE_ATTRIBUTES[code_point]['lower']
	else:
	return code_point

	def to_upper_turkish(code_point):
	'''Returns the code point of the Turkish uppercase version
	of the given code point'''
	if code_point == 0x0069:
	return 0x0130
	return to_upper(code_point)

	def to_lower_turkish(code_point):
	'''Returns the code point of the Turkish lowercase version
	of the given code point'''
	if code_point == 0x0049:
	return 0x0131
	return to_lower(code_point)

	def to_title(code_point):
	'''Returns the code point of the titlecase version
	of the given code point'''
	if (UNICODE_ATTRIBUTES[code_point]['name']
	and UNICODE_ATTRIBUTES[code_point]['title']):
	return UNICODE_ATTRIBUTES[code_point]['title']
	else:
	return code_point

	def is_upper(code_point):
	'''Checks whether the character with this code point is uppercase'''
	return (to_lower(code_point) != code_point
	or (code_point in DERIVED_CORE_PROPERTIES
	and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))

	def is_lower(code_point):
	'''Checks whether the character with this code point is lowercase'''
	# Some characters are defined as “Lowercase” in
	# DerivedCoreProperties.txt but do not have a mapping to upper
	# case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
	# one of these.
	return (to_upper(code_point) != code_point
	# <U00DF> is lowercase, but without simple to_upper mapping.
	or code_point == 0x00DF
	or (code_point in DERIVED_CORE_PROPERTIES
	and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))

	def is_alpha(code_point):
	'''Checks whether the character with this code point is alphabetic'''
	return ((code_point in DERIVED_CORE_PROPERTIES
	and
	'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
	or
	# Consider all the non-ASCII digits as alphabetic.
	# ISO C 99 forbids us to have them in category “digit”,
	# but we want iswalnum to return true on them.
	(UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
	and not (code_point >= 0x0030 and code_point <= 0x0039)))

	def is_digit(code_point):
	'''Checks whether the character with this code point is a digit'''
	if False:
	return (UNICODE_ATTRIBUTES[code_point]['name']
	and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
	# Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
	# a zero. Must add <0> in front of them by hand.
	else:
	# SUSV2 gives us some freedom for the "digit" category, but ISO C 99
	# takes it away:
	# 7.25.2.1.5:
	# The iswdigit function tests for any wide character that
	# corresponds to a decimal-digit character (as defined in 5.2.1).
	# 5.2.1:
	# the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
	return (code_point >= 0x0030 and code_point <= 0x0039)

	def is_outdigit(code_point):
	'''Checks whether the character with this code point is outdigit'''
	return (code_point >= 0x0030 and code_point <= 0x0039)

	def is_blank(code_point):
	'''Checks whether the character with this code point is blank'''
	return (code_point == 0x0009 # '\t'
	# Category Zs without mention of '<noBreak>'
	or (UNICODE_ATTRIBUTES[code_point]['name']
	and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
	and '<noBreak>' not in
	UNICODE_ATTRIBUTES[code_point]['decomposition']))

	def is_space(code_point):
	'''Checks whether the character with this code point is a space'''
	# Don’t make U+00A0 a space. Non-breaking space means that all programs
	# should treat it like a punctuation character, not like a space.
	return (code_point == 0x0020 # ' '
	or code_point == 0x000C # '\f'
	or code_point == 0x000A # '\n'
	or code_point == 0x000D # '\r'
	or code_point == 0x0009 # '\t'
	or code_point == 0x000B # '\v'
	# Categories Zl, Zp, and Zs without mention of "<noBreak>"
	or (UNICODE_ATTRIBUTES[code_point]['name']
	and
	(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
	or
	(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
	and
	'<noBreak>' not in
	UNICODE_ATTRIBUTES[code_point]['decomposition']))))

	def is_cntrl(code_point):
	'''Checks whether the character with this code point is
	a control character'''
	return (UNICODE_ATTRIBUTES[code_point]['name']
	and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
	or
	UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))

	def is_xdigit(code_point):
	'''Checks whether the character with this code point is
	a hexadecimal digit'''
	if False:
	return (is_digit(code_point)
	or (code_point >= 0x0041 and code_point <= 0x0046)
	or (code_point >= 0x0061 and code_point <= 0x0066))
	else:
	# SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
	# takes it away:
	# 7.25.2.1.12:
	# The iswxdigit function tests for any wide character that
	# corresponds to a hexadecimal-digit character (as defined
	# in 6.4.4.1).
	# 6.4.4.1:
	# hexadecimal-digit: one of
	# 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
	return ((code_point >= 0x0030 and code_point <= 0x0039)
	or (code_point >= 0x0041 and code_point <= 0x0046)
	or (code_point >= 0x0061 and code_point <= 0x0066))

	def is_graph(code_point):
	'''Checks whether the character with this code point is
	a graphical character'''
	return (UNICODE_ATTRIBUTES[code_point]['name']
	and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
	and not is_space(code_point))

	def is_print(code_point):
	'''Checks whether the character with this code point is printable'''
	return (UNICODE_ATTRIBUTES[code_point]['name']
	and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
	and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])

	def is_punct(code_point):
	'''Checks whether the character with this code point is punctuation'''
	if False:
	return (UNICODE_ATTRIBUTES[code_point]['name']
	and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
	else:
	# The traditional POSIX definition of punctuation is every graphic,
	# non-alphanumeric character.
	return (is_graph(code_point)
	and not is_alpha(code_point)
	and not is_digit(code_point))

	def is_combining(code_point):
	'''Checks whether the character with this code point is
	a combining character'''
	# Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
	# file. In 3.0.1 it was identical to the union of the general categories
	# "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
	# PropList.txt file, so we take the latter definition.
	return (UNICODE_ATTRIBUTES[code_point]['name']
	and
	UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])

	def is_combining_level3(code_point):
	'''Checks whether the character with this code point is
	a combining level3 character'''
	return (is_combining(code_point)
	and
	int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))

	def ucs_symbol(code_point):
	'''Return the UCS symbol string for a Unicode character.'''
	if code_point < 0x10000:
	return '<U{:04X}>'.format(code_point)
	else:
	return '<U{:08X}>'.format(code_point)

	def ucs_symbol_range(code_point_low, code_point_high):
	'''Returns a string UCS symbol string for a code point range.

	Example:

	<U0041>..<U005A>
	'''
	return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)

	def verifications():
	'''Tests whether the is_* functions observe the known restrictions'''
	for code_point in sorted(UNICODE_ATTRIBUTES):
	# toupper restriction: "Only characters specified for the keywords
	# lower and upper shall be specified.
	if (to_upper(code_point) != code_point
	and not (is_lower(code_point) or is_upper(code_point))):
	sys.stderr.write(
	('%(sym)s is not upper\|lower '
	+ 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
	'sym': ucs_symbol(code_point),
	'c': code_point,
	'uc': to_upper(code_point)})
	# tolower restriction: "Only characters specified for the keywords
	# lower and upper shall be specified.
	if (to_lower(code_point) != code_point
	and not (is_lower(code_point) or is_upper(code_point))):
	sys.stderr.write(
	('%(sym)s is not upper\|lower '
	+ 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
	'sym': ucs_symbol(code_point),
	'c': code_point,
	'uc': to_lower(code_point)})
	# alpha restriction: "Characters classified as either upper or lower
	# shall automatically belong to this class.
	if ((is_lower(code_point) or is_upper(code_point))
	and not is_alpha(code_point)):
	sys.stderr.write('%(sym)s is upper\|lower but not alpha\n' %{
	'sym': ucs_symbol(code_point)})
	# alpha restriction: “No character specified for the keywords cntrl,
	# digit, punct or space shall be specified.”
	if (is_alpha(code_point) and is_cntrl(code_point)):
	sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
	'sym': ucs_symbol(code_point)})
	if (is_alpha(code_point) and is_digit(code_point)):
	sys.stderr.write('%(sym)s is alpha and digit\n' %{
	'sym': ucs_symbol(code_point)})
	if (is_alpha(code_point) and is_punct(code_point)):
	sys.stderr.write('%(sym)s is alpha and punct\n' %{
	'sym': ucs_symbol(code_point)})
	if (is_alpha(code_point) and is_space(code_point)):
	sys.stderr.write('%(sym)s is alpha and space\n' %{
	'sym': ucs_symbol(code_point)})
	# space restriction: “No character specified for the keywords upper,
	# lower, alpha, digit, graph or xdigit shall be specified.”
	# upper, lower, alpha already checked above.
	if (is_space(code_point) and is_digit(code_point)):
	sys.stderr.write('%(sym)s is space and digit\n' %{
	'sym': ucs_symbol(code_point)})
	if (is_space(code_point) and is_graph(code_point)):
	sys.stderr.write('%(sym)s is space and graph\n' %{
	'sym': ucs_symbol(code_point)})
	if (is_space(code_point) and is_xdigit(code_point)):
	sys.stderr.write('%(sym)s is space and xdigit\n' %{
	'sym': ucs_symbol(code_point)})
	# cntrl restriction: “No character specified for the keywords upper,
	# lower, alpha, digit, punct, graph, print or xdigit shall be
	# specified.” upper, lower, alpha already checked above.
	if (is_cntrl(code_point) and is_digit(code_point)):
	sys.stderr.write('%(sym)s is cntrl and digit\n' %{
	'sym': ucs_symbol(code_point)})
	if (is_cntrl(code_point) and is_punct(code_point)):
	sys.stderr.write('%(sym)s is cntrl and punct\n' %{
	'sym': ucs_symbol(code_point)})
	if (is_cntrl(code_point) and is_graph(code_point)):
	sys.stderr.write('%(sym)s is cntrl and graph\n' %{
	'sym': ucs_symbol(code_point)})
	if (is_cntrl(code_point) and is_print(code_point)):
	sys.stderr.write('%(sym)s is cntrl and print\n' %{
	'sym': ucs_symbol(code_point)})
	if (is_cntrl(code_point) and is_xdigit(code_point)):
	sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
	'sym': ucs_symbol(code_point)})
	# punct restriction: “No character specified for the keywords upper,
	# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
	# be specified.” upper, lower, alpha, cntrl already checked above.
	if (is_punct(code_point) and is_digit(code_point)):
	sys.stderr.write('%(sym)s is punct and digit\n' %{
	'sym': ucs_symbol(code_point)})
	if (is_punct(code_point) and is_xdigit(code_point)):
	sys.stderr.write('%(sym)s is punct and xdigit\n' %{
	'sym': ucs_symbol(code_point)})
	if (is_punct(code_point) and code_point == 0x0020):
	sys.stderr.write('%(sym)s is punct\n' %{
	'sym': ucs_symbol(code_point)})
	# graph restriction: “No character specified for the keyword cntrl
	# shall be specified.” Already checked above.

	# print restriction: “No character specified for the keyword cntrl
	# shall be specified.” Already checked above.

	# graph - print relation: differ only in the <space> character.
	# How is this possible if there are more than one space character?!
	# I think susv2/xbd/locale.html should speak of “space characters”,
	# not “space character”.
	if (is_print(code_point)
	and not (is_graph(code_point) or is_space(code_point))):
	sys.stderr.write('%(sym)s is print but not graph\|<space>\n' %{
	'sym': unicode_utils.ucs_symbol(code_point)})
	if (not is_print(code_point)
	and (is_graph(code_point) or code_point == 0x0020)):
	sys.stderr.write('%(sym)s is graph\|<space> but not print\n' %{
	'sym': unicode_utils.ucs_symbol(code_point)})