| # chartables.pl - A perl program to generate tables for use by the |
| # Character class. |
| |
| # Copyright (C) 1998, 1999 Red Hat, Inc. |
| # |
| # This file is part of libjava. |
| # |
| # This software is copyrighted work licensed under the terms of the |
| # Libjava License. Please consult the file "LIBJAVA_LICENSE" for |
| # details. |
| |
| # This program requires a `unidata.txt' file of the form distributed |
| # on the Unicode 2.0 CD ROM. Or, get it more conveniently here: |
| # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt |
| # Version `2.1.8' of this file was last used to update the Character class. |
| |
| # Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3 |
| # "The Java Language Specification", ISBN 0-201-63451-1 |
| # plus online API docs for JDK 1.2 beta from http://www.javasoft.com. |
| |
| # Usage: perl chartables.pl [-n] UnicodeData-VERSION.txt |
| # If this exits with nonzero status, then you must investigate the |
| # cause of the problem. |
| # Diagnostics and other information to stderr. |
| # This creates the new include/java-chartables.h and |
| # include/java-chardecomp.h files directly. |
| # With -n, the files are not created, but all processing |
| # still occurs. |
| |
| # Fields in the table. |
| $CODE = 0; |
| $NAME = 1; |
| $CATEGORY = 2; |
| $DECOMPOSITION = 5; |
| $DECIMAL = 6; |
| $DIGIT = 7; |
| $NUMERIC = 8; |
| $UPPERCASE = 12; |
| $LOWERCASE = 13; |
| $TITLECASE = 14; |
| |
| # A special case. |
| $TAMIL_DIGIT_ONE = 0x0be7; |
| $TAMIL_DIGIT_NINE = 0x0bef; |
| |
| # These are endpoints of legitimate gaps in the tables. |
| $CJK_IDEOGRAPH_END = 0x9fa5; |
| $HANGUL_END = 0xd7a3; |
| $HIGH_SURROGATE_END = 0xdb7f; |
| $PRIVATE_HIGH_SURROGATE_END = 0xdbff; |
| $LOW_SURROGATE_END = 0xdfff; |
| $PRIVATE_END = 0xf8ff; |
| |
| %title_to_upper = (); |
| %title_to_lower = (); |
| %numerics = (); |
| %name = (); |
| |
| @digit_start = (); |
| @digit_end = (); |
| |
| @space_start = (); |
| @space_end = (); |
| |
| # @letter_start = (); |
| # @letter_end = (); |
| |
| @all_start = (); |
| @all_end = (); |
| @all_cats = (); |
| |
| @upper_start = (); |
| @upper_end = (); |
| @upper_map = (); |
| %upper_anom = (); |
| |
| @lower_start = (); |
| @lower_end = (); |
| @lower_map = (); |
| %lower_anom = (); |
| |
| @attributes = (); |
| |
| # There are a few characters which actually need two attributes. |
| # These are special-cased. |
| $ROMAN_START = 0x2160; |
| $ROMAN_END = 0x217f; |
| %second_attributes = (); |
| |
| $prevcode = -1; |
| $status = 0; |
| |
| %category_map = |
| ( |
| 'Mn' => 'NON_SPACING_MARK', |
| 'Mc' => 'COMBINING_SPACING_MARK', |
| 'Me' => 'ENCLOSING_MARK', |
| 'Nd' => 'DECIMAL_DIGIT_NUMBER', |
| 'Nl' => 'LETTER_NUMBER', |
| 'No' => 'OTHER_NUMBER', |
| 'Zs' => 'SPACE_SEPARATOR', |
| 'Zl' => 'LINE_SEPARATOR', |
| 'Zp' => 'PARAGRAPH_SEPARATOR', |
| 'Cc' => 'CONTROL', |
| 'Cf' => 'FORMAT', |
| 'Cs' => 'SURROGATE', |
| 'Co' => 'PRIVATE_USE', |
| 'Cn' => 'UNASSIGNED', |
| 'Lu' => 'UPPERCASE_LETTER', |
| 'Ll' => 'LOWERCASE_LETTER', |
| 'Lt' => 'TITLECASE_LETTER', |
| 'Lm' => 'MODIFIER_LETTER', |
| 'Lo' => 'OTHER_LETTER', |
| 'Pc' => 'CONNECTOR_PUNCTUATION', |
| 'Pd' => 'DASH_PUNCTUATION', |
| 'Ps' => 'START_PUNCTUATION', |
| 'Pe' => 'END_PUNCTUATION', |
| 'Pi' => 'START_PUNCTUATION', |
| 'Pf' => 'END_PUNCTUATION', |
| 'Po' => 'OTHER_PUNCTUATION', |
| 'Sm' => 'MATH_SYMBOL', |
| 'Sc' => 'CURRENCY_SYMBOL', |
| 'Sk' => 'MODIFIER_SYMBOL', |
| 'So' => 'OTHER_SYMBOL' |
| ); |
| |
| # These maps characters to their decompositions. |
| %canonical_decomposition = (); |
| %full_decomposition = (); |
| |
| |
| # Handle `-n' and open output files. |
| local ($f1, $f2) = ('include/java-chartables.h', |
| 'include/java-chardecomp.h'); |
| if ($ARGV[0] eq '-n') |
| { |
| shift @ARGV; |
| $f1 = '/dev/null'; |
| $f2 = '/dev/null'; |
| } |
| |
| open (CHARTABLE, "> $f1"); |
| open (DECOMP, "> $f2"); |
| |
| # Process the Unicode file. |
| while (<>) |
| { |
| chop; |
| # Specify a limit for split so that we pick up trailing fields. |
| # We make the limit larger than we need, to catch the case where |
| # there are extra fields. |
| @fields = split (';', $_, 30); |
| # Convert code to number. |
| $ncode = hex ($fields[$CODE]); |
| |
| if ($#fields != 14) |
| { |
| print STDERR ("Entry for \\u", $fields[$CODE], |
| " has wrong number of fields: ", $#fields, "\n"); |
| } |
| |
| $name{$fields[$CODE]} = $fields[$NAME]; |
| |
| # If we've found a gap in the table, fill it in. |
| if ($ncode != $prevcode + 1) |
| { |
| &process_gap (*fields, $prevcode, $ncode); |
| } |
| |
| &process_char (*fields, $ncode); |
| |
| $prevcode = $ncode; |
| } |
| |
| if ($prevcode != 0xffff) |
| { |
| # Setting of `fields' parameter doesn't matter here. |
| &process_gap (*fields, $prevcode, 0x10000); |
| } |
| |
| print CHARTABLE "// java-chartables.h - Character tables for java.lang.Character -*- c++ -*-\n\n"; |
| print CHARTABLE "#ifndef __JAVA_CHARTABLES_H__\n"; |
| print CHARTABLE "#define __JAVA_CHARTABLES_H__\n\n"; |
| print CHARTABLE "// These tables are automatically generated by the chartables.pl\n"; |
| print CHARTABLE "// script. DO NOT EDIT the tables. Instead, fix the script\n"; |
| print CHARTABLE "// and run it again.\n\n"; |
| print CHARTABLE "// This file should only be included by natCharacter.cc\n\n"; |
| |
| |
| $bytes = 0; |
| |
| # Titlecase mapping tables. |
| if ($#title_to_lower != $#title_to_upper) |
| { |
| # If this fails we need to reimplement toTitleCase. |
| print STDERR "titlecase mappings have different sizes\n"; |
| $status = 1; |
| } |
| # Also ensure that the tables are entirely parallel. |
| foreach $key (sort keys %title_to_lower) |
| { |
| if (! defined $title_to_upper{$key}) |
| { |
| print STDERR "titlecase mappings have different entries\n"; |
| $status = 1; |
| } |
| } |
| &print_single_map ("title_to_lower_table", %title_to_lower); |
| &print_single_map ("title_to_upper_table", %title_to_upper); |
| |
| print CHARTABLE "#ifdef COMPACT_CHARACTER\n\n"; |
| |
| printf CHARTABLE "#define TAMIL_DIGIT_ONE 0x%04x\n\n", $TAMIL_DIGIT_ONE; |
| |
| # All numeric values. |
| &print_numerics; |
| |
| # Digits only. |
| &print_block ("digit_table", *digit_start, *digit_end); |
| |
| # Space characters. |
| &print_block ("space_table", *space_start, *space_end); |
| |
| # Letters. We used to generate a separate letter table. But this |
| # doesn't really seem worthwhile. Simply using `all_table' saves us |
| # about 800 bytes, and only adds 3 table probes to isLetter. |
| # &print_block ("letter_table", *letter_start, *letter_end); |
| |
| # Case tables. |
| &print_case_table ("upper", *upper_start, *upper_end, *upper_map, *upper_anom); |
| &print_case_table ("lower", *lower_start, *lower_end, *lower_map, *lower_anom); |
| |
| # Everything else. |
| &print_all_block (*all_start, *all_end, *all_cats); |
| |
| print CHARTABLE "#else /* COMPACT_CHARACTER */\n\n"; |
| |
| printf CHARTABLE "#define ROMAN_START 0x%04x\n", $ROMAN_START; |
| printf CHARTABLE "#define ROMAN_END 0x%04x\n\n", $ROMAN_END; |
| |
| &print_fast_tables (*all_start, *all_end, *all_cats, |
| *attributes, *second_attributes); |
| |
| print CHARTABLE "#endif /* COMPACT_CHARACTER */\n\n"; |
| |
| print CHARTABLE "#endif /* __JAVA_CHARTABLES_H__ */\n"; |
| |
| printf STDERR "Approximately %d bytes of data generated (compact case)\n", |
| $bytes; |
| |
| |
| # Now generate decomposition tables. |
| printf DECOMP "// java-chardecomp.h - Decomposition character tables -*- c++ -*-\n\n"; |
| printf DECOMP "#ifndef __JAVA_CHARDECOMP_H__\n"; |
| printf DECOMP "#define __JAVA_CHARDECOMP_H__\n\n"; |
| print DECOMP "// These tables are automatically generated by the chartables.pl\n"; |
| print DECOMP "// script. DO NOT EDIT the tables. Instead, fix the script\n"; |
| print DECOMP "// and run it again.\n\n"; |
| print DECOMP "// This file should only be included by natCollator.cc\n\n"; |
| |
| print DECOMP "struct decomp_entry\n{\n"; |
| print DECOMP " jchar key;\n"; |
| print DECOMP " const char *value;\n"; |
| print DECOMP "};\n\n"; |
| |
| &write_decompositions; |
| |
| printf DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n"; |
| |
| |
| close (CHARTABLE); |
| close (DECOMP); |
| |
| exit $status; |
| |
| |
| # Process a gap in the space. |
| sub process_gap |
| { |
| local (*fields, $prevcode, $ncode) = @_; |
| local (@gap_fields, $i); |
| |
| if ($ncode == $CJK_IDEOGRAPH_END |
| || $ncode == $HANGUL_END |
| || $ncode == $HIGH_SURROGATE_END |
| || $ncode == $PRIVATE_HIGH_SURROGATE_END |
| || $ncode == $LOW_SURROGATE_END |
| || $ncode == $PRIVATE_END) |
| { |
| # The characters in the gap we just found are known to |
| # have the same properties as the character at the end of |
| # the gap. |
| @gap_fields = @fields; |
| } |
| else |
| { |
| # This prints too much to be enabled. |
| # print STDERR "Gap found at \\u", $fields[$CODE], "\n"; |
| @gap_fields = ('', '', 'Cn', '', '', '', '', '', '', '', '', |
| '', '', '', ''); |
| } |
| |
| for ($i = $prevcode + 1; $i < $ncode; ++$i) |
| { |
| $gap_fields[$CODE] = sprintf ("%04x", $i); |
| $gap_fields[$NAME] = "CHARACTER " . $gap_fields[$CODE]; |
| &process_char (*gap_fields, $i); |
| } |
| } |
| |
| # Process a single character. |
| sub process_char |
| { |
| local (*fields, $ncode) = @_; |
| |
| if ($fields[$DECOMPOSITION] ne '') |
| { |
| &add_decomposition ($ncode, $fields[$DECOMPOSITION]); |
| } |
| |
| # If this is a titlecase character, mark it. |
| if ($fields[$CATEGORY] eq 'Lt') |
| { |
| $title_to_upper{$fields[$CODE]} = $fields[$UPPERCASE]; |
| $title_to_lower{$fields[$CODE]} = $fields[$LOWERCASE]; |
| } |
| else |
| { |
| # For upper and lower case mappings, we try to build compact |
| # tables that map range onto range. We specifically want to |
| # avoid titlecase characters. Java specifies a range check to |
| # make sure the character is not between 0x2000 and 0x2fff. |
| # We avoid that here because we need to generate table entries |
| # -- toLower and toUpper still work in that range. |
| if ($fields[$UPPERCASE] eq '' |
| && ($fields[$LOWERCASE] ne '' |
| || $fields[$NAME] =~ /CAPITAL (LETTER|LIGATURE)/)) |
| { |
| if ($fields[$LOWERCASE] ne '') |
| { |
| &update_case_block (*upper_start, *upper_end, *upper_map, |
| $fields[$CODE], $fields[$LOWERCASE]); |
| &set_attribute ($ncode, hex ($fields[$LOWERCASE])); |
| } |
| else |
| { |
| $upper_anom{$fields[$CODE]} = 1; |
| } |
| } |
| elsif ($fields[$LOWERCASE] ne '') |
| { |
| print STDERR ("Java missed upper case char \\u", |
| $fields[$CODE], "\n"); |
| } |
| elsif ($fields[$CATEGORY] eq 'Lu') |
| { |
| # This case is for letters which are marked as upper case |
| # but for which there is no lower case equivalent. For |
| # instance, LATIN LETTER YR. |
| } |
| |
| if ($fields[$LOWERCASE] eq '' |
| && ($fields[$UPPERCASE] ne '' |
| || $fields[$NAME] =~ /SMALL (LETTER|LIGATURE)/)) |
| { |
| if ($fields[$UPPERCASE] ne '') |
| { |
| &update_case_block (*lower_start, *lower_end, *lower_map, |
| $fields[$CODE], $fields[$UPPERCASE]); |
| &set_attribute ($ncode, hex ($fields[$UPPERCASE])); |
| } |
| else |
| { |
| $lower_anom{$fields[$CODE]} = 1; |
| } |
| } |
| elsif ($fields[$UPPERCASE] ne '') |
| { |
| print STDERR ("Java missed lower case char \\u", |
| $fields[$CODE], "\n"); |
| } |
| elsif ($fields[$CATEGORY] eq 'Ll') |
| { |
| # This case is for letters which are marked as lower case |
| # but for which there is no upper case equivalent. For |
| # instance, FEMININE ORDINAL INDICATOR. |
| } |
| } |
| |
| |
| # If we have a non-decimal numeric value, add it to the list. |
| if ($fields[$CATEGORY] eq 'Nd' |
| && ($ncode < 0x2000 || $ncode > 0x2fff) |
| && $fields[$NAME] =~ /DIGIT/) |
| { |
| # This is a digit character that is handled elsewhere. |
| } |
| elsif ($fields[$DIGIT] ne '' || $fields[$NUMERIC] ne '') |
| { |
| # Do a simple check. |
| if ($fields[$DECIMAL] ne '') |
| { |
| # This catches bugs in an earlier implementation of |
| # chartables.pl. Now it is here for historical interest |
| # only. |
| # print STDERR ("Character \u", $fields[$CODE], |
| # " would have been missed as digit\n"); |
| } |
| |
| local ($val) = $fields[$DIGIT]; |
| $val = $fields[$NUMERIC] if $val eq ''; |
| local ($ok) = 1; |
| |
| # If we have a value which is not a positive integer, then we |
| # set the value to -2 to make life easier for |
| # Character.getNumericValue. |
| if ($val !~ m/^[0-9]+$/) |
| { |
| if ($fields[$CATEGORY] ne 'Nl' |
| && $fields[$CATEGORY] ne 'No') |
| { |
| # This shows a few errors in the Unicode table. These |
| # characters have a missing Numeric field, and the `N' |
| # for the mirrored field shows up there instead. I |
| # reported these characters to errata@unicode.org on |
| # Thu Sep 10 1998. They said it will be fixed in the |
| # 2.1.6 release of the tables. |
| print STDERR ("Character \u", $fields[$CODE], |
| " has value but is not numeric; val = '", |
| $val, "'\n"); |
| # We skip these. |
| $ok = 0; |
| } |
| $val = "-2"; |
| } |
| |
| if ($ok) |
| { |
| $numerics{$fields[$CODE]} = $val; |
| &set_attribute ($ncode, $val); |
| } |
| } |
| |
| # We build a table that lists ranges of ordinary decimal values. |
| # At each step we make sure that the digits are in the correct |
| # order, with no holes, as this is assumed by Character. If this |
| # fails, reimplementation is required. This implementation |
| # dovetails nicely with the Java Spec, which has strange rules for |
| # what constitutes a decimal value. In particular the Unicode |
| # name must contain the word `DIGIT'. The spec doesn't directly |
| # say that digits must have type `Nd' (or that their value must an |
| # integer), but that can be inferred from the list of digits in |
| # the book(s). Currently the only Unicode characters whose name |
| # includes `DIGIT' which would not fit are the Tibetan "half" |
| # digits. |
| if ($fields[$CATEGORY] eq 'Nd') |
| { |
| if (($ncode < 0x2000 || $ncode > 0x2fff) |
| && $fields[$NAME] =~ /DIGIT/) |
| { |
| &update_digit_block (*digit_start, *digit_end, $fields[$CODE], |
| $fields[$DECIMAL]); |
| &set_attribute ($ncode, $fields[$DECIMAL]); |
| } |
| else |
| { |
| # If this fails then Character.getType will fail. We |
| # assume that things in `digit_table' are the only |
| # category `Nd' characters. |
| print STDERR ("Character \u", $fields[$CODE], |
| " is class Nd but not in digit table\n"); |
| $status = 1; |
| } |
| } |
| |
| # Keep track of space characters. |
| if ($fields[$CATEGORY] =~ /Z[slp]/) |
| { |
| &update_block (*space_start, *space_end, $fields[$CODE]); |
| } |
| |
| # Keep track of letters. |
| # if ($fields[$CATEGORY] =~ /L[ultmo]/) |
| # { |
| # &update_letter_block (*letter_start, *letter_end, $fields[$CODE], |
| # $fields[$CATEGORY]); |
| # } |
| |
| # Keep track of all characters. You might think we wouldn't have |
| # to do this for uppercase letters, or other characters we already |
| # "classify". The problem is that this classification is |
| # different. E.g., \u216f is uppercase by Java rules, but is a |
| # LETTER_NUMBER here. |
| &update_all_block (*all_start, *all_end, *all_cats, |
| $fields[$CODE], $fields[$CATEGORY]); |
| } |
| |
| |
| # Called to add a new decomposition. |
| sub add_decomposition |
| { |
| local ($ncode, $value) = @_; |
| local ($is_full) = 0; |
| local ($first) = 1; |
| local (@decomp) = (); |
| |
| foreach (split (' ', $value)) |
| { |
| if ($first && /^\<.*\>$/) |
| { |
| $is_full = 1; |
| } |
| else |
| { |
| push (@decomp, hex ($_)); |
| } |
| $first = 0; |
| } |
| |
| # We pack the value into a string because this means we can stick |
| # with Perl 4 features. |
| local ($s) = pack "I*", @decomp; |
| if ($is_full) |
| { |
| $full_decomposition{$ncode} = $s; |
| } |
| else |
| { |
| $canonical_decomposition{$ncode} = $s; |
| } |
| } |
| |
| # Write a single decomposition table. |
| sub write_single_decomposition |
| { |
| local ($name, $is_canon, %table) = @_; |
| |
| printf DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n"; |
| |
| local ($key, @expansion, $char); |
| local ($first_line) = 1; |
| |
| for ($key = 0; $key <= 65535; ++$key) |
| { |
| next if ! defined $table{$key}; |
| |
| printf DECOMP ",\n" |
| unless $first_line; |
| $first_line = 0; |
| |
| printf DECOMP " { 0x%04x, \"", $key; |
| |
| # We represent the expansion as a series of bytes, terminated |
| # with a double nul. This is ugly, but relatively |
| # space-efficient. Most expansions are short, but there are a |
| # few that are very long (e.g. \uFDFA). This means that if we |
| # chose a fixed-space representation we would waste a lot of |
| # space. |
| @expansion = unpack "I*", $table{$key}; |
| foreach $char (@expansion) |
| { |
| printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256); |
| } |
| |
| printf DECOMP "\" }"; |
| } |
| |
| printf DECOMP "\n};\n\n"; |
| } |
| |
| sub write_decompositions |
| { |
| &write_single_decomposition ('canonical', 1, %canonical_decomposition); |
| &write_single_decomposition ('full', 0, %full_decomposition); |
| } |
| |
| # We represent a block of characters with a pair of lists. This |
| # function updates the pair to account for the new character. Returns |
| # 1 if we added to the old block, 0 otherwise. |
| sub update_block |
| { |
| local (*start, *end, $char) = @_; |
| |
| local ($nchar) = hex ($char); |
| local ($count) = $#end; |
| if ($count >= 0 && $end[$count] == $nchar - 1) |
| { |
| ++$end[$count]; |
| return 1; |
| } |
| else |
| { |
| ++$count; |
| $start[$count] = $nchar; |
| $end[$count] = $nchar; |
| } |
| return 0; |
| } |
| |
| # Return true if we will be appending this character to the end of the |
| # existing block. |
| sub block_append_p |
| { |
| local (*end, $char) = @_; |
| return $#end >= 0 && $end[$#end] == $char - 1; |
| } |
| |
| # This updates the digit block. This table is much like an ordinary |
| # block, but it has an extra constraint. |
| sub update_digit_block |
| { |
| local (*start, *end, $char, $value) = @_; |
| |
| &update_block ($start, $end, $char); |
| local ($nchar) = hex ($char); |
| |
| # We want to make sure that the new digit's value is correct for |
| # its place in the block. However, we special-case Tamil digits, |
| # since Tamil does not have a digit `0'. |
| local ($count) = $#start; |
| if (($nchar < $TAMIL_DIGIT_ONE || $nchar > $TAMIL_DIGIT_NINE) |
| && $nchar - $start[$count] != $value) |
| { |
| # If this fails then Character.digit_value will be wrong. |
| print STDERR "Character \\u", $char, " violates digit constraint\n"; |
| $status = 1; |
| } |
| } |
| |
| # Update letter table. We could be smart about avoiding upper or |
| # lower case letters, but it is much simpler to just track them all. |
| sub update_letter_block |
| { |
| local (*start, *end, $char, $category) = @_; |
| |
| &update_block (*start, *end, $char); |
| } |
| |
| # Update `all' table. This table holds all the characters we don't |
| # already categorize for other reasons. FIXME: if a given type has |
| # very few characters, we should just inline the code. E.g., there is |
| # only one paragraph separator. |
| sub update_all_block |
| { |
| local (*start, *end, *cats, $char, $category) = @_; |
| |
| local ($nchar) = hex ($char); |
| local ($count) = $#end; |
| if ($count >= 0 |
| && $end[$count] == $nchar - 1 |
| && $cats[$count] eq $category) |
| { |
| ++$end[$count]; |
| } |
| else |
| { |
| ++$count; |
| $start[$count] = $nchar; |
| $end[$count] = $nchar; |
| $cats[$count] = $category; |
| } |
| } |
| |
| # Update a case table. We handle case tables specially because we |
| # want to map (e.g.) a block of uppercase characters directly onto the |
| # corresponding block of lowercase characters. Therefore we generate |
| # a new entry when the block would no longer map directly. |
| sub update_case_block |
| { |
| local (*start, *end, *map, $char, $mapchar) = @_; |
| |
| local ($nchar) = hex ($char); |
| local ($nmap) = hex ($mapchar); |
| |
| local ($count) = $#end; |
| if ($count >= 0 |
| && $end[$count] == $nchar - 1 |
| && $nchar - $start[$count] == $nmap - $map[$count]) |
| { |
| ++$end[$count]; |
| } |
| else |
| { |
| ++$count; |
| $start[$count] = $nchar; |
| $end[$count] = $nchar; |
| $map[$count] = $nmap; |
| } |
| } |
| |
| # Set the attribute value for the character. Each character can have |
| # only one attribute. |
| sub set_attribute |
| { |
| local ($ncode, $attr) = @_; |
| |
| if ($attributes{$ncode} ne '' && $attributes{$ncode} ne $attr) |
| { |
| if ($ncode >= $ROMAN_START && $ncode <= $ROMAN_END) |
| { |
| $second_attributes{$ncode} = $attr; |
| } |
| else |
| { |
| printf STDERR "character \\u%04x already has attribute\n", $ncode; |
| } |
| } |
| # Attributes can be interpreted as unsigned in some situations, |
| # so we check against 65535. This could cause errors -- we need |
| # to check the interpretation here. |
| elsif ($attr < -32768 || $attr > 65535) |
| { |
| printf STDERR "attribute out of range for character \\u%04x\n", $ncode; |
| } |
| else |
| { |
| $attributes{$ncode} = $attr; |
| } |
| } |
| |
| |
| # Print a block table. |
| sub print_block |
| { |
| local ($title, *start, *end) = @_; |
| |
| print CHARTABLE "static const jchar ", $title, "[][2] =\n"; |
| print CHARTABLE " {\n"; |
| |
| local ($i) = 0; |
| while ($i <= $#start) |
| { |
| print CHARTABLE " { "; |
| &print_char ($start[$i]); |
| print CHARTABLE ", "; |
| &print_char ($end[$i]); |
| print CHARTABLE " }"; |
| print CHARTABLE "," if ($i != $#start); |
| print CHARTABLE "\n"; |
| ++$i; |
| $bytes += 4; # Two bytes per char. |
| } |
| |
| print CHARTABLE " };\n\n"; |
| } |
| |
| # Print the numerics table. |
| sub print_numerics |
| { |
| local ($i, $key, $count, @keys); |
| |
| $i = 0; |
| @keys = sort keys %numerics; |
| $count = @keys; |
| |
| print CHARTABLE "static const jchar numeric_table[] =\n"; |
| print CHARTABLE " { "; |
| foreach $key (@keys) |
| { |
| &print_char (hex ($key)); |
| ++$i; |
| print CHARTABLE ", " if $i < $count; |
| # Print 5 per line. |
| print CHARTABLE "\n " if ($i % 5 == 0); |
| $bytes += 2; # One character. |
| } |
| print CHARTABLE " };\n\n"; |
| |
| print CHARTABLE "static const jshort numeric_value[] =\n"; |
| print CHARTABLE " { "; |
| $i = 0; |
| foreach $key (@keys) |
| { |
| print CHARTABLE $numerics{$key}; |
| if ($numerics{$key} > 32767 || $numerics{$key} < -32768) |
| { |
| # This means our generated type info is incorrect. We |
| # could just detect and work around this here, but I'm |
| # lazy. |
| print STDERR "numeric value won't fit in a short\n"; |
| $status = 1; |
| } |
| ++$i; |
| print CHARTABLE ", " if $i < $count; |
| # Print 10 per line. |
| print CHARTABLE "\n " if ($i % 10 == 0); |
| $bytes += 2; # One short. |
| } |
| print CHARTABLE " };\n\n"; |
| } |
| |
| # Print a table that maps one single letter onto another. It assumes |
| # the map is index by char code. |
| sub print_single_map |
| { |
| local ($title, %map) = @_; |
| |
| local (@keys) = sort keys %map; |
| $num = @keys; |
| print CHARTABLE "static const jchar ", $title, "[][2] =\n"; |
| print CHARTABLE " {\n"; |
| $i = 0; |
| for $key (@keys) |
| { |
| print CHARTABLE " { "; |
| &print_char (hex ($key)); |
| print CHARTABLE ", "; |
| &print_char (hex ($map{$key})); |
| print CHARTABLE " }"; |
| ++$i; |
| if ($i < $num) |
| { |
| print CHARTABLE ","; |
| } |
| else |
| { |
| print CHARTABLE " "; |
| } |
| print CHARTABLE " // ", $name{$key}, "\n"; |
| $bytes += 4; # Two bytes per char. |
| } |
| print CHARTABLE " };\n\n"; |
| } |
| |
| # Print the `all' block. |
| sub print_all_block |
| { |
| local (*start, *end, *cats) = @_; |
| |
| &print_block ("all_table", *start, *end); |
| |
| local ($i) = 0; |
| local ($sum) = 0; |
| while ($i <= $#start) |
| { |
| $sum += $end[$i] - $start[$i] + 1; |
| ++$i; |
| } |
| # We do this computation just to make sure it isn't cheaper to |
| # simply list all the characters individually. |
| printf STDERR ("all_table encodes %d characters in %d entries\n", |
| $sum, $#start + 1); |
| |
| print CHARTABLE "static const jbyte category_table[] =\n"; |
| print CHARTABLE " { "; |
| |
| $i = 0; |
| while ($i <= $#cats) |
| { |
| if ($i > 0 && $cats[$i] eq $cats[$i - 1]) |
| { |
| # This isn't an error. We can have a duplicate because |
| # two ranges are not adjacent while the intervening |
| # characters are left out of the table for other reasons. |
| # We could exploit this to make the table a little smaller. |
| # printf STDERR "Duplicate all entry at \\u%04x\n", $start[$i]; |
| } |
| print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]}; |
| print CHARTABLE ", " if ($i < $#cats); |
| ++$i; |
| print CHARTABLE "\n "; |
| ++$bytes; |
| } |
| print CHARTABLE " };\n\n"; |
| } |
| |
| # Print case table. |
| sub print_case_table |
| { |
| local ($title, *start, *end, *map, *anomalous) = @_; |
| |
| &print_block ($title . '_case_table', *start, *end); |
| |
| print CHARTABLE "static const jchar ", $title, "_case_map_table[] =\n"; |
| print CHARTABLE " { "; |
| |
| local ($i) = 0; |
| while ($i <= $#map) |
| { |
| &print_char ($map[$i]); |
| print CHARTABLE ", " if $i < $#map; |
| ++$i; |
| print CHARTABLE "\n " if $i % 5 == 0; |
| $bytes += 2; |
| } |
| print CHARTABLE " };\n"; |
| |
| |
| local ($key, @keys); |
| @keys = sort keys %anomalous; |
| |
| if ($title eq 'upper') |
| { |
| if ($#keys >= 0) |
| { |
| # If these are found we need to change Character.isUpperCase. |
| print STDERR "Found anomalous upper case characters\n"; |
| $status = 1; |
| } |
| } |
| else |
| { |
| print CHARTABLE "\n"; |
| print CHARTABLE "static const jchar ", $title, "_anomalous_table[] =\n"; |
| print CHARTABLE " { "; |
| $i = 0; |
| foreach $key (@keys) |
| { |
| &print_char (hex ($key)); |
| print CHARTABLE ", " if $i < $#keys; |
| ++$i; |
| print CHARTABLE "\n " if $i % 5 == 0; |
| $bytes += 2; |
| } |
| print CHARTABLE " };\n"; |
| } |
| |
| print CHARTABLE "\n"; |
| } |
| |
| # Print the type table and attributes table for the fast version. |
| sub print_fast_tables |
| { |
| local (*start, *end, *cats, *atts, *second_atts) = @_; |
| |
| print CHARTABLE "static const jbyte type_table[] =\n{ "; |
| |
| local ($i, $j); |
| for ($i = 0; $i <= $#cats; ++$i) |
| { |
| for ($j = $start[$i]; $j <= $end[$i]; ++$j) |
| { |
| print CHARTABLE 'java::lang::Character::', $category_map{$cats[$i]}; |
| print CHARTABLE "," if ($i < $#cats || $j < $end[$i]); |
| print CHARTABLE "\n "; |
| } |
| } |
| print CHARTABLE "\n };\n\n"; |
| |
| print CHARTABLE "static const jshort attribute_table[] =\n{ "; |
| for ($i = 0; $i <= 0xffff; ++$i) |
| { |
| $atts{$i} = 0 if ! defined $atts{$i}; |
| print CHARTABLE $atts{$i}; |
| print CHARTABLE ", " if $i < 0xffff; |
| print CHARTABLE "\n " if $i % 5 == 1; |
| } |
| print CHARTABLE "\n };\n\n"; |
| |
| print CHARTABLE "static const jshort secondary_attribute_table[] =\n{ "; |
| for ($i = $ROMAN_START; $i <= $ROMAN_END; ++$i) |
| { |
| print CHARTABLE $second_atts{$i}; |
| print CHARTABLE ", " if $i < $ROMAN_END; |
| print CHARTABLE "\n " if $i % 5 == 1; |
| } |
| print CHARTABLE "\n };\n\n"; |
| } |
| |
| # Print a character constant. |
| sub print_char |
| { |
| local ($ncode) = @_; |
| printf CHARTABLE "0x%04x", $ncode; |
| } |