| #!/usr/bin/perl -w |
| # unicode-decomp.pl - script to generate database for java.text.Collator |
| # Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc. |
| # |
| # This file is part of libjava. |
| # |
| # This software is copyrighted work licensed under the terms of the |
| # Libjava License. Please consult the file "LIBJAVA_LICENSE" for |
| # details. |
| |
| # Code for reading UnicodeData.txt and generating the code for |
| # gnu.java.lang.CharData. For now, the relevant Unicode definition files |
| # are found in libjava/gnu/gcj/convert/. |
| # |
| # Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h> |
| # where <UnicodeData.txt> is obtained from www.unicode.org (named |
| # UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java> |
| # is the final location of include/java-chardecomp.h. |
| # As of JDK 1.4, use Unicode version 3.0.0 for best results. |
| # |
| # If this exits with nonzero status, then you must investigate the |
| # cause of the problem. |
| # Diagnostics and other information to stderr. |
| # With -n, the files are not created, but all processing still occurs. |
| |
| # These maps characters to their decompositions. |
| my %canonical_decomposition = (); |
| my %full_decomposition = (); |
| |
| # Handle `-n' and open output files. |
| if ($ARGV[0] && $ARGV[0] eq '-n') |
| { |
| shift @ARGV; |
| $ARGV[1] = '/dev/null'; |
| } |
| die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2; |
| open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n"; |
| |
| # Process the Unicode file. |
| $| = 1; |
| my $count = 0; |
| print STDERR "Parsing attributes file"; |
| while (<UNICODE>) |
| { |
| print STDERR "." unless $count++ % 1000; |
| chomp; |
| s/\r//g; |
| my ($ch, undef, undef, undef, undef, $decomp) = split ';'; |
| $ch = hex($ch); |
| |
| if ($decomp ne '') |
| { |
| my $is_full = 0; |
| my @decomp = (); |
| foreach (split (' ', $decomp)) |
| { |
| if (/^\<.*\>$/) |
| { |
| $is_full = 1; |
| next; |
| } |
| push (@decomp, hex ($_)); |
| } |
| my $s = pack "n*", @decomp; |
| if ($is_full) |
| { |
| $full_decomposition{$ch} = $s; |
| } |
| else |
| { |
| $canonical_decomposition{$ch} = $s; |
| } |
| } |
| } |
| |
| # Now generate decomposition tables. |
| open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n"; |
| print STDERR "\nGenerating tables\n"; |
| print DECOMP <<EOF; |
| // java-chardecomp.h - Decomposition character tables -*- c++ -*- |
| |
| #ifndef __JAVA_CHARDECOMP_H__ |
| #define __JAVA_CHARDECOMP_H__ |
| |
| |
| // These tables are automatically generated by the $0 |
| // script. DO NOT EDIT the tables. Instead, fix the script |
| // and run it again. |
| |
| // This file should only be included by natCollator.cc |
| |
| struct decomp_entry |
| { |
| jchar key; |
| const char *value; |
| }; |
| |
| EOF |
| |
| &write_decompositions; |
| |
| print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n"; |
| |
| close(DECOMP); |
| print STDERR "Done\n"; |
| exit; |
| |
| |
| # Write a single decomposition table. |
| sub write_single_decomposition($$%) |
| { |
| my ($name, $is_canon, %table) = @_; |
| my $first_line = 1; |
| print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n"; |
| |
| for my $key (0 .. 0xffff) |
| { |
| next if ! defined $table{$key}; |
| print DECOMP ",\n" unless $first_line; |
| $first_line = 0; |
| |
| printf DECOMP " { 0x%04x, \"", $key; |
| |
| # We represent the expansion as a series of bytes, terminated |
| # with a double nul. This is ugly, but relatively |
| # space-efficient. Most expansions are short, but there are a |
| # few that are very long (e.g. \uFDFA). This means that if we |
| # chose a fixed-space representation we would waste a lot of |
| # space. |
| my @expansion = unpack "n*", $table{$key}; |
| foreach my $char (@expansion) |
| { |
| printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256); |
| } |
| |
| print DECOMP "\" }"; |
| } |
| |
| print DECOMP "\n};\n\n"; |
| } |
| |
| sub write_decompositions() |
| { |
| &write_single_decomposition ('canonical', 1, %canonical_decomposition); |
| &write_single_decomposition ('full', 0, %full_decomposition); |
| } |