blob: 8aeed152adf01c6c034defb74f1a57a4501c382b [file] [log] [blame]
#!/usr/bin/perl -w
# unicode-decomp.pl - script to generate database for java.text.Collator
# Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.
#
# This file is part of libjava.
#
# This software is copyrighted work licensed under the terms of the
# Libjava License. Please consult the file "LIBJAVA_LICENSE" for
# details.
# Code for reading UnicodeData.txt and generating the code for
# gnu.java.lang.CharData. For now, the relevant Unicode definition files
# are found in libjava/gnu/gcj/convert/.
#
# Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>
# where <UnicodeData.txt> is obtained from www.unicode.org (named
# UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
# is the final location of include/java-chardecomp.h.
# As of JDK 1.4, use Unicode version 3.0.0 for best results.
#
# If this exits with nonzero status, then you must investigate the
# cause of the problem.
# Diagnostics and other information to stderr.
# With -n, the files are not created, but all processing still occurs.
# These maps characters to their decompositions.
my %canonical_decomposition = ();
my %full_decomposition = ();
# Handle `-n' and open output files.
if ($ARGV[0] && $ARGV[0] eq '-n')
{
shift @ARGV;
$ARGV[1] = '/dev/null';
}
die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;
open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
# Process the Unicode file.
$| = 1;
my $count = 0;
print STDERR "Parsing attributes file";
while (<UNICODE>)
{
print STDERR "." unless $count++ % 1000;
chomp;
s/\r//g;
my ($ch, undef, undef, undef, undef, $decomp) = split ';';
$ch = hex($ch);
if ($decomp ne '')
{
my $is_full = 0;
my @decomp = ();
foreach (split (' ', $decomp))
{
if (/^\<.*\>$/)
{
$is_full = 1;
next;
}
push (@decomp, hex ($_));
}
my $s = pack "n*", @decomp;
if ($is_full)
{
$full_decomposition{$ch} = $s;
}
else
{
$canonical_decomposition{$ch} = $s;
}
}
}
# Now generate decomposition tables.
open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n";
print STDERR "\nGenerating tables\n";
print DECOMP <<EOF;
// java-chardecomp.h - Decomposition character tables -*- c++ -*-
#ifndef __JAVA_CHARDECOMP_H__
#define __JAVA_CHARDECOMP_H__
// These tables are automatically generated by the $0
// script. DO NOT EDIT the tables. Instead, fix the script
// and run it again.
// This file should only be included by natCollator.cc
struct decomp_entry
{
jchar key;
const char *value;
};
EOF
&write_decompositions;
print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";
close(DECOMP);
print STDERR "Done\n";
exit;
# Write a single decomposition table.
sub write_single_decomposition($$%)
{
my ($name, $is_canon, %table) = @_;
my $first_line = 1;
print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";
for my $key (0 .. 0xffff)
{
next if ! defined $table{$key};
print DECOMP ",\n" unless $first_line;
$first_line = 0;
printf DECOMP " { 0x%04x, \"", $key;
# We represent the expansion as a series of bytes, terminated
# with a double nul. This is ugly, but relatively
# space-efficient. Most expansions are short, but there are a
# few that are very long (e.g. \uFDFA). This means that if we
# chose a fixed-space representation we would waste a lot of
# space.
my @expansion = unpack "n*", $table{$key};
foreach my $char (@expansion)
{
printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
}
print DECOMP "\" }";
}
print DECOMP "\n};\n\n";
}
sub write_decompositions()
{
&write_single_decomposition ('canonical', 1, %canonical_decomposition);
&write_single_decomposition ('full', 0, %full_decomposition);
}