libjava/scripts/unicode-decomp.pl - gcc - Git at Google

 #!/usr/bin/perl -w
 # unicode-decomp.pl - script to generate database for java.text.Collator
 # Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.
 #
 # This file is part of libjava.
 #
 # This software is copyrighted work licensed under the terms of the
 # Libjava License.  Please consult the file "LIBJAVA_LICENSE" for
 # details.

 # Code for reading UnicodeData.txt and generating the code for
 # gnu.java.lang.CharData.  For now, the relevant Unicode definition files
 # are found in libjava/gnu/gcj/convert/.
 #
 # Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>
 #   where <UnicodeData.txt> is obtained from www.unicode.org (named
 #   UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
 #   is the final location of include/java-chardecomp.h.
 #   As of JDK 1.4, use Unicode version 3.0.0 for best results.
 #
 # If this exits with nonzero status, then you must investigate the
 # cause of the problem.
 # Diagnostics and other information to stderr.
 # With -n, the files are not created, but all processing still occurs.

 # These maps characters to their decompositions.
 my %canonical_decomposition = ();
 my %full_decomposition = ();

 # Handle `-n' and open output files.
 if ($ARGV[0] && $ARGV[0] eq '-n')
 {
     shift @ARGV;
     $ARGV[1] = '/dev/null';
 }
 die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;
 open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";

 # Process the Unicode file.
 $| = 1;
 my $count = 0;
 print STDERR "Parsing attributes file";
 while (<UNICODE>)
 {
     print STDERR "." unless $count++ % 1000;
     chomp;
     s/\r//g;
     my ($ch, undef, undef, undef, undef, $decomp) = split ';';
     $ch = hex($ch);

     if ($decomp ne '')
     {
         my $is_full = 0;
         my @decomp = ();
         foreach (split (' ', $decomp))
         {
             if (/^\<.*\>$/)
             {
                 $is_full = 1;
                 next;
             }
 	    push (@decomp, hex ($_));
 	}
         my $s = pack "n*", @decomp;
         if ($is_full)
         {
             $full_decomposition{$ch} = $s;
         }
         else
         {
             $canonical_decomposition{$ch} = $s;
         }
     }
 }

 # Now generate decomposition tables.
 open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n";
 print STDERR "\nGenerating tables\n";
 print DECOMP <<EOF;
 // java-chardecomp.h - Decomposition character tables -*- c++ -*-

 #ifndef __JAVA_CHARDECOMP_H__
 #define __JAVA_CHARDECOMP_H__


 // These tables are automatically generated by the $0
 // script.  DO NOT EDIT the tables.  Instead, fix the script
 // and run it again.

 // This file should only be included by natCollator.cc

 struct decomp_entry
 {
   jchar key;
   const char *value;
 };

 EOF

 &write_decompositions;

 print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";

 close(DECOMP);
 print STDERR "Done\n";
 exit;


 # Write a single decomposition table.
 sub write_single_decomposition($$%)
 {
     my ($name, $is_canon, %table) = @_;
     my $first_line = 1;
     print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";

     for my $key (0 .. 0xffff)
     {
 	next if ! defined $table{$key};
         print DECOMP ",\n" unless $first_line;
 	$first_line = 0;

 	printf DECOMP "  { 0x%04x, \"", $key;

 	# We represent the expansion as a series of bytes, terminated
 	# with a double nul.  This is ugly, but relatively
 	# space-efficient.  Most expansions are short, but there are a
 	# few that are very long (e.g. \uFDFA).  This means that if we
 	# chose a fixed-space representation we would waste a lot of
 	# space.
 	my @expansion = unpack "n*", $table{$key};
 	foreach my $char (@expansion)
 	{
 	    printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
 	}

 	print DECOMP "\" }";
     }

     print DECOMP "\n};\n\n";
 }

 sub write_decompositions()
 {
     &write_single_decomposition ('canonical', 1, %canonical_decomposition);
     &write_single_decomposition ('full', 0, %full_decomposition);
 }
	#!/usr/bin/perl -w
	# unicode-decomp.pl - script to generate database for java.text.Collator
	# Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.
	#
	# This file is part of libjava.
	#
	# This software is copyrighted work licensed under the terms of the
	# Libjava License. Please consult the file "LIBJAVA_LICENSE" for
	# details.

	# Code for reading UnicodeData.txt and generating the code for
	# gnu.java.lang.CharData. For now, the relevant Unicode definition files
	# are found in libjava/gnu/gcj/convert/.
	#
	# Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>
	# where <UnicodeData.txt> is obtained from www.unicode.org (named
	# UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
	# is the final location of include/java-chardecomp.h.
	# As of JDK 1.4, use Unicode version 3.0.0 for best results.
	#
	# If this exits with nonzero status, then you must investigate the
	# cause of the problem.
	# Diagnostics and other information to stderr.
	# With -n, the files are not created, but all processing still occurs.

	# These maps characters to their decompositions.
	my %canonical_decomposition = ();
	my %full_decomposition = ();

	# Handle `-n' and open output files.
	if ($ARGV[0] && $ARGV[0] eq '-n')
	{
	shift @ARGV;
	$ARGV[1] = '/dev/null';
	}
	die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;
	open (UNICODE, "< $ARGV[0]") \|\| die "Can't open Unicode attribute file: $!\n";

	# Process the Unicode file.
	$\| = 1;
	my $count = 0;
	print STDERR "Parsing attributes file";
	while (<UNICODE>)
	{
	print STDERR "." unless $count++ % 1000;
	chomp;
	s/\r//g;
	my ($ch, undef, undef, undef, undef, $decomp) = split ';';
	$ch = hex($ch);

	if ($decomp ne '')
	{
	my $is_full = 0;
	my @decomp = ();
	foreach (split (' ', $decomp))
	{
	if (/^\<.*\>$/)
	{
	$is_full = 1;
	next;
	}
	push (@decomp, hex ($_));
	}
	my $s = pack "n*", @decomp;
	if ($is_full)
	{
	$full_decomposition{$ch} = $s;
	}
	else
	{
	$canonical_decomposition{$ch} = $s;
	}
	}
	}

	# Now generate decomposition tables.
	open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n";
	print STDERR "\nGenerating tables\n";
	print DECOMP <<EOF;
	// java-chardecomp.h - Decomposition character tables -- c++ --

	#ifndef __JAVA_CHARDECOMP_H__
	#define __JAVA_CHARDECOMP_H__


	// These tables are automatically generated by the $0
	// script. DO NOT EDIT the tables. Instead, fix the script
	// and run it again.

	// This file should only be included by natCollator.cc

	struct decomp_entry
	{
	jchar key;
	const char *value;
	};

	EOF

	&write_decompositions;

	print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";

	close(DECOMP);
	print STDERR "Done\n";
	exit;


	# Write a single decomposition table.
	sub write_single_decomposition($$%)
	{
	my ($name, $is_canon, %table) = @_;
	my $first_line = 1;
	print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";

	for my $key (0 .. 0xffff)
	{
	next if ! defined $table{$key};
	print DECOMP ",\n" unless $first_line;
	$first_line = 0;

	printf DECOMP " { 0x%04x, \"", $key;

	# We represent the expansion as a series of bytes, terminated
	# with a double nul. This is ugly, but relatively
	# space-efficient. Most expansions are short, but there are a
	# few that are very long (e.g. \uFDFA). This means that if we
	# chose a fixed-space representation we would waste a lot of
	# space.
	my @expansion = unpack "n*", $table{$key};
	foreach my $char (@expansion)
	{
	printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
	}

	print DECOMP "\" }";
	}

	print DECOMP "\n};\n\n";
	}

	sub write_decompositions()
	{
	&write_single_decomposition ('canonical', 1, %canonical_decomposition);
	&write_single_decomposition ('full', 0, %full_decomposition);
	}