Changeset View
Changeset View
Standalone View
Standalone View
tools/tools/locale/tools/utf8-rollup.pl
#!/usr/local/bin/perl -wC | #!/usr/local/bin/perl -wC | ||||
use strict; | use strict; | ||||
#use File::Copy; | |||||
#use XML::Parser; | |||||
use Tie::IxHash; | use Tie::IxHash; | ||||
#use Data::Dumper; | |||||
use Getopt::Long; | use Getopt::Long; | ||||
#use Digest::SHA qw(sha1_hex); | |||||
#require "charmaps.pm"; | |||||
if ($#ARGV != 1) { | if ($#ARGV != 1) { | ||||
print "Usage: $0 --cldr=<cldrdir> --etc=<etcdir>\n"; | print "Usage: $0 --unidir=<unidir> --etc=<etcdir>\n"; | ||||
exit(1); | exit(1); | ||||
} | } | ||||
my $CLDRDIR = undef; | my $UNIDIR = undef; | ||||
my $ETCDIR = undef; | my $ETCDIR = undef; | ||||
my $result = GetOptions ( | my $result = GetOptions ( | ||||
"cldr=s" => \$CLDRDIR, | "unidir=s" => \$UNIDIR, | ||||
"etc=s" => \$ETCDIR, | "etc=s" => \$ETCDIR, | ||||
); | ); | ||||
my @SECTIONS = ( | |||||
["en_US", "* 0x0000 - 0x007F Basic Latin\n" . | |||||
"* 0x0080 - 0x00FF Latin-1 Supplement\n" . | |||||
"* 0x0100 - 0x017F Latin Extended-A\n" . | |||||
"* 0x0180 - 0x024F Latin Extended-B\n" . | |||||
"* 0x0250 - 0x02AF IPA Extensions\n" . | |||||
"* 0x1D00 - 0x1D7F Phonetic Extensions\n" . | |||||
"* 0x1D80 - 0x1DBF Phonetic Extensions Supplement\n" . | |||||
"* 0x1E00 - 0x1EFF Latin Extended Additional\n" . | |||||
"* 0x2150 - 0x218F Number Forms (partial - Roman Numerals)\n". | |||||
"* 0x2C60 - 0x2C7F Latin Extended-C\n" . | |||||
"* 0xA720 - 0xA7FF Latin Extended-D\n" . | |||||
"* 0xAB30 - 0xAB6F Latin Extended-E\n" . | |||||
"* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n". | |||||
"* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"], | |||||
["el_GR", "* 0x0370 - 0x03FF Greek (No Coptic!)\n" . | |||||
"* 0x1F00 - 0x1FFF Greek Extended\n"], | |||||
["ru_RU", "* 0x0400 - 0x04FF Cyrillic\n" . | |||||
"* 0x0500 - 0x052F Cyrillic Supplementary\n" . | |||||
"* 0x2DE0 - 0x2DFF Cyrillic Extended-A\n" . | |||||
"* 0xA640 - 0xA69F Cyrillic Extended-B\n"], | |||||
["hy_AM", "* 0x0530 - 0x058F Armenian\n" . | |||||
"* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"], | |||||
["he_IL", "* 0x0590 - 0x05FF Hebrew\n" . | |||||
"* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"], | |||||
["ar_SA", "* 0x0600 - 0x06FF Arabic\n" . | |||||
"* 0x0750 - 0x074F Arabic Supplement\n" . | |||||
"* 0x08A0 - 0x08FF Arabic Extended-A\n" . | |||||
"* 0xFB50 - 0xFDFF Arabic Presentation Forms (partial)\n" . | |||||
"* 0xFE70 - 0xFEFF Arabic Presentation Forms-B (partial)\n"], | |||||
["hi_IN", "* 0x0900 - 0x097F Devanagari\n" . | |||||
"* 0xA8E0 - 0xA8FF Devanagari Extended\n"], | |||||
["bn_IN", "* 0x0900 - 0x097F Bengali\n"], | |||||
["pa_Guru_IN", "* 0x0A00 - 0x0A7F Gurmukhi\n"], | |||||
["gu_IN", "* 0x0A80 - 0x0AFF Gujarati\n"], | |||||
["or_IN", "* 0x0B00 - 0x0B7F Oriya\n"], | |||||
["ta_IN", "* 0x0B80 - 0x0BFF Tamil\n"], | |||||
["te_IN", "* 0x0C00 - 0x0C7F Telugu\n"], | |||||
["kn_IN", "* 0x0C80 - 0x0CFF Kannada\n"], | |||||
["ml_IN", "* 0x0D00 - 0x0D7F Malayalam\n"], | |||||
["si_LK", "* 0x0D80 - 0x0DFF Sinhala\n"], | |||||
["th_TH", "* 0x0E00 - 0x0E7F Thai\n"], | |||||
["lo_LA", "* 0x0E80 - 0x0EFF Lao\n"], | |||||
["bo_IN", "* 0x0F00 - 0x0FFF Tibetan\n"], | |||||
["my_MM", "* 0x1000 - 0x109F Myanmar\n" . | |||||
"* 0xA9E0 - 0xA9FF Myanmar Extended-B\n" . | |||||
"* 0xAA60 - 0xAA7F Myanmar Extended-A\n"], | |||||
["ka_GE", "* 0x10A0 - 0x10FF Georgia\n" . | |||||
"* 0x2D00 - 0x2D2F Georgian Supplement\n"], | |||||
["ja_JP", "* 0x1100 - 0x11FF Hangul Jamo\n" . | |||||
"* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" . | |||||
"* 0x3040 - 0x309F Hiragana\n" . | |||||
"* 0x30A0 - 0x30FF Katakana\n" . | |||||
"* 0x31F0 - 0x31FF Katakana Phonetic Extensions\n" . | |||||
"* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" . | |||||
"* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" . | |||||
"* 0x3300 - 0x33FF CJK Compatibility\n" . | |||||
"* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension-A (added)\n" . | |||||
"* 0x4E00 - 0x9FCC CJK Unified Ideographs (overridden)\n" . | |||||
"* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" . | |||||
"* 0xD7B0 - 0xD7FF Hangul Jamo Extended-B\n" . | |||||
"* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n" . | |||||
"* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"], | |||||
["am_ET", "* 0x1200 - 0x137F Ethiopic\n" . | |||||
"* 0x1380 - 0x139F Ethiopic Supplement\n" . | |||||
"* 0x2D80 - 0x2DDF Ethiopic Extended\n" . | |||||
"* 0xAB00 - 0xAB2F Ethiopic Extended-A\n"], | |||||
["chr_US", "* 0x13A0 - 0x13FF Cherokee\n"], | |||||
["km_KH", "* 0x1780 - 0x17FF Khmer\n" . | |||||
"* 0x19E0 - 0x19FF Khmer Symbols\n"], | |||||
["shi_Tfng_MA", "* 0x2D30 - 0x2D2F Tifinagh\n"], | |||||
["ii_CN", "* 0xA000 - 0xA48F Yi Syllables\n" . | |||||
"* 0xA490 - 0xA4CF Yi Radicals\n"], | |||||
["vai_Vaii_LR", "* 0xA500 - 0xA63F Vai\n"], | |||||
["ko_KR", "* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" . | |||||
"* 0xA960 - 0xA97F Hangul Jamo Extended-A\n" . | |||||
"* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" . | |||||
"* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"], | |||||
); | |||||
# ["zh_Hans_CN", "* 0x2E80 - 0x2EFF CJK Radicals Supplement\n" . | |||||
# "* 0x2F00 - 0x2FDF Rangxi Radicales\n" . | |||||
# "* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" . | |||||
# "* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" . | |||||
# "* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension A\n" . | |||||
# "* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n"], | |||||
my %seen = (); | my %seen = (); | ||||
my %pending_seen = (); | |||||
my %utf8map = (); | my %utf8map = (); | ||||
my %utf8aliases = (); | my %utf8aliases = (); | ||||
my $outfilename = "$ETCDIR/common.UTF-8.src"; | my $outfilename = "$ETCDIR/common.UTF-8.src"; | ||||
my $manual_file = "$ETCDIR/manual-input.UTF-8"; | |||||
my $stars = "**********************************************************************\n"; | my $stars = "**********************************************************************\n"; | ||||
get_utf8map("$CLDRDIR/posix/UTF-8.cm"); | get_utf8map("$UNIDIR/posix/UTF-8.cm"); | ||||
generate_header (); | generate_header (); | ||||
generate_sections (); | parse_unidata ("$UNIDIR/UnicodeData.txt"); | ||||
generate_footer (); | generate_footer (); | ||||
############################ | ############################ | ||||
sub get_utf8map { | sub get_utf8map { | ||||
my $file = shift; | my $file = shift; | ||||
open(FIN, $file); | open(FIN, $file); | ||||
my @lines = <FIN>; | my @lines = <FIN>; | ||||
close(FIN); | close(FIN); | ||||
chomp(@lines); | chomp(@lines); | ||||
my $prev_k = undef; | |||||
my $prev_v = ""; | |||||
my $incharmap = 0; | my $incharmap = 0; | ||||
foreach my $l (@lines) { | foreach my $l (@lines) { | ||||
$l =~ s/\r//; | $l =~ s/\r//; | ||||
next if ($l =~ /^\#/); | next if ($l =~ /^\#/); | ||||
next if ($l eq ""); | next if ($l eq ""); | ||||
if ($l eq "CHARMAP") { | if ($l eq "CHARMAP") { | ||||
$incharmap = 1; | $incharmap = 1; | ||||
next; | next; | ||||
} | } | ||||
next if (!$incharmap); | next if (!$incharmap); | ||||
last if ($l eq "END CHARMAP"); | last if ($l eq "END CHARMAP"); | ||||
$l =~ /^<([^\s]+)>\s+(.*)/; | $l =~ /^(<[^\s]+>)\s+(.*)/; | ||||
my $k = $1; | my $k = $2; | ||||
my $v = $2; | my $v = $1; | ||||
$k =~ s/_/ /g; # unicode char string | $k =~ s/\\x//g; # UTF-8 char code | ||||
$v =~ s/\\x//g; # UTF-8 char code | |||||
$utf8map{$k} = $v; | $utf8map{$k} = $v; | ||||
$utf8aliases{$k} = $prev_k if ($prev_v eq $v); | |||||
$prev_v = $v; | |||||
$prev_k = $k; | |||||
} | } | ||||
} | } | ||||
sub generate_header { | sub generate_header { | ||||
open(FOUT, ">", "$outfilename") | open(FOUT, ">", "$outfilename") | ||||
or die ("can't write to $outfilename\n"); | or die ("can't write to $outfilename\n"); | ||||
print FOUT <<EOF; | print FOUT <<EOF; | ||||
# Warning: Do not edit. This file is automatically generated from the | # Warning: Do not edit. This file is automatically generated from the | ||||
# tools in /usr/src/tools/tools/locale. The data is obtained from the | # tools in /usr/src/tools/tools/locale. The data is obtained from the | ||||
# CLDR project, obtained from http://cldr.unicode.org/ | # CLDR project, obtained from http://cldr.unicode.org/ | ||||
# ----------------------------------------------------------------------------- | # ----------------------------------------------------------------------------- | ||||
comment_char * | comment_char * | ||||
escape_char / | escape_char / | ||||
LC_CTYPE | LC_CTYPE | ||||
EOF | EOF | ||||
} | } | ||||
sub generate_footer { | sub generate_footer { | ||||
print FOUT "\nEND LC_CTYPE\n"; | print FOUT "\nEND LC_CTYPE\n"; | ||||
close (FOUT); | close (FOUT); | ||||
} | } | ||||
sub already_seen { | sub wctomb { | ||||
my $ucode = shift; | my $wc = hex(shift); | ||||
if (defined $seen{$ucode}) { | my $lead; | ||||
return 1; | my $len; | ||||
} | my $ret = ""; | ||||
$pending_seen{$ucode} = 1; | my $i; | ||||
return 0; | |||||
} | |||||
sub already_seen_RO { | if (($wc & ~0x7f) == 0) { | ||||
my $ucode = shift; | return sprintf "%02X", $wc; | ||||
if (defined $seen{$ucode}) { | } elsif (($wc & ~0x7ff) == 0) { | ||||
return 1; | $lead = 0xc0; | ||||
$len = 2; | |||||
} elsif (($wc & ~0xffff) == 0) { | |||||
$lead = 0xe0; | |||||
$len = 3; | |||||
} elsif ($wc >= 0 && $wc <= 0x10ffff) { | |||||
$lead = 0xf0; | |||||
$len = 4; | |||||
} | } | ||||
return 0; | |||||
} | |||||
sub merge_seen { | for ($i = $len - 1; $i > 0; $i--) { | ||||
foreach my $sn (keys %pending_seen) { | $ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret; | ||||
$seen{$sn} = 1; | $wc >>= 6; | ||||
} | } | ||||
%pending_seen = (); | $ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret; | ||||
} | |||||
sub initialize_lines { | return $ret; | ||||
my @result = (); | |||||
my $terr = shift; | |||||
my $n; | |||||
my $back2hex; | |||||
my @types = ("graph", "alpha"); | |||||
if ($terr eq "ja_JP") { | |||||
foreach my $T (@types) { | |||||
push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-3400>;/\n"; | |||||
for ($n = hex("3401"); $n <= hex("4DB4"); $n++) { | |||||
$back2hex=sprintf("%X", $n); | |||||
push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" . | |||||
$back2hex . ">;/\n"; | |||||
} | } | ||||
push @result, "\t<CJK_UNIFIED_IDEOGRAPH-4DB5>\n"; | |||||
push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-4E00>;/\n"; | |||||
for ($n = hex("4E01"); $n <= hex("9FCB"); $n++) { | |||||
$back2hex=sprintf("%X", $n); | |||||
push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" . | |||||
$back2hex . ">;/\n"; | |||||
} | |||||
push @result, "\t<CJK_UNIFIED_IDEOGRAPH-9FCC>\n"; | |||||
} | |||||
push @result, "merge\tnow\n"; | |||||
} | |||||
return @result; | |||||
} | |||||
sub compress_ctype { | sub parse_unidata { | ||||
my $territory = shift; | my $file = shift; | ||||
my $term; | my %data = (); | ||||
my $active = 0; | |||||
my $cat_loaded = 0; | |||||
my $lock_ID; | |||||
my $prev_ID; | |||||
my $curr_ID; | |||||
my $lock_name; | |||||
my $prev_name; | |||||
my $curr_name; | |||||
my $key_name; | |||||
my $category = ''; | |||||
my @lines = initialize_lines ($territory); | open(FIN, $file); | ||||
my @lines = <FIN>; | |||||
close(FIN); | |||||
chomp(@lines); | |||||
my $filename = "$CLDRDIR/posix/$territory.UTF-8.src"; | foreach my $l (@lines) { | ||||
if (! -f $filename) { | my @d = split(/;/, $l, -1); | ||||
print STDERR "Cannot open $filename\n"; | my $mb = wctomb($d[0]); | ||||
return; | my $cat; | ||||
# XXX There are code points present in UnicodeData.txt | |||||
# and missing from UTF-8.cm | |||||
next if !defined $utf8map{$mb}; | |||||
# Define the category | |||||
if ($d[2] =~ /^Lu/) { | |||||
$cat = "upper"; | |||||
} elsif ($d[2] =~ /^Ll/) { | |||||
$cat = "lower"; | |||||
} elsif ($d[2] =~ /^Nd/) { | |||||
$cat = "digit"; | |||||
} elsif ($d[2] =~ /^L/) { | |||||
$cat = "alpha"; | |||||
} elsif ($d[2] =~ /^P/) { | |||||
$cat = "punct"; | |||||
} elsif ($d[2] =~ /^M/ || $d[2] =~ /^N/ || $d[2] =~ /^S/) { | |||||
$cat = "graph"; | |||||
} elsif ($d[2] =~ /^C/) { | |||||
$cat = "cntrl"; | |||||
} elsif ($d[2] =~ /^Z/) { | |||||
$cat = "space"; | |||||
} | } | ||||
open(FIN, "$filename"); | $data{$cat}{$mb}{'wc'} = $d[0]; | ||||
print "Reading from $filename\n"; | |||||
while (<FIN>) { | # Check if it's a start or end of range | ||||
if (/^LC_CTYPE/../^END LC_CTYPE/) { | if ($d[1] =~ /First>$/) { | ||||
if ($_ ne "LC_CTYPE\n" && $_ ne "END LC_CTYPE\n" && | $data{$cat}{$mb}{'start'} = 1; | ||||
$_ ne "*************\n" && $_ ne "\n") { | } elsif ($d[1] =~ /Last>$/) { | ||||
push @lines, $_; | $data{$cat}{$mb}{'end'} = 1; | ||||
} | } | ||||
# Check if there's upper/lower mapping | |||||
if ($d[12] ne "") { | |||||
$data{'toupper'}{$mb} = wctomb($d[12]); | |||||
} elsif ($d[13] ne "") { | |||||
$data{'tolower'}{$mb} = wctomb($d[13]); | |||||
} | } | ||||
} | } | ||||
close(FIN); | |||||
foreach my $line (@lines) { | my $first; | ||||
if ($line =~ m/^([a-z]{3,})\t/) { | my $inrange = 0; | ||||
$category = $1; | |||||
if ($category eq 'merge') { | # Now write out the categories | ||||
merge_seen; | foreach my $cat (sort keys (%data)) { | ||||
next; | print FOUT "$cat\t"; | ||||
} | $first = 1; | ||||
if ($category ne 'print') { | foreach my $mb (sort keys (%{$data{$cat}})) { | ||||
$cat_loaded = 1; | if ($first == 1) { | ||||
} | $first = 0; | ||||
} | } elsif ($inrange == 1) { | ||||
next if ($category eq 'print'); | # Safety belt | ||||
if ($category eq 'toupper' || $category eq 'tolower') { | die "broken range end wc=$data{$cat}{$mb}{'wc'}" | ||||
if ($line =~ m/<([-_A-Za-z0-9]+)>,/) { | if !defined $data{$cat}{$mb}{'end'}; | ||||
$key_name = $1; | print FOUT ";...;"; | ||||
$key_name =~ s/_/ /g; | $inrange = 0; | ||||
if (already_seen_RO (hex($utf8map{$key_name}))) { | |||||
next; | |||||
} | |||||
if ($cat_loaded) { print FOUT $category; } | |||||
$cat_loaded = 0; | |||||
$line =~ s/^[a-z]{3,}\t/\t/; | |||||
print FOUT $line; | |||||
} | |||||
next; | |||||
} | |||||
if ($line =~ m/<([-_A-Za-z0-9]+)>(;.|)$/) { | |||||
$term = ($2 eq '') ? 1 : 0; | |||||
$curr_name = $1; | |||||
$key_name = $1; | |||||
$key_name =~ s/_/ /g; | |||||
$curr_ID = hex($utf8map{$key_name}); | |||||
if (already_seen ($curr_ID)) { | |||||
next; | |||||
} | |||||
if ($active) { | |||||
if ($curr_ID == $prev_ID + 1) { | |||||
$prev_ID = $curr_ID; | |||||
$prev_name = $curr_name; | |||||
} else { | } else { | ||||
if ($cat_loaded) { print FOUT $category; } | print FOUT ";/\n\t"; | ||||
$cat_loaded = 0; | |||||
if ($prev_ID == $lock_ID) { | |||||
print FOUT "\t<" . $prev_name . ">;/\n"; | |||||
} elsif ($prev_ID - 1 == $lock_ID) { | |||||
print FOUT "\t<" . $lock_name . ">;/\n"; | |||||
print FOUT "\t<" . $prev_name . ">;/\n"; | |||||
} else { | |||||
print FOUT "\t<" . $lock_name . | |||||
">;...;<" . $prev_name . ">;/\n"; | |||||
} | } | ||||
$lock_ID = $curr_ID; | |||||
$prev_ID = $curr_ID; | if ($cat eq "tolower" || $cat eq "toupper") { | ||||
$lock_name = $curr_name; | print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})"; | ||||
$prev_name = $curr_name; | |||||
} | |||||
} else { | } else { | ||||
$active = 1; | if (defined($data{$cat}{$mb}{'start'})) { | ||||
$lock_ID = $curr_ID; | $inrange = 1; | ||||
$prev_ID = $curr_ID; | |||||
$lock_name = $curr_name; | |||||
$prev_name = $curr_name; | |||||
} | } | ||||
if ($term) { | print FOUT "$utf8map{$mb}"; | ||||
if ($cat_loaded) { print FOUT $category; } | |||||
$cat_loaded = 0; | |||||
if ($curr_ID == $lock_ID) { | |||||
print FOUT "\t<" . $curr_name . ">\n"; | |||||
} elsif ($curr_ID == $lock_ID + 1) { | |||||
print FOUT "\t<" . $lock_name . ">;/\n"; | |||||
print FOUT "\t<" . $curr_name . ">\n"; | |||||
} else { | |||||
print FOUT "\t<" . $lock_name . | |||||
">;...;<" . $curr_name . ">\n"; | |||||
} | } | ||||
$active = 0; | |||||
} | } | ||||
} else { | |||||
print FOUT $line; | |||||
} | |||||
} | |||||
} | |||||
sub generate_sections { | |||||
foreach my $section (@SECTIONS ) { | |||||
print FOUT "\n"; | print FOUT "\n"; | ||||
print FOUT $stars; | |||||
print FOUT @$section[1]; | |||||
print FOUT $stars; | |||||
compress_ctype (@$section[0]); | |||||
merge_seen; | |||||
} | |||||
my @lines = (); | |||||
open(FIN, "$manual_file"); | |||||
print "Reading from $manual_file\n"; | |||||
while (<FIN>) { | |||||
push @lines, $_; | |||||
} | |||||
close(FIN); | |||||
foreach my $line (@lines) { | |||||
print FOUT $line; | |||||
} | } | ||||
} | } |