diff --git a/tools/tools/locale/Makefile b/tools/tools/locale/Makefile
--- a/tools/tools/locale/Makefile
+++ b/tools/tools/locale/Makefile
@@ -168,7 +168,8 @@
KOI8-U \
SJIS \
US-ASCII \
- UTF-8
+ UTF-8 \
+ UTF-32
# CLDR files
CLDRFILES_CORE= https://unicode.org/Public/cldr/35/core.zip
@@ -211,9 +212,10 @@
ln -s -f ../posix ${.TARGET}
clean-posix:
rm -rf posix ${UNIDIR}/posix
-post-posixcm: ${UNIDIR}/posix
+${UNIDIR}/posix/xx_Comm_C.UTF-8.src: ${UNIDIR}/posix
perl -I ${TOOLSDIR} ${TOOLSDIR}/utf8-rollup.pl \
--unidir=${UNIDIR}
+post-posixcm: ${UNIDIR}/posix/xx_Comm_C.UTF-8.src
.for enc in ${ENCODINGS}
posixcm: build-tools posix/${enc}.cm
.ORDER: build-tools posix/${enc}.cm
diff --git a/tools/tools/locale/README b/tools/tools/locale/README
--- a/tools/tools/locale/README
+++ b/tools/tools/locale/README
@@ -19,7 +19,7 @@
Variables:
LOCALESRCDIR
Destination path for the generated locale files.
- Default: $DESTDIR/usr/src/share.
+ Default: ${SRCTOP}/share.
TMPDIR
Temporary directory.
Default: /tmp
@@ -29,7 +29,12 @@
Create a temporary directory for building.
make clean
- Clean up the obj directories.
+ Clean up the obj directories. Note that this does not
+ clean up tools or posix locale source files generated
+ from the CLDR files because it takes a long time to generate
+ them and they are not changed as long as using the same
+ CLDR files. "make clean && make build" will
+ regenerate the locale source files for src/share/*def.
make cleandir
Remove the obj directories completely.
diff --git a/tools/tools/locale/etc/charmaps.xml b/tools/tools/locale/etc/charmaps.xml
--- a/tools/tools/locale/etc/charmaps.xml
+++ b/tools/tools/locale/etc/charmaps.xml
@@ -195,395 +195,404 @@
+
-
-
-
-
+
+
+
+
+ cldr="MINUS_SIGN" unicode="HYPHEN-MINUS" />
+ cldr="EN_DASH" unicode="HYPHEN-MINUS" />
+ cldr="CYRILLIC_CAPITAL_LETTER_JE"
+ unicode="LATIN_CAPITAL_LETTER_J" />
+ cldr="CYRILLIC_SMALL_LETTER_JE" unicode="LATIN_SMALL_LETTER_J" />
+ cldr="CYRILLIC_CAPITAL_LETTER_LJE" string="lj" />
+ cldr="CYRILLIC_SMALL_LETTER_LJE" string="lj" />
+ cldr="CYRILLIC_CAPITAL_LETTER_A" unicode="LATIN_CAPITAL_LETTER_A" />
+ cldr="CYRILLIC_SMALL_LETTER_A" unicode="LATIN_SMALL_LETTER_A" />
+ cldr="CYRILLIC_CAPITAL_LETTER_BE"
+ unicode="LATIN_CAPITAL_LETTER_B" />
+ cldr="CYRILLIC_SMALL_LETTER_BE" unicode="LATIN_SMALL_LETTER_B" />
+ cldr="CYRILLIC_CAPITAL_LETTER_VE"
+ unicode="LATIN_CAPITAL_LETTER_B" />
+ cldr="CYRILLIC_SMALL_LETTER_VE" unicode="LATIN_SMALL_LETTER_B" />
+ cldr="CYRILLIC_CAPITAL_LETTER_GHE"
+ unicode="LATIN_CAPITAL_LETTER_G" />
+ cldr="CYRILLIC_SMALL_LETTER_GHE" unicode="LATIN_SMALL_LETTER_G" />
+ cldr="CYRILLIC_CAPITAL_LETTER_DE" string="D" />
+ cldr="CYRILLIC_SMALL_LETTER_DE" string="d" />
+ cldr="CYRILLIC_CAPITAL_LETTER_IE"
+ unicode="LATIN_CAPITAL_LETTER_E" />
+ cldr="CYRILLIC_SMALL_LETTER_IE" unicode="LATIN_SMALL_LETTER_E" />
+ cldr="CYRILLIC_CAPITAL_LETTER_ZHE" string="ZH" />
+ cldr="CYRILLIC_SMALL_LETTER_ZHE" string="zh" />
+ cldr="CYRILLIC_CAPITAL_LETTER_ZE" string="z" />
+ cldr="CYRILLIC_SMALL_LETTER_ZE" string="z" />
+ cldr="CYRILLIC_CAPITAL_LETTER_I" unicode="LATIN_CAPITAL_LETTER_J" />
+ cldr="CYRILLIC_SMALL_LETTER_I" unicode="LATIN_CAPITAL_LETTER_J" />
+ cldr="CYRILLIC_CAPITAL_LETTER_I" unicode="LATIN_SMALL_LETTER_J" />
+ cldr="CYRILLIC_SMALL_LETTER_I" unicode="LATIN_SMALL_LETTER_J" />
+ cldr="CYRILLIC_CAPITAL_LETTER_KA"
+ unicode="LATIN_CAPITAL_LETTER_K" />
+ cldr="CYRILLIC_SMALL_LETTER_KA" unicode="LATIN_SMALL_LETTER_K" />
+ cldr="CYRILLIC_CAPITAL_LETTER_EL"
+ unicode="LATIN_CAPITAL_LETTER_L" />
+ cldr="CYRILLIC_SMALL_LETTER_EL" unicode="LATIN_SMALL_LETTER_L" />
+ cldr="CYRILLIC_CAPITAL_LETTER_EM"
+ unicode="LATIN_CAPITAL_LETTER_M" />
+ cldr="CYRILLIC_SMALL_LETTER_EM" unicode="LATIN_SMALL_LETTER_M" />
+ cldr="CYRILLIC_CAPITAL_LETTER_EN"
+ unicode="LATIN_CAPITAL_LETTER_H" />
+ cldr="CYRILLIC_SMALL_LETTER_EN" unicode="LATIN_SMALL_LETTER_H" />
+ cldr="CYRILLIC_CAPITAL_LETTER_O" unicode="LATIN_CAPITAL_LETTER_O" />
+ cldr="CYRILLIC_SMALL_LETTER_O" unicode="LATIN_SMALL_LETTER_O" />
+ cldr="CYRILLIC_CAPITAL_LETTER_PE"
+ unicode="LATIN_CAPITAL_LETTER_P" />
+ cldr="CYRILLIC_SMALL_LETTER_PE" unicode="LATIN_SMALL_LETTER_P" />
+ cldr="CYRILLIC_CAPITAL_LETTER_ER"
+ unicode="LATIN_CAPITAL_LETTER_R" />
+ cldr="CYRILLIC_SMALL_LETTER_ER" unicode="LATIN_SMALL_LETTER_R" />
+ cldr="CYRILLIC_CAPITAL_LETTER_ES"
+ unicode="LATIN_CAPITAL_LETTER_C" />
+ cldr="CYRILLIC_SMALL_LETTER_ES" unicode="LATIN_SMALL_LETTER_C" />
+ cldr="CYRILLIC_CAPITAL_LETTER_TE"
+ unicode="LATIN_CAPITAL_LETTER_T" />
+ cldr="CYRILLIC_SMALL_LETTER_TE" unicode="LATIN_SMALL_LETTER_T" />
+ cldr="CYRILLIC_CAPITAL_LETTER_U" unicode="LATIN_CAPITAL_LETTER_U" />
+ cldr="CYRILLIC_SMALL_LETTER_U" unicode="LATIN_SMALL_LETTER_U" />
+ cldr="CYRILLIC_CAPITAL_LETTER_EF"
+ unicode="LATIN_CAPITAL_LETTER_F" />
+ cldr="CYRILLIC_SMALL_LETTER_EF" unicode="LATIN_SMALL_LETTER_F" />
+ cldr="CYRILLIC_CAPITAL_LETTER_HA"
+ unicode="LATIN_CAPITAL_LETTER_H" />
+ cldr="CYRILLIC_SMALL_LETTER_HA" unicode="LATIN_SMALL_LETTER_H" />
+ cldr="CYRILLIC_CAPITAL_LETTER_TSE"
+ unicode="LATIN_CAPITAL_LETTER_C" />
+ cldr="CYRILLIC_SMALL_LETTER_TSE" unicode="LATIN_SMALL_LETTER_C" />
+ cldr="CYRILLIC_CAPITAL_LETTER_CHE"
+ unicode="LATIN_CAPITAL_LETTER_C_WITH_CARON" />
+ cldr="CYRILLIC_SMALL_LETTER_CHE"
+ unicode="LATIN_SMALL_LETTER_C_WITH_CARON" />
+ cldr="CYRILLIC_CAPITAL_LETTER_SHA"
+ unicode="LATIN_CAPITAL_LETTER_S_WITH_CARON" />
+ cldr="CYRILLIC_SMALL_LETTER_SHA"
+ unicode="LATIN_SMALL_LETTER_S_WITH_CARON" />
+ cldr="CYRILLIC_CAPITAL_LETTER_SHCHA"
+ unicode="LATIN_CAPITAL_LETTER_S_WITH_CIRCUMFLEX" />
+ cldr="CYRILLIC_SMALL_LETTER_SHCHA"
+ unicode="LATIN_SMALL_LETTER_S_WITH_CIRCUMFLEX" />
+ cldr="?CYRILLIC_CAPITAL_LETTER_HARD_SIGN" unicode="?" />
+ cldr="?CYRILLIC_SMALL_LETTER_HARD_SIGN" unicode="?" />
+ cldr="?CYRILLIC_CAPITAL_LETTER_YERU" unicode="?" />
+ cldr="?CYRILLIC_SMALL_LETTER_YERU" unicode="?" />
+ cldr="?CYRILLIC_CAPITAL_LETTER_SOFT_SIGN" unicode="?" />
+ cldr="?CYRILLIC_SMALL_LETTER_SOFT_SIGN" unicode="?" />
+ cldr="CYRILLIC_CAPITAL_LETTER_E"
+ unicode="LATIN_CAPITAL_LETTER_E_WITH_GRAVE" />
+ cldr="CYRILLIC_SMALL_LETTER_E"
+ unicode="LATIN_SMALL_LETTER_E_WITH_GRAVE" />
+ cldr="?CYRILLIC_CAPITAL_LETTER_YU" unicode="?" />
+ cldr="?CYRILLIC_SMALL_LETTER_YU" unicode="?" />
+ cldr="CYRILLIC_CAPITAL_LETTER_YA"
+ unicode="LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX" />
+ cldr="CYRILLIC_SMALL_LETTER_YA"
+ unicode="LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX" />
+ cldr="LATIN_SMALL_LETTER_T_WITH_COMMA_BELOW"
+ unicode="LATIN_SMALL_LETTER_T" />
+ cldr="MODIFIER_LETTER_APOSTROPHE" unicode="APOSTROPHE" />
+ cldr="LATIN_SMALL_LETTER_C_WITH_CARON"
+ unicode="LATIN_SMALL_LETTER_C" />
+ cldr="MODIFIER_LETTER_APOSTROPHE" unicode="APOSTROPHE" />
+ cldr="MODIFIER_LETTER_APOSTROPHE" unicode="APOSTROPHE" />
-
-
-
-
-
-
-
+
+
+
+
+
+
+
-
-
+
+
+ unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_C" />
+ unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_D" />
+ unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_N" />
+ unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_T" />
+ unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_W" />
+ unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_Y" />
+ unicode="FULLWIDTH_DIGIT_ONE" />
+ unicode="FULLWIDTH_DIGIT_TWO" />
+ unicode="FULLWIDTH_DIGIT_THREE" />
+ unicode="FULLWIDTH_DIGIT_FOUR" />
+ unicode="FULLWIDTH_DIGIT_FIVE" />
+ unicode="FULLWIDTH_DIGIT_SIX" />
+ unicode="FULLWIDTH_DIGIT_SEVEN" />
+ unicode="FULLWIDTH_DIGIT_EIGHT" />
+ unicode="FULLWIDTH_DIGIT_NINE" />
+ unicode="FULLWIDTH_DIGIT_ZERO" />
-
+ unicode="IDEOGRAPHIC_SPACE" />
+
+ unicode="FULLWIDTH_SOLIDUS" />
+ unicode="FULLWIDTH_COMMA" />
-
+ unicode="FULLWIDTH_HYPHEN-MINUS" />
+
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E00" ucc="4E00" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E03" ucc="4E03" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E09" ucc="4E09" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E0A" ucc="4E0A" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E0B" ucc="4E0B" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E0D" ucc="4E0D" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E5D" ucc="4E5D" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E8C" ucc="4E8C" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-4E94" ucc="4E94" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-516B" ucc="516B" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-516D" ucc="516D" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5206" ucc="5206" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-524D" ucc="524D" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5341" ucc="5341" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5348" ucc="5348" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5426" ucc="5426" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5468" ucc="5468" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-56DB" ucc="56DB" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-571F" ucc="571F" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5B9A" ucc="5B9A" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5E74" ucc="5E74" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-5F8C" ucc="5F8C" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-65E5" ucc="65E5" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-65F6" ucc="65F6" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-661F" ucc="661F" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-662F" ucc="662F" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-6642" ucc="6642" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-66DC" ucc="66DC" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-6708" ucc="6708" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-671F" ucc="671F" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-6728" ucc="6728" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-6C34" ucc="6C34" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-706B" ucc="706B" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-786E" ucc="786E" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-78BA" ucc="78BA" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-79D2" ucc="79D2" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-9031" ucc="9031" />
+ cldr="CJK_UNIFIED_IDEOGRAPH-91D1" ucc="91D1" />
+ cldr="HANGUL_SYLLABLE_GEUM" ucc="AE08" />
+ cldr="HANGUL_SYLLABLE_NYEON" ucc="B144" />
+ cldr="HANGUL_SYLLABLE_NI" ucc="B2C8" />
+ cldr="HANGUL_SYLLABLE_MOG" ucc="BAA9" />
+ cldr="HANGUL_SYLLABLE_BUN" ucc="BD84" />
+ cldr="HANGUL_SYLLABLE_SU" ucc="C218" />
+ cldr="HANGUL_SYLLABLE_SI" ucc="C2DC" />
+ cldr="HANGUL_SYLLABLE_A" ucc="C544" />
+ cldr="HANGUL_SYLLABLE_YE" ucc="C608" />
+ cldr="HANGUL_SYLLABLE_O" ucc="C624" />
+ cldr="HANGUL_SYLLABLE_YO" ucc="C694" />
+ cldr="HANGUL_SYLLABLE_WEOL" ucc="C6D4" />
+ cldr="HANGUL_SYLLABLE_IL" ucc="C77C" />
+ cldr="HANGUL_SYLLABLE_JEON" ucc="C804" />
+ cldr="HANGUL_SYLLABLE_CO" ucc="CD08" />
+ cldr="HANGUL_SYLLABLE_TO" ucc="D1A0" />
+ cldr="HANGUL_SYLLABLE_HWA" ucc="D654" />
+ cldr="HANGUL_SYLLABLE_HU" ucc="D6C4" />
+ cldr="ONE_DOT_LEADER" unicode="FULL_STOP" />
-
+
+ cldr="NO-BREAK_SPACE" unicode="SPACE" />
+ cldr="NARROW_NO-BREAK_SPACE" unicode="NO-BREAK_SPACE" />
+ cldr="RIGHT_SINGLE_QUOTATION_MARK" unicode="APOSTROPHE" />
-
-
-
-
+
+
+
-
diff --git a/tools/tools/locale/tools/cldr2def.pl b/tools/tools/locale/tools/cldr2def.pl
--- a/tools/tools/locale/tools/cldr2def.pl
+++ b/tools/tools/locale/tools/cldr2def.pl
@@ -4,6 +4,7 @@
#
# Copyright 2009 Edwin Groothuis
# Copyright 2015 John Marino
+# Copyright 2020 Hiroki Sato
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
@@ -38,7 +39,6 @@
use Digest::SHA qw(sha1_hex);
require "charmaps.pm";
-
if ($#ARGV < 2) {
print "Usage: $0 --unidir= --etc= --type=\n";
exit(1);
@@ -69,10 +69,11 @@
my %alternativemonths = ();
get_languages();
-my %utf8map = ();
-my %utf8aliases = ();
-get_unidata($UNIDIR);
-get_utf8map("$UNIDIR/posix/$DEFENCODING.cm");
+my %utfmap = ();
+$utfmap{'UTF-8'} = {};
+$utfmap{'UTF-32'} = {};
+get_utfmap("$UNIDIR/posix/$DEFENCODING.cm", $utfmap{'UTF-8'});
+get_utfmap("$UNIDIR/posix/UTF-32.cm", $utfmap{'UTF-32'});
get_encodings("$ETCDIR/charmaps");
my %keys = ();
@@ -334,25 +335,8 @@
############################
-sub get_unidata {
- my $directory = shift;
-
- open(FIN, "$directory/UnicodeData.txt")
- or die("Cannot open $directory/UnicodeData.txt");;
- my @lines = ;
- chomp(@lines);
- close(FIN);
-
- foreach my $l (@lines) {
- my @a = split(/;/, $l);
-
- $ucd{code2name}{"$a[0]"} = $a[1]; # Unicode name
- $ucd{name2code}{"$a[1]"} = $a[0]; # Unicode code
- }
-}
-
-sub get_utf8map {
- my $file = shift;
+sub get_utfmap {
+ my ($file, $db) = @_;
open(FIN, $file);
my @lines = ;
@@ -363,7 +347,7 @@
my $prev_v = "";
my $incharmap = 0;
foreach my $l (@lines) {
- $l =~ s/\r//;
+ chomp($l);
next if ($l =~ /^\#/);
next if ($l eq "");
@@ -378,17 +362,28 @@
$l =~ /^<([^\s]+)>\s+(.*)/;
my $k = $1;
my $v = $2;
- $k =~ s/_/ /g; # unicode char string
$v =~ s/\\x//g; # UTF-8 char code
- $utf8map{$k} = $v;
+ $db->{$k} = $v;
+# print STDERR "UTF $k = $v\n";
- $utf8aliases{$k} = $prev_k if ($prev_v eq $v);
+ # XXX: no longer needed
+ # $db_alias->{$k} = $prev_k if ($prev_v eq $v);
$prev_v = $v;
$prev_k = $k;
}
}
+sub resolve_enc_addition {
+ my $ret = '';
+
+ foreach my $t (split(/\+/, $_[0])) {
+ $t =~ s/^0[xX]//;
+ $ret .= $t;
+ }
+ return $ret;
+}
+
sub get_encodings {
my $dir = shift;
foreach my $e (sort(keys(%encodings))) {
@@ -403,14 +398,20 @@
chomp(@lines);
foreach my $l (@lines) {
$l =~ s/\r//;
- next if ($l =~ /^\#/);
next if ($l eq "");
my @a = split(" ", $l);
next if ($#a < 1);
- $a[0] =~ s/^0[xX]//; # local char code
- $a[1] =~ s/^0[xX]//; # unicode char code
- $convertors{$e}{uc($a[1])} = uc($a[0]);
+ next if ($a[0] =~ /^\#/ or $a[1] =~ /^\#/);
+ next if ($a[0] eq '' or $a[1] eq '');
+
+ $a[0] = resolve_enc_addition($a[0]); # local
+ $a[1] = resolve_enc_addition($a[1]); # UTF-32
+ my $u32 = sprintf("%08X", hex($a[1]));
+# print STDERR "$a[1] => $u32\n";
+
+ # Use UTF-32 as the indices.
+ $convertors{$e}{$u32} = uc($a[0]);
}
}
}
@@ -565,8 +566,75 @@
foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
next if ($enc eq $DEFENCODING);
- copy ("$TYPE.draft/$actfile.$DEFENCODING.src",
- "$TYPE.draft/$actfile.$enc.src");
+
+ open FIN, "<$TYPE.draft/$actfile.$DEFENCODING.src";
+ open FOUT, ">$TYPE.draft/$actfile.$enc.src";
+ my $order_start = 0;
+ my $print_p = 0;
+ #
+ # %c_elem: collation elements
+ #
+ # undef: not defined
+ # 1: defined
+ # 2: invalid in this encoding
+ #
+ my %c_elem = ();
+ while () { # XXX: this loop should be refactored.
+ chomp;
+ $print_p = 1;
+ if ($order_start) {
+ $order_start = 0 if (m/^order_end/);
+ if (m/^<([^>]+)>/) {
+ if (not defined $c_elem{$1}) {
+# print STDERR "$1:\n";
+
+ my $u32 = $utfmap{'UTF-32'}->{$1};
+ die "order, $1\n" if (not defined $u32);
+# print STDERR "u32 for $1 = $u32\n";
+ if (not defined $convertors{$enc}{$u32}) {
+# print STDERR "$1 - $u32 not defined in $enc\n";
+ $print_p = 0;
+ }
+ } elsif ($c_elem{$1} == 2) {
+# print STDERR "$1 is marked as invalid in $enc\n";
+ $print_p = 0;
+ }
+ }
+ } elsif (m/^collating-element/) {
+ my ($elem, $l);
+ if (m/<([^>]+)> from (.+)/) {
+ ($elem, $l) = ($1, $2);
+ }
+# print STDERR "$elem: enter ($print_p, $l,)\n";
+ while ($print_p and
+ defined $l and
+ $l =~ m/<([^>]+)>/g) {
+# print STDERR "$elem: $1\n";
+ my $u32 = $utfmap{'UTF-32'}->{$1};
+ die "collating-element, $1\n" if (not defined $u32);
+# print STDERR "u32 for $1 = $u32\n";
+ if (not $convertors{$enc}{$u32}) {
+# print STDERR "$1 - $u32 not defined in $enc\n";
+ $print_p = 0;
+# print STDERR "Mark $elem as invalid\n";
+ $c_elem{$elem} = 2;
+ }
+ }
+ if ($print_p) {
+# print STDERR "Add $elem\n";
+ $c_elem{$elem} = 1;
+ }
+ } elsif (m/^collating-symbol <([^>]+)>/) {
+# print STDERR "Add $1\n";
+ $c_elem{$1} = 1;
+ } elsif (m/^order_start/) {
+ $order_start = 1;
+ # do nothing
+ }
+ print FOUT $_, "\n" if ($print_p);
+ }
+ close FOUT;
+ close FIN;
$languages{$l}{$f}{data}{$c}{$enc} = $shex;
$hashtable{$shex}{"${l}_${f}_${c}.$enc"} = 1;
}
@@ -626,11 +694,11 @@
$continue = ($line =~ /\/$/);
$line =~ s/\/$// if ($continue);
- while ($line =~ /_/) {
- $line =~
- s/\<([^>_]+)_([^>]+)\>/<$1 $2>/;
- }
- die "_ in data - $line" if ($line =~ /_/);
+# while ($line =~ /_/) {
+# $line =~
+# s/\<([^>_]+)_([^>]+)\>/<$1 $2>/;
+# }
+# die "_ in data - $line" if ($line =~ /_/);
$values{$l}{$f}{$c}{$k} .= $line;
last if (!$continue);
@@ -652,56 +720,52 @@
# Conversion to UTF-8 can be done from the Unicode name to
# the UTF-8 character code.
#
- $v = $utf8map{$s};
+ $v = $utfmap{'UTF-8'}->{$s};
die "Cannot convert $s in $e (charmap)" if (!defined $v);
} else {
#
# Conversion to these encodings can be done from the Unicode
# name to Unicode code to the encodings code.
#
- my $ucc = undef;
- $ucc = $ucd{name2code}{$s} if (defined $ucd{name2code}{$s});
- $ucc = $ucd{name2code}{$utf8aliases{$s}}
- if (!defined $ucc
- && $utf8aliases{$s}
- && defined $ucd{name2code}{$utf8aliases{$s}});
-
- if (!defined $ucc) {
- if (defined $translations{$e}{$s}{hex}) {
- $v = $translations{$e}{$s}{hex};
- $ucc = 0;
- } elsif (defined $translations{$e}{$s}{ucc}) {
- $ucc = $translations{$e}{$s}{ucc};
+ # hex - hex or string attr
+ # unicode - unicode attr
+ # ucc - ucc attr
+ my $hex = $translations{$e}{$s}{hex};
+ my $ucc = $utfmap{'UTF-32'}->{$s};
+ my $ucc_attr = $translations{$e}{$s}{ucc};
+ my $unicode = $translations{$e}{$s}{unicode};
+
+ if (defined $hex) { # hex is in local encoding
+ $v = $hex;
+ } elsif (defined $unicode) { # unicode is in name
+ $v = $convertors{$e}{$utfmap{'UTF-32'}->{$unicode}};
+ } elsif (defined $ucc_attr) { # ucc is in code point
+ if (defined $ucc) {
+# print STDERR "INFO: ucc=$ucc_attr ",
+# "overrides $ucc in UTF-32\n";
}
- }
-
- die "Cannot convert $s in $e (ucd string)" if (!defined $ucc);
- $v = $convertors{$e}{$ucc} if (!defined $v);
-
- $v = $translations{$e}{$s}{hex}
- if (!defined $v && defined $translations{$e}{$s}{hex});
-
- if (!defined $v && defined $translations{$e}{$s}{unicode}) {
- my $ucn = $translations{$e}{$s}{unicode};
- $ucc = $ucd{name2code}{$ucn}
- if (defined $ucd{name2code}{$ucn});
- $ucc = $ucd{name2code}{$utf8aliases{$ucn}}
- if (!defined $ucc
- && defined $ucd{name2code}{$utf8aliases{$ucn}});
+ # normalize
+ $ucc_attr = sprintf("%08X", hex($ucc_attr));
+# print STDERR "convert $ucc_attr into $e\n";
+ $v = $convertors{$e}{$ucc_attr};
+ } elsif (defined $ucc) {
+ # normalize
+ $ucc = sprintf("%08X", hex($ucc));
+# print STDERR "convert $ucc into $e\n";
$v = $convertors{$e}{$ucc};
}
-
- die "Cannot convert $s in $e (charmap)" if (!defined $v);
+ die "Cannot convert $s in $e" if (!defined $v);
}
+ # XXX: length = 8 is not supported yet.
+ $v =~ s/^[0]+//g;
+ $v = "0" . $v if (length($v) % 2);
return pack("C", hex($v)) if (length($v) == 2);
return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)))
if (length($v) == 4);
return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)),
hex(substr($v, 4, 2))) if (length($v) == 6);
- print STDERR "Cannot convert $e $s\n";
- return "length = " . length($v);
-
+ die "Cannot convert $s in $e (length = " . length($v) . "\n";
}
sub translate {