Index: head/tools/tools/locale/tools/charmaps.pm =================================================================== --- head/tools/tools/locale/tools/charmaps.pm (revision 352273) +++ head/tools/tools/locale/tools/charmaps.pm (revision 352274) @@ -1,160 +1,184 @@ #!/usr/local/bin/perl -w +# SPDX-License-Identifier: BSD-2-Clause-FreeBSD +# +# Copyright 2009 Edwin Groothuis +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. # # $FreeBSD$ # use strict; use XML::Parser; use Data::Dumper; my %data = (); my %d = (); my $index = -1; sub get_xmldata { my $etcdir = shift; open(FIN, "$etcdir/charmaps.xml"); my @xml = ; chomp(@xml); close(FIN); my $xml = new XML::Parser(Handlers => { Start => \&h_start, End => \&h_end, Char => \&h_char }); $xml->parse(join("", @xml)); return %d; } sub h_start { my $expat = shift; my $element = shift; my @attrs = @_; my %attrs = (); while ($#attrs >= 0) { $attrs{$attrs[0]} = $attrs[1]; shift(@attrs); shift(@attrs); } $data{element}{++$index} = $element; if ($index == 2 && $data{element}{1} eq "languages" && $element eq "language") { my $name = $attrs{name}; my $countries = $attrs{countries}; my $encoding = $attrs{encoding}; my $family = $attrs{family}; my $f = defined $attrs{family} ? $attrs{family} : "x"; my $nc_link = $attrs{namecountry_link}; my $e_link = $attrs{encoding_link}; my $fallback = $attrs{fallback}; my $definitions = $attrs{definitions}; $d{L}{$name}{$f}{fallback} = $fallback; $d{L}{$name}{$f}{e_link} = $e_link; $d{L}{$name}{$f}{nc_link} = $nc_link; $d{L}{$name}{$f}{family} = $family; $d{L}{$name}{$f}{encoding} = $encoding; $d{L}{$name}{$f}{definitions} = $definitions; $d{L}{$name}{$f}{countries} = $countries; foreach my $c (split(" ", $countries)) { if (defined $encoding) { foreach my $e (split(" ", $encoding)) { $d{L}{$name}{$f}{data}{$c}{$e} = undef; $d{E}{$e} = 0; # not read } } $d{L}{$name}{$f}{data}{$c}{"UTF-8"} = undef; } return; } if ($index == 2 && $data{element}{1} eq "translations" && $element eq "translation") { foreach my $e (split(" ", $attrs{encoding})) { if (defined $attrs{hex}) { my $k = $attrs{cldr}; my $hs = $attrs{hex}; $d{T}{$e}{$k}{hex} = $hs; } if (defined $attrs{string}) { my $s = ""; for (my $i = 0; $i < length($attrs{string}); $i++) { $s .= sprintf("%02x", ord(substr($attrs{string}, $i, 1))); } $d{T}{$e}{$attrs{cldr}}{hex} = $s; } if (defined $attrs{unicode}) { my $k = $attrs{cldr}; my $uc = $attrs{unicode}; $d{T}{$e}{$k}{unicode} = $uc; } if (defined $attrs{ucc}) { my $k = $attrs{cldr}; my $uc = $attrs{ucc}; $d{T}{$e}{$k}{ucc} = $uc; } } return; } if ($index == 2 && $data{element}{1} eq "alternativemonths" && $element eq "language") { my $name = $attrs{name}; my $countries = $attrs{countries}; $data{fields}{name} = $name; $data{fields}{countries} = $countries; $data{fields}{text} = ""; return; } } sub h_end { my $expat = shift; my $element = shift; if ($index == "2") { if ($data{element}{1} eq "alternativemonths" && $data{element}{2} eq "language") { foreach my $c (split(/,/, $data{fields}{countries})) { my $m = $data{fields}{text}; $m =~ s/^[\t ]//g; $m =~ s/[\t ]$//g; $d{AM}{$data{fields}{name}}{$c} = $m; } $data{fields} = (); } } $index--; } sub h_char { my $expat = shift; my $string = shift; if ($index == "2") { if ($data{element}{1} eq "alternativemonths" && $data{element}{2} eq "language") { $data{fields}{text} .= $string; } } } #use Data::Dumper; #my %D = get_xmldata(); #print Dumper(%D); 1; Index: head/tools/tools/locale/tools/cldr2def.pl =================================================================== --- head/tools/tools/locale/tools/cldr2def.pl (revision 352273) +++ head/tools/tools/locale/tools/cldr2def.pl (revision 352274) @@ -1,1017 +1,1044 @@ #!/usr/local/bin/perl -wC + +# SPDX-License-Identifier: BSD-2-Clause-FreeBSD +# +# Copyright 2009 Edwin Groothuis +# Copyright 2015 John Marino +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# # $FreeBSD$ use strict; use File::Copy; use XML::Parser; use Tie::IxHash; use Text::Iconv; #use Data::Dumper; use Getopt::Long; use Digest::SHA qw(sha1_hex); require "charmaps.pm"; if ($#ARGV < 2) { print "Usage: $0 --unidir= --etc= --type=\n"; exit(1); } my $DEFENCODING = "UTF-8"; my $UNIDIR = undef; my $ETCDIR = undef; my $TYPE = undef; my $result = GetOptions ( "unidir=s" => \$UNIDIR, "etc=s" => \$ETCDIR, "type=s" => \$TYPE, ); my %convertors = (); my %ucd = (); my %values = (); my %hashtable = (); my %languages = (); my %translations = (); my %encodings = (); my %alternativemonths = (); get_languages(); my %utf8map = (); my %utf8aliases = (); get_unidata($UNIDIR); get_utf8map("$UNIDIR/posix/$DEFENCODING.cm"); get_encodings("$ETCDIR/charmaps"); my %keys = (); tie(%keys, "Tie::IxHash"); tie(%hashtable, "Tie::IxHash"); my %FILESNAMES = ( "monetdef" => "LC_MONETARY", "timedef" => "LC_TIME", "msgdef" => "LC_MESSAGES", "numericdef" => "LC_NUMERIC", "colldef" => "LC_COLLATE", "ctypedef" => "LC_CTYPE" ); my %callback = ( mdorder => \&callback_mdorder, altmon => \&callback_altmon, cformat => \&callback_cformat, dformat => \&callback_dformat, dtformat => \&callback_dtformat, cbabmon => \&callback_abmon, cbampm => \&callback_ampm, data => undef, ); my %DESC = ( # numericdef "decimal_point" => "decimal_point", "thousands_sep" => "thousands_sep", "grouping" => "grouping", # monetdef "int_curr_symbol" => "int_curr_symbol (last character always " . "SPACE)", "currency_symbol" => "currency_symbol", "mon_decimal_point" => "mon_decimal_point", "mon_thousands_sep" => "mon_thousands_sep", "mon_grouping" => "mon_grouping", "positive_sign" => "positive_sign", "negative_sign" => "negative_sign", "int_frac_digits" => "int_frac_digits", "frac_digits" => "frac_digits", "p_cs_precedes" => "p_cs_precedes", "p_sep_by_space" => "p_sep_by_space", "n_cs_precedes" => "n_cs_precedes", "n_sep_by_space" => "n_sep_by_space", "p_sign_posn" => "p_sign_posn", "n_sign_posn" => "n_sign_posn", # msgdef "yesexpr" => "yesexpr", "noexpr" => "noexpr", "yesstr" => "yesstr", "nostr" => "nostr", # timedef "abmon" => "Short month names", "mon" => "Long month names (as in a date)", "abday" => "Short weekday names", "day" => "Long weekday names", "t_fmt" => "X_fmt", "d_fmt" => "x_fmt", "c_fmt" => "c_fmt", "am_pm" => "AM/PM", "d_t_fmt" => "date_fmt", "altmon" => "Long month names (without case ending)", "md_order" => "md_order", "t_fmt_ampm" => "ampm_fmt", ); if ($TYPE eq "colldef") { transform_collation(); make_makefile(); } if ($TYPE eq "ctypedef") { transform_ctypes(); make_makefile(); } if ($TYPE eq "numericdef") { %keys = ( "decimal_point" => "s", "thousands_sep" => "s", "grouping" => "ai", ); get_fields(); print_fields(); make_makefile(); } if ($TYPE eq "monetdef") { %keys = ( "int_curr_symbol" => "s", "currency_symbol" => "s", "mon_decimal_point" => "s", "mon_thousands_sep" => "s", "mon_grouping" => "ai", "positive_sign" => "s", "negative_sign" => "s", "int_frac_digits" => "i", "frac_digits" => "i", "p_cs_precedes" => "i", "p_sep_by_space" => "i", "n_cs_precedes" => "i", "n_sep_by_space" => "i", "p_sign_posn" => "i", "n_sign_posn" => "i" ); get_fields(); print_fields(); make_makefile(); } if ($TYPE eq "msgdef") { %keys = ( "yesexpr" => "s", "noexpr" => "s", "yesstr" => "s", "nostr" => "s" ); get_fields(); print_fields(); make_makefile(); } if ($TYPE eq "timedef") { %keys = ( "abmon" => " "as", "abday" => "as", "day" => "as", "t_fmt" => "s", "d_fmt" => " " " " " " "s", ); get_fields(); print_fields(); make_makefile(); } sub callback_ampm { my $s = shift; my $nl = $callback{data}{l} . "_" . $callback{data}{c}; my $enc = $callback{data}{e}; if ($nl eq 'ru_RU') { if ($enc eq 'UTF-8') { $s = 'дп;пп'; } else { my $converter = Text::Iconv->new("utf-8", "$enc"); $s = $converter->convert("дп;пп"); } } return $s; } sub callback_cformat { my $s = shift; my $nl = $callback{data}{l} . "_" . $callback{data}{c}; if ($nl eq 'ko_KR') { $s =~ s/(> )(%p)/$1%A $2/; } $s =~ s/\.,/\./; $s =~ s/ %Z//; $s =~ s/ %z//; $s =~ s/^"%e\./%A %e/; $s =~ s/^"(%B %e, )/"%A, $1/; $s =~ s/^"(%e %B )/"%A $1/; return $s; }; sub callback_dformat { my $s = shift; $s =~ s/(%m(|[-.]))%e/$1%d/; $s =~ s/%e((|[-.])%m)/%d$1/; return $s; }; sub callback_dtformat { my $s = shift; my $nl = $callback{data}{l} . "_" . $callback{data}{c}; if ($nl eq 'ja_JP') { $s =~ s/(> )(%H)/$1%A $2/; } elsif ($nl eq 'ko_KR' || $nl eq 'zh_CN' || $nl eq 'zh_TW') { if ($nl ne 'ko_KR') { $s =~ s/%m/%_m/; } $s =~ s/(> )(%p)/$1%A $2/; } $s =~ s/\.,/\./; $s =~ s/^"%e\./%A %e/; $s =~ s/^"(%B %e, )/"%A, $1/; $s =~ s/^"(%e %B )/"%A $1/; return $s; }; sub callback_mdorder { my $s = shift; return undef if (!defined $s); $s =~ s/[^dem]//g; $s =~ s/e/d/g; return $s; }; sub callback_altmon { # if the language/country is known in %alternative months then # return that, otherwise repeat mon my $s = shift; if (defined $alternativemonths{$callback{data}{l}}{$callback{data}{c}}) { my @altnames = split(";",$alternativemonths{$callback{data}{l}}{$callback{data}{c}}); my @cleaned; foreach (@altnames) { $_ =~ s/^\s+//; $_ =~ s/\s+$//; push @cleaned, $_; } return join(";",@cleaned); } return $s; } sub callback_abmon { # for specified CJK locales, pad result with a space to enable # columns to line up (style established in FreeBSD in 2001) my $s = shift; my $nl = $callback{data}{l} . "_" . $callback{data}{c}; if ($nl eq 'ja_JP' || $nl eq 'ko_KR' || $nl eq 'zh_CN' || $nl eq 'zh_HK' || $nl eq 'zh_TW') { my @monthnames = split(";", $s); my @cleaned; foreach (@monthnames) { if ($_ =~ /^"<(two|three|four|five|six|seven|eight|nine)>/ || ($_ =~ /^"/ && $_ !~ /^"(||)/)) { $_ =~ s/^"/"/; } push @cleaned, $_; } return join(";",@cleaned); } return $s; } ############################ sub get_unidata { my $directory = shift; open(FIN, "$directory/UnicodeData.txt") or die("Cannot open $directory/UnicodeData.txt");; my @lines = ; chomp(@lines); close(FIN); foreach my $l (@lines) { my @a = split(/;/, $l); $ucd{code2name}{"$a[0]"} = $a[1]; # Unicode name $ucd{name2code}{"$a[1]"} = $a[0]; # Unicode code } } sub get_utf8map { my $file = shift; open(FIN, $file); my @lines = ; close(FIN); chomp(@lines); my $prev_k = undef; my $prev_v = ""; my $incharmap = 0; foreach my $l (@lines) { $l =~ s/\r//; next if ($l =~ /^\#/); next if ($l eq ""); if ($l eq "CHARMAP") { $incharmap = 1; next; } next if (!$incharmap); last if ($l eq "END CHARMAP"); $l =~ /^<([^\s]+)>\s+(.*)/; my $k = $1; my $v = $2; $k =~ s/_/ /g; # unicode char string $v =~ s/\\x//g; # UTF-8 char code $utf8map{$k} = $v; $utf8aliases{$k} = $prev_k if ($prev_v eq $v); $prev_v = $v; $prev_k = $k; } } sub get_encodings { my $dir = shift; foreach my $e (sort(keys(%encodings))) { if (!open(FIN, "$dir/$e.TXT")) { print "Cannot open charmap for $e\n"; next; } $encodings{$e} = 1; my @lines = ; close(FIN); chomp(@lines); foreach my $l (@lines) { $l =~ s/\r//; next if ($l =~ /^\#/); next if ($l eq ""); my @a = split(" ", $l); next if ($#a < 1); $a[0] =~ s/^0[xX]//; # local char code $a[1] =~ s/^0[xX]//; # unicode char code $convertors{$e}{uc($a[1])} = uc($a[0]); } } } sub get_languages { my %data = get_xmldata($ETCDIR); %languages = %{$data{L}}; %translations = %{$data{T}}; %alternativemonths = %{$data{AM}}; %encodings = %{$data{E}}; } sub transform_ctypes { # Add the C.UTF-8 $languages{"C"}{"x"}{data}{"x"}{$DEFENCODING} = undef; foreach my $l (sort keys(%languages)) { foreach my $f (sort keys(%{$languages{$l}})) { foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { next if (defined $languages{$l}{$f}{definitions} && $languages{$l}{$f}{definitions} !~ /$TYPE/); $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread my $file = $l; $file .= "_" . $f if ($f ne "x"); $file .= "_" . $c if ($c ne "x"); my $actfile = $file; my $filename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src"; if (! -f $filename) { print STDERR "Cannot open $filename\n"; next; } open(FIN, "$filename"); print "Reading from $filename for ${l}_${f}_${c}\n"; $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read my @lines; my $shex; my $uhex; while () { push @lines, $_; } close(FIN); $shex = sha1_hex(join("\n", @lines)); $languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex; $hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1; open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src"); print FOUT @lines; close(FOUT); foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) { next if ($enc eq $DEFENCODING); $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; if (! -f $filename) { print STDERR "Cannot open $filename\n"; next; } @lines = (); open(FIN, "$filename"); while () { if ((/^comment_char\s/) || (/^escape_char\s/)){ push @lines, $_; } if (/^LC_CTYPE/../^END LC_CTYPE/) { push @lines, $_; } } close(FIN); $uhex = sha1_hex(join("\n", @lines) . $enc); $languages{$l}{$f}{data}{$c}{$enc} = $uhex; $hashtable{$uhex}{"${l}_${f}_${c}.$enc"} = 1; open(FOUT, ">$TYPE.draft/$actfile.$enc.src"); print FOUT <) { if ((/^comment_char\s/) || (/^escape_char\s/)){ push @lines, $_; } if (/^LC_COLLATE/../^END LC_COLLATE/) { $_ =~ s/[ ]+/ /g; push @lines, $_; } } close(FIN); $shex = sha1_hex(join("\n", @lines)); $languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex; $hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1; open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src"); print FOUT <; chomp(@lines); close(FIN); my $continue = 0; foreach my $k (keys(%keys)) { foreach my $line (@lines) { $line =~ s/\r//; next if (!$continue && $line !~ /^$k\s/); if ($continue) { $line =~ s/^\s+//; } else { $line =~ s/^$k\s+//; } $values{$l}{$f}{$c}{$k} = "" if (!defined $values{$l}{$f}{$c}{$k}); $continue = ($line =~ /\/$/); $line =~ s/\/$// if ($continue); while ($line =~ /_/) { $line =~ s/\<([^>_]+)_([^>]+)\>/<$1 $2>/; } die "_ in data - $line" if ($line =~ /_/); $values{$l}{$f}{$c}{$k} .= $line; last if (!$continue); } } } } } } sub decodecldr { my $e = shift; my $s = shift; my $v = undef; if ($e eq "UTF-8") { # # Conversion to UTF-8 can be done from the Unicode name to # the UTF-8 character code. # $v = $utf8map{$s}; die "Cannot convert $s in $e (charmap)" if (!defined $v); } else { # # Conversion to these encodings can be done from the Unicode # name to Unicode code to the encodings code. # my $ucc = undef; $ucc = $ucd{name2code}{$s} if (defined $ucd{name2code}{$s}); $ucc = $ucd{name2code}{$utf8aliases{$s}} if (!defined $ucc && $utf8aliases{$s} && defined $ucd{name2code}{$utf8aliases{$s}}); if (!defined $ucc) { if (defined $translations{$e}{$s}{hex}) { $v = $translations{$e}{$s}{hex}; $ucc = 0; } elsif (defined $translations{$e}{$s}{ucc}) { $ucc = $translations{$e}{$s}{ucc}; } } die "Cannot convert $s in $e (ucd string)" if (!defined $ucc); $v = $convertors{$e}{$ucc} if (!defined $v); $v = $translations{$e}{$s}{hex} if (!defined $v && defined $translations{$e}{$s}{hex}); if (!defined $v && defined $translations{$e}{$s}{unicode}) { my $ucn = $translations{$e}{$s}{unicode}; $ucc = $ucd{name2code}{$ucn} if (defined $ucd{name2code}{$ucn}); $ucc = $ucd{name2code}{$utf8aliases{$ucn}} if (!defined $ucc && defined $ucd{name2code}{$utf8aliases{$ucn}}); $v = $convertors{$e}{$ucc}; } die "Cannot convert $s in $e (charmap)" if (!defined $v); } return pack("C", hex($v)) if (length($v) == 2); return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2))) if (length($v) == 4); return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)), hex(substr($v, 4, 2))) if (length($v) == 6); print STDERR "Cannot convert $e $s\n"; return "length = " . length($v); } sub translate { my $enc = shift; my $v = shift; return $translations{$enc}{$v} if (defined $translations{$enc}{$v}); return undef; } sub print_fields { foreach my $l (sort keys(%languages)) { foreach my $f (sort keys(%{$languages{$l}})) { foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { next if (defined $languages{$l}{$f}{definitions} && $languages{$l}{$f}{definitions} !~ /$TYPE/); foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) { if ($languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") { print "Skipping ${l}_" . ($f eq "x" ? "" : "${f}_") . "${c} - not read\n"; next; } my $file = $l; $file .= "_" . $f if ($f ne "x"); $file .= "_" . $c; print "Writing to $file in $enc\n"; if ($enc ne $DEFENCODING && !defined $convertors{$enc}) { print "Failed! Cannot convert to $enc.\n"; next; }; open(FOUT, ">$TYPE.draft/$file.$enc.new"); my $okay = 1; my $output = ""; print FOUT </) { $k = substr($g, 1); $g = $keys{$k}; } # Callback function if ($g =~ /^\(.*)/) { my $p1 = $1; $cm = $2; my $p3 = $3; my $rv = decodecldr($enc, $cm); # $rv = translate($enc, $cm) # if (!defined $rv); if (!defined $rv) { print STDERR "Could not convert $k ($cm) from $DEFENCODING to $enc\n"; $okay = 0; next; } $v = $p1 . $rv . $p3; } $output .= "$v\n"; next; } if ($g eq "as") { foreach my $v (split(/;/, $v)) { $v =~ s/^"//; $v =~ s/"$//; my $cm = ""; while ($v =~ /^(.*?)<(.*?)>(.*)/) { my $p1 = $1; $cm = $2; my $p3 = $3; my $rv = decodecldr($enc, $cm); # $rv = translate($enc, # $cm) # if (!defined $rv); if (!defined $rv) { print STDERR "Could not convert $k ($cm) from $DEFENCODING to $enc\n"; $okay = 0; next; } $v = $1 . $rv . $3; } $output .= "$v\n"; } next; } die("$k is '$g'"); } $languages{$l}{$f}{data}{$c}{$enc} = sha1_hex($output); $hashtable{sha1_hex($output)}{"${l}_${f}_${c}.$enc"} = 1; print FOUT "$output# EOF\n"; close(FOUT); if ($okay) { rename("$TYPE.draft/$file.$enc.new", "$TYPE.draft/$file.$enc.src"); } else { rename("$TYPE.draft/$file.$enc.new", "$TYPE.draft/$file.$enc.failed"); } } } } } } sub make_makefile { print "Creating Makefile for $TYPE\n"; my $SRCOUT; my $SRCOUT2; my $SRCOUT3 = ""; my $SRCOUT4 = ""; my $MAPLOC; if ($TYPE eq "colldef") { $SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U " . "-i \${.IMPSRC} \\\n" . "\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} " . "\${.OBJDIR}/\${.IMPSRC:T:R}"; $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" . "locale/etc/final-maps\n"; $SRCOUT2 = "LC_COLLATE"; $SRCOUT3 = "" . ".for f t in \${LOCALES_MAPPED}\n" . "FILES+=\t\$t.LC_COLLATE\n" . "FILESDIR_\$t.LC_COLLATE=\t\${LOCALEDIR}/\$t\n" . "\$t.LC_COLLATE: \${.CURDIR}/\$f.src\n" . "\tlocaledef \${LOCALEDEF_ENDIAN} -D -U " . "-i \${.ALLSRC} \\\n" . "\t\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} \\\n" . "\t\t\${.OBJDIR}/\${.TARGET:T:R}\n" . ".endfor\n\n"; $SRCOUT4 = "## LOCALES_MAPPED\n"; } elsif ($TYPE eq "ctypedef") { $SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U -c " . "-w \${MAPLOC}/widths.txt \\\n" . "\t-f \${MAPLOC}/map.\${.IMPSRC:T:R:E} " . "\\\n\t-i \${.IMPSRC} \${.OBJDIR}/\${.IMPSRC:T:R} " . " || true"; $SRCOUT2 = "LC_CTYPE"; $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" . "locale/etc/final-maps\n"; $SRCOUT3 = "## SYMPAIRS\n\n" . ".for s t in \${SYMPAIRS}\n" . "\${t:S/src\$/LC_CTYPE/}: " . "\$s\n" . "\tlocaledef \${LOCALEDEF_ENDIAN} -D -U -c " . "-w \${MAPLOC}/widths.txt \\\n" . "\t-f \${MAPLOC}/map.\${.TARGET:T:R:C/^.*\\.//} " . "\\\n\t-i \${.ALLSRC} \${.OBJDIR}/\${.TARGET:T:R} " . " || true\n" . ".endfor\n\n"; } else { $SRCOUT = "grep -v -E '^(\#\$\$|\#[ ])' < \${.IMPSRC} > \${.TARGET}"; $SRCOUT2 = "out"; $MAPLOC = ""; } open(FOUT, ">$TYPE.draft/Makefile"); print FOUT < EOF } print FOUT < 0) { my $link = shift(@files); $link =~ s/_x_x//; # special case for C $link =~ s/_x_/_/; # strip family if none there foreach my $file (@files) { my @a = split(/_/, $file); my @b = split(/\./, $a[-1]); $file =~ s/_x_/_/; print FOUT "SAME+=\t\t$link $file\n"; undef($languages{$a[0]}{$a[1]}{data}{$b[0]}{$b[1]}); } } } foreach my $l (sort keys(%languages)) { foreach my $f (sort keys(%{$languages{$l}})) { foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { next if (defined $languages{$l}{$f}{definitions} && $languages{$l}{$f}{definitions} !~ /$TYPE/); if (defined $languages{$l}{$f}{data}{$c}{$DEFENCODING} && $languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") { print "Skipping ${l}_" . ($f eq "x" ? "" : "${f}_") . "${c} - not read\n"; next; } foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) { my $file = $l; $file .= "_" . $f if ($f ne "x"); $file .= "_" . $c if ($c ne "x"); next if (!defined $languages{$l}{$f}{data}{$c}{$e}); print FOUT "LOCALES+=\t$file.$e\n"; } if (defined $languages{$l}{$f}{nc_link}) { foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) { my $file = $l . "_"; $file .= $f . "_" if ($f ne "x"); $file .= $c; print FOUT "SAME+=\t\t$file.$e $languages{$l}{$f}{nc_link}.$e\t# legacy (lang/country change)\n"; } } if (defined $languages{$l}{$f}{e_link}) { foreach my $el (split(" ", $languages{$l}{$f}{e_link})) { my @a = split(/:/, $el); my $file = $l . "_"; $file .= $f . "_" if ($f ne "x"); $file .= $c; print FOUT "SAME+=\t\t$file.$a[0] $file.$a[1]\t# legacy (same charset)\n"; } } } } } print FOUT < EOF close(FOUT); } Index: head/tools/tools/locale/tools/extract-colldef.awk =================================================================== --- head/tools/tools/locale/tools/extract-colldef.awk (revision 352273) +++ head/tools/tools/locale/tools/extract-colldef.awk (revision 352274) @@ -1,12 +1,38 @@ +# +# SPDX-License-Identifier: BSD-2-Clause-FreeBSD +# +# Copyright 2016 Baptiste Daroussin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# # $FreeBSD$ BEGIN { print "# Warning: Do not edit. This is automatically extracted" print "# from CLDR project data, obtained from http://cldr.unicode.org/" print "# -----------------------------------------------------------------------------" } $1 == "comment_char" { print } $1 == "escape_char" { print } $1 == "LC_COLLATE" { doprint = 1 } doprint == 1 { print } $1 == "END" && $2 == "LC_COLLATE" { exit 0 } Index: head/tools/tools/locale/tools/finalize =================================================================== --- head/tools/tools/locale/tools/finalize (revision 352273) +++ head/tools/tools/locale/tools/finalize (revision 352274) @@ -1,182 +1,207 @@ #!/bin/sh # +# SPDX-License-Identifier: BSD-2-Clause-FreeBSD +# +# Copyright 2015 John Marino +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# # $FreeBSD$ # # This is a helper script for the Makefile in the parent directory. # When the localization definitions are generated in the draft area, # this script will copy base ones that others symlink to, and rearrange # the generate makefile to pull the LOCALES first. # set -e usage () { echo "finalize ' to package standard localization" echo "type must be one of { monetdef, msgdef, numericdef, timedef, colldef, ctypedef }" exit 1 } [ $# -ne 1 ] && usage [ $1 = "monetdef" -o $1 = "msgdef" -o $1 = "colldef" -o \ $1 = "numericdef" -o $1 = "timedef" -o $1 = "ctypedef" ] || usage self=$(realpath $0) base=$(dirname ${self}) old=${base}/../${1}.draft new=${base}/../${1} TEMP=/tmp/${1}.locales TEMP2=/tmp/${1}.hashes TEMP3=/tmp/${1}.symlinks TEMP4=/tmp/${1}.mapped FULLMAP=/tmp/utf8-map FULLEXTRACT=/tmp/extracted-names AWKCMD="/## PLACEHOLDER/ { \ while ( getline line < \"${TEMP}\" ) {print line} } \ /## SYMPAIRS/ { \ while ( getline line < \"${TEMP3}\" ) {print line} } \ /## LOCALES_MAPPED/ { \ while ( getline line < \"${TEMP4}\" ) {print line} } \ !/## / { print \$0 }" # Rename the sources with 3 components name into the POSIX version of the name using @modifier cd $old pwd for i in *_*_*.*.src; do if [ "$i" = "*_*_*.*.src" ]; then break fi oldname=${i%.*} nname=`echo $oldname | awk '{ split($0, a, "_"); print a[1]"_"a[3]"@"a[2];} '` mv -f ${oldname}.src ${nname}.src sed -i '' -e "s/${oldname}/${nname}/g" Makefile COLLATIONS_SPECIAL=$(echo ${COLLATIONS_SPECIAL} | sed -e "s/${oldname}/${nname}/g") done # For variable without @modifier ambiguity do not keep the @modifier for i in *@*.src; do if [ "$i" = "*@*.src" ]; then break fi oldname=${i%.*} shortname=${oldname%@*} if [ $(ls ${shortname}@* | wc -l) -eq 1 ] ; then mv -f $i ${shortname}.src sed -i '' -e "s/${oldname}/${shortname}/g" Makefile COLLATIONS_SPECIAL=$(echo ${COLLATIONS_SPECIAL} | sed -e "s/${oldname}/${shortname}/g") fi done # Rename the modifiers into non abbreviated version for i in *@Latn.src; do if [ "$i" = "*@Latn.src" ]; then break fi mv -f ${i} ${i%@*}@latin.src sed -i '' -e "s/${i%.*}/${i%@*}@latin/g" Makefile COLLATIONS_SPECIAL=$(echo ${COLLATIONS_SPECIAL} | sed -e "s/${i%.*}/${i%@*}@latin/g") done for i in *@Cyrl.src; do if [ "$i" = "*@Cyrl.src" ]; then break fi mv -f ${i} ${i%@*}@cyrillic.src sed -i '' -e "s/${i%.*}/${i%@*}@cyrillic/g" Makefile COLLATIONS_SPECIAL=$(echo ${COLLATIONS_SPECIAL} | sed -e "s/${i%.*}/${i%@*}@cyrillic/g") done # On locales with multiple modifiers rename the "default" version without the @modifier default_locales="sr_RS@cyrillic" for i in ${default_locales}; do localename=${i%@*} mod=${i#*@} for l in ${localename}.*@${mod}.src; do if [ "$l" = "${localename}.*@${mod}.src" ]; then break fi mv -f ${l} ${l%@*}.src sed -i '' -e "s/${l%.*}/${l%@*}/g" Makefile done done cd - grep '^LOCALES+' ${old}/Makefile > ${TEMP} if [ $1 = "ctypedef" ] then keep=$(cat ${TEMP} | awk '{ print $2 ".src" }') (cd ${old} && md5 -r ${keep} | sort) > ${TEMP2} keep=$(awk '{ if ($1 != last1) print $2; last1 = $1; }' ${TEMP2}) for original in ${keep} do cp ${old}/${original} ${new}/ done awk '{ if ($1 == last1) { print "SYMPAIRS+=\t" last2 " " $2 } \ else {last1 = $1; last2 = $2}}' ${TEMP2} > ${TEMP3} rm -f ${TEMP2} /usr/bin/sed -E -e 's/[ ]+/ /g' \ ${UNIDIR}/posix/UTF-8.cm \ > ${base}/../etc/final-maps/map.UTF-8 /usr/bin/sed -E -e 's/[ ]+/ /g' \ ${UNIDIR}/posix/eucCN.cm \ > ${base}/../etc/final-maps/map.eucCN /usr/bin/sed -E -e 's/[ ]+/ /g' \ ${UNIDIR}/posix/eucCN.cm \ > ${base}/../etc/final-maps/map.GB2312 CHARMAPS="ARMSCII-8 Big5 CP1131 CP1251 \ CP866 GBK ISCII-DEV ISO8859-1 \ ISO8859-13 ISO8859-15 ISO8859-2 ISO8859-4 \ ISO8859-5 ISO8859-7 ISO8859-9 KOI8-R KOI8-U \ PT154 SJIS US-ASCII eucJP eucKR" # GB18030 blows up, use pre-generate Illumos version for map in ${CHARMAPS} do encoding=${map} /usr/local/bin/perl ${base}/convert_map.pl \ ${base}/../etc/charmaps/${map}.TXT ${encoding} \ | /usr/bin/sed -E -e 's/ +/ /g' \ > ${base}/../etc/final-maps/map.${map} echo map ${map} converted. done elif [ $1 = "colldef" ] then awk -v tmp4=${TEMP4} '$1 == "SAME+=" && $0 !~ /legacy/ { orig=$2 dest=$3 gsub(/.*\./, "", orig) gsub(/.*\./, "", dest) if (orig != dest ) print "LOCALES_MAPPED+=\t"$2 " "$3 > tmp4 }' ${old}/Makefile for line in $(awk '{ print $3 }' ${TEMP4}); do sed -i '' "/^SAME.*$line$/d" ${old}/Makefile done echo "" >> ${TEMP4} for enc in ${COLLATIONS_SPECIAL}; do sed -i '' "/^.*${enc}$/d" ${TEMP4} echo "LOCALES+= ${enc}" >> ${TEMP4} done keep=$(cat ${TEMP} | awk '{ print $2 }') for original in ${keep} ${COLLATIONS_SPECIAL} do cp ${old}/${original}.src ${new}/ done else # below is everything but ctypedef keep=$(cat ${TEMP} | awk '{ print $2 }') for original in ${keep} do cp ${old}/${original}.src ${new}/ done fi grep -v '^LOCALES+' ${old}/Makefile | awk "${AWKCMD}" > ${new}/Makefile rm -f ${TEMP} ${TEMP3} ${TEMP4} Index: head/tools/tools/locale/tools/utf8-rollup.pl =================================================================== --- head/tools/tools/locale/tools/utf8-rollup.pl (revision 352273) +++ head/tools/tools/locale/tools/utf8-rollup.pl (revision 352274) @@ -1,192 +1,219 @@ #!/usr/local/bin/perl -wC + +# SPDX-License-Identifier: BSD-2-Clause-FreeBSD +# +# Copyright 2009 Edwin Groothuis +# Copyright 2015 John Marino +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# # $FreeBSD$ use strict; use Getopt::Long; if ($#ARGV != 0) { print "Usage: $0 --unidir=\n"; exit(1); } my $UNIDIR = undef; my $result = GetOptions ( "unidir=s" => \$UNIDIR ); my %utf8map = (); my $outfilename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src"; get_utf8map("$UNIDIR/posix/UTF-8.cm"); generate_header (); parse_unidata ("$UNIDIR/UnicodeData.txt"); generate_footer (); ############################ sub get_utf8map { my $file = shift; open(FIN, $file); my @lines = ; close(FIN); chomp(@lines); my $incharmap = 0; foreach my $l (@lines) { $l =~ s/\r//; next if ($l =~ /^\#/); next if ($l eq ""); if ($l eq "CHARMAP") { $incharmap = 1; next; } next if (!$incharmap); last if ($l eq "END CHARMAP"); $l =~ /^(<[^\s]+>)\s+(.*)/; my $k = $2; my $v = $1; $k =~ s/\\x//g; # UTF-8 char code $utf8map{$k} = $v; } } sub generate_header { open(FOUT, ">", "$outfilename") or die ("can't write to $outfilename\n"); print FOUT <= 0 && $wc <= 0x10ffff) { $lead = 0xf0; $len = 4; } for ($i = $len - 1; $i > 0; $i--) { $ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret; $wc >>= 6; } $ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret; return $ret; } sub parse_unidata { my $file = shift; my %data = (); open(FIN, $file); my @lines = ; close(FIN); chomp(@lines); foreach my $l (@lines) { my @d = split(/;/, $l, -1); my $mb = wctomb($d[0]); my $cat; # XXX There are code points present in UnicodeData.txt # and missing from UTF-8.cm next if !defined $utf8map{$mb}; # Define the category if ($d[2] =~ /^Lu/) { $cat = "upper"; } elsif ($d[2] =~ /^Ll/) { $cat = "lower"; } elsif ($d[2] =~ /^Nd/) { $cat = "digit"; } elsif ($d[2] =~ /^L/) { $cat = "alpha"; } elsif ($d[2] =~ /^P/) { $cat = "punct"; } elsif ($d[2] =~ /^M/ || $d[2] =~ /^N/ || $d[2] =~ /^S/) { $cat = "graph"; } elsif ($d[2] =~ /^C/) { $cat = "cntrl"; } elsif ($d[2] =~ /^Z/) { $cat = "space"; } $data{$cat}{$mb}{'wc'} = $d[0]; # Check if it's a start or end of range if ($d[1] =~ /First>$/) { $data{$cat}{$mb}{'start'} = 1; } elsif ($d[1] =~ /Last>$/) { $data{$cat}{$mb}{'end'} = 1; } # Check if there's upper/lower mapping if ($d[12] ne "") { $data{'toupper'}{$mb} = wctomb($d[12]); } elsif ($d[13] ne "") { $data{'tolower'}{$mb} = wctomb($d[13]); } } my $first; my $inrange = 0; # Now write out the categories foreach my $cat (sort keys (%data)) { print FOUT "$cat\t"; $first = 1; foreach my $mb (sort keys (%{$data{$cat}})) { if ($first == 1) { $first = 0; } elsif ($inrange == 1) { # Safety belt die "broken range end wc=$data{$cat}{$mb}{'wc'}" if !defined $data{$cat}{$mb}{'end'}; print FOUT ";...;"; $inrange = 0; } else { print FOUT ";/\n\t"; } if ($cat eq "tolower" || $cat eq "toupper") { print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})"; } else { if (defined($data{$cat}{$mb}{'start'})) { $inrange = 1; } print FOUT "$utf8map{$mb}"; } } print FOUT "\n"; } }