Index: tools/tools/locale/Makefile =================================================================== --- tools/tools/locale/Makefile +++ tools/tools/locale/Makefile @@ -7,19 +7,30 @@ # # Modified by John Marino to suit DragonFly needs # +.if ${.CURDIR} == ${.OBJDIR} +.error Do make obj first. +.endif -.OBJDIR: . +LOCALESRCDIR?= ${DESTDIR}/usr/src/share +TMPDIR?= /tmp -.if !defined(UNIDIR) -.error UNIDIR is not set -.endif -PASSON= UNIDIR="${UNIDIR}" +BASEDIR= ${.CURDIR} +ETCDIR= ${BASEDIR}/etc +TOOLSDIR= ${BASEDIR}/tools +PATCHDIR= ${BASEDIR}/patch +UNIDIR= ${.OBJDIR:tA}/unicode -ETCDIR= ${.CURDIR}/etc +PKGS= openjdk8 \ + apache-ant \ + p5-XML-Parser \ + p5-Tie-IxHash \ + p5-Text-Iconv +tools-test: + pkg info -e ${PKGS} + @echo tools ok. KNOWN= monetdef numericdef msgdef colldef ctypedef # timedef TYPES?= ${KNOWN} -LOCALE_DESTDIR?= /tmp/generated-locales/ COLLATION_SPECIAL?= \ cs_CZ ISO8859-2 \ @@ -44,67 +55,81 @@ .for area enc in ${COLLATION_SPECIAL} COLLATIONS_SPECIAL_ENV+= ${area}.${enc} .endfor -PASSON+= COLLATIONS_SPECIAL="${COLLATIONS_SPECIAL_ENV}" +SETENV= env -i \ + PATH="${PATH}" \ + TMPDIR="${TMPDIR}" \ + COLLATIONS_SPECIAL="${COLLATIONS_SPECIAL_ENV}" \ + UNIDIR="${UNIDIR}" \ + BASEDIR="${BASEDIR}" \ + TOOLSDIR="${TOOLSDIR}" \ + ETCDIR="${ETCDIR}" -all: +all: posix build afterbuild +.ORDER: posix build afterbuild + +afterbuild: build + @echo "" + @find . -name *failed + .for t in ${TYPES} . if ${KNOWN:M${t}} - test -d ${t} || mkdir ${t} - make build-${t} +build: build-${t} +.ORDER: build-${t} afterbuild . endif .endfor - @echo "" - @find . -name *failed +diff: .for t in ${TYPES} +. if ${KNOWN:M${t}} +diff: diff-${t} +diff-${t}: + -/usr/bin/diff -ruN -x Makefile -x Makefile.depend \ + ${LOCALESRCDIR}/${t} ${t} +. endif +.endfor + +install: +.for t in ${TYPES} +. if ${KNOWN:M${t}} install: install-${t} install-${t}: -. if ${KNOWN:M${t}} - rm -rf ${.CURDIR}/${t}.draft - rm -f ${.CURDIR}/../../../share/${t}/Makefile - rm -f ${.CURDIR}/../../../share/${t}/*.src - mv ${.CURDIR}/${t}/* ${.CURDIR}/../../../share/${t}/ + cd ${LOCALESRCDIR}/${t} && \ + rm -f Makefile *.src && \ + install -c ${t}/* ${LOCALESRCDIR}/${t} . endif .endfor post-install: .for t in ${TYPES} . if ${KNOWN:M${t}} - (cd ${.CURDIR}/../../../share/${t} && \ - make && make install && make clean) + cd ${LOCALSRCDIR}/${t} && \ + make && make install && make clean . endif .endfor .for t in ${TYPES} -gen-${t}: - mkdir -p ${t} ${t}.draft - perl -I tools tools/cldr2def.pl \ - --unidir=$$(realpath ${UNIDIR}) \ - --etc=$$(realpath ${ETCDIR}) \ +CLEANDIRS+= ${t} ${t}.draft +${t}: + mkdir -p ${t} ${t}.draft && \ + perl -I ${TOOLSDIR} ${TOOLSDIR}/cldr2def.pl \ + --unidir=${UNIDIR:tA} \ + --etc=${ETCDIR:tA} \ --type=${t} -build-${t}: gen-${t} - env ${PASSON} tools/finalize ${t} +build-${t}: ${t} + ${SETENV} OUTBASEDIR="${.OBJDIR}/${t}" ${TOOLSDIR}/finalize ${t} .endfor -gen-ctypedef: ctype-rollup -static-colldef: gen-colldef +static-colldef: colldef build-colldef: static-colldef static-colldef: .for area enc in ${COLLATION_SPECIAL} - awk -f tools/extract-colldef.awk ${UNIDIR}/posix/${area}.${enc}.src > \ - colldef.draft/${area}.${enc}.src +colldef.draft/${area}.${enc}.src: posix/${area}.${enc}.src + awk -f ${TOOLSDIR}/extract-colldef.awk \ + ${.ALLSRC} > ${.TARGET} || (rm -f ${.TARGET} && false) .endfor -ctype-rollup: - perl -I tools tools/utf8-rollup.pl --unidir=$$(realpath ${UNIDIR}) - -clean: -.for t in ${TYPES} - rm -rf ${t} ${t}.draft -.endfor - BASE_LOCALES_OF_INTEREST?= \ af_ZA am_ET ar_AE ar_EG ar_JO ar_MA ar_QA ar_SA \ be_BY bg_BG ca_AD ca_ES ca_FR ca_IT \ @@ -145,31 +170,71 @@ US-ASCII \ UTF-8 -POSIX: posixsrc posixcol posixcm +# CLDR files +CLDRFILES_CORE= https://unicode.org/Public/cldr/35/core.zip +CLDRFILES_KEY= https://unicode.org/Public/cldr/35/keyboards.zip +CLDRFILES_TOOLS=https://unicode.org/Public/cldr/35/tools.zip +CLDRFILES_UCD= http://www.unicode.org/Public/zipped/latest/UCD.zip + +# fetch and extract targets +${UNIDIR}: + mkdir -p ${UNIDIR} +.for N in CORE KEY TOOLS UCD +${CLDRFILES_${N}:T}: + fetch ${CLDRFILES_${N}} +fetch: ${CLDRFILES_${N}:T} +extract-${CLDRFILES_${N}:T}:: ${CLDRFILES_${N}:T} ${UNIDIR} + cd ${UNIDIR} && unzip -o ../${CLDRFILES_${N}:T} +extract: extract-${CLDRFILES_${N}:T} +.endfor +patch:: +.if exists(${PATCHDIR}) + cd ${UNIDIR} && cat ${PATCHDIR}/patch-* | patch +.endif + .if !exists(${UNIDIR}/tools/java/cldr.jar) -.error check README about building cldr.jar +.ORDER: extract patch +build-tools: extract patch tools-test ${UNIDIR} + cd ${UNIDIR}/tools/java && ${SETENV} ant all jar +.else +build-tools: + @echo cldr.jar is ready. .endif + +JAVA_CLDR= java -DCLDR_DIR=${UNIDIR:Q} -jar ${UNIDIR}/tools/java/cldr.jar + +posix: posixcm post-posixcm posixsrc posixcol +.ORDER: posixcm post-posixcm posixsrc posixcol +${UNIDIR}/posix: + ln -s -f ../posix ${.TARGET} +clean-posix: + rm -rf posix ${UNIDIR}/posix +post-posixcm: ${UNIDIR}/posix + perl -I ${TOOLSDIR} ${TOOLSDIR}/utf8-rollup.pl \ + --unidir=${UNIDIR} +.for enc in ${ENCODINGS} +posixcm: build-tools posix/${enc}.cm +.ORDER: build-tools posix/${enc}.cm +posix/${enc}.cm: + mkdir -p posix && \ + ${JAVA_CLDR} org.unicode.cldr.posix.GenerateCharmap \ + -d posix -c ${enc} +.endfor .for area in ${BASE_LOCALES_OF_INTEREST} -posixsrc: ${UNIDIR}/posix/${area}.UTF-8.src -${UNIDIR}/posix/${area}.UTF-8.src: - java -DCLDR_DIR=${UNIDIR:Q} -jar ${UNIDIR}/tools/java/cldr.jar \ - org.unicode.cldr.posix.GeneratePOSIX \ - -d ${UNIDIR}/posix -m ${area} -c UTF-8 +posixsrc: build-tools posix/${area}.UTF-8.src +.ORDER: build-tools posix/${area}.UTF-8.src +posix/${area}.UTF-8.src: + mkdir -p posix && \ + ${JAVA_CLDR} org.unicode.cldr.posix.GeneratePOSIX \ + -d posix -m ${area} -c UTF-8 .endfor .for area encoding in ${COLLATION_SPECIAL} -posixcol: ${UNIDIR}/posix/${area}.${encoding}.src -${UNIDIR}/posix/${area}.${encoding}.src: - java -DCLDR_DIR=${UNIDIR:Q} -jar ${UNIDIR}/tools/java/cldr.jar \ - org.unicode.cldr.posix.GeneratePOSIX \ - -d ${UNIDIR}/posix -m ${area} -c ${encoding} +posixcol: build-tools posix/${area}.${encoding}.src +.ORDER: build-tools posix/${area}.${encoding}.src +posix/${area}.${encoding}.src: + mkdir -p posix && \ + ${JAVA_CLDR} org.unicode.cldr.posix.GeneratePOSIX \ + -d posix -m ${area} -c ${encoding} .endfor -.for enc in ${ENCODINGS} -posixcm: ${UNIDIR}/posix/${enc}.cm -${UNIDIR}/posix/${enc}.cm: - java -DCLDR_DIR=${UNIDIR:Q} -jar ${UNIDIR}/tools/java/cldr.jar \ - org.unicode.cldr.posix.GenerateCharmap \ - -d ${UNIDIR}/posix -c ${enc} -.endfor -clean-POSIX: - rm -f ${UNIDIR}/posix/* +.include Index: tools/tools/locale/README =================================================================== --- tools/tools/locale/README +++ tools/tools/locale/README @@ -1,32 +1,52 @@ # $FreeBSD$ -To generate the locales: +Files in this directory are used to generate locale source files +from files in CLDR (Unicode Common Locale Data Repository). -Tools needed: - java (openjdk >= 8) - perl - converters/p5-Text-Iconv - devel/apache-ant - devel/p5-Tie-IxHash - textproc/p5-XML-Parser +To generate the files, do the following: -1. Fetch CLDR data from: http://unicode.org/Public/cldr/. You need all of the -core.zip, keyboards.zip, and tools.zip. -2. Fetch unidata (UCD.zip) from http://www.unicode.org/Public/zipped/latest. -3. Extract: - mkdir -p ~/unicode - cd ~/unicode - unzip ~/core.zip - unzip ~/keyboards.zip - unzip ~/tools.zip - unzip ~/UCD.zip -4. Export variable: - UNIDIR=~/unicode; export UNIDIR -5. Build the CLDR tools: - cd $UNIDIR/tools/java - ant jar -6. Build POSIX data files from CLDR data: - make POSIX -7. Build and install new locale data: - make + cd /usr/src/tools/tools/locale + make obj (mandatory) + make -j16 (-jN recommended) + make diff (check if the changes are reasonable) make install + +"make" downloads the necessary files, build them, and install the +results into /usr/src/share/* as source files for locales. + +More details are as follows: + +Variables: + LOCALESRCDIR + Destination path for the generated locale files. + Default: $DESTDIR/usr/src/share. + TMPDIR + Temporary directory. + Default: /tmp + +Targets: + make obj + Create a temporary directory. + + make tools-test + Check if necessary tools are installed or not. + If something is missing, install them. + + make fetch + Download necessary files from CLDR. + + make build-tools + Build a tool to generate locale source files. + + make posix + Build POSIX locale source files. + + make build + Build locale files. + + make diff + Run diff(1) the build results against $LOCALESRCDIR. + + make install + Install the build results into $LOCALESRCDIR. +[EOF] Index: tools/tools/locale/patch/patch-UnicodeData.txt =================================================================== --- /dev/null +++ tools/tools/locale/patch/patch-UnicodeData.txt @@ -0,0 +1,29 @@ +--- UnicodeData.txt.orig 2020-06-29 14:05:49.483379000 +0900 ++++ UnicodeData.txt 2020-06-29 14:12:09.808622000 +0900 +@@ -12138,7 +12138,7 @@ + 33FE;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE;So;0;L; 0033 0031 65E5;;;;N;;;;; + 33FF;SQUARE GAL;So;0;ON; 0067 0061 006C;;;;N;;;;; + 3400;;Lo;0;L;;;;;N;;;;; +-4DBF;;Lo;0;L;;;;;N;;;;; ++4DB5;;Lo;0;L;;;;;N;;;;; + 4DC0;HEXAGRAM FOR THE CREATIVE HEAVEN;So;0;ON;;;;;N;;;;; + 4DC1;HEXAGRAM FOR THE RECEPTIVE EARTH;So;0;ON;;;;;N;;;;; + 4DC2;HEXAGRAM FOR DIFFICULTY AT THE BEGINNING;So;0;ON;;;;;N;;;;; +@@ -12204,7 +12204,7 @@ + 4DFE;HEXAGRAM FOR AFTER COMPLETION;So;0;ON;;;;;N;;;;; + 4DFF;HEXAGRAM FOR BEFORE COMPLETION;So;0;ON;;;;;N;;;;; + 4E00;;Lo;0;L;;;;;N;;;;; +-9FFC;;Lo;0;L;;;;;N;;;;; ++9FEF;;Lo;0;L;;;;;N;;;;; + A000;YI SYLLABLE IT;Lo;0;L;;;;;N;;;;; + A001;YI SYLLABLE IX;Lo;0;L;;;;;N;;;;; + A002;YI SYLLABLE I;Lo;0;L;;;;;N;;;;; +@@ -32901,7 +32901,7 @@ + 1FBF8;SEGMENTED DIGIT EIGHT;Nd;0;EN; 0038;8;8;8;N;;;;; + 1FBF9;SEGMENTED DIGIT NINE;Nd;0;EN; 0039;9;9;9;N;;;;; + 20000;;Lo;0;L;;;;;N;;;;; +-2A6DD;;Lo;0;L;;;;;N;;;;; ++2A6D6;;Lo;0;L;;;;;N;;;;; + 2A700;;Lo;0;L;;;;;N;;;;; + 2B734;;Lo;0;L;;;;;N;;;;; + 2B740;;Lo;0;L;;;;;N;;;;; Index: tools/tools/locale/tools/cldr2def.pl =================================================================== --- tools/tools/locale/tools/cldr2def.pl +++ tools/tools/locale/tools/cldr2def.pl @@ -460,6 +460,11 @@ foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) { next if ($enc eq $DEFENCODING); $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; + if ($file eq 'ja_JP') { + # Override $filename for ja_JP because + # its CTYPE is not compatible with UTF-8. + $filename = "$UNIDIR/posix/$file.eucJP.src"; + } if (! -f $filename) { print STDERR "Cannot open $filename\n"; next; Index: tools/tools/locale/tools/convert_map.pl =================================================================== --- tools/tools/locale/tools/convert_map.pl +++ tools/tools/locale/tools/convert_map.pl @@ -87,7 +87,7 @@ { my $file = shift; - open(UTF8, "$file") || die "open"; + open(UTF8, "$file") || die "$!: open: $file"; while () { next if (/^#/); @@ -158,7 +158,8 @@ $codeset = shift(@ARGV); my $max_mb; -load_utf8_cm("etc/final-maps/map.UTF-8"); +my $etcdir = (exists $ENV{'ETCDIR'}) ? $ENV{'ETCDIR'} : "etc"; +load_utf8_cm("${etcdir}/final-maps/map.UTF-8"); load_map($mf); Index: tools/tools/locale/tools/finalize =================================================================== --- tools/tools/locale/tools/finalize +++ tools/tools/locale/tools/finalize @@ -47,15 +47,21 @@ $1 = "numericdef" -o $1 = "timedef" -o $1 = "ctypedef" ] || usage self=$(realpath $0) -base=$(dirname ${self}) -old=${base}/../${1}.draft -new=${base}/../${1} -TEMP=/tmp/${1}.locales -TEMP2=/tmp/${1}.hashes -TEMP3=/tmp/${1}.symlinks -TEMP4=/tmp/${1}.mapped -FULLMAP=/tmp/utf8-map -FULLEXTRACT=/tmp/extracted-names +base=${BASEDIR:-$(dirname ${self})} +: ${ETCDIR:=${base}/../etc} +: ${TOOLSDIR:=${base}} +: ${OUTBASEDIR:=${base}/../${1}} +: ${OLD_DIR:=${OUTBASEDIR}.draft} +: ${NEW_DIR:=${OUTBASEDIR}} +old=${OLD_DIR} +new=${NEW_DIR} +: ${TMPDIR:=/tmp} +TEMP=${TMPDIR}/${1}.locales +TEMP2=${TMPDIR}/${1}.hashes +TEMP3=${TMPDIR}/${1}.symlinks +TEMP4=${TMPDIR}/${1}.mapped +FULLMAP=${TMPDIR}/utf8-map +FULLEXTRACT=${TMPDIR}/extracted-names AWKCMD="/## PLACEHOLDER/ { \ while ( getline line < \"${TEMP}\" ) {print line} } \ /## SYMPAIRS/ { \ @@ -65,6 +71,7 @@ !/## / { print \$0 }" # Rename the sources with 3 components name into the POSIX version of the name using @modifier +mkdir -p $old $new cd $old pwd for i in *_*_*.*.src; do @@ -142,13 +149,13 @@ rm -f ${TEMP2} /usr/bin/sed -E -e 's/[ ]+/ /g' \ ${UNIDIR}/posix/UTF-8.cm \ - > ${base}/../etc/final-maps/map.UTF-8 + > ${ETCDIR}/final-maps/map.UTF-8 /usr/bin/sed -E -e 's/[ ]+/ /g' \ ${UNIDIR}/posix/eucCN.cm \ - > ${base}/../etc/final-maps/map.eucCN + > ${ETCDIR}/final-maps/map.eucCN /usr/bin/sed -E -e 's/[ ]+/ /g' \ ${UNIDIR}/posix/eucCN.cm \ - > ${base}/../etc/final-maps/map.GB2312 + > ${ETCDIR}/final-maps/map.GB2312 # GB18030 and Big5 are pre-generated from CLDR data CHARMAPS="ARMSCII-8 CP1131 CP1251 \ @@ -160,10 +167,11 @@ for map in ${CHARMAPS} do encoding=${map} - /usr/local/bin/perl ${base}/convert_map.pl \ - ${base}/../etc/charmaps/${map}.TXT ${encoding} \ + env ETCDIR="${ETCDIR}" \ + /usr/local/bin/perl ${TOOLSDIR}/convert_map.pl \ + ${ETCDIR}/charmaps/${map}.TXT ${encoding} \ | /usr/bin/sed -E -e 's/ +/ /g' \ - > ${base}/../etc/final-maps/map.${map} + > ${ETCDIR}/final-maps/map.${map} echo map ${map} converted. done Index: tools/tools/locale/tools/utf8-rollup.pl =================================================================== --- tools/tools/locale/tools/utf8-rollup.pl +++ tools/tools/locale/tools/utf8-rollup.pl @@ -30,6 +30,7 @@ use strict; use Getopt::Long; +use Encode qw(encode decode); if ($#ARGV != 0) { print "Usage: $0 --unidir=\n"; @@ -52,6 +53,23 @@ ############################ +sub utf8to32 { + my @kl = split /\\x/, $_[0]; + + shift @kl if ($kl[0] eq ''); + my $k = pack('H2' x scalar @kl, @kl); + my $ux = encode('UTF-32BE', decode('UTF-8', $k)); + my $u = uc(unpack('H*', $ux)); + # Remove BOM + $u =~ s/^0000FEFF//; + # Remove heading bytes of 0 + while ($u =~ m/^0/ and length($u) > 4) { + $u =~ s/^0//; + } + + return $u; +} + sub get_utf8map { my $file = shift; @@ -75,9 +93,10 @@ last if ($l eq "END CHARMAP"); $l =~ /^(<[^\s]+>)\s+(.*)/; - my $k = $2; + my $k = utf8to32($2); # UTF-8 char code my $v = $1; - $k =~ s/\\x//g; # UTF-8 char code + +# print STDERR "register: $k - $v\n"; $utf8map{$k} = $v; } } @@ -103,6 +122,7 @@ close (FOUT); } + sub wctomb { my $wc = hex(shift); my $lead; @@ -143,7 +163,7 @@ foreach my $l (@lines) { my @d = split(/;/, $l, -1); - my $mb = wctomb($d[0]); + my $mb = $d[0]; my $cat; # XXX There are code points present in UnicodeData.txt @@ -180,9 +200,9 @@ # Check if there's upper/lower mapping if ($d[12] ne "") { - $data{'toupper'}{$mb} = wctomb($d[12]); + $data{'toupper'}{$mb} = $d[12]; } elsif ($d[13] ne "") { - $data{'tolower'}{$mb} = wctomb($d[13]); + $data{'tolower'}{$mb} = $d[13]; } } @@ -193,7 +213,7 @@ foreach my $cat (sort keys (%data)) { print FOUT "$cat\t"; $first = 1; - foreach my $mb (sort keys (%{$data{$cat}})) { + foreach my $mb (sort {hex($a) <=> hex($b)} keys (%{$data{$cat}})) { if ($first == 1) { $first = 0; } elsif ($inrange == 1) {