diff --git a/tools/tools/locale/Makefile b/tools/tools/locale/Makefile index 27ff255d7f9a..92f890b2f4d3 100644 --- a/tools/tools/locale/Makefile +++ b/tools/tools/locale/Makefile @@ -1,258 +1,260 @@ # $FreeBSD$ # See https://wiki.freebsd.org/LocaleNewApproach # Taken from FreeBSD svn [base]/user/edwin/locale/cldr # # needs: # devel/p5-Tie-IxHash # # Modified by John Marino to suit DragonFly needs # .if ${.CURDIR} == ${.OBJDIR} .error Do make obj first. .endif LOCALESRCDIR?= ${SRCTOP}/share TMPDIR?= /tmp BASEDIR= ${.CURDIR} ETCDIR= ${BASEDIR}/etc TOOLSDIR= ${BASEDIR}/tools PATCHDIR= ${BASEDIR}/patch UNIDIR= ${.OBJDIR:tA}/unicode PKGS= openjdk8 \ apache-ant \ p5-XML-Parser \ p5-Tie-IxHash \ p5-Text-Iconv tools-test: pkg info -e ${PKGS} @echo tools ok. KNOWN= monetdef numericdef msgdef colldef ctypedef # timedef TYPES?= ${KNOWN} COLLATION_SPECIAL?= \ cs_CZ ISO8859-2 \ da_DK ISO8859-1 \ da_DK ISO8859-15 \ hr_HR ISO8859-2 \ hu_HU ISO8859-2 \ nb_NO ISO8859-1 \ nb_NO ISO8859-15 \ sk_SK ISO8859-2 \ sr_Latn_RS ISO8859-2 \ sr_Cyrl_RS ISO8859-5 \ zh_Hans_CN GB2312 \ zh_Hans_CN eucCN \ zh_Hant_TW Big5 \ zh_Hans_CN GB18030 \ zh_Hans_CN GBK \ ja_JP eucJP \ nn_NO ISO8859-15 \ nn_NO ISO8859-1 .for area enc in ${COLLATION_SPECIAL} COLLATIONS_SPECIAL_ENV+= ${area}.${enc} .endfor SETENV= env -i \ PATH="${PATH}" \ TMPDIR="${TMPDIR}" \ COLLATIONS_SPECIAL="${COLLATIONS_SPECIAL_ENV}" \ UNIDIR="${UNIDIR}" \ BASEDIR="${BASEDIR}" \ TOOLSDIR="${TOOLSDIR}" \ ETCDIR="${ETCDIR}" all: posix build afterbuild .ORDER: posix build afterbuild afterbuild: build @echo "" @find . -name *failed .for t in ${TYPES} . if ${KNOWN:M${t}} build: build-${t} .ORDER: build-${t} afterbuild . endif .endfor diff: .for t in ${TYPES} . if ${KNOWN:M${t}} diff: diff-${t} diff-${t}: -/usr/bin/diff -ruN -x Makefile -x Makefile.depend \ ${LOCALESRCDIR}/${t} ${t} . endif .endfor install: .for t in ${TYPES} . if ${KNOWN:M${t}} install: install-${t} install-${t}: cd ${LOCALESRCDIR}/${t} && \ rm -f Makefile *.src && \ cd ${.OBJDIR} && \ install -m 644 ${t}/* ${LOCALESRCDIR}/${t} . endif .endfor post-install: .for t in ${TYPES} . if ${KNOWN:M${t}} cd ${LOCALSRCDIR}/${t} && \ make && make install && make clean . endif .endfor .for t in ${TYPES} CLEANDIRS+= ${t} ${t}.draft ${t}: mkdir -p ${t} ${t}.draft && \ perl -I ${TOOLSDIR} ${TOOLSDIR}/cldr2def.pl \ --unidir=${UNIDIR:tA} \ --etc=${ETCDIR:tA} \ --type=${t} build-${t}: ${t} ${SETENV} OUTBASEDIR="${.OBJDIR}/${t}" ${TOOLSDIR}/finalize ${t} .endfor static-colldef: colldef build-colldef: static-colldef static-colldef: .for area enc in ${COLLATION_SPECIAL} awk -f ${TOOLSDIR}/extract-colldef.awk \ posix/${area}.${enc}.src > colldef.draft/${area}.${enc}.src .endfor BASE_LOCALES_OF_INTEREST?= \ af_ZA am_ET ar_AE ar_EG ar_JO ar_MA ar_QA ar_SA \ be_BY bg_BG ca_AD ca_ES ca_FR ca_IT \ cs_CZ da_DK de_AT de_CH de_DE el_GR en_AU en_CA \ en_GB en_HK en_IE en_NZ en_PH en_SG en_US en_ZA \ es_AR es_CR es_ES es_MX et_EE eu_ES fi_FI fr_BE \ fr_CA fr_CH fr_FR ga_IE he_IL hi_IN hr_HR hu_HU hy_AM \ is_IS it_CH it_IT ja_JP ko_KR lt_LT lv_LV \ nb_NO nl_BE nl_NL nn_NO pl_PL pt_BR pt_PT ro_RO \ ru_RU se_FI se_NO sk_SK sl_SI sv_FI sv_SE tr_TR \ uk_UA \ kk_KZ mn_MN sr_Cyrl_RS sr_Latn_RS \ zh_Hans_CN zh_Hant_HK zh_Hant_TW \ bn_IN gu_IN or_IN ta_IN te_IN kn_IN ml_IN si_LK \ th_TH lo_LA bo_IN my_MM pa_Guru_IN ka_GE chr_US \ km_KH shi_Tfng_MA ii_CN vai_Vaii_LR vi_VN ENCODINGS= Big5 \ CP1251 \ CP866 \ CP949 \ eucCN \ eucJP \ eucKR \ GB18030 \ GB2312 \ GBK \ ISO8859-1 \ ISO8859-13 \ ISO8859-15 \ ISO8859-2 \ ISO8859-5 \ ISO8859-7 \ ISO8859-9 \ KOI8-R \ KOI8-U \ SJIS \ US-ASCII \ - UTF-8 + UTF-8 \ + UTF-32 # CLDR files CLDRFILES_CORE= https://unicode.org/Public/cldr/35/core.zip CLDRFILES_KEY= https://unicode.org/Public/cldr/35/keyboards.zip CLDRFILES_TOOLS=https://unicode.org/Public/cldr/35/tools.zip CLDRFILES_UCD= http://www.unicode.org/Public/zipped/latest/UCD.zip # fetch and extract targets ${UNIDIR}: mkdir -p ${UNIDIR} .for N in CORE KEY TOOLS UCD ${CLDRFILES_${N}:T}: fetch ${CLDRFILES_${N}} fetch: ${CLDRFILES_${N}:T} extract-${CLDRFILES_${N}:T}:: ${CLDRFILES_${N}:T} ${UNIDIR} cd ${UNIDIR} && unzip -o ../${CLDRFILES_${N}:T} extract: extract-${CLDRFILES_${N}:T} .endfor grep 'name="version"' ${UNIDIR}/tools/build.xml | \ sed 's/.* value="//;s/".*//' > ${UNIDIR}/cldr-version patch:: .if exists(${PATCHDIR}) cd ${UNIDIR} && cat ${PATCHDIR}/patch-* | patch .endif .if !exists(${UNIDIR}/tools/java/cldr.jar) .ORDER: extract patch build-tools: extract patch tools-test ${UNIDIR} cd ${UNIDIR}/tools/java && ${SETENV} ant all jar .else build-tools: @echo cldr.jar is ready. .endif JAVA_CLDR= java -DCLDR_DIR=${UNIDIR:Q} -jar ${UNIDIR}/tools/java/cldr.jar posix: posixcm post-posixcm posixsrc posixcol .ORDER: posixcm post-posixcm posixsrc posixcol ${UNIDIR}/posix: ln -s -f ../posix ${.TARGET} clean-posix: rm -rf posix ${UNIDIR}/posix -post-posixcm: ${UNIDIR}/posix +${UNIDIR}/posix/xx_Comm_C.UTF-8.src: ${UNIDIR}/posix perl -I ${TOOLSDIR} ${TOOLSDIR}/utf8-rollup.pl \ --unidir=${UNIDIR} +post-posixcm: ${UNIDIR}/posix/xx_Comm_C.UTF-8.src .for enc in ${ENCODINGS} posixcm: build-tools posix/${enc}.cm .ORDER: build-tools posix/${enc}.cm posix/${enc}.cm: mkdir -p posix && \ ${JAVA_CLDR} org.unicode.cldr.posix.GenerateCharmap \ -d posix -c ${enc} .endfor .for area in ${BASE_LOCALES_OF_INTEREST} posixsrc: build-tools posix/${area}.UTF-8.src .ORDER: build-tools posix/${area}.UTF-8.src posix/${area}.UTF-8.src: mkdir -p posix && \ ${JAVA_CLDR} org.unicode.cldr.posix.GeneratePOSIX \ -d posix -m ${area} -c UTF-8 .endfor .for area encoding in ${COLLATION_SPECIAL} posixcol: build-tools posix/${area}.${encoding}.src .ORDER: build-tools posix/${area}.${encoding}.src posix/${area}.${encoding}.src: mkdir -p posix && \ ${JAVA_CLDR} org.unicode.cldr.posix.GeneratePOSIX \ -d posix -m ${area} -c ${encoding} .endfor # generate widths.txt using the data from libut8proc GETWIDTHS=${TOOLSDIR}/getwidths MKWIDTHS=${TOOLSDIR}/mkwidths.pl WIDTHS= ${ETCDIR}/final-maps/widths.txt U8CFLAGS!=pkgconf --cflags libutf8proc U8LIBS!=pkgconf --libs libutf8proc CFLAGS+=${U8CFLAGS} LDFLAGS+=${U8LIBS} CLEANFILES+=${TOOLSDIR}/getwidths widths: ${WIDTHS} ${WIDTHS}: posixcm ${GETWIDTHS} ${GETWIDTHS} | ${MKWIDTHS} ${.OBJDIR}/posix/UTF-8.cm ${.TARGET} .include diff --git a/tools/tools/locale/README b/tools/tools/locale/README index 0b5ce24b51cd..380786929b7c 100644 --- a/tools/tools/locale/README +++ b/tools/tools/locale/README @@ -1,62 +1,67 @@ # $FreeBSD$ Files in this directory are used to generate locale source files from files in CLDR (Unicode Common Locale Data Repository). To generate the files, do the following: cd /usr/src/tools/tools/locale make obj (mandatory) make -j16 (-jN recommended) make diff (check if the changes are reasonable) make install "make" downloads the necessary files, build them, and install the results into /usr/src/share/* as source files for locales. More details are as follows: Variables: LOCALESRCDIR Destination path for the generated locale files. - Default: $DESTDIR/usr/src/share. + Default: ${SRCTOP}/share. TMPDIR Temporary directory. Default: /tmp Targets: make obj Create a temporary directory for building. make clean - Clean up the obj directories. + Clean up the obj directories. Note that this does not + clean up tools or posix locale source files generated + from the CLDR files because it takes a long time to generate + them and they are not changed as long as using the same + CLDR files. "make clean && make build" will + regenerate the locale source files for src/share/*def. make cleandir Remove the obj directories completely. make tools-test Check if necessary tools are installed or not. If something is missing, install them. make fetch Download necessary files from CLDR. make build-tools Build a tool to generate locale source files. make posix Build POSIX locale source files. make build Build locale files. make diff Run diff(1) the build results against $LOCALESRCDIR. make install Install the build results into $LOCALESRCDIR. make widths Generate widths.txt. Requires pkgconf and utf8proc packages to be installed. [EOF] diff --git a/tools/tools/locale/etc/charmaps.xml b/tools/tools/locale/etc/charmaps.xml index 78a344d6929e..52e80f2dee05 100644 --- a/tools/tools/locale/etc/charmaps.xml +++ b/tools/tools/locale/etc/charmaps.xml @@ -1,750 +1,759 @@ + - - - - + + + + + cldr="MINUS_SIGN" unicode="HYPHEN-MINUS" /> + cldr="EN_DASH" unicode="HYPHEN-MINUS" /> + cldr="CYRILLIC_CAPITAL_LETTER_JE" + unicode="LATIN_CAPITAL_LETTER_J" /> + cldr="CYRILLIC_SMALL_LETTER_JE" unicode="LATIN_SMALL_LETTER_J" /> + cldr="CYRILLIC_CAPITAL_LETTER_LJE" string="lj" /> + cldr="CYRILLIC_SMALL_LETTER_LJE" string="lj" /> + cldr="CYRILLIC_CAPITAL_LETTER_A" unicode="LATIN_CAPITAL_LETTER_A" /> + cldr="CYRILLIC_SMALL_LETTER_A" unicode="LATIN_SMALL_LETTER_A" /> + cldr="CYRILLIC_CAPITAL_LETTER_BE" + unicode="LATIN_CAPITAL_LETTER_B" /> + cldr="CYRILLIC_SMALL_LETTER_BE" unicode="LATIN_SMALL_LETTER_B" /> + cldr="CYRILLIC_CAPITAL_LETTER_VE" + unicode="LATIN_CAPITAL_LETTER_B" /> + cldr="CYRILLIC_SMALL_LETTER_VE" unicode="LATIN_SMALL_LETTER_B" /> + cldr="CYRILLIC_CAPITAL_LETTER_GHE" + unicode="LATIN_CAPITAL_LETTER_G" /> + cldr="CYRILLIC_SMALL_LETTER_GHE" unicode="LATIN_SMALL_LETTER_G" /> + cldr="CYRILLIC_CAPITAL_LETTER_DE" string="D" /> + cldr="CYRILLIC_SMALL_LETTER_DE" string="d" /> + cldr="CYRILLIC_CAPITAL_LETTER_IE" + unicode="LATIN_CAPITAL_LETTER_E" /> + cldr="CYRILLIC_SMALL_LETTER_IE" unicode="LATIN_SMALL_LETTER_E" /> + cldr="CYRILLIC_CAPITAL_LETTER_ZHE" string="ZH" /> + cldr="CYRILLIC_SMALL_LETTER_ZHE" string="zh" /> + cldr="CYRILLIC_CAPITAL_LETTER_ZE" string="z" /> + cldr="CYRILLIC_SMALL_LETTER_ZE" string="z" /> + cldr="CYRILLIC_CAPITAL_LETTER_I" unicode="LATIN_CAPITAL_LETTER_J" /> + cldr="CYRILLIC_SMALL_LETTER_I" unicode="LATIN_CAPITAL_LETTER_J" /> + cldr="CYRILLIC_CAPITAL_LETTER_I" unicode="LATIN_SMALL_LETTER_J" /> + cldr="CYRILLIC_SMALL_LETTER_I" unicode="LATIN_SMALL_LETTER_J" /> + cldr="CYRILLIC_CAPITAL_LETTER_KA" + unicode="LATIN_CAPITAL_LETTER_K" /> + cldr="CYRILLIC_SMALL_LETTER_KA" unicode="LATIN_SMALL_LETTER_K" /> + cldr="CYRILLIC_CAPITAL_LETTER_EL" + unicode="LATIN_CAPITAL_LETTER_L" /> + cldr="CYRILLIC_SMALL_LETTER_EL" unicode="LATIN_SMALL_LETTER_L" /> + cldr="CYRILLIC_CAPITAL_LETTER_EM" + unicode="LATIN_CAPITAL_LETTER_M" /> + cldr="CYRILLIC_SMALL_LETTER_EM" unicode="LATIN_SMALL_LETTER_M" /> + cldr="CYRILLIC_CAPITAL_LETTER_EN" + unicode="LATIN_CAPITAL_LETTER_H" /> + cldr="CYRILLIC_SMALL_LETTER_EN" unicode="LATIN_SMALL_LETTER_H" /> + cldr="CYRILLIC_CAPITAL_LETTER_O" unicode="LATIN_CAPITAL_LETTER_O" /> + cldr="CYRILLIC_SMALL_LETTER_O" unicode="LATIN_SMALL_LETTER_O" /> + cldr="CYRILLIC_CAPITAL_LETTER_PE" + unicode="LATIN_CAPITAL_LETTER_P" /> + cldr="CYRILLIC_SMALL_LETTER_PE" unicode="LATIN_SMALL_LETTER_P" /> + cldr="CYRILLIC_CAPITAL_LETTER_ER" + unicode="LATIN_CAPITAL_LETTER_R" /> + cldr="CYRILLIC_SMALL_LETTER_ER" unicode="LATIN_SMALL_LETTER_R" /> + cldr="CYRILLIC_CAPITAL_LETTER_ES" + unicode="LATIN_CAPITAL_LETTER_C" /> + cldr="CYRILLIC_SMALL_LETTER_ES" unicode="LATIN_SMALL_LETTER_C" /> + cldr="CYRILLIC_CAPITAL_LETTER_TE" + unicode="LATIN_CAPITAL_LETTER_T" /> + cldr="CYRILLIC_SMALL_LETTER_TE" unicode="LATIN_SMALL_LETTER_T" /> + cldr="CYRILLIC_CAPITAL_LETTER_U" unicode="LATIN_CAPITAL_LETTER_U" /> + cldr="CYRILLIC_SMALL_LETTER_U" unicode="LATIN_SMALL_LETTER_U" /> + cldr="CYRILLIC_CAPITAL_LETTER_EF" + unicode="LATIN_CAPITAL_LETTER_F" /> + cldr="CYRILLIC_SMALL_LETTER_EF" unicode="LATIN_SMALL_LETTER_F" /> + cldr="CYRILLIC_CAPITAL_LETTER_HA" + unicode="LATIN_CAPITAL_LETTER_H" /> + cldr="CYRILLIC_SMALL_LETTER_HA" unicode="LATIN_SMALL_LETTER_H" /> + cldr="CYRILLIC_CAPITAL_LETTER_TSE" + unicode="LATIN_CAPITAL_LETTER_C" /> + cldr="CYRILLIC_SMALL_LETTER_TSE" unicode="LATIN_SMALL_LETTER_C" /> + cldr="CYRILLIC_CAPITAL_LETTER_CHE" + unicode="LATIN_CAPITAL_LETTER_C_WITH_CARON" /> + cldr="CYRILLIC_SMALL_LETTER_CHE" + unicode="LATIN_SMALL_LETTER_C_WITH_CARON" /> + cldr="CYRILLIC_CAPITAL_LETTER_SHA" + unicode="LATIN_CAPITAL_LETTER_S_WITH_CARON" /> + cldr="CYRILLIC_SMALL_LETTER_SHA" + unicode="LATIN_SMALL_LETTER_S_WITH_CARON" /> + cldr="CYRILLIC_CAPITAL_LETTER_SHCHA" + unicode="LATIN_CAPITAL_LETTER_S_WITH_CIRCUMFLEX" /> + cldr="CYRILLIC_SMALL_LETTER_SHCHA" + unicode="LATIN_SMALL_LETTER_S_WITH_CIRCUMFLEX" /> + cldr="?CYRILLIC_CAPITAL_LETTER_HARD_SIGN" unicode="?" /> + cldr="?CYRILLIC_SMALL_LETTER_HARD_SIGN" unicode="?" /> + cldr="?CYRILLIC_CAPITAL_LETTER_YERU" unicode="?" /> + cldr="?CYRILLIC_SMALL_LETTER_YERU" unicode="?" /> + cldr="?CYRILLIC_CAPITAL_LETTER_SOFT_SIGN" unicode="?" /> + cldr="?CYRILLIC_SMALL_LETTER_SOFT_SIGN" unicode="?" /> + cldr="CYRILLIC_CAPITAL_LETTER_E" + unicode="LATIN_CAPITAL_LETTER_E_WITH_GRAVE" /> + cldr="CYRILLIC_SMALL_LETTER_E" + unicode="LATIN_SMALL_LETTER_E_WITH_GRAVE" /> + cldr="?CYRILLIC_CAPITAL_LETTER_YU" unicode="?" /> + cldr="?CYRILLIC_SMALL_LETTER_YU" unicode="?" /> + cldr="CYRILLIC_CAPITAL_LETTER_YA" + unicode="LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX" /> + cldr="CYRILLIC_SMALL_LETTER_YA" + unicode="LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX" /> + cldr="LATIN_SMALL_LETTER_T_WITH_COMMA_BELOW" + unicode="LATIN_SMALL_LETTER_T" /> + cldr="MODIFIER_LETTER_APOSTROPHE" unicode="APOSTROPHE" /> + cldr="LATIN_SMALL_LETTER_C_WITH_CARON" + unicode="LATIN_SMALL_LETTER_C" /> + cldr="MODIFIER_LETTER_APOSTROPHE" unicode="APOSTROPHE" /> + cldr="MODIFIER_LETTER_APOSTROPHE" unicode="APOSTROPHE" /> - - - - - - - + + + + + + + - - + + + unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_C" /> + unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_D" /> + unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_N" /> + unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_T" /> + unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_W" /> + unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_Y" /> + unicode="FULLWIDTH_DIGIT_ONE" /> + unicode="FULLWIDTH_DIGIT_TWO" /> + unicode="FULLWIDTH_DIGIT_THREE" /> + unicode="FULLWIDTH_DIGIT_FOUR" /> + unicode="FULLWIDTH_DIGIT_FIVE" /> + unicode="FULLWIDTH_DIGIT_SIX" /> + unicode="FULLWIDTH_DIGIT_SEVEN" /> + unicode="FULLWIDTH_DIGIT_EIGHT" /> + unicode="FULLWIDTH_DIGIT_NINE" /> + unicode="FULLWIDTH_DIGIT_ZERO" /> - + unicode="IDEOGRAPHIC_SPACE" /> + + unicode="FULLWIDTH_SOLIDUS" /> + unicode="FULLWIDTH_COMMA" /> - + unicode="FULLWIDTH_HYPHEN-MINUS" /> + + cldr="CJK_UNIFIED_IDEOGRAPH-4E00" ucc="4E00" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E03" ucc="4E03" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E09" ucc="4E09" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E0A" ucc="4E0A" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E0B" ucc="4E0B" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E0D" ucc="4E0D" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E5D" ucc="4E5D" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E8C" ucc="4E8C" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E94" ucc="4E94" /> + cldr="CJK_UNIFIED_IDEOGRAPH-516B" ucc="516B" /> + cldr="CJK_UNIFIED_IDEOGRAPH-516D" ucc="516D" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5206" ucc="5206" /> + cldr="CJK_UNIFIED_IDEOGRAPH-524D" ucc="524D" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5341" ucc="5341" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5348" ucc="5348" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5426" ucc="5426" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5468" ucc="5468" /> + cldr="CJK_UNIFIED_IDEOGRAPH-56DB" ucc="56DB" /> + cldr="CJK_UNIFIED_IDEOGRAPH-571F" ucc="571F" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5B9A" ucc="5B9A" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5E74" ucc="5E74" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5F8C" ucc="5F8C" /> + cldr="CJK_UNIFIED_IDEOGRAPH-65E5" ucc="65E5" /> + cldr="CJK_UNIFIED_IDEOGRAPH-65F6" ucc="65F6" /> + cldr="CJK_UNIFIED_IDEOGRAPH-661F" ucc="661F" /> + cldr="CJK_UNIFIED_IDEOGRAPH-662F" ucc="662F" /> + cldr="CJK_UNIFIED_IDEOGRAPH-6642" ucc="6642" /> + cldr="CJK_UNIFIED_IDEOGRAPH-66DC" ucc="66DC" /> + cldr="CJK_UNIFIED_IDEOGRAPH-6708" ucc="6708" /> + cldr="CJK_UNIFIED_IDEOGRAPH-671F" ucc="671F" /> + cldr="CJK_UNIFIED_IDEOGRAPH-6728" ucc="6728" /> + cldr="CJK_UNIFIED_IDEOGRAPH-6C34" ucc="6C34" /> + cldr="CJK_UNIFIED_IDEOGRAPH-706B" ucc="706B" /> + cldr="CJK_UNIFIED_IDEOGRAPH-786E" ucc="786E" /> + cldr="CJK_UNIFIED_IDEOGRAPH-78BA" ucc="78BA" /> + cldr="CJK_UNIFIED_IDEOGRAPH-79D2" ucc="79D2" /> + cldr="CJK_UNIFIED_IDEOGRAPH-9031" ucc="9031" /> + cldr="CJK_UNIFIED_IDEOGRAPH-91D1" ucc="91D1" /> + cldr="HANGUL_SYLLABLE_GEUM" ucc="AE08" /> + cldr="HANGUL_SYLLABLE_NYEON" ucc="B144" /> + cldr="HANGUL_SYLLABLE_NI" ucc="B2C8" /> + cldr="HANGUL_SYLLABLE_MOG" ucc="BAA9" /> + cldr="HANGUL_SYLLABLE_BUN" ucc="BD84" /> + cldr="HANGUL_SYLLABLE_SU" ucc="C218" /> + cldr="HANGUL_SYLLABLE_SI" ucc="C2DC" /> + cldr="HANGUL_SYLLABLE_A" ucc="C544" /> + cldr="HANGUL_SYLLABLE_YE" ucc="C608" /> + cldr="HANGUL_SYLLABLE_O" ucc="C624" /> + cldr="HANGUL_SYLLABLE_YO" ucc="C694" /> + cldr="HANGUL_SYLLABLE_WEOL" ucc="C6D4" /> + cldr="HANGUL_SYLLABLE_IL" ucc="C77C" /> + cldr="HANGUL_SYLLABLE_JEON" ucc="C804" /> + cldr="HANGUL_SYLLABLE_CO" ucc="CD08" /> + cldr="HANGUL_SYLLABLE_TO" ucc="D1A0" /> + cldr="HANGUL_SYLLABLE_HWA" ucc="D654" /> + cldr="HANGUL_SYLLABLE_HU" ucc="D6C4" /> + cldr="ONE_DOT_LEADER" unicode="FULL_STOP" /> - + + cldr="NO-BREAK_SPACE" unicode="SPACE" /> + cldr="NARROW_NO-BREAK_SPACE" unicode="NO-BREAK_SPACE" /> + cldr="RIGHT_SINGLE_QUOTATION_MARK" unicode="APOSTROPHE" /> - - - - + + + - "; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "" ]]> "; ""; ""; ""; ""; ""; ""; "

"; ""; ""; "

"; "

" ]]> "; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "" ]]> "; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "" ]]> "; ""; ""; ""; ""; ""; "

"; "

"; ""; "

"; "

"; "" ]]> "; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "" ]]> "; ""; ""; ""; ""; ""; "

"; "

"; ""; "

"; "

"; "" ]]> "; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "" ]]> "; ""; ""; ""; ""; ""; ""; ""; ""; ""; ""; "" ]]> diff --git a/tools/tools/locale/tools/cldr2def.pl b/tools/tools/locale/tools/cldr2def.pl index 8617ca81ca40..fd475db714a0 100755 --- a/tools/tools/locale/tools/cldr2def.pl +++ b/tools/tools/locale/tools/cldr2def.pl @@ -1,1068 +1,1132 @@ #!/usr/local/bin/perl -wC # SPDX-License-Identifier: BSD-2-Clause-FreeBSD # # Copyright 2009 Edwin Groothuis # Copyright 2015 John Marino +# Copyright 2020 Hiroki Sato # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # $FreeBSD$ use strict; use File::Copy; use XML::Parser; use Tie::IxHash; use Text::Iconv; #use Data::Dumper; use Getopt::Long; use Digest::SHA qw(sha1_hex); require "charmaps.pm"; - if ($#ARGV < 2) { print "Usage: $0 --unidir= --etc= --type=\n"; exit(1); } my $DEFENCODING = "UTF-8"; my $UNIDIR = undef; my $ETCDIR = undef; my $TYPE = undef; my $CLDR_VERSION = undef; my $result = GetOptions ( "unidir=s" => \$UNIDIR, "etc=s" => \$ETCDIR, "type=s" => \$TYPE, ); my %convertors = (); my %ucd = (); my %values = (); my %hashtable = (); my %languages = (); my %translations = (); my %encodings = (); my %alternativemonths = (); get_languages(); -my %utf8map = (); -my %utf8aliases = (); -get_unidata($UNIDIR); -get_utf8map("$UNIDIR/posix/$DEFENCODING.cm"); +my %utfmap = (); +$utfmap{'UTF-8'} = {}; +$utfmap{'UTF-32'} = {}; +get_utfmap("$UNIDIR/posix/$DEFENCODING.cm", $utfmap{'UTF-8'}); +get_utfmap("$UNIDIR/posix/UTF-32.cm", $utfmap{'UTF-32'}); get_encodings("$ETCDIR/charmaps"); my %keys = (); tie(%keys, "Tie::IxHash"); tie(%hashtable, "Tie::IxHash"); my %FILESNAMES = ( "monetdef" => "LC_MONETARY", "timedef" => "LC_TIME", "msgdef" => "LC_MESSAGES", "numericdef" => "LC_NUMERIC", "colldef" => "LC_COLLATE", "ctypedef" => "LC_CTYPE" ); my %callback = ( mdorder => \&callback_mdorder, altmon => \&callback_altmon, cformat => \&callback_cformat, dformat => \&callback_dformat, dtformat => \&callback_dtformat, cbabmon => \&callback_abmon, cbampm => \&callback_ampm, data => undef, ); my %DESC = ( # numericdef "decimal_point" => "decimal_point", "thousands_sep" => "thousands_sep", "grouping" => "grouping", # monetdef "int_curr_symbol" => "int_curr_symbol (last character always " . "SPACE)", "currency_symbol" => "currency_symbol", "mon_decimal_point" => "mon_decimal_point", "mon_thousands_sep" => "mon_thousands_sep", "mon_grouping" => "mon_grouping", "positive_sign" => "positive_sign", "negative_sign" => "negative_sign", "int_frac_digits" => "int_frac_digits", "frac_digits" => "frac_digits", "p_cs_precedes" => "p_cs_precedes", "p_sep_by_space" => "p_sep_by_space", "n_cs_precedes" => "n_cs_precedes", "n_sep_by_space" => "n_sep_by_space", "p_sign_posn" => "p_sign_posn", "n_sign_posn" => "n_sign_posn", # msgdef "yesexpr" => "yesexpr", "noexpr" => "noexpr", "yesstr" => "yesstr", "nostr" => "nostr", # timedef "abmon" => "Short month names", "mon" => "Long month names (as in a date)", "abday" => "Short weekday names", "day" => "Long weekday names", "t_fmt" => "X_fmt", "d_fmt" => "x_fmt", "c_fmt" => "c_fmt", "am_pm" => "AM/PM", "d_t_fmt" => "date_fmt", "altmon" => "Long month names (without case ending)", "md_order" => "md_order", "t_fmt_ampm" => "ampm_fmt", ); if ($TYPE eq "colldef") { transform_collation(); make_makefile(); } if ($TYPE eq "ctypedef") { transform_ctypes(); make_makefile(); } if ($TYPE eq "numericdef") { %keys = ( "decimal_point" => "s", "thousands_sep" => "s", "grouping" => "ai", ); get_fields(); print_fields(); make_makefile(); } if ($TYPE eq "monetdef") { %keys = ( "int_curr_symbol" => "s", "currency_symbol" => "s", "mon_decimal_point" => "s", "mon_thousands_sep" => "s", "mon_grouping" => "ai", "positive_sign" => "s", "negative_sign" => "s", "int_frac_digits" => "i", "frac_digits" => "i", "p_cs_precedes" => "i", "p_sep_by_space" => "i", "n_cs_precedes" => "i", "n_sep_by_space" => "i", "p_sign_posn" => "i", "n_sign_posn" => "i" ); get_fields(); print_fields(); make_makefile(); } if ($TYPE eq "msgdef") { %keys = ( "yesexpr" => "s", "noexpr" => "s", "yesstr" => "s", "nostr" => "s" ); get_fields(); print_fields(); make_makefile(); } if ($TYPE eq "timedef") { %keys = ( "abmon" => " "as", "abday" => "as", "day" => "as", "t_fmt" => "s", "d_fmt" => " " " " " " "s", ); get_fields(); print_fields(); make_makefile(); } sub callback_ampm { my $s = shift; my $nl = $callback{data}{l} . "_" . $callback{data}{c}; my $enc = $callback{data}{e}; if ($nl eq 'ru_RU') { if ($enc eq 'UTF-8') { $s = 'дп;пп'; } else { my $converter = Text::Iconv->new("utf-8", "$enc"); $s = $converter->convert("дп;пп"); } } return $s; } sub callback_cformat { my $s = shift; my $nl = $callback{data}{l} . "_" . $callback{data}{c}; if ($nl eq 'ko_KR') { $s =~ s/(> )(%p)/$1%A $2/; } $s =~ s/\.,/\./; $s =~ s/ %Z//; $s =~ s/ %z//; $s =~ s/^"%e\./%A %e/; $s =~ s/^"(%B %e, )/"%A, $1/; $s =~ s/^"(%e %B )/"%A $1/; return $s; }; sub callback_dformat { my $s = shift; $s =~ s/(%m(|[-.]))%e/$1%d/; $s =~ s/%e((|[-.])%m)/%d$1/; return $s; }; sub callback_dtformat { my $s = shift; my $nl = $callback{data}{l} . "_" . $callback{data}{c}; if ($nl eq 'ja_JP') { $s =~ s/(> )(%H)/$1%A $2/; } elsif ($nl eq 'ko_KR' || $nl eq 'zh_CN' || $nl eq 'zh_TW') { if ($nl ne 'ko_KR') { $s =~ s/%m/%_m/; } $s =~ s/(> )(%p)/$1%A $2/; } $s =~ s/\.,/\./; $s =~ s/^"%e\./%A %e/; $s =~ s/^"(%B %e, )/"%A, $1/; $s =~ s/^"(%e %B )/"%A $1/; return $s; }; sub callback_mdorder { my $s = shift; return undef if (!defined $s); $s =~ s/[^dem]//g; $s =~ s/e/d/g; return $s; }; sub callback_altmon { # if the language/country is known in %alternative months then # return that, otherwise repeat mon my $s = shift; if (defined $alternativemonths{$callback{data}{l}}{$callback{data}{c}}) { my @altnames = split(";",$alternativemonths{$callback{data}{l}}{$callback{data}{c}}); my @cleaned; foreach (@altnames) { $_ =~ s/^\s+//; $_ =~ s/\s+$//; push @cleaned, $_; } return join(";",@cleaned); } return $s; } sub callback_abmon { # for specified CJK locales, pad result with a space to enable # columns to line up (style established in FreeBSD in 2001) my $s = shift; my $nl = $callback{data}{l} . "_" . $callback{data}{c}; if ($nl eq 'ja_JP' || $nl eq 'ko_KR' || $nl eq 'zh_CN' || $nl eq 'zh_HK' || $nl eq 'zh_TW') { my @monthnames = split(";", $s); my @cleaned; foreach (@monthnames) { if ($_ =~ /^"<(two|three|four|five|six|seven|eight|nine)>/ || ($_ =~ /^"/ && $_ !~ /^"(||)/)) { $_ =~ s/^"/"/; } push @cleaned, $_; } return join(";",@cleaned); } return $s; } ############################ -sub get_unidata { - my $directory = shift; - - open(FIN, "$directory/UnicodeData.txt") - or die("Cannot open $directory/UnicodeData.txt");; - my @lines = ; - chomp(@lines); - close(FIN); - - foreach my $l (@lines) { - my @a = split(/;/, $l); - - $ucd{code2name}{"$a[0]"} = $a[1]; # Unicode name - $ucd{name2code}{"$a[1]"} = $a[0]; # Unicode code - } -} - -sub get_utf8map { - my $file = shift; +sub get_utfmap { + my ($file, $db) = @_; open(FIN, $file); my @lines = ; close(FIN); chomp(@lines); my $prev_k = undef; my $prev_v = ""; my $incharmap = 0; foreach my $l (@lines) { - $l =~ s/\r//; + chomp($l); next if ($l =~ /^\#/); next if ($l eq ""); if ($l eq "CHARMAP") { $incharmap = 1; next; } next if (!$incharmap); last if ($l eq "END CHARMAP"); $l =~ /^<([^\s]+)>\s+(.*)/; my $k = $1; my $v = $2; - $k =~ s/_/ /g; # unicode char string $v =~ s/\\x//g; # UTF-8 char code - $utf8map{$k} = $v; + $db->{$k} = $v; +# print STDERR "UTF $k = $v\n"; - $utf8aliases{$k} = $prev_k if ($prev_v eq $v); + # XXX: no longer needed + # $db_alias->{$k} = $prev_k if ($prev_v eq $v); $prev_v = $v; $prev_k = $k; } } +sub resolve_enc_addition { + my $ret = ''; + + foreach my $t (split(/\+/, $_[0])) { + $t =~ s/^0[xX]//; + $ret .= $t; + } + return $ret; +} + sub get_encodings { my $dir = shift; foreach my $e (sort(keys(%encodings))) { if (!open(FIN, "$dir/$e.TXT")) { print "Cannot open charmap for $e\n"; next; } $encodings{$e} = 1; my @lines = ; close(FIN); chomp(@lines); foreach my $l (@lines) { $l =~ s/\r//; - next if ($l =~ /^\#/); next if ($l eq ""); my @a = split(" ", $l); next if ($#a < 1); - $a[0] =~ s/^0[xX]//; # local char code - $a[1] =~ s/^0[xX]//; # unicode char code - $convertors{$e}{uc($a[1])} = uc($a[0]); + next if ($a[0] =~ /^\#/ or $a[1] =~ /^\#/); + next if ($a[0] eq '' or $a[1] eq ''); + + $a[0] = resolve_enc_addition($a[0]); # local + $a[1] = resolve_enc_addition($a[1]); # UTF-32 + my $u32 = sprintf("%08X", hex($a[1])); +# print STDERR "$a[1] => $u32\n"; + + # Use UTF-32 as the indices. + $convertors{$e}{$u32} = uc($a[0]); } } } sub get_languages { my %data = get_xmldata($ETCDIR); %languages = %{$data{L}}; %translations = %{$data{T}}; %alternativemonths = %{$data{AM}}; %encodings = %{$data{E}}; } sub transform_ctypes { # Add the C.UTF-8 $languages{"C"}{"x"}{data}{"x"}{$DEFENCODING} = undef; foreach my $l (sort keys(%languages)) { foreach my $f (sort keys(%{$languages{$l}})) { foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { next if (defined $languages{$l}{$f}{definitions} && $languages{$l}{$f}{definitions} !~ /$TYPE/); $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread my $file = $l; $file .= "_" . $f if ($f ne "x"); $file .= "_" . $c if ($c ne "x"); my $actfile = $file; my $filename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src"; if (! -f $filename) { print STDERR "Cannot open $filename\n"; next; } open(FIN, "$filename"); print "Reading from $filename for ${l}_${f}_${c}\n"; $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read my @lines; my $shex; my $uhex; while () { push @lines, $_; } close(FIN); $shex = sha1_hex(join("\n", @lines)); $languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex; $hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1; open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src"); print FOUT @lines; close(FOUT); foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) { next if ($enc eq $DEFENCODING); $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; if ($file eq 'ja_JP') { # Override $filename for ja_JP because # its CTYPE is not compatible with UTF-8. $filename = "$UNIDIR/posix/$file.eucJP.src"; } if (! -f $filename) { print STDERR "Cannot open $filename\n"; next; } @lines = (); open(FIN, "$filename"); while () { if ((/^comment_char\s/) || (/^escape_char\s/)){ push @lines, $_; } if (/^LC_CTYPE/../^END LC_CTYPE/) { push @lines, $_; } } close(FIN); $uhex = sha1_hex(join("\n", @lines) . $enc); $languages{$l}{$f}{data}{$c}{$enc} = $uhex; $hashtable{$uhex}{"${l}_${f}_${c}.$enc"} = 1; open(FOUT, ">$TYPE.draft/$actfile.$enc.src"); print FOUT <) { if ((/^comment_char\s/) || (/^escape_char\s/)){ push @lines, $_; } if (/^LC_COLLATE/../^END LC_COLLATE/) { $_ =~ s/[ ]+/ /g; push @lines, $_; } } close(FIN); $shex = sha1_hex(join("\n", @lines)); $languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex; $hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1; open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src"); print FOUT <$TYPE.draft/$actfile.$enc.src"; + my $order_start = 0; + my $print_p = 0; + # + # %c_elem: collation elements + # + # undef: not defined + # 1: defined + # 2: invalid in this encoding + # + my %c_elem = (); + while () { # XXX: this loop should be refactored. + chomp; + $print_p = 1; + if ($order_start) { + $order_start = 0 if (m/^order_end/); + if (m/^<([^>]+)>/) { + if (not defined $c_elem{$1}) { +# print STDERR "$1:\n"; + + my $u32 = $utfmap{'UTF-32'}->{$1}; + die "order, $1\n" if (not defined $u32); +# print STDERR "u32 for $1 = $u32\n"; + if (not defined $convertors{$enc}{$u32}) { +# print STDERR "$1 - $u32 not defined in $enc\n"; + $print_p = 0; + } + } elsif ($c_elem{$1} == 2) { +# print STDERR "$1 is marked as invalid in $enc\n"; + $print_p = 0; + } + } + } elsif (m/^collating-element/) { + my ($elem, $l); + if (m/<([^>]+)> from (.+)/) { + ($elem, $l) = ($1, $2); + } +# print STDERR "$elem: enter ($print_p, $l,)\n"; + while ($print_p and + defined $l and + $l =~ m/<([^>]+)>/g) { +# print STDERR "$elem: $1\n"; + my $u32 = $utfmap{'UTF-32'}->{$1}; + die "collating-element, $1\n" if (not defined $u32); +# print STDERR "u32 for $1 = $u32\n"; + if (not $convertors{$enc}{$u32}) { +# print STDERR "$1 - $u32 not defined in $enc\n"; + $print_p = 0; +# print STDERR "Mark $elem as invalid\n"; + $c_elem{$elem} = 2; + } + } + if ($print_p) { +# print STDERR "Add $elem\n"; + $c_elem{$elem} = 1; + } + } elsif (m/^collating-symbol <([^>]+)>/) { +# print STDERR "Add $1\n"; + $c_elem{$1} = 1; + } elsif (m/^order_start/) { + $order_start = 1; + # do nothing + } + print FOUT $_, "\n" if ($print_p); + } + close FOUT; + close FIN; $languages{$l}{$f}{data}{$c}{$enc} = $shex; $hashtable{$shex}{"${l}_${f}_${c}.$enc"} = 1; } } } } } sub get_fields { foreach my $l (sort keys(%languages)) { foreach my $f (sort keys(%{$languages{$l}})) { foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { next if (defined $languages{$l}{$f}{definitions} && $languages{$l}{$f}{definitions} !~ /$TYPE/); $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread my $file; $file = $l . "_"; $file .= $f . "_" if ($f ne "x"); $file .= $c; my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; $filename = "$ETCDIR/$file.$DEFENCODING.src" if (! -f $filename); if (! -f $filename && defined $languages{$l}{$f}{fallback}) { $file = $languages{$l}{$f}{fallback}; $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"; } $filename = "$UNIDIR/posix/$file.$DEFENCODING.src" if (! -f $filename); if (! -f $filename) { print STDERR "Cannot open $file.$DEFENCODING.src or fallback\n"; next; } open(FIN, "$filename"); print "Reading from $filename for ${l}_${f}_${c}\n"; $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read my @lines = ; chomp(@lines); close(FIN); my $continue = 0; foreach my $k (keys(%keys)) { foreach my $line (@lines) { $line =~ s/\r//; next if (!$continue && $line !~ /^$k\s/); if ($continue) { $line =~ s/^\s+//; } else { $line =~ s/^$k\s+//; } $values{$l}{$f}{$c}{$k} = "" if (!defined $values{$l}{$f}{$c}{$k}); $continue = ($line =~ /\/$/); $line =~ s/\/$// if ($continue); - while ($line =~ /_/) { - $line =~ - s/\<([^>_]+)_([^>]+)\>/<$1 $2>/; - } - die "_ in data - $line" if ($line =~ /_/); +# while ($line =~ /_/) { +# $line =~ +# s/\<([^>_]+)_([^>]+)\>/<$1 $2>/; +# } +# die "_ in data - $line" if ($line =~ /_/); $values{$l}{$f}{$c}{$k} .= $line; last if (!$continue); } } } } } } sub decodecldr { my $e = shift; my $s = shift; my $v = undef; if ($e eq "UTF-8") { # # Conversion to UTF-8 can be done from the Unicode name to # the UTF-8 character code. # - $v = $utf8map{$s}; + $v = $utfmap{'UTF-8'}->{$s}; die "Cannot convert $s in $e (charmap)" if (!defined $v); } else { # # Conversion to these encodings can be done from the Unicode # name to Unicode code to the encodings code. # - my $ucc = undef; - $ucc = $ucd{name2code}{$s} if (defined $ucd{name2code}{$s}); - $ucc = $ucd{name2code}{$utf8aliases{$s}} - if (!defined $ucc - && $utf8aliases{$s} - && defined $ucd{name2code}{$utf8aliases{$s}}); - - if (!defined $ucc) { - if (defined $translations{$e}{$s}{hex}) { - $v = $translations{$e}{$s}{hex}; - $ucc = 0; - } elsif (defined $translations{$e}{$s}{ucc}) { - $ucc = $translations{$e}{$s}{ucc}; + # hex - hex or string attr + # unicode - unicode attr + # ucc - ucc attr + my $hex = $translations{$e}{$s}{hex}; + my $ucc = $utfmap{'UTF-32'}->{$s}; + my $ucc_attr = $translations{$e}{$s}{ucc}; + my $unicode = $translations{$e}{$s}{unicode}; + + if (defined $hex) { # hex is in local encoding + $v = $hex; + } elsif (defined $unicode) { # unicode is in name + $v = $convertors{$e}{$utfmap{'UTF-32'}->{$unicode}}; + } elsif (defined $ucc_attr) { # ucc is in code point + if (defined $ucc) { +# print STDERR "INFO: ucc=$ucc_attr ", +# "overrides $ucc in UTF-32\n"; } - } - - die "Cannot convert $s in $e (ucd string)" if (!defined $ucc); - $v = $convertors{$e}{$ucc} if (!defined $v); - - $v = $translations{$e}{$s}{hex} - if (!defined $v && defined $translations{$e}{$s}{hex}); - - if (!defined $v && defined $translations{$e}{$s}{unicode}) { - my $ucn = $translations{$e}{$s}{unicode}; - $ucc = $ucd{name2code}{$ucn} - if (defined $ucd{name2code}{$ucn}); - $ucc = $ucd{name2code}{$utf8aliases{$ucn}} - if (!defined $ucc - && defined $ucd{name2code}{$utf8aliases{$ucn}}); + # normalize + $ucc_attr = sprintf("%08X", hex($ucc_attr)); +# print STDERR "convert $ucc_attr into $e\n"; + $v = $convertors{$e}{$ucc_attr}; + } elsif (defined $ucc) { + # normalize + $ucc = sprintf("%08X", hex($ucc)); +# print STDERR "convert $ucc into $e\n"; $v = $convertors{$e}{$ucc}; } - - die "Cannot convert $s in $e (charmap)" if (!defined $v); + die "Cannot convert $s in $e" if (!defined $v); } + # XXX: length = 8 is not supported yet. + $v =~ s/^[0]+//g; + $v = "0" . $v if (length($v) % 2); return pack("C", hex($v)) if (length($v) == 2); return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2))) if (length($v) == 4); return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)), hex(substr($v, 4, 2))) if (length($v) == 6); - print STDERR "Cannot convert $e $s\n"; - return "length = " . length($v); - + die "Cannot convert $s in $e (length = " . length($v) . "\n"; } sub translate { my $enc = shift; my $v = shift; return $translations{$enc}{$v} if (defined $translations{$enc}{$v}); return undef; } sub print_fields { foreach my $l (sort keys(%languages)) { foreach my $f (sort keys(%{$languages{$l}})) { foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { next if (defined $languages{$l}{$f}{definitions} && $languages{$l}{$f}{definitions} !~ /$TYPE/); foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) { if ($languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") { print "Skipping ${l}_" . ($f eq "x" ? "" : "${f}_") . "${c} - not read\n"; next; } my $file = $l; $file .= "_" . $f if ($f ne "x"); $file .= "_" . $c; print "Writing to $file in $enc\n"; if ($enc ne $DEFENCODING && !defined $convertors{$enc}) { print "Failed! Cannot convert to $enc.\n"; next; }; open(FOUT, ">$TYPE.draft/$file.$enc.new"); my $okay = 1; my $output = ""; print FOUT </) { $k = substr($g, 1); $g = $keys{$k}; } # Callback function if ($g =~ /^\(.*)/) { my $p1 = $1; $cm = $2; my $p3 = $3; my $rv = decodecldr($enc, $cm); # $rv = translate($enc, $cm) # if (!defined $rv); if (!defined $rv) { print STDERR "Could not convert $k ($cm) from $DEFENCODING to $enc\n"; $okay = 0; next; } $v = $p1 . $rv . $p3; } $output .= "$v\n"; next; } if ($g eq "as") { foreach my $v (split(/;/, $v)) { $v =~ s/^"//; $v =~ s/"$//; my $cm = ""; while ($v =~ /^(.*?)<(.*?)>(.*)/) { my $p1 = $1; $cm = $2; my $p3 = $3; my $rv = decodecldr($enc, $cm); # $rv = translate($enc, # $cm) # if (!defined $rv); if (!defined $rv) { print STDERR "Could not convert $k ($cm) from $DEFENCODING to $enc\n"; $okay = 0; next; } $v = $1 . $rv . $3; } $output .= "$v\n"; } next; } die("$k is '$g'"); } $languages{$l}{$f}{data}{$c}{$enc} = sha1_hex($output); $hashtable{sha1_hex($output)}{"${l}_${f}_${c}.$enc"} = 1; print FOUT "$output# EOF\n"; close(FOUT); if ($okay) { rename("$TYPE.draft/$file.$enc.new", "$TYPE.draft/$file.$enc.src"); } else { rename("$TYPE.draft/$file.$enc.new", "$TYPE.draft/$file.$enc.failed"); } } } } } } sub make_makefile { print "Creating Makefile for $TYPE\n"; my $SRCOUT; my $SRCOUT2; my $SRCOUT3 = ""; my $SRCOUT4 = ""; my $MAPLOC; if ($TYPE eq "colldef") { # In future, we might want to try to put the CLDR version into # the .src files with some new syntax, instead of the makefile. $SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U " . "-i \${.IMPSRC} \\\n" . "\t-V \${CLDR_VERSION} \\\n" . "\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} " . "\${.OBJDIR}/\${.IMPSRC:T:R}"; $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" . "locale/etc/final-maps\n"; $SRCOUT2 = "LC_COLLATE"; $SRCOUT3 = "" . ".for f t in \${LOCALES_MAPPED}\n" . "FILES+=\t\$t.LC_COLLATE\n" . "FILESDIR_\$t.LC_COLLATE=\t\${LOCALEDIR}/\$t\n" . "\$t.LC_COLLATE: \${.CURDIR}/\$f.src\n" . "\tlocaledef \${LOCALEDEF_ENDIAN} -D -U " . "-i \${.ALLSRC} \\\n" . "\t-V \${CLDR_VERSION} \\\n" . "\t\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} \\\n" . "\t\t\${.OBJDIR}/\${.TARGET:T:R}\n" . ".endfor\n\n"; $SRCOUT4 = "## LOCALES_MAPPED\n"; } elsif ($TYPE eq "ctypedef") { $SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U -c " . "-w \${MAPLOC}/widths.txt \\\n" . "\t-f \${MAPLOC}/map.\${.IMPSRC:T:R:E} " . "\\\n\t-i \${.IMPSRC} \${.OBJDIR}/\${.IMPSRC:T:R} " . " || true"; $SRCOUT2 = "LC_CTYPE"; $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" . "locale/etc/final-maps\n"; $SRCOUT3 = "## SYMPAIRS\n\n" . ".for s t in \${SYMPAIRS}\n" . "\${t:S/src\$/LC_CTYPE/}: " . "\$s\n" . "\tlocaledef \${LOCALEDEF_ENDIAN} -D -U -c " . "-w \${MAPLOC}/widths.txt \\\n" . "\t-f \${MAPLOC}/map.\${.TARGET:T:R:C/^.*\\.//} " . "\\\n\t-i \${.ALLSRC} \${.OBJDIR}/\${.TARGET:T:R} " . " || true\n" . ".endfor\n\n"; } else { $SRCOUT = "grep -v -E '^(\#\$\$|\#[ ])' < \${.IMPSRC} > \${.TARGET}"; $SRCOUT2 = "out"; $MAPLOC = ""; } open(FOUT, ">$TYPE.draft/Makefile"); print FOUT < EOF } print FOUT < 0) { my $link = shift(@files); $link =~ s/_x_x//; # special case for C $link =~ s/_x_/_/; # strip family if none there foreach my $file (@files) { my @a = split(/_/, $file); my @b = split(/\./, $a[-1]); $file =~ s/_x_/_/; print FOUT "SAME+=\t\t$link $file\n"; undef($languages{$a[0]}{$a[1]}{data}{$b[0]}{$b[1]}); } } } foreach my $l (sort keys(%languages)) { foreach my $f (sort keys(%{$languages{$l}})) { foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) { next if (defined $languages{$l}{$f}{definitions} && $languages{$l}{$f}{definitions} !~ /$TYPE/); if (defined $languages{$l}{$f}{data}{$c}{$DEFENCODING} && $languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") { print "Skipping ${l}_" . ($f eq "x" ? "" : "${f}_") . "${c} - not read\n"; next; } foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) { my $file = $l; $file .= "_" . $f if ($f ne "x"); $file .= "_" . $c if ($c ne "x"); next if (!defined $languages{$l}{$f}{data}{$c}{$e}); print FOUT "LOCALES+=\t$file.$e\n"; } if (defined $languages{$l}{$f}{nc_link}) { foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) { my $file = $l . "_"; $file .= $f . "_" if ($f ne "x"); $file .= $c; print FOUT "SAME+=\t\t$file.$e $languages{$l}{$f}{nc_link}.$e\t# legacy (lang/country change)\n"; } } if (defined $languages{$l}{$f}{e_link}) { foreach my $el (split(" ", $languages{$l}{$f}{e_link})) { my @a = split(/:/, $el); my $file = $l . "_"; $file .= $f . "_" if ($f ne "x"); $file .= $c; print FOUT "SAME+=\t\t$file.$a[0] $file.$a[1]\t# legacy (same charset)\n"; } } } } } print FOUT < EOF close(FOUT); }