Index: lib/libc/locale/Makefile.inc =================================================================== --- lib/libc/locale/Makefile.inc +++ lib/libc/locale/Makefile.inc @@ -4,7 +4,7 @@ # locale sources .PATH: ${LIBC_SRCTOP}/${LIBC_ARCH}/locale ${LIBC_SRCTOP}/locale -SRCS+= ascii.c big5.c btowc.c collate.c collcmp.c euc.c fix_grouping.c \ +SRCS+= big5.c btowc.c collate.c collcmp.c euc.c fix_grouping.c \ gb18030.c gb2312.c gbk.c ctype.c isctype.c iswctype.c \ ldpart.c lmessages.c lmonetary.c lnumeric.c localeconv.c mblen.c \ mbrlen.c \ Index: lib/libc/locale/Symbol.map =================================================================== --- lib/libc/locale/Symbol.map +++ lib/libc/locale/Symbol.map @@ -214,4 +214,7 @@ __detect_path_locale; __collate_load_error; __collate_range_cmp; + __collate_load_tables_l; + __collate_lookup; + }; Index: lib/libc/locale/ascii.c =================================================================== --- lib/libc/locale/ascii.c +++ lib/libc/locale/ascii.c @@ -1,192 +0,0 @@ -/*- - * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved. - * Copyright (c) 1993 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Paul Borman at Krystal Technologies. - * - * Copyright (c) 2011 The FreeBSD Foundation - * All rights reserved. - * Portions of this software were developed by David Chisnall - * under sponsorship from the FreeBSD Foundation. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include -__FBSDID("$FreeBSD$"); - -#include -#include -#include -#include -#include -#include -#include -#include -#include "mblocal.h" - -static size_t _ascii_mbrtowc(wchar_t * __restrict, const char * __restrict, - size_t, mbstate_t * __restrict); -static int _ascii_mbsinit(const mbstate_t *); -static size_t _ascii_mbsnrtowcs(wchar_t * __restrict dst, - const char ** __restrict src, size_t nms, size_t len, - mbstate_t * __restrict ps __unused); -static size_t _ascii_wcrtomb(char * __restrict, wchar_t, - mbstate_t * __restrict); -static size_t _ascii_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, - size_t, size_t, mbstate_t * __restrict); - -int -_ascii_init(struct xlocale_ctype *l,_RuneLocale *rl) -{ - - l->__mbrtowc = _ascii_mbrtowc; - l->__mbsinit = _ascii_mbsinit; - l->__mbsnrtowcs = _ascii_mbsnrtowcs; - l->__wcrtomb = _ascii_wcrtomb; - l->__wcsnrtombs = _ascii_wcsnrtombs; - l->runes = rl; - l->__mb_cur_max = 1; - l->__mb_sb_limit = 128; - return(0); -} - -static int -_ascii_mbsinit(const mbstate_t *ps __unused) -{ - - /* - * Encoding is not state dependent - we are always in the - * initial state. - */ - return (1); -} - -static size_t -_ascii_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, - mbstate_t * __restrict ps __unused) -{ - - if (s == NULL) - /* Reset to initial shift state (no-op) */ - return (0); - if (n == 0) - /* Incomplete multibyte sequence */ - return ((size_t)-2); - if (*s & 0x80) { - errno = EILSEQ; - return ((size_t)-1); - } - if (pwc != NULL) - *pwc = (unsigned char)*s; - return (*s == '\0' ? 0 : 1); -} - -static size_t -_ascii_wcrtomb(char * __restrict s, wchar_t wc, - mbstate_t * __restrict ps __unused) -{ - - if (s == NULL) - /* Reset to initial shift state (no-op) */ - return (1); - if (wc < 0 || wc > 127) { - errno = EILSEQ; - return ((size_t)-1); - } - *s = (unsigned char)wc; - return (1); -} - -static size_t -_ascii_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, - size_t nms, size_t len, mbstate_t * __restrict ps __unused) -{ - const char *s; - size_t nchr; - - if (dst == NULL) { - for (s = *src; nms > 0 && *s != '\0'; s++, nms--) { - if (*s & 0x80) { - errno = EILSEQ; - return ((size_t)-1); - } - } - return (s - *src); - } - - s = *src; - nchr = 0; - while (len-- > 0 && nms-- > 0) { - if (*s & 0x80) { - errno = EILSEQ; - return ((size_t)-1); - } - if ((*dst++ = (unsigned char)*s++) == L'\0') { - *src = NULL; - return (nchr); - } - nchr++; - } - *src = s; - return (nchr); -} - -static size_t -_ascii_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, - size_t nwc, size_t len, mbstate_t * __restrict ps __unused) -{ - const wchar_t *s; - size_t nchr; - - if (dst == NULL) { - for (s = *src; nwc > 0 && *s != L'\0'; s++, nwc--) { - if (*s < 0 || *s > 127) { - errno = EILSEQ; - return ((size_t)-1); - } - } - return (s - *src); - } - - s = *src; - nchr = 0; - while (len-- > 0 && nwc-- > 0) { - if (*s < 0 || *s > 127) { - errno = EILSEQ; - return ((size_t)-1); - } - if ((*dst++ = *s++) == '\0') { - *src = NULL; - return (nchr); - } - nchr++; - } - *src = s; - return (nchr); -} - Index: lib/libc/locale/big5.c =================================================================== --- lib/libc/locale/big5.c +++ lib/libc/locale/big5.c @@ -1,4 +1,6 @@ /*- + * Copyright 2013 Garrett D'Amore + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved. * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. @@ -19,11 +21,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -61,6 +59,12 @@ static int _BIG5_mbsinit(const mbstate_t *); static size_t _BIG5_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict); +static size_t _BIG5_mbsnrtowcs(wchar_t * __restrict, + const char ** __restrict, size_t, size_t, + mbstate_t * __restrict); +static size_t _BIG5_wcsnrtombs(char * __restrict, + const wchar_t ** __restrict, size_t, size_t, + mbstate_t * __restrict); typedef struct { wchar_t ch; @@ -72,6 +76,8 @@ l->__mbrtowc = _BIG5_mbrtowc; l->__wcrtomb = _BIG5_wcrtomb; + l->__mbsnrtowcs = _BIG5_mbsnrtowcs; + l->__wcsnrtombs = _BIG5_wcsnrtombs; l->__mbsinit = _BIG5_mbsinit; l->runes = rl; l->__mb_cur_max = 2; @@ -147,7 +153,7 @@ wc = (wc << 8) | (*s++ & 0xff); if (pwc != NULL) *pwc = wc; - return (2); + return (2); } else { if (pwc != NULL) *pwc = wc; @@ -178,3 +184,17 @@ *s = wc & 0xff; return (1); } + +static size_t +_BIG5_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, + size_t nms, size_t len, mbstate_t * __restrict ps) +{ + return (__mbsnrtowcs_std(dst, src, nms, len, ps, _BIG5_mbrtowc)); +} + +static size_t +_BIG5_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, + size_t nwc, size_t len, mbstate_t * __restrict ps) +{ + return (__wcsnrtombs_std(dst, src, nwc, len, ps, _BIG5_wcrtomb)); +} Index: lib/libc/locale/collate.h =================================================================== --- lib/libc/locale/collate.h +++ lib/libc/locale/collate.h @@ -1,4 +1,5 @@ /*- + * Copyright 2010 Nexenta Systmes, Inc. All rights reserved. * Copyright (c) 1995 Alex Tatmanjants * at Electronni Visti IA, Kiev, Ukraine. * All rights reserved. @@ -40,42 +41,98 @@ #include #include "xlocale_private.h" -#define STR_LEN 10 -#define TABLE_SIZE 100 -#define COLLATE_VERSION "1.0\n" -#define COLLATE_VERSION1_2 "1.2\n" +/* + * Work around buildworld bootstrapping from older systems whos limits.h + * sets COLL_WEIGHTS_MAX to 0. + */ +#if COLL_WEIGHTS_MAX == 0 +#undef COLL_WEIGHTS_MAX +#define COLL_WEIGHTS_MAX 10 +#endif -struct __collate_st_char_pri { - int prim, sec; -}; -struct __collate_st_chain_pri { - u_char str[STR_LEN]; - int prim, sec; -}; +#define COLLATE_STR_LEN 24 /* should be 64-bit multiple */ +#define COLLATE_VERSION "BSD 1.0\n" -#define __collate_substitute_table (*__collate_substitute_table_ptr) -#define __collate_char_pri_table (*__collate_char_pri_table_ptr) +#define COLLATE_MAX_PRIORITY (0x7fffffff) /* max signed value */ +#define COLLATE_SUBST_PRIORITY (0x40000000) /* bit indicates subst table */ + +#define DIRECTIVE_UNDEF 0x00 +#define DIRECTIVE_FORWARD 0x01 +#define DIRECTIVE_BACKWARD 0x02 +#define DIRECTIVE_POSITION 0x04 +#define DIRECTIVE_UNDEFINED 0x08 /* special last weight for UNDEFINED */ + +#define DIRECTIVE_DIRECTION_MASK (DIRECTIVE_FORWARD | DIRECTIVE_BACKWARD) + +/* + * The collate file format is as follows: + * + * char version[COLLATE_STR_LEN]; // must be COLLATE_VERSION + * collate_info_t info; // see below, includes padding + * collate_char_pri_t char_data[256]; // 8 bit char values + * collate_subst_t subst[*]; // 0 or more substitutions + * collate_chain_pri_t chains[*]; // 0 or more chains + * collate_large_pri_t large[*]; // extended char priorities + * + * Note that all structures must be 32-bit aligned, as each structure + * contains 32-bit member fields. The entire file is mmap'd, so its + * critical that alignment be observed. It is not generally safe to + * use any 64-bit values in the structures. + */ + +typedef struct collate_info { + uint8_t directive_count; + uint8_t directive[COLL_WEIGHTS_MAX]; + int32_t pri_count[COLL_WEIGHTS_MAX]; + int32_t flags; + int32_t chain_count; + int32_t large_count; + int32_t subst_count[COLL_WEIGHTS_MAX]; + int32_t undef_pri[COLL_WEIGHTS_MAX]; +} collate_info_t; + +typedef struct collate_char { + int32_t pri[COLL_WEIGHTS_MAX]; +} collate_char_t; + +typedef struct collate_chain { + wchar_t str[COLLATE_STR_LEN]; + int32_t pri[COLL_WEIGHTS_MAX]; +} collate_chain_t; + +typedef struct collate_large { + int32_t val; + collate_char_t pri; +} collate_large_t; + +typedef struct collate_subst { + int32_t key; + int32_t pri[COLLATE_STR_LEN]; +} collate_subst_t; struct xlocale_collate { struct xlocale_component header; int __collate_load_error; - int __collate_substitute_nontrivial; + char * map; + size_t maplen; - u_char (*__collate_substitute_table_ptr)[UCHAR_MAX + 1][STR_LEN]; - struct __collate_st_char_pri (*__collate_char_pri_table_ptr)[UCHAR_MAX + 1]; - struct __collate_st_chain_pri *__collate_chain_pri_table; + collate_info_t *info; + collate_char_t *char_pri_table; + collate_large_t *large_pri_table; + collate_chain_t *chain_pri_table; + collate_subst_t *subst_table[COLL_WEIGHTS_MAX]; }; - __BEGIN_DECLS -u_char *__collate_strdup(u_char *); -u_char *__collate_substitute(struct xlocale_collate *, const u_char *); int __collate_load_tables(const char *); -void __collate_lookup(struct xlocale_collate *, const u_char *, int *, int *, int *); -int __collate_range_cmp(struct xlocale_collate *, int, int); -#ifdef COLLATE_DEBUG -void __collate_print_tables(void); -#endif +int __collate_equiv_value(locale_t, const wchar_t *, size_t); +void _collate_lookup(struct xlocale_collate *,const wchar_t *, int *, int *, + int, const int **); +int __collate_range_cmp(struct xlocale_collate *, wchar_t, wchar_t); +size_t _collate_wxfrm(struct xlocale_collate *, const wchar_t *, wchar_t *, + size_t); +size_t _collate_sxfrm(struct xlocale_collate *, const wchar_t *, char *, + size_t); __END_DECLS #endif /* !_COLLATE_H_ */ Index: lib/libc/locale/collate.c =================================================================== --- lib/libc/locale/collate.c +++ lib/libc/locale/collate.c @@ -1,4 +1,6 @@ /*- + * Copyright 2014 Garrett D'Amore + * Copright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1995 Alex Tatmanjants * at Electronni Visti IA, Kiev, Ukraine. * All rights reserved. @@ -28,50 +30,39 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. + * + * Adapted to xlocale by John Marino */ #include __FBSDID("$FreeBSD$"); #include "namespace.h" -#include #include #include #include +#include #include #include -#include +#include +#include +#include +#include #include "un-namespace.h" #include "collate.h" #include "setlocale.h" #include "ldpart.h" -#include "libc_private.h" - -/* - * To avoid modifying the original (single-threaded) code too much, we'll just - * define the old globals as fields inside the table. - * - * We also modify the collation table test functions to search the thread-local - * table first and the global table second. - */ -#define __collate_substitute_nontrivial (table->__collate_substitute_nontrivial) -#define __collate_substitute_table_ptr (table->__collate_substitute_table_ptr) -#define __collate_char_pri_table_ptr (table->__collate_char_pri_table_ptr) -#define __collate_chain_pri_table (table->__collate_chain_pri_table) -int __collate_load_error; - - struct xlocale_collate __xlocale_global_collate = { - {{0}, "C"}, 1, 0 + {{0}, "C"}, 1, 0, 0, 0 }; - struct xlocale_collate __xlocale_C_collate = { - {{0}, "C"}, 1, 0 +struct xlocale_collate __xlocale_C_collate = { + {{0}, "C"}, 1, 0, 0, 0 }; -void __collate_err(int ex, const char *f) __dead2; +#include "libc_private.h" int __collate_load_tables_l(const char *encoding, struct xlocale_collate *table); @@ -80,14 +71,14 @@ destruct_collate(void *t) { struct xlocale_collate *table = t; - if (__collate_chain_pri_table) { - free(__collate_chain_pri_table); + if (table->map && (table->maplen > 0)) { + (void) munmap(table->map, table->maplen); } free(t); } void * -__collate_load(const char *encoding, locale_t unused) +__collate_load(const char *encoding, __unused locale_t unused) { if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { return &__xlocale_C_collate; @@ -110,18 +101,19 @@ __collate_load_tables(const char *encoding) { int ret = __collate_load_tables_l(encoding, &__xlocale_global_collate); - __collate_load_error = __xlocale_global_collate.__collate_load_error; return ret; } int __collate_load_tables_l(const char *encoding, struct xlocale_collate *table) { - FILE *fp; - int i, saverr, chains; - uint32_t u32; - char strbuf[STR_LEN], buf[PATH_MAX]; - void *TMP_substitute_table, *TMP_char_pri_table, *TMP_chain_pri_table; + int i, chains, z; + char buf[PATH_MAX]; + char *TMP; + char *map; + collate_info_t *info; + struct stat sbuf; + int fd; /* 'encoding' must be already checked. */ if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { @@ -129,217 +121,582 @@ return (_LDP_CACHE); } - /* 'PathLocale' must be already set & checked. */ - /* Range checking not needed, encoding has fixed size */ - (void)strcpy(buf, _PathLocale); - (void)strcat(buf, "/"); - (void)strcat(buf, encoding); - (void)strcat(buf, "/LC_COLLATE"); - if ((fp = fopen(buf, "re")) == NULL) - return (_LDP_ERROR); + (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE", + _PathLocale, encoding); - if (fread(strbuf, sizeof(strbuf), 1, fp) != 1) { - saverr = errno; - (void)fclose(fp); - errno = saverr; + if ((fd = _open(buf, O_RDONLY)) < 0) + return (_LDP_ERROR); + if (_fstat(fd, &sbuf) < 0) { + (void) _close(fd); return (_LDP_ERROR); } - chains = -1; - if (strcmp(strbuf, COLLATE_VERSION) == 0) - chains = 0; - else if (strcmp(strbuf, COLLATE_VERSION1_2) == 0) - chains = 1; - if (chains < 0) { - (void)fclose(fp); - errno = EFTYPE; + if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) { + (void) _close(fd); + errno = EINVAL; + return (_LDP_ERROR); + } + map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + (void) _close(fd); + if ((TMP = map) == NULL) { return (_LDP_ERROR); } - if (chains) { - if (fread(&u32, sizeof(u32), 1, fp) != 1) { - saverr = errno; - (void)fclose(fp); - errno = saverr; - return (_LDP_ERROR); - } - if ((chains = (int)ntohl(u32)) < 1) { - (void)fclose(fp); - errno = EFTYPE; - return (_LDP_ERROR); - } - } else - chains = TABLE_SIZE; - if ((TMP_substitute_table = - malloc(sizeof(__collate_substitute_table))) == NULL) { - saverr = errno; - (void)fclose(fp); - errno = saverr; + if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) { + (void) munmap(map, sbuf.st_size); + errno = EINVAL; return (_LDP_ERROR); } - if ((TMP_char_pri_table = - malloc(sizeof(__collate_char_pri_table))) == NULL) { - saverr = errno; - free(TMP_substitute_table); - (void)fclose(fp); - errno = saverr; + TMP += COLLATE_STR_LEN; + + info = (void *)TMP; + TMP += sizeof (*info); + + if ((info->directive_count < 1) || + (info->directive_count >= COLL_WEIGHTS_MAX) || + ((chains = info->chain_count) < 0)) { + (void) munmap(map, sbuf.st_size); + errno = EINVAL; return (_LDP_ERROR); } - if ((TMP_chain_pri_table = - malloc(sizeof(*__collate_chain_pri_table) * chains)) == NULL) { - saverr = errno; - free(TMP_substitute_table); - free(TMP_char_pri_table); - (void)fclose(fp); - errno = saverr; + + i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + + (sizeof (collate_chain_t) * chains) + + (sizeof (collate_large_t) * info->large_count); + for (z = 0; z < (info->directive_count); z++) { + i += sizeof (collate_subst_t) * info->subst_count[z]; + } + if (i != (sbuf.st_size - (TMP - map))) { + (void) munmap(map, sbuf.st_size); + errno = EINVAL; return (_LDP_ERROR); } -#define FREAD(a, b, c, d) \ -{ \ - if (fread(a, b, c, d) != c) { \ - saverr = errno; \ - free(TMP_substitute_table); \ - free(TMP_char_pri_table); \ - free(TMP_chain_pri_table); \ - (void)fclose(d); \ - errno = saverr; \ - return (_LDP_ERROR); \ - } \ -} - - FREAD(TMP_substitute_table, sizeof(__collate_substitute_table), 1, fp); - FREAD(TMP_char_pri_table, sizeof(__collate_char_pri_table), 1, fp); - FREAD(TMP_chain_pri_table, - sizeof(*__collate_chain_pri_table), chains, fp); - (void)fclose(fp); - - if (__collate_substitute_table_ptr != NULL) - free(__collate_substitute_table_ptr); - __collate_substitute_table_ptr = TMP_substitute_table; - if (__collate_char_pri_table_ptr != NULL) - free(__collate_char_pri_table_ptr); - __collate_char_pri_table_ptr = TMP_char_pri_table; - for (i = 0; i < UCHAR_MAX + 1; i++) { - __collate_char_pri_table[i].prim = - ntohl(__collate_char_pri_table[i].prim); - __collate_char_pri_table[i].sec = - ntohl(__collate_char_pri_table[i].sec); - } - if (__collate_chain_pri_table != NULL) - free(__collate_chain_pri_table); - __collate_chain_pri_table = TMP_chain_pri_table; - for (i = 0; i < chains; i++) { - __collate_chain_pri_table[i].prim = - ntohl(__collate_chain_pri_table[i].prim); - __collate_chain_pri_table[i].sec = - ntohl(__collate_chain_pri_table[i].sec); - } - __collate_substitute_nontrivial = 0; - for (i = 0; i < UCHAR_MAX + 1; i++) { - if (__collate_substitute_table[i][0] != i || - __collate_substitute_table[i][1] != 0) { - __collate_substitute_nontrivial = 1; - break; + table->char_pri_table = (void *)TMP; + TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); + + for (z = 0; z < info->directive_count; z++) { + if (info->subst_count[z] > 0) { + table->subst_table[z] = (void *)TMP; + TMP += info->subst_count[z] * sizeof (collate_subst_t); + } else { + table->subst_table[z] = NULL; } } + + if (chains > 0) { + table->chain_pri_table = (void *)TMP; + TMP += chains * sizeof (collate_chain_t); + } else + table->chain_pri_table = NULL; + if (info->large_count > 0) + table->large_pri_table = (void *)TMP; + else + table->large_pri_table = NULL; + + table->info = info; table->__collate_load_error = 0; return (_LDP_LOADED); } -u_char * -__collate_substitute(struct xlocale_collate *table, const u_char *s) +/* + * Note: for performance reasons, we have expanded bsearch here. This avoids + * function call overhead with each comparison. + */ + +static int32_t * +substsearch(struct xlocale_collate *table, const wchar_t key, int pass) +{ + collate_subst_t *p; + int n = table->info->subst_count[pass]; + + if (n == 0) + return (NULL); + + if (pass >= table->info->directive_count) + return (NULL); + + if (!(key & COLLATE_SUBST_PRIORITY)) + return (NULL); + + p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); + return (p->pri); +} + +static collate_chain_t * +chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len) { - int dest_len, len, nlen; - int delta = strlen(s); - u_char *dest_str = NULL; - - if (s == NULL || *s == '\0') - return (__collate_strdup("")); - delta += delta / 8; - dest_str = malloc(dest_len = delta); - if (dest_str == NULL) - __collate_err(EX_OSERR, __func__); - len = 0; - while (*s) { - nlen = len + strlen(__collate_substitute_table[*s]); - if (dest_len <= nlen) { - dest_str = reallocf(dest_str, dest_len = nlen + delta); - if (dest_str == NULL) - __collate_err(EX_OSERR, __func__); + int low; + int high; + int next, compar, l; + collate_chain_t *p; + collate_chain_t *tab; + + if (table->info->chain_count == 0) + return (NULL); + + low = 0; + high = table->info->chain_count - 1; + tab = table->chain_pri_table; + + while (low <= high) { + next = (low + high) / 2; + p = tab + next; + compar = *key - *p->str; + if (compar == 0) { + l = wcsnlen(p->str, COLLATE_STR_LEN); + compar = wcsncmp(key, p->str, l); + if (compar == 0) { + *len = l; + return (p); + } } - (void)strcpy(dest_str + len, __collate_substitute_table[*s++]); - len = nlen; + if (compar > 0) + low = next + 1; + else + high = next - 1; + } + return (NULL); +} + +static collate_large_t * +largesearch(struct xlocale_collate *table, const wchar_t key) +{ + int low = 0; + int high = table->info->large_count - 1; + int next, compar; + collate_large_t *p; + collate_large_t *tab = table->large_pri_table; + + if (table->info->large_count == 0) + return (NULL); + + while (low <= high) { + next = (low + high) / 2; + p = tab + next; + compar = key - p->val; + if (compar == 0) + return (p); + if (compar > 0) + low = next + 1; + else + high = next - 1; } - return (dest_str); + return (NULL); } void -__collate_lookup(struct xlocale_collate *table, const u_char *t, int *len, int *prim, int *sec) +_collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len, + int *pri, int which, const int **state) { - struct __collate_st_chain_pri *p2; + collate_chain_t *p2; + collate_large_t *match; + int p, l; + const int *sptr; + + /* + * If this is the "last" pass for the UNDEFINED, then + * we just return the priority itself. + */ + if (which >= table->info->directive_count) { + *pri = *t; + *len = 1; + *state = NULL; + return; + } + + /* + * If we have remaining substitution data from a previous + * call, consume it first. + */ + if ((sptr = *state) != NULL) { + *pri = *sptr; + sptr++; + *state = *sptr ? sptr : NULL; + *len = 0; + return; + } + /* No active substitutions */ *len = 1; - *prim = *sec = 0; - for (p2 = __collate_chain_pri_table; p2->str[0] != '\0'; p2++) { - if (*t == p2->str[0] && - strncmp(t, p2->str, strlen(p2->str)) == 0) { - *len = strlen(p2->str); - *prim = p2->prim; - *sec = p2->sec; - return; + + /* + * Check for composites such as dipthongs that collate as a + * single element (aka chains or collating-elements). + */ + if (((p2 = chainsearch(table, t, &l)) != NULL) && + ((p = p2->pri[which]) >= 0)) { + + *len = l; + *pri = p; + + } else if (*t <= UCHAR_MAX) { + + /* + * Character is a small (8-bit) character. + * We just look these up directly for speed. + */ + *pri = table->char_pri_table[*t].pri[which]; + + } else if ((table->info->large_count > 0) && + ((match = largesearch(table, *t)) != NULL)) { + + /* + * Character was found in the extended table. + */ + *pri = match->pri.pri[which]; + + } else { + /* + * Character lacks a specific definition. + */ + if (table->info->directive[which] & DIRECTIVE_UNDEFINED) { + /* Mask off sign bit to prevent ordering confusion. */ + *pri = (*t & COLLATE_MAX_PRIORITY); + } else { + *pri = table->info->undef_pri[which]; } + /* No substitutions for undefined characters! */ + return; } - *prim = __collate_char_pri_table[*t].prim; - *sec = __collate_char_pri_table[*t].sec; + + /* + * Try substituting (expanding) the character. We are + * currently doing this *after* the chain compression. I + * think it should not matter, but this way might be slightly + * faster. + * + * We do this after the priority search, as this will help us + * to identify a single key value. In order for this to work, + * its important that the priority assigned to a given element + * to be substituted be unique for that level. The localedef + * code ensures this for us. + */ + if ((sptr = substsearch(table, *pri, which)) != NULL) { + if ((*pri = *sptr) != 0) { + sptr++; + *state = *sptr ? sptr : NULL; + } + } + } -u_char * -__collate_strdup(u_char *s) +/* + * This is the meaty part of wcsxfrm & strxfrm. Note that it does + * NOT NULL terminate. That is left to the caller. + */ +size_t +_collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf, + size_t room) { - u_char *t = strdup(s); + int pri; + int len; + const wchar_t *t; + wchar_t *tr = NULL; + int direc; + int pass; + const int32_t *state; + size_t want = 0; + size_t need = 0; + + for (pass = 0; pass <= table->info->directive_count; pass++) { + + state = NULL; + + if (pass != 0) { + /* insert level separator from the previous pass */ + if (room) { + *xf++ = 1; + room--; + } + want++; + } + + /* special pass for undefined */ + if (pass == table->info->directive_count) { + direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; + } else { + direc = table->info->directive[pass]; + } + + t = src; + + if (direc & DIRECTIVE_BACKWARD) { + wchar_t *bp, *fp, c; + if (tr) + free(tr); + if ((tr = wcsdup(t)) == NULL) { + errno = ENOMEM; + goto fail; + } + bp = tr; + fp = tr + wcslen(tr) - 1; + while (bp < fp) { + c = *bp; + *bp++ = *fp; + *fp-- = c; + } + t = (const wchar_t *)tr; + } - if (t == NULL) - __collate_err(EX_OSERR, __func__); - return (t); + if (direc & DIRECTIVE_POSITION) { + while (*t || state) { + _collate_lookup(table, t, &len, &pri, pass, &state); + t += len; + if (pri <= 0) { + if (pri < 0) { + errno = EINVAL; + goto fail; + } + pri = COLLATE_MAX_PRIORITY; + } + if (room) { + *xf++ = pri; + room--; + } + want++; + need = want; + } + } else { + while (*t || state) { + _collate_lookup(table, t, &len, &pri, pass, &state); + t += len; + if (pri <= 0) { + if (pri < 0) { + errno = EINVAL; + goto fail; + } + continue; + } + if (room) { + *xf++ = pri; + room--; + } + want++; + need = want; + } + } + } + if (tr) + free(tr); + return (need); + +fail: + if (tr) + free(tr); + return ((size_t)(-1)); } -void -__collate_err(int ex, const char *f) +/* + * In the non-POSIX case, we transform each character into a string of + * characters representing the character's priority. Since char is usually + * signed, we are limited by 7 bits per byte. To avoid zero, we need to add + * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 + * bits per byte. + * + * It turns out that we sometimes have real priorities that are + * 31-bits wide. (But: be careful using priorities where the high + * order bit is set -- i.e. the priority is negative. The sort order + * may be surprising!) + * + * TODO: This would be a good area to optimize somewhat. It turns out + * that real prioririties *except for the last UNDEFINED pass* are generally + * very small. We need the localedef code to precalculate the max + * priority for us, and ideally also give us a mask, and then we could + * severely limit what we expand to. + */ +#define XFRM_BYTES 6 +#define XFRM_OFFSET ('0') /* make all printable characters */ +#define XFRM_SHIFT 6 +#define XFRM_MASK ((1 << XFRM_SHIFT) - 1) +#define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ + +static int +xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass) { - const char *s; - int serrno = errno; + /* we use unsigned to ensure zero fill on right shift */ + uint32_t val = (uint32_t)table->info->pri_count[pass]; + int nc = 0; + + while (val) { + *p = (pri & XFRM_MASK) + XFRM_OFFSET; + pri >>= XFRM_SHIFT; + val >>= XFRM_SHIFT; + p++; + nc++; + } + return (nc); +} - s = _getprogname(); - _write(STDERR_FILENO, s, strlen(s)); - _write(STDERR_FILENO, ": ", 2); - s = f; - _write(STDERR_FILENO, s, strlen(s)); - _write(STDERR_FILENO, ": ", 2); - s = strerror(serrno); - _write(STDERR_FILENO, s, strlen(s)); - _write(STDERR_FILENO, "\n", 1); - exit(ex); +size_t +_collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf, + size_t room) +{ + int pri; + int len; + const wchar_t *t; + wchar_t *tr = NULL; + int direc; + int pass; + const int32_t *state; + size_t want = 0; + size_t need = 0; + int b; + uint8_t buf[XFRM_BYTES]; + + for (pass = 0; pass <= table->info->directive_count; pass++) { + + state = NULL; + + if (pass != 0) { + /* insert level separator from the previous pass */ + if (room) { + *xf++ = XFRM_SEP; + room--; + } + want++; + } + + /* special pass for undefined */ + if (pass == table->info->directive_count) { + direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; + } else { + direc = table->info->directive[pass]; + } + + t = src; + + if (direc & DIRECTIVE_BACKWARD) { + wchar_t *bp, *fp, c; + if (tr) + free(tr); + if ((tr = wcsdup(t)) == NULL) { + errno = ENOMEM; + goto fail; + } + bp = tr; + fp = tr + wcslen(tr) - 1; + while (bp < fp) { + c = *bp; + *bp++ = *fp; + *fp-- = c; + } + t = (const wchar_t *)tr; + } + + if (direc & DIRECTIVE_POSITION) { + while (*t || state) { + + _collate_lookup(table, t, &len, &pri, pass, &state); + t += len; + if (pri <= 0) { + if (pri < 0) { + errno = EINVAL; + goto fail; + } + pri = COLLATE_MAX_PRIORITY; + } + + b = xfrm(table, buf, pri, pass); + want += b; + if (room) { + while (b) { + b--; + if (room) { + *xf++ = buf[b]; + room--; + } + } + } + need = want; + } + } else { + while (*t || state) { + _collate_lookup(table, t, &len, &pri, pass, &state); + t += len; + if (pri <= 0) { + if (pri < 0) { + errno = EINVAL; + goto fail; + } + continue; + } + + b = xfrm(table, buf, pri, pass); + want += b; + if (room) { + + while (b) { + b--; + if (room) { + *xf++ = buf[b]; + room--; + } + } + } + need = want; + } + } + } + if (tr) + free(tr); + return (need); + +fail: + if (tr) + free(tr); + return ((size_t)(-1)); } -#ifdef COLLATE_DEBUG -void -__collate_print_tables() +/* + * __collate_equiv_value returns the primary collation value for the given + * collating symbol specified by str and len. Zero or negative is returned + * if the collating symbol was not found. This function is used by bracket + * code in the TRE regex library. + */ +int +__collate_equiv_value(locale_t locale, const wchar_t *str, size_t len) { - int i; - struct __collate_st_chain_pri *p2; + int32_t e; - printf("Substitute table:\n"); - for (i = 0; i < UCHAR_MAX + 1; i++) - if (i != *__collate_substitute_table[i]) - printf("\t'%c' --> \"%s\"\n", i, - __collate_substitute_table[i]); - printf("Chain priority table:\n"); - for (p2 = __collate_chain_pri_table; p2->str[0] != '\0'; p2++) - printf("\t\"%s\" : %d %d\n", p2->str, p2->prim, p2->sec); - printf("Char priority table:\n"); - for (i = 0; i < UCHAR_MAX + 1; i++) - printf("\t'%c' : %d %d\n", i, __collate_char_pri_table[i].prim, - __collate_char_pri_table[i].sec); + if (len < 1 || len >= COLLATE_STR_LEN) + return (-1); + + FIX_LOCALE(locale); + struct xlocale_collate *table = + (struct xlocale_collate*)locale->components[XLC_COLLATE]; + + if (table->__collate_load_error) + return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1); + + if (len == 1) { + e = -1; + if (*str <= UCHAR_MAX) + e = table->char_pri_table[*str].pri[0]; + else if (table->info->large_count > 0) { + collate_large_t *match_large; + match_large = largesearch(table, *str); + if (match_large) + e = match_large->pri.pri[0]; + } + if (e == 0) + return (1); + return (e > 0 ? e : 0); + } + if (table->info->chain_count > 0) { + wchar_t name[COLLATE_STR_LEN]; + collate_chain_t *match_chain; + int clen; + + wcsncpy (name, str, len); + name[len] = 0; + match_chain = chainsearch(table, name, &clen); + if (match_chain) { + e = match_chain->pri[0]; + if (e == 0) + return (1); + return (e < 0 ? -e : e); + } + } + return (0); } -#endif Index: lib/libc/locale/collcmp.c =================================================================== --- lib/libc/locale/collcmp.c +++ lib/libc/locale/collcmp.c @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$"); #include +#include #include #include "collate.h" @@ -40,13 +41,15 @@ * Compare two characters using collate */ -int __collate_range_cmp(struct xlocale_collate *table, int c1, int c2) +int __collate_range_cmp(struct xlocale_collate *table, wchar_t c1, wchar_t c2) { - static char s1[2], s2[2]; + wchar_t s1[2], s2[2]; s1[0] = c1; + s1[1] = 0; s2[0] = c2; + s2[1] = 0; struct _xlocale l = {{0}}; l.components[XLC_COLLATE] = (struct xlocale_component *)table; - return (strcoll_l(s1, s2, &l)); + return (wcscoll_l(s1, s2, &l)); } Index: lib/libc/locale/euc.c =================================================================== --- lib/libc/locale/euc.c +++ lib/libc/locale/euc.c @@ -1,4 +1,6 @@ /*- + * Copyright 2013 Garrett D'Amore + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved. * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. @@ -19,11 +21,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -56,17 +54,56 @@ extern int __mb_sb_limit; -static size_t _EUC_mbrtowc(wchar_t * __restrict, const char * __restrict, +static size_t _EUC_mbrtowc_impl(wchar_t * __restrict, const char * __restrict, + size_t, mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t); +static size_t _EUC_wcrtomb_impl(char * __restrict, wchar_t, + mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t); + +static size_t _EUC_CN_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); -static int _EUC_mbsinit(const mbstate_t *); -static size_t _EUC_wcrtomb(char * __restrict, wchar_t, +static size_t _EUC_JP_mbrtowc(wchar_t * __restrict, const char * __restrict, + size_t, mbstate_t * __restrict); +static size_t _EUC_KR_mbrtowc(wchar_t * __restrict, const char * __restrict, + size_t, mbstate_t * __restrict); +static size_t _EUC_TW_mbrtowc(wchar_t * __restrict, const char * __restrict, + size_t, mbstate_t * __restrict); + +static size_t _EUC_CN_wcrtomb(char * __restrict, wchar_t, + mbstate_t * __restrict); +static size_t _EUC_JP_wcrtomb(char * __restrict, wchar_t, + mbstate_t * __restrict); +static size_t _EUC_KR_wcrtomb(char * __restrict, wchar_t, + mbstate_t * __restrict); +static size_t _EUC_TW_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict); -typedef struct { - int count[4]; - wchar_t bits[4]; - wchar_t mask; -} _EucInfo; +static size_t _EUC_CN_mbsnrtowcs(wchar_t * __restrict, + const char ** __restrict, size_t, size_t, + mbstate_t * __restrict); +static size_t _EUC_JP_mbsnrtowcs(wchar_t * __restrict, + const char ** __restrict, size_t, size_t, + mbstate_t * __restrict); +static size_t _EUC_KR_mbsnrtowcs(wchar_t * __restrict, + const char ** __restrict, size_t, size_t, + mbstate_t * __restrict); +static size_t _EUC_TW_mbsnrtowcs(wchar_t * __restrict, + const char ** __restrict, size_t, size_t, + mbstate_t * __restrict); + +static size_t _EUC_CN_wcsnrtombs(char * __restrict, + const wchar_t ** __restrict, size_t, size_t, + mbstate_t * __restrict); +static size_t _EUC_JP_wcsnrtombs(char * __restrict, + const wchar_t ** __restrict, size_t, size_t, + mbstate_t * __restrict); +static size_t _EUC_KR_wcsnrtombs(char * __restrict, + const wchar_t ** __restrict, size_t, size_t, + mbstate_t * __restrict); +static size_t _EUC_TW_wcsnrtombs(char * __restrict, + const wchar_t ** __restrict, size_t, size_t, + mbstate_t * __restrict); + +static int _EUC_mbsinit(const mbstate_t *); typedef struct { wchar_t ch; @@ -74,94 +111,218 @@ int want; } _EucState; +static int +_EUC_mbsinit(const mbstate_t *ps) +{ + + return (ps == NULL || ((const _EucState *)ps)->want == 0); +} + +/* + * EUC-CN uses CS0, CS1 and CS2 (4 bytes). + */ int -_EUC_init(struct xlocale_ctype *l, _RuneLocale *rl) +_EUC_CN_init(struct xlocale_ctype *l, _RuneLocale *rl) { - _EucInfo *ei; - int x, new__mb_cur_max; - char *v, *e; + l->__mbrtowc = _EUC_CN_mbrtowc; + l->__wcrtomb = _EUC_CN_wcrtomb; + l->__mbsnrtowcs = _EUC_CN_mbsnrtowcs; + l->__wcsnrtombs = _EUC_CN_wcsnrtombs; + l->__mbsinit = _EUC_mbsinit; - if (rl->__variable == NULL) - return (EFTYPE); + l->runes = rl; + l->__mb_cur_max = 4; + l->__mb_sb_limit = 256; + return (0); +} - v = (char *)rl->__variable; +static size_t +_EUC_CN_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, + size_t n, mbstate_t * __restrict ps) +{ + return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0)); +} - while (*v == ' ' || *v == '\t') - ++v; +static size_t +_EUC_CN_mbsnrtowcs(wchar_t * __restrict dst, + const char ** __restrict src, + size_t nms, size_t len, mbstate_t * __restrict ps) +{ + return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_CN_mbrtowc)); +} - if ((ei = malloc(sizeof(_EucInfo))) == NULL) - return (errno == 0 ? ENOMEM : errno); +static size_t +_EUC_CN_wcrtomb(char * __restrict s, wchar_t wc, + mbstate_t * __restrict ps) +{ + return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0)); +} - new__mb_cur_max = 0; - for (x = 0; x < 4; ++x) { - ei->count[x] = (int)strtol(v, &e, 0); - if (v == e || !(v = e)) { - free(ei); - return (EFTYPE); - } - if (new__mb_cur_max < ei->count[x]) - new__mb_cur_max = ei->count[x]; - while (*v == ' ' || *v == '\t') - ++v; - ei->bits[x] = (int)strtol(v, &e, 0); - if (v == e || !(v = e)) { - free(ei); - return (EFTYPE); - } - while (*v == ' ' || *v == '\t') - ++v; - } - ei->mask = (int)strtol(v, &e, 0); - if (v == e || !(v = e)) { - free(ei); - return (EFTYPE); - } - rl->__variable = ei; - rl->__variable_len = sizeof(_EucInfo); - l->runes = rl; - l->__mb_cur_max = new__mb_cur_max; - l->__mbrtowc = _EUC_mbrtowc; - l->__wcrtomb = _EUC_wcrtomb; +static size_t +_EUC_CN_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, + size_t nwc, size_t len, mbstate_t * __restrict ps) +{ + return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_CN_wcrtomb)); +} + +/* + * EUC-KR uses only CS0 and CS1. + */ +int +_EUC_KR_init(struct xlocale_ctype *l, _RuneLocale *rl) +{ + l->__mbrtowc = _EUC_KR_mbrtowc; + l->__wcrtomb = _EUC_KR_wcrtomb; + l->__mbsnrtowcs = _EUC_KR_mbsnrtowcs; + l->__wcsnrtombs = _EUC_KR_wcsnrtombs; l->__mbsinit = _EUC_mbsinit; - l->__mb_sb_limit = 256; + + l->runes = rl; + l->__mb_cur_max = 2; + l->__mb_sb_limit = 128; return (0); } -static int -_EUC_mbsinit(const mbstate_t *ps) +static size_t +_EUC_KR_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, + size_t n, mbstate_t * __restrict ps) { + return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0)); +} - return (ps == NULL || ((const _EucState *)ps)->want == 0); +static size_t +_EUC_KR_mbsnrtowcs(wchar_t * __restrict dst, + const char ** __restrict src, + size_t nms, size_t len, mbstate_t * __restrict ps) +{ + return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_KR_mbrtowc)); } -#define CEI ((_EucInfo *)(_CurrentRuneLocale->__variable)) +static size_t +_EUC_KR_wcrtomb(char * __restrict s, wchar_t wc, + mbstate_t * __restrict ps) +{ + return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0)); +} -#define _SS2 0x008e -#define _SS3 0x008f +static size_t +_EUC_KR_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, + size_t nwc, size_t len, mbstate_t * __restrict ps) +{ + return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_KR_wcrtomb)); +} -#define GR_BITS 0x80808080 /* XXX: to be fixed */ +/* + * EUC-JP uses CS0, CS1, CS2, and CS3. + */ +int +_EUC_JP_init(struct xlocale_ctype *l, _RuneLocale *rl) +{ + l->__mbrtowc = _EUC_JP_mbrtowc; + l->__wcrtomb = _EUC_JP_wcrtomb; + l->__mbsnrtowcs = _EUC_JP_mbsnrtowcs; + l->__wcsnrtombs = _EUC_JP_wcsnrtombs; + l->__mbsinit = _EUC_mbsinit; -static __inline int -_euc_set(u_int c) + l->runes = rl; + l->__mb_cur_max = 3; + l->__mb_sb_limit = 196; + return (0); +} + +static size_t +_EUC_JP_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, + size_t n, mbstate_t * __restrict ps) { + return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3)); +} - c &= 0xff; - return ((c & 0x80) ? c == _SS3 ? 3 : c == _SS2 ? 2 : 1 : 0); +static size_t +_EUC_JP_mbsnrtowcs(wchar_t * __restrict dst, + const char ** __restrict src, + size_t nms, size_t len, mbstate_t * __restrict ps) +{ + return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_JP_mbrtowc)); } static size_t -_EUC_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, +_EUC_JP_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) { + return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3)); +} + +static size_t +_EUC_JP_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, + size_t nwc, size_t len, mbstate_t * __restrict ps) +{ + return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_JP_wcrtomb)); +} + +/* + * EUC-TW uses CS0, CS1, and CS2. + */ +int +_EUC_TW_init(struct xlocale_ctype *l, _RuneLocale *rl) +{ + l->__mbrtowc = _EUC_TW_mbrtowc; + l->__wcrtomb = _EUC_TW_wcrtomb; + l->__mbsnrtowcs = _EUC_TW_mbsnrtowcs; + l->__wcsnrtombs = _EUC_TW_wcsnrtombs; + l->__mbsinit = _EUC_mbsinit; + + l->runes = rl; + l->__mb_cur_max = 4; + l->__mb_sb_limit = 256; + return (0); +} + +static size_t +_EUC_TW_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, + size_t n, mbstate_t * __restrict ps) +{ + return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0)); +} + +static size_t +_EUC_TW_mbsnrtowcs(wchar_t * __restrict dst, + const char ** __restrict src, + size_t nms, size_t len, mbstate_t * __restrict ps) +{ + return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_TW_mbrtowc)); +} + +static size_t +_EUC_TW_wcrtomb(char * __restrict s, wchar_t wc, + mbstate_t * __restrict ps) +{ + return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0)); +} + +static size_t +_EUC_TW_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, + size_t nwc, size_t len, mbstate_t * __restrict ps) +{ + return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_TW_wcrtomb)); +} + +/* + * Common EUC code. + */ + +static size_t +_EUC_mbrtowc_impl(wchar_t * __restrict pwc, const char * __restrict s, + size_t n, mbstate_t * __restrict ps, + uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) +{ _EucState *es; - int i, set, want; + int i, want; wchar_t wc; - const char *os; + unsigned char ch; es = (_EucState *)ps; - if (es->want < 0 || es->want > MB_CUR_MAX || es->set < 0 || - es->set > 3) { + if (es->want < 0 || es->want > MB_CUR_MAX) { errno = EINVAL; return ((size_t)-1); } @@ -176,58 +337,59 @@ /* Incomplete multibyte sequence */ return ((size_t)-2); - os = s; - if (es->want == 0) { - want = CEI->count[set = _euc_set(*s)]; - if (set == 2 || set == 3) { - --want; - if (--n == 0) { - /* Incomplete multibyte sequence */ - es->set = set; - es->want = want; - es->ch = 0; - return ((size_t)-2); - } - ++s; - if (*s == '\0') { - errno = EILSEQ; - return ((size_t)-1); - } + /* Fast path for plain ASCII (CS0) */ + if (((ch = (unsigned char)*s) & 0x80) == 0) { + if (pwc != NULL) + *pwc = ch; + return (ch != '\0' ? 1 : 0); } - wc = (unsigned char)*s++; + + if (ch >= 0xa1) { + /* CS1 */ + want = 2; + } else if (ch == cs2) { + want = cs2width; + } else if (ch == cs3) { + want = cs3width; + } else { + errno = EILSEQ; + return ((size_t)-1); + } + + + es->want = want; + es->ch = 0; } else { - set = es->set; want = es->want; wc = es->ch; } - for (i = (es->want == 0) ? 1 : 0; i < MIN(want, n); i++) { - if (*s == '\0') { - errno = EILSEQ; - return ((size_t)-1); - } - wc = (wc << 8) | (unsigned char)*s++; + + for (i = 0; i < MIN(want, n); i++) { + wc <<= 8; + wc |= *s; + s++; } if (i < want) { /* Incomplete multibyte sequence */ - es->set = set; es->want = want - i; es->ch = wc; return ((size_t)-2); } - wc = (wc & ~CEI->mask) | CEI->bits[set]; if (pwc != NULL) *pwc = wc; es->want = 0; - return (wc == L'\0' ? 0 : s - os); + return (wc == L'\0' ? 0 : want); } static size_t -_EUC_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) +_EUC_wcrtomb_impl(char * __restrict s, wchar_t wc, + mbstate_t * __restrict ps, + uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) { _EucState *es; - wchar_t m, nm; int i, len; + wchar_t nm; es = (_EucState *)ps; @@ -240,34 +402,52 @@ /* Reset to initial shift state (no-op) */ return (1); - m = wc & CEI->mask; - nm = wc & ~m; + if ((wc & ~0x7f) == 0) { + /* Fast path for plain ASCII (CS0) */ + *s = (char)wc; + return (1); + } - if (m == CEI->bits[1]) { -CodeSet1: - /* Codeset 1: The first byte must have 0x80 in it. */ - i = len = CEI->count[1]; - while (i-- > 0) - *s++ = (nm >> (i << 3)) | 0x80; + /* Determine the "length" */ + if ((unsigned)wc > 0xffffff) { + len = 4; + } else if ((unsigned)wc > 0xffff) { + len = 3; + } else if ((unsigned)wc > 0xff) { + len = 2; } else { - if (m == CEI->bits[0]) - i = len = CEI->count[0]; - else if (m == CEI->bits[2]) { - i = len = CEI->count[2]; - *s++ = _SS2; - --i; - /* SS2 designates G2 into GR */ - nm |= GR_BITS; - } else if (m == CEI->bits[3]) { - i = len = CEI->count[3]; - *s++ = _SS3; - --i; - /* SS3 designates G3 into GR */ - nm |= GR_BITS; - } else - goto CodeSet1; /* Bletch */ - while (i-- > 0) - *s++ = (nm >> (i << 3)) & 0xff; + len = 1; + } + + if (len > MB_CUR_MAX) { + errno = EILSEQ; + return ((size_t)-1); + } + + /* This first check excludes CS1, which is implicitly valid. */ + if ((wc < 0xa100) || (wc > 0xffff)) { + /* Check for valid CS2 or CS3 */ + nm = (wc >> ((len - 1) * 8)); + if (nm == cs2) { + if (len != cs2width) { + errno = EILSEQ; + return ((size_t)-1); + } + } else if (nm == cs3) { + if (len != cs3width) { + errno = EILSEQ; + return ((size_t)-1); + } + } else { + errno = EILSEQ; + return ((size_t)-1); + } + } + + /* Stash the bytes, least significant last */ + for (i = len - 1; i >= 0; i--) { + s[i] = (wc & 0xff); + wc >>= 8; } return (len); } Index: lib/libc/locale/gb18030.c =================================================================== --- lib/libc/locale/gb18030.c +++ lib/libc/locale/gb18030.c @@ -1,4 +1,6 @@ /*- + * Copyright 2013 Garrett D'Amore + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2002-2004 Tim J. Robbins * All rights reserved. * @@ -28,6 +30,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ + /* * PRC National Standard GB 18030-2000 encoding of Chinese text. * @@ -49,6 +52,13 @@ static int _GB18030_mbsinit(const mbstate_t *); static size_t _GB18030_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict); +static size_t _GB18030_mbsnrtowcs(wchar_t * __restrict, + const char ** __restrict, size_t, size_t, + mbstate_t * __restrict); +static size_t _GB18030_wcsnrtombs(char * __restrict, + const wchar_t ** __restrict, size_t, size_t, + mbstate_t * __restrict); + typedef struct { int count; @@ -62,6 +72,8 @@ l->__mbrtowc = _GB18030_mbrtowc; l->__wcrtomb = _GB18030_wcrtomb; l->__mbsinit = _GB18030_mbsinit; + l->__mbsnrtowcs = _GB18030_mbsnrtowcs; + l->__wcsnrtombs = _GB18030_wcsnrtombs; l->runes = rl; l->__mb_cur_max = 4; l->__mb_sb_limit = 128; @@ -222,3 +234,19 @@ errno = EILSEQ; return ((size_t)-1); } + +static size_t +_GB18030_mbsnrtowcs(wchar_t * __restrict dst, + const char ** __restrict src, size_t nms, size_t len, + mbstate_t * __restrict ps) +{ + return (__mbsnrtowcs_std(dst, src, nms, len, ps, _GB18030_mbrtowc)); +} + +static size_t +_GB18030_wcsnrtombs(char * __restrict dst, + const wchar_t ** __restrict src, size_t nwc, size_t len, + mbstate_t * __restrict ps) +{ + return (__wcsnrtombs_std(dst, src, nwc, len, ps, _GB18030_wcrtomb)); +} Index: lib/libc/locale/gb2312.c =================================================================== --- lib/libc/locale/gb2312.c +++ lib/libc/locale/gb2312.c @@ -1,4 +1,6 @@ /*- + * Copyright 2013 Garrett D'Amore + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2004 Tim J. Robbins. All rights reserved. * Copyright (c) 2003 David Xu * All rights reserved. @@ -45,6 +47,13 @@ static int _GB2312_mbsinit(const mbstate_t *); static size_t _GB2312_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict); +static size_t _GB2312_mbsnrtowcs(wchar_t * __restrict, + const char ** __restrict, size_t, size_t, + mbstate_t * __restrict); +static size_t _GB2312_wcsnrtombs(char * __restrict, + const wchar_t ** __restrict, size_t, size_t, + mbstate_t * __restrict); + typedef struct { int count; @@ -59,6 +68,8 @@ l->__mbrtowc = _GB2312_mbrtowc; l->__wcrtomb = _GB2312_wcrtomb; l->__mbsinit = _GB2312_mbsinit; + l->__mbsnrtowcs = _GB2312_mbsnrtowcs; + l->__wcsnrtombs = _GB2312_wcsnrtombs; l->__mb_cur_max = 2; l->__mb_sb_limit = 128; return (0); @@ -71,7 +82,7 @@ return (ps == NULL || ((const _GB2312State *)ps)->count == 0); } -static __inline int +static int _GB2312_check(const char *str, size_t n) { const u_char *s = (const u_char *)str; @@ -90,7 +101,7 @@ } else if (s[0] & 0x80) { /* Invalid multibyte sequence */ return (-1); - } + } return (1); } @@ -158,3 +169,19 @@ *s = wc & 0xff; return (1); } + +static size_t +_GB2312_mbsnrtowcs(wchar_t * __restrict dst, + const char ** __restrict src, size_t nms, size_t len, + mbstate_t * __restrict ps) +{ + return (__mbsnrtowcs_std(dst, src, nms, len, ps, _GB2312_mbrtowc)); +} + +static size_t +_GB2312_wcsnrtombs(char * __restrict dst, + const wchar_t ** __restrict src, size_t nwc, size_t len, + mbstate_t * __restrict ps) +{ + return (__wcsnrtombs_std(dst, src, nwc, len, ps, _GB2312_wcrtomb)); +} Index: lib/libc/locale/gbk.c =================================================================== --- lib/libc/locale/gbk.c +++ lib/libc/locale/gbk.c @@ -1,4 +1,6 @@ /*- + * Copyright 2013 Garrett D'Amore + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved. * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. @@ -54,6 +56,12 @@ static int _GBK_mbsinit(const mbstate_t *); static size_t _GBK_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict); +static size_t _GBK_mbsnrtowcs(wchar_t * __restrict, + const char ** __restrict, size_t, size_t, + mbstate_t * __restrict); +static size_t _GBK_wcsnrtombs(char * __restrict, + const wchar_t ** __restrict, size_t, size_t, + mbstate_t * __restrict); typedef struct { wchar_t ch; @@ -66,6 +74,8 @@ l->__mbrtowc = _GBK_mbrtowc; l->__wcrtomb = _GBK_wcrtomb; l->__mbsinit = _GBK_mbsinit; + l->__mbsnrtowcs = _GBK_mbsnrtowcs; + l->__wcsnrtombs = _GBK_wcsnrtombs; l->runes = rl; l->__mb_cur_max = 2; l->__mb_sb_limit = 128; @@ -79,7 +89,7 @@ return (ps == NULL || ((const _GBKState *)ps)->ch == 0); } -static __inline int +static int _gbk_check(u_int c) { @@ -140,7 +150,7 @@ wc = (wc << 8) | (*s++ & 0xff); if (pwc != NULL) *pwc = wc; - return (2); + return (2); } else { if (pwc != NULL) *pwc = wc; @@ -171,3 +181,17 @@ *s = wc & 0xff; return (1); } + +static size_t +_GBK_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, + size_t nms, size_t len, mbstate_t * __restrict ps) +{ + return (__mbsnrtowcs_std(dst, src, nms, len, ps, _GBK_mbrtowc)); +} + +static size_t +_GBK_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, + size_t nwc, size_t len, mbstate_t * __restrict ps) +{ + return (__wcsnrtombs_std(dst, src, nwc, len, ps, _GBK_wcrtomb)); +} Index: lib/libc/locale/mblocal.h =================================================================== --- lib/libc/locale/mblocal.h +++ lib/libc/locale/mblocal.h @@ -1,4 +1,6 @@ /*- + * Copyright 2013 Garrett D'Amore + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2004 Tim J. Robbins. * All rights reserved. * @@ -37,6 +39,8 @@ #include #include "xlocale_private.h" +#define SS2 0x008e +#define SS3 0x008f /* * Conversion function pointers for current encoding. @@ -62,18 +66,24 @@ * Rune initialization function prototypes. */ int _none_init(struct xlocale_ctype *, _RuneLocale *); -int _ascii_init(struct xlocale_ctype *, _RuneLocale *); int _UTF8_init(struct xlocale_ctype *, _RuneLocale *); -int _EUC_init(struct xlocale_ctype *, _RuneLocale *); +int _EUC_CN_init(struct xlocale_ctype *, _RuneLocale *); +int _EUC_JP_init(struct xlocale_ctype *, _RuneLocale *); +int _EUC_KR_init(struct xlocale_ctype *, _RuneLocale *); +int _EUC_TW_init(struct xlocale_ctype *, _RuneLocale *); int _GB18030_init(struct xlocale_ctype *, _RuneLocale *); int _GB2312_init(struct xlocale_ctype *, _RuneLocale *); int _GBK_init(struct xlocale_ctype *, _RuneLocale *); int _BIG5_init(struct xlocale_ctype *, _RuneLocale *); int _MSKanji_init(struct xlocale_ctype *, _RuneLocale *); -extern size_t __mbsnrtowcs_std(wchar_t * __restrict, const char ** __restrict, - size_t, size_t, mbstate_t * __restrict); -extern size_t __wcsnrtombs_std(char * __restrict, const wchar_t ** __restrict, - size_t, size_t, mbstate_t * __restrict); +typedef size_t (*mbrtowc_pfn_t)(wchar_t * __restrict, + const char * __restrict, size_t, mbstate_t * __restrict); +typedef size_t (*wcrtomb_pfn_t)(char * __restrict, wchar_t, + mbstate_t * __restrict); +size_t __mbsnrtowcs_std(wchar_t * __restrict, const char ** __restrict, + size_t, size_t, mbstate_t * __restrict, mbrtowc_pfn_t); +size_t __wcsnrtombs_std(char * __restrict, const wchar_t ** __restrict, + size_t, size_t, mbstate_t * __restrict, wcrtomb_pfn_t); #endif /* _MBLOCAL_H_ */ Index: lib/libc/locale/mbsnrtowcs.c =================================================================== --- lib/libc/locale/mbsnrtowcs.c +++ lib/libc/locale/mbsnrtowcs.c @@ -1,4 +1,6 @@ /*- + * Copyright 2013 Garrett D'Amore + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2002-2004 Tim J. Robbins. * * Copyright (c) 2011 The FreeBSD Foundation @@ -56,20 +58,20 @@ size_t __mbsnrtowcs_std(wchar_t * __restrict dst, const char ** __restrict src, - size_t nms, size_t len, mbstate_t * __restrict ps) + size_t nms, size_t len, mbstate_t * __restrict ps, + mbrtowc_pfn_t pmbrtowc) { const char *s; size_t nchr; wchar_t wc; size_t nb; - struct xlocale_ctype *ct = XLOCALE_CTYPE(__get_locale()); s = *src; nchr = 0; if (dst == NULL) { for (;;) { - if ((nb = ct->__mbrtowc(&wc, s, nms, ps)) == (size_t)-1) + if ((nb = pmbrtowc(&wc, s, nms, ps)) == (size_t)-1) /* Invalid sequence - mbrtowc() sets errno. */ return ((size_t)-1); else if (nb == 0 || nb == (size_t)-2) @@ -82,7 +84,7 @@ } while (len-- > 0) { - if ((nb = ct->__mbrtowc(dst, s, nms, ps)) == (size_t)-1) { + if ((nb = pmbrtowc(dst, s, nms, ps)) == (size_t)-1) { *src = s; return ((size_t)-1); } else if (nb == (size_t)-2) { Index: lib/libc/locale/mskanji.c =================================================================== --- lib/libc/locale/mskanji.c +++ lib/libc/locale/mskanji.c @@ -1,4 +1,6 @@ /* + * Copyright 2013 Garrett D'Amore + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved. * * ja_JP.SJIS locale table for BSD4.4/rune @@ -28,14 +30,14 @@ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #if defined(LIBC_SCCS) && !defined(lint) @@ -59,6 +61,12 @@ static int _MSKanji_mbsinit(const mbstate_t *); static size_t _MSKanji_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict); +static size_t _MSKanji_mbsnrtowcs(wchar_t * __restrict, + const char ** __restrict, size_t, size_t, + mbstate_t * __restrict); +static size_t _MSKanji_wcsnrtombs(char * __restrict, + const wchar_t ** __restrict, size_t, size_t, + mbstate_t * __restrict); typedef struct { wchar_t ch; @@ -70,6 +78,8 @@ l->__mbrtowc = _MSKanji_mbrtowc; l->__wcrtomb = _MSKanji_wcrtomb; + l->__mbsnrtowcs = _MSKanji_mbsnrtowcs; + l->__wcsnrtombs = _MSKanji_wcsnrtombs; l->__mbsinit = _MSKanji_mbsinit; l->runes = rl; l->__mb_cur_max = 2; @@ -163,3 +173,19 @@ *s++ = wc >> (i << 3); return (len); } + +static size_t +_MSKanji_mbsnrtowcs(wchar_t * __restrict dst, + const char ** __restrict src, size_t nms, + size_t len, mbstate_t * __restrict ps) +{ + return (__mbsnrtowcs_std(dst, src, nms, len, ps, _MSKanji_mbrtowc)); +} + +static size_t +_MSKanji_wcsnrtombs(char * __restrict dst, + const wchar_t ** __restrict src, size_t nwc, + size_t len, mbstate_t * __restrict ps) +{ + return (__wcsnrtombs_std(dst, src, nwc, len, ps, _MSKanji_wcrtomb)); +} Index: lib/libc/locale/none.c =================================================================== --- lib/libc/locale/none.c +++ lib/libc/locale/none.c @@ -1,4 +1,6 @@ /*- + * Copyright 2013 Garrett D'Amore + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved. * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. @@ -187,16 +189,6 @@ /* setup defaults */ -size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict, size_t, - mbstate_t * __restrict) = _none_mbrtowc; -int (*__mbsinit)(const mbstate_t *) = _none_mbsinit; -size_t (*__mbsnrtowcs)(wchar_t * __restrict, const char ** __restrict, - size_t, size_t, mbstate_t * __restrict) = _none_mbsnrtowcs; -size_t (*__wcrtomb)(char * __restrict, wchar_t, mbstate_t * __restrict) = - _none_wcrtomb; -size_t (*__wcsnrtombs)(char * __restrict, const wchar_t ** __restrict, - size_t, size_t, mbstate_t * __restrict) = _none_wcsnrtombs; - struct xlocale_ctype __xlocale_global_ctype = { {{0}, "C"}, (_RuneLocale*)&_DefaultRuneLocale, Index: lib/libc/locale/rune.c =================================================================== --- lib/libc/locale/rune.c +++ lib/libc/locale/rune.c @@ -1,4 +1,6 @@ /*- + * Copyright 2014 Garrett D'Amore + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. * @@ -45,14 +47,15 @@ #include #include #include +#include +#include +#include #include "un-namespace.h" #include "runefile.h" -_RuneLocale *_Read_RuneMagi(FILE *); - _RuneLocale * -_Read_RuneMagi(FILE *fp) +_Read_RuneMagi(const char *fname) { char *fdata, *data; void *lastp; @@ -67,119 +70,77 @@ _FileRuneEntry *maplower_ext_ranges; _FileRuneEntry *mapupper_ext_ranges; int runetype_ext_len = 0; + int fd; - if (_fstat(fileno(fp), &sb) < 0) + if ((fd = _open(fname, O_RDONLY)) < 0) { + errno = EINVAL; return (NULL); + } - if ((size_t)sb.st_size < sizeof(_FileRuneLocale)) { - errno = EFTYPE; + if (_fstat(fd, &sb) < 0) { + (void) _close(fd); + errno = EINVAL; return (NULL); } - if ((fdata = malloc(sb.st_size)) == NULL) - return (NULL); - - errno = 0; - rewind(fp); /* Someone might have read the magic number once already */ - if (errno) { - saverr = errno; - free(fdata); - errno = saverr; + if ((size_t)sb.st_size < sizeof (_FileRuneLocale)) { + (void) _close(fd); + errno = EINVAL; return (NULL); } - if (fread(fdata, sb.st_size, 1, fp) != 1) { - saverr = errno; - free(fdata); - errno = saverr; + + fdata = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + (void) _close(fd); + if (fdata == NULL) { + errno = EINVAL; return (NULL); } - frl = (_FileRuneLocale *)fdata; + frl = (_FileRuneLocale *)(void *)fdata; lastp = fdata + sb.st_size; variable = frl + 1; - if (memcmp(frl->magic, _FILE_RUNE_MAGIC_1, sizeof(frl->magic))) { - free(fdata); - errno = EFTYPE; - return (NULL); - } - - frl->variable_len = ntohl(frl->variable_len); - frl->runetype_ext_nranges = ntohl(frl->runetype_ext_nranges); - frl->maplower_ext_nranges = ntohl(frl->maplower_ext_nranges); - frl->mapupper_ext_nranges = ntohl(frl->mapupper_ext_nranges); - - for (x = 0; x < _CACHED_RUNES; ++x) { - frl->runetype[x] = ntohl(frl->runetype[x]); - frl->maplower[x] = ntohl(frl->maplower[x]); - frl->mapupper[x] = ntohl(frl->mapupper[x]); + if (memcmp(frl->magic, _FILE_RUNE_MAGIC_1, sizeof (frl->magic))) { + goto invalid; } runetype_ext_ranges = (_FileRuneEntry *)variable; variable = runetype_ext_ranges + frl->runetype_ext_nranges; if (variable > lastp) { - free(fdata); - errno = EFTYPE; - return (NULL); + goto invalid; } maplower_ext_ranges = (_FileRuneEntry *)variable; variable = maplower_ext_ranges + frl->maplower_ext_nranges; if (variable > lastp) { - free(fdata); - errno = EFTYPE; - return (NULL); + goto invalid; } mapupper_ext_ranges = (_FileRuneEntry *)variable; variable = mapupper_ext_ranges + frl->mapupper_ext_nranges; if (variable > lastp) { - free(fdata); - errno = EFTYPE; - return (NULL); + goto invalid; } frr = runetype_ext_ranges; for (x = 0; x < frl->runetype_ext_nranges; ++x) { uint32_t *types; - frr[x].min = ntohl(frr[x].min); - frr[x].max = ntohl(frr[x].max); - frr[x].map = ntohl(frr[x].map); if (frr[x].map == 0) { int len = frr[x].max - frr[x].min + 1; types = variable; variable = types + len; runetype_ext_len += len; if (variable > lastp) { - free(fdata); - errno = EFTYPE; - return (NULL); + goto invalid; } - while (len-- > 0) - types[len] = ntohl(types[len]); } } - frr = maplower_ext_ranges; - for (x = 0; x < frl->maplower_ext_nranges; ++x) { - frr[x].min = ntohl(frr[x].min); - frr[x].max = ntohl(frr[x].max); - frr[x].map = ntohl(frr[x].map); - } - - frr = mapupper_ext_ranges; - for (x = 0; x < frl->mapupper_ext_nranges; ++x) { - frr[x].min = ntohl(frr[x].min); - frr[x].max = ntohl(frr[x].max); - frr[x].map = ntohl(frr[x].map); - } if ((char *)variable + frl->variable_len > (char *)lastp) { - free(fdata); - errno = EFTYPE; - return (NULL); + goto invalid; } /* @@ -192,7 +153,7 @@ frl->variable_len); if (data == NULL) { saverr = errno; - free(fdata); + munmap(fdata, sb.st_size); errno = saverr; return (NULL); } @@ -202,7 +163,6 @@ memcpy(rl->__magic, _RUNE_MAGIC_1, sizeof(rl->__magic)); memcpy(rl->__encoding, frl->encoding, sizeof(rl->__encoding)); - rl->__invalid_rune = 0; rl->__variable_len = frl->variable_len; rl->__runetype_ext.__nranges = frl->runetype_ext_nranges; @@ -265,7 +225,7 @@ } memcpy(rl->__variable, variable, rl->__variable_len); - free(fdata); + munmap(fdata, sb.st_size); /* * Go out and zero pointers that should be zero. @@ -283,4 +243,9 @@ rl->__mapupper_ext.__ranges = NULL; return (rl); + +invalid: + munmap(fdata, sb.st_size); + errno = EINVAL; + return (NULL); } Index: lib/libc/locale/setrunelocale.c =================================================================== --- lib/libc/locale/setrunelocale.c +++ lib/libc/locale/setrunelocale.c @@ -63,23 +63,15 @@ extern int __mb_sb_limit; -extern _RuneLocale *_Read_RuneMagi(FILE *); +extern _RuneLocale *_Read_RuneMagi(const char *); static int __setrunelocale(struct xlocale_ctype *l, const char *); -#define __collate_substitute_nontrivial (table->__collate_substitute_nontrivial) -#define __collate_substitute_table_ptr (table->__collate_substitute_table_ptr) -#define __collate_char_pri_table_ptr (table->__collate_char_pri_table_ptr) -#define __collate_chain_pri_table (table->__collate_chain_pri_table) - - static void destruct_ctype(void *v) { struct xlocale_ctype *l = v; - if (strcmp(l->runes->__encoding, "EUC") == 0) - free(l->runes->__variable); if (&_DefaultRuneLocale != l->runes) free(l->runes); free(l); @@ -95,12 +87,7 @@ static void free_runes(_RuneLocale *rl) { - - /* FIXME: The "EUC" check here is a hideous abstraction violation. */ if ((rl != &_DefaultRuneLocale) && (rl)) { - if (strcmp(rl->__encoding, "EUC") == 0) { - free(rl->__variable); - } free(rl); } } @@ -108,10 +95,9 @@ static int __setrunelocale(struct xlocale_ctype *l, const char *encoding) { - FILE *fp; - char name[PATH_MAX]; _RuneLocale *rl; - int saverr, ret; + int ret; + char path[PATH_MAX]; struct xlocale_ctype saved = *l; /* @@ -124,37 +110,34 @@ } /* Range checking not needed, encoding length already checked before */ - (void) strcpy(name, _PathLocale); - (void) strcat(name, "/"); - (void) strcat(name, encoding); - (void) strcat(name, "/LC_CTYPE"); - - if ((fp = fopen(name, "re")) == NULL) - return (errno == 0 ? ENOENT : errno); - - if ((rl = _Read_RuneMagi(fp)) == NULL) { - saverr = (errno == 0 ? EFTYPE : errno); - (void)fclose(fp); - return (saverr); + (void) snprintf(path, sizeof (path), "%s/%s/LC_CTYPE", + _PathLocale, encoding); + + if ((rl = _Read_RuneMagi(path)) == NULL) { + errno = EINVAL; + return (errno); } - (void)fclose(fp); l->__mbrtowc = NULL; l->__mbsinit = NULL; - l->__mbsnrtowcs = __mbsnrtowcs_std; + l->__mbsnrtowcs = NULL; l->__wcrtomb = NULL; - l->__wcsnrtombs = __wcsnrtombs_std; + l->__wcsnrtombs = NULL; rl->__sputrune = NULL; rl->__sgetrune = NULL; if (strcmp(rl->__encoding, "NONE") == 0) ret = _none_init(l, rl); - else if (strcmp(rl->__encoding, "ASCII") == 0) - ret = _ascii_init(l, rl); else if (strcmp(rl->__encoding, "UTF-8") == 0) ret = _UTF8_init(l, rl); - else if (strcmp(rl->__encoding, "EUC") == 0) - ret = _EUC_init(l, rl); + else if (strcmp(rl->__encoding, "EUC-CN") == 0) + ret = _EUC_CN_init(l, rl); + else if (strcmp(rl->__encoding, "EUC-JP") == 0) + ret = _EUC_JP_init(l, rl); + else if (strcmp(rl->__encoding, "EUC-KR") == 0) + ret = _EUC_KR_init(l, rl); + else if (strcmp(rl->__encoding, "EUC-TW") == 0) + ret = _EUC_TW_init(l, rl); else if (strcmp(rl->__encoding, "GB18030") == 0) ret = _GB18030_init(l, rl); else if (strcmp(rl->__encoding, "GB2312") == 0) @@ -211,7 +194,7 @@ #endif void * -__ctype_load(const char *locale, locale_t unused) +__ctype_load(const char *locale, locale_t unused __unused) { struct xlocale_ctype *l = calloc(sizeof(struct xlocale_ctype), 1); Index: lib/libc/locale/utf8.c =================================================================== --- lib/libc/locale/utf8.c +++ lib/libc/locale/utf8.c @@ -1,4 +1,5 @@ /*- + * Copyright 2013 Garrett D'Amore * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2002-2004 Tim J. Robbins * All rights reserved. @@ -70,7 +71,7 @@ l->__mbsnrtowcs = _UTF8_mbsnrtowcs; l->__wcsnrtombs = _UTF8_wcsnrtombs; l->runes = rl; - l->__mb_cur_max = 6; + l->__mb_cur_max = 4; /* * UCS-4 encoding used as the internal representation, so * slots 0x0080-0x00FF are occuped and must be excluded @@ -145,6 +146,9 @@ mask = 0x07; want = 4; lbound = 0x10000; +#if 0 + /* These would be illegal in the UTF-8 space */ + } else if ((ch & 0xfc) == 0xf8) { mask = 0x03; want = 5; @@ -153,6 +157,7 @@ mask = 0x01; want = 6; lbound = 0x4000000; +#endif } else { /* * Malformed input; input is not UTF-8. @@ -173,6 +178,7 @@ wch = (unsigned char)*s++ & mask; else wch = us->ch; + for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { if ((*s & 0xc0) != 0x80) { /* @@ -199,13 +205,6 @@ errno = EILSEQ; return ((size_t)-1); } - if (wch >= 0xd800 && wch <= 0xdfff) { - /* - * Malformed input; invalid code points. - */ - errno = EILSEQ; - return ((size_t)-1); - } if (pwc != NULL) *pwc = wch; us->want = 0; @@ -331,12 +330,15 @@ } else if ((wc & ~0x1fffff) == 0) { lead = 0xf0; len = 4; +#if 0 + /* Again, 5 and 6 byte encodings are simply not permitted */ } else if ((wc & ~0x3ffffff) == 0) { lead = 0xf8; len = 5; } else if ((wc & ~0x7fffffff) == 0) { lead = 0xfc; len = 6; +#endif } else { errno = EILSEQ; return ((size_t)-1); Index: lib/libc/locale/wcsnrtombs.c =================================================================== --- lib/libc/locale/wcsnrtombs.c +++ lib/libc/locale/wcsnrtombs.c @@ -1,4 +1,6 @@ /*- + * Copyright 2013 Garrett D'Amore + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2002-2004 Tim J. Robbins. * All rights reserved. * @@ -57,21 +59,21 @@ size_t __wcsnrtombs_std(char * __restrict dst, const wchar_t ** __restrict src, - size_t nwc, size_t len, mbstate_t * __restrict ps) + size_t nwc, size_t len, mbstate_t * __restrict ps, + wcrtomb_pfn_t pwcrtomb) { mbstate_t mbsbak; char buf[MB_LEN_MAX]; const wchar_t *s; size_t nbytes; size_t nb; - struct xlocale_ctype *l = XLOCALE_CTYPE(__get_locale()); s = *src; nbytes = 0; if (dst == NULL) { while (nwc-- > 0) { - if ((nb = l->__wcrtomb(buf, *s, ps)) == (size_t)-1) + if ((nb = pwcrtomb(buf, *s, ps)) == (size_t)-1) /* Invalid character - wcrtomb() sets errno. */ return ((size_t)-1); else if (*s == L'\0') @@ -85,7 +87,7 @@ while (len > 0 && nwc-- > 0) { if (len > (size_t)MB_CUR_MAX) { /* Enough space to translate in-place. */ - if ((nb = l->__wcrtomb(dst, *s, ps)) == (size_t)-1) { + if ((nb = pwcrtomb(dst, *s, ps)) == (size_t)-1) { *src = s; return ((size_t)-1); } @@ -98,7 +100,7 @@ * character is too long for the buffer. */ mbsbak = *ps; - if ((nb = l->__wcrtomb(buf, *s, ps)) == (size_t)-1) { + if ((nb = pwcrtomb(buf, *s, ps)) == (size_t)-1) { *src = s; return ((size_t)-1); } Index: lib/libc/regex/regcomp.c =================================================================== --- lib/libc/regex/regcomp.c +++ lib/libc/regex/regcomp.c @@ -38,6 +38,13 @@ * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 */ +/* + * This implementation currently only works with C locale + * It's definitely limited by UCHAR_MAX, but not even ISO-8859 charsets + * are working. The forced changing of locale to C for the comparison + * is considered a workaround until a better solution is found. + */ + #if defined(LIBC_SCCS) && !defined(lint) static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; #endif /* LIBC_SCCS and not lint */ @@ -768,8 +775,9 @@ char c; wint_t start, finish; wint_t i; + locale_t loc = &__xlocale_C_locale; /* see note under license */ struct xlocale_collate *table = - (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; + (struct xlocale_collate*)loc->components[XLC_COLLATE]; /* classify what we've got */ switch ((MORE()) ? PEEK() : '\0') { Index: lib/libc/string/strcoll.c =================================================================== --- lib/libc/string/strcoll.c +++ lib/libc/string/strcoll.c @@ -1,4 +1,5 @@ /*- + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1995 Alex Tatmanjants * at Electronni Visti IA, Kiev, Ukraine. * All rights reserved. @@ -35,63 +36,82 @@ #include #include +#include +#include #include "collate.h" -#include +/* + * In order to properly handle multibyte locales, its easiet to just + * convert to wide characters and then use wcscoll. However if an + * error occurs, we gracefully fall back to simple strcmp. Caller + * should check errno. + */ int strcoll_l(const char *s, const char *s2, locale_t locale) { - int len, len2, prim, prim2, sec, sec2, ret, ret2; - const char *t, *t2; - char *tt, *tt2; + int ret; + wchar_t *t1 = NULL, *t2 = NULL; + wchar_t *w1 = NULL, *w2 = NULL; + const char *cs1, *cs2; + mbstate_t mbs1; + mbstate_t mbs2; + size_t sz1, sz2; + + memset(&mbs1, 0, sizeof (mbstate_t)); + memset(&mbs2, 0, sizeof (mbstate_t)); + + /* + * The mbsrtowcs_l function can set the src pointer to null upon + * failure, so it should act on a copy to avoid: + * - sending null pointer to strcmp + * - having strcoll/strcoll_l change *s or *s2 to null + */ + cs1 = s; + cs2 = s2; + FIX_LOCALE(locale); struct xlocale_collate *table = (struct xlocale_collate*)locale->components[XLC_COLLATE]; if (table->__collate_load_error) - return strcmp(s, s2); + goto error; - len = len2 = 1; - ret = ret2 = 0; - if (table->__collate_substitute_nontrivial) { - t = tt = __collate_substitute(table, s); - t2 = tt2 = __collate_substitute(table, s2); - } else { - tt = tt2 = NULL; - t = s; - t2 = s2; - } - while(*t && *t2) { - prim = prim2 = 0; - while(*t && !prim) { - __collate_lookup(table, t, &len, &prim, &sec); - t += len; - } - while(*t2 && !prim2) { - __collate_lookup(table, t2, &len2, &prim2, &sec2); - t2 += len2; - } - if(!prim || !prim2) - break; - if(prim != prim2) { - ret = prim - prim2; - goto end; - } - if(!ret2) - ret2 = sec - sec2; - } - if(!*t && *t2) - ret = -(int)((u_char)*t2); - else if(*t && !*t2) - ret = (u_char)*t; - else if(!*t && !*t2) - ret = ret2; - end: - free(tt); - free(tt2); + sz1 = strlen(s) + 1; + sz2 = strlen(s2) + 1; - return ret; + /* + * Simple assumption: conversion to wide format is strictly + * reducing, i.e. a single byte (or multibyte character) + * cannot result in multiple wide characters. + */ + if ((t1 = malloc(sz1 * sizeof (wchar_t))) == NULL) + goto error; + w1 = t1; + if ((t2 = malloc(sz2 * sizeof (wchar_t))) == NULL) + goto error; + w2 = t2; + + if ((mbsrtowcs_l(w1, &cs1, sz1, &mbs1, locale)) == (size_t)-1) + goto error; + + if ((mbsrtowcs_l(w2, &cs2, sz2, &mbs2, locale)) == (size_t)-1) + goto error; + + ret = wcscoll_l(w1, w2, locale); + if (t1) + free(t1); + if (t2) + free(t2); + + return (ret); + +error: + if (t1) + free(t1); + if (t2) + free(t2); + return (strcmp(s, s2)); } int Index: lib/libc/string/strxfrm.c =================================================================== --- lib/libc/string/strxfrm.c +++ lib/libc/string/strxfrm.c @@ -1,4 +1,5 @@ /*- + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1995 Alex Tatmanjants * at Electronni Visti IA, Kiev, Ukraine. * All rights reserved. @@ -35,6 +36,8 @@ #include #include +#include +#include #include "collate.h" size_t @@ -48,9 +51,10 @@ size_t strxfrm_l(char * __restrict dest, const char * __restrict src, size_t len, locale_t locale) { - int prim, sec, l; size_t slen; - char *s, *ss; + size_t xlen; + wchar_t *wcs = NULL; + FIX_LOCALE(locale); struct xlocale_collate *table = (struct xlocale_collate*)locale->components[XLC_COLLATE]; @@ -58,32 +62,44 @@ if (!*src) { if (len > 0) *dest = '\0'; - return 0; + return (0); } + /* + * The conversion from multibyte to wide character strings is + * strictly reducing (one byte of an mbs cannot expand to more + * than one wide character.) + */ + slen = strlen(src); + if (table->__collate_load_error) - return strlcpy(dest, src, len); + goto error; + + if ((wcs = malloc((slen + 1) * sizeof (wchar_t))) == NULL) + goto error; + + if (mbstowcs_l(wcs, src, slen + 1, locale) == (size_t)-1) + goto error; - slen = 0; - prim = sec = 0; - ss = s = __collate_substitute(table, src); - while (*s) { - while (*s && !prim) { - __collate_lookup(table, s, &l, &prim, &sec); - s += l; - } - if (prim) { - if (len > 1) { - *dest++ = (char)prim; - len--; - } - slen++; - prim = 0; - } + if ((xlen = _collate_sxfrm(table, wcs, dest, len)) == (size_t)-1) + goto error; + + if (wcs) + free(wcs); + + if (len > xlen) { + dest[xlen] = 0; + } else if (len) { + dest[len-1] = 0; } - free(ss); - if (len > 0) - *dest = '\0'; - return slen; + return (xlen); + +error: + /* errno should be set to ENOMEM if malloc failed */ + if (wcs) + free(wcs); + (void) strlcpy(dest, src, len); + + return (slen); } Index: lib/libc/string/wcsxfrm.c =================================================================== --- lib/libc/string/wcsxfrm.c +++ lib/libc/string/wcsxfrm.c @@ -1,4 +1,5 @@ /*- + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1995 Alex Tatmanjants * at Electronni Visti IA, Kiev, Ukraine. * All rights reserved. @@ -31,9 +32,6 @@ */ #include -#if 0 -__FBSDID("FreeBSD: src/lib/libc/string/strxfrm.c,v 1.15 2002/09/06 11:24:06 tjr Exp "); -#endif __FBSDID("$FreeBSD$"); #include @@ -41,18 +39,10 @@ #include #include "collate.h" -static char *__mbsdup(const wchar_t *); - -/* - * Placeholder wcsxfrm() implementation. See wcscoll.c for a description of - * the logic used. - */ size_t wcsxfrm_l(wchar_t * __restrict dest, const wchar_t * __restrict src, size_t len, locale_t locale) { - int prim, sec, l; size_t slen; - char *mbsrc, *s, *ss; FIX_LOCALE(locale); struct xlocale_collate *table = (struct xlocale_collate*)locale->components[XLC_COLLATE]; @@ -63,67 +53,33 @@ return (0); } - if (table->__collate_load_error || MB_CUR_MAX > 1) { - slen = wcslen(src); - if (len > 0) { - if (slen < len) - wcscpy(dest, src); - else { - wcsncpy(dest, src, len - 1); - dest[len - 1] = L'\0'; - } - } - return (slen); + if ((table->__collate_load_error) || + ((slen = _collate_wxfrm(table, src, dest, len)) == (size_t)-1)) { + goto error; } - mbsrc = __mbsdup(src); - slen = 0; - prim = sec = 0; - ss = s = __collate_substitute(table, mbsrc); - while (*s != '\0') { - while (*s != '\0' && prim == 0) { - __collate_lookup(table, s, &l, &prim, &sec); - s += l; - } - if (prim != 0) { - if (len > 1) { - *dest++ = (wchar_t)prim; - len--; - } - slen++; - prim = 0; - } + /* Add null termination at the correct location. */ + if (len > slen) { + dest[slen] = 0; + } else if (len) { + dest[len-1] = 0; } - free(ss); - free(mbsrc); - if (len != 0) - *dest = L'\0'; return (slen); + +error: + slen = wcslen(src); + if (slen < len) + (void) wcscpy(dest, src); + else { + (void) wcsncpy(dest, src, len - 1); + dest[len - 1] = L'\0'; + } + return (slen); } + size_t wcsxfrm(wchar_t * __restrict dest, const wchar_t * __restrict src, size_t len) { return wcsxfrm_l(dest, src, len, __get_locale()); } - -static char * -__mbsdup(const wchar_t *ws) -{ - static const mbstate_t initial; - mbstate_t st; - const wchar_t *wcp; - size_t len; - char *mbs; - - wcp = ws; - st = initial; - if ((len = wcsrtombs(NULL, &wcp, 0, &st)) == (size_t)-1) - return (NULL); - if ((mbs = malloc(len + 1)) == NULL) - return (NULL); - st = initial; - wcsrtombs(mbs, &ws, len + 1, &st); - - return (mbs); -}