diff --git a/contrib/libarchive/libarchive/archive_match.c b/contrib/libarchive/libarchive/archive_match.c index fc8a4ce8127b..3ab8eda36038 100644 --- a/contrib/libarchive/libarchive/archive_match.c +++ b/contrib/libarchive/libarchive/archive_match.c @@ -1,1874 +1,1874 @@ /*- * Copyright (c) 2003-2007 Tim Kientzle * Copyright (c) 2012 Michihiro NAKAJIMA * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "archive_platform.h" #ifdef HAVE_ERRNO_H #include #endif #ifdef HAVE_STDLIB_H #include #endif #ifdef HAVE_STRING_H #include #endif #include "archive.h" #include "archive_private.h" #include "archive_entry.h" #include "archive_getdate.h" #include "archive_pathmatch.h" #include "archive_rb.h" #include "archive_string.h" struct match { struct match *next; - int matches; + int matched; struct archive_mstring pattern; }; struct match_list { struct match *first; struct match **last; int count; int unmatched_count; struct match *unmatched_next; int unmatched_eof; }; struct match_file { struct archive_rb_node node; struct match_file *next; struct archive_mstring pathname; int flag; time_t mtime_sec; long mtime_nsec; time_t ctime_sec; long ctime_nsec; }; struct entry_list { struct match_file *first; struct match_file **last; int count; }; struct id_array { size_t size;/* Allocated size */ size_t count; int64_t *ids; }; #define PATTERN_IS_SET 1 #define TIME_IS_SET 2 #define ID_IS_SET 4 struct archive_match { struct archive archive; /* exclusion/inclusion set flag. */ int setflag; /* Recursively include directory content? */ int recursive_include; /* * Matching filename patterns. */ struct match_list exclusions; struct match_list inclusions; /* * Matching time stamps. */ time_t now; int newer_mtime_filter; time_t newer_mtime_sec; long newer_mtime_nsec; int newer_ctime_filter; time_t newer_ctime_sec; long newer_ctime_nsec; int older_mtime_filter; time_t older_mtime_sec; long older_mtime_nsec; int older_ctime_filter; time_t older_ctime_sec; long older_ctime_nsec; /* * Matching time stamps with its filename. */ struct archive_rb_tree exclusion_tree; struct entry_list exclusion_entry_list; /* * Matching file owners. */ struct id_array inclusion_uids; struct id_array inclusion_gids; struct match_list inclusion_unames; struct match_list inclusion_gnames; }; static int add_pattern_from_file(struct archive_match *, struct match_list *, int, const void *, int); static int add_entry(struct archive_match *, int, struct archive_entry *); static int add_owner_id(struct archive_match *, struct id_array *, int64_t); static int add_owner_name(struct archive_match *, struct match_list *, int, const void *); static int add_pattern_mbs(struct archive_match *, struct match_list *, const char *); static int add_pattern_wcs(struct archive_match *, struct match_list *, const wchar_t *); static int cmp_key_mbs(const struct archive_rb_node *, const void *); static int cmp_key_wcs(const struct archive_rb_node *, const void *); static int cmp_node_mbs(const struct archive_rb_node *, const struct archive_rb_node *); static int cmp_node_wcs(const struct archive_rb_node *, const struct archive_rb_node *); static void entry_list_add(struct entry_list *, struct match_file *); static void entry_list_free(struct entry_list *); static void entry_list_init(struct entry_list *); static int error_nomem(struct archive_match *); static void match_list_add(struct match_list *, struct match *); static void match_list_free(struct match_list *); static void match_list_init(struct match_list *); static int match_list_unmatched_inclusions_next(struct archive_match *, struct match_list *, int, const void **); static int match_owner_id(struct id_array *, int64_t); #if !defined(_WIN32) || defined(__CYGWIN__) static int match_owner_name_mbs(struct archive_match *, struct match_list *, const char *); #else static int match_owner_name_wcs(struct archive_match *, struct match_list *, const wchar_t *); #endif static int match_path_exclusion(struct archive_match *, struct match *, int, const void *); static int match_path_inclusion(struct archive_match *, struct match *, int, const void *); static int owner_excluded(struct archive_match *, struct archive_entry *); static int path_excluded(struct archive_match *, int, const void *); static int set_timefilter(struct archive_match *, int, time_t, long, time_t, long); static int set_timefilter_pathname_mbs(struct archive_match *, int, const char *); static int set_timefilter_pathname_wcs(struct archive_match *, int, const wchar_t *); static int set_timefilter_date(struct archive_match *, int, const char *); static int set_timefilter_date_w(struct archive_match *, int, const wchar_t *); static int time_excluded(struct archive_match *, struct archive_entry *); static int validate_time_flag(struct archive *, int, const char *); #define get_date __archive_get_date static const struct archive_rb_tree_ops rb_ops_mbs = { cmp_node_mbs, cmp_key_mbs }; static const struct archive_rb_tree_ops rb_ops_wcs = { cmp_node_wcs, cmp_key_wcs }; /* * The matching logic here needs to be re-thought. I started out to * try to mimic gtar's matching logic, but it's not entirely * consistent. In particular 'tar -t' and 'tar -x' interpret patterns * on the command line as anchored, but --exclude doesn't. */ static int error_nomem(struct archive_match *a) { archive_set_error(&(a->archive), ENOMEM, "No memory"); a->archive.state = ARCHIVE_STATE_FATAL; return (ARCHIVE_FATAL); } /* * Create an ARCHIVE_MATCH object. */ struct archive * archive_match_new(void) { struct archive_match *a; a = (struct archive_match *)calloc(1, sizeof(*a)); if (a == NULL) return (NULL); a->archive.magic = ARCHIVE_MATCH_MAGIC; a->archive.state = ARCHIVE_STATE_NEW; a->recursive_include = 1; match_list_init(&(a->inclusions)); match_list_init(&(a->exclusions)); __archive_rb_tree_init(&(a->exclusion_tree), &rb_ops_mbs); entry_list_init(&(a->exclusion_entry_list)); match_list_init(&(a->inclusion_unames)); match_list_init(&(a->inclusion_gnames)); time(&a->now); return (&(a->archive)); } /* * Free an ARCHIVE_MATCH object. */ int archive_match_free(struct archive *_a) { struct archive_match *a; if (_a == NULL) return (ARCHIVE_OK); archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_ANY | ARCHIVE_STATE_FATAL, "archive_match_free"); a = (struct archive_match *)_a; match_list_free(&(a->inclusions)); match_list_free(&(a->exclusions)); entry_list_free(&(a->exclusion_entry_list)); free(a->inclusion_uids.ids); free(a->inclusion_gids.ids); match_list_free(&(a->inclusion_unames)); match_list_free(&(a->inclusion_gnames)); free(a); return (ARCHIVE_OK); } /* * Convenience function to perform all exclusion tests. * * Returns 1 if archive entry is excluded. * Returns 0 if archive entry is not excluded. * Returns <0 if something error happened. */ int archive_match_excluded(struct archive *_a, struct archive_entry *entry) { struct archive_match *a; int r; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_excluded_ae"); a = (struct archive_match *)_a; if (entry == NULL) { archive_set_error(&(a->archive), EINVAL, "entry is NULL"); return (ARCHIVE_FAILED); } r = 0; if (a->setflag & PATTERN_IS_SET) { #if defined(_WIN32) && !defined(__CYGWIN__) r = path_excluded(a, 0, archive_entry_pathname_w(entry)); #else r = path_excluded(a, 1, archive_entry_pathname(entry)); #endif if (r != 0) return (r); } if (a->setflag & TIME_IS_SET) { r = time_excluded(a, entry); if (r != 0) return (r); } if (a->setflag & ID_IS_SET) r = owner_excluded(a, entry); return (r); } /* * Utility functions to manage exclusion/inclusion patterns */ int archive_match_exclude_pattern(struct archive *_a, const char *pattern) { struct archive_match *a; int r; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_exclude_pattern"); a = (struct archive_match *)_a; if (pattern == NULL || *pattern == '\0') { archive_set_error(&(a->archive), EINVAL, "pattern is empty"); return (ARCHIVE_FAILED); } if ((r = add_pattern_mbs(a, &(a->exclusions), pattern)) != ARCHIVE_OK) return (r); return (ARCHIVE_OK); } int archive_match_exclude_pattern_w(struct archive *_a, const wchar_t *pattern) { struct archive_match *a; int r; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_exclude_pattern_w"); a = (struct archive_match *)_a; if (pattern == NULL || *pattern == L'\0') { archive_set_error(&(a->archive), EINVAL, "pattern is empty"); return (ARCHIVE_FAILED); } if ((r = add_pattern_wcs(a, &(a->exclusions), pattern)) != ARCHIVE_OK) return (r); return (ARCHIVE_OK); } int archive_match_exclude_pattern_from_file(struct archive *_a, const char *pathname, int nullSeparator) { struct archive_match *a; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_exclude_pattern_from_file"); a = (struct archive_match *)_a; return add_pattern_from_file(a, &(a->exclusions), 1, pathname, nullSeparator); } int archive_match_exclude_pattern_from_file_w(struct archive *_a, const wchar_t *pathname, int nullSeparator) { struct archive_match *a; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_exclude_pattern_from_file_w"); a = (struct archive_match *)_a; return add_pattern_from_file(a, &(a->exclusions), 0, pathname, nullSeparator); } int archive_match_include_pattern(struct archive *_a, const char *pattern) { struct archive_match *a; int r; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_include_pattern"); a = (struct archive_match *)_a; if (pattern == NULL || *pattern == '\0') { archive_set_error(&(a->archive), EINVAL, "pattern is empty"); return (ARCHIVE_FAILED); } if ((r = add_pattern_mbs(a, &(a->inclusions), pattern)) != ARCHIVE_OK) return (r); return (ARCHIVE_OK); } int archive_match_include_pattern_w(struct archive *_a, const wchar_t *pattern) { struct archive_match *a; int r; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_include_pattern_w"); a = (struct archive_match *)_a; if (pattern == NULL || *pattern == L'\0') { archive_set_error(&(a->archive), EINVAL, "pattern is empty"); return (ARCHIVE_FAILED); } if ((r = add_pattern_wcs(a, &(a->inclusions), pattern)) != ARCHIVE_OK) return (r); return (ARCHIVE_OK); } int archive_match_include_pattern_from_file(struct archive *_a, const char *pathname, int nullSeparator) { struct archive_match *a; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_include_pattern_from_file"); a = (struct archive_match *)_a; return add_pattern_from_file(a, &(a->inclusions), 1, pathname, nullSeparator); } int archive_match_include_pattern_from_file_w(struct archive *_a, const wchar_t *pathname, int nullSeparator) { struct archive_match *a; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_include_pattern_from_file_w"); a = (struct archive_match *)_a; return add_pattern_from_file(a, &(a->inclusions), 0, pathname, nullSeparator); } /* * Test functions for pathname patterns. * * Returns 1 if archive entry is excluded. * Returns 0 if archive entry is not excluded. * Returns <0 if something error happened. */ int archive_match_path_excluded(struct archive *_a, struct archive_entry *entry) { struct archive_match *a; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_path_excluded"); a = (struct archive_match *)_a; if (entry == NULL) { archive_set_error(&(a->archive), EINVAL, "entry is NULL"); return (ARCHIVE_FAILED); } /* If we don't have exclusion/inclusion pattern set at all, * the entry is always not excluded. */ if ((a->setflag & PATTERN_IS_SET) == 0) return (0); #if defined(_WIN32) && !defined(__CYGWIN__) return (path_excluded(a, 0, archive_entry_pathname_w(entry))); #else return (path_excluded(a, 1, archive_entry_pathname(entry))); #endif } /* * When recursive inclusion of directory content is enabled, * an inclusion pattern that matches a directory will also * include everything beneath that directory. Enabled by default. * * For compatibility with GNU tar, exclusion patterns always * match if a subset of the full patch matches (i.e., they are * are not rooted at the beginning of the path) and thus there * is no corresponding non-recursive exclusion mode. */ int archive_match_set_inclusion_recursion(struct archive *_a, int enabled) { struct archive_match *a; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_set_inclusion_recursion"); a = (struct archive_match *)_a; a->recursive_include = enabled; return (ARCHIVE_OK); } /* * Utility functions to get statistic information for inclusion patterns. */ int archive_match_path_unmatched_inclusions(struct archive *_a) { struct archive_match *a; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_unmatched_inclusions"); a = (struct archive_match *)_a; return (a->inclusions.unmatched_count); } int archive_match_path_unmatched_inclusions_next(struct archive *_a, const char **_p) { struct archive_match *a; const void *v; int r; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_unmatched_inclusions_next"); a = (struct archive_match *)_a; r = match_list_unmatched_inclusions_next(a, &(a->inclusions), 1, &v); *_p = (const char *)v; return (r); } int archive_match_path_unmatched_inclusions_next_w(struct archive *_a, const wchar_t **_p) { struct archive_match *a; const void *v; int r; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_unmatched_inclusions_next_w"); a = (struct archive_match *)_a; r = match_list_unmatched_inclusions_next(a, &(a->inclusions), 0, &v); *_p = (const wchar_t *)v; return (r); } /* * Add inclusion/exclusion patterns. */ static int add_pattern_mbs(struct archive_match *a, struct match_list *list, const char *pattern) { struct match *match; size_t len; match = calloc(1, sizeof(*match)); if (match == NULL) return (error_nomem(a)); /* Both "foo/" and "foo" should match "foo/bar". */ len = strlen(pattern); if (len && pattern[len - 1] == '/') --len; archive_mstring_copy_mbs_len(&(match->pattern), pattern, len); match_list_add(list, match); a->setflag |= PATTERN_IS_SET; return (ARCHIVE_OK); } static int add_pattern_wcs(struct archive_match *a, struct match_list *list, const wchar_t *pattern) { struct match *match; size_t len; match = calloc(1, sizeof(*match)); if (match == NULL) return (error_nomem(a)); /* Both "foo/" and "foo" should match "foo/bar". */ len = wcslen(pattern); if (len && pattern[len - 1] == L'/') --len; archive_mstring_copy_wcs_len(&(match->pattern), pattern, len); match_list_add(list, match); a->setflag |= PATTERN_IS_SET; return (ARCHIVE_OK); } static int add_pattern_from_file(struct archive_match *a, struct match_list *mlist, int mbs, const void *pathname, int nullSeparator) { struct archive *ar; struct archive_entry *ae; struct archive_string as; const void *buff; size_t size; int64_t offset; int r; ar = archive_read_new(); if (ar == NULL) { archive_set_error(&(a->archive), ENOMEM, "No memory"); return (ARCHIVE_FATAL); } r = archive_read_support_format_raw(ar); - r = archive_read_support_format_empty(ar); + if (r == ARCHIVE_OK) + r = archive_read_support_format_empty(ar); if (r != ARCHIVE_OK) { archive_copy_error(&(a->archive), ar); archive_read_free(ar); return (r); } if (mbs) r = archive_read_open_filename(ar, pathname, 512*20); else r = archive_read_open_filename_w(ar, pathname, 512*20); if (r != ARCHIVE_OK) { archive_copy_error(&(a->archive), ar); archive_read_free(ar); return (r); } r = archive_read_next_header(ar, &ae); if (r != ARCHIVE_OK) { archive_read_free(ar); if (r == ARCHIVE_EOF) { return (ARCHIVE_OK); } else { archive_copy_error(&(a->archive), ar); return (r); } } archive_string_init(&as); while ((r = archive_read_data_block(ar, &buff, &size, &offset)) == ARCHIVE_OK) { const char *b = (const char *)buff; while (size) { const char *s = (const char *)b; size_t length = 0; int found_separator = 0; while (length < size) { if (nullSeparator) { if (*b == '\0') { found_separator = 1; break; } } else { if (*b == 0x0d || *b == 0x0a) { found_separator = 1; break; } } b++; length++; } if (!found_separator) { archive_strncat(&as, s, length); /* Read next data block. */ break; } b++; size -= length + 1; archive_strncat(&as, s, length); /* If the line is not empty, add the pattern. */ if (archive_strlen(&as) > 0) { /* Add pattern. */ r = add_pattern_mbs(a, mlist, as.s); if (r != ARCHIVE_OK) { archive_read_free(ar); archive_string_free(&as); return (r); } archive_string_empty(&as); } } } /* If an error occurred, report it immediately. */ if (r < ARCHIVE_OK) { archive_copy_error(&(a->archive), ar); archive_read_free(ar); archive_string_free(&as); return (r); } /* If the line is not empty, add the pattern. */ if (r == ARCHIVE_EOF && archive_strlen(&as) > 0) { /* Add pattern. */ r = add_pattern_mbs(a, mlist, as.s); if (r != ARCHIVE_OK) { archive_read_free(ar); archive_string_free(&as); return (r); } } archive_read_free(ar); archive_string_free(&as); return (ARCHIVE_OK); } /* * Test if pathname is excluded by inclusion/exclusion patterns. */ static int path_excluded(struct archive_match *a, int mbs, const void *pathname) { struct match *match; struct match *matched; int r; if (a == NULL) return (0); /* Mark off any unmatched inclusions. */ /* In particular, if a filename does appear in the archive and * is explicitly included and excluded, then we don't report * it as missing even though we don't extract it. */ matched = NULL; for (match = a->inclusions.first; match != NULL; match = match->next){ - if (match->matches == 0 && + if (!match->matched && (r = match_path_inclusion(a, match, mbs, pathname)) != 0) { if (r < 0) return (r); a->inclusions.unmatched_count--; - match->matches++; + match->matched = 1; matched = match; } } /* Exclusions take priority */ for (match = a->exclusions.first; match != NULL; match = match->next){ r = match_path_exclusion(a, match, mbs, pathname); if (r) return (r); } /* It's not excluded and we found an inclusion above, so it's * included. */ if (matched != NULL) return (0); /* We didn't find an unmatched inclusion, check the remaining ones. */ for (match = a->inclusions.first; match != NULL; match = match->next){ /* We looked at previously-unmatched inclusions already. */ - if (match->matches > 0 && + if (match->matched && (r = match_path_inclusion(a, match, mbs, pathname)) != 0) { if (r < 0) return (r); - match->matches++; return (0); } } /* If there were inclusions, default is to exclude. */ if (a->inclusions.first != NULL) return (1); /* No explicit inclusions, default is to match. */ return (0); } /* * This is a little odd, but it matches the default behavior of * gtar. In particular, 'a*b' will match 'foo/a1111/222b/bar' * */ static int match_path_exclusion(struct archive_match *a, struct match *m, int mbs, const void *pn) { int flag = PATHMATCH_NO_ANCHOR_START | PATHMATCH_NO_ANCHOR_END; int r; if (mbs) { const char *p; r = archive_mstring_get_mbs(&(a->archive), &(m->pattern), &p); if (r == 0) return (archive_pathmatch(p, (const char *)pn, flag)); } else { const wchar_t *p; r = archive_mstring_get_wcs(&(a->archive), &(m->pattern), &p); if (r == 0) return (archive_pathmatch_w(p, (const wchar_t *)pn, flag)); } if (errno == ENOMEM) return (error_nomem(a)); return (0); } /* * Again, mimic gtar: inclusions are always anchored (have to match * the beginning of the path) even though exclusions are not anchored. */ static int match_path_inclusion(struct archive_match *a, struct match *m, int mbs, const void *pn) { /* Recursive operation requires only a prefix match. */ int flag = a->recursive_include ? PATHMATCH_NO_ANCHOR_END : 0; int r; if (mbs) { const char *p; r = archive_mstring_get_mbs(&(a->archive), &(m->pattern), &p); if (r == 0) return (archive_pathmatch(p, (const char *)pn, flag)); } else { const wchar_t *p; r = archive_mstring_get_wcs(&(a->archive), &(m->pattern), &p); if (r == 0) return (archive_pathmatch_w(p, (const wchar_t *)pn, flag)); } if (errno == ENOMEM) return (error_nomem(a)); return (0); } static void match_list_init(struct match_list *list) { list->first = NULL; list->last = &(list->first); list->count = 0; } static void match_list_free(struct match_list *list) { struct match *p, *q; for (p = list->first; p != NULL; ) { q = p; p = p->next; archive_mstring_clean(&(q->pattern)); free(q); } } static void match_list_add(struct match_list *list, struct match *m) { *list->last = m; list->last = &(m->next); list->count++; list->unmatched_count++; } static int match_list_unmatched_inclusions_next(struct archive_match *a, struct match_list *list, int mbs, const void **vp) { struct match *m; *vp = NULL; if (list->unmatched_eof) { list->unmatched_eof = 0; return (ARCHIVE_EOF); } if (list->unmatched_next == NULL) { if (list->unmatched_count == 0) return (ARCHIVE_EOF); list->unmatched_next = list->first; } for (m = list->unmatched_next; m != NULL; m = m->next) { int r; - if (m->matches) + if (m->matched) continue; if (mbs) { const char *p; r = archive_mstring_get_mbs(&(a->archive), &(m->pattern), &p); if (r < 0 && errno == ENOMEM) return (error_nomem(a)); if (p == NULL) p = ""; *vp = p; } else { const wchar_t *p; r = archive_mstring_get_wcs(&(a->archive), &(m->pattern), &p); if (r < 0 && errno == ENOMEM) return (error_nomem(a)); if (p == NULL) p = L""; *vp = p; } list->unmatched_next = m->next; if (list->unmatched_next == NULL) /* To return EOF next time. */ list->unmatched_eof = 1; return (ARCHIVE_OK); } list->unmatched_next = NULL; return (ARCHIVE_EOF); } /* * Utility functions to manage inclusion timestamps. */ int archive_match_include_time(struct archive *_a, int flag, time_t sec, long nsec) { int r; r = validate_time_flag(_a, flag, "archive_match_include_time"); if (r != ARCHIVE_OK) return (r); return set_timefilter((struct archive_match *)_a, flag, sec, nsec, sec, nsec); } int archive_match_include_date(struct archive *_a, int flag, const char *datestr) { int r; r = validate_time_flag(_a, flag, "archive_match_include_date"); if (r != ARCHIVE_OK) return (r); return set_timefilter_date((struct archive_match *)_a, flag, datestr); } int archive_match_include_date_w(struct archive *_a, int flag, const wchar_t *datestr) { int r; r = validate_time_flag(_a, flag, "archive_match_include_date_w"); if (r != ARCHIVE_OK) return (r); return set_timefilter_date_w((struct archive_match *)_a, flag, datestr); } int archive_match_include_file_time(struct archive *_a, int flag, const char *pathname) { int r; r = validate_time_flag(_a, flag, "archive_match_include_file_time"); if (r != ARCHIVE_OK) return (r); return set_timefilter_pathname_mbs((struct archive_match *)_a, flag, pathname); } int archive_match_include_file_time_w(struct archive *_a, int flag, const wchar_t *pathname) { int r; r = validate_time_flag(_a, flag, "archive_match_include_file_time_w"); if (r != ARCHIVE_OK) return (r); return set_timefilter_pathname_wcs((struct archive_match *)_a, flag, pathname); } int archive_match_exclude_entry(struct archive *_a, int flag, struct archive_entry *entry) { struct archive_match *a; int r; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_time_include_entry"); a = (struct archive_match *)_a; if (entry == NULL) { archive_set_error(&(a->archive), EINVAL, "entry is NULL"); return (ARCHIVE_FAILED); } r = validate_time_flag(_a, flag, "archive_match_exclude_entry"); if (r != ARCHIVE_OK) return (r); return (add_entry(a, flag, entry)); } /* * Test function for time stamps. * * Returns 1 if archive entry is excluded. * Returns 0 if archive entry is not excluded. * Returns <0 if something error happened. */ int archive_match_time_excluded(struct archive *_a, struct archive_entry *entry) { struct archive_match *a; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_time_excluded_ae"); a = (struct archive_match *)_a; if (entry == NULL) { archive_set_error(&(a->archive), EINVAL, "entry is NULL"); return (ARCHIVE_FAILED); } /* If we don't have inclusion time set at all, the entry is always * not excluded. */ if ((a->setflag & TIME_IS_SET) == 0) return (0); return (time_excluded(a, entry)); } static int validate_time_flag(struct archive *_a, int flag, const char *_fn) { archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, _fn); /* Check a type of time. */ if (flag & ((~(ARCHIVE_MATCH_MTIME | ARCHIVE_MATCH_CTIME)) & 0xff00)) { archive_set_error(_a, EINVAL, "Invalid time flag"); return (ARCHIVE_FAILED); } if ((flag & (ARCHIVE_MATCH_MTIME | ARCHIVE_MATCH_CTIME)) == 0) { archive_set_error(_a, EINVAL, "No time flag"); return (ARCHIVE_FAILED); } /* Check a type of comparison. */ if (flag & ((~(ARCHIVE_MATCH_NEWER | ARCHIVE_MATCH_OLDER | ARCHIVE_MATCH_EQUAL)) & 0x00ff)) { archive_set_error(_a, EINVAL, "Invalid comparison flag"); return (ARCHIVE_FAILED); } if ((flag & (ARCHIVE_MATCH_NEWER | ARCHIVE_MATCH_OLDER | ARCHIVE_MATCH_EQUAL)) == 0) { archive_set_error(_a, EINVAL, "No comparison flag"); return (ARCHIVE_FAILED); } return (ARCHIVE_OK); } #define JUST_EQUAL(t) (((t) & (ARCHIVE_MATCH_EQUAL |\ ARCHIVE_MATCH_NEWER | ARCHIVE_MATCH_OLDER)) == ARCHIVE_MATCH_EQUAL) static int set_timefilter(struct archive_match *a, int timetype, time_t mtime_sec, long mtime_nsec, time_t ctime_sec, long ctime_nsec) { if (timetype & ARCHIVE_MATCH_MTIME) { if ((timetype & ARCHIVE_MATCH_NEWER) || JUST_EQUAL(timetype)) { a->newer_mtime_filter = timetype; a->newer_mtime_sec = mtime_sec; a->newer_mtime_nsec = mtime_nsec; a->setflag |= TIME_IS_SET; } if ((timetype & ARCHIVE_MATCH_OLDER) || JUST_EQUAL(timetype)) { a->older_mtime_filter = timetype; a->older_mtime_sec = mtime_sec; a->older_mtime_nsec = mtime_nsec; a->setflag |= TIME_IS_SET; } } if (timetype & ARCHIVE_MATCH_CTIME) { if ((timetype & ARCHIVE_MATCH_NEWER) || JUST_EQUAL(timetype)) { a->newer_ctime_filter = timetype; a->newer_ctime_sec = ctime_sec; a->newer_ctime_nsec = ctime_nsec; a->setflag |= TIME_IS_SET; } if ((timetype & ARCHIVE_MATCH_OLDER) || JUST_EQUAL(timetype)) { a->older_ctime_filter = timetype; a->older_ctime_sec = ctime_sec; a->older_ctime_nsec = ctime_nsec; a->setflag |= TIME_IS_SET; } } return (ARCHIVE_OK); } static int set_timefilter_date(struct archive_match *a, int timetype, const char *datestr) { time_t t; if (datestr == NULL || *datestr == '\0') { archive_set_error(&(a->archive), EINVAL, "date is empty"); return (ARCHIVE_FAILED); } t = get_date(a->now, datestr); if (t == (time_t)-1) { archive_set_error(&(a->archive), EINVAL, "invalid date string"); return (ARCHIVE_FAILED); } return set_timefilter(a, timetype, t, 0, t, 0); } static int set_timefilter_date_w(struct archive_match *a, int timetype, const wchar_t *datestr) { struct archive_string as; time_t t; if (datestr == NULL || *datestr == L'\0') { archive_set_error(&(a->archive), EINVAL, "date is empty"); return (ARCHIVE_FAILED); } archive_string_init(&as); if (archive_string_append_from_wcs(&as, datestr, wcslen(datestr)) < 0) { archive_string_free(&as); if (errno == ENOMEM) return (error_nomem(a)); archive_set_error(&(a->archive), -1, "Failed to convert WCS to MBS"); return (ARCHIVE_FAILED); } t = get_date(a->now, as.s); archive_string_free(&as); if (t == (time_t)-1) { archive_set_error(&(a->archive), EINVAL, "invalid date string"); return (ARCHIVE_FAILED); } return set_timefilter(a, timetype, t, 0, t, 0); } #if defined(_WIN32) && !defined(__CYGWIN__) #define EPOC_TIME ARCHIVE_LITERAL_ULL(116444736000000000) static int set_timefilter_find_data(struct archive_match *a, int timetype, DWORD ftLastWriteTime_dwHighDateTime, DWORD ftLastWriteTime_dwLowDateTime, DWORD ftCreationTime_dwHighDateTime, DWORD ftCreationTime_dwLowDateTime) { ULARGE_INTEGER utc; time_t ctime_sec, mtime_sec; long ctime_ns, mtime_ns; utc.HighPart = ftCreationTime_dwHighDateTime; utc.LowPart = ftCreationTime_dwLowDateTime; if (utc.QuadPart >= EPOC_TIME) { utc.QuadPart -= EPOC_TIME; ctime_sec = (time_t)(utc.QuadPart / 10000000); ctime_ns = (long)(utc.QuadPart % 10000000) * 100; } else { ctime_sec = 0; ctime_ns = 0; } utc.HighPart = ftLastWriteTime_dwHighDateTime; utc.LowPart = ftLastWriteTime_dwLowDateTime; if (utc.QuadPart >= EPOC_TIME) { utc.QuadPart -= EPOC_TIME; mtime_sec = (time_t)(utc.QuadPart / 10000000); mtime_ns = (long)(utc.QuadPart % 10000000) * 100; } else { mtime_sec = 0; mtime_ns = 0; } return set_timefilter(a, timetype, mtime_sec, mtime_ns, ctime_sec, ctime_ns); } static int set_timefilter_pathname_mbs(struct archive_match *a, int timetype, const char *path) { /* NOTE: stat() on Windows cannot handle nano seconds. */ HANDLE h; WIN32_FIND_DATAA d; if (path == NULL || *path == '\0') { archive_set_error(&(a->archive), EINVAL, "pathname is empty"); return (ARCHIVE_FAILED); } h = FindFirstFileA(path, &d); if (h == INVALID_HANDLE_VALUE) { la_dosmaperr(GetLastError()); archive_set_error(&(a->archive), errno, "Failed to FindFirstFileA"); return (ARCHIVE_FAILED); } FindClose(h); return set_timefilter_find_data(a, timetype, d.ftLastWriteTime.dwHighDateTime, d.ftLastWriteTime.dwLowDateTime, d.ftCreationTime.dwHighDateTime, d.ftCreationTime.dwLowDateTime); } static int set_timefilter_pathname_wcs(struct archive_match *a, int timetype, const wchar_t *path) { HANDLE h; WIN32_FIND_DATAW d; if (path == NULL || *path == L'\0') { archive_set_error(&(a->archive), EINVAL, "pathname is empty"); return (ARCHIVE_FAILED); } h = FindFirstFileW(path, &d); if (h == INVALID_HANDLE_VALUE) { la_dosmaperr(GetLastError()); archive_set_error(&(a->archive), errno, "Failed to FindFirstFile"); return (ARCHIVE_FAILED); } FindClose(h); return set_timefilter_find_data(a, timetype, d.ftLastWriteTime.dwHighDateTime, d.ftLastWriteTime.dwLowDateTime, d.ftCreationTime.dwHighDateTime, d.ftCreationTime.dwLowDateTime); } #else /* _WIN32 && !__CYGWIN__ */ static int set_timefilter_stat(struct archive_match *a, int timetype, struct stat *st) { struct archive_entry *ae; time_t ctime_sec, mtime_sec; long ctime_ns, mtime_ns; ae = archive_entry_new(); if (ae == NULL) return (error_nomem(a)); archive_entry_copy_stat(ae, st); ctime_sec = archive_entry_ctime(ae); ctime_ns = archive_entry_ctime_nsec(ae); mtime_sec = archive_entry_mtime(ae); mtime_ns = archive_entry_mtime_nsec(ae); archive_entry_free(ae); return set_timefilter(a, timetype, mtime_sec, mtime_ns, ctime_sec, ctime_ns); } static int set_timefilter_pathname_mbs(struct archive_match *a, int timetype, const char *path) { struct stat st; if (path == NULL || *path == '\0') { archive_set_error(&(a->archive), EINVAL, "pathname is empty"); return (ARCHIVE_FAILED); } if (la_stat(path, &st) != 0) { archive_set_error(&(a->archive), errno, "Failed to stat()"); return (ARCHIVE_FAILED); } return (set_timefilter_stat(a, timetype, &st)); } static int set_timefilter_pathname_wcs(struct archive_match *a, int timetype, const wchar_t *path) { struct archive_string as; int r; if (path == NULL || *path == L'\0') { archive_set_error(&(a->archive), EINVAL, "pathname is empty"); return (ARCHIVE_FAILED); } /* Convert WCS filename to MBS filename. */ archive_string_init(&as); if (archive_string_append_from_wcs(&as, path, wcslen(path)) < 0) { archive_string_free(&as); if (errno == ENOMEM) return (error_nomem(a)); archive_set_error(&(a->archive), -1, "Failed to convert WCS to MBS"); return (ARCHIVE_FAILED); } r = set_timefilter_pathname_mbs(a, timetype, as.s); archive_string_free(&as); return (r); } #endif /* _WIN32 && !__CYGWIN__ */ /* * Call back functions for archive_rb. */ static int cmp_node_mbs(const struct archive_rb_node *n1, const struct archive_rb_node *n2) { struct match_file *f1 = (struct match_file *)(uintptr_t)n1; struct match_file *f2 = (struct match_file *)(uintptr_t)n2; const char *p1, *p2; archive_mstring_get_mbs(NULL, &(f1->pathname), &p1); archive_mstring_get_mbs(NULL, &(f2->pathname), &p2); if (p1 == NULL) return (1); if (p2 == NULL) return (-1); return (strcmp(p1, p2)); } static int cmp_key_mbs(const struct archive_rb_node *n, const void *key) { struct match_file *f = (struct match_file *)(uintptr_t)n; const char *p; archive_mstring_get_mbs(NULL, &(f->pathname), &p); if (p == NULL) return (-1); return (strcmp(p, (const char *)key)); } static int cmp_node_wcs(const struct archive_rb_node *n1, const struct archive_rb_node *n2) { struct match_file *f1 = (struct match_file *)(uintptr_t)n1; struct match_file *f2 = (struct match_file *)(uintptr_t)n2; const wchar_t *p1, *p2; archive_mstring_get_wcs(NULL, &(f1->pathname), &p1); archive_mstring_get_wcs(NULL, &(f2->pathname), &p2); if (p1 == NULL) return (1); if (p2 == NULL) return (-1); return (wcscmp(p1, p2)); } static int cmp_key_wcs(const struct archive_rb_node *n, const void *key) { struct match_file *f = (struct match_file *)(uintptr_t)n; const wchar_t *p; archive_mstring_get_wcs(NULL, &(f->pathname), &p); if (p == NULL) return (-1); return (wcscmp(p, (const wchar_t *)key)); } static void entry_list_init(struct entry_list *list) { list->first = NULL; list->last = &(list->first); list->count = 0; } static void entry_list_free(struct entry_list *list) { struct match_file *p, *q; for (p = list->first; p != NULL; ) { q = p; p = p->next; archive_mstring_clean(&(q->pathname)); free(q); } } static void entry_list_add(struct entry_list *list, struct match_file *file) { *list->last = file; list->last = &(file->next); list->count++; } static int add_entry(struct archive_match *a, int flag, struct archive_entry *entry) { struct match_file *f; const void *pathname; int r; f = calloc(1, sizeof(*f)); if (f == NULL) return (error_nomem(a)); #if defined(_WIN32) && !defined(__CYGWIN__) pathname = archive_entry_pathname_w(entry); if (pathname == NULL) { free(f); archive_set_error(&(a->archive), EINVAL, "pathname is NULL"); return (ARCHIVE_FAILED); } archive_mstring_copy_wcs(&(f->pathname), pathname); a->exclusion_tree.rbt_ops = &rb_ops_wcs; #else (void)rb_ops_wcs; pathname = archive_entry_pathname(entry); if (pathname == NULL) { free(f); archive_set_error(&(a->archive), EINVAL, "pathname is NULL"); return (ARCHIVE_FAILED); } archive_mstring_copy_mbs(&(f->pathname), pathname); a->exclusion_tree.rbt_ops = &rb_ops_mbs; #endif f->flag = flag; f->mtime_sec = archive_entry_mtime(entry); f->mtime_nsec = archive_entry_mtime_nsec(entry); f->ctime_sec = archive_entry_ctime(entry); f->ctime_nsec = archive_entry_ctime_nsec(entry); r = __archive_rb_tree_insert_node(&(a->exclusion_tree), &(f->node)); if (!r) { struct match_file *f2; /* Get the duplicated file. */ f2 = (struct match_file *)__archive_rb_tree_find_node( &(a->exclusion_tree), pathname); /* * We always overwrite comparison condition. * If you do not want to overwrite it, you should not * call archive_match_exclude_entry(). We cannot know * what behavior you really expect since overwriting * condition might be different with the flag. */ if (f2 != NULL) { f2->flag = f->flag; f2->mtime_sec = f->mtime_sec; f2->mtime_nsec = f->mtime_nsec; f2->ctime_sec = f->ctime_sec; f2->ctime_nsec = f->ctime_nsec; } /* Release the duplicated file. */ archive_mstring_clean(&(f->pathname)); free(f); return (ARCHIVE_OK); } entry_list_add(&(a->exclusion_entry_list), f); a->setflag |= TIME_IS_SET; return (ARCHIVE_OK); } /* * Test if entry is excluded by its timestamp. */ static int time_excluded(struct archive_match *a, struct archive_entry *entry) { struct match_file *f; const void *pathname; time_t sec; long nsec; /* * If this file/dir is excluded by a time comparison, skip it. */ if (a->newer_ctime_filter) { /* If ctime is not set, use mtime instead. */ if (archive_entry_ctime_is_set(entry)) sec = archive_entry_ctime(entry); else sec = archive_entry_mtime(entry); if (sec < a->newer_ctime_sec) return (1); /* Too old, skip it. */ if (sec == a->newer_ctime_sec) { if (archive_entry_ctime_is_set(entry)) nsec = archive_entry_ctime_nsec(entry); else nsec = archive_entry_mtime_nsec(entry); if (nsec < a->newer_ctime_nsec) return (1); /* Too old, skip it. */ if (nsec == a->newer_ctime_nsec && (a->newer_ctime_filter & ARCHIVE_MATCH_EQUAL) == 0) return (1); /* Equal, skip it. */ } } if (a->older_ctime_filter) { /* If ctime is not set, use mtime instead. */ if (archive_entry_ctime_is_set(entry)) sec = archive_entry_ctime(entry); else sec = archive_entry_mtime(entry); if (sec > a->older_ctime_sec) return (1); /* Too new, skip it. */ if (sec == a->older_ctime_sec) { if (archive_entry_ctime_is_set(entry)) nsec = archive_entry_ctime_nsec(entry); else nsec = archive_entry_mtime_nsec(entry); if (nsec > a->older_ctime_nsec) return (1); /* Too new, skip it. */ if (nsec == a->older_ctime_nsec && (a->older_ctime_filter & ARCHIVE_MATCH_EQUAL) == 0) return (1); /* Equal, skip it. */ } } if (a->newer_mtime_filter) { sec = archive_entry_mtime(entry); if (sec < a->newer_mtime_sec) return (1); /* Too old, skip it. */ if (sec == a->newer_mtime_sec) { nsec = archive_entry_mtime_nsec(entry); if (nsec < a->newer_mtime_nsec) return (1); /* Too old, skip it. */ if (nsec == a->newer_mtime_nsec && (a->newer_mtime_filter & ARCHIVE_MATCH_EQUAL) == 0) return (1); /* Equal, skip it. */ } } if (a->older_mtime_filter) { sec = archive_entry_mtime(entry); if (sec > a->older_mtime_sec) return (1); /* Too new, skip it. */ nsec = archive_entry_mtime_nsec(entry); if (sec == a->older_mtime_sec) { if (nsec > a->older_mtime_nsec) return (1); /* Too new, skip it. */ if (nsec == a->older_mtime_nsec && (a->older_mtime_filter & ARCHIVE_MATCH_EQUAL) == 0) return (1); /* Equal, skip it. */ } } /* If there is no exclusion list, include the file. */ if (a->exclusion_entry_list.count == 0) return (0); #if defined(_WIN32) && !defined(__CYGWIN__) pathname = archive_entry_pathname_w(entry); a->exclusion_tree.rbt_ops = &rb_ops_wcs; #else (void)rb_ops_wcs; pathname = archive_entry_pathname(entry); a->exclusion_tree.rbt_ops = &rb_ops_mbs; #endif if (pathname == NULL) return (0); f = (struct match_file *)__archive_rb_tree_find_node( &(a->exclusion_tree), pathname); /* If the file wasn't rejected, include it. */ if (f == NULL) return (0); if (f->flag & ARCHIVE_MATCH_CTIME) { sec = archive_entry_ctime(entry); if (f->ctime_sec > sec) { if (f->flag & ARCHIVE_MATCH_OLDER) return (1); } else if (f->ctime_sec < sec) { if (f->flag & ARCHIVE_MATCH_NEWER) return (1); } else { nsec = archive_entry_ctime_nsec(entry); if (f->ctime_nsec > nsec) { if (f->flag & ARCHIVE_MATCH_OLDER) return (1); } else if (f->ctime_nsec < nsec) { if (f->flag & ARCHIVE_MATCH_NEWER) return (1); } else if (f->flag & ARCHIVE_MATCH_EQUAL) return (1); } } if (f->flag & ARCHIVE_MATCH_MTIME) { sec = archive_entry_mtime(entry); if (f->mtime_sec > sec) { if (f->flag & ARCHIVE_MATCH_OLDER) return (1); } else if (f->mtime_sec < sec) { if (f->flag & ARCHIVE_MATCH_NEWER) return (1); } else { nsec = archive_entry_mtime_nsec(entry); if (f->mtime_nsec > nsec) { if (f->flag & ARCHIVE_MATCH_OLDER) return (1); } else if (f->mtime_nsec < nsec) { if (f->flag & ARCHIVE_MATCH_NEWER) return (1); } else if (f->flag & ARCHIVE_MATCH_EQUAL) return (1); } } return (0); } /* * Utility functions to manage inclusion owners */ int archive_match_include_uid(struct archive *_a, la_int64_t uid) { struct archive_match *a; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_include_uid"); a = (struct archive_match *)_a; return (add_owner_id(a, &(a->inclusion_uids), uid)); } int archive_match_include_gid(struct archive *_a, la_int64_t gid) { struct archive_match *a; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_include_gid"); a = (struct archive_match *)_a; return (add_owner_id(a, &(a->inclusion_gids), gid)); } int archive_match_include_uname(struct archive *_a, const char *uname) { struct archive_match *a; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_include_uname"); a = (struct archive_match *)_a; return (add_owner_name(a, &(a->inclusion_unames), 1, uname)); } int archive_match_include_uname_w(struct archive *_a, const wchar_t *uname) { struct archive_match *a; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_include_uname_w"); a = (struct archive_match *)_a; return (add_owner_name(a, &(a->inclusion_unames), 0, uname)); } int archive_match_include_gname(struct archive *_a, const char *gname) { struct archive_match *a; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_include_gname"); a = (struct archive_match *)_a; return (add_owner_name(a, &(a->inclusion_gnames), 1, gname)); } int archive_match_include_gname_w(struct archive *_a, const wchar_t *gname) { struct archive_match *a; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_include_gname_w"); a = (struct archive_match *)_a; return (add_owner_name(a, &(a->inclusion_gnames), 0, gname)); } /* * Test function for owner(uid, gid, uname, gname). * * Returns 1 if archive entry is excluded. * Returns 0 if archive entry is not excluded. * Returns <0 if something error happened. */ int archive_match_owner_excluded(struct archive *_a, struct archive_entry *entry) { struct archive_match *a; archive_check_magic(_a, ARCHIVE_MATCH_MAGIC, ARCHIVE_STATE_NEW, "archive_match_id_excluded_ae"); a = (struct archive_match *)_a; if (entry == NULL) { archive_set_error(&(a->archive), EINVAL, "entry is NULL"); return (ARCHIVE_FAILED); } /* If we don't have inclusion id set at all, the entry is always * not excluded. */ if ((a->setflag & ID_IS_SET) == 0) return (0); return (owner_excluded(a, entry)); } static int add_owner_id(struct archive_match *a, struct id_array *ids, int64_t id) { unsigned i; if (ids->count + 1 >= ids->size) { void *p; if (ids->size == 0) ids->size = 8; else ids->size *= 2; p = realloc(ids->ids, sizeof(*ids->ids) * ids->size); if (p == NULL) return (error_nomem(a)); ids->ids = (int64_t *)p; } /* Find an insert point. */ for (i = 0; i < ids->count; i++) { if (ids->ids[i] >= id) break; } /* Add owner id. */ if (i == ids->count) ids->ids[ids->count++] = id; else if (ids->ids[i] != id) { memmove(&(ids->ids[i+1]), &(ids->ids[i]), (ids->count - i) * sizeof(ids->ids[0])); ids->ids[i] = id; ids->count++; } a->setflag |= ID_IS_SET; return (ARCHIVE_OK); } static int match_owner_id(struct id_array *ids, int64_t id) { unsigned b, m, t; t = 0; b = (unsigned)ids->count; while (t < b) { m = (t + b)>>1; if (ids->ids[m] == id) return (1); if (ids->ids[m] < id) t = m + 1; else b = m; } return (0); } static int add_owner_name(struct archive_match *a, struct match_list *list, int mbs, const void *name) { struct match *match; match = calloc(1, sizeof(*match)); if (match == NULL) return (error_nomem(a)); if (mbs) archive_mstring_copy_mbs(&(match->pattern), name); else archive_mstring_copy_wcs(&(match->pattern), name); match_list_add(list, match); a->setflag |= ID_IS_SET; return (ARCHIVE_OK); } #if !defined(_WIN32) || defined(__CYGWIN__) static int match_owner_name_mbs(struct archive_match *a, struct match_list *list, const char *name) { struct match *m; const char *p; if (name == NULL || *name == '\0') return (0); for (m = list->first; m; m = m->next) { if (archive_mstring_get_mbs(&(a->archive), &(m->pattern), &p) < 0 && errno == ENOMEM) return (error_nomem(a)); if (p != NULL && strcmp(p, name) == 0) { - m->matches++; + m->matched = 1; return (1); } } return (0); } #else static int match_owner_name_wcs(struct archive_match *a, struct match_list *list, const wchar_t *name) { struct match *m; const wchar_t *p; if (name == NULL || *name == L'\0') return (0); for (m = list->first; m; m = m->next) { if (archive_mstring_get_wcs(&(a->archive), &(m->pattern), &p) < 0 && errno == ENOMEM) return (error_nomem(a)); if (p != NULL && wcscmp(p, name) == 0) { - m->matches++; + m->matched = 1; return (1); } } return (0); } #endif /* * Test if entry is excluded by uid, gid, uname or gname. */ static int owner_excluded(struct archive_match *a, struct archive_entry *entry) { int r; if (a->inclusion_uids.count) { if (!match_owner_id(&(a->inclusion_uids), archive_entry_uid(entry))) return (1); } if (a->inclusion_gids.count) { if (!match_owner_id(&(a->inclusion_gids), archive_entry_gid(entry))) return (1); } if (a->inclusion_unames.count) { #if defined(_WIN32) && !defined(__CYGWIN__) r = match_owner_name_wcs(a, &(a->inclusion_unames), archive_entry_uname_w(entry)); #else r = match_owner_name_mbs(a, &(a->inclusion_unames), archive_entry_uname(entry)); #endif if (!r) return (1); else if (r < 0) return (r); } if (a->inclusion_gnames.count) { #if defined(_WIN32) && !defined(__CYGWIN__) r = match_owner_name_wcs(a, &(a->inclusion_gnames), archive_entry_gname_w(entry)); #else r = match_owner_name_mbs(a, &(a->inclusion_gnames), archive_entry_gname(entry)); #endif if (!r) return (1); else if (r < 0) return (r); } return (0); } diff --git a/contrib/libarchive/libarchive/archive_read_support_format_lha.c b/contrib/libarchive/libarchive/archive_read_support_format_lha.c index 4d6290ac33bb..ae5a1d7d668e 100644 --- a/contrib/libarchive/libarchive/archive_read_support_format_lha.c +++ b/contrib/libarchive/libarchive/archive_read_support_format_lha.c @@ -1,2919 +1,2921 @@ /*- * Copyright (c) 2008-2014 Michihiro NAKAJIMA * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "archive_platform.h" #ifdef HAVE_ERRNO_H #include #endif #ifdef HAVE_LIMITS_H #include #endif #ifdef HAVE_STDLIB_H #include #endif #ifdef HAVE_STRING_H #include #endif #include "archive.h" #include "archive_entry.h" #include "archive_entry_locale.h" #include "archive_private.h" #include "archive_read_private.h" #include "archive_endian.h" #define MAXMATCH 256 /* Maximum match length. */ #define MINMATCH 3 /* Minimum match length. */ /* * Literal table format: * +0 +256 +510 * +---------------+-------------------------+ * | literal code | match length | * | 0 ... 255 | MINMATCH ... MAXMATCH | * +---------------+-------------------------+ * <--- LT_BITLEN_SIZE ---> */ /* Literal table size. */ #define LT_BITLEN_SIZE (UCHAR_MAX + 1 + MAXMATCH - MINMATCH + 1) /* Position table size. * Note: this used for both position table and pre literal table.*/ #define PT_BITLEN_SIZE (3 + 16) struct lzh_dec { /* Decoding status. */ int state; /* * Window to see last 8Ki(lh5),32Ki(lh6),64Ki(lh7) bytes of decoded * data. */ int w_size; int w_mask; /* Window buffer, which is a loop buffer. */ unsigned char *w_buff; /* The insert position to the window. */ int w_pos; /* The position where we can copy decoded code from the window. */ int copy_pos; /* The length how many bytes we can copy decoded code from * the window. */ int copy_len; /* * Bit stream reader. */ struct lzh_br { #define CACHE_TYPE uint64_t #define CACHE_BITS (8 * sizeof(CACHE_TYPE)) /* Cache buffer. */ CACHE_TYPE cache_buffer; /* Indicates how many bits avail in cache_buffer. */ int cache_avail; } br; /* * Huffman coding. */ struct huffman { int len_size; int len_avail; int len_bits; int freq[17]; unsigned char *bitlen; /* * Use a index table. It's faster than searching a huffman * coding tree, which is a binary tree. But a use of a large * index table causes L1 cache read miss many times. */ #define HTBL_BITS 10 int max_bits; int shift_bits; int tbl_bits; int tree_used; int tree_avail; /* Direct access table. */ uint16_t *tbl; /* Binary tree table for extra bits over the direct access. */ struct htree_t { uint16_t left; uint16_t right; } *tree; } lt, pt; int blocks_avail; int pos_pt_len_size; int pos_pt_len_bits; int literal_pt_len_size; int literal_pt_len_bits; int reading_position; int loop; int error; }; struct lzh_stream { const unsigned char *next_in; int avail_in; int64_t total_in; const unsigned char *ref_ptr; int avail_out; int64_t total_out; struct lzh_dec *ds; }; struct lha { /* entry_bytes_remaining is the number of bytes we expect. */ int64_t entry_offset; int64_t entry_bytes_remaining; int64_t entry_unconsumed; uint16_t entry_crc_calculated; size_t header_size; /* header size */ unsigned char level; /* header level */ char method[3]; /* compress type */ int64_t compsize; /* compressed data size */ int64_t origsize; /* original file size */ int setflag; #define BIRTHTIME_IS_SET 1 #define ATIME_IS_SET 2 #define UNIX_MODE_IS_SET 4 #define CRC_IS_SET 8 time_t birthtime; long birthtime_tv_nsec; time_t mtime; long mtime_tv_nsec; time_t atime; long atime_tv_nsec; mode_t mode; int64_t uid; int64_t gid; struct archive_string uname; struct archive_string gname; uint16_t header_crc; uint16_t crc; /* dirname and filename could be in different codepages */ struct archive_string_conv *sconv_dir; struct archive_string_conv *sconv_fname; struct archive_string_conv *opt_sconv; struct archive_string dirname; struct archive_string filename; struct archive_wstring ws; unsigned char dos_attr; /* Flag to mark progress that an archive was read their first header.*/ char found_first_header; /* Flag to mark that indicates an empty directory. */ char directory; /* Flags to mark progress of decompression. */ char decompress_init; char end_of_entry; char end_of_entry_cleanup; char entry_is_compressed; char format_name[64]; struct lzh_stream strm; }; /* * LHA header common member offset. */ #define H_METHOD_OFFSET 2 /* Compress type. */ #define H_ATTR_OFFSET 19 /* DOS attribute. */ #define H_LEVEL_OFFSET 20 /* Header Level. */ #define H_SIZE 22 /* Minimum header size. */ static int archive_read_format_lha_bid(struct archive_read *, int); static int archive_read_format_lha_options(struct archive_read *, const char *, const char *); static int archive_read_format_lha_read_header(struct archive_read *, struct archive_entry *); static int archive_read_format_lha_read_data(struct archive_read *, const void **, size_t *, int64_t *); static int archive_read_format_lha_read_data_skip(struct archive_read *); static int archive_read_format_lha_cleanup(struct archive_read *); static void lha_replace_path_separator(struct lha *, struct archive_entry *); static int lha_read_file_header_0(struct archive_read *, struct lha *); static int lha_read_file_header_1(struct archive_read *, struct lha *); static int lha_read_file_header_2(struct archive_read *, struct lha *); static int lha_read_file_header_3(struct archive_read *, struct lha *); static int lha_read_file_extended_header(struct archive_read *, struct lha *, uint16_t *, int, size_t, size_t *); static size_t lha_check_header_format(const void *); static int lha_skip_sfx(struct archive_read *); static time_t lha_dos_time(const unsigned char *); static time_t lha_win_time(uint64_t, long *); static unsigned char lha_calcsum(unsigned char, const void *, int, size_t); static int lha_parse_linkname(struct archive_wstring *, struct archive_wstring *); static int lha_read_data_none(struct archive_read *, const void **, size_t *, int64_t *); static int lha_read_data_lzh(struct archive_read *, const void **, size_t *, int64_t *); static void lha_crc16_init(void); static uint16_t lha_crc16(uint16_t, const void *, size_t); static int lzh_decode_init(struct lzh_stream *, const char *); static void lzh_decode_free(struct lzh_stream *); static int lzh_decode(struct lzh_stream *, int); static int lzh_br_fillup(struct lzh_stream *, struct lzh_br *); static int lzh_huffman_init(struct huffman *, size_t, int); static void lzh_huffman_free(struct huffman *); static int lzh_read_pt_bitlen(struct lzh_stream *, int start, int end); static int lzh_make_fake_table(struct huffman *, uint16_t); static int lzh_make_huffman_table(struct huffman *); static inline int lzh_decode_huffman(struct huffman *, unsigned); static int lzh_decode_huffman_tree(struct huffman *, unsigned, int); int archive_read_support_format_lha(struct archive *_a) { struct archive_read *a = (struct archive_read *)_a; struct lha *lha; int r; archive_check_magic(_a, ARCHIVE_READ_MAGIC, ARCHIVE_STATE_NEW, "archive_read_support_format_lha"); lha = (struct lha *)calloc(1, sizeof(*lha)); if (lha == NULL) { archive_set_error(&a->archive, ENOMEM, "Can't allocate lha data"); return (ARCHIVE_FATAL); } archive_string_init(&lha->ws); r = __archive_read_register_format(a, lha, "lha", archive_read_format_lha_bid, archive_read_format_lha_options, archive_read_format_lha_read_header, archive_read_format_lha_read_data, archive_read_format_lha_read_data_skip, NULL, archive_read_format_lha_cleanup, NULL, NULL); if (r != ARCHIVE_OK) free(lha); return (ARCHIVE_OK); } static size_t lha_check_header_format(const void *h) { const unsigned char *p = h; size_t next_skip_bytes; switch (p[H_METHOD_OFFSET+3]) { /* * "-lh0-" ... "-lh7-" "-lhd-" * "-lzs-" "-lz5-" */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case 'd': case 's': next_skip_bytes = 4; /* b0 == 0 means the end of an LHa archive file. */ if (p[0] == 0) break; if (p[H_METHOD_OFFSET] != '-' || p[H_METHOD_OFFSET+1] != 'l' || p[H_METHOD_OFFSET+4] != '-') break; if (p[H_METHOD_OFFSET+2] == 'h') { /* "-lh?-" */ if (p[H_METHOD_OFFSET+3] == 's') break; if (p[H_LEVEL_OFFSET] == 0) return (0); if (p[H_LEVEL_OFFSET] <= 3 && p[H_ATTR_OFFSET] == 0x20) return (0); } if (p[H_METHOD_OFFSET+2] == 'z') { /* LArc extensions: -lzs-,-lz4- and -lz5- */ if (p[H_LEVEL_OFFSET] != 0) break; if (p[H_METHOD_OFFSET+3] == 's' || p[H_METHOD_OFFSET+3] == '4' || p[H_METHOD_OFFSET+3] == '5') return (0); } break; case 'h': next_skip_bytes = 1; break; case 'z': next_skip_bytes = 1; break; case 'l': next_skip_bytes = 2; break; case '-': next_skip_bytes = 3; break; default : next_skip_bytes = 4; break; } return (next_skip_bytes); } static int archive_read_format_lha_bid(struct archive_read *a, int best_bid) { const char *p; const void *buff; ssize_t bytes_avail, offset, window; size_t next; /* If there's already a better bid than we can ever make, don't bother testing. */ if (best_bid > 30) return (-1); if ((p = __archive_read_ahead(a, H_SIZE, NULL)) == NULL) return (-1); if (lha_check_header_format(p) == 0) return (30); if (p[0] == 'M' && p[1] == 'Z') { /* PE file */ offset = 0; window = 4096; while (offset < (1024 * 20)) { buff = __archive_read_ahead(a, offset + window, &bytes_avail); if (buff == NULL) { /* Remaining bytes are less than window. */ window >>= 1; if (window < (H_SIZE + 3)) return (0); continue; } p = (const char *)buff + offset; while (p + H_SIZE < (const char *)buff + bytes_avail) { if ((next = lha_check_header_format(p)) == 0) return (30); p += next; } offset = p - (const char *)buff; } } return (0); } static int archive_read_format_lha_options(struct archive_read *a, const char *key, const char *val) { struct lha *lha; int ret = ARCHIVE_FAILED; lha = (struct lha *)(a->format->data); if (strcmp(key, "hdrcharset") == 0) { if (val == NULL || val[0] == 0) archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, "lha: hdrcharset option needs a character-set name"); else { lha->opt_sconv = archive_string_conversion_from_charset( &a->archive, val, 0); if (lha->opt_sconv != NULL) ret = ARCHIVE_OK; else ret = ARCHIVE_FATAL; } return (ret); } /* Note: The "warn" return is just to inform the options * supervisor that we didn't handle it. It will generate * a suitable error if no one used this option. */ return (ARCHIVE_WARN); } static int lha_skip_sfx(struct archive_read *a) { const void *h; const char *p, *q; size_t next, skip; ssize_t bytes, window; window = 4096; for (;;) { h = __archive_read_ahead(a, window, &bytes); if (h == NULL) { /* Remaining bytes are less than window. */ window >>= 1; if (window < (H_SIZE + 3)) goto fatal; continue; } if (bytes < H_SIZE) goto fatal; p = h; q = p + bytes; /* * Scan ahead until we find something that looks * like the lha header. */ while (p + H_SIZE < q) { if ((next = lha_check_header_format(p)) == 0) { skip = p - (const char *)h; __archive_read_consume(a, skip); return (ARCHIVE_OK); } p += next; } skip = p - (const char *)h; __archive_read_consume(a, skip); } fatal: archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Couldn't find out LHa header"); return (ARCHIVE_FATAL); } static int truncated_error(struct archive_read *a) { archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Truncated LHa header"); return (ARCHIVE_FATAL); } static int archive_read_format_lha_read_header(struct archive_read *a, struct archive_entry *entry) { struct archive_wstring linkname; struct archive_wstring pathname; struct lha *lha; const unsigned char *p; const char *signature; int err; struct archive_mstring conv_buffer; const wchar_t *conv_buffer_p; lha_crc16_init(); a->archive.archive_format = ARCHIVE_FORMAT_LHA; if (a->archive.archive_format_name == NULL) a->archive.archive_format_name = "lha"; lha = (struct lha *)(a->format->data); lha->decompress_init = 0; lha->end_of_entry = 0; lha->end_of_entry_cleanup = 0; lha->entry_unconsumed = 0; if ((p = __archive_read_ahead(a, H_SIZE, NULL)) == NULL) { /* * LHa archiver added 0 to the tail of its archive file as * the mark of the end of the archive. */ signature = __archive_read_ahead(a, sizeof(signature[0]), NULL); if (signature == NULL || signature[0] == 0) return (ARCHIVE_EOF); return (truncated_error(a)); } signature = (const char *)p; if (lha->found_first_header == 0 && signature[0] == 'M' && signature[1] == 'Z') { /* This is an executable? Must be self-extracting... */ err = lha_skip_sfx(a); if (err < ARCHIVE_WARN) return (err); if ((p = __archive_read_ahead(a, sizeof(*p), NULL)) == NULL) return (truncated_error(a)); signature = (const char *)p; } /* signature[0] == 0 means the end of an LHa archive file. */ if (signature[0] == 0) return (ARCHIVE_EOF); /* * Check the header format and method type. */ if (lha_check_header_format(p) != 0) { archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Bad LHa file"); return (ARCHIVE_FATAL); } /* We've found the first header. */ lha->found_first_header = 1; /* Set a default value and common data */ lha->header_size = 0; lha->level = p[H_LEVEL_OFFSET]; lha->method[0] = p[H_METHOD_OFFSET+1]; lha->method[1] = p[H_METHOD_OFFSET+2]; lha->method[2] = p[H_METHOD_OFFSET+3]; if (memcmp(lha->method, "lhd", 3) == 0) lha->directory = 1; else lha->directory = 0; if (memcmp(lha->method, "lh0", 3) == 0 || memcmp(lha->method, "lz4", 3) == 0) lha->entry_is_compressed = 0; else lha->entry_is_compressed = 1; lha->compsize = 0; lha->origsize = 0; lha->setflag = 0; lha->birthtime = 0; lha->birthtime_tv_nsec = 0; lha->mtime = 0; lha->mtime_tv_nsec = 0; lha->atime = 0; lha->atime_tv_nsec = 0; lha->mode = (lha->directory)? 0777 : 0666; lha->uid = 0; lha->gid = 0; archive_string_empty(&lha->dirname); archive_string_empty(&lha->filename); lha->dos_attr = 0; if (lha->opt_sconv != NULL) { lha->sconv_dir = lha->opt_sconv; lha->sconv_fname = lha->opt_sconv; } else { lha->sconv_dir = NULL; lha->sconv_fname = NULL; } switch (p[H_LEVEL_OFFSET]) { case 0: err = lha_read_file_header_0(a, lha); break; case 1: err = lha_read_file_header_1(a, lha); break; case 2: err = lha_read_file_header_2(a, lha); break; case 3: err = lha_read_file_header_3(a, lha); break; default: archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Unsupported LHa header level %d", p[H_LEVEL_OFFSET]); err = ARCHIVE_FATAL; break; } if (err < ARCHIVE_WARN) return (err); if (!lha->directory && archive_strlen(&lha->filename) == 0) /* The filename has not been set */ return (truncated_error(a)); /* * Make a pathname from a dirname and a filename, after converting to Unicode. * This is because codepages might differ between dirname and filename. */ archive_string_init(&pathname); archive_string_init(&linkname); archive_string_init(&conv_buffer.aes_mbs); archive_string_init(&conv_buffer.aes_mbs_in_locale); archive_string_init(&conv_buffer.aes_utf8); archive_string_init(&conv_buffer.aes_wcs); if (0 != archive_mstring_copy_mbs_len_l(&conv_buffer, lha->dirname.s, lha->dirname.length, lha->sconv_dir)) { archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Pathname cannot be converted " "from %s to Unicode.", archive_string_conversion_charset_name(lha->sconv_dir)); err = ARCHIVE_FATAL; } else if (0 != archive_mstring_get_wcs(&a->archive, &conv_buffer, &conv_buffer_p)) err = ARCHIVE_FATAL; if (err == ARCHIVE_FATAL) { archive_mstring_clean(&conv_buffer); archive_wstring_free(&pathname); archive_wstring_free(&linkname); return (err); } archive_wstring_copy(&pathname, &conv_buffer.aes_wcs); archive_string_empty(&conv_buffer.aes_mbs); archive_string_empty(&conv_buffer.aes_mbs_in_locale); archive_string_empty(&conv_buffer.aes_utf8); archive_wstring_empty(&conv_buffer.aes_wcs); if (0 != archive_mstring_copy_mbs_len_l(&conv_buffer, lha->filename.s, lha->filename.length, lha->sconv_fname)) { archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Pathname cannot be converted " "from %s to Unicode.", archive_string_conversion_charset_name(lha->sconv_fname)); err = ARCHIVE_FATAL; } else if (0 != archive_mstring_get_wcs(&a->archive, &conv_buffer, &conv_buffer_p)) err = ARCHIVE_FATAL; if (err == ARCHIVE_FATAL) { archive_mstring_clean(&conv_buffer); archive_wstring_free(&pathname); archive_wstring_free(&linkname); return (err); } archive_wstring_concat(&pathname, &conv_buffer.aes_wcs); archive_mstring_clean(&conv_buffer); if ((lha->mode & AE_IFMT) == AE_IFLNK) { /* * Extract the symlink-name if it's included in the pathname. */ if (!lha_parse_linkname(&linkname, &pathname)) { /* We couldn't get the symlink-name. */ archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Unknown symlink-name"); archive_wstring_free(&pathname); archive_wstring_free(&linkname); return (ARCHIVE_FAILED); } } else { /* * Make sure a file-type is set. * The mode has been overridden if it is in the extended data. */ lha->mode = (lha->mode & ~AE_IFMT) | ((lha->directory)? AE_IFDIR: AE_IFREG); } if ((lha->setflag & UNIX_MODE_IS_SET) == 0 && (lha->dos_attr & 1) != 0) lha->mode &= ~(0222);/* read only. */ /* * Set basic file parameters. */ archive_entry_copy_pathname_w(entry, pathname.s); archive_wstring_free(&pathname); if (archive_strlen(&linkname) > 0) { archive_entry_copy_symlink_w(entry, linkname.s); } else archive_entry_set_symlink(entry, NULL); archive_wstring_free(&linkname); /* * When a header level is 0, there is a possibility that * a pathname and a symlink has '\' character, a directory * separator in DOS/Windows. So we should convert it to '/'. */ if (p[H_LEVEL_OFFSET] == 0) lha_replace_path_separator(lha, entry); archive_entry_set_mode(entry, lha->mode); archive_entry_set_uid(entry, lha->uid); archive_entry_set_gid(entry, lha->gid); if (archive_strlen(&lha->uname) > 0) archive_entry_set_uname(entry, lha->uname.s); if (archive_strlen(&lha->gname) > 0) archive_entry_set_gname(entry, lha->gname.s); if (lha->setflag & BIRTHTIME_IS_SET) { archive_entry_set_birthtime(entry, lha->birthtime, lha->birthtime_tv_nsec); archive_entry_set_ctime(entry, lha->birthtime, lha->birthtime_tv_nsec); } else { archive_entry_unset_birthtime(entry); archive_entry_unset_ctime(entry); } archive_entry_set_mtime(entry, lha->mtime, lha->mtime_tv_nsec); if (lha->setflag & ATIME_IS_SET) archive_entry_set_atime(entry, lha->atime, lha->atime_tv_nsec); else archive_entry_unset_atime(entry); if (lha->directory || archive_entry_symlink(entry) != NULL) archive_entry_unset_size(entry); else archive_entry_set_size(entry, lha->origsize); /* * Prepare variables used to read a file content. */ lha->entry_bytes_remaining = lha->compsize; if (lha->entry_bytes_remaining < 0) { archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Invalid LHa entry size"); return (ARCHIVE_FATAL); } lha->entry_offset = 0; lha->entry_crc_calculated = 0; /* * This file does not have a content. */ if (lha->directory || lha->compsize == 0) lha->end_of_entry = 1; snprintf(lha->format_name, sizeof(lha->format_name), "lha -%c%c%c-", lha->method[0], lha->method[1], lha->method[2]); a->archive.archive_format_name = lha->format_name; return (err); } /* * Replace a DOS path separator '\' by a character '/'. * Some multi-byte character set have a character '\' in its second byte. */ static void lha_replace_path_separator(struct lha *lha, struct archive_entry *entry) { const wchar_t *wp; size_t i; if ((wp = archive_entry_pathname_w(entry)) != NULL) { archive_wstrcpy(&(lha->ws), wp); for (i = 0; i < archive_strlen(&(lha->ws)); i++) { if (lha->ws.s[i] == L'\\') lha->ws.s[i] = L'/'; } archive_entry_copy_pathname_w(entry, lha->ws.s); } if ((wp = archive_entry_symlink_w(entry)) != NULL) { archive_wstrcpy(&(lha->ws), wp); for (i = 0; i < archive_strlen(&(lha->ws)); i++) { if (lha->ws.s[i] == L'\\') lha->ws.s[i] = L'/'; } archive_entry_copy_symlink_w(entry, lha->ws.s); } } /* * Header 0 format * * +0 +1 +2 +7 +11 * +---------------+----------+----------------+-------------------+ * |header size(*1)|header sum|compression type|compressed size(*2)| * +---------------+----------+----------------+-------------------+ * <---------------------(*1)----------* * * +11 +15 +17 +19 +20 +21 * +-----------------+---------+---------+--------------+----------------+ * |uncompressed size|time(DOS)|date(DOS)|attribute(DOS)|header level(=0)| * +-----------------+---------+---------+--------------+----------------+ * *--------------------------------(*1)---------------------------------* * * +21 +22 +22+(*3) +22+(*3)+2 +22+(*3)+2+(*4) * +---------------+---------+----------+----------------+------------------+ * |name length(*3)|file name|file CRC16|extra header(*4)| compressed data | * +---------------+---------+----------+----------------+------------------+ * <--(*3)-> <------(*2)------> * *----------------------(*1)--------------------------> * */ #define H0_HEADER_SIZE_OFFSET 0 #define H0_HEADER_SUM_OFFSET 1 #define H0_COMP_SIZE_OFFSET 7 #define H0_ORIG_SIZE_OFFSET 11 #define H0_DOS_TIME_OFFSET 15 #define H0_NAME_LEN_OFFSET 21 #define H0_FILE_NAME_OFFSET 22 #define H0_FIXED_SIZE 24 static int lha_read_file_header_0(struct archive_read *a, struct lha *lha) { const unsigned char *p; int extdsize, namelen; unsigned char headersum, sum_calculated; if ((p = __archive_read_ahead(a, H0_FIXED_SIZE, NULL)) == NULL) return (truncated_error(a)); lha->header_size = p[H0_HEADER_SIZE_OFFSET] + 2; headersum = p[H0_HEADER_SUM_OFFSET]; lha->compsize = archive_le32dec(p + H0_COMP_SIZE_OFFSET); lha->origsize = archive_le32dec(p + H0_ORIG_SIZE_OFFSET); lha->mtime = lha_dos_time(p + H0_DOS_TIME_OFFSET); namelen = p[H0_NAME_LEN_OFFSET]; extdsize = (int)lha->header_size - H0_FIXED_SIZE - namelen; if ((namelen > 221 || extdsize < 0) && extdsize != -2) { archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Invalid LHa header"); return (ARCHIVE_FATAL); } if ((p = __archive_read_ahead(a, lha->header_size, NULL)) == NULL) return (truncated_error(a)); archive_strncpy(&lha->filename, p + H0_FILE_NAME_OFFSET, namelen); /* When extdsize == -2, A CRC16 value is not present in the header. */ if (extdsize >= 0) { lha->crc = archive_le16dec(p + H0_FILE_NAME_OFFSET + namelen); lha->setflag |= CRC_IS_SET; } sum_calculated = lha_calcsum(0, p, 2, lha->header_size - 2); /* Read an extended header */ if (extdsize > 0) { /* This extended data is set by 'LHa for UNIX' only. * Maybe fixed size. */ p += H0_FILE_NAME_OFFSET + namelen + 2; if (p[0] == 'U' && extdsize == 12) { /* p[1] is a minor version. */ lha->mtime = archive_le32dec(&p[2]); lha->mode = archive_le16dec(&p[6]); lha->uid = archive_le16dec(&p[8]); lha->gid = archive_le16dec(&p[10]); lha->setflag |= UNIX_MODE_IS_SET; } } __archive_read_consume(a, lha->header_size); if (sum_calculated != headersum) { archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, "LHa header sum error"); return (ARCHIVE_FATAL); } return (ARCHIVE_OK); } /* * Header 1 format * * +0 +1 +2 +7 +11 * +---------------+----------+----------------+-------------+ * |header size(*1)|header sum|compression type|skip size(*2)| * +---------------+----------+----------------+-------------+ * <---------------(*1)----------* * * +11 +15 +17 +19 +20 +21 * +-----------------+---------+---------+--------------+----------------+ * |uncompressed size|time(DOS)|date(DOS)|attribute(DOS)|header level(=1)| * +-----------------+---------+---------+--------------+----------------+ * *-------------------------------(*1)----------------------------------* * * +21 +22 +22+(*3) +22+(*3)+2 +22+(*3)+3 +22+(*3)+3+(*4) * +---------------+---------+----------+-----------+-----------+ * |name length(*3)|file name|file CRC16| creator |padding(*4)| * +---------------+---------+----------+-----------+-----------+ * <--(*3)-> * *----------------------------(*1)----------------------------* * * +22+(*3)+3+(*4) +22+(*3)+3+(*4)+2 +22+(*3)+3+(*4)+2+(*5) * +----------------+---------------------+------------------------+ * |next header size| extended header(*5) | compressed data | * +----------------+---------------------+------------------------+ * *------(*1)-----> <--------------------(*2)--------------------> */ #define H1_HEADER_SIZE_OFFSET 0 #define H1_HEADER_SUM_OFFSET 1 #define H1_COMP_SIZE_OFFSET 7 #define H1_ORIG_SIZE_OFFSET 11 #define H1_DOS_TIME_OFFSET 15 #define H1_NAME_LEN_OFFSET 21 #define H1_FILE_NAME_OFFSET 22 #define H1_FIXED_SIZE 27 static int lha_read_file_header_1(struct archive_read *a, struct lha *lha) { const unsigned char *p; size_t extdsize; int i, err, err2; int namelen, padding; unsigned char headersum, sum_calculated; err = ARCHIVE_OK; if ((p = __archive_read_ahead(a, H1_FIXED_SIZE, NULL)) == NULL) return (truncated_error(a)); lha->header_size = p[H1_HEADER_SIZE_OFFSET] + 2; headersum = p[H1_HEADER_SUM_OFFSET]; /* Note: An extended header size is included in a compsize. */ lha->compsize = archive_le32dec(p + H1_COMP_SIZE_OFFSET); lha->origsize = archive_le32dec(p + H1_ORIG_SIZE_OFFSET); lha->mtime = lha_dos_time(p + H1_DOS_TIME_OFFSET); namelen = p[H1_NAME_LEN_OFFSET]; /* Calculate a padding size. The result will be normally 0 only(?) */ padding = ((int)lha->header_size) - H1_FIXED_SIZE - namelen; if (namelen > 230 || padding < 0) goto invalid; if ((p = __archive_read_ahead(a, lha->header_size, NULL)) == NULL) return (truncated_error(a)); for (i = 0; i < namelen; i++) { if (p[i + H1_FILE_NAME_OFFSET] == 0xff) goto invalid;/* Invalid filename. */ } archive_strncpy(&lha->filename, p + H1_FILE_NAME_OFFSET, namelen); lha->crc = archive_le16dec(p + H1_FILE_NAME_OFFSET + namelen); lha->setflag |= CRC_IS_SET; sum_calculated = lha_calcsum(0, p, 2, lha->header_size - 2); /* Consume used bytes but not include `next header size' data * since it will be consumed in lha_read_file_extended_header(). */ __archive_read_consume(a, lha->header_size - 2); /* Read extended headers */ err2 = lha_read_file_extended_header(a, lha, NULL, 2, (size_t)(lha->compsize + 2), &extdsize); if (err2 < ARCHIVE_WARN) return (err2); if (err2 < err) err = err2; /* Get a real compressed file size. */ lha->compsize -= extdsize - 2; if (lha->compsize < 0) goto invalid; /* Invalid compressed file size */ if (sum_calculated != headersum) { archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, "LHa header sum error"); return (ARCHIVE_FATAL); } return (err); invalid: archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Invalid LHa header"); return (ARCHIVE_FATAL); } /* * Header 2 format * * +0 +2 +7 +11 +15 * +---------------+----------------+-------------------+-----------------+ * |header size(*1)|compression type|compressed size(*2)|uncompressed size| * +---------------+----------------+-------------------+-----------------+ * <--------------------------------(*1)---------------------------------* * * +15 +19 +20 +21 +23 +24 * +-----------------+------------+----------------+----------+-----------+ * |data/time(time_t)| 0x20 fixed |header level(=2)|file CRC16| creator | * +-----------------+------------+----------------+----------+-----------+ * *---------------------------------(*1)---------------------------------* * * +24 +26 +26+(*3) +26+(*3)+(*4) * +----------------+-------------------+-------------+-------------------+ * |next header size|extended header(*3)| padding(*4) | compressed data | * +----------------+-------------------+-------------+-------------------+ * *--------------------------(*1)-------------------> <------(*2)-------> * */ #define H2_HEADER_SIZE_OFFSET 0 #define H2_COMP_SIZE_OFFSET 7 #define H2_ORIG_SIZE_OFFSET 11 #define H2_TIME_OFFSET 15 #define H2_CRC_OFFSET 21 #define H2_FIXED_SIZE 24 static int lha_read_file_header_2(struct archive_read *a, struct lha *lha) { const unsigned char *p; size_t extdsize; int err, padding; uint16_t header_crc; if ((p = __archive_read_ahead(a, H2_FIXED_SIZE, NULL)) == NULL) return (truncated_error(a)); lha->header_size =archive_le16dec(p + H2_HEADER_SIZE_OFFSET); lha->compsize = archive_le32dec(p + H2_COMP_SIZE_OFFSET); lha->origsize = archive_le32dec(p + H2_ORIG_SIZE_OFFSET); lha->mtime = archive_le32dec(p + H2_TIME_OFFSET); lha->crc = archive_le16dec(p + H2_CRC_OFFSET); lha->setflag |= CRC_IS_SET; if (lha->header_size < H2_FIXED_SIZE) { archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Invalid LHa header size"); return (ARCHIVE_FATAL); } header_crc = lha_crc16(0, p, H2_FIXED_SIZE); __archive_read_consume(a, H2_FIXED_SIZE); /* Read extended headers */ err = lha_read_file_extended_header(a, lha, &header_crc, 2, lha->header_size - H2_FIXED_SIZE, &extdsize); if (err < ARCHIVE_WARN) return (err); /* Calculate a padding size. The result will be normally 0 or 1. */ padding = (int)lha->header_size - (int)(H2_FIXED_SIZE + extdsize); if (padding > 0) { if ((p = __archive_read_ahead(a, padding, NULL)) == NULL) return (truncated_error(a)); header_crc = lha_crc16(header_crc, p, padding); __archive_read_consume(a, padding); } if (header_crc != lha->header_crc) { #ifndef DONT_FAIL_ON_CRC_ERROR archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "LHa header CRC error"); return (ARCHIVE_FATAL); #endif } return (err); } /* * Header 3 format * * +0 +2 +7 +11 +15 * +------------+----------------+-------------------+-----------------+ * | 0x04 fixed |compression type|compressed size(*2)|uncompressed size| * +------------+----------------+-------------------+-----------------+ * <-------------------------------(*1)-------------------------------* * * +15 +19 +20 +21 +23 +24 * +-----------------+------------+----------------+----------+-----------+ * |date/time(time_t)| 0x20 fixed |header level(=3)|file CRC16| creator | * +-----------------+------------+----------------+----------+-----------+ * *--------------------------------(*1)----------------------------------* * * +24 +28 +32 +32+(*3) * +---------------+----------------+-------------------+-----------------+ * |header size(*1)|next header size|extended header(*3)| compressed data | * +---------------+----------------+-------------------+-----------------+ * *------------------------(*1)-----------------------> <------(*2)-----> * */ #define H3_FIELD_LEN_OFFSET 0 #define H3_COMP_SIZE_OFFSET 7 #define H3_ORIG_SIZE_OFFSET 11 #define H3_TIME_OFFSET 15 #define H3_CRC_OFFSET 21 #define H3_HEADER_SIZE_OFFSET 24 #define H3_FIXED_SIZE 28 static int lha_read_file_header_3(struct archive_read *a, struct lha *lha) { const unsigned char *p; size_t extdsize; int err; uint16_t header_crc; if ((p = __archive_read_ahead(a, H3_FIXED_SIZE, NULL)) == NULL) return (truncated_error(a)); if (archive_le16dec(p + H3_FIELD_LEN_OFFSET) != 4) goto invalid; lha->header_size =archive_le32dec(p + H3_HEADER_SIZE_OFFSET); lha->compsize = archive_le32dec(p + H3_COMP_SIZE_OFFSET); lha->origsize = archive_le32dec(p + H3_ORIG_SIZE_OFFSET); lha->mtime = archive_le32dec(p + H3_TIME_OFFSET); lha->crc = archive_le16dec(p + H3_CRC_OFFSET); lha->setflag |= CRC_IS_SET; if (lha->header_size < H3_FIXED_SIZE + 4) goto invalid; header_crc = lha_crc16(0, p, H3_FIXED_SIZE); __archive_read_consume(a, H3_FIXED_SIZE); /* Read extended headers */ err = lha_read_file_extended_header(a, lha, &header_crc, 4, lha->header_size - H3_FIXED_SIZE, &extdsize); if (err < ARCHIVE_WARN) return (err); if (header_crc != lha->header_crc) { #ifndef DONT_FAIL_ON_CRC_ERROR archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "LHa header CRC error"); return (ARCHIVE_FATAL); #endif } return (err); invalid: archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Invalid LHa header"); return (ARCHIVE_FATAL); } /* * Extended header format * * +0 +2 +3 -- used in header 1 and 2 * +0 +4 +5 -- used in header 3 * +--------------+---------+-------------------+--------------+-- * |ex-header size|header id| data |ex-header size| ....... * +--------------+---------+-------------------+--------------+-- * <-------------( ex-header size)------------> <-- next extended header --* * * If the ex-header size is zero, it is the make of the end of extended * headers. * */ static int lha_read_file_extended_header(struct archive_read *a, struct lha *lha, uint16_t *crc, int sizefield_length, size_t limitsize, size_t *total_size) { const void *h; const unsigned char *extdheader; size_t extdsize; size_t datasize; unsigned int i; unsigned char extdtype; #define EXT_HEADER_CRC 0x00 /* Header CRC and information*/ #define EXT_FILENAME 0x01 /* Filename */ #define EXT_DIRECTORY 0x02 /* Directory name */ #define EXT_DOS_ATTR 0x40 /* MS-DOS attribute */ #define EXT_TIMESTAMP 0x41 /* Windows time stamp */ #define EXT_FILESIZE 0x42 /* Large file size */ #define EXT_TIMEZONE 0x43 /* Time zone */ #define EXT_UTF16_FILENAME 0x44 /* UTF-16 filename */ #define EXT_UTF16_DIRECTORY 0x45 /* UTF-16 directory name */ #define EXT_CODEPAGE 0x46 /* Codepage */ #define EXT_UNIX_MODE 0x50 /* File permission */ #define EXT_UNIX_GID_UID 0x51 /* gid,uid */ #define EXT_UNIX_GNAME 0x52 /* Group name */ #define EXT_UNIX_UNAME 0x53 /* User name */ #define EXT_UNIX_MTIME 0x54 /* Modified time */ #define EXT_OS2_NEW_ATTR 0x7f /* new attribute(OS/2 only) */ #define EXT_NEW_ATTR 0xff /* new attribute */ *total_size = sizefield_length; for (;;) { /* Read an extended header size. */ if ((h = __archive_read_ahead(a, sizefield_length, NULL)) == NULL) return (truncated_error(a)); /* Check if the size is the zero indicates the end of the * extended header. */ if (sizefield_length == sizeof(uint16_t)) extdsize = archive_le16dec(h); else extdsize = archive_le32dec(h); if (extdsize == 0) { /* End of extended header */ if (crc != NULL) *crc = lha_crc16(*crc, h, sizefield_length); __archive_read_consume(a, sizefield_length); return (ARCHIVE_OK); } /* Sanity check to the extended header size. */ if (((uint64_t)*total_size + extdsize) > (uint64_t)limitsize || extdsize <= (size_t)sizefield_length) goto invalid; /* Read the extended header. */ if ((h = __archive_read_ahead(a, extdsize, NULL)) == NULL) return (truncated_error(a)); *total_size += extdsize; extdheader = (const unsigned char *)h; /* Get the extended header type. */ extdtype = extdheader[sizefield_length]; /* Calculate an extended data size. */ datasize = extdsize - (1 + sizefield_length); /* Skip an extended header size field and type field. */ extdheader += sizefield_length + 1; if (crc != NULL && extdtype != EXT_HEADER_CRC) *crc = lha_crc16(*crc, h, extdsize); switch (extdtype) { case EXT_HEADER_CRC: /* We only use a header CRC. Following data will not * be used. */ if (datasize >= 2) { lha->header_crc = archive_le16dec(extdheader); if (crc != NULL) { static const char zeros[2] = {0, 0}; *crc = lha_crc16(*crc, h, extdsize - datasize); /* CRC value itself as zero */ *crc = lha_crc16(*crc, zeros, 2); *crc = lha_crc16(*crc, extdheader+2, datasize - 2); } } break; case EXT_FILENAME: if (datasize == 0) { /* maybe directory header */ archive_string_empty(&lha->filename); break; } if (extdheader[0] == '\0') goto invalid; archive_strncpy(&lha->filename, (const char *)extdheader, datasize); break; case EXT_UTF16_FILENAME: if (datasize == 0) { /* maybe directory header */ archive_string_empty(&lha->filename); break; } else if (datasize & 1) { /* UTF-16 characters take always 2 or 4 bytes */ goto invalid; } if (extdheader[0] == '\0') goto invalid; archive_string_empty(&lha->filename); archive_array_append(&lha->filename, (const char *)extdheader, datasize); /* Setup a string conversion for a filename. */ lha->sconv_fname = archive_string_conversion_from_charset(&a->archive, "UTF-16LE", 1); if (lha->sconv_fname == NULL) return (ARCHIVE_FATAL); break; case EXT_DIRECTORY: if (datasize == 0 || extdheader[0] == '\0') /* no directory name data. exit this case. */ goto invalid; archive_strncpy(&lha->dirname, (const char *)extdheader, datasize); /* * Convert directory delimiter from 0xFF * to '/' for local system. */ for (i = 0; i < lha->dirname.length; i++) { if ((unsigned char)lha->dirname.s[i] == 0xFF) lha->dirname.s[i] = '/'; } /* Is last character directory separator? */ if (lha->dirname.s[lha->dirname.length-1] != '/') /* invalid directory data */ goto invalid; break; case EXT_UTF16_DIRECTORY: /* UTF-16 characters take always 2 or 4 bytes */ if (datasize == 0 || (datasize & 1) || extdheader[0] == '\0') { /* no directory name data. exit this case. */ goto invalid; } archive_string_empty(&lha->dirname); archive_array_append(&lha->dirname, (const char *)extdheader, datasize); lha->sconv_dir = archive_string_conversion_from_charset(&a->archive, "UTF-16LE", 1); if (lha->sconv_dir == NULL) return (ARCHIVE_FATAL); else { /* * Convert directory delimiter from 0xFFFF * to '/' for local system. */ uint16_t dirSep; uint16_t d = 1; if (archive_be16dec(&d) == 1) dirSep = 0x2F00; else dirSep = 0x002F; /* UTF-16LE character */ uint16_t *utf16name = (uint16_t *)lha->dirname.s; for (i = 0; i < lha->dirname.length / 2; i++) { if (utf16name[i] == 0xFFFF) { utf16name[i] = dirSep; } } /* Is last character directory separator? */ if (utf16name[lha->dirname.length / 2 - 1] != dirSep) { /* invalid directory data */ goto invalid; } } break; case EXT_DOS_ATTR: if (datasize == 2) lha->dos_attr = (unsigned char) (archive_le16dec(extdheader) & 0xff); break; case EXT_TIMESTAMP: if (datasize == (sizeof(uint64_t) * 3)) { lha->birthtime = lha_win_time( archive_le64dec(extdheader), &lha->birthtime_tv_nsec); extdheader += sizeof(uint64_t); lha->mtime = lha_win_time( archive_le64dec(extdheader), &lha->mtime_tv_nsec); extdheader += sizeof(uint64_t); lha->atime = lha_win_time( archive_le64dec(extdheader), &lha->atime_tv_nsec); lha->setflag |= BIRTHTIME_IS_SET | ATIME_IS_SET; } break; case EXT_FILESIZE: if (datasize == sizeof(uint64_t) * 2) { lha->compsize = archive_le64dec(extdheader); extdheader += sizeof(uint64_t); lha->origsize = archive_le64dec(extdheader); + if (lha->compsize < 0 || lha->origsize < 0) + goto invalid; } break; case EXT_CODEPAGE: /* Get an archived filename charset from codepage. * This overwrites the charset specified by * hdrcharset option. */ if (datasize == sizeof(uint32_t)) { struct archive_string cp; const char *charset; archive_string_init(&cp); switch (archive_le32dec(extdheader)) { case 65001: /* UTF-8 */ charset = "UTF-8"; break; default: archive_string_sprintf(&cp, "CP%d", (int)archive_le32dec(extdheader)); charset = cp.s; break; } lha->sconv_dir = archive_string_conversion_from_charset( &(a->archive), charset, 1); lha->sconv_fname = archive_string_conversion_from_charset( &(a->archive), charset, 1); archive_string_free(&cp); if (lha->sconv_dir == NULL) return (ARCHIVE_FATAL); if (lha->sconv_fname == NULL) return (ARCHIVE_FATAL); } break; case EXT_UNIX_MODE: if (datasize == sizeof(uint16_t)) { lha->mode = archive_le16dec(extdheader); lha->setflag |= UNIX_MODE_IS_SET; } break; case EXT_UNIX_GID_UID: if (datasize == (sizeof(uint16_t) * 2)) { lha->gid = archive_le16dec(extdheader); lha->uid = archive_le16dec(extdheader+2); } break; case EXT_UNIX_GNAME: if (datasize > 0) archive_strncpy(&lha->gname, (const char *)extdheader, datasize); break; case EXT_UNIX_UNAME: if (datasize > 0) archive_strncpy(&lha->uname, (const char *)extdheader, datasize); break; case EXT_UNIX_MTIME: if (datasize == sizeof(uint32_t)) lha->mtime = archive_le32dec(extdheader); break; case EXT_OS2_NEW_ATTR: /* This extended header is OS/2 depend. */ if (datasize == 16) { lha->dos_attr = (unsigned char) (archive_le16dec(extdheader) & 0xff); lha->mode = archive_le16dec(extdheader+2); lha->gid = archive_le16dec(extdheader+4); lha->uid = archive_le16dec(extdheader+6); lha->birthtime = archive_le32dec(extdheader+8); lha->atime = archive_le32dec(extdheader+12); lha->setflag |= UNIX_MODE_IS_SET | BIRTHTIME_IS_SET | ATIME_IS_SET; } break; case EXT_NEW_ATTR: if (datasize == 20) { lha->mode = (mode_t)archive_le32dec(extdheader); lha->gid = archive_le32dec(extdheader+4); lha->uid = archive_le32dec(extdheader+8); lha->birthtime = archive_le32dec(extdheader+12); lha->atime = archive_le32dec(extdheader+16); lha->setflag |= UNIX_MODE_IS_SET | BIRTHTIME_IS_SET | ATIME_IS_SET; } break; case EXT_TIMEZONE: /* Not supported */ break; default: break; } __archive_read_consume(a, extdsize); } invalid: archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Invalid extended LHa header"); return (ARCHIVE_FATAL); } static int lha_end_of_entry(struct archive_read *a) { struct lha *lha = (struct lha *)(a->format->data); int r = ARCHIVE_EOF; if (!lha->end_of_entry_cleanup) { if ((lha->setflag & CRC_IS_SET) && lha->crc != lha->entry_crc_calculated) { archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, "LHa data CRC error"); r = ARCHIVE_WARN; } /* End-of-entry cleanup done. */ lha->end_of_entry_cleanup = 1; } return (r); } static int archive_read_format_lha_read_data(struct archive_read *a, const void **buff, size_t *size, int64_t *offset) { struct lha *lha = (struct lha *)(a->format->data); int r; if (lha->entry_unconsumed) { /* Consume as much as the decompressor actually used. */ __archive_read_consume(a, lha->entry_unconsumed); lha->entry_unconsumed = 0; } if (lha->end_of_entry) { *offset = lha->entry_offset; *size = 0; *buff = NULL; return (lha_end_of_entry(a)); } if (lha->entry_is_compressed) r = lha_read_data_lzh(a, buff, size, offset); else /* No compression. */ r = lha_read_data_none(a, buff, size, offset); return (r); } /* * Read a file content in no compression. * * Returns ARCHIVE_OK if successful, ARCHIVE_FATAL otherwise, sets * lha->end_of_entry if it consumes all of the data. */ static int lha_read_data_none(struct archive_read *a, const void **buff, size_t *size, int64_t *offset) { struct lha *lha = (struct lha *)(a->format->data); ssize_t bytes_avail; if (lha->entry_bytes_remaining == 0) { *buff = NULL; *size = 0; *offset = lha->entry_offset; lha->end_of_entry = 1; return (ARCHIVE_OK); } /* * Note: '1' here is a performance optimization. * Recall that the decompression layer returns a count of * available bytes; asking for more than that forces the * decompressor to combine reads by copying data. */ *buff = __archive_read_ahead(a, 1, &bytes_avail); if (bytes_avail <= 0) { archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Truncated LHa file data"); return (ARCHIVE_FATAL); } if (bytes_avail > lha->entry_bytes_remaining) bytes_avail = (ssize_t)lha->entry_bytes_remaining; lha->entry_crc_calculated = lha_crc16(lha->entry_crc_calculated, *buff, bytes_avail); *size = bytes_avail; *offset = lha->entry_offset; lha->entry_offset += bytes_avail; lha->entry_bytes_remaining -= bytes_avail; if (lha->entry_bytes_remaining == 0) lha->end_of_entry = 1; lha->entry_unconsumed = bytes_avail; return (ARCHIVE_OK); } /* * Read a file content in LZHUFF encoding. * * Returns ARCHIVE_OK if successful, returns ARCHIVE_WARN if compression is * unsupported, ARCHIVE_FATAL otherwise, sets lha->end_of_entry if it consumes * all of the data. */ static int lha_read_data_lzh(struct archive_read *a, const void **buff, size_t *size, int64_t *offset) { struct lha *lha = (struct lha *)(a->format->data); ssize_t bytes_avail; int r; /* If we haven't yet read any data, initialize the decompressor. */ if (!lha->decompress_init) { r = lzh_decode_init(&(lha->strm), lha->method); switch (r) { case ARCHIVE_OK: break; case ARCHIVE_FAILED: /* Unsupported compression. */ *buff = NULL; *size = 0; *offset = 0; archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Unsupported lzh compression method -%c%c%c-", lha->method[0], lha->method[1], lha->method[2]); /* We know compressed size; just skip it. */ archive_read_format_lha_read_data_skip(a); return (ARCHIVE_WARN); default: archive_set_error(&a->archive, ENOMEM, "Couldn't allocate memory " "for lzh decompression"); return (ARCHIVE_FATAL); } /* We've initialized decompression for this stream. */ lha->decompress_init = 1; lha->strm.avail_out = 0; lha->strm.total_out = 0; } /* * Note: '1' here is a performance optimization. * Recall that the decompression layer returns a count of * available bytes; asking for more than that forces the * decompressor to combine reads by copying data. */ lha->strm.next_in = __archive_read_ahead(a, 1, &bytes_avail); if (bytes_avail <= 0) { archive_set_error(&a->archive, ARCHIVE_ERRNO_FILE_FORMAT, "Truncated LHa file body"); return (ARCHIVE_FATAL); } if (bytes_avail > lha->entry_bytes_remaining) bytes_avail = (ssize_t)lha->entry_bytes_remaining; lha->strm.avail_in = (int)bytes_avail; lha->strm.total_in = 0; lha->strm.avail_out = 0; r = lzh_decode(&(lha->strm), bytes_avail == lha->entry_bytes_remaining); switch (r) { case ARCHIVE_OK: break; case ARCHIVE_EOF: lha->end_of_entry = 1; break; default: archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, "Bad lzh data"); return (ARCHIVE_FAILED); } lha->entry_unconsumed = lha->strm.total_in; lha->entry_bytes_remaining -= lha->strm.total_in; if (lha->strm.avail_out) { *offset = lha->entry_offset; *size = lha->strm.avail_out; *buff = lha->strm.ref_ptr; lha->entry_crc_calculated = lha_crc16(lha->entry_crc_calculated, *buff, *size); lha->entry_offset += *size; } else { *offset = lha->entry_offset; *size = 0; *buff = NULL; if (lha->end_of_entry) return (lha_end_of_entry(a)); } return (ARCHIVE_OK); } /* * Skip a file content. */ static int archive_read_format_lha_read_data_skip(struct archive_read *a) { struct lha *lha; int64_t bytes_skipped; lha = (struct lha *)(a->format->data); if (lha->entry_unconsumed) { /* Consume as much as the decompressor actually used. */ __archive_read_consume(a, lha->entry_unconsumed); lha->entry_unconsumed = 0; } /* if we've already read to end of data, we're done. */ if (lha->end_of_entry_cleanup) return (ARCHIVE_OK); /* * If the length is at the beginning, we can skip the * compressed data much more quickly. */ bytes_skipped = __archive_read_consume(a, lha->entry_bytes_remaining); if (bytes_skipped < 0) return (ARCHIVE_FATAL); /* This entry is finished and done. */ lha->end_of_entry_cleanup = lha->end_of_entry = 1; return (ARCHIVE_OK); } static int archive_read_format_lha_cleanup(struct archive_read *a) { struct lha *lha = (struct lha *)(a->format->data); lzh_decode_free(&(lha->strm)); archive_string_free(&(lha->dirname)); archive_string_free(&(lha->filename)); archive_string_free(&(lha->uname)); archive_string_free(&(lha->gname)); archive_wstring_free(&(lha->ws)); free(lha); (a->format->data) = NULL; return (ARCHIVE_OK); } /* * 'LHa for UNIX' utility has archived a symbolic-link name after * a pathname with '|' character. * This function extracts the symbolic-link name from the pathname. * * example. * 1. a symbolic-name is 'aaa/bb/cc' * 2. a filename is 'xxx/bbb' * then an archived pathname is 'xxx/bbb|aaa/bb/cc' */ static int lha_parse_linkname(struct archive_wstring *linkname, struct archive_wstring *pathname) { wchar_t * linkptr; size_t symlen; linkptr = wcschr(pathname->s, L'|'); if (linkptr != NULL) { symlen = wcslen(linkptr + 1); archive_wstrncpy(linkname, linkptr+1, symlen); *linkptr = 0; pathname->length = wcslen(pathname->s); return (1); } return (0); } /* Convert an MSDOS-style date/time into Unix-style time. */ static time_t lha_dos_time(const unsigned char *p) { int msTime, msDate; struct tm ts; msTime = archive_le16dec(p); msDate = archive_le16dec(p+2); memset(&ts, 0, sizeof(ts)); ts.tm_year = ((msDate >> 9) & 0x7f) + 80; /* Years since 1900. */ ts.tm_mon = ((msDate >> 5) & 0x0f) - 1; /* Month number. */ ts.tm_mday = msDate & 0x1f; /* Day of month. */ ts.tm_hour = (msTime >> 11) & 0x1f; ts.tm_min = (msTime >> 5) & 0x3f; ts.tm_sec = (msTime << 1) & 0x3e; ts.tm_isdst = -1; return (mktime(&ts)); } /* Convert an MS-Windows-style date/time into Unix-style time. */ static time_t lha_win_time(uint64_t wintime, long *ns) { #define EPOC_TIME ARCHIVE_LITERAL_ULL(116444736000000000) if (wintime >= EPOC_TIME) { wintime -= EPOC_TIME; /* 1970-01-01 00:00:00 (UTC) */ if (ns != NULL) *ns = (long)(wintime % 10000000) * 100; return (wintime / 10000000); } else { if (ns != NULL) *ns = 0; return (0); } } static unsigned char lha_calcsum(unsigned char sum, const void *pp, int offset, size_t size) { unsigned char const *p = (unsigned char const *)pp; p += offset; for (;size > 0; --size) sum += *p++; return (sum); } static uint16_t crc16tbl[2][256]; static void lha_crc16_init(void) { unsigned int i; static int crc16init = 0; if (crc16init) return; crc16init = 1; for (i = 0; i < 256; i++) { unsigned int j; uint16_t crc = (uint16_t)i; for (j = 8; j; j--) crc = (crc >> 1) ^ ((crc & 1) * 0xA001); crc16tbl[0][i] = crc; } for (i = 0; i < 256; i++) { crc16tbl[1][i] = (crc16tbl[0][i] >> 8) ^ crc16tbl[0][crc16tbl[0][i] & 0xff]; } } static uint16_t lha_crc16(uint16_t crc, const void *pp, size_t len) { const unsigned char *p = (const unsigned char *)pp; const uint16_t *buff; const union { uint32_t i; char c[4]; } u = { 0x01020304 }; if (len == 0) return crc; /* Process unaligned address. */ if (((uintptr_t)p) & (uintptr_t)0x1) { crc = (crc >> 8) ^ crc16tbl[0][(crc ^ *p++) & 0xff]; len--; } buff = (const uint16_t *)p; /* * Modern C compiler such as GCC does not unroll automatically yet * without unrolling pragma, and Clang is so. So we should * unroll this loop for its performance. */ for (;len >= 8; len -= 8) { /* This if statement expects compiler optimization will * remove the statement which will not be executed. */ #undef bswap16 #ifndef __has_builtin #define __has_builtin(x) 0 #endif #if defined(_MSC_VER) && _MSC_VER >= 1400 /* Visual Studio */ # define bswap16(x) _byteswap_ushort(x) #elif defined(__GNUC__) && ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ > 4) /* GCC 4.8 and later has __builtin_bswap16() */ # define bswap16(x) __builtin_bswap16(x) #elif defined(__clang__) && __has_builtin(__builtin_bswap16) /* Newer clang versions have __builtin_bswap16() */ # define bswap16(x) __builtin_bswap16(x) #else # define bswap16(x) ((((x) >> 8) & 0xff) | ((x) << 8)) #endif #define CRC16W do { \ if(u.c[0] == 1) { /* Big endian */ \ crc ^= bswap16(*buff); buff++; \ } else \ crc ^= *buff++; \ crc = crc16tbl[1][crc & 0xff] ^ crc16tbl[0][crc >> 8];\ } while (0) CRC16W; CRC16W; CRC16W; CRC16W; #undef CRC16W #undef bswap16 } p = (const unsigned char *)buff; for (;len; len--) { crc = (crc >> 8) ^ crc16tbl[0][(crc ^ *p++) & 0xff]; } return crc; } /* * Initialize LZHUF decoder. * * Returns ARCHIVE_OK if initialization was successful. * Returns ARCHIVE_FAILED if method is unsupported. * Returns ARCHIVE_FATAL if initialization failed; memory allocation * error occurred. */ static int lzh_decode_init(struct lzh_stream *strm, const char *method) { struct lzh_dec *ds; int w_bits, w_size; if (strm->ds == NULL) { strm->ds = calloc(1, sizeof(*strm->ds)); if (strm->ds == NULL) return (ARCHIVE_FATAL); } ds = strm->ds; ds->error = ARCHIVE_FAILED; if (method == NULL || method[0] != 'l' || method[1] != 'h') return (ARCHIVE_FAILED); switch (method[2]) { case '5': w_bits = 13;/* 8KiB for window */ break; case '6': w_bits = 15;/* 32KiB for window */ break; case '7': w_bits = 16;/* 64KiB for window */ break; default: return (ARCHIVE_FAILED);/* Not supported. */ } ds->error = ARCHIVE_FATAL; /* Expand a window size up to 128 KiB for decompressing process * performance whatever its original window size is. */ ds->w_size = 1U << 17; ds->w_mask = ds->w_size -1; if (ds->w_buff == NULL) { ds->w_buff = malloc(ds->w_size); if (ds->w_buff == NULL) return (ARCHIVE_FATAL); } w_size = 1U << w_bits; memset(ds->w_buff + ds->w_size - w_size, 0x20, w_size); ds->w_pos = 0; ds->state = 0; ds->pos_pt_len_size = w_bits + 1; ds->pos_pt_len_bits = (w_bits == 15 || w_bits == 16)? 5: 4; ds->literal_pt_len_size = PT_BITLEN_SIZE; ds->literal_pt_len_bits = 5; ds->br.cache_buffer = 0; ds->br.cache_avail = 0; if (lzh_huffman_init(&(ds->lt), LT_BITLEN_SIZE, 16) != ARCHIVE_OK) return (ARCHIVE_FATAL); ds->lt.len_bits = 9; if (lzh_huffman_init(&(ds->pt), PT_BITLEN_SIZE, 16) != ARCHIVE_OK) return (ARCHIVE_FATAL); ds->error = 0; return (ARCHIVE_OK); } /* * Release LZHUF decoder. */ static void lzh_decode_free(struct lzh_stream *strm) { if (strm->ds == NULL) return; free(strm->ds->w_buff); lzh_huffman_free(&(strm->ds->lt)); lzh_huffman_free(&(strm->ds->pt)); free(strm->ds); strm->ds = NULL; } /* * Bit stream reader. */ /* Check that the cache buffer has enough bits. */ #define lzh_br_has(br, n) ((br)->cache_avail >= n) /* Get compressed data by bit. */ #define lzh_br_bits(br, n) \ (((uint16_t)((br)->cache_buffer >> \ ((br)->cache_avail - (n)))) & cache_masks[n]) #define lzh_br_bits_forced(br, n) \ (((uint16_t)((br)->cache_buffer << \ ((n) - (br)->cache_avail))) & cache_masks[n]) /* Read ahead to make sure the cache buffer has enough compressed data we * will use. * True : completed, there is enough data in the cache buffer. * False : we met that strm->next_in is empty, we have to get following * bytes. */ #define lzh_br_read_ahead_0(strm, br, n) \ (lzh_br_has(br, (n)) || lzh_br_fillup(strm, br)) /* True : the cache buffer has some bits as much as we need. * False : there are no enough bits in the cache buffer to be used, * we have to get following bytes if we could. */ #define lzh_br_read_ahead(strm, br, n) \ (lzh_br_read_ahead_0((strm), (br), (n)) || lzh_br_has((br), (n))) /* Notify how many bits we consumed. */ #define lzh_br_consume(br, n) ((br)->cache_avail -= (n)) #define lzh_br_unconsume(br, n) ((br)->cache_avail += (n)) static const uint16_t cache_masks[] = { 0x0000, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF, 0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF }; /* * Shift away used bits in the cache data and fill it up with following bits. * Call this when cache buffer does not have enough bits you need. * * Returns 1 if the cache buffer is full. * Returns 0 if the cache buffer is not full; input buffer is empty. */ static int lzh_br_fillup(struct lzh_stream *strm, struct lzh_br *br) { int n = CACHE_BITS - br->cache_avail; for (;;) { const int x = n >> 3; if (strm->avail_in >= x) { switch (x) { case 8: br->cache_buffer = ((uint64_t)strm->next_in[0]) << 56 | ((uint64_t)strm->next_in[1]) << 48 | ((uint64_t)strm->next_in[2]) << 40 | ((uint64_t)strm->next_in[3]) << 32 | ((uint32_t)strm->next_in[4]) << 24 | ((uint32_t)strm->next_in[5]) << 16 | ((uint32_t)strm->next_in[6]) << 8 | (uint32_t)strm->next_in[7]; strm->next_in += 8; strm->avail_in -= 8; br->cache_avail += 8 * 8; return (1); case 7: br->cache_buffer = (br->cache_buffer << 56) | ((uint64_t)strm->next_in[0]) << 48 | ((uint64_t)strm->next_in[1]) << 40 | ((uint64_t)strm->next_in[2]) << 32 | ((uint64_t)strm->next_in[3]) << 24 | ((uint64_t)strm->next_in[4]) << 16 | ((uint64_t)strm->next_in[5]) << 8 | (uint64_t)strm->next_in[6]; strm->next_in += 7; strm->avail_in -= 7; br->cache_avail += 7 * 8; return (1); case 6: br->cache_buffer = (br->cache_buffer << 48) | ((uint64_t)strm->next_in[0]) << 40 | ((uint64_t)strm->next_in[1]) << 32 | ((uint64_t)strm->next_in[2]) << 24 | ((uint64_t)strm->next_in[3]) << 16 | ((uint64_t)strm->next_in[4]) << 8 | (uint64_t)strm->next_in[5]; strm->next_in += 6; strm->avail_in -= 6; br->cache_avail += 6 * 8; return (1); case 0: /* We have enough compressed data in * the cache buffer.*/ return (1); default: break; } } if (strm->avail_in == 0) { /* There is not enough compressed data to fill up the * cache buffer. */ return (0); } br->cache_buffer = (br->cache_buffer << 8) | *strm->next_in++; strm->avail_in--; br->cache_avail += 8; n -= 8; } } /* * Decode LZHUF. * * 1. Returns ARCHIVE_OK if output buffer or input buffer are empty. * Please set available buffer and call this function again. * 2. Returns ARCHIVE_EOF if decompression has been completed. * 3. Returns ARCHIVE_FAILED if an error occurred; compressed data * is broken or you do not set 'last' flag properly. * 4. 'last' flag is very important, you must set 1 to the flag if there * is no input data. The lha compressed data format does not provide how * to know the compressed data is really finished. * Note: lha command utility check if the total size of output bytes is * reached the uncompressed size recorded in its header. it does not mind * that the decoding process is properly finished. * GNU ZIP can decompress another compressed file made by SCO LZH compress. * it handles EOF as null to fill read buffer with zero until the decoding * process meet 2 bytes of zeros at reading a size of a next chunk, so the * zeros are treated as the mark of the end of the data although the zeros * is dummy, not the file data. */ static int lzh_read_blocks(struct lzh_stream *, int); static int lzh_decode_blocks(struct lzh_stream *, int); #define ST_RD_BLOCK 0 #define ST_RD_PT_1 1 #define ST_RD_PT_2 2 #define ST_RD_PT_3 3 #define ST_RD_PT_4 4 #define ST_RD_LITERAL_1 5 #define ST_RD_LITERAL_2 6 #define ST_RD_LITERAL_3 7 #define ST_RD_POS_DATA_1 8 #define ST_GET_LITERAL 9 #define ST_GET_POS_1 10 #define ST_GET_POS_2 11 #define ST_COPY_DATA 12 static int lzh_decode(struct lzh_stream *strm, int last) { struct lzh_dec *ds = strm->ds; int avail_in; int r; if (ds->error) return (ds->error); avail_in = strm->avail_in; do { if (ds->state < ST_GET_LITERAL) r = lzh_read_blocks(strm, last); else r = lzh_decode_blocks(strm, last); } while (r == 100); strm->total_in += avail_in - strm->avail_in; return (r); } static void lzh_emit_window(struct lzh_stream *strm, size_t s) { strm->ref_ptr = strm->ds->w_buff; strm->avail_out = (int)s; strm->total_out += s; } static int lzh_read_blocks(struct lzh_stream *strm, int last) { struct lzh_dec *ds = strm->ds; struct lzh_br *br = &(ds->br); int c = 0, i; unsigned rbits; for (;;) { switch (ds->state) { case ST_RD_BLOCK: /* * Read a block number indicates how many blocks * we will handle. The block is composed of a * literal and a match, sometimes a literal only * in particular, there are no reference data at * the beginning of the decompression. */ if (!lzh_br_read_ahead_0(strm, br, 16)) { if (!last) /* We need following data. */ return (ARCHIVE_OK); if (lzh_br_has(br, 8)) { /* * It seems there are extra bits. * 1. Compressed data is broken. * 2. `last' flag does not properly * set. */ goto failed; } if (ds->w_pos > 0) { lzh_emit_window(strm, ds->w_pos); ds->w_pos = 0; return (ARCHIVE_OK); } /* End of compressed data; we have completely * handled all compressed data. */ return (ARCHIVE_EOF); } ds->blocks_avail = lzh_br_bits(br, 16); if (ds->blocks_avail == 0) goto failed; lzh_br_consume(br, 16); /* * Read a literal table compressed in huffman * coding. */ ds->pt.len_size = ds->literal_pt_len_size; ds->pt.len_bits = ds->literal_pt_len_bits; ds->reading_position = 0; /* FALL THROUGH */ case ST_RD_PT_1: /* Note: ST_RD_PT_1, ST_RD_PT_2 and ST_RD_PT_4 are * used in reading both a literal table and a * position table. */ if (!lzh_br_read_ahead(strm, br, ds->pt.len_bits)) { if (last) goto failed;/* Truncated data. */ ds->state = ST_RD_PT_1; return (ARCHIVE_OK); } ds->pt.len_avail = lzh_br_bits(br, ds->pt.len_bits); lzh_br_consume(br, ds->pt.len_bits); /* FALL THROUGH */ case ST_RD_PT_2: if (ds->pt.len_avail == 0) { /* There is no bitlen. */ if (!lzh_br_read_ahead(strm, br, ds->pt.len_bits)) { if (last) goto failed;/* Truncated data.*/ ds->state = ST_RD_PT_2; return (ARCHIVE_OK); } if (!lzh_make_fake_table(&(ds->pt), lzh_br_bits(br, ds->pt.len_bits))) goto failed;/* Invalid data. */ lzh_br_consume(br, ds->pt.len_bits); if (ds->reading_position) ds->state = ST_GET_LITERAL; else ds->state = ST_RD_LITERAL_1; break; } else if (ds->pt.len_avail > ds->pt.len_size) goto failed;/* Invalid data. */ ds->loop = 0; memset(ds->pt.freq, 0, sizeof(ds->pt.freq)); if (ds->pt.len_avail < 3 || ds->pt.len_size == ds->pos_pt_len_size) { ds->state = ST_RD_PT_4; break; } /* FALL THROUGH */ case ST_RD_PT_3: ds->loop = lzh_read_pt_bitlen(strm, ds->loop, 3); if (ds->loop < 3) { if (ds->loop < 0 || last) goto failed;/* Invalid data. */ /* Not completed, get following data. */ ds->state = ST_RD_PT_3; return (ARCHIVE_OK); } /* There are some null in bitlen of the literal. */ if (!lzh_br_read_ahead(strm, br, 2)) { if (last) goto failed;/* Truncated data. */ ds->state = ST_RD_PT_3; return (ARCHIVE_OK); } c = lzh_br_bits(br, 2); lzh_br_consume(br, 2); if (c > ds->pt.len_avail - 3) goto failed;/* Invalid data. */ for (i = 3; c-- > 0 ;) ds->pt.bitlen[i++] = 0; ds->loop = i; /* FALL THROUGH */ case ST_RD_PT_4: ds->loop = lzh_read_pt_bitlen(strm, ds->loop, ds->pt.len_avail); if (ds->loop < ds->pt.len_avail) { if (ds->loop < 0 || last) goto failed;/* Invalid data. */ /* Not completed, get following data. */ ds->state = ST_RD_PT_4; return (ARCHIVE_OK); } if (!lzh_make_huffman_table(&(ds->pt))) goto failed;/* Invalid data */ if (ds->reading_position) { ds->state = ST_GET_LITERAL; break; } /* FALL THROUGH */ case ST_RD_LITERAL_1: if (!lzh_br_read_ahead(strm, br, ds->lt.len_bits)) { if (last) goto failed;/* Truncated data. */ ds->state = ST_RD_LITERAL_1; return (ARCHIVE_OK); } ds->lt.len_avail = lzh_br_bits(br, ds->lt.len_bits); lzh_br_consume(br, ds->lt.len_bits); /* FALL THROUGH */ case ST_RD_LITERAL_2: if (ds->lt.len_avail == 0) { /* There is no bitlen. */ if (!lzh_br_read_ahead(strm, br, ds->lt.len_bits)) { if (last) goto failed;/* Truncated data.*/ ds->state = ST_RD_LITERAL_2; return (ARCHIVE_OK); } if (!lzh_make_fake_table(&(ds->lt), lzh_br_bits(br, ds->lt.len_bits))) goto failed;/* Invalid data */ lzh_br_consume(br, ds->lt.len_bits); ds->state = ST_RD_POS_DATA_1; break; } else if (ds->lt.len_avail > ds->lt.len_size) goto failed;/* Invalid data */ ds->loop = 0; memset(ds->lt.freq, 0, sizeof(ds->lt.freq)); /* FALL THROUGH */ case ST_RD_LITERAL_3: i = ds->loop; while (i < ds->lt.len_avail) { if (!lzh_br_read_ahead(strm, br, ds->pt.max_bits)) { if (last) goto failed;/* Truncated data.*/ ds->loop = i; ds->state = ST_RD_LITERAL_3; return (ARCHIVE_OK); } rbits = lzh_br_bits(br, ds->pt.max_bits); c = lzh_decode_huffman(&(ds->pt), rbits); if (c > 2) { /* Note: 'c' will never be more than * eighteen since it's limited by * PT_BITLEN_SIZE, which is being set * to ds->pt.len_size through * ds->literal_pt_len_size. */ lzh_br_consume(br, ds->pt.bitlen[c]); c -= 2; ds->lt.freq[c]++; ds->lt.bitlen[i++] = c; } else if (c == 0) { lzh_br_consume(br, ds->pt.bitlen[c]); ds->lt.bitlen[i++] = 0; } else { /* c == 1 or c == 2 */ int n = (c == 1)?4:9; if (!lzh_br_read_ahead(strm, br, ds->pt.bitlen[c] + n)) { if (last) /* Truncated data. */ goto failed; ds->loop = i; ds->state = ST_RD_LITERAL_3; return (ARCHIVE_OK); } lzh_br_consume(br, ds->pt.bitlen[c]); c = lzh_br_bits(br, n); lzh_br_consume(br, n); c += (n == 4)?3:20; if (i + c > ds->lt.len_avail) goto failed;/* Invalid data */ memset(&(ds->lt.bitlen[i]), 0, c); i += c; } } if (i > ds->lt.len_avail || !lzh_make_huffman_table(&(ds->lt))) goto failed;/* Invalid data */ /* FALL THROUGH */ case ST_RD_POS_DATA_1: /* * Read a position table compressed in huffman * coding. */ ds->pt.len_size = ds->pos_pt_len_size; ds->pt.len_bits = ds->pos_pt_len_bits; ds->reading_position = 1; ds->state = ST_RD_PT_1; break; case ST_GET_LITERAL: return (100); } } failed: return (ds->error = ARCHIVE_FAILED); } static int lzh_decode_blocks(struct lzh_stream *strm, int last) { struct lzh_dec *ds = strm->ds; struct lzh_br bre = ds->br; struct huffman *lt = &(ds->lt); struct huffman *pt = &(ds->pt); unsigned char *w_buff = ds->w_buff; unsigned char *lt_bitlen = lt->bitlen; unsigned char *pt_bitlen = pt->bitlen; int blocks_avail = ds->blocks_avail, c = 0; int copy_len = ds->copy_len, copy_pos = ds->copy_pos; int w_pos = ds->w_pos, w_mask = ds->w_mask, w_size = ds->w_size; int lt_max_bits = lt->max_bits, pt_max_bits = pt->max_bits; int state = ds->state; for (;;) { switch (state) { case ST_GET_LITERAL: for (;;) { if (blocks_avail == 0) { /* We have decoded all blocks. * Let's handle next blocks. */ ds->state = ST_RD_BLOCK; ds->br = bre; ds->blocks_avail = 0; ds->w_pos = w_pos; ds->copy_pos = 0; return (100); } /* lzh_br_read_ahead() always tries to fill the * cache buffer up. In specific situation we * are close to the end of the data, the cache * buffer will not be full and thus we have to * determine if the cache buffer has some bits * as much as we need after lzh_br_read_ahead() * failed. */ if (!lzh_br_read_ahead(strm, &bre, lt_max_bits)) { if (!last) goto next_data; /* Remaining bits are less than * maximum bits(lt.max_bits) but maybe * it still remains as much as we need, * so we should try to use it with * dummy bits. */ c = lzh_decode_huffman(lt, lzh_br_bits_forced(&bre, lt_max_bits)); lzh_br_consume(&bre, lt_bitlen[c]); if (!lzh_br_has(&bre, 0)) goto failed;/* Over read. */ } else { c = lzh_decode_huffman(lt, lzh_br_bits(&bre, lt_max_bits)); lzh_br_consume(&bre, lt_bitlen[c]); } blocks_avail--; if (c > UCHAR_MAX) /* Current block is a match data. */ break; /* * 'c' is exactly a literal code. */ /* Save a decoded code to reference it * afterward. */ w_buff[w_pos] = c; if (++w_pos >= w_size) { w_pos = 0; lzh_emit_window(strm, w_size); goto next_data; } } /* 'c' is the length of a match pattern we have * already extracted, which has be stored in * window(ds->w_buff). */ copy_len = c - (UCHAR_MAX + 1) + MINMATCH; /* FALL THROUGH */ case ST_GET_POS_1: /* * Get a reference position. */ if (!lzh_br_read_ahead(strm, &bre, pt_max_bits)) { if (!last) { state = ST_GET_POS_1; ds->copy_len = copy_len; goto next_data; } copy_pos = lzh_decode_huffman(pt, lzh_br_bits_forced(&bre, pt_max_bits)); lzh_br_consume(&bre, pt_bitlen[copy_pos]); if (!lzh_br_has(&bre, 0)) goto failed;/* Over read. */ } else { copy_pos = lzh_decode_huffman(pt, lzh_br_bits(&bre, pt_max_bits)); lzh_br_consume(&bre, pt_bitlen[copy_pos]); } /* FALL THROUGH */ case ST_GET_POS_2: if (copy_pos > 1) { /* We need an additional adjustment number to * the position. */ int p = copy_pos - 1; if (!lzh_br_read_ahead(strm, &bre, p)) { if (last) goto failed;/* Truncated data.*/ state = ST_GET_POS_2; ds->copy_len = copy_len; ds->copy_pos = copy_pos; goto next_data; } copy_pos = (1 << p) + lzh_br_bits(&bre, p); lzh_br_consume(&bre, p); } /* The position is actually a distance from the last * code we had extracted and thus we have to convert * it to a position of the window. */ copy_pos = (w_pos - copy_pos - 1) & w_mask; /* FALL THROUGH */ case ST_COPY_DATA: /* * Copy `copy_len' bytes as extracted data from * the window into the output buffer. */ for (;;) { int l; l = copy_len; if (copy_pos > w_pos) { if (l > w_size - copy_pos) l = w_size - copy_pos; } else { if (l > w_size - w_pos) l = w_size - w_pos; } if ((copy_pos + l < w_pos) || (w_pos + l < copy_pos)) { /* No overlap. */ memcpy(w_buff + w_pos, w_buff + copy_pos, l); } else { const unsigned char *s; unsigned char *d; int li; d = w_buff + w_pos; s = w_buff + copy_pos; for (li = 0; li < l-1;) { d[li] = s[li];li++; d[li] = s[li];li++; } if (li < l) d[li] = s[li]; } w_pos += l; if (w_pos == w_size) { w_pos = 0; lzh_emit_window(strm, w_size); if (copy_len <= l) state = ST_GET_LITERAL; else { state = ST_COPY_DATA; ds->copy_len = copy_len - l; ds->copy_pos = (copy_pos + l) & w_mask; } goto next_data; } if (copy_len <= l) /* A copy of current pattern ended. */ break; copy_len -= l; copy_pos = (copy_pos + l) & w_mask; } state = ST_GET_LITERAL; break; } } failed: return (ds->error = ARCHIVE_FAILED); next_data: ds->br = bre; ds->blocks_avail = blocks_avail; ds->state = state; ds->w_pos = w_pos; return (ARCHIVE_OK); } static int lzh_huffman_init(struct huffman *hf, size_t len_size, int tbl_bits) { int bits; if (hf->bitlen == NULL) { hf->bitlen = malloc(len_size * sizeof(hf->bitlen[0])); if (hf->bitlen == NULL) return (ARCHIVE_FATAL); } if (hf->tbl == NULL) { if (tbl_bits < HTBL_BITS) bits = tbl_bits; else bits = HTBL_BITS; hf->tbl = malloc(((size_t)1 << bits) * sizeof(hf->tbl[0])); if (hf->tbl == NULL) return (ARCHIVE_FATAL); } if (hf->tree == NULL && tbl_bits > HTBL_BITS) { hf->tree_avail = 1 << (tbl_bits - HTBL_BITS + 4); hf->tree = malloc(hf->tree_avail * sizeof(hf->tree[0])); if (hf->tree == NULL) return (ARCHIVE_FATAL); } hf->len_size = (int)len_size; hf->tbl_bits = tbl_bits; return (ARCHIVE_OK); } static void lzh_huffman_free(struct huffman *hf) { free(hf->bitlen); free(hf->tbl); free(hf->tree); } static const char bitlen_tbl[0x400] = { 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 16, 0 }; static int lzh_read_pt_bitlen(struct lzh_stream *strm, int start, int end) { struct lzh_dec *ds = strm->ds; struct lzh_br *br = &(ds->br); int c, i; for (i = start; i < end; ) { /* * bit pattern the number we need * 000 -> 0 * 001 -> 1 * 010 -> 2 * ... * 110 -> 6 * 1110 -> 7 * 11110 -> 8 * ... * 1111111111110 -> 16 */ if (!lzh_br_read_ahead(strm, br, 3)) return (i); if ((c = lzh_br_bits(br, 3)) == 7) { if (!lzh_br_read_ahead(strm, br, 13)) return (i); c = bitlen_tbl[lzh_br_bits(br, 13) & 0x3FF]; if (c) lzh_br_consume(br, c - 3); else return (-1);/* Invalid data. */ } else lzh_br_consume(br, 3); ds->pt.bitlen[i++] = c; ds->pt.freq[c]++; } return (i); } static int lzh_make_fake_table(struct huffman *hf, uint16_t c) { if (c >= hf->len_size) return (0); hf->tbl[0] = c; hf->max_bits = 0; hf->shift_bits = 0; hf->bitlen[hf->tbl[0]] = 0; return (1); } /* * Make a huffman coding table. */ static int lzh_make_huffman_table(struct huffman *hf) { uint16_t *tbl; const unsigned char *bitlen; int bitptn[17], weight[17]; int i, maxbits = 0, ptn, tbl_size, w; int diffbits, len_avail; /* * Initialize bit patterns. */ ptn = 0; for (i = 1, w = 1 << 15; i <= 16; i++, w >>= 1) { bitptn[i] = ptn; weight[i] = w; if (hf->freq[i]) { ptn += hf->freq[i] * w; maxbits = i; } } if (ptn != 0x10000 || maxbits > hf->tbl_bits) return (0);/* Invalid */ hf->max_bits = maxbits; /* * Cut out extra bits which we won't house in the table. * This preparation reduces the same calculation in the for-loop * making the table. */ if (maxbits < 16) { int ebits = 16 - maxbits; for (i = 1; i <= maxbits; i++) { bitptn[i] >>= ebits; weight[i] >>= ebits; } } if (maxbits > HTBL_BITS) { unsigned htbl_max; uint16_t *p; diffbits = maxbits - HTBL_BITS; for (i = 1; i <= HTBL_BITS; i++) { bitptn[i] >>= diffbits; weight[i] >>= diffbits; } htbl_max = bitptn[HTBL_BITS] + weight[HTBL_BITS] * hf->freq[HTBL_BITS]; p = &(hf->tbl[htbl_max]); while (p < &hf->tbl[1U<shift_bits = diffbits; /* * Make the table. */ tbl_size = 1 << HTBL_BITS; tbl = hf->tbl; bitlen = hf->bitlen; len_avail = hf->len_avail; hf->tree_used = 0; for (i = 0; i < len_avail; i++) { uint16_t *p; int len, cnt; uint16_t bit; int extlen; struct htree_t *ht; if (bitlen[i] == 0) continue; /* Get a bit pattern */ len = bitlen[i]; ptn = bitptn[len]; cnt = weight[len]; if (len <= HTBL_BITS) { /* Calculate next bit pattern */ if ((bitptn[len] = ptn + cnt) > tbl_size) return (0);/* Invalid */ /* Update the table */ p = &(tbl[ptn]); if (cnt > 7) { uint16_t *pc; cnt -= 8; pc = &p[cnt]; pc[0] = (uint16_t)i; pc[1] = (uint16_t)i; pc[2] = (uint16_t)i; pc[3] = (uint16_t)i; pc[4] = (uint16_t)i; pc[5] = (uint16_t)i; pc[6] = (uint16_t)i; pc[7] = (uint16_t)i; if (cnt > 7) { cnt -= 8; memcpy(&p[cnt], pc, 8 * sizeof(uint16_t)); pc = &p[cnt]; while (cnt > 15) { cnt -= 16; memcpy(&p[cnt], pc, 16 * sizeof(uint16_t)); } } if (cnt) memcpy(p, pc, cnt * sizeof(uint16_t)); } else { while (cnt > 1) { p[--cnt] = (uint16_t)i; p[--cnt] = (uint16_t)i; } if (cnt) p[--cnt] = (uint16_t)i; } continue; } /* * A bit length is too big to be housed to a direct table, * so we use a tree model for its extra bits. */ bitptn[len] = ptn + cnt; bit = 1U << (diffbits -1); extlen = len - HTBL_BITS; p = &(tbl[ptn >> diffbits]); if (*p == 0) { *p = len_avail + hf->tree_used; ht = &(hf->tree[hf->tree_used++]); if (hf->tree_used > hf->tree_avail) return (0);/* Invalid */ ht->left = 0; ht->right = 0; } else { if (*p < len_avail || *p >= (len_avail + hf->tree_used)) return (0);/* Invalid */ ht = &(hf->tree[*p - len_avail]); } while (--extlen > 0) { if (ptn & bit) { if (ht->left < len_avail) { ht->left = len_avail + hf->tree_used; ht = &(hf->tree[hf->tree_used++]); if (hf->tree_used > hf->tree_avail) return (0);/* Invalid */ ht->left = 0; ht->right = 0; } else { ht = &(hf->tree[ht->left - len_avail]); } } else { if (ht->right < len_avail) { ht->right = len_avail + hf->tree_used; ht = &(hf->tree[hf->tree_used++]); if (hf->tree_used > hf->tree_avail) return (0);/* Invalid */ ht->left = 0; ht->right = 0; } else { ht = &(hf->tree[ht->right - len_avail]); } } bit >>= 1; } if (ptn & bit) { if (ht->left != 0) return (0);/* Invalid */ ht->left = (uint16_t)i; } else { if (ht->right != 0) return (0);/* Invalid */ ht->right = (uint16_t)i; } } return (1); } static int lzh_decode_huffman_tree(struct huffman *hf, unsigned rbits, int c) { struct htree_t *ht; int extlen; ht = hf->tree; extlen = hf->shift_bits; while (c >= hf->len_avail) { c -= hf->len_avail; if (extlen-- <= 0 || c >= hf->tree_used) return (0); if (rbits & (1U << extlen)) c = ht[c].left; else c = ht[c].right; } return (c); } static inline int lzh_decode_huffman(struct huffman *hf, unsigned rbits) { int c; /* * At first search an index table for a bit pattern. * If it fails, search a huffman tree for. */ c = hf->tbl[rbits >> hf->shift_bits]; if (c < hf->len_avail || hf->len_avail == 0) return (c); /* This bit pattern needs to be found out at a huffman tree. */ return (lzh_decode_huffman_tree(hf, rbits, c)); } diff --git a/contrib/libarchive/libarchive/archive_string.c b/contrib/libarchive/libarchive/archive_string.c index f39677ad7a26..be6c39600d72 100644 --- a/contrib/libarchive/libarchive/archive_string.c +++ b/contrib/libarchive/libarchive/archive_string.c @@ -1,4256 +1,4244 @@ /*- * Copyright (c) 2003-2011 Tim Kientzle * Copyright (c) 2011-2012 Michihiro NAKAJIMA * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "archive_platform.h" /* * Basic resizable string support, to simplify manipulating arbitrary-sized * strings while minimizing heap activity. * * In particular, the buffer used by a string object is only grown, it * never shrinks, so you can clear and reuse the same string object * without incurring additional memory allocations. */ #ifdef HAVE_ERRNO_H #include #endif #ifdef HAVE_ICONV_H #include #endif #ifdef HAVE_LANGINFO_H #include #endif #ifdef HAVE_LOCALCHARSET_H #include #endif #ifdef HAVE_STDLIB_H #include #endif #ifdef HAVE_STRING_H #include #endif #ifdef HAVE_WCHAR_H #include #endif #if defined(_WIN32) && !defined(__CYGWIN__) #include #include #endif #include "archive_endian.h" #include "archive_private.h" #include "archive_string.h" #include "archive_string_composition.h" #if !defined(HAVE_WMEMCPY) && !defined(wmemcpy) #define wmemcpy(a,b,i) (wchar_t *)memcpy((a), (b), (i) * sizeof(wchar_t)) #endif #if !defined(HAVE_WMEMMOVE) && !defined(wmemmove) #define wmemmove(a,b,i) (wchar_t *)memmove((a), (b), (i) * sizeof(wchar_t)) #endif #undef max #define max(a, b) ((a)>(b)?(a):(b)) struct archive_string_conv { struct archive_string_conv *next; char *from_charset; char *to_charset; unsigned from_cp; unsigned to_cp; /* Set 1 if from_charset and to_charset are the same. */ int same; int flag; #define SCONV_TO_CHARSET 1 /* MBS is being converted to specified * charset. */ #define SCONV_FROM_CHARSET (1<<1) /* MBS is being converted from * specified charset. */ #define SCONV_BEST_EFFORT (1<<2) /* Copy at least ASCII code. */ #define SCONV_WIN_CP (1<<3) /* Use Windows API for converting * MBS. */ #define SCONV_UTF8_LIBARCHIVE_2 (1<<4) /* Incorrect UTF-8 made by libarchive * 2.x in the wrong assumption. */ #define SCONV_NORMALIZATION_C (1<<6) /* Need normalization to be Form C. * Before UTF-8 characters are actually * processed. */ #define SCONV_NORMALIZATION_D (1<<7) /* Need normalization to be Form D. * Before UTF-8 characters are actually * processed. * Currently this only for MAC OS X. */ #define SCONV_TO_UTF8 (1<<8) /* "to charset" side is UTF-8. */ #define SCONV_FROM_UTF8 (1<<9) /* "from charset" side is UTF-8. */ #define SCONV_TO_UTF16BE (1<<10) /* "to charset" side is UTF-16BE. */ #define SCONV_FROM_UTF16BE (1<<11) /* "from charset" side is UTF-16BE. */ #define SCONV_TO_UTF16LE (1<<12) /* "to charset" side is UTF-16LE. */ #define SCONV_FROM_UTF16LE (1<<13) /* "from charset" side is UTF-16LE. */ #define SCONV_TO_UTF16 (SCONV_TO_UTF16BE | SCONV_TO_UTF16LE) #define SCONV_FROM_UTF16 (SCONV_FROM_UTF16BE | SCONV_FROM_UTF16LE) #if HAVE_ICONV iconv_t cd; iconv_t cd_w;/* Use at archive_mstring on * Windows. */ #endif /* A temporary buffer for normalization. */ struct archive_string utftmp; int (*converter[2])(struct archive_string *, const void *, size_t, struct archive_string_conv *); int nconverter; }; #define CP_C_LOCALE 0 /* "C" locale only for this file. */ #define CP_UTF16LE 1200 #define CP_UTF16BE 1201 #define IS_HIGH_SURROGATE_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDBFF) #define IS_LOW_SURROGATE_LA(uc) ((uc) >= 0xDC00 && (uc) <= 0xDFFF) #define IS_SURROGATE_PAIR_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDFFF) #define UNICODE_MAX 0x10FFFF #define UNICODE_R_CHAR 0xFFFD /* Replacement character. */ /* Set U+FFFD(Replacement character) in UTF-8. */ static const char utf8_replacement_char[] = {0xef, 0xbf, 0xbd}; static struct archive_string_conv *find_sconv_object(struct archive *, const char *, const char *); static void add_sconv_object(struct archive *, struct archive_string_conv *); static struct archive_string_conv *create_sconv_object(const char *, const char *, unsigned, int); static void free_sconv_object(struct archive_string_conv *); static struct archive_string_conv *get_sconv_object(struct archive *, const char *, const char *, int); static unsigned make_codepage_from_charset(const char *); static unsigned get_current_codepage(void); static unsigned get_current_oemcp(void); static size_t mbsnbytes(const void *, size_t); static size_t utf16nbytes(const void *, size_t); #if defined(_WIN32) && !defined(__CYGWIN__) static int archive_wstring_append_from_mbs_in_codepage( struct archive_wstring *, const char *, size_t, struct archive_string_conv *); static int archive_string_append_from_wcs_in_codepage(struct archive_string *, const wchar_t *, size_t, struct archive_string_conv *); static int is_big_endian(void); static int strncat_in_codepage(struct archive_string *, const void *, size_t, struct archive_string_conv *); static int win_strncat_from_utf16be(struct archive_string *, const void *, size_t, struct archive_string_conv *); static int win_strncat_from_utf16le(struct archive_string *, const void *, size_t, struct archive_string_conv *); static int win_strncat_to_utf16be(struct archive_string *, const void *, size_t, struct archive_string_conv *); static int win_strncat_to_utf16le(struct archive_string *, const void *, size_t, struct archive_string_conv *); #endif static int best_effort_strncat_from_utf16be(struct archive_string *, const void *, size_t, struct archive_string_conv *); static int best_effort_strncat_from_utf16le(struct archive_string *, const void *, size_t, struct archive_string_conv *); static int best_effort_strncat_to_utf16be(struct archive_string *, const void *, size_t, struct archive_string_conv *); static int best_effort_strncat_to_utf16le(struct archive_string *, const void *, size_t, struct archive_string_conv *); #if defined(HAVE_ICONV) static int iconv_strncat_in_locale(struct archive_string *, const void *, size_t, struct archive_string_conv *); #endif static int best_effort_strncat_in_locale(struct archive_string *, const void *, size_t, struct archive_string_conv *); static int _utf8_to_unicode(uint32_t *, const char *, size_t); static int utf8_to_unicode(uint32_t *, const char *, size_t); static inline uint32_t combine_surrogate_pair(uint32_t, uint32_t); static int cesu8_to_unicode(uint32_t *, const char *, size_t); static size_t unicode_to_utf8(char *, size_t, uint32_t); static int utf16_to_unicode(uint32_t *, const char *, size_t, int); static size_t unicode_to_utf16be(char *, size_t, uint32_t); static size_t unicode_to_utf16le(char *, size_t, uint32_t); static int strncat_from_utf8_libarchive2(struct archive_string *, const void *, size_t, struct archive_string_conv *); static int strncat_from_utf8_to_utf8(struct archive_string *, const void *, size_t, struct archive_string_conv *); static int archive_string_normalize_C(struct archive_string *, const void *, size_t, struct archive_string_conv *); static int archive_string_normalize_D(struct archive_string *, const void *, size_t, struct archive_string_conv *); static int archive_string_append_unicode(struct archive_string *, const void *, size_t, struct archive_string_conv *); static struct archive_string * archive_string_append(struct archive_string *as, const char *p, size_t s) { if (archive_string_ensure(as, as->length + s + 1) == NULL) return (NULL); if (s) memmove(as->s + as->length, p, s); as->length += s; as->s[as->length] = 0; return (as); } static struct archive_wstring * archive_wstring_append(struct archive_wstring *as, const wchar_t *p, size_t s) { if (archive_wstring_ensure(as, as->length + s + 1) == NULL) return (NULL); if (s) wmemmove(as->s + as->length, p, s); as->length += s; as->s[as->length] = 0; return (as); } struct archive_string * archive_array_append(struct archive_string *as, const char *p, size_t s) { return archive_string_append(as, p, s); } void archive_string_concat(struct archive_string *dest, struct archive_string *src) { if (archive_string_append(dest, src->s, src->length) == NULL) __archive_errx(1, "Out of memory"); } void archive_wstring_concat(struct archive_wstring *dest, struct archive_wstring *src) { if (archive_wstring_append(dest, src->s, src->length) == NULL) __archive_errx(1, "Out of memory"); } void archive_string_free(struct archive_string *as) { as->length = 0; as->buffer_length = 0; free(as->s); as->s = NULL; } void archive_wstring_free(struct archive_wstring *as) { as->length = 0; as->buffer_length = 0; free(as->s); as->s = NULL; } struct archive_wstring * archive_wstring_ensure(struct archive_wstring *as, size_t s) { return (struct archive_wstring *) archive_string_ensure((struct archive_string *)as, s * sizeof(wchar_t)); } /* Returns NULL on any allocation failure. */ struct archive_string * archive_string_ensure(struct archive_string *as, size_t s) { char *p; size_t new_length; /* If buffer is already big enough, don't reallocate. */ if (as->s && (s <= as->buffer_length)) return (as); /* * Growing the buffer at least exponentially ensures that * append operations are always linear in the number of * characters appended. Using a smaller growth rate for * larger buffers reduces memory waste somewhat at the cost of * a larger constant factor. */ if (as->buffer_length < 32) /* Start with a minimum 32-character buffer. */ new_length = 32; else if (as->buffer_length < 8192) /* Buffers under 8k are doubled for speed. */ new_length = as->buffer_length + as->buffer_length; else { /* Buffers 8k and over grow by at least 25% each time. */ new_length = as->buffer_length + as->buffer_length / 4; /* Be safe: If size wraps, fail. */ if (new_length < as->buffer_length) { /* On failure, wipe the string and return NULL. */ archive_string_free(as); errno = ENOMEM;/* Make sure errno has ENOMEM. */ return (NULL); } } /* * The computation above is a lower limit to how much we'll * grow the buffer. In any case, we have to grow it enough to * hold the request. */ if (new_length < s) new_length = s; /* Now we can reallocate the buffer. */ p = (char *)realloc(as->s, new_length); if (p == NULL) { /* On failure, wipe the string and return NULL. */ archive_string_free(as); errno = ENOMEM;/* Make sure errno has ENOMEM. */ return (NULL); } as->s = p; as->buffer_length = new_length; return (as); } /* * TODO: See if there's a way to avoid scanning * the source string twice. Then test to see * if it actually helps (remember that we're almost * always called with pretty short arguments, so * such an optimization might not help). */ struct archive_string * archive_strncat(struct archive_string *as, const void *_p, size_t n) { size_t s; const char *p, *pp; p = (const char *)_p; /* Like strlen(p), except won't examine positions beyond p[n]. */ s = 0; pp = p; while (s < n && *pp) { pp++; s++; } if ((as = archive_string_append(as, p, s)) == NULL) __archive_errx(1, "Out of memory"); return (as); } struct archive_wstring * archive_wstrncat(struct archive_wstring *as, const wchar_t *p, size_t n) { size_t s; const wchar_t *pp; /* Like strlen(p), except won't examine positions beyond p[n]. */ s = 0; pp = p; while (s < n && *pp) { pp++; s++; } if ((as = archive_wstring_append(as, p, s)) == NULL) __archive_errx(1, "Out of memory"); return (as); } struct archive_string * archive_strcat(struct archive_string *as, const void *p) { /* strcat is just strncat without an effective limit. * Assert that we'll never get called with a source * string over 16MB. * TODO: Review all uses of strcat in the source * and try to replace them with strncat(). */ return archive_strncat(as, p, 0x1000000); } struct archive_wstring * archive_wstrcat(struct archive_wstring *as, const wchar_t *p) { /* Ditto. */ return archive_wstrncat(as, p, 0x1000000); } struct archive_string * archive_strappend_char(struct archive_string *as, char c) { if ((as = archive_string_append(as, &c, 1)) == NULL) __archive_errx(1, "Out of memory"); return (as); } struct archive_wstring * archive_wstrappend_wchar(struct archive_wstring *as, wchar_t c) { if ((as = archive_wstring_append(as, &c, 1)) == NULL) __archive_errx(1, "Out of memory"); return (as); } /* * Get the "current character set" name to use with iconv. * On FreeBSD, the empty character set name "" chooses * the correct character encoding for the current locale, * so this isn't necessary. * But iconv on Mac OS 10.6 doesn't seem to handle this correctly; * on that system, we have to explicitly call nl_langinfo() * to get the right name. Not sure about other platforms. * * NOTE: GNU libiconv does not recognize the character-set name * which some platform nl_langinfo(CODESET) returns, so we should * use locale_charset() instead of nl_langinfo(CODESET) for GNU libiconv. */ static const char * default_iconv_charset(const char *charset) { if (charset != NULL && charset[0] != '\0') return charset; #if HAVE_LOCALE_CHARSET && !defined(__APPLE__) /* locale_charset() is broken on Mac OS */ return locale_charset(); #elif HAVE_NL_LANGINFO return nl_langinfo(CODESET); #else return ""; #endif } #if defined(_WIN32) && !defined(__CYGWIN__) /* * Convert MBS to WCS. * Note: returns -1 if conversion fails. */ int archive_wstring_append_from_mbs(struct archive_wstring *dest, const char *p, size_t len) { return archive_wstring_append_from_mbs_in_codepage(dest, p, len, NULL); } static int archive_wstring_append_from_mbs_in_codepage(struct archive_wstring *dest, const char *s, size_t length, struct archive_string_conv *sc) { int count, ret = 0; UINT from_cp; if (sc != NULL) from_cp = sc->from_cp; else from_cp = get_current_codepage(); if (from_cp == CP_C_LOCALE) { /* * "C" locale special processing. */ wchar_t *ws; const unsigned char *mp; if (NULL == archive_wstring_ensure(dest, dest->length + length + 1)) return (-1); ws = dest->s + dest->length; mp = (const unsigned char *)s; count = 0; while (count < (int)length && *mp) { *ws++ = (wchar_t)*mp++; count++; } } else if (sc != NULL && (sc->flag & (SCONV_NORMALIZATION_C | SCONV_NORMALIZATION_D))) { /* * Normalize UTF-8 and UTF-16BE and convert it directly * to UTF-16 as wchar_t. */ struct archive_string u16; int saved_flag = sc->flag;/* save current flag. */ if (is_big_endian()) sc->flag |= SCONV_TO_UTF16BE; else sc->flag |= SCONV_TO_UTF16LE; if (sc->flag & SCONV_FROM_UTF16) { /* * UTF-16BE/LE NFD ===> UTF-16 NFC * UTF-16BE/LE NFC ===> UTF-16 NFD */ count = (int)utf16nbytes(s, length); } else { /* * UTF-8 NFD ===> UTF-16 NFC * UTF-8 NFC ===> UTF-16 NFD */ count = (int)mbsnbytes(s, length); } u16.s = (char *)dest->s; u16.length = dest->length << 1;; u16.buffer_length = dest->buffer_length; if (sc->flag & SCONV_NORMALIZATION_C) ret = archive_string_normalize_C(&u16, s, count, sc); else ret = archive_string_normalize_D(&u16, s, count, sc); dest->s = (wchar_t *)u16.s; dest->length = u16.length >> 1; dest->buffer_length = u16.buffer_length; sc->flag = saved_flag;/* restore the saved flag. */ return (ret); } else if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) { count = (int)utf16nbytes(s, length); count >>= 1; /* to be WCS length */ /* Allocate memory for WCS. */ if (NULL == archive_wstring_ensure(dest, dest->length + count + 1)) return (-1); wmemcpy(dest->s + dest->length, (const wchar_t *)s, count); if ((sc->flag & SCONV_FROM_UTF16BE) && !is_big_endian()) { uint16_t *u16 = (uint16_t *)(dest->s + dest->length); int b; for (b = 0; b < count; b++) { uint16_t val = archive_le16dec(u16+b); archive_be16enc(u16+b, val); } } else if ((sc->flag & SCONV_FROM_UTF16LE) && is_big_endian()) { uint16_t *u16 = (uint16_t *)(dest->s + dest->length); int b; for (b = 0; b < count; b++) { uint16_t val = archive_be16dec(u16+b); archive_le16enc(u16+b, val); } } } else { DWORD mbflag; size_t buffsize; if (sc == NULL) mbflag = 0; else if (sc->flag & SCONV_FROM_CHARSET) { /* Do not trust the length which comes from * an archive file. */ length = mbsnbytes(s, length); mbflag = 0; } else mbflag = MB_PRECOMPOSED; mbflag |= MB_ERR_INVALID_CHARS; buffsize = dest->length + length + 1; do { /* Allocate memory for WCS. */ if (NULL == archive_wstring_ensure(dest, buffsize)) return (-1); /* Convert MBS to WCS. */ count = MultiByteToWideChar(from_cp, mbflag, s, (int)length, dest->s + dest->length, (int)(dest->buffer_length >> 1) -1); if (count == 0 && GetLastError() == ERROR_INSUFFICIENT_BUFFER) { /* Expand the WCS buffer. */ buffsize = dest->buffer_length << 1; continue; } if (count == 0 && length != 0) ret = -1; break; } while (1); } dest->length += count; dest->s[dest->length] = L'\0'; return (ret); } #else /* * Convert MBS to WCS. * Note: returns -1 if conversion fails. */ int archive_wstring_append_from_mbs(struct archive_wstring *dest, const char *p, size_t len) { size_t r; int ret_val = 0; /* * No single byte will be more than one wide character, * so this length estimate will always be big enough. */ // size_t wcs_length = len; size_t mbs_length = len; const char *mbs = p; wchar_t *wcs; #if HAVE_MBRTOWC mbstate_t shift_state; memset(&shift_state, 0, sizeof(shift_state)); #endif /* * As we decided to have wcs_length == mbs_length == len * we can use len here instead of wcs_length */ if (NULL == archive_wstring_ensure(dest, dest->length + len + 1)) return (-1); wcs = dest->s + dest->length; /* * We cannot use mbsrtowcs/mbstowcs here because those may convert * extra MBS when strlen(p) > len and one wide character consists of * multi bytes. */ while (*mbs && mbs_length > 0) { /* * The buffer we allocated is always big enough. * Keep this code path in a comment if we decide to choose * smaller wcs_length in the future */ /* if (wcs_length == 0) { dest->length = wcs - dest->s; dest->s[dest->length] = L'\0'; wcs_length = mbs_length; if (NULL == archive_wstring_ensure(dest, dest->length + wcs_length + 1)) return (-1); wcs = dest->s + dest->length; } */ #if HAVE_MBRTOWC r = mbrtowc(wcs, mbs, mbs_length, &shift_state); #else r = mbtowc(wcs, mbs, mbs_length); #endif if (r == (size_t)-1 || r == (size_t)-2) { ret_val = -1; break; } if (r == 0 || r > mbs_length) break; wcs++; // wcs_length--; mbs += r; mbs_length -= r; } dest->length = wcs - dest->s; dest->s[dest->length] = L'\0'; return (ret_val); } #endif #if defined(_WIN32) && !defined(__CYGWIN__) /* * WCS ==> MBS. * Note: returns -1 if conversion fails. * * Win32 builds use WideCharToMultiByte from the Windows API. * (Maybe Cygwin should too? WideCharToMultiByte will know a * lot more about local character encodings than the wcrtomb() * wrapper is going to know.) */ int archive_string_append_from_wcs(struct archive_string *as, const wchar_t *w, size_t len) { return archive_string_append_from_wcs_in_codepage(as, w, len, NULL); } static int archive_string_append_from_wcs_in_codepage(struct archive_string *as, const wchar_t *ws, size_t len, struct archive_string_conv *sc) { BOOL defchar_used, *dp; int count, ret = 0; UINT to_cp; int wslen = (int)len; if (sc != NULL) to_cp = sc->to_cp; else to_cp = get_current_codepage(); if (to_cp == CP_C_LOCALE) { /* * "C" locale special processing. */ const wchar_t *wp = ws; char *p; if (NULL == archive_string_ensure(as, as->length + wslen +1)) return (-1); p = as->s + as->length; count = 0; defchar_used = 0; while (count < wslen && *wp) { if (*wp > 255) { *p++ = '?'; wp++; defchar_used = 1; } else *p++ = (char)*wp++; count++; } } else if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) { uint16_t *u16; if (NULL == archive_string_ensure(as, as->length + len * 2 + 2)) return (-1); u16 = (uint16_t *)(as->s + as->length); count = 0; defchar_used = 0; if (sc->flag & SCONV_TO_UTF16BE) { while (count < (int)len && *ws) { archive_be16enc(u16+count, *ws); ws++; count++; } } else { while (count < (int)len && *ws) { archive_le16enc(u16+count, *ws); ws++; count++; } } count <<= 1; /* to be byte size */ } else { /* Make sure the MBS buffer has plenty to set. */ if (NULL == archive_string_ensure(as, as->length + len * 2 + 1)) return (-1); do { defchar_used = 0; if (to_cp == CP_UTF8 || sc == NULL) dp = NULL; else dp = &defchar_used; count = WideCharToMultiByte(to_cp, 0, ws, wslen, as->s + as->length, (int)as->buffer_length - (int)as->length - 1, NULL, dp); if (count == 0 && GetLastError() == ERROR_INSUFFICIENT_BUFFER) { /* Expand the MBS buffer and retry. */ if (NULL == archive_string_ensure(as, as->buffer_length + len)) return (-1); continue; } if (count == 0) ret = -1; break; } while (1); } as->length += count; as->s[as->length] = '\0'; return (defchar_used?-1:ret); } #elif defined(HAVE_WCTOMB) || defined(HAVE_WCRTOMB) /* * Translates a wide character string into current locale character set * and appends to the archive_string. Note: returns -1 if conversion * fails. */ int archive_string_append_from_wcs(struct archive_string *as, const wchar_t *w, size_t len) { /* We cannot use the standard wcstombs() here because it * cannot tell us how big the output buffer should be. So * I've built a loop around wcrtomb() or wctomb() that * converts a character at a time and resizes the string as * needed. We prefer wcrtomb() when it's available because * it's thread-safe. */ int n, ret_val = 0; char *p; char *end; #if HAVE_WCRTOMB mbstate_t shift_state; memset(&shift_state, 0, sizeof(shift_state)); #else /* Clear the shift state before starting. */ wctomb(NULL, L'\0'); #endif /* * Allocate buffer for MBS. * We need this allocation here since it is possible that * as->s is still NULL. */ if (archive_string_ensure(as, as->length + len + 1) == NULL) return (-1); p = as->s + as->length; end = as->s + as->buffer_length - MB_CUR_MAX -1; while (*w != L'\0' && len > 0) { if (p >= end) { as->length = p - as->s; as->s[as->length] = '\0'; /* Re-allocate buffer for MBS. */ if (archive_string_ensure(as, as->length + max(len * 2, (size_t)MB_CUR_MAX) + 1) == NULL) return (-1); p = as->s + as->length; end = as->s + as->buffer_length - MB_CUR_MAX -1; } #if HAVE_WCRTOMB n = wcrtomb(p, *w++, &shift_state); #else n = wctomb(p, *w++); #endif if (n == -1) { if (errno == EILSEQ) { /* Skip an illegal wide char. */ *p++ = '?'; ret_val = -1; } else { ret_val = -1; break; } } else p += n; len--; } as->length = p - as->s; as->s[as->length] = '\0'; return (ret_val); } #else /* HAVE_WCTOMB || HAVE_WCRTOMB */ /* * TODO: Test if __STDC_ISO_10646__ is defined. * Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion * one character at a time. If a non-Windows platform doesn't have * either of these, fall back to the built-in UTF8 conversion. */ int archive_string_append_from_wcs(struct archive_string *as, const wchar_t *w, size_t len) { (void)as;/* UNUSED */ (void)w;/* UNUSED */ (void)len;/* UNUSED */ errno = ENOSYS; return (-1); } #endif /* HAVE_WCTOMB || HAVE_WCRTOMB */ /* * Find a string conversion object by a pair of 'from' charset name * and 'to' charset name from an archive object. * Return NULL if not found. */ static struct archive_string_conv * find_sconv_object(struct archive *a, const char *fc, const char *tc) { struct archive_string_conv *sc; if (a == NULL) return (NULL); for (sc = a->sconv; sc != NULL; sc = sc->next) { if (strcmp(sc->from_charset, fc) == 0 && strcmp(sc->to_charset, tc) == 0) break; } return (sc); } /* * Register a string object to an archive object. */ static void add_sconv_object(struct archive *a, struct archive_string_conv *sc) { struct archive_string_conv **psc; /* Add a new sconv to sconv list. */ psc = &(a->sconv); while (*psc != NULL) psc = &((*psc)->next); *psc = sc; } static void add_converter(struct archive_string_conv *sc, int (*converter) (struct archive_string *, const void *, size_t, struct archive_string_conv *)) { if (sc == NULL || sc->nconverter >= 2) __archive_errx(1, "Programming error"); sc->converter[sc->nconverter++] = converter; } static void setup_converter(struct archive_string_conv *sc) { /* Reset. */ sc->nconverter = 0; /* * Perform special sequence for the incorrect UTF-8 filenames * made by libarchive2.x. */ if (sc->flag & SCONV_UTF8_LIBARCHIVE_2) { add_converter(sc, strncat_from_utf8_libarchive2); return; } /* * Convert a string to UTF-16BE/LE. */ if (sc->flag & SCONV_TO_UTF16) { /* * If the current locale is UTF-8, we can translate * a UTF-8 string into a UTF-16BE string. */ if (sc->flag & SCONV_FROM_UTF8) { add_converter(sc, archive_string_append_unicode); return; } #if defined(_WIN32) && !defined(__CYGWIN__) if (sc->flag & SCONV_WIN_CP) { if (sc->flag & SCONV_TO_UTF16BE) add_converter(sc, win_strncat_to_utf16be); else add_converter(sc, win_strncat_to_utf16le); return; } #endif #if defined(HAVE_ICONV) if (sc->cd != (iconv_t)-1) { add_converter(sc, iconv_strncat_in_locale); return; } #endif if (sc->flag & SCONV_BEST_EFFORT) { if (sc->flag & SCONV_TO_UTF16BE) add_converter(sc, best_effort_strncat_to_utf16be); else add_converter(sc, best_effort_strncat_to_utf16le); } else /* Make sure we have no converter. */ sc->nconverter = 0; return; } /* * Convert a string from UTF-16BE/LE. */ if (sc->flag & SCONV_FROM_UTF16) { /* * At least we should normalize a UTF-16BE string. */ if (sc->flag & SCONV_NORMALIZATION_D) add_converter(sc,archive_string_normalize_D); else if (sc->flag & SCONV_NORMALIZATION_C) add_converter(sc, archive_string_normalize_C); if (sc->flag & SCONV_TO_UTF8) { /* * If the current locale is UTF-8, we can translate * a UTF-16BE/LE string into a UTF-8 string directly. */ if (!(sc->flag & (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C))) add_converter(sc, archive_string_append_unicode); return; } #if defined(_WIN32) && !defined(__CYGWIN__) if (sc->flag & SCONV_WIN_CP) { if (sc->flag & SCONV_FROM_UTF16BE) add_converter(sc, win_strncat_from_utf16be); else add_converter(sc, win_strncat_from_utf16le); return; } #endif #if defined(HAVE_ICONV) if (sc->cd != (iconv_t)-1) { add_converter(sc, iconv_strncat_in_locale); return; } #endif if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE)) == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE)) add_converter(sc, best_effort_strncat_from_utf16be); else if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE)) == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE)) add_converter(sc, best_effort_strncat_from_utf16le); else /* Make sure we have no converter. */ sc->nconverter = 0; return; } if (sc->flag & SCONV_FROM_UTF8) { /* * At least we should normalize a UTF-8 string. */ if (sc->flag & SCONV_NORMALIZATION_D) add_converter(sc,archive_string_normalize_D); else if (sc->flag & SCONV_NORMALIZATION_C) add_converter(sc, archive_string_normalize_C); /* * Copy UTF-8 string with a check of CESU-8. * Apparently, iconv does not check surrogate pairs in UTF-8 * when both from-charset and to-charset are UTF-8, and then * we use our UTF-8 copy code. */ if (sc->flag & SCONV_TO_UTF8) { /* * If the current locale is UTF-8, we can translate * a UTF-16BE string into a UTF-8 string directly. */ if (!(sc->flag & (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C))) add_converter(sc, strncat_from_utf8_to_utf8); return; } } #if defined(_WIN32) && !defined(__CYGWIN__) /* * On Windows we can use Windows API for a string conversion. */ if (sc->flag & SCONV_WIN_CP) { add_converter(sc, strncat_in_codepage); return; } #endif #if HAVE_ICONV if (sc->cd != (iconv_t)-1) { add_converter(sc, iconv_strncat_in_locale); /* * iconv generally does not support UTF-8-MAC and so * we have to the output of iconv from NFC to NFD if * need. */ if ((sc->flag & SCONV_FROM_CHARSET) && (sc->flag & SCONV_TO_UTF8)) { if (sc->flag & SCONV_NORMALIZATION_D) add_converter(sc, archive_string_normalize_D); } return; } #endif /* * Try conversion in the best effort or no conversion. */ if ((sc->flag & SCONV_BEST_EFFORT) || sc->same) add_converter(sc, best_effort_strncat_in_locale); else /* Make sure we have no converter. */ sc->nconverter = 0; } /* * Return canonicalized charset-name but this supports just UTF-8, UTF-16BE * and CP932 which are referenced in create_sconv_object(). */ static const char * canonical_charset_name(const char *charset) { char cs[16]; char *p; const char *s; if (charset == NULL || charset[0] == '\0' || strlen(charset) > 15) return (charset); /* Copy name to uppercase. */ p = cs; s = charset; while (*s) { char c = *s++; if (c >= 'a' && c <= 'z') c -= 'a' - 'A'; *p++ = c; } *p++ = '\0'; if (strcmp(cs, "UTF-8") == 0 || strcmp(cs, "UTF8") == 0) return ("UTF-8"); if (strcmp(cs, "UTF-16BE") == 0 || strcmp(cs, "UTF16BE") == 0) return ("UTF-16BE"); if (strcmp(cs, "UTF-16LE") == 0 || strcmp(cs, "UTF16LE") == 0) return ("UTF-16LE"); if (strcmp(cs, "CP932") == 0) return ("CP932"); return (charset); } /* * Create a string conversion object. */ static struct archive_string_conv * create_sconv_object(const char *fc, const char *tc, unsigned current_codepage, int flag) { struct archive_string_conv *sc; sc = calloc(1, sizeof(*sc)); if (sc == NULL) return (NULL); sc->next = NULL; sc->from_charset = strdup(fc); if (sc->from_charset == NULL) { free(sc); return (NULL); } sc->to_charset = strdup(tc); if (sc->to_charset == NULL) { free(sc->from_charset); free(sc); return (NULL); } archive_string_init(&sc->utftmp); if (flag & SCONV_TO_CHARSET) { /* * Convert characters from the current locale charset to * a specified charset. */ sc->from_cp = current_codepage; sc->to_cp = make_codepage_from_charset(tc); #if defined(_WIN32) && !defined(__CYGWIN__) if (IsValidCodePage(sc->to_cp)) flag |= SCONV_WIN_CP; #endif } else if (flag & SCONV_FROM_CHARSET) { /* * Convert characters from a specified charset to * the current locale charset. */ sc->to_cp = current_codepage; sc->from_cp = make_codepage_from_charset(fc); #if defined(_WIN32) && !defined(__CYGWIN__) if (IsValidCodePage(sc->from_cp)) flag |= SCONV_WIN_CP; #endif } /* * Check if "from charset" and "to charset" are the same. */ if (strcmp(fc, tc) == 0 || (sc->from_cp != (unsigned)-1 && sc->from_cp == sc->to_cp)) sc->same = 1; else sc->same = 0; /* * Mark if "from charset" or "to charset" are UTF-8 or UTF-16BE/LE. */ if (strcmp(tc, "UTF-8") == 0) flag |= SCONV_TO_UTF8; else if (strcmp(tc, "UTF-16BE") == 0) flag |= SCONV_TO_UTF16BE; else if (strcmp(tc, "UTF-16LE") == 0) flag |= SCONV_TO_UTF16LE; if (strcmp(fc, "UTF-8") == 0) flag |= SCONV_FROM_UTF8; else if (strcmp(fc, "UTF-16BE") == 0) flag |= SCONV_FROM_UTF16BE; else if (strcmp(fc, "UTF-16LE") == 0) flag |= SCONV_FROM_UTF16LE; #if defined(_WIN32) && !defined(__CYGWIN__) if (sc->to_cp == CP_UTF8) flag |= SCONV_TO_UTF8; else if (sc->to_cp == CP_UTF16BE) flag |= SCONV_TO_UTF16BE | SCONV_WIN_CP; else if (sc->to_cp == CP_UTF16LE) flag |= SCONV_TO_UTF16LE | SCONV_WIN_CP; if (sc->from_cp == CP_UTF8) flag |= SCONV_FROM_UTF8; else if (sc->from_cp == CP_UTF16BE) flag |= SCONV_FROM_UTF16BE | SCONV_WIN_CP; else if (sc->from_cp == CP_UTF16LE) flag |= SCONV_FROM_UTF16LE | SCONV_WIN_CP; #endif /* * Set a flag for Unicode NFD. Usually iconv cannot correctly * handle it. So we have to translate NFD characters to NFC ones * ourselves before iconv handles. Another reason is to prevent * that the same sight of two filenames, one is NFC and other * is NFD, would be in its directory. * On Mac OS X, although its filesystem layer automatically * convert filenames to NFD, it would be useful for filename * comparing to find out the same filenames that we normalize * that to be NFD ourselves. */ if ((flag & SCONV_FROM_CHARSET) && (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8))) { #if defined(__APPLE__) if (flag & SCONV_TO_UTF8) flag |= SCONV_NORMALIZATION_D; else #endif flag |= SCONV_NORMALIZATION_C; } #if defined(__APPLE__) /* * In case writing an archive file, make sure that a filename * going to be passed to iconv is a Unicode NFC string since * a filename in HFS Plus filesystem is a Unicode NFD one and * iconv cannot handle it with "UTF-8" charset. It is simpler * than a use of "UTF-8-MAC" charset. */ if ((flag & SCONV_TO_CHARSET) && (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && !(flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8))) flag |= SCONV_NORMALIZATION_C; /* * In case reading an archive file. make sure that a filename * will be passed to users is a Unicode NFD string in order to * correctly compare the filename with other one which comes * from HFS Plus filesystem. */ if ((flag & SCONV_FROM_CHARSET) && !(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && (flag & SCONV_TO_UTF8)) flag |= SCONV_NORMALIZATION_D; #endif #if defined(HAVE_ICONV) sc->cd_w = (iconv_t)-1; /* * Create an iconv object. */ if (((flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) && (flag & (SCONV_FROM_UTF8 | SCONV_FROM_UTF16))) || (flag & SCONV_WIN_CP)) { /* This case we won't use iconv. */ sc->cd = (iconv_t)-1; } else { sc->cd = iconv_open(tc, fc); if (sc->cd == (iconv_t)-1 && (sc->flag & SCONV_BEST_EFFORT)) { /* * Unfortunately, all of iconv implements do support * "CP932" character-set, so we should use "SJIS" * instead if iconv_open failed. */ if (strcmp(tc, "CP932") == 0) sc->cd = iconv_open("SJIS", fc); else if (strcmp(fc, "CP932") == 0) sc->cd = iconv_open(tc, "SJIS"); } #if defined(_WIN32) && !defined(__CYGWIN__) /* * archive_mstring on Windows directly convert multi-bytes * into archive_wstring in order not to depend on locale * so that you can do a I18N programming. This will be * used only in archive_mstring_copy_mbs_len_l so far. */ if (flag & SCONV_FROM_CHARSET) { sc->cd_w = iconv_open("UTF-8", fc); if (sc->cd_w == (iconv_t)-1 && (sc->flag & SCONV_BEST_EFFORT)) { if (strcmp(fc, "CP932") == 0) sc->cd_w = iconv_open("UTF-8", "SJIS"); } } #endif /* _WIN32 && !__CYGWIN__ */ } #endif /* HAVE_ICONV */ sc->flag = flag; /* * Set up converters. */ setup_converter(sc); return (sc); } /* * Free a string conversion object. */ static void free_sconv_object(struct archive_string_conv *sc) { free(sc->from_charset); free(sc->to_charset); archive_string_free(&sc->utftmp); #if HAVE_ICONV if (sc->cd != (iconv_t)-1) iconv_close(sc->cd); if (sc->cd_w != (iconv_t)-1) iconv_close(sc->cd_w); #endif free(sc); } #if defined(_WIN32) && !defined(__CYGWIN__) # if defined(WINAPI_FAMILY_PARTITION) && !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) # define GetOEMCP() CP_OEMCP # endif static unsigned my_atoi(const char *p) { unsigned cp; cp = 0; while (*p) { if (*p >= '0' && *p <= '9') cp = cp * 10 + (*p - '0'); else return (-1); p++; } return (cp); } /* * Translate Charset name (as used by iconv) into CodePage (as used by Windows) * Return -1 if failed. * * Note: This translation code may be insufficient. */ static struct charset { const char *name; unsigned cp; } charsets[] = { /* MUST BE SORTED! */ {"ASCII", 1252}, {"ASMO-708", 708}, {"BIG5", 950}, {"CHINESE", 936}, {"CP367", 1252}, {"CP819", 1252}, {"CP1025", 21025}, {"DOS-720", 720}, {"DOS-862", 862}, {"EUC-CN", 51936}, {"EUC-JP", 51932}, {"EUC-KR", 949}, {"EUCCN", 51936}, {"EUCJP", 51932}, {"EUCKR", 949}, {"GB18030", 54936}, {"GB2312", 936}, {"HEBREW", 1255}, {"HZ-GB-2312", 52936}, {"IBM273", 20273}, {"IBM277", 20277}, {"IBM278", 20278}, {"IBM280", 20280}, {"IBM284", 20284}, {"IBM285", 20285}, {"IBM290", 20290}, {"IBM297", 20297}, {"IBM367", 1252}, {"IBM420", 20420}, {"IBM423", 20423}, {"IBM424", 20424}, {"IBM819", 1252}, {"IBM871", 20871}, {"IBM880", 20880}, {"IBM905", 20905}, {"IBM924", 20924}, {"ISO-8859-1", 28591}, {"ISO-8859-13", 28603}, {"ISO-8859-15", 28605}, {"ISO-8859-2", 28592}, {"ISO-8859-3", 28593}, {"ISO-8859-4", 28594}, {"ISO-8859-5", 28595}, {"ISO-8859-6", 28596}, {"ISO-8859-7", 28597}, {"ISO-8859-8", 28598}, {"ISO-8859-9", 28599}, {"ISO8859-1", 28591}, {"ISO8859-13", 28603}, {"ISO8859-15", 28605}, {"ISO8859-2", 28592}, {"ISO8859-3", 28593}, {"ISO8859-4", 28594}, {"ISO8859-5", 28595}, {"ISO8859-6", 28596}, {"ISO8859-7", 28597}, {"ISO8859-8", 28598}, {"ISO8859-9", 28599}, {"JOHAB", 1361}, {"KOI8-R", 20866}, {"KOI8-U", 21866}, {"KS_C_5601-1987", 949}, {"LATIN1", 1252}, {"LATIN2", 28592}, {"MACINTOSH", 10000}, {"SHIFT-JIS", 932}, {"SHIFT_JIS", 932}, {"SJIS", 932}, {"US", 1252}, {"US-ASCII", 1252}, {"UTF-16", 1200}, {"UTF-16BE", 1201}, {"UTF-16LE", 1200}, {"UTF-8", CP_UTF8}, {"X-EUROPA", 29001}, {"X-MAC-ARABIC", 10004}, {"X-MAC-CE", 10029}, {"X-MAC-CHINESEIMP", 10008}, {"X-MAC-CHINESETRAD", 10002}, {"X-MAC-CROATIAN", 10082}, {"X-MAC-CYRILLIC", 10007}, {"X-MAC-GREEK", 10006}, {"X-MAC-HEBREW", 10005}, {"X-MAC-ICELANDIC", 10079}, {"X-MAC-JAPANESE", 10001}, {"X-MAC-KOREAN", 10003}, {"X-MAC-ROMANIAN", 10010}, {"X-MAC-THAI", 10021}, {"X-MAC-TURKISH", 10081}, {"X-MAC-UKRAINIAN", 10017}, }; static unsigned make_codepage_from_charset(const char *charset) { char cs[16]; char *p; unsigned cp; int a, b; if (charset == NULL || strlen(charset) > 15) return -1; /* Copy name to uppercase. */ p = cs; while (*charset) { char c = *charset++; if (c >= 'a' && c <= 'z') c -= 'a' - 'A'; *p++ = c; } *p++ = '\0'; cp = -1; /* Look it up in the table first, so that we can easily * override CP367, which we map to 1252 instead of 367. */ a = 0; b = sizeof(charsets)/sizeof(charsets[0]); while (b > a) { int c = (b + a) / 2; int r = strcmp(charsets[c].name, cs); if (r < 0) a = c + 1; else if (r > 0) b = c; else return charsets[c].cp; } /* If it's not in the table, try to parse it. */ switch (*cs) { case 'C': if (cs[1] == 'P' && cs[2] >= '0' && cs[2] <= '9') { cp = my_atoi(cs + 2); } else if (strcmp(cs, "CP_ACP") == 0) cp = get_current_codepage(); else if (strcmp(cs, "CP_OEMCP") == 0) cp = get_current_oemcp(); break; case 'I': if (cs[1] == 'B' && cs[2] == 'M' && cs[3] >= '0' && cs[3] <= '9') { cp = my_atoi(cs + 3); } break; case 'W': if (strncmp(cs, "WINDOWS-", 8) == 0) { cp = my_atoi(cs + 8); if (cp != 874 && (cp < 1250 || cp > 1258)) cp = -1;/* This may invalid code. */ } break; } return (cp); } /* * Return ANSI Code Page of current locale set by setlocale(). */ static unsigned get_current_codepage(void) { char *locale, *p; unsigned cp; locale = setlocale(LC_CTYPE, NULL); if (locale == NULL) return (GetACP()); if (locale[0] == 'C' && locale[1] == '\0') return (CP_C_LOCALE); p = strrchr(locale, '.'); if (p == NULL) return (GetACP()); if ((strcmp(p+1, "utf8") == 0) || (strcmp(p+1, "UTF-8") == 0)) return CP_UTF8; cp = my_atoi(p+1); if ((int)cp <= 0) return (GetACP()); return (cp); } /* * Translation table between Locale Name and ACP/OEMCP. */ static struct { unsigned acp; unsigned ocp; const char *locale; } acp_ocp_map[] = { { 950, 950, "Chinese_Taiwan" }, { 936, 936, "Chinese_People's Republic of China" }, { 950, 950, "Chinese_Taiwan" }, { 1250, 852, "Czech_Czech Republic" }, { 1252, 850, "Danish_Denmark" }, { 1252, 850, "Dutch_Netherlands" }, { 1252, 850, "Dutch_Belgium" }, { 1252, 437, "English_United States" }, { 1252, 850, "English_Australia" }, { 1252, 850, "English_Canada" }, { 1252, 850, "English_New Zealand" }, { 1252, 850, "English_United Kingdom" }, { 1252, 437, "English_United States" }, { 1252, 850, "Finnish_Finland" }, { 1252, 850, "French_France" }, { 1252, 850, "French_Belgium" }, { 1252, 850, "French_Canada" }, { 1252, 850, "French_Switzerland" }, { 1252, 850, "German_Germany" }, { 1252, 850, "German_Austria" }, { 1252, 850, "German_Switzerland" }, { 1253, 737, "Greek_Greece" }, { 1250, 852, "Hungarian_Hungary" }, { 1252, 850, "Icelandic_Iceland" }, { 1252, 850, "Italian_Italy" }, { 1252, 850, "Italian_Switzerland" }, { 932, 932, "Japanese_Japan" }, { 949, 949, "Korean_Korea" }, { 1252, 850, "Norwegian (BokmOl)_Norway" }, { 1252, 850, "Norwegian (BokmOl)_Norway" }, { 1252, 850, "Norwegian-Nynorsk_Norway" }, { 1250, 852, "Polish_Poland" }, { 1252, 850, "Portuguese_Portugal" }, { 1252, 850, "Portuguese_Brazil" }, { 1251, 866, "Russian_Russia" }, { 1250, 852, "Slovak_Slovakia" }, { 1252, 850, "Spanish_Spain" }, { 1252, 850, "Spanish_Mexico" }, { 1252, 850, "Spanish_Spain" }, { 1252, 850, "Swedish_Sweden" }, { 1254, 857, "Turkish_Turkey" }, { 0, 0, NULL} }; /* * Return OEM Code Page of current locale set by setlocale(). */ static unsigned get_current_oemcp(void) { int i; char *locale, *p; size_t len; locale = setlocale(LC_CTYPE, NULL); if (locale == NULL) return (GetOEMCP()); if (locale[0] == 'C' && locale[1] == '\0') return (CP_C_LOCALE); p = strrchr(locale, '.'); if (p == NULL) return (GetOEMCP()); len = p - locale; for (i = 0; acp_ocp_map[i].acp; i++) { if (strncmp(acp_ocp_map[i].locale, locale, len) == 0) return (acp_ocp_map[i].ocp); } return (GetOEMCP()); } #else /* * POSIX platform does not use CodePage. */ static unsigned get_current_codepage(void) { return (-1);/* Unknown */ } static unsigned make_codepage_from_charset(const char *charset) { (void)charset; /* UNUSED */ return (-1);/* Unknown */ } static unsigned get_current_oemcp(void) { return (-1);/* Unknown */ } #endif /* defined(_WIN32) && !defined(__CYGWIN__) */ /* * Return a string conversion object. */ static struct archive_string_conv * get_sconv_object(struct archive *a, const char *fc, const char *tc, int flag) { struct archive_string_conv *sc; unsigned current_codepage; /* Check if we have made the sconv object. */ sc = find_sconv_object(a, fc, tc); if (sc != NULL) return (sc); if (a == NULL) current_codepage = get_current_codepage(); else current_codepage = a->current_codepage; sc = create_sconv_object(canonical_charset_name(fc), canonical_charset_name(tc), current_codepage, flag); if (sc == NULL) { if (a != NULL) archive_set_error(a, ENOMEM, "Could not allocate memory for " "a string conversion object"); return (NULL); } /* * If there is no converter for current string conversion object, * we cannot handle this conversion. */ if (sc->nconverter == 0) { if (a != NULL) { #if HAVE_ICONV archive_set_error(a, ARCHIVE_ERRNO_MISC, "iconv_open failed : Cannot handle ``%s''", (flag & SCONV_TO_CHARSET)?tc:fc); #else archive_set_error(a, ARCHIVE_ERRNO_MISC, "A character-set conversion not fully supported " "on this platform"); #endif } /* Failed; free a sconv object. */ free_sconv_object(sc); return (NULL); } /* * Success! */ if (a != NULL) add_sconv_object(a, sc); return (sc); } static const char * get_current_charset(struct archive *a) { const char *cur_charset; if (a == NULL) cur_charset = default_iconv_charset(""); else { cur_charset = default_iconv_charset(a->current_code); if (a->current_code == NULL) { a->current_code = strdup(cur_charset); a->current_codepage = get_current_codepage(); a->current_oemcp = get_current_oemcp(); } } return (cur_charset); } /* * Make and Return a string conversion object. * Return NULL if the platform does not support the specified conversion * and best_effort is 0. * If best_effort is set, A string conversion object must be returned * unless memory allocation for the object fails, but the conversion * might fail when non-ASCII code is found. */ struct archive_string_conv * archive_string_conversion_to_charset(struct archive *a, const char *charset, int best_effort) { int flag = SCONV_TO_CHARSET; if (best_effort) flag |= SCONV_BEST_EFFORT; return (get_sconv_object(a, get_current_charset(a), charset, flag)); } struct archive_string_conv * archive_string_conversion_from_charset(struct archive *a, const char *charset, int best_effort) { int flag = SCONV_FROM_CHARSET; if (best_effort) flag |= SCONV_BEST_EFFORT; return (get_sconv_object(a, charset, get_current_charset(a), flag)); } /* * archive_string_default_conversion_*_archive() are provided for Windows * platform because other archiver application use CP_OEMCP for * MultiByteToWideChar() and WideCharToMultiByte() for the filenames * in tar or zip files. But mbstowcs/wcstombs(CRT) usually use CP_ACP * unless you use setlocale(LC_ALL, ".OCP")(specify CP_OEMCP). * So we should make a string conversion between CP_ACP and CP_OEMCP * for compatibility. */ #if defined(_WIN32) && !defined(__CYGWIN__) struct archive_string_conv * archive_string_default_conversion_for_read(struct archive *a) { const char *cur_charset = get_current_charset(a); char oemcp[16]; /* NOTE: a check of cur_charset is unneeded but we need * that get_current_charset() has been surely called at * this time whatever C compiler optimized. */ if (cur_charset != NULL && (a->current_codepage == CP_C_LOCALE || a->current_codepage == a->current_oemcp)) return (NULL);/* no conversion. */ _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp); /* Make sure a null termination must be set. */ oemcp[sizeof(oemcp)-1] = '\0'; return (get_sconv_object(a, oemcp, cur_charset, SCONV_FROM_CHARSET)); } struct archive_string_conv * archive_string_default_conversion_for_write(struct archive *a) { const char *cur_charset = get_current_charset(a); char oemcp[16]; /* NOTE: a check of cur_charset is unneeded but we need * that get_current_charset() has been surely called at * this time whatever C compiler optimized. */ if (cur_charset != NULL && (a->current_codepage == CP_C_LOCALE || a->current_codepage == a->current_oemcp)) return (NULL);/* no conversion. */ _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp); /* Make sure a null termination must be set. */ oemcp[sizeof(oemcp)-1] = '\0'; return (get_sconv_object(a, cur_charset, oemcp, SCONV_TO_CHARSET)); } #else struct archive_string_conv * archive_string_default_conversion_for_read(struct archive *a) { (void)a; /* UNUSED */ return (NULL); } struct archive_string_conv * archive_string_default_conversion_for_write(struct archive *a) { (void)a; /* UNUSED */ return (NULL); } #endif /* * Dispose of all character conversion objects in the archive object. */ void archive_string_conversion_free(struct archive *a) { struct archive_string_conv *sc; struct archive_string_conv *sc_next; for (sc = a->sconv; sc != NULL; sc = sc_next) { sc_next = sc->next; free_sconv_object(sc); } a->sconv = NULL; free(a->current_code); a->current_code = NULL; } /* * Return a conversion charset name. */ const char * archive_string_conversion_charset_name(struct archive_string_conv *sc) { if (sc->flag & SCONV_TO_CHARSET) return (sc->to_charset); else return (sc->from_charset); } /* * Change the behavior of a string conversion. */ void archive_string_conversion_set_opt(struct archive_string_conv *sc, int opt) { switch (opt) { /* * A filename in UTF-8 was made with libarchive 2.x in a wrong * assumption that wchar_t was Unicode. * This option enables simulating the assumption in order to read * that filename correctly. */ case SCONV_SET_OPT_UTF8_LIBARCHIVE2X: #if (defined(_WIN32) && !defined(__CYGWIN__)) \ || defined(__STDC_ISO_10646__) || defined(__APPLE__) /* * Nothing to do for it since wchar_t on these platforms * is really Unicode. */ (void)sc; /* UNUSED */ #else if ((sc->flag & SCONV_UTF8_LIBARCHIVE_2) == 0) { sc->flag |= SCONV_UTF8_LIBARCHIVE_2; /* Set up string converters. */ setup_converter(sc); } #endif break; case SCONV_SET_OPT_NORMALIZATION_C: if ((sc->flag & SCONV_NORMALIZATION_C) == 0) { sc->flag |= SCONV_NORMALIZATION_C; sc->flag &= ~SCONV_NORMALIZATION_D; /* Set up string converters. */ setup_converter(sc); } break; case SCONV_SET_OPT_NORMALIZATION_D: #if defined(HAVE_ICONV) /* * If iconv will take the string, do not change the * setting of the normalization. */ if (!(sc->flag & SCONV_WIN_CP) && (sc->flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && !(sc->flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8))) break; #endif if ((sc->flag & SCONV_NORMALIZATION_D) == 0) { sc->flag |= SCONV_NORMALIZATION_D; sc->flag &= ~SCONV_NORMALIZATION_C; /* Set up string converters. */ setup_converter(sc); } break; default: break; } } /* * * Copy one archive_string to another in locale conversion. * * archive_strncat_l(); * archive_strncpy_l(); * */ static size_t mbsnbytes(const void *_p, size_t n) { size_t s; const char *p, *pp; if (_p == NULL) return (0); p = (const char *)_p; /* Like strlen(p), except won't examine positions beyond p[n]. */ s = 0; pp = p; while (s < n && *pp) { pp++; s++; } return (s); } static size_t utf16nbytes(const void *_p, size_t n) { size_t s; const char *p, *pp; if (_p == NULL) return (0); p = (const char *)_p; /* Like strlen(p), except won't examine positions beyond p[n]. */ s = 0; pp = p; n >>= 1; while (s < n && (pp[0] || pp[1])) { pp += 2; s++; } return (s<<1); } int archive_strncpy_l(struct archive_string *as, const void *_p, size_t n, struct archive_string_conv *sc) { as->length = 0; return (archive_strncat_l(as, _p, n, sc)); } int archive_strncat_l(struct archive_string *as, const void *_p, size_t n, struct archive_string_conv *sc) { const void *s; size_t length = 0; int i, r = 0, r2; if (_p != NULL && n > 0) { if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) length = utf16nbytes(_p, n); else length = mbsnbytes(_p, n); } /* We must allocate memory even if there is no data for conversion * or copy. This simulates archive_string_append behavior. */ if (length == 0) { int tn = 1; if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) tn = 2; if (archive_string_ensure(as, as->length + tn) == NULL) return (-1); as->s[as->length] = 0; if (tn == 2) as->s[as->length+1] = 0; return (0); } /* * If sc is NULL, we just make a copy. */ if (sc == NULL) { if (archive_string_append(as, _p, length) == NULL) return (-1);/* No memory */ return (0); } s = _p; i = 0; if (sc->nconverter > 1) { sc->utftmp.length = 0; r2 = sc->converter[0](&(sc->utftmp), s, length, sc); if (r2 != 0 && errno == ENOMEM) return (r2); if (r > r2) r = r2; s = sc->utftmp.s; length = sc->utftmp.length; ++i; } r2 = sc->converter[i](as, s, length, sc); if (r > r2) r = r2; return (r); } #if HAVE_ICONV /* * Return -1 if conversion fails. */ static int iconv_strncat_in_locale(struct archive_string *as, const void *_p, size_t length, struct archive_string_conv *sc) { ICONV_CONST char *itp; size_t remaining; iconv_t cd; char *outp; size_t avail, bs; int return_value = 0; /* success */ int to_size, from_size; if (sc->flag & SCONV_TO_UTF16) to_size = 2; else to_size = 1; if (sc->flag & SCONV_FROM_UTF16) from_size = 2; else from_size = 1; if (archive_string_ensure(as, as->length + length*2+to_size) == NULL) return (-1); cd = sc->cd; itp = (char *)(uintptr_t)_p; remaining = length; outp = as->s + as->length; avail = as->buffer_length - as->length - to_size; while (remaining >= (size_t)from_size) { size_t result = iconv(cd, &itp, &remaining, &outp, &avail); if (result != (size_t)-1) break; /* Conversion completed. */ if (errno == EILSEQ || errno == EINVAL) { /* * If an output charset is UTF-8 or UTF-16BE/LE, * unknown character should be U+FFFD * (replacement character). */ if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) { size_t rbytes; if (sc->flag & SCONV_TO_UTF8) rbytes = sizeof(utf8_replacement_char); else rbytes = 2; if (avail < rbytes) { as->length = outp - as->s; bs = as->buffer_length + (remaining * to_size) + rbytes; if (NULL == archive_string_ensure(as, bs)) return (-1); outp = as->s + as->length; avail = as->buffer_length - as->length - to_size; } if (sc->flag & SCONV_TO_UTF8) memcpy(outp, utf8_replacement_char, sizeof(utf8_replacement_char)); else if (sc->flag & SCONV_TO_UTF16BE) archive_be16enc(outp, UNICODE_R_CHAR); else archive_le16enc(outp, UNICODE_R_CHAR); outp += rbytes; avail -= rbytes; } else { /* Skip the illegal input bytes. */ *outp++ = '?'; avail--; } itp += from_size; remaining -= from_size; return_value = -1; /* failure */ } else { /* E2BIG no output buffer, * Increase an output buffer. */ as->length = outp - as->s; bs = as->buffer_length + remaining * 2; if (NULL == archive_string_ensure(as, bs)) return (-1); outp = as->s + as->length; avail = as->buffer_length - as->length - to_size; } } as->length = outp - as->s; as->s[as->length] = 0; if (to_size == 2) as->s[as->length+1] = 0; return (return_value); } #endif /* HAVE_ICONV */ #if defined(_WIN32) && !defined(__CYGWIN__) /* * Translate a string from a some CodePage to an another CodePage by * Windows APIs, and copy the result. Return -1 if conversion fails. */ static int strncat_in_codepage(struct archive_string *as, const void *_p, size_t length, struct archive_string_conv *sc) { const char *s = (const char *)_p; struct archive_wstring aws; size_t l; int r, saved_flag; archive_string_init(&aws); saved_flag = sc->flag; sc->flag &= ~(SCONV_NORMALIZATION_D | SCONV_NORMALIZATION_C); r = archive_wstring_append_from_mbs_in_codepage(&aws, s, length, sc); sc->flag = saved_flag; if (r != 0) { archive_wstring_free(&aws); if (errno != ENOMEM) archive_string_append(as, s, length); return (-1); } l = as->length; r = archive_string_append_from_wcs_in_codepage( as, aws.s, aws.length, sc); if (r != 0 && errno != ENOMEM && l == as->length) archive_string_append(as, s, length); archive_wstring_free(&aws); return (r); } /* * Test whether MBS ==> WCS is okay. */ static int invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc) { const char *p = (const char *)_p; unsigned codepage; DWORD mbflag = MB_ERR_INVALID_CHARS; if (sc->flag & SCONV_FROM_CHARSET) codepage = sc->to_cp; else codepage = sc->from_cp; if (codepage == CP_C_LOCALE) return (0); if (codepage != CP_UTF8) mbflag |= MB_PRECOMPOSED; if (MultiByteToWideChar(codepage, mbflag, p, (int)n, NULL, 0) == 0) return (-1); /* Invalid */ return (0); /* Okay */ } #else /* * Test whether MBS ==> WCS is okay. */ static int invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc) { const char *p = (const char *)_p; size_t r; #if HAVE_MBRTOWC mbstate_t shift_state; memset(&shift_state, 0, sizeof(shift_state)); #else /* Clear the shift state before starting. */ mbtowc(NULL, NULL, 0); #endif while (n) { wchar_t wc; #if HAVE_MBRTOWC r = mbrtowc(&wc, p, n, &shift_state); #else r = mbtowc(&wc, p, n); #endif if (r == (size_t)-1 || r == (size_t)-2) return (-1);/* Invalid. */ if (r == 0) break; p += r; n -= r; } (void)sc; /* UNUSED */ return (0); /* All Okey. */ } #endif /* defined(_WIN32) && !defined(__CYGWIN__) */ /* * Basically returns -1 because we cannot make a conversion of charset * without iconv but in some cases this would return 0. * Returns 0 if all copied characters are ASCII. * Returns 0 if both from-locale and to-locale are the same and those * can be WCS with no error. */ static int best_effort_strncat_in_locale(struct archive_string *as, const void *_p, size_t length, struct archive_string_conv *sc) { size_t remaining; const uint8_t *itp; int return_value = 0; /* success */ /* * If both from-locale and to-locale is the same, this makes a copy. * And then this checks all copied MBS can be WCS if so returns 0. */ if (sc->same) { if (archive_string_append(as, _p, length) == NULL) return (-1);/* No memory */ return (invalid_mbs(_p, length, sc)); } /* * If a character is ASCII, this just copies it. If not, this * assigns '?' character instead but in UTF-8 locale this assigns * byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD, * a Replacement Character in Unicode. */ remaining = length; itp = (const uint8_t *)_p; while (*itp && remaining > 0) { if (*itp > 127) { // Non-ASCII: Substitute with suitable replacement if (sc->flag & SCONV_TO_UTF8) { if (archive_string_append(as, utf8_replacement_char, sizeof(utf8_replacement_char)) == NULL) { __archive_errx(1, "Out of memory"); } } else { archive_strappend_char(as, '?'); } return_value = -1; } else { archive_strappend_char(as, *itp); } ++itp; } return (return_value); } /* * Unicode conversion functions. * - UTF-8 <===> UTF-8 in removing surrogate pairs. * - UTF-8 NFD ===> UTF-8 NFC in removing surrogate pairs. * - UTF-8 made by libarchive 2.x ===> UTF-8. * - UTF-16BE <===> UTF-8. * */ /* * Utility to convert a single UTF-8 sequence. * * Usually return used bytes, return used byte in negative value when * a unicode character is replaced with U+FFFD. * See also http://unicode.org/review/pr-121.html Public Review Issue #121 * Recommended Practice for Replacement Characters. */ static int _utf8_to_unicode(uint32_t *pwc, const char *s, size_t n) { static const char utf8_count[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 00 - 0F */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 10 - 1F */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 20 - 2F */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 30 - 3F */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 40 - 4F */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 50 - 5F */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 60 - 6F */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 70 - 7F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 80 - 8F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 90 - 9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* A0 - AF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* B0 - BF */ 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* C0 - CF */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* D0 - DF */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,/* E0 - EF */ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0 - FF */ }; int ch, i; int cnt; uint32_t wc; /* Sanity check. */ if (n == 0) return (0); /* * Decode 1-4 bytes depending on the value of the first byte. */ ch = (unsigned char)*s; if (ch == 0) return (0); /* Standard: return 0 for end-of-string. */ cnt = utf8_count[ch]; /* Invalid sequence or there are not plenty bytes. */ if ((int)n < cnt) { cnt = (int)n; for (i = 1; i < cnt; i++) { if ((s[i] & 0xc0) != 0x80) { cnt = i; break; } } goto invalid_sequence; } /* Make a Unicode code point from a single UTF-8 sequence. */ switch (cnt) { case 1: /* 1 byte sequence. */ *pwc = ch & 0x7f; return (cnt); case 2: /* 2 bytes sequence. */ if ((s[1] & 0xc0) != 0x80) { cnt = 1; goto invalid_sequence; } *pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f); return (cnt); case 3: /* 3 bytes sequence. */ if ((s[1] & 0xc0) != 0x80) { cnt = 1; goto invalid_sequence; } if ((s[2] & 0xc0) != 0x80) { cnt = 2; goto invalid_sequence; } wc = ((ch & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f); if (wc < 0x800) goto invalid_sequence;/* Overlong sequence. */ break; case 4: /* 4 bytes sequence. */ if ((s[1] & 0xc0) != 0x80) { cnt = 1; goto invalid_sequence; } if ((s[2] & 0xc0) != 0x80) { cnt = 2; goto invalid_sequence; } if ((s[3] & 0xc0) != 0x80) { cnt = 3; goto invalid_sequence; } wc = ((ch & 0x07) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f); if (wc < 0x10000) goto invalid_sequence;/* Overlong sequence. */ break; default: /* Others are all invalid sequence. */ if (ch == 0xc0 || ch == 0xc1) cnt = 2; else if (ch >= 0xf5 && ch <= 0xf7) cnt = 4; else if (ch >= 0xf8 && ch <= 0xfb) cnt = 5; else if (ch == 0xfc || ch == 0xfd) cnt = 6; else cnt = 1; if ((int)n < cnt) cnt = (int)n; for (i = 1; i < cnt; i++) { if ((s[i] & 0xc0) != 0x80) { cnt = i; break; } } goto invalid_sequence; } /* The code point larger than 0x10FFFF is not legal * Unicode values. */ if (wc > UNICODE_MAX) goto invalid_sequence; /* Correctly gets a Unicode, returns used bytes. */ *pwc = wc; return (cnt); invalid_sequence: *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */ return (cnt * -1); } static int utf8_to_unicode(uint32_t *pwc, const char *s, size_t n) { int cnt; cnt = _utf8_to_unicode(pwc, s, n); /* Any of Surrogate pair is not legal Unicode values. */ if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc)) return (-3); return (cnt); } static inline uint32_t combine_surrogate_pair(uint32_t uc, uint32_t uc2) { uc -= 0xD800; uc *= 0x400; uc += uc2 - 0xDC00; uc += 0x10000; return (uc); } /* * Convert a single UTF-8/CESU-8 sequence to a Unicode code point in * removing surrogate pairs. * * CESU-8: The Compatibility Encoding Scheme for UTF-16. * * Usually return used bytes, return used byte in negative value when * a unicode character is replaced with U+FFFD. */ static int cesu8_to_unicode(uint32_t *pwc, const char *s, size_t n) { uint32_t wc = 0; int cnt; cnt = _utf8_to_unicode(&wc, s, n); if (cnt == 3 && IS_HIGH_SURROGATE_LA(wc)) { uint32_t wc2 = 0; if (n - 3 < 3) { /* Invalid byte sequence. */ goto invalid_sequence; } cnt = _utf8_to_unicode(&wc2, s+3, n-3); if (cnt != 3 || !IS_LOW_SURROGATE_LA(wc2)) { /* Invalid byte sequence. */ goto invalid_sequence; } wc = combine_surrogate_pair(wc, wc2); cnt = 6; } else if (cnt == 3 && IS_LOW_SURROGATE_LA(wc)) { /* Invalid byte sequence. */ goto invalid_sequence; } *pwc = wc; return (cnt); invalid_sequence: *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */ if (cnt > 0) cnt *= -1; return (cnt); } /* * Convert a Unicode code point to a single UTF-8 sequence. * * NOTE:This function does not check if the Unicode is legal or not. * Please you definitely check it before calling this. */ static size_t unicode_to_utf8(char *p, size_t remaining, uint32_t uc) { char *_p = p; /* Invalid Unicode char maps to Replacement character */ if (uc > UNICODE_MAX) uc = UNICODE_R_CHAR; /* Translate code point to UTF8 */ if (uc <= 0x7f) { if (remaining == 0) return (0); *p++ = (char)uc; } else if (uc <= 0x7ff) { if (remaining < 2) return (0); *p++ = 0xc0 | ((uc >> 6) & 0x1f); *p++ = 0x80 | (uc & 0x3f); } else if (uc <= 0xffff) { if (remaining < 3) return (0); *p++ = 0xe0 | ((uc >> 12) & 0x0f); *p++ = 0x80 | ((uc >> 6) & 0x3f); *p++ = 0x80 | (uc & 0x3f); } else { if (remaining < 4) return (0); *p++ = 0xf0 | ((uc >> 18) & 0x07); *p++ = 0x80 | ((uc >> 12) & 0x3f); *p++ = 0x80 | ((uc >> 6) & 0x3f); *p++ = 0x80 | (uc & 0x3f); } return (p - _p); } static int utf16be_to_unicode(uint32_t *pwc, const char *s, size_t n) { return (utf16_to_unicode(pwc, s, n, 1)); } static int utf16le_to_unicode(uint32_t *pwc, const char *s, size_t n) { return (utf16_to_unicode(pwc, s, n, 0)); } static int utf16_to_unicode(uint32_t *pwc, const char *s, size_t n, int be) { const char *utf16 = s; unsigned uc; if (n == 0) return (0); if (n == 1) { /* set the Replacement Character instead. */ *pwc = UNICODE_R_CHAR; return (-1); } if (be) uc = archive_be16dec(utf16); else uc = archive_le16dec(utf16); utf16 += 2; /* If this is a surrogate pair, assemble the full code point.*/ if (IS_HIGH_SURROGATE_LA(uc)) { unsigned uc2; if (n >= 4) { if (be) uc2 = archive_be16dec(utf16); else uc2 = archive_le16dec(utf16); } else uc2 = 0; if (IS_LOW_SURROGATE_LA(uc2)) { uc = combine_surrogate_pair(uc, uc2); utf16 += 2; } else { /* Undescribed code point should be U+FFFD * (replacement character). */ *pwc = UNICODE_R_CHAR; return (-2); } } /* * Surrogate pair values(0xd800 through 0xdfff) are only * used by UTF-16, so, after above calculation, the code * must not be surrogate values, and Unicode has no codes * larger than 0x10ffff. Thus, those are not legal Unicode * values. */ if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) { /* Undescribed code point should be U+FFFD * (replacement character). */ *pwc = UNICODE_R_CHAR; return (((int)(utf16 - s)) * -1); } *pwc = uc; return ((int)(utf16 - s)); } static size_t unicode_to_utf16be(char *p, size_t remaining, uint32_t uc) { char *utf16 = p; if (uc > 0xffff) { /* We have a code point that won't fit into a * wchar_t; convert it to a surrogate pair. */ if (remaining < 4) return (0); uc -= 0x10000; archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00); return (4); } else { if (remaining < 2) return (0); archive_be16enc(utf16, uc); return (2); } } static size_t unicode_to_utf16le(char *p, size_t remaining, uint32_t uc) { char *utf16 = p; if (uc > 0xffff) { /* We have a code point that won't fit into a * wchar_t; convert it to a surrogate pair. */ if (remaining < 4) return (0); uc -= 0x10000; archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00); return (4); } else { if (remaining < 2) return (0); archive_le16enc(utf16, uc); return (2); } } /* - * Copy UTF-8 string in checking surrogate pair. - * If any surrogate pair are found, it would be canonicalized. + * Append new UTF-8 string to existing UTF-8 string. + * Existing string is assumed to already be in proper form; + * the new string will have invalid sequences replaced and + * surrogate pairs canonicalized. */ static int -strncat_from_utf8_to_utf8(struct archive_string *as, const void *_p, +strncat_from_utf8_to_utf8(struct archive_string *as, const void *_src, size_t len, struct archive_string_conv *sc) { - const char *s; - char *p, *endp; - int n, ret = 0; - + int ret = 0; + const char *src = _src; (void)sc; /* UNUSED */ + /* Pre-extend the destination */ if (archive_string_ensure(as, as->length + len + 1) == NULL) return (-1); - s = (const char *)_p; - p = as->s + as->length; - endp = as->s + as->buffer_length -1; - do { + /* Invariant: src points to the first UTF8 byte that hasn't + * been copied to the destination `as`. */ + for (;;) { + int n; uint32_t uc; - const char *ss = s; - size_t w; + const char *e = src; - /* - * Forward byte sequence until a conversion of that is needed. - */ - while ((n = utf8_to_unicode(&uc, s, len)) > 0) { - s += n; + /* Skip UTF-8 sequences until we reach end-of-string or + * a code point that needs conversion. */ + while ((n = utf8_to_unicode(&uc, e, len)) > 0) { + e += n; len -= n; } - if (ss < s) { - if (p + (s - ss) > endp) { - as->length = p - as->s; - if (archive_string_ensure(as, - as->buffer_length + len + 1) == NULL) - return (-1); - p = as->s + as->length; - endp = as->s + as->buffer_length -1; - } - - memcpy(p, ss, s - ss); - p += s - ss; + /* Copy the part that doesn't need conversion */ + if (e > src) { + if (archive_string_append(as, src, e - src) == NULL) + return (-1); + src = e; } - /* - * If n is negative, current byte sequence needs a replacement. - */ - if (n < 0) { + if (n == 0) { + /* We reached end-of-string */ + return (ret); + } else { + /* Next code point needs conversion */ + char t[4]; + size_t w; + + /* Try decoding a surrogate pair */ if (n == -3 && IS_SURROGATE_PAIR_LA(uc)) { - /* Current byte sequence may be CESU-8. */ - n = cesu8_to_unicode(&uc, s, len); + n = cesu8_to_unicode(&uc, src, len); } + /* Not a (valid) surrogate, so use a replacement char */ if (n < 0) { - ret = -1; - n *= -1;/* Use a replaced unicode character. */ - } - - /* Rebuild UTF-8 byte sequence. */ - while ((w = unicode_to_utf8(p, endp - p, uc)) == 0) { - as->length = p - as->s; - if (archive_string_ensure(as, - as->buffer_length + len + 1) == NULL) - return (-1); - p = as->s + as->length; - endp = as->s + as->buffer_length -1; + ret = -1; /* Return -1 if we used any replacement */ + n *= -1; } - p += w; - s += n; + /* Consume converted code point */ + src += n; len -= n; + /* Convert and append new UTF-8 sequence. */ + w = unicode_to_utf8(t, sizeof(t), uc); + if (archive_string_append(as, t, w) == NULL) + return (-1); } - } while (n > 0); - as->length = p - as->s; - as->s[as->length] = '\0'; - return (ret); + } } static int archive_string_append_unicode(struct archive_string *as, const void *_p, size_t len, struct archive_string_conv *sc) { const char *s; char *p, *endp; uint32_t uc; size_t w; int n, ret = 0, ts, tm; int (*parse)(uint32_t *, const char *, size_t); size_t (*unparse)(char *, size_t, uint32_t); if (sc->flag & SCONV_TO_UTF16BE) { unparse = unicode_to_utf16be; ts = 2; } else if (sc->flag & SCONV_TO_UTF16LE) { unparse = unicode_to_utf16le; ts = 2; } else if (sc->flag & SCONV_TO_UTF8) { unparse = unicode_to_utf8; ts = 1; } else { /* * This case is going to be converted to another * character-set through iconv. */ if (sc->flag & SCONV_FROM_UTF16BE) { unparse = unicode_to_utf16be; ts = 2; } else if (sc->flag & SCONV_FROM_UTF16LE) { unparse = unicode_to_utf16le; ts = 2; } else { unparse = unicode_to_utf8; ts = 1; } } if (sc->flag & SCONV_FROM_UTF16BE) { parse = utf16be_to_unicode; tm = 1; } else if (sc->flag & SCONV_FROM_UTF16LE) { parse = utf16le_to_unicode; tm = 1; } else { parse = cesu8_to_unicode; tm = ts; } if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) return (-1); s = (const char *)_p; p = as->s + as->length; endp = as->s + as->buffer_length - ts; while ((n = parse(&uc, s, len)) != 0) { if (n < 0) { /* Use a replaced unicode character. */ n *= -1; ret = -1; } s += n; len -= n; while ((w = unparse(p, endp - p, uc)) == 0) { /* There is not enough output buffer so * we have to expand it. */ as->length = p - as->s; if (archive_string_ensure(as, as->buffer_length + len * tm + ts) == NULL) return (-1); p = as->s + as->length; endp = as->s + as->buffer_length - ts; } p += w; } as->length = p - as->s; as->s[as->length] = '\0'; if (ts == 2) as->s[as->length+1] = '\0'; return (ret); } /* * Following Constants for Hangul compositions this information comes from * Unicode Standard Annex #15 http://unicode.org/reports/tr15/ */ #define HC_SBASE 0xAC00 #define HC_LBASE 0x1100 #define HC_VBASE 0x1161 #define HC_TBASE 0x11A7 #define HC_LCOUNT 19 #define HC_VCOUNT 21 #define HC_TCOUNT 28 #define HC_NCOUNT (HC_VCOUNT * HC_TCOUNT) #define HC_SCOUNT (HC_LCOUNT * HC_NCOUNT) static uint32_t get_nfc(uint32_t uc, uint32_t uc2) { int t, b; t = 0; b = sizeof(u_composition_table)/sizeof(u_composition_table[0]) -1; while (b >= t) { int m = (t + b) / 2; if (u_composition_table[m].cp1 < uc) t = m + 1; else if (u_composition_table[m].cp1 > uc) b = m - 1; else if (u_composition_table[m].cp2 < uc2) t = m + 1; else if (u_composition_table[m].cp2 > uc2) b = m - 1; else return (u_composition_table[m].nfc); } return (0); } #define FDC_MAX 10 /* The maximum number of Following Decomposable * Characters. */ /* * Update first code point. */ #define UPDATE_UC(new_uc) do { \ uc = new_uc; \ ucptr = NULL; \ } while (0) /* * Replace first code point with second code point. */ #define REPLACE_UC_WITH_UC2() do { \ uc = uc2; \ ucptr = uc2ptr; \ n = n2; \ } while (0) #define EXPAND_BUFFER() do { \ as->length = p - as->s; \ if (archive_string_ensure(as, \ as->buffer_length + len * tm + ts) == NULL)\ return (-1); \ p = as->s + as->length; \ endp = as->s + as->buffer_length - ts; \ } while (0) #define UNPARSE(p, endp, uc) do { \ while ((w = unparse(p, (endp) - (p), uc)) == 0) {\ EXPAND_BUFFER(); \ } \ p += w; \ } while (0) /* * Write first code point. * If the code point has not be changed from its original code, * this just copies it from its original buffer pointer. * If not, this converts it to UTF-8 byte sequence and copies it. */ #define WRITE_UC() do { \ if (ucptr) { \ if (p + n > endp) \ EXPAND_BUFFER(); \ switch (n) { \ case 4: \ *p++ = *ucptr++; \ /* FALL THROUGH */ \ case 3: \ *p++ = *ucptr++; \ /* FALL THROUGH */ \ case 2: \ *p++ = *ucptr++; \ /* FALL THROUGH */ \ case 1: \ *p++ = *ucptr; \ break; \ } \ ucptr = NULL; \ } else { \ UNPARSE(p, endp, uc); \ } \ } while (0) /* * Collect following decomposable code points. */ #define COLLECT_CPS(start) do { \ int _i; \ for (_i = start; _i < FDC_MAX ; _i++) { \ nx = parse(&ucx[_i], s, len); \ if (nx <= 0) \ break; \ cx = CCC(ucx[_i]); \ if (cl >= cx && cl != 228 && cx != 228)\ break; \ s += nx; \ len -= nx; \ cl = cx; \ ccx[_i] = cx; \ } \ if (_i >= FDC_MAX) { \ ret = -1; \ ucx_size = FDC_MAX; \ } else \ ucx_size = _i; \ } while (0) /* * Normalize UTF-8/UTF-16BE characters to Form C and copy the result. * * TODO: Convert composition exclusions, which are never converted * from NFC,NFD,NFKC and NFKD, to Form C. */ static int archive_string_normalize_C(struct archive_string *as, const void *_p, size_t len, struct archive_string_conv *sc) { const char *s = (const char *)_p; char *p, *endp; uint32_t uc, uc2; size_t w; int always_replace, n, n2, ret = 0, spair, ts, tm; int (*parse)(uint32_t *, const char *, size_t); size_t (*unparse)(char *, size_t, uint32_t); always_replace = 1; ts = 1;/* text size. */ if (sc->flag & SCONV_TO_UTF16BE) { unparse = unicode_to_utf16be; ts = 2; if (sc->flag & SCONV_FROM_UTF16BE) always_replace = 0; } else if (sc->flag & SCONV_TO_UTF16LE) { unparse = unicode_to_utf16le; ts = 2; if (sc->flag & SCONV_FROM_UTF16LE) always_replace = 0; } else if (sc->flag & SCONV_TO_UTF8) { unparse = unicode_to_utf8; if (sc->flag & SCONV_FROM_UTF8) always_replace = 0; } else { /* * This case is going to be converted to another * character-set through iconv. */ always_replace = 0; if (sc->flag & SCONV_FROM_UTF16BE) { unparse = unicode_to_utf16be; ts = 2; } else if (sc->flag & SCONV_FROM_UTF16LE) { unparse = unicode_to_utf16le; ts = 2; } else { unparse = unicode_to_utf8; } } if (sc->flag & SCONV_FROM_UTF16BE) { parse = utf16be_to_unicode; tm = 1; spair = 4;/* surrogate pair size in UTF-16. */ } else if (sc->flag & SCONV_FROM_UTF16LE) { parse = utf16le_to_unicode; tm = 1; spair = 4;/* surrogate pair size in UTF-16. */ } else { parse = cesu8_to_unicode; tm = ts; spair = 6;/* surrogate pair size in UTF-8. */ } if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) return (-1); p = as->s + as->length; endp = as->s + as->buffer_length - ts; while ((n = parse(&uc, s, len)) != 0) { const char *ucptr, *uc2ptr; if (n < 0) { /* Use a replaced unicode character. */ UNPARSE(p, endp, uc); s += n*-1; len -= n*-1; ret = -1; continue; } else if (n == spair || always_replace) /* uc is converted from a surrogate pair. * this should be treated as a changed code. */ ucptr = NULL; else ucptr = s; s += n; len -= n; /* Read second code point. */ while ((n2 = parse(&uc2, s, len)) > 0) { uint32_t ucx[FDC_MAX]; int ccx[FDC_MAX]; int cl, cx, i, nx, ucx_size; int LIndex,SIndex; uint32_t nfc; if (n2 == spair || always_replace) /* uc2 is converted from a surrogate pair. * this should be treated as a changed code. */ uc2ptr = NULL; else uc2ptr = s; s += n2; len -= n2; /* * If current second code point is out of decomposable * code points, finding compositions is unneeded. */ if (!IS_DECOMPOSABLE_BLOCK(uc2)) { WRITE_UC(); REPLACE_UC_WITH_UC2(); continue; } /* * Try to combine current code points. */ /* * We have to combine Hangul characters according to * http://uniicode.org/reports/tr15/#Hangul */ if (0 <= (LIndex = uc - HC_LBASE) && LIndex < HC_LCOUNT) { /* * Hangul Composition. * 1. Two current code points are L and V. */ int VIndex = uc2 - HC_VBASE; if (0 <= VIndex && VIndex < HC_VCOUNT) { /* Make syllable of form LV. */ UPDATE_UC(HC_SBASE + (LIndex * HC_VCOUNT + VIndex) * HC_TCOUNT); } else { WRITE_UC(); REPLACE_UC_WITH_UC2(); } continue; } else if (0 <= (SIndex = uc - HC_SBASE) && SIndex < HC_SCOUNT && (SIndex % HC_TCOUNT) == 0) { /* * Hangul Composition. * 2. Two current code points are LV and T. */ int TIndex = uc2 - HC_TBASE; if (0 < TIndex && TIndex < HC_TCOUNT) { /* Make syllable of form LVT. */ UPDATE_UC(uc + TIndex); } else { WRITE_UC(); REPLACE_UC_WITH_UC2(); } continue; } else if ((nfc = get_nfc(uc, uc2)) != 0) { /* A composition to current code points * is found. */ UPDATE_UC(nfc); continue; } else if ((cl = CCC(uc2)) == 0) { /* Clearly 'uc2' the second code point is not * a decomposable code. */ WRITE_UC(); REPLACE_UC_WITH_UC2(); continue; } /* * Collect following decomposable code points. */ cx = 0; ucx[0] = uc2; ccx[0] = cl; COLLECT_CPS(1); /* * Find a composed code in the collected code points. */ i = 1; while (i < ucx_size) { int j; if ((nfc = get_nfc(uc, ucx[i])) == 0) { i++; continue; } /* * nfc is composed of uc and ucx[i]. */ UPDATE_UC(nfc); /* * Remove ucx[i] by shifting * following code points. */ for (j = i; j+1 < ucx_size; j++) { ucx[j] = ucx[j+1]; ccx[j] = ccx[j+1]; } ucx_size --; /* * Collect following code points blocked * by ucx[i] the removed code point. */ if (ucx_size > 0 && i == ucx_size && nx > 0 && cx == cl) { cl = ccx[ucx_size-1]; COLLECT_CPS(ucx_size); } /* * Restart finding a composed code with * the updated uc from the top of the * collected code points. */ i = 0; } /* * Apparently the current code points are not * decomposed characters or already composed. */ WRITE_UC(); for (i = 0; i < ucx_size; i++) UNPARSE(p, endp, ucx[i]); /* * Flush out remaining canonical combining characters. */ if (nx > 0 && cx == cl && len > 0) { while ((nx = parse(&ucx[0], s, len)) > 0) { cx = CCC(ucx[0]); if (cl > cx) break; s += nx; len -= nx; cl = cx; UNPARSE(p, endp, ucx[0]); } } break; } if (n2 < 0) { WRITE_UC(); /* Use a replaced unicode character. */ UNPARSE(p, endp, uc2); s += n2*-1; len -= n2*-1; ret = -1; continue; } else if (n2 == 0) { WRITE_UC(); break; } } as->length = p - as->s; as->s[as->length] = '\0'; if (ts == 2) as->s[as->length+1] = '\0'; return (ret); } static int get_nfd(uint32_t *cp1, uint32_t *cp2, uint32_t uc) { int t, b; /* * These are not converted to NFD on Mac OS. */ if ((uc >= 0x2000 && uc <= 0x2FFF) || (uc >= 0xF900 && uc <= 0xFAFF) || (uc >= 0x2F800 && uc <= 0x2FAFF)) return (0); /* * Those code points are not converted to NFD on Mac OS. * I do not know the reason because it is undocumented. * NFC NFD * 1109A ==> 11099 110BA * 1109C ==> 1109B 110BA * 110AB ==> 110A5 110BA */ if (uc == 0x1109A || uc == 0x1109C || uc == 0x110AB) return (0); t = 0; b = sizeof(u_decomposition_table)/sizeof(u_decomposition_table[0]) -1; while (b >= t) { int m = (t + b) / 2; if (u_decomposition_table[m].nfc < uc) t = m + 1; else if (u_decomposition_table[m].nfc > uc) b = m - 1; else { *cp1 = u_decomposition_table[m].cp1; *cp2 = u_decomposition_table[m].cp2; return (1); } } return (0); } #define REPLACE_UC_WITH(cp) do { \ uc = cp; \ ucptr = NULL; \ } while (0) /* * Normalize UTF-8 characters to Form D and copy the result. */ static int archive_string_normalize_D(struct archive_string *as, const void *_p, size_t len, struct archive_string_conv *sc) { const char *s = (const char *)_p; char *p, *endp; uint32_t uc, uc2; size_t w; int always_replace, n, n2, ret = 0, spair, ts, tm; int (*parse)(uint32_t *, const char *, size_t); size_t (*unparse)(char *, size_t, uint32_t); always_replace = 1; ts = 1;/* text size. */ if (sc->flag & SCONV_TO_UTF16BE) { unparse = unicode_to_utf16be; ts = 2; if (sc->flag & SCONV_FROM_UTF16BE) always_replace = 0; } else if (sc->flag & SCONV_TO_UTF16LE) { unparse = unicode_to_utf16le; ts = 2; if (sc->flag & SCONV_FROM_UTF16LE) always_replace = 0; } else if (sc->flag & SCONV_TO_UTF8) { unparse = unicode_to_utf8; if (sc->flag & SCONV_FROM_UTF8) always_replace = 0; } else { /* * This case is going to be converted to another * character-set through iconv. */ always_replace = 0; if (sc->flag & SCONV_FROM_UTF16BE) { unparse = unicode_to_utf16be; ts = 2; } else if (sc->flag & SCONV_FROM_UTF16LE) { unparse = unicode_to_utf16le; ts = 2; } else { unparse = unicode_to_utf8; } } if (sc->flag & SCONV_FROM_UTF16BE) { parse = utf16be_to_unicode; tm = 1; spair = 4;/* surrogate pair size in UTF-16. */ } else if (sc->flag & SCONV_FROM_UTF16LE) { parse = utf16le_to_unicode; tm = 1; spair = 4;/* surrogate pair size in UTF-16. */ } else { parse = cesu8_to_unicode; tm = ts; spair = 6;/* surrogate pair size in UTF-8. */ } if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) return (-1); p = as->s + as->length; endp = as->s + as->buffer_length - ts; while ((n = parse(&uc, s, len)) != 0) { const char *ucptr; uint32_t cp1, cp2; int SIndex; struct { uint32_t uc; int ccc; } fdc[FDC_MAX]; int fdi, fdj; int ccc; check_first_code: if (n < 0) { /* Use a replaced unicode character. */ UNPARSE(p, endp, uc); s += n*-1; len -= n*-1; ret = -1; continue; } else if (n == spair || always_replace) /* uc is converted from a surrogate pair. * this should be treated as a changed code. */ ucptr = NULL; else ucptr = s; s += n; len -= n; /* Hangul Decomposition. */ if ((SIndex = uc - HC_SBASE) >= 0 && SIndex < HC_SCOUNT) { int L = HC_LBASE + SIndex / HC_NCOUNT; int V = HC_VBASE + (SIndex % HC_NCOUNT) / HC_TCOUNT; int T = HC_TBASE + SIndex % HC_TCOUNT; REPLACE_UC_WITH(L); WRITE_UC(); REPLACE_UC_WITH(V); WRITE_UC(); if (T != HC_TBASE) { REPLACE_UC_WITH(T); WRITE_UC(); } continue; } if (IS_DECOMPOSABLE_BLOCK(uc) && CCC(uc) != 0) { WRITE_UC(); continue; } fdi = 0; while (get_nfd(&cp1, &cp2, uc) && fdi < FDC_MAX) { int k; for (k = fdi; k > 0; k--) fdc[k] = fdc[k-1]; fdc[0].ccc = CCC(cp2); fdc[0].uc = cp2; fdi++; REPLACE_UC_WITH(cp1); } /* Read following code points. */ while ((n2 = parse(&uc2, s, len)) > 0 && (ccc = CCC(uc2)) != 0 && fdi < FDC_MAX) { int j, k; s += n2; len -= n2; for (j = 0; j < fdi; j++) { if (fdc[j].ccc > ccc) break; } if (j < fdi) { for (k = fdi; k > j; k--) fdc[k] = fdc[k-1]; fdc[j].ccc = ccc; fdc[j].uc = uc2; } else { fdc[fdi].ccc = ccc; fdc[fdi].uc = uc2; } fdi++; } WRITE_UC(); for (fdj = 0; fdj < fdi; fdj++) { REPLACE_UC_WITH(fdc[fdj].uc); WRITE_UC(); } if (n2 == 0) break; REPLACE_UC_WITH(uc2); n = n2; goto check_first_code; } as->length = p - as->s; as->s[as->length] = '\0'; if (ts == 2) as->s[as->length+1] = '\0'; return (ret); } /* * libarchive 2.x made incorrect UTF-8 strings in the wrong assumption * that WCS is Unicode. It is true for several platforms but some are false. * And then people who did not use UTF-8 locale on the non Unicode WCS * platform and made a tar file with libarchive(mostly bsdtar) 2.x. Those * now cannot get right filename from libarchive 3.x and later since we * fixed the wrong assumption and it is incompatible to older its versions. * So we provide special option, "compat-2x.x", for resolving it. * That option enable the string conversion of libarchive 2.x. * * Translates the wrong UTF-8 string made by libarchive 2.x into current * locale character set and appends to the archive_string. * Note: returns -1 if conversion fails. */ static int strncat_from_utf8_libarchive2(struct archive_string *as, const void *_p, size_t len, struct archive_string_conv *sc) { const char *s; int n; char *p; char *end; uint32_t unicode; #if HAVE_WCRTOMB mbstate_t shift_state; memset(&shift_state, 0, sizeof(shift_state)); #else /* Clear the shift state before starting. */ wctomb(NULL, L'\0'); #endif (void)sc; /* UNUSED */ /* * Allocate buffer for MBS. * We need this allocation here since it is possible that * as->s is still NULL. */ if (archive_string_ensure(as, as->length + len + 1) == NULL) return (-1); s = (const char *)_p; p = as->s + as->length; end = as->s + as->buffer_length - MB_CUR_MAX -1; while ((n = _utf8_to_unicode(&unicode, s, len)) != 0) { wchar_t wc; if (p >= end) { as->length = p - as->s; /* Re-allocate buffer for MBS. */ if (archive_string_ensure(as, as->length + max(len * 2, (size_t)MB_CUR_MAX) + 1) == NULL) return (-1); p = as->s + as->length; end = as->s + as->buffer_length - MB_CUR_MAX -1; } /* * As libarchive 2.x, translates the UTF-8 characters into * wide-characters in the assumption that WCS is Unicode. */ if (n < 0) { n *= -1; wc = L'?'; } else wc = (wchar_t)unicode; s += n; len -= n; /* * Translates the wide-character into the current locale MBS. */ #if HAVE_WCRTOMB n = (int)wcrtomb(p, wc, &shift_state); #else n = (int)wctomb(p, wc); #endif if (n == -1) return (-1); p += n; } as->length = p - as->s; as->s[as->length] = '\0'; return (0); } /* * Conversion functions between current locale dependent MBS and UTF-16BE. * strncat_from_utf16be() : UTF-16BE --> MBS * strncat_to_utf16be() : MBS --> UTF16BE */ #if defined(_WIN32) && !defined(__CYGWIN__) /* * Convert a UTF-16BE/LE string to current locale and copy the result. * Return -1 if conversion fails. */ static int win_strncat_from_utf16(struct archive_string *as, const void *_p, size_t bytes, struct archive_string_conv *sc, int be) { struct archive_string tmp; const char *u16; int ll; BOOL defchar; char *mbs; size_t mbs_size, b; int ret = 0; bytes &= ~1; if (archive_string_ensure(as, as->length + bytes +1) == NULL) return (-1); mbs = as->s + as->length; mbs_size = as->buffer_length - as->length -1; if (sc->to_cp == CP_C_LOCALE) { /* * "C" locale special process. */ u16 = _p; ll = 0; for (b = 0; b < bytes; b += 2) { uint16_t val; if (be) val = archive_be16dec(u16+b); else val = archive_le16dec(u16+b); if (val > 255) { *mbs++ = '?'; ret = -1; } else *mbs++ = (char)(val&0xff); ll++; } as->length += ll; as->s[as->length] = '\0'; return (ret); } archive_string_init(&tmp); if (be) { if (is_big_endian()) { u16 = _p; } else { if (archive_string_ensure(&tmp, bytes+2) == NULL) return (-1); memcpy(tmp.s, _p, bytes); for (b = 0; b < bytes; b += 2) { uint16_t val = archive_be16dec(tmp.s+b); archive_le16enc(tmp.s+b, val); } u16 = tmp.s; } } else { if (!is_big_endian()) { u16 = _p; } else { if (archive_string_ensure(&tmp, bytes+2) == NULL) return (-1); memcpy(tmp.s, _p, bytes); for (b = 0; b < bytes; b += 2) { uint16_t val = archive_le16dec(tmp.s+b); archive_be16enc(tmp.s+b, val); } u16 = tmp.s; } } do { defchar = 0; ll = WideCharToMultiByte(sc->to_cp, 0, (LPCWSTR)u16, (int)bytes>>1, mbs, (int)mbs_size, NULL, &defchar); /* Exit loop if we succeeded */ if (ll != 0 || GetLastError() != ERROR_INSUFFICIENT_BUFFER) { break; } /* Else expand buffer and loop to try again. */ ll = WideCharToMultiByte(sc->to_cp, 0, (LPCWSTR)u16, (int)bytes, NULL, 0, NULL, NULL); if (archive_string_ensure(as, ll +1) == NULL) return (-1); mbs = as->s + as->length; mbs_size = as->buffer_length - as->length -1; } while (1); archive_string_free(&tmp); as->length += ll; as->s[as->length] = '\0'; if (ll == 0 || defchar) ret = -1; return (ret); } static int win_strncat_from_utf16be(struct archive_string *as, const void *_p, size_t bytes, struct archive_string_conv *sc) { return (win_strncat_from_utf16(as, _p, bytes, sc, 1)); } static int win_strncat_from_utf16le(struct archive_string *as, const void *_p, size_t bytes, struct archive_string_conv *sc) { return (win_strncat_from_utf16(as, _p, bytes, sc, 0)); } static int is_big_endian(void) { uint16_t d = 1; return (archive_be16dec(&d) == 1); } /* * Convert a current locale string to UTF-16BE/LE and copy the result. * Return -1 if conversion fails. */ static int win_strncat_to_utf16(struct archive_string *as16, const void *_p, size_t length, struct archive_string_conv *sc, int bigendian) { const char *s = (const char *)_p; char *u16; size_t count, avail; if (archive_string_ensure(as16, as16->length + (length + 1) * 2) == NULL) return (-1); u16 = as16->s + as16->length; avail = as16->buffer_length - 2; if (sc->from_cp == CP_C_LOCALE) { /* * "C" locale special process. */ count = 0; while (count < length && *s) { if (bigendian) archive_be16enc(u16, *s); else archive_le16enc(u16, *s); u16 += 2; s++; count++; } as16->length += count << 1; as16->s[as16->length] = 0; as16->s[as16->length+1] = 0; return (0); } do { count = MultiByteToWideChar(sc->from_cp, MB_PRECOMPOSED, s, (int)length, (LPWSTR)u16, (int)avail>>1); /* Exit loop if we succeeded */ if (count != 0 || GetLastError() != ERROR_INSUFFICIENT_BUFFER) { break; } /* Expand buffer and try again */ count = MultiByteToWideChar(sc->from_cp, MB_PRECOMPOSED, s, (int)length, NULL, 0); if (archive_string_ensure(as16, (count +1) * 2) == NULL) return (-1); u16 = as16->s + as16->length; avail = as16->buffer_length - 2; } while (1); as16->length += count * 2; as16->s[as16->length] = 0; as16->s[as16->length+1] = 0; if (count == 0) return (-1); if (is_big_endian()) { if (!bigendian) { while (count > 0) { uint16_t v = archive_be16dec(u16); archive_le16enc(u16, v); u16 += 2; count--; } } } else { if (bigendian) { while (count > 0) { uint16_t v = archive_le16dec(u16); archive_be16enc(u16, v); u16 += 2; count--; } } } return (0); } static int win_strncat_to_utf16be(struct archive_string *as16, const void *_p, size_t length, struct archive_string_conv *sc) { return (win_strncat_to_utf16(as16, _p, length, sc, 1)); } static int win_strncat_to_utf16le(struct archive_string *as16, const void *_p, size_t length, struct archive_string_conv *sc) { return (win_strncat_to_utf16(as16, _p, length, sc, 0)); } #endif /* _WIN32 && !__CYGWIN__ */ /* * Do the best effort for conversions. * We cannot handle UTF-16BE character-set without such iconv, * but there is a chance if a string consists just ASCII code or * a current locale is UTF-8. */ /* * Convert a UTF-16BE string to current locale and copy the result. * Return -1 if conversion fails. */ static int best_effort_strncat_from_utf16(struct archive_string *as, const void *_p, size_t bytes, struct archive_string_conv *sc, int be) { const char *utf16 = (const char *)_p; char *mbs; uint32_t uc; int n, ret; (void)sc; /* UNUSED */ /* * Other case, we should do the best effort. * If all character are ASCII(<0x7f), we can convert it. * if not , we set a alternative character and return -1. */ ret = 0; if (archive_string_ensure(as, as->length + bytes +1) == NULL) return (-1); mbs = as->s + as->length; while ((n = utf16_to_unicode(&uc, utf16, bytes, be)) != 0) { if (n < 0) { n *= -1; ret = -1; } bytes -= n; utf16 += n; if (uc > 127) { /* We cannot handle it. */ *mbs++ = '?'; ret = -1; } else *mbs++ = (char)uc; } as->length = mbs - as->s; as->s[as->length] = '\0'; return (ret); } static int best_effort_strncat_from_utf16be(struct archive_string *as, const void *_p, size_t bytes, struct archive_string_conv *sc) { return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 1)); } static int best_effort_strncat_from_utf16le(struct archive_string *as, const void *_p, size_t bytes, struct archive_string_conv *sc) { return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 0)); } /* * Convert a current locale string to UTF-16BE/LE and copy the result. * Return -1 if conversion fails. */ static int best_effort_strncat_to_utf16(struct archive_string *as16, const void *_p, size_t length, struct archive_string_conv *sc, int bigendian) { const char *s = (const char *)_p; char *utf16; size_t remaining; int ret; (void)sc; /* UNUSED */ /* * Other case, we should do the best effort. * If all character are ASCII(<0x7f), we can convert it. * if not , we set a alternative character and return -1. */ ret = 0; remaining = length; if (archive_string_ensure(as16, as16->length + (length + 1) * 2) == NULL) return (-1); utf16 = as16->s + as16->length; while (remaining--) { unsigned c = *s++; if (c > 127) { /* We cannot handle it. */ c = UNICODE_R_CHAR; ret = -1; } if (bigendian) archive_be16enc(utf16, c); else archive_le16enc(utf16, c); utf16 += 2; } as16->length = utf16 - as16->s; as16->s[as16->length] = 0; as16->s[as16->length+1] = 0; return (ret); } static int best_effort_strncat_to_utf16be(struct archive_string *as16, const void *_p, size_t length, struct archive_string_conv *sc) { return (best_effort_strncat_to_utf16(as16, _p, length, sc, 1)); } static int best_effort_strncat_to_utf16le(struct archive_string *as16, const void *_p, size_t length, struct archive_string_conv *sc) { return (best_effort_strncat_to_utf16(as16, _p, length, sc, 0)); } /* * Multistring operations. */ void archive_mstring_clean(struct archive_mstring *aes) { archive_wstring_free(&(aes->aes_wcs)); archive_string_free(&(aes->aes_mbs)); archive_string_free(&(aes->aes_utf8)); archive_string_free(&(aes->aes_mbs_in_locale)); aes->aes_set = 0; } void archive_mstring_copy(struct archive_mstring *dest, struct archive_mstring *src) { dest->aes_set = src->aes_set; archive_string_copy(&(dest->aes_mbs), &(src->aes_mbs)); archive_string_copy(&(dest->aes_utf8), &(src->aes_utf8)); archive_wstring_copy(&(dest->aes_wcs), &(src->aes_wcs)); } int archive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes, const char **p) { struct archive_string_conv *sc; int r; /* If we already have a UTF8 form, return that immediately. */ if (aes->aes_set & AES_SET_UTF8) { *p = aes->aes_utf8.s; return (0); } *p = NULL; /* Try converting WCS to MBS first if MBS does not exist yet. */ if ((aes->aes_set & AES_SET_MBS) == 0) { const char *pm; /* unused */ archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */ } if (aes->aes_set & AES_SET_MBS) { sc = archive_string_conversion_to_charset(a, "UTF-8", 1); if (sc == NULL) return (-1);/* Couldn't allocate memory for sc. */ r = archive_strncpy_l(&(aes->aes_utf8), aes->aes_mbs.s, aes->aes_mbs.length, sc); if (a == NULL) free_sconv_object(sc); if (r == 0) { aes->aes_set |= AES_SET_UTF8; *p = aes->aes_utf8.s; return (0);/* success. */ } else return (-1);/* failure. */ } return (0);/* success. */ } int archive_mstring_get_mbs(struct archive *a, struct archive_mstring *aes, const char **p) { struct archive_string_conv *sc; int r, ret = 0; /* If we already have an MBS form, return that immediately. */ if (aes->aes_set & AES_SET_MBS) { *p = aes->aes_mbs.s; return (ret); } *p = NULL; /* If there's a WCS form, try converting with the native locale. */ if (aes->aes_set & AES_SET_WCS) { archive_string_empty(&(aes->aes_mbs)); r = archive_string_append_from_wcs(&(aes->aes_mbs), aes->aes_wcs.s, aes->aes_wcs.length); *p = aes->aes_mbs.s; if (r == 0) { aes->aes_set |= AES_SET_MBS; return (ret); } else ret = -1; } /* If there's a UTF-8 form, try converting with the native locale. */ if (aes->aes_set & AES_SET_UTF8) { archive_string_empty(&(aes->aes_mbs)); sc = archive_string_conversion_from_charset(a, "UTF-8", 1); if (sc == NULL) return (-1);/* Couldn't allocate memory for sc. */ r = archive_strncpy_l(&(aes->aes_mbs), aes->aes_utf8.s, aes->aes_utf8.length, sc); if (a == NULL) free_sconv_object(sc); *p = aes->aes_mbs.s; if (r == 0) { aes->aes_set |= AES_SET_MBS; ret = 0;/* success; overwrite previous error. */ } else ret = -1;/* failure. */ } return (ret); } int archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes, const wchar_t **wp) { int r, ret = 0; (void)a;/* UNUSED */ /* Return WCS form if we already have it. */ if (aes->aes_set & AES_SET_WCS) { *wp = aes->aes_wcs.s; return (ret); } *wp = NULL; /* Try converting UTF8 to MBS first if MBS does not exist yet. */ if ((aes->aes_set & AES_SET_MBS) == 0) { const char *p; /* unused */ archive_mstring_get_mbs(a, aes, &p); /* ignore errors, we'll handle it later */ } /* Try converting MBS to WCS using native locale. */ if (aes->aes_set & AES_SET_MBS) { archive_wstring_empty(&(aes->aes_wcs)); r = archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s, aes->aes_mbs.length); if (r == 0) { aes->aes_set |= AES_SET_WCS; *wp = aes->aes_wcs.s; } else ret = -1;/* failure. */ } return (ret); } int archive_mstring_get_mbs_l(struct archive *a, struct archive_mstring *aes, const char **p, size_t *length, struct archive_string_conv *sc) { int ret = 0; #if defined(_WIN32) && !defined(__CYGWIN__) int r; /* * Internationalization programming on Windows must use Wide * characters because Windows platform cannot make locale UTF-8. */ if (sc != NULL && (aes->aes_set & AES_SET_WCS) != 0) { archive_string_empty(&(aes->aes_mbs_in_locale)); r = archive_string_append_from_wcs_in_codepage( &(aes->aes_mbs_in_locale), aes->aes_wcs.s, aes->aes_wcs.length, sc); if (r == 0) { *p = aes->aes_mbs_in_locale.s; if (length != NULL) *length = aes->aes_mbs_in_locale.length; return (0); } else if (errno == ENOMEM) return (-1); else ret = -1; } #endif /* If there is not an MBS form but there is a WCS or UTF8 form, try converting * with the native locale to be used for translating it to specified * character-set. */ if ((aes->aes_set & AES_SET_MBS) == 0) { const char *pm; /* unused */ archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */ } /* If we already have an MBS form, use it to be translated to * specified character-set. */ if (aes->aes_set & AES_SET_MBS) { if (sc == NULL) { /* Conversion is unneeded. */ *p = aes->aes_mbs.s; if (length != NULL) *length = aes->aes_mbs.length; return (0); } ret = archive_strncpy_l(&(aes->aes_mbs_in_locale), aes->aes_mbs.s, aes->aes_mbs.length, sc); *p = aes->aes_mbs_in_locale.s; if (length != NULL) *length = aes->aes_mbs_in_locale.length; } else { *p = NULL; if (length != NULL) *length = 0; } return (ret); } int archive_mstring_copy_mbs(struct archive_mstring *aes, const char *mbs) { if (mbs == NULL) { aes->aes_set = 0; return (0); } return (archive_mstring_copy_mbs_len(aes, mbs, strlen(mbs))); } int archive_mstring_copy_mbs_len(struct archive_mstring *aes, const char *mbs, size_t len) { if (mbs == NULL) { aes->aes_set = 0; return (0); } aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */ archive_strncpy(&(aes->aes_mbs), mbs, len); archive_string_empty(&(aes->aes_utf8)); archive_wstring_empty(&(aes->aes_wcs)); return (0); } int archive_mstring_copy_wcs(struct archive_mstring *aes, const wchar_t *wcs) { return archive_mstring_copy_wcs_len(aes, wcs, wcs == NULL ? 0 : wcslen(wcs)); } int archive_mstring_copy_utf8(struct archive_mstring *aes, const char *utf8) { if (utf8 == NULL) { aes->aes_set = 0; return (0); } aes->aes_set = AES_SET_UTF8; archive_string_empty(&(aes->aes_mbs)); archive_string_empty(&(aes->aes_wcs)); archive_strncpy(&(aes->aes_utf8), utf8, strlen(utf8)); return (int)strlen(utf8); } int archive_mstring_copy_wcs_len(struct archive_mstring *aes, const wchar_t *wcs, size_t len) { if (wcs == NULL) { aes->aes_set = 0; return (0); } aes->aes_set = AES_SET_WCS; /* Only WCS form set. */ archive_string_empty(&(aes->aes_mbs)); archive_string_empty(&(aes->aes_utf8)); archive_wstrncpy(&(aes->aes_wcs), wcs, len); return (0); } int archive_mstring_copy_mbs_len_l(struct archive_mstring *aes, const char *mbs, size_t len, struct archive_string_conv *sc) { int r; if (mbs == NULL) { aes->aes_set = 0; return (0); } archive_string_empty(&(aes->aes_mbs)); archive_wstring_empty(&(aes->aes_wcs)); archive_string_empty(&(aes->aes_utf8)); #if defined(_WIN32) && !defined(__CYGWIN__) /* * Internationalization programming on Windows must use Wide * characters because Windows platform cannot make locale UTF-8. */ if (sc == NULL) { if (archive_string_append(&(aes->aes_mbs), mbs, mbsnbytes(mbs, len)) == NULL) { aes->aes_set = 0; r = -1; } else { aes->aes_set = AES_SET_MBS; r = 0; } #if defined(HAVE_ICONV) } else if (sc != NULL && sc->cd_w != (iconv_t)-1) { /* * This case happens only when MultiByteToWideChar() cannot * handle sc->from_cp, and we have to iconv in order to * translate character-set to wchar_t,UTF-16. */ iconv_t cd = sc->cd; unsigned from_cp; int flag; /* * Translate multi-bytes from some character-set to UTF-8. */ sc->cd = sc->cd_w; r = archive_strncpy_l(&(aes->aes_utf8), mbs, len, sc); sc->cd = cd; if (r != 0) { aes->aes_set = 0; return (r); } aes->aes_set = AES_SET_UTF8; /* * Append the UTF-8 string into wstring. */ flag = sc->flag; sc->flag &= ~(SCONV_NORMALIZATION_C | SCONV_TO_UTF16| SCONV_FROM_UTF16); from_cp = sc->from_cp; sc->from_cp = CP_UTF8; r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs), aes->aes_utf8.s, aes->aes_utf8.length, sc); sc->flag = flag; sc->from_cp = from_cp; if (r == 0) aes->aes_set |= AES_SET_WCS; #endif } else { r = archive_wstring_append_from_mbs_in_codepage( &(aes->aes_wcs), mbs, len, sc); if (r == 0) aes->aes_set = AES_SET_WCS; else aes->aes_set = 0; } #else r = archive_strncpy_l(&(aes->aes_mbs), mbs, len, sc); if (r == 0) aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */ else aes->aes_set = 0; #endif return (r); } /* * The 'update' form tries to proactively update all forms of * this string (WCS and MBS) and returns an error if any of * them fail. This is used by the 'pax' handler, for instance, * to detect and report character-conversion failures early while * still allowing clients to get potentially useful values from * the more tolerant lazy conversions. (get_mbs and get_wcs will * strive to give the user something useful, so you can get hopefully * usable values even if some of the character conversions are failing.) */ int archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes, const char *utf8) { struct archive_string_conv *sc; int r; if (utf8 == NULL) { aes->aes_set = 0; return (0); /* Succeeded in clearing everything. */ } /* Save the UTF8 string. */ archive_strcpy(&(aes->aes_utf8), utf8); /* Empty the mbs and wcs strings. */ archive_string_empty(&(aes->aes_mbs)); archive_wstring_empty(&(aes->aes_wcs)); aes->aes_set = AES_SET_UTF8; /* Only UTF8 is set now. */ /* Try converting UTF-8 to MBS, return false on failure. */ sc = archive_string_conversion_from_charset(a, "UTF-8", 1); if (sc == NULL) return (-1);/* Couldn't allocate memory for sc. */ r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc); #if defined(_WIN32) && !defined(__CYGWIN__) /* On failure, make an effort to convert UTF8 to WCS as the active code page * may not be able to represent all characters in the string */ if (r != 0) { if (archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs), aes->aes_utf8.s, aes->aes_utf8.length, sc) == 0) aes->aes_set = AES_SET_UTF8 | AES_SET_WCS; } #endif if (a == NULL) free_sconv_object(sc); if (r != 0) return (-1); aes->aes_set = AES_SET_UTF8 | AES_SET_MBS; /* Both UTF8 and MBS set. */ /* Try converting MBS to WCS, return false on failure. */ if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s, aes->aes_mbs.length)) return (-1); aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS; /* All conversions succeeded. */ return (0); } diff --git a/contrib/libarchive/unzip/test/test_I.c b/contrib/libarchive/unzip/test/test_I.c index 5d31ce8d1611..d189edca1a5c 100644 --- a/contrib/libarchive/unzip/test/test_I.c +++ b/contrib/libarchive/unzip/test/test_I.c @@ -1,54 +1,62 @@ /* * Copyright (c) 2023 Aaron Lindros * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "test.h" #ifdef HAVE_LOCALE_H #include #endif /* Test I arg - file name encoding */ DEFINE_TEST(test_I) { const char *reffile = "test_I.zip"; + const char *lang; int r; #if HAVE_SETLOCALE if (NULL == setlocale(LC_ALL, "en_US.UTF-8")) { skipping("en_US.UTF-8 locale not available on this system."); return; } #else skipping("setlocale() not available on this system."); #endif + lang = getenv("LANG"); + setenv("LANG", "en_US.UTF-8", 1); extract_reference_file(reffile); r = systemf("%s -I UTF-8 %s >test.out 2>test.err", testprog, reffile); assertEqualInt(0, r); assertNonEmptyFile("test.out"); assertEmptyFile("test.err"); assertTextFileContents("Hello, World!\n", "Γειά σου Κόσμε.txt"); + + if (lang == NULL) + unsetenv("LANG"); + else + setenv("LANG", lang, 1); }