Index: head/usr.bin/sort/coll.c =================================================================== --- head/usr.bin/sort/coll.c (revision 281122) +++ head/usr.bin/sort/coll.c (revision 281123) @@ -1,1302 +1,1302 @@ /*- * Copyright (C) 2009 Gabor Kovesdan * Copyright (C) 2012 Oleg Moskalenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include "coll.h" #include "vsort.h" struct key_specs *keys; size_t keys_num = 0; wint_t symbol_decimal_point = L'.'; /* there is no default thousands separator in collate rules: */ wint_t symbol_thousands_sep = 0; wint_t symbol_negative_sign = L'-'; wint_t symbol_positive_sign = L'+'; static int wstrcoll(struct key_value *kv1, struct key_value *kv2, size_t offset); static int gnumcoll(struct key_value*, struct key_value *, size_t offset); static int monthcoll(struct key_value*, struct key_value *, size_t offset); static int numcoll(struct key_value*, struct key_value *, size_t offset); static int hnumcoll(struct key_value*, struct key_value *, size_t offset); static int randomcoll(struct key_value*, struct key_value *, size_t offset); static int versioncoll(struct key_value*, struct key_value *, size_t offset); /* * Allocate keys array */ struct keys_array * keys_array_alloc(void) { struct keys_array *ka; size_t sz; sz = keys_array_size(); ka = sort_malloc(sz); memset(ka, 0, sz); return (ka); } /* - * Calculate whether we need key hint space + * Calculate whether we need key hint space */ static size_t key_hint_size(void) { return (need_hint ? sizeof(struct key_hint) : 0); } /* * Calculate keys array size */ size_t keys_array_size(void) { return (keys_num * (sizeof(struct key_value) + key_hint_size())); } /* * Clean data of keys array */ void clean_keys_array(const struct bwstring *s, struct keys_array *ka) { if (ka) { for (size_t i = 0; i < keys_num; ++i) if (ka->key[i].k && ka->key[i].k != s) bwsfree(ka->key[i].k); memset(ka, 0, keys_array_size()); } } /* * Set value of a key in the keys set */ void set_key_on_keys_array(struct keys_array *ka, struct bwstring *s, size_t ind) { if (ka && keys_num > ind) { struct key_value *kv; kv = &(ka->key[ind]); if (kv->k && kv->k != s) bwsfree(kv->k); kv->k = s; } } /* * Initialize a sort list item */ struct sort_list_item * sort_list_item_alloc(void) { struct sort_list_item *si; size_t sz; sz = sizeof(struct sort_list_item) + keys_array_size(); si = sort_malloc(sz); memset(si, 0, sz); return (si); } size_t sort_list_item_size(struct sort_list_item *si) { size_t ret = 0; if (si) { ret = sizeof(struct sort_list_item) + keys_array_size(); if (si->str) ret += bws_memsize(si->str); for (size_t i = 0; i < keys_num; ++i) { struct key_value *kv; kv = &(si->ka.key[i]); if (kv->k != si->str) ret += bws_memsize(kv->k); } } return (ret); } /* * Calculate key for a sort list item */ static void sort_list_item_make_key(struct sort_list_item *si) { preproc(si->str, &(si->ka)); } /* * Set value of a sort list item. * Return combined string and keys memory size. */ void sort_list_item_set(struct sort_list_item *si, struct bwstring *str) { if (si) { clean_keys_array(si->str, &(si->ka)); if (si->str) { if (si->str == str) { /* we are trying to reset the same string */ return; } else { bwsfree(si->str); si->str = NULL; } } si->str = str; sort_list_item_make_key(si); } } /* * De-allocate a sort list item object memory */ void sort_list_item_clean(struct sort_list_item *si) { if (si) { clean_keys_array(si->str, &(si->ka)); if (si->str) { bwsfree(si->str); si->str = NULL; } } } /* * Skip columns according to specs */ static size_t skip_cols_to_start(const struct bwstring *s, size_t cols, size_t start, bool skip_blanks, bool *empty_key) { if (cols < 1) return (BWSLEN(s) + 1); if (skip_blanks) while (start < BWSLEN(s) && iswblank(BWS_GET(s,start))) ++start; while (start < BWSLEN(s) && cols > 1) { --cols; ++start; } if (start >= BWSLEN(s)) *empty_key = true; return (start); } /* * Skip fields according to specs */ static size_t skip_fields_to_start(const struct bwstring *s, size_t fields, bool *empty_field) { if (fields < 2) { if (BWSLEN(s) == 0) *empty_field = true; return (0); } else if (!(sort_opts_vals.tflag)) { size_t cpos = 0; bool pb = true; while (cpos < BWSLEN(s)) { bool isblank; isblank = iswblank(BWS_GET(s, cpos)); if (isblank && !pb) { --fields; if (fields <= 1) return (cpos); } pb = isblank; ++cpos; } if (fields > 1) *empty_field = true; return (cpos); } else { size_t cpos = 0; while (cpos < BWSLEN(s)) { if (BWS_GET(s,cpos) == (wchar_t)sort_opts_vals.field_sep) { --fields; if (fields <= 1) return (cpos + 1); } ++cpos; } if (fields > 1) *empty_field = true; return (cpos); } } /* * Find fields start */ static void find_field_start(const struct bwstring *s, struct key_specs *ks, size_t *field_start, size_t *key_start, bool *empty_field, bool *empty_key) { *field_start = skip_fields_to_start(s, ks->f1, empty_field); if (!*empty_field) *key_start = skip_cols_to_start(s, ks->c1, *field_start, ks->pos1b, empty_key); else *empty_key = true; } /* * Find end key position */ static size_t find_field_end(const struct bwstring *s, struct key_specs *ks) { size_t f2, next_field_start, pos_end; bool empty_field, empty_key; pos_end = 0; next_field_start = 0; empty_field = false; empty_key = false; f2 = ks->f2; if (f2 == 0) return (BWSLEN(s) + 1); else { if (ks->c2 == 0) { next_field_start = skip_fields_to_start(s, f2 + 1, &empty_field); if ((next_field_start > 0) && sort_opts_vals.tflag && ((wchar_t)sort_opts_vals.field_sep == BWS_GET(s, next_field_start - 1))) --next_field_start; } else next_field_start = skip_fields_to_start(s, f2, &empty_field); } if (empty_field || (next_field_start >= BWSLEN(s))) return (BWSLEN(s) + 1); if (ks->c2) { pos_end = skip_cols_to_start(s, ks->c2, next_field_start, ks->pos2b, &empty_key); if (pos_end < BWSLEN(s)) ++pos_end; } else pos_end = next_field_start; return (pos_end); } /* * Cut a field according to the key specs */ static struct bwstring * cut_field(const struct bwstring *s, struct key_specs *ks) { struct bwstring *ret = NULL; if (s && ks) { size_t field_start, key_end, key_start, sz; bool empty_field, empty_key; field_start = 0; key_start = 0; empty_field = false; empty_key = false; find_field_start(s, ks, &field_start, &key_start, &empty_field, &empty_key); if (empty_key) sz = 0; else { key_end = find_field_end(s, ks); sz = (key_end < key_start) ? 0 : (key_end - key_start); } ret = bwsalloc(sz); if (sz) bwsnocpy(ret, s, key_start, sz); } else ret = bwsalloc(0); return (ret); } /* * Preprocesses a line applying the necessary transformations * specified by command line options and returns the preprocessed * string, which can be used to compare. */ int preproc(struct bwstring *s, struct keys_array *ka) { if (sort_opts_vals.kflag) for (size_t i = 0; i < keys_num; i++) { struct bwstring *key; struct key_specs *kspecs; struct sort_mods *sm; kspecs = &(keys[i]); key = cut_field(s, kspecs); sm = &(kspecs->sm); if (sm->dflag) key = dictionary_order(key); else if (sm->iflag) key = ignore_nonprinting(key); if (sm->fflag || sm->Mflag) key = ignore_case(key); set_key_on_keys_array(ka, key, i); } else { struct bwstring *ret = NULL; struct sort_mods *sm = default_sort_mods; if (sm->bflag) { if (ret == NULL) ret = bwsdup(s); ret = ignore_leading_blanks(ret); } if (sm->dflag) { if (ret == NULL) ret = bwsdup(s); ret = dictionary_order(ret); } else if (sm->iflag) { if (ret == NULL) ret = bwsdup(s); ret = ignore_nonprinting(ret); } if (sm->fflag || sm->Mflag) { if (ret == NULL) ret = bwsdup(s); ret = ignore_case(ret); } if (ret == NULL) set_key_on_keys_array(ka, s, 0); else set_key_on_keys_array(ka, ret, 0); } return 0; } cmpcoll_t get_sort_func(struct sort_mods *sm) { if (sm->nflag) return (numcoll); else if (sm->hflag) return (hnumcoll); else if (sm->gflag) return (gnumcoll); else if (sm->Mflag) return (monthcoll); else if (sm->Rflag) return (randomcoll); else if (sm->Vflag) return (versioncoll); else return (wstrcoll); } /* * Compares the given strings. Returns a positive number if * the first precedes the second, a negative number if the second is * the preceding one, and zero if they are equal. This function calls * the underlying collate functions, which done the actual comparison. */ int key_coll(struct keys_array *ps1, struct keys_array *ps2, size_t offset) { struct sort_mods *sm; int res = 0; for (size_t i = 0; i < keys_num; ++i) { sm = &(keys[i].sm); if (sm->rflag) res = sm->func(&(ps2->key[i]), &(ps1->key[i]), offset); else res = sm->func(&(ps1->key[i]), &(ps2->key[i]), offset); if (res) break; /* offset applies to only the first key */ offset = 0; } return (res); } /* * Compare two strings. * Plain symbol-by-symbol comparison. */ int top_level_str_coll(const struct bwstring *s1, const struct bwstring *s2) { if (default_sort_mods->rflag) { const struct bwstring *tmp; tmp = s1; s1 = s2; s2 = tmp; } return (bwscoll(s1, s2, 0)); } /* * Compare a string and a sort list item, according to the sort specs. */ int str_list_coll(struct bwstring *str1, struct sort_list_item **ss2) { struct keys_array *ka1; int ret = 0; ka1 = keys_array_alloc(); preproc(str1, ka1); sort_list_item_make_key(*ss2); if (debug_sort) { bwsprintf(stdout, str1, "; s1=<", ">"); bwsprintf(stdout, (*ss2)->str, ", s2=<", ">"); } ret = key_coll(ka1, &((*ss2)->ka), 0); if (debug_sort) printf("; cmp1=%d", ret); clean_keys_array(str1, ka1); sort_free(ka1); if ((ret == 0) && !(sort_opts_vals.sflag) && sort_opts_vals.complex_sort) { ret = top_level_str_coll(str1, ((*ss2)->str)); if (debug_sort) printf("; cmp2=%d", ret); } if (debug_sort) printf("\n"); return (ret); } /* * Compare two sort list items, according to the sort specs. */ int list_coll_offset(struct sort_list_item **ss1, struct sort_list_item **ss2, size_t offset) { int ret; ret = key_coll(&((*ss1)->ka), &((*ss2)->ka), offset); if (debug_sort) { if (offset) printf("; offset=%d", (int) offset); bwsprintf(stdout, ((*ss1)->str), "; s1=<", ">"); bwsprintf(stdout, ((*ss2)->str), ", s2=<", ">"); printf("; cmp1=%d\n", ret); } if (ret) return (ret); if (!(sort_opts_vals.sflag) && sort_opts_vals.complex_sort) { ret = top_level_str_coll(((*ss1)->str), ((*ss2)->str)); if (debug_sort) printf("; cmp2=%d\n", ret); } return (ret); } /* * Compare two sort list items, according to the sort specs. */ int list_coll(struct sort_list_item **ss1, struct sort_list_item **ss2) { return (list_coll_offset(ss1, ss2, 0)); } -#define LSCDEF(N) \ -static int \ -list_coll_##N(struct sort_list_item **ss1, struct sort_list_item **ss2) \ -{ \ - \ - return (list_coll_offset(ss1, ss2, N)); \ +#define LSCDEF(N) \ +static int \ +list_coll_##N(struct sort_list_item **ss1, struct sort_list_item **ss2) \ +{ \ + \ + return (list_coll_offset(ss1, ss2, N)); \ } LSCDEF(1) LSCDEF(2) LSCDEF(3) LSCDEF(4) LSCDEF(5) LSCDEF(6) LSCDEF(7) LSCDEF(8) LSCDEF(9) LSCDEF(10) LSCDEF(11) LSCDEF(12) LSCDEF(13) LSCDEF(14) LSCDEF(15) LSCDEF(16) LSCDEF(17) LSCDEF(18) LSCDEF(19) LSCDEF(20) listcoll_t get_list_call_func(size_t offset) { static const listcoll_t lsarray[] = { list_coll, list_coll_1, list_coll_2, list_coll_3, list_coll_4, list_coll_5, list_coll_6, list_coll_7, list_coll_8, list_coll_9, list_coll_10, list_coll_11, list_coll_12, list_coll_13, list_coll_14, list_coll_15, list_coll_16, list_coll_17, list_coll_18, list_coll_19, list_coll_20 }; if (offset <= 20) return (lsarray[offset]); return (list_coll); } /* * Compare two sort list items, only by their original string. */ int list_coll_by_str_only(struct sort_list_item **ss1, struct sort_list_item **ss2) { return (top_level_str_coll(((*ss1)->str), ((*ss2)->str))); } /* * Maximum size of a number in the string (before or after decimal point) */ #define MAX_NUM_SIZE (128) /* * Set suffix value */ static void setsuffix(wchar_t c, unsigned char *si) { switch (c){ case L'k': case L'K': *si = 1; break; case L'M': *si = 2; break; case L'G': *si = 3; break; case L'T': *si = 4; break; case L'P': *si = 5; break; case L'E': *si = 6; break; case L'Z': *si = 7; break; case L'Y': *si = 8; break; default: *si = 0; }; } /* * Read string s and parse the string into a fixed-decimal-point number. * sign equals -1 if the number is negative (explicit plus is not allowed, * according to GNU sort's "info sort". * The number part before decimal point is in the smain, after the decimal * point is in sfrac, tail is the pointer to the remainder of the string. */ static int read_number(struct bwstring *s0, int *sign, wchar_t *smain, size_t *main_len, wchar_t *sfrac, size_t *frac_len, unsigned char *si) { bwstring_iterator s; s = bws_begin(s0); /* always end the fraction with zero, even if we have no fraction */ sfrac[0] = 0; while (iswblank(bws_get_iter_value(s))) s = bws_iterator_inc(s, 1); if (bws_get_iter_value(s) == (wchar_t)symbol_negative_sign) { *sign = -1; s = bws_iterator_inc(s, 1); } // This is '0', not '\0', do not change this while (iswdigit(bws_get_iter_value(s)) && (bws_get_iter_value(s) == L'0')) s = bws_iterator_inc(s, 1); while (bws_get_iter_value(s) && *main_len < MAX_NUM_SIZE) { if (iswdigit(bws_get_iter_value(s))) { smain[*main_len] = bws_get_iter_value(s); s = bws_iterator_inc(s, 1); *main_len += 1; } else if (symbol_thousands_sep && (bws_get_iter_value(s) == (wchar_t)symbol_thousands_sep)) s = bws_iterator_inc(s, 1); else break; } smain[*main_len] = 0; if (bws_get_iter_value(s) == (wchar_t)symbol_decimal_point) { s = bws_iterator_inc(s, 1); while (iswdigit(bws_get_iter_value(s)) && *frac_len < MAX_NUM_SIZE) { sfrac[*frac_len] = bws_get_iter_value(s); s = bws_iterator_inc(s, 1); *frac_len += 1; } sfrac[*frac_len] = 0; while (*frac_len > 0 && sfrac[*frac_len - 1] == L'0') { --(*frac_len); sfrac[*frac_len] = L'\0'; } } setsuffix(bws_get_iter_value(s),si); if ((*main_len + *frac_len) == 0) *sign = 0; return (0); } /* * Implements string sort. */ static int wstrcoll(struct key_value *kv1, struct key_value *kv2, size_t offset) { if (debug_sort) { if (offset) printf("; offset=%d\n", (int) offset); bwsprintf(stdout, kv1->k, "; k1=<", ">"); printf("(%zu)", BWSLEN(kv1->k)); bwsprintf(stdout, kv2->k, ", k2=<", ">"); printf("(%zu)", BWSLEN(kv2->k)); } return (bwscoll(kv1->k, kv2->k, offset)); } /* * Compare two suffixes */ static inline int cmpsuffix(unsigned char si1, unsigned char si2) { return ((char)si1 - (char)si2); } /* * Implements numeric sort for -n and -h. */ static int numcoll_impl(struct key_value *kv1, struct key_value *kv2, size_t offset __unused, bool use_suffix) { struct bwstring *s1, *s2; wchar_t sfrac1[MAX_NUM_SIZE + 1], sfrac2[MAX_NUM_SIZE + 1]; wchar_t smain1[MAX_NUM_SIZE + 1], smain2[MAX_NUM_SIZE + 1]; int cmp_res, sign1, sign2; size_t frac1, frac2, main1, main2; unsigned char SI1, SI2; bool e1, e2, key1_read, key2_read; s1 = kv1->k; s2 = kv2->k; sign1 = sign2 = 0; main1 = main2 = 0; frac1 = frac2 = 0; cmp_res = 0; key1_read = key2_read = false; if (debug_sort) { bwsprintf(stdout, s1, "; k1=<", ">"); bwsprintf(stdout, s2, ", k2=<", ">"); } if (s1 == s2) return (0); if (kv1->hint->status == HS_UNINITIALIZED) { /* read the number from the string */ read_number(s1, &sign1, smain1, &main1, sfrac1, &frac1, &SI1); key1_read = true; kv1->hint->v.nh.n1 = wcstoull(smain1, NULL, 10); if(main1 < 1 && frac1 < 1) kv1->hint->v.nh.empty=true; kv1->hint->v.nh.si = SI1; kv1->hint->status = (kv1->hint->v.nh.n1 != ULLONG_MAX) ? HS_INITIALIZED : HS_ERROR; kv1->hint->v.nh.neg = (sign1 < 0) ? true : false; } if (kv2->hint->status == HS_UNINITIALIZED) { /* read the number from the string */ read_number(s2, &sign2, smain2, &main2, sfrac2, &frac2,&SI2); key2_read = true; kv2->hint->v.nh.n1 = wcstoull(smain2, NULL, 10); if(main2 < 1 && frac2 < 1) kv2->hint->v.nh.empty=true; kv2->hint->v.nh.si = SI2; kv2->hint->status = (kv2->hint->v.nh.n1 != ULLONG_MAX) ? HS_INITIALIZED : HS_ERROR; kv2->hint->v.nh.neg = (sign2 < 0) ? true : false; } if (kv1->hint->status == HS_INITIALIZED && kv2->hint->status == HS_INITIALIZED) { unsigned long long n1, n2; bool neg1, neg2; e1 = kv1->hint->v.nh.empty; e2 = kv2->hint->v.nh.empty; if (e1 && e2) return (0); neg1 = kv1->hint->v.nh.neg; neg2 = kv2->hint->v.nh.neg; if (neg1 && !neg2) return (-1); if (neg2 && !neg1) return (+1); if (e1) return (neg2 ? +1 : -1); else if (e2) return (neg1 ? -1 : +1); if (use_suffix) { cmp_res = cmpsuffix(kv1->hint->v.nh.si, kv2->hint->v.nh.si); if (cmp_res) return (neg1 ? -cmp_res : cmp_res); } n1 = kv1->hint->v.nh.n1; n2 = kv2->hint->v.nh.n1; if (n1 < n2) return (neg1 ? +1 : -1); else if (n1 > n2) return (neg1 ? -1 : +1); } /* read the numbers from the strings */ if (!key1_read) read_number(s1, &sign1, smain1, &main1, sfrac1, &frac1, &SI1); if (!key2_read) read_number(s2, &sign2, smain2, &main2, sfrac2, &frac2, &SI2); e1 = ((main1 + frac1) == 0); e2 = ((main2 + frac2) == 0); if (e1 && e2) return (0); /* we know the result if the signs are different */ if (sign1 < 0 && sign2 >= 0) return (-1); if (sign1 >= 0 && sign2 < 0) return (+1); if (e1) return ((sign2 < 0) ? +1 : -1); else if (e2) return ((sign1 < 0) ? -1 : +1); if (use_suffix) { cmp_res = cmpsuffix(SI1, SI2); if (cmp_res) return ((sign1 < 0) ? -cmp_res : cmp_res); } /* if both numbers are empty assume that the strings are equal */ if (main1 < 1 && main2 < 1 && frac1 < 1 && frac2 < 1) return (0); /* * if the main part is of different size, we know the result * (because the leading zeros are removed) */ if (main1 < main2) cmp_res = -1; else if (main1 > main2) cmp_res = +1; /* if the sizes are equal then simple non-collate string compare gives the correct result */ else cmp_res = wcscmp(smain1, smain2); /* check fraction */ if (!cmp_res) cmp_res = wcscmp(sfrac1, sfrac2); if (!cmp_res) return (0); /* reverse result if the signs are negative */ if (sign1 < 0 && sign2 < 0) cmp_res = -cmp_res; return (cmp_res); } /* * Implements numeric sort (-n). */ static int numcoll(struct key_value *kv1, struct key_value *kv2, size_t offset) { return (numcoll_impl(kv1, kv2, offset, false)); } /* * Implements 'human' numeric sort (-h). */ static int hnumcoll(struct key_value *kv1, struct key_value *kv2, size_t offset) { return (numcoll_impl(kv1, kv2, offset, true)); } /* * Implements random sort (-R). */ static int randomcoll(struct key_value *kv1, struct key_value *kv2, size_t offset __unused) { struct bwstring *s1, *s2; MD5_CTX ctx1, ctx2; char *b1, *b2; s1 = kv1->k; s2 = kv2->k; if (debug_sort) { bwsprintf(stdout, s1, "; k1=<", ">"); bwsprintf(stdout, s2, ", k2=<", ">"); } if (s1 == s2) return (0); memcpy(&ctx1,&md5_ctx,sizeof(MD5_CTX)); memcpy(&ctx2,&md5_ctx,sizeof(MD5_CTX)); MD5Update(&ctx1, bwsrawdata(s1), bwsrawlen(s1)); MD5Update(&ctx2, bwsrawdata(s2), bwsrawlen(s2)); b1 = MD5End(&ctx1, NULL); b2 = MD5End(&ctx2, NULL); if (b1 == NULL) { if (b2 == NULL) return (0); else { sort_free(b2); return (-1); } } else if (b2 == NULL) { sort_free(b1); return (+1); } else { int cmp_res; cmp_res = strcmp(b1,b2); sort_free(b1); sort_free(b2); if (!cmp_res) cmp_res = bwscoll(s1, s2, 0); return (cmp_res); } } /* * Implements version sort (-V). */ static int versioncoll(struct key_value *kv1, struct key_value *kv2, size_t offset __unused) { struct bwstring *s1, *s2; s1 = kv1->k; s2 = kv2->k; if (debug_sort) { bwsprintf(stdout, s1, "; k1=<", ">"); bwsprintf(stdout, s2, ", k2=<", ">"); } if (s1 == s2) return (0); return (vcmp(s1, s2)); } /* * Check for minus infinity */ static inline bool huge_minus(double d, int err1) { if (err1 == ERANGE) if (d == -HUGE_VAL || d == -HUGE_VALF || d == -HUGE_VALL) return (+1); return (0); } /* * Check for plus infinity */ static inline bool huge_plus(double d, int err1) { if (err1 == ERANGE) if (d == HUGE_VAL || d == HUGE_VALF || d == HUGE_VALL) return (+1); return (0); } /* * Check whether a function is a NAN */ static bool is_nan(double d) { return ((d == NAN) || (isnan(d))); } /* * Compare two NANs */ static int cmp_nans(double d1, double d2) { if (d1 < d2) return (-1); if (d2 > d2) return (+1); return (0); } /* * Implements general numeric sort (-g). */ static int gnumcoll(struct key_value *kv1, struct key_value *kv2, size_t offset __unused) { double d1, d2; int err1, err2; bool empty1, empty2, key1_read, key2_read; d1 = d2 = 0; err1 = err2 = 0; key1_read = key2_read = false; if (debug_sort) { bwsprintf(stdout, kv1->k, "; k1=<", ">"); bwsprintf(stdout, kv2->k, "; k2=<", ">"); } if (kv1->hint->status == HS_UNINITIALIZED) { errno = 0; d1 = bwstod(kv1->k, &empty1); err1 = errno; if (empty1) kv1->hint->v.gh.notnum = true; else if (err1 == 0) { kv1->hint->v.gh.d = d1; kv1->hint->v.gh.nan = is_nan(d1); kv1->hint->status = HS_INITIALIZED; } else kv1->hint->status = HS_ERROR; key1_read = true; } if (kv2->hint->status == HS_UNINITIALIZED) { errno = 0; d2 = bwstod(kv2->k, &empty2); err2 = errno; if (empty2) kv2->hint->v.gh.notnum = true; else if (err2 == 0) { kv2->hint->v.gh.d = d2; kv2->hint->v.gh.nan = is_nan(d2); kv2->hint->status = HS_INITIALIZED; } else kv2->hint->status = HS_ERROR; key2_read = true; } if (kv1->hint->status == HS_INITIALIZED && kv2->hint->status == HS_INITIALIZED) { if (kv1->hint->v.gh.notnum) return ((kv2->hint->v.gh.notnum) ? 0 : -1); else if (kv2->hint->v.gh.notnum) return (+1); if (kv1->hint->v.gh.nan) return ((kv2->hint->v.gh.nan) ? cmp_nans(kv1->hint->v.gh.d, kv2->hint->v.gh.d) : -1); else if (kv2->hint->v.gh.nan) return (+1); d1 = kv1->hint->v.gh.d; d2 = kv2->hint->v.gh.d; if (d1 < d2) return (-1); else if (d1 > d2) return (+1); else return (0); } if (!key1_read) { errno = 0; d1 = bwstod(kv1->k, &empty1); err1 = errno; } if (!key2_read) { errno = 0; d2 = bwstod(kv2->k, &empty2); err2 = errno; } /* Non-value case: */ if (empty1) return (empty2 ? 0 : -1); else if (empty2) return (+1); /* NAN case */ if (is_nan(d1)) return (is_nan(d2) ? cmp_nans(d1, d2) : -1); else if (is_nan(d2)) return (+1); /* Infinities */ if (err1 == ERANGE || err2 == ERANGE) { /* Minus infinity case */ if (huge_minus(d1, err1)) { if (huge_minus(d2, err2)) { if (d1 < d2) return (-1); if (d1 > d2) return (+1); return (0); } else return (-1); } else if (huge_minus(d2, err2)) { if (huge_minus(d1, err1)) { if (d1 < d2) return (-1); if (d1 > d2) return (+1); return (0); } else return (+1); } /* Plus infinity case */ if (huge_plus(d1, err1)) { if (huge_plus(d2, err2)) { if (d1 < d2) return (-1); if (d1 > d2) return (+1); return (0); } else return (+1); } else if (huge_plus(d2, err2)) { if (huge_plus(d1, err1)) { if (d1 < d2) return (-1); if (d1 > d2) return (+1); return (0); } else return (-1); } } if (d1 < d2) return (-1); if (d1 > d2) return (+1); return (0); } /* * Implements month sort (-M). */ static int monthcoll(struct key_value *kv1, struct key_value *kv2, size_t offset __unused) { int val1, val2; bool key1_read, key2_read; val1 = val2 = 0; key1_read = key2_read = false; if (debug_sort) { bwsprintf(stdout, kv1->k, "; k1=<", ">"); bwsprintf(stdout, kv2->k, "; k2=<", ">"); } if (kv1->hint->status == HS_UNINITIALIZED) { kv1->hint->v.Mh.m = bws_month_score(kv1->k); key1_read = true; kv1->hint->status = HS_INITIALIZED; } if (kv2->hint->status == HS_UNINITIALIZED) { kv2->hint->v.Mh.m = bws_month_score(kv2->k); key2_read = true; kv2->hint->status = HS_INITIALIZED; } if (kv1->hint->status == HS_INITIALIZED) { val1 = kv1->hint->v.Mh.m; key1_read = true; } if (kv2->hint->status == HS_INITIALIZED) { val2 = kv2->hint->v.Mh.m; key2_read = true; } if (!key1_read) val1 = bws_month_score(kv1->k); if (!key2_read) val2 = bws_month_score(kv2->k); if (val1 == val2) { return (0); } if (val1 < val2) return (-1); return (+1); } Index: head/usr.bin/sort/file.c =================================================================== --- head/usr.bin/sort/file.c (revision 281122) +++ head/usr.bin/sort/file.c (revision 281123) @@ -1,1633 +1,1633 @@ /*- * Copyright (C) 2009 Gabor Kovesdan * Copyright (C) 2012 Oleg Moskalenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #if defined(SORT_THREADS) #include #endif #include #include #include #include #include #include #include #include "coll.h" #include "file.h" #include "radixsort.h" unsigned long long free_memory = 1000000; unsigned long long available_free_memory = 1000000; bool use_mmap; const char *tmpdir = "/var/tmp"; const char *compress_program; size_t max_open_files = 16; /* * How much space we read from file at once */ #define READ_CHUNK (4096) /* * File reader structure */ struct file_reader { struct reader_buffer rb; FILE *file; char *fname; unsigned char *buffer; unsigned char *mmapaddr; unsigned char *mmapptr; size_t bsz; size_t cbsz; size_t mmapsize; size_t strbeg; int fd; char elsymb; }; /* * Structure to be used in file merge process. */ struct file_header { struct file_reader *fr; struct sort_list_item *si; /* current top line */ size_t file_pos; }; /* * List elements of "cleanable" files list. */ struct CLEANABLE_FILE { char *fn; LIST_ENTRY(CLEANABLE_FILE) files; }; /* * List header of "cleanable" files list. */ static LIST_HEAD(CLEANABLE_FILES,CLEANABLE_FILE) tmp_files; /* * Semaphore to protect the tmp file list. * We use semaphore here because it is signal-safe, according to POSIX. * And semaphore does not require pthread library. */ static sem_t tmp_files_sem; static void mt_sort(struct sort_list *list, int (*sort_func)(void *, size_t, size_t, int (*)(const void *, const void *)), const char* fn); /* * Init tmp files list */ void init_tmp_files(void) { LIST_INIT(&tmp_files); sem_init(&tmp_files_sem, 0, 1); } /* * Save name of a tmp file for signal cleanup */ void tmp_file_atexit(const char *tmp_file) { if (tmp_file) { sem_wait(&tmp_files_sem); struct CLEANABLE_FILE *item = sort_malloc(sizeof(struct CLEANABLE_FILE)); item->fn = sort_strdup(tmp_file); LIST_INSERT_HEAD(&tmp_files, item, files); sem_post(&tmp_files_sem); } } /* * Clear tmp files */ void clear_tmp_files(void) { struct CLEANABLE_FILE *item; sem_wait(&tmp_files_sem); LIST_FOREACH(item,&tmp_files,files) { if ((item) && (item->fn)) unlink(item->fn); } sem_post(&tmp_files_sem); } /* * Check whether a file is a temporary file */ static bool file_is_tmp(const char* fn) { struct CLEANABLE_FILE *item; bool ret = false; if (fn) { sem_wait(&tmp_files_sem); LIST_FOREACH(item,&tmp_files,files) { if ((item) && (item->fn)) if (strcmp(item->fn, fn) == 0) { ret = true; break; } } sem_post(&tmp_files_sem); } return (ret); } /* * Read zero-terminated line from a file */ char * read_file0_line(struct file0_reader *f0r) { size_t pos = 0; int c; if ((f0r->f == NULL) || feof(f0r->f)) return (NULL); if (f0r->current_line && f0r->current_sz > 0) f0r->current_line[0] = 0; while (!feof(f0r->f)) { c = fgetc(f0r->f); if (feof(f0r->f) || (c == -1)) break; if ((pos + 1) >= f0r->current_sz) { size_t newsz = (f0r->current_sz + 2) * 2; f0r->current_line = sort_realloc(f0r->current_line, newsz); f0r->current_sz = newsz; } f0r->current_line[pos] = (char)c; if (c == 0) break; else f0r->current_line[pos + 1] = 0; ++pos; } return f0r->current_line; } /* * Generate new temporary file name */ char * new_tmp_file_name(void) { static size_t tfcounter = 0; static const char *fn = ".bsdsort."; char *ret; size_t sz; sz = strlen(tmpdir) + 1 + strlen(fn) + 32 + 1; ret = sort_malloc(sz); sprintf(ret, "%s/%s%d.%lu", tmpdir, fn, (int) getpid(), (unsigned long)(tfcounter++)); tmp_file_atexit(ret); return (ret); } /* * Initialize file list */ void file_list_init(struct file_list *fl, bool tmp) { if (fl) { fl->count = 0; fl->sz = 0; fl->fns = NULL; fl->tmp = tmp; } } /* * Add a file name to the list */ void file_list_add(struct file_list *fl, char *fn, bool allocate) { if (fl && fn) { if (fl->count >= fl->sz || (fl->fns == NULL)) { fl->sz = (fl->sz) * 2 + 1; fl->fns = sort_realloc(fl->fns, fl->sz * sizeof(char *)); } fl->fns[fl->count] = allocate ? sort_strdup(fn) : fn; fl->count += 1; } } /* * Populate file list from array of file names */ void file_list_populate(struct file_list *fl, int argc, char **argv, bool allocate) { if (fl && argv) { int i; for (i = 0; i < argc; i++) file_list_add(fl, argv[i], allocate); } } /* * Clean file list data and delete the files, * if this is a list of temporary files */ void file_list_clean(struct file_list *fl) { if (fl) { if (fl->fns) { size_t i; for (i = 0; i < fl->count; i++) { if (fl->fns[i]) { if (fl->tmp) unlink(fl->fns[i]); sort_free(fl->fns[i]); fl->fns[i] = 0; } } sort_free(fl->fns); fl->fns = NULL; } fl->sz = 0; fl->count = 0; fl->tmp = false; } } /* * Init sort list */ void sort_list_init(struct sort_list *l) { if (l) { l->count = 0; l->size = 0; l->memsize = sizeof(struct sort_list); l->list = NULL; } } /* * Add string to sort list */ void sort_list_add(struct sort_list *l, struct bwstring *str) { if (l && str) { size_t indx = l->count; if ((l->list == NULL) || (indx >= l->size)) { size_t newsize = (l->size + 1) + 1024; l->list = sort_realloc(l->list, sizeof(struct sort_list_item*) * newsize); l->memsize += (newsize - l->size) * sizeof(struct sort_list_item*); l->size = newsize; } l->list[indx] = sort_list_item_alloc(); sort_list_item_set(l->list[indx], str); l->memsize += sort_list_item_size(l->list[indx]); l->count += 1; } } /* * Clean sort list data */ void sort_list_clean(struct sort_list *l) { if (l) { if (l->list) { size_t i; for (i = 0; i < l->count; i++) { struct sort_list_item *item; item = l->list[i]; if (item) { sort_list_item_clean(item); sort_free(item); l->list[i] = NULL; } } sort_free(l->list); l->list = NULL; } l->count = 0; l->size = 0; l->memsize = sizeof(struct sort_list); } } /* * Write sort list to file */ void sort_list_dump(struct sort_list *l, const char *fn) { if (l && fn) { FILE *f; f = openfile(fn, "w"); if (f == NULL) err(2, NULL); if (l->list) { size_t i; if (!(sort_opts_vals.uflag)) { for (i = 0; i < l->count; ++i) bwsfwrite(l->list[i]->str, f, sort_opts_vals.zflag); } else { struct sort_list_item *last_printed_item = NULL; struct sort_list_item *item; for (i = 0; i < l->count; ++i) { item = l->list[i]; if ((last_printed_item == NULL) || list_coll(&last_printed_item, &item)) { bwsfwrite(item->str, f, sort_opts_vals.zflag); last_printed_item = item; } } } } closefile(f, fn); } } /* * Checks if the given file is sorted. Stops at the first disorder, * prints the disordered line and returns 1. */ int check(const char *fn) { struct bwstring *s1, *s2, *s1disorder, *s2disorder; struct file_reader *fr; struct keys_array *ka1, *ka2; int res; size_t pos, posdisorder; s1 = s2 = s1disorder = s2disorder = NULL; ka1 = ka2 = NULL; fr = file_reader_init(fn); res = 0; pos = 1; posdisorder = 1; if (fr == NULL) { err(2, NULL); goto end; } s1 = file_reader_readline(fr); if (s1 == NULL) goto end; ka1 = keys_array_alloc(); preproc(s1, ka1); s2 = file_reader_readline(fr); if (s2 == NULL) goto end; ka2 = keys_array_alloc(); preproc(s2, ka2); for (;;) { if (debug_sort) { bwsprintf(stdout, s2, "s1=<", ">"); bwsprintf(stdout, s1, "s2=<", ">"); } int cmp = key_coll(ka2, ka1, 0); if (debug_sort) printf("; cmp1=%d", cmp); if (!cmp && sort_opts_vals.complex_sort && !(sort_opts_vals.uflag) && !(sort_opts_vals.sflag)) { cmp = top_level_str_coll(s2, s1); if (debug_sort) printf("; cmp2=%d", cmp); } if (debug_sort) printf("\n"); if ((sort_opts_vals.uflag && (cmp <= 0)) || (cmp < 0)) { if (!(sort_opts_vals.csilentflag)) { s2disorder = bwsdup(s2); posdisorder = pos; if (debug_sort) s1disorder = bwsdup(s1); } res = 1; goto end; } pos++; clean_keys_array(s1, ka1); sort_free(ka1); ka1 = ka2; ka2 = NULL; bwsfree(s1); s1 = s2; s2 = file_reader_readline(fr); if (s2 == NULL) goto end; ka2 = keys_array_alloc(); preproc(s2, ka2); } end: if (ka1) { clean_keys_array(s1, ka1); sort_free(ka1); } if (s1) bwsfree(s1); if (ka2) { clean_keys_array(s2, ka2); sort_free(ka2); } if (s2) bwsfree(s2); if ((fn == NULL) || (*fn == 0) || (strcmp(fn, "-") == 0)) { for (;;) { s2 = file_reader_readline(fr); if (s2 == NULL) break; bwsfree(s2); } } file_reader_free(fr); if (s2disorder) { bws_disorder_warnx(s2disorder, fn, posdisorder); if (s1disorder) { bws_disorder_warnx(s1disorder, fn, posdisorder); if (s1disorder != s2disorder) bwsfree(s1disorder); } bwsfree(s2disorder); s1disorder = NULL; s2disorder = NULL; } if (res) exit(res); return (0); } /* * Opens a file. If the given filename is "-", stdout will be * opened. */ FILE * openfile(const char *fn, const char *mode) { FILE *file; if (strcmp(fn, "-") == 0) { return ((mode && mode[0] == 'r') ? stdin : stdout); } else { mode_t orig_file_mask = 0; int is_tmp = file_is_tmp(fn); if (is_tmp && (mode[0] == 'w')) orig_file_mask = umask(S_IWGRP | S_IWOTH | S_IRGRP | S_IROTH); if (is_tmp && (compress_program != NULL)) { char *cmd; size_t cmdsz; cmdsz = strlen(fn) + 128; cmd = sort_malloc(cmdsz); fflush(stdout); if (mode[0] == 'r') snprintf(cmd, cmdsz - 1, "cat %s | %s -d", fn, compress_program); else if (mode[0] == 'w') snprintf(cmd, cmdsz - 1, "%s > %s", compress_program, fn); else err(2, "%s", getstr(7)); if ((file = popen(cmd, mode)) == NULL) err(2, NULL); sort_free(cmd); } else if ((file = fopen(fn, mode)) == NULL) err(2, NULL); if (is_tmp && (mode[0] == 'w')) umask(orig_file_mask); } return (file); } /* * Close file */ void closefile(FILE *f, const char *fn) { if (f == NULL) { ; } else if (f == stdin) { ; } else if (f == stdout) { fflush(f); } else { if (file_is_tmp(fn) && compress_program != NULL) { if(pclose(f)<0) err(2,NULL); } else fclose(f); } } /* * Reads a file into the internal buffer. */ struct file_reader * file_reader_init(const char *fsrc) { struct file_reader *ret; if (fsrc == NULL) fsrc = "-"; ret = sort_malloc(sizeof(struct file_reader)); memset(ret, 0, sizeof(struct file_reader)); ret->elsymb = '\n'; if (sort_opts_vals.zflag) ret->elsymb = 0; ret->fname = sort_strdup(fsrc); if (strcmp(fsrc, "-") && (compress_program == NULL) && use_mmap) { do { struct stat stat_buf; void *addr; size_t sz = 0; int fd, flags; flags = MAP_NOCORE | MAP_NOSYNC; addr = MAP_FAILED; fd = open(fsrc, O_RDONLY); if (fd < 0) err(2, NULL); if (fstat(fd, &stat_buf) < 0) { close(fd); break; } sz = stat_buf.st_size; #if defined(MAP_PREFAULT_READ) flags |= MAP_PREFAULT_READ; #endif addr = mmap(NULL, sz, PROT_READ, flags, fd, 0); if (addr == MAP_FAILED) { close(fd); break; } ret->fd = fd; ret->mmapaddr = addr; ret->mmapsize = sz; ret->mmapptr = ret->mmapaddr; } while (0); } if (ret->mmapaddr == NULL) { ret->file = openfile(fsrc, "r"); if (ret->file == NULL) err(2, NULL); if (strcmp(fsrc, "-")) { ret->cbsz = READ_CHUNK; ret->buffer = sort_malloc(ret->cbsz); ret->bsz = 0; ret->strbeg = 0; ret->bsz = fread(ret->buffer, 1, ret->cbsz, ret->file); if (ret->bsz == 0) { if (ferror(ret->file)) err(2, NULL); } } } return (ret); } struct bwstring * file_reader_readline(struct file_reader *fr) { struct bwstring *ret = NULL; if (fr->mmapaddr) { unsigned char *mmapend; mmapend = fr->mmapaddr + fr->mmapsize; if (fr->mmapptr >= mmapend) return (NULL); else { unsigned char *strend; size_t sz; sz = mmapend - fr->mmapptr; strend = memchr(fr->mmapptr, fr->elsymb, sz); if (strend == NULL) { ret = bwscsbdup(fr->mmapptr, sz); fr->mmapptr = mmapend; } else { ret = bwscsbdup(fr->mmapptr, strend - fr->mmapptr); fr->mmapptr = strend + 1; } } } else if (fr->file != stdin) { unsigned char *strend; size_t bsz1, remsz, search_start; search_start = 0; remsz = 0; strend = NULL; if (fr->bsz > fr->strbeg) remsz = fr->bsz - fr->strbeg; /* line read cycle */ for (;;) { if (remsz > search_start) strend = memchr(fr->buffer + fr->strbeg + search_start, fr->elsymb, remsz - search_start); else strend = NULL; if (strend) break; if (feof(fr->file)) break; if (fr->bsz != fr->cbsz) /* NOTREACHED */ err(2, "File read software error 1"); if (remsz > (READ_CHUNK >> 1)) { search_start = fr->cbsz - fr->strbeg; fr->cbsz += READ_CHUNK; fr->buffer = sort_realloc(fr->buffer, fr->cbsz); bsz1 = fread(fr->buffer + fr->bsz, 1, READ_CHUNK, fr->file); if (bsz1 == 0) { if (ferror(fr->file)) err(2, NULL); break; } fr->bsz += bsz1; remsz += bsz1; } else { if (remsz > 0 && fr->strbeg>0) bcopy(fr->buffer + fr->strbeg, fr->buffer, remsz); fr->strbeg = 0; search_start = remsz; bsz1 = fread(fr->buffer + remsz, 1, fr->cbsz - remsz, fr->file); if (bsz1 == 0) { if (ferror(fr->file)) err(2, NULL); break; } fr->bsz = remsz + bsz1; remsz = fr->bsz; } } if (strend == NULL) strend = fr->buffer + fr->bsz; if ((fr->buffer + fr->strbeg <= strend) && (fr->strbeg < fr->bsz) && (remsz>0)) ret = bwscsbdup(fr->buffer + fr->strbeg, strend - fr->buffer - fr->strbeg); fr->strbeg = (strend - fr->buffer) + 1; } else { size_t len = 0; ret = bwsfgetln(fr->file, &len, sort_opts_vals.zflag, &(fr->rb)); } return (ret); } static void file_reader_clean(struct file_reader *fr) { if (fr) { if (fr->mmapaddr) munmap(fr->mmapaddr, fr->mmapsize); if (fr->fd) close(fr->fd); if (fr->buffer) sort_free(fr->buffer); if (fr->file) if (fr->file != stdin) closefile(fr->file, fr->fname); if(fr->fname) sort_free(fr->fname); memset(fr, 0, sizeof(struct file_reader)); } } void file_reader_free(struct file_reader *fr) { if (fr) { file_reader_clean(fr); sort_free(fr); } } int procfile(const char *fsrc, struct sort_list *list, struct file_list *fl) { struct file_reader *fr; fr = file_reader_init(fsrc); if (fr == NULL) err(2, NULL); /* file browse cycle */ for (;;) { struct bwstring *bws; bws = file_reader_readline(fr); if (bws == NULL) break; sort_list_add(list, bws); if (list->memsize >= available_free_memory) { char *fn; fn = new_tmp_file_name(); sort_list_to_file(list, fn); file_list_add(fl, fn, false); sort_list_clean(list); } } file_reader_free(fr); return (0); } /* * Compare file headers. Files with EOF always go to the end of the list. */ static int file_header_cmp(struct file_header *f1, struct file_header *f2) { if (f1 == f2) return (0); else { if (f1->fr == NULL) { return ((f2->fr == NULL) ? 0 : +1); } else if (f2->fr == NULL) return (-1); else { int ret; ret = list_coll(&(f1->si), &(f2->si)); if (!ret) return ((f1->file_pos < f2->file_pos) ? -1 : +1); return (ret); } } } /* * Allocate and init file header structure */ static void file_header_init(struct file_header **fh, const char *fn, size_t file_pos) { if (fh && fn) { struct bwstring *line; *fh = sort_malloc(sizeof(struct file_header)); (*fh)->file_pos = file_pos; (*fh)->fr = file_reader_init(fn); if ((*fh)->fr == NULL) { perror(fn); err(2, "%s", getstr(8)); } line = file_reader_readline((*fh)->fr); if (line == NULL) { file_reader_free((*fh)->fr); (*fh)->fr = NULL; (*fh)->si = NULL; } else { (*fh)->si = sort_list_item_alloc(); sort_list_item_set((*fh)->si, line); } } } /* * Close file */ static void file_header_close(struct file_header **fh) { if (fh && *fh) { if ((*fh)->fr) { file_reader_free((*fh)->fr); (*fh)->fr = NULL; } if ((*fh)->si) { sort_list_item_clean((*fh)->si); sort_free((*fh)->si); (*fh)->si = NULL; } sort_free(*fh); *fh = NULL; } } /* * Swap two array elements */ static void file_header_swap(struct file_header **fh, size_t i1, size_t i2) { struct file_header *tmp; tmp = fh[i1]; fh[i1] = fh[i2]; fh[i2] = tmp; } /* heap algorithm ==>> */ /* * See heap sort algorithm * "Raises" last element to its right place */ static void file_header_heap_swim(struct file_header **fh, size_t indx) { if (indx > 0) { size_t parent_index; parent_index = (indx - 1) >> 1; if (file_header_cmp(fh[indx], fh[parent_index]) < 0) { /* swap child and parent and continue */ file_header_swap(fh, indx, parent_index); file_header_heap_swim(fh, parent_index); } } } /* * Sink the top element to its correct position */ static void file_header_heap_sink(struct file_header **fh, size_t indx, size_t size) { size_t left_child_index; size_t right_child_index; left_child_index = indx + indx + 1; right_child_index = left_child_index + 1; if (left_child_index < size) { size_t min_child_index; min_child_index = left_child_index; if ((right_child_index < size) && (file_header_cmp(fh[left_child_index], fh[right_child_index]) > 0)) min_child_index = right_child_index; if (file_header_cmp(fh[indx], fh[min_child_index]) > 0) { file_header_swap(fh, indx, min_child_index); file_header_heap_sink(fh, min_child_index, size); } } } /* <<== heap algorithm */ /* * Adds element to the "left" end */ static void file_header_list_rearrange_from_header(struct file_header **fh, size_t size) { file_header_heap_sink(fh, 0, size); } /* * Adds element to the "right" end */ static void file_header_list_push(struct file_header *f, struct file_header **fh, size_t size) { fh[size++] = f; file_header_heap_swim(fh, size - 1); } struct last_printed { struct bwstring *str; }; /* * Prints the current line of the file */ static void file_header_print(struct file_header *fh, FILE *f_out, struct last_printed *lp) { if (fh && fh->fr && f_out && fh->si && fh->si->str) { if (sort_opts_vals.uflag) { if ((lp->str == NULL) || (str_list_coll(lp->str, &(fh->si)))) { bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag); if (lp->str) bwsfree(lp->str); lp->str = bwsdup(fh->si->str); } } else bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag); } } /* * Read next line */ static void file_header_read_next(struct file_header *fh) { if (fh && fh->fr) { struct bwstring *tmp; tmp = file_reader_readline(fh->fr); if (tmp == NULL) { file_reader_free(fh->fr); fh->fr = NULL; if (fh->si) { sort_list_item_clean(fh->si); sort_free(fh->si); fh->si = NULL; } } else { if (fh->si == NULL) fh->si = sort_list_item_alloc(); sort_list_item_set(fh->si, tmp); } } } /* * Merge array of "files headers" */ static void file_headers_merge(size_t fnum, struct file_header **fh, FILE *f_out) { struct last_printed lp; size_t i; memset(&lp, 0, sizeof(lp)); /* - * construct the initial sort structure + * construct the initial sort structure */ for (i = 0; i < fnum; i++) file_header_list_push(fh[i], fh, i); while (fh[0]->fr) { /* unfinished files are always in front */ /* output the smallest line: */ file_header_print(fh[0], f_out, &lp); /* read a new line, if possible: */ file_header_read_next(fh[0]); /* re-arrange the list: */ file_header_list_rearrange_from_header(fh, fnum); } if (lp.str) bwsfree(lp.str); } /* * Merges the given files into the output file, which can be * stdout. */ static void merge_files_array(size_t argc, char **argv, const char *fn_out) { if (argv && fn_out) { struct file_header **fh; FILE *f_out; size_t i; f_out = openfile(fn_out, "w"); if (f_out == NULL) err(2, NULL); fh = sort_malloc((argc + 1) * sizeof(struct file_header *)); for (i = 0; i < argc; i++) file_header_init(fh + i, argv[i], (size_t) i); file_headers_merge(argc, fh, f_out); for (i = 0; i < argc; i++) file_header_close(fh + i); sort_free(fh); closefile(f_out, fn_out); } } /* * Shrinks the file list until its size smaller than max number of opened files */ static int shrink_file_list(struct file_list *fl) { if ((fl == NULL) || (size_t) (fl->count) < max_open_files) return (0); else { struct file_list new_fl; size_t indx = 0; file_list_init(&new_fl, true); while (indx < fl->count) { char *fnew; size_t num; num = fl->count - indx; fnew = new_tmp_file_name(); if ((size_t) num >= max_open_files) num = max_open_files - 1; merge_files_array(num, fl->fns + indx, fnew); if (fl->tmp) { size_t i; for (i = 0; i < num; i++) unlink(fl->fns[indx + i]); } file_list_add(&new_fl, fnew, false); indx += num; } fl->tmp = false; /* already taken care of */ file_list_clean(fl); fl->count = new_fl.count; fl->fns = new_fl.fns; fl->sz = new_fl.sz; fl->tmp = new_fl.tmp; return (1); } } /* * Merge list of files */ void merge_files(struct file_list *fl, const char *fn_out) { if (fl && fn_out) { while (shrink_file_list(fl)); merge_files_array(fl->count, fl->fns, fn_out); } } static const char * get_sort_method_name(int sm) { if (sm == SORT_MERGESORT) return "mergesort"; else if (sort_opts_vals.sort_method == SORT_RADIXSORT) return "radixsort"; else if (sort_opts_vals.sort_method == SORT_HEAPSORT) return "heapsort"; else return "quicksort"; } /* * Wrapper for qsort */ static int sort_qsort(void *list, size_t count, size_t elem_size, int (*cmp_func)(const void *, const void *)) { qsort(list, count, elem_size, cmp_func); return (0); } /* * Sort list of lines and writes it to the file */ void sort_list_to_file(struct sort_list *list, const char *outfile) { struct sort_mods *sm = &(keys[0].sm); if (!(sm->Mflag) && !(sm->Rflag) && !(sm->Vflag) && !(sm->Vflag) && !(sm->gflag) && !(sm->hflag) && !(sm->nflag)) { if ((sort_opts_vals.sort_method == SORT_DEFAULT) && byte_sort) sort_opts_vals.sort_method = SORT_RADIXSORT; } else if (sort_opts_vals.sort_method == SORT_RADIXSORT) err(2, "%s", getstr(9)); /* * to handle stable sort and the unique cases in the * right order, we need stable basic algorithm */ if (sort_opts_vals.sflag) { switch (sort_opts_vals.sort_method){ case SORT_MERGESORT: break; case SORT_RADIXSORT: break; case SORT_DEFAULT: sort_opts_vals.sort_method = SORT_MERGESORT; break; default: errx(2, "%s", getstr(10)); }; } if (sort_opts_vals.sort_method == SORT_DEFAULT) sort_opts_vals.sort_method = DEFAULT_SORT_ALGORITHM; if (debug_sort) printf("sort_method=%s\n", get_sort_method_name(sort_opts_vals.sort_method)); switch (sort_opts_vals.sort_method){ case SORT_RADIXSORT: rxsort(list->list, list->count); sort_list_dump(list, outfile); break; case SORT_MERGESORT: mt_sort(list, mergesort, outfile); break; case SORT_HEAPSORT: mt_sort(list, heapsort, outfile); break; case SORT_QSORT: mt_sort(list, sort_qsort, outfile); break; default: mt_sort(list, DEFAULT_SORT_FUNC, outfile); break; } } /******************* MT SORT ************************/ #if defined(SORT_THREADS) /* semaphore to count threads */ static sem_t mtsem; /* current system sort function */ static int (*g_sort_func)(void *, size_t, size_t, int(*)(const void *, const void *)); /* * Sort cycle thread (in multi-threaded mode) */ static void* mt_sort_thread(void* arg) { struct sort_list *list = arg; g_sort_func(list->list, list->count, sizeof(struct sort_list_item *), (int(*)(const void *, const void *)) list_coll); sem_post(&mtsem); return (arg); } /* * Compare sub-lists. Empty sub-lists always go to the end of the list. */ static int sub_list_cmp(struct sort_list *l1, struct sort_list *l2) { if (l1 == l2) return (0); else { if (l1->count == 0) { return ((l2->count == 0) ? 0 : +1); } else if (l2->count == 0) { return (-1); } else { int ret; ret = list_coll(&(l1->list[0]), &(l2->list[0])); if (!ret) return ((l1->sub_list_pos < l2->sub_list_pos) ? -1 : +1); return (ret); } } } /* * Swap two array elements */ static void sub_list_swap(struct sort_list **sl, size_t i1, size_t i2) { struct sort_list *tmp; tmp = sl[i1]; sl[i1] = sl[i2]; sl[i2] = tmp; } /* heap algorithm ==>> */ /* * See heap sort algorithm * "Raises" last element to its right place */ static void sub_list_swim(struct sort_list **sl, size_t indx) { if (indx > 0) { size_t parent_index; parent_index = (indx - 1) >> 1; if (sub_list_cmp(sl[indx], sl[parent_index]) < 0) { /* swap child and parent and continue */ sub_list_swap(sl, indx, parent_index); sub_list_swim(sl, parent_index); } } } /* * Sink the top element to its correct position */ static void sub_list_sink(struct sort_list **sl, size_t indx, size_t size) { size_t left_child_index; size_t right_child_index; left_child_index = indx + indx + 1; right_child_index = left_child_index + 1; if (left_child_index < size) { size_t min_child_index; min_child_index = left_child_index; if ((right_child_index < size) && (sub_list_cmp(sl[left_child_index], sl[right_child_index]) > 0)) min_child_index = right_child_index; if (sub_list_cmp(sl[indx], sl[min_child_index]) > 0) { sub_list_swap(sl, indx, min_child_index); sub_list_sink(sl, min_child_index, size); } } } /* <<== heap algorithm */ /* * Adds element to the "right" end */ static void sub_list_push(struct sort_list *s, struct sort_list **sl, size_t size) { sl[size++] = s; sub_list_swim(sl, size - 1); } struct last_printed_item { struct sort_list_item *item; }; /* * Prints the current line of the file */ static void sub_list_header_print(struct sort_list *sl, FILE *f_out, struct last_printed_item *lp) { if (sl && sl->count && f_out && sl->list[0]->str) { if (sort_opts_vals.uflag) { if ((lp->item == NULL) || (list_coll(&(lp->item), &(sl->list[0])))) { bwsfwrite(sl->list[0]->str, f_out, sort_opts_vals.zflag); lp->item = sl->list[0]; } } else bwsfwrite(sl->list[0]->str, f_out, sort_opts_vals.zflag); } } /* * Read next line */ static void sub_list_next(struct sort_list *sl) { if (sl && sl->count) { sl->list += 1; sl->count -= 1; } } /* * Merge sub-lists to a file */ static void merge_sub_lists(struct sort_list **sl, size_t n, FILE* f_out) { struct last_printed_item lp; size_t i; memset(&lp,0,sizeof(lp)); /* construct the initial list: */ for (i = 0; i < n; i++) sub_list_push(sl[i], sl, i); while (sl[0]->count) { /* unfinished lists are always in front */ /* output the smallest line: */ sub_list_header_print(sl[0], f_out, &lp); /* move to a new line, if possible: */ sub_list_next(sl[0]); /* re-arrange the list: */ sub_list_sink(sl, 0, n); } } /* * Merge sub-lists to a file */ static void merge_list_parts(struct sort_list **parts, size_t n, const char *fn) { FILE* f_out; f_out = openfile(fn,"w"); merge_sub_lists(parts, n, f_out); closefile(f_out, fn); } #endif /* defined(SORT_THREADS) */ /* * Multi-threaded sort algorithm "driver" */ static void mt_sort(struct sort_list *list, int(*sort_func)(void *, size_t, size_t, int(*)(const void *, const void *)), const char* fn) { #if defined(SORT_THREADS) if (nthreads < 2 || list->count < MT_SORT_THRESHOLD) { size_t nthreads_save = nthreads; nthreads = 1; #endif /* if single thread or small data, do simple sort */ sort_func(list->list, list->count, sizeof(struct sort_list_item *), (int(*)(const void *, const void *)) list_coll); sort_list_dump(list, fn); #if defined(SORT_THREADS) nthreads = nthreads_save; } else { /* multi-threaded sort */ struct sort_list **parts; size_t avgsize, cstart, i; /* array of sub-lists */ parts = sort_malloc(sizeof(struct sort_list*) * nthreads); cstart = 0; avgsize = list->count / nthreads; /* set global system sort function */ g_sort_func = sort_func; /* set sublists */ for (i = 0; i < nthreads; ++i) { size_t sz = 0; parts[i] = sort_malloc(sizeof(struct sort_list)); parts[i]->list = list->list + cstart; parts[i]->memsize = 0; parts[i]->sub_list_pos = i; sz = (i == nthreads - 1) ? list->count - cstart : avgsize; parts[i]->count = sz; parts[i]->size = parts[i]->count; cstart += sz; } /* init threads counting semaphore */ sem_init(&mtsem, 0, 0); /* start threads */ for (i = 0; i < nthreads; ++i) { pthread_t pth; pthread_attr_t attr; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_DETACHED); for (;;) { int res = pthread_create(&pth, &attr, mt_sort_thread, parts[i]); if (res >= 0) break; if (errno == EAGAIN) { pthread_yield(); continue; } err(2, NULL); } pthread_attr_destroy(&attr); } /* wait for threads completion */ for (i = 0; i < nthreads; ++i) { sem_wait(&mtsem); } /* destroy the semaphore - we do not need it anymore */ sem_destroy(&mtsem); /* merge sorted sub-lists to the file */ merge_list_parts(parts, nthreads, fn); /* free sub-lists data */ for (i = 0; i < nthreads; ++i) { sort_free(parts[i]); } sort_free(parts); } #endif /* defined(SORT_THREADS) */ } Index: head/usr.bin/sort/file.h =================================================================== --- head/usr.bin/sort/file.h (revision 281122) +++ head/usr.bin/sort/file.h (revision 281123) @@ -1,138 +1,138 @@ /* $FreeBSD$ */ /*- * Copyright (C) 2009 Gabor Kovesdan * Copyright (C) 2012 Oleg Moskalenko * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if !defined(__SORT_FILE_H__) #define __SORT_FILE_H__ #include "coll.h" #include "sort.h" #define SORT_DEFAULT 0 #define SORT_QSORT 1 #define SORT_MERGESORT 2 #define SORT_HEAPSORT 3 #define SORT_RADIXSORT 4 #define DEFAULT_SORT_ALGORITHM SORT_HEAPSORT #define DEFAULT_SORT_FUNC heapsort /* * List of data to be sorted. */ struct sort_list { struct sort_list_item **list; unsigned long long memsize; size_t count; size_t size; size_t sub_list_pos; }; /* - * File reader object + * File reader object */ struct file_reader; /* * List of files to be sorted */ struct file_list { char **fns; size_t count; size_t sz; bool tmp; }; /* * Structure for zero-separated file reading (for input files list) */ struct file0_reader { char *current_line; FILE *f; size_t current_sz; }; /* memory */ /**/ extern unsigned long long free_memory; extern unsigned long long available_free_memory; /* Are we using mmap ? */ extern bool use_mmap; /* temporary file dir */ extern const char *tmpdir; /* * Max number of simultaneously open files (including the output file). */ extern size_t max_open_files; /* * Compress program */ extern const char* compress_program; /* funcs */ struct file_reader *file_reader_init(const char *fsrc); struct bwstring *file_reader_readline(struct file_reader *fr); void file_reader_free(struct file_reader *fr); char *read_file0_line(struct file0_reader *f0r); void init_tmp_files(void); void clear_tmp_files(void); char *new_tmp_file_name(void); void tmp_file_atexit(const char *tmp_file); void file_list_init(struct file_list *fl, bool tmp); void file_list_add(struct file_list *fl, char *fn, bool allocate); void file_list_populate(struct file_list *fl, int argc, char **argv, bool allocate); void file_list_clean(struct file_list *fl); int check(const char *); void merge_files(struct file_list *fl, const char *fn_out); FILE *openfile(const char *, const char *); void closefile(FILE *, const char *); int procfile(const char *fn, struct sort_list *list, struct file_list *fl); void sort_list_init(struct sort_list *l); void sort_list_add(struct sort_list *l, struct bwstring *str); void sort_list_clean(struct sort_list *l); void sort_list_dump(struct sort_list *l, const char *fn); void sort_list_to_file(struct sort_list *list, const char *outfile); #endif /* __SORT_FILE_H__ */ Index: head/usr.bin/sort/radixsort.c =================================================================== --- head/usr.bin/sort/radixsort.c (revision 281122) +++ head/usr.bin/sort/radixsort.c (revision 281123) @@ -1,691 +1,690 @@ /*- * Copyright (C) 2012 Oleg Moskalenko * Copyright (C) 2012 Gabor Kovesdan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #if defined(SORT_THREADS) #include #include #endif #include #include #include #include #include #include "coll.h" #include "radixsort.h" #define DEFAULT_SORT_FUNC_RADIXSORT mergesort #define TINY_NODE(sl) ((sl)->tosort_num < 65) #define SMALL_NODE(sl) ((sl)->tosort_num < 5) /* are we sorting in reverse order ? */ static bool reverse_sort; /* sort sub-levels array size */ static const size_t slsz = 256 * sizeof(struct sort_level*); /* one sort level structure */ struct sort_level { struct sort_level **sublevels; struct sort_list_item **leaves; struct sort_list_item **sorted; struct sort_list_item **tosort; size_t leaves_num; size_t leaves_sz; size_t level; size_t real_sln; size_t start_position; size_t sln; size_t tosort_num; size_t tosort_sz; }; /* stack of sort levels ready to be sorted */ struct level_stack { struct level_stack *next; struct sort_level *sl; }; static struct level_stack *g_ls; #if defined(SORT_THREADS) /* stack guarding mutex */ static pthread_mutex_t g_ls_mutex; /* counter: how many items are left */ static size_t sort_left; /* guarding mutex */ static pthread_mutex_t sort_left_mutex; /* semaphore to count threads */ static sem_t mtsem; /* * Decrement items counter */ static inline void sort_left_dec(size_t n) { pthread_mutex_lock(&sort_left_mutex); sort_left -= n; pthread_mutex_unlock(&sort_left_mutex); } /* * Do we have something to sort ? */ static inline bool have_sort_left(void) { bool ret; pthread_mutex_lock(&sort_left_mutex); ret = (sort_left > 0); pthread_mutex_unlock(&sort_left_mutex); return (ret); } #else #define sort_left_dec(n) #endif /* SORT_THREADS */ /* * Push sort level to the stack */ static inline void -push_ls(struct sort_level* sl) +push_ls(struct sort_level *sl) { struct level_stack *new_ls; new_ls = sort_malloc(sizeof(struct level_stack)); new_ls->sl = sl; #if defined(SORT_THREADS) if (nthreads > 1) pthread_mutex_lock(&g_ls_mutex); #endif new_ls->next = g_ls; g_ls = new_ls; #if defined(SORT_THREADS) if (nthreads > 1) pthread_mutex_unlock(&g_ls_mutex); #endif } /* * Pop sort level from the stack (single-threaded style) */ static inline struct sort_level* pop_ls_st(void) { struct sort_level *sl; if (g_ls) { struct level_stack *saved_ls; sl = g_ls->sl; saved_ls = g_ls; g_ls = g_ls->next; sort_free(saved_ls); } else sl = NULL; return (sl); } #if defined(SORT_THREADS) /* * Pop sort level from the stack (multi-threaded style) */ static inline struct sort_level* pop_ls_mt(void) { struct level_stack *saved_ls; struct sort_level *sl; pthread_mutex_lock(&g_ls_mutex); if (g_ls) { sl = g_ls->sl; saved_ls = g_ls; g_ls = g_ls->next; } else { sl = NULL; saved_ls = NULL; } pthread_mutex_unlock(&g_ls_mutex); sort_free(saved_ls); return (sl); } #endif /* defined(SORT_THREADS) */ static void add_to_sublevel(struct sort_level *sl, struct sort_list_item *item, size_t indx) { struct sort_level *ssl; ssl = sl->sublevels[indx]; if (ssl == NULL) { ssl = sort_malloc(sizeof(struct sort_level)); memset(ssl, 0, sizeof(struct sort_level)); ssl->level = sl->level + 1; sl->sublevels[indx] = ssl; ++(sl->real_sln); } if (++(ssl->tosort_num) > ssl->tosort_sz) { ssl->tosort_sz = ssl->tosort_num + 128; ssl->tosort = sort_realloc(ssl->tosort, sizeof(struct sort_list_item*) * (ssl->tosort_sz)); } ssl->tosort[ssl->tosort_num - 1] = item; } static inline void add_leaf(struct sort_level *sl, struct sort_list_item *item) { - if (++(sl->leaves_num) > sl->leaves_sz) { sl->leaves_sz = sl->leaves_num + 128; sl->leaves = sort_realloc(sl->leaves, (sizeof(struct sort_list_item*) * (sl->leaves_sz))); } sl->leaves[sl->leaves_num - 1] = item; } static inline int get_wc_index(struct sort_list_item *sli, size_t level) { const struct bwstring *bws; bws = sli->ka.key[0].k; if ((BWSLEN(bws) > level)) return (unsigned char) BWS_GET(bws,level); return (-1); } static void place_item(struct sort_level *sl, size_t item) { struct sort_list_item *sli; int c; sli = sl->tosort[item]; c = get_wc_index(sli, sl->level); if (c == -1) add_leaf(sl, sli); else add_to_sublevel(sl, sli, c); } static void free_sort_level(struct sort_level *sl) { if (sl) { if (sl->leaves) sort_free(sl->leaves); if (sl->level > 0) sort_free(sl->tosort); if (sl->sublevels) { struct sort_level *slc; size_t sln; sln = sl->sln; for (size_t i = 0; i < sln; ++i) { slc = sl->sublevels[i]; if (slc) free_sort_level(slc); } sort_free(sl->sublevels); } sort_free(sl); } } static void run_sort_level_next(struct sort_level *sl) { struct sort_level *slc; size_t i, sln, tosort_num; if (sl->sublevels) { sort_free(sl->sublevels); sl->sublevels = NULL; } switch (sl->tosort_num){ case 0: goto end; case (1): sl->sorted[sl->start_position] = sl->tosort[0]; sort_left_dec(1); goto end; case (2): if (list_coll_offset(&(sl->tosort[0]), &(sl->tosort[1]), sl->level) > 0) { sl->sorted[sl->start_position++] = sl->tosort[1]; sl->sorted[sl->start_position] = sl->tosort[0]; } else { sl->sorted[sl->start_position++] = sl->tosort[0]; sl->sorted[sl->start_position] = sl->tosort[1]; } sort_left_dec(2); goto end; default: if (TINY_NODE(sl) || (sl->level > 15)) { listcoll_t func; func = get_list_call_func(sl->level); sl->leaves = sl->tosort; sl->leaves_num = sl->tosort_num; sl->leaves_sz = sl->leaves_num; sl->leaves = sort_realloc(sl->leaves, (sizeof(struct sort_list_item *) * (sl->leaves_sz))); sl->tosort = NULL; sl->tosort_num = 0; sl->tosort_sz = 0; sl->sln = 0; sl->real_sln = 0; if (sort_opts_vals.sflag) { if (mergesort(sl->leaves, sl->leaves_num, sizeof(struct sort_list_item *), (int(*)(const void *, const void *)) func) == -1) /* NOTREACHED */ err(2, "Radix sort error 3"); } else DEFAULT_SORT_FUNC_RADIXSORT(sl->leaves, sl->leaves_num, sizeof(struct sort_list_item *), (int(*)(const void *, const void *)) func); memcpy(sl->sorted + sl->start_position, sl->leaves, sl->leaves_num * sizeof(struct sort_list_item*)); sort_left_dec(sl->leaves_num); goto end; } else { sl->tosort_sz = sl->tosort_num; sl->tosort = sort_realloc(sl->tosort, sizeof(struct sort_list_item*) * (sl->tosort_sz)); } } sl->sln = 256; sl->sublevels = sort_malloc(slsz); memset(sl->sublevels, 0, slsz); sl->real_sln = 0; tosort_num = sl->tosort_num; for (i = 0; i < tosort_num; ++i) place_item(sl, i); sort_free(sl->tosort); sl->tosort = NULL; sl->tosort_num = 0; sl->tosort_sz = 0; if (sl->leaves_num > 1) { if (keys_num > 1) { if (sort_opts_vals.sflag) { mergesort(sl->leaves, sl->leaves_num, sizeof(struct sort_list_item *), (int(*)(const void *, const void *)) list_coll); } else { DEFAULT_SORT_FUNC_RADIXSORT(sl->leaves, sl->leaves_num, sizeof(struct sort_list_item *), (int(*)(const void *, const void *)) list_coll); } } else if (!sort_opts_vals.sflag && sort_opts_vals.complex_sort) { DEFAULT_SORT_FUNC_RADIXSORT(sl->leaves, sl->leaves_num, sizeof(struct sort_list_item *), (int(*)(const void *, const void *)) list_coll_by_str_only); } } sl->leaves_sz = sl->leaves_num; sl->leaves = sort_realloc(sl->leaves, (sizeof(struct sort_list_item *) * (sl->leaves_sz))); if (!reverse_sort) { memcpy(sl->sorted + sl->start_position, sl->leaves, sl->leaves_num * sizeof(struct sort_list_item*)); sl->start_position += sl->leaves_num; sort_left_dec(sl->leaves_num); sort_free(sl->leaves); sl->leaves = NULL; sl->leaves_num = 0; sl->leaves_sz = 0; sln = sl->sln; for (i = 0; i < sln; ++i) { slc = sl->sublevels[i]; if (slc) { slc->sorted = sl->sorted; slc->start_position = sl->start_position; sl->start_position += slc->tosort_num; if (SMALL_NODE(slc)) run_sort_level_next(slc); else push_ls(slc); sl->sublevels[i] = NULL; } } } else { size_t n; sln = sl->sln; for (i = 0; i < sln; ++i) { n = sln - i - 1; slc = sl->sublevels[n]; if (slc) { slc->sorted = sl->sorted; slc->start_position = sl->start_position; sl->start_position += slc->tosort_num; if (SMALL_NODE(slc)) run_sort_level_next(slc); else push_ls(slc); sl->sublevels[n] = NULL; } } memcpy(sl->sorted + sl->start_position, sl->leaves, sl->leaves_num * sizeof(struct sort_list_item*)); sort_left_dec(sl->leaves_num); } end: free_sort_level(sl); } /* * Single-threaded sort cycle */ static void run_sort_cycle_st(void) { struct sort_level *slc; for (;;) { slc = pop_ls_st(); if (slc == NULL) { break; } run_sort_level_next(slc); } } #if defined(SORT_THREADS) /* * Multi-threaded sort cycle */ static void run_sort_cycle_mt(void) { struct sort_level *slc; for (;;) { slc = pop_ls_mt(); if (slc == NULL) { if (have_sort_left()) { pthread_yield(); continue; } break; } run_sort_level_next(slc); } } /* * Sort cycle thread (in multi-threaded mode) */ static void* sort_thread(void* arg) { run_sort_cycle_mt(); sem_post(&mtsem); return (arg); } #endif /* defined(SORT_THREADS) */ static void run_top_sort_level(struct sort_level *sl) { struct sort_level *slc; reverse_sort = sort_opts_vals.kflag ? keys[0].sm.rflag : default_sort_mods->rflag; sl->start_position = 0; sl->sln = 256; sl->sublevels = sort_malloc(slsz); memset(sl->sublevels, 0, slsz); for (size_t i = 0; i < sl->tosort_num; ++i) place_item(sl, i); if (sl->leaves_num > 1) { if (keys_num > 1) { if (sort_opts_vals.sflag) { mergesort(sl->leaves, sl->leaves_num, sizeof(struct sort_list_item *), (int(*)(const void *, const void *)) list_coll); } else { DEFAULT_SORT_FUNC_RADIXSORT(sl->leaves, sl->leaves_num, sizeof(struct sort_list_item *), (int(*)(const void *, const void *)) list_coll); } } else if (!sort_opts_vals.sflag && sort_opts_vals.complex_sort) { DEFAULT_SORT_FUNC_RADIXSORT(sl->leaves, sl->leaves_num, sizeof(struct sort_list_item *), (int(*)(const void *, const void *)) list_coll_by_str_only); } } if (!reverse_sort) { memcpy(sl->tosort + sl->start_position, sl->leaves, sl->leaves_num * sizeof(struct sort_list_item*)); sl->start_position += sl->leaves_num; sort_left_dec(sl->leaves_num); for (size_t i = 0; i < sl->sln; ++i) { slc = sl->sublevels[i]; if (slc) { slc->sorted = sl->tosort; slc->start_position = sl->start_position; sl->start_position += slc->tosort_num; push_ls(slc); sl->sublevels[i] = NULL; } } } else { size_t n; for (size_t i = 0; i < sl->sln; ++i) { n = sl->sln - i - 1; slc = sl->sublevels[n]; if (slc) { slc->sorted = sl->tosort; slc->start_position = sl->start_position; sl->start_position += slc->tosort_num; push_ls(slc); sl->sublevels[n] = NULL; } } memcpy(sl->tosort + sl->start_position, sl->leaves, sl->leaves_num * sizeof(struct sort_list_item*)); sort_left_dec(sl->leaves_num); } #if defined(SORT_THREADS) if (nthreads < 2) { #endif run_sort_cycle_st(); #if defined(SORT_THREADS) } else { size_t i; for(i = 0; i < nthreads; ++i) { pthread_attr_t attr; pthread_t pth; pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_DETACHED); for (;;) { int res = pthread_create(&pth, &attr, sort_thread, NULL); if (res >= 0) break; if (errno == EAGAIN) { pthread_yield(); continue; } err(2, NULL); } pthread_attr_destroy(&attr); } for(i = 0; i < nthreads; ++i) sem_wait(&mtsem); } #endif /* defined(SORT_THREADS) */ } static void run_sort(struct sort_list_item **base, size_t nmemb) { struct sort_level *sl; #if defined(SORT_THREADS) size_t nthreads_save = nthreads; if (nmemb < MT_SORT_THRESHOLD) nthreads = 1; if (nthreads > 1) { pthread_mutexattr_t mattr; pthread_mutexattr_init(&mattr); pthread_mutexattr_settype(&mattr, PTHREAD_MUTEX_ADAPTIVE_NP); pthread_mutex_init(&g_ls_mutex, &mattr); pthread_mutex_init(&sort_left_mutex, &mattr); pthread_mutexattr_destroy(&mattr); sem_init(&mtsem, 0, 0); } #endif sl = sort_malloc(sizeof(struct sort_level)); memset(sl, 0, sizeof(struct sort_level)); sl->tosort = base; sl->tosort_num = nmemb; sl->tosort_sz = nmemb; #if defined(SORT_THREADS) sort_left = nmemb; #endif run_top_sort_level(sl); free_sort_level(sl); #if defined(SORT_THREADS) if (nthreads > 1) { sem_destroy(&mtsem); pthread_mutex_destroy(&g_ls_mutex); pthread_mutex_destroy(&sort_left_mutex); } nthreads = nthreads_save; #endif } void rxsort(struct sort_list_item **base, size_t nmemb) { run_sort(base, nmemb); } Index: head/usr.bin/sort/sort.1.in =================================================================== --- head/usr.bin/sort/sort.1.in (revision 281122) +++ head/usr.bin/sort/sort.1.in (revision 281123) @@ -1,638 +1,638 @@ -.\" $OpenBSD: sort.1,v 1.31 2007/08/21 21:22:37 millert Exp $ +.\" $OpenBSD: sort.1,v 1.45 2015/03/19 13:51:10 jmc Exp $ .\" $FreeBSD$ .\" .\" Copyright (c) 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" .\" This code is derived from software contributed to Berkeley by .\" the Institute of Electrical and Electronics Engineers, Inc. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" @(#)sort.1 8.1 (Berkeley) 6/6/93 .\" -.Dd July 3, 2012 +.Dd March 19 2015 .Dt SORT 1 .Os .Sh NAME .Nm sort .Nd sort or merge records (lines) of text and binary files .Sh SYNOPSIS .Nm sort .Bk -words .Op Fl bcCdfghiRMmnrsuVz .Sm off .Op Fl k\ \& Ar field1 Op , Ar field2 .Sm on .Op Fl S Ar memsize .Ek .Op Fl T Ar dir .Op Fl t Ar char .Op Fl o Ar output .Op Ar file ... .Nm sort .Fl Fl help .Nm sort .Fl Fl version .Sh DESCRIPTION The .Nm utility sorts text and binary files by lines. A line is a record separated from the subsequent record by a newline (default) or NUL \'\\0\' character (-z option). A record can contain any printable or unprintable characters. Comparisons are based on one or more sort keys extracted from each line of input, and are performed lexicographically, according to the current locale's collating rules and the specified command-line options that can tune the actual sorting behavior. By default, if keys are not given, .Nm uses entire lines for comparison. .Pp The command line options are as follows: .Bl -tag -width Ds .It Fl c, Fl Fl check, Fl C, Fl Fl check=silent|quiet Check that the single input file is sorted. If the file is not sorted, .Nm produces the appropriate error messages and exits with code 1, otherwise returns 0. If .Fl C or .Fl Fl check=silent is specified, .Nm produces no output. This is a "silent" version of .Fl c. .It Fl m , Fl Fl merge Merge only. The input files are assumed to be pre-sorted. If they are not sorted the output order is undefined. .It Fl o Ar output , Fl Fl output Ns = Ns Ar output Print the output to the .Ar output file instead of the standard output. .It Fl S Ar size, Fl Fl buffer-size Ns = Ns Ar size Use .Ar size for the maximum size of the memory buffer. Size modifiers %,b,K,M,G,T,P,E,Z,Y can be used. If a memory limit is not explicitly specified, .Nm takes up to about 90% of available memory. If the file size is too big to fit into the memory buffer, the temporary disk files are used to perform the sorting. .It Fl T Ar dir , Fl Fl temporary-directory Ns = Ns Ar dir Store temporary files in the directory .Ar dir . The default path is the value of the environment variable .Ev TMPDIR or .Pa /var/tmp if .Ev TMPDIR is not defined. .It Fl u , Fl Fl unique Unique keys. Suppress all lines that have a key that is equal to an already processed one. This option, similarly to .Fl s , implies a stable sort. If used with .Fl c or .Fl C , .Nm also checks that there are no lines with duplicate keys. .It Fl s Stable sort. This option maintains the original record order of records that have and equal key. This is a non-standard feature, but it is widely accepted and used. .It Fl Fl version Print the version and silently exits. .It Fl Fl help Print the help text and silently exits. .El .Pp The following options override the default ordering rules. When ordering options appear independently of key field specifications, they apply globally to all sort keys. When attached to a specific key (see .Fl k ) , the ordering options override all global ordering options for the key they are attached to. .Bl -tag -width indent .It Fl b, Fl Fl ignore-leading-blanks Ignore leading blank characters when comparing lines. .It Fl d , Fl Fl dictionary-order Consider only blank spaces and alphanumeric characters in comparisons. .It Fl f , Fl Fl ignore-case Convert all lowercase characters to their uppercase equivalent before comparison, that is, perform case-independent sorting. .It Fl g, Fl Fl general-numeric-sort, Fl Fl sort=general-numeric Sort by general numerical value. As opposed to .Fl n , -this option handles general floating points, which have a much -permissive format than those allowed by -. Fl n , +this option handles general floating points. +It has a more +permissive format than that allowed by +.Fl n but it has a significant performance drawback. .It Fl h, Fl Fl human-numeric-sort, Fl Fl sort=human-numeric Sort by numerical value, but take into account the SI suffix, if present. Sort first by numeric sign (negative, zero, or positive); then by SI suffix (either empty, or `k' or `K', or one of `MGTPEZY', in that order); and finally by numeric value. The SI suffix must immediately follow the number. For example, '12345K' sorts before '1M', because M is "larger" than K. This sort option is useful for sorting the output of a single invocation of 'df' command with .Fl h or .Fl H options (human-readable). .It Fl i , Fl Fl ignore-nonprinting Ignore all non-printable characters. .It Fl M, Fl Fl month-sort, Fl Fl sort=month Sort by month abbreviations. Unknown strings are considered smaller than the month names. .It Fl n , Fl Fl numeric-sort, Fl Fl sort=numeric Sort fields numerically by arithmetic value. Fields are supposed to have optional blanks in the beginning, an optional minus sign, zero or more digits (including decimal point and possible thousand separators). .It Fl R, Fl Fl random-sort, Fl Fl sort=random Sort by a random order. This is a random permutation of the inputs except that the equal keys sort together. It is implemented by hashing the input keys and sorting the hash values. The hash function is chosen randomly. The hash function is randomized by .Cm /dev/random content, or by file content if it is specified by .Fl Fl random-source . Even if multiple sort fields are specified, the same random hash function is used for all of them. .It Fl r , Fl Fl reverse Sort in reverse order. .It Fl V, Fl Fl version-sort Sort version numbers. The input lines are treated as file names in form PREFIX VERSION SUFFIX, where SUFFIX matches the regular expression "(\.([A-Za-z~][A-Za-z0-9~]*)?)*". The files are compared by their prefixes and versions (leading zeros are ignored in version numbers, see example below). If an input string does not match the pattern, then it is compared using the byte compare function. All string comparisons are performed in C locale, the locale environment setting is ignored. .Bl -tag -width indent .It Example: .It $ ls sort* | sort -V .It sort-1.022.tgz .It sort-1.23.tgz .It sort-1.23.1.tgz .It sort-1.024.tgz .It sort-1.024.003. .It sort-1.024.003.tgz .It sort-1.024.07.tgz .It sort-1.024.009.tgz .El .El .Pp The treatment of field separators can be altered using these options: .Bl -tag -width indent .It Fl b , Fl Fl ignore-leading-blanks Ignore leading blank space when determining the start and end of a restricted sort key (see .Fl k ). If .Fl b is specified before the first .Fl k option, it applies globally to all key specifications. Otherwise, .Fl b can be attached independently to each .Ar field argument of the key specifications. .Fl b . .It Xo -.Sm off -.Fl k\ \& Ar field1 Op , Ar field2 , Fl Fl key Ns = Ns Ar field1 Op , Ar field2 -.Sm on +.Fl k Ar field1 Ns Op , Ns Ar field2 , +.Fl Fl key Ns = Ns Ar field1 Ns Op , Ns Ar field2 .Xc Define a restricted sort key that has the starting position .Ar field1 , and optional ending position .Ar field2 of a key field. The .Fl k option may be specified multiple times, in which case subsequent keys are compared when earlier keys compare equal. The .Fl k option replaces the obsolete options .Cm \(pl Ns Ar pos1 and .Fl Ns Ar pos2 , but the old notation is also supported. .It Fl t Ar char , Fl Fl field-separator Ns = Ns Ar char Use .Ar char as a field separator character. The initial .Ar char is not considered to be part of a field when determining key offsets. Each occurrence of .Ar char is significant (for example, .Dq Ar charchar delimits an empty field). If .Fl t is not specified, the default field separator is a sequence of blank space characters, and consecutive blank spaces do .Em not delimit an empty field, however, the initial blank space .Em is considered part of a field when determining key offsets. To use NUL as field separator, use .Fl t \'\\0\'. .It Fl z , Fl Fl zero-terminated Use NUL as record separator. By default, records in the files are supposed to be separated by the newline characters. With this option, NUL (\'\\0\') is used as a record separator character. .El .Pp Other options: .Bl -tag -width indent .It Fl Fl batch-size Ns = Ns Ar num Specify maximum number of files that can be opened by .Nm at once. This option affects behavior when having many input files or using temporary files. The default value is 16. .It Fl Fl compress-program Ns = Ns Ar PROGRAM Use PROGRAM to compress temporary files. PROGRAM must compress standard input to standard output, when called without arguments. When called with argument .Fl d it must decompress standard input to standard output. If PROGRAM fails, .Nm must exit with error. An example of PROGRAM that can be used here is bzip2. .It Fl Fl random-source Ns = Ns Ar filename In random sort, the file content is used as the source of the 'seed' data for the hash function choice. Two invocations of random sort with the same seed data will use the same hash function and will produce the same result if the input is also identical. By default, file .Cm /dev/random is used. .It Fl Fl debug Print some extra information about the sorting process to the standard output. %%THREADS%%.It Fl Fl parallel %%THREADS%%Set the maximum number of execution threads. %%THREADS%%Default number equals to the number of CPUs. .It Fl Fl files0-from Ns = Ns Ar filename Take the input file list from the file -.Ar filename. +.Ar filename . The file names must be separated by NUL (like the output produced by the command "find ... -print0"). .It Fl Fl radixsort Try to use radix sort, if the sort specifications allow. The radix sort can only be used for trivial locales (C and POSIX), and it cannot be used for numeric or month sort. Radix sort is very fast and stable. .It Fl Fl mergesort Use mergesort. This is a universal algorithm that can always be used, but it is not always the fastest. .It Fl Fl qsort Try to use quick sort, if the sort specifications allow. This sort algorithm cannot be used with .Fl u and .Fl s . .It Fl Fl heapsort Try to use heap sort, if the sort specifications allow. This sort algorithm cannot be used with .Fl u and .Fl s . .It Fl Fl mmap Try to use file memory mapping system call. It may increase speed in some cases. .El .Pp The following operands are available: .Bl -tag -width indent .It Ar file The pathname of a file to be sorted, merged, or checked. If no .Ar file operands are specified, or if a .Ar file operand is .Fl , the standard input is used. .El .Pp A field is defined as a maximal sequence of characters other than the field separator and record separator (newline by default). Initial blank spaces are included in the field unless .Fl b has been specified; the first blank space of a sequence of blank spaces acts as the field separator and is included in the field (unless .Fl t is specified). For example, all blank spaces at the beginning of a line are considered to be part of the first field. .Pp Fields are specified by the .Sm off .Fl k\ \& Ar field1 Op , Ar field2 .Sm on command-line option. If .Ar field2 is missing, the end of the key defaults to the end of the line. .Pp The arguments .Ar field1 and .Ar field2 have the form .Em m.n .Em (m,n > 0) and can be followed by one or more of the modifiers .Cm b , d , f , i , .Cm n , g , M and .Cm r , which correspond to the options discussed above. When .Cm b is specified it applies only to .Ar field1 or .Ar field2 where it is specified while the rest of the modifiers apply to the whole key field regardless if they are specified only with .Ar field1 or .Ar field2 or both. A .Ar field1 position specified by .Em m.n is interpreted as the .Em n Ns th character from the beginning of the .Em m Ns th field. A missing .Em \&.n in .Ar field1 means .Ql \&.1 , indicating the first character of the .Em m Ns th field; if the .Fl b option is in effect, .Em n is counted from the first non-blank character in the .Em m Ns th field; .Em m Ns \&.1b refers to the first non-blank character in the .Em m Ns th field. .No 1\&. Ns Em n refers to the .Em n Ns th character from the beginning of the line; if .Em n is greater than the length of the line, the field is taken to be empty. .Pp .Em n Ns th positions are always counted from the field beginning, even if the field is shorter than the number of specified positions. Thus, the key can really start from a position in a subsequent field. .Pp A .Ar field2 position specified by .Em m.n is interpreted as the .Em n Ns th character (including separators) from the beginning of the .Em m Ns th field. A missing .Em \&.n indicates the last character of the .Em m Ns th field; .Em m = \&0 designates the end of a line. Thus the option .Fl k Ar v.x,w.y is synonymous with the obsolete option .Cm \(pl Ns Ar v-\&1.x-\&1 .Fl Ns Ar w-\&1.y ; when .Em y is omitted, .Fl k Ar v.x,w is synonymous with .Cm \(pl Ns Ar v-\&1.x-\&1 .Fl Ns Ar w\&.0 . The obsolete .Cm \(pl Ns Ar pos1 .Fl Ns Ar pos2 option is still supported, except for .Fl Ns Ar w\&.0b , which has no .Fl k equivalent. .Sh ENVIRONMENT .Bl -tag -width Fl .It Ev LC_COLLATE Locale settings to be used to determine the collation for sorting records. .It Ev LC_CTYPE Locale settings to be used to case conversion and classification of characters, that is, which characters are considered whitespaces, etc. .It Ev LC_MESSAGES Locale settings that determine the language of output messages that .Nm prints out. .It Ev LC_NUMERIC Locale settings that determine the number format used in numeric sort. .It Ev LC_TIME Locale settings that determine the month format used in month sort. .It Ev LC_ALL Locale settings that override all of the above locale settings. This environment variable can be used to set all these settings to the same value at once. .It Ev LANG Used as a last resort to determine different kinds of locale-specific behavior if neither the respective environment variable, nor .Ev LC_ALL are set. %%NLS%%.It Ev NLSPATH %%NLS%%Path to NLS catalogs. .It Ev TMPDIR Path to the directory in which temporary files will be stored. Note that .Ev TMPDIR may be overridden by the .Fl T option. .It Ev GNUSORT_NUMERIC_COMPATIBILITY If defined .Fl t will not override the locale numeric symbols, that is, thousand separators and decimal separators. By default, if we specify .Fl t with the same symbol as the thousand separator or decimal point, the symbol will be treated as the field separator. Older behavior was less definite; the symbol was treated as both field separator and numeric separator, simultaneously. This environment variable enables the old behavior. .El .Sh FILES .Bl -tag -width Pa -compact .It Pa /var/tmp/.bsdsort.PID.* Temporary files. .It Pa /dev/random Default seed file for the random sort. .El .Sh EXIT STATUS The .Nm utility shall exit with one of the following values: .Pp .Bl -tag -width flag -compact .It 0 Successfully sorted the input files or if used with .Fl c or .Fl C , the input file already met the sorting criteria. .It 1 On disorder (or non-uniqueness) with the .Fl c or .Fl C options. .It 2 An error occurred. .El .Sh SEE ALSO .Xr comm 1 , .Xr join 1 , .Xr uniq 1 .Sh STANDARDS The .Nm utility is compliant with the .St -p1003.1-2008 specification. .Pp The flags .Op Fl ghRMSsTVz are extensions to the POSIX specification. .Pp All long options are extensions to the specification, some of them are provided for compatibility with GNU versions and some of them are own extensions. .Pp The old key notations .Cm \(pl Ns Ar pos1 and .Fl Ns Ar pos2 come from older versions of .Nm and are still supported but their use is highly discouraged. .Sh HISTORY A .Nm command first appeared in .At v3 . .Sh AUTHORS .An Gabor Kovesdan Aq Mt gabor@FreeBSD.org , .Pp .An Oleg Moskalenko Aq Mt mom040267@gmail.com .Sh NOTES This implementation of .Nm has no limits on input line length (other than imposed by available memory) or any restrictions on bytes allowed within lines. .Pp The performance depends highly on locale settings, efficient choice of sort keys and key complexity. The fastest sort is with locale C, on whole lines, with option .Fl s. In general, locale C is the fastest, then single-byte locales follow and multi-byte locales as the slowest but the correct collation order is always respected. As for the key specification, the simpler to process the lines the faster the search will be. .Pp When sorting by arithmetic value, using .Fl n results in much better performance than .Fl g so its use is encouraged whenever possible. Index: head/usr.bin/sort/vsort.c =================================================================== --- head/usr.bin/sort/vsort.c (revision 281122) +++ head/usr.bin/sort/vsort.c (revision 281123) @@ -1,265 +1,261 @@ /*- * Copyright (C) 2012 Oleg Moskalenko * Copyright (C) 2012 Gabor Kovesdan * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include "sort.h" #include "vsort.h" static inline bool isdigit_clocale(wchar_t c) { - return (c >= L'0' && c <= L'9'); } static inline bool isalpha_clocale(wchar_t c) { - return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z')); } static inline bool isalnum_clocale(wchar_t c) { - return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z') || (c >= L'0' && c <= L'9')); } /* * Find string suffix of format: (\.[A-Za-z~][A-Za-z0-9~]*)*$ * Set length of string before suffix. */ static void find_suffix(bwstring_iterator si, bwstring_iterator se, size_t *len) { wchar_t c; size_t clen; bool expect_alpha, sfx; sfx = false; expect_alpha = false; *len = 0; clen = 0; while ((si < se) && (c = bws_get_iter_value(si))) { if (expect_alpha) { expect_alpha = false; if (!isalpha_clocale(c) && (c != L'~')) sfx = false; } else if (c == L'.') { expect_alpha = true; if (!sfx) { sfx = true; *len = clen; } } else if (!isalnum_clocale(c) && (c != L'~')) sfx = false; si = bws_iterator_inc(si, 1); ++clen; } /* This code must be here to make the implementation compatible * with WORDING of GNU sort documentation. * But the GNU sort implementation is not following its own * documentation. GNU sort allows empty file extensions * (just dot with nothing after); but the regular expression in * their documentation does not allow empty file extensions. * We chose to make our implementation compatible with GNU sort * implementation. If they will ever fix their bug, this code * must be uncommented. Or they may choose to fix the info page, * then the code stays commented. * if (expect_alpha) sfx = false; */ if (!sfx) *len = clen; } static inline int cmp_chars(wchar_t c1, wchar_t c2) { - if (c1 == c2) return (0); if (c1 == L'~') return (-1); if (c2 == L'~') return (+1); if (isdigit_clocale(c1) || !c1) return ((isdigit_clocale(c2) || !c2) ? 0 : -1); if (isdigit_clocale(c2) || !c2) return (+1); if (isalpha_clocale(c1)) return ((isalpha_clocale(c2)) ? ((int) c1 - (int) c2) : -1); if (isalpha_clocale(c2)) return (+1); return ((int) c1 - (int) c2); } static int cmpversions(bwstring_iterator si1, bwstring_iterator se1, bwstring_iterator si2, bwstring_iterator se2) { int cmp, diff; while ((si1 < se1) || (si2 < se2)) { diff = 0; while (((si1 < se1) && !isdigit_clocale(bws_get_iter_value(si1))) || ((si2 < se2) && !isdigit_clocale(bws_get_iter_value(si2)))) { wchar_t c1, c2; c1 = (si1 < se1) ? bws_get_iter_value(si1) : 0; c2 = (si2 < se2) ? bws_get_iter_value(si2) : 0; cmp = cmp_chars(c1, c2); if (cmp) return (cmp); if (si1 < se1) si1 = bws_iterator_inc(si1, 1); if (si2 < se2) si2 = bws_iterator_inc(si2, 1); } while (bws_get_iter_value(si1) == L'0') si1 = bws_iterator_inc(si1, 1); while (bws_get_iter_value(si2) == L'0') si2 = bws_iterator_inc(si2, 1); while (isdigit_clocale(bws_get_iter_value(si1)) && isdigit_clocale(bws_get_iter_value(si2))) { if (!diff) diff = ((int)bws_get_iter_value(si1) - (int)bws_get_iter_value(si2)); si1 = bws_iterator_inc(si1, 1); si2 = bws_iterator_inc(si2, 1); } if (isdigit_clocale(bws_get_iter_value(si1))) return (1); if (isdigit_clocale(bws_get_iter_value(si2))) return (-1); if (diff) return (diff); } return (0); } /* * Compare two version strings */ int vcmp(struct bwstring *s1, struct bwstring *s2) { bwstring_iterator si1, si2; wchar_t c1, c2; size_t len1, len2, slen1, slen2; int cmp_bytes, cmp_res; if (s1 == s2) return (0); cmp_bytes = bwscmp(s1, s2, 0); if (cmp_bytes == 0) return (0); len1 = slen1 = BWSLEN(s1); len2 = slen2 = BWSLEN(s2); if (slen1 < 1) return (-1); if (slen2 < 1) return (+1); si1 = bws_begin(s1); si2 = bws_begin(s2); c1 = bws_get_iter_value(si1); c2 = bws_get_iter_value(si2); if (c1 == L'.' && (slen1 == 1)) return (-1); if (c2 == L'.' && (slen2 == 1)) return (+1); if (slen1 == 2 && c1 == L'.' && bws_get_iter_value(bws_iterator_inc(si1, 1)) == L'.') return (-1); if (slen2 == 2 && c2 == L'.' && bws_get_iter_value(bws_iterator_inc(si2, 1)) == L'.') return (+1); if (c1 == L'.' && c2 != L'.') return (-1); if (c1 != L'.' && c2 == L'.') return (+1); if (c1 == L'.' && c2 == L'.') { si1 = bws_iterator_inc(si1, 1); si2 = bws_iterator_inc(si2, 1); } find_suffix(si1, bws_end(s1), &len1); find_suffix(si2, bws_end(s2), &len2); if ((len1 == len2) && (bws_iterator_cmp(si1, si2, len1) == 0)) return (cmp_bytes); cmp_res = cmpversions(si1, bws_iterator_inc(si1, len1), si2, bws_iterator_inc(si2, len2)); if (cmp_res == 0) cmp_res = cmp_bytes; return (cmp_res); }