diff --git a/usr.bin/split/split.c b/usr.bin/split/split.c index af1ed69c9482..e246a0d4adfc 100644 --- a/usr.bin/split/split.c +++ b/usr.bin/split/split.c @@ -1,445 +1,458 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1987, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1987, 1993, 1994\n\ The Regents of the University of California. All rights reserved.\n"; #endif #ifndef lint static const char sccsid[] = "@(#)split.c 8.2 (Berkeley) 4/16/94"; #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEFLINE 1000 /* Default num lines per file. */ static off_t bytecnt; /* Byte count to split on. */ -static off_t chunks = 0; /* Chunks count to split into. */ +static long chunks; /* Chunks count to split into. */ static bool clobber = true; /* Whether to overwrite existing output files. */ static long numlines; /* Line count to split on. */ static int file_open; /* If a file open. */ static int ifd = -1, ofd = -1; /* Input/output file descriptors. */ static char fname[MAXPATHLEN]; /* File name prefix. */ static regex_t rgx; static int pflag; static bool dflag; static long sufflen = 2; /* File name suffix length. */ -static int autosfx = 1; /* Whether to auto-extend the suffix length. */ +static bool autosfx = true; /* Whether to auto-extend the suffix length. */ static void newfile(void); static void split1(void); static void split2(void); static void split3(void); static void usage(void) __dead2; int main(int argc, char **argv) { - const char *p; - char *ep; + char errbuf[64]; + const char *p, *errstr; int ch, error; setlocale(LC_ALL, ""); dflag = false; while ((ch = getopt(argc, argv, "0::1::2::3::4::5::6::7::8::9::a:b:cdl:n:p:")) != -1) switch (ch) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* * Undocumented kludge: split was originally designed * to take a number after a dash. */ if (numlines != 0) usage(); numlines = ch - '0'; p = optarg ? optarg : ""; while (numlines >= 0 && *p >= '0' && *p <= '9') numlines = numlines * 10 + *p++ - '0'; if (numlines <= 0 || *p != '\0') - errx(EX_USAGE, "%c%s: illegal line count", ch, - optarg ? optarg : ""); + errx(EX_USAGE, "%c%s: line count is invalid", + ch, optarg ? optarg : ""); break; case 'a': /* Suffix length */ - if ((sufflen = strtol(optarg, &ep, 10)) <= 0 || *ep) - errx(EX_USAGE, - "%s: illegal suffix length", optarg); - autosfx = 0; + sufflen = strtonum(optarg, 0, INT_MAX, &errstr); + if (errstr != NULL) { + errx(EX_USAGE, "%s: suffix length is %s", + optarg, errstr); + } + if (sufflen == 0) { + sufflen = 2; + autosfx = true; + } else { + autosfx = false; + } break; case 'b': /* Byte count. */ - errno = 0; - error = expand_number(optarg, &bytecnt); - if (error == -1) - errx(EX_USAGE, "%s: offset too large", optarg); + if (expand_number(optarg, &bytecnt) != 0) { + errx(EX_USAGE, "%s: byte count is invalid", + optarg); + } break; case 'c': /* Continue, don't overwrite output files. */ clobber = false; break; case 'd': /* Decimal suffix */ dflag = true; break; case 'l': /* Line count. */ if (numlines != 0) usage(); - if ((numlines = strtol(optarg, &ep, 10)) <= 0 || *ep) - errx(EX_USAGE, - "%s: illegal line count", optarg); + numlines = strtonum(optarg, 1, LONG_MAX, &errstr); + if (errstr != NULL) { + errx(EX_USAGE, "%s: line count is %s", + optarg, errstr); + } break; case 'n': /* Chunks. */ - if (!isdigit((unsigned char)optarg[0]) || - (chunks = (size_t)strtoul(optarg, &ep, 10)) == 0 || - *ep != '\0') { - errx(EX_USAGE, "%s: illegal number of chunks", - optarg); + chunks = strtonum(optarg, 1, LONG_MAX, &errstr); + if (errstr != NULL) { + errx(EX_USAGE, "%s: number of chunks is %s", + optarg, errstr); } break; case 'p': /* pattern matching. */ - if (regcomp(&rgx, optarg, REG_EXTENDED|REG_NOSUB) != 0) - errx(EX_USAGE, "%s: illegal regexp", optarg); + error = regcomp(&rgx, optarg, REG_EXTENDED|REG_NOSUB); + if (error != 0) { + regerror(error, &rgx, errbuf, sizeof(errbuf)); + errx(EX_USAGE, "%s: regex is invalid: %s", + optarg, errbuf); + } pflag = 1; break; default: usage(); } argv += optind; argc -= optind; if (argc > 0) { /* Input file. */ if (strcmp(*argv, "-") == 0) ifd = STDIN_FILENO; else if ((ifd = open(*argv, O_RDONLY, 0)) < 0) err(EX_NOINPUT, "%s", *argv); ++argv; --argc; } if (argc > 0) { /* File name prefix. */ - if (strlcpy(fname, *argv, sizeof(fname)) >= sizeof(fname)) - errx(EX_USAGE, "file name prefix is too long: %s", + if (strlcpy(fname, *argv, sizeof(fname)) >= sizeof(fname)) { + errx(EX_USAGE, "%s: file name prefix is too long", *argv); + } ++argv; --argc; } if (argc > 0) usage(); if (strlen(fname) + (unsigned long)sufflen >= sizeof(fname)) errx(EX_USAGE, "suffix is too long"); if (pflag && (numlines != 0 || bytecnt != 0 || chunks != 0)) usage(); if (numlines == 0) numlines = DEFLINE; else if (bytecnt != 0 || chunks != 0) usage(); - if (bytecnt && chunks) + if (bytecnt != 0 && chunks != 0) usage(); if (ifd == -1) /* Stdin by default. */ ifd = 0; - if (bytecnt) { + if (bytecnt != 0) { split1(); exit (0); - } else if (chunks) { + } else if (chunks != 0) { split3(); exit (0); } split2(); if (pflag) regfree(&rgx); exit(0); } /* * split1 -- * Split the input by bytes. */ static void split1(void) { static char bfr[MAXBSIZE]; off_t bcnt; char *C; ssize_t dist, len; int nfiles; nfiles = 0; for (bcnt = 0;;) switch ((len = read(ifd, bfr, sizeof(bfr)))) { case 0: exit(0); case -1: err(EX_IOERR, "read"); /* NOTREACHED */ default: if (!file_open) { - if (!chunks || (nfiles < chunks)) { + if (chunks == 0 || nfiles < chunks) { newfile(); nfiles++; } } if (bcnt + len >= bytecnt) { dist = bytecnt - bcnt; if (write(ofd, bfr, dist) != dist) err(EX_IOERR, "write"); len -= dist; for (C = bfr + dist; len >= bytecnt; - len -= bytecnt, C += bytecnt) { - if (!chunks || (nfiles < chunks)) { - newfile(); + len -= bytecnt, C += bytecnt) { + if (chunks == 0 || nfiles < chunks) { + newfile(); nfiles++; } - if (write(ofd, - C, bytecnt) != bytecnt) + if (write(ofd, C, bytecnt) != bytecnt) err(EX_IOERR, "write"); } if (len != 0) { - if (!chunks || (nfiles < chunks)) { - newfile(); + if (chunks == 0 || nfiles < chunks) { + newfile(); nfiles++; } if (write(ofd, C, len) != len) err(EX_IOERR, "write"); - } else + } else { file_open = 0; + } bcnt = len; } else { bcnt += len; if (write(ofd, bfr, len) != len) err(EX_IOERR, "write"); } } } /* * split2 -- * Split the input by lines. */ static void split2(void) { char *buf; size_t bufsize; ssize_t len; long lcnt = 0; FILE *infp; buf = NULL; bufsize = 0; /* Stick a stream on top of input file descriptor */ if ((infp = fdopen(ifd, "r")) == NULL) err(EX_NOINPUT, "fdopen"); /* Process input one line at a time */ while ((errno = 0, len = getline(&buf, &bufsize, infp)) > 0) { /* Check if we need to start a new file */ if (pflag) { regmatch_t pmatch; pmatch.rm_so = 0; pmatch.rm_eo = len - 1; if (regexec(&rgx, buf, 0, &pmatch, REG_STARTEND) == 0) newfile(); } else if (lcnt++ == numlines) { newfile(); lcnt = 1; } /* Open output file if needed */ if (!file_open) newfile(); /* Write out line */ if (write(ofd, buf, len) != len) err(EX_IOERR, "write"); } /* EOF or error? */ if ((len == -1 && errno != 0) || ferror(infp)) err(EX_IOERR, "read"); else exit(0); } /* * split3 -- * Split the input into specified number of chunks */ static void split3(void) { struct stat sb; if (fstat(ifd, &sb) == -1) { err(1, "stat"); /* NOTREACHED */ } if (chunks > sb.st_size) { errx(1, "can't split into more than %d files", (int)sb.st_size); /* NOTREACHED */ } bytecnt = sb.st_size / chunks; split1(); } /* * newfile -- * Open a new output file. */ static void newfile(void) { long i, maxfiles, tfnum; static long fnum; static char *fpnt; char beg, end; int pattlen; int flags = O_WRONLY | O_CREAT | O_TRUNC; if (!clobber) flags |= O_EXCL; if (ofd == -1) { if (fname[0] == '\0') { fname[0] = 'x'; fpnt = fname + 1; } else { fpnt = fname + strlen(fname); } } else if (close(ofd) != 0) err(1, "%s", fname); again: if (dflag) { beg = '0'; end = '9'; } else { beg = 'a'; end = 'z'; } pattlen = end - beg + 1; /* * If '-a' is not specified, then we automatically expand the * suffix length to accomodate splitting all input. We do this * by moving the suffix pointer (fpnt) forward and incrementing * sufflen by one, thereby yielding an additional two characters * and allowing all output files to sort such that 'cat *' yields * the input in order. I.e., the order is '... xyy xyz xzaaa * xzaab ... xzyzy, xzyzz, xzzaaaa, xzzaaab' and so on. */ if (!dflag && autosfx && (fpnt[0] == 'y') && strspn(fpnt+1, "z") == strlen(fpnt+1)) { fpnt = fname + strlen(fname) - sufflen; fpnt[sufflen + 2] = '\0'; fpnt[0] = end; fpnt[1] = beg; /* Basename | Suffix * before: * x | yz * after: * xz | a.. */ fpnt++; sufflen++; /* Reset so we start back at all 'a's in our extended suffix. */ fnum = 0; } /* maxfiles = pattlen^sufflen, but don't use libm. */ for (maxfiles = 1, i = 0; i < sufflen; i++) if (LONG_MAX / pattlen < maxfiles) errx(EX_USAGE, "suffix is too long (max %ld)", i); else maxfiles *= pattlen; if (fnum == maxfiles) errx(EX_DATAERR, "too many files"); /* Generate suffix of sufflen letters */ tfnum = fnum; i = sufflen - 1; do { fpnt[i] = tfnum % pattlen + beg; tfnum /= pattlen; } while (i-- > 0); fpnt[sufflen] = '\0'; ++fnum; if ((ofd = open(fname, flags, DEFFILEMODE)) < 0) { if (!clobber && errno == EEXIST) goto again; err(EX_IOERR, "%s", fname); } file_open = 1; } static void usage(void) { (void)fprintf(stderr, "usage: split [-cd] [-l line_count] [-a suffix_length] [file [prefix]]\n" " split [-cd] -b byte_count[K|k|M|m|G|g] [-a suffix_length] [file [prefix]]\n" " split [-cd] -n chunk_count [-a suffix_length] [file [prefix]]\n" " split [-cd] -p pattern [-a suffix_length] [file [prefix]]\n"); exit(EX_USAGE); } diff --git a/usr.bin/split/tests/split_test.sh b/usr.bin/split/tests/split_test.sh index c9b87c01618c..48065719055a 100755 --- a/usr.bin/split/tests/split_test.sh +++ b/usr.bin/split/tests/split_test.sh @@ -1,255 +1,274 @@ # # SPDX-License-Identifier: BSD-2-Clause # # Copyright (c) 2022-2023 Klara Systems # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # # sys/param.h : ${MAXBSIZE:=65536} atf_test_case bytes bytes_body() { printf "aaaa" > foo-aa printf "bb\nc" > foo-ab printf "ccc\n" > foo-ac cat foo-* > foo atf_check split -b 4 foo split- atf_check -o file:foo-aa cat split-aa atf_check -o file:foo-ab cat split-ab atf_check -o file:foo-ac cat split-ac # MAXBSIZE is the default buffer size, so we'll split at just a little # bit past the buffer size to make sure that it still properly splits # even when it needs to read again to hit the limit. bsize=$((MAXBSIZE + 12)) rm foo-* foo jot -ns "" -b "a" ${bsize} > foo-aa jot -ns "" -b "b" ${bsize} > foo-ab jot -ns "" -b "c" 12 > foo-ac cat foo-* > foo atf_check split -b ${bsize} foo split- atf_check -o file:foo-aa cat split-aa atf_check -o file:foo-ab cat split-ab atf_check -o file:foo-ac cat split-ac } atf_test_case chunks chunks_body() { jot -ns "" -b "a" 4096 > foo jot -ns "" -b "b" 4096 >> foo jot -ns "" -b "c" 4104 >> foo chunks=3 jot -ns "" -b "a" 4096 > foo-aa jot -ns "" -b "b" 2 >> foo-aa jot -ns "" -b "b" 4094 > foo-ab jot -ns "" -b "c" 4 >> foo-ab jot -ns "" -b "c" 4100 > foo-ac atf_check split -n ${chunks} foo split- atf_check -o file:foo-aa cat split-aa atf_check -o file:foo-ab cat split-ab atf_check -o file:foo-ac cat split-ac } atf_test_case sensible_lines sensible_lines_body() { echo "The quick brown fox" > foo-aa echo "jumps over" > foo-ab echo "the lazy dog" > foo-ac cat foo-* > foo atf_check split -l 1 foo split- atf_check -o file:foo-aa cat split-aa atf_check -o file:foo-ab cat split-ab atf_check -o file:foo-ac cat split-ac # Try again, make sure that `-` uses stdin as documented. atf_check rm split-* atf_check -x 'split -l 1 - split- < foo' atf_check -o file:foo-aa cat split-aa atf_check -o file:foo-ab cat split-ab atf_check -o file:foo-ac cat split-ac # Finally, try with -l == 2; we should see a 2/1 split instead of the # previous 1/1/1. cat foo-aa foo-ab > foo-aa-ng cat foo-ac > foo-ab-ng atf_check rm split-* atf_check split -l 2 foo split- atf_check -o file:foo-aa-ng cat split-aa atf_check -o file:foo-ab-ng cat split-ab } atf_test_case long_lines long_lines_body() { # Test file lines will be: # a x MAXBSIZE # b x MAXBSIZE + c x MAXBSIZE # d x 1024 # # The historical split(1) implementation wouldn't grow its internal # buffer, so we'd end up with 2/3 split- files being wrong with -l 1. # Notably, split-aa would include most of the first two lines, split-ab # a tiny fraction of the second line, and split-ac the third line. # # Recent split(1) instead grows the buffer until we can either fit the # line or we run out of memory. jot -s "" -b "a" ${MAXBSIZE} > foo-aa jot -ns "" -b "b" ${MAXBSIZE} > foo-ab jot -s "" -b "c" ${MAXBSIZE} >> foo-ab jot -s "" -b "d" 1024 > foo-ac cat foo-* > foo atf_check split -l 1 foo split- atf_check -o file:foo-aa cat split-aa atf_check -o file:foo-ab cat split-ab atf_check -o file:foo-ac cat split-ac } atf_test_case numeric_suffix numeric_suffix_body() { echo "The quick brown fox" > foo-00 echo "jumps over" > foo-01 echo "the lazy dog" > foo-02 cat foo-* > foo atf_check split -d -l 1 foo split- atf_check -o file:foo-00 cat split-00 atf_check -o file:foo-01 cat split-01 atf_check -o file:foo-02 cat split-02 } atf_test_case larger_suffix_length larger_suffix_length_body() { :> foo # Generate foo-000 through foo-009, then foo-010 and foo-011 for i in $(seq -w 0 11); do len=$((${i##0} + 1)) file="foo-0${i}" jot -s "" -b "a" ${len} > ${file} cat ${file} >> foo done atf_check split -a 3 -d -l 1 foo split- for i in $(seq -w 0 11); do srcfile="foo-0${i}" splitfile="split-0${i}" atf_check -o file:"${srcfile}" cat "${splitfile}" done } atf_test_case pattern pattern_body() { # Some fake yaml gives us a good realistic use-case for -p, as we can # split on top-level stanzas. cat < foo-aa cat: aa: true ab: true ac: true EOF cat < foo-ab dog: ba: true bb: true bc: true EOF cat foo-* > foo atf_check split -p "^[^[:space:]]+:" foo split- atf_check -o file:foo-aa cat split-aa atf_check -o file:foo-ab cat split-ab } atf_test_case autoextend autoextend_body() { seq $((26*25+1)) >input atf_check split -l1 input atf_check -o inline:"$((26*25))\n" cat xyz atf_check -o inline:"$((26*25+1))\n" cat xzaaa } +atf_test_case noautoextend +noautoextend_body() +{ + seq $((26*26)) >input + atf_check split -a2 -l1 input + atf_check -o inline:"$((26*26))\n" cat xzz +} + +atf_test_case reautoextend +reautoextend_body() +{ + seq $((26*25+1)) >input + atf_check split -a2 -a0 -l1 input + atf_check -o inline:"$((26*25))\n" cat xyz + atf_check -o inline:"$((26*25+1))\n" cat xzaaa +} + atf_test_case continue continue_body() { echo hello >input atf_check split input atf_check -o file:input cat xaa atf_check -s exit:1 -e ignore cat xab atf_check split -c input atf_check -o file:input cat xab } atf_test_case undocumented_kludge undocumented_kludge_body() { seq 5000 >input atf_check split -1000 input atf_check -o file:xae seq 4001 5000 atf_check split -d1000 input atf_check -o file:x04 seq 4001 5000 } atf_test_case duplicate_linecount duplicate_linecount_body() { atf_check -s exit:64 -e ignore split -5 -5 /dev/null atf_check -s exit:64 -e ignore split -l5 -5 /dev/null atf_check -s exit:64 -e ignore split -5 -l5 /dev/null atf_check -s exit:64 -e ignore split -l5 -l5 /dev/null } atf_init_test_cases() { atf_add_test_case bytes atf_add_test_case chunks atf_add_test_case sensible_lines atf_add_test_case long_lines atf_add_test_case numeric_suffix atf_add_test_case larger_suffix_length atf_add_test_case pattern atf_add_test_case autoextend + atf_add_test_case noautoextend + atf_add_test_case reautoextend atf_add_test_case continue atf_add_test_case undocumented_kludge atf_add_test_case duplicate_linecount }