Index: include/regex.h =================================================================== --- include/regex.h +++ include/regex.h @@ -69,6 +69,7 @@ #define REG_NOSPEC 0020 #define REG_PEND 0040 #define REG_DUMP 0200 +#define REG_POSIX 0400 /* only POSIX-compliant regex (libregex) */ /* regerror() flags */ #define REG_ENOSYS (-1) Index: lib/libc/regex/engine.c =================================================================== --- lib/libc/regex/engine.c +++ lib/libc/regex/engine.c @@ -104,7 +104,7 @@ static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast); -static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); +static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags); #define MAX_RECURSION 100 #define BOL (OUT-1) #define EOL (BOL-1) @@ -114,6 +114,11 @@ #define EOW (BOL-5) #define BADCHAR (BOL-6) #define NONCHAR(c) ((c) <= OUT) +/* sflags */ +#define SNWBND 01 +#define SBOS 02 +#define SEOS 04 + #ifdef REDEBUG static void print(struct match *m, const char *caption, states st, int ch, FILE *d); #endif @@ -181,6 +186,17 @@ if (stop < start) return(REG_INVARG); + /* Trivial zero-length match on empty sub */ + if (g->iflags & EMPTBR) { + if (nmatch > 0) { + pmatch[0].rm_so = pmatch[0].rm_eo = 0; + + for (i = 1; i < nmatch; i++) + pmatch[i].rm_so = pmatch[i].rm_eo = -1; + } + return(0); + } + /* prescreening; this does wonders for this rather slow code */ if (g->must != NULL) { if (g->charjump != NULL && g->matchjump != NULL) { @@ -410,10 +426,14 @@ case OCHAR: sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); break; + case OBOS: + case OEOS: case OBOL: case OEOL: case OBOW: case OEOW: + case OWBND: + case ONWBND: break; case OANY: case OANYOF: @@ -619,22 +639,45 @@ else return(NULL); break; + case OBOS: + break; + if (sp == m->beginp) + { /* yes */ } + else + return(NULL); + break; + case OEOS: + break; + if (sp == m->endp) + { /* yes */ } + else + return(NULL); + break; + case ONWBND: + if (sp > m->beginp && sp < m->endp && + ISWORD(*(sp-1)) == ISWORD(*sp)) + { /* yes */ } + else + return(NULL); + break; + case OWBND: case OBOW: if (sp < m->endp && ISWORD(*sp) && ((sp == m->beginp && !(m->eflags®_NOTBOL)) || (sp > m->offp && !ISWORD(*(sp-1))))) { /* yes */ } - else + else if (OP(s) == OBOW) return(NULL); - break; + /* FALLTHROUGH */ case OEOW: - if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || + if (OP(s) != OBOW && + ( (sp == m->endp && !(m->eflags®_NOTEOL)) || (sp < m->endp && *sp == '\n' && (m->g->cflags®_NEWLINE)) || (sp < m->endp && !ISWORD(*sp)) ) && (sp > m->beginp && ISWORD(*(sp-1))) ) { /* yes */ } - else + else if (OP(s) != OBOW) return(NULL); break; case O_QUEST: @@ -678,7 +721,7 @@ ssp = m->offp + m->pmatch[i].rm_so; if (memcmp(sp, ssp, len) != 0) return(NULL); - while (m->g->strip[ss] != SOP(O_BACK, i)) + while (m->g->strip[ss] != (sop)SOP(O_BACK, i)) ss++; return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); case OQUEST_: /* to null or not */ @@ -767,6 +810,7 @@ states fresh = m->fresh; states empty = m->empty; states tmp = m->tmp; + sopno nxop; const char *p = start; wint_t c; wint_t lastc; /* previous c */ @@ -774,12 +818,13 @@ int i; const char *matchp; /* last p at which a match ended */ size_t clen; + int sflags = 0; AT("slow", start, stop, startst, stopst); CLEAR(st); SET1(st, startst); SP("sstart", st, *p); - st = step(m->g, startst, stopst, st, NOTHING, st); + st = step(m->g, startst, stopst, st, NOTHING, st, sflags); if (fast) ASSIGN(fresh, st); matchp = NULL; @@ -820,7 +865,7 @@ } if (i != 0) { for (; i > 0; i--) - st = step(m->g, startst, stopst, st, flagch, st); + st = step(m->g, startst, stopst, st, flagch, st, sflags); SP("sboleol", st, c); } @@ -833,11 +878,24 @@ (flagch == EOL || (c != OUT && !ISWORD(c))) ) { flagch = EOW; } - if (flagch == BOW || flagch == EOW) { - st = step(m->g, startst, stopst, st, flagch, st); - SP("sboweow", st, c); + if (p == m->beginp) + sflags |= SBOS; + if (p == m->endp) + sflags |= SEOS; + if (flagch != BOW && flagch != EOW && + lastc != OUT && c != OUT && ISWORD(lastc) == ISWORD(c)) + sflags |= SNWBND; + nxop = OP(m->g->strip[startst]); + /* Consume a match for BOW/EOW markers */ + if (flagch == BOW || flagch == EOW || + nxop == ONWBND || nxop == OBOS || nxop == OEOS) { + st = step(m->g, startst, stopst, st, flagch, st, sflags); + SP("sboweownbwnd", st, c); } + /* Don't match 0-length ops elsewhere */ + sflags = 0; + /* are we done? */ if (ISSET(st, stopst)) { if (fast) @@ -845,7 +903,7 @@ else matchp = p; } - if (EQ(st, empty) || p == stop || clen > stop - p) + if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p)) break; /* NOTE BREAK OUT */ /* no, we must deal with this character */ @@ -855,9 +913,9 @@ else ASSIGN(st, empty); assert(c != OUT); - st = step(m->g, startst, stopst, tmp, c, st); + st = step(m->g, startst, stopst, tmp, c, st, sflags); SP("saft", st, c); - assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); + assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags), st)); p += clen; } @@ -891,7 +949,8 @@ sopno stop, /* state after stop state within strip */ states bef, /* states reachable before */ wint_t ch, /* character or NONCHAR code */ - states aft) /* states already known reachable after */ + states aft, /* states already known reachable after */ + int sflags) /* 0-length matching states*/ { cset *cs; sop s; @@ -920,12 +979,25 @@ if (ch == EOL || ch == BOLEOL) FWD(aft, bef, 1); break; + case ONWBND: + if (sflags & SNWBND) + FWD(aft, bef, 1); + break; + case OBOS: + if (sflags & SBOS) + FWD(aft, bef, 1); + break; + case OEOS: + if (sflags & SEOS) + FWD(aft, bef, 1); + break; + case OWBND: case OBOW: if (ch == BOW) FWD(aft, bef, 1); - break; + /* FALLTHROUGH */ case OEOW: - if (ch == EOW) + if (OP(s) != OBOW && ch == EOW) FWD(aft, bef, 1); break; case OANY: Index: lib/libc/regex/regcomp.c =================================================================== --- lib/libc/regex/regcomp.c +++ lib/libc/regex/regcomp.c @@ -55,7 +55,9 @@ #include #include +#ifndef LIBREGEX #include "collate.h" +#endif #include "utils.h" #include "regex2.h" @@ -88,6 +90,7 @@ const char *next; /* next character in RE */ const char *end; /* end of string (-> NUL normally) */ int error; /* has an error been seen? */ + int gnuext; sop *strip; /* malloced strip */ sopno ssize; /* malloced strip size (allocated) */ sopno slen; /* malloced strip length (used) */ @@ -122,8 +125,11 @@ static bool p_simp_re(struct parse *p, struct branchc *bc); static int p_count(struct parse *p); static void p_bracket(struct parse *p); +static int p_range_cmp(wchar_t c1, wchar_t c2); static void p_b_term(struct parse *p, cset *cs); +static int p_b_pseudoclass(struct parse *p, char c); static void p_b_cclass(struct parse *p, cset *cs); +static void p_b_cclass_named(struct parse *p, cset *cs, const char[]); static void p_b_eclass(struct parse *p, cset *cs); static wint_t p_b_symbol(struct parse *p); static wint_t p_b_coll_elem(struct parse *p, wint_t endc); @@ -172,6 +178,7 @@ #define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a)) #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) +#define EATSPEC(a) (p->bre ? EATTWO('\\', a) : EAT(a)) #define NEXT() (p->next++) #define NEXT2() (p->next += 2) #define NEXTn(n) (p->next += (n)) @@ -278,14 +285,22 @@ p->pbegin[i] = 0; p->pend[i] = 0; } +#ifdef LIBREGEX + if (cflags®_POSIX) { + p->gnuext = false; + p->allowbranch = (cflags & REG_EXTENDED) != 0; + } else + p->gnuext = p->allowbranch = true; +#else + p->gnuext = false; + p->allowbranch = (cflags & REG_EXTENDED) != 0; +#endif if (cflags & REG_EXTENDED) { - p->allowbranch = true; p->bre = false; p->parse_expr = p_ere_exp; p->pre_parse = NULL; p->post_parse = NULL; } else { - p->allowbranch = false; p->bre = true; p->parse_expr = p_simp_re; p->pre_parse = p_bre_pre_parse; @@ -359,12 +374,19 @@ sopno pos; int count; int count2; +#ifdef LIBREGEX + int i; + int handled; +#endif sopno subno; int wascaret = 0; - assert(MORE()); /* caller should have ensured this */ c = GETNEXT(); + (void)bc; +#ifdef LIBREGEX + handled = 0; +#endif pos = HERE(); switch (c) { case '(': @@ -427,6 +449,59 @@ case '\\': (void)REQUIRE(MORE(), REG_EESCAPE); wc = WGETNEXT(); +#ifdef LIBREGEX + if (p->gnuext) { + handled = 1; + switch (wc) { + case '`': + EMIT(OBOS, 0); + break; + case '\'': + EMIT(OEOS, 0); + break; + case 'b': + EMIT(OWBND, 0); + break; + case 'B': + EMIT(ONWBND, 0); + break; + case 'W': + case 'w': + case 'S': + case 's': + p_b_pseudoclass(p, wc); + break; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + i = wc - '0'; + assert(i < NPAREN); + if (p->pend[i] != 0) { + assert(i <= p->g->nsub); + EMIT(OBACK_, i); + assert(p->pbegin[i] != 0); + assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); + assert(OP(p->strip[p->pend[i]]) == ORPAREN); + (void) dupl(p, p->pbegin[i]+1, p->pend[i]); + EMIT(O_BACK, i); + } else + SETERROR(REG_ESUBREG); + p->g->backrefs = 1; + break; + default: + handled = 0; + } + /* Don't proceed to the POSIX bits if we've already handled it */ + if (handled) + break; + } +#endif switch (wc) { case '<': EMIT(OBOW, 0); @@ -532,8 +607,9 @@ { int nskip; + (void)bc; nskip = 0; - while (EAT('|')) + while (EATSPEC('|')) ++nskip; return (nskip); } @@ -585,8 +661,15 @@ p_branch_empty(struct parse *p, struct branchc *bc) { +#if defined(LIBREGEX) && defined(NOTYET) + if (bc->outer) + p->g->iflags |= EMPTBR; + return (true); +#else + (void)bc; SETERROR(REG_EMPTY); return (false); +#endif } /* @@ -678,7 +761,11 @@ } if (p->post_parse != NULL) p->post_parse(p, &bc); - (void) REQUIRE(HERE() != bc.start, REG_EMPTY); + (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY); +#ifdef LIBREGEX + if (HERE() == bc.start && !p_branch_empty(p, &bc)) + break; +#endif if (!p->allowbranch) break; /* @@ -705,106 +792,139 @@ p_simp_re(struct parse *p, struct branchc *bc) { int c; + int cc; /* convenient/control character */ int count; int count2; sopno pos; + bool handled; int i; wint_t wc; sopno subno; # define BACKSL (1<gnuext) { + handled = true; + switch (c) { + case BACKSL|'`': + EMIT(OBOS, 0); + break; + case BACKSL|'\'': + EMIT(OEOS, 0); + break; + case BACKSL|'b': + EMIT(OWBND, 0); + break; + case BACKSL|'B': + EMIT(ONWBND, 0); + break; + case BACKSL|'W': + case BACKSL|'w': + case BACKSL|'S': + case BACKSL|'s': + p_b_pseudoclass(p, cc); + break; + default: + handled = false; + } + } +#endif } - switch (c) { - case '.': - if (p->g->cflags®_NEWLINE) - nonnewline(p); - else - EMIT(OANY, 0); - break; - case '[': - p_bracket(p); - break; - case BACKSL|'<': - EMIT(OBOW, 0); - break; - case BACKSL|'>': - EMIT(OEOW, 0); - break; - case BACKSL|'{': - SETERROR(REG_BADRPT); - break; - case BACKSL|'(': - p->g->nsub++; - subno = p->g->nsub; - if (subno < NPAREN) - p->pbegin[subno] = HERE(); - EMIT(OLPAREN, subno); - /* the MORE here is an error heuristic */ - if (MORE() && !SEETWO('\\', ')')) - p_re(p, '\\', ')'); - if (subno < NPAREN) { - p->pend[subno] = HERE(); - assert(p->pend[subno] != 0); + if (!handled) { + switch (c) { + case '.': + if (p->g->cflags®_NEWLINE) + nonnewline(p); + else + EMIT(OANY, 0); + break; + case '[': + p_bracket(p); + break; + case BACKSL|'<': + EMIT(OBOW, 0); + break; + case BACKSL|'>': + EMIT(OEOW, 0); + break; + case BACKSL|'{': + SETERROR(REG_BADRPT); + break; + case BACKSL|'(': + p->g->nsub++; + subno = p->g->nsub; + if (subno < NPAREN) + p->pbegin[subno] = HERE(); + EMIT(OLPAREN, subno); + /* the MORE here is an error heuristic */ + if (MORE() && !SEETWO('\\', ')')) + p_re(p, '\\', ')'); + if (subno < NPAREN) { + p->pend[subno] = HERE(); + assert(p->pend[subno] != 0); + } + EMIT(ORPAREN, subno); + (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); + break; + case BACKSL|')': /* should not get here -- must be user */ + SETERROR(REG_EPAREN); + break; + case BACKSL|'1': + case BACKSL|'2': + case BACKSL|'3': + case BACKSL|'4': + case BACKSL|'5': + case BACKSL|'6': + case BACKSL|'7': + case BACKSL|'8': + case BACKSL|'9': + i = (c&~BACKSL) - '0'; + assert(i < NPAREN); + if (p->pend[i] != 0) { + assert(i <= p->g->nsub); + EMIT(OBACK_, i); + assert(p->pbegin[i] != 0); + assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); + assert(OP(p->strip[p->pend[i]]) == ORPAREN); + (void) dupl(p, p->pbegin[i]+1, p->pend[i]); + EMIT(O_BACK, i); + } else + SETERROR(REG_ESUBREG); + p->g->backrefs = 1; + break; + case '*': + (void)REQUIRE(bc->nchain == 0, REG_BADRPT); + /* FALLTHROUGH */ + default: + p->next--; + wc = WGETNEXT(); + ordinary(p, wc); + break; } - EMIT(ORPAREN, subno); - (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN); - break; - case BACKSL|')': /* should not get here -- must be user */ - SETERROR(REG_EPAREN); - break; - case BACKSL|'1': - case BACKSL|'2': - case BACKSL|'3': - case BACKSL|'4': - case BACKSL|'5': - case BACKSL|'6': - case BACKSL|'7': - case BACKSL|'8': - case BACKSL|'9': - i = (c&~BACKSL) - '0'; - assert(i < NPAREN); - if (p->pend[i] != 0) { - assert(i <= p->g->nsub); - EMIT(OBACK_, i); - assert(p->pbegin[i] != 0); - assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); - assert(OP(p->strip[p->pend[i]]) == ORPAREN); - (void) dupl(p, p->pbegin[i]+1, p->pend[i]); - EMIT(O_BACK, i); - } else - SETERROR(REG_ESUBREG); - p->g->backrefs = 1; - break; - case '*': - /* - * Ordinary if used as the first character beyond BOL anchor of - * a (sub-)expression, counts as a bad repetition operator if it - * appears otherwise. - */ - (void)REQUIRE(bc->nchain == 0, REG_BADRPT); - /* FALLTHROUGH */ - default: - if (p->error != 0) - return (false); /* Definitely not $... */ - p->next--; - wc = WGETNEXT(); - ordinary(p, wc); - break; } - if (EAT('*')) { /* implemented as +? */ /* this case does not require the (y|) trick, noKLUDGE */ INSERT(OPLUS_, pos); ASTERN(O_PLUS, pos); INSERT(OQUEST_, pos); ASTERN(O_QUEST, pos); +#ifdef LIBREGEX + } else if (p->gnuext && EATTWO('\\', '?')) { + INSERT(OQUEST_, pos); + ASTERN(O_QUEST, pos); + } else if (p->gnuext && EATTWO('\\', '+')) { + INSERT(OPLUS_, pos); + ASTERN(O_PLUS, pos); +#endif } else if (EATTWO('\\', '{')) { count = p_count(p); if (EAT(',')) { @@ -899,6 +1019,23 @@ EMIT(OANYOF, (int)(cs - p->g->sets)); } +static int +p_range_cmp(wchar_t c1, wchar_t c2) +{ +#ifndef LIBREGEX + return __wcollate_range_cmp(c1, c2); +#else + /* Copied from libc/collate __wcollate_range_cmp */ + wchar_t s1[2], s2[2]; + + s1[0] = c1; + s1[1] = L'\0'; + s2[0] = c2; + s2[1] = L'\0'; + return (wcscoll(s1, s2)); +#endif +} + /* - p_b_term - parse one term of a bracketed character list == static void p_b_term(struct parse *p, cset *cs); @@ -909,9 +1046,10 @@ char c; wint_t start, finish; wint_t i; +#ifndef LIBREGEX struct xlocale_collate *table = (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE]; - +#endif /* classify what we've got */ switch ((MORE()) ? PEEK() : '\0') { case '[': @@ -958,15 +1096,18 @@ if (start == finish) CHadd(p, cs, start); else { +#ifndef LIBREGEX if (table->__collate_load_error || MB_CUR_MAX > 1) { +#else + if (MB_CUR_MAX > 1) { +#endif (void)REQUIRE(start <= finish, REG_ERANGE); CHaddrange(p, cs, start, finish); } else { - (void)REQUIRE(__wcollate_range_cmp(start, finish) <= 0, REG_ERANGE); + (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE); for (i = 0; i <= UCHAR_MAX; i++) { - if ( __wcollate_range_cmp(start, i) <= 0 - && __wcollate_range_cmp(i, finish) <= 0 - ) + if (p_range_cmp(start, i) <= 0 && + p_range_cmp(i, finish) <= 0 ) CHadd(p, cs, i); } } @@ -975,6 +1116,41 @@ } } +/* + - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S) + == static int p_b_pseudoclass(struct parse *p, char c) + */ +static int +p_b_pseudoclass(struct parse *p, char c) { + cset *cs; + + if ((cs = allocset(p)) == NULL) + return(0); + + if (p->g->cflags®_ICASE) + cs->icase = 1; + + switch (c) { + case 'W': + cs->invert = 1; + /* PASSTHROUGH */ + case 'w': + p_b_cclass_named(p, cs, "alnum"); + break; + case 'S': + cs->invert = 1; + /* PASSTHROUGH */ + case 's': + p_b_cclass_named(p, cs, "space"); + break; + default: + return(0); + } + + EMIT(OANYOF, (int)(cs - p->g->sets)); + return(1); +} + /* - p_b_cclass - parse a character-class name and deal with it == static void p_b_cclass(struct parse *p, cset *cs); @@ -984,7 +1160,6 @@ { const char *sp = p->next; size_t len; - wctype_t wct; char clname[16]; while (MORE() && isalpha((uch)PEEK())) @@ -996,6 +1171,17 @@ } memcpy(clname, sp, len); clname[len] = '\0'; + + p_b_cclass_named(p, cs, clname); +} +/* + - p_b_cclass_named - deal with a named character class + == static void p_b_cclass_named(struct parse *p, cset *cs, const char []); + */ +static void +p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) { + wctype_t wct; + if ((wct = wctype(clname)) == 0) { SETERROR(REG_ECTYPE); return; @@ -1624,12 +1810,14 @@ /* FALLTHROUGH */ case OBOW: /* things that break a sequence */ case OEOW: + case OWBND: + case ONWBND: case OBOL: case OEOL: case O_QUEST: case O_CH: case OEND: - if (newlen > g->mlen) { /* ends one */ + if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { @@ -1644,7 +1832,7 @@ newlen = 0; break; case OANY: - if (newlen > g->mlen) { /* ends one */ + if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { @@ -1662,7 +1850,7 @@ break; case OANYOF: /* may or may not invalidate offset */ /* First, everything as OANY */ - if (newlen > g->mlen) { /* ends one */ + if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) { @@ -1685,7 +1873,7 @@ * save the last known good offset, in case the * must sequence doesn't occur later. */ - if (newlen > g->mlen) { /* ends one */ + if (newlen > (sopno)g->mlen) { /* ends one */ start = newstart; g->mlen = newlen; if (offset > -1) @@ -1777,6 +1965,8 @@ try++; case OBOW: case OEOW: + case OWBND: + case ONWBND: case OLPAREN: case ORPAREN: case OOR2: Index: lib/libc/regex/regex.3 =================================================================== --- lib/libc/regex/regex.3 +++ lib/libc/regex/regex.3 @@ -32,7 +32,7 @@ .\" @(#)regex.3 8.4 (Berkeley) 3/20/94 .\" $FreeBSD$ .\" -.Dd May 25, 2016 +.Dd April 15, 2017 .Dt REGEX 3 .Os .Sh NAME @@ -183,6 +183,17 @@ .St -p1003.2 , and should be used with caution in software intended to be portable to other systems. +.It Dv REG_POSIX +Compile only +.St -p1003.2 +compliant expressions. +This flag has no effect unless linking against +.Nm libregex . +This is an extension, +compatible with but not specified by +.St -p1003.2 , +and should be used with +caution in software intended to be portable to other systems. .El .Pp When successful, Index: lib/libc/regex/regex2.h =================================================================== --- lib/libc/regex/regex2.h +++ lib/libc/regex/regex2.h @@ -102,7 +102,10 @@ #define O_CH (18L<