Index: head/lib/libc/regex/engine.c =================================================================== --- head/lib/libc/regex/engine.c (revision 340834) +++ head/lib/libc/regex/engine.c (revision 340835) @@ -1,1093 +1,1135 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1992, 1993, 1994 Henry Spencer. * Copyright (c) 1992, 1993, 1994 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Henry Spencer. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)engine.c 8.5 (Berkeley) 3/20/94 */ #include __FBSDID("$FreeBSD$"); #include /* * The matching engine and friends. This file is #included by regexec.c * after suitable #defines of a variety of macros used herein, so that * different state representations can be used without duplicating masses * of code. */ #ifdef SNAMES +#define stepback sstepback #define matcher smatcher #define walk swalk #define dissect sdissect #define backref sbackref #define step sstep #define print sprint #define at sat #define match smat #endif #ifdef LNAMES +#define stepback lstepback #define matcher lmatcher #define walk lwalk #define dissect ldissect #define backref lbackref #define step lstep #define print lprint #define at lat #define match lmat #endif #ifdef MNAMES +#define stepback mstepback #define matcher mmatcher #define walk mwalk #define dissect mdissect #define backref mbackref #define step mstep #define print mprint #define at mat #define match mmat #endif /* another structure passed up and down to avoid zillions of parameters */ struct match { struct re_guts *g; int eflags; regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ const char *offp; /* offsets work from here */ const char *beginp; /* start of string -- virtual NUL precedes */ const char *endp; /* end of string -- virtual NUL here */ const char *coldp; /* can be no match starting before here */ const char **lastpos; /* [nplus+1] */ STATEVARS; states st; /* current states */ states fresh; /* states for a fresh start */ states tmp; /* temporary */ states empty; /* empty set of states */ mbstate_t mbs; /* multibyte conversion state */ }; /* ========= begin header generated by ./mkh ========= */ #ifdef __cplusplus extern "C" { #endif /* === engine.c === */ static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast); static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); #define MAX_RECURSION 100 #define BOL (OUT-1) #define EOL (BOL-1) #define BOLEOL (BOL-2) #define NOTHING (BOL-3) #define BOW (BOL-4) #define EOW (BOL-5) #define BADCHAR (BOL-6) #define NONCHAR(c) ((c) <= OUT) #ifdef REDEBUG static void print(struct match *m, const char *caption, states st, int ch, FILE *d); #endif #ifdef REDEBUG static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst); #endif #ifdef REDEBUG static const char *pchar(int ch); #endif #ifdef __cplusplus } #endif /* ========= end header generated by ./mkh ========= */ #ifdef REDEBUG #define SP(t, s, c) print(m, t, s, c, stdout) #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } #else #define SP(t, s, c) /* nothing */ #define AT(t, p1, p2, s1, s2) /* nothing */ #define NOTE(s) /* nothing */ #endif /* + * Given a multibyte string pointed to by start, step back nchar characters + * from current position pointed to by cur. + */ +static const char * +stepback(const char *start, const char *cur, int nchar) +{ + const char *ret; + int wc, mbc; + mbstate_t mbs; + size_t clen; + + if (MB_CUR_MAX == 1) + return ((cur - nchar) > start ? cur - nchar : NULL); + + ret = cur; + for (wc = nchar; wc > 0; wc--) { + for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) { + if ((ret - mbc) < start) + return (NULL); + memset(&mbs, 0, sizeof(mbs)); + clen = mbrtowc(NULL, ret - mbc, mbc, &mbs); + if (clen != (size_t)-1 && clen != (size_t)-2) + break; + } + if (mbc > MB_CUR_MAX) + return (NULL); + ret -= mbc; + } + + return (ret); +} + +/* - matcher - the actual matching engine == static int matcher(struct re_guts *g, const char *string, \ == size_t nmatch, regmatch_t pmatch[], int eflags); */ static int /* 0 success, REG_NOMATCH failure */ matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags) { const char *endp; size_t i; struct match mv; struct match *m = &mv; const char *dp = NULL; const sopno gf = g->firststate+1; /* +1 for OEND */ const sopno gl = g->laststate; const char *start; const char *stop; /* Boyer-Moore algorithms variables */ const char *pp; int cj, mj; const char *mustfirst; const char *mustlast; int *matchjump; int *charjump; /* simplify the situation where possible */ if (g->cflags®_NOSUB) nmatch = 0; if (eflags®_STARTEND) { start = string + pmatch[0].rm_so; stop = string + pmatch[0].rm_eo; } else { start = string; stop = start + strlen(start); } if (stop < start) return(REG_INVARG); /* prescreening; this does wonders for this rather slow code */ if (g->must != NULL) { if (g->charjump != NULL && g->matchjump != NULL) { mustfirst = g->must; mustlast = g->must + g->mlen - 1; charjump = g->charjump; matchjump = g->matchjump; pp = mustlast; for (dp = start+g->mlen-1; dp < stop;) { /* Fast skip non-matches */ while (dp < stop && charjump[(int)*dp]) dp += charjump[(int)*dp]; if (dp >= stop) break; /* Greedy matcher */ /* We depend on not being used for * for strings of length 1 */ while (*--dp == *--pp && pp != mustfirst); if (*dp == *pp) break; /* Jump to next possible match */ mj = matchjump[pp - mustfirst]; cj = charjump[(int)*dp]; dp += (cj < mj ? mj : cj); pp = mustlast; } if (pp != mustfirst) return(REG_NOMATCH); } else { for (dp = start; dp < stop; dp++) if (*dp == g->must[0] && stop - dp >= g->mlen && memcmp(dp, g->must, (size_t)g->mlen) == 0) break; if (dp == stop) /* we didn't find g->must */ return(REG_NOMATCH); } } /* match struct setup */ m->g = g; m->eflags = eflags; m->pmatch = NULL; m->lastpos = NULL; m->offp = string; m->beginp = start; m->endp = stop; STATESETUP(m, 4); SETUP(m->st); SETUP(m->fresh); SETUP(m->tmp); SETUP(m->empty); CLEAR(m->empty); ZAPSTATE(&m->mbs); /* Adjust start according to moffset, to speed things up */ - if (dp != NULL && g->moffset > -1) - start = ((dp - g->moffset) < start) ? start : dp - g->moffset; + if (dp != NULL && g->moffset > -1) { + const char *nstart; + nstart = stepback(start, dp, g->moffset); + if (nstart != NULL) + start = nstart; + } + SP("mloop", m->st, *start); /* this loop does only one repetition except for backrefs */ for (;;) { endp = walk(m, start, stop, gf, gl, true); if (endp == NULL) { /* a miss */ if (m->pmatch != NULL) free((char *)m->pmatch); if (m->lastpos != NULL) free((char *)m->lastpos); STATETEARDOWN(m); return(REG_NOMATCH); } if (nmatch == 0 && !g->backrefs) break; /* no further info needed */ /* where? */ assert(m->coldp != NULL); for (;;) { NOTE("finding start"); endp = walk(m, m->coldp, stop, gf, gl, false); if (endp != NULL) break; assert(m->coldp < m->endp); m->coldp += XMBRTOWC(NULL, m->coldp, m->endp - m->coldp, &m->mbs, 0); } if (nmatch == 1 && !g->backrefs) break; /* no further info needed */ /* oh my, he wants the subexpressions... */ if (m->pmatch == NULL) m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * sizeof(regmatch_t)); if (m->pmatch == NULL) { STATETEARDOWN(m); return(REG_ESPACE); } for (i = 1; i <= m->g->nsub; i++) m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; if (!g->backrefs && !(m->eflags®_BACKR)) { NOTE("dissecting"); dp = dissect(m, m->coldp, endp, gf, gl); } else { if (g->nplus > 0 && m->lastpos == NULL) m->lastpos = malloc((g->nplus+1) * sizeof(const char *)); if (g->nplus > 0 && m->lastpos == NULL) { free(m->pmatch); STATETEARDOWN(m); return(REG_ESPACE); } NOTE("backref dissect"); dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); } if (dp != NULL) break; /* uh-oh... we couldn't find a subexpression-level match */ assert(g->backrefs); /* must be back references doing it */ assert(g->nplus == 0 || m->lastpos != NULL); for (;;) { if (dp != NULL || endp <= m->coldp) break; /* defeat */ NOTE("backoff"); endp = walk(m, m->coldp, endp-1, gf, gl, false); if (endp == NULL) break; /* defeat */ /* try it on a shorter possibility */ #ifndef NDEBUG for (i = 1; i <= m->g->nsub; i++) { assert(m->pmatch[i].rm_so == -1); assert(m->pmatch[i].rm_eo == -1); } #endif NOTE("backoff dissect"); dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0); } assert(dp == NULL || dp == endp); if (dp != NULL) /* found a shorter one */ break; /* despite initial appearances, there is no match here */ NOTE("false alarm"); /* recycle starting later */ start = m->coldp + XMBRTOWC(NULL, m->coldp, stop - m->coldp, &m->mbs, 0); assert(start <= stop); } /* fill in the details if requested */ if (nmatch > 0) { pmatch[0].rm_so = m->coldp - m->offp; pmatch[0].rm_eo = endp - m->offp; } if (nmatch > 1) { assert(m->pmatch != NULL); for (i = 1; i < nmatch; i++) if (i <= m->g->nsub) pmatch[i] = m->pmatch[i]; else { pmatch[i].rm_so = -1; pmatch[i].rm_eo = -1; } } if (m->pmatch != NULL) free((char *)m->pmatch); if (m->lastpos != NULL) free((char *)m->lastpos); STATETEARDOWN(m); return(0); } /* - dissect - figure out what matched what, no back references == static const char *dissect(struct match *m, const char *start, \ == const char *stop, sopno startst, sopno stopst); */ static const char * /* == stop (success) always */ dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst) { int i; sopno ss; /* start sop of current subRE */ sopno es; /* end sop of current subRE */ const char *sp; /* start of string matched by it */ const char *stp; /* string matched by it cannot pass here */ const char *rest; /* start of rest of string */ const char *tail; /* string unmatched by rest of RE */ sopno ssub; /* start sop of subsubRE */ sopno esub; /* end sop of subsubRE */ const char *ssp; /* start of string matched by subsubRE */ const char *sep; /* end of string matched by subsubRE */ const char *oldssp; /* previous ssp */ const char *dp __unused; AT("diss", start, stop, startst, stopst); sp = start; for (ss = startst; ss < stopst; ss = es) { /* identify end of subRE */ es = ss; switch (OP(m->g->strip[es])) { case OPLUS_: case OQUEST_: es += OPND(m->g->strip[es]); break; case OCH_: while (OP(m->g->strip[es]) != (sop)O_CH) es += OPND(m->g->strip[es]); break; } es++; /* figure out what it matched */ switch (OP(m->g->strip[ss])) { case OEND: assert(nope); break; case OCHAR: sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); break; case OBOL: case OEOL: case OBOW: case OEOW: break; case OANY: case OANYOF: sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); break; case OBACK_: case O_BACK: assert(nope); break; /* cases where length of match is hard to find */ case OQUEST_: stp = stop; for (;;) { /* how long could this one be? */ rest = walk(m, sp, stp, ss, es, false); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ tail = walk(m, rest, stop, es, stopst, false); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ stp = rest - 1; assert(stp >= sp); /* it did work */ } ssub = ss + 1; esub = es - 1; /* did innards match? */ if (walk(m, sp, rest, ssub, esub, false) != NULL) { dp = dissect(m, sp, rest, ssub, esub); assert(dp == rest); } else /* no */ assert(sp == rest); sp = rest; break; case OPLUS_: stp = stop; for (;;) { /* how long could this one be? */ rest = walk(m, sp, stp, ss, es, false); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ tail = walk(m, rest, stop, es, stopst, false); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ stp = rest - 1; assert(stp >= sp); /* it did work */ } ssub = ss + 1; esub = es - 1; ssp = sp; oldssp = ssp; for (;;) { /* find last match of innards */ sep = walk(m, ssp, rest, ssub, esub, false); if (sep == NULL || sep == ssp) break; /* failed or matched null */ oldssp = ssp; /* on to next try */ ssp = sep; } if (sep == NULL) { /* last successful match */ sep = ssp; ssp = oldssp; } assert(sep == rest); /* must exhaust substring */ assert(walk(m, ssp, sep, ssub, esub, false) == rest); dp = dissect(m, ssp, sep, ssub, esub); assert(dp == sep); sp = rest; break; case OCH_: stp = stop; for (;;) { /* how long could this one be? */ rest = walk(m, sp, stp, ss, es, false); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ tail = walk(m, rest, stop, es, stopst, false); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ stp = rest - 1; assert(stp >= sp); /* it did work */ } ssub = ss + 1; esub = ss + OPND(m->g->strip[ss]) - 1; assert(OP(m->g->strip[esub]) == OOR1); for (;;) { /* find first matching branch */ if (walk(m, sp, rest, ssub, esub, false) == rest) break; /* it matched all of it */ /* that one missed, try next one */ assert(OP(m->g->strip[esub]) == OOR1); esub++; assert(OP(m->g->strip[esub]) == OOR2); ssub = esub + 1; esub += OPND(m->g->strip[esub]); if (OP(m->g->strip[esub]) == (sop)OOR2) esub--; else assert(OP(m->g->strip[esub]) == O_CH); } dp = dissect(m, sp, rest, ssub, esub); assert(dp == rest); sp = rest; break; case O_PLUS: case O_QUEST: case OOR1: case OOR2: case O_CH: assert(nope); break; case OLPAREN: i = OPND(m->g->strip[ss]); assert(0 < i && i <= m->g->nsub); m->pmatch[i].rm_so = sp - m->offp; break; case ORPAREN: i = OPND(m->g->strip[ss]); assert(0 < i && i <= m->g->nsub); m->pmatch[i].rm_eo = sp - m->offp; break; default: /* uh oh */ assert(nope); break; } } assert(sp == stop); return(sp); } /* - backref - figure out what matched what, figuring in back references == static const char *backref(struct match *m, const char *start, \ == const char *stop, sopno startst, sopno stopst, sopno lev); */ static const char * /* == stop (success) or NULL (failure) */ backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, /* PLUS nesting level */ int rec) { int i; sopno ss; /* start sop of current subRE */ const char *sp; /* start of string matched by it */ sopno ssub; /* start sop of subsubRE */ sopno esub; /* end sop of subsubRE */ const char *ssp; /* start of string matched by subsubRE */ const char *dp; size_t len; int hard; sop s; regoff_t offsave; cset *cs; wint_t wc; AT("back", start, stop, startst, stopst); sp = start; /* get as far as we can with easy stuff */ hard = 0; for (ss = startst; !hard && ss < stopst; ss++) switch (OP(s = m->g->strip[ss])) { case OCHAR: if (sp == stop) return(NULL); sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); if (wc != OPND(s)) return(NULL); break; case OANY: if (sp == stop) return(NULL); sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); if (wc == BADCHAR) return (NULL); break; case OANYOF: if (sp == stop) return (NULL); cs = &m->g->sets[OPND(s)]; sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); if (wc == BADCHAR || !CHIN(cs, wc)) return(NULL); break; case OBOL: if ((sp == m->beginp && !(m->eflags®_NOTBOL)) || (sp > m->offp && sp < m->endp && *(sp-1) == '\n' && (m->g->cflags®_NEWLINE))) { /* yes */ } else return(NULL); break; case OEOL: if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || (sp < m->endp && *sp == '\n' && (m->g->cflags®_NEWLINE)) ) { /* yes */ } else return(NULL); break; case OBOW: if (sp < m->endp && ISWORD(*sp) && ((sp == m->beginp && !(m->eflags®_NOTBOL)) || (sp > m->offp && !ISWORD(*(sp-1))))) { /* yes */ } else return(NULL); break; case OEOW: if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || (sp < m->endp && *sp == '\n' && (m->g->cflags®_NEWLINE)) || (sp < m->endp && !ISWORD(*sp)) ) && (sp > m->beginp && ISWORD(*(sp-1))) ) { /* yes */ } else return(NULL); break; case O_QUEST: break; case OOR1: /* matches null but needs to skip */ ss++; s = m->g->strip[ss]; do { assert(OP(s) == OOR2); ss += OPND(s); } while (OP(s = m->g->strip[ss]) != (sop)O_CH); /* note that the ss++ gets us past the O_CH */ break; default: /* have to make a choice */ hard = 1; break; } if (!hard) { /* that was it! */ if (sp != stop) return(NULL); return(sp); } ss--; /* adjust for the for's final increment */ /* the hard stuff */ AT("hard", sp, stop, ss, stopst); s = m->g->strip[ss]; switch (OP(s)) { case OBACK_: /* the vilest depths */ i = OPND(s); assert(0 < i && i <= m->g->nsub); if (m->pmatch[i].rm_eo == -1) return(NULL); assert(m->pmatch[i].rm_so != -1); len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; if (len == 0 && rec++ > MAX_RECURSION) return(NULL); assert(stop - m->beginp >= len); if (sp > stop - len) return(NULL); /* not enough left to match */ ssp = m->offp + m->pmatch[i].rm_so; if (memcmp(sp, ssp, len) != 0) return(NULL); while (m->g->strip[ss] != (sop)SOP(O_BACK, i)) ss++; return(backref(m, sp+len, stop, ss+1, stopst, lev, rec)); case OQUEST_: /* to null or not */ dp = backref(m, sp, stop, ss+1, stopst, lev, rec); if (dp != NULL) return(dp); /* not */ return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec)); case OPLUS_: assert(m->lastpos != NULL); assert(lev+1 <= m->g->nplus); m->lastpos[lev+1] = sp; return(backref(m, sp, stop, ss+1, stopst, lev+1, rec)); case O_PLUS: if (sp == m->lastpos[lev]) /* last pass matched null */ return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); /* try another pass */ m->lastpos[lev] = sp; dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec); if (dp == NULL) return(backref(m, sp, stop, ss+1, stopst, lev-1, rec)); else return(dp); case OCH_: /* find the right one, if any */ ssub = ss + 1; esub = ss + OPND(s) - 1; assert(OP(m->g->strip[esub]) == OOR1); for (;;) { /* find first matching branch */ dp = backref(m, sp, stop, ssub, esub, lev, rec); if (dp != NULL) return(dp); /* that one missed, try next one */ if (OP(m->g->strip[esub]) == (sop)O_CH) return(NULL); /* there is none */ esub++; assert(OP(m->g->strip[esub]) == (sop)OOR2); ssub = esub + 1; esub += OPND(m->g->strip[esub]); if (OP(m->g->strip[esub]) == (sop)OOR2) esub--; else assert(OP(m->g->strip[esub]) == O_CH); } /* NOTREACHED */ break; case OLPAREN: /* must undo assignment if rest fails */ i = OPND(s); assert(0 < i && i <= m->g->nsub); offsave = m->pmatch[i].rm_so; m->pmatch[i].rm_so = sp - m->offp; dp = backref(m, sp, stop, ss+1, stopst, lev, rec); if (dp != NULL) return(dp); m->pmatch[i].rm_so = offsave; return(NULL); case ORPAREN: /* must undo assignment if rest fails */ i = OPND(s); assert(0 < i && i <= m->g->nsub); offsave = m->pmatch[i].rm_eo; m->pmatch[i].rm_eo = sp - m->offp; dp = backref(m, sp, stop, ss+1, stopst, lev, rec); if (dp != NULL) return(dp); m->pmatch[i].rm_eo = offsave; return(NULL); default: /* uh oh */ assert(nope); break; } /* "can't happen" */ assert(nope); /* NOTREACHED */ return "shut up gcc"; } /* - walk - step through the string either quickly or slowly == static const char *walk(struct match *m, const char *start, \ == const char *stop, sopno startst, sopno stopst, bool fast); */ static const char * /* where it ended, or NULL */ walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast) { states st = m->st; states fresh = m->fresh; states empty = m->empty; states tmp = m->tmp; const char *p = start; wint_t c; wint_t lastc; /* previous c */ wint_t flagch; int i; const char *matchp; /* last p at which a match ended */ size_t clen; AT("slow", start, stop, startst, stopst); CLEAR(st); SET1(st, startst); SP("sstart", st, *p); st = step(m->g, startst, stopst, st, NOTHING, st); if (fast) ASSIGN(fresh, st); matchp = NULL; if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) c = OUT; else { /* * XXX Wrong if the previous character was multi-byte. * Newline never is (in encodings supported by FreeBSD), * so this only breaks the ISWORD tests below. */ c = (uch)*(start - 1); } for (;;) { /* next character */ lastc = c; if (p == m->endp) { c = OUT; clen = 0; } else clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); if (fast && EQ(st, fresh)) matchp = p; /* is there an EOL and/or BOL between lastc and c? */ flagch = '\0'; i = 0; if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || (lastc == OUT && !(m->eflags®_NOTBOL)) ) { flagch = BOL; i = m->g->nbol; } if ( (c == '\n' && m->g->cflags®_NEWLINE) || (c == OUT && !(m->eflags®_NOTEOL)) ) { flagch = (flagch == BOL) ? BOLEOL : EOL; i += m->g->neol; } if (i != 0) { for (; i > 0; i--) st = step(m->g, startst, stopst, st, flagch, st); SP("sboleol", st, c); } /* how about a word boundary? */ if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && (c != OUT && ISWORD(c)) ) { flagch = BOW; } if ( (lastc != OUT && ISWORD(lastc)) && (flagch == EOL || (c != OUT && !ISWORD(c))) ) { flagch = EOW; } if (flagch == BOW || flagch == EOW) { st = step(m->g, startst, stopst, st, flagch, st); SP("sboweow", st, c); } /* are we done? */ if (ISSET(st, stopst)) { if (fast) break; else matchp = p; } if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p)) break; /* NOTE BREAK OUT */ /* no, we must deal with this character */ ASSIGN(tmp, st); if (fast) ASSIGN(st, fresh); else ASSIGN(st, empty); assert(c != OUT); st = step(m->g, startst, stopst, tmp, c, st); SP("saft", st, c); assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); p += clen; } if (fast) { assert(matchp != NULL); m->coldp = matchp; if (ISSET(st, stopst)) return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); else return (NULL); } else return (matchp); } /* - step - map set of states reachable before char to set reachable after == static states step(struct re_guts *g, sopno start, sopno stop, \ == states bef, int ch, states aft); == #define BOL (OUT-1) == #define EOL (BOL-1) == #define BOLEOL (BOL-2) == #define NOTHING (BOL-3) == #define BOW (BOL-4) == #define EOW (BOL-5) == #define BADCHAR (BOL-6) == #define NONCHAR(c) ((c) <= OUT) */ static states step(struct re_guts *g, sopno start, /* start state within strip */ sopno stop, /* state after stop state within strip */ states bef, /* states reachable before */ wint_t ch, /* character or NONCHAR code */ states aft) /* states already known reachable after */ { cset *cs; sop s; sopno pc; onestate here; /* note, macros know this name */ sopno look; int i; for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { s = g->strip[pc]; switch (OP(s)) { case OEND: assert(pc == stop-1); break; case OCHAR: /* only characters can match */ assert(!NONCHAR(ch) || ch != OPND(s)); if (ch == OPND(s)) FWD(aft, bef, 1); break; case OBOL: if (ch == BOL || ch == BOLEOL) FWD(aft, bef, 1); break; case OEOL: if (ch == EOL || ch == BOLEOL) FWD(aft, bef, 1); break; case OBOW: if (ch == BOW) FWD(aft, bef, 1); break; case OEOW: if (ch == EOW) FWD(aft, bef, 1); break; case OANY: if (!NONCHAR(ch)) FWD(aft, bef, 1); break; case OANYOF: cs = &g->sets[OPND(s)]; if (!NONCHAR(ch) && CHIN(cs, ch)) FWD(aft, bef, 1); break; case OBACK_: /* ignored here */ case O_BACK: FWD(aft, aft, 1); break; case OPLUS_: /* forward, this is just an empty */ FWD(aft, aft, 1); break; case O_PLUS: /* both forward and back */ FWD(aft, aft, 1); i = ISSETBACK(aft, OPND(s)); BACK(aft, aft, OPND(s)); if (!i && ISSETBACK(aft, OPND(s))) { /* oho, must reconsider loop body */ pc -= OPND(s) + 1; INIT(here, pc); } break; case OQUEST_: /* two branches, both forward */ FWD(aft, aft, 1); FWD(aft, aft, OPND(s)); break; case O_QUEST: /* just an empty */ FWD(aft, aft, 1); break; case OLPAREN: /* not significant here */ case ORPAREN: FWD(aft, aft, 1); break; case OCH_: /* mark the first two branches */ FWD(aft, aft, 1); assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); FWD(aft, aft, OPND(s)); break; case OOR1: /* done a branch, find the O_CH */ if (ISSTATEIN(aft, here)) { for (look = 1; OP(s = g->strip[pc+look]) != (sop)O_CH; look += OPND(s)) assert(OP(s) == (sop)OOR2); FWD(aft, aft, look + 1); } break; case OOR2: /* propagate OCH_'s marking */ FWD(aft, aft, 1); if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) { assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2); FWD(aft, aft, OPND(s)); } break; case O_CH: /* just empty */ FWD(aft, aft, 1); break; default: /* ooooops... */ assert(nope); break; } } return(aft); } #ifdef REDEBUG /* - print - print a set of states == #ifdef REDEBUG == static void print(struct match *m, const char *caption, states st, \ == int ch, FILE *d); == #endif */ static void print(struct match *m, const char *caption, states st, int ch, FILE *d) { struct re_guts *g = m->g; sopno i; int first = 1; if (!(m->eflags®_TRACE)) return; fprintf(d, "%s", caption); if (ch != '\0') fprintf(d, " %s", pchar(ch)); for (i = 0; i < g->nstates; i++) if (ISSET(st, i)) { fprintf(d, "%s%d", (first) ? "\t" : ", ", i); first = 0; } fprintf(d, "\n"); } /* - at - print current situation == #ifdef REDEBUG == static void at(struct match *m, const char *title, const char *start, \ == const char *stop, sopno startst, sopno stopst); == #endif */ static void at( struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst) { if (!(m->eflags®_TRACE)) return; printf("%s %s-", title, pchar(*start)); printf("%s ", pchar(*stop)); printf("%ld-%ld\n", (long)startst, (long)stopst); } #ifndef PCHARDONE #define PCHARDONE /* never again */ /* - pchar - make a character printable == #ifdef REDEBUG == static const char *pchar(int ch); == #endif * * Is this identical to regchar() over in debug.c? Well, yes. But a * duplicate here avoids having a debugging-capable regexec.o tied to * a matching debug.o, and this is convenient. It all disappears in * the non-debug compilation anyway, so it doesn't matter much. */ static const char * /* -> representation */ pchar(int ch) { static char pbuf[10]; if (isprint((uch)ch) || ch == ' ') sprintf(pbuf, "%c", ch); else sprintf(pbuf, "\\%o", ch); return(pbuf); } #endif #endif +#undef stepback #undef matcher #undef walk #undef dissect #undef backref #undef step #undef print #undef at #undef match Index: head/lib/libc/tests/regex/Makefile =================================================================== --- head/lib/libc/tests/regex/Makefile (revision 340834) +++ head/lib/libc/tests/regex/Makefile (revision 340835) @@ -1,7 +1,10 @@ # $FreeBSD$ PACKAGE= tests +# local test cases +ATF_TESTS_SH+= multibyte + .include "Makefile.inc" .include "${.CURDIR:H}/Makefile.netbsd-tests" .include Index: head/lib/libc/tests/regex/multibyte.sh =================================================================== --- head/lib/libc/tests/regex/multibyte.sh (nonexistent) +++ head/lib/libc/tests/regex/multibyte.sh (revision 340835) @@ -0,0 +1,35 @@ +# $FreeBSD$ + +atf_test_case multibyte +multibyte_head() +{ + atf_set "descr" "Check matching multibyte characters (PR153502)" +} +multibyte_body() +{ + export LC_CTYPE="C.UTF-8" + + printf 'é' | atf_check -o "inline:é" \ + sed -ne '/^.$/p' + printf 'éé' | atf_check -o "inline:éé" \ + sed -ne '/^..$/p' + printf 'aéa' | atf_check -o "inline:aéa" \ + sed -ne '/a.a/p' + printf 'aéa'| atf_check -o "inline:aéa" \ + sed -ne '/a.*a/p' + printf 'aaéaa' | atf_check -o "inline:aaéaa" \ + sed -ne '/aa.aa/p' + printf 'aéaéa' | atf_check -o "inline:aéaéa" \ + sed -ne '/a.a.a/p' + printf 'éa' | atf_check -o "inline:éa" \ + sed -ne '/.a/p' + printf 'aéaa' | atf_check -o "inline:aéaa" \ + sed -ne '/a.aa/p' + printf 'éaé' | atf_check -o "inline:éaé" \ + sed -ne '/.a./p' +} + +atf_init_test_cases() +{ + atf_add_test_case multibyte +} Property changes on: head/lib/libc/tests/regex/multibyte.sh ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:executable ## -0,0 +1 ## +* \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property