| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277 |
- #include <stdlib.h>
- #include <stdio.h>
- #include <string.h>
- #include <setjmp.h>
- #include <limits.h>
- #include "regexp.h"
- #include "utf.h"
- #define emit regemit
- #define next regnext
- #define accept regaccept
- #define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
- #define REPINF 255
- #ifndef REG_MAXPROG
- #define REG_MAXPROG (32 << 10)
- #endif
- #ifndef REG_MAXREC
- #define REG_MAXREC 1024
- #endif
- #ifndef REG_MAXSPAN
- #define REG_MAXSPAN 64
- #endif
- #ifndef REG_MAXCLASS
- #define REG_MAXCLASS 128
- #endif
- typedef struct Reclass Reclass;
- typedef struct Renode Renode;
- typedef struct Reinst Reinst;
- typedef struct Rethread Rethread;
- struct Reclass {
- Rune *end;
- Rune spans[REG_MAXSPAN];
- };
- struct Reprog {
- Reinst *start, *end;
- Reclass *cclass;
- int flags;
- int nsub;
- };
- struct cstate {
- Reprog *prog;
- Renode *pstart, *pend;
- const char *source;
- int ncclass;
- int nsub;
- Renode *sub[REG_MAXSUB];
- int lookahead;
- Rune yychar;
- Reclass *yycc;
- int yymin, yymax;
- const char *error;
- jmp_buf kaboom;
- Reclass cclass[REG_MAXCLASS];
- };
- static void die(struct cstate *g, const char *message)
- {
- g->error = message;
- longjmp(g->kaboom, 1);
- }
- static int canon(Rune c)
- {
- Rune u = toupperrune(c);
- if (c >= 128 && u < 128)
- return c;
- return u;
- }
- /* Scan */
- enum {
- L_CHAR = 256,
- L_CCLASS, /* character class */
- L_NCCLASS, /* negative character class */
- L_NC, /* "(?:" no capture */
- L_PLA, /* "(?=" positive lookahead */
- L_NLA, /* "(?!" negative lookahead */
- L_WORD, /* "\b" word boundary */
- L_NWORD, /* "\B" non-word boundary */
- L_REF, /* "\1" back-reference */
- L_COUNT, /* {M,N} */
- };
- static int hex(struct cstate *g, int c)
- {
- if (c >= '0' && c <= '9') return c - '0';
- if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
- if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
- die(g, "invalid escape sequence");
- return 0;
- }
- static int dec(struct cstate *g, int c)
- {
- if (c >= '0' && c <= '9') return c - '0';
- die(g, "invalid quantifier");
- return 0;
- }
- #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|-0123456789"
- static int isunicodeletter(int c)
- {
- return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
- }
- static int nextrune(struct cstate *g)
- {
- if (!*g->source) {
- g->yychar = EOF;
- return 0;
- }
- g->source += chartorune(&g->yychar, g->source);
- if (g->yychar == '\\') {
- if (!*g->source)
- die(g, "unterminated escape sequence");
- g->source += chartorune(&g->yychar, g->source);
- switch (g->yychar) {
- case 'f': g->yychar = '\f'; return 0;
- case 'n': g->yychar = '\n'; return 0;
- case 'r': g->yychar = '\r'; return 0;
- case 't': g->yychar = '\t'; return 0;
- case 'v': g->yychar = '\v'; return 0;
- case 'c':
- if (!g->source[0])
- die(g, "unterminated escape sequence");
- g->yychar = (*g->source++) & 31;
- return 0;
- case 'x':
- if (!g->source[0] || !g->source[1])
- die(g, "unterminated escape sequence");
- g->yychar = hex(g, *g->source++) << 4;
- g->yychar += hex(g, *g->source++);
- if (g->yychar == 0) {
- g->yychar = '0';
- return 1;
- }
- return 0;
- case 'u':
- if (!g->source[0] || !g->source[1] || !g->source[2] || !g->source[3])
- die(g, "unterminated escape sequence");
- g->yychar = hex(g, *g->source++) << 12;
- g->yychar += hex(g, *g->source++) << 8;
- g->yychar += hex(g, *g->source++) << 4;
- g->yychar += hex(g, *g->source++);
- if (g->yychar == 0) {
- g->yychar = '0';
- return 1;
- }
- return 0;
- case 0:
- g->yychar = '0';
- return 1;
- }
- if (strchr(ESCAPES, g->yychar))
- return 1;
- if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
- die(g, "invalid escape character");
- return 0;
- }
- return 0;
- }
- static int lexcount(struct cstate *g)
- {
- g->yychar = *g->source++;
- g->yymin = dec(g, g->yychar);
- g->yychar = *g->source++;
- while (g->yychar != ',' && g->yychar != '}') {
- g->yymin = g->yymin * 10 + dec(g, g->yychar);
- g->yychar = *g->source++;
- if (g->yymin >= REPINF)
- die(g, "numeric overflow");
- }
- if (g->yychar == ',') {
- g->yychar = *g->source++;
- if (g->yychar == '}') {
- g->yymax = REPINF;
- } else {
- g->yymax = dec(g, g->yychar);
- g->yychar = *g->source++;
- while (g->yychar != '}') {
- g->yymax = g->yymax * 10 + dec(g, g->yychar);
- g->yychar = *g->source++;
- if (g->yymax >= REPINF)
- die(g, "numeric overflow");
- }
- }
- } else {
- g->yymax = g->yymin;
- }
- return L_COUNT;
- }
- static void newcclass(struct cstate *g)
- {
- if (g->ncclass >= REG_MAXCLASS)
- die(g, "too many character classes");
- g->yycc = g->cclass + g->ncclass++;
- g->yycc->end = g->yycc->spans;
- }
- static void addrange(struct cstate *g, Rune a, Rune b)
- {
- Reclass *cc = g->yycc;
- Rune *p;
- if (a > b)
- die(g, "invalid character class range");
- /* extend existing ranges if they overlap */
- for (p = cc->spans; p < cc->end; p += 2) {
- /* completely inside old range */
- if (a >= p[0] && b <= p[1])
- return;
- /* completely swallows old range */
- if (a < p[0] && b >= p[1]) {
- p[0] = a;
- p[1] = b;
- return;
- }
- /* extend at start */
- if (b >= p[0] - 1 && b <= p[1] && a < p[0]) {
- p[0] = a;
- return;
- }
- /* extend at end */
- if (a >= p[0] && a <= p[1] + 1 && b > p[1]) {
- p[1] = b;
- return;
- }
- }
- if (cc->end + 2 >= cc->spans + nelem(cc->spans))
- die(g, "too many character class ranges");
- *cc->end++ = a;
- *cc->end++ = b;
- }
- static void addranges_d(struct cstate *g)
- {
- addrange(g, '0', '9');
- }
- static void addranges_D(struct cstate *g)
- {
- addrange(g, 0, '0'-1);
- addrange(g, '9'+1, 0xFFFF);
- }
- static void addranges_s(struct cstate *g)
- {
- addrange(g, 0x9, 0xD);
- addrange(g, 0x20, 0x20);
- addrange(g, 0xA0, 0xA0);
- addrange(g, 0x2028, 0x2029);
- addrange(g, 0xFEFF, 0xFEFF);
- }
- static void addranges_S(struct cstate *g)
- {
- addrange(g, 0, 0x9-1);
- addrange(g, 0xD+1, 0x20-1);
- addrange(g, 0x20+1, 0xA0-1);
- addrange(g, 0xA0+1, 0x2028-1);
- addrange(g, 0x2029+1, 0xFEFF-1);
- addrange(g, 0xFEFF+1, 0xFFFF);
- }
- static void addranges_w(struct cstate *g)
- {
- addrange(g, '0', '9');
- addrange(g, 'A', 'Z');
- addrange(g, '_', '_');
- addrange(g, 'a', 'z');
- }
- static void addranges_W(struct cstate *g)
- {
- addrange(g, 0, '0'-1);
- addrange(g, '9'+1, 'A'-1);
- addrange(g, 'Z'+1, '_'-1);
- addrange(g, '_'+1, 'a'-1);
- addrange(g, 'z'+1, 0xFFFF);
- }
- static int lexclass(struct cstate *g)
- {
- int type = L_CCLASS;
- int quoted, havesave, havedash;
- Rune save = 0;
- newcclass(g);
- quoted = nextrune(g);
- if (!quoted && g->yychar == '^') {
- type = L_NCCLASS;
- quoted = nextrune(g);
- }
- havesave = havedash = 0;
- for (;;) {
- if (g->yychar == EOF)
- die(g, "unterminated character class");
- if (!quoted && g->yychar == ']')
- break;
- if (!quoted && g->yychar == '-') {
- if (havesave) {
- if (havedash) {
- addrange(g, save, '-');
- havesave = havedash = 0;
- } else {
- havedash = 1;
- }
- } else {
- save = '-';
- havesave = 1;
- }
- } else if (quoted && strchr("DSWdsw", g->yychar)) {
- if (havesave) {
- addrange(g, save, save);
- if (havedash)
- addrange(g, '-', '-');
- }
- switch (g->yychar) {
- case 'd': addranges_d(g); break;
- case 's': addranges_s(g); break;
- case 'w': addranges_w(g); break;
- case 'D': addranges_D(g); break;
- case 'S': addranges_S(g); break;
- case 'W': addranges_W(g); break;
- }
- havesave = havedash = 0;
- } else {
- if (quoted) {
- if (g->yychar == 'b')
- g->yychar = '\b';
- else if (g->yychar == '0')
- g->yychar = 0;
- /* else identity escape */
- }
- if (havesave) {
- if (havedash) {
- addrange(g, save, g->yychar);
- havesave = havedash = 0;
- } else {
- addrange(g, save, save);
- save = g->yychar;
- }
- } else {
- save = g->yychar;
- havesave = 1;
- }
- }
- quoted = nextrune(g);
- }
- if (havesave) {
- addrange(g, save, save);
- if (havedash)
- addrange(g, '-', '-');
- }
- return type;
- }
- static int lex(struct cstate *g)
- {
- int quoted = nextrune(g);
- if (quoted) {
- switch (g->yychar) {
- case 'b': return L_WORD;
- case 'B': return L_NWORD;
- case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
- case 's': newcclass(g); addranges_s(g); return L_CCLASS;
- case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
- case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
- case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
- case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
- case '0': g->yychar = 0; return L_CHAR;
- }
- if (g->yychar >= '0' && g->yychar <= '9') {
- g->yychar -= '0';
- if (*g->source >= '0' && *g->source <= '9')
- g->yychar = g->yychar * 10 + *g->source++ - '0';
- return L_REF;
- }
- return L_CHAR;
- }
- switch (g->yychar) {
- case EOF:
- case '$': case ')': case '*': case '+':
- case '.': case '?': case '^': case '|':
- return g->yychar;
- }
- if (g->yychar == '{')
- return lexcount(g);
- if (g->yychar == '[')
- return lexclass(g);
- if (g->yychar == '(') {
- if (g->source[0] == '?') {
- if (g->source[1] == ':') {
- g->source += 2;
- return L_NC;
- }
- if (g->source[1] == '=') {
- g->source += 2;
- return L_PLA;
- }
- if (g->source[1] == '!') {
- g->source += 2;
- return L_NLA;
- }
- }
- return '(';
- }
- return L_CHAR;
- }
- /* Parse */
- enum {
- P_CAT, P_ALT, P_REP,
- P_BOL, P_EOL, P_WORD, P_NWORD,
- P_PAR, P_PLA, P_NLA,
- P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
- P_REF,
- };
- struct Renode {
- unsigned char type;
- unsigned char ng, m, n;
- Rune c;
- int cc;
- Renode *x;
- Renode *y;
- };
- static Renode *newnode(struct cstate *g, int type)
- {
- Renode *node = g->pend++;
- node->type = type;
- node->cc = -1;
- node->c = 0;
- node->ng = 0;
- node->m = 0;
- node->n = 0;
- node->x = node->y = NULL;
- return node;
- }
- static int empty(Renode *node)
- {
- if (!node) return 1;
- switch (node->type) {
- default: return 1;
- case P_CAT: return empty(node->x) && empty(node->y);
- case P_ALT: return empty(node->x) || empty(node->y);
- case P_REP: return empty(node->x) || node->m == 0;
- case P_PAR: return empty(node->x);
- case P_REF: return empty(node->x);
- case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
- }
- }
- static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
- {
- Renode *rep = newnode(g, P_REP);
- if (max == REPINF && empty(atom))
- die(g, "infinite loop matching the empty string");
- rep->ng = ng;
- rep->m = min;
- rep->n = max;
- rep->x = atom;
- return rep;
- }
- static void next(struct cstate *g)
- {
- g->lookahead = lex(g);
- }
- static int accept(struct cstate *g, int t)
- {
- if (g->lookahead == t) {
- next(g);
- return 1;
- }
- return 0;
- }
- static Renode *parsealt(struct cstate *g);
- static Renode *parseatom(struct cstate *g)
- {
- Renode *atom;
- if (g->lookahead == L_CHAR) {
- atom = newnode(g, P_CHAR);
- atom->c = g->yychar;
- next(g);
- return atom;
- }
- if (g->lookahead == L_CCLASS) {
- atom = newnode(g, P_CCLASS);
- atom->cc = g->yycc - g->cclass;
- next(g);
- return atom;
- }
- if (g->lookahead == L_NCCLASS) {
- atom = newnode(g, P_NCCLASS);
- atom->cc = g->yycc - g->cclass;
- next(g);
- return atom;
- }
- if (g->lookahead == L_REF) {
- atom = newnode(g, P_REF);
- if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar])
- die(g, "invalid back-reference");
- atom->n = g->yychar;
- atom->x = g->sub[g->yychar];
- next(g);
- return atom;
- }
- if (accept(g, '.'))
- return newnode(g, P_ANY);
- if (accept(g, '(')) {
- atom = newnode(g, P_PAR);
- if (g->nsub == REG_MAXSUB)
- die(g, "too many captures");
- atom->n = g->nsub++;
- atom->x = parsealt(g);
- g->sub[atom->n] = atom;
- if (!accept(g, ')'))
- die(g, "unmatched '('");
- return atom;
- }
- if (accept(g, L_NC)) {
- atom = parsealt(g);
- if (!accept(g, ')'))
- die(g, "unmatched '('");
- return atom;
- }
- if (accept(g, L_PLA)) {
- atom = newnode(g, P_PLA);
- atom->x = parsealt(g);
- if (!accept(g, ')'))
- die(g, "unmatched '('");
- return atom;
- }
- if (accept(g, L_NLA)) {
- atom = newnode(g, P_NLA);
- atom->x = parsealt(g);
- if (!accept(g, ')'))
- die(g, "unmatched '('");
- return atom;
- }
- die(g, "syntax error");
- return NULL;
- }
- static Renode *parserep(struct cstate *g)
- {
- Renode *atom;
- if (accept(g, '^')) return newnode(g, P_BOL);
- if (accept(g, '$')) return newnode(g, P_EOL);
- if (accept(g, L_WORD)) return newnode(g, P_WORD);
- if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
- atom = parseatom(g);
- if (g->lookahead == L_COUNT) {
- int min = g->yymin, max = g->yymax;
- next(g);
- if (max < min)
- die(g, "invalid quantifier");
- return newrep(g, atom, accept(g, '?'), min, max);
- }
- if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
- if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
- if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
- return atom;
- }
- static Renode *parsecat(struct cstate *g)
- {
- Renode *cat, *head, **tail;
- if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
- /* Build a right-leaning tree by splicing in new 'cat' at the tail. */
- head = parserep(g);
- tail = &head;
- while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
- cat = newnode(g, P_CAT);
- cat->x = *tail;
- cat->y = parserep(g);
- *tail = cat;
- tail = &cat->y;
- }
- return head;
- }
- return NULL;
- }
- static Renode *parsealt(struct cstate *g)
- {
- Renode *alt, *x;
- alt = parsecat(g);
- while (accept(g, '|')) {
- x = alt;
- alt = newnode(g, P_ALT);
- alt->x = x;
- alt->y = parsecat(g);
- }
- return alt;
- }
- /* Compile */
- enum {
- I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
- I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
- I_BOL, I_EOL, I_WORD, I_NWORD,
- I_LPAR, I_RPAR
- };
- struct Reinst {
- unsigned char opcode;
- unsigned char n;
- Rune c;
- Reclass *cc;
- Reinst *x;
- Reinst *y;
- };
- static int count(struct cstate *g, Renode *node, int depth)
- {
- int min, max, n;
- if (!node) return 0;
- if (++depth > REG_MAXREC) die(g, "stack overflow");
- switch (node->type) {
- default: return 1;
- case P_CAT: return count(g, node->x, depth) + count(g, node->y, depth);
- case P_ALT: return count(g, node->x, depth) + count(g, node->y, depth) + 2;
- case P_REP:
- min = node->m;
- max = node->n;
- if (min == max) n = count(g, node->x, depth) * min;
- else if (max < REPINF) n = count(g, node->x, depth) * max + (max - min);
- else n = count(g, node->x, depth) * (min + 1) + 2;
- if (n < 0 || n > REG_MAXPROG) die(g, "program too large");
- return n;
- case P_PAR: return count(g, node->x, depth) + 2;
- case P_PLA: return count(g, node->x, depth) + 2;
- case P_NLA: return count(g, node->x, depth) + 2;
- }
- }
- static Reinst *emit(Reprog *prog, int opcode)
- {
- Reinst *inst = prog->end++;
- inst->opcode = opcode;
- inst->n = 0;
- inst->c = 0;
- inst->cc = NULL;
- inst->x = inst->y = NULL;
- return inst;
- }
- static void compile(Reprog *prog, Renode *node)
- {
- Reinst *inst, *split, *jump;
- int i;
- loop:
- if (!node)
- return;
- switch (node->type) {
- case P_CAT:
- compile(prog, node->x);
- node = node->y;
- goto loop;
- case P_ALT:
- split = emit(prog, I_SPLIT);
- compile(prog, node->x);
- jump = emit(prog, I_JUMP);
- compile(prog, node->y);
- split->x = split + 1;
- split->y = jump + 1;
- jump->x = prog->end;
- break;
- case P_REP:
- inst = NULL; /* silence compiler warning. assert(node->m > 0). */
- for (i = 0; i < node->m; ++i) {
- inst = prog->end;
- compile(prog, node->x);
- }
- if (node->m == node->n)
- break;
- if (node->n < REPINF) {
- for (i = node->m; i < node->n; ++i) {
- split = emit(prog, I_SPLIT);
- compile(prog, node->x);
- if (node->ng) {
- split->y = split + 1;
- split->x = prog->end;
- } else {
- split->x = split + 1;
- split->y = prog->end;
- }
- }
- } else if (node->m == 0) {
- split = emit(prog, I_SPLIT);
- compile(prog, node->x);
- jump = emit(prog, I_JUMP);
- if (node->ng) {
- split->y = split + 1;
- split->x = prog->end;
- } else {
- split->x = split + 1;
- split->y = prog->end;
- }
- jump->x = split;
- } else {
- split = emit(prog, I_SPLIT);
- if (node->ng) {
- split->y = inst;
- split->x = prog->end;
- } else {
- split->x = inst;
- split->y = prog->end;
- }
- }
- break;
- case P_BOL: emit(prog, I_BOL); break;
- case P_EOL: emit(prog, I_EOL); break;
- case P_WORD: emit(prog, I_WORD); break;
- case P_NWORD: emit(prog, I_NWORD); break;
- case P_PAR:
- inst = emit(prog, I_LPAR);
- inst->n = node->n;
- compile(prog, node->x);
- inst = emit(prog, I_RPAR);
- inst->n = node->n;
- break;
- case P_PLA:
- split = emit(prog, I_PLA);
- compile(prog, node->x);
- emit(prog, I_END);
- split->x = split + 1;
- split->y = prog->end;
- break;
- case P_NLA:
- split = emit(prog, I_NLA);
- compile(prog, node->x);
- emit(prog, I_END);
- split->x = split + 1;
- split->y = prog->end;
- break;
- case P_ANY:
- emit(prog, I_ANY);
- break;
- case P_CHAR:
- inst = emit(prog, I_CHAR);
- inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
- break;
- case P_CCLASS:
- inst = emit(prog, I_CCLASS);
- inst->cc = prog->cclass + node->cc;
- break;
- case P_NCCLASS:
- inst = emit(prog, I_NCCLASS);
- inst->cc = prog->cclass + node->cc;
- break;
- case P_REF:
- inst = emit(prog, I_REF);
- inst->n = node->n;
- break;
- }
- }
- #ifdef TEST
- static void dumpnode(struct cstate *g, Renode *node)
- {
- Rune *p;
- Reclass *cc;
- if (!node) { printf("Empty"); return; }
- switch (node->type) {
- case P_CAT: printf("Cat("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break;
- case P_ALT: printf("Alt("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break;
- case P_REP:
- printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
- dumpnode(g, node->x);
- printf(")");
- break;
- case P_BOL: printf("Bol"); break;
- case P_EOL: printf("Eol"); break;
- case P_WORD: printf("Word"); break;
- case P_NWORD: printf("NotWord"); break;
- case P_PAR: printf("Par(%d,", node->n); dumpnode(g, node->x); printf(")"); break;
- case P_PLA: printf("PLA("); dumpnode(g, node->x); printf(")"); break;
- case P_NLA: printf("NLA("); dumpnode(g, node->x); printf(")"); break;
- case P_ANY: printf("Any"); break;
- case P_CHAR: printf("Char(%c)", node->c); break;
- case P_CCLASS:
- printf("Class(");
- cc = g->cclass + node->cc;
- for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
- printf(")");
- break;
- case P_NCCLASS:
- printf("NotClass(");
- cc = g->cclass + node->cc;
- for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
- printf(")");
- break;
- case P_REF: printf("Ref(%d)", node->n); break;
- }
- }
- static void dumpcclass(Reclass *cc) {
- Rune *p;
- for (p = cc->spans; p < cc->end; p += 2) {
- if (p[0] > 32 && p[0] < 127)
- printf(" %c", p[0]);
- else
- printf(" \\x%02x", p[0]);
- if (p[1] > 32 && p[1] < 127)
- printf("-%c", p[1]);
- else
- printf("-\\x%02x", p[1]);
- }
- putchar('\n');
- }
- static void dumpprog(Reprog *prog)
- {
- Reinst *inst;
- int i;
- for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
- printf("% 5d: ", i);
- switch (inst->opcode) {
- case I_END: puts("end"); break;
- case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
- case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
- case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
- case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
- case I_ANY: puts("any"); break;
- case I_ANYNL: puts("anynl"); break;
- case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
- case I_CCLASS: printf("cclass"); dumpcclass(inst->cc); break;
- case I_NCCLASS: printf("ncclass"); dumpcclass(inst->cc); break;
- case I_REF: printf("ref %d\n", inst->n); break;
- case I_BOL: puts("bol"); break;
- case I_EOL: puts("eol"); break;
- case I_WORD: puts("word"); break;
- case I_NWORD: puts("nword"); break;
- case I_LPAR: printf("lpar %d\n", inst->n); break;
- case I_RPAR: printf("rpar %d\n", inst->n); break;
- }
- }
- }
- #endif
- Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx,
- const char *pattern, int cflags, const char **errorp)
- {
- struct cstate g;
- Renode *node;
- Reinst *split, *jump;
- int i, n;
- g.pstart = NULL;
- g.prog = NULL;
- if (setjmp(g.kaboom)) {
- if (errorp) *errorp = g.error;
- alloc(ctx, g.pstart, 0);
- if (g.prog) {
- alloc(ctx, g.prog->cclass, 0);
- alloc(ctx, g.prog->start, 0);
- alloc(ctx, g.prog, 0);
- }
- return NULL;
- }
- g.prog = alloc(ctx, NULL, sizeof (Reprog));
- if (!g.prog)
- die(&g, "cannot allocate regular expression");
- g.prog->start = NULL;
- g.prog->cclass = NULL;
- n = strlen(pattern) * 2;
- if (n > REG_MAXPROG)
- die(&g, "program too large");
- if (n > 0) {
- g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n);
- if (!g.pstart)
- die(&g, "cannot allocate regular expression parse list");
- }
- g.source = pattern;
- g.ncclass = 0;
- g.nsub = 1;
- for (i = 0; i < REG_MAXSUB; ++i)
- g.sub[i] = 0;
- g.prog->flags = cflags;
- next(&g);
- node = parsealt(&g);
- if (g.lookahead == ')')
- die(&g, "unmatched ')'");
- if (g.lookahead != EOF)
- die(&g, "syntax error");
- #ifdef TEST
- dumpnode(&g, node);
- putchar('\n');
- #endif
- n = 6 + count(&g, node, 0);
- if (n < 0 || n > REG_MAXPROG)
- die(&g, "program too large");
- g.prog->nsub = g.nsub;
- g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst));
- if (!g.prog->start)
- die(&g, "cannot allocate regular expression instruction list");
- if (g.ncclass > 0) {
- g.prog->cclass = alloc(ctx, NULL, g.ncclass * sizeof (Reclass));
- if (!g.prog->cclass)
- die(&g, "cannot allocate regular expression character class list");
- memcpy(g.prog->cclass, g.cclass, g.ncclass * sizeof (Reclass));
- for (i = 0; i < g.ncclass; ++i)
- g.prog->cclass[i].end = g.prog->cclass[i].spans + (g.cclass[i].end - g.cclass[i].spans);
- }
- split = emit(g.prog, I_SPLIT);
- split->x = split + 3;
- split->y = split + 1;
- emit(g.prog, I_ANYNL);
- jump = emit(g.prog, I_JUMP);
- jump->x = split;
- emit(g.prog, I_LPAR);
- compile(g.prog, node);
- emit(g.prog, I_RPAR);
- emit(g.prog, I_END);
- #ifdef TEST
- dumpprog(g.prog);
- #endif
- alloc(ctx, g.pstart, 0);
- if (errorp) *errorp = NULL;
- return g.prog;
- }
- void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog)
- {
- if (prog) {
- if (prog->cclass)
- alloc(ctx, prog->cclass, 0);
- alloc(ctx, prog->start, 0);
- alloc(ctx, prog, 0);
- }
- }
- static void *default_alloc(void *ctx, void *p, int n)
- {
- if (n == 0) {
- free(p);
- return NULL;
- }
- return realloc(p, (size_t)n);
- }
- Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
- {
- return regcompx(default_alloc, NULL, pattern, cflags, errorp);
- }
- void regfree(Reprog *prog)
- {
- regfreex(default_alloc, NULL, prog);
- }
- /* Match */
- static int isnewline(int c)
- {
- return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
- }
- static int iswordchar(int c)
- {
- return c == '_' ||
- (c >= 'a' && c <= 'z') ||
- (c >= 'A' && c <= 'Z') ||
- (c >= '0' && c <= '9');
- }
- static int incclass(Reclass *cc, Rune c)
- {
- Rune *p;
- for (p = cc->spans; p < cc->end; p += 2)
- if (p[0] <= c && c <= p[1])
- return 1;
- return 0;
- }
- static int incclasscanon(Reclass *cc, Rune c)
- {
- Rune *p, r;
- for (p = cc->spans; p < cc->end; p += 2)
- for (r = p[0]; r <= p[1]; ++r)
- if (c == canon(r))
- return 1;
- return 0;
- }
- static int strncmpcanon(const char *a, const char *b, int n)
- {
- Rune ra, rb;
- int c;
- while (n--) {
- if (!*a) return -1;
- if (!*b) return 1;
- a += chartorune(&ra, a);
- b += chartorune(&rb, b);
- c = canon(ra) - canon(rb);
- if (c)
- return c;
- }
- return 0;
- }
- static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out, int depth)
- {
- Resub scratch;
- int result;
- int i;
- Rune c;
- /* stack overflow */
- if (depth > REG_MAXREC)
- return -1;
- for (;;) {
- switch (pc->opcode) {
- case I_END:
- return 0;
- case I_JUMP:
- pc = pc->x;
- break;
- case I_SPLIT:
- scratch = *out;
- result = match(pc->x, sp, bol, flags, &scratch, depth+1);
- if (result == -1)
- return -1;
- if (result == 0) {
- *out = scratch;
- return 0;
- }
- pc = pc->y;
- break;
- case I_PLA:
- result = match(pc->x, sp, bol, flags, out, depth+1);
- if (result == -1)
- return -1;
- if (result == 1)
- return 1;
- pc = pc->y;
- break;
- case I_NLA:
- scratch = *out;
- result = match(pc->x, sp, bol, flags, &scratch, depth+1);
- if (result == -1)
- return -1;
- if (result == 0)
- return 1;
- pc = pc->y;
- break;
- case I_ANYNL:
- if (!*sp) return 1;
- sp += chartorune(&c, sp);
- pc = pc + 1;
- break;
- case I_ANY:
- if (!*sp) return 1;
- sp += chartorune(&c, sp);
- if (isnewline(c))
- return 1;
- pc = pc + 1;
- break;
- case I_CHAR:
- if (!*sp) return 1;
- sp += chartorune(&c, sp);
- if (flags & REG_ICASE)
- c = canon(c);
- if (c != pc->c)
- return 1;
- pc = pc + 1;
- break;
- case I_CCLASS:
- if (!*sp) return 1;
- sp += chartorune(&c, sp);
- if (flags & REG_ICASE) {
- if (!incclasscanon(pc->cc, canon(c)))
- return 1;
- } else {
- if (!incclass(pc->cc, c))
- return 1;
- }
- pc = pc + 1;
- break;
- case I_NCCLASS:
- if (!*sp) return 1;
- sp += chartorune(&c, sp);
- if (flags & REG_ICASE) {
- if (incclasscanon(pc->cc, canon(c)))
- return 1;
- } else {
- if (incclass(pc->cc, c))
- return 1;
- }
- pc = pc + 1;
- break;
- case I_REF:
- i = out->sub[pc->n].ep - out->sub[pc->n].sp;
- if (flags & REG_ICASE) {
- if (strncmpcanon(sp, out->sub[pc->n].sp, i))
- return 1;
- } else {
- if (strncmp(sp, out->sub[pc->n].sp, i))
- return 1;
- }
- if (i > 0)
- sp += i;
- pc = pc + 1;
- break;
- case I_BOL:
- if (sp == bol && !(flags & REG_NOTBOL)) {
- pc = pc + 1;
- break;
- }
- if (flags & REG_NEWLINE) {
- if (sp > bol && isnewline(sp[-1])) {
- pc = pc + 1;
- break;
- }
- }
- return 1;
- case I_EOL:
- if (*sp == 0) {
- pc = pc + 1;
- break;
- }
- if (flags & REG_NEWLINE) {
- if (isnewline(*sp)) {
- pc = pc + 1;
- break;
- }
- }
- return 1;
- case I_WORD:
- i = sp > bol && iswordchar(sp[-1]);
- i ^= iswordchar(sp[0]);
- if (!i)
- return 1;
- pc = pc + 1;
- break;
- case I_NWORD:
- i = sp > bol && iswordchar(sp[-1]);
- i ^= iswordchar(sp[0]);
- if (i)
- return 1;
- pc = pc + 1;
- break;
- case I_LPAR:
- out->sub[pc->n].sp = sp;
- pc = pc + 1;
- break;
- case I_RPAR:
- out->sub[pc->n].ep = sp;
- pc = pc + 1;
- break;
- default:
- return 1;
- }
- }
- }
- int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
- {
- Resub scratch;
- int i;
- if (!sub)
- sub = &scratch;
- sub->nsub = prog->nsub;
- for (i = 0; i < REG_MAXSUB; ++i)
- sub->sub[i].sp = sub->sub[i].ep = NULL;
- return match(prog->start, sp, sp, prog->flags | eflags, sub, 0);
- }
- #ifdef TEST
- int main(int argc, char **argv)
- {
- const char *error;
- const char *s;
- Reprog *p;
- Resub m;
- int i;
- if (argc > 1) {
- p = regcomp(argv[1], 0, &error);
- if (!p) {
- fprintf(stderr, "regcomp: %s\n", error);
- return 1;
- }
- if (argc > 2) {
- s = argv[2];
- printf("nsub = %d\n", p->nsub);
- if (!regexec(p, s, &m, 0)) {
- for (i = 0; i < m.nsub; ++i) {
- int n = m.sub[i].ep - m.sub[i].sp;
- if (n > 0)
- printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
- else
- printf("match %d: n=0 ''\n", i);
- }
- } else {
- printf("no match\n");
- }
- }
- }
- return 0;
- }
- #endif
|