regexp.c 27 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277
  1. #include <stdlib.h>
  2. #include <stdio.h>
  3. #include <string.h>
  4. #include <setjmp.h>
  5. #include <limits.h>
  6. #include "regexp.h"
  7. #include "utf.h"
  8. #define emit regemit
  9. #define next regnext
  10. #define accept regaccept
  11. #define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
  12. #define REPINF 255
  13. #ifndef REG_MAXPROG
  14. #define REG_MAXPROG (32 << 10)
  15. #endif
  16. #ifndef REG_MAXREC
  17. #define REG_MAXREC 1024
  18. #endif
  19. #ifndef REG_MAXSPAN
  20. #define REG_MAXSPAN 64
  21. #endif
  22. #ifndef REG_MAXCLASS
  23. #define REG_MAXCLASS 128
  24. #endif
  25. typedef struct Reclass Reclass;
  26. typedef struct Renode Renode;
  27. typedef struct Reinst Reinst;
  28. typedef struct Rethread Rethread;
  29. struct Reclass {
  30. Rune *end;
  31. Rune spans[REG_MAXSPAN];
  32. };
  33. struct Reprog {
  34. Reinst *start, *end;
  35. Reclass *cclass;
  36. int flags;
  37. int nsub;
  38. };
  39. struct cstate {
  40. Reprog *prog;
  41. Renode *pstart, *pend;
  42. const char *source;
  43. int ncclass;
  44. int nsub;
  45. Renode *sub[REG_MAXSUB];
  46. int lookahead;
  47. Rune yychar;
  48. Reclass *yycc;
  49. int yymin, yymax;
  50. const char *error;
  51. jmp_buf kaboom;
  52. Reclass cclass[REG_MAXCLASS];
  53. };
  54. static void die(struct cstate *g, const char *message)
  55. {
  56. g->error = message;
  57. longjmp(g->kaboom, 1);
  58. }
  59. static int canon(Rune c)
  60. {
  61. Rune u = toupperrune(c);
  62. if (c >= 128 && u < 128)
  63. return c;
  64. return u;
  65. }
  66. /* Scan */
  67. enum {
  68. L_CHAR = 256,
  69. L_CCLASS, /* character class */
  70. L_NCCLASS, /* negative character class */
  71. L_NC, /* "(?:" no capture */
  72. L_PLA, /* "(?=" positive lookahead */
  73. L_NLA, /* "(?!" negative lookahead */
  74. L_WORD, /* "\b" word boundary */
  75. L_NWORD, /* "\B" non-word boundary */
  76. L_REF, /* "\1" back-reference */
  77. L_COUNT, /* {M,N} */
  78. };
  79. static int hex(struct cstate *g, int c)
  80. {
  81. if (c >= '0' && c <= '9') return c - '0';
  82. if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
  83. if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
  84. die(g, "invalid escape sequence");
  85. return 0;
  86. }
  87. static int dec(struct cstate *g, int c)
  88. {
  89. if (c >= '0' && c <= '9') return c - '0';
  90. die(g, "invalid quantifier");
  91. return 0;
  92. }
  93. #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|-0123456789"
  94. static int isunicodeletter(int c)
  95. {
  96. return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
  97. }
  98. static int nextrune(struct cstate *g)
  99. {
  100. if (!*g->source) {
  101. g->yychar = EOF;
  102. return 0;
  103. }
  104. g->source += chartorune(&g->yychar, g->source);
  105. if (g->yychar == '\\') {
  106. if (!*g->source)
  107. die(g, "unterminated escape sequence");
  108. g->source += chartorune(&g->yychar, g->source);
  109. switch (g->yychar) {
  110. case 'f': g->yychar = '\f'; return 0;
  111. case 'n': g->yychar = '\n'; return 0;
  112. case 'r': g->yychar = '\r'; return 0;
  113. case 't': g->yychar = '\t'; return 0;
  114. case 'v': g->yychar = '\v'; return 0;
  115. case 'c':
  116. if (!g->source[0])
  117. die(g, "unterminated escape sequence");
  118. g->yychar = (*g->source++) & 31;
  119. return 0;
  120. case 'x':
  121. if (!g->source[0] || !g->source[1])
  122. die(g, "unterminated escape sequence");
  123. g->yychar = hex(g, *g->source++) << 4;
  124. g->yychar += hex(g, *g->source++);
  125. if (g->yychar == 0) {
  126. g->yychar = '0';
  127. return 1;
  128. }
  129. return 0;
  130. case 'u':
  131. if (!g->source[0] || !g->source[1] || !g->source[2] || !g->source[3])
  132. die(g, "unterminated escape sequence");
  133. g->yychar = hex(g, *g->source++) << 12;
  134. g->yychar += hex(g, *g->source++) << 8;
  135. g->yychar += hex(g, *g->source++) << 4;
  136. g->yychar += hex(g, *g->source++);
  137. if (g->yychar == 0) {
  138. g->yychar = '0';
  139. return 1;
  140. }
  141. return 0;
  142. case 0:
  143. g->yychar = '0';
  144. return 1;
  145. }
  146. if (strchr(ESCAPES, g->yychar))
  147. return 1;
  148. if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
  149. die(g, "invalid escape character");
  150. return 0;
  151. }
  152. return 0;
  153. }
  154. static int lexcount(struct cstate *g)
  155. {
  156. g->yychar = *g->source++;
  157. g->yymin = dec(g, g->yychar);
  158. g->yychar = *g->source++;
  159. while (g->yychar != ',' && g->yychar != '}') {
  160. g->yymin = g->yymin * 10 + dec(g, g->yychar);
  161. g->yychar = *g->source++;
  162. if (g->yymin >= REPINF)
  163. die(g, "numeric overflow");
  164. }
  165. if (g->yychar == ',') {
  166. g->yychar = *g->source++;
  167. if (g->yychar == '}') {
  168. g->yymax = REPINF;
  169. } else {
  170. g->yymax = dec(g, g->yychar);
  171. g->yychar = *g->source++;
  172. while (g->yychar != '}') {
  173. g->yymax = g->yymax * 10 + dec(g, g->yychar);
  174. g->yychar = *g->source++;
  175. if (g->yymax >= REPINF)
  176. die(g, "numeric overflow");
  177. }
  178. }
  179. } else {
  180. g->yymax = g->yymin;
  181. }
  182. return L_COUNT;
  183. }
  184. static void newcclass(struct cstate *g)
  185. {
  186. if (g->ncclass >= REG_MAXCLASS)
  187. die(g, "too many character classes");
  188. g->yycc = g->cclass + g->ncclass++;
  189. g->yycc->end = g->yycc->spans;
  190. }
  191. static void addrange(struct cstate *g, Rune a, Rune b)
  192. {
  193. Reclass *cc = g->yycc;
  194. Rune *p;
  195. if (a > b)
  196. die(g, "invalid character class range");
  197. /* extend existing ranges if they overlap */
  198. for (p = cc->spans; p < cc->end; p += 2) {
  199. /* completely inside old range */
  200. if (a >= p[0] && b <= p[1])
  201. return;
  202. /* completely swallows old range */
  203. if (a < p[0] && b >= p[1]) {
  204. p[0] = a;
  205. p[1] = b;
  206. return;
  207. }
  208. /* extend at start */
  209. if (b >= p[0] - 1 && b <= p[1] && a < p[0]) {
  210. p[0] = a;
  211. return;
  212. }
  213. /* extend at end */
  214. if (a >= p[0] && a <= p[1] + 1 && b > p[1]) {
  215. p[1] = b;
  216. return;
  217. }
  218. }
  219. if (cc->end + 2 >= cc->spans + nelem(cc->spans))
  220. die(g, "too many character class ranges");
  221. *cc->end++ = a;
  222. *cc->end++ = b;
  223. }
  224. static void addranges_d(struct cstate *g)
  225. {
  226. addrange(g, '0', '9');
  227. }
  228. static void addranges_D(struct cstate *g)
  229. {
  230. addrange(g, 0, '0'-1);
  231. addrange(g, '9'+1, 0xFFFF);
  232. }
  233. static void addranges_s(struct cstate *g)
  234. {
  235. addrange(g, 0x9, 0xD);
  236. addrange(g, 0x20, 0x20);
  237. addrange(g, 0xA0, 0xA0);
  238. addrange(g, 0x2028, 0x2029);
  239. addrange(g, 0xFEFF, 0xFEFF);
  240. }
  241. static void addranges_S(struct cstate *g)
  242. {
  243. addrange(g, 0, 0x9-1);
  244. addrange(g, 0xD+1, 0x20-1);
  245. addrange(g, 0x20+1, 0xA0-1);
  246. addrange(g, 0xA0+1, 0x2028-1);
  247. addrange(g, 0x2029+1, 0xFEFF-1);
  248. addrange(g, 0xFEFF+1, 0xFFFF);
  249. }
  250. static void addranges_w(struct cstate *g)
  251. {
  252. addrange(g, '0', '9');
  253. addrange(g, 'A', 'Z');
  254. addrange(g, '_', '_');
  255. addrange(g, 'a', 'z');
  256. }
  257. static void addranges_W(struct cstate *g)
  258. {
  259. addrange(g, 0, '0'-1);
  260. addrange(g, '9'+1, 'A'-1);
  261. addrange(g, 'Z'+1, '_'-1);
  262. addrange(g, '_'+1, 'a'-1);
  263. addrange(g, 'z'+1, 0xFFFF);
  264. }
  265. static int lexclass(struct cstate *g)
  266. {
  267. int type = L_CCLASS;
  268. int quoted, havesave, havedash;
  269. Rune save = 0;
  270. newcclass(g);
  271. quoted = nextrune(g);
  272. if (!quoted && g->yychar == '^') {
  273. type = L_NCCLASS;
  274. quoted = nextrune(g);
  275. }
  276. havesave = havedash = 0;
  277. for (;;) {
  278. if (g->yychar == EOF)
  279. die(g, "unterminated character class");
  280. if (!quoted && g->yychar == ']')
  281. break;
  282. if (!quoted && g->yychar == '-') {
  283. if (havesave) {
  284. if (havedash) {
  285. addrange(g, save, '-');
  286. havesave = havedash = 0;
  287. } else {
  288. havedash = 1;
  289. }
  290. } else {
  291. save = '-';
  292. havesave = 1;
  293. }
  294. } else if (quoted && strchr("DSWdsw", g->yychar)) {
  295. if (havesave) {
  296. addrange(g, save, save);
  297. if (havedash)
  298. addrange(g, '-', '-');
  299. }
  300. switch (g->yychar) {
  301. case 'd': addranges_d(g); break;
  302. case 's': addranges_s(g); break;
  303. case 'w': addranges_w(g); break;
  304. case 'D': addranges_D(g); break;
  305. case 'S': addranges_S(g); break;
  306. case 'W': addranges_W(g); break;
  307. }
  308. havesave = havedash = 0;
  309. } else {
  310. if (quoted) {
  311. if (g->yychar == 'b')
  312. g->yychar = '\b';
  313. else if (g->yychar == '0')
  314. g->yychar = 0;
  315. /* else identity escape */
  316. }
  317. if (havesave) {
  318. if (havedash) {
  319. addrange(g, save, g->yychar);
  320. havesave = havedash = 0;
  321. } else {
  322. addrange(g, save, save);
  323. save = g->yychar;
  324. }
  325. } else {
  326. save = g->yychar;
  327. havesave = 1;
  328. }
  329. }
  330. quoted = nextrune(g);
  331. }
  332. if (havesave) {
  333. addrange(g, save, save);
  334. if (havedash)
  335. addrange(g, '-', '-');
  336. }
  337. return type;
  338. }
  339. static int lex(struct cstate *g)
  340. {
  341. int quoted = nextrune(g);
  342. if (quoted) {
  343. switch (g->yychar) {
  344. case 'b': return L_WORD;
  345. case 'B': return L_NWORD;
  346. case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
  347. case 's': newcclass(g); addranges_s(g); return L_CCLASS;
  348. case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
  349. case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
  350. case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
  351. case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
  352. case '0': g->yychar = 0; return L_CHAR;
  353. }
  354. if (g->yychar >= '0' && g->yychar <= '9') {
  355. g->yychar -= '0';
  356. if (*g->source >= '0' && *g->source <= '9')
  357. g->yychar = g->yychar * 10 + *g->source++ - '0';
  358. return L_REF;
  359. }
  360. return L_CHAR;
  361. }
  362. switch (g->yychar) {
  363. case EOF:
  364. case '$': case ')': case '*': case '+':
  365. case '.': case '?': case '^': case '|':
  366. return g->yychar;
  367. }
  368. if (g->yychar == '{')
  369. return lexcount(g);
  370. if (g->yychar == '[')
  371. return lexclass(g);
  372. if (g->yychar == '(') {
  373. if (g->source[0] == '?') {
  374. if (g->source[1] == ':') {
  375. g->source += 2;
  376. return L_NC;
  377. }
  378. if (g->source[1] == '=') {
  379. g->source += 2;
  380. return L_PLA;
  381. }
  382. if (g->source[1] == '!') {
  383. g->source += 2;
  384. return L_NLA;
  385. }
  386. }
  387. return '(';
  388. }
  389. return L_CHAR;
  390. }
  391. /* Parse */
  392. enum {
  393. P_CAT, P_ALT, P_REP,
  394. P_BOL, P_EOL, P_WORD, P_NWORD,
  395. P_PAR, P_PLA, P_NLA,
  396. P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
  397. P_REF,
  398. };
  399. struct Renode {
  400. unsigned char type;
  401. unsigned char ng, m, n;
  402. Rune c;
  403. int cc;
  404. Renode *x;
  405. Renode *y;
  406. };
  407. static Renode *newnode(struct cstate *g, int type)
  408. {
  409. Renode *node = g->pend++;
  410. node->type = type;
  411. node->cc = -1;
  412. node->c = 0;
  413. node->ng = 0;
  414. node->m = 0;
  415. node->n = 0;
  416. node->x = node->y = NULL;
  417. return node;
  418. }
  419. static int empty(Renode *node)
  420. {
  421. if (!node) return 1;
  422. switch (node->type) {
  423. default: return 1;
  424. case P_CAT: return empty(node->x) && empty(node->y);
  425. case P_ALT: return empty(node->x) || empty(node->y);
  426. case P_REP: return empty(node->x) || node->m == 0;
  427. case P_PAR: return empty(node->x);
  428. case P_REF: return empty(node->x);
  429. case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
  430. }
  431. }
  432. static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
  433. {
  434. Renode *rep = newnode(g, P_REP);
  435. if (max == REPINF && empty(atom))
  436. die(g, "infinite loop matching the empty string");
  437. rep->ng = ng;
  438. rep->m = min;
  439. rep->n = max;
  440. rep->x = atom;
  441. return rep;
  442. }
  443. static void next(struct cstate *g)
  444. {
  445. g->lookahead = lex(g);
  446. }
  447. static int accept(struct cstate *g, int t)
  448. {
  449. if (g->lookahead == t) {
  450. next(g);
  451. return 1;
  452. }
  453. return 0;
  454. }
  455. static Renode *parsealt(struct cstate *g);
  456. static Renode *parseatom(struct cstate *g)
  457. {
  458. Renode *atom;
  459. if (g->lookahead == L_CHAR) {
  460. atom = newnode(g, P_CHAR);
  461. atom->c = g->yychar;
  462. next(g);
  463. return atom;
  464. }
  465. if (g->lookahead == L_CCLASS) {
  466. atom = newnode(g, P_CCLASS);
  467. atom->cc = g->yycc - g->cclass;
  468. next(g);
  469. return atom;
  470. }
  471. if (g->lookahead == L_NCCLASS) {
  472. atom = newnode(g, P_NCCLASS);
  473. atom->cc = g->yycc - g->cclass;
  474. next(g);
  475. return atom;
  476. }
  477. if (g->lookahead == L_REF) {
  478. atom = newnode(g, P_REF);
  479. if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar])
  480. die(g, "invalid back-reference");
  481. atom->n = g->yychar;
  482. atom->x = g->sub[g->yychar];
  483. next(g);
  484. return atom;
  485. }
  486. if (accept(g, '.'))
  487. return newnode(g, P_ANY);
  488. if (accept(g, '(')) {
  489. atom = newnode(g, P_PAR);
  490. if (g->nsub == REG_MAXSUB)
  491. die(g, "too many captures");
  492. atom->n = g->nsub++;
  493. atom->x = parsealt(g);
  494. g->sub[atom->n] = atom;
  495. if (!accept(g, ')'))
  496. die(g, "unmatched '('");
  497. return atom;
  498. }
  499. if (accept(g, L_NC)) {
  500. atom = parsealt(g);
  501. if (!accept(g, ')'))
  502. die(g, "unmatched '('");
  503. return atom;
  504. }
  505. if (accept(g, L_PLA)) {
  506. atom = newnode(g, P_PLA);
  507. atom->x = parsealt(g);
  508. if (!accept(g, ')'))
  509. die(g, "unmatched '('");
  510. return atom;
  511. }
  512. if (accept(g, L_NLA)) {
  513. atom = newnode(g, P_NLA);
  514. atom->x = parsealt(g);
  515. if (!accept(g, ')'))
  516. die(g, "unmatched '('");
  517. return atom;
  518. }
  519. die(g, "syntax error");
  520. return NULL;
  521. }
  522. static Renode *parserep(struct cstate *g)
  523. {
  524. Renode *atom;
  525. if (accept(g, '^')) return newnode(g, P_BOL);
  526. if (accept(g, '$')) return newnode(g, P_EOL);
  527. if (accept(g, L_WORD)) return newnode(g, P_WORD);
  528. if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
  529. atom = parseatom(g);
  530. if (g->lookahead == L_COUNT) {
  531. int min = g->yymin, max = g->yymax;
  532. next(g);
  533. if (max < min)
  534. die(g, "invalid quantifier");
  535. return newrep(g, atom, accept(g, '?'), min, max);
  536. }
  537. if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
  538. if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
  539. if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
  540. return atom;
  541. }
  542. static Renode *parsecat(struct cstate *g)
  543. {
  544. Renode *cat, *head, **tail;
  545. if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
  546. /* Build a right-leaning tree by splicing in new 'cat' at the tail. */
  547. head = parserep(g);
  548. tail = &head;
  549. while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
  550. cat = newnode(g, P_CAT);
  551. cat->x = *tail;
  552. cat->y = parserep(g);
  553. *tail = cat;
  554. tail = &cat->y;
  555. }
  556. return head;
  557. }
  558. return NULL;
  559. }
  560. static Renode *parsealt(struct cstate *g)
  561. {
  562. Renode *alt, *x;
  563. alt = parsecat(g);
  564. while (accept(g, '|')) {
  565. x = alt;
  566. alt = newnode(g, P_ALT);
  567. alt->x = x;
  568. alt->y = parsecat(g);
  569. }
  570. return alt;
  571. }
  572. /* Compile */
  573. enum {
  574. I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
  575. I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
  576. I_BOL, I_EOL, I_WORD, I_NWORD,
  577. I_LPAR, I_RPAR
  578. };
  579. struct Reinst {
  580. unsigned char opcode;
  581. unsigned char n;
  582. Rune c;
  583. Reclass *cc;
  584. Reinst *x;
  585. Reinst *y;
  586. };
  587. static int count(struct cstate *g, Renode *node, int depth)
  588. {
  589. int min, max, n;
  590. if (!node) return 0;
  591. if (++depth > REG_MAXREC) die(g, "stack overflow");
  592. switch (node->type) {
  593. default: return 1;
  594. case P_CAT: return count(g, node->x, depth) + count(g, node->y, depth);
  595. case P_ALT: return count(g, node->x, depth) + count(g, node->y, depth) + 2;
  596. case P_REP:
  597. min = node->m;
  598. max = node->n;
  599. if (min == max) n = count(g, node->x, depth) * min;
  600. else if (max < REPINF) n = count(g, node->x, depth) * max + (max - min);
  601. else n = count(g, node->x, depth) * (min + 1) + 2;
  602. if (n < 0 || n > REG_MAXPROG) die(g, "program too large");
  603. return n;
  604. case P_PAR: return count(g, node->x, depth) + 2;
  605. case P_PLA: return count(g, node->x, depth) + 2;
  606. case P_NLA: return count(g, node->x, depth) + 2;
  607. }
  608. }
  609. static Reinst *emit(Reprog *prog, int opcode)
  610. {
  611. Reinst *inst = prog->end++;
  612. inst->opcode = opcode;
  613. inst->n = 0;
  614. inst->c = 0;
  615. inst->cc = NULL;
  616. inst->x = inst->y = NULL;
  617. return inst;
  618. }
  619. static void compile(Reprog *prog, Renode *node)
  620. {
  621. Reinst *inst, *split, *jump;
  622. int i;
  623. loop:
  624. if (!node)
  625. return;
  626. switch (node->type) {
  627. case P_CAT:
  628. compile(prog, node->x);
  629. node = node->y;
  630. goto loop;
  631. case P_ALT:
  632. split = emit(prog, I_SPLIT);
  633. compile(prog, node->x);
  634. jump = emit(prog, I_JUMP);
  635. compile(prog, node->y);
  636. split->x = split + 1;
  637. split->y = jump + 1;
  638. jump->x = prog->end;
  639. break;
  640. case P_REP:
  641. inst = NULL; /* silence compiler warning. assert(node->m > 0). */
  642. for (i = 0; i < node->m; ++i) {
  643. inst = prog->end;
  644. compile(prog, node->x);
  645. }
  646. if (node->m == node->n)
  647. break;
  648. if (node->n < REPINF) {
  649. for (i = node->m; i < node->n; ++i) {
  650. split = emit(prog, I_SPLIT);
  651. compile(prog, node->x);
  652. if (node->ng) {
  653. split->y = split + 1;
  654. split->x = prog->end;
  655. } else {
  656. split->x = split + 1;
  657. split->y = prog->end;
  658. }
  659. }
  660. } else if (node->m == 0) {
  661. split = emit(prog, I_SPLIT);
  662. compile(prog, node->x);
  663. jump = emit(prog, I_JUMP);
  664. if (node->ng) {
  665. split->y = split + 1;
  666. split->x = prog->end;
  667. } else {
  668. split->x = split + 1;
  669. split->y = prog->end;
  670. }
  671. jump->x = split;
  672. } else {
  673. split = emit(prog, I_SPLIT);
  674. if (node->ng) {
  675. split->y = inst;
  676. split->x = prog->end;
  677. } else {
  678. split->x = inst;
  679. split->y = prog->end;
  680. }
  681. }
  682. break;
  683. case P_BOL: emit(prog, I_BOL); break;
  684. case P_EOL: emit(prog, I_EOL); break;
  685. case P_WORD: emit(prog, I_WORD); break;
  686. case P_NWORD: emit(prog, I_NWORD); break;
  687. case P_PAR:
  688. inst = emit(prog, I_LPAR);
  689. inst->n = node->n;
  690. compile(prog, node->x);
  691. inst = emit(prog, I_RPAR);
  692. inst->n = node->n;
  693. break;
  694. case P_PLA:
  695. split = emit(prog, I_PLA);
  696. compile(prog, node->x);
  697. emit(prog, I_END);
  698. split->x = split + 1;
  699. split->y = prog->end;
  700. break;
  701. case P_NLA:
  702. split = emit(prog, I_NLA);
  703. compile(prog, node->x);
  704. emit(prog, I_END);
  705. split->x = split + 1;
  706. split->y = prog->end;
  707. break;
  708. case P_ANY:
  709. emit(prog, I_ANY);
  710. break;
  711. case P_CHAR:
  712. inst = emit(prog, I_CHAR);
  713. inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
  714. break;
  715. case P_CCLASS:
  716. inst = emit(prog, I_CCLASS);
  717. inst->cc = prog->cclass + node->cc;
  718. break;
  719. case P_NCCLASS:
  720. inst = emit(prog, I_NCCLASS);
  721. inst->cc = prog->cclass + node->cc;
  722. break;
  723. case P_REF:
  724. inst = emit(prog, I_REF);
  725. inst->n = node->n;
  726. break;
  727. }
  728. }
  729. #ifdef TEST
  730. static void dumpnode(struct cstate *g, Renode *node)
  731. {
  732. Rune *p;
  733. Reclass *cc;
  734. if (!node) { printf("Empty"); return; }
  735. switch (node->type) {
  736. case P_CAT: printf("Cat("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break;
  737. case P_ALT: printf("Alt("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break;
  738. case P_REP:
  739. printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
  740. dumpnode(g, node->x);
  741. printf(")");
  742. break;
  743. case P_BOL: printf("Bol"); break;
  744. case P_EOL: printf("Eol"); break;
  745. case P_WORD: printf("Word"); break;
  746. case P_NWORD: printf("NotWord"); break;
  747. case P_PAR: printf("Par(%d,", node->n); dumpnode(g, node->x); printf(")"); break;
  748. case P_PLA: printf("PLA("); dumpnode(g, node->x); printf(")"); break;
  749. case P_NLA: printf("NLA("); dumpnode(g, node->x); printf(")"); break;
  750. case P_ANY: printf("Any"); break;
  751. case P_CHAR: printf("Char(%c)", node->c); break;
  752. case P_CCLASS:
  753. printf("Class(");
  754. cc = g->cclass + node->cc;
  755. for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
  756. printf(")");
  757. break;
  758. case P_NCCLASS:
  759. printf("NotClass(");
  760. cc = g->cclass + node->cc;
  761. for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
  762. printf(")");
  763. break;
  764. case P_REF: printf("Ref(%d)", node->n); break;
  765. }
  766. }
  767. static void dumpcclass(Reclass *cc) {
  768. Rune *p;
  769. for (p = cc->spans; p < cc->end; p += 2) {
  770. if (p[0] > 32 && p[0] < 127)
  771. printf(" %c", p[0]);
  772. else
  773. printf(" \\x%02x", p[0]);
  774. if (p[1] > 32 && p[1] < 127)
  775. printf("-%c", p[1]);
  776. else
  777. printf("-\\x%02x", p[1]);
  778. }
  779. putchar('\n');
  780. }
  781. static void dumpprog(Reprog *prog)
  782. {
  783. Reinst *inst;
  784. int i;
  785. for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
  786. printf("% 5d: ", i);
  787. switch (inst->opcode) {
  788. case I_END: puts("end"); break;
  789. case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
  790. case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
  791. case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
  792. case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
  793. case I_ANY: puts("any"); break;
  794. case I_ANYNL: puts("anynl"); break;
  795. case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
  796. case I_CCLASS: printf("cclass"); dumpcclass(inst->cc); break;
  797. case I_NCCLASS: printf("ncclass"); dumpcclass(inst->cc); break;
  798. case I_REF: printf("ref %d\n", inst->n); break;
  799. case I_BOL: puts("bol"); break;
  800. case I_EOL: puts("eol"); break;
  801. case I_WORD: puts("word"); break;
  802. case I_NWORD: puts("nword"); break;
  803. case I_LPAR: printf("lpar %d\n", inst->n); break;
  804. case I_RPAR: printf("rpar %d\n", inst->n); break;
  805. }
  806. }
  807. }
  808. #endif
  809. Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx,
  810. const char *pattern, int cflags, const char **errorp)
  811. {
  812. struct cstate g;
  813. Renode *node;
  814. Reinst *split, *jump;
  815. int i, n;
  816. g.pstart = NULL;
  817. g.prog = NULL;
  818. if (setjmp(g.kaboom)) {
  819. if (errorp) *errorp = g.error;
  820. alloc(ctx, g.pstart, 0);
  821. if (g.prog) {
  822. alloc(ctx, g.prog->cclass, 0);
  823. alloc(ctx, g.prog->start, 0);
  824. alloc(ctx, g.prog, 0);
  825. }
  826. return NULL;
  827. }
  828. g.prog = alloc(ctx, NULL, sizeof (Reprog));
  829. if (!g.prog)
  830. die(&g, "cannot allocate regular expression");
  831. g.prog->start = NULL;
  832. g.prog->cclass = NULL;
  833. n = strlen(pattern) * 2;
  834. if (n > REG_MAXPROG)
  835. die(&g, "program too large");
  836. if (n > 0) {
  837. g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n);
  838. if (!g.pstart)
  839. die(&g, "cannot allocate regular expression parse list");
  840. }
  841. g.source = pattern;
  842. g.ncclass = 0;
  843. g.nsub = 1;
  844. for (i = 0; i < REG_MAXSUB; ++i)
  845. g.sub[i] = 0;
  846. g.prog->flags = cflags;
  847. next(&g);
  848. node = parsealt(&g);
  849. if (g.lookahead == ')')
  850. die(&g, "unmatched ')'");
  851. if (g.lookahead != EOF)
  852. die(&g, "syntax error");
  853. #ifdef TEST
  854. dumpnode(&g, node);
  855. putchar('\n');
  856. #endif
  857. n = 6 + count(&g, node, 0);
  858. if (n < 0 || n > REG_MAXPROG)
  859. die(&g, "program too large");
  860. g.prog->nsub = g.nsub;
  861. g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst));
  862. if (!g.prog->start)
  863. die(&g, "cannot allocate regular expression instruction list");
  864. if (g.ncclass > 0) {
  865. g.prog->cclass = alloc(ctx, NULL, g.ncclass * sizeof (Reclass));
  866. if (!g.prog->cclass)
  867. die(&g, "cannot allocate regular expression character class list");
  868. memcpy(g.prog->cclass, g.cclass, g.ncclass * sizeof (Reclass));
  869. for (i = 0; i < g.ncclass; ++i)
  870. g.prog->cclass[i].end = g.prog->cclass[i].spans + (g.cclass[i].end - g.cclass[i].spans);
  871. }
  872. split = emit(g.prog, I_SPLIT);
  873. split->x = split + 3;
  874. split->y = split + 1;
  875. emit(g.prog, I_ANYNL);
  876. jump = emit(g.prog, I_JUMP);
  877. jump->x = split;
  878. emit(g.prog, I_LPAR);
  879. compile(g.prog, node);
  880. emit(g.prog, I_RPAR);
  881. emit(g.prog, I_END);
  882. #ifdef TEST
  883. dumpprog(g.prog);
  884. #endif
  885. alloc(ctx, g.pstart, 0);
  886. if (errorp) *errorp = NULL;
  887. return g.prog;
  888. }
  889. void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog)
  890. {
  891. if (prog) {
  892. if (prog->cclass)
  893. alloc(ctx, prog->cclass, 0);
  894. alloc(ctx, prog->start, 0);
  895. alloc(ctx, prog, 0);
  896. }
  897. }
  898. static void *default_alloc(void *ctx, void *p, int n)
  899. {
  900. if (n == 0) {
  901. free(p);
  902. return NULL;
  903. }
  904. return realloc(p, (size_t)n);
  905. }
  906. Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
  907. {
  908. return regcompx(default_alloc, NULL, pattern, cflags, errorp);
  909. }
  910. void regfree(Reprog *prog)
  911. {
  912. regfreex(default_alloc, NULL, prog);
  913. }
  914. /* Match */
  915. static int isnewline(int c)
  916. {
  917. return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
  918. }
  919. static int iswordchar(int c)
  920. {
  921. return c == '_' ||
  922. (c >= 'a' && c <= 'z') ||
  923. (c >= 'A' && c <= 'Z') ||
  924. (c >= '0' && c <= '9');
  925. }
  926. static int incclass(Reclass *cc, Rune c)
  927. {
  928. Rune *p;
  929. for (p = cc->spans; p < cc->end; p += 2)
  930. if (p[0] <= c && c <= p[1])
  931. return 1;
  932. return 0;
  933. }
  934. static int incclasscanon(Reclass *cc, Rune c)
  935. {
  936. Rune *p, r;
  937. for (p = cc->spans; p < cc->end; p += 2)
  938. for (r = p[0]; r <= p[1]; ++r)
  939. if (c == canon(r))
  940. return 1;
  941. return 0;
  942. }
  943. static int strncmpcanon(const char *a, const char *b, int n)
  944. {
  945. Rune ra, rb;
  946. int c;
  947. while (n--) {
  948. if (!*a) return -1;
  949. if (!*b) return 1;
  950. a += chartorune(&ra, a);
  951. b += chartorune(&rb, b);
  952. c = canon(ra) - canon(rb);
  953. if (c)
  954. return c;
  955. }
  956. return 0;
  957. }
  958. static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out, int depth)
  959. {
  960. Resub scratch;
  961. int result;
  962. int i;
  963. Rune c;
  964. /* stack overflow */
  965. if (depth > REG_MAXREC)
  966. return -1;
  967. for (;;) {
  968. switch (pc->opcode) {
  969. case I_END:
  970. return 0;
  971. case I_JUMP:
  972. pc = pc->x;
  973. break;
  974. case I_SPLIT:
  975. scratch = *out;
  976. result = match(pc->x, sp, bol, flags, &scratch, depth+1);
  977. if (result == -1)
  978. return -1;
  979. if (result == 0) {
  980. *out = scratch;
  981. return 0;
  982. }
  983. pc = pc->y;
  984. break;
  985. case I_PLA:
  986. result = match(pc->x, sp, bol, flags, out, depth+1);
  987. if (result == -1)
  988. return -1;
  989. if (result == 1)
  990. return 1;
  991. pc = pc->y;
  992. break;
  993. case I_NLA:
  994. scratch = *out;
  995. result = match(pc->x, sp, bol, flags, &scratch, depth+1);
  996. if (result == -1)
  997. return -1;
  998. if (result == 0)
  999. return 1;
  1000. pc = pc->y;
  1001. break;
  1002. case I_ANYNL:
  1003. if (!*sp) return 1;
  1004. sp += chartorune(&c, sp);
  1005. pc = pc + 1;
  1006. break;
  1007. case I_ANY:
  1008. if (!*sp) return 1;
  1009. sp += chartorune(&c, sp);
  1010. if (isnewline(c))
  1011. return 1;
  1012. pc = pc + 1;
  1013. break;
  1014. case I_CHAR:
  1015. if (!*sp) return 1;
  1016. sp += chartorune(&c, sp);
  1017. if (flags & REG_ICASE)
  1018. c = canon(c);
  1019. if (c != pc->c)
  1020. return 1;
  1021. pc = pc + 1;
  1022. break;
  1023. case I_CCLASS:
  1024. if (!*sp) return 1;
  1025. sp += chartorune(&c, sp);
  1026. if (flags & REG_ICASE) {
  1027. if (!incclasscanon(pc->cc, canon(c)))
  1028. return 1;
  1029. } else {
  1030. if (!incclass(pc->cc, c))
  1031. return 1;
  1032. }
  1033. pc = pc + 1;
  1034. break;
  1035. case I_NCCLASS:
  1036. if (!*sp) return 1;
  1037. sp += chartorune(&c, sp);
  1038. if (flags & REG_ICASE) {
  1039. if (incclasscanon(pc->cc, canon(c)))
  1040. return 1;
  1041. } else {
  1042. if (incclass(pc->cc, c))
  1043. return 1;
  1044. }
  1045. pc = pc + 1;
  1046. break;
  1047. case I_REF:
  1048. i = out->sub[pc->n].ep - out->sub[pc->n].sp;
  1049. if (flags & REG_ICASE) {
  1050. if (strncmpcanon(sp, out->sub[pc->n].sp, i))
  1051. return 1;
  1052. } else {
  1053. if (strncmp(sp, out->sub[pc->n].sp, i))
  1054. return 1;
  1055. }
  1056. if (i > 0)
  1057. sp += i;
  1058. pc = pc + 1;
  1059. break;
  1060. case I_BOL:
  1061. if (sp == bol && !(flags & REG_NOTBOL)) {
  1062. pc = pc + 1;
  1063. break;
  1064. }
  1065. if (flags & REG_NEWLINE) {
  1066. if (sp > bol && isnewline(sp[-1])) {
  1067. pc = pc + 1;
  1068. break;
  1069. }
  1070. }
  1071. return 1;
  1072. case I_EOL:
  1073. if (*sp == 0) {
  1074. pc = pc + 1;
  1075. break;
  1076. }
  1077. if (flags & REG_NEWLINE) {
  1078. if (isnewline(*sp)) {
  1079. pc = pc + 1;
  1080. break;
  1081. }
  1082. }
  1083. return 1;
  1084. case I_WORD:
  1085. i = sp > bol && iswordchar(sp[-1]);
  1086. i ^= iswordchar(sp[0]);
  1087. if (!i)
  1088. return 1;
  1089. pc = pc + 1;
  1090. break;
  1091. case I_NWORD:
  1092. i = sp > bol && iswordchar(sp[-1]);
  1093. i ^= iswordchar(sp[0]);
  1094. if (i)
  1095. return 1;
  1096. pc = pc + 1;
  1097. break;
  1098. case I_LPAR:
  1099. out->sub[pc->n].sp = sp;
  1100. pc = pc + 1;
  1101. break;
  1102. case I_RPAR:
  1103. out->sub[pc->n].ep = sp;
  1104. pc = pc + 1;
  1105. break;
  1106. default:
  1107. return 1;
  1108. }
  1109. }
  1110. }
  1111. int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
  1112. {
  1113. Resub scratch;
  1114. int i;
  1115. if (!sub)
  1116. sub = &scratch;
  1117. sub->nsub = prog->nsub;
  1118. for (i = 0; i < REG_MAXSUB; ++i)
  1119. sub->sub[i].sp = sub->sub[i].ep = NULL;
  1120. return match(prog->start, sp, sp, prog->flags | eflags, sub, 0);
  1121. }
  1122. #ifdef TEST
  1123. int main(int argc, char **argv)
  1124. {
  1125. const char *error;
  1126. const char *s;
  1127. Reprog *p;
  1128. Resub m;
  1129. int i;
  1130. if (argc > 1) {
  1131. p = regcomp(argv[1], 0, &error);
  1132. if (!p) {
  1133. fprintf(stderr, "regcomp: %s\n", error);
  1134. return 1;
  1135. }
  1136. if (argc > 2) {
  1137. s = argv[2];
  1138. printf("nsub = %d\n", p->nsub);
  1139. if (!regexec(p, s, &m, 0)) {
  1140. for (i = 0; i < m.nsub; ++i) {
  1141. int n = m.sub[i].ep - m.sub[i].sp;
  1142. if (n > 0)
  1143. printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
  1144. else
  1145. printf("match %d: n=0 ''\n", i);
  1146. }
  1147. } else {
  1148. printf("no match\n");
  1149. }
  1150. }
  1151. }
  1152. return 0;
  1153. }
  1154. #endif