Highlight things
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

864 linhas
23KB

  1. #ifdef __cplusplus
  2. # pragma GCC diagnostic ignored "-Wc++20-extensions"
  3. #endif
  4. #include "jeger.h"
  5. #include <assert.h>
  6. #include <string.h>
  7. #include <limits.h>
  8. #include <stdlib.h>
  9. #if DEBUG
  10. # include <stdio.h>
  11. #endif
  12. #define JEGER_SOS_STATE 0
  13. #define JEGER_NSOS_STATE 1
  14. #define JEGER_INIT_STATE 2
  15. // ------------------
  16. // ### Char tests ###
  17. // ------------------
  18. static inline
  19. bool mystrchr(const char * const str, const char c){
  20. for (const char * s = str; *s != '\00'; s++) {
  21. if (*s == c) {
  22. return true;
  23. }
  24. }
  25. return false;
  26. }
  27. static inline
  28. bool is_quantifier(const char c) {
  29. return mystrchr("=?+*", c);
  30. }
  31. static inline
  32. bool is_hologram_escape(const char c) {
  33. return mystrchr("<>", c);
  34. }
  35. bool is_magic(const char c) {
  36. return is_quantifier(c)
  37. || mystrchr("\\[].^", c)
  38. ;
  39. }
  40. // -------------------
  41. // ### Match tests ###
  42. // -------------------
  43. bool is_sentinel(const match_t * const match) {
  44. return (match->position == -1)
  45. && (match->width == -1)
  46. ;
  47. }
  48. // -----------------
  49. // ### Char sets ###
  50. // -----------------
  51. #define JEGER_CHAR_SET_at "@"
  52. #define JEGER_CHAR_SET_underscore "_"
  53. #define JEGER_CHAR_SET_lower "abcdefghijklmnopqrstuwxyz"
  54. #define JEGER_CHAR_SET_upper "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  55. #define JEGER_CHAR_SET_digits "0123456789"
  56. #define JEGER_CHAR_SET_octal_digits "01234567"
  57. #define JEGER_CHAR_SET_lower_hex "abcdef"
  58. #define JEGER_CHAR_SET_upper_hex "ABCDEF"
  59. #define JEGER_CHAR_SET_oct_241_to_277 \
  60. "\241\242\243\244\245" \
  61. "\246\247\250\251\252" \
  62. "\253\254\255\256\257" \
  63. "\260\261\262\263\264" \
  64. "\265\266\267\270\271" \
  65. "\272\273\274\275\276" \
  66. "\277"
  67. #define JEGER_CHAR_SET_oct_300_to_337 \
  68. "\300\301\302\303\304" \
  69. "\305\306\307\310\311" \
  70. "\312\313\314\315\316" \
  71. "\317\320\321\322\323" \
  72. "\324\325\326\327\330" \
  73. "\331\332\333\334\335" \
  74. "\336\337"
  75. #define JEGER_CHAR_SET_file_extra "/.-_+,#$%~="
  76. #define JEGER_CHAR_SET_whitespace " " "\t\v\n"
  77. static const char JEGER_CHAR_symbol_chars[] =
  78. JEGER_CHAR_SET_underscore
  79. JEGER_CHAR_SET_lower
  80. JEGER_CHAR_SET_upper
  81. ;
  82. // ----------------------
  83. // ### Internal Types ###
  84. // ----------------------
  85. typedef struct {
  86. int in;
  87. char input;
  88. int to;
  89. int pattern_width;
  90. int match_width;
  91. } delta_t;
  92. typedef struct {
  93. int in;
  94. int to;
  95. int pattern_width;
  96. int match_width;
  97. } offshoot_t;
  98. enum {
  99. DO_CATCH = 0x00000001 << 0,
  100. IS_NEGATIVE = 0x00000001 << 1,
  101. IS_AT_THE_BEGINNING = 0x00000001 << 2,
  102. FORCE_START_OF_STRING = 0x00000001 << 3,
  103. DO_FORBID_START_OF_STRING = 0x00000001 << 4,
  104. INCREMENT_STATE = 0x00000001 << 5,
  105. };
  106. typedef struct {
  107. int flags;
  108. int state;
  109. int width;
  110. int match_width;
  111. char * whitelist;
  112. char * blacklist;
  113. } compiler_state;
  114. // ----------------------------------
  115. // ### Regex creation/destruction ###
  116. // ----------------------------------
  117. enum {
  118. ASSERTION_FAILURE = 0,
  119. ASSERTION_SUCCESS = 1,
  120. HALT_AND_CATCH_FIRE = INT_MIN,
  121. };
  122. #define ASSERT_HALT(a) ((a == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : (cs->state + a))
  123. static
  124. void HOOK_ALL(const int from,
  125. const char * const str,
  126. const int to,
  127. const compiler_state * const cs,
  128. regex_t * regex) {
  129. for (const char * s = str; *s != '\0'; s++) {
  130. delta_t * delta = (delta_t *)malloc(sizeof(delta_t));
  131. *delta = (delta_t){
  132. .in = cs->state + from,
  133. .input = *s,
  134. .to = ASSERT_HALT(to),
  135. .pattern_width = cs->width,
  136. .match_width = cs->match_width,
  137. };
  138. vector_push(&regex->delta_table,
  139. &delta);
  140. }
  141. }
  142. static
  143. void ABSOLUTE_OFFSHOOT(const int from,
  144. const int to,
  145. const int width,
  146. const int match_width,
  147. regex_t * regex) {
  148. offshoot_t * offshoot = (offshoot_t *)malloc(sizeof(offshoot_t));
  149. *offshoot = (offshoot_t){
  150. .in = from,
  151. .to = to,
  152. .pattern_width = width,
  153. .match_width = match_width,
  154. };
  155. vector_push(&regex->catch_table,
  156. &offshoot);
  157. }
  158. static
  159. void OFFSHOOT(const int from,
  160. const int to,
  161. const int width,
  162. const int match_width,
  163. const compiler_state * cs,
  164. regex_t * regex) {
  165. ABSOLUTE_OFFSHOOT(cs->state + from, ASSERT_HALT(to), width, match_width, regex);
  166. }
  167. static
  168. int escape_1_to_1(const char c,
  169. const compiler_state * const cs) {
  170. char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist;
  171. switch (c) {
  172. case 't': {
  173. strcat(target_list, "\t");
  174. } return 1;
  175. case 'n': {
  176. strcat(target_list, "\n");
  177. } return 1;
  178. case 'r': {
  179. strcat(target_list, "\r");
  180. } return 1;
  181. case 'b': {
  182. strcat(target_list, "\b");
  183. } return 1;
  184. case '[': {
  185. strcat(target_list, "[");
  186. } return 1;
  187. case ']': {
  188. strcat(target_list, "]");
  189. } return 1;
  190. case '.': {
  191. strcat(target_list, ".");
  192. } return 1;
  193. case '^': {
  194. strcat(target_list, "^");
  195. } return 1;
  196. case '=': {
  197. strcat(target_list, "=");
  198. } return 1;
  199. case '?': {
  200. strcat(target_list, "?");
  201. } return 1;
  202. case '+': {
  203. strcat(target_list, "+");
  204. } return 1;
  205. case '*': {
  206. strcat(target_list, "*");
  207. } return 1;
  208. case '\\': {
  209. strcat(target_list, "\\");
  210. } return 1;
  211. }
  212. return 0;
  213. }
  214. static
  215. int escape_1_to_N(const char c,
  216. const compiler_state * const cs) {
  217. char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist;
  218. switch(c) {
  219. case 'i': {
  220. const char identifier_chars[] = JEGER_CHAR_SET_at
  221. JEGER_CHAR_SET_underscore
  222. JEGER_CHAR_SET_digits
  223. JEGER_CHAR_SET_oct_300_to_337
  224. ;
  225. strcpy(target_list, identifier_chars);
  226. return sizeof(identifier_chars)-1;
  227. };
  228. case 'I': {
  229. const char identifier_chars[] = JEGER_CHAR_SET_at
  230. JEGER_CHAR_SET_underscore
  231. JEGER_CHAR_SET_oct_300_to_337
  232. ;
  233. strcpy(target_list, identifier_chars);
  234. return sizeof(identifier_chars)-1;
  235. };
  236. case 'k': {
  237. const char keyword_chars[] = JEGER_CHAR_SET_at
  238. JEGER_CHAR_SET_underscore
  239. JEGER_CHAR_SET_digits
  240. JEGER_CHAR_SET_oct_300_to_337
  241. ;
  242. strcpy(target_list, keyword_chars);
  243. return sizeof(keyword_chars)-1;
  244. };
  245. case 'K': {
  246. const char keyword_chars[] = JEGER_CHAR_SET_at
  247. JEGER_CHAR_SET_underscore
  248. JEGER_CHAR_SET_oct_300_to_337
  249. ;
  250. strcpy(target_list, keyword_chars);
  251. return sizeof(keyword_chars)-1;
  252. };
  253. case 'f': {
  254. const char filename_chars[] = JEGER_CHAR_SET_at
  255. JEGER_CHAR_SET_digits
  256. JEGER_CHAR_SET_file_extra
  257. ;
  258. strcpy(target_list, filename_chars);
  259. return sizeof(filename_chars)-1;
  260. };
  261. case 'F': {
  262. const char filename_chars[] = JEGER_CHAR_SET_at
  263. JEGER_CHAR_SET_file_extra
  264. ;
  265. strcpy(target_list, filename_chars);
  266. return sizeof(filename_chars)-1;
  267. };
  268. case 'p': {
  269. const char printable_chars[] = JEGER_CHAR_SET_at
  270. JEGER_CHAR_SET_oct_241_to_277
  271. JEGER_CHAR_SET_oct_300_to_337
  272. ;
  273. strcpy(target_list, printable_chars);
  274. return sizeof(printable_chars)-1;
  275. };
  276. case 'P': {
  277. const char printable_chars[] = JEGER_CHAR_SET_at
  278. JEGER_CHAR_SET_oct_241_to_277
  279. JEGER_CHAR_SET_oct_300_to_337
  280. ;
  281. strcpy(target_list, printable_chars);
  282. return sizeof(printable_chars)-1;
  283. };
  284. case 's': {
  285. const char whitespace_chars[] = JEGER_CHAR_SET_whitespace;
  286. strcpy(target_list, whitespace_chars);
  287. return sizeof(whitespace_chars)-1;
  288. };
  289. case 'd': {
  290. const char digit_chars[] = JEGER_CHAR_SET_digits;
  291. strcpy(target_list, digit_chars);
  292. return sizeof(digit_chars)-1;
  293. };
  294. case 'x': {
  295. const char hex_chars[] = JEGER_CHAR_SET_digits
  296. JEGER_CHAR_SET_lower_hex
  297. JEGER_CHAR_SET_upper_hex
  298. ;
  299. strcpy(target_list, hex_chars);
  300. return sizeof(hex_chars)-1;
  301. };
  302. case 'o': {
  303. const char oct_chars[] = JEGER_CHAR_SET_octal_digits;
  304. strcpy(target_list, oct_chars);
  305. return sizeof(oct_chars)-1;
  306. };
  307. case 'w': {
  308. const char word_chars[] = JEGER_CHAR_SET_underscore
  309. JEGER_CHAR_SET_digits
  310. JEGER_CHAR_SET_lower
  311. JEGER_CHAR_SET_upper
  312. ;
  313. strcpy(target_list, word_chars);
  314. return sizeof(word_chars)-1;
  315. };
  316. case 'h': {
  317. // #global JEGER_CHAR_symbol_chars
  318. strcpy(target_list, JEGER_CHAR_symbol_chars);
  319. return sizeof(JEGER_CHAR_symbol_chars)-1;
  320. };
  321. case 'a': {
  322. const char alpha_chars[] = JEGER_CHAR_SET_lower
  323. JEGER_CHAR_SET_upper
  324. ;
  325. strcpy(target_list, alpha_chars);
  326. return sizeof(alpha_chars)-1;
  327. };
  328. case 'l': {
  329. const char lower_alpha_chars[] = JEGER_CHAR_SET_lower;
  330. strcpy(target_list, lower_alpha_chars);
  331. return sizeof(lower_alpha_chars)-1;
  332. };
  333. case 'u': {
  334. const char upper_alpha_chars[] = JEGER_CHAR_SET_upper;
  335. strcpy(target_list, upper_alpha_chars);
  336. return sizeof(upper_alpha_chars)-1;
  337. };
  338. }
  339. return 0;
  340. }
  341. static inline
  342. int escape_to_negative(const char c,
  343. compiler_state * const cs) {
  344. switch (c) {
  345. case 'D': {
  346. const char digit_chars[] = JEGER_CHAR_SET_digits;
  347. strcpy(cs->blacklist, digit_chars);
  348. cs->flags |= IS_NEGATIVE;
  349. return sizeof(digit_chars)-1;
  350. };
  351. case 'X': {
  352. const char hex_chars[] = JEGER_CHAR_SET_digits
  353. JEGER_CHAR_SET_lower_hex
  354. JEGER_CHAR_SET_upper_hex
  355. ;
  356. strcpy(cs->blacklist, hex_chars);
  357. cs->flags |= IS_NEGATIVE;
  358. return sizeof(hex_chars)-1;
  359. };
  360. case 'O': {
  361. const char oct_chars[] = JEGER_CHAR_SET_octal_digits;
  362. strcpy(cs->blacklist, oct_chars);
  363. cs->flags |= IS_NEGATIVE;
  364. return sizeof(oct_chars)-1;
  365. };
  366. case 'W': {
  367. const char word_chars[] = JEGER_CHAR_SET_underscore
  368. JEGER_CHAR_SET_digits
  369. JEGER_CHAR_SET_lower
  370. JEGER_CHAR_SET_upper
  371. ;
  372. strcpy(cs->blacklist, word_chars);
  373. cs->flags |= IS_NEGATIVE;
  374. return sizeof(word_chars)-1;
  375. };
  376. case 'L': {
  377. const char lower_alpha_chars[] = JEGER_CHAR_SET_lower;
  378. strcpy(cs->blacklist, lower_alpha_chars);
  379. cs->flags |= IS_NEGATIVE;
  380. return sizeof(lower_alpha_chars)-1;
  381. };
  382. case 'U': {
  383. const char upper_alpha_chars[] = JEGER_CHAR_SET_upper;
  384. strcpy(cs->blacklist, upper_alpha_chars);
  385. cs->flags |= IS_NEGATIVE;
  386. return sizeof(upper_alpha_chars)-1;
  387. };
  388. }
  389. return 0;
  390. }
  391. static inline
  392. int compile_dot(compiler_state * const cs) {
  393. cs->flags |= DO_CATCH;
  394. return true;
  395. }
  396. static inline
  397. int compile_escape(const char c,
  398. compiler_state * const cs) {
  399. return escape_1_to_1(c, cs)
  400. || escape_1_to_N(c, cs)
  401. || escape_to_negative(c, cs)
  402. ;
  403. }
  404. static
  405. int compile_range(const char * const range,
  406. compiler_state * const cs) {
  407. assert((range[0] == '[') && "Not a range.");
  408. const char * s;
  409. if (range[1] == '^') {
  410. cs->flags |= IS_NEGATIVE;
  411. s = range + 2;
  412. } else {
  413. s = range + 1;
  414. }
  415. char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist;
  416. for (; *s != ']'; s++) {
  417. assert((*s != '\0') && "Unclosed range.");
  418. char c = *s;
  419. if (c == '\\') {
  420. s += 1;
  421. assert(compile_escape(*s, cs) && "Unknown escape.");
  422. } else if (*(s+1) == '-') {
  423. char end = *(s+2);
  424. assert((c < end) && "Endless range.");
  425. for (char cc = c; cc < end+1; cc++) {
  426. strncat(target_list, &cc, 1);
  427. strncat(target_list, "\0", 1);
  428. }
  429. s += 2;
  430. } else {
  431. strncat(target_list, &c, 1);
  432. }
  433. }
  434. return ((s - range) + 1);
  435. }
  436. static
  437. void filter_blacklist(const char * whitelist,
  438. const char * blacklist,
  439. char * filtered) {
  440. for (; *blacklist != '\0'; blacklist++) {
  441. for (; *whitelist != '\0'; whitelist++) {
  442. if (*blacklist == *whitelist) {
  443. goto long_continue;
  444. }
  445. }
  446. strncat(filtered, blacklist, 1);
  447. long_continue:
  448. ;
  449. }
  450. }
  451. regex_t * regex_compile(const char * const pattern) {
  452. regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
  453. regex->str = strdup(pattern);
  454. vector_init(&regex->delta_table, sizeof(delta_t*), 0UL);
  455. vector_init(&regex->catch_table, sizeof(offshoot_t*), 0UL);
  456. char whitelist[64];
  457. char blacklist[64];
  458. static const int REGEX_PREVERSABLE_FLAGS = IS_AT_THE_BEGINNING
  459. | FORCE_START_OF_STRING
  460. | DO_FORBID_START_OF_STRING
  461. ;
  462. compiler_state cs = {
  463. .flags = IS_AT_THE_BEGINNING,
  464. .state = JEGER_INIT_STATE,
  465. .whitelist = whitelist,
  466. .blacklist = blacklist,
  467. };
  468. for (const char * s = pattern; *s != '\00';) {
  469. assert(!is_quantifier(*s) && "Pattern starts with quantifier.");
  470. // Reset the compiler
  471. whitelist[0] = '\0';
  472. blacklist[0] = '\0';
  473. cs.flags &= REGEX_PREVERSABLE_FLAGS;
  474. cs.width = 1;
  475. cs.match_width = 1;
  476. // Translate char
  477. switch (*s) {
  478. case '^': {
  479. ;
  480. } break;
  481. case '.': {
  482. compile_dot(&cs);
  483. s += 1;
  484. } break;
  485. case '\\': {
  486. s += 1;
  487. if (compile_escape(*s, &cs)) {
  488. s += 1;
  489. } else if (is_hologram_escape(*s)) {
  490. s -= 1;
  491. } else {
  492. assert("Unknown escape.");
  493. }
  494. } break;
  495. case '[': {
  496. s += compile_range(s, &cs);
  497. } break;
  498. default: { // Literal
  499. whitelist[0] = *s;
  500. whitelist[1] = '\0';
  501. s += 1;
  502. } break;
  503. }
  504. // Compile char
  505. switch (*s) {
  506. // holograms
  507. case '^': {
  508. whitelist[0] = '\n';
  509. whitelist[1] = '\0';
  510. HOOK_ALL(0, whitelist, 0, &cs, regex);
  511. if (cs.flags & IS_AT_THE_BEGINNING) {
  512. cs.flags |= FORCE_START_OF_STRING;
  513. } else {
  514. cs.flags |= INCREMENT_STATE;
  515. }
  516. s += 1;
  517. } break;
  518. case '\\': {
  519. if(is_hologram_escape(*(s+1))) {
  520. ++s;
  521. } else {
  522. goto DEFAULT;
  523. }
  524. switch(*s){
  525. case '<': {
  526. // XXX: make this legible
  527. if (cs.flags & IS_AT_THE_BEGINNING
  528. && !(cs.flags & DO_CATCH)
  529. && !(cs.flags & IS_NEGATIVE)
  530. && whitelist[0] == '\0') {
  531. // ---
  532. cs.flags |= INCREMENT_STATE;
  533. cs.flags |= DO_FORBID_START_OF_STRING;
  534. strcat(whitelist, JEGER_CHAR_symbol_chars);
  535. // ---
  536. ABSOLUTE_OFFSHOOT( JEGER_SOS_STATE, JEGER_INIT_STATE+1, 0, 0, regex);
  537. ABSOLUTE_OFFSHOOT(JEGER_INIT_STATE, JEGER_INIT_STATE+2, 1, 0, regex);
  538. HOOK_ALL(0, whitelist, HALT_AND_CATCH_FIRE, &cs, regex);
  539. // ---
  540. ++cs.state;
  541. cs.width = 0;
  542. cs.match_width = 0;
  543. HOOK_ALL(0, whitelist, +1, &cs, regex);
  544. cs.width = 1;
  545. OFFSHOOT(0, +1, 1, 0, &cs, regex);
  546. // ---
  547. } else {
  548. HOOK_ALL(0, whitelist, +1, &cs, regex);
  549. if ((cs.flags & DO_CATCH)
  550. || (cs.flags & IS_NEGATIVE)) {
  551. OFFSHOOT(+1, +2, 1, 1, &cs, regex);
  552. } else {
  553. cs.flags |= INCREMENT_STATE;
  554. }
  555. OFFSHOOT(0, +1, 1, 0, &cs, regex);
  556. }
  557. cs.flags |= IS_NEGATIVE;
  558. strcat(blacklist, JEGER_CHAR_symbol_chars);
  559. s += 1;
  560. } break;
  561. case '>': {
  562. HOOK_ALL(0, whitelist, +1, &cs, regex);
  563. cs.flags |= IS_NEGATIVE | INCREMENT_STATE;
  564. strcat(blacklist, JEGER_CHAR_symbol_chars);
  565. OFFSHOOT(+1, +2, 0, 0, &cs, regex);
  566. ++cs.state;
  567. s += 1;
  568. } break;
  569. }
  570. } break;
  571. // quantifiers
  572. case '=':
  573. case '?': {
  574. HOOK_ALL(0, whitelist, +1, &cs, regex);
  575. if ((cs.flags & DO_CATCH)
  576. || (cs.flags & IS_NEGATIVE)) {
  577. OFFSHOOT(0, +1, 1, 1, &cs, regex);
  578. }
  579. s += 1;
  580. } break;
  581. case '*': {
  582. HOOK_ALL(0, whitelist, 0, &cs, regex);
  583. if ((cs.flags & DO_CATCH)
  584. || (cs.flags & IS_NEGATIVE)) {
  585. OFFSHOOT(0, 0, 1, 1, &cs, regex);
  586. }
  587. s += 1;
  588. } break;
  589. case '+': {
  590. cs.flags |= INCREMENT_STATE;
  591. HOOK_ALL(0, whitelist, +1, &cs, regex);
  592. if ((cs.flags & DO_CATCH)
  593. || (cs.flags & IS_NEGATIVE)) {
  594. OFFSHOOT(0, +1, 1, 1, &cs, regex);
  595. }
  596. HOOK_ALL(+1, whitelist, +1, &cs, regex);
  597. if ((cs.flags & DO_CATCH)
  598. || (cs.flags & IS_NEGATIVE)) {
  599. OFFSHOOT(+1, +1, 1, 1, &cs, regex);
  600. }
  601. s += 1;
  602. } break;
  603. DEFAULT:
  604. default: { // Literal
  605. cs.flags |= INCREMENT_STATE;
  606. HOOK_ALL(0, whitelist, +1, &cs, regex);
  607. if ((cs.flags & DO_CATCH)
  608. || (cs.flags & IS_NEGATIVE)) {
  609. OFFSHOOT(0, +1, 1, 1, &cs, regex);
  610. }
  611. } break;
  612. }
  613. // Compile blacklist
  614. if (*blacklist) {
  615. char filtered_blacklist[64];
  616. filtered_blacklist[0] = '\0';
  617. filter_blacklist(whitelist, blacklist, filtered_blacklist);
  618. HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs, regex);
  619. }
  620. if (cs.flags & INCREMENT_STATE) {
  621. ++cs.state;
  622. }
  623. // Purge SOS flag
  624. cs.flags &= (~IS_AT_THE_BEGINNING);
  625. }
  626. // Init state hookups
  627. if (!(cs.flags & DO_FORBID_START_OF_STRING)) {
  628. ABSOLUTE_OFFSHOOT(JEGER_SOS_STATE, JEGER_INIT_STATE, 0, 0, regex);
  629. }
  630. if (cs.flags & FORCE_START_OF_STRING) {
  631. ABSOLUTE_OFFSHOOT(JEGER_NSOS_STATE, HALT_AND_CATCH_FIRE, 0, 0, regex);
  632. } else {
  633. ABSOLUTE_OFFSHOOT(JEGER_NSOS_STATE, JEGER_INIT_STATE, 0, 0, regex);
  634. }
  635. regex->accepting_state = cs.state;
  636. return regex;
  637. }
  638. int regex_free(regex_t * const regex) {
  639. free(regex->str);
  640. vector_free(&regex->delta_table);
  641. vector_free(&regex->catch_table);
  642. free(regex);
  643. return 0;
  644. }
  645. // -----------------
  646. // ### Searching ###
  647. // -----------------
  648. static
  649. const offshoot_t * catch_table_lookup(const regex_t * const regex,
  650. const int * const state) {
  651. for (size_t i = 0; i < regex->catch_table.element_count; i++){
  652. const offshoot_t * const offshoot = *(offshoot_t**)vector_get(&regex->catch_table, i);
  653. if (offshoot->in == *state) {
  654. return offshoot;
  655. }
  656. }
  657. return NULL;
  658. }
  659. static
  660. int regex_assert(const regex_t * const regex,
  661. const char * const string,
  662. int state,
  663. match_t * const match) {
  664. if (state == HALT_AND_CATCH_FIRE) {
  665. return HALT_AND_CATCH_FIRE;
  666. }
  667. bool last_stand = false;
  668. bool was_found;
  669. const char * s = string;
  670. LOOP: {
  671. was_found = false;
  672. if (*s == '\0') {
  673. last_stand = true;
  674. goto PERFORM_CATCH_LOOKUP;
  675. }
  676. // Jump search for the correct state
  677. const int jump = 10;
  678. size_t i = jump;
  679. while (i < regex->delta_table.element_count) {
  680. const delta_t * const delta = *(delta_t**)vector_get(&regex->delta_table, i);
  681. if (delta->in >= state) {
  682. break;
  683. }
  684. i += jump;
  685. }
  686. i -= jump;
  687. // Linear search finish up
  688. for (; i < regex->delta_table.element_count; i++) {
  689. const delta_t * const delta = *(delta_t**)vector_get(&regex->delta_table, i);
  690. if (delta->in > state) {
  691. break;
  692. }
  693. if ((delta->in == state)
  694. && (delta->input == *s)) {
  695. bool do_reset = false;
  696. was_found = true;
  697. if (!match->_pos_ptr && delta->match_width) {
  698. match->_pos_ptr = s;
  699. do_reset = true;
  700. }
  701. const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match);
  702. if(r == ASSERTION_SUCCESS){
  703. match->width += delta->match_width;
  704. return r;
  705. } else {
  706. if (r == ASSERTION_FAILURE) {
  707. was_found = false;
  708. }
  709. if (do_reset) {
  710. match->_pos_ptr = NULL;
  711. }
  712. }
  713. }
  714. }
  715. }
  716. PERFORM_CATCH_LOOKUP: {
  717. if (!was_found) {
  718. const offshoot_t * const my_catch = catch_table_lookup(regex, &state);
  719. if (my_catch && (!my_catch->pattern_width || !last_stand)) {
  720. state = my_catch->to;
  721. s += my_catch->pattern_width;
  722. match->width += my_catch->match_width;
  723. goto LOOP;
  724. }
  725. }
  726. }
  727. return ((state == regex->accepting_state) ? ASSERTION_SUCCESS : ASSERTION_FAILURE);
  728. }
  729. match_t * regex_match(const regex_t * const regex,
  730. const char * const string,
  731. const bool is_start_of_string) {
  732. vector_t matches;
  733. vector_init(&matches, sizeof(match_t), 0);
  734. match_t * match = (match_t *)malloc(sizeof(match_t));
  735. /* Non-existent regex does not match anything.
  736. * Not to be confused with an empty regex.
  737. */
  738. if (regex == NULL) {
  739. goto FINISH;
  740. }
  741. // Find all matches
  742. {
  743. const char * s = string;
  744. int initial_state;
  745. do {
  746. initial_state = (int)(!(is_start_of_string && (s == string)));
  747. *match = (match_t){
  748. ._pos_ptr = NULL,
  749. .width = 0,
  750. };
  751. if (regex_assert(regex, s, initial_state, match) == 1) {
  752. //printf("true: %s\n", s);
  753. if (match->_pos_ptr) {
  754. match->position = (match->_pos_ptr - string);
  755. } else {
  756. match->position = (s - string);
  757. }
  758. vector_push(&matches, match);
  759. s += ((match->width > 0) ? match->width : 1);
  760. match = (match_t *)malloc(sizeof(match_t));
  761. } else {
  762. //printf("false: %s\n", s);
  763. ++s;
  764. }
  765. } while (*s != '\0');
  766. }
  767. FINISH:
  768. // Insert sentinel
  769. *match = (match_t){
  770. .position = -1,
  771. .width = -1,
  772. };
  773. vector_push(&matches, match);
  774. // Hide internal vector usage
  775. const size_t data_size = matches.element_size * matches.element_count;
  776. match_t * r = (match_t *)malloc(data_size);
  777. memcpy(r, matches.data, data_size);
  778. vector_free(&matches);
  779. return r;
  780. }
  781. bool regex_search(const regex_t * const regex,
  782. const char * const string) {
  783. match_t * m = regex_match(regex, string, true);
  784. const bool r = !is_sentinel(m);
  785. free(m);
  786. return r;
  787. }