You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

844 lines
22KB

  1. #ifdef __cplusplus
  2. # pragma GCC diagnostic ignored "-Wc++20-extensions"
  3. #endif
  4. #include "jeger.h"
  5. #include <assert.h>
  6. #include <string.h>
  7. #include <limits.h>
  8. #include <stdlib.h>
  9. #if DEBUG
  10. # include <stdio.h>
  11. #endif
  12. #define JEGER_SOS_STATE 0
  13. #define JEGER_NSOS_STATE 1
  14. #define JEGER_INIT_STATE 2
  15. // ------------------
  16. // ### Char tests ###
  17. // ------------------
  18. static inline
  19. bool mystrchr(const char * const str, const char c){
  20. for (const char * s = str; *s != '\00'; s++) {
  21. if (*s == c) {
  22. return true;
  23. }
  24. }
  25. return false;
  26. }
  27. static inline
  28. bool is_quantifier(const char c) {
  29. return mystrchr("=?+*", c);
  30. }
  31. static inline
  32. bool is_hologram_escape(const char c) {
  33. return mystrchr("<>", c);
  34. }
  35. bool is_magic(const char c) {
  36. return is_quantifier(c)
  37. || mystrchr("\\[].^", c)
  38. ;
  39. }
  40. // -------------------
  41. // ### Match tests ###
  42. // -------------------
  43. static inline
  44. bool is_sentinel(const match_t * const match) {
  45. return (match->position == -1)
  46. && (match->width == -1)
  47. ;
  48. }
  49. // -----------------
  50. // ### Char sets ###
  51. // -----------------
  52. #define JEGER_CHAR_SET_at "@"
  53. #define JEGER_CHAR_SET_underscore "_"
  54. #define JEGER_CHAR_SET_lower "abcdefghijklmnopqrstuwxyz"
  55. #define JEGER_CHAR_SET_upper "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  56. #define JEGER_CHAR_SET_digits "0123456789"
  57. #define JEGER_CHAR_SET_octal_digits "01234567"
  58. #define JEGER_CHAR_SET_lower_hex "abcdef"
  59. #define JEGER_CHAR_SET_upper_hex "ABCDEF"
  60. #define JEGER_CHAR_SET_oct_241_to_277 \
  61. "\241\242\243\244\245" \
  62. "\246\247\250\251\252" \
  63. "\253\254\255\256\257" \
  64. "\260\261\262\263\264" \
  65. "\265\266\267\270\271" \
  66. "\272\273\274\275\276" \
  67. "\277"
  68. #define JEGER_CHAR_SET_oct_300_to_337 \
  69. "\300\301\302\303\304" \
  70. "\305\306\307\310\311" \
  71. "\312\313\314\315\316" \
  72. "\317\320\321\322\323" \
  73. "\324\325\326\327\330" \
  74. "\331\332\333\334\335" \
  75. "\336\337"
  76. #define JEGER_CHAR_SET_file_extra "/.-_+,#$%~="
  77. #define JEGER_CHAR_SET_whitespace " " "\t\v\n"
  78. static const char JEGER_CHAR_symbol_chars[] =
  79. JEGER_CHAR_SET_underscore
  80. JEGER_CHAR_SET_lower
  81. JEGER_CHAR_SET_upper
  82. ;
  83. // ----------------------
  84. // ### Internal Types ###
  85. // ----------------------
  86. typedef struct {
  87. int in;
  88. char input;
  89. int to;
  90. int pattern_width;
  91. int match_width;
  92. } delta_t;
  93. typedef struct {
  94. int in;
  95. int to;
  96. int pattern_width;
  97. int match_width;
  98. } offshoot_t;
  99. enum {
  100. DO_CATCH = 0x00000001 << 0,
  101. IS_NEGATIVE = 0x00000001 << 1,
  102. IS_AT_THE_BEGINNING = 0x00000001 << 2,
  103. FORCE_START_OF_STRING = 0x00000001 << 3,
  104. DO_FORBID_START_OF_STRING = 0x00000001 << 4,
  105. INCREMENT_STATE = 0x00000001 << 5,
  106. };
  107. typedef struct {
  108. int flags;
  109. int state;
  110. int width;
  111. int width2;
  112. char * whitelist;
  113. char * blacklist;
  114. } compiler_state;
  115. // ----------------------------------
  116. // ### Regex creation/destruction ###
  117. // ----------------------------------
  118. static const int HALT_AND_CATCH_FIRE = INT_MIN;
  119. #define ASSERT_HALT(a) ((a == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : (cs->state + a))
  120. static
  121. void HOOK_ALL(const int from,
  122. const char * const str,
  123. const int to,
  124. const compiler_state * const cs,
  125. regex_t * regex) {
  126. for (const char * s = str; *s != '\0'; s++) {
  127. delta_t * delta = (delta_t *)malloc(sizeof(delta_t));
  128. *delta = (delta_t){
  129. .in = cs->state + from,
  130. .input = *s,
  131. .to = ASSERT_HALT(to),
  132. .pattern_width = cs->width,
  133. .match_width = cs->width2,
  134. };
  135. vector_push(&regex->delta_table,
  136. &delta);
  137. }
  138. }
  139. static
  140. void ABSOLUTE_OFFSHOOT(const int from,
  141. const int to,
  142. const int width,
  143. const int match_width,
  144. regex_t * regex) {
  145. offshoot_t * offshoot = (offshoot_t *)malloc(sizeof(offshoot_t));
  146. *offshoot = (offshoot_t){
  147. .in = from,
  148. .to = to,
  149. .pattern_width = width,
  150. .match_width = match_width,
  151. };
  152. vector_push(&regex->catch_table,
  153. &offshoot);
  154. }
  155. static
  156. void OFFSHOOT(const int from,
  157. const int to,
  158. const int width,
  159. const int match_width,
  160. const compiler_state * cs,
  161. regex_t * regex) {
  162. ABSOLUTE_OFFSHOOT(cs->state + from, ASSERT_HALT(to), width, match_width, regex);
  163. }
  164. static
  165. int escape_1_to_1(const char c,
  166. const compiler_state * const cs) {
  167. char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist;
  168. switch (c) {
  169. case 't': {
  170. strcat(target_list, "\t");
  171. } return 1;
  172. case 'n': {
  173. strcat(target_list, "\n");
  174. } return 1;
  175. case 'r': {
  176. strcat(target_list, "\r");
  177. } return 1;
  178. case 'b': {
  179. strcat(target_list, "\b");
  180. } return 1;
  181. case '[': {
  182. strcat(target_list, "[");
  183. } return 1;
  184. case ']': {
  185. strcat(target_list, "]");
  186. } return 1;
  187. case '.': {
  188. strcat(target_list, ".");
  189. } return 1;
  190. case '^': {
  191. strcat(target_list, "^");
  192. } return 1;
  193. case '=': {
  194. strcat(target_list, "=");
  195. } return 1;
  196. case '?': {
  197. strcat(target_list, "?");
  198. } return 1;
  199. case '+': {
  200. strcat(target_list, "+");
  201. } return 1;
  202. case '*': {
  203. strcat(target_list, "*");
  204. } return 1;
  205. case '\\': {
  206. strcat(target_list, "\\");
  207. } return 1;
  208. }
  209. return 0;
  210. }
  211. static
  212. int escape_1_to_N(const char c,
  213. const compiler_state * const cs) {
  214. char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist;
  215. switch(c) {
  216. case 'i': {
  217. const char identifier_chars[] = JEGER_CHAR_SET_at
  218. JEGER_CHAR_SET_underscore
  219. JEGER_CHAR_SET_digits
  220. JEGER_CHAR_SET_oct_300_to_337
  221. ;
  222. strcpy(target_list, identifier_chars);
  223. return sizeof(identifier_chars)-1;
  224. };
  225. case 'I': {
  226. const char identifier_chars[] = JEGER_CHAR_SET_at
  227. JEGER_CHAR_SET_underscore
  228. JEGER_CHAR_SET_oct_300_to_337
  229. ;
  230. strcpy(target_list, identifier_chars);
  231. return sizeof(identifier_chars)-1;
  232. };
  233. case 'k': {
  234. const char keyword_chars[] = JEGER_CHAR_SET_at
  235. JEGER_CHAR_SET_underscore
  236. JEGER_CHAR_SET_digits
  237. JEGER_CHAR_SET_oct_300_to_337
  238. ;
  239. strcpy(target_list, keyword_chars);
  240. return sizeof(keyword_chars)-1;
  241. };
  242. case 'K': {
  243. const char keyword_chars[] = JEGER_CHAR_SET_at
  244. JEGER_CHAR_SET_underscore
  245. JEGER_CHAR_SET_oct_300_to_337
  246. ;
  247. strcpy(target_list, keyword_chars);
  248. return sizeof(keyword_chars)-1;
  249. };
  250. case 'f': {
  251. const char filename_chars[] = JEGER_CHAR_SET_at
  252. JEGER_CHAR_SET_digits
  253. JEGER_CHAR_SET_file_extra
  254. ;
  255. strcpy(target_list, filename_chars);
  256. return sizeof(filename_chars)-1;
  257. };
  258. case 'F': {
  259. const char filename_chars[] = JEGER_CHAR_SET_at
  260. JEGER_CHAR_SET_file_extra
  261. ;
  262. strcpy(target_list, filename_chars);
  263. return sizeof(filename_chars)-1;
  264. };
  265. case 'p': {
  266. const char printable_chars[] = JEGER_CHAR_SET_at
  267. JEGER_CHAR_SET_oct_241_to_277
  268. JEGER_CHAR_SET_oct_300_to_337
  269. ;
  270. strcpy(target_list, printable_chars);
  271. return sizeof(printable_chars)-1;
  272. };
  273. case 'P': {
  274. const char printable_chars[] = JEGER_CHAR_SET_at
  275. JEGER_CHAR_SET_oct_241_to_277
  276. JEGER_CHAR_SET_oct_300_to_337
  277. ;
  278. strcpy(target_list, printable_chars);
  279. return sizeof(printable_chars)-1;
  280. };
  281. case 's': {
  282. const char whitespace_chars[] = JEGER_CHAR_SET_whitespace;
  283. strcpy(target_list, whitespace_chars);
  284. return sizeof(whitespace_chars)-1;
  285. };
  286. case 'd': {
  287. const char digit_chars[] = JEGER_CHAR_SET_digits;
  288. strcpy(target_list, digit_chars);
  289. return sizeof(digit_chars)-1;
  290. };
  291. case 'x': {
  292. const char hex_chars[] = JEGER_CHAR_SET_digits
  293. JEGER_CHAR_SET_lower_hex
  294. JEGER_CHAR_SET_upper_hex
  295. ;
  296. strcpy(target_list, hex_chars);
  297. return sizeof(hex_chars)-1;
  298. };
  299. case 'o': {
  300. const char oct_chars[] = JEGER_CHAR_SET_octal_digits;
  301. strcpy(target_list, oct_chars);
  302. return sizeof(oct_chars)-1;
  303. };
  304. case 'w': {
  305. const char word_chars[] = JEGER_CHAR_SET_underscore
  306. JEGER_CHAR_SET_digits
  307. JEGER_CHAR_SET_lower
  308. JEGER_CHAR_SET_upper
  309. ;
  310. strcpy(target_list, word_chars);
  311. return sizeof(word_chars)-1;
  312. };
  313. case 'h': {
  314. // #global JEGER_CHAR_symbol_chars
  315. strcpy(target_list, JEGER_CHAR_symbol_chars);
  316. return sizeof(JEGER_CHAR_symbol_chars)-1;
  317. };
  318. case 'a': {
  319. const char alpha_chars[] = JEGER_CHAR_SET_lower
  320. JEGER_CHAR_SET_upper
  321. ;
  322. strcpy(target_list, alpha_chars);
  323. return sizeof(alpha_chars)-1;
  324. };
  325. case 'l': {
  326. const char lower_alpha_chars[] = JEGER_CHAR_SET_lower;
  327. strcpy(target_list, lower_alpha_chars);
  328. return sizeof(lower_alpha_chars)-1;
  329. };
  330. case 'u': {
  331. const char upper_alpha_chars[] = JEGER_CHAR_SET_upper;
  332. strcpy(target_list, upper_alpha_chars);
  333. return sizeof(upper_alpha_chars)-1;
  334. };
  335. }
  336. return 0;
  337. }
  338. static inline
  339. int escape_to_negative(const char c,
  340. compiler_state * const cs) {
  341. switch (c) {
  342. case 'D': {
  343. const char digit_chars[] = JEGER_CHAR_SET_digits;
  344. strcpy(cs->blacklist, digit_chars);
  345. cs->flags |= IS_NEGATIVE;
  346. return sizeof(digit_chars)-1;
  347. };
  348. case 'X': {
  349. const char hex_chars[] = JEGER_CHAR_SET_digits
  350. JEGER_CHAR_SET_lower_hex
  351. JEGER_CHAR_SET_upper_hex
  352. ;
  353. strcpy(cs->blacklist, hex_chars);
  354. cs->flags |= IS_NEGATIVE;
  355. return sizeof(hex_chars)-1;
  356. };
  357. case 'O': {
  358. const char oct_chars[] = JEGER_CHAR_SET_octal_digits;
  359. strcpy(cs->blacklist, oct_chars);
  360. cs->flags |= IS_NEGATIVE;
  361. return sizeof(oct_chars)-1;
  362. };
  363. case 'W': {
  364. const char word_chars[] = JEGER_CHAR_SET_underscore
  365. JEGER_CHAR_SET_digits
  366. JEGER_CHAR_SET_lower
  367. JEGER_CHAR_SET_upper
  368. ;
  369. strcpy(cs->blacklist, word_chars);
  370. cs->flags |= IS_NEGATIVE;
  371. return sizeof(word_chars)-1;
  372. };
  373. case 'L': {
  374. const char lower_alpha_chars[] = JEGER_CHAR_SET_lower;
  375. strcpy(cs->blacklist, lower_alpha_chars);
  376. cs->flags |= IS_NEGATIVE;
  377. return sizeof(lower_alpha_chars)-1;
  378. };
  379. case 'U': {
  380. const char upper_alpha_chars[] = JEGER_CHAR_SET_upper;
  381. strcpy(cs->blacklist, upper_alpha_chars);
  382. cs->flags |= IS_NEGATIVE;
  383. return sizeof(upper_alpha_chars)-1;
  384. };
  385. }
  386. return 0;
  387. }
  388. static inline
  389. int compile_dot(compiler_state * const cs) {
  390. cs->flags |= DO_CATCH;
  391. return true;
  392. }
  393. static inline
  394. int compile_escape(const char c,
  395. compiler_state * const cs) {
  396. return escape_1_to_1(c, cs)
  397. || escape_1_to_N(c, cs)
  398. || escape_to_negative(c, cs)
  399. ;
  400. }
  401. static
  402. int compile_range(const char * const range,
  403. compiler_state * const cs) {
  404. assert((range[0] == '[') && "Not a range.");
  405. const char * s;
  406. if (range[1] == '^') {
  407. cs->flags |= IS_NEGATIVE;
  408. s = range + 2;
  409. } else {
  410. s = range + 1;
  411. }
  412. char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist;
  413. for (; *s != ']'; s++) {
  414. assert((*s != '\0') && "Unclosed range.");
  415. char c = *s;
  416. if (c == '\\') {
  417. s += 1;
  418. assert(compile_escape(*s, cs) && "Unknown escape.");
  419. } else if (*(s+1) == '-') {
  420. char end = *(s+2);
  421. assert((c < end) && "Endless range.");
  422. for (char cc = c; cc < end+1; cc++) {
  423. strncat(target_list, &cc, 1);
  424. strncat(target_list, "\0", 1);
  425. }
  426. s += 2;
  427. } else {
  428. strncat(target_list, &c, 1);
  429. }
  430. }
  431. return ((s - range) + 1);
  432. }
  433. static
  434. void filter_blacklist(const char * whitelist,
  435. const char * blacklist,
  436. char * filtered) {
  437. for (; *blacklist != '\0'; blacklist++) {
  438. for (; *whitelist != '\0'; whitelist++) {
  439. if (*blacklist == *whitelist) {
  440. goto long_continue;
  441. }
  442. }
  443. strncat(filtered, blacklist, 1);
  444. long_continue:
  445. ;
  446. }
  447. }
  448. regex_t * regex_compile(const char * const pattern) {
  449. regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
  450. regex->str = strdup(pattern);
  451. vector_init(&regex->delta_table, sizeof(delta_t*), 0UL);
  452. vector_init(&regex->catch_table, sizeof(offshoot_t*), 0UL);
  453. char whitelist[64];
  454. char blacklist[64];
  455. compiler_state cs = {
  456. .flags = IS_AT_THE_BEGINNING,
  457. .state = JEGER_INIT_STATE,
  458. .whitelist = whitelist,
  459. .blacklist = blacklist,
  460. };
  461. for (const char * s = pattern; *s != '\00';) {
  462. assert(!is_quantifier(*s) && "Pattern starts with quantifier.");
  463. // Reset the compiler
  464. whitelist[0] = '\0';
  465. blacklist[0] = '\0';
  466. cs.flags &= (IS_AT_THE_BEGINNING | FORCE_START_OF_STRING);
  467. cs.width = 1;
  468. cs.width2 = 1;
  469. // Translate char
  470. switch (*s) {
  471. case '^': {
  472. ;
  473. } break;
  474. case '.': {
  475. compile_dot(&cs);
  476. s += 1;
  477. } break;
  478. case '\\': {
  479. s += 1;
  480. if (compile_escape(*s, &cs)) {
  481. s += 1;
  482. } else if (is_hologram_escape(*s)) {
  483. s -= 1;
  484. } else {
  485. assert("Unknown escape.");
  486. }
  487. } break;
  488. case '[': {
  489. s += compile_range(s, &cs);
  490. } break;
  491. default: { // Literal
  492. whitelist[0] = *s;
  493. whitelist[1] = '\0';
  494. s += 1;
  495. } break;
  496. }
  497. /* Ew */
  498. if (*s == '\\'
  499. && is_hologram_escape(*(s+1))) {
  500. ++s;
  501. }
  502. // Compile char
  503. switch (*s) {
  504. // holograms
  505. case '^': {
  506. whitelist[0] = '\n';
  507. whitelist[1] = '\0';
  508. HOOK_ALL(0, whitelist, 0, &cs, regex);
  509. if (cs.flags & IS_AT_THE_BEGINNING) {
  510. cs.flags |= FORCE_START_OF_STRING;
  511. } else {
  512. cs.flags |= INCREMENT_STATE;
  513. }
  514. s += 1;
  515. } break;
  516. case '<': {
  517. // XXX: make this legible
  518. if (cs.flags & IS_AT_THE_BEGINNING
  519. && !(cs.flags & DO_CATCH)
  520. && !(cs.flags & IS_NEGATIVE)
  521. && whitelist[0] == '\0') {
  522. // ---
  523. cs.flags |= INCREMENT_STATE;
  524. cs.flags |= DO_FORBID_START_OF_STRING;
  525. strcat(whitelist, JEGER_CHAR_symbol_chars);
  526. // ---
  527. ABSOLUTE_OFFSHOOT( JEGER_SOS_STATE, JEGER_INIT_STATE+1, 0, 0, regex);
  528. ABSOLUTE_OFFSHOOT(JEGER_INIT_STATE, JEGER_INIT_STATE+2, 1, 0, regex);
  529. HOOK_ALL(0, whitelist, HALT_AND_CATCH_FIRE, &cs, regex);
  530. // ---
  531. ++cs.state;
  532. cs.width = 0;
  533. cs.width2 = 0;
  534. HOOK_ALL(0, whitelist, +1, &cs, regex);
  535. cs.width = 1;
  536. OFFSHOOT(0, +1, 1, 0, &cs, regex);
  537. // ---
  538. } else {
  539. HOOK_ALL(0, whitelist, +1, &cs, regex);
  540. if ((cs.flags & DO_CATCH)
  541. || (cs.flags & IS_NEGATIVE)) {
  542. OFFSHOOT(+1, +2, 1, 1, &cs, regex);
  543. } else {
  544. cs.flags |= INCREMENT_STATE;
  545. }
  546. OFFSHOOT(0, +1, 1, 0, &cs, regex);
  547. }
  548. cs.flags |= IS_NEGATIVE;
  549. strcat(blacklist, JEGER_CHAR_symbol_chars);
  550. s += 1;
  551. } break;
  552. case '>': {
  553. HOOK_ALL(0, whitelist, +1, &cs, regex);
  554. cs.flags |= IS_NEGATIVE | INCREMENT_STATE;
  555. strcat(blacklist, JEGER_CHAR_symbol_chars);
  556. OFFSHOOT(+1, +2, 0, 0, &cs, regex);
  557. ++cs.state;
  558. s += 1;
  559. } break;
  560. // quantifiers
  561. case '=':
  562. case '?': {
  563. HOOK_ALL(0, whitelist, +1, &cs, regex);
  564. if ((cs.flags & DO_CATCH)
  565. || (cs.flags & IS_NEGATIVE)) {
  566. OFFSHOOT(0, +1, 1, 1, &cs, regex);
  567. }
  568. s += 1;
  569. } break;
  570. case '*': {
  571. HOOK_ALL(0, whitelist, 0, &cs, regex);
  572. if ((cs.flags & DO_CATCH)
  573. || (cs.flags & IS_NEGATIVE)) {
  574. OFFSHOOT(0, 0, 1, 1, &cs, regex);
  575. }
  576. s += 1;
  577. } break;
  578. case '+': {
  579. cs.flags |= INCREMENT_STATE;
  580. HOOK_ALL(0, whitelist, +1, &cs, regex);
  581. if ((cs.flags & DO_CATCH)
  582. || (cs.flags & IS_NEGATIVE)) {
  583. OFFSHOOT(0, +1, 1, 1, &cs, regex);
  584. }
  585. HOOK_ALL(+1, whitelist, +1, &cs, regex);
  586. if ((cs.flags & DO_CATCH)
  587. || (cs.flags & IS_NEGATIVE)) {
  588. OFFSHOOT(+1, +1, 1, 1, &cs, regex);
  589. }
  590. s += 1;
  591. } break;
  592. default: { // Literal
  593. cs.flags |= INCREMENT_STATE;
  594. HOOK_ALL(0, whitelist, +1, &cs, regex);
  595. if ((cs.flags & DO_CATCH)
  596. || (cs.flags & IS_NEGATIVE)) {
  597. OFFSHOOT(0, +1, 1, 1, &cs, regex);
  598. }
  599. } break;
  600. }
  601. // Compile blacklist
  602. if (*blacklist) {
  603. char filtered_blacklist[64];
  604. filtered_blacklist[0] = '\0';
  605. filter_blacklist(whitelist, blacklist, filtered_blacklist);
  606. HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs, regex);
  607. }
  608. if (cs.flags & INCREMENT_STATE) {
  609. ++cs.state;
  610. }
  611. cs.flags &= (~IS_AT_THE_BEGINNING);
  612. }
  613. // Init state hookups
  614. if (!(cs.flags & DO_FORBID_START_OF_STRING)) {
  615. ABSOLUTE_OFFSHOOT(JEGER_SOS_STATE, JEGER_INIT_STATE, 0, 0, regex);
  616. }
  617. if (cs.flags & FORCE_START_OF_STRING) {
  618. ABSOLUTE_OFFSHOOT(JEGER_NSOS_STATE, HALT_AND_CATCH_FIRE, 0, 0, regex);
  619. } else {
  620. ABSOLUTE_OFFSHOOT(JEGER_NSOS_STATE, JEGER_INIT_STATE, 0, 0, regex);
  621. }
  622. regex->accepting_state = cs.state;
  623. return regex;
  624. }
  625. int regex_free(regex_t * const regex) {
  626. free(regex->str);
  627. vector_free(&regex->delta_table);
  628. vector_free(&regex->catch_table);
  629. free(regex);
  630. return 0;
  631. }
  632. // -----------------
  633. // ### Searching ###
  634. // -----------------
  635. static
  636. const offshoot_t * catch_table_lookup(const regex_t * const regex,
  637. const int * const state) {
  638. for (size_t i = 0; i < regex->catch_table.element_count; i++){
  639. const offshoot_t * const offshoot = *(offshoot_t**)vector_get(&regex->catch_table, i);
  640. if (offshoot->in == *state) {
  641. return offshoot;
  642. }
  643. }
  644. return NULL;
  645. }
  646. static
  647. bool regex_assert(const regex_t * const regex,
  648. const char * const string,
  649. int state,
  650. match_t * const match) {
  651. if (state == HALT_AND_CATCH_FIRE) {
  652. return false;
  653. }
  654. bool last_stand = false;
  655. bool was_found;
  656. const char * s = string;
  657. LOOP: {
  658. was_found = false;
  659. if (*s == '\0') {
  660. last_stand = true;
  661. goto PERFORM_CATCH_LOOKUP;
  662. }
  663. // Jump search for the correct state
  664. const int jump = 10;
  665. size_t i = jump;
  666. while (i < regex->delta_table.element_count) {
  667. const delta_t * const delta = *(delta_t**)vector_get(&regex->delta_table, i);
  668. if (delta->in >= state) {
  669. break;
  670. }
  671. i += jump;
  672. }
  673. i -= jump;
  674. // Linear search finish up
  675. for (; i < regex->delta_table.element_count; i++) {
  676. const delta_t * const delta = *(delta_t**)vector_get(&regex->delta_table, i);
  677. if (delta->in > state) {
  678. break;
  679. }
  680. if ((delta->in == state)
  681. && (delta->input == *s)) {
  682. bool do_reset = false;
  683. was_found = true;
  684. if (!match->_pos_ptr && delta->match_width) {
  685. match->_pos_ptr = s;
  686. do_reset = true;
  687. }
  688. const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match);
  689. if(r){
  690. match->width += delta->match_width;
  691. return r;
  692. } else if (do_reset) {
  693. match->_pos_ptr = NULL;
  694. }
  695. }
  696. }
  697. }
  698. PERFORM_CATCH_LOOKUP: {
  699. if (!was_found) {
  700. const offshoot_t * const my_catch = catch_table_lookup(regex, &state);
  701. if (my_catch && (!my_catch->pattern_width || !last_stand)) {
  702. state = my_catch->to;
  703. s += my_catch->pattern_width;
  704. match->width += my_catch->match_width;
  705. goto LOOP;
  706. }
  707. }
  708. }
  709. return (state == regex->accepting_state);
  710. }
  711. match_t * regex_match(const regex_t * const regex,
  712. const char * const string,
  713. const bool is_start_of_string) {
  714. vector_t matches;
  715. vector_init(&matches, sizeof(match_t), 0);
  716. match_t * match = (match_t *)malloc(sizeof(match_t));
  717. /* Non-existent regex does not match anything.
  718. * Not to be confused with an empty regex.
  719. */
  720. if (regex == NULL) {
  721. goto FINISH;
  722. }
  723. // Find all matches
  724. {
  725. const char * s = string;
  726. int initial_state;
  727. do {
  728. initial_state = (int)(!(is_start_of_string && (s == string)));
  729. *match = (match_t){
  730. ._pos_ptr = NULL,
  731. .width = 0,
  732. };
  733. if (regex_assert(regex, s, initial_state, match)) {
  734. if (match->_pos_ptr) {
  735. match->position = (match->_pos_ptr - string);
  736. } else {
  737. match->position = (s - string);
  738. }
  739. vector_push(&matches, match);
  740. s += ((match->width > 0) ? match->width : 1);
  741. match = (match_t *)malloc(sizeof(match_t));
  742. } else {
  743. ++s;
  744. }
  745. } while (*s != '\0');
  746. }
  747. FINISH:
  748. // Insert sentinel
  749. *match = (match_t){
  750. .position = -1,
  751. .width = -1,
  752. };
  753. vector_push(&matches, match);
  754. // Hide internal vector usage
  755. const size_t data_size = matches.element_size * matches.element_count;
  756. match_t * r = (match_t *)malloc(data_size);
  757. memcpy(r, matches.data, data_size);
  758. vector_free(&matches);
  759. return r;
  760. }
  761. bool regex_search(const regex_t * const regex,
  762. const char * const string) {
  763. match_t * m = regex_match(regex, string, true);
  764. const bool r = !is_sentinel(m);
  765. free(m);
  766. return r;
  767. }