Highlight things
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

843 行
22KB

  1. #ifdef __cplusplus
  2. # pragma GCC diagnostic ignored "-Wc++20-extensions"
  3. #endif
  4. #include "jeger.h"
  5. #include <assert.h>
  6. #include <string.h>
  7. #include <limits.h>
  8. #include <stdlib.h>
  9. #if DEBUG
  10. # include <stdio.h>
  11. #endif
  12. #define JEGER_SOS_STATE 0
  13. #define JEGER_NSOS_STATE 1
  14. #define JEGER_INIT_STATE 2
  15. // ------------------
  16. // ### Char tests ###
  17. // ------------------
  18. static inline
  19. bool mystrchr(const char * const str, const char c){
  20. for (const char * s = str; *s != '\00'; s++) {
  21. if (*s == c) {
  22. return true;
  23. }
  24. }
  25. return false;
  26. }
  27. static inline
  28. bool is_quantifier(const char c) {
  29. return mystrchr("=?+*", c);
  30. }
  31. static inline
  32. bool is_hologram_escape(const char c) {
  33. return mystrchr("<>", c);
  34. }
  35. bool is_magic(const char c) {
  36. return is_quantifier(c)
  37. || mystrchr("\\[].^", c)
  38. ;
  39. }
  40. // -------------------
  41. // ### Match tests ###
  42. // -------------------
  43. bool is_sentinel(const match_t * const match) {
  44. return (match->position == -1)
  45. && (match->width == -1)
  46. ;
  47. }
  48. // -----------------
  49. // ### Char sets ###
  50. // -----------------
  51. #define JEGER_CHAR_SET_at "@"
  52. #define JEGER_CHAR_SET_underscore "_"
  53. #define JEGER_CHAR_SET_lower "abcdefghijklmnopqrstuwxyz"
  54. #define JEGER_CHAR_SET_upper "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  55. #define JEGER_CHAR_SET_digits "0123456789"
  56. #define JEGER_CHAR_SET_octal_digits "01234567"
  57. #define JEGER_CHAR_SET_lower_hex "abcdef"
  58. #define JEGER_CHAR_SET_upper_hex "ABCDEF"
  59. #define JEGER_CHAR_SET_oct_241_to_277 \
  60. "\241\242\243\244\245" \
  61. "\246\247\250\251\252" \
  62. "\253\254\255\256\257" \
  63. "\260\261\262\263\264" \
  64. "\265\266\267\270\271" \
  65. "\272\273\274\275\276" \
  66. "\277"
  67. #define JEGER_CHAR_SET_oct_300_to_337 \
  68. "\300\301\302\303\304" \
  69. "\305\306\307\310\311" \
  70. "\312\313\314\315\316" \
  71. "\317\320\321\322\323" \
  72. "\324\325\326\327\330" \
  73. "\331\332\333\334\335" \
  74. "\336\337"
  75. #define JEGER_CHAR_SET_file_extra "/.-_+,#$%~="
  76. #define JEGER_CHAR_SET_whitespace " " "\t\v\n"
  77. static const char JEGER_CHAR_symbol_chars[] =
  78. JEGER_CHAR_SET_underscore
  79. JEGER_CHAR_SET_lower
  80. JEGER_CHAR_SET_upper
  81. ;
  82. // ----------------------
  83. // ### Internal Types ###
  84. // ----------------------
  85. typedef struct {
  86. int in;
  87. char input;
  88. int to;
  89. int pattern_width;
  90. int match_width;
  91. } delta_t;
  92. typedef struct {
  93. int in;
  94. int to;
  95. int pattern_width;
  96. int match_width;
  97. } offshoot_t;
  98. enum {
  99. DO_CATCH = 0x00000001 << 0,
  100. IS_NEGATIVE = 0x00000001 << 1,
  101. IS_AT_THE_BEGINNING = 0x00000001 << 2,
  102. FORCE_START_OF_STRING = 0x00000001 << 3,
  103. DO_FORBID_START_OF_STRING = 0x00000001 << 4,
  104. INCREMENT_STATE = 0x00000001 << 5,
  105. };
  106. typedef struct {
  107. int flags;
  108. int state;
  109. int width;
  110. int width2;
  111. char * whitelist;
  112. char * blacklist;
  113. } compiler_state;
  114. // ----------------------------------
  115. // ### Regex creation/destruction ###
  116. // ----------------------------------
  117. static const int HALT_AND_CATCH_FIRE = INT_MIN;
  118. #define ASSERT_HALT(a) ((a == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : (cs->state + a))
  119. static
  120. void HOOK_ALL(const int from,
  121. const char * const str,
  122. const int to,
  123. const compiler_state * const cs,
  124. regex_t * regex) {
  125. for (const char * s = str; *s != '\0'; s++) {
  126. delta_t * delta = (delta_t *)malloc(sizeof(delta_t));
  127. *delta = (delta_t){
  128. .in = cs->state + from,
  129. .input = *s,
  130. .to = ASSERT_HALT(to),
  131. .pattern_width = cs->width,
  132. .match_width = cs->width2,
  133. };
  134. vector_push(&regex->delta_table,
  135. &delta);
  136. }
  137. }
  138. static
  139. void ABSOLUTE_OFFSHOOT(const int from,
  140. const int to,
  141. const int width,
  142. const int match_width,
  143. regex_t * regex) {
  144. offshoot_t * offshoot = (offshoot_t *)malloc(sizeof(offshoot_t));
  145. *offshoot = (offshoot_t){
  146. .in = from,
  147. .to = to,
  148. .pattern_width = width,
  149. .match_width = match_width,
  150. };
  151. vector_push(&regex->catch_table,
  152. &offshoot);
  153. }
  154. static
  155. void OFFSHOOT(const int from,
  156. const int to,
  157. const int width,
  158. const int match_width,
  159. const compiler_state * cs,
  160. regex_t * regex) {
  161. ABSOLUTE_OFFSHOOT(cs->state + from, ASSERT_HALT(to), width, match_width, regex);
  162. }
  163. static
  164. int escape_1_to_1(const char c,
  165. const compiler_state * const cs) {
  166. char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist;
  167. switch (c) {
  168. case 't': {
  169. strcat(target_list, "\t");
  170. } return 1;
  171. case 'n': {
  172. strcat(target_list, "\n");
  173. } return 1;
  174. case 'r': {
  175. strcat(target_list, "\r");
  176. } return 1;
  177. case 'b': {
  178. strcat(target_list, "\b");
  179. } return 1;
  180. case '[': {
  181. strcat(target_list, "[");
  182. } return 1;
  183. case ']': {
  184. strcat(target_list, "]");
  185. } return 1;
  186. case '.': {
  187. strcat(target_list, ".");
  188. } return 1;
  189. case '^': {
  190. strcat(target_list, "^");
  191. } return 1;
  192. case '=': {
  193. strcat(target_list, "=");
  194. } return 1;
  195. case '?': {
  196. strcat(target_list, "?");
  197. } return 1;
  198. case '+': {
  199. strcat(target_list, "+");
  200. } return 1;
  201. case '*': {
  202. strcat(target_list, "*");
  203. } return 1;
  204. case '\\': {
  205. strcat(target_list, "\\");
  206. } return 1;
  207. }
  208. return 0;
  209. }
  210. static
  211. int escape_1_to_N(const char c,
  212. const compiler_state * const cs) {
  213. char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist;
  214. switch(c) {
  215. case 'i': {
  216. const char identifier_chars[] = JEGER_CHAR_SET_at
  217. JEGER_CHAR_SET_underscore
  218. JEGER_CHAR_SET_digits
  219. JEGER_CHAR_SET_oct_300_to_337
  220. ;
  221. strcpy(target_list, identifier_chars);
  222. return sizeof(identifier_chars)-1;
  223. };
  224. case 'I': {
  225. const char identifier_chars[] = JEGER_CHAR_SET_at
  226. JEGER_CHAR_SET_underscore
  227. JEGER_CHAR_SET_oct_300_to_337
  228. ;
  229. strcpy(target_list, identifier_chars);
  230. return sizeof(identifier_chars)-1;
  231. };
  232. case 'k': {
  233. const char keyword_chars[] = JEGER_CHAR_SET_at
  234. JEGER_CHAR_SET_underscore
  235. JEGER_CHAR_SET_digits
  236. JEGER_CHAR_SET_oct_300_to_337
  237. ;
  238. strcpy(target_list, keyword_chars);
  239. return sizeof(keyword_chars)-1;
  240. };
  241. case 'K': {
  242. const char keyword_chars[] = JEGER_CHAR_SET_at
  243. JEGER_CHAR_SET_underscore
  244. JEGER_CHAR_SET_oct_300_to_337
  245. ;
  246. strcpy(target_list, keyword_chars);
  247. return sizeof(keyword_chars)-1;
  248. };
  249. case 'f': {
  250. const char filename_chars[] = JEGER_CHAR_SET_at
  251. JEGER_CHAR_SET_digits
  252. JEGER_CHAR_SET_file_extra
  253. ;
  254. strcpy(target_list, filename_chars);
  255. return sizeof(filename_chars)-1;
  256. };
  257. case 'F': {
  258. const char filename_chars[] = JEGER_CHAR_SET_at
  259. JEGER_CHAR_SET_file_extra
  260. ;
  261. strcpy(target_list, filename_chars);
  262. return sizeof(filename_chars)-1;
  263. };
  264. case 'p': {
  265. const char printable_chars[] = JEGER_CHAR_SET_at
  266. JEGER_CHAR_SET_oct_241_to_277
  267. JEGER_CHAR_SET_oct_300_to_337
  268. ;
  269. strcpy(target_list, printable_chars);
  270. return sizeof(printable_chars)-1;
  271. };
  272. case 'P': {
  273. const char printable_chars[] = JEGER_CHAR_SET_at
  274. JEGER_CHAR_SET_oct_241_to_277
  275. JEGER_CHAR_SET_oct_300_to_337
  276. ;
  277. strcpy(target_list, printable_chars);
  278. return sizeof(printable_chars)-1;
  279. };
  280. case 's': {
  281. const char whitespace_chars[] = JEGER_CHAR_SET_whitespace;
  282. strcpy(target_list, whitespace_chars);
  283. return sizeof(whitespace_chars)-1;
  284. };
  285. case 'd': {
  286. const char digit_chars[] = JEGER_CHAR_SET_digits;
  287. strcpy(target_list, digit_chars);
  288. return sizeof(digit_chars)-1;
  289. };
  290. case 'x': {
  291. const char hex_chars[] = JEGER_CHAR_SET_digits
  292. JEGER_CHAR_SET_lower_hex
  293. JEGER_CHAR_SET_upper_hex
  294. ;
  295. strcpy(target_list, hex_chars);
  296. return sizeof(hex_chars)-1;
  297. };
  298. case 'o': {
  299. const char oct_chars[] = JEGER_CHAR_SET_octal_digits;
  300. strcpy(target_list, oct_chars);
  301. return sizeof(oct_chars)-1;
  302. };
  303. case 'w': {
  304. const char word_chars[] = JEGER_CHAR_SET_underscore
  305. JEGER_CHAR_SET_digits
  306. JEGER_CHAR_SET_lower
  307. JEGER_CHAR_SET_upper
  308. ;
  309. strcpy(target_list, word_chars);
  310. return sizeof(word_chars)-1;
  311. };
  312. case 'h': {
  313. // #global JEGER_CHAR_symbol_chars
  314. strcpy(target_list, JEGER_CHAR_symbol_chars);
  315. return sizeof(JEGER_CHAR_symbol_chars)-1;
  316. };
  317. case 'a': {
  318. const char alpha_chars[] = JEGER_CHAR_SET_lower
  319. JEGER_CHAR_SET_upper
  320. ;
  321. strcpy(target_list, alpha_chars);
  322. return sizeof(alpha_chars)-1;
  323. };
  324. case 'l': {
  325. const char lower_alpha_chars[] = JEGER_CHAR_SET_lower;
  326. strcpy(target_list, lower_alpha_chars);
  327. return sizeof(lower_alpha_chars)-1;
  328. };
  329. case 'u': {
  330. const char upper_alpha_chars[] = JEGER_CHAR_SET_upper;
  331. strcpy(target_list, upper_alpha_chars);
  332. return sizeof(upper_alpha_chars)-1;
  333. };
  334. }
  335. return 0;
  336. }
  337. static inline
  338. int escape_to_negative(const char c,
  339. compiler_state * const cs) {
  340. switch (c) {
  341. case 'D': {
  342. const char digit_chars[] = JEGER_CHAR_SET_digits;
  343. strcpy(cs->blacklist, digit_chars);
  344. cs->flags |= IS_NEGATIVE;
  345. return sizeof(digit_chars)-1;
  346. };
  347. case 'X': {
  348. const char hex_chars[] = JEGER_CHAR_SET_digits
  349. JEGER_CHAR_SET_lower_hex
  350. JEGER_CHAR_SET_upper_hex
  351. ;
  352. strcpy(cs->blacklist, hex_chars);
  353. cs->flags |= IS_NEGATIVE;
  354. return sizeof(hex_chars)-1;
  355. };
  356. case 'O': {
  357. const char oct_chars[] = JEGER_CHAR_SET_octal_digits;
  358. strcpy(cs->blacklist, oct_chars);
  359. cs->flags |= IS_NEGATIVE;
  360. return sizeof(oct_chars)-1;
  361. };
  362. case 'W': {
  363. const char word_chars[] = JEGER_CHAR_SET_underscore
  364. JEGER_CHAR_SET_digits
  365. JEGER_CHAR_SET_lower
  366. JEGER_CHAR_SET_upper
  367. ;
  368. strcpy(cs->blacklist, word_chars);
  369. cs->flags |= IS_NEGATIVE;
  370. return sizeof(word_chars)-1;
  371. };
  372. case 'L': {
  373. const char lower_alpha_chars[] = JEGER_CHAR_SET_lower;
  374. strcpy(cs->blacklist, lower_alpha_chars);
  375. cs->flags |= IS_NEGATIVE;
  376. return sizeof(lower_alpha_chars)-1;
  377. };
  378. case 'U': {
  379. const char upper_alpha_chars[] = JEGER_CHAR_SET_upper;
  380. strcpy(cs->blacklist, upper_alpha_chars);
  381. cs->flags |= IS_NEGATIVE;
  382. return sizeof(upper_alpha_chars)-1;
  383. };
  384. }
  385. return 0;
  386. }
  387. static inline
  388. int compile_dot(compiler_state * const cs) {
  389. cs->flags |= DO_CATCH;
  390. return true;
  391. }
  392. static inline
  393. int compile_escape(const char c,
  394. compiler_state * const cs) {
  395. return escape_1_to_1(c, cs)
  396. || escape_1_to_N(c, cs)
  397. || escape_to_negative(c, cs)
  398. ;
  399. }
  400. static
  401. int compile_range(const char * const range,
  402. compiler_state * const cs) {
  403. assert((range[0] == '[') && "Not a range.");
  404. const char * s;
  405. if (range[1] == '^') {
  406. cs->flags |= IS_NEGATIVE;
  407. s = range + 2;
  408. } else {
  409. s = range + 1;
  410. }
  411. char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist;
  412. for (; *s != ']'; s++) {
  413. assert((*s != '\0') && "Unclosed range.");
  414. char c = *s;
  415. if (c == '\\') {
  416. s += 1;
  417. assert(compile_escape(*s, cs) && "Unknown escape.");
  418. } else if (*(s+1) == '-') {
  419. char end = *(s+2);
  420. assert((c < end) && "Endless range.");
  421. for (char cc = c; cc < end+1; cc++) {
  422. strncat(target_list, &cc, 1);
  423. strncat(target_list, "\0", 1);
  424. }
  425. s += 2;
  426. } else {
  427. strncat(target_list, &c, 1);
  428. }
  429. }
  430. return ((s - range) + 1);
  431. }
  432. static
  433. void filter_blacklist(const char * whitelist,
  434. const char * blacklist,
  435. char * filtered) {
  436. for (; *blacklist != '\0'; blacklist++) {
  437. for (; *whitelist != '\0'; whitelist++) {
  438. if (*blacklist == *whitelist) {
  439. goto long_continue;
  440. }
  441. }
  442. strncat(filtered, blacklist, 1);
  443. long_continue:
  444. ;
  445. }
  446. }
  447. regex_t * regex_compile(const char * const pattern) {
  448. regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
  449. regex->str = strdup(pattern);
  450. vector_init(&regex->delta_table, sizeof(delta_t*), 0UL);
  451. vector_init(&regex->catch_table, sizeof(offshoot_t*), 0UL);
  452. char whitelist[64];
  453. char blacklist[64];
  454. compiler_state cs = {
  455. .flags = IS_AT_THE_BEGINNING,
  456. .state = JEGER_INIT_STATE,
  457. .whitelist = whitelist,
  458. .blacklist = blacklist,
  459. };
  460. for (const char * s = pattern; *s != '\00';) {
  461. assert(!is_quantifier(*s) && "Pattern starts with quantifier.");
  462. // Reset the compiler
  463. whitelist[0] = '\0';
  464. blacklist[0] = '\0';
  465. cs.flags &= (IS_AT_THE_BEGINNING | FORCE_START_OF_STRING);
  466. cs.width = 1;
  467. cs.width2 = 1;
  468. // Translate char
  469. switch (*s) {
  470. case '^': {
  471. ;
  472. } break;
  473. case '.': {
  474. compile_dot(&cs);
  475. s += 1;
  476. } break;
  477. case '\\': {
  478. s += 1;
  479. if (compile_escape(*s, &cs)) {
  480. s += 1;
  481. } else if (is_hologram_escape(*s)) {
  482. s -= 1;
  483. } else {
  484. assert("Unknown escape.");
  485. }
  486. } break;
  487. case '[': {
  488. s += compile_range(s, &cs);
  489. } break;
  490. default: { // Literal
  491. whitelist[0] = *s;
  492. whitelist[1] = '\0';
  493. s += 1;
  494. } break;
  495. }
  496. /* Ew */
  497. if (*s == '\\'
  498. && is_hologram_escape(*(s+1))) {
  499. ++s;
  500. }
  501. // Compile char
  502. switch (*s) {
  503. // holograms
  504. case '^': {
  505. whitelist[0] = '\n';
  506. whitelist[1] = '\0';
  507. HOOK_ALL(0, whitelist, 0, &cs, regex);
  508. if (cs.flags & IS_AT_THE_BEGINNING) {
  509. cs.flags |= FORCE_START_OF_STRING;
  510. } else {
  511. cs.flags |= INCREMENT_STATE;
  512. }
  513. s += 1;
  514. } break;
  515. case '<': {
  516. // XXX: make this legible
  517. if (cs.flags & IS_AT_THE_BEGINNING
  518. && !(cs.flags & DO_CATCH)
  519. && !(cs.flags & IS_NEGATIVE)
  520. && whitelist[0] == '\0') {
  521. // ---
  522. cs.flags |= INCREMENT_STATE;
  523. cs.flags |= DO_FORBID_START_OF_STRING;
  524. strcat(whitelist, JEGER_CHAR_symbol_chars);
  525. // ---
  526. ABSOLUTE_OFFSHOOT( JEGER_SOS_STATE, JEGER_INIT_STATE+1, 0, 0, regex);
  527. ABSOLUTE_OFFSHOOT(JEGER_INIT_STATE, JEGER_INIT_STATE+2, 1, 0, regex);
  528. HOOK_ALL(0, whitelist, HALT_AND_CATCH_FIRE, &cs, regex);
  529. // ---
  530. ++cs.state;
  531. cs.width = 0;
  532. cs.width2 = 0;
  533. HOOK_ALL(0, whitelist, +1, &cs, regex);
  534. cs.width = 1;
  535. OFFSHOOT(0, +1, 1, 0, &cs, regex);
  536. // ---
  537. } else {
  538. HOOK_ALL(0, whitelist, +1, &cs, regex);
  539. if ((cs.flags & DO_CATCH)
  540. || (cs.flags & IS_NEGATIVE)) {
  541. OFFSHOOT(+1, +2, 1, 1, &cs, regex);
  542. } else {
  543. cs.flags |= INCREMENT_STATE;
  544. }
  545. OFFSHOOT(0, +1, 1, 0, &cs, regex);
  546. }
  547. cs.flags |= IS_NEGATIVE;
  548. strcat(blacklist, JEGER_CHAR_symbol_chars);
  549. s += 1;
  550. } break;
  551. case '>': {
  552. HOOK_ALL(0, whitelist, +1, &cs, regex);
  553. cs.flags |= IS_NEGATIVE | INCREMENT_STATE;
  554. strcat(blacklist, JEGER_CHAR_symbol_chars);
  555. OFFSHOOT(+1, +2, 0, 0, &cs, regex);
  556. ++cs.state;
  557. s += 1;
  558. } break;
  559. // quantifiers
  560. case '=':
  561. case '?': {
  562. HOOK_ALL(0, whitelist, +1, &cs, regex);
  563. if ((cs.flags & DO_CATCH)
  564. || (cs.flags & IS_NEGATIVE)) {
  565. OFFSHOOT(0, +1, 1, 1, &cs, regex);
  566. }
  567. s += 1;
  568. } break;
  569. case '*': {
  570. HOOK_ALL(0, whitelist, 0, &cs, regex);
  571. if ((cs.flags & DO_CATCH)
  572. || (cs.flags & IS_NEGATIVE)) {
  573. OFFSHOOT(0, 0, 1, 1, &cs, regex);
  574. }
  575. s += 1;
  576. } break;
  577. case '+': {
  578. cs.flags |= INCREMENT_STATE;
  579. HOOK_ALL(0, whitelist, +1, &cs, regex);
  580. if ((cs.flags & DO_CATCH)
  581. || (cs.flags & IS_NEGATIVE)) {
  582. OFFSHOOT(0, +1, 1, 1, &cs, regex);
  583. }
  584. HOOK_ALL(+1, whitelist, +1, &cs, regex);
  585. if ((cs.flags & DO_CATCH)
  586. || (cs.flags & IS_NEGATIVE)) {
  587. OFFSHOOT(+1, +1, 1, 1, &cs, regex);
  588. }
  589. s += 1;
  590. } break;
  591. default: { // Literal
  592. cs.flags |= INCREMENT_STATE;
  593. HOOK_ALL(0, whitelist, +1, &cs, regex);
  594. if ((cs.flags & DO_CATCH)
  595. || (cs.flags & IS_NEGATIVE)) {
  596. OFFSHOOT(0, +1, 1, 1, &cs, regex);
  597. }
  598. } break;
  599. }
  600. // Compile blacklist
  601. if (*blacklist) {
  602. char filtered_blacklist[64];
  603. filtered_blacklist[0] = '\0';
  604. filter_blacklist(whitelist, blacklist, filtered_blacklist);
  605. HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs, regex);
  606. }
  607. if (cs.flags & INCREMENT_STATE) {
  608. ++cs.state;
  609. }
  610. cs.flags &= (~IS_AT_THE_BEGINNING);
  611. }
  612. // Init state hookups
  613. if (!(cs.flags & DO_FORBID_START_OF_STRING)) {
  614. ABSOLUTE_OFFSHOOT(JEGER_SOS_STATE, JEGER_INIT_STATE, 0, 0, regex);
  615. }
  616. if (cs.flags & FORCE_START_OF_STRING) {
  617. ABSOLUTE_OFFSHOOT(JEGER_NSOS_STATE, HALT_AND_CATCH_FIRE, 0, 0, regex);
  618. } else {
  619. ABSOLUTE_OFFSHOOT(JEGER_NSOS_STATE, JEGER_INIT_STATE, 0, 0, regex);
  620. }
  621. regex->accepting_state = cs.state;
  622. return regex;
  623. }
  624. int regex_free(regex_t * const regex) {
  625. free(regex->str);
  626. vector_free(&regex->delta_table);
  627. vector_free(&regex->catch_table);
  628. free(regex);
  629. return 0;
  630. }
  631. // -----------------
  632. // ### Searching ###
  633. // -----------------
  634. static
  635. const offshoot_t * catch_table_lookup(const regex_t * const regex,
  636. const int * const state) {
  637. for (size_t i = 0; i < regex->catch_table.element_count; i++){
  638. const offshoot_t * const offshoot = *(offshoot_t**)vector_get(&regex->catch_table, i);
  639. if (offshoot->in == *state) {
  640. return offshoot;
  641. }
  642. }
  643. return NULL;
  644. }
  645. static
  646. bool regex_assert(const regex_t * const regex,
  647. const char * const string,
  648. int state,
  649. match_t * const match) {
  650. if (state == HALT_AND_CATCH_FIRE) {
  651. return false;
  652. }
  653. bool last_stand = false;
  654. bool was_found;
  655. const char * s = string;
  656. LOOP: {
  657. was_found = false;
  658. if (*s == '\0') {
  659. last_stand = true;
  660. goto PERFORM_CATCH_LOOKUP;
  661. }
  662. // Jump search for the correct state
  663. const int jump = 10;
  664. size_t i = jump;
  665. while (i < regex->delta_table.element_count) {
  666. const delta_t * const delta = *(delta_t**)vector_get(&regex->delta_table, i);
  667. if (delta->in >= state) {
  668. break;
  669. }
  670. i += jump;
  671. }
  672. i -= jump;
  673. // Linear search finish up
  674. for (; i < regex->delta_table.element_count; i++) {
  675. const delta_t * const delta = *(delta_t**)vector_get(&regex->delta_table, i);
  676. if (delta->in > state) {
  677. break;
  678. }
  679. if ((delta->in == state)
  680. && (delta->input == *s)) {
  681. bool do_reset = false;
  682. was_found = true;
  683. if (!match->_pos_ptr && delta->match_width) {
  684. match->_pos_ptr = s;
  685. do_reset = true;
  686. }
  687. const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match);
  688. if(r){
  689. match->width += delta->match_width;
  690. return r;
  691. } else if (do_reset) {
  692. match->_pos_ptr = NULL;
  693. }
  694. }
  695. }
  696. }
  697. PERFORM_CATCH_LOOKUP: {
  698. if (!was_found) {
  699. const offshoot_t * const my_catch = catch_table_lookup(regex, &state);
  700. if (my_catch && (!my_catch->pattern_width || !last_stand)) {
  701. state = my_catch->to;
  702. s += my_catch->pattern_width;
  703. match->width += my_catch->match_width;
  704. goto LOOP;
  705. }
  706. }
  707. }
  708. return (state == regex->accepting_state);
  709. }
  710. match_t * regex_match(const regex_t * const regex,
  711. const char * const string,
  712. const bool is_start_of_string) {
  713. vector_t matches;
  714. vector_init(&matches, sizeof(match_t), 0);
  715. match_t * match = (match_t *)malloc(sizeof(match_t));
  716. /* Non-existent regex does not match anything.
  717. * Not to be confused with an empty regex.
  718. */
  719. if (regex == NULL) {
  720. goto FINISH;
  721. }
  722. // Find all matches
  723. {
  724. const char * s = string;
  725. int initial_state;
  726. do {
  727. initial_state = (int)(!(is_start_of_string && (s == string)));
  728. *match = (match_t){
  729. ._pos_ptr = NULL,
  730. .width = 0,
  731. };
  732. if (regex_assert(regex, s, initial_state, match)) {
  733. if (match->_pos_ptr) {
  734. match->position = (match->_pos_ptr - string);
  735. } else {
  736. match->position = (s - string);
  737. }
  738. vector_push(&matches, match);
  739. s += ((match->width > 0) ? match->width : 1);
  740. match = (match_t *)malloc(sizeof(match_t));
  741. } else {
  742. ++s;
  743. }
  744. } while (*s != '\0');
  745. }
  746. FINISH:
  747. // Insert sentinel
  748. *match = (match_t){
  749. .position = -1,
  750. .width = -1,
  751. };
  752. vector_push(&matches, match);
  753. // Hide internal vector usage
  754. const size_t data_size = matches.element_size * matches.element_count;
  755. match_t * r = (match_t *)malloc(data_size);
  756. memcpy(r, matches.data, data_size);
  757. vector_free(&matches);
  758. return r;
  759. }
  760. bool regex_search(const regex_t * const regex,
  761. const char * const string) {
  762. match_t * m = regex_match(regex, string, true);
  763. const bool r = !is_sentinel(m);
  764. free(m);
  765. return r;
  766. }