Highlight things
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

659 lines
18KB

  1. /* regex.c
  2. * Copyright 2023 Anon Anonson, Ognjen 'xolatile' Milan Robovic, Emil Williams
  3. * SPDX Identifier: GPL-3.0-only / NO WARRANTY / NO GUARANTEE */
  4. #include "regex.h"
  5. #include <assert.h>
  6. #include <string.h>
  7. #include <limits.h>
  8. #include <stdlib.h>
  9. // ------------------
  10. // ### Char tests ###
  11. // ------------------
  12. static bool is_quantifier(const char c) {
  13. for (const char * s = "+*?="; *s != '\00'; s++) {
  14. if (*s == c) {
  15. return true;
  16. }
  17. }
  18. return false;
  19. }
  20. bool is_magic(const char c) {
  21. if (is_quantifier(c)) {
  22. return true;
  23. }
  24. for (const char * s = "\\[].^"; *s != '\00'; s++) {
  25. if (*s == c) {
  26. return true;
  27. }
  28. }
  29. return false;
  30. }
  31. // ----------------------
  32. // ### Internal Types ###
  33. // ----------------------
  34. typedef struct {
  35. int in;
  36. char input;
  37. int to;
  38. int width;
  39. } delta_t;
  40. typedef struct {
  41. int in;
  42. int to;
  43. int width;
  44. } offshoot_t;
  45. typedef struct {
  46. // XXX:
  47. // These should share a mask
  48. // Not even sure why they are pointers to begin with
  49. bool * do_catch;
  50. bool * is_negative;
  51. bool is_at_the_beginning;
  52. bool do_skip;
  53. // these might be obsolite but im leaving them for now
  54. bool * do_loop_hook;
  55. bool * do_follow_hook;
  56. bool * do_loop_shoot;
  57. bool * do_follow_shoot;
  58. // ---
  59. int * state;
  60. int * width;
  61. char * whitelist;
  62. char * blacklist;
  63. regex_t * regex;
  64. } compiler_state;
  65. // ----------------------------------
  66. // ### Regex creation/destruction ###
  67. // ----------------------------------
  68. #define HALT_AND_CATCH_FIRE INT_MIN
  69. static void HOOK_ALL( int from,
  70. const char * const str,
  71. int to,
  72. compiler_state * cs) {
  73. int hook_to = (to == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : ((*cs->state) + to);
  74. for (const char * s = str; *s != '\0'; s++) {
  75. delta_t * delta = malloc(sizeof(delta_t));
  76. delta->in = *cs->state + from;
  77. delta->input = *s;
  78. delta->to = hook_to;
  79. delta->width = *cs->width;
  80. vector_push(&cs->regex->delta_table,
  81. &delta);
  82. }
  83. }
  84. static void ABSOLUTE_OFFSHOOT(int from,
  85. int to,
  86. int width,
  87. compiler_state * cs) {
  88. offshoot_t * offshoot = malloc(sizeof(offshoot_t));
  89. offshoot->in = from;
  90. offshoot->to = to;
  91. offshoot->width = width;
  92. vector_push(&cs->regex->catch_table,
  93. &offshoot);
  94. }
  95. static void OFFSHOOT(int from,
  96. int to,
  97. int width,
  98. compiler_state * cs) {
  99. ABSOLUTE_OFFSHOOT(*cs->state + from, *cs->state + to, width, cs);
  100. }
  101. static int escape_1_to_1(const char c, compiler_state * cs) {
  102. char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
  103. switch (c) {
  104. case 't': {
  105. strcat(target_list, "\t");
  106. } return 1;
  107. case 'n': {
  108. strcat(target_list, "\n");
  109. } return 1;
  110. case 'r': {
  111. strcat(target_list, "\r");
  112. } return 1;
  113. case 'b': {
  114. strcat(target_list, "\b");
  115. } return 1;
  116. case '[': {
  117. strcat(target_list, "[");
  118. } return 1;
  119. case ']': {
  120. strcat(target_list, "]");
  121. } return 1;
  122. case '.': {
  123. strcat(target_list, ".");
  124. } return 1;
  125. case '^': {
  126. strcat(target_list, "^");
  127. } return 1;
  128. case '=': {
  129. strcat(target_list, "=");
  130. } return 1;
  131. case '?': {
  132. strcat(target_list, "?");
  133. } return 1;
  134. case '+': {
  135. strcat(target_list, "+");
  136. } return 1;
  137. case '*': {
  138. strcat(target_list, "*");
  139. } return 1;
  140. case '\\': {
  141. strcat(target_list, "\\");
  142. } return 1;
  143. }
  144. return 0;
  145. }
  146. static int escape_1_to_N(const char c, compiler_state * cs) {
  147. char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
  148. switch(c) {
  149. case 'i': {
  150. const char identifier_chars[] = "@0123456789_"
  151. "\300\301\302\303\304"
  152. "\305\306\307\310\311"
  153. "\312\313\314\315\316"
  154. "\317\320\321\322\323"
  155. "\324\325\326\327\330"
  156. "\331\332\333\334\335"
  157. "\336\337";
  158. strcpy(target_list, identifier_chars);
  159. return sizeof(identifier_chars)-1;
  160. };
  161. case 'I': {
  162. const char identifier_chars[] = "@_"
  163. "\300\301\302\303\304"
  164. "\305\306\307\310\311"
  165. "\312\313\314\315\316"
  166. "\317\320\321\322\323"
  167. "\324\325\326\327\330"
  168. "\331\332\333\334\335"
  169. "\336\337";
  170. strcpy(target_list, identifier_chars);
  171. return sizeof(identifier_chars)-1;
  172. };
  173. case 'k': {
  174. const char keyword_chars[] = "@0123456789_"
  175. "\300\301\302\303\304"
  176. "\305\306\307\310\311"
  177. "\312\313\314\315\316"
  178. "\317\320\321\322\323"
  179. "\324\325\326\327\330"
  180. "\331\332\333\334\335"
  181. "\336\337";
  182. strcpy(target_list, keyword_chars);
  183. return sizeof(keyword_chars)-1;
  184. };
  185. case 'K': {
  186. const char keyword_chars[] = "@_"
  187. "\300\301\302\303\304"
  188. "\305\306\307\310\311"
  189. "\312\313\314\315\316"
  190. "\317\320\321\322\323"
  191. "\324\325\326\327\330"
  192. "\331\332\333\334\335"
  193. "\336\337";
  194. strcpy(target_list, keyword_chars);
  195. return sizeof(keyword_chars)-1;
  196. };
  197. case 'f': {
  198. const char filename_chars[] = "@0123456789/.-_+,#$%~=";
  199. strcpy(target_list, filename_chars);
  200. return sizeof(filename_chars)-1;
  201. };
  202. case 'F': {
  203. const char filename_chars[] = "@/.-_+,#$%~=";
  204. strcpy(target_list, filename_chars);
  205. return sizeof(filename_chars)-1;
  206. };
  207. case 'p': {
  208. const char printable_chars[] = "@"
  209. "\241\242\243\244\245"
  210. "\246\247\250\251\252"
  211. "\253\254\255\256\257"
  212. "\260\261\262\263\264"
  213. "\265\266\267\270\271"
  214. "\272\273\274\275\276"
  215. "\277"
  216. "\300\301\302\303\304"
  217. "\305\306\307\310\311"
  218. "\312\313\314\315\316"
  219. "\317\320\321\322\323"
  220. "\324\325\326\327\330"
  221. "\331\332\333\334\335"
  222. "\336\337";
  223. strcpy(target_list, printable_chars);
  224. return sizeof(printable_chars)-1;
  225. };
  226. case 'P': {
  227. const char printable_chars[] = "@"
  228. "\241\242\243\244\245"
  229. "\246\247\250\251\252"
  230. "\253\254\255\256\257"
  231. "\260\261\262\263\264"
  232. "\265\266\267\270\271"
  233. "\272\273\274\275\276"
  234. "\277"
  235. "\300\301\302\303\304"
  236. "\305\306\307\310\311"
  237. "\312\313\314\315\316"
  238. "\317\320\321\322\323"
  239. "\324\325\326\327\330"
  240. "\331\332\333\334\335"
  241. "\336\337";
  242. strcpy(target_list, printable_chars);
  243. return sizeof(printable_chars)-1;
  244. };
  245. case 's': {
  246. const char whitespace_chars[] = " \t\v\n";
  247. strcpy(target_list, whitespace_chars);
  248. return sizeof(whitespace_chars)-1;
  249. };
  250. case 'd': {
  251. const char digit_chars[] = "0123456789";
  252. strcpy(target_list, digit_chars);
  253. return sizeof(digit_chars)-1;
  254. };
  255. case 'x': {
  256. const char hex_chars[] = "0123456789"
  257. "abcdef"
  258. "ABCDEF";
  259. strcpy(target_list, hex_chars);
  260. return sizeof(hex_chars)-1;
  261. };
  262. case 'o': {
  263. const char oct_chars[] = "01234567";
  264. strcpy(target_list, oct_chars);
  265. return sizeof(oct_chars)-1;
  266. };
  267. case 'w': {
  268. const char word_chars[] = "0123456789"
  269. "abcdefghijklmnopqrstuwxyz"
  270. "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  271. "_";
  272. strcpy(target_list, word_chars);
  273. return sizeof(word_chars)-1;
  274. };
  275. case 'h': {
  276. const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
  277. "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  278. "_";
  279. strcpy(target_list, very_word_chars);
  280. return sizeof(very_word_chars)-1;
  281. };
  282. case 'a': {
  283. const char alpha_chars[] = "abcdefghijklmnopqrstuwxyz"
  284. "ABCDEFGHIJKLMNOPQRSTUWXYZ";
  285. strcpy(target_list, alpha_chars);
  286. return sizeof(alpha_chars)-1;
  287. };
  288. case 'l': {
  289. const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz";
  290. strcpy(target_list, lower_alpha_chars);
  291. return sizeof(lower_alpha_chars)-1;
  292. };
  293. case 'u': {
  294. const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ";
  295. strcpy(target_list, upper_alpha_chars);
  296. return sizeof(upper_alpha_chars)-1;
  297. };
  298. }
  299. return 0;
  300. }
  301. static int escape_to_negative(const char c,
  302. compiler_state * cs) {
  303. switch (c) {
  304. case 'D': {
  305. const char digit_chars[] = "0123456789";
  306. strcpy(cs->blacklist, digit_chars);
  307. *cs->is_negative = true;
  308. return sizeof(digit_chars)-1;
  309. };
  310. }
  311. return 0;
  312. }
  313. static int escape_hologram(const char c, compiler_state * cs) {
  314. switch (c) {
  315. case '<': {
  316. if (cs->is_at_the_beginning) {
  317. ABSOLUTE_OFFSHOOT(0, 2, 0, cs);
  318. cs->do_skip = true;
  319. }
  320. const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
  321. "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  322. "_";
  323. *cs->is_negative = true; // effectless currently; should be used to trigger the following lines in the main compile loop
  324. strcat(cs->blacklist, very_word_chars);
  325. HOOK_ALL(0, cs->blacklist, HALT_AND_CATCH_FIRE, cs);
  326. OFFSHOOT(0, 0, 1, cs);
  327. return sizeof(very_word_chars)-1;
  328. };
  329. case '>': {
  330. const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
  331. "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  332. "_";
  333. *cs->is_negative = true;
  334. strcat(cs->blacklist, very_word_chars);
  335. return 1;
  336. }
  337. }
  338. return 0;
  339. }
  340. static int compile_dot(compiler_state * cs) {
  341. *cs->do_catch = true;
  342. return true;
  343. }
  344. static int compile_escape(const char c,
  345. compiler_state * cs) {
  346. return escape_1_to_1(c, cs)
  347. || escape_1_to_N(c, cs)
  348. || escape_to_negative(c, cs)
  349. || escape_hologram(c, cs)
  350. ;
  351. }
  352. static int compile_range(const char * const range,
  353. compiler_state * cs) {
  354. assert((range[0] == '[') && "Not a range.");
  355. const char * s;
  356. if (range[1] == '^') {
  357. *cs->is_negative = true;
  358. s = range + 2;
  359. } else {
  360. s = range + 1;
  361. }
  362. char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
  363. for (; *s != ']'; s++) {
  364. assert((*s != '\0') && "Unclosed range.");
  365. char c = *s;
  366. if (c == '\\') {
  367. s += 1;
  368. assert(compile_escape(*s, cs) && "Unknown escape.");
  369. } else if (*(s+1) == '-') {
  370. char end = *(s+2);
  371. assert((c < end) && "Endless range.");
  372. for (char cc = c; cc < end+1; cc++) {
  373. strncat(target_list, &cc, 1);
  374. strncat(target_list, "\0", 1);
  375. }
  376. s += 2;
  377. } else {
  378. strncat(target_list, &c, 1);
  379. }
  380. }
  381. return ((s - range) + 1);
  382. }
  383. void filter_blacklist(const char * whitelist,
  384. const char * blacklist,
  385. char * filtered) {
  386. for (; *blacklist != '\0'; blacklist++) {
  387. for(; *whitelist != '\0'; whitelist++) {
  388. if (*blacklist == *whitelist) {
  389. goto long_continue;
  390. }
  391. }
  392. strncat(filtered, blacklist, 1);
  393. long_continue:
  394. ;
  395. }
  396. }
  397. regex_t * regex_compile(const char * const pattern) {
  398. regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
  399. regex->str = strdup(pattern);
  400. vector_init(&regex->delta_table, sizeof(delta_t*), 0UL);
  401. vector_init(&regex->catch_table, sizeof(offshoot_t*), 0UL);
  402. int state = 2;
  403. // this is plain retarded
  404. bool do_catch;
  405. bool is_negative;
  406. bool do_loop_hook;
  407. bool do_follow_hook;
  408. bool do_loop_shoot;
  409. bool do_follow_shoot;
  410. int width;
  411. char whitelist[64];
  412. char blacklist[64];
  413. compiler_state cs = {
  414. .do_catch = &do_catch,
  415. .is_negative = &is_negative,
  416. .is_at_the_beginning = true,
  417. .do_skip = false,
  418. .state = &state,
  419. .width = &width,
  420. .whitelist = whitelist,
  421. .blacklist = blacklist,
  422. .regex = regex,
  423. };
  424. for (const char * s = pattern; *s != '\00';) {
  425. // Reset the compiler
  426. assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
  427. whitelist[0] = '\0';
  428. blacklist[0] = '\0';
  429. do_catch = false;
  430. is_negative = false;
  431. cs.do_skip = false;
  432. /**/
  433. do_loop_hook = false;
  434. do_follow_hook = false;
  435. do_loop_shoot = false;
  436. do_follow_shoot = false;
  437. /**/
  438. width = 1;
  439. // Translate char
  440. switch (*s) {
  441. case '^': {
  442. if (cs.is_at_the_beginning) {
  443. ABSOLUTE_OFFSHOOT(0, 2, 0, &cs);
  444. ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, &cs);
  445. }
  446. whitelist[0] = '\n';
  447. whitelist[1] = '\0';
  448. HOOK_ALL(0, whitelist, 0, &cs);
  449. if (s != pattern) {
  450. state += 1;
  451. }
  452. cs.do_skip = true;
  453. } break;
  454. case '.': {
  455. compile_dot(&cs);
  456. } break;
  457. case '\\': {
  458. s += 1;
  459. assert(compile_escape(*s, &cs) && "Unknown escape.");
  460. } break;
  461. case '[': {
  462. s += compile_range(s, &cs) - 1;
  463. } break;
  464. default: {
  465. whitelist[0] = *s;
  466. whitelist[1] = '\0';
  467. } break;
  468. }
  469. s += 1;
  470. if (cs.do_skip) {
  471. goto long_continue;
  472. }
  473. // Compile with quantifier
  474. switch (*s) {
  475. case '=':
  476. case '?': {
  477. do_loop_hook = true;
  478. HOOK_ALL(0, whitelist, +1, &cs);
  479. if (do_catch || is_negative) {
  480. OFFSHOOT(0, +1, 1, &cs);
  481. }
  482. s += 1;
  483. } break;
  484. case '*': {
  485. HOOK_ALL(0, whitelist, 0, &cs);
  486. if (do_catch) {
  487. OFFSHOOT(0, +1, 1, &cs);
  488. } else if (is_negative) {
  489. OFFSHOOT(0, 0, 1, &cs);
  490. }
  491. s += 1;
  492. } break;
  493. case '+': {
  494. HOOK_ALL(0, whitelist, +1, &cs);
  495. if (do_catch || is_negative) {
  496. OFFSHOOT(0, +1, 1, &cs);
  497. }
  498. state += 1;
  499. HOOK_ALL(0, whitelist, 0, &cs);
  500. if (do_catch || is_negative) {
  501. OFFSHOOT(0, 0, 1, &cs);
  502. }
  503. s += 1;
  504. } break;
  505. default: { // Literal
  506. HOOK_ALL(0, whitelist, +1, &cs);
  507. if (do_catch || is_negative) {
  508. OFFSHOOT(0, +1, 1, &cs);
  509. }
  510. state += 1;
  511. } break;
  512. }
  513. // Compile blacklist
  514. if (*blacklist) {
  515. char filtered_blacklist[64];
  516. filtered_blacklist[0] = '\0';
  517. filter_blacklist(whitelist, blacklist, filtered_blacklist);
  518. HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs);
  519. }
  520. long_continue:
  521. cs.is_at_the_beginning = false;
  522. }
  523. regex->accepting_state = state;
  524. return regex;
  525. }
  526. int regex_free(regex_t * const regex) {
  527. free(regex->str);
  528. vector_free(&regex->delta_table);
  529. vector_free(&regex->catch_table);
  530. free(regex);
  531. return 0;
  532. }
  533. // -----------------
  534. // ### Searching ###
  535. // -----------------
  536. static int catch_(const regex_t * const regex,
  537. int * const state) {
  538. for (size_t i = 0; i < regex->catch_table.element_count; i++){
  539. const offshoot_t * const offshoot = *(offshoot_t**)vector_get(&regex->catch_table, i);
  540. if (offshoot->in == *state) {
  541. *state = offshoot->to;
  542. return offshoot->width;
  543. }
  544. }
  545. return HALT_AND_CATCH_FIRE;
  546. }
  547. static int regex_assert(const regex_t * const regex,
  548. const char * const string,
  549. const int string_offset,
  550. int state,
  551. int width) { // XXX: im pretty sure this is actually redundant and the width should be calculated from string - s
  552. for (const char * s = (string + string_offset); *s != '\00';) {
  553. // XXX: this should be a jump search for the instate and then a linear
  554. // delta
  555. //int left = 0;
  556. //int right = regex->delta_table.element_count - 1;
  557. //int i;
  558. //while(left <= right) }
  559. for (size_t i = 0; i < regex->delta_table.element_count; i++) {
  560. //i = (left + right) / 2;
  561. const delta_t * const delta = *(delta_t**)vector_get(&regex->delta_table, i);
  562. if ((delta->in == state)
  563. && (delta->input == *s)) {
  564. int r = regex_assert(regex, string, (s - string) + delta->width, delta->to, width + 1);
  565. if(r){
  566. return r;
  567. }
  568. }
  569. }
  570. const int catch_width = catch_(regex, &state);
  571. if ((catch_width != HALT_AND_CATCH_FIRE)
  572. && (state != HALT_AND_CATCH_FIRE)) {
  573. s += catch_width;
  574. continue;
  575. }
  576. // XXX: the extra catch might not be necessary if we were to compile to a simpler form
  577. catch_(regex, &state);
  578. return (state == regex->accepting_state) ? width : false;
  579. }
  580. return false;
  581. }
  582. int regex_match( regex_t * regex,
  583. const char * const string,
  584. const bool is_start_of_string,
  585. const int string_offset) { // XXX: remove this useless piece of shit of a parameter nigger
  586. if (regex == NULL) {
  587. return false;
  588. }
  589. if (string == NULL) {
  590. return true;
  591. }
  592. const int initial_state = (int)(!is_start_of_string);
  593. return regex_assert(regex, string, string_offset, initial_state, 0);
  594. }
  595. bool regex_search( regex_t * regex,
  596. const char * const string) {
  597. return (bool)regex_match(regex, string, true, 0);
  598. }