Highlight things
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

619 lines
17KB

  1. /* regex.c
  2. * Copyright 2023 Anon Anonson, Ognjen 'xolatile' Milan Robovic, Emil Williams
  3. * SPDX Identifier: GPL-3.0-only / NO WARRANTY / NO GUARANTEE */
  4. #include "regex.h"
  5. #include <assert.h>
  6. #include <string.h>
  7. #include <limits.h>
  8. #include <stdlib.h>
  9. #include <stdbool.h>
  10. // ------------------
  11. // ### Char tests ###
  12. // ------------------
  13. static bool is_quantifier(const char c) {
  14. for (const char * s = "+*?="; *s != '\00'; s++) {
  15. if (*s == c) {
  16. return true;
  17. }
  18. }
  19. return false;
  20. }
  21. bool is_magic(const char c) {
  22. if (is_quantifier(c)) {
  23. return true;
  24. }
  25. for (const char * s = "\\[].^"; *s != '\00'; s++) {
  26. if (*s == c) {
  27. return true;
  28. }
  29. }
  30. return false;
  31. }
  32. // ----------------------
  33. // ### Internal Types ###
  34. // ----------------------
  35. typedef struct {
  36. int in;
  37. char input;
  38. int to;
  39. int width;
  40. } delta_t;
  41. typedef struct {
  42. int in;
  43. int to;
  44. int width;
  45. } offshoot_t;
  46. typedef struct {
  47. bool * do_catch;
  48. bool * is_negative;
  49. // these might be obsolite but im leaving them for now
  50. bool * do_loop_hook;
  51. bool * do_follow_hook;
  52. bool * do_loop_shoot;
  53. bool * do_follow_shoot;
  54. // ---
  55. int * state;
  56. int * width;
  57. char * whitelist;
  58. char * blacklist;
  59. regex_t * regex;
  60. } compiler_state;
  61. // ----------------------------------
  62. // ### Regex creation/destruction ###
  63. // ----------------------------------
  64. static int escape_1_to_1(const char c, compiler_state * cs) {
  65. char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
  66. switch (c) {
  67. case 't': {
  68. strcat(target_list, "\t");
  69. } return 1;
  70. case 'n': {
  71. strcat(target_list, "\n");
  72. } return 1;
  73. case 'r': {
  74. strcat(target_list, "\r");
  75. } return 1;
  76. case 'b': {
  77. strcat(target_list, "\b");
  78. } return 1;
  79. case '[': {
  80. strcat(target_list, "[");
  81. } return 1;
  82. case ']': {
  83. strcat(target_list, "]");
  84. } return 1;
  85. case '.': {
  86. strcat(target_list, ".");
  87. } return 1;
  88. case '^': {
  89. strcat(target_list, "^");
  90. } return 1;
  91. case '=': {
  92. strcat(target_list, "=");
  93. } return 1;
  94. case '?': {
  95. strcat(target_list, "?");
  96. } return 1;
  97. case '+': {
  98. strcat(target_list, "+");
  99. } return 1;
  100. case '*': {
  101. strcat(target_list, "*");
  102. } return 1;
  103. case '\\': {
  104. strcat(target_list, "\\");
  105. } return 1;
  106. }
  107. return 0;
  108. }
  109. static int escape_1_to_N(const char c, compiler_state * cs) {
  110. char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
  111. switch(c) {
  112. case 'i': {
  113. const char identifier_chars[] = "@0123456789_"
  114. "\300\301\302\303\304"
  115. "\305\306\307\310\311"
  116. "\312\313\314\315\316"
  117. "\317\320\321\322\323"
  118. "\324\325\326\327\330"
  119. "\331\332\333\334\335"
  120. "\336\337";
  121. strcpy(target_list, identifier_chars);
  122. return sizeof(identifier_chars)-1;
  123. };
  124. case 'I': {
  125. const char identifier_chars[] = "@_"
  126. "\300\301\302\303\304"
  127. "\305\306\307\310\311"
  128. "\312\313\314\315\316"
  129. "\317\320\321\322\323"
  130. "\324\325\326\327\330"
  131. "\331\332\333\334\335"
  132. "\336\337";
  133. strcpy(target_list, identifier_chars);
  134. return sizeof(identifier_chars)-1;
  135. };
  136. case 'k': {
  137. const char keyword_chars[] = "@0123456789_"
  138. "\300\301\302\303\304"
  139. "\305\306\307\310\311"
  140. "\312\313\314\315\316"
  141. "\317\320\321\322\323"
  142. "\324\325\326\327\330"
  143. "\331\332\333\334\335"
  144. "\336\337";
  145. strcpy(target_list, keyword_chars);
  146. return sizeof(keyword_chars)-1;
  147. };
  148. case 'K': {
  149. const char keyword_chars[] = "@_"
  150. "\300\301\302\303\304"
  151. "\305\306\307\310\311"
  152. "\312\313\314\315\316"
  153. "\317\320\321\322\323"
  154. "\324\325\326\327\330"
  155. "\331\332\333\334\335"
  156. "\336\337";
  157. strcpy(target_list, keyword_chars);
  158. return sizeof(keyword_chars)-1;
  159. };
  160. case 'f': {
  161. const char filename_chars[] = "@0123456789/.-_+,#$%~=";
  162. strcpy(target_list, filename_chars);
  163. return sizeof(filename_chars)-1;
  164. };
  165. case 'F': {
  166. const char filename_chars[] = "@/.-_+,#$%~=";
  167. strcpy(target_list, filename_chars);
  168. return sizeof(filename_chars)-1;
  169. };
  170. case 'p': {
  171. const char printable_chars[] = "@"
  172. "\241\242\243\244\245"
  173. "\246\247\250\251\252"
  174. "\253\254\255\256\257"
  175. "\260\261\262\263\264"
  176. "\265\266\267\270\271"
  177. "\272\273\274\275\276"
  178. "\277"
  179. "\300\301\302\303\304"
  180. "\305\306\307\310\311"
  181. "\312\313\314\315\316"
  182. "\317\320\321\322\323"
  183. "\324\325\326\327\330"
  184. "\331\332\333\334\335"
  185. "\336\337";
  186. strcpy(target_list, printable_chars);
  187. return sizeof(printable_chars)-1;
  188. };
  189. case 'P': {
  190. const char printable_chars[] = "@"
  191. "\241\242\243\244\245"
  192. "\246\247\250\251\252"
  193. "\253\254\255\256\257"
  194. "\260\261\262\263\264"
  195. "\265\266\267\270\271"
  196. "\272\273\274\275\276"
  197. "\277"
  198. "\300\301\302\303\304"
  199. "\305\306\307\310\311"
  200. "\312\313\314\315\316"
  201. "\317\320\321\322\323"
  202. "\324\325\326\327\330"
  203. "\331\332\333\334\335"
  204. "\336\337";
  205. strcpy(target_list, printable_chars);
  206. return sizeof(printable_chars)-1;
  207. };
  208. case 's': {
  209. const char whitespace_chars[] = " \t\v\n";
  210. strcpy(target_list, whitespace_chars);
  211. return sizeof(whitespace_chars)-1;
  212. };
  213. case 'd': {
  214. const char digit_chars[] = "0123456789";
  215. strcpy(target_list, digit_chars);
  216. return sizeof(digit_chars)-1;
  217. };
  218. case 'x': {
  219. const char hex_chars[] = "0123456789"
  220. "abcdef"
  221. "ABCDEF";
  222. strcpy(target_list, hex_chars);
  223. return sizeof(hex_chars)-1;
  224. };
  225. case 'o': {
  226. const char oct_chars[] = "01234567";
  227. strcpy(target_list, oct_chars);
  228. return sizeof(oct_chars)-1;
  229. };
  230. case 'w': {
  231. const char word_chars[] = "0123456789"
  232. "abcdefghijklmnopqrstuwxyz"
  233. "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  234. "_";
  235. strcpy(target_list, word_chars);
  236. return sizeof(word_chars)-1;
  237. };
  238. case 'h': {
  239. const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
  240. "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  241. "_";
  242. strcpy(target_list, very_word_chars);
  243. return sizeof(very_word_chars)-1;
  244. };
  245. case 'a': {
  246. const char alpha_chars[] = "abcdefghijklmnopqrstuwxyz"
  247. "ABCDEFGHIJKLMNOPQRSTUWXYZ";
  248. strcpy(target_list, alpha_chars);
  249. return sizeof(alpha_chars)-1;
  250. };
  251. case 'l': {
  252. const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz";
  253. strcpy(target_list, lower_alpha_chars);
  254. return sizeof(lower_alpha_chars)-1;
  255. };
  256. case 'u': {
  257. const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ";
  258. strcpy(target_list, upper_alpha_chars);
  259. return sizeof(upper_alpha_chars)-1;
  260. };
  261. }
  262. return 0;
  263. }
  264. static int escape_to_negative(const char c,
  265. compiler_state * cs) {
  266. switch (c) {
  267. case 'D': {
  268. const char digit_chars[] = "0123456789";
  269. strcpy(cs->blacklist, digit_chars);
  270. *cs->is_negative = true;
  271. return sizeof(digit_chars)-1;
  272. };
  273. }
  274. return 0;
  275. }
  276. //static int compile_hologram(char * hologram, char * whitelist) {
  277. // if (hologram[0] == '\\') {
  278. // switch (hologram[1]) {
  279. // case '<': {
  280. // const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
  281. // "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  282. // "_";
  283. // strcat(whitelist, very_word_chars);
  284. // is_negative = true;
  285. // HOOK_ALL(0, whitelist, 0)
  286. // } break;
  287. // }
  288. // }
  289. //}
  290. static int compile_dot(compiler_state * cs) {
  291. *cs->do_catch = true;
  292. return true;
  293. }
  294. static int compile_escape(const char c,
  295. compiler_state * cs) {
  296. return escape_1_to_1(c, cs)
  297. || escape_1_to_N(c, cs)
  298. || escape_to_negative(c, cs)
  299. //|| compile_hologram(*s, whitelist)
  300. ;
  301. }
  302. static int compile_range(const char * const range,
  303. compiler_state * cs) {
  304. assert((range[0] == '[') && "Not a range.");
  305. char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
  306. const char * s;
  307. if (range[1] == '^') {
  308. *cs->is_negative = true;
  309. s = range + 2;
  310. } else {
  311. s = range + 1;
  312. }
  313. for (; *s != ']'; s++) {
  314. assert((*s != '\0') && "Unclosed range.");
  315. char c = *s;
  316. if (c == '\\') {
  317. s += 1;
  318. assert(compile_escape(*s, cs) && "Unknown escape.");
  319. } else if (*(s+1) == '-') {
  320. char end = *(s+2);
  321. assert((c < end) && "Endless range.");
  322. for (char cc = c; cc < end+1; cc++) {
  323. strncat(target_list, &cc, 1);
  324. strncat(target_list, "\0", 1);
  325. }
  326. s += 2;
  327. } else {
  328. strncat(target_list, &c, 1);
  329. }
  330. }
  331. return ((s - range) + 1);
  332. }
  333. void filter_blacklist(const char * whitelist,
  334. const char * blacklist,
  335. char * filtered) {
  336. for (; *blacklist != '\0'; blacklist++) {
  337. for(; *whitelist != '\0'; whitelist++) {
  338. if (*blacklist == *whitelist) {
  339. goto long_continue;
  340. }
  341. }
  342. strncat(filtered, blacklist, 1);
  343. long_continue:;
  344. }
  345. }
  346. #define HALT_AND_CATCH_FIRE INT_MIN
  347. void HOOK_ALL( int from,
  348. const char * const str,
  349. int to,
  350. compiler_state * cs) {
  351. int hook_to = (to == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : ((*cs->state) + to);
  352. for (const char * s = str; *s != '\0'; s++) {
  353. delta_t * delta = malloc(sizeof(delta_t));
  354. delta->in = *cs->state + from;
  355. delta->input = *s;
  356. delta->to = hook_to;
  357. delta->width = *cs->width;
  358. vector_push(&cs->regex->delta_table,
  359. &delta);
  360. }
  361. }
  362. void ABSOLUTE_OFFSHOOT(int from,
  363. int to,
  364. int width,
  365. compiler_state * cs) {
  366. offshoot_t * offshoot = malloc(sizeof(offshoot_t));
  367. offshoot->in = from;
  368. offshoot->to = to;
  369. offshoot->width = width;
  370. vector_push(&cs->regex->catch_table,
  371. &offshoot);
  372. }
  373. void OFFSHOOT(int from,
  374. int to,
  375. int width,
  376. compiler_state * cs) {
  377. ABSOLUTE_OFFSHOOT(*cs->state + from, *cs->state + to, width, cs);
  378. }
  379. regex_t * regex_compile(const char * const pattern) {
  380. regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
  381. regex->str = strdup(pattern);
  382. vector_init(&regex->delta_table, sizeof(delta_t*), 0UL);
  383. vector_init(&regex->catch_table, sizeof(offshoot_t*), 0UL);
  384. int state = 2;
  385. bool do_catch;
  386. bool is_negative;
  387. bool do_loop_hook;
  388. bool do_follow_hook;
  389. bool do_loop_shoot;
  390. bool do_follow_shoot;
  391. int width;
  392. char whitelist[64];
  393. char blacklist[64];
  394. compiler_state cs = {
  395. .do_catch = &do_catch,
  396. .is_negative = &is_negative,
  397. .state = &state,
  398. .width = &width,
  399. .whitelist = whitelist,
  400. .blacklist = blacklist,
  401. .regex = regex,
  402. };
  403. for (const char * s = pattern; *s != '\00';) {
  404. // Reset the compiler
  405. assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
  406. whitelist[0] = '\0';
  407. blacklist[0] = '\0';
  408. do_catch = false;
  409. is_negative = false;
  410. do_loop_hook = false;
  411. do_follow_hook = false;
  412. do_loop_shoot = false;
  413. do_follow_shoot = false;
  414. width = 1;
  415. // Translate char
  416. switch (*s) {
  417. case '^': {
  418. if (s == pattern) {
  419. ABSOLUTE_OFFSHOOT(0, 2, 0, &cs);
  420. ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, &cs);
  421. }
  422. whitelist[0] = '\n';
  423. whitelist[1] = '\0';
  424. HOOK_ALL(0, whitelist, 0, &cs);
  425. if (s != pattern) {
  426. state += 1;
  427. }
  428. s += 1;
  429. goto long_continue;
  430. } break;
  431. case '.': {
  432. compile_dot(&cs);
  433. } break;
  434. case '\\': {
  435. s += 1;
  436. assert(compile_escape(*s, &cs) && "Unknown escape.");
  437. } break;
  438. case '[': {
  439. s += compile_range(s, &cs) - 1;
  440. } break;
  441. default: {
  442. whitelist[0] = *s;
  443. whitelist[1] = '\0';
  444. } break;
  445. }
  446. s += 1;
  447. // Compile with quantifier
  448. switch (*s) {
  449. case '=':
  450. case '?': {
  451. do_loop_hook = true;
  452. HOOK_ALL(0, whitelist, +1, &cs);
  453. if (do_catch || is_negative) {
  454. OFFSHOOT(0, +1, 1, &cs);
  455. }
  456. s += 1;
  457. } break;
  458. case '*': {
  459. HOOK_ALL(0, whitelist, 0, &cs);
  460. if (do_catch) {
  461. OFFSHOOT(0, +1, 1, &cs);
  462. } else if (is_negative) {
  463. OFFSHOOT(0, 0, 1, &cs);
  464. }
  465. s += 1;
  466. } break;
  467. case '+': {
  468. HOOK_ALL(0, whitelist, +1, &cs);
  469. if (do_catch || is_negative) {
  470. OFFSHOOT(0, +1, 1, &cs);
  471. }
  472. state += 1;
  473. HOOK_ALL(0, whitelist, 0, &cs);
  474. if (do_catch || is_negative) {
  475. OFFSHOOT(0, 0, 1, &cs);
  476. }
  477. s += 1;
  478. } break;
  479. default: { // Literal
  480. HOOK_ALL(0, whitelist, +1, &cs);
  481. if (do_catch || is_negative) {
  482. OFFSHOOT(0, +1, 1, &cs);
  483. }
  484. state += 1;
  485. } break;
  486. }
  487. // Compile blacklist
  488. if (*blacklist) {
  489. char filtered_blacklist[64];
  490. filtered_blacklist[0] = '\0';
  491. filter_blacklist(whitelist, blacklist, filtered_blacklist);
  492. HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs);
  493. }
  494. long_continue:;
  495. }
  496. regex->accepting_state = state;
  497. return regex;
  498. }
  499. int regex_free(regex_t * const regex) {
  500. free(regex->str);
  501. vector_free(&regex->delta_table);
  502. vector_free(&regex->catch_table);
  503. free(regex);
  504. return 0;
  505. }
  506. // -----------------
  507. // ### Searching ###
  508. // -----------------
  509. static int catch_(const regex_t * const regex,
  510. int * const state) {
  511. for (size_t i = 0; i < regex->catch_table.element_count; i++){
  512. const offshoot_t * const offshoot = *(offshoot_t**)vector_get(&regex->catch_table, i);
  513. if (offshoot->in == *state) {
  514. *state = offshoot->to;
  515. return offshoot->width;
  516. }
  517. }
  518. return HALT_AND_CATCH_FIRE;
  519. }
  520. static int regex_assert(const regex_t * const regex,
  521. const char * const string,
  522. const int string_offset,
  523. int state,
  524. int width) { // XXX: im pretty sure this is actually redundant and the width should be calculated from string - s
  525. for (const char * s = (string + string_offset); *s != '\00';) {
  526. // delta
  527. for (size_t i = 0; i < regex->delta_table.element_count; i++) {
  528. const delta_t * const delta = *(delta_t**)vector_get(&regex->delta_table, i);
  529. if ((delta->in == state)
  530. && (delta->input == *s)) {
  531. int r = regex_assert(regex, string, (s - string) + delta->width, delta->to, width + 1);
  532. if(r){
  533. return r;
  534. }
  535. }
  536. }
  537. const int catch_width = catch_(regex, &state);
  538. if ((catch_width != HALT_AND_CATCH_FIRE)
  539. && (state != HALT_AND_CATCH_FIRE)) {
  540. s += catch_width;
  541. continue;
  542. }
  543. return (state == regex->accepting_state) ? width : false;
  544. }
  545. return false;
  546. }
  547. int regex_match( regex_t * regex,
  548. const char * const string,
  549. const bool is_start_of_string,
  550. const int string_offset) { // XXX: remove this useless piece of shit of a parameter nigger
  551. if (regex == NULL) {
  552. return false;
  553. }
  554. if (string == NULL) {
  555. return true;
  556. }
  557. const int initial_state = (int)(!is_start_of_string);
  558. return regex_assert(regex, string, string_offset, initial_state, 0);
  559. }
  560. bool regex_search( regex_t * regex,
  561. const char * const string) {
  562. return (bool)regex_match(regex, string, true, 0);
  563. }