Highlight things
No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.

619 líneas
17KB

  1. /* regex.c
  2. * Copyright 2023 Anon Anonson, Ognjen 'xolatile' Milan Robovic, Emil Williams
  3. * SPDX Identifier: GPL-3.0-only / NO WARRANTY / NO GUARANTEE */
  4. #include "regex.h"
  5. #include <assert.h>
  6. #include <string.h>
  7. #include <limits.h>
  8. // ------------------
  9. // ### Char tests ###
  10. // ------------------
  11. static bool is_quantifier(const char c) {
  12. for (const char * s = "+*?="; *s != '\00'; s++) {
  13. if (*s == c) {
  14. return true;
  15. }
  16. }
  17. return false;
  18. }
  19. bool is_magic(const char c) {
  20. if (is_quantifier(c)) {
  21. return true;
  22. }
  23. for (const char * s = "\\[].^"; *s != '\00'; s++) {
  24. if (*s == c) {
  25. return true;
  26. }
  27. }
  28. return false;
  29. }
  30. // ----------------------
  31. // ### Internal Types ###
  32. // ----------------------
  33. typedef struct {
  34. int in;
  35. char input;
  36. int to;
  37. int width;
  38. } delta_t;
  39. typedef struct {
  40. int in;
  41. int to;
  42. int width;
  43. } offshoot_t;
  44. typedef struct {
  45. bool * do_catch;
  46. bool * is_negative;
  47. // these might be obsolite but im leaving them for now
  48. bool * do_loop_hook;
  49. bool * do_follow_hook;
  50. bool * do_loop_shoot;
  51. bool * do_follow_shoot;
  52. // ---
  53. int * state;
  54. int * width;
  55. char * whitelist;
  56. char * blacklist;
  57. regex_t * regex;
  58. } compiler_state;
  59. // ----------------------------------
  60. // ### Regex creation/destruction ###
  61. // ----------------------------------
  62. static int escape_1_to_1(const char c, compiler_state * cs) {
  63. char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
  64. switch (c) {
  65. case 't': {
  66. strcat(target_list, "\t");
  67. } return 1;
  68. case 'n': {
  69. strcat(target_list, "\n");
  70. } return 1;
  71. case 'r': {
  72. strcat(target_list, "\r");
  73. } return 1;
  74. case 'b': {
  75. strcat(target_list, "\b");
  76. } return 1;
  77. case '[': {
  78. strcat(target_list, "[");
  79. } return 1;
  80. case ']': {
  81. strcat(target_list, "]");
  82. } return 1;
  83. case '.': {
  84. strcat(target_list, ".");
  85. } return 1;
  86. case '^': {
  87. strcat(target_list, "^");
  88. } return 1;
  89. case '=': {
  90. strcat(target_list, "=");
  91. } return 1;
  92. case '?': {
  93. strcat(target_list, "?");
  94. } return 1;
  95. case '+': {
  96. strcat(target_list, "+");
  97. } return 1;
  98. case '*': {
  99. strcat(target_list, "*");
  100. } return 1;
  101. case '\\': {
  102. strcat(target_list, "\\");
  103. } return 1;
  104. }
  105. return 0;
  106. }
  107. static int escape_1_to_N(const char c, compiler_state * cs) {
  108. char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
  109. switch(c) {
  110. case 'i': {
  111. const char identifier_chars[] = "@0123456789_"
  112. "\300\301\302\303\304"
  113. "\305\306\307\310\311"
  114. "\312\313\314\315\316"
  115. "\317\320\321\322\323"
  116. "\324\325\326\327\330"
  117. "\331\332\333\334\335"
  118. "\336\337";
  119. strcpy(target_list, identifier_chars);
  120. return sizeof(identifier_chars)-1;
  121. };
  122. case 'I': {
  123. const char identifier_chars[] = "@_"
  124. "\300\301\302\303\304"
  125. "\305\306\307\310\311"
  126. "\312\313\314\315\316"
  127. "\317\320\321\322\323"
  128. "\324\325\326\327\330"
  129. "\331\332\333\334\335"
  130. "\336\337";
  131. strcpy(target_list, identifier_chars);
  132. return sizeof(identifier_chars)-1;
  133. };
  134. case 'k': {
  135. const char keyword_chars[] = "@0123456789_"
  136. "\300\301\302\303\304"
  137. "\305\306\307\310\311"
  138. "\312\313\314\315\316"
  139. "\317\320\321\322\323"
  140. "\324\325\326\327\330"
  141. "\331\332\333\334\335"
  142. "\336\337";
  143. strcpy(target_list, keyword_chars);
  144. return sizeof(keyword_chars)-1;
  145. };
  146. case 'K': {
  147. const char keyword_chars[] = "@_"
  148. "\300\301\302\303\304"
  149. "\305\306\307\310\311"
  150. "\312\313\314\315\316"
  151. "\317\320\321\322\323"
  152. "\324\325\326\327\330"
  153. "\331\332\333\334\335"
  154. "\336\337";
  155. strcpy(target_list, keyword_chars);
  156. return sizeof(keyword_chars)-1;
  157. };
  158. case 'f': {
  159. const char filename_chars[] = "@0123456789/.-_+,#$%~=";
  160. strcpy(target_list, filename_chars);
  161. return sizeof(filename_chars)-1;
  162. };
  163. case 'F': {
  164. const char filename_chars[] = "@/.-_+,#$%~=";
  165. strcpy(target_list, filename_chars);
  166. return sizeof(filename_chars)-1;
  167. };
  168. case 'p': {
  169. const char printable_chars[] = "@"
  170. "\241\242\243\244\245"
  171. "\246\247\250\251\252"
  172. "\253\254\255\256\257"
  173. "\260\261\262\263\264"
  174. "\265\266\267\270\271"
  175. "\272\273\274\275\276"
  176. "\277"
  177. "\300\301\302\303\304"
  178. "\305\306\307\310\311"
  179. "\312\313\314\315\316"
  180. "\317\320\321\322\323"
  181. "\324\325\326\327\330"
  182. "\331\332\333\334\335"
  183. "\336\337";
  184. strcpy(target_list, printable_chars);
  185. return sizeof(printable_chars)-1;
  186. };
  187. case 'P': {
  188. const char printable_chars[] = "@"
  189. "\241\242\243\244\245"
  190. "\246\247\250\251\252"
  191. "\253\254\255\256\257"
  192. "\260\261\262\263\264"
  193. "\265\266\267\270\271"
  194. "\272\273\274\275\276"
  195. "\277"
  196. "\300\301\302\303\304"
  197. "\305\306\307\310\311"
  198. "\312\313\314\315\316"
  199. "\317\320\321\322\323"
  200. "\324\325\326\327\330"
  201. "\331\332\333\334\335"
  202. "\336\337";
  203. strcpy(target_list, printable_chars);
  204. return sizeof(printable_chars)-1;
  205. };
  206. case 's': {
  207. const char whitespace_chars[] = " \t\v\n";
  208. strcpy(target_list, whitespace_chars);
  209. return sizeof(whitespace_chars)-1;
  210. };
  211. case 'd': {
  212. const char digit_chars[] = "0123456789";
  213. strcpy(target_list, digit_chars);
  214. return sizeof(digit_chars)-1;
  215. };
  216. case 'x': {
  217. const char hex_chars[] = "0123456789"
  218. "abcdef"
  219. "ABCDEF";
  220. strcpy(target_list, hex_chars);
  221. return sizeof(hex_chars)-1;
  222. };
  223. case 'o': {
  224. const char oct_chars[] = "01234567";
  225. strcpy(target_list, oct_chars);
  226. return sizeof(oct_chars)-1;
  227. };
  228. case 'w': {
  229. const char word_chars[] = "0123456789"
  230. "abcdefghijklmnopqrstuwxyz"
  231. "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  232. "_";
  233. strcpy(target_list, word_chars);
  234. return sizeof(word_chars)-1;
  235. };
  236. case 'h': {
  237. const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
  238. "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  239. "_";
  240. strcpy(target_list, very_word_chars);
  241. return sizeof(very_word_chars)-1;
  242. };
  243. case 'a': {
  244. const char alpha_chars[] = "abcdefghijklmnopqrstuwxyz"
  245. "ABCDEFGHIJKLMNOPQRSTUWXYZ";
  246. strcpy(target_list, alpha_chars);
  247. return sizeof(alpha_chars)-1;
  248. };
  249. case 'l': {
  250. const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz";
  251. strcpy(target_list, lower_alpha_chars);
  252. return sizeof(lower_alpha_chars)-1;
  253. };
  254. case 'u': {
  255. const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ";
  256. strcpy(target_list, upper_alpha_chars);
  257. return sizeof(upper_alpha_chars)-1;
  258. };
  259. }
  260. return 0;
  261. }
  262. static int escape_to_negative(const char c,
  263. compiler_state * cs) {
  264. switch (c) {
  265. case 'D': {
  266. const char digit_chars[] = "0123456789";
  267. strcpy(cs->blacklist, digit_chars);
  268. *cs->is_negative = true;
  269. return sizeof(digit_chars)-1;
  270. };
  271. }
  272. return 0;
  273. }
  274. //static int compile_hologram(char * hologram, char * whitelist) {
  275. // if (hologram[0] == '\\') {
  276. // switch (hologram[1]) {
  277. // case '<': {
  278. // const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
  279. // "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  280. // "_";
  281. // strcat(whitelist, very_word_chars);
  282. // is_negative = true;
  283. // HOOK_ALL(0, whitelist, 0)
  284. // } break;
  285. // }
  286. // }
  287. //}
  288. static int compile_dot(compiler_state * cs) {
  289. *cs->do_catch = true;
  290. return true;
  291. }
  292. static int compile_escape(const char c,
  293. compiler_state * cs) {
  294. return escape_1_to_1(c, cs)
  295. || escape_1_to_N(c, cs)
  296. || escape_to_negative(c, cs)
  297. //|| compile_hologram(*s, whitelist)
  298. ;
  299. }
  300. static int compile_range(const char * const range,
  301. compiler_state * cs) {
  302. assert((range[0] == '[') && "Not a range.");
  303. char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
  304. const char * s;
  305. if (range[1] == '^') {
  306. *cs->is_negative = true;
  307. s = range + 2;
  308. } else {
  309. s = range + 1;
  310. }
  311. for (; *s != ']'; s++) {
  312. assert((*s != '\0') && "Unclosed range.");
  313. char c = *s;
  314. if (c == '\\') {
  315. s += 1;
  316. assert(compile_escape(*s, cs) && "Unknown escape.");
  317. } else if (*(s+1) == '-') {
  318. char end = *(s+2);
  319. assert((c < end) && "Endless range.");
  320. for (char cc = c; cc < end+1; cc++) {
  321. strncat(target_list, &cc, 1);
  322. strncat(target_list, "\0", 1);
  323. }
  324. s += 2;
  325. } else {
  326. strncat(target_list, &c, 1);
  327. }
  328. }
  329. return ((s - range) + 1);
  330. }
  331. void filter_blacklist(const char * const whitelist,
  332. const char * const blacklist,
  333. char * const filtered) {
  334. for (char * black_pointer = blacklist; *black_pointer != '\0'; black_pointer++) {
  335. for(char * white_pointer = blacklist; *white_pointer != '\0'; white_pointer++) {
  336. if (*black_pointer == *white_pointer) {
  337. goto long_continue;
  338. }
  339. }
  340. strncat(filtered, black_pointer, 1);
  341. long_continue:
  342. }
  343. }
  344. #define HALT_AND_CATCH_FIRE INT_MIN
  345. void HOOK_ALL( int from,
  346. const char * const str,
  347. int to,
  348. compiler_state * cs) {
  349. int hook_to = (to == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : ((*cs->state) + to);
  350. for (const char * s = str; *s != '\0'; s++) {
  351. delta_t * delta = malloc(sizeof(delta_t));
  352. delta->in = *cs->state + from;
  353. delta->input = *s;
  354. delta->to = hook_to;
  355. delta->width = *cs->width;
  356. vector_push(&cs->regex->delta_table,
  357. &delta);
  358. }
  359. }
  360. void ABSOLUTE_OFFSHOOT(int from,
  361. int to,
  362. int width,
  363. compiler_state * cs) {
  364. offshoot_t * offshoot = malloc(sizeof(offshoot_t));
  365. offshoot->in = from;
  366. offshoot->to = to;
  367. offshoot->width = width;
  368. vector_push(&cs->regex->catch_table,
  369. &offshoot);
  370. }
  371. void OFFSHOOT(int from,
  372. int to,
  373. int width,
  374. compiler_state * cs) {
  375. ABSOLUTE_OFFSHOOT(*cs->state + from, *cs->state + to, width, cs);
  376. }
  377. regex_t * regex_compile(const char * const pattern) {
  378. regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
  379. regex->str = strdup(pattern);
  380. vector_init(&regex->delta_table, sizeof(delta_t*), 0UL);
  381. vector_init(&regex->catch_table, sizeof(offshoot_t*), 0UL);
  382. int state = 2;
  383. bool do_catch;
  384. bool is_negative;
  385. bool do_loop_hook;
  386. bool do_follow_hook;
  387. bool do_loop_shoot;
  388. bool do_follow_shoot;
  389. int width;
  390. char whitelist[64];
  391. char blacklist[64];
  392. compiler_state cs = {
  393. .do_catch = &do_catch,
  394. .is_negative = &is_negative,
  395. .state = &state,
  396. .width = &width,
  397. .whitelist = whitelist,
  398. .blacklist = blacklist,
  399. .regex = regex,
  400. };
  401. for (const char * s = pattern; *s != '\00';) {
  402. // Reset the compiler
  403. assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
  404. whitelist[0] = '\0';
  405. blacklist[0] = '\0';
  406. do_catch = false;
  407. is_negative = false;
  408. do_loop_hook = false;
  409. do_follow_hook = false;
  410. do_loop_shoot = false;
  411. do_follow_shoot = false;
  412. width = 1;
  413. // Translate char
  414. switch (*s) {
  415. case '^': {
  416. if (s == pattern) {
  417. ABSOLUTE_OFFSHOOT(0, 2, 0, &cs);
  418. ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, &cs);
  419. }
  420. whitelist[0] = '\n';
  421. whitelist[1] = '\0';
  422. HOOK_ALL(0, whitelist, 0, &cs);
  423. if (s != pattern) {
  424. state += 1;
  425. }
  426. s += 1;
  427. goto long_continue;
  428. } break;
  429. case '.': {
  430. compile_dot(&cs);
  431. } break;
  432. case '\\': {
  433. s += 1;
  434. assert(compile_escape(*s, &cs) && "Unknown escape.");
  435. } break;
  436. case '[': {
  437. s += compile_range(s, &cs) - 1;
  438. } break;
  439. default: {
  440. whitelist[0] = *s;
  441. whitelist[1] = '\0';
  442. } break;
  443. }
  444. s += 1;
  445. // Compile with quantifier
  446. switch (*s) {
  447. case '=':
  448. case '?': {
  449. do_loop_hook = true;
  450. HOOK_ALL(0, whitelist, +1, &cs);
  451. if (do_catch || is_negative) {
  452. OFFSHOOT(0, +1, 1, &cs);
  453. }
  454. s += 1;
  455. } break;
  456. case '*': {
  457. HOOK_ALL(0, whitelist, 0, &cs);
  458. if (do_catch) {
  459. OFFSHOOT(0, +1, 1, &cs);
  460. } else if (is_negative) {
  461. OFFSHOOT(0, 0, 1, &cs);
  462. }
  463. s += 1;
  464. } break;
  465. case '+': {
  466. HOOK_ALL(0, whitelist, +1, &cs);
  467. if (do_catch || is_negative) {
  468. OFFSHOOT(0, +1, 1, &cs);
  469. }
  470. state += 1;
  471. HOOK_ALL(0, whitelist, 0, &cs);
  472. if (do_catch || is_negative) {
  473. OFFSHOOT(0, 0, 1, &cs);
  474. }
  475. s += 1;
  476. } break;
  477. default: { // Literal
  478. HOOK_ALL(0, whitelist, +1, &cs);
  479. if (do_catch || is_negative) {
  480. OFFSHOOT(0, +1, 1, &cs);
  481. }
  482. state += 1;
  483. } break;
  484. }
  485. // Compile blacklist
  486. if (*blacklist) {
  487. char filtered_blacklist[64];
  488. filtered_blacklist[0] = '\0';
  489. filter_blacklist(whitelist, blacklist, filtered_blacklist);
  490. HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs);
  491. }
  492. long_continue:
  493. }
  494. regex->accepting_state = state;
  495. return regex;
  496. }
  497. int regex_free(regex_t * const regex) {
  498. free(regex->str);
  499. vector_free(&regex->delta_table);
  500. vector_free(&regex->catch_table);
  501. free(regex);
  502. return 0;
  503. }
  504. // -----------------
  505. // ### Searching ###
  506. // -----------------
  507. static int catch_(const regex_t * const regex,
  508. int * const state) {
  509. for (size_t i = 0; i < regex->catch_table.element_count; i++){
  510. const offshoot_t * const offshoot = *(offshoot_t**)vector_get(&regex->catch_table, i);
  511. if (offshoot->in == *state) {
  512. *state = offshoot->to;
  513. return offshoot->width;
  514. }
  515. }
  516. return HALT_AND_CATCH_FIRE;
  517. }
  518. static int regex_assert(const regex_t * const regex,
  519. const char * const string,
  520. const int string_offset,
  521. int state,
  522. int width) { // XXX: im pretty sure this is actually redundant and the width should be calculated from string - s
  523. for (const char * s = (string + string_offset); *s != '\00';) {
  524. // delta
  525. for (size_t i = 0; i < regex->delta_table.element_count; i++) {
  526. const delta_t * const delta = *(delta_t**)vector_get(&regex->delta_table, i);
  527. if ((delta->in == state)
  528. && (delta->input == *s)) {
  529. int r = regex_assert(regex, string, (s - string) + delta->width, delta->to, width + 1);
  530. if(r){
  531. return r;
  532. }
  533. }
  534. }
  535. const int catch_width = catch_(regex, &state);
  536. if ((catch_width != HALT_AND_CATCH_FIRE)
  537. && (state != HALT_AND_CATCH_FIRE)) {
  538. s += catch_width;
  539. continue;
  540. }
  541. return (state == regex->accepting_state) ? width : false;
  542. }
  543. return false;
  544. }
  545. int regex_match( regex_t * regex,
  546. const char * const string,
  547. const bool is_start_of_string,
  548. const int string_offset) { // XXX: remove this useless piece of shit of a parameter nigger
  549. if (regex == NULL) {
  550. return false;
  551. }
  552. if (string == NULL) {
  553. return true;
  554. }
  555. const int initial_state = (int)(!is_start_of_string);
  556. return regex_assert(regex, string, string_offset, initial_state, 0);
  557. }
  558. bool regex_search( regex_t * regex,
  559. const char * const string) {
  560. return (bool)regex_match(regex, string, true, 0);
  561. }