Highlight things
25개 이상의 토픽을 선택하실 수 없습니다. Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

618 lines
17KB

  1. /* regex.c
  2. * Copyright 2023 Anon Anonson, Ognjen 'xolatile' Milan Robovic, Emil Williams
  3. * SPDX Identifier: GPL-3.0-only / NO WARRANTY / NO GUARANTEE */
  4. #include "regex.h"
  5. #include <assert.h>
  6. #include <string.h>
  7. #include <limits.h>
  8. #include <stdlib.h>
  9. // ------------------
  10. // ### Char tests ###
  11. // ------------------
  12. static bool is_quantifier(const char c) {
  13. for (const char * s = "+*?="; *s != '\00'; s++) {
  14. if (*s == c) {
  15. return true;
  16. }
  17. }
  18. return false;
  19. }
  20. bool is_magic(const char c) {
  21. if (is_quantifier(c)) {
  22. return true;
  23. }
  24. for (const char * s = "\\[].^"; *s != '\00'; s++) {
  25. if (*s == c) {
  26. return true;
  27. }
  28. }
  29. return false;
  30. }
  31. // ----------------------
  32. // ### Internal Types ###
  33. // ----------------------
  34. typedef struct {
  35. int in;
  36. char input;
  37. int to;
  38. int width;
  39. } delta_t;
  40. typedef struct {
  41. int in;
  42. int to;
  43. int width;
  44. } offshoot_t;
  45. typedef struct {
  46. bool * do_catch;
  47. bool * is_negative;
  48. // these might be obsolite but im leaving them for now
  49. bool * do_loop_hook;
  50. bool * do_follow_hook;
  51. bool * do_loop_shoot;
  52. bool * do_follow_shoot;
  53. // ---
  54. int * state;
  55. int * width;
  56. char * whitelist;
  57. char * blacklist;
  58. regex_t * regex;
  59. } compiler_state;
  60. // ----------------------------------
  61. // ### Regex creation/destruction ###
  62. // ----------------------------------
  63. static int escape_1_to_1(const char c, compiler_state * cs) {
  64. char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
  65. switch (c) {
  66. case 't': {
  67. strcat(target_list, "\t");
  68. } return 1;
  69. case 'n': {
  70. strcat(target_list, "\n");
  71. } return 1;
  72. case 'r': {
  73. strcat(target_list, "\r");
  74. } return 1;
  75. case 'b': {
  76. strcat(target_list, "\b");
  77. } return 1;
  78. case '[': {
  79. strcat(target_list, "[");
  80. } return 1;
  81. case ']': {
  82. strcat(target_list, "]");
  83. } return 1;
  84. case '.': {
  85. strcat(target_list, ".");
  86. } return 1;
  87. case '^': {
  88. strcat(target_list, "^");
  89. } return 1;
  90. case '=': {
  91. strcat(target_list, "=");
  92. } return 1;
  93. case '?': {
  94. strcat(target_list, "?");
  95. } return 1;
  96. case '+': {
  97. strcat(target_list, "+");
  98. } return 1;
  99. case '*': {
  100. strcat(target_list, "*");
  101. } return 1;
  102. case '\\': {
  103. strcat(target_list, "\\");
  104. } return 1;
  105. }
  106. return 0;
  107. }
  108. static int escape_1_to_N(const char c, compiler_state * cs) {
  109. char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
  110. switch(c) {
  111. case 'i': {
  112. const char identifier_chars[] = "@0123456789_"
  113. "\300\301\302\303\304"
  114. "\305\306\307\310\311"
  115. "\312\313\314\315\316"
  116. "\317\320\321\322\323"
  117. "\324\325\326\327\330"
  118. "\331\332\333\334\335"
  119. "\336\337";
  120. strcpy(target_list, identifier_chars);
  121. return sizeof(identifier_chars)-1;
  122. };
  123. case 'I': {
  124. const char identifier_chars[] = "@_"
  125. "\300\301\302\303\304"
  126. "\305\306\307\310\311"
  127. "\312\313\314\315\316"
  128. "\317\320\321\322\323"
  129. "\324\325\326\327\330"
  130. "\331\332\333\334\335"
  131. "\336\337";
  132. strcpy(target_list, identifier_chars);
  133. return sizeof(identifier_chars)-1;
  134. };
  135. case 'k': {
  136. const char keyword_chars[] = "@0123456789_"
  137. "\300\301\302\303\304"
  138. "\305\306\307\310\311"
  139. "\312\313\314\315\316"
  140. "\317\320\321\322\323"
  141. "\324\325\326\327\330"
  142. "\331\332\333\334\335"
  143. "\336\337";
  144. strcpy(target_list, keyword_chars);
  145. return sizeof(keyword_chars)-1;
  146. };
  147. case 'K': {
  148. const char keyword_chars[] = "@_"
  149. "\300\301\302\303\304"
  150. "\305\306\307\310\311"
  151. "\312\313\314\315\316"
  152. "\317\320\321\322\323"
  153. "\324\325\326\327\330"
  154. "\331\332\333\334\335"
  155. "\336\337";
  156. strcpy(target_list, keyword_chars);
  157. return sizeof(keyword_chars)-1;
  158. };
  159. case 'f': {
  160. const char filename_chars[] = "@0123456789/.-_+,#$%~=";
  161. strcpy(target_list, filename_chars);
  162. return sizeof(filename_chars)-1;
  163. };
  164. case 'F': {
  165. const char filename_chars[] = "@/.-_+,#$%~=";
  166. strcpy(target_list, filename_chars);
  167. return sizeof(filename_chars)-1;
  168. };
  169. case 'p': {
  170. const char printable_chars[] = "@"
  171. "\241\242\243\244\245"
  172. "\246\247\250\251\252"
  173. "\253\254\255\256\257"
  174. "\260\261\262\263\264"
  175. "\265\266\267\270\271"
  176. "\272\273\274\275\276"
  177. "\277"
  178. "\300\301\302\303\304"
  179. "\305\306\307\310\311"
  180. "\312\313\314\315\316"
  181. "\317\320\321\322\323"
  182. "\324\325\326\327\330"
  183. "\331\332\333\334\335"
  184. "\336\337";
  185. strcpy(target_list, printable_chars);
  186. return sizeof(printable_chars)-1;
  187. };
  188. case 'P': {
  189. const char printable_chars[] = "@"
  190. "\241\242\243\244\245"
  191. "\246\247\250\251\252"
  192. "\253\254\255\256\257"
  193. "\260\261\262\263\264"
  194. "\265\266\267\270\271"
  195. "\272\273\274\275\276"
  196. "\277"
  197. "\300\301\302\303\304"
  198. "\305\306\307\310\311"
  199. "\312\313\314\315\316"
  200. "\317\320\321\322\323"
  201. "\324\325\326\327\330"
  202. "\331\332\333\334\335"
  203. "\336\337";
  204. strcpy(target_list, printable_chars);
  205. return sizeof(printable_chars)-1;
  206. };
  207. case 's': {
  208. const char whitespace_chars[] = " \t\v\n";
  209. strcpy(target_list, whitespace_chars);
  210. return sizeof(whitespace_chars)-1;
  211. };
  212. case 'd': {
  213. const char digit_chars[] = "0123456789";
  214. strcpy(target_list, digit_chars);
  215. return sizeof(digit_chars)-1;
  216. };
  217. case 'x': {
  218. const char hex_chars[] = "0123456789"
  219. "abcdef"
  220. "ABCDEF";
  221. strcpy(target_list, hex_chars);
  222. return sizeof(hex_chars)-1;
  223. };
  224. case 'o': {
  225. const char oct_chars[] = "01234567";
  226. strcpy(target_list, oct_chars);
  227. return sizeof(oct_chars)-1;
  228. };
  229. case 'w': {
  230. const char word_chars[] = "0123456789"
  231. "abcdefghijklmnopqrstuwxyz"
  232. "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  233. "_";
  234. strcpy(target_list, word_chars);
  235. return sizeof(word_chars)-1;
  236. };
  237. case 'h': {
  238. const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
  239. "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  240. "_";
  241. strcpy(target_list, very_word_chars);
  242. return sizeof(very_word_chars)-1;
  243. };
  244. case 'a': {
  245. const char alpha_chars[] = "abcdefghijklmnopqrstuwxyz"
  246. "ABCDEFGHIJKLMNOPQRSTUWXYZ";
  247. strcpy(target_list, alpha_chars);
  248. return sizeof(alpha_chars)-1;
  249. };
  250. case 'l': {
  251. const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz";
  252. strcpy(target_list, lower_alpha_chars);
  253. return sizeof(lower_alpha_chars)-1;
  254. };
  255. case 'u': {
  256. const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ";
  257. strcpy(target_list, upper_alpha_chars);
  258. return sizeof(upper_alpha_chars)-1;
  259. };
  260. }
  261. return 0;
  262. }
  263. static int escape_to_negative(const char c,
  264. compiler_state * cs) {
  265. switch (c) {
  266. case 'D': {
  267. const char digit_chars[] = "0123456789";
  268. strcpy(cs->blacklist, digit_chars);
  269. *cs->is_negative = true;
  270. return sizeof(digit_chars)-1;
  271. };
  272. }
  273. return 0;
  274. }
  275. //static int compile_hologram(char * hologram, char * whitelist) {
  276. // if (hologram[0] == '\\') {
  277. // switch (hologram[1]) {
  278. // case '<': {
  279. // const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
  280. // "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  281. // "_";
  282. // strcat(whitelist, very_word_chars);
  283. // is_negative = true;
  284. // HOOK_ALL(0, whitelist, 0)
  285. // } break;
  286. // }
  287. // }
  288. //}
  289. static int compile_dot(compiler_state * cs) {
  290. *cs->do_catch = true;
  291. return true;
  292. }
  293. static int compile_escape(const char c,
  294. compiler_state * cs) {
  295. return escape_1_to_1(c, cs)
  296. || escape_1_to_N(c, cs)
  297. || escape_to_negative(c, cs)
  298. //|| compile_hologram(*s, whitelist)
  299. ;
  300. }
  301. static int compile_range(const char * const range,
  302. compiler_state * cs) {
  303. assert((range[0] == '[') && "Not a range.");
  304. char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
  305. const char * s;
  306. if (range[1] == '^') {
  307. *cs->is_negative = true;
  308. s = range + 2;
  309. } else {
  310. s = range + 1;
  311. }
  312. for (; *s != ']'; s++) {
  313. assert((*s != '\0') && "Unclosed range.");
  314. char c = *s;
  315. if (c == '\\') {
  316. s += 1;
  317. assert(compile_escape(*s, cs) && "Unknown escape.");
  318. } else if (*(s+1) == '-') {
  319. char end = *(s+2);
  320. assert((c < end) && "Endless range.");
  321. for (char cc = c; cc < end+1; cc++) {
  322. strncat(target_list, &cc, 1);
  323. strncat(target_list, "\0", 1);
  324. }
  325. s += 2;
  326. } else {
  327. strncat(target_list, &c, 1);
  328. }
  329. }
  330. return ((s - range) + 1);
  331. }
  332. void filter_blacklist(const char * whitelist,
  333. const char * blacklist,
  334. char * filtered) {
  335. for (; *blacklist != '\0'; blacklist++) {
  336. for(; *whitelist != '\0'; whitelist++) {
  337. if (*blacklist == *whitelist) {
  338. goto long_continue;
  339. }
  340. }
  341. strncat(filtered, blacklist, 1);
  342. long_continue:;
  343. }
  344. }
  345. #define HALT_AND_CATCH_FIRE INT_MIN
  346. void HOOK_ALL( int from,
  347. const char * const str,
  348. int to,
  349. compiler_state * cs) {
  350. int hook_to = (to == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : ((*cs->state) + to);
  351. for (const char * s = str; *s != '\0'; s++) {
  352. delta_t * delta = malloc(sizeof(delta_t));
  353. delta->in = *cs->state + from;
  354. delta->input = *s;
  355. delta->to = hook_to;
  356. delta->width = *cs->width;
  357. vector_push(&cs->regex->delta_table,
  358. &delta);
  359. }
  360. }
  361. void ABSOLUTE_OFFSHOOT(int from,
  362. int to,
  363. int width,
  364. compiler_state * cs) {
  365. offshoot_t * offshoot = malloc(sizeof(offshoot_t));
  366. offshoot->in = from;
  367. offshoot->to = to;
  368. offshoot->width = width;
  369. vector_push(&cs->regex->catch_table,
  370. &offshoot);
  371. }
  372. void OFFSHOOT(int from,
  373. int to,
  374. int width,
  375. compiler_state * cs) {
  376. ABSOLUTE_OFFSHOOT(*cs->state + from, *cs->state + to, width, cs);
  377. }
  378. regex_t * regex_compile(const char * const pattern) {
  379. regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
  380. regex->str = strdup(pattern);
  381. vector_init(&regex->delta_table, sizeof(delta_t*), 0UL);
  382. vector_init(&regex->catch_table, sizeof(offshoot_t*), 0UL);
  383. int state = 2;
  384. bool do_catch;
  385. bool is_negative;
  386. bool do_loop_hook;
  387. bool do_follow_hook;
  388. bool do_loop_shoot;
  389. bool do_follow_shoot;
  390. int width;
  391. char whitelist[64];
  392. char blacklist[64];
  393. compiler_state cs = {
  394. .do_catch = &do_catch,
  395. .is_negative = &is_negative,
  396. .state = &state,
  397. .width = &width,
  398. .whitelist = whitelist,
  399. .blacklist = blacklist,
  400. .regex = regex,
  401. };
  402. for (const char * s = pattern; *s != '\00';) {
  403. // Reset the compiler
  404. assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
  405. whitelist[0] = '\0';
  406. blacklist[0] = '\0';
  407. do_catch = false;
  408. is_negative = false;
  409. do_loop_hook = false;
  410. do_follow_hook = false;
  411. do_loop_shoot = false;
  412. do_follow_shoot = false;
  413. width = 1;
  414. // Translate char
  415. switch (*s) {
  416. case '^': {
  417. if (s == pattern) {
  418. ABSOLUTE_OFFSHOOT(0, 2, 0, &cs);
  419. ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, &cs);
  420. }
  421. whitelist[0] = '\n';
  422. whitelist[1] = '\0';
  423. HOOK_ALL(0, whitelist, 0, &cs);
  424. if (s != pattern) {
  425. state += 1;
  426. }
  427. s += 1;
  428. goto long_continue;
  429. } break;
  430. case '.': {
  431. compile_dot(&cs);
  432. } break;
  433. case '\\': {
  434. s += 1;
  435. assert(compile_escape(*s, &cs) && "Unknown escape.");
  436. } break;
  437. case '[': {
  438. s += compile_range(s, &cs) - 1;
  439. } break;
  440. default: {
  441. whitelist[0] = *s;
  442. whitelist[1] = '\0';
  443. } break;
  444. }
  445. s += 1;
  446. // Compile with quantifier
  447. switch (*s) {
  448. case '=':
  449. case '?': {
  450. do_loop_hook = true;
  451. HOOK_ALL(0, whitelist, +1, &cs);
  452. if (do_catch || is_negative) {
  453. OFFSHOOT(0, +1, 1, &cs);
  454. }
  455. s += 1;
  456. } break;
  457. case '*': {
  458. HOOK_ALL(0, whitelist, 0, &cs);
  459. if (do_catch) {
  460. OFFSHOOT(0, +1, 1, &cs);
  461. } else if (is_negative) {
  462. OFFSHOOT(0, 0, 1, &cs);
  463. }
  464. s += 1;
  465. } break;
  466. case '+': {
  467. HOOK_ALL(0, whitelist, +1, &cs);
  468. if (do_catch || is_negative) {
  469. OFFSHOOT(0, +1, 1, &cs);
  470. }
  471. state += 1;
  472. HOOK_ALL(0, whitelist, 0, &cs);
  473. if (do_catch || is_negative) {
  474. OFFSHOOT(0, 0, 1, &cs);
  475. }
  476. s += 1;
  477. } break;
  478. default: { // Literal
  479. HOOK_ALL(0, whitelist, +1, &cs);
  480. if (do_catch || is_negative) {
  481. OFFSHOOT(0, +1, 1, &cs);
  482. }
  483. state += 1;
  484. } break;
  485. }
  486. // Compile blacklist
  487. if (*blacklist) {
  488. char filtered_blacklist[64];
  489. filtered_blacklist[0] = '\0';
  490. filter_blacklist(whitelist, blacklist, filtered_blacklist);
  491. HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs);
  492. }
  493. long_continue:
  494. }
  495. regex->accepting_state = state;
  496. return regex;
  497. }
  498. int regex_free(regex_t * const regex) {
  499. free(regex->str);
  500. vector_free(&regex->delta_table);
  501. vector_free(&regex->catch_table);
  502. free(regex);
  503. return 0;
  504. }
  505. // -----------------
  506. // ### Searching ###
  507. // -----------------
  508. static int catch_(const regex_t * const regex,
  509. int * const state) {
  510. for (size_t i = 0; i < regex->catch_table.element_count; i++){
  511. const offshoot_t * const offshoot = *(offshoot_t**)vector_get(&regex->catch_table, i);
  512. if (offshoot->in == *state) {
  513. *state = offshoot->to;
  514. return offshoot->width;
  515. }
  516. }
  517. return HALT_AND_CATCH_FIRE;
  518. }
  519. static int regex_assert(const regex_t * const regex,
  520. const char * const string,
  521. const int string_offset,
  522. int state,
  523. int width) { // XXX: im pretty sure this is actually redundant and the width should be calculated from string - s
  524. for (const char * s = (string + string_offset); *s != '\00';) {
  525. // delta
  526. for (size_t i = 0; i < regex->delta_table.element_count; i++) {
  527. const delta_t * const delta = *(delta_t**)vector_get(&regex->delta_table, i);
  528. if ((delta->in == state)
  529. && (delta->input == *s)) {
  530. int r = regex_assert(regex, string, (s - string) + delta->width, delta->to, width + 1);
  531. if(r){
  532. return r;
  533. }
  534. }
  535. }
  536. const int catch_width = catch_(regex, &state);
  537. if ((catch_width != HALT_AND_CATCH_FIRE)
  538. && (state != HALT_AND_CATCH_FIRE)) {
  539. s += catch_width;
  540. continue;
  541. }
  542. return (state == regex->accepting_state) ? width : false;
  543. }
  544. return false;
  545. }
  546. int regex_match( regex_t * regex,
  547. const char * const string,
  548. const bool is_start_of_string,
  549. const int string_offset) { // XXX: remove this useless piece of shit of a parameter nigger
  550. if (regex == NULL) {
  551. return false;
  552. }
  553. if (string == NULL) {
  554. return true;
  555. }
  556. const int initial_state = (int)(!is_start_of_string);
  557. return regex_assert(regex, string, string_offset, initial_state, 0);
  558. }
  559. bool regex_search( regex_t * regex,
  560. const char * const string) {
  561. return (bool)regex_match(regex, string, true, 0);
  562. }