Highlight things
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

569 lines
15KB

  1. /* regex.c
  2. * Copyright 2023 Anon Anonson, Ognjen 'xolatile' Milan Robovic, Emil Williams
  3. * SPDX Identifier: GPL-3.0-only / NO WARRANTY / NO GUARANTEE */
  4. #include "regex.h"
  5. #include <assert.h>
  6. #include <string.h>
  7. #include <limits.h>
  8. // ------------------
  9. // ### Char tests ###
  10. // ------------------
  11. static bool is_quantifier(const char c) {
  12. for (const char * s = "+*?="; *s != '\00'; s++) {
  13. if (*s == c) {
  14. return true;
  15. }
  16. }
  17. return false;
  18. }
  19. bool is_magic(const char c) {
  20. if (is_quantifier(c)) {
  21. return true;
  22. }
  23. for (const char * s = "\\[]."; *s != '\00'; s++) {
  24. if (*s == c) {
  25. return true;
  26. }
  27. }
  28. return false;
  29. }
  30. // ----------------------
  31. // ### Internal Types ###
  32. // ----------------------
  33. typedef struct {
  34. int in;
  35. char input;
  36. int to;
  37. int width;
  38. } delta_t;
  39. typedef struct {
  40. int in;
  41. int to;
  42. } offshoot_t;
  43. typedef struct {
  44. bool * do_catch;
  45. bool * is_negative;
  46. int * state;
  47. int * width;
  48. char * whitelist;
  49. char * blacklist;
  50. regex_t * regex;
  51. } compiler_state;
  52. // ----------------------------------
  53. // ### Regex creation/destruction ###
  54. // ----------------------------------
  55. static int escape_1_to_1(const char c, compiler_state * cs) {
  56. char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
  57. switch (c) {
  58. case 't': {
  59. strcat(target_list, "\t");
  60. } return 1;
  61. case 'n': {
  62. strcat(target_list, "\n");
  63. } return 1;
  64. case 'r': {
  65. strcat(target_list, "\r");
  66. } return 1;
  67. case 'b': {
  68. strcat(target_list, "\b");
  69. } return 1;
  70. case '[': {
  71. strcat(target_list, "[");
  72. } return 1;
  73. case ']': {
  74. strcat(target_list, "]");
  75. } return 1;
  76. case '.': {
  77. strcat(target_list, ".");
  78. } return 1;
  79. case '=': {
  80. strcat(target_list, "=");
  81. } return 1;
  82. case '?': {
  83. strcat(target_list, "?");
  84. } return 1;
  85. case '+': {
  86. strcat(target_list, "+");
  87. } return 1;
  88. case '*': {
  89. strcat(target_list, "*");
  90. } return 1;
  91. case '\\': {
  92. strcat(target_list, "\\");
  93. } return 1;
  94. }
  95. return 0;
  96. }
  97. static int escape_1_to_N(const char c, compiler_state * cs) {
  98. char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
  99. switch(c) {
  100. case 'i': {
  101. const char identifier_chars[] = "@0123456789_"
  102. "\300\301\302\303\304"
  103. "\305\306\307\310\311"
  104. "\312\313\314\315\316"
  105. "\317\320\321\322\323"
  106. "\324\325\326\327\330"
  107. "\331\332\333\334\335"
  108. "\336\337";
  109. strcpy(target_list, identifier_chars);
  110. return sizeof(identifier_chars)-1;
  111. };
  112. case 'I': {
  113. const char identifier_chars[] = "@_"
  114. "\300\301\302\303\304"
  115. "\305\306\307\310\311"
  116. "\312\313\314\315\316"
  117. "\317\320\321\322\323"
  118. "\324\325\326\327\330"
  119. "\331\332\333\334\335"
  120. "\336\337";
  121. strcpy(target_list, identifier_chars);
  122. return sizeof(identifier_chars)-1;
  123. };
  124. case 'k': {
  125. const char keyword_chars[] = "@0123456789_"
  126. "\300\301\302\303\304"
  127. "\305\306\307\310\311"
  128. "\312\313\314\315\316"
  129. "\317\320\321\322\323"
  130. "\324\325\326\327\330"
  131. "\331\332\333\334\335"
  132. "\336\337";
  133. strcpy(target_list, keyword_chars);
  134. return sizeof(keyword_chars)-1;
  135. };
  136. case 'K': {
  137. const char keyword_chars[] = "@_"
  138. "\300\301\302\303\304"
  139. "\305\306\307\310\311"
  140. "\312\313\314\315\316"
  141. "\317\320\321\322\323"
  142. "\324\325\326\327\330"
  143. "\331\332\333\334\335"
  144. "\336\337";
  145. strcpy(target_list, keyword_chars);
  146. return sizeof(keyword_chars)-1;
  147. };
  148. case 'f': {
  149. const char filename_chars[] = "@0123456789/.-_+,#$%~=";
  150. strcpy(target_list, filename_chars);
  151. return sizeof(filename_chars)-1;
  152. };
  153. case 'F': {
  154. const char filename_chars[] = "@/.-_+,#$%~=";
  155. strcpy(target_list, filename_chars);
  156. return sizeof(filename_chars)-1;
  157. };
  158. case 'p': {
  159. const char printable_chars[] = "@"
  160. "\241\242\243\244\245"
  161. "\246\247\250\251\252"
  162. "\253\254\255\256\257"
  163. "\260\261\262\263\264"
  164. "\265\266\267\270\271"
  165. "\272\273\274\275\276"
  166. "\277"
  167. "\300\301\302\303\304"
  168. "\305\306\307\310\311"
  169. "\312\313\314\315\316"
  170. "\317\320\321\322\323"
  171. "\324\325\326\327\330"
  172. "\331\332\333\334\335"
  173. "\336\337";
  174. strcpy(target_list, printable_chars);
  175. return sizeof(printable_chars)-1;
  176. };
  177. case 'P': {
  178. const char printable_chars[] = "@"
  179. "\241\242\243\244\245"
  180. "\246\247\250\251\252"
  181. "\253\254\255\256\257"
  182. "\260\261\262\263\264"
  183. "\265\266\267\270\271"
  184. "\272\273\274\275\276"
  185. "\277"
  186. "\300\301\302\303\304"
  187. "\305\306\307\310\311"
  188. "\312\313\314\315\316"
  189. "\317\320\321\322\323"
  190. "\324\325\326\327\330"
  191. "\331\332\333\334\335"
  192. "\336\337";
  193. strcpy(target_list, printable_chars);
  194. return sizeof(printable_chars)-1;
  195. };
  196. case 's': {
  197. const char whitespace_chars[] = " \t\v\n";
  198. strcpy(target_list, whitespace_chars);
  199. return sizeof(whitespace_chars)-1;
  200. };
  201. case 'd': {
  202. const char digit_chars[] = "0123456789";
  203. strcpy(target_list, digit_chars);
  204. return sizeof(digit_chars)-1;
  205. };
  206. case 'x': {
  207. const char hex_chars[] = "0123456789"
  208. "abcdef"
  209. "ABCDEF";
  210. strcpy(target_list, hex_chars);
  211. return sizeof(hex_chars)-1;
  212. };
  213. case 'o': {
  214. const char oct_chars[] = "01234567";
  215. strcpy(target_list, oct_chars);
  216. return sizeof(oct_chars)-1;
  217. };
  218. case 'w': {
  219. const char word_chars[] = "0123456789"
  220. "abcdefghijklmnopqrstuwxyz"
  221. "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  222. "_";
  223. strcpy(target_list, word_chars);
  224. return sizeof(word_chars)-1;
  225. };
  226. case 'h': {
  227. const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
  228. "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  229. "_";
  230. strcpy(target_list, very_word_chars);
  231. return sizeof(very_word_chars)-1;
  232. };
  233. case 'a': {
  234. const char alpha_chars[] = "abcdefghijklmnopqrstuwxyz"
  235. "ABCDEFGHIJKLMNOPQRSTUWXYZ";
  236. strcpy(target_list, alpha_chars);
  237. return sizeof(alpha_chars)-1;
  238. };
  239. case 'l': {
  240. const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz";
  241. strcpy(target_list, lower_alpha_chars);
  242. return sizeof(lower_alpha_chars)-1;
  243. };
  244. case 'u': {
  245. const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ";
  246. strcpy(target_list, upper_alpha_chars);
  247. return sizeof(upper_alpha_chars)-1;
  248. };
  249. }
  250. return 0;
  251. }
  252. static int escape_to_negative(const char c,
  253. compiler_state * cs) {
  254. switch (c) {
  255. case 'D': {
  256. const char digit_chars[] = "0123456789";
  257. strcpy(cs->blacklist, digit_chars);
  258. *cs->is_negative = true;
  259. return sizeof(digit_chars)-1;
  260. };
  261. }
  262. return 0;
  263. }
  264. //static int compile_hologram(char * hologram, char * whitelist) {
  265. // if (hologram[0] == '\\') {
  266. // switch (hologram[1]) {
  267. // case '<': {
  268. // const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
  269. // "ABCDEFGHIJKLMNOPQRSTUWXYZ"
  270. // "_";
  271. // strcat(whitelist, very_word_chars);
  272. // is_negative = true;
  273. // HOOK_ALL(0, whitelist, 0)
  274. // } break;
  275. // }
  276. // }
  277. //}
  278. static int compile_dot(compiler_state * cs) {
  279. *cs->do_catch = true;
  280. return true;
  281. }
  282. static int compile_escape(const char c,
  283. compiler_state * cs) {
  284. return escape_1_to_1(c, cs)
  285. || escape_1_to_N(c, cs)
  286. || escape_to_negative(c, cs)
  287. //|| compile_hologram(*s, whitelist)
  288. ;
  289. }
  290. static int compile_range(const char * const range,
  291. compiler_state * cs) {
  292. assert((range[0] == '[') && "Not a range.");
  293. char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
  294. const char * s;
  295. if (range[1] == '^') {
  296. *cs->is_negative = true;
  297. s = range + 2;
  298. } else {
  299. s = range + 1;
  300. }
  301. for (; *s != ']'; s++) {
  302. assert((*s != '\0') && "Unclosed range.");
  303. char c = *s;
  304. if (c == '\\') {
  305. s += 1;
  306. assert(compile_escape(*s, cs) && "Unknown escape.");
  307. } else if (*(s+1) == '-') {
  308. char end = *(s+2);
  309. assert((c < end) && "Endless range.");
  310. for (char cc = c; cc < end+1; cc++) {
  311. strncat(target_list, &cc, 1);
  312. strncat(target_list, "\0", 1);
  313. }
  314. s += 2;
  315. } else {
  316. strncat(target_list, &c, 1);
  317. }
  318. }
  319. return ((s - range) + 1);
  320. }
  321. void filter_blacklist(const char * const whitelist,
  322. const char * const blacklist,
  323. char * const filtered) {
  324. for (char * black_pointer = blacklist; *black_pointer != '\0'; black_pointer++) {
  325. for(char * white_pointer = blacklist; *white_pointer != '\0'; white_pointer++) {
  326. if (*black_pointer == *white_pointer) {
  327. goto long_continue;
  328. }
  329. }
  330. strncat(filtered, black_pointer, 1);
  331. long_continue:
  332. }
  333. }
  334. #define HALT_AND_CATCH_FIRE INT_MIN
  335. void HOOK_ALL( int from,
  336. const char * const str,
  337. int to,
  338. compiler_state * cs) {
  339. int hook_to = (to == HALT_AND_CATCH_FIRE) ? -1 : ((*cs->state) + to);
  340. for (const char * s = str; *s != '\0'; s++) {
  341. delta_t * delta = malloc(sizeof(delta_t));
  342. delta->in = *cs->state + from;
  343. delta->input = *s;
  344. delta->to = hook_to;
  345. delta->width = *cs->width;
  346. vector_push(&cs->regex->delta_table,
  347. &delta);
  348. }
  349. }
  350. void OFFSHOOT(int from,
  351. int to,
  352. compiler_state * cs) {
  353. offshoot_t * offshoot = malloc(sizeof(offshoot_t));
  354. offshoot->in = *cs->state + from;
  355. offshoot->to = *cs->state + to;
  356. vector_push(&cs->regex->catch_table,
  357. &offshoot);
  358. }
  359. regex_t * regex_compile(const char * const pattern) {
  360. regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
  361. regex->str = strdup(pattern);
  362. vector_init(&regex->delta_table, sizeof(delta_t*), 0UL);
  363. vector_init(&regex->catch_table, sizeof(offshoot_t*), 0UL);
  364. int state = 0;
  365. bool do_catch;
  366. bool is_negative;
  367. int width;
  368. char whitelist[64];
  369. char blacklist[64];
  370. compiler_state cs = {
  371. .do_catch = &do_catch,
  372. .is_negative = &is_negative,
  373. .state = &state,
  374. .width = &width,
  375. .whitelist = whitelist,
  376. .blacklist = blacklist,
  377. .regex = regex,
  378. };
  379. for (const char * s = pattern; *s != '\00';) {
  380. // Reset the compiler
  381. assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
  382. whitelist[0] = '\00';
  383. blacklist[0] = '\00';
  384. do_catch = false;
  385. is_negative = false;
  386. width = 1;
  387. // Translate char
  388. switch (*s) {
  389. case '.': {
  390. compile_dot(&cs);
  391. } break;
  392. case '\\': {
  393. s += 1;
  394. assert(compile_escape(*s, &cs) && "Unknown escape.");
  395. } break;
  396. case '[': {
  397. s += compile_range(s, &cs) - 1;
  398. } break;
  399. default: {
  400. whitelist[0] = *s;
  401. whitelist[1] = '\00';
  402. } break;
  403. }
  404. s += 1;
  405. // Compile with quantifier
  406. switch (*s) {
  407. case '=':
  408. case '?': {
  409. HOOK_ALL(0, whitelist, +1, &cs);
  410. if (do_catch || is_negative) {
  411. OFFSHOOT(0, +1, &cs);
  412. }
  413. s += 1;
  414. } break;
  415. case '*': {
  416. HOOK_ALL(0, whitelist, 0, &cs);
  417. if (do_catch) {
  418. OFFSHOOT(0, +1, &cs);
  419. } else if (is_negative) {
  420. OFFSHOOT(0, 0, &cs);
  421. }
  422. s += 1;
  423. } break;
  424. case '+': {
  425. HOOK_ALL(0, whitelist, +1, &cs);
  426. if (do_catch || is_negative) {
  427. OFFSHOOT(0, +1, &cs);
  428. }
  429. state += 1;
  430. HOOK_ALL(0, whitelist, 0, &cs);
  431. if (do_catch || is_negative) {
  432. OFFSHOOT(0, 0, &cs);
  433. }
  434. s += 1;
  435. } break;
  436. default: { // Literal
  437. HOOK_ALL(0, whitelist, +1, &cs);
  438. if (do_catch || is_negative) {
  439. OFFSHOOT(0, +1, &cs);
  440. }
  441. state += 1;
  442. } break;
  443. }
  444. // Compile blacklist
  445. if (*blacklist) {
  446. char filtered_blacklist[64];
  447. filtered_blacklist[0] = '\0';
  448. filter_blacklist(whitelist, blacklist, filtered_blacklist);
  449. HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs);
  450. }
  451. }
  452. regex->accepting_state = state;
  453. return regex;
  454. }
  455. int regex_free(regex_t * const regex) {
  456. free(regex->str);
  457. vector_free(&regex->delta_table);
  458. vector_free(&regex->catch_table);
  459. free(regex);
  460. return 0;
  461. }
  462. // -----------------
  463. // ### Searching ###
  464. // -----------------
  465. static bool catch_(const regex_t * const regex,
  466. int * const state) {
  467. for (size_t i = 0; i < regex->catch_table.element_count; i++){
  468. const offshoot_t * const offshoot = *(offshoot_t**)vector_get(&regex->catch_table, i);
  469. if (offshoot->in == *state) {
  470. *state = offshoot->to;
  471. return true;
  472. }
  473. }
  474. return false;
  475. }
  476. static int regex_assert(const regex_t * const regex,
  477. const char * const string,
  478. int state,
  479. int width) {
  480. for (const char * s = string; *s != '\00'; s++) {
  481. // delta
  482. for (size_t i = 0; i < regex->delta_table.element_count; i++) {
  483. const delta_t * const delta = *(delta_t**)vector_get(&regex->delta_table, i);
  484. if ((delta->in == state)
  485. && (delta->input == *s)) {
  486. int r = regex_assert(regex, s + delta->width, delta->to, width + 1);
  487. if(r){
  488. return r;
  489. }
  490. }
  491. }
  492. if (catch_(regex, &state)) {
  493. width += 1;
  494. continue;
  495. }
  496. return (state == regex->accepting_state) ? width : false;
  497. }
  498. return false;
  499. }
  500. int regex_match( regex_t * regex,
  501. const char * const string) {
  502. if (regex == NULL) {
  503. return false;
  504. }
  505. if (string == NULL) {
  506. return true;
  507. }
  508. return regex_assert(regex, string, 0, 0);
  509. }
  510. bool regex_search( regex_t * regex,
  511. const char * const string) {
  512. return (bool)regex_match(regex, string);
  513. }