From 7c88282816e53efcc72462fb982296082b21a75e Mon Sep 17 00:00:00 2001 From: anon Date: Sat, 2 Sep 2023 17:39:52 +0200 Subject: [PATCH] . --- .gdbinit | 2 + .gitignore | 3 + Makefile | 10 + debug/stage1.gdb | 1 + debug/stage2.gdb | 3 + documentation/README.md | 18 ++ source/main.cpp | 136 +++++++++ source/regex.c | 718 +++++++++++++++++++++++++++++++++++++++++++++++ {src => source}/regex.h | 4 +- {src => source}/vector.c | 10 +- {src => source}/vector.h | 0 src/main.c | 76 ----- src/regex.c | 691 --------------------------------------------- 13 files changed, 896 insertions(+), 776 deletions(-) create mode 100644 .gdbinit create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 debug/stage1.gdb create mode 100644 debug/stage2.gdb create mode 100644 documentation/README.md create mode 100644 source/main.cpp create mode 100644 source/regex.c rename {src => source}/regex.h (69%) rename {src => source}/vector.c (79%) rename {src => source}/vector.h (100%) delete mode 100644 src/main.c delete mode 100644 src/regex.c diff --git a/.gdbinit b/.gdbinit new file mode 100644 index 0000000..8f046c7 --- /dev/null +++ b/.gdbinit @@ -0,0 +1,2 @@ +source debug/regex.pretty_print.py +source debug/stage1.gdb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ef00294 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +regtest +*.out +.gdb_history diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c6f94e4 --- /dev/null +++ b/Makefile @@ -0,0 +1,10 @@ +CXXFLAGS := -fuse-ld=mold -ggdb +OUT := regtest + +main: + g++ ${CXXFLAGS} source/main.cpp source/vector.c source/regex.c -o ${OUT} + +run: + ${OUT} + +test: run diff --git a/debug/stage1.gdb b/debug/stage1.gdb new file mode 100644 index 0000000..113bd0f --- /dev/null +++ b/debug/stage1.gdb @@ -0,0 +1 @@ +b regex_match diff --git a/debug/stage2.gdb b/debug/stage2.gdb new file mode 100644 index 0000000..340ea78 --- /dev/null +++ b/debug/stage2.gdb @@ -0,0 +1,3 @@ +break regex_assert +continue +print * regex diff --git a/documentation/README.md b/documentation/README.md new file mode 100644 index 0000000..0581e6e --- /dev/null +++ b/documentation/README.md @@ -0,0 +1,18 @@ +# Abstraction + +---------------------+ + | | + | | + | State register | + | | + | | + +---------------------+ + + + +---------------------------------+ + | State transition table | + +---------------------------------+ + + + +---------------------------------+ + | Fallback transition table | + +---------------------------------+ diff --git a/source/main.cpp b/source/main.cpp new file mode 100644 index 0000000..04db856 --- /dev/null +++ b/source/main.cpp @@ -0,0 +1,136 @@ +#include +#include "regex.h" + +static int test_counter = 0; +static int passed_tests = 0; +static int positives = 0; +static int positive_successes = 0; +static int negatives = 0; +static int negative_successes = 0; + +static void +TEST(const char * const what, + const char * const on, + const bool expect){ + + regex_t * r = regex_compile(what); + bool result = regex_search(r, on); + bool passed = (result == expect); + + expect ? ++positives : ++negatives; + + if (passed) { + printf("\033[32;1mSuccess\033[0;1m. - \033[0m"); + expect ? ++positive_successes : ++negative_successes; + } else { + printf("\033[31;1mFailiour\033[0;1m. - \033[0m"); + } + printf("'%12s'\033[1m vs \033[0m'%12s'\033[1m:\033[0m Result = %d, Expected = %d\n", what, on, result, expect); + if (passed) { + ++passed_tests; + } + + ++test_counter; +} + +signed main() { + TEST(R"del(abc)del","abc",true); + TEST(R"del(efg1)del","efg1",true); + TEST(R"del(nig)del","ger",false); + TEST(R"del(ss)del","sss",true); + TEST(R"del(sss)del","ss",false); + + puts(""); + + TEST(R"del(ab+c)del","abc",true); + TEST(R"del(ef+g1)del","effffg1",true); + TEST(R"del(efg1+)del","efg",false); + TEST(R"del(efg1+)del","efg1",true); + TEST(R"del(efg1+)del","efg11",true); + + puts(""); + + TEST(R"del(a+a)del","aaa",true); + TEST(R"del(a+a)del","aa",true); + TEST(R"del(a+a)del","a",false); + TEST(R"del(a+a)del","aaa",true); + TEST(R"del(a+\+)del","aaa",false); + + puts(""); + + TEST(R"del(ab*c)del","abc",true); + TEST(R"del(ef*g1)del","effffg1",true); + TEST(R"del(efg1*)del","efg",true); + TEST(R"del(efg1*)del","efg1",true); + TEST(R"del(efg1*)del","efg11",true); + + puts(""); + + TEST(R"del(ne.)del","net",true); + TEST(R"del(ne.)del","ne",false); + TEST(R"del(ne.+)del","neoo",true); + TEST(R"del(ne.*)del","neoo",true); + TEST(R"del(ne.*)del","ne",true); + + puts(""); + + TEST(R"del(ne.)del","ne\t",true); + TEST(R"del(ne\t)del","ne",false); + TEST(R"del(ne\t)del","ne\t",true); + TEST(R"del(ne\t)del","net",false); + TEST(R"del(ne)del","ne\t",true); + + puts(""); + + TEST(R"del(\sa)del"," a",true); + TEST(R"del(\sa)del"," a ",true); + TEST(R"del(\wi)del","hi",true); + TEST(R"del(\w+)del","asd",true); + TEST(R"del(\w*)del","",true); + + puts(""); + + TEST(R"del([A-Za-z]+)del","HelloWorld",true); + TEST(R"del([A-Za-z]+g)del","HelloWorldg",true); + TEST(R"del([A-Za-z]+g)del","g",false); + TEST(R"del([A-Za-z]*g)del","g",true); + TEST(R"del([A-Za-z]+1)del","1",false); + + puts(""); + + TEST(R"del(^\^)del","^^",true); + TEST(R"del(^\^)del"," ^",false); + TEST(R"del(^ \^)del"," ^",true); + TEST(R"del(^a*)del","asd",true); + TEST(R"del(^)del","",true); + + puts(""); + + TEST(R"del(\)del","test",true); + TEST(R"del(\)del","testa",false); + TEST(R"del(\)del","test",true); + + //TEST(R"del(\d{3})del","123",true); + //TEST(R"del(^\w+@\w+\.\w+$)del","example@email.com",true); + + //TEST(R"del(\b\w+\b)del","This is a test",true); + //TEST(R"del(^[A-Za-z]+\s\d+)del","OpenAI 123",true); + //TEST(R"del([0-9]{4}-[0-9]{2}-[0-9]{2})del","2023-08-22",true); + + //TEST(R"del(^[^abc]+$)del","def123",true); + //TEST(R"del(\b\d{5}\b)del","12345 67890",true); + //TEST(R"del(^[A-Z][a-z]+$)del","OpenAI",true); + + //TEST(R"del(\d{3}-\d{2}-\d{4})del","123-45-6789",true); + //TEST(R"del(^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})del","192.168.1.1",true); + //TEST(R"del(^\w{8,12})del","Password123", false); + + if(test_counter == passed_tests) { + fputs("\033[32m", stdout); + } + printf("\nPassed %d out of %d tests.\033[0m\n", passed_tests, test_counter); + printf("\tPositives: %d/%d\n", positive_successes, positives); + printf("\tNegatives: %d/%d\n", negative_successes, negatives); +} diff --git a/source/regex.c b/source/regex.c new file mode 100644 index 0000000..c1cd4fd --- /dev/null +++ b/source/regex.c @@ -0,0 +1,718 @@ +/* XXX: + * as it turns out returning a range of match objects is a + * high profile performance issue regarding regex, especially when highlighting. + * now as it stands we search an array of tokens for every position on a string. + * which sounds ok, until one realizes that searching from any position revails a range, + * where (future) matches can or cannot be found. meaning we are computing the same thing + * repeatedly, practically resulting in a bruteforcing situation where instead of eliminating + * certain non-matches, we blindly hammer character by character. + */ + +#include "regex.h" + +#include +#include +#include +#include + +#define JEGER_INIT_STATE 2 + +// ------------------ +// ### Char tests ### +// ------------------ +static bool is_quantifier(const char c) { + for (const char * s = "+*?="; *s != '\00'; s++) { + if (*s == c) { + return true; + } + } + return false; +} + +bool is_magic(const char c) { + if (is_quantifier(c)) { + return true; + } + for (const char * s = "\\[].^"; *s != '\00'; s++) { + if (*s == c) { + return true; + } + } + return false; +} + +// ----------------- +// ### Char sets ### +// ----------------- +#define JEGER_CHAR_SET_at "@" +#define JEGER_CHAR_SET_underscore "_" +#define JEGER_CHAR_SET_lower "abcdefghijklmnopqrstuwxyz" +#define JEGER_CHAR_SET_upper "ABCDEFGHIJKLMNOPQRSTUWXYZ" +#define JEGER_CHAR_SET_digits "0123456789" +#define JEGER_CHAR_SET_octal_digits "01234567" +#define JEGER_CHAR_SET_lower_hex "abcdef" +#define JEGER_CHAR_SET_upper_hex "ABCDEF" +#define JEGER_CHAR_SET_oct_241_to_277 \ + "\241\242\243\244\245" \ + "\246\247\250\251\252" \ + "\253\254\255\256\257" \ + "\260\261\262\263\264" \ + "\265\266\267\270\271" \ + "\272\273\274\275\276" \ + "\277" +#define JEGER_CHAR_SET_oct_300_to_337 \ + "\300\301\302\303\304" \ + "\305\306\307\310\311" \ + "\312\313\314\315\316" \ + "\317\320\321\322\323" \ + "\324\325\326\327\330" \ + "\331\332\333\334\335" \ + "\336\337" +#define JEGER_CHAR_SET_file_extra "/.-_+,#$%~=" +#define JEGER_CHAR_SET_whitespace " \t\v\n" + +// ---------------------- +// ### Internal Types ### +// ---------------------- +typedef struct { + int in; + char input; + int to; + int width; + int match_width; +} delta_t; + +typedef struct { + int in; + int to; + int width; + int match_width; +} offshoot_t; + +enum { + DO_CATCH = 0x00000001 << 0, + IS_NEGATIVE = 0x00000001 << 1, + IS_AT_THE_BEGINNING = 0x00000001 << 2, + DO_SKIP = 0x00000001 << 3, + FORCE_START_OF_STRING = 0x00000001 << 4, +}; + +typedef struct { + // XXX: + int flags; +// these might be obsolite but im leaving them for now + bool do_loop_hook; + bool do_follow_hook; + bool do_loop_shoot; + bool do_follow_shoot; +// --- + int state; + int width; + char * whitelist; + char * blacklist; + regex_t * regex; +} compiler_state; + + + +// ---------------------------------- +// ### Regex creation/destruction ### +// ---------------------------------- +static const int HALT_AND_CATCH_FIRE = INT_MIN; + +#define ASSERT_HALT(a) ((a == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : (cs->state + a)) + +static +void HOOK_ALL( int from, + const char * const str, + int to, + compiler_state * cs) { + for (const char * s = str; *s != '\0'; s++) { + delta_t * delta = (delta_t *)malloc(sizeof(delta_t)); + delta->in = cs->state + from; + delta->input = *s; + delta->to = ASSERT_HALT(to); + delta->width = cs->width; + vector_push(&cs->regex->delta_table, + &delta); + } +} + +static +void ABSOLUTE_OFFSHOOT(int from, + int to, + int width, + int match_width, + compiler_state * cs) { + offshoot_t * offshoot = (offshoot_t *)malloc(sizeof(offshoot_t)); + offshoot->in = from; + offshoot->to = to; + offshoot->width = width; + offshoot->match_width = match_width; + vector_push(&cs->regex->catch_table, + &offshoot); +} + +static +void OFFSHOOT(int from, + int to, + int width, + int match_width, + compiler_state * cs) { + ABSOLUTE_OFFSHOOT(cs->state + from, ASSERT_HALT(to), width, match_width, cs); +} + +static +int escape_1_to_1(const char c, compiler_state * cs) { + char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist; + switch (c) { + case 't': { + strcat(target_list, "\t"); + } return 1; + case 'n': { + strcat(target_list, "\n"); + } return 1; + case 'r': { + strcat(target_list, "\r"); + } return 1; + case 'b': { + strcat(target_list, "\b"); + } return 1; + case '[': { + strcat(target_list, "["); + } return 1; + case ']': { + strcat(target_list, "]"); + } return 1; + case '.': { + strcat(target_list, "."); + } return 1; + case '^': { + strcat(target_list, "^"); + } return 1; + case '=': { + strcat(target_list, "="); + } return 1; + case '?': { + strcat(target_list, "?"); + } return 1; + case '+': { + strcat(target_list, "+"); + } return 1; + case '*': { + strcat(target_list, "*"); + } return 1; + case '\\': { + strcat(target_list, "\\"); + } return 1; + } + + return 0; +} + +static +int escape_1_to_N(const char c, compiler_state * cs) { + char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist; + switch(c) { + case 'i': { + const char identifier_chars[] = JEGER_CHAR_SET_at + JEGER_CHAR_SET_underscore + JEGER_CHAR_SET_digits + JEGER_CHAR_SET_oct_300_to_337 + ; + strcpy(target_list, identifier_chars); + return sizeof(identifier_chars)-1; + }; + case 'I': { + const char identifier_chars[] = JEGER_CHAR_SET_at + JEGER_CHAR_SET_underscore + JEGER_CHAR_SET_oct_300_to_337 + ; + strcpy(target_list, identifier_chars); + return sizeof(identifier_chars)-1; + }; + case 'k': { + const char keyword_chars[] = JEGER_CHAR_SET_at + JEGER_CHAR_SET_underscore + JEGER_CHAR_SET_digits + JEGER_CHAR_SET_oct_300_to_337 + ; + strcpy(target_list, keyword_chars); + return sizeof(keyword_chars)-1; + }; + case 'K': { + const char keyword_chars[] = JEGER_CHAR_SET_at + JEGER_CHAR_SET_underscore + JEGER_CHAR_SET_oct_300_to_337 + ; + strcpy(target_list, keyword_chars); + return sizeof(keyword_chars)-1; + }; + case 'f': { + const char filename_chars[] = JEGER_CHAR_SET_at + JEGER_CHAR_SET_digits + JEGER_CHAR_SET_file_extra + ; + strcpy(target_list, filename_chars); + return sizeof(filename_chars)-1; + }; + case 'F': { + const char filename_chars[] = JEGER_CHAR_SET_at + JEGER_CHAR_SET_file_extra + ; + strcpy(target_list, filename_chars); + return sizeof(filename_chars)-1; + }; + case 'p': { + const char printable_chars[] = JEGER_CHAR_SET_at + JEGER_CHAR_SET_oct_241_to_277 + JEGER_CHAR_SET_oct_300_to_337 + ; + strcpy(target_list, printable_chars); + return sizeof(printable_chars)-1; + }; + case 'P': { + const char printable_chars[] = JEGER_CHAR_SET_at + JEGER_CHAR_SET_oct_241_to_277 + JEGER_CHAR_SET_oct_300_to_337 + ; + strcpy(target_list, printable_chars); + return sizeof(printable_chars)-1; + }; + case 's': { + const char whitespace_chars[] = JEGER_CHAR_SET_whitespace; + strcpy(target_list, whitespace_chars); + return sizeof(whitespace_chars)-1; + }; + case 'd': { + const char digit_chars[] = JEGER_CHAR_SET_digits; + strcpy(target_list, digit_chars); + return sizeof(digit_chars)-1; + }; + case 'x': { + const char hex_chars[] = JEGER_CHAR_SET_digits + JEGER_CHAR_SET_lower_hex + JEGER_CHAR_SET_upper_hex + ; + strcpy(target_list, hex_chars); + return sizeof(hex_chars)-1; + }; + case 'o': { + const char oct_chars[] = JEGER_CHAR_SET_octal_digits; + strcpy(target_list, oct_chars); + return sizeof(oct_chars)-1; + }; + case 'w': { + const char word_chars[] = JEGER_CHAR_SET_underscore + JEGER_CHAR_SET_digits + JEGER_CHAR_SET_lower + JEGER_CHAR_SET_upper + ; + strcpy(target_list, word_chars); + return sizeof(word_chars)-1; + }; + case 'h': { + const char very_word_chars[] = JEGER_CHAR_SET_underscore + JEGER_CHAR_SET_lower + JEGER_CHAR_SET_upper + ; + strcpy(target_list, very_word_chars); + return sizeof(very_word_chars)-1; + }; + case 'a': { + const char alpha_chars[] = JEGER_CHAR_SET_lower + JEGER_CHAR_SET_upper + ; + strcpy(target_list, alpha_chars); + return sizeof(alpha_chars)-1; + }; + case 'l': { + const char lower_alpha_chars[] = JEGER_CHAR_SET_lower; + strcpy(target_list, lower_alpha_chars); + return sizeof(lower_alpha_chars)-1; + }; + case 'u': { + const char upper_alpha_chars[] = JEGER_CHAR_SET_upper; + strcpy(target_list, upper_alpha_chars); + return sizeof(upper_alpha_chars)-1; + }; + } + + return 0; +} + +static +int escape_to_negative(const char c, + compiler_state * const cs) { + switch (c) { + case 'D': { + const char digit_chars[] = JEGER_CHAR_SET_digits; + strcpy(cs->blacklist, digit_chars); + cs->flags |= IS_NEGATIVE; + return sizeof(digit_chars)-1; + }; + } + + return 0; +} + +static +int compile_hologram(const char c, + compiler_state * const cs) { + static + const char very_word_chars[] = JEGER_CHAR_SET_underscore + JEGER_CHAR_SET_lower + JEGER_CHAR_SET_upper + ; + switch (c) { + case '^': { + cs->whitelist[0] = '\n'; + cs->whitelist[1] = '\0'; + HOOK_ALL(0, cs->whitelist, 0, cs); + cs->flags |= DO_SKIP; + if (cs->flags & IS_AT_THE_BEGINNING) { + cs->flags |= FORCE_START_OF_STRING; + } else { + cs->state += 1; + } + return 1; + }; + case '<': { + if (cs->flags & IS_AT_THE_BEGINNING) { + ABSOLUTE_OFFSHOOT(0, 3, 0, 0, cs); //XXX: figure out how to move this + } + cs->flags |= DO_SKIP; + cs->flags |= IS_NEGATIVE; + strcat(cs->blacklist, very_word_chars); + OFFSHOOT(0, 0, 1, 0, cs); //XXX: figure out how to move this + ++cs->state; + return sizeof(very_word_chars)-1; + }; + case '>': { + cs->flags |= DO_SKIP; + cs->flags |= IS_NEGATIVE; + strcat(cs->blacklist, very_word_chars); + OFFSHOOT(0, 1, 0, 0, cs); //XXX: figure out how to move this + ++cs->state; // XXX: the current bug arises from the state being increased before the blacklist is hooked + + return sizeof(very_word_chars)-1; + } + } + return 0; + +} + +static +int compile_dot(compiler_state * cs) { + cs->flags |= DO_CATCH; + return true; +} + +static +int compile_escape(const char c, + compiler_state * const cs) { + + return escape_1_to_1(c, cs) + || escape_1_to_N(c, cs) + || escape_to_negative(c, cs) + ; +} + +static +int compile_range(const char * const range, + compiler_state * const cs) { + assert((range[0] == '[') && "Not a range."); + + const char * s; + if (range[1] == '^') { + cs->flags |= IS_NEGATIVE; + s = range + 2; + } else { + s = range + 1; + } + + char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist; + + for (; *s != ']'; s++) { + assert((*s != '\0') && "Unclosed range."); + char c = *s; + if (c == '\\') { + s += 1; + assert(compile_escape(*s, cs) && "Unknown escape."); + } else if (*(s+1) == '-') { + char end = *(s+2); + assert((c < end) && "Endless range."); + for (char cc = c; cc < end+1; cc++) { + strncat(target_list, &cc, 1); + strncat(target_list, "\0", 1); + } + s += 2; + } else { + strncat(target_list, &c, 1); + } + } + + return ((s - range) + 1); +} + +void filter_blacklist(const char * whitelist, + const char * blacklist, + char * filtered) { + for (; *blacklist != '\0'; blacklist++) { + for(; *whitelist != '\0'; whitelist++) { + if (*blacklist == *whitelist) { + goto long_continue; + } + } + strncat(filtered, blacklist, 1); + long_continue: + ; + } +} + +regex_t * regex_compile(const char * const pattern) { + regex_t * regex = (regex_t *)malloc(sizeof(regex_t)); + regex->str = strdup(pattern); + vector_init(®ex->delta_table, sizeof(delta_t*), 0UL); + vector_init(®ex->catch_table, sizeof(offshoot_t*), 0UL); + + // this is plain retarded + char whitelist[64]; + char blacklist[64]; + + compiler_state cs = { + .flags = IS_AT_THE_BEGINNING, + .state = JEGER_INIT_STATE, + .width = 0, + .whitelist = whitelist, + .blacklist = blacklist, + .regex = regex, + }; + + for (const char * s = pattern; *s != '\00';) { + // Reset the compiler + assert(!is_quantifier(*s) && "Pattern starts with quantifier."); + whitelist[0] = '\0'; + blacklist[0] = '\0'; + cs.flags &= IS_AT_THE_BEGINNING; + /**/ + cs.do_loop_hook = false; + cs.do_follow_hook = false; + cs.do_loop_shoot = false; + cs.do_follow_shoot = false; + /**/ + cs.width = 1; + + // Translate char + switch (*s) { + case '^': { + compile_hologram(*s, &cs); + } break; + case '.': { + compile_dot(&cs); + } break; + case '\\': { + s += 1; + assert((compile_escape(*s, &cs) || compile_hologram(*s, &cs)) && "Unknown escape."); + } break; + case '[': { + s += compile_range(s, &cs) - 1; + } break; + default: { // Literal + whitelist[0] = *s; + whitelist[1] = '\0'; + } break; + } + + s += 1; + + // Compile blacklist + if (*blacklist) { + char filtered_blacklist[64]; + filtered_blacklist[0] = '\0'; + filter_blacklist(whitelist, blacklist, filtered_blacklist); + HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs); + } + + if (cs.flags & DO_SKIP) { + goto long_continue; + } + + // Compile with quantifier + switch (*s) { + case '=': + case '?': { + HOOK_ALL(0, whitelist, +1, &cs); + if ((cs.flags & DO_CATCH) || (cs.flags & IS_NEGATIVE)) { + OFFSHOOT(0, +1, 1, 1, &cs); + } + s += 1; + } break; + case '*': { + HOOK_ALL(0, whitelist, 0, &cs); + if ((cs.flags & DO_CATCH) + || (cs.flags & IS_NEGATIVE)) { + OFFSHOOT(0, 0, 1, 1, &cs); + } + s += 1; + } break; + case '+': { + HOOK_ALL(0, whitelist, +1, &cs); + if ((cs.flags & DO_CATCH) + || (cs.flags & IS_NEGATIVE)) { + OFFSHOOT(0, +1, 1, 1, &cs); + } + ++cs.state; + HOOK_ALL(0, whitelist, 0, &cs); + if ((cs.flags & DO_CATCH) + || (cs.flags & IS_NEGATIVE)) { + OFFSHOOT(0, 0, 1, 1, &cs); + } + s += 1; + } break; + default: { // Literal + HOOK_ALL(0, whitelist, +1, &cs); + if ((cs.flags & DO_CATCH) + || (cs.flags & IS_NEGATIVE)) { + OFFSHOOT(0, +1, 1, 1, &cs); + } + ++cs.state; + } break; + } + + long_continue: + cs.flags &= !IS_AT_THE_BEGINNING; + } + + // Init state hookups + ABSOLUTE_OFFSHOOT(0, 2, 0, 0, &cs); + if (cs.flags & FORCE_START_OF_STRING) { + ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, 0, &cs); + } else { + ABSOLUTE_OFFSHOOT(1, 2, 0, 0, &cs); + } + + regex->accepting_state = cs.state; + + return regex; +} + +int regex_free(regex_t * const regex) { + free(regex->str); + vector_free(®ex->delta_table); + vector_free(®ex->catch_table); + free(regex); + return 0; +} + + + +// ----------------- +// ### Searching ### +// ----------------- +static +const offshoot_t * catch_table_lookup(const regex_t * const regex, + const int * const state) { + for (size_t i = 0; i < regex->catch_table.element_count; i++){ + const offshoot_t * const offshoot = *(offshoot_t**)vector_get(®ex->catch_table, i); + if (offshoot->in == *state) { + return offshoot; + } + } + return NULL; +} + +static +bool regex_assert(const regex_t * const regex, + const char * const string, + int state, + match_t * const match) { + if (state == HALT_AND_CATCH_FIRE) { + return false; + } + + bool last_stand = false; + + const char * s = string; + LOOP: { + if (*s == '\0') { + last_stand = true; + goto PERFORM_CATCH_LOOKUP; + } + // Jump search for the correct state + const int jump = 10; + size_t i = jump; + while (i < regex->delta_table.element_count) { + const delta_t * const delta = *(delta_t**)vector_get(®ex->delta_table, i); + if (delta->in >= state) { + break; + } + i += jump; + } + i -= jump; + // Linear search finish up + for (; i < regex->delta_table.element_count; i++) { + const delta_t * const delta = *(delta_t**)vector_get(®ex->delta_table, i); + + if (delta->in > state) { + break; + } + + if ((delta->in == state) + && (delta->input == *s)) { + const int r = regex_assert(regex, s + delta->width, delta->to, match); + if(r){ + if ((match->position != -1) + && (delta->match_width)) { + match->position = (s - string); + } + match->width += delta->match_width; + return r; + } + } + } + } + + PERFORM_CATCH_LOOKUP: { + const offshoot_t * const my_catch = catch_table_lookup(regex, &state); + if (my_catch && (!my_catch->width || !last_stand)) { + state = my_catch->to; + s += my_catch->width; + match->width += my_catch->match_width; + goto LOOP; + } + } + + return (state == regex->accepting_state); +} + +match_t * regex_match(const regex_t * const regex, + const char * const string, + const bool is_start_of_string) { + if (regex == NULL) { + return NULL; + } + + match_t * match = (match_t *)malloc(sizeof(match_t)); + + if (string == NULL) { + match->position = -1; + match->width = 0; + return match; + } + + const int initial_state = (int)(!is_start_of_string); + + // XXX: this should be called in a loop, always restarting from the last char of the last match + if (regex_assert(regex, string, initial_state, match)) { + return match; + } else { + return NULL; + } +} + +bool regex_search(const regex_t * const regex, + const char * const string) { + + return (bool)regex_match(regex, string, true); +} diff --git a/src/regex.h b/source/regex.h similarity index 69% rename from src/regex.h rename to source/regex.h index 5e5fca8..2b049b1 100644 --- a/src/regex.h +++ b/source/regex.h @@ -21,8 +21,8 @@ typedef struct { extern regex_t * regex_compile(const char * const pattern); extern int regex_free(regex_t * const regex); -extern bool regex_search(regex_t * regex, const char * const string); -extern match_t * regex_match(regex_t * regex, const char * const string, const bool start_of_string, const int string_offset); +extern bool regex_search(const regex_t * const regex, const char * const string); +extern match_t * regex_match(const regex_t * const regex, const char * const string, const bool start_of_string); extern bool is_magic(const char c); diff --git a/src/vector.c b/source/vector.c similarity index 79% rename from src/vector.c rename to source/vector.c index 3cfe9f8..b14444f 100644 --- a/src/vector.c +++ b/source/vector.c @@ -1,7 +1,3 @@ -/* vector.c - * Copyright 2023 Anon Anonson, Ognjen 'xolatile' Milan Robovic, Emil Williams - * SPDX Identifier: GPL-3.0-only / NO WARRANTY / NO GUARANTEE */ - #include "vector.h" #include @@ -18,7 +14,7 @@ void vector_init(vector_t * vector, vector->element_size = element_size; vector->element_count = element_count; - vector->data = calloc(vector->element_count, vector->element_size); + vector->data = (char *)calloc(vector->element_count, vector->element_size); assert(vector->data); } @@ -29,8 +25,8 @@ void vector_push(vector_t * vector, vector->element_count += 1; - vector->data = realloc(vector->data, - vector->element_size * vector->element_count); + vector->data = (char *)realloc(vector->data, + vector->element_size * vector->element_count); assert(vector->data); diff --git a/src/vector.h b/source/vector.h similarity index 100% rename from src/vector.h rename to source/vector.h diff --git a/src/main.c b/src/main.c deleted file mode 100644 index b3ccf06..0000000 --- a/src/main.c +++ /dev/null @@ -1,76 +0,0 @@ -// @COMPILECMD g++ $@ -o regtest -O0 -ggdb -pg -fno-inline -#include -#include "regex.hpp" - -#define TEST(a, b, expected) do { \ - r = regex_compile(a); \ - bool result = regex_search(r, b); \ - bool passed = (result == expected); \ - if (passed) { printf("Success. - "); } else { printf("Failiour. - "); } \ - printf("%s vs %s: Result = %d, Expected = %d\n", #a, #b, result, expected); \ - ++num_tests; \ - if (passed) { ++passed_tests; } \ -} while(0) - -signed main() { - int num_tests = 0; - int passed_tests = 0; - regex_t * r; - - TEST(R"del(abc)del","abc",true); - TEST(R"del(efg1)del","efg1",true); - TEST(R"del(nig)del","ger",false); - - puts(""); - - TEST(R"del(ab+c)del","abc",true); - TEST(R"del(ef+g1)del","effffg1",true); - TEST(R"del(ni*g?)del","ngg",false); - - puts(""); - - TEST(R"del(ne.)del","net",true); - TEST(R"del(ne.)del","ne",false); - TEST(R"del(ne.+)del","neoo",true); - - puts(""); - - TEST(R"del(ne.)del","ne\t",true); - TEST(R"del(ne\t)del","ne",false); - TEST(R"del(ne\t)del","ne\t",true); - - puts(""); - - TEST(R"del(\sa)del"," a",true); - TEST(R"del(\wi)del","hi",true); - TEST(R"del(\w+)del","asd",true); - - puts(""); - - TEST(R"del([A-Za-z]+)del","HelloWorld",true); - TEST(R"del([A-Za-z]+g)del","HelloWorldg",true); - TEST(R"del([A-Za-z]+g)del","g",false); - - puts(""); - - TEST(R"del(a+a)del","aaa",true); - TEST(R"del(a+a)del","aa",true); - TEST(R"del(a+a)del","a",false); - - //++num_tests; TEST(R"del(\d{3})del","123",true); - //++num_tests; TEST(R"del(^\w+@\w+\.\w+$)del","example@email.com",true); - - //++num_tests; TEST(R"del(\b\w+\b)del","This is a test",true); - //++num_tests; TEST(R"del(^[A-Za-z]+\s\d+)del","OpenAI 123",true); - //++num_tests; TEST(R"del([0-9]{4}-[0-9]{2}-[0-9]{2})del","2023-08-22",true); - - //++num_tests; TEST(R"del(^[^abc]+$)del","def123",true); - //++num_tests; TEST(R"del(\b\d{5}\b)del","12345 67890",true); - //++num_tests; TEST(R"del(^[A-Z][a-z]+$)del","OpenAI",true); - - //++num_tests; TEST(R"del(\d{3}-\d{2}-\d{4})del","123-45-6789",true); - //++num_tests; TEST(R"del(^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})del","192.168.1.1",true); - //++num_tests; TEST(R"del(^\w{8,12})del","Password123", false); - - printf("\nPassed %d out of %d tests.\n", passed_tests, num_tests); -} diff --git a/src/regex.c b/src/regex.c deleted file mode 100644 index 3c09bf7..0000000 --- a/src/regex.c +++ /dev/null @@ -1,691 +0,0 @@ -/* regex.c - * Copyright 2023 Anon Anonson, Ognjen 'xolatile' Milan Robovic, Emil Williams - * SPDX Identifier: GPL-3.0-only / NO WARRANTY / NO GUARANTEE */ - -/* XXX: - * as it turns out returning a range of match objects is a - * high profile performance issue regarding regex, especially when highlighting. - * now as it stands we search an array of tokens for every position on a string. - * which sounds ok, until one realizes that searching from any position revails a range, - * where (future) matches can or cannot be found. meaning we are computing the same thing - * repeatedly, practically resulting in a bruteforcing situation where instead of eliminating - * certain non-matches, we blindly hammer character by character. - */ - -/* XXX: - * the bigass char sets should be global and broken into parts -*/ - -#include "regex.h" - -#include -#include -#include -#include - -// ------------------ -// ### Char tests ### -// ------------------ -static bool is_quantifier(const char c) { - for (const char * s = "+*?="; *s != '\00'; s++) { - if (*s == c) { - return true; - } - } - return false; -} - -bool is_magic(const char c) { - if (is_quantifier(c)) { - return true; - } - for (const char * s = "\\[].^"; *s != '\00'; s++) { - if (*s == c) { - return true; - } - } - return false; -} - -// ---------------------- -// ### Internal Types ### -// ---------------------- -typedef struct { - int in; - char input; - int to; - int width; - int match_width; -} delta_t; - -typedef struct { - int in; - int to; - int width; - int match_width; -} offshoot_t; - -typedef struct { - // XXX: - // These should share a mask - // Not even sure why they are pointers to begin with - bool * do_catch; - bool * is_negative; - bool is_at_the_beginning; - bool do_skip; -// these might be obsolite but im leaving them for now - bool * do_loop_hook; - bool * do_follow_hook; - bool * do_loop_shoot; - bool * do_follow_shoot; -// --- - int * state; - int * width; - char * whitelist; - char * blacklist; - regex_t * regex; -} compiler_state; - - - -// ---------------------------------- -// ### Regex creation/destruction ### -// ---------------------------------- -#define HALT_AND_CATCH_FIRE INT_MIN - -static void HOOK_ALL( int from, - const char * const str, - int to, - compiler_state * cs) { - - int hook_to = (to == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : ((*cs->state) + to); - - - for (const char * s = str; *s != '\0'; s++) { - delta_t * delta = malloc(sizeof(delta_t)); - delta->in = *cs->state + from; - delta->input = *s; - delta->to = hook_to; - delta->width = *cs->width; - vector_push(&cs->regex->delta_table, - &delta); - } -} - -// XXX: align -static void ABSOLUTE_OFFSHOOT(int from, - int to, - int width, - int match_width, - compiler_state * cs) { - offshoot_t * offshoot = malloc(sizeof(offshoot_t)); - offshoot->in = from; - offshoot->to = to; - offshoot->width = width; - offshoot->match_width = match_width; - vector_push(&cs->regex->catch_table, - &offshoot); -} - -// XXX: align -static void OFFSHOOT(int from, - int to, - int width, - int match_width, - compiler_state * cs) { - ABSOLUTE_OFFSHOOT(*cs->state + from, *cs->state + to, width, match_width, cs); -} - -static int escape_1_to_1(const char c, compiler_state * cs) { - char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist; - switch (c) { - case 't': { - strcat(target_list, "\t"); - } return 1; - case 'n': { - strcat(target_list, "\n"); - } return 1; - case 'r': { - strcat(target_list, "\r"); - } return 1; - case 'b': { - strcat(target_list, "\b"); - } return 1; - case '[': { - strcat(target_list, "["); - } return 1; - case ']': { - strcat(target_list, "]"); - } return 1; - case '.': { - strcat(target_list, "."); - } return 1; - case '^': { - strcat(target_list, "^"); - } return 1; - case '=': { - strcat(target_list, "="); - } return 1; - case '?': { - strcat(target_list, "?"); - } return 1; - case '+': { - strcat(target_list, "+"); - } return 1; - case '*': { - strcat(target_list, "*"); - } return 1; - case '\\': { - strcat(target_list, "\\"); - } return 1; - } - - return 0; -} - -static int escape_1_to_N(const char c, compiler_state * cs) { - char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist; - switch(c) { - case 'i': { - const char identifier_chars[] = "@0123456789_" - "\300\301\302\303\304" - "\305\306\307\310\311" - "\312\313\314\315\316" - "\317\320\321\322\323" - "\324\325\326\327\330" - "\331\332\333\334\335" - "\336\337"; - strcpy(target_list, identifier_chars); - return sizeof(identifier_chars)-1; - }; - case 'I': { - const char identifier_chars[] = "@_" - "\300\301\302\303\304" - "\305\306\307\310\311" - "\312\313\314\315\316" - "\317\320\321\322\323" - "\324\325\326\327\330" - "\331\332\333\334\335" - "\336\337"; - strcpy(target_list, identifier_chars); - return sizeof(identifier_chars)-1; - }; - case 'k': { - const char keyword_chars[] = "@0123456789_" - "\300\301\302\303\304" - "\305\306\307\310\311" - "\312\313\314\315\316" - "\317\320\321\322\323" - "\324\325\326\327\330" - "\331\332\333\334\335" - "\336\337"; - strcpy(target_list, keyword_chars); - return sizeof(keyword_chars)-1; - }; - case 'K': { - const char keyword_chars[] = "@_" - "\300\301\302\303\304" - "\305\306\307\310\311" - "\312\313\314\315\316" - "\317\320\321\322\323" - "\324\325\326\327\330" - "\331\332\333\334\335" - "\336\337"; - strcpy(target_list, keyword_chars); - return sizeof(keyword_chars)-1; - }; - case 'f': { - const char filename_chars[] = "@0123456789/.-_+,#$%~="; - strcpy(target_list, filename_chars); - return sizeof(filename_chars)-1; - }; - case 'F': { - const char filename_chars[] = "@/.-_+,#$%~="; - strcpy(target_list, filename_chars); - return sizeof(filename_chars)-1; - }; - case 'p': { - const char printable_chars[] = "@" - "\241\242\243\244\245" - "\246\247\250\251\252" - "\253\254\255\256\257" - "\260\261\262\263\264" - "\265\266\267\270\271" - "\272\273\274\275\276" - "\277" - "\300\301\302\303\304" - "\305\306\307\310\311" - "\312\313\314\315\316" - "\317\320\321\322\323" - "\324\325\326\327\330" - "\331\332\333\334\335" - "\336\337"; - strcpy(target_list, printable_chars); - return sizeof(printable_chars)-1; - }; - case 'P': { - const char printable_chars[] = "@" - "\241\242\243\244\245" - "\246\247\250\251\252" - "\253\254\255\256\257" - "\260\261\262\263\264" - "\265\266\267\270\271" - "\272\273\274\275\276" - "\277" - "\300\301\302\303\304" - "\305\306\307\310\311" - "\312\313\314\315\316" - "\317\320\321\322\323" - "\324\325\326\327\330" - "\331\332\333\334\335" - "\336\337"; - strcpy(target_list, printable_chars); - return sizeof(printable_chars)-1; - }; - case 's': { - const char whitespace_chars[] = " \t\v\n"; - strcpy(target_list, whitespace_chars); - return sizeof(whitespace_chars)-1; - }; - case 'd': { - const char digit_chars[] = "0123456789"; - strcpy(target_list, digit_chars); - return sizeof(digit_chars)-1; - }; - case 'x': { - const char hex_chars[] = "0123456789" - "abcdef" - "ABCDEF"; - strcpy(target_list, hex_chars); - return sizeof(hex_chars)-1; - }; - case 'o': { - const char oct_chars[] = "01234567"; - strcpy(target_list, oct_chars); - return sizeof(oct_chars)-1; - }; - case 'w': { - const char word_chars[] = "0123456789" - "abcdefghijklmnopqrstuwxyz" - "ABCDEFGHIJKLMNOPQRSTUWXYZ" - "_"; - strcpy(target_list, word_chars); - return sizeof(word_chars)-1; - }; - case 'h': { - const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz" - "ABCDEFGHIJKLMNOPQRSTUWXYZ" - "_"; - strcpy(target_list, very_word_chars); - return sizeof(very_word_chars)-1; - }; - case 'a': { - const char alpha_chars[] = "abcdefghijklmnopqrstuwxyz" - "ABCDEFGHIJKLMNOPQRSTUWXYZ"; - strcpy(target_list, alpha_chars); - return sizeof(alpha_chars)-1; - }; - case 'l': { - const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz"; - strcpy(target_list, lower_alpha_chars); - return sizeof(lower_alpha_chars)-1; - }; - case 'u': { - const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ"; - strcpy(target_list, upper_alpha_chars); - return sizeof(upper_alpha_chars)-1; - }; - } - - return 0; -} - -static int escape_to_negative(const char c, - compiler_state * cs) { - switch (c) { - case 'D': { - const char digit_chars[] = "0123456789"; - strcpy(cs->blacklist, digit_chars); - *cs->is_negative = true; - return sizeof(digit_chars)-1; - }; - } - - return 0; -} - -static int escape_hologram(const char c, compiler_state * cs) { - switch (c) { - case '<': { - if (cs->is_at_the_beginning) { - ABSOLUTE_OFFSHOOT(0, 2, 0, 0, cs); - cs->do_skip = true; - } - const char very_word_chars[] = "abcw"; - //const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz" - // "ABCDEFGHIJKLMNOPQRSTUWXYZ" - // "_"; - *cs->is_negative = true; // effectless currently; should be used to trigger the following lines in the main compile loop - strcat(cs->blacklist, very_word_chars); - HOOK_ALL(0, cs->blacklist, HALT_AND_CATCH_FIRE, cs); - OFFSHOOT(0, 0, 1, 0, cs); - - return sizeof(very_word_chars)-1; - }; - case '>': { - const char very_word_chars[] = "abcw"; - //const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz" - // "ABCDEFGHIJKLMNOPQRSTUWXYZ" - // "_"; - *cs->is_negative = true; - strcat(cs->blacklist, very_word_chars); - - return sizeof(very_word_chars)-1; - } - } - return 0; -} - -static int compile_dot(compiler_state * cs) { - *cs->do_catch = true; - return true; -} - -static int compile_escape(const char c, - compiler_state * cs) { - - return escape_1_to_1(c, cs) - || escape_1_to_N(c, cs) - || escape_to_negative(c, cs) - || escape_hologram(c, cs) - ; -} - -static int compile_range(const char * const range, - compiler_state * cs) { - assert((range[0] == '[') && "Not a range."); - - const char * s; - if (range[1] == '^') { - *cs->is_negative = true; - s = range + 2; - } else { - s = range + 1; - } - - char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist; - - for (; *s != ']'; s++) { - assert((*s != '\0') && "Unclosed range."); - char c = *s; - if (c == '\\') { - s += 1; - assert(compile_escape(*s, cs) && "Unknown escape."); - } else if (*(s+1) == '-') { - char end = *(s+2); - assert((c < end) && "Endless range."); - for (char cc = c; cc < end+1; cc++) { - strncat(target_list, &cc, 1); - strncat(target_list, "\0", 1); - } - s += 2; - } else { - strncat(target_list, &c, 1); - } - } - - return ((s - range) + 1); -} - -void filter_blacklist(const char * whitelist, - const char * blacklist, - char * filtered) { - for (; *blacklist != '\0'; blacklist++) { - for(; *whitelist != '\0'; whitelist++) { - if (*blacklist == *whitelist) { - goto long_continue; - } - } - strncat(filtered, blacklist, 1); - long_continue: - ; - } -} - -regex_t * regex_compile(const char * const pattern) { - regex_t * regex = (regex_t *)malloc(sizeof(regex_t)); - regex->str = strdup(pattern); - vector_init(®ex->delta_table, sizeof(delta_t*), 0UL); - vector_init(®ex->catch_table, sizeof(offshoot_t*), 0UL); - - int state = 2; - - // this is plain retarded - bool do_catch; - bool is_negative; - bool do_loop_hook; - bool do_follow_hook; - bool do_loop_shoot; - bool do_follow_shoot; - int width; - char whitelist[64]; - char blacklist[64]; - - compiler_state cs = { - .do_catch = &do_catch, - .is_negative = &is_negative, - .is_at_the_beginning = true, - .do_skip = false, - .state = &state, - .width = &width, - .whitelist = whitelist, - .blacklist = blacklist, - .regex = regex, - }; - - for (const char * s = pattern; *s != '\00';) { - // Reset the compiler - assert(!is_quantifier(*pattern) && "Pattern starts with quantifier."); - whitelist[0] = '\0'; - blacklist[0] = '\0'; - do_catch = false; - is_negative = false; - cs.do_skip = false; - /**/ - do_loop_hook = false; - do_follow_hook = false; - do_loop_shoot = false; - do_follow_shoot = false; - /**/ - width = 1; - - // Translate char - switch (*s) { - case '^': { - if (cs.is_at_the_beginning) { - ABSOLUTE_OFFSHOOT(0, 2, 0, 0, &cs); - ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, 0, &cs); - } - whitelist[0] = '\n'; - whitelist[1] = '\0'; - HOOK_ALL(0, whitelist, 0, &cs); - if (s != pattern) { - state += 1; - } - cs.do_skip = true; - } break; - case '.': { - compile_dot(&cs); - } break; - case '\\': { - s += 1; - assert(compile_escape(*s, &cs) && "Unknown escape."); - } break; - case '[': { - s += compile_range(s, &cs) - 1; - } break; - default: { - whitelist[0] = *s; - whitelist[1] = '\0'; - } break; - } - - s += 1; - - if (cs.do_skip) { - goto long_continue; - } - - // Compile blacklist - if (*blacklist) { - char filtered_blacklist[64]; - filtered_blacklist[0] = '\0'; - filter_blacklist(whitelist, blacklist, filtered_blacklist); - HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs); - } - - // Compile with quantifier - switch (*s) { - case '=': - case '?': { - do_loop_hook = true; - HOOK_ALL(0, whitelist, +1, &cs); - if (do_catch || is_negative) { - OFFSHOOT(0, +1, 1, 1, &cs); - } - s += 1; - } break; - case '*': { - HOOK_ALL(0, whitelist, 0, &cs); - if (do_catch) { - OFFSHOOT(0, +1, 1, 1, &cs); - } else if (is_negative) { - OFFSHOOT(0, 0, 1, 1, &cs); - } - s += 1; - } break; - case '+': { - HOOK_ALL(0, whitelist, +1, &cs); - if (do_catch || is_negative) { - OFFSHOOT(0, +1, 1, 1, &cs); - } - state += 1; - HOOK_ALL(0, whitelist, 0, &cs); - if (do_catch || is_negative) { - OFFSHOOT(0, 0, 1, 1, &cs); - } - s += 1; - } break; - default: { // Literal - HOOK_ALL(0, whitelist, +1, &cs); - if (do_catch || is_negative) { - OFFSHOOT(0, +1, 1, 1, &cs); - } - state += 1; - } break; - } - - long_continue: - cs.is_at_the_beginning = false; - } - - regex->accepting_state = state; - - return regex; -} - -int regex_free(regex_t * const regex) { - free(regex->str); - vector_free(®ex->delta_table); - vector_free(®ex->catch_table); - free(regex); - return 0; -} - - - -// ----------------- -// ### Searching ### -// ----------------- -// XXX: rename; align -static offshoot_t * catch_(const regex_t * const regex, - int * const state) { - for (size_t i = 0; i < regex->catch_table.element_count; i++){ - const offshoot_t * const offshoot = *(offshoot_t**)vector_get(®ex->catch_table, i); - if (offshoot->in == *state) { - *state = offshoot->to; - return offshoot; - } - } - return NULL; -} - -static bool regex_assert(const regex_t * const regex, - const char * const string, - const int string_offset, // XXX: useless - int state, - match_t * const match) { - if (state == HALT_AND_CATCH_FIRE) { return false; } - for (const char * s = (string + string_offset); *s != '\00';) { - // XXX: this should be a jump search for the instate and then a linear - for (size_t i = 0; i < regex->delta_table.element_count; i++) { - const delta_t * const delta = *(delta_t**)vector_get(®ex->delta_table, i); - if ((delta->in == state) - && (delta->input == *s)) { - const int r = regex_assert(regex, string, (s - string) + delta->width, delta->to, match); - if(r){ - if ((match->position != -1) - && (delta->match_width)) { - match->position = (s - string); - } - match->width += delta->match_width; - return r; - } - } - } - - const offshoot_t * const catch = catch_(regex, &state); - if ((catch) - && (state != HALT_AND_CATCH_FIRE)) { - s += catch->width; - match->width += catch->match_width; - continue; - } else { - break; - } - } - - return (state == regex->accepting_state); -} - -match_t * regex_match( regex_t * regex, - const char * const string, - const bool is_start_of_string, - const int string_offset) { // XXX: remove this useless piece of shit of a parameter nigger - if (regex == NULL) { - return NULL; - } - - match_t * m = (match_t *)malloc(sizeof(match_t)); - if (string == NULL) { - m->position = -1; - m->width = 0; - return m; - } - - const int initial_state = (int)(!is_start_of_string); - - // XXX: this should be called in a loop, always restarting from the last char of the last match - if(regex_assert(regex, string, string_offset, initial_state, m)) { - return m; - } else { - free(m); - return NULL; - } -} - -bool regex_search( regex_t * regex, - const char * const string) { - - return (bool)regex_match(regex, string, true, 0); -}