From 3912449f11e25068bb5169f8db748682fa97f26c Mon Sep 17 00:00:00 2001
From: anon <anon@anon.anon>
Date: Wed, 23 Aug 2023 18:38:12 +0200
Subject: [PATCH] new regex engine

---
 source/hl.h            |   1 -
 source/regex2.hpp      | 230 +++++++++++++++++++++++++++++++++++++++++++++++++
 tests/regex_tester.cpp |  76 ++++++++++++++++
 3 files changed, 306 insertions(+), 1 deletion(-)
 create mode 100644 source/regex2.hpp
 create mode 100644 tests/regex_tester.cpp

diff --git a/source/hl.h b/source/hl.h
index dcf2894..b64a77c 100644
--- a/source/hl.h
+++ b/source/hl.h
@@ -48,7 +48,6 @@ int token_table_top = 0;
 // --------------------------------
 // ### Constructors/Destructors ###
 // --------------------------------
-
 void new_display_mode(display_t * mode) {
 	HASH_ADD_STR(display_table,
 	             key,
diff --git a/source/regex2.hpp b/source/regex2.hpp
new file mode 100644
index 0000000..25badf3
--- /dev/null
+++ b/source/regex2.hpp
@@ -0,0 +1,230 @@
+#include <vector>
+#include <assert.h>
+#include <string.h>
+
+typedef struct {
+	int in;
+	char input;
+	int to;
+} delta_t;
+
+typedef struct {
+	int in;
+	int to;
+} offshoot_t;
+
+typedef struct {
+	char * str;
+	std::vector<delta_t> delta_table;
+	std::vector<offshoot_t> catch_table;
+	int accepting_state;
+} regex_t;
+
+#define HALT_AND_CATCH_FIRE -1
+
+#define HOOK_ALL(from, str, to) do {                   \
+	for (char * s = str; *s != '\00'; s++) {           \
+		reg.delta_table.push_back(                     \
+			delta_t{state + from, *s, state + to}      \
+		);                                             \
+	}                                                  \
+	if (do_catch) {                                    \
+		reg.catch_table.push_back(                     \
+			{state + from, state + to}                 \
+		);                                             \
+	}                                                  \
+} while (0)
+
+#define EAT(n) do { \
+	s += n;         \
+} while (0)
+
+bool is_quantifier(const char c){
+	for (const char * s = "+*?"; *s != '\00'; s++) {
+		if (*s == c) {
+			return true;
+		}
+	}
+	return false;
+}
+
+
+int escape_1_to_1(const char c, char * whitelist) {
+	switch(c) {
+		case 't': {
+			strcpy(whitelist, "\t");
+		} return 1;
+		case 'n': {
+			strcpy(whitelist, "\n");
+		} return 1;
+	}
+
+	return 0;
+}
+
+int escape_1_to_N(const char c, char * whitelist) {
+	switch(c) {
+		case 'd': {
+			const char digitchars[] = "0123456789";
+			strcpy(whitelist, digitchars);
+			return sizeof(digitchars)-1;
+		};
+		case 'w': {
+			const char wordchars[] = "abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ";
+			strcpy(whitelist, wordchars);
+			return sizeof(wordchars)-1;
+		};
+		case 's': {
+			const char blankchars[] = " \t\v\n";
+			strcpy(whitelist, blankchars);
+			return sizeof(blankchars)-1;
+		};
+	}
+
+	return 0;
+}
+
+int compile_range(const char * const     range,
+                        char *       whitelist) {
+	assert(range[0] == '[' && "Not a range.");
+
+	int r = 0;
+	const char * s;
+	for (s = range+1; *s != ']'; s++) {
+		assert(*s != '\00' && "Unclosed range.");
+		char c = *s;
+		if (*(s+1) == '-') {
+			char end = *(s+2);
+			assert(c < end && "Endless range.");
+			for (char cc = c; cc < end+1; cc++) {
+				strncat(whitelist,   &cc, 1);
+				strncat(whitelist, "\00", 1);
+			}
+			s += 2;
+		} else {
+			++r;
+			strncat(whitelist,    &c, 1);
+			strncat(whitelist, "\00", 1);
+		}
+	}
+
+	return ((s - range) + 1);
+}
+
+regex_t * regex_compile(const char * const pattern) {
+	regex_t * r = new regex_t;
+	regex_t &reg = *r;
+	reg.str = strdup(pattern);
+
+	int state = 0;
+
+	char whitelist[64];
+	bool do_catch;
+	for (const char * s = pattern; *s != '\00';) {
+		// Get token
+		assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
+		whitelist[0] = '\00';
+		do_catch     = false;
+		switch (*s) {
+			case '.': {
+				do_catch = true;
+			} break;
+			case '\\': {
+				EAT(1);
+				if(escape_1_to_1(*s, whitelist)
+				|| escape_1_to_N(*s, whitelist)){
+					;
+				} else {
+					assert(!"Unknown escape.");
+				}
+			} break;
+			case '[': {
+				EAT(compile_range(s, whitelist)-1);
+			} break;
+			default: {
+				whitelist[0] = *s;
+				whitelist[1] = '\00';
+			} break;
+		}
+
+		EAT(1);
+
+		// Quantifier
+		switch (*s) {
+			case '?': {
+				HOOK_ALL(0, whitelist, +1);
+				EAT(1);
+			} break;
+			case '*': {
+				HOOK_ALL(0, whitelist,  0);
+				EAT(1);
+			} break;
+			case '+': {
+				HOOK_ALL(0, whitelist, +1);
+				state += 1;
+				HOOK_ALL(0, whitelist,  0);
+				EAT(1);
+			} break;
+			default: { // Literal
+				HOOK_ALL(0, whitelist, +1);
+				state += 1;
+			} break;
+		}
+	}
+
+	reg.accepting_state = state;
+
+	return r;
+}
+
+inline bool catch_(const regex_t * regex,
+                        int     & state) {
+
+	const regex_t &reg = *regex;
+	for (int i = 0; i < reg.catch_table.size(); i++){
+		if (reg.catch_table[i].in == state) {
+			state = reg.catch_table[i].to;
+			return true;
+		}
+	}
+	return false;
+}
+
+bool regex_assert(const regex_t * const  regex,
+                  const char    * const string,
+				        int              state) {
+
+	const regex_t &reg = *regex;
+	for (const char * s = string; *s != '\00'; s++) {
+		// delta
+		for (int i = 0; i < reg.delta_table.size(); i++) {
+			if ((reg.delta_table[i].in == state) 
+			&&  (reg.delta_table[i].input == *s)) {
+				if(regex_assert(regex, s+1, reg.delta_table[i].to)){
+					return true;
+				}
+			}
+		}
+
+		if (catch_(regex, state)) {
+			continue;
+		}
+
+		return false;
+	}
+
+	return (state == regex->accepting_state);
+}
+
+bool regex_search(      regex_t *        regex,
+                  const char    * const string) {
+
+	if (regex == NULL) {
+		return false;
+	}
+	if (string == NULL) {
+		return true;
+	}
+
+	return regex_assert(regex, string, 0);
+}
diff --git a/tests/regex_tester.cpp b/tests/regex_tester.cpp
new file mode 100644
index 0000000..b3ccf06
--- /dev/null
+++ b/tests/regex_tester.cpp
@@ -0,0 +1,76 @@
+// @COMPILECMD g++ $@ -o regtest -O0 -ggdb -pg -fno-inline
+#include <stdio.h>
+#include "regex.hpp"
+
+#define TEST(a, b, expected) do { \
+	r = regex_compile(a); \
+	bool result = regex_search(r, b); \
+	bool passed = (result == expected); \
+	if (passed) { printf("Success.  - "); } else { printf("Failiour. - "); } \
+	printf("%s vs %s: Result = %d, Expected = %d\n", #a, #b, result, expected); \
+	++num_tests; \
+	if (passed) { ++passed_tests; } \
+} while(0)
+
+signed main() {
+	int num_tests = 0;
+	int passed_tests = 0;
+	regex_t * r;
+
+	TEST(R"del(abc)del","abc",true);
+	TEST(R"del(efg1)del","efg1",true);
+	TEST(R"del(nig)del","ger",false);
+
+	puts("");
+
+	TEST(R"del(ab+c)del","abc",true);
+	TEST(R"del(ef+g1)del","effffg1",true);
+	TEST(R"del(ni*g?)del","ngg",false);
+
+	puts("");
+
+	TEST(R"del(ne.)del","net",true);
+	TEST(R"del(ne.)del","ne",false);
+	TEST(R"del(ne.+)del","neoo",true);
+
+	puts("");
+
+	TEST(R"del(ne.)del","ne\t",true);
+	TEST(R"del(ne\t)del","ne",false);
+	TEST(R"del(ne\t)del","ne\t",true);
+
+	puts("");
+
+	TEST(R"del(\sa)del"," a",true);
+	TEST(R"del(\wi)del","hi",true);
+	TEST(R"del(\w+)del","asd",true);
+
+	puts("");
+
+	TEST(R"del([A-Za-z]+)del","HelloWorld",true);
+	TEST(R"del([A-Za-z]+g)del","HelloWorldg",true);
+	TEST(R"del([A-Za-z]+g)del","g",false);
+
+	puts("");
+
+	TEST(R"del(a+a)del","aaa",true);
+	TEST(R"del(a+a)del","aa",true);
+	TEST(R"del(a+a)del","a",false);
+
+	//++num_tests; TEST(R"del(\d{3})del","123",true);
+	//++num_tests; TEST(R"del(^\w+@\w+\.\w+$)del","example@email.com",true);
+
+	//++num_tests; TEST(R"del(\b\w+\b)del","This is a test",true);
+	//++num_tests; TEST(R"del(^[A-Za-z]+\s\d+)del","OpenAI 123",true);
+	//++num_tests; TEST(R"del([0-9]{4}-[0-9]{2}-[0-9]{2})del","2023-08-22",true);
+
+	//++num_tests; TEST(R"del(^[^abc]+$)del","def123",true);
+	//++num_tests; TEST(R"del(\b\d{5}\b)del","12345 67890",true);
+	//++num_tests; TEST(R"del(^[A-Z][a-z]+$)del","OpenAI",true);
+
+	//++num_tests; TEST(R"del(\d{3}-\d{2}-\d{4})del","123-45-6789",true);
+	//++num_tests; TEST(R"del(^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})del","192.168.1.1",true);
+	//++num_tests; TEST(R"del(^\w{8,12})del","Password123", false);
+
+	printf("\nPassed %d out of %d tests.\n", passed_tests, num_tests);
+}