pull in jeger

emils suggestions
2023-09-18 22:43:57 +02:00 · 2023-09-18 22:43:28 +02:00
6 changed files with 794 additions and 667 deletions
--- a/README.md
+++ b/README.md
@ -83,13 +83,13 @@ hl will read from stdin and write to stdout.
 ### Cli Options
 ```bash
 	-h          : display help message
-	-F <dir>    : syntax file look up directory
+	-I <dir>    : syntax file look up directory
 	-s <syntax> : specify syntax to load
 ```

 ### Environment variables
 ```bash
-	HL_HOME	: default directory to load syntax files from
+	$HLPATH	: colon separated list of directories searched for syntax script files
 ```

 ---
--- a/include/hl.h
+++ b/include/hl.h
@ -6,7 +6,8 @@
 #include <string.h>
 #include <stdbool.h>
 #include "chad.h"
-#include "regex.h"
+#include "vector.h"
+#include "jeger.h"

 // -------------------
 // ### Definitions ###
--- a/include/jeger.h
+++ b/include/jeger.h
@ -1,4 +1,5 @@
-#ifndef REGEX_H
+#ifndef JEGER_H
+#define JEGER_H

 #include <stdbool.h>

@ -13,12 +14,16 @@ typedef struct {
 	vector_t catch_table;	// <offshoot_t>
 } regex_t;

+typedef struct {
+	int position;
+	int width;
+} match_t;
+
 extern regex_t * regex_compile(const char * const pattern);
 extern int       regex_free(regex_t * const regex);
-extern bool      regex_search(regex_t * regex, const char * const string);
-extern int       regex_match(regex_t * regex, const char * const string, const bool start_of_string, const int string_offset);
+extern bool      regex_search(const regex_t * const regex, const char * const string);
+extern match_t * regex_match(const regex_t * const regex, const char * const string, const bool start_of_string);

 extern bool is_magic(const char c);

-#define REGEX_H
 #endif
--- a/source/hl.c
+++ b/source/hl.c
@ -186,7 +186,7 @@ int token_fits(const token_t * const            token,
                     int     *       match_offset) {
  UNUSED(match_offset);
 	//return regex_match(pattern, to, string_offset, match_offset);
-	return regex_match(token->syntax, to, is_start_of_line, string_offset);
+	return (int)regex_match(token->syntax, to, is_start_of_line);
 }

 void render_string(const char * const string,
--- a/source/jeger.c
+++ b/source/jeger.c
@ -0,0 +1,780 @@
+#if __cplusplus
+# pragma GCC diagnostic ignored "-Wc++20-extensions"
+#endif
+
+#include "jeger.h"
+
+#include <assert.h>
+#include <string.h>
+#include <limits.h>
+#include <stdlib.h>
+
+#define JEGER_INIT_STATE    2
+
+// ------------------
+// ### Char tests ###
+// ------------------
+static inline
+bool mystrchr(const char * const str, const char c){
+	for (const char * s = str; *s != '\00'; s++) {
+		if (*s == c) {
+			return true;
+		}
+	}
+	return false;
+}
+
+static inline
+bool is_quantifier(const char c) {
+	return mystrchr("=?+*", c);
+}
+
+static inline
+bool is_hologram_escape(const char c) {
+	return mystrchr("<>", c);
+}
+
+bool is_magic(const char c) {
+	return is_quantifier(c)
+	    || mystrchr("\\[].^", c)
+		;
+}
+
+// -----------------
+// ### Char sets ###
+// -----------------
+#define JEGER_CHAR_SET_at                "@"
+#define JEGER_CHAR_SET_underscore        "_"
+#define JEGER_CHAR_SET_lower             "abcdefghijklmnopqrstuwxyz"
+#define JEGER_CHAR_SET_upper             "ABCDEFGHIJKLMNOPQRSTUWXYZ"
+#define JEGER_CHAR_SET_digits            "0123456789"
+#define JEGER_CHAR_SET_octal_digits      "01234567"
+#define JEGER_CHAR_SET_lower_hex         "abcdef"
+#define JEGER_CHAR_SET_upper_hex         "ABCDEF"
+#define JEGER_CHAR_SET_oct_241_to_277                           \
+			                             "\241\242\243\244\245" \
+			                             "\246\247\250\251\252" \
+			                             "\253\254\255\256\257" \
+			                             "\260\261\262\263\264" \
+			                             "\265\266\267\270\271" \
+			                             "\272\273\274\275\276" \
+			                             "\277"
+#define JEGER_CHAR_SET_oct_300_to_337                           \
+                                         "\300\301\302\303\304" \
+                                         "\305\306\307\310\311" \
+                                         "\312\313\314\315\316" \
+                                         "\317\320\321\322\323" \
+                                         "\324\325\326\327\330" \
+                                         "\331\332\333\334\335" \
+                                         "\336\337"
+#define JEGER_CHAR_SET_file_extra        "/.-_+,#$%~="
+#define JEGER_CHAR_SET_whitespace        " \t\v\n"
+
+static const char JEGER_CHAR_very_word_chars[] = 
+                                   JEGER_CHAR_SET_underscore
+                                   JEGER_CHAR_SET_lower
+                                   JEGER_CHAR_SET_upper
+                                 ;
+
+// ----------------------
+// ### Internal Types ###
+// ----------------------
+typedef struct {
+	int in;
+	char input;
+	int to;
+	int pattern_width;
+	int match_width;
+} delta_t;
+
+typedef struct {
+	int in;
+	int to;
+	int pattern_width;
+	int match_width;
+} offshoot_t;
+
+enum {
+	DO_CATCH              = 0x00000001 << 0,
+	IS_NEGATIVE           = 0x00000001 << 1,
+	IS_AT_THE_BEGINNING   = 0x00000001 << 2,
+	FORCE_START_OF_STRING = 0x00000001 << 3,
+	INCREMENT_STATE       = 0x00000001 << 4,
+};
+
+typedef struct {
+	int       flags;
+	int       state;
+	int       width;
+	char    * whitelist;
+	char    * blacklist;
+} compiler_state;
+
+
+
+// ----------------------------------
+// ### Regex creation/destruction ###
+// ----------------------------------
+static const int HALT_AND_CATCH_FIRE = INT_MIN;
+
+#define ASSERT_HALT(a) ((a == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : (cs->state + a))
+
+static
+void HOOK_ALL(const int                         from,
+              const char           * const       str,
+              const int                           to,
+              const compiler_state * const        cs,
+					regex_t        *           regex) {
+	for (const char * s = str; *s != '\0'; s++) {
+		delta_t * delta = (delta_t *)malloc(sizeof(delta_t));
+		*delta = (delta_t){
+			.in            = cs->state + from,
+			.input         = *s,
+			.to            = ASSERT_HALT(to),
+			.pattern_width = cs->width,
+			.match_width   = 1,
+		};
+		vector_push(&regex->delta_table,
+		            &delta);
+	}
+}
+
+static
+void ABSOLUTE_OFFSHOOT(const int              from,
+                       const int                to,
+                       const int             width,
+                       const int       match_width,
+					         regex_t *       regex) {
+	offshoot_t * offshoot = (offshoot_t *)malloc(sizeof(offshoot_t));
+	*offshoot = (offshoot_t){
+		.in            = from,
+		.to            = to,
+		.pattern_width = width,
+		.match_width   = match_width,
+	};
+	vector_push(&regex->catch_table,
+	            &offshoot);
+}
+
+static
+void OFFSHOOT(const int                     from,
+              const int                       to,
+              const int                    width,
+              const int              match_width,
+              const compiler_state *          cs,
+					regex_t        *       regex) {
+	ABSOLUTE_OFFSHOOT(cs->state + from, ASSERT_HALT(to), width, match_width, regex);
+}
+
+static
+int escape_1_to_1(const char                    c,
+                  const compiler_state * const cs) {
+	char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist;
+	switch (c) {
+		case 't': {
+			strcat(target_list, "\t");
+		} return 1;
+		case 'n': {
+			strcat(target_list, "\n");
+		} return 1;
+		case 'r': {
+			strcat(target_list, "\r");
+		} return 1;
+		case 'b': {
+			strcat(target_list, "\b");
+		} return 1;
+		case '[': {
+			strcat(target_list, "[");
+		} return 1;
+		case ']': {
+			strcat(target_list, "]");
+		} return 1;
+		case '.': {
+			strcat(target_list, ".");
+		} return 1;
+		case '^': {
+			strcat(target_list, "^");
+		} return 1;
+		case '=': {
+			strcat(target_list, "=");
+		} return 1;
+		case '?': {
+			strcat(target_list, "?");
+		} return 1;
+		case '+': {
+			strcat(target_list, "+");
+		} return 1;
+		case '*': {
+			strcat(target_list, "*");
+		} return 1;
+		case '\\': {
+			strcat(target_list, "\\");
+		} return 1;
+	}
+
+	return 0;
+}
+
+static
+int escape_1_to_N(const char                    c,
+                  const compiler_state * const cs) {
+	char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist;
+	switch(c) {
+		case 'i': {
+			const char identifier_chars[] = JEGER_CHAR_SET_at
+			                                JEGER_CHAR_SET_underscore
+			                                JEGER_CHAR_SET_digits
+			                                JEGER_CHAR_SET_oct_300_to_337
+			                              ;
+			strcpy(target_list, identifier_chars);
+			return sizeof(identifier_chars)-1;
+		};
+		case 'I': {
+			const char identifier_chars[] = JEGER_CHAR_SET_at
+			                                JEGER_CHAR_SET_underscore
+			                                JEGER_CHAR_SET_oct_300_to_337
+			                              ;
+			strcpy(target_list, identifier_chars);
+			return sizeof(identifier_chars)-1;
+		};
+		case 'k': {
+			const char keyword_chars[] = JEGER_CHAR_SET_at
+			                             JEGER_CHAR_SET_underscore
+			                             JEGER_CHAR_SET_digits
+			                             JEGER_CHAR_SET_oct_300_to_337
+			                           ;
+			strcpy(target_list, keyword_chars);
+			return sizeof(keyword_chars)-1;
+		};
+		case 'K': {
+			const char keyword_chars[] = JEGER_CHAR_SET_at
+			                             JEGER_CHAR_SET_underscore
+			                             JEGER_CHAR_SET_oct_300_to_337
+			                           ;
+			strcpy(target_list, keyword_chars);
+			return sizeof(keyword_chars)-1;
+		};
+		case 'f': {
+			const char filename_chars[] = JEGER_CHAR_SET_at
+			                              JEGER_CHAR_SET_digits
+			                              JEGER_CHAR_SET_file_extra
+			                            ;
+			strcpy(target_list, filename_chars);
+			return sizeof(filename_chars)-1;
+		};
+		case 'F': {
+			const char filename_chars[] = JEGER_CHAR_SET_at
+			                              JEGER_CHAR_SET_file_extra
+			                            ;
+			strcpy(target_list, filename_chars);
+			return sizeof(filename_chars)-1;
+		};
+		case 'p': {
+			const char printable_chars[] = JEGER_CHAR_SET_at
+			                               JEGER_CHAR_SET_oct_241_to_277
+			                               JEGER_CHAR_SET_oct_300_to_337
+			                             ;
+			strcpy(target_list, printable_chars);
+			return sizeof(printable_chars)-1;
+		};
+		case 'P': {
+			const char printable_chars[] = JEGER_CHAR_SET_at
+			                               JEGER_CHAR_SET_oct_241_to_277
+			                               JEGER_CHAR_SET_oct_300_to_337
+			                             ;
+			strcpy(target_list, printable_chars);
+			return sizeof(printable_chars)-1;
+		};
+		case 's': {
+			const char whitespace_chars[] = JEGER_CHAR_SET_whitespace;
+			strcpy(target_list, whitespace_chars);
+			return sizeof(whitespace_chars)-1;
+		};
+		case 'd': {
+			const char digit_chars[] = JEGER_CHAR_SET_digits;
+			strcpy(target_list, digit_chars);
+			return sizeof(digit_chars)-1;
+		};
+		case 'x': {
+			const char hex_chars[] = JEGER_CHAR_SET_digits
+			                         JEGER_CHAR_SET_lower_hex
+			                         JEGER_CHAR_SET_upper_hex
+			                       ;
+			strcpy(target_list, hex_chars);
+			return sizeof(hex_chars)-1;
+		};
+		case 'o': {
+			const char oct_chars[] = JEGER_CHAR_SET_octal_digits;
+			strcpy(target_list, oct_chars);
+			return sizeof(oct_chars)-1;
+		};
+		case 'w': {
+			const char word_chars[] = JEGER_CHAR_SET_underscore
+			                          JEGER_CHAR_SET_digits
+			                          JEGER_CHAR_SET_lower
+			                          JEGER_CHAR_SET_upper
+			                        ;
+			strcpy(target_list, word_chars);
+			return sizeof(word_chars)-1;
+		};
+		case 'h': {
+			// #global JEGER_CHAR_very_word_chars
+			strcpy(target_list, JEGER_CHAR_very_word_chars);
+			return sizeof(JEGER_CHAR_very_word_chars)-1;
+		};
+		case 'a': {
+			const char alpha_chars[] = JEGER_CHAR_SET_lower
+			                           JEGER_CHAR_SET_upper
+			                         ;
+			strcpy(target_list, alpha_chars);
+			return sizeof(alpha_chars)-1;
+		};
+		case 'l': {
+			const char lower_alpha_chars[] = JEGER_CHAR_SET_lower;
+			strcpy(target_list, lower_alpha_chars);
+			return sizeof(lower_alpha_chars)-1;
+		};
+		case 'u': {
+			const char upper_alpha_chars[] = JEGER_CHAR_SET_upper;
+			strcpy(target_list, upper_alpha_chars);
+			return sizeof(upper_alpha_chars)-1;
+		};
+	}
+
+	return 0;
+}
+
+static inline
+int escape_to_negative(const char                    c,
+	                         compiler_state * const cs) {
+	switch (c) {
+		case 'D': {
+			const char digit_chars[] = JEGER_CHAR_SET_digits;
+			strcpy(cs->blacklist, digit_chars);
+			cs->flags |= IS_NEGATIVE;
+			return sizeof(digit_chars)-1;
+		};
+		case 'X': {
+			const char hex_chars[] = JEGER_CHAR_SET_digits
+			                         JEGER_CHAR_SET_lower_hex
+			                         JEGER_CHAR_SET_upper_hex
+			                       ;
+			strcpy(cs->blacklist, hex_chars);
+			cs->flags |= IS_NEGATIVE;
+			return sizeof(hex_chars)-1;
+		};
+		case 'O': {
+			const char oct_chars[] = JEGER_CHAR_SET_octal_digits;
+			strcpy(cs->blacklist, oct_chars);
+			cs->flags |= IS_NEGATIVE;
+			return sizeof(oct_chars)-1;
+		};
+		case 'W': {
+			const char word_chars[] = JEGER_CHAR_SET_underscore
+			                          JEGER_CHAR_SET_digits
+			                          JEGER_CHAR_SET_lower
+			                          JEGER_CHAR_SET_upper
+			                        ;
+			strcpy(cs->blacklist, word_chars);
+			cs->flags |= IS_NEGATIVE;
+			return sizeof(word_chars)-1;
+		};
+		case 'L': {
+			const char lower_alpha_chars[] = JEGER_CHAR_SET_lower;
+			strcpy(cs->blacklist, lower_alpha_chars);
+			cs->flags |= IS_NEGATIVE;
+			return sizeof(lower_alpha_chars)-1;
+		};
+		case 'U': {
+			const char upper_alpha_chars[] = JEGER_CHAR_SET_upper;
+			strcpy(cs->blacklist, upper_alpha_chars);
+			cs->flags |= IS_NEGATIVE;
+			return sizeof(upper_alpha_chars)-1;
+		};
+	}
+
+	return 0;
+}
+
+static inline
+int compile_dot(compiler_state * const cs) {
+	cs->flags |= DO_CATCH;
+	return true;
+}
+
+static inline
+int compile_escape(const char                    c,
+                         compiler_state * const cs) {
+
+	return escape_1_to_1(c,      cs)
+	    || escape_1_to_N(c,      cs)
+	    || escape_to_negative(c, cs)
+	    ;
+}
+
+static
+int compile_range(const char           * const range,
+                        compiler_state * const    cs) {
+	assert((range[0] == '[') && "Not a range.");
+
+	const char * s;
+	if (range[1] == '^') {
+		cs->flags |= IS_NEGATIVE;
+		s = range + 2;
+	} else {
+		s = range + 1;
+	}
+
+	char * target_list = (cs->flags & IS_NEGATIVE) ? cs->blacklist : cs->whitelist;
+
+	for (; *s != ']'; s++) {
+		assert((*s != '\0') && "Unclosed range.");
+		char c = *s;
+		if (c == '\\') {
+			s += 1;
+			assert(compile_escape(*s, cs) && "Unknown escape.");
+		} else if (*(s+1) == '-') {
+			char end = *(s+2);
+			assert((c < end) && "Endless range.");
+			for (char cc = c; cc < end+1; cc++) {
+				strncat(target_list,  &cc, 1);
+				strncat(target_list, "\0", 1);
+			}
+			s += 2;
+		} else {
+			strncat(target_list,   &c, 1);
+		}
+	}
+
+	return ((s - range) + 1);
+}
+
+static
+void filter_blacklist(const char * whitelist,
+                      const char * blacklist,
+                            char *  filtered) {
+	for (; *blacklist != '\0'; blacklist++) {
+		for (; *whitelist != '\0'; whitelist++) {
+			if (*blacklist == *whitelist) {
+				goto long_continue;
+			}
+		}
+		strncat(filtered, blacklist, 1);
+		long_continue:
+		;
+	}
+}
+
+regex_t * regex_compile(const char * const pattern) {
+	regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
+	regex->str = strdup(pattern);
+	vector_init(&regex->delta_table, sizeof(delta_t*), 0UL);
+	vector_init(&regex->catch_table, sizeof(offshoot_t*), 0UL);
+
+	char whitelist[64];
+	char blacklist[64];
+
+	compiler_state cs = {
+		.flags     = IS_AT_THE_BEGINNING,
+		.state     = JEGER_INIT_STATE,
+		.whitelist = whitelist,
+		.blacklist = blacklist,
+	};
+
+	for (const char * s = pattern; *s != '\00';) {
+		assert(!is_quantifier(*s) && "Pattern starts with quantifier.");
+		// Reset the compiler
+		whitelist[0] = '\0';
+		blacklist[0] = '\0';
+		cs.flags    &= (IS_AT_THE_BEGINNING | FORCE_START_OF_STRING);
+		cs.width     = 1;
+
+		// Translate char
+		switch (*s) {
+			case '^': {
+				;
+			} break;
+			case '.': {
+				compile_dot(&cs);
+				s += 1;
+			} break;
+			case '\\': {
+				s += 1;
+				if (compile_escape(*s, &cs)) {
+					s += 1;
+				} else if (is_hologram_escape(*s)) {
+					;
+				} else {
+					assert("Unknown escape.");
+				}
+			} break;
+			case '[': {
+				s += compile_range(s, &cs);
+			} break;
+			default: { // Literal
+				whitelist[0] =   *s;
+				whitelist[1] = '\0';
+				s += 1;
+			} break;
+		}
+
+		// Compile char
+		switch (*s) {
+			// holograms
+			case '^': {
+				whitelist[0] = '\n';
+				whitelist[1] = '\0';
+				HOOK_ALL(0, whitelist, 0, &cs, regex);
+				if (cs.flags & IS_AT_THE_BEGINNING) {
+					cs.flags |= FORCE_START_OF_STRING;
+				} else {
+					cs.flags |= INCREMENT_STATE;
+				}
+				s += 1;
+			} break;
+			case '<': {
+				cs.flags |= IS_NEGATIVE | INCREMENT_STATE;
+				if (cs.flags & IS_AT_THE_BEGINNING) {
+					ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE+1, 0, 0, regex);
+				}
+				strcat(blacklist, JEGER_CHAR_very_word_chars);
+				OFFSHOOT(0, 0, 1, 0, &cs, regex);
+				s += 1;
+			} break;
+			case '>': {
+				cs.flags |= IS_NEGATIVE | INCREMENT_STATE;
+				strcat(blacklist, JEGER_CHAR_very_word_chars);
+				OFFSHOOT(0, 1, 0, 0, &cs, regex); 
+				s += 1;
+			} break;
+			// quantifiers
+			case '=':
+			case '?': {
+				HOOK_ALL(0, whitelist, +1, &cs, regex);
+				if ((cs.flags & DO_CATCH)
+				||  (cs.flags & IS_NEGATIVE)) {
+					OFFSHOOT(0, +1, 1, 1, &cs, regex);
+				}
+				s += 1;
+			} break;
+			case '*': {
+				HOOK_ALL(0, whitelist,  0, &cs, regex);
+				if ((cs.flags & DO_CATCH)
+				||  (cs.flags & IS_NEGATIVE)) {
+					OFFSHOOT(0, 0, 1, 1, &cs, regex);
+				}
+				s += 1;
+			} break;
+			case '+': {
+				cs.flags |= INCREMENT_STATE;
+				HOOK_ALL(0, whitelist, +1, &cs, regex);
+				if ((cs.flags & DO_CATCH)
+				||  (cs.flags & IS_NEGATIVE)) {
+					OFFSHOOT(0, +1, 1, 1, &cs, regex);
+				}
+				HOOK_ALL(+1, whitelist, +1, &cs, regex);
+				if ((cs.flags & DO_CATCH)
+				||  (cs.flags & IS_NEGATIVE)) {
+					OFFSHOOT(+1, +1, 1, 1, &cs, regex);
+				}
+				s += 1;
+			} break;
+			default: { // Literal
+				cs.flags |= INCREMENT_STATE;
+				HOOK_ALL(0, whitelist, +1, &cs, regex);
+				if ((cs.flags & DO_CATCH)
+				||  (cs.flags & IS_NEGATIVE)) {
+					OFFSHOOT(0, +1, 1, 1, &cs, regex);
+				}
+			} break;
+		}
+
+		// Compile blacklist
+		if (*blacklist) {
+			char filtered_blacklist[64];
+			filtered_blacklist[0] = '\0'; 
+			filter_blacklist(whitelist, blacklist, filtered_blacklist);
+			HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs, regex);
+		}
+
+		if (cs.flags & INCREMENT_STATE) {
+			++cs.state;
+		}
+
+		cs.flags &= (~IS_AT_THE_BEGINNING);
+	}
+
+	// Init state hookups
+	ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE, 0, 0, regex);
+	if (cs.flags & FORCE_START_OF_STRING) {
+		ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, 0, regex);
+	} else {
+		ABSOLUTE_OFFSHOOT(1,    JEGER_INIT_STATE, 0, 0, regex);
+	}
+
+	regex->accepting_state = cs.state;
+
+	return regex;
+}
+
+int regex_free(regex_t * const regex) {
+	free(regex->str);
+	vector_free(&regex->delta_table);
+	vector_free(&regex->catch_table);
+	free(regex);
+	return 0;
+}
+
+
+
+// -----------------
+// ### Searching ###
+// -----------------
+static
+const offshoot_t * catch_table_lookup(const regex_t * const regex,
+                                      const int     * const state) {
+	for (size_t i = 0; i < regex->catch_table.element_count; i++){
+		const offshoot_t * const offshoot = *(offshoot_t**)vector_get(&regex->catch_table, i);
+		if (offshoot->in == *state) {
+			return offshoot;
+		}
+	}
+	return NULL;
+}
+
+static
+bool regex_assert(const regex_t * const         regex,
+                  const char    * const        string,
+                        int                     state,
+                        match_t * const         match) {
+	if (state == HALT_AND_CATCH_FIRE) {
+		return false;
+	}
+
+	bool last_stand = false;
+	bool was_found;
+
+	const char * s = string;
+	LOOP: {
+		was_found = false;
+		if (*s == '\0') {
+			last_stand = true;
+			goto PERFORM_CATCH_LOOKUP;
+		}
+		// Jump search for the correct state
+		const int jump = 10;
+		size_t i = jump;
+		while (i < regex->delta_table.element_count) {
+			const delta_t * const delta = *(delta_t**)vector_get(&regex->delta_table, i);
+			if (delta->in >= state) {
+				break;
+			}
+			i += jump;
+		}
+		i -= jump;
+		// Linear search finish up
+		for (; i < regex->delta_table.element_count; i++) {
+			const delta_t * const delta = *(delta_t**)vector_get(&regex->delta_table, i);
+
+			if (delta->in > state) {
+				break;
+			}
+
+			if ((delta->in == state) 
+			&&  (delta->input == *s)) {
+				was_found = true;
+				const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match);
+				if(r){
+					if (match->position == -1) {
+						match->position = (s - string);
+					}
+					match->width += delta->match_width;
+					return r;
+				}
+			}
+		}
+	}
+
+	PERFORM_CATCH_LOOKUP: {
+		if (!was_found) {
+			const offshoot_t * const my_catch = catch_table_lookup(regex, &state);
+			if (my_catch && (!my_catch->pattern_width || !last_stand)) {
+				state = my_catch->to;
+				s += my_catch->pattern_width;
+				match->width += my_catch->match_width;
+				goto LOOP;
+			}
+		}
+	}
+
+	return (state == regex->accepting_state);
+}
+
+match_t * regex_match(const regex_t * const              regex,
+                      const char    * const             string,
+                      const bool            is_start_of_string) {
+
+	vector_t matches;
+	vector_init(&matches, sizeof(match_t), 0);
+
+	match_t * match = (match_t *)malloc(sizeof(match_t));
+
+	/* Non-existent regex does not match anything.
+	 * Not to be confused with an empty regex.
+	 */
+	if (regex == NULL) {
+		goto FINISH;
+	}
+
+	// Find all matches
+	{
+		const char * s = string;
+		do {
+			int initial_state;
+			initial_state = (int)(!(is_start_of_string && (s == string)));
+
+			*match = (match_t){
+				.position = -1,
+				.width    =  0,
+			};
+
+			if (regex_assert(regex, s, initial_state, match)) {
+				match->position = (s - string);
+
+				vector_push(&matches, match);
+
+				s += ((match->width > 0) ? match->width : 1);
+				match = (match_t *)malloc(sizeof(match_t));
+			} else {
+				++s;
+			}
+		} while (*s != '\0');
+	}
+
+	FINISH:
+
+	// Insert sentinel
+	*match = (match_t){
+		.position = -1,
+		.width    = -1,
+	};
+	vector_push(&matches, match);
+
+	// Hide internal vector usage
+	const size_t data_size = matches.element_size * matches.element_count;
+	match_t * r = (match_t *)malloc(data_size);
+	memcpy(r, matches.data, data_size);
+	vector_free(&matches);
+
+	return r;
+}
+
+bool regex_search(const regex_t * const  regex,
+                  const char    * const string) {
+
+	match_t * m = regex_match(regex, string, true);
+	const bool r = (m->position != -1);
+	free(m);
+
+	return r;
+}
--- a/source/regex.c
+++ b/source/regex.c
@ -1,659 +0,0 @@
-/* regex.c
- * Copyright 2023 Anon Anonson, Ognjen 'xolatile' Milan Robovic, Emil Williams
- * SPDX Identifier: GPL-3.0-only / NO WARRANTY / NO GUARANTEE */
-
-#include "regex.h"
-
-#include <assert.h>
-#include <string.h>
-#include <limits.h>
-#include <stdlib.h>
-
-// ------------------
-// ### Char tests ###
-// ------------------
-static bool is_quantifier(const char c) {
-	for (const char * s = "+*?="; *s != '\00'; s++) {
-		if (*s == c) {
-			return true;
-		}
-	}
-	return false;
-}
-
-bool is_magic(const char c) {
-	if (is_quantifier(c)) {
-		return true;
-	}
-	for (const char * s = "\\[].^"; *s != '\00'; s++) {
-		if (*s == c) {
-			return true;
-		}
-	}
-	return false;
-}
-
-// ----------------------
-// ### Internal Types ###
-// ----------------------
-typedef struct {
-	int in;
-	char input;
-	int to;
-	int width;
-} delta_t;
-
-typedef struct {
-	int in;
-	int to;
-	int width;
-} offshoot_t;
-
-typedef struct {
-	// XXX:
-	//  These should share a mask
-	//  Not even sure why they are pointers to begin with
-	bool    * do_catch;
-	bool    * is_negative;
-	bool      is_at_the_beginning;
-	bool      do_skip;
-// these might be obsolite but im leaving them for now
-	bool    * do_loop_hook;
-	bool    * do_follow_hook;
-	bool    * do_loop_shoot;
-	bool    * do_follow_shoot;
-// ---
-	int     * state;
-	int     * width;
-	char    * whitelist;
-	char    * blacklist;
-	regex_t * regex;
-} compiler_state;
-
-
-
-// ----------------------------------
-// ### Regex creation/destruction ###
-// ----------------------------------
-#define HALT_AND_CATCH_FIRE INT_MIN
-
-static void HOOK_ALL(      int              from,
-                     const char * const      str,
-                           int                to,
-                           compiler_state *   cs) {
-
-	int hook_to = (to == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : ((*cs->state) + to);
-
-
-	for (const char * s = str; *s != '\0'; s++) {
-		delta_t * delta = malloc(sizeof(delta_t));
-		delta->in    = *cs->state + from;
-		delta->input = *s;
-		delta->to    = hook_to;
-		delta->width = *cs->width;
-		vector_push(&cs->regex->delta_table,
-		            &delta);
-	}
-}
-
-static void ABSOLUTE_OFFSHOOT(int             from,
-                              int               to,
-                              int            width,
-                              compiler_state *  cs) {
-	offshoot_t * offshoot = malloc(sizeof(offshoot_t));
-	offshoot->in    = from; 
-	offshoot->to    = to;
-	offshoot->width = width;
-	vector_push(&cs->regex->catch_table,
-	            &offshoot);
-}
-
-static void OFFSHOOT(int             from,
-                     int               to,
-                     int            width,
-                     compiler_state *  cs) {
-	ABSOLUTE_OFFSHOOT(*cs->state + from, *cs->state + to, width, cs);
-}
-
-static int escape_1_to_1(const char c, compiler_state * cs) {
-	char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
-	switch (c) {
-		case 't': {
-			strcat(target_list, "\t");
-		} return 1;
-		case 'n': {
-			strcat(target_list, "\n");
-		} return 1;
-		case 'r': {
-			strcat(target_list, "\r");
-		} return 1;
-		case 'b': {
-			strcat(target_list, "\b");
-		} return 1;
-		case '[': {
-			strcat(target_list, "[");
-		} return 1;
-		case ']': {
-			strcat(target_list, "]");
-		} return 1;
-		case '.': {
-			strcat(target_list, ".");
-		} return 1;
-		case '^': {
-			strcat(target_list, "^");
-		} return 1;
-		case '=': {
-			strcat(target_list, "=");
-		} return 1;
-		case '?': {
-			strcat(target_list, "?");
-		} return 1;
-		case '+': {
-			strcat(target_list, "+");
-		} return 1;
-		case '*': {
-			strcat(target_list, "*");
-		} return 1;
-		case '\\': {
-			strcat(target_list, "\\");
-		} return 1;
-	}
-
-	return 0;
-}
-
-static int escape_1_to_N(const char c, compiler_state * cs) {
-	char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
-	switch(c) {
-		case 'i': {
-			const char identifier_chars[] = "@0123456789_"
-			                                "\300\301\302\303\304"
-			                                "\305\306\307\310\311"
-			                                "\312\313\314\315\316"
-			                                "\317\320\321\322\323"
-			                                "\324\325\326\327\330"
-			                                "\331\332\333\334\335"
-			                                "\336\337";
-			strcpy(target_list, identifier_chars);
-			return sizeof(identifier_chars)-1;
-		};
-		case 'I': {
-			const char identifier_chars[] = "@_"
-			                                "\300\301\302\303\304"
-			                                "\305\306\307\310\311"
-			                                "\312\313\314\315\316"
-			                                "\317\320\321\322\323"
-			                                "\324\325\326\327\330"
-			                                "\331\332\333\334\335"
-			                                "\336\337";
-			strcpy(target_list, identifier_chars);
-			return sizeof(identifier_chars)-1;
-		};
-		case 'k': {
-			const char keyword_chars[] = "@0123456789_"
-			                             "\300\301\302\303\304"
-			                             "\305\306\307\310\311"
-			                             "\312\313\314\315\316"
-			                             "\317\320\321\322\323"
-			                             "\324\325\326\327\330"
-			                             "\331\332\333\334\335"
-			                             "\336\337";
-			strcpy(target_list, keyword_chars);
-			return sizeof(keyword_chars)-1;
-		};
-		case 'K': {
-			const char keyword_chars[] = "@_"
-			                             "\300\301\302\303\304"
-			                             "\305\306\307\310\311"
-			                             "\312\313\314\315\316"
-			                             "\317\320\321\322\323"
-			                             "\324\325\326\327\330"
-			                             "\331\332\333\334\335"
-			                             "\336\337";
-			strcpy(target_list, keyword_chars);
-			return sizeof(keyword_chars)-1;
-		};
-		case 'f': {
-			const char filename_chars[] = "@0123456789/.-_+,#$%~=";
-			strcpy(target_list, filename_chars);
-			return sizeof(filename_chars)-1;
-		};
-		case 'F': {
-			const char filename_chars[] = "@/.-_+,#$%~=";
-			strcpy(target_list, filename_chars);
-			return sizeof(filename_chars)-1;
-		};
-		case 'p': {
-			const char printable_chars[] = "@"
-			                               "\241\242\243\244\245"
-			                               "\246\247\250\251\252"
-			                               "\253\254\255\256\257"
-			                               "\260\261\262\263\264"
-			                               "\265\266\267\270\271"
-			                               "\272\273\274\275\276"
-			                               "\277"
-			                               "\300\301\302\303\304"
-			                               "\305\306\307\310\311"
-			                               "\312\313\314\315\316"
-			                               "\317\320\321\322\323"
-			                               "\324\325\326\327\330"
-			                               "\331\332\333\334\335"
-			                               "\336\337";
-			strcpy(target_list, printable_chars);
-			return sizeof(printable_chars)-1;
-		};
-		case 'P': {
-			const char printable_chars[] = "@"
-			                               "\241\242\243\244\245"
-			                               "\246\247\250\251\252"
-			                               "\253\254\255\256\257"
-			                               "\260\261\262\263\264"
-			                               "\265\266\267\270\271"
-			                               "\272\273\274\275\276"
-			                               "\277"
-			                               "\300\301\302\303\304"
-			                               "\305\306\307\310\311"
-			                               "\312\313\314\315\316"
-			                               "\317\320\321\322\323"
-			                               "\324\325\326\327\330"
-			                               "\331\332\333\334\335"
-			                               "\336\337";
-			strcpy(target_list, printable_chars);
-			return sizeof(printable_chars)-1;
-		};
-		case 's': {
-			const char whitespace_chars[] = " \t\v\n";
-			strcpy(target_list, whitespace_chars);
-			return sizeof(whitespace_chars)-1;
-		};
-		case 'd': {
-			const char digit_chars[] = "0123456789";
-			strcpy(target_list, digit_chars);
-			return sizeof(digit_chars)-1;
-		};
-		case 'x': {
-			const char hex_chars[] = "0123456789"
-			                         "abcdef"
-			                         "ABCDEF";
-			strcpy(target_list, hex_chars);
-			return sizeof(hex_chars)-1;
-		};
-		case 'o': {
-			const char oct_chars[] = "01234567";
-			strcpy(target_list, oct_chars);
-			return sizeof(oct_chars)-1;
-		};
-		case 'w': {
-			const char word_chars[] = "0123456789"
-			                          "abcdefghijklmnopqrstuwxyz"
-			                          "ABCDEFGHIJKLMNOPQRSTUWXYZ"
-			                          "_";
-			strcpy(target_list, word_chars);
-			return sizeof(word_chars)-1;
-		};
-		case 'h': {
-			const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
-			                               "ABCDEFGHIJKLMNOPQRSTUWXYZ"
-			                               "_";
-			strcpy(target_list, very_word_chars);
-			return sizeof(very_word_chars)-1;
-		};
-		case 'a': {
-			const char alpha_chars[] = "abcdefghijklmnopqrstuwxyz"
-			                           "ABCDEFGHIJKLMNOPQRSTUWXYZ";
-			strcpy(target_list, alpha_chars);
-			return sizeof(alpha_chars)-1;
-		};
-		case 'l': {
-			const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz";
-			strcpy(target_list, lower_alpha_chars);
-			return sizeof(lower_alpha_chars)-1;
-		};
-		case 'u': {
-			const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ";
-			strcpy(target_list, upper_alpha_chars);
-			return sizeof(upper_alpha_chars)-1;
-		};
-	}
-
-	return 0;
-}
-
-static int escape_to_negative(const char              c,
-	                                compiler_state * cs) {
-	switch (c) {
-		case 'D': {
-			const char digit_chars[] = "0123456789";
-			strcpy(cs->blacklist, digit_chars);
-			*cs->is_negative = true;
-			return sizeof(digit_chars)-1;
-		};
-	}
-
-	return 0;
-}
-
-static int escape_hologram(const char c, compiler_state * cs) {
-	switch (c) {
-		case '<': {
-			if (cs->is_at_the_beginning) {
-				ABSOLUTE_OFFSHOOT(0, 2, 0, cs);
-				cs->do_skip = true;
-			}
-			const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
-										   "ABCDEFGHIJKLMNOPQRSTUWXYZ"
-										   "_";
-			*cs->is_negative = true;	// effectless currently; should be used to trigger the following lines in the main compile loop
-			strcat(cs->blacklist, very_word_chars);
-			HOOK_ALL(0, cs->blacklist, HALT_AND_CATCH_FIRE, cs);
-			OFFSHOOT(0, 0, 1, cs);
-
-			return sizeof(very_word_chars)-1;
-		};
-		case '>': {
-			const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
-										   "ABCDEFGHIJKLMNOPQRSTUWXYZ"
-										   "_";
-			*cs->is_negative = true;
-			strcat(cs->blacklist, very_word_chars);
-			
-			return 1;
-		}
-	}
-	return 0;
-}
-
-static int compile_dot(compiler_state * cs) {
-	*cs->do_catch = true;
-	return true;
-}
-
-static int compile_escape(const char                    c,
-                                compiler_state *       cs) {
-
-	return escape_1_to_1(c,      cs)
-		|| escape_1_to_N(c,      cs)
-		|| escape_to_negative(c, cs)
-		|| escape_hologram(c,    cs)
-		;
-}
-
-static int compile_range(const char           * const range,
-                               compiler_state *          cs) {
-	assert((range[0] == '[') && "Not a range.");
-
-	const char * s;
-	if (range[1] == '^') {
-		*cs->is_negative = true;
-		s = range + 2;
-	} else {
-		s = range + 1;
-	}
-
-	char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
-
-	for (; *s != ']'; s++) {
-		assert((*s != '\0') && "Unclosed range.");
-		char c = *s;
-		if (c == '\\') {
-			s += 1;
-			assert(compile_escape(*s, cs) && "Unknown escape.");
-		} else if (*(s+1) == '-') {
-			char end = *(s+2);
-			assert((c < end) && "Endless range.");
-			for (char cc = c; cc < end+1; cc++) {
-				strncat(target_list,  &cc, 1);
-				strncat(target_list, "\0", 1);
-			}
-			s += 2;
-		} else {
-			strncat(target_list,   &c, 1);
-		}
-	}
-
-	return ((s - range) + 1);
-}
-
-void filter_blacklist(const char * whitelist,
-                      const char * blacklist,
-                            char *  filtered) {
-	for (; *blacklist != '\0'; blacklist++) {
-		for(; *whitelist != '\0'; whitelist++) {
-			if (*blacklist == *whitelist) {
-				goto long_continue;
-			}
-		}
-		strncat(filtered, blacklist, 1);
-		long_continue:
-		;
-	}
-}
-
-regex_t * regex_compile(const char * const pattern) {
-	regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
-	regex->str = strdup(pattern);
-	vector_init(&regex->delta_table, sizeof(delta_t*), 0UL);
-	vector_init(&regex->catch_table, sizeof(offshoot_t*), 0UL);
-
-	int state = 2;
-
-	// this is plain retarded
-	bool do_catch;
-	bool is_negative;
-	bool do_loop_hook;
-	bool do_follow_hook;
-	bool do_loop_shoot;
-	bool do_follow_shoot;
-	int width;
-	char whitelist[64];
-	char blacklist[64];
-
-	compiler_state cs = {
-		.do_catch            = &do_catch,
-		.is_negative         = &is_negative,
-		.is_at_the_beginning = true,
-		.do_skip             = false,
-		.state               = &state,
-		.width               = &width,
-		.whitelist           = whitelist,
-		.blacklist           = blacklist,
-		.regex               = regex,
-	};
-
-	for (const char * s = pattern; *s != '\00';) {
-		// Reset the compiler
-		assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
-		whitelist[0]    =  '\0';
-		blacklist[0]    =  '\0';
-		do_catch        = false;
-		is_negative     = false;
-		cs.do_skip      = false;
-		/**/
-		do_loop_hook    = false;
-		do_follow_hook  = false;
-		do_loop_shoot   = false;
-		do_follow_shoot = false;
-		/**/
-		width        = 1;
-
-		// Translate char
-		switch (*s) {
-			case '^': {
-				if (cs.is_at_the_beginning) {
-					ABSOLUTE_OFFSHOOT(0,                   2, 0, &cs);
-					ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, &cs);
-				}
-				whitelist[0] = '\n';
-				whitelist[1] = '\0';
-				HOOK_ALL(0, whitelist, 0, &cs);
-				if (s != pattern) {
-					state += 1;
-				}
-				cs.do_skip = true;
-			} break;
-			case '.': {
-				compile_dot(&cs);
-			} break;
-			case '\\': {
-				s += 1;
-				assert(compile_escape(*s, &cs) && "Unknown escape.");
-			} break;
-			case '[': {
-				s += compile_range(s, &cs) - 1;
-			} break;
-			default: {
-				whitelist[0] =   *s;
-				whitelist[1] = '\0';
-			} break;
-		}
-		
-		s += 1;
-
-		if (cs.do_skip) {
-			goto long_continue;
-		}
-
-		// Compile with quantifier
-		switch (*s) {
-			case '=':
-			case '?': {
-				do_loop_hook = true;
-				HOOK_ALL(0, whitelist, +1, &cs);
-				if (do_catch || is_negative) {
-					OFFSHOOT(0, +1, 1, &cs);
-				}
-				s += 1;
-			} break;
-			case '*': {
-				HOOK_ALL(0, whitelist,  0, &cs);
-				if (do_catch) {
-					OFFSHOOT(0, +1, 1, &cs);
-				} else if (is_negative) {
-					OFFSHOOT(0,  0, 1, &cs);
-				}
-				s += 1;
-			} break;
-			case '+': {
-				HOOK_ALL(0, whitelist, +1, &cs);
-				if (do_catch || is_negative) {
-					OFFSHOOT(0, +1, 1, &cs);
-				}
-				state += 1;
-				HOOK_ALL(0, whitelist,  0, &cs);
-				if (do_catch || is_negative) {
-					OFFSHOOT(0, 0, 1, &cs);
-				}
-				s += 1;
-			} break;
-			default: { // Literal
-				HOOK_ALL(0, whitelist, +1, &cs);
-				if (do_catch || is_negative) {
-					OFFSHOOT(0, +1, 1, &cs);
-				}
-				state += 1;
-			} break;
-		}
-
-		// Compile blacklist
-		if (*blacklist) {
-			char filtered_blacklist[64];
-			filtered_blacklist[0] = '\0'; 
-			filter_blacklist(whitelist, blacklist, filtered_blacklist);
-			HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs);
-		}
-
-		long_continue:
-		cs.is_at_the_beginning = false;
-  long_continue:;
-	}
-
-	regex->accepting_state = state;
-
-	return regex;
-}
-
-int regex_free(regex_t * const regex) {
-	free(regex->str);
-	vector_free(&regex->delta_table);
-	vector_free(&regex->catch_table);
-	free(regex);
-	return 0;
-}
-
-
-
-// -----------------
-// ### Searching ###
-// -----------------
-static int catch_(const regex_t * const regex,
-                         int     * const state) {
-	for (size_t i = 0; i < regex->catch_table.element_count; i++){
-		const offshoot_t * const offshoot = *(offshoot_t**)vector_get(&regex->catch_table, i);
-		if (offshoot->in == *state) {
-			*state = offshoot->to;
-			return offshoot->width;
-		}
-	}
-	return HALT_AND_CATCH_FIRE;
-}
-
-static int regex_assert(const regex_t * const         regex,
-                        const char    * const        string,
-                        const int             string_offset,
-                              int                     state,
-                              int                     width) { // XXX: im pretty sure this is actually redundant and the width should be calculated from string - s
-	for (const char * s = (string + string_offset); *s != '\00';) {
-		// XXX: this should be a jump search for the instate and then a linear
-		// delta
-		//int left  = 0;
-		//int right = regex->delta_table.element_count - 1;
-		//int i;
-		//while(left <= right) }
-		for (size_t i = 0; i < regex->delta_table.element_count; i++) {
-			//i = (left + right) / 2;
-			const delta_t * const delta = *(delta_t**)vector_get(&regex->delta_table, i);
-			if ((delta->in == state) 
-			&&  (delta->input == *s)) {
-				int r = regex_assert(regex, string, (s - string) + delta->width, delta->to, width + 1);
-				if(r){
-					return r;
-				}
-			}
-		}
-
-		const int catch_width = catch_(regex, &state);
-		if ((catch_width != HALT_AND_CATCH_FIRE)
-		&&  (state != HALT_AND_CATCH_FIRE)) {
-			s += catch_width;
-			continue;
-		}
-
-		// XXX: the extra catch might not be necessary if we were to compile to a simpler form
-		catch_(regex, &state);
-		return (state == regex->accepting_state) ? width : false;
-	}
-
-	return false;
-}
-
-int regex_match(      regex_t *                    regex,
-                const char    * const             string,
-                const bool            is_start_of_string,
-                const int                  string_offset) {	// XXX: remove this useless piece of shit of a parameter nigger
-	if (regex == NULL) {
-		return false;
-	}
-	if (string == NULL) {
-		return true;
-	}
-
-	const int initial_state = (int)(!is_start_of_string);
-
-	return regex_assert(regex, string, string_offset, initial_state, 0);
-}
-
-bool regex_search(      regex_t *        regex,
-                  const char    * const string) {
-
-	return (bool)regex_match(regex, string, true, 0);
-}
Author	SHA1	Message	Date
anon	8a3adb7862	pull in jeger	2023-09-18 22:43:57 +02:00
anon	65930238a0	emils suggestions	2023-09-18 22:43:28 +02:00