From 9b54a3f3e8630e87584dd431d5f0f4f3a6a37ecc Mon Sep 17 00:00:00 2001 From: anon Date: Sat, 23 Sep 2023 17:06:44 +0200 Subject: [PATCH] i deserve a blowjob --- include/jeger.h | 6 ++- source/hl.c | 4 +- source/jeger.c | 148 ++++++++++++++++++++++++++++++++++++++++---------------- 3 files changed, 112 insertions(+), 46 deletions(-) diff --git a/include/jeger.h b/include/jeger.h index 5c6d622..97efd03 100644 --- a/include/jeger.h +++ b/include/jeger.h @@ -15,7 +15,10 @@ typedef struct { } regex_t; typedef struct { - int position; + union { + int position; + const char * _pos_ptr; + }; int width; } match_t; @@ -25,5 +28,6 @@ extern bool regex_search(const regex_t * const regex, const char * const st extern match_t * regex_match(const regex_t * const regex, const char * const string, const bool start_of_string); extern bool is_magic(const char c); +extern bool is_sentinel(const match_t * const match); #endif diff --git a/source/hl.c b/source/hl.c index 96b8343..6932d2d 100644 --- a/source/hl.c +++ b/source/hl.c @@ -195,7 +195,7 @@ void render_string(const char * const string, token_t * t = *(token_t**)vector_get(&token_table, i); match_t * match = regex_match(t->syntax, string, true); - if (match->position == -1) { + if (is_sentinel(match)) { free(match); continue; } @@ -212,7 +212,7 @@ void render_string(const char * const string, max = &sentinel; for (int h = 0; h < rrs; h++) { result_t * const current_result = r + h; - for (int j = 0; current_result->m[j].position != -1; j++) { + for (int j = 0; !is_sentinel(&(current_result->m[j])); j++) { if (current_result->m[j].position == (s - string)) { if (current_result->m[j].width > max->m->width) { current_result->i = j; diff --git a/source/jeger.c b/source/jeger.c index 5074182..343f639 100644 --- a/source/jeger.c +++ b/source/jeger.c @@ -8,8 +8,13 @@ #include #include #include +#if DEBUG +# include +#endif -#define JEGER_INIT_STATE 2 +#define JEGER_SOS_STATE 0 +#define JEGER_NSOS_STATE 1 +#define JEGER_INIT_STATE 2 // ------------------ // ### Char tests ### @@ -40,6 +45,15 @@ bool is_magic(const char c) { ; } +// ------------------- +// ### Match tests ### +// ------------------- +bool is_sentinel(const match_t * const match) { + return (match->position == -1) + && (match->width == -1) + ; +} + // ----------------- // ### Char sets ### // ----------------- @@ -52,13 +66,13 @@ bool is_magic(const char c) { #define JEGER_CHAR_SET_lower_hex "abcdef" #define JEGER_CHAR_SET_upper_hex "ABCDEF" #define JEGER_CHAR_SET_oct_241_to_277 \ - "\241\242\243\244\245" \ - "\246\247\250\251\252" \ - "\253\254\255\256\257" \ - "\260\261\262\263\264" \ - "\265\266\267\270\271" \ - "\272\273\274\275\276" \ - "\277" + "\241\242\243\244\245" \ + "\246\247\250\251\252" \ + "\253\254\255\256\257" \ + "\260\261\262\263\264" \ + "\265\266\267\270\271" \ + "\272\273\274\275\276" \ + "\277" #define JEGER_CHAR_SET_oct_300_to_337 \ "\300\301\302\303\304" \ "\305\306\307\310\311" \ @@ -68,13 +82,13 @@ bool is_magic(const char c) { "\331\332\333\334\335" \ "\336\337" #define JEGER_CHAR_SET_file_extra "/.-_+,#$%~=" -#define JEGER_CHAR_SET_whitespace " \t\v\n" +#define JEGER_CHAR_SET_whitespace " " "\t\v\n" -static const char JEGER_CHAR_very_word_chars[] = - JEGER_CHAR_SET_underscore - JEGER_CHAR_SET_lower - JEGER_CHAR_SET_upper - ; +static const char JEGER_CHAR_symbol_chars[] = + JEGER_CHAR_SET_underscore + JEGER_CHAR_SET_lower + JEGER_CHAR_SET_upper + ; // ---------------------- // ### Internal Types ### @@ -95,17 +109,19 @@ typedef struct { } offshoot_t; enum { - DO_CATCH = 0x00000001 << 0, - IS_NEGATIVE = 0x00000001 << 1, - IS_AT_THE_BEGINNING = 0x00000001 << 2, - FORCE_START_OF_STRING = 0x00000001 << 3, - INCREMENT_STATE = 0x00000001 << 4, + DO_CATCH = 0x00000001 << 0, + IS_NEGATIVE = 0x00000001 << 1, + IS_AT_THE_BEGINNING = 0x00000001 << 2, + FORCE_START_OF_STRING = 0x00000001 << 3, + DO_FORBID_START_OF_STRING = 0x00000001 << 4, + INCREMENT_STATE = 0x00000001 << 5, }; typedef struct { int flags; int state; int width; + int width2; char * whitelist; char * blacklist; } compiler_state; @@ -132,7 +148,7 @@ void HOOK_ALL(const int from, .input = *s, .to = ASSERT_HALT(to), .pattern_width = cs->width, - .match_width = 1, + .match_width = cs->width2, }; vector_push(®ex->delta_table, &delta); @@ -318,9 +334,9 @@ int escape_1_to_N(const char c, return sizeof(word_chars)-1; }; case 'h': { - // #global JEGER_CHAR_very_word_chars - strcpy(target_list, JEGER_CHAR_very_word_chars); - return sizeof(JEGER_CHAR_very_word_chars)-1; + // #global JEGER_CHAR_symbol_chars + strcpy(target_list, JEGER_CHAR_symbol_chars); + return sizeof(JEGER_CHAR_symbol_chars)-1; }; case 'a': { const char alpha_chars[] = JEGER_CHAR_SET_lower @@ -346,7 +362,7 @@ int escape_1_to_N(const char c, static inline int escape_to_negative(const char c, - compiler_state * const cs) { + compiler_state * const cs) { switch (c) { case 'D': { const char digit_chars[] = JEGER_CHAR_SET_digits; @@ -488,6 +504,7 @@ regex_t * regex_compile(const char * const pattern) { blacklist[0] = '\0'; cs.flags &= (IS_AT_THE_BEGINNING | FORCE_START_OF_STRING); cs.width = 1; + cs.width2 = 1; // Translate char switch (*s) { @@ -503,7 +520,7 @@ regex_t * regex_compile(const char * const pattern) { if (compile_escape(*s, &cs)) { s += 1; } else if (is_hologram_escape(*s)) { - ; + s -= 1; } else { assert("Unknown escape."); } @@ -518,6 +535,12 @@ regex_t * regex_compile(const char * const pattern) { } break; } + /* Ew */ + if (*s == '\\' + && is_hologram_escape(*(s+1))) { + ++s; + } + // Compile char switch (*s) { // holograms @@ -533,18 +556,47 @@ regex_t * regex_compile(const char * const pattern) { s += 1; } break; case '<': { - cs.flags |= IS_NEGATIVE | INCREMENT_STATE; - if (cs.flags & IS_AT_THE_BEGINNING) { - ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE+1, 0, 0, regex); + // XXX: make this legible + if (cs.flags & IS_AT_THE_BEGINNING + && !(cs.flags & DO_CATCH) + && !(cs.flags & IS_NEGATIVE) + && whitelist[0] == '\0') { + // --- + cs.flags |= INCREMENT_STATE; + cs.flags |= DO_FORBID_START_OF_STRING; + strcat(whitelist, JEGER_CHAR_symbol_chars); + // --- + ABSOLUTE_OFFSHOOT( JEGER_SOS_STATE, JEGER_INIT_STATE+1, 0, 0, regex); + ABSOLUTE_OFFSHOOT(JEGER_INIT_STATE, JEGER_INIT_STATE+2, 1, 0, regex); + HOOK_ALL(0, whitelist, HALT_AND_CATCH_FIRE, &cs, regex); + // --- + ++cs.state; + cs.width = 0; + cs.width2 = 0; + HOOK_ALL(0, whitelist, +1, &cs, regex); + cs.width = 1; + OFFSHOOT(0, +1, 1, 0, &cs, regex); + // --- + } else { + HOOK_ALL(0, whitelist, +1, &cs, regex); + if ((cs.flags & DO_CATCH) + || (cs.flags & IS_NEGATIVE)) { + OFFSHOOT(+1, +2, 1, 1, &cs, regex); + } else { + cs.flags |= INCREMENT_STATE; + } + OFFSHOOT(0, +1, 1, 0, &cs, regex); } - strcat(blacklist, JEGER_CHAR_very_word_chars); - OFFSHOOT(0, 0, 1, 0, &cs, regex); + cs.flags |= IS_NEGATIVE; + strcat(blacklist, JEGER_CHAR_symbol_chars); s += 1; } break; case '>': { + HOOK_ALL(0, whitelist, +1, &cs, regex); cs.flags |= IS_NEGATIVE | INCREMENT_STATE; - strcat(blacklist, JEGER_CHAR_very_word_chars); - OFFSHOOT(0, 1, 0, 0, &cs, regex); + strcat(blacklist, JEGER_CHAR_symbol_chars); + OFFSHOOT(+1, +2, 0, 0, &cs, regex); + ++cs.state; s += 1; } break; // quantifiers @@ -605,11 +657,13 @@ regex_t * regex_compile(const char * const pattern) { } // Init state hookups - ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE, 0, 0, regex); + if (!(cs.flags & DO_FORBID_START_OF_STRING)) { + ABSOLUTE_OFFSHOOT(JEGER_SOS_STATE, JEGER_INIT_STATE, 0, 0, regex); + } if (cs.flags & FORCE_START_OF_STRING) { - ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, 0, regex); + ABSOLUTE_OFFSHOOT(JEGER_NSOS_STATE, HALT_AND_CATCH_FIRE, 0, 0, regex); } else { - ABSOLUTE_OFFSHOOT(1, JEGER_INIT_STATE, 0, 0, regex); + ABSOLUTE_OFFSHOOT(JEGER_NSOS_STATE, JEGER_INIT_STATE, 0, 0, regex); } regex->accepting_state = cs.state; @@ -682,14 +736,18 @@ bool regex_assert(const regex_t * const regex, if ((delta->in == state) && (delta->input == *s)) { + bool do_reset = false; was_found = true; + if (!match->_pos_ptr && delta->match_width) { + match->_pos_ptr = s; + do_reset = true; + } const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match); if(r){ - if (match->position == -1) { - match->position = (s - string); - } match->width += delta->match_width; return r; + } else if (do_reset) { + match->_pos_ptr = NULL; } } } @@ -729,17 +787,21 @@ match_t * regex_match(const regex_t * const regex, // Find all matches { const char * s = string; + int initial_state; do { - int initial_state; initial_state = (int)(!(is_start_of_string && (s == string))); *match = (match_t){ - .position = -1, - .width = 0, + ._pos_ptr = NULL, + .width = 0, }; if (regex_assert(regex, s, initial_state, match)) { - match->position = (s - string); + if (match->_pos_ptr) { + match->position = (match->_pos_ptr - string); + } else { + match->position = (s - string); + } vector_push(&matches, match); @@ -773,7 +835,7 @@ bool regex_search(const regex_t * const regex, const char * const string) { match_t * m = regex_match(regex, string, true); - const bool r = (m->position != -1); + const bool r = !is_sentinel(m); free(m); return r;