This commit is contained in:
anon 2023-09-09 12:10:07 +02:00
parent c7af36dbb0
commit 9e8eb4f2e2

View File

@ -1,12 +1,4 @@
/* XXX: #pragma GCC diagnostic ignored "-Wc++20-extensions"
* as it turns out returning a range of match objects is a
* high profile performance issue regarding regex, especially when highlighting.
* now as it stands we search an array of tokens for every position on a string.
* which sounds ok, until one realizes that searching from any position revails a range,
* where (future) matches can or cannot be found. meaning we are computing the same thing
* repeatedly, practically resulting in a bruteforcing situation where instead of eliminating
* certain non-matches, we blindly hammer character by character.
*/
#include "regex.h" #include "regex.h"
@ -89,14 +81,14 @@ typedef struct {
int in; int in;
char input; char input;
int to; int to;
int width; int pattern_width;
int match_width; int match_width;
} delta_t; } delta_t;
typedef struct { typedef struct {
int in; int in;
int to; int to;
int width; int pattern_width;
int match_width; int match_width;
} offshoot_t; } offshoot_t;
@ -133,26 +125,31 @@ void HOOK_ALL(const int from,
regex_t * regex) { regex_t * regex) {
for (const char * s = str; *s != '\0'; s++) { for (const char * s = str; *s != '\0'; s++) {
delta_t * delta = (delta_t *)malloc(sizeof(delta_t)); delta_t * delta = (delta_t *)malloc(sizeof(delta_t));
delta->in = cs->state + from; *delta = (delta_t){
delta->input = *s; .in = cs->state + from,
delta->to = ASSERT_HALT(to); .input = *s,
delta->width = cs->width; .to = ASSERT_HALT(to),
.pattern_width = cs->width,
.match_width = 1,
};
vector_push(&regex->delta_table, vector_push(&regex->delta_table,
&delta); &delta);
} }
} }
static static
void ABSOLUTE_OFFSHOOT(const int from, void ABSOLUTE_OFFSHOOT(const int from,
const int to, const int to,
const int width, const int width,
const int match_width, const int match_width,
regex_t * regex) { regex_t * regex) {
offshoot_t * offshoot = (offshoot_t *)malloc(sizeof(offshoot_t)); offshoot_t * offshoot = (offshoot_t *)malloc(sizeof(offshoot_t));
offshoot->in = from; *offshoot = (offshoot_t){
offshoot->to = to; .in = from,
offshoot->width = width; .to = to,
offshoot->match_width = match_width; .pattern_width = width,
.match_width = match_width,
};
vector_push(&regex->catch_table, vector_push(&regex->catch_table,
&offshoot); &offshoot);
} }
@ -361,7 +358,7 @@ int escape_to_negative(const char c,
} }
static inline static inline
int compile_dot(compiler_state * cs) { int compile_dot(compiler_state * const cs) {
cs->flags |= DO_CATCH; cs->flags |= DO_CATCH;
return true; return true;
} }
@ -371,9 +368,9 @@ int compile_escape(const char c,
compiler_state * const cs) { compiler_state * const cs) {
return escape_1_to_1(c, cs) return escape_1_to_1(c, cs)
|| escape_1_to_N(c, cs) || escape_1_to_N(c, cs)
|| escape_to_negative(c, cs) || escape_to_negative(c, cs)
; ;
} }
static static
@ -441,7 +438,6 @@ regex_t * regex_compile(const char * const pattern) {
compiler_state cs = { compiler_state cs = {
.flags = IS_AT_THE_BEGINNING, .flags = IS_AT_THE_BEGINNING,
.state = JEGER_INIT_STATE, .state = JEGER_INIT_STATE,
.width = 0,
.whitelist = whitelist, .whitelist = whitelist,
.blacklist = blacklist, .blacklist = blacklist,
}; };
@ -451,7 +447,7 @@ regex_t * regex_compile(const char * const pattern) {
// Reset the compiler // Reset the compiler
whitelist[0] = '\0'; whitelist[0] = '\0';
blacklist[0] = '\0'; blacklist[0] = '\0';
cs.flags &= IS_AT_THE_BEGINNING; cs.flags &= (IS_AT_THE_BEGINNING | FORCE_START_OF_STRING);
cs.width = 1; cs.width = 1;
// Translate char // Translate char
@ -566,7 +562,7 @@ regex_t * regex_compile(const char * const pattern) {
++cs.state; ++cs.state;
} }
cs.flags &= !(IS_AT_THE_BEGINNING); cs.flags &= (~IS_AT_THE_BEGINNING);
} }
// Init state hookups // Init state hookups
@ -648,10 +644,9 @@ bool regex_assert(const regex_t * const regex,
if ((delta->in == state) if ((delta->in == state)
&& (delta->input == *s)) { && (delta->input == *s)) {
was_found = true; was_found = true;
const int r = regex_assert(regex, s + delta->width, delta->to, match); const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match);
if(r){ if(r){
if ((match->position != -1) if (match->position == -1) {
&& (delta->match_width)) {
match->position = (s - string); match->position = (s - string);
} }
match->width += delta->match_width; match->width += delta->match_width;
@ -664,9 +659,9 @@ bool regex_assert(const regex_t * const regex,
PERFORM_CATCH_LOOKUP: { PERFORM_CATCH_LOOKUP: {
if (!was_found) { if (!was_found) {
const offshoot_t * const my_catch = catch_table_lookup(regex, &state); const offshoot_t * const my_catch = catch_table_lookup(regex, &state);
if (my_catch && (!my_catch->width || !last_stand)) { if (my_catch && (!my_catch->pattern_width || !last_stand)) {
state = my_catch->to; state = my_catch->to;
s += my_catch->width; s += my_catch->pattern_width;
match->width += my_catch->match_width; match->width += my_catch->match_width;
goto LOOP; goto LOOP;
} }
@ -679,30 +674,68 @@ bool regex_assert(const regex_t * const regex,
match_t * regex_match(const regex_t * const regex, match_t * regex_match(const regex_t * const regex,
const char * const string, const char * const string,
const bool is_start_of_string) { const bool is_start_of_string) {
if (regex == NULL) {
return NULL; vector_t matches;
} vector_init(&matches, sizeof(match_t), 0);
match_t * match = (match_t *)malloc(sizeof(match_t)); match_t * match = (match_t *)malloc(sizeof(match_t));
if (string == NULL) { /* Non-existent regex does not match anything.
match->position = -1; * Not to be confused with an empty regex.
match->width = 0; */
return match; if (regex == NULL) {
goto FINISH;
} }
const int initial_state = (int)(!is_start_of_string); // Find all matches
{
const char * s = string;
do {
int initial_state;
initial_state = (int)(!(is_start_of_string && (s == string)));
// XXX: this should be called in a loop, always restarting from the last char of the last match *match = (match_t){
if (regex_assert(regex, string, initial_state, match)) { .position = -1,
return match; .width = 0,
} else { };
return NULL;
if (regex_assert(regex, s, initial_state, match)) {
match->position = (s - string);
vector_push(&matches, match);
s += ((match->width > 0) ? match->width : 1);
match = (match_t *)malloc(sizeof(match_t));
} else {
++s;
}
} while (*s != '\0');
} }
FINISH:
// Insert sentinel
*match = (match_t){
.position = -1,
.width = -1,
};
vector_push(&matches, match);
// Hide internal vector usage
const size_t data_size = matches.element_size * matches.element_count;
match_t * r = (match_t *)malloc(data_size);
memcpy(r, matches.data, data_size);
vector_free(&matches);
return r;
} }
bool regex_search(const regex_t * const regex, bool regex_search(const regex_t * const regex,
const char * const string) { const char * const string) {
return (bool)regex_match(regex, string, true); match_t * m = regex_match(regex, string, true);
const bool r = (m->position != -1);
free(m);
return r;
} }