it works
This commit is contained in:
parent
c7af36dbb0
commit
9e8eb4f2e2
133
source/regex.c
133
source/regex.c
@ -1,12 +1,4 @@
|
|||||||
/* XXX:
|
#pragma GCC diagnostic ignored "-Wc++20-extensions"
|
||||||
* as it turns out returning a range of match objects is a
|
|
||||||
* high profile performance issue regarding regex, especially when highlighting.
|
|
||||||
* now as it stands we search an array of tokens for every position on a string.
|
|
||||||
* which sounds ok, until one realizes that searching from any position revails a range,
|
|
||||||
* where (future) matches can or cannot be found. meaning we are computing the same thing
|
|
||||||
* repeatedly, practically resulting in a bruteforcing situation where instead of eliminating
|
|
||||||
* certain non-matches, we blindly hammer character by character.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "regex.h"
|
#include "regex.h"
|
||||||
|
|
||||||
@ -89,14 +81,14 @@ typedef struct {
|
|||||||
int in;
|
int in;
|
||||||
char input;
|
char input;
|
||||||
int to;
|
int to;
|
||||||
int width;
|
int pattern_width;
|
||||||
int match_width;
|
int match_width;
|
||||||
} delta_t;
|
} delta_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int in;
|
int in;
|
||||||
int to;
|
int to;
|
||||||
int width;
|
int pattern_width;
|
||||||
int match_width;
|
int match_width;
|
||||||
} offshoot_t;
|
} offshoot_t;
|
||||||
|
|
||||||
@ -133,26 +125,31 @@ void HOOK_ALL(const int from,
|
|||||||
regex_t * regex) {
|
regex_t * regex) {
|
||||||
for (const char * s = str; *s != '\0'; s++) {
|
for (const char * s = str; *s != '\0'; s++) {
|
||||||
delta_t * delta = (delta_t *)malloc(sizeof(delta_t));
|
delta_t * delta = (delta_t *)malloc(sizeof(delta_t));
|
||||||
delta->in = cs->state + from;
|
*delta = (delta_t){
|
||||||
delta->input = *s;
|
.in = cs->state + from,
|
||||||
delta->to = ASSERT_HALT(to);
|
.input = *s,
|
||||||
delta->width = cs->width;
|
.to = ASSERT_HALT(to),
|
||||||
|
.pattern_width = cs->width,
|
||||||
|
.match_width = 1,
|
||||||
|
};
|
||||||
vector_push(®ex->delta_table,
|
vector_push(®ex->delta_table,
|
||||||
&delta);
|
&delta);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
void ABSOLUTE_OFFSHOOT(const int from,
|
void ABSOLUTE_OFFSHOOT(const int from,
|
||||||
const int to,
|
const int to,
|
||||||
const int width,
|
const int width,
|
||||||
const int match_width,
|
const int match_width,
|
||||||
regex_t * regex) {
|
regex_t * regex) {
|
||||||
offshoot_t * offshoot = (offshoot_t *)malloc(sizeof(offshoot_t));
|
offshoot_t * offshoot = (offshoot_t *)malloc(sizeof(offshoot_t));
|
||||||
offshoot->in = from;
|
*offshoot = (offshoot_t){
|
||||||
offshoot->to = to;
|
.in = from,
|
||||||
offshoot->width = width;
|
.to = to,
|
||||||
offshoot->match_width = match_width;
|
.pattern_width = width,
|
||||||
|
.match_width = match_width,
|
||||||
|
};
|
||||||
vector_push(®ex->catch_table,
|
vector_push(®ex->catch_table,
|
||||||
&offshoot);
|
&offshoot);
|
||||||
}
|
}
|
||||||
@ -361,7 +358,7 @@ int escape_to_negative(const char c,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static inline
|
static inline
|
||||||
int compile_dot(compiler_state * cs) {
|
int compile_dot(compiler_state * const cs) {
|
||||||
cs->flags |= DO_CATCH;
|
cs->flags |= DO_CATCH;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -371,9 +368,9 @@ int compile_escape(const char c,
|
|||||||
compiler_state * const cs) {
|
compiler_state * const cs) {
|
||||||
|
|
||||||
return escape_1_to_1(c, cs)
|
return escape_1_to_1(c, cs)
|
||||||
|| escape_1_to_N(c, cs)
|
|| escape_1_to_N(c, cs)
|
||||||
|| escape_to_negative(c, cs)
|
|| escape_to_negative(c, cs)
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
@ -441,7 +438,6 @@ regex_t * regex_compile(const char * const pattern) {
|
|||||||
compiler_state cs = {
|
compiler_state cs = {
|
||||||
.flags = IS_AT_THE_BEGINNING,
|
.flags = IS_AT_THE_BEGINNING,
|
||||||
.state = JEGER_INIT_STATE,
|
.state = JEGER_INIT_STATE,
|
||||||
.width = 0,
|
|
||||||
.whitelist = whitelist,
|
.whitelist = whitelist,
|
||||||
.blacklist = blacklist,
|
.blacklist = blacklist,
|
||||||
};
|
};
|
||||||
@ -451,7 +447,7 @@ regex_t * regex_compile(const char * const pattern) {
|
|||||||
// Reset the compiler
|
// Reset the compiler
|
||||||
whitelist[0] = '\0';
|
whitelist[0] = '\0';
|
||||||
blacklist[0] = '\0';
|
blacklist[0] = '\0';
|
||||||
cs.flags &= IS_AT_THE_BEGINNING;
|
cs.flags &= (IS_AT_THE_BEGINNING | FORCE_START_OF_STRING);
|
||||||
cs.width = 1;
|
cs.width = 1;
|
||||||
|
|
||||||
// Translate char
|
// Translate char
|
||||||
@ -566,7 +562,7 @@ regex_t * regex_compile(const char * const pattern) {
|
|||||||
++cs.state;
|
++cs.state;
|
||||||
}
|
}
|
||||||
|
|
||||||
cs.flags &= !(IS_AT_THE_BEGINNING);
|
cs.flags &= (~IS_AT_THE_BEGINNING);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Init state hookups
|
// Init state hookups
|
||||||
@ -648,10 +644,9 @@ bool regex_assert(const regex_t * const regex,
|
|||||||
if ((delta->in == state)
|
if ((delta->in == state)
|
||||||
&& (delta->input == *s)) {
|
&& (delta->input == *s)) {
|
||||||
was_found = true;
|
was_found = true;
|
||||||
const int r = regex_assert(regex, s + delta->width, delta->to, match);
|
const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match);
|
||||||
if(r){
|
if(r){
|
||||||
if ((match->position != -1)
|
if (match->position == -1) {
|
||||||
&& (delta->match_width)) {
|
|
||||||
match->position = (s - string);
|
match->position = (s - string);
|
||||||
}
|
}
|
||||||
match->width += delta->match_width;
|
match->width += delta->match_width;
|
||||||
@ -664,9 +659,9 @@ bool regex_assert(const regex_t * const regex,
|
|||||||
PERFORM_CATCH_LOOKUP: {
|
PERFORM_CATCH_LOOKUP: {
|
||||||
if (!was_found) {
|
if (!was_found) {
|
||||||
const offshoot_t * const my_catch = catch_table_lookup(regex, &state);
|
const offshoot_t * const my_catch = catch_table_lookup(regex, &state);
|
||||||
if (my_catch && (!my_catch->width || !last_stand)) {
|
if (my_catch && (!my_catch->pattern_width || !last_stand)) {
|
||||||
state = my_catch->to;
|
state = my_catch->to;
|
||||||
s += my_catch->width;
|
s += my_catch->pattern_width;
|
||||||
match->width += my_catch->match_width;
|
match->width += my_catch->match_width;
|
||||||
goto LOOP;
|
goto LOOP;
|
||||||
}
|
}
|
||||||
@ -679,30 +674,68 @@ bool regex_assert(const regex_t * const regex,
|
|||||||
match_t * regex_match(const regex_t * const regex,
|
match_t * regex_match(const regex_t * const regex,
|
||||||
const char * const string,
|
const char * const string,
|
||||||
const bool is_start_of_string) {
|
const bool is_start_of_string) {
|
||||||
if (regex == NULL) {
|
|
||||||
return NULL;
|
vector_t matches;
|
||||||
}
|
vector_init(&matches, sizeof(match_t), 0);
|
||||||
|
|
||||||
match_t * match = (match_t *)malloc(sizeof(match_t));
|
match_t * match = (match_t *)malloc(sizeof(match_t));
|
||||||
|
|
||||||
if (string == NULL) {
|
/* Non-existent regex does not match anything.
|
||||||
match->position = -1;
|
* Not to be confused with an empty regex.
|
||||||
match->width = 0;
|
*/
|
||||||
return match;
|
if (regex == NULL) {
|
||||||
|
goto FINISH;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int initial_state = (int)(!is_start_of_string);
|
// Find all matches
|
||||||
|
{
|
||||||
|
const char * s = string;
|
||||||
|
do {
|
||||||
|
int initial_state;
|
||||||
|
initial_state = (int)(!(is_start_of_string && (s == string)));
|
||||||
|
|
||||||
// XXX: this should be called in a loop, always restarting from the last char of the last match
|
*match = (match_t){
|
||||||
if (regex_assert(regex, string, initial_state, match)) {
|
.position = -1,
|
||||||
return match;
|
.width = 0,
|
||||||
} else {
|
};
|
||||||
return NULL;
|
|
||||||
|
if (regex_assert(regex, s, initial_state, match)) {
|
||||||
|
match->position = (s - string);
|
||||||
|
|
||||||
|
vector_push(&matches, match);
|
||||||
|
|
||||||
|
s += ((match->width > 0) ? match->width : 1);
|
||||||
|
match = (match_t *)malloc(sizeof(match_t));
|
||||||
|
} else {
|
||||||
|
++s;
|
||||||
|
}
|
||||||
|
} while (*s != '\0');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
FINISH:
|
||||||
|
|
||||||
|
// Insert sentinel
|
||||||
|
*match = (match_t){
|
||||||
|
.position = -1,
|
||||||
|
.width = -1,
|
||||||
|
};
|
||||||
|
vector_push(&matches, match);
|
||||||
|
|
||||||
|
// Hide internal vector usage
|
||||||
|
const size_t data_size = matches.element_size * matches.element_count;
|
||||||
|
match_t * r = (match_t *)malloc(data_size);
|
||||||
|
memcpy(r, matches.data, data_size);
|
||||||
|
vector_free(&matches);
|
||||||
|
|
||||||
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool regex_search(const regex_t * const regex,
|
bool regex_search(const regex_t * const regex,
|
||||||
const char * const string) {
|
const char * const string) {
|
||||||
|
|
||||||
return (bool)regex_match(regex, string, true);
|
match_t * m = regex_match(regex, string, true);
|
||||||
|
const bool r = (m->position != -1);
|
||||||
|
free(m);
|
||||||
|
|
||||||
|
return r;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user