|
|
@@ -50,8 +50,13 @@ typedef struct { |
|
|
|
} offshoot_t; |
|
|
|
|
|
|
|
typedef struct { |
|
|
|
// XXX: |
|
|
|
// These should share a mask |
|
|
|
// Not even sure why they are pointers to begin with |
|
|
|
bool * do_catch; |
|
|
|
bool * is_negative; |
|
|
|
bool is_at_the_beginning; |
|
|
|
bool do_skip; |
|
|
|
// these might be obsolite but im leaving them for now |
|
|
|
bool * do_loop_hook; |
|
|
|
bool * do_follow_hook; |
|
|
@@ -70,6 +75,46 @@ typedef struct { |
|
|
|
// ---------------------------------- |
|
|
|
// ### Regex creation/destruction ### |
|
|
|
// ---------------------------------- |
|
|
|
#define HALT_AND_CATCH_FIRE INT_MIN |
|
|
|
|
|
|
|
static void HOOK_ALL( int from, |
|
|
|
const char * const str, |
|
|
|
int to, |
|
|
|
compiler_state * cs) { |
|
|
|
|
|
|
|
int hook_to = (to == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : ((*cs->state) + to); |
|
|
|
|
|
|
|
|
|
|
|
for (const char * s = str; *s != '\0'; s++) { |
|
|
|
delta_t * delta = malloc(sizeof(delta_t)); |
|
|
|
delta->in = *cs->state + from; |
|
|
|
delta->input = *s; |
|
|
|
delta->to = hook_to; |
|
|
|
delta->width = *cs->width; |
|
|
|
vector_push(&cs->regex->delta_table, |
|
|
|
&delta); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
static void ABSOLUTE_OFFSHOOT(int from, |
|
|
|
int to, |
|
|
|
int width, |
|
|
|
compiler_state * cs) { |
|
|
|
offshoot_t * offshoot = malloc(sizeof(offshoot_t)); |
|
|
|
offshoot->in = from; |
|
|
|
offshoot->to = to; |
|
|
|
offshoot->width = width; |
|
|
|
vector_push(&cs->regex->catch_table, |
|
|
|
&offshoot); |
|
|
|
} |
|
|
|
|
|
|
|
static void OFFSHOOT(int from, |
|
|
|
int to, |
|
|
|
int width, |
|
|
|
compiler_state * cs) { |
|
|
|
ABSOLUTE_OFFSHOOT(*cs->state + from, *cs->state + to, width, cs); |
|
|
|
} |
|
|
|
|
|
|
|
static int escape_1_to_1(const char c, compiler_state * cs) { |
|
|
|
char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist; |
|
|
|
switch (c) { |
|
|
@@ -288,20 +333,35 @@ static int escape_to_negative(const char c, |
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
//static int compile_hologram(char * hologram, char * whitelist) { |
|
|
|
// if (hologram[0] == '\\') { |
|
|
|
// switch (hologram[1]) { |
|
|
|
// case '<': { |
|
|
|
// const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz" |
|
|
|
// "ABCDEFGHIJKLMNOPQRSTUWXYZ" |
|
|
|
// "_"; |
|
|
|
// strcat(whitelist, very_word_chars); |
|
|
|
// is_negative = true; |
|
|
|
// HOOK_ALL(0, whitelist, 0) |
|
|
|
// } break; |
|
|
|
// } |
|
|
|
// } |
|
|
|
//} |
|
|
|
static int escape_hologram(const char c, compiler_state * cs) { |
|
|
|
switch (c) { |
|
|
|
case '<': { |
|
|
|
if (cs->is_at_the_beginning) { |
|
|
|
ABSOLUTE_OFFSHOOT(0, 2, 0, cs); |
|
|
|
cs->do_skip = true; |
|
|
|
} |
|
|
|
const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz" |
|
|
|
"ABCDEFGHIJKLMNOPQRSTUWXYZ" |
|
|
|
"_"; |
|
|
|
*cs->is_negative = true; // effectless currently; should be used to trigger the following lines in the main compile loop |
|
|
|
strcat(cs->blacklist, very_word_chars); |
|
|
|
HOOK_ALL(0, cs->blacklist, HALT_AND_CATCH_FIRE, cs); |
|
|
|
OFFSHOOT(0, 0, 1, cs); |
|
|
|
|
|
|
|
return sizeof(very_word_chars)-1; |
|
|
|
}; |
|
|
|
case '>': { |
|
|
|
const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz" |
|
|
|
"ABCDEFGHIJKLMNOPQRSTUWXYZ" |
|
|
|
"_"; |
|
|
|
*cs->is_negative = true; |
|
|
|
strcat(cs->blacklist, very_word_chars); |
|
|
|
|
|
|
|
return 1; |
|
|
|
} |
|
|
|
} |
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
static int compile_dot(compiler_state * cs) { |
|
|
|
*cs->do_catch = true; |
|
|
@@ -311,10 +371,10 @@ static int compile_dot(compiler_state * cs) { |
|
|
|
static int compile_escape(const char c, |
|
|
|
compiler_state * cs) { |
|
|
|
|
|
|
|
return escape_1_to_1(c, cs) |
|
|
|
|| escape_1_to_N(c, cs) |
|
|
|
return escape_1_to_1(c, cs) |
|
|
|
|| escape_1_to_N(c, cs) |
|
|
|
|| escape_to_negative(c, cs) |
|
|
|
//|| compile_hologram(*s, whitelist) |
|
|
|
|| escape_hologram(c, cs) |
|
|
|
; |
|
|
|
} |
|
|
|
|
|
|
@@ -322,8 +382,6 @@ static int compile_range(const char * const range, |
|
|
|
compiler_state * cs) { |
|
|
|
assert((range[0] == '[') && "Not a range."); |
|
|
|
|
|
|
|
char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist; |
|
|
|
|
|
|
|
const char * s; |
|
|
|
if (range[1] == '^') { |
|
|
|
*cs->is_negative = true; |
|
|
@@ -331,6 +389,9 @@ static int compile_range(const char * const range, |
|
|
|
} else { |
|
|
|
s = range + 1; |
|
|
|
} |
|
|
|
|
|
|
|
char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist; |
|
|
|
|
|
|
|
for (; *s != ']'; s++) { |
|
|
|
assert((*s != '\0') && "Unclosed range."); |
|
|
|
char c = *s; |
|
|
@@ -363,50 +424,11 @@ void filter_blacklist(const char * whitelist, |
|
|
|
} |
|
|
|
} |
|
|
|
strncat(filtered, blacklist, 1); |
|
|
|
long_continue:; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
#define HALT_AND_CATCH_FIRE INT_MIN |
|
|
|
|
|
|
|
void HOOK_ALL( int from, |
|
|
|
const char * const str, |
|
|
|
int to, |
|
|
|
compiler_state * cs) { |
|
|
|
|
|
|
|
int hook_to = (to == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : ((*cs->state) + to); |
|
|
|
|
|
|
|
|
|
|
|
for (const char * s = str; *s != '\0'; s++) { |
|
|
|
delta_t * delta = malloc(sizeof(delta_t)); |
|
|
|
delta->in = *cs->state + from; |
|
|
|
delta->input = *s; |
|
|
|
delta->to = hook_to; |
|
|
|
delta->width = *cs->width; |
|
|
|
vector_push(&cs->regex->delta_table, |
|
|
|
&delta); |
|
|
|
long_continue: |
|
|
|
; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void ABSOLUTE_OFFSHOOT(int from, |
|
|
|
int to, |
|
|
|
int width, |
|
|
|
compiler_state * cs) { |
|
|
|
offshoot_t * offshoot = malloc(sizeof(offshoot_t)); |
|
|
|
offshoot->in = from; |
|
|
|
offshoot->to = to; |
|
|
|
offshoot->width = width; |
|
|
|
vector_push(&cs->regex->catch_table, |
|
|
|
&offshoot); |
|
|
|
} |
|
|
|
|
|
|
|
void OFFSHOOT(int from, |
|
|
|
int to, |
|
|
|
int width, |
|
|
|
compiler_state * cs) { |
|
|
|
ABSOLUTE_OFFSHOOT(*cs->state + from, *cs->state + to, width, cs); |
|
|
|
} |
|
|
|
|
|
|
|
regex_t * regex_compile(const char * const pattern) { |
|
|
|
regex_t * regex = (regex_t *)malloc(sizeof(regex_t)); |
|
|
|
regex->str = strdup(pattern); |
|
|
@@ -415,6 +437,7 @@ regex_t * regex_compile(const char * const pattern) { |
|
|
|
|
|
|
|
int state = 2; |
|
|
|
|
|
|
|
// this is plain retarded |
|
|
|
bool do_catch; |
|
|
|
bool is_negative; |
|
|
|
bool do_loop_hook; |
|
|
@@ -426,13 +449,15 @@ regex_t * regex_compile(const char * const pattern) { |
|
|
|
char blacklist[64]; |
|
|
|
|
|
|
|
compiler_state cs = { |
|
|
|
.do_catch = &do_catch, |
|
|
|
.is_negative = &is_negative, |
|
|
|
.state = &state, |
|
|
|
.width = &width, |
|
|
|
.whitelist = whitelist, |
|
|
|
.blacklist = blacklist, |
|
|
|
.regex = regex, |
|
|
|
.do_catch = &do_catch, |
|
|
|
.is_negative = &is_negative, |
|
|
|
.is_at_the_beginning = true, |
|
|
|
.do_skip = false, |
|
|
|
.state = &state, |
|
|
|
.width = &width, |
|
|
|
.whitelist = whitelist, |
|
|
|
.blacklist = blacklist, |
|
|
|
.regex = regex, |
|
|
|
}; |
|
|
|
|
|
|
|
for (const char * s = pattern; *s != '\00';) { |
|
|
@@ -442,16 +467,19 @@ regex_t * regex_compile(const char * const pattern) { |
|
|
|
blacklist[0] = '\0'; |
|
|
|
do_catch = false; |
|
|
|
is_negative = false; |
|
|
|
cs.do_skip = false; |
|
|
|
/**/ |
|
|
|
do_loop_hook = false; |
|
|
|
do_follow_hook = false; |
|
|
|
do_loop_shoot = false; |
|
|
|
do_follow_shoot = false; |
|
|
|
/**/ |
|
|
|
width = 1; |
|
|
|
|
|
|
|
// Translate char |
|
|
|
switch (*s) { |
|
|
|
case '^': { |
|
|
|
if (s == pattern) { |
|
|
|
if (cs.is_at_the_beginning) { |
|
|
|
ABSOLUTE_OFFSHOOT(0, 2, 0, &cs); |
|
|
|
ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, &cs); |
|
|
|
} |
|
|
@@ -461,8 +489,7 @@ regex_t * regex_compile(const char * const pattern) { |
|
|
|
if (s != pattern) { |
|
|
|
state += 1; |
|
|
|
} |
|
|
|
s += 1; |
|
|
|
goto long_continue; |
|
|
|
cs.do_skip = true; |
|
|
|
} break; |
|
|
|
case '.': { |
|
|
|
compile_dot(&cs); |
|
|
@@ -479,9 +506,13 @@ regex_t * regex_compile(const char * const pattern) { |
|
|
|
whitelist[1] = '\0'; |
|
|
|
} break; |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
s += 1; |
|
|
|
|
|
|
|
if (cs.do_skip) { |
|
|
|
goto long_continue; |
|
|
|
} |
|
|
|
|
|
|
|
// Compile with quantifier |
|
|
|
switch (*s) { |
|
|
|
case '=': |
|
|
@@ -530,7 +561,9 @@ regex_t * regex_compile(const char * const pattern) { |
|
|
|
filter_blacklist(whitelist, blacklist, filtered_blacklist); |
|
|
|
HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs); |
|
|
|
} |
|
|
|
|
|
|
|
long_continue: |
|
|
|
cs.is_at_the_beginning = false; |
|
|
|
} |
|
|
|
|
|
|
|
regex->accepting_state = state; |
|
|
@@ -569,8 +602,14 @@ static int regex_assert(const regex_t * const regex, |
|
|
|
int state, |
|
|
|
int width) { // XXX: im pretty sure this is actually redundant and the width should be calculated from string - s |
|
|
|
for (const char * s = (string + string_offset); *s != '\00';) { |
|
|
|
// XXX: this should be a jump search for the instate and then a linear |
|
|
|
// delta |
|
|
|
//int left = 0; |
|
|
|
//int right = regex->delta_table.element_count - 1; |
|
|
|
//int i; |
|
|
|
//while(left <= right) } |
|
|
|
for (size_t i = 0; i < regex->delta_table.element_count; i++) { |
|
|
|
//i = (left + right) / 2; |
|
|
|
const delta_t * const delta = *(delta_t**)vector_get(®ex->delta_table, i); |
|
|
|
if ((delta->in == state) |
|
|
|
&& (delta->input == *s)) { |
|
|
@@ -588,6 +627,8 @@ static int regex_assert(const regex_t * const regex, |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
// XXX: the extra catch might not be necessary if we were to compile to a simpler form |
|
|
|
catch_(regex, &state); |
|
|
|
return (state == regex->accepting_state) ? width : false; |
|
|
|
} |
|
|
|
|
|
|
|