|
|
@@ -8,8 +8,13 @@ |
|
|
|
#include <string.h> |
|
|
|
#include <limits.h> |
|
|
|
#include <stdlib.h> |
|
|
|
#if DEBUG |
|
|
|
# include <stdio.h> |
|
|
|
#endif |
|
|
|
|
|
|
|
#define JEGER_INIT_STATE 2 |
|
|
|
#define JEGER_SOS_STATE 0 |
|
|
|
#define JEGER_NSOS_STATE 1 |
|
|
|
#define JEGER_INIT_STATE 2 |
|
|
|
|
|
|
|
// ------------------ |
|
|
|
// ### Char tests ### |
|
|
@@ -40,6 +45,15 @@ bool is_magic(const char c) { |
|
|
|
; |
|
|
|
} |
|
|
|
|
|
|
|
// ------------------- |
|
|
|
// ### Match tests ### |
|
|
|
// ------------------- |
|
|
|
bool is_sentinel(const match_t * const match) { |
|
|
|
return (match->position == -1) |
|
|
|
&& (match->width == -1) |
|
|
|
; |
|
|
|
} |
|
|
|
|
|
|
|
// ----------------- |
|
|
|
// ### Char sets ### |
|
|
|
// ----------------- |
|
|
@@ -52,13 +66,13 @@ bool is_magic(const char c) { |
|
|
|
#define JEGER_CHAR_SET_lower_hex "abcdef" |
|
|
|
#define JEGER_CHAR_SET_upper_hex "ABCDEF" |
|
|
|
#define JEGER_CHAR_SET_oct_241_to_277 \ |
|
|
|
"\241\242\243\244\245" \ |
|
|
|
"\246\247\250\251\252" \ |
|
|
|
"\253\254\255\256\257" \ |
|
|
|
"\260\261\262\263\264" \ |
|
|
|
"\265\266\267\270\271" \ |
|
|
|
"\272\273\274\275\276" \ |
|
|
|
"\277" |
|
|
|
"\241\242\243\244\245" \ |
|
|
|
"\246\247\250\251\252" \ |
|
|
|
"\253\254\255\256\257" \ |
|
|
|
"\260\261\262\263\264" \ |
|
|
|
"\265\266\267\270\271" \ |
|
|
|
"\272\273\274\275\276" \ |
|
|
|
"\277" |
|
|
|
#define JEGER_CHAR_SET_oct_300_to_337 \ |
|
|
|
"\300\301\302\303\304" \ |
|
|
|
"\305\306\307\310\311" \ |
|
|
@@ -68,13 +82,13 @@ bool is_magic(const char c) { |
|
|
|
"\331\332\333\334\335" \ |
|
|
|
"\336\337" |
|
|
|
#define JEGER_CHAR_SET_file_extra "/.-_+,#$%~=" |
|
|
|
#define JEGER_CHAR_SET_whitespace " \t\v\n" |
|
|
|
#define JEGER_CHAR_SET_whitespace " " "\t\v\n" |
|
|
|
|
|
|
|
static const char JEGER_CHAR_very_word_chars[] = |
|
|
|
JEGER_CHAR_SET_underscore |
|
|
|
JEGER_CHAR_SET_lower |
|
|
|
JEGER_CHAR_SET_upper |
|
|
|
; |
|
|
|
static const char JEGER_CHAR_symbol_chars[] = |
|
|
|
JEGER_CHAR_SET_underscore |
|
|
|
JEGER_CHAR_SET_lower |
|
|
|
JEGER_CHAR_SET_upper |
|
|
|
; |
|
|
|
|
|
|
|
// ---------------------- |
|
|
|
// ### Internal Types ### |
|
|
@@ -95,17 +109,19 @@ typedef struct { |
|
|
|
} offshoot_t; |
|
|
|
|
|
|
|
enum { |
|
|
|
DO_CATCH = 0x00000001 << 0, |
|
|
|
IS_NEGATIVE = 0x00000001 << 1, |
|
|
|
IS_AT_THE_BEGINNING = 0x00000001 << 2, |
|
|
|
FORCE_START_OF_STRING = 0x00000001 << 3, |
|
|
|
INCREMENT_STATE = 0x00000001 << 4, |
|
|
|
DO_CATCH = 0x00000001 << 0, |
|
|
|
IS_NEGATIVE = 0x00000001 << 1, |
|
|
|
IS_AT_THE_BEGINNING = 0x00000001 << 2, |
|
|
|
FORCE_START_OF_STRING = 0x00000001 << 3, |
|
|
|
DO_FORBID_START_OF_STRING = 0x00000001 << 4, |
|
|
|
INCREMENT_STATE = 0x00000001 << 5, |
|
|
|
}; |
|
|
|
|
|
|
|
typedef struct { |
|
|
|
int flags; |
|
|
|
int state; |
|
|
|
int width; |
|
|
|
int width2; |
|
|
|
char * whitelist; |
|
|
|
char * blacklist; |
|
|
|
} compiler_state; |
|
|
@@ -132,7 +148,7 @@ void HOOK_ALL(const int from, |
|
|
|
.input = *s, |
|
|
|
.to = ASSERT_HALT(to), |
|
|
|
.pattern_width = cs->width, |
|
|
|
.match_width = 1, |
|
|
|
.match_width = cs->width2, |
|
|
|
}; |
|
|
|
vector_push(®ex->delta_table, |
|
|
|
&delta); |
|
|
@@ -318,9 +334,9 @@ int escape_1_to_N(const char c, |
|
|
|
return sizeof(word_chars)-1; |
|
|
|
}; |
|
|
|
case 'h': { |
|
|
|
// #global JEGER_CHAR_very_word_chars |
|
|
|
strcpy(target_list, JEGER_CHAR_very_word_chars); |
|
|
|
return sizeof(JEGER_CHAR_very_word_chars)-1; |
|
|
|
// #global JEGER_CHAR_symbol_chars |
|
|
|
strcpy(target_list, JEGER_CHAR_symbol_chars); |
|
|
|
return sizeof(JEGER_CHAR_symbol_chars)-1; |
|
|
|
}; |
|
|
|
case 'a': { |
|
|
|
const char alpha_chars[] = JEGER_CHAR_SET_lower |
|
|
@@ -346,7 +362,7 @@ int escape_1_to_N(const char c, |
|
|
|
|
|
|
|
static inline |
|
|
|
int escape_to_negative(const char c, |
|
|
|
compiler_state * const cs) { |
|
|
|
compiler_state * const cs) { |
|
|
|
switch (c) { |
|
|
|
case 'D': { |
|
|
|
const char digit_chars[] = JEGER_CHAR_SET_digits; |
|
|
@@ -488,6 +504,7 @@ regex_t * regex_compile(const char * const pattern) { |
|
|
|
blacklist[0] = '\0'; |
|
|
|
cs.flags &= (IS_AT_THE_BEGINNING | FORCE_START_OF_STRING); |
|
|
|
cs.width = 1; |
|
|
|
cs.width2 = 1; |
|
|
|
|
|
|
|
// Translate char |
|
|
|
switch (*s) { |
|
|
@@ -503,7 +520,7 @@ regex_t * regex_compile(const char * const pattern) { |
|
|
|
if (compile_escape(*s, &cs)) { |
|
|
|
s += 1; |
|
|
|
} else if (is_hologram_escape(*s)) { |
|
|
|
; |
|
|
|
s -= 1; |
|
|
|
} else { |
|
|
|
assert("Unknown escape."); |
|
|
|
} |
|
|
@@ -518,6 +535,12 @@ regex_t * regex_compile(const char * const pattern) { |
|
|
|
} break; |
|
|
|
} |
|
|
|
|
|
|
|
/* Ew */ |
|
|
|
if (*s == '\\' |
|
|
|
&& is_hologram_escape(*(s+1))) { |
|
|
|
++s; |
|
|
|
} |
|
|
|
|
|
|
|
// Compile char |
|
|
|
switch (*s) { |
|
|
|
// holograms |
|
|
@@ -533,18 +556,47 @@ regex_t * regex_compile(const char * const pattern) { |
|
|
|
s += 1; |
|
|
|
} break; |
|
|
|
case '<': { |
|
|
|
cs.flags |= IS_NEGATIVE | INCREMENT_STATE; |
|
|
|
if (cs.flags & IS_AT_THE_BEGINNING) { |
|
|
|
ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE+1, 0, 0, regex); |
|
|
|
// XXX: make this legible |
|
|
|
if (cs.flags & IS_AT_THE_BEGINNING |
|
|
|
&& !(cs.flags & DO_CATCH) |
|
|
|
&& !(cs.flags & IS_NEGATIVE) |
|
|
|
&& whitelist[0] == '\0') { |
|
|
|
// --- |
|
|
|
cs.flags |= INCREMENT_STATE; |
|
|
|
cs.flags |= DO_FORBID_START_OF_STRING; |
|
|
|
strcat(whitelist, JEGER_CHAR_symbol_chars); |
|
|
|
// --- |
|
|
|
ABSOLUTE_OFFSHOOT( JEGER_SOS_STATE, JEGER_INIT_STATE+1, 0, 0, regex); |
|
|
|
ABSOLUTE_OFFSHOOT(JEGER_INIT_STATE, JEGER_INIT_STATE+2, 1, 0, regex); |
|
|
|
HOOK_ALL(0, whitelist, HALT_AND_CATCH_FIRE, &cs, regex); |
|
|
|
// --- |
|
|
|
++cs.state; |
|
|
|
cs.width = 0; |
|
|
|
cs.width2 = 0; |
|
|
|
HOOK_ALL(0, whitelist, +1, &cs, regex); |
|
|
|
cs.width = 1; |
|
|
|
OFFSHOOT(0, +1, 1, 0, &cs, regex); |
|
|
|
// --- |
|
|
|
} else { |
|
|
|
HOOK_ALL(0, whitelist, +1, &cs, regex); |
|
|
|
if ((cs.flags & DO_CATCH) |
|
|
|
|| (cs.flags & IS_NEGATIVE)) { |
|
|
|
OFFSHOOT(+1, +2, 1, 1, &cs, regex); |
|
|
|
} else { |
|
|
|
cs.flags |= INCREMENT_STATE; |
|
|
|
} |
|
|
|
OFFSHOOT(0, +1, 1, 0, &cs, regex); |
|
|
|
} |
|
|
|
strcat(blacklist, JEGER_CHAR_very_word_chars); |
|
|
|
OFFSHOOT(0, 0, 1, 0, &cs, regex); |
|
|
|
cs.flags |= IS_NEGATIVE; |
|
|
|
strcat(blacklist, JEGER_CHAR_symbol_chars); |
|
|
|
s += 1; |
|
|
|
} break; |
|
|
|
case '>': { |
|
|
|
HOOK_ALL(0, whitelist, +1, &cs, regex); |
|
|
|
cs.flags |= IS_NEGATIVE | INCREMENT_STATE; |
|
|
|
strcat(blacklist, JEGER_CHAR_very_word_chars); |
|
|
|
OFFSHOOT(0, 1, 0, 0, &cs, regex); |
|
|
|
strcat(blacklist, JEGER_CHAR_symbol_chars); |
|
|
|
OFFSHOOT(+1, +2, 0, 0, &cs, regex); |
|
|
|
++cs.state; |
|
|
|
s += 1; |
|
|
|
} break; |
|
|
|
// quantifiers |
|
|
@@ -605,11 +657,13 @@ regex_t * regex_compile(const char * const pattern) { |
|
|
|
} |
|
|
|
|
|
|
|
// Init state hookups |
|
|
|
ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE, 0, 0, regex); |
|
|
|
if (!(cs.flags & DO_FORBID_START_OF_STRING)) { |
|
|
|
ABSOLUTE_OFFSHOOT(JEGER_SOS_STATE, JEGER_INIT_STATE, 0, 0, regex); |
|
|
|
} |
|
|
|
if (cs.flags & FORCE_START_OF_STRING) { |
|
|
|
ABSOLUTE_OFFSHOOT(1, HALT_AND_CATCH_FIRE, 0, 0, regex); |
|
|
|
ABSOLUTE_OFFSHOOT(JEGER_NSOS_STATE, HALT_AND_CATCH_FIRE, 0, 0, regex); |
|
|
|
} else { |
|
|
|
ABSOLUTE_OFFSHOOT(1, JEGER_INIT_STATE, 0, 0, regex); |
|
|
|
ABSOLUTE_OFFSHOOT(JEGER_NSOS_STATE, JEGER_INIT_STATE, 0, 0, regex); |
|
|
|
} |
|
|
|
|
|
|
|
regex->accepting_state = cs.state; |
|
|
@@ -682,14 +736,18 @@ bool regex_assert(const regex_t * const regex, |
|
|
|
|
|
|
|
if ((delta->in == state) |
|
|
|
&& (delta->input == *s)) { |
|
|
|
bool do_reset = false; |
|
|
|
was_found = true; |
|
|
|
if (!match->_pos_ptr && delta->match_width) { |
|
|
|
match->_pos_ptr = s; |
|
|
|
do_reset = true; |
|
|
|
} |
|
|
|
const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match); |
|
|
|
if(r){ |
|
|
|
if (match->position == -1) { |
|
|
|
match->position = (s - string); |
|
|
|
} |
|
|
|
match->width += delta->match_width; |
|
|
|
return r; |
|
|
|
} else if (do_reset) { |
|
|
|
match->_pos_ptr = NULL; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
@@ -729,17 +787,21 @@ match_t * regex_match(const regex_t * const regex, |
|
|
|
// Find all matches |
|
|
|
{ |
|
|
|
const char * s = string; |
|
|
|
int initial_state; |
|
|
|
do { |
|
|
|
int initial_state; |
|
|
|
initial_state = (int)(!(is_start_of_string && (s == string))); |
|
|
|
|
|
|
|
*match = (match_t){ |
|
|
|
.position = -1, |
|
|
|
.width = 0, |
|
|
|
._pos_ptr = NULL, |
|
|
|
.width = 0, |
|
|
|
}; |
|
|
|
|
|
|
|
if (regex_assert(regex, s, initial_state, match)) { |
|
|
|
match->position = (s - string); |
|
|
|
if (match->_pos_ptr) { |
|
|
|
match->position = (match->_pos_ptr - string); |
|
|
|
} else { |
|
|
|
match->position = (s - string); |
|
|
|
} |
|
|
|
|
|
|
|
vector_push(&matches, match); |
|
|
|
|
|
|
@@ -773,7 +835,7 @@ bool regex_search(const regex_t * const regex, |
|
|
|
const char * const string) { |
|
|
|
|
|
|
|
match_t * m = regex_match(regex, string, true); |
|
|
|
const bool r = (m->position != -1); |
|
|
|
const bool r = !is_sentinel(m); |
|
|
|
free(m); |
|
|
|
|
|
|
|
return r; |
|
|
|