|
|
@@ -6,6 +6,7 @@ |
|
|
|
|
|
|
|
#include <assert.h> |
|
|
|
#include <string.h> |
|
|
|
#include <limits.h> |
|
|
|
|
|
|
|
// ------------------ |
|
|
|
// ### Char tests ### |
|
|
@@ -54,6 +55,7 @@ typedef struct { |
|
|
|
int * state; |
|
|
|
int * width; |
|
|
|
char * whitelist; |
|
|
|
char * blacklist; |
|
|
|
regex_t * regex; |
|
|
|
} compiler_state; |
|
|
|
|
|
|
@@ -62,50 +64,52 @@ typedef struct { |
|
|
|
// ---------------------------------- |
|
|
|
// ### Regex creation/destruction ### |
|
|
|
// ---------------------------------- |
|
|
|
static int escape_1_to_1(const char c, char * whitelist) { |
|
|
|
static int escape_1_to_1(const char c, compiler_state * cs) { |
|
|
|
char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist; |
|
|
|
switch (c) { |
|
|
|
case 't': { |
|
|
|
strcat(whitelist, "\t"); |
|
|
|
strcat(target_list, "\t"); |
|
|
|
} return 1; |
|
|
|
case 'n': { |
|
|
|
strcat(whitelist, "\n"); |
|
|
|
strcat(target_list, "\n"); |
|
|
|
} return 1; |
|
|
|
case 'r': { |
|
|
|
strcat(whitelist, "\r"); |
|
|
|
strcat(target_list, "\r"); |
|
|
|
} return 1; |
|
|
|
case 'b': { |
|
|
|
strcat(whitelist, "\b"); |
|
|
|
strcat(target_list, "\b"); |
|
|
|
} return 1; |
|
|
|
case '[': { |
|
|
|
strcat(whitelist, "["); |
|
|
|
strcat(target_list, "["); |
|
|
|
} return 1; |
|
|
|
case ']': { |
|
|
|
strcat(whitelist, "]"); |
|
|
|
strcat(target_list, "]"); |
|
|
|
} return 1; |
|
|
|
case '.': { |
|
|
|
strcat(whitelist, "."); |
|
|
|
strcat(target_list, "."); |
|
|
|
} return 1; |
|
|
|
case '=': { |
|
|
|
strcat(whitelist, "="); |
|
|
|
strcat(target_list, "="); |
|
|
|
} return 1; |
|
|
|
case '?': { |
|
|
|
strcat(whitelist, "?"); |
|
|
|
strcat(target_list, "?"); |
|
|
|
} return 1; |
|
|
|
case '+': { |
|
|
|
strcat(whitelist, "+"); |
|
|
|
strcat(target_list, "+"); |
|
|
|
} return 1; |
|
|
|
case '*': { |
|
|
|
strcat(whitelist, "*"); |
|
|
|
strcat(target_list, "*"); |
|
|
|
} return 1; |
|
|
|
case '\\': { |
|
|
|
strcat(whitelist, "\\"); |
|
|
|
strcat(target_list, "\\"); |
|
|
|
} return 1; |
|
|
|
} |
|
|
|
|
|
|
|
return 0; |
|
|
|
} |
|
|
|
|
|
|
|
static int escape_1_to_N(const char c, char * whitelist) { |
|
|
|
static int escape_1_to_N(const char c, compiler_state * cs) { |
|
|
|
char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist; |
|
|
|
switch(c) { |
|
|
|
case 'i': { |
|
|
|
const char identifier_chars[] = "@0123456789_" |
|
|
@@ -116,7 +120,7 @@ static int escape_1_to_N(const char c, char * whitelist) { |
|
|
|
"\324\325\326\327\330" |
|
|
|
"\331\332\333\334\335" |
|
|
|
"\336\337"; |
|
|
|
strcpy(whitelist, identifier_chars); |
|
|
|
strcpy(target_list, identifier_chars); |
|
|
|
return sizeof(identifier_chars)-1; |
|
|
|
}; |
|
|
|
case 'I': { |
|
|
@@ -128,7 +132,7 @@ static int escape_1_to_N(const char c, char * whitelist) { |
|
|
|
"\324\325\326\327\330" |
|
|
|
"\331\332\333\334\335" |
|
|
|
"\336\337"; |
|
|
|
strcpy(whitelist, identifier_chars); |
|
|
|
strcpy(target_list, identifier_chars); |
|
|
|
return sizeof(identifier_chars)-1; |
|
|
|
}; |
|
|
|
case 'k': { |
|
|
@@ -140,7 +144,7 @@ static int escape_1_to_N(const char c, char * whitelist) { |
|
|
|
"\324\325\326\327\330" |
|
|
|
"\331\332\333\334\335" |
|
|
|
"\336\337"; |
|
|
|
strcpy(whitelist, keyword_chars); |
|
|
|
strcpy(target_list, keyword_chars); |
|
|
|
return sizeof(keyword_chars)-1; |
|
|
|
}; |
|
|
|
case 'K': { |
|
|
@@ -152,17 +156,17 @@ static int escape_1_to_N(const char c, char * whitelist) { |
|
|
|
"\324\325\326\327\330" |
|
|
|
"\331\332\333\334\335" |
|
|
|
"\336\337"; |
|
|
|
strcpy(whitelist, keyword_chars); |
|
|
|
strcpy(target_list, keyword_chars); |
|
|
|
return sizeof(keyword_chars)-1; |
|
|
|
}; |
|
|
|
case 'f': { |
|
|
|
const char filename_chars[] = "@0123456789/.-_+,#$%~="; |
|
|
|
strcpy(whitelist, filename_chars); |
|
|
|
strcpy(target_list, filename_chars); |
|
|
|
return sizeof(filename_chars)-1; |
|
|
|
}; |
|
|
|
case 'F': { |
|
|
|
const char filename_chars[] = "@/.-_+,#$%~="; |
|
|
|
strcpy(whitelist, filename_chars); |
|
|
|
strcpy(target_list, filename_chars); |
|
|
|
return sizeof(filename_chars)-1; |
|
|
|
}; |
|
|
|
case 'p': { |
|
|
@@ -181,7 +185,7 @@ static int escape_1_to_N(const char c, char * whitelist) { |
|
|
|
"\324\325\326\327\330" |
|
|
|
"\331\332\333\334\335" |
|
|
|
"\336\337"; |
|
|
|
strcpy(whitelist, printable_chars); |
|
|
|
strcpy(target_list, printable_chars); |
|
|
|
return sizeof(printable_chars)-1; |
|
|
|
}; |
|
|
|
case 'P': { |
|
|
@@ -200,29 +204,29 @@ static int escape_1_to_N(const char c, char * whitelist) { |
|
|
|
"\324\325\326\327\330" |
|
|
|
"\331\332\333\334\335" |
|
|
|
"\336\337"; |
|
|
|
strcpy(whitelist, printable_chars); |
|
|
|
strcpy(target_list, printable_chars); |
|
|
|
return sizeof(printable_chars)-1; |
|
|
|
}; |
|
|
|
case 's': { |
|
|
|
const char whitespace_chars[] = " \t\v\n"; |
|
|
|
strcpy(whitelist, whitespace_chars); |
|
|
|
strcpy(target_list, whitespace_chars); |
|
|
|
return sizeof(whitespace_chars)-1; |
|
|
|
}; |
|
|
|
case 'd': { |
|
|
|
const char digit_chars[] = "0123456789"; |
|
|
|
strcpy(whitelist, digit_chars); |
|
|
|
strcpy(target_list, digit_chars); |
|
|
|
return sizeof(digit_chars)-1; |
|
|
|
}; |
|
|
|
case 'x': { |
|
|
|
const char hex_chars[] = "0123456789" |
|
|
|
"abcdef" |
|
|
|
"ABCDEF"; |
|
|
|
strcpy(whitelist, hex_chars); |
|
|
|
strcpy(target_list, hex_chars); |
|
|
|
return sizeof(hex_chars)-1; |
|
|
|
}; |
|
|
|
case 'o': { |
|
|
|
const char oct_chars[] = "01234567"; |
|
|
|
strcpy(whitelist, oct_chars); |
|
|
|
strcpy(target_list, oct_chars); |
|
|
|
return sizeof(oct_chars)-1; |
|
|
|
}; |
|
|
|
case 'w': { |
|
|
@@ -230,30 +234,30 @@ static int escape_1_to_N(const char c, char * whitelist) { |
|
|
|
"abcdefghijklmnopqrstuwxyz" |
|
|
|
"ABCDEFGHIJKLMNOPQRSTUWXYZ" |
|
|
|
"_"; |
|
|
|
strcpy(whitelist, word_chars); |
|
|
|
strcpy(target_list, word_chars); |
|
|
|
return sizeof(word_chars)-1; |
|
|
|
}; |
|
|
|
case 'h': { |
|
|
|
const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz" |
|
|
|
"ABCDEFGHIJKLMNOPQRSTUWXYZ" |
|
|
|
"_"; |
|
|
|
strcpy(whitelist, very_word_chars); |
|
|
|
strcpy(target_list, very_word_chars); |
|
|
|
return sizeof(very_word_chars)-1; |
|
|
|
}; |
|
|
|
case 'a': { |
|
|
|
const char alpha_chars[] = "abcdefghijklmnopqrstuwxyz" |
|
|
|
"ABCDEFGHIJKLMNOPQRSTUWXYZ"; |
|
|
|
strcpy(whitelist, alpha_chars); |
|
|
|
strcpy(target_list, alpha_chars); |
|
|
|
return sizeof(alpha_chars)-1; |
|
|
|
}; |
|
|
|
case 'l': { |
|
|
|
const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz"; |
|
|
|
strcpy(whitelist, lower_alpha_chars); |
|
|
|
strcpy(target_list, lower_alpha_chars); |
|
|
|
return sizeof(lower_alpha_chars)-1; |
|
|
|
}; |
|
|
|
case 'u': { |
|
|
|
const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ"; |
|
|
|
strcpy(whitelist, upper_alpha_chars); |
|
|
|
strcpy(target_list, upper_alpha_chars); |
|
|
|
return sizeof(upper_alpha_chars)-1; |
|
|
|
}; |
|
|
|
} |
|
|
@@ -291,34 +295,47 @@ static int compile_range(const char * const range, |
|
|
|
for (; *s != ']'; s++) { |
|
|
|
assert((*s != '\0') && "Unclosed range."); |
|
|
|
char c = *s; |
|
|
|
if (escape_1_to_1(c, whitelist) |
|
|
|
|| escape_1_to_N(c, whitelist)) { |
|
|
|
; |
|
|
|
if (c == '\\') { |
|
|
|
s += 1; |
|
|
|
assert(compile_escape(*s, cs) && "Unknown escape."); |
|
|
|
} else if (*(s+1) == '-') { |
|
|
|
char end = *(s+2); |
|
|
|
assert((c < end) && "Endless range."); |
|
|
|
for (char cc = c; cc < end+1; cc++) { |
|
|
|
strncat(whitelist, &cc, 1); |
|
|
|
strncat(whitelist, "\0", 1); |
|
|
|
strncat(target_list, &cc, 1); |
|
|
|
strncat(target_list, "\0", 1); |
|
|
|
} |
|
|
|
s += 2; |
|
|
|
} else { |
|
|
|
strncat(whitelist, &c, 1); |
|
|
|
strncat(whitelist, "\00", 1); |
|
|
|
strncat(target_list, &c, 1); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return ((s - range) + 1); |
|
|
|
} |
|
|
|
|
|
|
|
#define HALT_AND_CATCH_FIRE -1 |
|
|
|
void filter_blacklist(const char * const whitelist, |
|
|
|
const char * const blacklist, |
|
|
|
char * const filtered) { |
|
|
|
for (char * black_pointer = blacklist; *black_pointer != '\0'; black_pointer++) { |
|
|
|
for(char * white_pointer = blacklist; *white_pointer != '\0'; white_pointer++) { |
|
|
|
if (*black_pointer == *white_pointer) { |
|
|
|
goto long_continue; |
|
|
|
} |
|
|
|
} |
|
|
|
strncat(filtered, black_pointer, 1); |
|
|
|
long_continue: |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
void HOOK_ALL(int from, |
|
|
|
const char * const str, |
|
|
|
int to, |
|
|
|
compiler_state * cs) { |
|
|
|
#define HALT_AND_CATCH_FIRE INT_MIN |
|
|
|
|
|
|
|
int hook_to = (*cs->is_negative) ? HALT_AND_CATCH_FIRE : *cs->state + to; |
|
|
|
void HOOK_ALL( int from, |
|
|
|
const char * const str, |
|
|
|
int to, |
|
|
|
compiler_state * cs) { |
|
|
|
|
|
|
|
int hook_to = (to == HALT_AND_CATCH_FIRE) ? -1 : ((*cs->state) + to); |
|
|
|
|
|
|
|
|
|
|
|
for (const char * s = str; *s != '\0'; s++) { |
|
|
@@ -330,18 +347,17 @@ void HOOK_ALL(int from, |
|
|
|
vector_push(&cs->regex->delta_table, |
|
|
|
&delta); |
|
|
|
} |
|
|
|
if (*cs->do_catch || *cs->is_negative) { |
|
|
|
offshoot_t * offshoot = malloc(sizeof(offshoot_t)); |
|
|
|
offshoot->in = *cs->state + from; |
|
|
|
offshoot->to = hook_to; |
|
|
|
vector_push(&cs->regex->catch_table, |
|
|
|
&offshoot); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
#define EAT(n) do { \ |
|
|
|
s += n; \ |
|
|
|
} while (0) |
|
|
|
void OFFSHOOT(int from, |
|
|
|
int to, |
|
|
|
compiler_state * cs) { |
|
|
|
offshoot_t * offshoot = malloc(sizeof(offshoot_t)); |
|
|
|
offshoot->in = *cs->state + from; |
|
|
|
offshoot->to = *cs->state + to; |
|
|
|
vector_push(&cs->regex->catch_table, |
|
|
|
&offshoot); |
|
|
|
} |
|
|
|
|
|
|
|
regex_t * regex_compile(const char * const pattern) { |
|
|
|
regex_t * regex = (regex_t *)malloc(sizeof(regex_t)); |
|
|
@@ -355,6 +371,7 @@ regex_t * regex_compile(const char * const pattern) { |
|
|
|
bool is_negative; |
|
|
|
int width; |
|
|
|
char whitelist[64]; |
|
|
|
char blacklist[64]; |
|
|
|
|
|
|
|
compiler_state cs = { |
|
|
|
.do_catch = &do_catch, |
|
|
@@ -362,35 +379,30 @@ regex_t * regex_compile(const char * const pattern) { |
|
|
|
.state = &state, |
|
|
|
.width = &width, |
|
|
|
.whitelist = whitelist, |
|
|
|
.blacklist = blacklist, |
|
|
|
.regex = regex, |
|
|
|
}; |
|
|
|
|
|
|
|
for (const char * s = pattern; *s != '\00';) { |
|
|
|
// Get token |
|
|
|
// Reset the compiler |
|
|
|
assert(!is_quantifier(*pattern) && "Pattern starts with quantifier."); |
|
|
|
whitelist[0] = '\00'; |
|
|
|
blacklist[0] = '\00'; |
|
|
|
do_catch = false; |
|
|
|
is_negative = false; |
|
|
|
width = 1; |
|
|
|
|
|
|
|
// Translate char |
|
|
|
switch (*s) { |
|
|
|
case '.': { |
|
|
|
do_catch = true; |
|
|
|
compile_dot(&cs); |
|
|
|
} break; |
|
|
|
case '\\': { |
|
|
|
//if (compile_hologram(*s, whitelist)) { |
|
|
|
// break; |
|
|
|
//} |
|
|
|
EAT(1); |
|
|
|
if(escape_1_to_1(*s, whitelist) |
|
|
|
|| escape_1_to_N(*s, whitelist)){ |
|
|
|
; |
|
|
|
} else { |
|
|
|
assert(!"Unknown escape."); |
|
|
|
} |
|
|
|
s += 1; |
|
|
|
assert(compile_escape(*s, &cs) && "Unknown escape."); |
|
|
|
} break; |
|
|
|
case '[': { |
|
|
|
EAT(compile_range(s, whitelist, &is_negative)-1); |
|
|
|
s += compile_range(s, &cs) - 1; |
|
|
|
} break; |
|
|
|
default: { |
|
|
|
whitelist[0] = *s; |
|
|
@@ -398,30 +410,55 @@ regex_t * regex_compile(const char * const pattern) { |
|
|
|
} break; |
|
|
|
} |
|
|
|
|
|
|
|
EAT(1); |
|
|
|
s += 1; |
|
|
|
|
|
|
|
// Get quantifier |
|
|
|
// Compile with quantifier |
|
|
|
switch (*s) { |
|
|
|
case '=': |
|
|
|
case '?': { |
|
|
|
HOOK_ALL(0, whitelist, +1, &cs); |
|
|
|
EAT(1); |
|
|
|
if (do_catch || is_negative) { |
|
|
|
OFFSHOOT(0, +1, &cs); |
|
|
|
} |
|
|
|
s += 1; |
|
|
|
} break; |
|
|
|
case '*': { |
|
|
|
HOOK_ALL(0, whitelist, 0, &cs); |
|
|
|
EAT(1); |
|
|
|
if (do_catch) { |
|
|
|
OFFSHOOT(0, +1, &cs); |
|
|
|
} else if (is_negative) { |
|
|
|
OFFSHOOT(0, 0, &cs); |
|
|
|
} |
|
|
|
s += 1; |
|
|
|
} break; |
|
|
|
case '+': { |
|
|
|
HOOK_ALL(0, whitelist, +1, &cs); |
|
|
|
if (do_catch || is_negative) { |
|
|
|
OFFSHOOT(0, +1, &cs); |
|
|
|
} |
|
|
|
state += 1; |
|
|
|
HOOK_ALL(0, whitelist, 0, &cs); |
|
|
|
EAT(1); |
|
|
|
if (do_catch || is_negative) { |
|
|
|
OFFSHOOT(0, 0, &cs); |
|
|
|
} |
|
|
|
s += 1; |
|
|
|
} break; |
|
|
|
default: { // Literal |
|
|
|
HOOK_ALL(0, whitelist, +1, &cs); |
|
|
|
if (do_catch || is_negative) { |
|
|
|
OFFSHOOT(0, +1, &cs); |
|
|
|
} |
|
|
|
state += 1; |
|
|
|
} break; |
|
|
|
} |
|
|
|
|
|
|
|
// Compile blacklist |
|
|
|
if (*blacklist) { |
|
|
|
char filtered_blacklist[64]; |
|
|
|
filtered_blacklist[0] = '\0'; |
|
|
|
filter_blacklist(whitelist, blacklist, filtered_blacklist); |
|
|
|
HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
regex->accepting_state = state; |
|
|
|