adopted whitelist/blacklist logic
This commit is contained in:
parent
6aa4c0035c
commit
291e115016
175
source/regex.c
175
source/regex.c
@ -6,6 +6,7 @@
|
|||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <limits.h>
|
||||||
|
|
||||||
// ------------------
|
// ------------------
|
||||||
// ### Char tests ###
|
// ### Char tests ###
|
||||||
@ -54,6 +55,7 @@ typedef struct {
|
|||||||
int * state;
|
int * state;
|
||||||
int * width;
|
int * width;
|
||||||
char * whitelist;
|
char * whitelist;
|
||||||
|
char * blacklist;
|
||||||
regex_t * regex;
|
regex_t * regex;
|
||||||
} compiler_state;
|
} compiler_state;
|
||||||
|
|
||||||
@ -62,50 +64,52 @@ typedef struct {
|
|||||||
// ----------------------------------
|
// ----------------------------------
|
||||||
// ### Regex creation/destruction ###
|
// ### Regex creation/destruction ###
|
||||||
// ----------------------------------
|
// ----------------------------------
|
||||||
static int escape_1_to_1(const char c, char * whitelist) {
|
static int escape_1_to_1(const char c, compiler_state * cs) {
|
||||||
|
char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
|
||||||
switch (c) {
|
switch (c) {
|
||||||
case 't': {
|
case 't': {
|
||||||
strcat(whitelist, "\t");
|
strcat(target_list, "\t");
|
||||||
} return 1;
|
} return 1;
|
||||||
case 'n': {
|
case 'n': {
|
||||||
strcat(whitelist, "\n");
|
strcat(target_list, "\n");
|
||||||
} return 1;
|
} return 1;
|
||||||
case 'r': {
|
case 'r': {
|
||||||
strcat(whitelist, "\r");
|
strcat(target_list, "\r");
|
||||||
} return 1;
|
} return 1;
|
||||||
case 'b': {
|
case 'b': {
|
||||||
strcat(whitelist, "\b");
|
strcat(target_list, "\b");
|
||||||
} return 1;
|
} return 1;
|
||||||
case '[': {
|
case '[': {
|
||||||
strcat(whitelist, "[");
|
strcat(target_list, "[");
|
||||||
} return 1;
|
} return 1;
|
||||||
case ']': {
|
case ']': {
|
||||||
strcat(whitelist, "]");
|
strcat(target_list, "]");
|
||||||
} return 1;
|
} return 1;
|
||||||
case '.': {
|
case '.': {
|
||||||
strcat(whitelist, ".");
|
strcat(target_list, ".");
|
||||||
} return 1;
|
} return 1;
|
||||||
case '=': {
|
case '=': {
|
||||||
strcat(whitelist, "=");
|
strcat(target_list, "=");
|
||||||
} return 1;
|
} return 1;
|
||||||
case '?': {
|
case '?': {
|
||||||
strcat(whitelist, "?");
|
strcat(target_list, "?");
|
||||||
} return 1;
|
} return 1;
|
||||||
case '+': {
|
case '+': {
|
||||||
strcat(whitelist, "+");
|
strcat(target_list, "+");
|
||||||
} return 1;
|
} return 1;
|
||||||
case '*': {
|
case '*': {
|
||||||
strcat(whitelist, "*");
|
strcat(target_list, "*");
|
||||||
} return 1;
|
} return 1;
|
||||||
case '\\': {
|
case '\\': {
|
||||||
strcat(whitelist, "\\");
|
strcat(target_list, "\\");
|
||||||
} return 1;
|
} return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int escape_1_to_N(const char c, char * whitelist) {
|
static int escape_1_to_N(const char c, compiler_state * cs) {
|
||||||
|
char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
|
||||||
switch(c) {
|
switch(c) {
|
||||||
case 'i': {
|
case 'i': {
|
||||||
const char identifier_chars[] = "@0123456789_"
|
const char identifier_chars[] = "@0123456789_"
|
||||||
@ -116,7 +120,7 @@ static int escape_1_to_N(const char c, char * whitelist) {
|
|||||||
"\324\325\326\327\330"
|
"\324\325\326\327\330"
|
||||||
"\331\332\333\334\335"
|
"\331\332\333\334\335"
|
||||||
"\336\337";
|
"\336\337";
|
||||||
strcpy(whitelist, identifier_chars);
|
strcpy(target_list, identifier_chars);
|
||||||
return sizeof(identifier_chars)-1;
|
return sizeof(identifier_chars)-1;
|
||||||
};
|
};
|
||||||
case 'I': {
|
case 'I': {
|
||||||
@ -128,7 +132,7 @@ static int escape_1_to_N(const char c, char * whitelist) {
|
|||||||
"\324\325\326\327\330"
|
"\324\325\326\327\330"
|
||||||
"\331\332\333\334\335"
|
"\331\332\333\334\335"
|
||||||
"\336\337";
|
"\336\337";
|
||||||
strcpy(whitelist, identifier_chars);
|
strcpy(target_list, identifier_chars);
|
||||||
return sizeof(identifier_chars)-1;
|
return sizeof(identifier_chars)-1;
|
||||||
};
|
};
|
||||||
case 'k': {
|
case 'k': {
|
||||||
@ -140,7 +144,7 @@ static int escape_1_to_N(const char c, char * whitelist) {
|
|||||||
"\324\325\326\327\330"
|
"\324\325\326\327\330"
|
||||||
"\331\332\333\334\335"
|
"\331\332\333\334\335"
|
||||||
"\336\337";
|
"\336\337";
|
||||||
strcpy(whitelist, keyword_chars);
|
strcpy(target_list, keyword_chars);
|
||||||
return sizeof(keyword_chars)-1;
|
return sizeof(keyword_chars)-1;
|
||||||
};
|
};
|
||||||
case 'K': {
|
case 'K': {
|
||||||
@ -152,17 +156,17 @@ static int escape_1_to_N(const char c, char * whitelist) {
|
|||||||
"\324\325\326\327\330"
|
"\324\325\326\327\330"
|
||||||
"\331\332\333\334\335"
|
"\331\332\333\334\335"
|
||||||
"\336\337";
|
"\336\337";
|
||||||
strcpy(whitelist, keyword_chars);
|
strcpy(target_list, keyword_chars);
|
||||||
return sizeof(keyword_chars)-1;
|
return sizeof(keyword_chars)-1;
|
||||||
};
|
};
|
||||||
case 'f': {
|
case 'f': {
|
||||||
const char filename_chars[] = "@0123456789/.-_+,#$%~=";
|
const char filename_chars[] = "@0123456789/.-_+,#$%~=";
|
||||||
strcpy(whitelist, filename_chars);
|
strcpy(target_list, filename_chars);
|
||||||
return sizeof(filename_chars)-1;
|
return sizeof(filename_chars)-1;
|
||||||
};
|
};
|
||||||
case 'F': {
|
case 'F': {
|
||||||
const char filename_chars[] = "@/.-_+,#$%~=";
|
const char filename_chars[] = "@/.-_+,#$%~=";
|
||||||
strcpy(whitelist, filename_chars);
|
strcpy(target_list, filename_chars);
|
||||||
return sizeof(filename_chars)-1;
|
return sizeof(filename_chars)-1;
|
||||||
};
|
};
|
||||||
case 'p': {
|
case 'p': {
|
||||||
@ -181,7 +185,7 @@ static int escape_1_to_N(const char c, char * whitelist) {
|
|||||||
"\324\325\326\327\330"
|
"\324\325\326\327\330"
|
||||||
"\331\332\333\334\335"
|
"\331\332\333\334\335"
|
||||||
"\336\337";
|
"\336\337";
|
||||||
strcpy(whitelist, printable_chars);
|
strcpy(target_list, printable_chars);
|
||||||
return sizeof(printable_chars)-1;
|
return sizeof(printable_chars)-1;
|
||||||
};
|
};
|
||||||
case 'P': {
|
case 'P': {
|
||||||
@ -200,29 +204,29 @@ static int escape_1_to_N(const char c, char * whitelist) {
|
|||||||
"\324\325\326\327\330"
|
"\324\325\326\327\330"
|
||||||
"\331\332\333\334\335"
|
"\331\332\333\334\335"
|
||||||
"\336\337";
|
"\336\337";
|
||||||
strcpy(whitelist, printable_chars);
|
strcpy(target_list, printable_chars);
|
||||||
return sizeof(printable_chars)-1;
|
return sizeof(printable_chars)-1;
|
||||||
};
|
};
|
||||||
case 's': {
|
case 's': {
|
||||||
const char whitespace_chars[] = " \t\v\n";
|
const char whitespace_chars[] = " \t\v\n";
|
||||||
strcpy(whitelist, whitespace_chars);
|
strcpy(target_list, whitespace_chars);
|
||||||
return sizeof(whitespace_chars)-1;
|
return sizeof(whitespace_chars)-1;
|
||||||
};
|
};
|
||||||
case 'd': {
|
case 'd': {
|
||||||
const char digit_chars[] = "0123456789";
|
const char digit_chars[] = "0123456789";
|
||||||
strcpy(whitelist, digit_chars);
|
strcpy(target_list, digit_chars);
|
||||||
return sizeof(digit_chars)-1;
|
return sizeof(digit_chars)-1;
|
||||||
};
|
};
|
||||||
case 'x': {
|
case 'x': {
|
||||||
const char hex_chars[] = "0123456789"
|
const char hex_chars[] = "0123456789"
|
||||||
"abcdef"
|
"abcdef"
|
||||||
"ABCDEF";
|
"ABCDEF";
|
||||||
strcpy(whitelist, hex_chars);
|
strcpy(target_list, hex_chars);
|
||||||
return sizeof(hex_chars)-1;
|
return sizeof(hex_chars)-1;
|
||||||
};
|
};
|
||||||
case 'o': {
|
case 'o': {
|
||||||
const char oct_chars[] = "01234567";
|
const char oct_chars[] = "01234567";
|
||||||
strcpy(whitelist, oct_chars);
|
strcpy(target_list, oct_chars);
|
||||||
return sizeof(oct_chars)-1;
|
return sizeof(oct_chars)-1;
|
||||||
};
|
};
|
||||||
case 'w': {
|
case 'w': {
|
||||||
@ -230,30 +234,30 @@ static int escape_1_to_N(const char c, char * whitelist) {
|
|||||||
"abcdefghijklmnopqrstuwxyz"
|
"abcdefghijklmnopqrstuwxyz"
|
||||||
"ABCDEFGHIJKLMNOPQRSTUWXYZ"
|
"ABCDEFGHIJKLMNOPQRSTUWXYZ"
|
||||||
"_";
|
"_";
|
||||||
strcpy(whitelist, word_chars);
|
strcpy(target_list, word_chars);
|
||||||
return sizeof(word_chars)-1;
|
return sizeof(word_chars)-1;
|
||||||
};
|
};
|
||||||
case 'h': {
|
case 'h': {
|
||||||
const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
|
const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
|
||||||
"ABCDEFGHIJKLMNOPQRSTUWXYZ"
|
"ABCDEFGHIJKLMNOPQRSTUWXYZ"
|
||||||
"_";
|
"_";
|
||||||
strcpy(whitelist, very_word_chars);
|
strcpy(target_list, very_word_chars);
|
||||||
return sizeof(very_word_chars)-1;
|
return sizeof(very_word_chars)-1;
|
||||||
};
|
};
|
||||||
case 'a': {
|
case 'a': {
|
||||||
const char alpha_chars[] = "abcdefghijklmnopqrstuwxyz"
|
const char alpha_chars[] = "abcdefghijklmnopqrstuwxyz"
|
||||||
"ABCDEFGHIJKLMNOPQRSTUWXYZ";
|
"ABCDEFGHIJKLMNOPQRSTUWXYZ";
|
||||||
strcpy(whitelist, alpha_chars);
|
strcpy(target_list, alpha_chars);
|
||||||
return sizeof(alpha_chars)-1;
|
return sizeof(alpha_chars)-1;
|
||||||
};
|
};
|
||||||
case 'l': {
|
case 'l': {
|
||||||
const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz";
|
const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz";
|
||||||
strcpy(whitelist, lower_alpha_chars);
|
strcpy(target_list, lower_alpha_chars);
|
||||||
return sizeof(lower_alpha_chars)-1;
|
return sizeof(lower_alpha_chars)-1;
|
||||||
};
|
};
|
||||||
case 'u': {
|
case 'u': {
|
||||||
const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ";
|
const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ";
|
||||||
strcpy(whitelist, upper_alpha_chars);
|
strcpy(target_list, upper_alpha_chars);
|
||||||
return sizeof(upper_alpha_chars)-1;
|
return sizeof(upper_alpha_chars)-1;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -291,34 +295,47 @@ static int compile_range(const char * const range,
|
|||||||
for (; *s != ']'; s++) {
|
for (; *s != ']'; s++) {
|
||||||
assert((*s != '\0') && "Unclosed range.");
|
assert((*s != '\0') && "Unclosed range.");
|
||||||
char c = *s;
|
char c = *s;
|
||||||
if (escape_1_to_1(c, whitelist)
|
if (c == '\\') {
|
||||||
|| escape_1_to_N(c, whitelist)) {
|
s += 1;
|
||||||
;
|
assert(compile_escape(*s, cs) && "Unknown escape.");
|
||||||
} else if (*(s+1) == '-') {
|
} else if (*(s+1) == '-') {
|
||||||
char end = *(s+2);
|
char end = *(s+2);
|
||||||
assert((c < end) && "Endless range.");
|
assert((c < end) && "Endless range.");
|
||||||
for (char cc = c; cc < end+1; cc++) {
|
for (char cc = c; cc < end+1; cc++) {
|
||||||
strncat(whitelist, &cc, 1);
|
strncat(target_list, &cc, 1);
|
||||||
strncat(whitelist, "\0", 1);
|
strncat(target_list, "\0", 1);
|
||||||
}
|
}
|
||||||
s += 2;
|
s += 2;
|
||||||
} else {
|
} else {
|
||||||
strncat(whitelist, &c, 1);
|
strncat(target_list, &c, 1);
|
||||||
strncat(whitelist, "\00", 1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ((s - range) + 1);
|
return ((s - range) + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define HALT_AND_CATCH_FIRE -1
|
void filter_blacklist(const char * const whitelist,
|
||||||
|
const char * const blacklist,
|
||||||
|
char * const filtered) {
|
||||||
|
for (char * black_pointer = blacklist; *black_pointer != '\0'; black_pointer++) {
|
||||||
|
for(char * white_pointer = blacklist; *white_pointer != '\0'; white_pointer++) {
|
||||||
|
if (*black_pointer == *white_pointer) {
|
||||||
|
goto long_continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
strncat(filtered, black_pointer, 1);
|
||||||
|
long_continue:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void HOOK_ALL(int from,
|
#define HALT_AND_CATCH_FIRE INT_MIN
|
||||||
|
|
||||||
|
void HOOK_ALL( int from,
|
||||||
const char * const str,
|
const char * const str,
|
||||||
int to,
|
int to,
|
||||||
compiler_state * cs) {
|
compiler_state * cs) {
|
||||||
|
|
||||||
int hook_to = (*cs->is_negative) ? HALT_AND_CATCH_FIRE : *cs->state + to;
|
int hook_to = (to == HALT_AND_CATCH_FIRE) ? -1 : ((*cs->state) + to);
|
||||||
|
|
||||||
|
|
||||||
for (const char * s = str; *s != '\0'; s++) {
|
for (const char * s = str; *s != '\0'; s++) {
|
||||||
@ -330,18 +347,17 @@ void HOOK_ALL(int from,
|
|||||||
vector_push(&cs->regex->delta_table,
|
vector_push(&cs->regex->delta_table,
|
||||||
&delta);
|
&delta);
|
||||||
}
|
}
|
||||||
if (*cs->do_catch || *cs->is_negative) {
|
|
||||||
offshoot_t * offshoot = malloc(sizeof(offshoot_t));
|
|
||||||
offshoot->in = *cs->state + from;
|
|
||||||
offshoot->to = hook_to;
|
|
||||||
vector_push(&cs->regex->catch_table,
|
|
||||||
&offshoot);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define EAT(n) do { \
|
void OFFSHOOT(int from,
|
||||||
s += n; \
|
int to,
|
||||||
} while (0)
|
compiler_state * cs) {
|
||||||
|
offshoot_t * offshoot = malloc(sizeof(offshoot_t));
|
||||||
|
offshoot->in = *cs->state + from;
|
||||||
|
offshoot->to = *cs->state + to;
|
||||||
|
vector_push(&cs->regex->catch_table,
|
||||||
|
&offshoot);
|
||||||
|
}
|
||||||
|
|
||||||
regex_t * regex_compile(const char * const pattern) {
|
regex_t * regex_compile(const char * const pattern) {
|
||||||
regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
|
regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
|
||||||
@ -355,6 +371,7 @@ regex_t * regex_compile(const char * const pattern) {
|
|||||||
bool is_negative;
|
bool is_negative;
|
||||||
int width;
|
int width;
|
||||||
char whitelist[64];
|
char whitelist[64];
|
||||||
|
char blacklist[64];
|
||||||
|
|
||||||
compiler_state cs = {
|
compiler_state cs = {
|
||||||
.do_catch = &do_catch,
|
.do_catch = &do_catch,
|
||||||
@ -362,35 +379,30 @@ regex_t * regex_compile(const char * const pattern) {
|
|||||||
.state = &state,
|
.state = &state,
|
||||||
.width = &width,
|
.width = &width,
|
||||||
.whitelist = whitelist,
|
.whitelist = whitelist,
|
||||||
|
.blacklist = blacklist,
|
||||||
.regex = regex,
|
.regex = regex,
|
||||||
};
|
};
|
||||||
|
|
||||||
for (const char * s = pattern; *s != '\00';) {
|
for (const char * s = pattern; *s != '\00';) {
|
||||||
// Get token
|
// Reset the compiler
|
||||||
assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
|
assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
|
||||||
whitelist[0] = '\00';
|
whitelist[0] = '\00';
|
||||||
|
blacklist[0] = '\00';
|
||||||
do_catch = false;
|
do_catch = false;
|
||||||
is_negative = false;
|
is_negative = false;
|
||||||
width = 1;
|
width = 1;
|
||||||
|
|
||||||
|
// Translate char
|
||||||
switch (*s) {
|
switch (*s) {
|
||||||
case '.': {
|
case '.': {
|
||||||
do_catch = true;
|
compile_dot(&cs);
|
||||||
} break;
|
} break;
|
||||||
case '\\': {
|
case '\\': {
|
||||||
//if (compile_hologram(*s, whitelist)) {
|
s += 1;
|
||||||
// break;
|
assert(compile_escape(*s, &cs) && "Unknown escape.");
|
||||||
//}
|
|
||||||
EAT(1);
|
|
||||||
if(escape_1_to_1(*s, whitelist)
|
|
||||||
|| escape_1_to_N(*s, whitelist)){
|
|
||||||
;
|
|
||||||
} else {
|
|
||||||
assert(!"Unknown escape.");
|
|
||||||
}
|
|
||||||
} break;
|
} break;
|
||||||
case '[': {
|
case '[': {
|
||||||
EAT(compile_range(s, whitelist, &is_negative)-1);
|
s += compile_range(s, &cs) - 1;
|
||||||
} break;
|
} break;
|
||||||
default: {
|
default: {
|
||||||
whitelist[0] = *s;
|
whitelist[0] = *s;
|
||||||
@ -398,30 +410,55 @@ regex_t * regex_compile(const char * const pattern) {
|
|||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
|
||||||
EAT(1);
|
s += 1;
|
||||||
|
|
||||||
// Get quantifier
|
// Compile with quantifier
|
||||||
switch (*s) {
|
switch (*s) {
|
||||||
case '=':
|
case '=':
|
||||||
case '?': {
|
case '?': {
|
||||||
HOOK_ALL(0, whitelist, +1, &cs);
|
HOOK_ALL(0, whitelist, +1, &cs);
|
||||||
EAT(1);
|
if (do_catch || is_negative) {
|
||||||
|
OFFSHOOT(0, +1, &cs);
|
||||||
|
}
|
||||||
|
s += 1;
|
||||||
} break;
|
} break;
|
||||||
case '*': {
|
case '*': {
|
||||||
HOOK_ALL(0, whitelist, 0, &cs);
|
HOOK_ALL(0, whitelist, 0, &cs);
|
||||||
EAT(1);
|
if (do_catch) {
|
||||||
|
OFFSHOOT(0, +1, &cs);
|
||||||
|
} else if (is_negative) {
|
||||||
|
OFFSHOOT(0, 0, &cs);
|
||||||
|
}
|
||||||
|
s += 1;
|
||||||
} break;
|
} break;
|
||||||
case '+': {
|
case '+': {
|
||||||
HOOK_ALL(0, whitelist, +1, &cs);
|
HOOK_ALL(0, whitelist, +1, &cs);
|
||||||
|
if (do_catch || is_negative) {
|
||||||
|
OFFSHOOT(0, +1, &cs);
|
||||||
|
}
|
||||||
state += 1;
|
state += 1;
|
||||||
HOOK_ALL(0, whitelist, 0, &cs);
|
HOOK_ALL(0, whitelist, 0, &cs);
|
||||||
EAT(1);
|
if (do_catch || is_negative) {
|
||||||
|
OFFSHOOT(0, 0, &cs);
|
||||||
|
}
|
||||||
|
s += 1;
|
||||||
} break;
|
} break;
|
||||||
default: { // Literal
|
default: { // Literal
|
||||||
HOOK_ALL(0, whitelist, +1, &cs);
|
HOOK_ALL(0, whitelist, +1, &cs);
|
||||||
|
if (do_catch || is_negative) {
|
||||||
|
OFFSHOOT(0, +1, &cs);
|
||||||
|
}
|
||||||
state += 1;
|
state += 1;
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Compile blacklist
|
||||||
|
if (*blacklist) {
|
||||||
|
char filtered_blacklist[64];
|
||||||
|
filtered_blacklist[0] = '\0';
|
||||||
|
filter_blacklist(whitelist, blacklist, filtered_blacklist);
|
||||||
|
HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
regex->accepting_state = state;
|
regex->accepting_state = state;
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user