adopted whitelist/blacklist logic

This commit is contained in:
anon 2023-08-28 15:42:42 +02:00
parent 6aa4c0035c
commit 291e115016

View File

@ -6,6 +6,7 @@
#include <assert.h> #include <assert.h>
#include <string.h> #include <string.h>
#include <limits.h>
// ------------------ // ------------------
// ### Char tests ### // ### Char tests ###
@ -54,6 +55,7 @@ typedef struct {
int * state; int * state;
int * width; int * width;
char * whitelist; char * whitelist;
char * blacklist;
regex_t * regex; regex_t * regex;
} compiler_state; } compiler_state;
@ -62,50 +64,52 @@ typedef struct {
// ---------------------------------- // ----------------------------------
// ### Regex creation/destruction ### // ### Regex creation/destruction ###
// ---------------------------------- // ----------------------------------
static int escape_1_to_1(const char c, char * whitelist) { static int escape_1_to_1(const char c, compiler_state * cs) {
char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
switch (c) { switch (c) {
case 't': { case 't': {
strcat(whitelist, "\t"); strcat(target_list, "\t");
} return 1; } return 1;
case 'n': { case 'n': {
strcat(whitelist, "\n"); strcat(target_list, "\n");
} return 1; } return 1;
case 'r': { case 'r': {
strcat(whitelist, "\r"); strcat(target_list, "\r");
} return 1; } return 1;
case 'b': { case 'b': {
strcat(whitelist, "\b"); strcat(target_list, "\b");
} return 1; } return 1;
case '[': { case '[': {
strcat(whitelist, "["); strcat(target_list, "[");
} return 1; } return 1;
case ']': { case ']': {
strcat(whitelist, "]"); strcat(target_list, "]");
} return 1; } return 1;
case '.': { case '.': {
strcat(whitelist, "."); strcat(target_list, ".");
} return 1; } return 1;
case '=': { case '=': {
strcat(whitelist, "="); strcat(target_list, "=");
} return 1; } return 1;
case '?': { case '?': {
strcat(whitelist, "?"); strcat(target_list, "?");
} return 1; } return 1;
case '+': { case '+': {
strcat(whitelist, "+"); strcat(target_list, "+");
} return 1; } return 1;
case '*': { case '*': {
strcat(whitelist, "*"); strcat(target_list, "*");
} return 1; } return 1;
case '\\': { case '\\': {
strcat(whitelist, "\\"); strcat(target_list, "\\");
} return 1; } return 1;
} }
return 0; return 0;
} }
static int escape_1_to_N(const char c, char * whitelist) { static int escape_1_to_N(const char c, compiler_state * cs) {
char * target_list = (*cs->is_negative) ? cs->blacklist : cs->whitelist;
switch(c) { switch(c) {
case 'i': { case 'i': {
const char identifier_chars[] = "@0123456789_" const char identifier_chars[] = "@0123456789_"
@ -116,7 +120,7 @@ static int escape_1_to_N(const char c, char * whitelist) {
"\324\325\326\327\330" "\324\325\326\327\330"
"\331\332\333\334\335" "\331\332\333\334\335"
"\336\337"; "\336\337";
strcpy(whitelist, identifier_chars); strcpy(target_list, identifier_chars);
return sizeof(identifier_chars)-1; return sizeof(identifier_chars)-1;
}; };
case 'I': { case 'I': {
@ -128,7 +132,7 @@ static int escape_1_to_N(const char c, char * whitelist) {
"\324\325\326\327\330" "\324\325\326\327\330"
"\331\332\333\334\335" "\331\332\333\334\335"
"\336\337"; "\336\337";
strcpy(whitelist, identifier_chars); strcpy(target_list, identifier_chars);
return sizeof(identifier_chars)-1; return sizeof(identifier_chars)-1;
}; };
case 'k': { case 'k': {
@ -140,7 +144,7 @@ static int escape_1_to_N(const char c, char * whitelist) {
"\324\325\326\327\330" "\324\325\326\327\330"
"\331\332\333\334\335" "\331\332\333\334\335"
"\336\337"; "\336\337";
strcpy(whitelist, keyword_chars); strcpy(target_list, keyword_chars);
return sizeof(keyword_chars)-1; return sizeof(keyword_chars)-1;
}; };
case 'K': { case 'K': {
@ -152,17 +156,17 @@ static int escape_1_to_N(const char c, char * whitelist) {
"\324\325\326\327\330" "\324\325\326\327\330"
"\331\332\333\334\335" "\331\332\333\334\335"
"\336\337"; "\336\337";
strcpy(whitelist, keyword_chars); strcpy(target_list, keyword_chars);
return sizeof(keyword_chars)-1; return sizeof(keyword_chars)-1;
}; };
case 'f': { case 'f': {
const char filename_chars[] = "@0123456789/.-_+,#$%~="; const char filename_chars[] = "@0123456789/.-_+,#$%~=";
strcpy(whitelist, filename_chars); strcpy(target_list, filename_chars);
return sizeof(filename_chars)-1; return sizeof(filename_chars)-1;
}; };
case 'F': { case 'F': {
const char filename_chars[] = "@/.-_+,#$%~="; const char filename_chars[] = "@/.-_+,#$%~=";
strcpy(whitelist, filename_chars); strcpy(target_list, filename_chars);
return sizeof(filename_chars)-1; return sizeof(filename_chars)-1;
}; };
case 'p': { case 'p': {
@ -181,7 +185,7 @@ static int escape_1_to_N(const char c, char * whitelist) {
"\324\325\326\327\330" "\324\325\326\327\330"
"\331\332\333\334\335" "\331\332\333\334\335"
"\336\337"; "\336\337";
strcpy(whitelist, printable_chars); strcpy(target_list, printable_chars);
return sizeof(printable_chars)-1; return sizeof(printable_chars)-1;
}; };
case 'P': { case 'P': {
@ -200,29 +204,29 @@ static int escape_1_to_N(const char c, char * whitelist) {
"\324\325\326\327\330" "\324\325\326\327\330"
"\331\332\333\334\335" "\331\332\333\334\335"
"\336\337"; "\336\337";
strcpy(whitelist, printable_chars); strcpy(target_list, printable_chars);
return sizeof(printable_chars)-1; return sizeof(printable_chars)-1;
}; };
case 's': { case 's': {
const char whitespace_chars[] = " \t\v\n"; const char whitespace_chars[] = " \t\v\n";
strcpy(whitelist, whitespace_chars); strcpy(target_list, whitespace_chars);
return sizeof(whitespace_chars)-1; return sizeof(whitespace_chars)-1;
}; };
case 'd': { case 'd': {
const char digit_chars[] = "0123456789"; const char digit_chars[] = "0123456789";
strcpy(whitelist, digit_chars); strcpy(target_list, digit_chars);
return sizeof(digit_chars)-1; return sizeof(digit_chars)-1;
}; };
case 'x': { case 'x': {
const char hex_chars[] = "0123456789" const char hex_chars[] = "0123456789"
"abcdef" "abcdef"
"ABCDEF"; "ABCDEF";
strcpy(whitelist, hex_chars); strcpy(target_list, hex_chars);
return sizeof(hex_chars)-1; return sizeof(hex_chars)-1;
}; };
case 'o': { case 'o': {
const char oct_chars[] = "01234567"; const char oct_chars[] = "01234567";
strcpy(whitelist, oct_chars); strcpy(target_list, oct_chars);
return sizeof(oct_chars)-1; return sizeof(oct_chars)-1;
}; };
case 'w': { case 'w': {
@ -230,30 +234,30 @@ static int escape_1_to_N(const char c, char * whitelist) {
"abcdefghijklmnopqrstuwxyz" "abcdefghijklmnopqrstuwxyz"
"ABCDEFGHIJKLMNOPQRSTUWXYZ" "ABCDEFGHIJKLMNOPQRSTUWXYZ"
"_"; "_";
strcpy(whitelist, word_chars); strcpy(target_list, word_chars);
return sizeof(word_chars)-1; return sizeof(word_chars)-1;
}; };
case 'h': { case 'h': {
const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz" const char very_word_chars[] = "abcdefghijklmnopqrstuwxyz"
"ABCDEFGHIJKLMNOPQRSTUWXYZ" "ABCDEFGHIJKLMNOPQRSTUWXYZ"
"_"; "_";
strcpy(whitelist, very_word_chars); strcpy(target_list, very_word_chars);
return sizeof(very_word_chars)-1; return sizeof(very_word_chars)-1;
}; };
case 'a': { case 'a': {
const char alpha_chars[] = "abcdefghijklmnopqrstuwxyz" const char alpha_chars[] = "abcdefghijklmnopqrstuwxyz"
"ABCDEFGHIJKLMNOPQRSTUWXYZ"; "ABCDEFGHIJKLMNOPQRSTUWXYZ";
strcpy(whitelist, alpha_chars); strcpy(target_list, alpha_chars);
return sizeof(alpha_chars)-1; return sizeof(alpha_chars)-1;
}; };
case 'l': { case 'l': {
const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz"; const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz";
strcpy(whitelist, lower_alpha_chars); strcpy(target_list, lower_alpha_chars);
return sizeof(lower_alpha_chars)-1; return sizeof(lower_alpha_chars)-1;
}; };
case 'u': { case 'u': {
const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ"; const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ";
strcpy(whitelist, upper_alpha_chars); strcpy(target_list, upper_alpha_chars);
return sizeof(upper_alpha_chars)-1; return sizeof(upper_alpha_chars)-1;
}; };
} }
@ -291,34 +295,47 @@ static int compile_range(const char * const range,
for (; *s != ']'; s++) { for (; *s != ']'; s++) {
assert((*s != '\0') && "Unclosed range."); assert((*s != '\0') && "Unclosed range.");
char c = *s; char c = *s;
if (escape_1_to_1(c, whitelist) if (c == '\\') {
|| escape_1_to_N(c, whitelist)) { s += 1;
; assert(compile_escape(*s, cs) && "Unknown escape.");
} else if (*(s+1) == '-') { } else if (*(s+1) == '-') {
char end = *(s+2); char end = *(s+2);
assert((c < end) && "Endless range."); assert((c < end) && "Endless range.");
for (char cc = c; cc < end+1; cc++) { for (char cc = c; cc < end+1; cc++) {
strncat(whitelist, &cc, 1); strncat(target_list, &cc, 1);
strncat(whitelist, "\0", 1); strncat(target_list, "\0", 1);
} }
s += 2; s += 2;
} else { } else {
strncat(whitelist, &c, 1); strncat(target_list, &c, 1);
strncat(whitelist, "\00", 1);
} }
} }
return ((s - range) + 1); return ((s - range) + 1);
} }
#define HALT_AND_CATCH_FIRE -1 void filter_blacklist(const char * const whitelist,
const char * const blacklist,
char * const filtered) {
for (char * black_pointer = blacklist; *black_pointer != '\0'; black_pointer++) {
for(char * white_pointer = blacklist; *white_pointer != '\0'; white_pointer++) {
if (*black_pointer == *white_pointer) {
goto long_continue;
}
}
strncat(filtered, black_pointer, 1);
long_continue:
}
}
void HOOK_ALL(int from, #define HALT_AND_CATCH_FIRE INT_MIN
void HOOK_ALL( int from,
const char * const str, const char * const str,
int to, int to,
compiler_state * cs) { compiler_state * cs) {
int hook_to = (*cs->is_negative) ? HALT_AND_CATCH_FIRE : *cs->state + to; int hook_to = (to == HALT_AND_CATCH_FIRE) ? -1 : ((*cs->state) + to);
for (const char * s = str; *s != '\0'; s++) { for (const char * s = str; *s != '\0'; s++) {
@ -330,18 +347,17 @@ void HOOK_ALL(int from,
vector_push(&cs->regex->delta_table, vector_push(&cs->regex->delta_table,
&delta); &delta);
} }
if (*cs->do_catch || *cs->is_negative) {
offshoot_t * offshoot = malloc(sizeof(offshoot_t));
offshoot->in = *cs->state + from;
offshoot->to = hook_to;
vector_push(&cs->regex->catch_table,
&offshoot);
}
} }
#define EAT(n) do { \ void OFFSHOOT(int from,
s += n; \ int to,
} while (0) compiler_state * cs) {
offshoot_t * offshoot = malloc(sizeof(offshoot_t));
offshoot->in = *cs->state + from;
offshoot->to = *cs->state + to;
vector_push(&cs->regex->catch_table,
&offshoot);
}
regex_t * regex_compile(const char * const pattern) { regex_t * regex_compile(const char * const pattern) {
regex_t * regex = (regex_t *)malloc(sizeof(regex_t)); regex_t * regex = (regex_t *)malloc(sizeof(regex_t));
@ -355,6 +371,7 @@ regex_t * regex_compile(const char * const pattern) {
bool is_negative; bool is_negative;
int width; int width;
char whitelist[64]; char whitelist[64];
char blacklist[64];
compiler_state cs = { compiler_state cs = {
.do_catch = &do_catch, .do_catch = &do_catch,
@ -362,35 +379,30 @@ regex_t * regex_compile(const char * const pattern) {
.state = &state, .state = &state,
.width = &width, .width = &width,
.whitelist = whitelist, .whitelist = whitelist,
.blacklist = blacklist,
.regex = regex, .regex = regex,
}; };
for (const char * s = pattern; *s != '\00';) { for (const char * s = pattern; *s != '\00';) {
// Get token // Reset the compiler
assert(!is_quantifier(*pattern) && "Pattern starts with quantifier."); assert(!is_quantifier(*pattern) && "Pattern starts with quantifier.");
whitelist[0] = '\00'; whitelist[0] = '\00';
blacklist[0] = '\00';
do_catch = false; do_catch = false;
is_negative = false; is_negative = false;
width = 1; width = 1;
// Translate char
switch (*s) { switch (*s) {
case '.': { case '.': {
do_catch = true; compile_dot(&cs);
} break; } break;
case '\\': { case '\\': {
//if (compile_hologram(*s, whitelist)) { s += 1;
// break; assert(compile_escape(*s, &cs) && "Unknown escape.");
//}
EAT(1);
if(escape_1_to_1(*s, whitelist)
|| escape_1_to_N(*s, whitelist)){
;
} else {
assert(!"Unknown escape.");
}
} break; } break;
case '[': { case '[': {
EAT(compile_range(s, whitelist, &is_negative)-1); s += compile_range(s, &cs) - 1;
} break; } break;
default: { default: {
whitelist[0] = *s; whitelist[0] = *s;
@ -398,30 +410,55 @@ regex_t * regex_compile(const char * const pattern) {
} break; } break;
} }
EAT(1); s += 1;
// Get quantifier // Compile with quantifier
switch (*s) { switch (*s) {
case '=': case '=':
case '?': { case '?': {
HOOK_ALL(0, whitelist, +1, &cs); HOOK_ALL(0, whitelist, +1, &cs);
EAT(1); if (do_catch || is_negative) {
OFFSHOOT(0, +1, &cs);
}
s += 1;
} break; } break;
case '*': { case '*': {
HOOK_ALL(0, whitelist, 0, &cs); HOOK_ALL(0, whitelist, 0, &cs);
EAT(1); if (do_catch) {
OFFSHOOT(0, +1, &cs);
} else if (is_negative) {
OFFSHOOT(0, 0, &cs);
}
s += 1;
} break; } break;
case '+': { case '+': {
HOOK_ALL(0, whitelist, +1, &cs); HOOK_ALL(0, whitelist, +1, &cs);
if (do_catch || is_negative) {
OFFSHOOT(0, +1, &cs);
}
state += 1; state += 1;
HOOK_ALL(0, whitelist, 0, &cs); HOOK_ALL(0, whitelist, 0, &cs);
EAT(1); if (do_catch || is_negative) {
OFFSHOOT(0, 0, &cs);
}
s += 1;
} break; } break;
default: { // Literal default: { // Literal
HOOK_ALL(0, whitelist, +1, &cs); HOOK_ALL(0, whitelist, +1, &cs);
if (do_catch || is_negative) {
OFFSHOOT(0, +1, &cs);
}
state += 1; state += 1;
} break; } break;
} }
// Compile blacklist
if (*blacklist) {
char filtered_blacklist[64];
filtered_blacklist[0] = '\0';
filter_blacklist(whitelist, blacklist, filtered_blacklist);
HOOK_ALL(0, filtered_blacklist, HALT_AND_CATCH_FIRE, &cs);
}
} }
regex->accepting_state = state; regex->accepting_state = state;