From 3073434501f65a2fc543a631735177bd35d17053 Mon Sep 17 00:00:00 2001 From: anon Date: Thu, 24 Aug 2023 01:59:59 +0200 Subject: [PATCH] symbolic regex integration --- source/regex.c | 477 ++++++++++++++++++++++++++++++++++-------------------- source/regex.h | 4 +- source/regex2.h | 329 ------------------------------------- source/regex2.hpp | 331 ------------------------------------- 4 files changed, 305 insertions(+), 836 deletions(-) delete mode 100644 source/regex2.h delete mode 100644 source/regex2.hpp diff --git a/source/regex.c b/source/regex.c index 3dcb16a..98741aa 100644 --- a/source/regex.c +++ b/source/regex.c @@ -1,211 +1,338 @@ -#include "regex.h" +#include +#include +#include "vector.h" -bool is_case_on = true; +typedef struct { + int in; + char input; + int to; +} delta_t; -static bool is_next_valid(const char * const s) { - return *(s + 1); -} +typedef struct { + int in; + int to; +} offshoot_t; -static bool char_in_range(const char start, - const char end, - const char character) { - if (start > end){ - return false; - } +typedef struct { + char * str; + vector_t delta_table; // + vector_t catch_table; // + int accepting_state; +} regex_t; - for (char c = start; c != end; c++) { - if (character == c) { +#define HALT_AND_CATCH_FIRE -1 + +#define HOOK_ALL(from, str, to) do { \ + for (char * s = str; *s != '\00'; s++) { \ + vector_push(regex->delta_table \ + (delta_t *){state + from, *s, state + to} \ + ); \ + } \ + if (do_catch) { \ + vector_push(regex->catch_table \ + (offshoot_t *){state + from, state + to} \ + ); \ + } \ +} while (0) + +#define EAT(n) do { \ + s += n; \ +} while (0) + +bool is_quantifier(const char c){ + for (const char * s = "+*?"; *s != '\00'; s++) { + if (*s == c) { return true; } } - return false; } -static bool is_word_separator(const char character) { - return (( isascii(character)) - && (!isalnum(character)) - && ( character != '_')); + +int escape_1_to_1(const char c, char * whitelist) { + switch(c) { + case 't': { + strcat(whitelist, "\t"); + } return 1; + case 'n': { + strcat(whitelist, "\n"); + } return 1; + case 'r': { + strcat(whitelist, "\r"); + } return 1; + case 'b': { + strcat(whitelist, "\b"); + } return 1; + case '[': { + strcat(whitelist, "["); + } return 1; + case ']': { + strcat(whitelist, "]"); + } return 1; + case '.': { + strcat(whitelist, "."); + } return 1; + case '?': { + strcat(whitelist, "?"); + } return 1; + case '+': { + strcat(whitelist, "+"); + } return 1; + case '*': { + strcat(whitelist, "*"); + } return 1; + case '\\': { + strcat(whitelist, "\\"); + } return 1; + } + + return 0; } -static bool magic(const char magic_char, const char to_enchant) { - switch(magic_char){ - // \i identifier character (see 'isident' option) - // \I like "\i", but excluding digits - // \k keyword character (see 'iskeyword' option) - // \K like "\k", but excluding digits - // \f file name character (see 'isfname' option) - // \F like "\f", but excluding digits - // \p printable character (see 'isprint' option) - // \P like "\p", but excluding digits +int escape_1_to_N(const char c, char * whitelist) { + switch(c) { + case 'i': { + const char identifier_chars[] = "@0123456789_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; + strcpy(whitelist, identifier_chars); + return sizeof(identifier_chars)-1; + }; + case 'I': { + const char identifier_chars[] = "@_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; + strcpy(whitelist, identifier_chars); + return sizeof(identifier_chars)-1; + }; + case 'k': { + const char keyword_chars[] = "@0123456789_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; + strcpy(whitelist, keyword_chars); + return sizeof(keyword_chars)-1; + }; + case 'K': { + const char keyword_chars[] = "@_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; + strcpy(whitelist, keyword_chars); + return sizeof(keyword_chars)-1; + }; + case 'f': { + const char filename_chars[] = "@0123456789/.-_+,#$%~="; + strcpy(whitelist, keyword_chars); + return sizeof(keyword_chars)-1; + }; + case 'F': { + const char filename_chars[] = "@/.-_+,#$%~="; + strcpy(whitelist, keyword_chars); + return sizeof(keyword_chars)-1; + }; + case 'p': { + const char printable_chars[] = "@\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; + strcpy(whitelist, printable_chars); + return sizeof(printable_chars)-1; + }; + case 'P': { + const char printable_chars[] = "@\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; + strcpy(whitelist, printable_chars); + return sizeof(printable_chars)-1; + }; case 's': { - return ((to_enchant == ' ') || (to_enchant == '\t')); - } - case 'S': { - return !((to_enchant == ' ') || (to_enchant == '\t')); - } - case 'd': { // [0-9] - return char_in_range('0', '9', to_enchant); + const char whitespace_chars[] = " \t\v\n"; + strcpy(whitelist, whitespace_chars); + return sizeof(whitespace_chars)-1; }; - case 'D': { // [^0-9] - return !char_in_range('0', '9', to_enchant); + case 'd': { + const char digit_chars[] = "0123456789"; + strcpy(whitelist, digit_chars); + return sizeof(digit_chars)-1; }; - case 'x': { // [0-9A-Fa-f] - return char_in_range('0', '9', to_enchant) || char_in_range('A', 'F', to_enchant) || char_in_range('a', 'f', to_enchant); + case 'x': { + const char hex_chars[] = "0123456789abcdefABCDEF"; + strcpy(whitelist, hex_chars); + return sizeof(hex_chars)-1; }; - case 'X': { // [^0-9A-Fa-f] - return !char_in_range('0', '9', to_enchant) && !char_in_range('A', 'F', to_enchant) && !char_in_range('a', 'f', to_enchant); + case 'o': { + const char oct_chars[] = "01234567"; + strcpy(whitelist, oct_chars); + return sizeof(oct_chars)-1; }; - case 'o': { // [0-7] - return char_in_range('0', '7', to_enchant); + case 'w': { + const char word_chars[] = "0123456789abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ_"; + strcpy(whitelist, word_chars); + return sizeof(word_chars)-1; }; - case 'O': { // [^0-7] - return !char_in_range('0', '7', to_enchant); + case 'h': { + const char very_word_chars[] = "abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ_"; + strcpy(whitelist, very_word_chars); + return sizeof(very_word_chars)-1; }; - case 'w': { // [0-9A-Za-z_] - return char_in_range('0', '9', to_enchant) || char_in_range('A', 'Z', to_enchant) || char_in_range('a', 'z', to_enchant) || (to_enchant == '_'); + case 'a': { + const char alpha_chars[] = "abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ"; + strcpy(whitelist, alpha_chars); + return sizeof(alpha_chars)-1; }; - case 'W': { // [^0-9A-Za-z_] - return !(char_in_range('0', '9', to_enchant) || char_in_range('A', 'Z', to_enchant) || char_in_range('a', 'z', to_enchant) || (to_enchant == '_')); + case 'l': { + const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz"; + strcpy(whitelist, lower_alpha_chars); + return sizeof(lower_alpha_chars)-1; }; - case 'h': { // [A-Za-z_] - return char_in_range('A', 'Z', to_enchant) || char_in_range('a', 'z', to_enchant) || (to_enchant == '_'); - }; - case 'H': { // [^A-Za-z_] - return !(char_in_range('A', 'Z', to_enchant) || char_in_range('a', 'z', to_enchant) || (to_enchant == '_')); - }; - case 'a': { // [A-Za-z] - return char_in_range('A', 'Z', to_enchant) || char_in_range('a', 'z', to_enchant); - }; - case 'A': { // [A-Za-z] - return !(char_in_range('A', 'Z', to_enchant) || char_in_range('a', 'z', to_enchant)); - }; - case 'l': { // [a-z] - return char_in_range('a', 'z', to_enchant); - }; - case 'L': { // [^a-z] - return !(char_in_range('a', 'z', to_enchant)); - }; - case 'u': { // [A-Z] - return char_in_range('A', 'Z', to_enchant); - }; - case 'U': { // [^A-Z] - return !(char_in_range('A', 'Z', to_enchant)); + case 'u': { + const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ"; + strcpy(whitelist, upper_alpha_chars); + return sizeof(upper_alpha_chars)-1; }; } + return 0; +} + +int compile_range(const char * const range, + char * whitelist) { + assert(range[0] == '[' && "Not a range."); + + int r = 0; + const char * s; + for (s = range+1; *s != ']'; s++) { + assert(*s != '\00' && "Unclosed range."); + char c = *s; + if (escape_1_to_1(c, whitelist) + || escape_1_to_N(c, whitelist)) { + ; + } else if (*(s+1) == '-') { + char end = *(s+2); + assert(c < end && "Endless range."); + for (char cc = c; cc < end+1; cc++) { + strncat(whitelist, &cc, 1); + strncat(whitelist, "\00", 1); + } + s += 2; + } else { + ++r; + strncat(whitelist, &c, 1); + strncat(whitelist, "\00", 1); + } + } + + return ((s - range) + 1); +} + +regex_t * regex_compile(const char * const pattern) { + regex_t * r = new regex_t; + regex->str = strdup(pattern); + vector_init(regex->delta_table, sizeof(delta_t), 32); + vector_init(regex->catch_table, sizeof(offshoot_t), 16); + + int state = 0; + + char whitelist[64]; + bool do_catch; + for (const char * s = pattern; *s != '\00';) { + // Get token + assert(!is_quantifier(*pattern) && "Pattern starts with quantifier."); + whitelist[0] = '\00'; + do_catch = false; + switch (*s) { + case '.': { + do_catch = true; + } break; + case '\\': { + EAT(1); + if(escape_1_to_1(*s, whitelist) + || escape_1_to_N(*s, whitelist)){ + ; + } else { + assert(!"Unknown escape."); + } + } break; + case '[': { + EAT(compile_range(s, whitelist)-1); + } break; + default: { + whitelist[0] = *s; + whitelist[1] = '\00'; + } break; + } + + EAT(1); + + // Quantifier + switch (*s) { + case '?': { + HOOK_ALL(0, whitelist, +1); + EAT(1); + } break; + case '*': { + HOOK_ALL(0, whitelist, 0); + EAT(1); + } break; + case '+': { + HOOK_ALL(0, whitelist, +1); + state += 1; + HOOK_ALL(0, whitelist, 0); + EAT(1); + } break; + default: { // Literal + HOOK_ALL(0, whitelist, +1); + state += 1; + } break; + } + } + + regex->accepting_state = state; + + return r; +} + +int regex_free(regex_t * const regex) { + free(regex->str); + vector_free(regex->delta_table); + vector_free(regex->catch_table); + return 0; +} + +inline bool catch_(const regex_t * const regex, + int & state) { + + for (int i = 0; i < regex->catch_table->element_size; i++){ + const offshoot_t * const offshoot = (offshoot *)(vector_get(reg.catch_table, i)); + if (offshoot->in == state) { + state = offshoot->to; + return true; + } + } return false; } -int regex_match(const char * const pattern, - const char * const string_start, - const int string_offset, - int * match_offset_) { - const char * pattern_pointer = pattern; - const char * string_pointer = string_start + string_offset; - const char * const match_base = string_pointer; - int match_offset = 0; +bool regex_assert(const regex_t * const regex, + const char * const string, + int state) { - while (1488) { - // End of one of the arguments - if (!(*pattern_pointer)) { - break; - } - if (!(*string_pointer)) { - return false; - } - - // Escape character - if (*pattern_pointer == '\\') { - if (!is_next_valid(pattern_pointer)) { - return false; - } - - switch(*(pattern_pointer + 1)){ - case 't': { - if (*(string_pointer + 1) == '\t') { - pattern_pointer += 2; - string_pointer += 1; - } else { - return false; - } - } break; - case 'r': { - if (*(string_pointer + 1) == '\r') { - pattern_pointer += 2; - string_pointer += 1; - } else { - return false; - } - } break; - case 'e': { - if (*(string_pointer + 1) == '\033') { - pattern_pointer += 2; - string_pointer += 1; - } else { - return false; - } - } break; - case 'b': { - if (*(string_pointer + 1) == '\010') { - pattern_pointer += 2; - string_pointer += 1; - } else { - return false; - } - } break; - } - - if (*(pattern_pointer + 1) == '\\') { - if (*string_pointer == '\\') { - pattern_pointer += 2; - string_pointer += 1; - continue; + for (const char * s = string; *s != '\00'; s++) { + // delta + for (int i = 0; i < regex->delta_table.element_count; i++) { + const delta_t * const delta = (delta_t *)(vector_get(reg.delta_table, i); + if (delta->in == state) + && (delta->input == *s)) { + if(regex_assert(regex, s+1, delta->to)){ + return true; } } - - if (*(pattern_pointer + 1) == '<') { - if (is_word_separator(*string_pointer)) { - pattern_pointer += 2; - string_pointer += 1; - match_offset += 1; - continue; - } else if (string_pointer == string_start) { - pattern_pointer += 2; - continue; - } - } - - if (*(pattern_pointer + 1) == '>') { - if (is_word_separator(*string_pointer)) { - pattern_pointer += 2; - continue; - } - if (*(string_pointer + 1) == '\00') { - break; - } - } - - if (magic(*(pattern_pointer + 1), *string_pointer)) { - pattern_pointer += 2; - string_pointer += 1; - continue; - } - - return false; } - // Literal - if (*pattern_pointer != *string_pointer) { - return false; - } else { - ++pattern_pointer; - ++string_pointer; + if (catch_(regex, state)) { + continue; } + + return false; } - if (match_offset_) { - *match_offset_ = match_offset; - } - return (string_pointer - match_base) - match_offset; + return (state == regex->accepting_state); +} + +bool regex_search( regex_t * regex, + const char * const string) { + + if (regex == NULL) { + return false; + } + if (string == NULL) { + return true; + } + + return regex_assert(regex, string, 0); } diff --git a/source/regex.h b/source/regex.h index 11706f4..219c2ef 100644 --- a/source/regex.h +++ b/source/regex.h @@ -3,4 +3,6 @@ extern bool is_case_on; -extern int regex_match(const char * const pattern, const char * const string, const int string_offset, int * match_offset_); +extern regex_t * regex_compile(const char * const pattern); +extern int regex_match(const char * const pattern, const char * const string, const int string_offset, int * match_offset_); +extern int regex_free(regex_t * const regex); diff --git a/source/regex2.h b/source/regex2.h deleted file mode 100644 index 708c41f..0000000 --- a/source/regex2.h +++ /dev/null @@ -1,329 +0,0 @@ -#include -#include - -typedef struct { - int in; - char input; - int to; -} delta_t; - -typedef struct { - int in; - int to; -} offshoot_t; - -typedef struct { - char * str; - std::vector delta_table; - std::vector catch_table; - int accepting_state; -} regex_t; - -#define HALT_AND_CATCH_FIRE -1 - -#define HOOK_ALL(from, str, to) do { \ - for (char * s = str; *s != '\00'; s++) { \ - reg.delta_table.push_back( \ - delta_t{state + from, *s, state + to} \ - ); \ - } \ - if (do_catch) { \ - reg.catch_table.push_back( \ - {state + from, state + to} \ - ); \ - } \ -} while (0) - -#define EAT(n) do { \ - s += n; \ -} while (0) - -bool is_quantifier(const char c){ - for (const char * s = "+*?"; *s != '\00'; s++) { - if (*s == c) { - return true; - } - } - return false; -} - - -int escape_1_to_1(const char c, char * whitelist) { - switch(c) { - case 't': { - strcat(whitelist, "\t"); - } return 1; - case 'n': { - strcat(whitelist, "\n"); - } return 1; - case 'r': { - strcat(whitelist, "\r"); - } return 1; - case 'b': { - strcat(whitelist, "\b"); - } return 1; - case '[': { - strcat(whitelist, "["); - } return 1; - case ']': { - strcat(whitelist, "]"); - } return 1; - case '.': { - strcat(whitelist, "."); - } return 1; - case '?': { - strcat(whitelist, "?"); - } return 1; - case '+': { - strcat(whitelist, "+"); - } return 1; - case '*': { - strcat(whitelist, "*"); - } return 1; - case '\\': { - strcat(whitelist, "\\"); - } return 1; - } - - return 0; -} - -int escape_1_to_N(const char c, char * whitelist) { - switch(c) { - case 'i': { - const char identifier_chars[] = "@0123456789_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; - strcpy(whitelist, identifier_chars); - return sizeof(identifier_chars)-1; - }; - case 'I': { - const char identifier_chars[] = "@_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; - strcpy(whitelist, identifier_chars); - return sizeof(identifier_chars)-1; - }; - case 'k': { - const char keyword_chars[] = "@0123456789_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; - strcpy(whitelist, keyword_chars); - return sizeof(keyword_chars)-1; - }; - case 'K': { - const char keyword_chars[] = "@_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; - strcpy(whitelist, keyword_chars); - return sizeof(keyword_chars)-1; - }; - case 'f': { - const char filename_chars[] = "@0123456789/.-_+,#$%~="; - strcpy(whitelist, keyword_chars); - return sizeof(keyword_chars)-1; - }; - case 'F': { - const char filename_chars[] = "@/.-_+,#$%~="; - strcpy(whitelist, keyword_chars); - return sizeof(keyword_chars)-1; - }; - case 'p': { - const char printable_chars[] = "@\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; - strcpy(whitelist, printable_chars); - return sizeof(printable_chars)-1; - }; - case 'P': { - const char printable_chars[] = "@\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; - strcpy(whitelist, printable_chars); - return sizeof(printable_chars)-1; - }; - case 's': { - const char whitespace_chars[] = " \t\v\n"; - strcpy(whitelist, whitespace_chars); - return sizeof(whitespace_chars)-1; - }; - case 'd': { - const char digit_chars[] = "0123456789"; - strcpy(whitelist, digit_chars); - return sizeof(digit_chars)-1; - }; - case 'x': { - const char hex_chars[] = "0123456789abcdefABCDEF"; - strcpy(whitelist, hex_chars); - return sizeof(hex_chars)-1; - }; - case 'o': { - const char oct_chars[] = "01234567"; - strcpy(whitelist, oct_chars); - return sizeof(oct_chars)-1; - }; - case 'w': { - const char word_chars[] = "0123456789abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ_"; - strcpy(whitelist, word_chars); - return sizeof(word_chars)-1; - }; - case 'h': { - const char very_word_chars[] = "abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ_"; - strcpy(whitelist, very_word_chars); - return sizeof(very_word_chars)-1; - }; - case 'a': { - const char alpha_chars[] = "abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ"; - strcpy(whitelist, alpha_chars); - return sizeof(alpha_chars)-1; - }; - case 'l': { - const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz"; - strcpy(whitelist, lower_alpha_chars); - return sizeof(lower_alpha_chars)-1; - }; - case 'u': { - const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ"; - strcpy(whitelist, upper_alpha_chars); - return sizeof(upper_alpha_chars)-1; - }; - } - - return 0; -} - -int compile_range(const char * const range, - char * whitelist) { - assert(range[0] == '[' && "Not a range."); - - int r = 0; - const char * s; - for (s = range+1; *s != ']'; s++) { - assert(*s != '\00' && "Unclosed range."); - char c = *s; - if (escape_1_to_1(c, whitelist) - || escape_1_to_N(c, whitelist)) { - ; - } else if (*(s+1) == '-') { - char end = *(s+2); - assert(c < end && "Endless range."); - for (char cc = c; cc < end+1; cc++) { - strncat(whitelist, &cc, 1); - strncat(whitelist, "\00", 1); - } - s += 2; - } else { - ++r; - strncat(whitelist, &c, 1); - strncat(whitelist, "\00", 1); - } - } - - return ((s - range) + 1); -} - -regex_t * regex_compile(const char * const pattern) { - regex_t * r = new regex_t; - regex_t ® = *r; - reg.str = strdup(pattern); - - int state = 0; - - char whitelist[64]; - bool do_catch; - for (const char * s = pattern; *s != '\00';) { - // Get token - assert(!is_quantifier(*pattern) && "Pattern starts with quantifier."); - whitelist[0] = '\00'; - do_catch = false; - switch (*s) { - case '.': { - do_catch = true; - } break; - case '\\': { - EAT(1); - if(escape_1_to_1(*s, whitelist) - || escape_1_to_N(*s, whitelist)){ - ; - } else { - assert(!"Unknown escape."); - } - } break; - case '[': { - EAT(compile_range(s, whitelist)-1); - } break; - default: { - whitelist[0] = *s; - whitelist[1] = '\00'; - } break; - } - - EAT(1); - - // Quantifier - switch (*s) { - case '?': { - HOOK_ALL(0, whitelist, +1); - EAT(1); - } break; - case '*': { - HOOK_ALL(0, whitelist, 0); - EAT(1); - } break; - case '+': { - HOOK_ALL(0, whitelist, +1); - state += 1; - HOOK_ALL(0, whitelist, 0); - EAT(1); - } break; - default: { // Literal - HOOK_ALL(0, whitelist, +1); - state += 1; - } break; - } - } - - reg.accepting_state = state; - - return r; -} - -inline bool catch_(const regex_t * regex, - int & state) { - - const regex_t ® = *regex; - for (int i = 0; i < reg.catch_table.size(); i++){ - if (reg.catch_table[i].in == state) { - state = reg.catch_table[i].to; - return true; - } - } - return false; -} - -bool regex_assert(const regex_t * const regex, - const char * const string, - int state) { - - const regex_t ® = *regex; - for (const char * s = string; *s != '\00'; s++) { - // delta - for (int i = 0; i < reg.delta_table.size(); i++) { - if ((reg.delta_table[i].in == state) - && (reg.delta_table[i].input == *s)) { - if(regex_assert(regex, s+1, reg.delta_table[i].to)){ - return true; - } - } - } - - if (catch_(regex, state)) { - continue; - } - - return false; - } - - return (state == regex->accepting_state); -} - -bool regex_search( regex_t * regex, - const char * const string) { - - if (regex == NULL) { - return false; - } - if (string == NULL) { - return true; - } - - return regex_assert(regex, string, 0); -} diff --git a/source/regex2.hpp b/source/regex2.hpp deleted file mode 100644 index de3f4c9..0000000 --- a/source/regex2.hpp +++ /dev/null @@ -1,331 +0,0 @@ -#include -#include -#include "vector.h" - -typedef struct { - int in; - char input; - int to; -} delta_t; - -typedef struct { - int in; - int to; -} offshoot_t; - -typedef struct { - char * str; - vector_t delta_table; // - vector_t catch_table; // - int accepting_state; -} regex_t; - -#define HALT_AND_CATCH_FIRE -1 - -#define HOOK_ALL(from, str, to) do { \ - for (char * s = str; *s != '\00'; s++) { \ - vector_push(®.delta_table \ - (delta_t *){state + from, *s, state + to} \ - ); \ - } \ - if (do_catch) { \ - vector_push(®.catch_table \ - (offshoot_t *){state + from, state + to} \ - ); \ - } \ -} while (0) - -#define EAT(n) do { \ - s += n; \ -} while (0) - -bool is_quantifier(const char c){ - for (const char * s = "+*?"; *s != '\00'; s++) { - if (*s == c) { - return true; - } - } - return false; -} - - -int escape_1_to_1(const char c, char * whitelist) { - switch(c) { - case 't': { - strcat(whitelist, "\t"); - } return 1; - case 'n': { - strcat(whitelist, "\n"); - } return 1; - case 'r': { - strcat(whitelist, "\r"); - } return 1; - case 'b': { - strcat(whitelist, "\b"); - } return 1; - case '[': { - strcat(whitelist, "["); - } return 1; - case ']': { - strcat(whitelist, "]"); - } return 1; - case '.': { - strcat(whitelist, "."); - } return 1; - case '?': { - strcat(whitelist, "?"); - } return 1; - case '+': { - strcat(whitelist, "+"); - } return 1; - case '*': { - strcat(whitelist, "*"); - } return 1; - case '\\': { - strcat(whitelist, "\\"); - } return 1; - } - - return 0; -} - -int escape_1_to_N(const char c, char * whitelist) { - switch(c) { - case 'i': { - const char identifier_chars[] = "@0123456789_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; - strcpy(whitelist, identifier_chars); - return sizeof(identifier_chars)-1; - }; - case 'I': { - const char identifier_chars[] = "@_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; - strcpy(whitelist, identifier_chars); - return sizeof(identifier_chars)-1; - }; - case 'k': { - const char keyword_chars[] = "@0123456789_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; - strcpy(whitelist, keyword_chars); - return sizeof(keyword_chars)-1; - }; - case 'K': { - const char keyword_chars[] = "@_\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; - strcpy(whitelist, keyword_chars); - return sizeof(keyword_chars)-1; - }; - case 'f': { - const char filename_chars[] = "@0123456789/.-_+,#$%~="; - strcpy(whitelist, keyword_chars); - return sizeof(keyword_chars)-1; - }; - case 'F': { - const char filename_chars[] = "@/.-_+,#$%~="; - strcpy(whitelist, keyword_chars); - return sizeof(keyword_chars)-1; - }; - case 'p': { - const char printable_chars[] = "@\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; - strcpy(whitelist, printable_chars); - return sizeof(printable_chars)-1; - }; - case 'P': { - const char printable_chars[] = "@\241\242\243\244\245\246\247\250\251\252\253\254\255\256\257\260\261\262\263\264\265\266\267\270\271\272\273\274\275\276\277\300\301\302\303\304\305\306\307\310\311\312\313\314\315\316\317\320\321\322\323\324\325\326\327\330\331\332\333\334\335\336\337"; - strcpy(whitelist, printable_chars); - return sizeof(printable_chars)-1; - }; - case 's': { - const char whitespace_chars[] = " \t\v\n"; - strcpy(whitelist, whitespace_chars); - return sizeof(whitespace_chars)-1; - }; - case 'd': { - const char digit_chars[] = "0123456789"; - strcpy(whitelist, digit_chars); - return sizeof(digit_chars)-1; - }; - case 'x': { - const char hex_chars[] = "0123456789abcdefABCDEF"; - strcpy(whitelist, hex_chars); - return sizeof(hex_chars)-1; - }; - case 'o': { - const char oct_chars[] = "01234567"; - strcpy(whitelist, oct_chars); - return sizeof(oct_chars)-1; - }; - case 'w': { - const char word_chars[] = "0123456789abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ_"; - strcpy(whitelist, word_chars); - return sizeof(word_chars)-1; - }; - case 'h': { - const char very_word_chars[] = "abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ_"; - strcpy(whitelist, very_word_chars); - return sizeof(very_word_chars)-1; - }; - case 'a': { - const char alpha_chars[] = "abcdefghijklmnopqrstuwxyzABCDEFGHIJKLMNOPQRSTUWXYZ"; - strcpy(whitelist, alpha_chars); - return sizeof(alpha_chars)-1; - }; - case 'l': { - const char lower_alpha_chars[] = "abcdefghijklmnopqrstuwxyz"; - strcpy(whitelist, lower_alpha_chars); - return sizeof(lower_alpha_chars)-1; - }; - case 'u': { - const char upper_alpha_chars[] = "ABCDEFGHIJKLMNOPQRSTUWXYZ"; - strcpy(whitelist, upper_alpha_chars); - return sizeof(upper_alpha_chars)-1; - }; - } - - return 0; -} - -int compile_range(const char * const range, - char * whitelist) { - assert(range[0] == '[' && "Not a range."); - - int r = 0; - const char * s; - for (s = range+1; *s != ']'; s++) { - assert(*s != '\00' && "Unclosed range."); - char c = *s; - if (escape_1_to_1(c, whitelist) - || escape_1_to_N(c, whitelist)) { - ; - } else if (*(s+1) == '-') { - char end = *(s+2); - assert(c < end && "Endless range."); - for (char cc = c; cc < end+1; cc++) { - strncat(whitelist, &cc, 1); - strncat(whitelist, "\00", 1); - } - s += 2; - } else { - ++r; - strncat(whitelist, &c, 1); - strncat(whitelist, "\00", 1); - } - } - - return ((s - range) + 1); -} - -regex_t * regex_compile(const char * const pattern) { - regex_t * r = new regex_t; - regex_t ® = *r; - reg.str = strdup(pattern); - - int state = 0; - - char whitelist[64]; - bool do_catch; - for (const char * s = pattern; *s != '\00';) { - // Get token - assert(!is_quantifier(*pattern) && "Pattern starts with quantifier."); - whitelist[0] = '\00'; - do_catch = false; - switch (*s) { - case '.': { - do_catch = true; - } break; - case '\\': { - EAT(1); - if(escape_1_to_1(*s, whitelist) - || escape_1_to_N(*s, whitelist)){ - ; - } else { - assert(!"Unknown escape."); - } - } break; - case '[': { - EAT(compile_range(s, whitelist)-1); - } break; - default: { - whitelist[0] = *s; - whitelist[1] = '\00'; - } break; - } - - EAT(1); - - // Quantifier - switch (*s) { - case '?': { - HOOK_ALL(0, whitelist, +1); - EAT(1); - } break; - case '*': { - HOOK_ALL(0, whitelist, 0); - EAT(1); - } break; - case '+': { - HOOK_ALL(0, whitelist, +1); - state += 1; - HOOK_ALL(0, whitelist, 0); - EAT(1); - } break; - default: { // Literal - HOOK_ALL(0, whitelist, +1); - state += 1; - } break; - } - } - - reg.accepting_state = state; - - return r; -} - -inline bool catch_(const regex_t * regex, - int & state) { - - const regex_t ® = *regex; - for (int i = 0; i < reg.catch_table.size(); i++){ - if (reg.catch_table[i].in == state) { - state = reg.catch_table[i].to; - return true; - } - } - return false; -} - -bool regex_assert(const regex_t * const regex, - const char * const string, - int state) { - - const regex_t ® = *regex; - for (const char * s = string; *s != '\00'; s++) { - // delta - for (int i = 0; i < reg.delta_table.size(); i++) { - const delta_t * const delta = (delta_t *)(vector_get(reg.delta_table, i); - if (delta->in == state) - && (delta->input == *s)) { - if(regex_assert(regex, s+1, delta->to)){ - return true; - } - } - } - - if (catch_(regex, state)) { - continue; - } - - return false; - } - - return (state == regex->accepting_state); -} - -bool regex_search( regex_t * regex, - const char * const string) { - - if (regex == NULL) { - return false; - } - if (string == NULL) { - return true; - } - - return regex_assert(regex, string, 0); -}