Compare commits
5 Commits
7b6ba63819
...
3d56f3021b
Author | SHA1 | Date | |
---|---|---|---|
3d56f3021b | |||
8d642f14ce | |||
d9a148d825 | |||
900d7ecf7e | |||
84a5d503dc |
@ -1,11 +1,11 @@
|
||||
# Abstraction
|
||||
+---------------------+
|
||||
| |
|
||||
| |
|
||||
| State register |
|
||||
| |
|
||||
| |
|
||||
+---------------------+
|
||||
+---------------------+
|
||||
| |
|
||||
| |
|
||||
| State register |
|
||||
| |
|
||||
| |
|
||||
+---------------------+
|
||||
|
||||
|
||||
+---------------------------------+
|
||||
@ -16,3 +16,87 @@
|
||||
+---------------------------------+
|
||||
| Fallback transition table |
|
||||
+---------------------------------+
|
||||
|
||||
---
|
||||
State transition table look up
|
||||
+ success --> continue
|
||||
+ fail --> look up fallback table
|
||||
+ success --> continue
|
||||
+ fail --> return
|
||||
? EOS --> look up fallback table
|
||||
+ success --> is 0 width?
|
||||
+ success --> continue
|
||||
+ fail --> return
|
||||
+ fail --> return
|
||||
---
|
||||
##### HALT\_AND\_CATCH\_FIRE
|
||||
H&C is a special state signalling that we have hit a dead end.
|
||||
The reason why need it and we cant just instanly quick is backtracking.
|
||||
|
||||
---
|
||||
##### [^example]
|
||||
This is a negative range.
|
||||
```
|
||||
let myNegativeRange = {'e', 'x', 'a', 'm', 'p', 'l'}
|
||||
```
|
||||
None of the characters in $myNegativeRange must be accepted.
|
||||
The way this is a compiled is that we first hook all chars in $myNegativeRange to H&C,
|
||||
then define an OFFSHOOT of width 1.
|
||||
Put differently:
|
||||
if we read something illegal we abort this branch,
|
||||
if what we read was not illegal, we deduct that it must have been legal and we continue.
|
||||
|
||||
Handling "negatives" this way allows us to be "alphabet agnostic" in a sense.
|
||||
Many implementations will presume ASCII, with its fixed 7/8 bit width
|
||||
and create look up tables.
|
||||
Which is fast and cute, but this strategy becomes a giant memory hog
|
||||
if we ever wanted to use it on, say UTF-8 (from 256 te/c (table entries per char) to 4'294'967'295 te/c).
|
||||
|
||||
|
||||
#### .
|
||||
This is the dot operator.
|
||||
It matches any 1 char.
|
||||
|
||||
Similar how negative ranges are implemented,
|
||||
it takes advantage of the fallback table.
|
||||
It simply ignores the state transition table and rather unconditionally hooks itself to the next state.
|
||||
|
||||
|
||||
#### ^
|
||||
This is the carrot operator.
|
||||
It matches the SOS (start of the string).
|
||||
|
||||
For explanation purposes multilining (match '\n') is irrelevant.
|
||||
That behaves just like a literal.
|
||||
|
||||
What is more interesting is how SOS is recognized.
|
||||
Since `regex_assert()` is recursive the current state is continuesly passed along,
|
||||
however at out first frame, it's not just always 0.
|
||||
`regex_match()` decides depending on the current position of the string.
|
||||
Basically we have the first 2 states (0, 1) reserved and always missing from the state transmission table.
|
||||
+ 0 - SOS
|
||||
+ 1 - !SOS
|
||||
Normally both are _hooked_ to state 2,
|
||||
and we pretend nothing has ever happened.
|
||||
But when carrot operator is compiled, it sets a special compiler flag FORCE\_START\_OF\_STRING,
|
||||
which forbids the hooking of state 1 to 2,
|
||||
therefor when `regex_match()` calls from, say position 2,
|
||||
it passes in 1 as the starting state,
|
||||
no state transition table entry will be found since thats forbidden to begin with,
|
||||
no jumps are found(!),
|
||||
the machine checks whether the current state (1) is the accepting state (>=2)
|
||||
and finally returns failiour.
|
||||
|
||||
|
||||
#### \<
|
||||
This is the SOW (start of word) operator.
|
||||
SOW must match:
|
||||
```
|
||||
^myword
|
||||
[^\h]myword
|
||||
```
|
||||
Not only that, this combination is key,
|
||||
either it has to be the start of the string
|
||||
or there has to be at least something which is not a symbol char.
|
||||
With out the last condition "eexample" would match "\\\<exaplme\\\>"
|
||||
as the iteration of `regex_match()` reaches "example".
|
||||
|
4
documentation/TODO.md
Normal file
4
documentation/TODO.md
Normal file
@ -0,0 +1,4 @@
|
||||
[ ] wchar\_t support
|
||||
[ ] UTF-8 support
|
||||
[ ] arbitrary memory support (this probably covers UTF-8 support)
|
||||
[ ] documentation thats not shit
|
@ -68,13 +68,13 @@ bool is_magic(const char c) {
|
||||
"\331\332\333\334\335" \
|
||||
"\336\337"
|
||||
#define JEGER_CHAR_SET_file_extra "/.-_+,#$%~="
|
||||
#define JEGER_CHAR_SET_whitespace " \t\v\n"
|
||||
#define JEGER_CHAR_SET_whitespace " " "\t\v\n"
|
||||
|
||||
static const char JEGER_CHAR_very_word_chars[] =
|
||||
JEGER_CHAR_SET_underscore
|
||||
JEGER_CHAR_SET_lower
|
||||
JEGER_CHAR_SET_upper
|
||||
;
|
||||
static const char JEGER_CHAR_symbol_chars[] =
|
||||
JEGER_CHAR_SET_underscore
|
||||
JEGER_CHAR_SET_lower
|
||||
JEGER_CHAR_SET_upper
|
||||
;
|
||||
|
||||
// ----------------------
|
||||
// ### Internal Types ###
|
||||
@ -318,9 +318,9 @@ int escape_1_to_N(const char c,
|
||||
return sizeof(word_chars)-1;
|
||||
};
|
||||
case 'h': {
|
||||
// #global JEGER_CHAR_very_word_chars
|
||||
strcpy(target_list, JEGER_CHAR_very_word_chars);
|
||||
return sizeof(JEGER_CHAR_very_word_chars)-1;
|
||||
// #global JEGER_CHAR_symbol_chars
|
||||
strcpy(target_list, JEGER_CHAR_symbol_chars);
|
||||
return sizeof(JEGER_CHAR_symbol_chars)-1;
|
||||
};
|
||||
case 'a': {
|
||||
const char alpha_chars[] = JEGER_CHAR_SET_lower
|
||||
@ -503,7 +503,7 @@ regex_t * regex_compile(const char * const pattern) {
|
||||
if (compile_escape(*s, &cs)) {
|
||||
s += 1;
|
||||
} else if (is_hologram_escape(*s)) {
|
||||
;
|
||||
s -= 1;
|
||||
} else {
|
||||
assert("Unknown escape.");
|
||||
}
|
||||
@ -518,6 +518,12 @@ regex_t * regex_compile(const char * const pattern) {
|
||||
} break;
|
||||
}
|
||||
|
||||
/* Ew */
|
||||
if (*s == '\\'
|
||||
&& is_hologram_escape(*(s+1))) {
|
||||
++s;
|
||||
}
|
||||
|
||||
// Compile char
|
||||
switch (*s) {
|
||||
// holograms
|
||||
@ -533,18 +539,28 @@ regex_t * regex_compile(const char * const pattern) {
|
||||
s += 1;
|
||||
} break;
|
||||
case '<': {
|
||||
cs.flags |= IS_NEGATIVE | INCREMENT_STATE;
|
||||
if (cs.flags & IS_AT_THE_BEGINNING) {
|
||||
ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE+1, 0, 0, regex);
|
||||
unsigned true_inc = 1;
|
||||
if ((cs.flags & DO_CATCH)
|
||||
|| (cs.flags & IS_NEGATIVE)) {
|
||||
OFFSHOOT(0, +1, 1, 1, &cs, regex);
|
||||
OFFSHOOT(+1, +2, 1, 1, &cs, regex);
|
||||
++true_inc;
|
||||
} else {
|
||||
cs.flags |= INCREMENT_STATE;
|
||||
}
|
||||
strcat(blacklist, JEGER_CHAR_very_word_chars);
|
||||
OFFSHOOT(0, 0, 1, 0, &cs, regex);
|
||||
cs.flags |= IS_NEGATIVE;
|
||||
if (cs.flags & IS_AT_THE_BEGINNING) {
|
||||
ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE + true_inc, 0, 0, regex);
|
||||
}
|
||||
strcat(blacklist, JEGER_CHAR_symbol_chars);
|
||||
//OFFSHOOT(0 + (true_inc-1), +true_inc, 1, 0, &cs, regex);
|
||||
s += 1;
|
||||
} break;
|
||||
case '>': {
|
||||
HOOK_ALL(0, whitelist, 0, &cs, regex);
|
||||
cs.flags |= IS_NEGATIVE | INCREMENT_STATE;
|
||||
strcat(blacklist, JEGER_CHAR_very_word_chars);
|
||||
OFFSHOOT(0, 1, 0, 0, &cs, regex);
|
||||
strcat(blacklist, JEGER_CHAR_symbol_chars);
|
||||
OFFSHOOT(+1, +2, 0, 0, &cs, regex);
|
||||
s += 1;
|
||||
} break;
|
||||
// quantifiers
|
||||
@ -729,8 +745,8 @@ match_t * regex_match(const regex_t * const regex,
|
||||
// Find all matches
|
||||
{
|
||||
const char * s = string;
|
||||
int initial_state;
|
||||
do {
|
||||
int initial_state;
|
||||
initial_state = (int)(!(is_start_of_string && (s == string)));
|
||||
|
||||
*match = (match_t){
|
||||
|
@ -95,15 +95,17 @@ signed main() {
|
||||
|
||||
TEST( R"del(\<test)del", "test", true);
|
||||
TEST( R"del(test\>)del", "test", true);
|
||||
TEST( R"del(\<test)del", "atest", false);
|
||||
TEST( R"del(\<test)del", "ttest", false);
|
||||
TEST( R"del(test\>)del", "testa", false);
|
||||
TEST(R"del(\<test\>)del", "test", true);
|
||||
|
||||
puts("");
|
||||
|
||||
TEST(R"del(\<int\>)del", "printf", false);
|
||||
TEST(R"del(\<print\>)del", " print ", true);
|
||||
TEST(R"del(\<print\>)del", "\nprint\n", true);
|
||||
TEST( R"del(\<int\>)del", "printf", false);
|
||||
TEST(R"del(.\<print\>.)del", " print ", true);
|
||||
TEST(R"del(.\<print\>.)del", "fprint", false);
|
||||
TEST(R"del(.\<print\>.)del", "printf", false);
|
||||
TEST(R"del(.\<print\>.)del", "fprintf", false);
|
||||
|
||||
if (test_counter == passed_tests) {
|
||||
fputs("\033[32m", stdout);
|
||||
|
@ -12,7 +12,7 @@ static int passed_tests2 = 0;
|
||||
|
||||
static
|
||||
void asprint_match_t( char * * destination,
|
||||
const match_t * const match){
|
||||
const match_t * const match) {
|
||||
if (match) {
|
||||
asprintf(destination, "%p {%d, %d}", (void *)match, match->position, match->width);
|
||||
} else {
|
||||
@ -21,11 +21,11 @@ void asprint_match_t( char * * destination,
|
||||
}
|
||||
|
||||
static
|
||||
void print_leader(const bool passed){
|
||||
void print_leader(const bool passed, const int n) {
|
||||
if (passed) {
|
||||
printf("\033[32;1mSuccess\033[0;1m. - \033[0m");
|
||||
printf("\033[32;1mSuccess\033[0m. %02d\033[1m - \033[0m", n);
|
||||
} else {
|
||||
printf("\033[31;1mFailiour\033[0;1m. - \033[0m");
|
||||
printf("\033[31;1mFailiour\033[0m. %02d\033[1m - \033[0m", n);
|
||||
}
|
||||
}
|
||||
|
||||
@ -39,9 +39,10 @@ void do_flush(void) {
|
||||
static
|
||||
void TEST(const char * const what,
|
||||
const char * const on,
|
||||
const bool expect){
|
||||
const bool expect) {
|
||||
|
||||
do_flush();
|
||||
++test_counter;
|
||||
|
||||
regex_t * r = regex_compile(what);
|
||||
bool result = regex_search(r, on);
|
||||
@ -51,7 +52,7 @@ void TEST(const char * const what,
|
||||
|
||||
expect ? ++positives : ++negatives;
|
||||
|
||||
print_leader(passed);
|
||||
print_leader(passed, test_counter);
|
||||
|
||||
char * quoted_what, * quoted_on;
|
||||
asprintf("ed_what, "'%s'", what);
|
||||
@ -63,8 +64,6 @@ void TEST(const char * const what,
|
||||
++passed_tests;
|
||||
expect ? ++positive_successes : ++negative_successes;
|
||||
}
|
||||
|
||||
++test_counter;
|
||||
}
|
||||
|
||||
static
|
||||
@ -73,6 +72,7 @@ void TEST2(const char * const what,
|
||||
const match_t expect){
|
||||
|
||||
do_flush();
|
||||
++test_counter2;
|
||||
|
||||
regex_t * r = regex_compile(what);
|
||||
match_t * result = regex_match(r, on, true);
|
||||
@ -80,7 +80,7 @@ void TEST2(const char * const what,
|
||||
&& result->width == expect.width
|
||||
);
|
||||
|
||||
print_leader(passed);
|
||||
print_leader(passed, test_counter2);
|
||||
|
||||
char * quoted_what, * quoted_on;
|
||||
asprintf("ed_what, "'%s'", what);
|
||||
@ -98,6 +98,4 @@ void TEST2(const char * const what,
|
||||
if (passed) {
|
||||
++passed_tests2;
|
||||
}
|
||||
|
||||
++test_counter2;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user