Compare commits

...

5 Commits

Author SHA1 Message Date
3d56f3021b solved the \< situation; \> WIP 2023-09-22 20:37:20 +02:00
8d642f14ce print test number 2023-09-22 20:36:42 +02:00
d9a148d825 slightly improoved tests 2023-09-22 20:36:33 +02:00
900d7ecf7e documentation of some value 2023-09-22 20:36:14 +02:00
84a5d503dc todo 2023-09-22 20:36:05 +02:00
5 changed files with 144 additions and 40 deletions

View File

@ -1,11 +1,11 @@
# Abstraction
+---------------------+
| |
| |
| State register |
| |
| |
+---------------------+
+---------------------+
| |
| |
| State register |
| |
| |
+---------------------+
+---------------------------------+
@ -16,3 +16,87 @@
+---------------------------------+
| Fallback transition table |
+---------------------------------+
---
State transition table look up
+ success --> continue
+ fail --> look up fallback table
+ success --> continue
+ fail --> return
? EOS --> look up fallback table
+ success --> is 0 width?
+ success --> continue
+ fail --> return
+ fail --> return
---
##### HALT\_AND\_CATCH\_FIRE
H&C is a special state signalling that we have hit a dead end.
The reason why need it and we cant just instanly quick is backtracking.
---
##### [^example]
This is a negative range.
```
let myNegativeRange = {'e', 'x', 'a', 'm', 'p', 'l'}
```
None of the characters in $myNegativeRange must be accepted.
The way this is a compiled is that we first hook all chars in $myNegativeRange to H&C,
then define an OFFSHOOT of width 1.
Put differently:
if we read something illegal we abort this branch,
if what we read was not illegal, we deduct that it must have been legal and we continue.
Handling "negatives" this way allows us to be "alphabet agnostic" in a sense.
Many implementations will presume ASCII, with its fixed 7/8 bit width
and create look up tables.
Which is fast and cute, but this strategy becomes a giant memory hog
if we ever wanted to use it on, say UTF-8 (from 256 te/c (table entries per char) to 4'294'967'295 te/c).
#### .
This is the dot operator.
It matches any 1 char.
Similar how negative ranges are implemented,
it takes advantage of the fallback table.
It simply ignores the state transition table and rather unconditionally hooks itself to the next state.
#### ^
This is the carrot operator.
It matches the SOS (start of the string).
For explanation purposes multilining (match '\n') is irrelevant.
That behaves just like a literal.
What is more interesting is how SOS is recognized.
Since `regex_assert()` is recursive the current state is continuesly passed along,
however at out first frame, it's not just always 0.
`regex_match()` decides depending on the current position of the string.
Basically we have the first 2 states (0, 1) reserved and always missing from the state transmission table.
+ 0 - SOS
+ 1 - !SOS
Normally both are _hooked_ to state 2,
and we pretend nothing has ever happened.
But when carrot operator is compiled, it sets a special compiler flag FORCE\_START\_OF\_STRING,
which forbids the hooking of state 1 to 2,
therefor when `regex_match()` calls from, say position 2,
it passes in 1 as the starting state,
no state transition table entry will be found since thats forbidden to begin with,
no jumps are found(!),
the machine checks whether the current state (1) is the accepting state (>=2)
and finally returns failiour.
#### \<
This is the SOW (start of word) operator.
SOW must match:
```
^myword
[^\h]myword
```
Not only that, this combination is key,
either it has to be the start of the string
or there has to be at least something which is not a symbol char.
With out the last condition "eexample" would match "\\\<exaplme\\\>"
as the iteration of `regex_match()` reaches "example".

4
documentation/TODO.md Normal file
View File

@ -0,0 +1,4 @@
[ ] wchar\_t support
[ ] UTF-8 support
[ ] arbitrary memory support (this probably covers UTF-8 support)
[ ] documentation thats not shit

View File

@ -68,13 +68,13 @@ bool is_magic(const char c) {
"\331\332\333\334\335" \
"\336\337"
#define JEGER_CHAR_SET_file_extra "/.-_+,#$%~="
#define JEGER_CHAR_SET_whitespace " \t\v\n"
#define JEGER_CHAR_SET_whitespace " " "\t\v\n"
static const char JEGER_CHAR_very_word_chars[] =
JEGER_CHAR_SET_underscore
JEGER_CHAR_SET_lower
JEGER_CHAR_SET_upper
;
static const char JEGER_CHAR_symbol_chars[] =
JEGER_CHAR_SET_underscore
JEGER_CHAR_SET_lower
JEGER_CHAR_SET_upper
;
// ----------------------
// ### Internal Types ###
@ -318,9 +318,9 @@ int escape_1_to_N(const char c,
return sizeof(word_chars)-1;
};
case 'h': {
// #global JEGER_CHAR_very_word_chars
strcpy(target_list, JEGER_CHAR_very_word_chars);
return sizeof(JEGER_CHAR_very_word_chars)-1;
// #global JEGER_CHAR_symbol_chars
strcpy(target_list, JEGER_CHAR_symbol_chars);
return sizeof(JEGER_CHAR_symbol_chars)-1;
};
case 'a': {
const char alpha_chars[] = JEGER_CHAR_SET_lower
@ -503,7 +503,7 @@ regex_t * regex_compile(const char * const pattern) {
if (compile_escape(*s, &cs)) {
s += 1;
} else if (is_hologram_escape(*s)) {
;
s -= 1;
} else {
assert("Unknown escape.");
}
@ -518,6 +518,12 @@ regex_t * regex_compile(const char * const pattern) {
} break;
}
/* Ew */
if (*s == '\\'
&& is_hologram_escape(*(s+1))) {
++s;
}
// Compile char
switch (*s) {
// holograms
@ -533,18 +539,28 @@ regex_t * regex_compile(const char * const pattern) {
s += 1;
} break;
case '<': {
cs.flags |= IS_NEGATIVE | INCREMENT_STATE;
if (cs.flags & IS_AT_THE_BEGINNING) {
ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE+1, 0, 0, regex);
unsigned true_inc = 1;
if ((cs.flags & DO_CATCH)
|| (cs.flags & IS_NEGATIVE)) {
OFFSHOOT(0, +1, 1, 1, &cs, regex);
OFFSHOOT(+1, +2, 1, 1, &cs, regex);
++true_inc;
} else {
cs.flags |= INCREMENT_STATE;
}
strcat(blacklist, JEGER_CHAR_very_word_chars);
OFFSHOOT(0, 0, 1, 0, &cs, regex);
cs.flags |= IS_NEGATIVE;
if (cs.flags & IS_AT_THE_BEGINNING) {
ABSOLUTE_OFFSHOOT(0, JEGER_INIT_STATE + true_inc, 0, 0, regex);
}
strcat(blacklist, JEGER_CHAR_symbol_chars);
//OFFSHOOT(0 + (true_inc-1), +true_inc, 1, 0, &cs, regex);
s += 1;
} break;
case '>': {
HOOK_ALL(0, whitelist, 0, &cs, regex);
cs.flags |= IS_NEGATIVE | INCREMENT_STATE;
strcat(blacklist, JEGER_CHAR_very_word_chars);
OFFSHOOT(0, 1, 0, 0, &cs, regex);
strcat(blacklist, JEGER_CHAR_symbol_chars);
OFFSHOOT(+1, +2, 0, 0, &cs, regex);
s += 1;
} break;
// quantifiers
@ -729,8 +745,8 @@ match_t * regex_match(const regex_t * const regex,
// Find all matches
{
const char * s = string;
int initial_state;
do {
int initial_state;
initial_state = (int)(!(is_start_of_string && (s == string)));
*match = (match_t){

View File

@ -95,15 +95,17 @@ signed main() {
TEST( R"del(\<test)del", "test", true);
TEST( R"del(test\>)del", "test", true);
TEST( R"del(\<test)del", "atest", false);
TEST( R"del(\<test)del", "ttest", false);
TEST( R"del(test\>)del", "testa", false);
TEST(R"del(\<test\>)del", "test", true);
puts("");
TEST(R"del(\<int\>)del", "printf", false);
TEST(R"del(\<print\>)del", " print ", true);
TEST(R"del(\<print\>)del", "\nprint\n", true);
TEST( R"del(\<int\>)del", "printf", false);
TEST(R"del(.\<print\>.)del", " print ", true);
TEST(R"del(.\<print\>.)del", "fprint", false);
TEST(R"del(.\<print\>.)del", "printf", false);
TEST(R"del(.\<print\>.)del", "fprintf", false);
if (test_counter == passed_tests) {
fputs("\033[32m", stdout);

View File

@ -12,7 +12,7 @@ static int passed_tests2 = 0;
static
void asprint_match_t( char * * destination,
const match_t * const match){
const match_t * const match) {
if (match) {
asprintf(destination, "%p {%d, %d}", (void *)match, match->position, match->width);
} else {
@ -21,11 +21,11 @@ void asprint_match_t( char * * destination,
}
static
void print_leader(const bool passed){
void print_leader(const bool passed, const int n) {
if (passed) {
printf("\033[32;1mSuccess\033[0;1m. - \033[0m");
printf("\033[32;1mSuccess\033[0m. %02d\033[1m - \033[0m", n);
} else {
printf("\033[31;1mFailiour\033[0;1m. - \033[0m");
printf("\033[31;1mFailiour\033[0m. %02d\033[1m - \033[0m", n);
}
}
@ -39,9 +39,10 @@ void do_flush(void) {
static
void TEST(const char * const what,
const char * const on,
const bool expect){
const bool expect) {
do_flush();
++test_counter;
regex_t * r = regex_compile(what);
bool result = regex_search(r, on);
@ -51,7 +52,7 @@ void TEST(const char * const what,
expect ? ++positives : ++negatives;
print_leader(passed);
print_leader(passed, test_counter);
char * quoted_what, * quoted_on;
asprintf(&quoted_what, "'%s'", what);
@ -63,8 +64,6 @@ void TEST(const char * const what,
++passed_tests;
expect ? ++positive_successes : ++negative_successes;
}
++test_counter;
}
static
@ -73,6 +72,7 @@ void TEST2(const char * const what,
const match_t expect){
do_flush();
++test_counter2;
regex_t * r = regex_compile(what);
match_t * result = regex_match(r, on, true);
@ -80,7 +80,7 @@ void TEST2(const char * const what,
&& result->width == expect.width
);
print_leader(passed);
print_leader(passed, test_counter2);
char * quoted_what, * quoted_on;
asprintf(&quoted_what, "'%s'", what);
@ -98,6 +98,4 @@ void TEST2(const char * const what,
if (passed) {
++passed_tests2;
}
++test_counter2;
}