Compare commits

...

4 Commits

Author SHA1 Message Date
e4106e1a69 . 2023-09-26 16:18:38 +02:00
99127233db backtracking fix; '\<test' == 'ttest' bug recreated 2023-09-26 16:04:45 +02:00
64fcc77fa1 running circles 2023-09-26 16:03:59 +02:00
0e99ecb8d8 yet more work on the docs 2023-09-26 14:42:47 +02:00
4 changed files with 63 additions and 38 deletions

View File

@ -9,20 +9,26 @@ match_t * regex_match(const regex_t * const regex, const char * const string, co
``` ```
Returns a sentinel terminated array of `match_t` objects. Returns a sentinel terminated array of `match_t` objects.
The sentinel object is defined as `(match_t){ .position = -1, .width = -1, };`. The sentinel object is defined as `(match_t){ .position = -1, .width = -1, };`.
The position and width of non-sentinel `match_t`s is guaranteed to be => 0.
```C
bool is_sentinel(const match_t * const match);
```
This is the function you must check whether a `match_t` is a sentinel or not.
I.e. make this the break condition while looping the results.
| Symbol | Meaning (TODO: fill in) | | Symbol | Meaning (TODO: fill in) |
| :----: | :---------------------: | | :----: | :---------------------: |
| . | | | . | |
| = | | | ? | One or zero of the previous token |
| + | | | = | Same as ? |
| * | | | * | Any number of the previous token |
| ? | | | + | One or more of the previous token |
| \\< | | | \\< | Start of word |
| \\> | | | \\> | End of word |
| ^ | | | ^ | Start of string |
| \t | | | \t | Tab |
| \n | | | \n | New line |
| \b | | | \b | |
| \i | | | \i | |
| \I | | | \I | |
@ -33,19 +39,19 @@ The position and width of non-sentinel `match_t`s is guaranteed to be => 0.
| \p | | | \p | |
| \P | | | \P | |
| \s | | | \s | |
| \d | | | \d | Digit char |
| \D | | | \D | Not digit char |
| \x | | | \x | Hex char|
| \X | | | \X | Not hex char |
| \o | | | \o | Octal char |
| \O | | | \O | Not octal char |
| \w | | | \w | Word char|
| \W | | | \W | Not word char|
| \h | | | \h | |
| \a | | | \a | Ascii letter |
| \l | | | \l | Lowercase ascii letter |
| \L | | | \L | Not (lowercase ascii letter) |
| \u | | | \u | Uppercase ascii letter |
| \U | | | \U | Not (uppercase ascii letter) |
| [\<range\>] | | | [\<range\>] | Any of \<range\> |
| [\^\<range\>] | | | [\^\<range\>] | None of \<range\> |

View File

@ -43,7 +43,7 @@ EOS ? --> look up fallback table
| Line | SOS | EOS | | Line | SOS | EOS |
| Word | SOW | EOW | | Word | SOW | EOW |
---
##### HALT\_AND\_CATCH\_FIRE ##### HALT\_AND\_CATCH\_FIRE
H&C is a special state signalling that we have hit a dead end. H&C is a special state signalling that we have hit a dead end.
The reason why need it and we cant just instanly quick is backtracking. The reason why need it and we cant just instanly quick is backtracking.
@ -54,8 +54,8 @@ This is a negative range.
``` ```
let myNegativeRange = {'e', 'x', 'a', 'm', 'p', 'l'} let myNegativeRange = {'e', 'x', 'a', 'm', 'p', 'l'}
``` ```
None of the characters in $myNegativeRange must be accepted. None of the characters in `$myNegativeRange` must be accepted.
The way this is a compiled is that we first hook all chars in $myNegativeRange to H&C, The way this is a compiled is that we first hook all chars in `$myNegativeRange` to H&C,
then define an OFFSHOOT of width 1. then define an OFFSHOOT of width 1.
Put differently: Put differently:
if we read something illegal we abort this branch, if we read something illegal we abort this branch,
@ -79,7 +79,7 @@ It simply ignores the state transition table and rather unconditionally hooks it
#### ^ #### ^
This is the carrot operator. This is the carrot operator.
It matches the SOS (start of the string). It matches the SOS.
For explanation purposes multilining (match '\n') is irrelevant. For explanation purposes multilining (match '\n') is irrelevant.
That behaves just like a literal. That behaves just like a literal.
@ -111,7 +111,7 @@ SOW must match:
[^\h]myword [^\h]myword
``` ```
Not only that, this combination is key, Not only that, this combination is key,
either it has to be the start of the string either it has to be the SOS
or there has to be at least something which is not a symbol char. or there has to be at least something which is not a symbol char.
With out the last condition "eexample" would match "\\\<exaple\\\>" With out the last condition "eexample" would match "\\\<exaple\\\>"
as the iteration of `regex_match()` reaches "example". as the iteration of `regex_match()` reaches "example".

View File

@ -131,7 +131,11 @@ typedef struct {
// ---------------------------------- // ----------------------------------
// ### Regex creation/destruction ### // ### Regex creation/destruction ###
// ---------------------------------- // ----------------------------------
static const int HALT_AND_CATCH_FIRE = INT_MIN; enum {
ASSERTION_FAILURE = 0,
ASSERTION_SUCCESS = 1,
HALT_AND_CATCH_FIRE = INT_MIN,
};
#define ASSERT_HALT(a) ((a == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : (cs->state + a)) #define ASSERT_HALT(a) ((a == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : (cs->state + a))
@ -707,12 +711,12 @@ const offshoot_t * catch_table_lookup(const regex_t * const regex,
} }
static static
bool regex_assert(const regex_t * const regex, int regex_assert(const regex_t * const regex,
const char * const string, const char * const string,
int state, int state,
match_t * const match) { match_t * const match) {
if (state == HALT_AND_CATCH_FIRE) { if (state == HALT_AND_CATCH_FIRE) {
return false; return HALT_AND_CATCH_FIRE;
} }
bool last_stand = false; bool last_stand = false;
@ -753,15 +757,20 @@ bool regex_assert(const regex_t * const regex,
do_reset = true; do_reset = true;
} }
const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match); const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match);
if(r){ if(r == ASSERTION_SUCCESS){
match->width += delta->match_width; match->width += delta->match_width;
return r; return r;
} else if (do_reset) { } else {
if (r == ASSERTION_FAILURE) {
was_found = false;
}
if (do_reset) {
match->_pos_ptr = NULL; match->_pos_ptr = NULL;
} }
} }
} }
} }
}
PERFORM_CATCH_LOOKUP: { PERFORM_CATCH_LOOKUP: {
if (!was_found) { if (!was_found) {
@ -775,7 +784,7 @@ bool regex_assert(const regex_t * const regex,
} }
} }
return (state == regex->accepting_state); return ((state == regex->accepting_state) ? ASSERTION_SUCCESS : ASSERTION_FAILURE);
} }
match_t * regex_match(const regex_t * const regex, match_t * regex_match(const regex_t * const regex,
@ -806,7 +815,8 @@ match_t * regex_match(const regex_t * const regex,
.width = 0, .width = 0,
}; };
if (regex_assert(regex, s, initial_state, match)) { if (regex_assert(regex, s, initial_state, match) == 1) {
//printf("true: %s\n", s);
if (match->_pos_ptr) { if (match->_pos_ptr) {
match->position = (match->_pos_ptr - string); match->position = (match->_pos_ptr - string);
} else { } else {
@ -818,6 +828,7 @@ match_t * regex_match(const regex_t * const regex,
s += ((match->width > 0) ? match->width : 1); s += ((match->width > 0) ? match->width : 1);
match = (match_t *)malloc(sizeof(match_t)); match = (match_t *)malloc(sizeof(match_t));
} else { } else {
//printf("false: %s\n", s);
++s; ++s;
} }
} while (*s != '\0'); } while (*s != '\0');

View File

@ -101,6 +101,14 @@ signed main() {
puts(""); puts("");
TEST( R"del(\<test)del", " test ", true);
TEST( R"del(test\>)del", " test ", true);
TEST( R"del(\<test)del", " ttest ", false);
TEST( R"del(test\>)del", "testa ", false);
TEST(R"del(\<test\>)del", " test ", true);
puts("");
TEST( R"del(\<int\>)del", "printf", false); TEST( R"del(\<int\>)del", "printf", false);
TEST(R"del(.\<print\>.)del", " print ", true); TEST(R"del(.\<print\>.)del", " print ", true);
TEST(R"del(.\<print\>.)del", "fprint", false); TEST(R"del(.\<print\>.)del", "fprint", false);