Compare commits
4 Commits
a1ac8cb319
...
e4106e1a69
Author | SHA1 | Date | |
---|---|---|---|
e4106e1a69 | |||
99127233db | |||
64fcc77fa1 | |||
0e99ecb8d8 |
56
README.md
56
README.md
@ -9,20 +9,26 @@ match_t * regex_match(const regex_t * const regex, const char * const string, co
|
||||
```
|
||||
Returns a sentinel terminated array of `match_t` objects.
|
||||
The sentinel object is defined as `(match_t){ .position = -1, .width = -1, };`.
|
||||
The position and width of non-sentinel `match_t`s is guaranteed to be => 0.
|
||||
|
||||
```C
|
||||
bool is_sentinel(const match_t * const match);
|
||||
```
|
||||
This is the function you must check whether a `match_t` is a sentinel or not.
|
||||
I.e. make this the break condition while looping the results.
|
||||
|
||||
|
||||
| Symbol | Meaning (TODO: fill in) |
|
||||
| :----: | :---------------------: |
|
||||
| . | |
|
||||
| = | |
|
||||
| + | |
|
||||
| * | |
|
||||
| ? | |
|
||||
| \\< | |
|
||||
| \\> | |
|
||||
| ^ | |
|
||||
| \t | |
|
||||
| \n | |
|
||||
| ? | One or zero of the previous token |
|
||||
| = | Same as ? |
|
||||
| * | Any number of the previous token |
|
||||
| + | One or more of the previous token |
|
||||
| \\< | Start of word |
|
||||
| \\> | End of word |
|
||||
| ^ | Start of string |
|
||||
| \t | Tab |
|
||||
| \n | New line |
|
||||
| \b | |
|
||||
| \i | |
|
||||
| \I | |
|
||||
@ -33,19 +39,19 @@ The position and width of non-sentinel `match_t`s is guaranteed to be => 0.
|
||||
| \p | |
|
||||
| \P | |
|
||||
| \s | |
|
||||
| \d | |
|
||||
| \D | |
|
||||
| \x | |
|
||||
| \X | |
|
||||
| \o | |
|
||||
| \O | |
|
||||
| \w | |
|
||||
| \W | |
|
||||
| \d | Digit char |
|
||||
| \D | Not digit char |
|
||||
| \x | Hex char|
|
||||
| \X | Not hex char |
|
||||
| \o | Octal char |
|
||||
| \O | Not octal char |
|
||||
| \w | Word char|
|
||||
| \W | Not word char|
|
||||
| \h | |
|
||||
| \a | |
|
||||
| \l | |
|
||||
| \L | |
|
||||
| \u | |
|
||||
| \U | |
|
||||
| [\<range\>] | |
|
||||
| [\^\<range\>] | |
|
||||
| \a | Ascii letter |
|
||||
| \l | Lowercase ascii letter |
|
||||
| \L | Not (lowercase ascii letter) |
|
||||
| \u | Uppercase ascii letter |
|
||||
| \U | Not (uppercase ascii letter) |
|
||||
| [\<range\>] | Any of \<range\> |
|
||||
| [\^\<range\>] | None of \<range\> |
|
||||
|
@ -43,7 +43,7 @@ EOS ? --> look up fallback table
|
||||
| Line | SOS | EOS |
|
||||
| Word | SOW | EOW |
|
||||
|
||||
---
|
||||
|
||||
##### HALT\_AND\_CATCH\_FIRE
|
||||
H&C is a special state signalling that we have hit a dead end.
|
||||
The reason why need it and we cant just instanly quick is backtracking.
|
||||
@ -54,8 +54,8 @@ This is a negative range.
|
||||
```
|
||||
let myNegativeRange = {'e', 'x', 'a', 'm', 'p', 'l'}
|
||||
```
|
||||
None of the characters in $myNegativeRange must be accepted.
|
||||
The way this is a compiled is that we first hook all chars in $myNegativeRange to H&C,
|
||||
None of the characters in `$myNegativeRange` must be accepted.
|
||||
The way this is a compiled is that we first hook all chars in `$myNegativeRange` to H&C,
|
||||
then define an OFFSHOOT of width 1.
|
||||
Put differently:
|
||||
if we read something illegal we abort this branch,
|
||||
@ -79,7 +79,7 @@ It simply ignores the state transition table and rather unconditionally hooks it
|
||||
|
||||
#### ^
|
||||
This is the carrot operator.
|
||||
It matches the SOS (start of the string).
|
||||
It matches the SOS.
|
||||
|
||||
For explanation purposes multilining (match '\n') is irrelevant.
|
||||
That behaves just like a literal.
|
||||
@ -111,7 +111,7 @@ SOW must match:
|
||||
[^\h]myword
|
||||
```
|
||||
Not only that, this combination is key,
|
||||
either it has to be the start of the string
|
||||
either it has to be the SOS
|
||||
or there has to be at least something which is not a symbol char.
|
||||
With out the last condition "eexample" would match "\\\<exaple\\\>"
|
||||
as the iteration of `regex_match()` reaches "example".
|
||||
|
@ -131,7 +131,11 @@ typedef struct {
|
||||
// ----------------------------------
|
||||
// ### Regex creation/destruction ###
|
||||
// ----------------------------------
|
||||
static const int HALT_AND_CATCH_FIRE = INT_MIN;
|
||||
enum {
|
||||
ASSERTION_FAILURE = 0,
|
||||
ASSERTION_SUCCESS = 1,
|
||||
HALT_AND_CATCH_FIRE = INT_MIN,
|
||||
};
|
||||
|
||||
#define ASSERT_HALT(a) ((a == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : (cs->state + a))
|
||||
|
||||
@ -707,12 +711,12 @@ const offshoot_t * catch_table_lookup(const regex_t * const regex,
|
||||
}
|
||||
|
||||
static
|
||||
bool regex_assert(const regex_t * const regex,
|
||||
int regex_assert(const regex_t * const regex,
|
||||
const char * const string,
|
||||
int state,
|
||||
match_t * const match) {
|
||||
if (state == HALT_AND_CATCH_FIRE) {
|
||||
return false;
|
||||
return HALT_AND_CATCH_FIRE;
|
||||
}
|
||||
|
||||
bool last_stand = false;
|
||||
@ -753,11 +757,16 @@ bool regex_assert(const regex_t * const regex,
|
||||
do_reset = true;
|
||||
}
|
||||
const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match);
|
||||
if(r){
|
||||
if(r == ASSERTION_SUCCESS){
|
||||
match->width += delta->match_width;
|
||||
return r;
|
||||
} else if (do_reset) {
|
||||
match->_pos_ptr = NULL;
|
||||
} else {
|
||||
if (r == ASSERTION_FAILURE) {
|
||||
was_found = false;
|
||||
}
|
||||
if (do_reset) {
|
||||
match->_pos_ptr = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -775,7 +784,7 @@ bool regex_assert(const regex_t * const regex,
|
||||
}
|
||||
}
|
||||
|
||||
return (state == regex->accepting_state);
|
||||
return ((state == regex->accepting_state) ? ASSERTION_SUCCESS : ASSERTION_FAILURE);
|
||||
}
|
||||
|
||||
match_t * regex_match(const regex_t * const regex,
|
||||
@ -806,7 +815,8 @@ match_t * regex_match(const regex_t * const regex,
|
||||
.width = 0,
|
||||
};
|
||||
|
||||
if (regex_assert(regex, s, initial_state, match)) {
|
||||
if (regex_assert(regex, s, initial_state, match) == 1) {
|
||||
//printf("true: %s\n", s);
|
||||
if (match->_pos_ptr) {
|
||||
match->position = (match->_pos_ptr - string);
|
||||
} else {
|
||||
@ -818,6 +828,7 @@ match_t * regex_match(const regex_t * const regex,
|
||||
s += ((match->width > 0) ? match->width : 1);
|
||||
match = (match_t *)malloc(sizeof(match_t));
|
||||
} else {
|
||||
//printf("false: %s\n", s);
|
||||
++s;
|
||||
}
|
||||
} while (*s != '\0');
|
||||
|
@ -101,6 +101,14 @@ signed main() {
|
||||
|
||||
puts("");
|
||||
|
||||
TEST( R"del(\<test)del", " test ", true);
|
||||
TEST( R"del(test\>)del", " test ", true);
|
||||
TEST( R"del(\<test)del", " ttest ", false);
|
||||
TEST( R"del(test\>)del", "testa ", false);
|
||||
TEST(R"del(\<test\>)del", " test ", true);
|
||||
|
||||
puts("");
|
||||
|
||||
TEST( R"del(\<int\>)del", "printf", false);
|
||||
TEST(R"del(.\<print\>.)del", " print ", true);
|
||||
TEST(R"del(.\<print\>.)del", "fprint", false);
|
||||
|
Loading…
Reference in New Issue
Block a user