Compare commits
4 Commits
a1ac8cb319
...
e4106e1a69
Author | SHA1 | Date | |
---|---|---|---|
e4106e1a69 | |||
99127233db | |||
64fcc77fa1 | |||
0e99ecb8d8 |
56
README.md
56
README.md
@ -9,20 +9,26 @@ match_t * regex_match(const regex_t * const regex, const char * const string, co
|
|||||||
```
|
```
|
||||||
Returns a sentinel terminated array of `match_t` objects.
|
Returns a sentinel terminated array of `match_t` objects.
|
||||||
The sentinel object is defined as `(match_t){ .position = -1, .width = -1, };`.
|
The sentinel object is defined as `(match_t){ .position = -1, .width = -1, };`.
|
||||||
The position and width of non-sentinel `match_t`s is guaranteed to be => 0.
|
|
||||||
|
```C
|
||||||
|
bool is_sentinel(const match_t * const match);
|
||||||
|
```
|
||||||
|
This is the function you must check whether a `match_t` is a sentinel or not.
|
||||||
|
I.e. make this the break condition while looping the results.
|
||||||
|
|
||||||
|
|
||||||
| Symbol | Meaning (TODO: fill in) |
|
| Symbol | Meaning (TODO: fill in) |
|
||||||
| :----: | :---------------------: |
|
| :----: | :---------------------: |
|
||||||
| . | |
|
| . | |
|
||||||
| = | |
|
| ? | One or zero of the previous token |
|
||||||
| + | |
|
| = | Same as ? |
|
||||||
| * | |
|
| * | Any number of the previous token |
|
||||||
| ? | |
|
| + | One or more of the previous token |
|
||||||
| \\< | |
|
| \\< | Start of word |
|
||||||
| \\> | |
|
| \\> | End of word |
|
||||||
| ^ | |
|
| ^ | Start of string |
|
||||||
| \t | |
|
| \t | Tab |
|
||||||
| \n | |
|
| \n | New line |
|
||||||
| \b | |
|
| \b | |
|
||||||
| \i | |
|
| \i | |
|
||||||
| \I | |
|
| \I | |
|
||||||
@ -33,19 +39,19 @@ The position and width of non-sentinel `match_t`s is guaranteed to be => 0.
|
|||||||
| \p | |
|
| \p | |
|
||||||
| \P | |
|
| \P | |
|
||||||
| \s | |
|
| \s | |
|
||||||
| \d | |
|
| \d | Digit char |
|
||||||
| \D | |
|
| \D | Not digit char |
|
||||||
| \x | |
|
| \x | Hex char|
|
||||||
| \X | |
|
| \X | Not hex char |
|
||||||
| \o | |
|
| \o | Octal char |
|
||||||
| \O | |
|
| \O | Not octal char |
|
||||||
| \w | |
|
| \w | Word char|
|
||||||
| \W | |
|
| \W | Not word char|
|
||||||
| \h | |
|
| \h | |
|
||||||
| \a | |
|
| \a | Ascii letter |
|
||||||
| \l | |
|
| \l | Lowercase ascii letter |
|
||||||
| \L | |
|
| \L | Not (lowercase ascii letter) |
|
||||||
| \u | |
|
| \u | Uppercase ascii letter |
|
||||||
| \U | |
|
| \U | Not (uppercase ascii letter) |
|
||||||
| [\<range\>] | |
|
| [\<range\>] | Any of \<range\> |
|
||||||
| [\^\<range\>] | |
|
| [\^\<range\>] | None of \<range\> |
|
||||||
|
@ -43,7 +43,7 @@ EOS ? --> look up fallback table
|
|||||||
| Line | SOS | EOS |
|
| Line | SOS | EOS |
|
||||||
| Word | SOW | EOW |
|
| Word | SOW | EOW |
|
||||||
|
|
||||||
---
|
|
||||||
##### HALT\_AND\_CATCH\_FIRE
|
##### HALT\_AND\_CATCH\_FIRE
|
||||||
H&C is a special state signalling that we have hit a dead end.
|
H&C is a special state signalling that we have hit a dead end.
|
||||||
The reason why need it and we cant just instanly quick is backtracking.
|
The reason why need it and we cant just instanly quick is backtracking.
|
||||||
@ -54,8 +54,8 @@ This is a negative range.
|
|||||||
```
|
```
|
||||||
let myNegativeRange = {'e', 'x', 'a', 'm', 'p', 'l'}
|
let myNegativeRange = {'e', 'x', 'a', 'm', 'p', 'l'}
|
||||||
```
|
```
|
||||||
None of the characters in $myNegativeRange must be accepted.
|
None of the characters in `$myNegativeRange` must be accepted.
|
||||||
The way this is a compiled is that we first hook all chars in $myNegativeRange to H&C,
|
The way this is a compiled is that we first hook all chars in `$myNegativeRange` to H&C,
|
||||||
then define an OFFSHOOT of width 1.
|
then define an OFFSHOOT of width 1.
|
||||||
Put differently:
|
Put differently:
|
||||||
if we read something illegal we abort this branch,
|
if we read something illegal we abort this branch,
|
||||||
@ -79,7 +79,7 @@ It simply ignores the state transition table and rather unconditionally hooks it
|
|||||||
|
|
||||||
#### ^
|
#### ^
|
||||||
This is the carrot operator.
|
This is the carrot operator.
|
||||||
It matches the SOS (start of the string).
|
It matches the SOS.
|
||||||
|
|
||||||
For explanation purposes multilining (match '\n') is irrelevant.
|
For explanation purposes multilining (match '\n') is irrelevant.
|
||||||
That behaves just like a literal.
|
That behaves just like a literal.
|
||||||
@ -111,7 +111,7 @@ SOW must match:
|
|||||||
[^\h]myword
|
[^\h]myword
|
||||||
```
|
```
|
||||||
Not only that, this combination is key,
|
Not only that, this combination is key,
|
||||||
either it has to be the start of the string
|
either it has to be the SOS
|
||||||
or there has to be at least something which is not a symbol char.
|
or there has to be at least something which is not a symbol char.
|
||||||
With out the last condition "eexample" would match "\\\<exaple\\\>"
|
With out the last condition "eexample" would match "\\\<exaple\\\>"
|
||||||
as the iteration of `regex_match()` reaches "example".
|
as the iteration of `regex_match()` reaches "example".
|
||||||
|
@ -131,7 +131,11 @@ typedef struct {
|
|||||||
// ----------------------------------
|
// ----------------------------------
|
||||||
// ### Regex creation/destruction ###
|
// ### Regex creation/destruction ###
|
||||||
// ----------------------------------
|
// ----------------------------------
|
||||||
static const int HALT_AND_CATCH_FIRE = INT_MIN;
|
enum {
|
||||||
|
ASSERTION_FAILURE = 0,
|
||||||
|
ASSERTION_SUCCESS = 1,
|
||||||
|
HALT_AND_CATCH_FIRE = INT_MIN,
|
||||||
|
};
|
||||||
|
|
||||||
#define ASSERT_HALT(a) ((a == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : (cs->state + a))
|
#define ASSERT_HALT(a) ((a == HALT_AND_CATCH_FIRE) ? HALT_AND_CATCH_FIRE : (cs->state + a))
|
||||||
|
|
||||||
@ -707,12 +711,12 @@ const offshoot_t * catch_table_lookup(const regex_t * const regex,
|
|||||||
}
|
}
|
||||||
|
|
||||||
static
|
static
|
||||||
bool regex_assert(const regex_t * const regex,
|
int regex_assert(const regex_t * const regex,
|
||||||
const char * const string,
|
const char * const string,
|
||||||
int state,
|
int state,
|
||||||
match_t * const match) {
|
match_t * const match) {
|
||||||
if (state == HALT_AND_CATCH_FIRE) {
|
if (state == HALT_AND_CATCH_FIRE) {
|
||||||
return false;
|
return HALT_AND_CATCH_FIRE;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool last_stand = false;
|
bool last_stand = false;
|
||||||
@ -753,15 +757,20 @@ bool regex_assert(const regex_t * const regex,
|
|||||||
do_reset = true;
|
do_reset = true;
|
||||||
}
|
}
|
||||||
const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match);
|
const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match);
|
||||||
if(r){
|
if(r == ASSERTION_SUCCESS){
|
||||||
match->width += delta->match_width;
|
match->width += delta->match_width;
|
||||||
return r;
|
return r;
|
||||||
} else if (do_reset) {
|
} else {
|
||||||
|
if (r == ASSERTION_FAILURE) {
|
||||||
|
was_found = false;
|
||||||
|
}
|
||||||
|
if (do_reset) {
|
||||||
match->_pos_ptr = NULL;
|
match->_pos_ptr = NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
PERFORM_CATCH_LOOKUP: {
|
PERFORM_CATCH_LOOKUP: {
|
||||||
if (!was_found) {
|
if (!was_found) {
|
||||||
@ -775,7 +784,7 @@ bool regex_assert(const regex_t * const regex,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return (state == regex->accepting_state);
|
return ((state == regex->accepting_state) ? ASSERTION_SUCCESS : ASSERTION_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
match_t * regex_match(const regex_t * const regex,
|
match_t * regex_match(const regex_t * const regex,
|
||||||
@ -806,7 +815,8 @@ match_t * regex_match(const regex_t * const regex,
|
|||||||
.width = 0,
|
.width = 0,
|
||||||
};
|
};
|
||||||
|
|
||||||
if (regex_assert(regex, s, initial_state, match)) {
|
if (regex_assert(regex, s, initial_state, match) == 1) {
|
||||||
|
//printf("true: %s\n", s);
|
||||||
if (match->_pos_ptr) {
|
if (match->_pos_ptr) {
|
||||||
match->position = (match->_pos_ptr - string);
|
match->position = (match->_pos_ptr - string);
|
||||||
} else {
|
} else {
|
||||||
@ -818,6 +828,7 @@ match_t * regex_match(const regex_t * const regex,
|
|||||||
s += ((match->width > 0) ? match->width : 1);
|
s += ((match->width > 0) ? match->width : 1);
|
||||||
match = (match_t *)malloc(sizeof(match_t));
|
match = (match_t *)malloc(sizeof(match_t));
|
||||||
} else {
|
} else {
|
||||||
|
//printf("false: %s\n", s);
|
||||||
++s;
|
++s;
|
||||||
}
|
}
|
||||||
} while (*s != '\0');
|
} while (*s != '\0');
|
||||||
|
@ -101,6 +101,14 @@ signed main() {
|
|||||||
|
|
||||||
puts("");
|
puts("");
|
||||||
|
|
||||||
|
TEST( R"del(\<test)del", " test ", true);
|
||||||
|
TEST( R"del(test\>)del", " test ", true);
|
||||||
|
TEST( R"del(\<test)del", " ttest ", false);
|
||||||
|
TEST( R"del(test\>)del", "testa ", false);
|
||||||
|
TEST(R"del(\<test\>)del", " test ", true);
|
||||||
|
|
||||||
|
puts("");
|
||||||
|
|
||||||
TEST( R"del(\<int\>)del", "printf", false);
|
TEST( R"del(\<int\>)del", "printf", false);
|
||||||
TEST(R"del(.\<print\>.)del", " print ", true);
|
TEST(R"del(.\<print\>.)del", " print ", true);
|
||||||
TEST(R"del(.\<print\>.)del", "fprint", false);
|
TEST(R"del(.\<print\>.)del", "fprint", false);
|
||||||
|
Loading…
Reference in New Issue
Block a user