Compare commits
5 Commits
c7af36dbb0
...
e608101951
Author | SHA1 | Date | |
---|---|---|---|
e608101951 | |||
e72bc6ffe8 | |||
e4b9643993 | |||
3ac2b8c18b | |||
9e8eb4f2e2 |
2
.gdbinit
2
.gdbinit
@ -1,2 +1,2 @@
|
||||
source debug/regex.pretty_print.py
|
||||
source debug/stage1.gdb
|
||||
#source debug/stage1.gdb
|
||||
|
2
Makefile
2
Makefile
@ -2,7 +2,7 @@ CXXFLAGS := -fuse-ld=mold -ggdb -Wall -Wextra -Wpedantic
|
||||
OUT := regtest
|
||||
|
||||
main:
|
||||
g++ ${CXXFLAGS} source/main.cpp source/vector.c source/regex.c -o ${OUT}
|
||||
${CXX} ${CXXFLAGS} ${CPPFLAGS} source/main.cpp source/vector.c source/jeger.c -o ${OUT}
|
||||
|
||||
run:
|
||||
${OUT}
|
||||
|
1
debug/match_stage1.gdb
Normal file
1
debug/match_stage1.gdb
Normal file
@ -0,0 +1 @@
|
||||
b TEST2
|
@ -1,14 +1,6 @@
|
||||
/* XXX:
|
||||
* as it turns out returning a range of match objects is a
|
||||
* high profile performance issue regarding regex, especially when highlighting.
|
||||
* now as it stands we search an array of tokens for every position on a string.
|
||||
* which sounds ok, until one realizes that searching from any position revails a range,
|
||||
* where (future) matches can or cannot be found. meaning we are computing the same thing
|
||||
* repeatedly, practically resulting in a bruteforcing situation where instead of eliminating
|
||||
* certain non-matches, we blindly hammer character by character.
|
||||
*/
|
||||
#pragma GCC diagnostic ignored "-Wc++20-extensions"
|
||||
|
||||
#include "regex.h"
|
||||
#include "jeger.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
@ -89,14 +81,14 @@ typedef struct {
|
||||
int in;
|
||||
char input;
|
||||
int to;
|
||||
int width;
|
||||
int pattern_width;
|
||||
int match_width;
|
||||
} delta_t;
|
||||
|
||||
typedef struct {
|
||||
int in;
|
||||
int to;
|
||||
int width;
|
||||
int pattern_width;
|
||||
int match_width;
|
||||
} offshoot_t;
|
||||
|
||||
@ -133,26 +125,31 @@ void HOOK_ALL(const int from,
|
||||
regex_t * regex) {
|
||||
for (const char * s = str; *s != '\0'; s++) {
|
||||
delta_t * delta = (delta_t *)malloc(sizeof(delta_t));
|
||||
delta->in = cs->state + from;
|
||||
delta->input = *s;
|
||||
delta->to = ASSERT_HALT(to);
|
||||
delta->width = cs->width;
|
||||
*delta = (delta_t){
|
||||
.in = cs->state + from,
|
||||
.input = *s,
|
||||
.to = ASSERT_HALT(to),
|
||||
.pattern_width = cs->width,
|
||||
.match_width = 1,
|
||||
};
|
||||
vector_push(®ex->delta_table,
|
||||
&delta);
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void ABSOLUTE_OFFSHOOT(const int from,
|
||||
const int to,
|
||||
const int width,
|
||||
const int match_width,
|
||||
regex_t * regex) {
|
||||
void ABSOLUTE_OFFSHOOT(const int from,
|
||||
const int to,
|
||||
const int width,
|
||||
const int match_width,
|
||||
regex_t * regex) {
|
||||
offshoot_t * offshoot = (offshoot_t *)malloc(sizeof(offshoot_t));
|
||||
offshoot->in = from;
|
||||
offshoot->to = to;
|
||||
offshoot->width = width;
|
||||
offshoot->match_width = match_width;
|
||||
*offshoot = (offshoot_t){
|
||||
.in = from,
|
||||
.to = to,
|
||||
.pattern_width = width,
|
||||
.match_width = match_width,
|
||||
};
|
||||
vector_push(®ex->catch_table,
|
||||
&offshoot);
|
||||
}
|
||||
@ -361,7 +358,7 @@ int escape_to_negative(const char c,
|
||||
}
|
||||
|
||||
static inline
|
||||
int compile_dot(compiler_state * cs) {
|
||||
int compile_dot(compiler_state * const cs) {
|
||||
cs->flags |= DO_CATCH;
|
||||
return true;
|
||||
}
|
||||
@ -371,9 +368,9 @@ int compile_escape(const char c,
|
||||
compiler_state * const cs) {
|
||||
|
||||
return escape_1_to_1(c, cs)
|
||||
|| escape_1_to_N(c, cs)
|
||||
|| escape_to_negative(c, cs)
|
||||
;
|
||||
|| escape_1_to_N(c, cs)
|
||||
|| escape_to_negative(c, cs)
|
||||
;
|
||||
}
|
||||
|
||||
static
|
||||
@ -441,7 +438,6 @@ regex_t * regex_compile(const char * const pattern) {
|
||||
compiler_state cs = {
|
||||
.flags = IS_AT_THE_BEGINNING,
|
||||
.state = JEGER_INIT_STATE,
|
||||
.width = 0,
|
||||
.whitelist = whitelist,
|
||||
.blacklist = blacklist,
|
||||
};
|
||||
@ -451,7 +447,7 @@ regex_t * regex_compile(const char * const pattern) {
|
||||
// Reset the compiler
|
||||
whitelist[0] = '\0';
|
||||
blacklist[0] = '\0';
|
||||
cs.flags &= IS_AT_THE_BEGINNING;
|
||||
cs.flags &= (IS_AT_THE_BEGINNING | FORCE_START_OF_STRING);
|
||||
cs.width = 1;
|
||||
|
||||
// Translate char
|
||||
@ -566,7 +562,7 @@ regex_t * regex_compile(const char * const pattern) {
|
||||
++cs.state;
|
||||
}
|
||||
|
||||
cs.flags &= !(IS_AT_THE_BEGINNING);
|
||||
cs.flags &= (~IS_AT_THE_BEGINNING);
|
||||
}
|
||||
|
||||
// Init state hookups
|
||||
@ -648,10 +644,9 @@ bool regex_assert(const regex_t * const regex,
|
||||
if ((delta->in == state)
|
||||
&& (delta->input == *s)) {
|
||||
was_found = true;
|
||||
const int r = regex_assert(regex, s + delta->width, delta->to, match);
|
||||
const int r = regex_assert(regex, s + delta->pattern_width, delta->to, match);
|
||||
if(r){
|
||||
if ((match->position != -1)
|
||||
&& (delta->match_width)) {
|
||||
if (match->position == -1) {
|
||||
match->position = (s - string);
|
||||
}
|
||||
match->width += delta->match_width;
|
||||
@ -664,9 +659,9 @@ bool regex_assert(const regex_t * const regex,
|
||||
PERFORM_CATCH_LOOKUP: {
|
||||
if (!was_found) {
|
||||
const offshoot_t * const my_catch = catch_table_lookup(regex, &state);
|
||||
if (my_catch && (!my_catch->width || !last_stand)) {
|
||||
if (my_catch && (!my_catch->pattern_width || !last_stand)) {
|
||||
state = my_catch->to;
|
||||
s += my_catch->width;
|
||||
s += my_catch->pattern_width;
|
||||
match->width += my_catch->match_width;
|
||||
goto LOOP;
|
||||
}
|
||||
@ -679,30 +674,68 @@ bool regex_assert(const regex_t * const regex,
|
||||
match_t * regex_match(const regex_t * const regex,
|
||||
const char * const string,
|
||||
const bool is_start_of_string) {
|
||||
if (regex == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
vector_t matches;
|
||||
vector_init(&matches, sizeof(match_t), 0);
|
||||
|
||||
match_t * match = (match_t *)malloc(sizeof(match_t));
|
||||
|
||||
if (string == NULL) {
|
||||
match->position = -1;
|
||||
match->width = 0;
|
||||
return match;
|
||||
/* Non-existent regex does not match anything.
|
||||
* Not to be confused with an empty regex.
|
||||
*/
|
||||
if (regex == NULL) {
|
||||
goto FINISH;
|
||||
}
|
||||
|
||||
const int initial_state = (int)(!is_start_of_string);
|
||||
// Find all matches
|
||||
{
|
||||
const char * s = string;
|
||||
do {
|
||||
int initial_state;
|
||||
initial_state = (int)(!(is_start_of_string && (s == string)));
|
||||
|
||||
// XXX: this should be called in a loop, always restarting from the last char of the last match
|
||||
if (regex_assert(regex, string, initial_state, match)) {
|
||||
return match;
|
||||
} else {
|
||||
return NULL;
|
||||
*match = (match_t){
|
||||
.position = -1,
|
||||
.width = 0,
|
||||
};
|
||||
|
||||
if (regex_assert(regex, s, initial_state, match)) {
|
||||
match->position = (s - string);
|
||||
|
||||
vector_push(&matches, match);
|
||||
|
||||
s += ((match->width > 0) ? match->width : 1);
|
||||
match = (match_t *)malloc(sizeof(match_t));
|
||||
} else {
|
||||
++s;
|
||||
}
|
||||
} while (*s != '\0');
|
||||
}
|
||||
|
||||
FINISH:
|
||||
|
||||
// Insert sentinel
|
||||
*match = (match_t){
|
||||
.position = -1,
|
||||
.width = -1,
|
||||
};
|
||||
vector_push(&matches, match);
|
||||
|
||||
// Hide internal vector usage
|
||||
const size_t data_size = matches.element_size * matches.element_count;
|
||||
match_t * r = (match_t *)malloc(data_size);
|
||||
memcpy(r, matches.data, data_size);
|
||||
vector_free(&matches);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
bool regex_search(const regex_t * const regex,
|
||||
const char * const string) {
|
||||
|
||||
return (bool)regex_match(regex, string, true);
|
||||
match_t * m = regex_match(regex, string, true);
|
||||
const bool r = (m->position != -1);
|
||||
free(m);
|
||||
|
||||
return r;
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
#ifndef REGEX_H
|
||||
#define REGEX_H
|
||||
#ifndef JEGER_H
|
||||
#define JEGER_H
|
||||
|
||||
#include <stdbool.h>
|
||||
|
134
source/main.cpp
134
source/main.cpp
@ -1,98 +1,8 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include "regex.h"
|
||||
|
||||
static int test_counter = 0;
|
||||
static int passed_tests = 0;
|
||||
static int positives = 0;
|
||||
static int positive_successes = 0;
|
||||
static int negatives = 0;
|
||||
static int negative_successes = 0;
|
||||
|
||||
static int test_counter2 = 0;
|
||||
static int passed_tests2 = 0;
|
||||
|
||||
static
|
||||
void asprint_match_t( char * * destination,
|
||||
const match_t * const match){
|
||||
if (match) {
|
||||
asprintf(destination, "%p {%d, %d}", (void *)match, match->position, match->width);
|
||||
} else {
|
||||
asprintf(destination, "0x0 {N/A, N/A}");
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void print_leader(const bool passed){
|
||||
if (passed) {
|
||||
printf("\033[32;1mSuccess\033[0;1m. - \033[0m");
|
||||
} else {
|
||||
printf("\033[31;1mFailiour\033[0;1m. - \033[0m");
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void TEST(const char * const what,
|
||||
const char * const on,
|
||||
const bool expect){
|
||||
|
||||
regex_t * r = regex_compile(what);
|
||||
bool result = regex_search(r, on);
|
||||
bool passed = (result == expect);
|
||||
|
||||
passed && expect ? ++positive_successes : ++negative_successes;
|
||||
|
||||
print_leader(passed);
|
||||
|
||||
char * quoted_what, * quoted_on;
|
||||
asprintf("ed_what, "'%s'", what);
|
||||
asprintf("ed_on, "'%s'", on);
|
||||
printf("%14s\033[1m vs \033[0m%14s\033[1m:\033[0m Result = %d, Expected = %d\n", quoted_what, quoted_on, result, expect);
|
||||
free(quoted_what);
|
||||
free(quoted_on);
|
||||
if (passed) {
|
||||
++passed_tests;
|
||||
}
|
||||
|
||||
++test_counter;
|
||||
}
|
||||
|
||||
static
|
||||
void TEST2(const char * const what,
|
||||
const char * const on,
|
||||
const match_t expect){
|
||||
|
||||
regex_t * r = regex_compile(what);
|
||||
match_t * result = regex_match(r, on, true);
|
||||
bool passed = (
|
||||
( result
|
||||
&& result->position == expect.position
|
||||
&& result->width == expect.width
|
||||
)
|
||||
||
|
||||
expect.position == -1
|
||||
);
|
||||
|
||||
print_leader(passed);
|
||||
|
||||
char * quoted_what, * quoted_on;
|
||||
asprintf("ed_what, "'%s'", what);
|
||||
asprintf("ed_on, "'%s'", on);
|
||||
char * result_string, * expect_string;
|
||||
asprint_match_t(&result_string, result);
|
||||
asprint_match_t(&expect_string, &expect);
|
||||
printf("%14s\033[1m vs \033[0m%14s\033[1m:\033[0m\n\t%s\n\t%s\n", quoted_what, quoted_on, result_string, expect_string);
|
||||
free(quoted_what);
|
||||
free(quoted_on);
|
||||
free(result_string);
|
||||
free(expect_string);
|
||||
if (passed) {
|
||||
++passed_tests2;
|
||||
}
|
||||
|
||||
++test_counter2;
|
||||
}
|
||||
#include "test.hpp"
|
||||
|
||||
signed main() {
|
||||
TEST( R"del(abc)del", "abc", true);
|
||||
@ -189,20 +99,48 @@ signed main() {
|
||||
TEST( R"del(test\>)del", "testa", false);
|
||||
TEST(R"del(\<test\>)del", "test", true);
|
||||
|
||||
if(test_counter == passed_tests) {
|
||||
if (test_counter == passed_tests) {
|
||||
fputs("\033[32m", stdout);
|
||||
} else {
|
||||
fputs("\033[31m", stdout);
|
||||
}
|
||||
printf("\nPassed %d out of %d tests.\033[0m\n", passed_tests, test_counter);
|
||||
printf("\tPositives: %d/%d\n", positive_successes, positives);
|
||||
printf("\tNegatives: %d/%d\n", negative_successes, negatives);
|
||||
|
||||
puts("");
|
||||
|
||||
puts("");
|
||||
|
||||
TEST2( R"del(abc)del", "abc", match_t{ 0, 3});
|
||||
TEST2(R"del(efg1)del", "efg1", match_t{ 0, 4});
|
||||
TEST2( R"del(nig)del", "ger", match_t{-1, 0});
|
||||
TEST2( R"del(ss)del", "sss", match_t{ 0, 2});
|
||||
TEST2( R"del(sss)del", "ss", match_t{-1, 0});
|
||||
TEST2( R"del(abc)del", "abc", match_t{ 0, strlen("abc")});
|
||||
TEST2(R"del(efg1)del", "efg1", match_t{ 0, strlen("efg1")});
|
||||
TEST2( R"del(nig)del", "ger", match_t{-1, -1});
|
||||
TEST2( R"del(ss)del", "sss", match_t{ 0, 2});
|
||||
TEST2( R"del(sss)del", "ss", match_t{-1, -1});
|
||||
|
||||
puts("");
|
||||
puts("");
|
||||
|
||||
TEST2( R"del(ab+c)del", "abc", match_t{ 0, strlen("abc")});
|
||||
TEST2(R"del(ef+g1)del", "effffg1", match_t{ 0, strlen("effffg1")});
|
||||
TEST2(R"del(efg1+)del", "efg", match_t{-1, -1});
|
||||
TEST2(R"del(efg1+)del", "efg1", match_t{ 0, strlen("efg1")});
|
||||
TEST2(R"del(efg1+)del", "efg11", match_t{ 0, strlen("efg11")});
|
||||
|
||||
puts("");
|
||||
puts("");
|
||||
|
||||
TEST2( R"del(a+a)del", " aaa", match_t{ 1, strlen("aaa")});
|
||||
TEST2( R"del(a+a)del", " aa", match_t{ 1, strlen("aa")});
|
||||
TEST2( R"del(a+a)del", " a", match_t{-1, -1});
|
||||
TEST2( R"del(a+a)del", " aaa", match_t{ 3, strlen("aaa")});
|
||||
TEST2(R"del(a+\+)del", "aaa+", match_t{ 0, strlen("aaa+")});
|
||||
|
||||
if(test_counter2 == passed_tests2) {
|
||||
fputs("\033[32m", stdout);
|
||||
} else {
|
||||
fputs("\033[31m", stdout);
|
||||
}
|
||||
printf("\nPassed %d out of %d tests.\033[0m\n", passed_tests2, test_counter2);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
103
source/test.hpp
Normal file
103
source/test.hpp
Normal file
@ -0,0 +1,103 @@
|
||||
#include "jeger.h"
|
||||
|
||||
static int test_counter = 0;
|
||||
static int passed_tests = 0;
|
||||
static int positives = 0;
|
||||
static int positive_successes = 0;
|
||||
static int negatives = 0;
|
||||
static int negative_successes = 0;
|
||||
|
||||
static int test_counter2 = 0;
|
||||
static int passed_tests2 = 0;
|
||||
|
||||
static
|
||||
void asprint_match_t( char * * destination,
|
||||
const match_t * const match){
|
||||
if (match) {
|
||||
asprintf(destination, "%p {%d, %d}", (void *)match, match->position, match->width);
|
||||
} else {
|
||||
asprintf(destination, "0x000000000000 {N/A, N/A}");
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void print_leader(const bool passed){
|
||||
if (passed) {
|
||||
printf("\033[32;1mSuccess\033[0;1m. - \033[0m");
|
||||
} else {
|
||||
printf("\033[31;1mFailiour\033[0;1m. - \033[0m");
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void do_flush(void) {
|
||||
if(!(test_counter % 5)) {
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void TEST(const char * const what,
|
||||
const char * const on,
|
||||
const bool expect){
|
||||
|
||||
do_flush();
|
||||
|
||||
regex_t * r = regex_compile(what);
|
||||
bool result = regex_search(r, on);
|
||||
regex_free(r);
|
||||
|
||||
bool passed = (result == expect);
|
||||
|
||||
expect ? ++positives : ++negatives;
|
||||
|
||||
print_leader(passed);
|
||||
|
||||
char * quoted_what, * quoted_on;
|
||||
asprintf("ed_what, "'%s'", what);
|
||||
asprintf("ed_on, "'%s'", on);
|
||||
printf("%14s\033[1m vs \033[0m%14s\033[1m:\033[0m Result = %d, Expected = %d\n", quoted_what, quoted_on, result, expect);
|
||||
free(quoted_what);
|
||||
free(quoted_on);
|
||||
if (passed) {
|
||||
++passed_tests;
|
||||
expect ? ++positive_successes : ++negative_successes;
|
||||
}
|
||||
|
||||
++test_counter;
|
||||
}
|
||||
|
||||
static
|
||||
void TEST2(const char * const what,
|
||||
const char * const on,
|
||||
const match_t expect){
|
||||
|
||||
do_flush();
|
||||
|
||||
regex_t * r = regex_compile(what);
|
||||
match_t * result = regex_match(r, on, true);
|
||||
bool passed = (result->position == expect.position
|
||||
&& result->width == expect.width
|
||||
);
|
||||
|
||||
print_leader(passed);
|
||||
|
||||
char * quoted_what, * quoted_on;
|
||||
asprintf("ed_what, "'%s'", what);
|
||||
asprintf("ed_on, "'%s'", on);
|
||||
char * result_string, * expect_string;
|
||||
asprint_match_t(&result_string, result);
|
||||
asprint_match_t(&expect_string, &expect);
|
||||
printf("%s\033[1m vs \033[0m%s\033[1m:\033[0m\n\tResult = %s\n\tExpected = %s\n", quoted_what, quoted_on, result_string, expect_string);
|
||||
free(quoted_what);
|
||||
free(quoted_on);
|
||||
free(result_string);
|
||||
free(expect_string);
|
||||
free(result);
|
||||
|
||||
if (passed) {
|
||||
++passed_tests2;
|
||||
}
|
||||
|
||||
++test_counter2;
|
||||
}
|
Loading…
Reference in New Issue
Block a user