xolatilization/xyntax.h

///                   _
/// __  ___   _ _ __ | |_ __ ___  __
/// \ \/ / | | | '_ \| __/ _` \ \/ /
///  >  <| |_| | | | | || (_| |>  <
/// /_/\_\\__, |_| |_|\__\__,_/_/\_\
///       |___/
///
/// Copyright (c) 1997 - Ognjen 'xolatile' Milan Robovic
///
/// xolatile@chud.cyou - xyntax - Tiny, unsafe and somewhat insane unity header for generic syntax definition.
///
/// This program is free software, free as in freedom and as in free beer, you can redistribute it and/or modify it under the terms of the GNU
/// General Public License as published by the Free Software Foundation, either version 3 of the License, or any later version if you wish...
///
/// This program is distributed in the hope that it will be useful, but it is probably not, and without any warranty, without even the implied
/// warranty of merchantability or fitness for a particular purpose, because it is pointless. Please see the GNU (Geenoo) General Public License
/// for more details, if you dare, it is a lot of text that nobody wants to read...

/// Description
///
/// Xyntax, the most minimal text parser that can deal with syntax highlighting that I could've come up with, and the most generic name for it
/// also, it's rather slow, but if you're creating heavy duty program, you'd use heavy duty library for it. This library has only one header, so
/// it's easy to incorporate it into existing projects. If you want to see how it is used, check out simple examples below, if you want more
/// robust example, check out my other programs, Xarbon and Xighlight.
///
/// For start, you want to include this header file, there's no macro for including implementation (like stb libraries), this is for projects
/// that have only one C source file, and one or more C header files. Then make global or local variable 'syntax_structure * whatever_syntax'
/// defined below, initialize it, define the rules, then in main loop select rule, do whatever you wanted, then deinitialize the structure. It's
/// simple, I'll provide minimal examples below.

/// Structure for single syntax definition (array of rules), if you want to parse multiple languages simultaneously, use an array.
///
/// syntax_structure * syntax = null;

typedef struct {
	natural       count;   /// Count of syntax rules used, maximum is set with limit below, if limit is 0, it'll allocate it dynamically.
	natural       limit;   /// Preallocation limit for syntax rules, hardcode it if you don't want this to allocate memory dynamically.
	boolean     * enrange; /// Enrange rule, set to true if you want to begin matching by any character from 'begin' string below.
	boolean     * derange; /// Derange rule, set to true if you want to end matching by any character from 'end' string below.
	character * * begin;   /// String containing set of characters or full string for start of matching, correlating to 'enrange' above.
	character * * end;     /// String containing set of characters or full string for end of matching, correlating to 'derange' above.
	character   * escape;  /// Escape character, which will skip one cycle in selection loop, then continue matching for 'end' string.
	natural     * colour;  /// Colour for matched array of characters, can be anything, enumerated, literal, hardcoded...
	natural     * effect;  /// Effect for matched array of characters, can be anything, enumerated, literal, hardcoded...
} syntax_structure;

/// Initialize syntax structure before calling other functions that take it as an argument, set 'limit' to 0 if you want dynamic array of rules.
///
/// syntax = syntax_initialize (0);

static syntax_structure * syntax_initialize (natural limit) {
	syntax_structure * syntax = allocate (sizeof (* syntax));

	syntax->limit = limit;

	if (limit != 0) {
		syntax->enrange = allocate (syntax->limit * sizeof (* syntax->enrange));
		syntax->derange = allocate (syntax->limit * sizeof (* syntax->derange));
		syntax->begin   = allocate (syntax->limit * sizeof (* syntax->begin));
		syntax->end     = allocate (syntax->limit * sizeof (* syntax->end));
		syntax->escape  = allocate (syntax->limit * sizeof (* syntax->escape));
		syntax->colour  = allocate (syntax->limit * sizeof (* syntax->colour));
		syntax->effect  = allocate (syntax->limit * sizeof (* syntax->effect));
	}

	return (syntax);
}

/// Deinitialize syntax structure after using it, in order to avoid memory leaks.
///
/// syntax = syntax_deinitialize (syntax);

static syntax_structure * syntax_deinitialize (syntax_structure * syntax) {
	for (natural index = 0; index < syntax->count; ++index) {
		syntax->begin [index] = deallocate (syntax->begin [index]);
		syntax->end   [index] = deallocate (syntax->end   [index]);
	}

	syntax->enrange = deallocate (syntax->enrange);
	syntax->derange = deallocate (syntax->derange);
	syntax->begin   = deallocate (syntax->begin);
	syntax->end     = deallocate (syntax->end);
	syntax->escape  = deallocate (syntax->escape);
	syntax->colour  = deallocate (syntax->colour);
	syntax->effect  = deallocate (syntax->effect);

	return (deallocate (syntax));
}

/// Define single syntax rule, which will be added into array part of syntax structure, return value is index into that array.
///
/// Take a look into few simple examples of defining some simplified rules of C programming language.
/// Two examples below show how to define multiline comments and strings, since these have priority, both enrange and derange are false.
///
/// syntax_define (syntax, false, false, "/*", "*/", '\\', 1, 0);
/// syntax_define (syntax, false, false, "\"", "\"", '\\', 2, 0);
///
/// Now we're defining syntax rule for one keyword, static, notice that end string contaings separator characters because derange is true.
///
/// syntax_define (syntax, false, true, "static", "()[]{}.,:;<=>+*-/%!&~^?| \t\r\n", '\0', 3, 0);
///
/// You can define brackets and operator characters separately, or if you want to, you can define some of them separately again.
///
/// syntax_define (syntax, true, false, "()[]{}", "", '\0', 4, 0);
/// syntax_define (syntax, true, false, ".,:;<=>+*-/%!&~^?|", "", '\0', 5, 0);
///
/// And lastly, we can define number selection like this below, by setting both enrange and derange as false.
///
/// syntax_define (syntax, true, true, "0123456789", "()[]{}.,:;<=>+*-/%!&~^?| \t\r\n", '\0', 6, 0);
///
/// I hope this is pretty clear, if you want to select a number, you start by matching any of digits provided above, and you end matching that
/// number by any character from 'end' string, if you want to support floating point numbers, you'd exclude '.' character, or alternatively add
/// letters f, u, l and whatever else your language supports (like in C/C++). However, this approach is too weak for detecting syntax errors,
/// you shouldn't use this library for robust linter or parser.

static natural syntax_define (syntax_structure * syntax, boolean enrange, boolean derange, character * begin, character * end, character escape,
                              natural colour, natural effect) {
	++syntax->count;

	natural current = syntax->count - 1;

	fatal_failure (begin == null, "syntax_define: Begin string is null pointer.");
	fatal_failure (end   == null, "syntax_define: End string is null pointer.");

	fatal_failure (syntax->count >= syntax->limit, "syntax_define: Reached the hardcoded limit.");

	if (syntax->limit == 0) {
		syntax->enrange = reallocate (syntax->enrange, syntax->count * sizeof (* syntax->enrange));
		syntax->derange = reallocate (syntax->derange, syntax->count * sizeof (* syntax->derange));
		syntax->begin   = reallocate (syntax->begin,   syntax->count * sizeof (* syntax->begin));
		syntax->end     = reallocate (syntax->end,     syntax->count * sizeof (* syntax->end));
		syntax->escape  = reallocate (syntax->escape,  syntax->count * sizeof (* syntax->escape));
		syntax->colour  = reallocate (syntax->colour,  syntax->count * sizeof (* syntax->colour));
		syntax->effect  = reallocate (syntax->effect,  syntax->count * sizeof (* syntax->effect));
	}

	syntax->begin [current] = allocate ((string_length (begin) + 1) * sizeof (* * syntax->begin));
	syntax->end   [current] = allocate ((string_length (end)   + 1) * sizeof (* * syntax->end));

	syntax->enrange [current] = enrange;
	syntax->derange [current] = derange;
	syntax->escape  [current] = escape;
	syntax->colour  [current] = colour;
	syntax->effect  [current] = effect;

	string_copy (syntax->begin [current], begin);
	string_copy (syntax->end   [current], end);

	return (current);
}

/// After all syntax definitions have been defined, call this function inside your main loop, return value is index of selected rule.
///
/// Now, imagine that 'buffer' is file you've loaded into memory, you have declared natural numbers 'offset', 'length' and 'select', and you've
/// properly initialized syntax structure 'syntax', defined its rules for wanted language(s), simple main loop would look like this:
///
/// for (offset = 0; buffer [offset] != '\0'; offset += length) {
///         /// Notice that we're not incrementing 'offset', we're increasing it by 'length'.
///         select = syntax_select (syntax, & buffer [offset], & length);
///         if (select >= syntax->count) {
///                 /// Syntax definition is incomplete, unknown sequence has been detected, either print nothing, or print default.
///         } else {
///                 /// Print string of 'length', at '& buffer [offset]', using 'syntax->colour [select]' and 'syntax->effect [select]'.
///                 /// Strings here aren't null terminated, you want to print sized string.
///         }
/// }

static natural syntax_select (syntax_structure * syntax, character * string, natural * length) {
	natural offset = 0;
	natural subset = 0;
	natural select = 0;

	for (; select != syntax->count; ++select) {
		caliber begin_length = string_length (syntax->begin [select]);

		if (syntax->enrange [select] == false) {
			if (syntax->derange [select] == false) {
				if (string_compare_limit (string, syntax->begin [select], begin_length) == true) {
					break;
				}
			} else {
				if ((string_compare_limit    (string, syntax->begin [select], begin_length)         == true)
				&&  (character_compare_array (string [offset + begin_length], syntax->end [select]) == true)) {
					break;
				}
			}
		} else {
			for (subset = 0; subset != begin_length; ++subset) {
				if (string [offset] == syntax->begin [select] [subset]) {
					goto selected;
				}
			}
		}
	}

	selected:

	if (select >= syntax->count) {
		* length = 1;

		return (syntax->count);
	}

	caliber end_length = string_length (syntax->end [select]);

	for (offset = 1; string [offset - 1] != character_null; ++offset) {
		if (string [offset] == syntax->escape [select]) {
			++offset;
			continue;
		}

		if (syntax->derange [select] == true) {
			subset = 0;
			if (end_length == 0) {
				break;
			} do {
				if (string [offset] == syntax->end [select] [subset]) {
					* length = offset;
					goto finished;
				}
			} while (++subset != end_length);
		} else {
			if (end_length != 0) {
				if (string_compare_limit (& string [offset], syntax->end [select], end_length)) {
					* length = offset + end_length;
					return (select);
				}
			} else {
				* length = 1;
				return (select);
			}
		}
	}

	finished:

	return (select);
}