feat: first commit

2026-01-25 10:20:10 +01:00 · 2026-01-25 10:20:10 +01:00 · 701734097e
commit 701734097e
13 changed files with 2655 additions and 0 deletions
--- a/lexer.c
+++ b/lexer.c
@ -0,0 +1,410 @@
+#include "lexer.h"
+#include <stdbool.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+static void add_token(lexer *l, token_type type, usize len)
+{
+	token *t = malloc(sizeof(token));
+	t->type = type;
+	t->lexeme_len = len;
+	t->lexeme = l->source + l->index;
+	t->position.row = l->row;
+	t->position.column = l->column;
+
+	if (!l->tokens) {
+		l->tokens = t;
+		l->tail = t;
+	} else {
+		l->tail->next = t;
+		l->tail = t;
+	}
+}
+
+static void add_error(lexer *l, char *msg)
+{
+	token *t = malloc(sizeof(token));
+	t->type = TOKEN_ERROR;
+	t->lexeme_len = strlen(msg);
+	t->lexeme = msg;
+	t->position.row = l->row;
+	t->position.column = l->column;
+
+	if (!l->tokens) {
+		l->tokens = t;
+		l->tail = t;
+	} else {
+		l->tail->next = t;
+		l->tail = t;
+	}
+}
+
+static void parse_number(lexer *l)
+{
+	char c = l->source[l->index];
+	/* Is the number a float? */
+	bool f = false;
+	usize len = 0;
+
+	while (isdigit(c)) {
+		/* If a dot is found, and the character after it is a digit, this is a float. */
+		if (l->source[l->index+1] == '.' && isdigit(l->source[l->index+2])) {
+			f = true;
+			len += 3;
+			l->index += 3;
+		} else {
+			len += 1;
+			l->index += 1;
+		}
+		c = l->source[l->index];
+	}
+	l->index -= len;
+	if (f) {
+		add_token(l, TOKEN_FLOAT, len);
+	} else {
+		add_token(l, TOKEN_INTEGER, len);
+	}
+	l->index += len;
+}
+
+static void parse_identifier(lexer *l)
+{
+	char c = l->source[l->index];
+	usize len = 0;
+
+	while (isalnum(c) || c == '_') {
+		len += 1;
+		l->index += 1;
+		c = l->source[l->index];
+	}
+	l->index -= len;
+
+	//token_type keyword = trie_get(keywords, l->source + l->index, len);
+	if (0) {
+		//add_token(l, keyword, len);
+	} else {
+		add_token(l, TOKEN_IDENTIFIER, len);
+	}
+	l->index += len;
+}
+
+static void parse_string(lexer *l)
+{
+	char c = l->source[l->index];
+	usize len = 0;
+
+	while (c != '"') {
+		if (c == '\0' || c == '\n') {
+			l->index -= len;
+			add_error(l, "unclosed string literal.");
+			l->index += len;
+			return;
+		}
+		len += 1;
+		l->index += 1;
+		c = l->source[l->index];
+	}
+	l->index -= len;
+	add_token(l, TOKEN_STRING, len);
+	l->index += len + 1;
+}
+
+static bool parse_special(lexer *l)
+{
+	switch (l->source[l->index]) {
+	case '+':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_PLUS_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '+') {
+			add_token(l, TOKEN_PLUS_PLUS, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_PLUS, 1);
+			l->index += 1;
+		}
+		return true;
+	case '-':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_MINUS_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '-') {
+			add_token(l, TOKEN_MINUS_MINUS, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '>') {
+			add_token(l, TOKEN_ARROW, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_MINUS, 1);
+			l->index += 1;
+		}
+		return true;
+	case '/':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_SLASH_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_SLASH, 1);
+			l->index += 1;
+		}
+		return true;
+	case '*':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_STAR_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_STAR, 1);
+			l->index += 1;
+		}
+		return true;
+	case '%':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_PERC_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_PERC, 1);
+			l->index += 1;
+		}
+		return true;
+	case '&':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_AND_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '&') {
+			add_token(l, TOKEN_DOUBLE_AND, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_AND, 1);
+			l->index += 1;
+		}
+		return true;
+	case '^':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_HAT_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_HAT, 1);
+			l->index += 1;
+		}
+		return true;
+	case '|':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_PIPE_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '|') {
+			add_token(l, TOKEN_OR, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_PIPE, 1);
+			l->index += 1;
+		}
+		return true;
+	case '=':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_DOUBLE_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_EQ, 1);
+			l->index += 1;
+		}
+		return true;
+	case '>':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_GREATER_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '>') {
+			if (l->source[l->index+2] == '=') {
+				add_token(l, TOKEN_RSHIFT_EQ, 3);
+				l->index += 3;
+				return true;
+			}
+			add_token(l, TOKEN_RSHIFT, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_GREATER_THAN, 1);
+			l->index += 1;
+		}
+		return true;
+	case '<':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_LESS_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '<') {
+			if (l->source[l->index+2] == '=') {
+				add_token(l, TOKEN_LSHIFT_EQ, 3);
+				l->index += 3;
+				return true;
+			}
+			add_token(l, TOKEN_LSHIFT, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_LESS_THAN, 1);
+			l->index += 1;
+		}
+		return true;
+	case '!':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_NOT_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_BANG, 1);
+			l->index += 1;
+		}
+		return true;
+	case ':':
+		add_token(l, TOKEN_COLON, 1);
+		l->index += 1;
+		return true;
+	case ';':
+		add_token(l, TOKEN_SEMICOLON, 1);
+		l->index += 1;
+		return true;
+	case '.':
+		add_token(l, TOKEN_DOT, 1);
+		l->index += 1;
+		return true;
+	case ',':
+		add_token(l, TOKEN_COMMA, 1);
+		l->index += 1;
+		return true;
+	case '(':
+		add_token(l, TOKEN_LPAREN, 1);
+		l->index += 1;
+		return true;
+	case ')':
+		add_token(l, TOKEN_RPAREN, 1);
+		l->index += 1;
+		return true;
+	case '[':
+		add_token(l, TOKEN_LSQUARE, 1);
+		l->index += 1;
+		return true;
+	case ']':
+		add_token(l, TOKEN_RSQUARE, 1);
+		l->index += 1;
+		return true;
+	case '{':
+		add_token(l, TOKEN_LCURLY, 1);
+		l->index += 1;
+		return true;
+	case '}':
+		add_token(l, TOKEN_RCURLY, 1);
+		l->index += 1;
+		return true;
+	case '\'':
+		if (l->source[l->index+1] == '\\') {
+			if (l->source[l->index+3] != '\'') {
+				add_error(l, "unclosed character literal.");
+				l->index += 1;
+				return true;
+			}
+			l->index += 1;
+			add_token(l, TOKEN_CHAR, 2);
+			l->index += 3;
+			return true;
+		} else {
+			if (l->source[l->index+2] != '\'') {
+				add_error(l, "unclosed character literal.");
+				l->index += 1;
+				return true;
+			}
+			l->index += 1;
+			add_token(l, TOKEN_CHAR, 1);
+			l->index += 2;
+			return true;
+		}
+	default:
+		return false;
+	}
+}
+
+static void parse(lexer *l)
+{
+	char c;
+
+	while (l->index <= l->size) {
+		c = l->source[l->index];
+		l->column += 1;
+
+		if (c == '\n') {
+			l->index += 1;
+			l->row += 1;
+			l->column = 0;
+			continue;
+		}
+
+		usize head = l->index;
+
+		if (c == '/' && l->source[l->index+1] == '/') {
+			while (l->source[l->index] != '\n') {
+				l->index += 1;
+			}
+			l->column += (l->index - head - 1);
+		} else if (c == '/' && l->source[l->index+1] == '*') {
+			l->index += 2;
+			while (l->index < l->size) {
+				if (l->source[l->index] == '*' && l->source[l->index+1] == '/') {
+					l->index += 2;
+					break;
+				}
+				if (l->source[l->index] == '\n') {
+					l->row += 1;
+					l->column = 0;
+				}
+				l->index += 1;
+			}
+			l->column += (l->index - head - 1);
+			continue;
+		}
+
+		if (isspace(c)) {
+			l->index += 1;
+			continue;
+		}
+
+
+		if (parse_special(l)) {
+			l->column += (l->index - head - 1);
+			continue;
+		}
+
+		if (isdigit(c)) {
+			parse_number(l);
+			l->column += (l->index - head - 1);
+			continue;
+		}
+
+		if (isalpha(c)) {
+			parse_identifier(l);
+			l->column += (l->index - head - 1);
+			continue;
+		}
+
+		if (c == '"') {
+			l->index += 1;
+			parse_string(l);
+			l->column += (l->index - head - 1);
+			continue;
+		}
+
+		l->index += 1;
+	}
+}
+
+lexer *lexer_init(char *source, usize size)
+{
+	lexer *lex = malloc(sizeof(lexer));
+	lex->column = 1;
+	lex->row = 1;
+	lex->index = 0;
+	lex->size = size;
+	lex->tokens = 0;
+	lex->tail = 0;
+	lex->source = source;
+
+	parse(lex);
+
+	return lex;
+}