implemented the lexer

2025-11-30 12:58:35 +01:00 · 2025-11-30 12:58:35 +01:00 · f1675bca76
commit f1675bca76
13 changed files with 1303 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+**/*.o
+**/*~
+cc
--- a/24
+++ b/24
@ -0,0 +1,24 @@
+Copyright (c) 2025, Lorenzo Torres
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. Neither the name of the <organization> nor the
+   names of its contributors may be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ''AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/55
+++ b/55
@ -0,0 +1,55 @@
+# cc - C compiler
+# See LICENSE file for copyright and license details.
+
+include config.mk
+
+SRC = cc.c utils.c lexer.c
+HDR = config.def.h utils.h
+OBJ = ${SRC:.c=.o}
+
+all: options cc
+
+options:
+	@echo cc build options:
+	@echo "CFLAGS   = ${CFLAGS}"
+	@echo "LDFLAGS  = ${LDFLAGS}"
+	@echo "CC       = ${CC}"
+
+.c.o:
+	${CC} -c ${CFLAGS} $<
+
+${OBJ}: config.h config.mk
+
+config.h:
+	cp config.def.h $@
+
+users.h:
+	cp users.def.h $@
+
+cc: ${OBJ}
+	${CC} -o $@ ${OBJ} ${LDFLAGS}
+
+clean:
+	rm -f cc ${OBJ} cc-${VERSION}.tar.gz
+
+dist: clean
+	mkdir -p cc-${VERSION}
+	cp -R LICENSE Makefile README config.mk\
+		cc.1 ${HDR} ${SRC} cc-${VERSION}
+	tar -cf cc-${VERSION}.tar cc-${VERSION}
+	gzip cc-${VERSION}.tar
+	rm -rf cc-${VERSION}
+
+install: all
+	mkdir -p ${DESTDIR}${PREFIX}/bin
+	cp -f cc ${DESTDIR}${PREFIX}/bin
+	chmod 755 ${DESTDIR}${PREFIX}/bin/cc
+	mkdir -p ${DESTDIR}${MANPREFIX}/man1
+	sed "s/VERSION/${VERSION}/g" < cc.1 > ${DESTDIR}${MANPREFIX}/man1/cc.1
+	chmod 644 ${DESTDIR}${MANPREFIX}/man1/cc.1
+
+uninstall:
+	rm -f ${DESTDIR}${PREFIX}/bin/cc\
+		${DESTDIR}${MANPREFIX}/man1/cc.1
+
+.PHONY: all options clean dist install uninstall
--- a/33
+++ b/33
@ -0,0 +1,33 @@
+sis - simple imap server
+============================
+sis is an IMAP server, following the unix philosophy,
+trying to be as small as possible while providing
+a reliable service.
+
+
+Requirements
+------------
+In order to build sis you need... a computer
+
+
+Installation
+------------
+Edit config.mk to match your local setup (sis is installed into
+the /usr/local namespace by default).
+
+Afterwards enter the following command to build and install sis (if
+necessary as root):
+
+    make clean install
+
+
+Running sis
+-----------
+By default, sis runs in daemon mode, if you want to avoid detaching use the -d option
+    sis -d
+
+
+Configuration
+-------------
+The configuration of sis is done by creating a custom config.h
+and (re)compiling the source code.
--- a/cc.c
+++ b/cc.c
@ -0,0 +1,24 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils.h"
+#include "lexer.h"
+
+int main(void)
+{
+	FILE *fp = fopen("test.c", "r");
+	usize size = 0;
+	fseek(fp, 0, SEEK_END);
+	size = ftell(fp);
+	fseek(fp, 0, SEEK_SET);
+	char *src = malloc(size+1);
+	fread(src, size, 1, fp);
+	fclose(fp);
+	src[size] = '\0';
+
+	arena a = arena_init(0x1000 * 0x1000 * 64);
+	lexer *l = lexer_init(src, size, &a);
+
+	arena_deinit(a);
+
+	return 0;
+}
--- a/config.def.h
+++ b/config.def.h
@ -0,0 +1,4 @@
+#ifndef CONFIG_H
+#define CONFIG_H
+
+#endif
--- a/config.h
+++ b/config.h
@ -0,0 +1,4 @@
+#ifndef CONFIG_H
+#define CONFIG_H
+
+#endif
--- a/config.mk
+++ b/config.mk
@ -0,0 +1,27 @@
+# cc version
+VERSION = 0.1
+
+# Customize below to fit your system
+
+# paths
+PREFIX = /usr
+MANPREFIX = ${PREFIX}/share/man
+
+# OpenBSD (uncomment)
+#MANPREFIX = ${PREFIX}/man
+
+# includes and libs
+INCS = -I.
+LIBS =
+# flags
+CPPFLAGS = -DVERSION=\"${VERSION}\" 
+CFLAGS  := -std=c99 -pedantic -Wall -O0 ${INCS} ${CPPFLAGS} 
+CFLAGS  := ${CFLAGS} -g
+LDFLAGS  = ${LIBS}
+
+# Solaris
+#CFLAGS = -fast ${INCS} -DVERSION=\"${VERSION}\"
+#LDFLAGS = ${LIBS}
+
+# compiler and linker
+CC = cc
--- a/lexer.c
+++ b/lexer.c
@ -0,0 +1,404 @@
+#include "lexer.h"
+#include <stdbool.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+
+trie_node *keywords;
+
+static void add_token(lexer *l, token_type type, usize len)
+{
+	token *t = arena_alloc(l->allocator, sizeof(token));
+	t->type = type;
+	t->lexeme_len = len;
+	t->lexeme = l->source + l->index;
+	t->position.row = l->row;
+	t->position.column = l->column;
+
+	if (!l->tokens) {
+		l->tokens = t;
+		l->tail = t;
+	} else {
+		l->tail->next = t;
+		l->tail = t;
+	}
+}
+
+static void add_error(lexer *l, char *msg)
+{
+	token *t = arena_alloc(l->allocator, sizeof(token));
+	t->type = TOKEN_ERROR;
+	t->lexeme_len = strlen(msg);
+	t->lexeme = msg;
+	t->position.row = l->row;
+	t->position.column = l->column;
+
+	if (!l->tokens) {
+		l->tokens = t;
+		l->tail = t;
+	} else {
+		l->tail->next = t;
+		l->tail = t;
+	}
+}
+
+static void parse_number(lexer *l)
+{
+	char c = l->source[l->index];
+	/* Is the number a float? */
+	bool f = false;
+	usize len = 0;
+
+	while (isdigit(c)) {
+		/* If a dot is found, and the character after it is a digit, this is a float. */
+		if (l->source[l->index+1] == '.' && isdigit(l->source[l->index+2])) {
+			f = true;
+			len += 3;
+			l->index += 3;
+		} else {
+			len += 1;
+			l->index += 1;
+		}
+		c = l->source[l->index];
+	}
+	l->index -= len;
+	if (f) {
+		add_token(l, TOKEN_FLOAT, len);
+	} else {
+		add_token(l, TOKEN_INTEGER, len);
+	}
+	l->index += len;
+}
+
+static void parse_identifier(lexer *l)
+{
+	char c = l->source[l->index];
+	usize len = 0;
+
+	while (isalnum(c) || c == '_') {
+		len += 1;
+		l->index += 1;
+		c = l->source[l->index];
+	}
+	l->index -= len;
+	token_type keyword = trie_get(keywords, l->source + l->index, len);
+	if (keyword) {
+		add_token(l, keyword, len);
+	} else {
+		add_token(l, TOKEN_IDENTIFIER, len);
+	}
+	l->index += len;
+}
+
+static void parse_string(lexer *l)
+{
+	char c = l->source[l->index];
+	usize len = 0;
+
+	while (c != '"') {
+		if (c == '\0' || c == '\n') {
+			l->index -= len;
+			add_error(l, "unclosed string literal.");
+			l->index += len;
+			return;
+		}
+		len += 1;
+		l->index += 1;
+		c = l->source[l->index];
+	}
+	l->index -= len;
+	add_token(l, TOKEN_STRING, len);
+	l->index += len + 1;
+}
+
+static bool parse_special(lexer *l)
+{
+	switch (l->source[l->index]) {
+	case '+':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_PLUS_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '+') {
+			add_token(l, TOKEN_PLUS_PLUS, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_PLUS, 1);
+			l->index += 1;
+		}
+		return true;
+	case '-':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_MINUS_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '-') {
+			add_token(l, TOKEN_MINUS_MINUS, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '>') {
+			add_token(l, TOKEN_ARROW, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_MINUS, 1);
+			l->index += 1;
+		}
+		return true;
+	case '/':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_SLASH_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_SLASH, 1);
+			l->index += 1;
+		}
+		return true;
+	case '*':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_STAR_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_STAR, 1);
+			l->index += 1;
+		}
+		return true;
+	case '%':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_PERC_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_PERC, 1);
+			l->index += 1;
+		}
+		return true;
+	case '&':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_AND_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '&') {
+			add_token(l, TOKEN_DOUBLE_AND, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_AND, 1);
+			l->index += 1;
+		}
+		return true;
+	case '^':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_HAT_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_HAT, 1);
+			l->index += 1;
+		}
+		return true;
+	case '|':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_PIPE_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '|') {
+			add_token(l, TOKEN_OR, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_PIPE, 1);
+			l->index += 1;
+		}
+		return true;
+	case '=':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_DOUBLE_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_EQ, 1);
+			l->index += 1;
+		}
+		return true;
+	case '>':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_GREATER_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '>') {
+			if (l->source[l->index+2] == '=') {
+				add_token(l, TOKEN_RSHIFT_EQ, 3);
+				l->index += 3;
+				return true;
+			}
+			add_token(l, TOKEN_RSHIFT, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_GREATER_THAN, 1);
+			l->index += 1;
+		}
+		return true;
+	case '<':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_LESS_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '<') {
+			if (l->source[l->index+2] == '=') {
+				add_token(l, TOKEN_LSHIFT_EQ, 3);
+				l->index += 3;
+				return true;
+			}
+			add_token(l, TOKEN_LSHIFT, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_LESS_THAN, 1);
+			l->index += 1;
+		}
+		return true;
+	case '!':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_NOT_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_BANG, 1);
+			l->index += 1;
+		}
+		return true;
+	case ':':
+		add_token(l, TOKEN_COLON, 1);
+		l->index += 1;
+		return true;
+	case ';':
+		add_token(l, TOKEN_SEMICOLON, 1);
+		l->index += 1;
+		return true;
+	case '.':
+		add_token(l, TOKEN_DOT, 1);
+		l->index += 1;
+		return true;
+	case ',':
+		add_token(l, TOKEN_COMMA, 1);
+		l->index += 1;
+		return true;
+	case '(':
+		add_token(l, TOKEN_LPAREN, 1);
+		l->index += 1;
+		return true;
+	case ')':
+		add_token(l, TOKEN_RPAREN, 1);
+		l->index += 1;
+		return true;
+	case '[':
+		add_token(l, TOKEN_LSQUARE, 1);
+		l->index += 1;
+		return true;
+	case ']':
+		add_token(l, TOKEN_RSQUARE, 1);
+		l->index += 1;
+		return true;
+	case '{':
+		add_token(l, TOKEN_LCURLY, 1);
+		l->index += 1;
+		return true;
+	case '}':
+		add_token(l, TOKEN_RCURLY, 1);
+		l->index += 1;
+		return true;
+	case '\'':
+		if (l->source[l->index+1] == '\\') {
+			if (l->source[l->index+3] != '\'') {
+				add_error(l, "unclosed character literal.");
+				l->index += 1;
+				return true;
+			}
+			l->index += 1;
+			add_token(l, TOKEN_CHAR, 2);
+			l->index += 3;
+			return true;
+		} else {
+			if (l->source[l->index+2] != '\'') {
+				add_error(l, "unclosed character literal.");
+				l->index += 1;
+				return true;
+			}
+			l->index += 1;
+			add_token(l, TOKEN_CHAR, 1);
+			l->index += 2;
+			return true;
+		}
+	default:
+		return false;
+	}
+}
+
+static void parse(lexer *l)
+{
+	char c;
+
+	while (l->index <= l->size) {
+		c = l->source[l->index];
+		l->column += 1;
+
+		if (c == '\n') {
+			l->index += 1;
+			l->row += 1;
+			l->column = 0;
+			continue;
+		}
+
+		if (isspace(c)) {
+			l->index += 1;
+			continue;
+		}
+
+		usize head = l->index;
+
+		if (parse_special(l)) {
+			l->column += (l->index - head - 1);
+			continue;
+		}
+
+		if (isdigit(c)) {
+			parse_number(l);
+			l->column += (l->index - head - 1);
+			continue;
+		}
+
+		if (isalpha(c)) {
+			parse_identifier(l);
+			l->column += (l->index - head - 1);
+			continue;
+		}
+
+		if (c == '"') {
+			l->index += 1;
+			parse_string(l);
+			l->column += (l->index - head - 1);
+			continue;
+		}
+
+		l->index += 1;
+	}
+}
+
+lexer *lexer_init(char *source, usize size, arena *arena)
+{
+	lexer *lex = arena_alloc(arena, sizeof(lexer));
+	lex->column = 0;
+	lex->row = 0;
+	lex->index = 0;
+	lex->size = size;
+	lex->tokens = 0;
+	lex->tail = 0;
+	lex->allocator = arena;
+	lex->source = source;
+
+	keywords = arena_alloc(arena, sizeof(trie_node));
+	trie_insert(keywords, lex->allocator, "while", TOKEN_WHILE);
+	trie_insert(keywords, lex->allocator, "for", TOKEN_FOR);
+	trie_insert(keywords, lex->allocator, "goto", TOKEN_GOTO);
+	trie_insert(keywords, lex->allocator, "if", TOKEN_IF);
+	trie_insert(keywords, lex->allocator, "else", TOKEN_ELSE);
+	trie_insert(keywords, lex->allocator, "switch", TOKEN_SWITCH);
+	trie_insert(keywords, lex->allocator, "case", TOKEN_CASE);
+	trie_insert(keywords, lex->allocator, "do", TOKEN_DO);
+	trie_insert(keywords, lex->allocator, "defer", TOKEN_DEFER);
+	trie_insert(keywords, lex->allocator, "module", TOKEN_MODULE);
+	trie_insert(keywords, lex->allocator, "static", TOKEN_STATIC);
+	trie_insert(keywords, lex->allocator, "const", TOKEN_CONST);
+	trie_insert(keywords, lex->allocator, "extern", TOKEN_EXTERN);
+	trie_insert(keywords, lex->allocator, "volatile", TOKEN_VOLATILE);
+
+	parse(lex);
+
+	return lex;
+}
--- a/lexer.h
+++ b/lexer.h
@ -0,0 +1,95 @@
+#ifndef LEXER_H
+#define LEXER_H
+
+#include "utils.h"
+
+typedef enum {
+	TOKEN_ERROR,
+	TOKEN_END,
+
+	TOKEN_PLUS, // +
+	TOKEN_PLUS_PLUS, // ++
+	TOKEN_MINUS, // -
+	TOKEN_MINUS_MINUS, // --
+	TOKEN_SLASH, // /
+	TOKEN_PERC, // %
+	TOKEN_STAR, // *
+	TOKEN_AND, // &
+	TOKEN_HAT, // ^
+	TOKEN_PIPE, // |
+	TOKEN_EQ, // =
+	TOKEN_ARROW, // ->
+	TOKEN_LSHIFT, // <<
+	TOKEN_RSHIFT, // >>
+	TOKEN_DOUBLE_EQ, // ==
+	TOKEN_LESS_THAN, // <
+	TOKEN_GREATER_THAN, // >
+	TOKEN_LESS_EQ, // <=
+	TOKEN_GREATER_EQ, // >=
+	TOKEN_NOT_EQ, // !=
+	TOKEN_PLUS_EQ, // +=
+	TOKEN_MINUS_EQ, // -=
+	TOKEN_STAR_EQ, // *=
+	TOKEN_SLASH_EQ, // /=
+	TOKEN_AND_EQ, // &=
+	TOKEN_HAT_EQ, // ^=
+	TOKEN_PIPE_EQ, // |=
+	TOKEN_PERC_EQ, // %=
+	TOKEN_LSHIFT_EQ, // <<=
+	TOKEN_RSHIFT_EQ, // >>=
+	TOKEN_OR, // ||
+	TOKEN_DOUBLE_AND, // &&
+	TOKEN_COLON, // :
+	TOKEN_SEMICOLON, // ;
+	TOKEN_DOT, // .
+	TOKEN_BANG, // !
+	TOKEN_COMMA, // ,
+	TOKEN_LPAREN, // (
+	TOKEN_RPAREN, // )
+	TOKEN_LSQUARE, // [
+	TOKEN_RSQUARE, // ]
+	TOKEN_LCURLY, // {
+	TOKEN_RCURLY, // }
+
+	TOKEN_INTEGER,
+	TOKEN_FLOAT,
+	TOKEN_IDENTIFIER,
+	TOKEN_STRING,
+	TOKEN_CHAR,
+
+	TOKEN_WHILE,
+	TOKEN_FOR,
+	TOKEN_GOTO,
+	TOKEN_IF,
+	TOKEN_ELSE,
+	TOKEN_SWITCH,
+	TOKEN_CASE,
+	TOKEN_DO,
+	TOKEN_DEFER,
+	TOKEN_MODULE,
+
+	TOKEN_STATIC,
+	TOKEN_CONST,
+	TOKEN_EXTERN,
+	TOKEN_VOLATILE
+} token_type;
+
+typedef struct _token {
+	token_type type;
+	source_pos position;
+	char *lexeme;
+	usize lexeme_len;
+	struct _token *next;
+} token;
+
+typedef struct {
+	usize column, row, index, size;
+	char *source;
+	token *tokens;
+	token *tail;
+	arena *allocator;
+} lexer;
+
+lexer *lexer_init(char *source, usize size, arena *arena);
+
+#endif
--- a/test.c
+++ b/test.c
@ -0,0 +1,482 @@
+#include "lexer.h"
+#include <stdbool.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+
+static const char *token_type_str[] = {
+	[TOKEN_ERROR]        = "TOKEN_ERROR",
+	[TOKEN_END]          = "TOKEN_END",
+
+	[TOKEN_PLUS]         = "TOKEN_PLUS",
+	[TOKEN_PLUS_PLUS]    = "TOKEN_PLUS_PLUS",
+	[TOKEN_MINUS]        = "TOKEN_MINUS",
+	[TOKEN_MINUS_MINUS]  = "TOKEN_MINUS_MINUS",
+	[TOKEN_SLASH]        = "TOKEN_SLASH",
+	[TOKEN_PERC]         = "TOKEN_PERC",
+	[TOKEN_STAR]         = "TOKEN_STAR",
+	[TOKEN_AND]          = "TOKEN_AND",
+	[TOKEN_HAT]          = "TOKEN_HAT",
+	[TOKEN_PIPE]         = "TOKEN_PIPE",
+	[TOKEN_EQ]           = "TOKEN_EQ",
+	[TOKEN_ARROW]	     = "TOKEN_ARROW",
+	[TOKEN_LSHIFT]       = "TOKEN_LSHIFT",
+	[TOKEN_RSHIFT]       = "TOKEN_RSHIFT",
+	[TOKEN_DOUBLE_EQ]    = "TOKEN_DOUBLE_EQ",
+	[TOKEN_LESS_THAN]    = "TOKEN_LESS_THAN",
+	[TOKEN_GREATER_THAN] = "TOKEN_GREATER_THAN",
+	[TOKEN_LESS_EQ]      = "TOKEN_LESS_EQ",
+	[TOKEN_GREATER_EQ]   = "TOKEN_GREATER_EQ",
+	[TOKEN_NOT_EQ]       = "TOKEN_NOT_EQ",
+	[TOKEN_PLUS_EQ]      = "TOKEN_PLUS_EQ",
+	[TOKEN_MINUS_EQ]     = "TOKEN_MINUS_EQ",
+	[TOKEN_STAR_EQ]      = "TOKEN_STAR_EQ",
+	[TOKEN_SLASH_EQ]     = "TOKEN_SLASH_EQ",
+	[TOKEN_AND_EQ]       = "TOKEN_AND_EQ",
+	[TOKEN_HAT_EQ]       = "TOKEN_HAT_EQ",
+	[TOKEN_PIPE_EQ]      = "TOKEN_PIPE_EQ",
+	[TOKEN_PERC_EQ]      = "TOKEN_PERC_EQ",
+	[TOKEN_LSHIFT_EQ]    = "TOKEN_LSHIFT_EQ",
+	[TOKEN_RSHIFT_EQ]    = "TOKEN_RSHIFT_EQ",
+	[TOKEN_OR]           = "TOKEN_OR",
+	[TOKEN_DOUBLE_AND]   = "TOKEN_DOUBLE_AND",
+	[TOKEN_COLON]        = "TOKEN_COLON",
+	[TOKEN_SEMICOLON]    = "TOKEN_SEMICOLON",
+	[TOKEN_DOT]          = "TOKEN_DOT",
+	[TOKEN_BANG]         = "TOKEN_BANG",
+	[TOKEN_COMMA]        = "TOKEN_COMMA",
+	[TOKEN_LPAREN]       = "TOKEN_LPAREN",
+	[TOKEN_RPAREN]       = "TOKEN_RPAREN",
+	[TOKEN_LSQUARE]      = "TOKEN_LSQUARE",
+	[TOKEN_RSQUARE]      = "TOKEN_RSQUARE",
+	[TOKEN_LCURLY]       = "TOKEN_LCURLY",
+	[TOKEN_RCURLY]       = "TOKEN_RCURLY",
+
+	[TOKEN_INTEGER]      = "TOKEN_INTEGER",
+	[TOKEN_FLOAT]        = "TOKEN_FLOAT",
+	[TOKEN_IDENTIFIER]   = "TOKEN_IDENTIFIER",
+	[TOKEN_STRING]       = "TOKEN_STRING",
+	[TOKEN_CHAR]         = "TOKEN_CHAR",
+
+	[TOKEN_WHILE]        = "TOKEN_WHILE",
+	[TOKEN_FOR]          = "TOKEN_FOR",
+	[TOKEN_GOTO]         = "TOKEN_GOTO",
+	[TOKEN_IF]           = "TOKEN_IF",
+	[TOKEN_ELSE]         = "TOKEN_ELSE",
+	[TOKEN_SWITCH]       = "TOKEN_SWITCH",
+	[TOKEN_CASE]         = "TOKEN_CASE",
+	[TOKEN_DO]           = "TOKEN_DO",
+	[TOKEN_DEFER]        = "TOKEN_DEFER",
+	[TOKEN_MODULE]       = "TOKEN_MODULE",
+
+	[TOKEN_STATIC]       = "TOKEN_STATIC",
+	[TOKEN_CONST]        = "TOKEN_CONST",
+	[TOKEN_EXTERN]       = "TOKEN_EXTERN",
+	[TOKEN_VOLATILE]     = "TOKEN_VOLATILE",
+};
+
+trie_node *keywords;
+
+void lexer_print_token(token *t)
+{
+	printf("%s: ", token_type_str[t->type]);
+	for (usize i=0; i < t->lexeme_len; i++) {
+		printf("%c", t->lexeme[i]);
+	}
+}
+
+static void add_token(lexer *l, token_type type, usize len)
+{
+	token *t = arena_alloc(l->allocator, sizeof(token));
+	t->type = type;
+	t->lexeme_len = len;
+	t->lexeme = l->source + l->index;
+	t->position.row = l->row;
+	t->position.column = l->column;
+
+	if (!l->tokens) {
+		l->tokens = t;
+		l->tail = t;
+	} else {
+		l->tail->next = t;
+		l->tail = t;
+	}
+}
+
+static void add_error(lexer *l, char *msg)
+{
+	token *t = arena_alloc(l->allocator, sizeof(token));
+	t->type = TOKEN_ERROR;
+	t->lexeme_len = strlen(msg);
+	t->lexeme = msg;
+	t->position.row = l->row;
+	t->position.column = l->column;
+
+	if (!l->tokens) {
+		l->tokens = t;
+		l->tail = t;
+	} else {
+		l->tail->next = t;
+		l->tail = t;
+	}
+}
+
+static void parse_number(lexer *l)
+{
+	char c = l->source[l->index];
+	/* Is the number a float? */
+	bool f = false;
+	usize len = 0;
+
+	while (isdigit(c)) {
+		/* If a dot is found, and the character after it is a digit, this is a float. */
+		if (l->source[l->index+1] == '.' && isdigit(l->source[l->index+2])) {
+			f = true;
+			len += 3;
+			l->index += 3;
+		} else {
+			len += 1;
+			l->index += 1;
+		}
+		c = l->source[l->index];
+	}
+	l->index -= len;
+	if (f) {
+		add_token(l, TOKEN_FLOAT, len);
+	} else {
+		add_token(l, TOKEN_INTEGER, len);
+	}
+	l->index += len;
+}
+
+static void parse_identifier(lexer *l)
+{
+	char c = l->source[l->index];
+	usize len = 0;
+
+	while (isalnum(c) || c == '_') {
+		len += 1;
+		l->index += 1;
+		c = l->source[l->index];
+	}
+	l->index -= len;
+	token_type keyword = trie_get(keywords, l->source + l->index, len);
+	if (keyword) {
+		add_token(l, keyword, len);
+	} else {
+		add_token(l, TOKEN_IDENTIFIER, len);
+	}
+	l->index += len;
+}
+
+static void parse_string(lexer *l)
+{
+	char c = l->source[l->index];
+	usize len = 0;
+
+	while (c != '"') {
+		if (c == '\0' || c == '\n') {
+			printf("%c", c);
+			l->index -= len;
+			add_error(l, "unclosed string literal.");
+			l->index += len;
+			return;
+		}
+		len += 1;
+		l->index += 1;
+		c = l->source[l->index];
+	}
+	l->index -= len;
+	add_token(l, TOKEN_STRING, len);
+	l->index += len + 1;
+}
+
+static bool parse_special(lexer *l)
+{
+	switch (l->source[l->index]) {
+	case '+':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_PLUS_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '+') {
+			add_token(l, TOKEN_PLUS_PLUS, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_PLUS, 1);
+			l->index += 1;
+		}
+		return true;
+	case '-':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_MINUS_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '-') {
+			add_token(l, TOKEN_MINUS_MINUS, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '>') {
+			add_token(l, TOKEN_ARROW, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_MINUS, 1);
+			l->index += 1;
+		}
+		return true;
+	case '/':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_SLASH_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_SLASH, 1);
+			l->index += 1;
+		}
+		return true;
+	case '*':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_STAR_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_STAR, 1);
+			l->index += 1;
+		}
+		return true;
+	case '%':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_PERC_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_PERC, 1);
+			l->index += 1;
+		}
+		return true;
+	case '&':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_AND_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '&') {
+			add_token(l, TOKEN_DOUBLE_AND, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_AND, 1);
+			l->index += 1;
+		}
+		return true;
+	case '^':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_HAT_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_HAT, 1);
+			l->index += 1;
+		}
+		return true;
+	case '|':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_PIPE_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '|') {
+			add_token(l, TOKEN_OR, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_PIPE, 1);
+			l->index += 1;
+		}
+		return true;
+	case '=':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_DOUBLE_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_EQ, 1);
+			l->index += 1;
+		}
+		return true;
+	case '>':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_GREATER_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '>') {
+			if (l->source[l->index+2] == '=') {
+				add_token(l, TOKEN_RSHIFT_EQ, 3);
+				l->index += 3;
+				return true;
+			}
+			add_token(l, TOKEN_RSHIFT, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_GREATER_THAN, 1);
+			l->index += 1;
+		}
+		return true;
+	case '<':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_LESS_EQ, 2);
+			l->index += 2;
+		} else if (l->source[l->index+1] == '<') {
+			if (l->source[l->index+2] == '=') {
+				add_token(l, TOKEN_LSHIFT_EQ, 3);
+				l->index += 3;
+				return true;
+			}
+			add_token(l, TOKEN_LSHIFT, 2);
+			l->index += 2;
+		} else {
+			add_token(l, TOKEN_LESS_THAN, 1);
+			l->index += 1;
+		}
+		return true;
+	case '!':
+		if (l->source[l->index+1] == '=') {
+			add_token(l, TOKEN_NOT_EQ, 2);
+			l->index += 2;
+		}  else {
+			add_token(l, TOKEN_BANG, 1);
+			l->index += 1;
+		}
+		return true;
+	case ':':
+		add_token(l, TOKEN_COLON, 1);
+		l->index += 1;
+		return true;
+	case ';':
+		add_token(l, TOKEN_SEMICOLON, 1);
+		l->index += 1;
+		return true;
+	case '.':
+		add_token(l, TOKEN_DOT, 1);
+		l->index += 1;
+		return true;
+	case ',':
+		add_token(l, TOKEN_COMMA, 1);
+		l->index += 1;
+		return true;
+	case '(':
+		add_token(l, TOKEN_LPAREN, 1);
+		l->index += 1;
+		return true;
+	case ')':
+		add_token(l, TOKEN_RPAREN, 1);
+		l->index += 1;
+		return true;
+	case '[':
+		add_token(l, TOKEN_LSQUARE, 1);
+		l->index += 1;
+		return true;
+	case ']':
+		add_token(l, TOKEN_RSQUARE, 1);
+		l->index += 1;
+		return true;
+	case '{':
+		add_token(l, TOKEN_LCURLY, 1);
+		l->index += 1;
+		return true;
+	case '}':
+		add_token(l, TOKEN_RCURLY, 1);
+		l->index += 1;
+		return true;
+	case '\'':
+		if (l->source[l->index+1] == '\\') {
+			if (l->source[l->index+3] != '\'') {
+				add_error(l, "unclosed character literal.");
+				return true;
+			}
+			l->index += 1;
+			add_token(l, TOKEN_CHAR, 2);
+			l->index += 3;
+			return true;
+		} else {
+			if (l->source[l->index+2] != '\'') {
+				add_error(l, "unclosed character literal.");
+				return false;
+			}
+			l->index += 1;
+			add_token(l, TOKEN_CHAR, 1);
+			l->index += 2;
+			return true;
+		}
+	default:
+		return false;
+	}
+}
+
+static void parse(lexer *l)
+{
+	char c;
+
+	while (l->index <= l->size) {
+		c = l->source[l->index];
+		l->column += 1;
+
+		if (c == '\n') {
+			l->index += 1;
+			l->row += 1;
+			l->column = 0;
+			continue;
+		}
+
+		if (isspace(c)) {
+			l->index += 1;
+			continue;
+		}
+
+		usize head = l->index;
+
+		if (parse_special(l)) {
+			l->column += (l->index - head - 1);
+			continue;
+		}
+
+		if (isdigit(c)) {
+			parse_number(l);
+			l->column += (l->index - head - 1);
+			continue;
+		}
+
+		if (isalpha(c)) {
+			parse_identifier(l);
+			l->column += (l->index - head - 1);
+			continue;
+		}
+
+		if (c == '"') {
+			l->index += 1;
+			parse_string(l);
+			l->column += (l->index - head - 1);
+			continue;
+		}
+
+		l->index += 1;
+	}
+}
+
+lexer *lexer_init(char *source, usize size, arena *arena)
+{
+	lexer *lex = arena_alloc(arena, sizeof(lexer));
+	lex->column = 0;
+	lex->row = 0;
+	lex->index = 0;
+	lex->size = size;
+	lex->tokens = 0;
+	lex->tail = 0;
+	lex->allocator = arena;
+	lex->source = source;
+
+	keywords = arena_alloc(arena, sizeof(trie_node));
+	trie_insert(keywords, lex->allocator, "while", TOKEN_WHILE);
+	trie_insert(keywords, lex->allocator, "for", TOKEN_FOR);
+	trie_insert(keywords, lex->allocator, "goto", TOKEN_GOTO);
+	trie_insert(keywords, lex->allocator, "if", TOKEN_IF);
+	trie_insert(keywords, lex->allocator, "else", TOKEN_ELSE);
+	trie_insert(keywords, lex->allocator, "switch", TOKEN_SWITCH);
+	trie_insert(keywords, lex->allocator, "case", TOKEN_CASE);
+	trie_insert(keywords, lex->allocator, "do", TOKEN_DO);
+	trie_insert(keywords, lex->allocator, "defer", TOKEN_DEFER);
+	trie_insert(keywords, lex->allocator, "module", TOKEN_MODULE);
+	trie_insert(keywords, lex->allocator, "static", TOKEN_STATIC);
+	trie_insert(keywords, lex->allocator, "const", TOKEN_CONST);
+	trie_insert(keywords, lex->allocator, "extern", TOKEN_EXTERN);
+	trie_insert(keywords, lex->allocator, "volatile", TOKEN_VOLATILE);
+
+	parse(lex);
+
+	return lex;
+}
--- a/utils.c
+++ b/utils.c
@ -0,0 +1,87 @@
+#include "utils.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+void trie_insert(trie_node *root, arena *a, char *key, uint16_t value)
+{
+	trie_node *node = root;
+	while (*key) {
+		if (!node->children[(usize)*key]) {
+			node->children[(usize)*key] = arena_alloc(a, sizeof(trie_node));
+			memset(node->children[(usize)*key], 0x0, sizeof(trie_node));
+		}
+		node = node->children[(usize)*key];
+
+		key++;
+	}
+
+	node->value = value;
+}
+
+uint16_t trie_get(trie_node *root, char *key, usize len)
+{
+	trie_node *node = root;
+	for (usize i=0; i < len; i++) {
+		if (!node->children[(usize)(key[i])]) {
+			return 0;
+		}
+		node = node->children[(usize)(key[i])];
+	}
+
+	return node->value;
+}
+
+#ifndef DEFAULT_ALIGNMENT
+#define DEFAULT_ALIGNMENT (2 * sizeof(void *))
+#endif
+
+static usize align_forward(usize ptr, usize align) {
+	uintptr_t p = ptr;
+	uintptr_t a = (uintptr_t)align;
+	uintptr_t modulo = p & (a - 1);
+
+	if (modulo != 0) {
+		p += a - modulo;
+	}
+	return (usize)p;
+}
+
+arena arena_init(usize size)
+{
+	return (arena){
+		.capacity = size,
+		.position = 0,
+		.memory = malloc(size),
+	};
+}
+
+void *arena_alloc(arena *a, usize size) {
+	uintptr_t current_addr = (uintptr_t)a->memory + a->position;
+	uintptr_t padding = align_forward(current_addr, DEFAULT_ALIGNMENT) - current_addr;
+	if (a->position + padding + size > a->capacity) return NULL;
+	void *ret = (unsigned char *)a->memory + a->position + padding;
+	a->position += (size + padding); 
+
+	return ret;
+}
+
+snapshot arena_snapshot(arena a)
+{
+	return a.position;
+}
+
+void arena_reset_to_snapshot(arena *a, snapshot s)
+{
+	a->position = s;
+}
+
+void arena_reset(arena *a)
+{
+	arena_reset_to_snapshot(a, 0);
+}
+
+void arena_deinit(arena a)
+{
+	free(a.memory);
+}
--- a/utils.h
+++ b/utils.h
@ -0,0 +1,61 @@
+#ifndef UTILS_H
+#define UTILS_H
+
+#include <stdint.h>
+#include <stdint.h>
+#include <stddef.h>
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+typedef int8_t i8;
+typedef int16_t i16;
+typedef int32_t i32;
+typedef int64_t i64;
+
+typedef size_t usize;
+
+typedef float f32;
+typedef double f64;
+
+typedef struct {
+	usize capacity;
+	usize position;
+	void* memory;
+} arena;
+
+typedef usize snapshot;
+
+/*
+ * NOTE(ernesto): faulty initialization is signalided by the arena.memory
+ * being null. It is the responsability of the caller to check for fulty
+ * initialization.
+ */
+arena arena_init(usize size);
+/*
+ * Returns null on unsuccessfull allocation.
+ * In this implemention an allocation is only unsuccessfull if the arena
+ * does not have enough memory to allocate the requested space
+ */
+void *arena_alloc(arena *a, usize size);
+snapshot arena_snapshot(arena a);
+void arena_reset_to_snapshot(arena *a, snapshot s);
+void arena_reset(arena *a);
+/* This call should never fail, also, do we even care if it does? */
+void arena_deinit(arena a);
+
+typedef struct _trie_node {
+	uint16_t value;
+	struct _trie_node *children[256];
+} trie_node;
+
+void trie_insert(trie_node *root, arena *a, char *key, uint16_t value);
+uint16_t trie_get(trie_node *root, char *key, usize len);
+
+typedef struct {
+	usize row, column;
+} source_pos;
+
+#endif