From f1675bca760f1967a0a263aaba81d54a8668d0f5 Mon Sep 17 00:00:00 2001 From: Lorenzo Torres Date: Sun, 30 Nov 2025 12:58:35 +0100 Subject: [PATCH] implemented the lexer --- .gitignore | 3 + LICENSE | 24 +++ Makefile | 55 ++++++ README | 33 ++++ cc.c | 24 +++ config.def.h | 4 + config.h | 4 + config.mk | 27 +++ lexer.c | 404 ++++++++++++++++++++++++++++++++++++++++++ lexer.h | 95 ++++++++++ test.c | 482 +++++++++++++++++++++++++++++++++++++++++++++++++++ utils.c | 87 ++++++++++ utils.h | 61 +++++++ 13 files changed, 1303 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README create mode 100644 cc.c create mode 100644 config.def.h create mode 100644 config.h create mode 100644 config.mk create mode 100644 lexer.c create mode 100644 lexer.h create mode 100644 test.c create mode 100644 utils.c create mode 100644 utils.h diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..db88bbc --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +**/*.o +**/*~ +cc diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c71ceed --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +Copyright (c) 2025, Lorenzo Torres +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. Neither the name of the nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY ''AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..92dc81f --- /dev/null +++ b/Makefile @@ -0,0 +1,55 @@ +# cc - C compiler +# See LICENSE file for copyright and license details. + +include config.mk + +SRC = cc.c utils.c lexer.c +HDR = config.def.h utils.h +OBJ = ${SRC:.c=.o} + +all: options cc + +options: + @echo cc build options: + @echo "CFLAGS = ${CFLAGS}" + @echo "LDFLAGS = ${LDFLAGS}" + @echo "CC = ${CC}" + +.c.o: + ${CC} -c ${CFLAGS} $< + +${OBJ}: config.h config.mk + +config.h: + cp config.def.h $@ + +users.h: + cp users.def.h $@ + +cc: ${OBJ} + ${CC} -o $@ ${OBJ} ${LDFLAGS} + +clean: + rm -f cc ${OBJ} cc-${VERSION}.tar.gz + +dist: clean + mkdir -p cc-${VERSION} + cp -R LICENSE Makefile README config.mk\ + cc.1 ${HDR} ${SRC} cc-${VERSION} + tar -cf cc-${VERSION}.tar cc-${VERSION} + gzip cc-${VERSION}.tar + rm -rf cc-${VERSION} + +install: all + mkdir -p ${DESTDIR}${PREFIX}/bin + cp -f cc ${DESTDIR}${PREFIX}/bin + chmod 755 ${DESTDIR}${PREFIX}/bin/cc + mkdir -p ${DESTDIR}${MANPREFIX}/man1 + sed "s/VERSION/${VERSION}/g" < cc.1 > ${DESTDIR}${MANPREFIX}/man1/cc.1 + chmod 644 ${DESTDIR}${MANPREFIX}/man1/cc.1 + +uninstall: + rm -f ${DESTDIR}${PREFIX}/bin/cc\ + ${DESTDIR}${MANPREFIX}/man1/cc.1 + +.PHONY: all options clean dist install uninstall diff --git a/README b/README new file mode 100644 index 0000000..17ccbb6 --- /dev/null +++ b/README @@ -0,0 +1,33 @@ +sis - simple imap server +============================ +sis is an IMAP server, following the unix philosophy, +trying to be as small as possible while providing +a reliable service. + + +Requirements +------------ +In order to build sis you need... a computer + + +Installation +------------ +Edit config.mk to match your local setup (sis is installed into +the /usr/local namespace by default). + +Afterwards enter the following command to build and install sis (if +necessary as root): + + make clean install + + +Running sis +----------- +By default, sis runs in daemon mode, if you want to avoid detaching use the -d option + sis -d + + +Configuration +------------- +The configuration of sis is done by creating a custom config.h +and (re)compiling the source code. diff --git a/cc.c b/cc.c new file mode 100644 index 0000000..9b59dd4 --- /dev/null +++ b/cc.c @@ -0,0 +1,24 @@ +#include +#include +#include "utils.h" +#include "lexer.h" + +int main(void) +{ + FILE *fp = fopen("test.c", "r"); + usize size = 0; + fseek(fp, 0, SEEK_END); + size = ftell(fp); + fseek(fp, 0, SEEK_SET); + char *src = malloc(size+1); + fread(src, size, 1, fp); + fclose(fp); + src[size] = '\0'; + + arena a = arena_init(0x1000 * 0x1000 * 64); + lexer *l = lexer_init(src, size, &a); + + arena_deinit(a); + + return 0; +} diff --git a/config.def.h b/config.def.h new file mode 100644 index 0000000..184290d --- /dev/null +++ b/config.def.h @@ -0,0 +1,4 @@ +#ifndef CONFIG_H +#define CONFIG_H + +#endif diff --git a/config.h b/config.h new file mode 100644 index 0000000..184290d --- /dev/null +++ b/config.h @@ -0,0 +1,4 @@ +#ifndef CONFIG_H +#define CONFIG_H + +#endif diff --git a/config.mk b/config.mk new file mode 100644 index 0000000..c797027 --- /dev/null +++ b/config.mk @@ -0,0 +1,27 @@ +# cc version +VERSION = 0.1 + +# Customize below to fit your system + +# paths +PREFIX = /usr +MANPREFIX = ${PREFIX}/share/man + +# OpenBSD (uncomment) +#MANPREFIX = ${PREFIX}/man + +# includes and libs +INCS = -I. +LIBS = +# flags +CPPFLAGS = -DVERSION=\"${VERSION}\" +CFLAGS := -std=c99 -pedantic -Wall -O0 ${INCS} ${CPPFLAGS} +CFLAGS := ${CFLAGS} -g +LDFLAGS = ${LIBS} + +# Solaris +#CFLAGS = -fast ${INCS} -DVERSION=\"${VERSION}\" +#LDFLAGS = ${LIBS} + +# compiler and linker +CC = cc diff --git a/lexer.c b/lexer.c new file mode 100644 index 0000000..b87bc96 --- /dev/null +++ b/lexer.c @@ -0,0 +1,404 @@ +#include "lexer.h" +#include +#include +#include +#include + +trie_node *keywords; + +static void add_token(lexer *l, token_type type, usize len) +{ + token *t = arena_alloc(l->allocator, sizeof(token)); + t->type = type; + t->lexeme_len = len; + t->lexeme = l->source + l->index; + t->position.row = l->row; + t->position.column = l->column; + + if (!l->tokens) { + l->tokens = t; + l->tail = t; + } else { + l->tail->next = t; + l->tail = t; + } +} + +static void add_error(lexer *l, char *msg) +{ + token *t = arena_alloc(l->allocator, sizeof(token)); + t->type = TOKEN_ERROR; + t->lexeme_len = strlen(msg); + t->lexeme = msg; + t->position.row = l->row; + t->position.column = l->column; + + if (!l->tokens) { + l->tokens = t; + l->tail = t; + } else { + l->tail->next = t; + l->tail = t; + } +} + +static void parse_number(lexer *l) +{ + char c = l->source[l->index]; + /* Is the number a float? */ + bool f = false; + usize len = 0; + + while (isdigit(c)) { + /* If a dot is found, and the character after it is a digit, this is a float. */ + if (l->source[l->index+1] == '.' && isdigit(l->source[l->index+2])) { + f = true; + len += 3; + l->index += 3; + } else { + len += 1; + l->index += 1; + } + c = l->source[l->index]; + } + l->index -= len; + if (f) { + add_token(l, TOKEN_FLOAT, len); + } else { + add_token(l, TOKEN_INTEGER, len); + } + l->index += len; +} + +static void parse_identifier(lexer *l) +{ + char c = l->source[l->index]; + usize len = 0; + + while (isalnum(c) || c == '_') { + len += 1; + l->index += 1; + c = l->source[l->index]; + } + l->index -= len; + token_type keyword = trie_get(keywords, l->source + l->index, len); + if (keyword) { + add_token(l, keyword, len); + } else { + add_token(l, TOKEN_IDENTIFIER, len); + } + l->index += len; +} + +static void parse_string(lexer *l) +{ + char c = l->source[l->index]; + usize len = 0; + + while (c != '"') { + if (c == '\0' || c == '\n') { + l->index -= len; + add_error(l, "unclosed string literal."); + l->index += len; + return; + } + len += 1; + l->index += 1; + c = l->source[l->index]; + } + l->index -= len; + add_token(l, TOKEN_STRING, len); + l->index += len + 1; +} + +static bool parse_special(lexer *l) +{ + switch (l->source[l->index]) { + case '+': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_PLUS_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '+') { + add_token(l, TOKEN_PLUS_PLUS, 2); + l->index += 2; + } else { + add_token(l, TOKEN_PLUS, 1); + l->index += 1; + } + return true; + case '-': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_MINUS_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '-') { + add_token(l, TOKEN_MINUS_MINUS, 2); + l->index += 2; + } else if (l->source[l->index+1] == '>') { + add_token(l, TOKEN_ARROW, 2); + l->index += 2; + } else { + add_token(l, TOKEN_MINUS, 1); + l->index += 1; + } + return true; + case '/': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_SLASH_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_SLASH, 1); + l->index += 1; + } + return true; + case '*': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_STAR_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_STAR, 1); + l->index += 1; + } + return true; + case '%': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_PERC_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_PERC, 1); + l->index += 1; + } + return true; + case '&': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_AND_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '&') { + add_token(l, TOKEN_DOUBLE_AND, 2); + l->index += 2; + } else { + add_token(l, TOKEN_AND, 1); + l->index += 1; + } + return true; + case '^': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_HAT_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_HAT, 1); + l->index += 1; + } + return true; + case '|': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_PIPE_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '|') { + add_token(l, TOKEN_OR, 2); + l->index += 2; + } else { + add_token(l, TOKEN_PIPE, 1); + l->index += 1; + } + return true; + case '=': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_DOUBLE_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_EQ, 1); + l->index += 1; + } + return true; + case '>': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_GREATER_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '>') { + if (l->source[l->index+2] == '=') { + add_token(l, TOKEN_RSHIFT_EQ, 3); + l->index += 3; + return true; + } + add_token(l, TOKEN_RSHIFT, 2); + l->index += 2; + } else { + add_token(l, TOKEN_GREATER_THAN, 1); + l->index += 1; + } + return true; + case '<': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_LESS_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '<') { + if (l->source[l->index+2] == '=') { + add_token(l, TOKEN_LSHIFT_EQ, 3); + l->index += 3; + return true; + } + add_token(l, TOKEN_LSHIFT, 2); + l->index += 2; + } else { + add_token(l, TOKEN_LESS_THAN, 1); + l->index += 1; + } + return true; + case '!': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_NOT_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_BANG, 1); + l->index += 1; + } + return true; + case ':': + add_token(l, TOKEN_COLON, 1); + l->index += 1; + return true; + case ';': + add_token(l, TOKEN_SEMICOLON, 1); + l->index += 1; + return true; + case '.': + add_token(l, TOKEN_DOT, 1); + l->index += 1; + return true; + case ',': + add_token(l, TOKEN_COMMA, 1); + l->index += 1; + return true; + case '(': + add_token(l, TOKEN_LPAREN, 1); + l->index += 1; + return true; + case ')': + add_token(l, TOKEN_RPAREN, 1); + l->index += 1; + return true; + case '[': + add_token(l, TOKEN_LSQUARE, 1); + l->index += 1; + return true; + case ']': + add_token(l, TOKEN_RSQUARE, 1); + l->index += 1; + return true; + case '{': + add_token(l, TOKEN_LCURLY, 1); + l->index += 1; + return true; + case '}': + add_token(l, TOKEN_RCURLY, 1); + l->index += 1; + return true; + case '\'': + if (l->source[l->index+1] == '\\') { + if (l->source[l->index+3] != '\'') { + add_error(l, "unclosed character literal."); + l->index += 1; + return true; + } + l->index += 1; + add_token(l, TOKEN_CHAR, 2); + l->index += 3; + return true; + } else { + if (l->source[l->index+2] != '\'') { + add_error(l, "unclosed character literal."); + l->index += 1; + return true; + } + l->index += 1; + add_token(l, TOKEN_CHAR, 1); + l->index += 2; + return true; + } + default: + return false; + } +} + +static void parse(lexer *l) +{ + char c; + + while (l->index <= l->size) { + c = l->source[l->index]; + l->column += 1; + + if (c == '\n') { + l->index += 1; + l->row += 1; + l->column = 0; + continue; + } + + if (isspace(c)) { + l->index += 1; + continue; + } + + usize head = l->index; + + if (parse_special(l)) { + l->column += (l->index - head - 1); + continue; + } + + if (isdigit(c)) { + parse_number(l); + l->column += (l->index - head - 1); + continue; + } + + if (isalpha(c)) { + parse_identifier(l); + l->column += (l->index - head - 1); + continue; + } + + if (c == '"') { + l->index += 1; + parse_string(l); + l->column += (l->index - head - 1); + continue; + } + + l->index += 1; + } +} + +lexer *lexer_init(char *source, usize size, arena *arena) +{ + lexer *lex = arena_alloc(arena, sizeof(lexer)); + lex->column = 0; + lex->row = 0; + lex->index = 0; + lex->size = size; + lex->tokens = 0; + lex->tail = 0; + lex->allocator = arena; + lex->source = source; + + keywords = arena_alloc(arena, sizeof(trie_node)); + trie_insert(keywords, lex->allocator, "while", TOKEN_WHILE); + trie_insert(keywords, lex->allocator, "for", TOKEN_FOR); + trie_insert(keywords, lex->allocator, "goto", TOKEN_GOTO); + trie_insert(keywords, lex->allocator, "if", TOKEN_IF); + trie_insert(keywords, lex->allocator, "else", TOKEN_ELSE); + trie_insert(keywords, lex->allocator, "switch", TOKEN_SWITCH); + trie_insert(keywords, lex->allocator, "case", TOKEN_CASE); + trie_insert(keywords, lex->allocator, "do", TOKEN_DO); + trie_insert(keywords, lex->allocator, "defer", TOKEN_DEFER); + trie_insert(keywords, lex->allocator, "module", TOKEN_MODULE); + trie_insert(keywords, lex->allocator, "static", TOKEN_STATIC); + trie_insert(keywords, lex->allocator, "const", TOKEN_CONST); + trie_insert(keywords, lex->allocator, "extern", TOKEN_EXTERN); + trie_insert(keywords, lex->allocator, "volatile", TOKEN_VOLATILE); + + parse(lex); + + return lex; +} diff --git a/lexer.h b/lexer.h new file mode 100644 index 0000000..04961f8 --- /dev/null +++ b/lexer.h @@ -0,0 +1,95 @@ +#ifndef LEXER_H +#define LEXER_H + +#include "utils.h" + +typedef enum { + TOKEN_ERROR, + TOKEN_END, + + TOKEN_PLUS, // + + TOKEN_PLUS_PLUS, // ++ + TOKEN_MINUS, // - + TOKEN_MINUS_MINUS, // -- + TOKEN_SLASH, // / + TOKEN_PERC, // % + TOKEN_STAR, // * + TOKEN_AND, // & + TOKEN_HAT, // ^ + TOKEN_PIPE, // | + TOKEN_EQ, // = + TOKEN_ARROW, // -> + TOKEN_LSHIFT, // << + TOKEN_RSHIFT, // >> + TOKEN_DOUBLE_EQ, // == + TOKEN_LESS_THAN, // < + TOKEN_GREATER_THAN, // > + TOKEN_LESS_EQ, // <= + TOKEN_GREATER_EQ, // >= + TOKEN_NOT_EQ, // != + TOKEN_PLUS_EQ, // += + TOKEN_MINUS_EQ, // -= + TOKEN_STAR_EQ, // *= + TOKEN_SLASH_EQ, // /= + TOKEN_AND_EQ, // &= + TOKEN_HAT_EQ, // ^= + TOKEN_PIPE_EQ, // |= + TOKEN_PERC_EQ, // %= + TOKEN_LSHIFT_EQ, // <<= + TOKEN_RSHIFT_EQ, // >>= + TOKEN_OR, // || + TOKEN_DOUBLE_AND, // && + TOKEN_COLON, // : + TOKEN_SEMICOLON, // ; + TOKEN_DOT, // . + TOKEN_BANG, // ! + TOKEN_COMMA, // , + TOKEN_LPAREN, // ( + TOKEN_RPAREN, // ) + TOKEN_LSQUARE, // [ + TOKEN_RSQUARE, // ] + TOKEN_LCURLY, // { + TOKEN_RCURLY, // } + + TOKEN_INTEGER, + TOKEN_FLOAT, + TOKEN_IDENTIFIER, + TOKEN_STRING, + TOKEN_CHAR, + + TOKEN_WHILE, + TOKEN_FOR, + TOKEN_GOTO, + TOKEN_IF, + TOKEN_ELSE, + TOKEN_SWITCH, + TOKEN_CASE, + TOKEN_DO, + TOKEN_DEFER, + TOKEN_MODULE, + + TOKEN_STATIC, + TOKEN_CONST, + TOKEN_EXTERN, + TOKEN_VOLATILE +} token_type; + +typedef struct _token { + token_type type; + source_pos position; + char *lexeme; + usize lexeme_len; + struct _token *next; +} token; + +typedef struct { + usize column, row, index, size; + char *source; + token *tokens; + token *tail; + arena *allocator; +} lexer; + +lexer *lexer_init(char *source, usize size, arena *arena); + +#endif diff --git a/test.c b/test.c new file mode 100644 index 0000000..27839c1 --- /dev/null +++ b/test.c @@ -0,0 +1,482 @@ +#include "lexer.h" +#include +#include +#include +#include + +static const char *token_type_str[] = { + [TOKEN_ERROR] = "TOKEN_ERROR", + [TOKEN_END] = "TOKEN_END", + + [TOKEN_PLUS] = "TOKEN_PLUS", + [TOKEN_PLUS_PLUS] = "TOKEN_PLUS_PLUS", + [TOKEN_MINUS] = "TOKEN_MINUS", + [TOKEN_MINUS_MINUS] = "TOKEN_MINUS_MINUS", + [TOKEN_SLASH] = "TOKEN_SLASH", + [TOKEN_PERC] = "TOKEN_PERC", + [TOKEN_STAR] = "TOKEN_STAR", + [TOKEN_AND] = "TOKEN_AND", + [TOKEN_HAT] = "TOKEN_HAT", + [TOKEN_PIPE] = "TOKEN_PIPE", + [TOKEN_EQ] = "TOKEN_EQ", + [TOKEN_ARROW] = "TOKEN_ARROW", + [TOKEN_LSHIFT] = "TOKEN_LSHIFT", + [TOKEN_RSHIFT] = "TOKEN_RSHIFT", + [TOKEN_DOUBLE_EQ] = "TOKEN_DOUBLE_EQ", + [TOKEN_LESS_THAN] = "TOKEN_LESS_THAN", + [TOKEN_GREATER_THAN] = "TOKEN_GREATER_THAN", + [TOKEN_LESS_EQ] = "TOKEN_LESS_EQ", + [TOKEN_GREATER_EQ] = "TOKEN_GREATER_EQ", + [TOKEN_NOT_EQ] = "TOKEN_NOT_EQ", + [TOKEN_PLUS_EQ] = "TOKEN_PLUS_EQ", + [TOKEN_MINUS_EQ] = "TOKEN_MINUS_EQ", + [TOKEN_STAR_EQ] = "TOKEN_STAR_EQ", + [TOKEN_SLASH_EQ] = "TOKEN_SLASH_EQ", + [TOKEN_AND_EQ] = "TOKEN_AND_EQ", + [TOKEN_HAT_EQ] = "TOKEN_HAT_EQ", + [TOKEN_PIPE_EQ] = "TOKEN_PIPE_EQ", + [TOKEN_PERC_EQ] = "TOKEN_PERC_EQ", + [TOKEN_LSHIFT_EQ] = "TOKEN_LSHIFT_EQ", + [TOKEN_RSHIFT_EQ] = "TOKEN_RSHIFT_EQ", + [TOKEN_OR] = "TOKEN_OR", + [TOKEN_DOUBLE_AND] = "TOKEN_DOUBLE_AND", + [TOKEN_COLON] = "TOKEN_COLON", + [TOKEN_SEMICOLON] = "TOKEN_SEMICOLON", + [TOKEN_DOT] = "TOKEN_DOT", + [TOKEN_BANG] = "TOKEN_BANG", + [TOKEN_COMMA] = "TOKEN_COMMA", + [TOKEN_LPAREN] = "TOKEN_LPAREN", + [TOKEN_RPAREN] = "TOKEN_RPAREN", + [TOKEN_LSQUARE] = "TOKEN_LSQUARE", + [TOKEN_RSQUARE] = "TOKEN_RSQUARE", + [TOKEN_LCURLY] = "TOKEN_LCURLY", + [TOKEN_RCURLY] = "TOKEN_RCURLY", + + [TOKEN_INTEGER] = "TOKEN_INTEGER", + [TOKEN_FLOAT] = "TOKEN_FLOAT", + [TOKEN_IDENTIFIER] = "TOKEN_IDENTIFIER", + [TOKEN_STRING] = "TOKEN_STRING", + [TOKEN_CHAR] = "TOKEN_CHAR", + + [TOKEN_WHILE] = "TOKEN_WHILE", + [TOKEN_FOR] = "TOKEN_FOR", + [TOKEN_GOTO] = "TOKEN_GOTO", + [TOKEN_IF] = "TOKEN_IF", + [TOKEN_ELSE] = "TOKEN_ELSE", + [TOKEN_SWITCH] = "TOKEN_SWITCH", + [TOKEN_CASE] = "TOKEN_CASE", + [TOKEN_DO] = "TOKEN_DO", + [TOKEN_DEFER] = "TOKEN_DEFER", + [TOKEN_MODULE] = "TOKEN_MODULE", + + [TOKEN_STATIC] = "TOKEN_STATIC", + [TOKEN_CONST] = "TOKEN_CONST", + [TOKEN_EXTERN] = "TOKEN_EXTERN", + [TOKEN_VOLATILE] = "TOKEN_VOLATILE", +}; + +trie_node *keywords; + +void lexer_print_token(token *t) +{ + printf("%s: ", token_type_str[t->type]); + for (usize i=0; i < t->lexeme_len; i++) { + printf("%c", t->lexeme[i]); + } +} + +static void add_token(lexer *l, token_type type, usize len) +{ + token *t = arena_alloc(l->allocator, sizeof(token)); + t->type = type; + t->lexeme_len = len; + t->lexeme = l->source + l->index; + t->position.row = l->row; + t->position.column = l->column; + + if (!l->tokens) { + l->tokens = t; + l->tail = t; + } else { + l->tail->next = t; + l->tail = t; + } +} + +static void add_error(lexer *l, char *msg) +{ + token *t = arena_alloc(l->allocator, sizeof(token)); + t->type = TOKEN_ERROR; + t->lexeme_len = strlen(msg); + t->lexeme = msg; + t->position.row = l->row; + t->position.column = l->column; + + if (!l->tokens) { + l->tokens = t; + l->tail = t; + } else { + l->tail->next = t; + l->tail = t; + } +} + +static void parse_number(lexer *l) +{ + char c = l->source[l->index]; + /* Is the number a float? */ + bool f = false; + usize len = 0; + + while (isdigit(c)) { + /* If a dot is found, and the character after it is a digit, this is a float. */ + if (l->source[l->index+1] == '.' && isdigit(l->source[l->index+2])) { + f = true; + len += 3; + l->index += 3; + } else { + len += 1; + l->index += 1; + } + c = l->source[l->index]; + } + l->index -= len; + if (f) { + add_token(l, TOKEN_FLOAT, len); + } else { + add_token(l, TOKEN_INTEGER, len); + } + l->index += len; +} + +static void parse_identifier(lexer *l) +{ + char c = l->source[l->index]; + usize len = 0; + + while (isalnum(c) || c == '_') { + len += 1; + l->index += 1; + c = l->source[l->index]; + } + l->index -= len; + token_type keyword = trie_get(keywords, l->source + l->index, len); + if (keyword) { + add_token(l, keyword, len); + } else { + add_token(l, TOKEN_IDENTIFIER, len); + } + l->index += len; +} + +static void parse_string(lexer *l) +{ + char c = l->source[l->index]; + usize len = 0; + + while (c != '"') { + if (c == '\0' || c == '\n') { + printf("%c", c); + l->index -= len; + add_error(l, "unclosed string literal."); + l->index += len; + return; + } + len += 1; + l->index += 1; + c = l->source[l->index]; + } + l->index -= len; + add_token(l, TOKEN_STRING, len); + l->index += len + 1; +} + +static bool parse_special(lexer *l) +{ + switch (l->source[l->index]) { + case '+': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_PLUS_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '+') { + add_token(l, TOKEN_PLUS_PLUS, 2); + l->index += 2; + } else { + add_token(l, TOKEN_PLUS, 1); + l->index += 1; + } + return true; + case '-': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_MINUS_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '-') { + add_token(l, TOKEN_MINUS_MINUS, 2); + l->index += 2; + } else if (l->source[l->index+1] == '>') { + add_token(l, TOKEN_ARROW, 2); + l->index += 2; + } else { + add_token(l, TOKEN_MINUS, 1); + l->index += 1; + } + return true; + case '/': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_SLASH_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_SLASH, 1); + l->index += 1; + } + return true; + case '*': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_STAR_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_STAR, 1); + l->index += 1; + } + return true; + case '%': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_PERC_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_PERC, 1); + l->index += 1; + } + return true; + case '&': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_AND_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '&') { + add_token(l, TOKEN_DOUBLE_AND, 2); + l->index += 2; + } else { + add_token(l, TOKEN_AND, 1); + l->index += 1; + } + return true; + case '^': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_HAT_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_HAT, 1); + l->index += 1; + } + return true; + case '|': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_PIPE_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '|') { + add_token(l, TOKEN_OR, 2); + l->index += 2; + } else { + add_token(l, TOKEN_PIPE, 1); + l->index += 1; + } + return true; + case '=': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_DOUBLE_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_EQ, 1); + l->index += 1; + } + return true; + case '>': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_GREATER_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '>') { + if (l->source[l->index+2] == '=') { + add_token(l, TOKEN_RSHIFT_EQ, 3); + l->index += 3; + return true; + } + add_token(l, TOKEN_RSHIFT, 2); + l->index += 2; + } else { + add_token(l, TOKEN_GREATER_THAN, 1); + l->index += 1; + } + return true; + case '<': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_LESS_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '<') { + if (l->source[l->index+2] == '=') { + add_token(l, TOKEN_LSHIFT_EQ, 3); + l->index += 3; + return true; + } + add_token(l, TOKEN_LSHIFT, 2); + l->index += 2; + } else { + add_token(l, TOKEN_LESS_THAN, 1); + l->index += 1; + } + return true; + case '!': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_NOT_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_BANG, 1); + l->index += 1; + } + return true; + case ':': + add_token(l, TOKEN_COLON, 1); + l->index += 1; + return true; + case ';': + add_token(l, TOKEN_SEMICOLON, 1); + l->index += 1; + return true; + case '.': + add_token(l, TOKEN_DOT, 1); + l->index += 1; + return true; + case ',': + add_token(l, TOKEN_COMMA, 1); + l->index += 1; + return true; + case '(': + add_token(l, TOKEN_LPAREN, 1); + l->index += 1; + return true; + case ')': + add_token(l, TOKEN_RPAREN, 1); + l->index += 1; + return true; + case '[': + add_token(l, TOKEN_LSQUARE, 1); + l->index += 1; + return true; + case ']': + add_token(l, TOKEN_RSQUARE, 1); + l->index += 1; + return true; + case '{': + add_token(l, TOKEN_LCURLY, 1); + l->index += 1; + return true; + case '}': + add_token(l, TOKEN_RCURLY, 1); + l->index += 1; + return true; + case '\'': + if (l->source[l->index+1] == '\\') { + if (l->source[l->index+3] != '\'') { + add_error(l, "unclosed character literal."); + return true; + } + l->index += 1; + add_token(l, TOKEN_CHAR, 2); + l->index += 3; + return true; + } else { + if (l->source[l->index+2] != '\'') { + add_error(l, "unclosed character literal."); + return false; + } + l->index += 1; + add_token(l, TOKEN_CHAR, 1); + l->index += 2; + return true; + } + default: + return false; + } +} + +static void parse(lexer *l) +{ + char c; + + while (l->index <= l->size) { + c = l->source[l->index]; + l->column += 1; + + if (c == '\n') { + l->index += 1; + l->row += 1; + l->column = 0; + continue; + } + + if (isspace(c)) { + l->index += 1; + continue; + } + + usize head = l->index; + + if (parse_special(l)) { + l->column += (l->index - head - 1); + continue; + } + + if (isdigit(c)) { + parse_number(l); + l->column += (l->index - head - 1); + continue; + } + + if (isalpha(c)) { + parse_identifier(l); + l->column += (l->index - head - 1); + continue; + } + + if (c == '"') { + l->index += 1; + parse_string(l); + l->column += (l->index - head - 1); + continue; + } + + l->index += 1; + } +} + +lexer *lexer_init(char *source, usize size, arena *arena) +{ + lexer *lex = arena_alloc(arena, sizeof(lexer)); + lex->column = 0; + lex->row = 0; + lex->index = 0; + lex->size = size; + lex->tokens = 0; + lex->tail = 0; + lex->allocator = arena; + lex->source = source; + + keywords = arena_alloc(arena, sizeof(trie_node)); + trie_insert(keywords, lex->allocator, "while", TOKEN_WHILE); + trie_insert(keywords, lex->allocator, "for", TOKEN_FOR); + trie_insert(keywords, lex->allocator, "goto", TOKEN_GOTO); + trie_insert(keywords, lex->allocator, "if", TOKEN_IF); + trie_insert(keywords, lex->allocator, "else", TOKEN_ELSE); + trie_insert(keywords, lex->allocator, "switch", TOKEN_SWITCH); + trie_insert(keywords, lex->allocator, "case", TOKEN_CASE); + trie_insert(keywords, lex->allocator, "do", TOKEN_DO); + trie_insert(keywords, lex->allocator, "defer", TOKEN_DEFER); + trie_insert(keywords, lex->allocator, "module", TOKEN_MODULE); + trie_insert(keywords, lex->allocator, "static", TOKEN_STATIC); + trie_insert(keywords, lex->allocator, "const", TOKEN_CONST); + trie_insert(keywords, lex->allocator, "extern", TOKEN_EXTERN); + trie_insert(keywords, lex->allocator, "volatile", TOKEN_VOLATILE); + + parse(lex); + + return lex; +} diff --git a/utils.c b/utils.c new file mode 100644 index 0000000..4080730 --- /dev/null +++ b/utils.c @@ -0,0 +1,87 @@ +#include "utils.h" +#include +#include +#include + +void trie_insert(trie_node *root, arena *a, char *key, uint16_t value) +{ + trie_node *node = root; + while (*key) { + if (!node->children[(usize)*key]) { + node->children[(usize)*key] = arena_alloc(a, sizeof(trie_node)); + memset(node->children[(usize)*key], 0x0, sizeof(trie_node)); + } + node = node->children[(usize)*key]; + + key++; + } + + node->value = value; +} + +uint16_t trie_get(trie_node *root, char *key, usize len) +{ + trie_node *node = root; + for (usize i=0; i < len; i++) { + if (!node->children[(usize)(key[i])]) { + return 0; + } + node = node->children[(usize)(key[i])]; + } + + return node->value; +} + +#ifndef DEFAULT_ALIGNMENT +#define DEFAULT_ALIGNMENT (2 * sizeof(void *)) +#endif + +static usize align_forward(usize ptr, usize align) { + uintptr_t p = ptr; + uintptr_t a = (uintptr_t)align; + uintptr_t modulo = p & (a - 1); + + if (modulo != 0) { + p += a - modulo; + } + return (usize)p; +} + +arena arena_init(usize size) +{ + return (arena){ + .capacity = size, + .position = 0, + .memory = malloc(size), + }; +} + +void *arena_alloc(arena *a, usize size) { + uintptr_t current_addr = (uintptr_t)a->memory + a->position; + uintptr_t padding = align_forward(current_addr, DEFAULT_ALIGNMENT) - current_addr; + if (a->position + padding + size > a->capacity) return NULL; + void *ret = (unsigned char *)a->memory + a->position + padding; + a->position += (size + padding); + + return ret; +} + +snapshot arena_snapshot(arena a) +{ + return a.position; +} + +void arena_reset_to_snapshot(arena *a, snapshot s) +{ + a->position = s; +} + +void arena_reset(arena *a) +{ + arena_reset_to_snapshot(a, 0); +} + +void arena_deinit(arena a) +{ + free(a.memory); +} diff --git a/utils.h b/utils.h new file mode 100644 index 0000000..455e9e3 --- /dev/null +++ b/utils.h @@ -0,0 +1,61 @@ +#ifndef UTILS_H +#define UTILS_H + +#include +#include +#include + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +typedef int8_t i8; +typedef int16_t i16; +typedef int32_t i32; +typedef int64_t i64; + +typedef size_t usize; + +typedef float f32; +typedef double f64; + +typedef struct { + usize capacity; + usize position; + void* memory; +} arena; + +typedef usize snapshot; + +/* + * NOTE(ernesto): faulty initialization is signalided by the arena.memory + * being null. It is the responsability of the caller to check for fulty + * initialization. + */ +arena arena_init(usize size); +/* + * Returns null on unsuccessfull allocation. + * In this implemention an allocation is only unsuccessfull if the arena + * does not have enough memory to allocate the requested space + */ +void *arena_alloc(arena *a, usize size); +snapshot arena_snapshot(arena a); +void arena_reset_to_snapshot(arena *a, snapshot s); +void arena_reset(arena *a); +/* This call should never fail, also, do we even care if it does? */ +void arena_deinit(arena a); + +typedef struct _trie_node { + uint16_t value; + struct _trie_node *children[256]; +} trie_node; + +void trie_insert(trie_node *root, arena *a, char *key, uint16_t value); +uint16_t trie_get(trie_node *root, char *key, usize len); + +typedef struct { + usize row, column; +} source_pos; + +#endif