commit f1675bca760f1967a0a263aaba81d54a8668d0f5 Author: Lorenzo Torres Date: Sun Nov 30 12:58:35 2025 +0100 implemented the lexer diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..db88bbc --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +**/*.o +**/*~ +cc diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c71ceed --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +Copyright (c) 2025, Lorenzo Torres +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. Neither the name of the nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY ''AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..92dc81f --- /dev/null +++ b/Makefile @@ -0,0 +1,55 @@ +# cc - C compiler +# See LICENSE file for copyright and license details. + +include config.mk + +SRC = cc.c utils.c lexer.c +HDR = config.def.h utils.h +OBJ = ${SRC:.c=.o} + +all: options cc + +options: + @echo cc build options: + @echo "CFLAGS = ${CFLAGS}" + @echo "LDFLAGS = ${LDFLAGS}" + @echo "CC = ${CC}" + +.c.o: + ${CC} -c ${CFLAGS} $< + +${OBJ}: config.h config.mk + +config.h: + cp config.def.h $@ + +users.h: + cp users.def.h $@ + +cc: ${OBJ} + ${CC} -o $@ ${OBJ} ${LDFLAGS} + +clean: + rm -f cc ${OBJ} cc-${VERSION}.tar.gz + +dist: clean + mkdir -p cc-${VERSION} + cp -R LICENSE Makefile README config.mk\ + cc.1 ${HDR} ${SRC} cc-${VERSION} + tar -cf cc-${VERSION}.tar cc-${VERSION} + gzip cc-${VERSION}.tar + rm -rf cc-${VERSION} + +install: all + mkdir -p ${DESTDIR}${PREFIX}/bin + cp -f cc ${DESTDIR}${PREFIX}/bin + chmod 755 ${DESTDIR}${PREFIX}/bin/cc + mkdir -p ${DESTDIR}${MANPREFIX}/man1 + sed "s/VERSION/${VERSION}/g" < cc.1 > ${DESTDIR}${MANPREFIX}/man1/cc.1 + chmod 644 ${DESTDIR}${MANPREFIX}/man1/cc.1 + +uninstall: + rm -f ${DESTDIR}${PREFIX}/bin/cc\ + ${DESTDIR}${MANPREFIX}/man1/cc.1 + +.PHONY: all options clean dist install uninstall diff --git a/README b/README new file mode 100644 index 0000000..17ccbb6 --- /dev/null +++ b/README @@ -0,0 +1,33 @@ +sis - simple imap server +============================ +sis is an IMAP server, following the unix philosophy, +trying to be as small as possible while providing +a reliable service. + + +Requirements +------------ +In order to build sis you need... a computer + + +Installation +------------ +Edit config.mk to match your local setup (sis is installed into +the /usr/local namespace by default). + +Afterwards enter the following command to build and install sis (if +necessary as root): + + make clean install + + +Running sis +----------- +By default, sis runs in daemon mode, if you want to avoid detaching use the -d option + sis -d + + +Configuration +------------- +The configuration of sis is done by creating a custom config.h +and (re)compiling the source code. diff --git a/cc.c b/cc.c new file mode 100644 index 0000000..9b59dd4 --- /dev/null +++ b/cc.c @@ -0,0 +1,24 @@ +#include +#include +#include "utils.h" +#include "lexer.h" + +int main(void) +{ + FILE *fp = fopen("test.c", "r"); + usize size = 0; + fseek(fp, 0, SEEK_END); + size = ftell(fp); + fseek(fp, 0, SEEK_SET); + char *src = malloc(size+1); + fread(src, size, 1, fp); + fclose(fp); + src[size] = '\0'; + + arena a = arena_init(0x1000 * 0x1000 * 64); + lexer *l = lexer_init(src, size, &a); + + arena_deinit(a); + + return 0; +} diff --git a/config.def.h b/config.def.h new file mode 100644 index 0000000..184290d --- /dev/null +++ b/config.def.h @@ -0,0 +1,4 @@ +#ifndef CONFIG_H +#define CONFIG_H + +#endif diff --git a/config.h b/config.h new file mode 100644 index 0000000..184290d --- /dev/null +++ b/config.h @@ -0,0 +1,4 @@ +#ifndef CONFIG_H +#define CONFIG_H + +#endif diff --git a/config.mk b/config.mk new file mode 100644 index 0000000..c797027 --- /dev/null +++ b/config.mk @@ -0,0 +1,27 @@ +# cc version +VERSION = 0.1 + +# Customize below to fit your system + +# paths +PREFIX = /usr +MANPREFIX = ${PREFIX}/share/man + +# OpenBSD (uncomment) +#MANPREFIX = ${PREFIX}/man + +# includes and libs +INCS = -I. +LIBS = +# flags +CPPFLAGS = -DVERSION=\"${VERSION}\" +CFLAGS := -std=c99 -pedantic -Wall -O0 ${INCS} ${CPPFLAGS} +CFLAGS := ${CFLAGS} -g +LDFLAGS = ${LIBS} + +# Solaris +#CFLAGS = -fast ${INCS} -DVERSION=\"${VERSION}\" +#LDFLAGS = ${LIBS} + +# compiler and linker +CC = cc diff --git a/lexer.c b/lexer.c new file mode 100644 index 0000000..b87bc96 --- /dev/null +++ b/lexer.c @@ -0,0 +1,404 @@ +#include "lexer.h" +#include +#include +#include +#include + +trie_node *keywords; + +static void add_token(lexer *l, token_type type, usize len) +{ + token *t = arena_alloc(l->allocator, sizeof(token)); + t->type = type; + t->lexeme_len = len; + t->lexeme = l->source + l->index; + t->position.row = l->row; + t->position.column = l->column; + + if (!l->tokens) { + l->tokens = t; + l->tail = t; + } else { + l->tail->next = t; + l->tail = t; + } +} + +static void add_error(lexer *l, char *msg) +{ + token *t = arena_alloc(l->allocator, sizeof(token)); + t->type = TOKEN_ERROR; + t->lexeme_len = strlen(msg); + t->lexeme = msg; + t->position.row = l->row; + t->position.column = l->column; + + if (!l->tokens) { + l->tokens = t; + l->tail = t; + } else { + l->tail->next = t; + l->tail = t; + } +} + +static void parse_number(lexer *l) +{ + char c = l->source[l->index]; + /* Is the number a float? */ + bool f = false; + usize len = 0; + + while (isdigit(c)) { + /* If a dot is found, and the character after it is a digit, this is a float. */ + if (l->source[l->index+1] == '.' && isdigit(l->source[l->index+2])) { + f = true; + len += 3; + l->index += 3; + } else { + len += 1; + l->index += 1; + } + c = l->source[l->index]; + } + l->index -= len; + if (f) { + add_token(l, TOKEN_FLOAT, len); + } else { + add_token(l, TOKEN_INTEGER, len); + } + l->index += len; +} + +static void parse_identifier(lexer *l) +{ + char c = l->source[l->index]; + usize len = 0; + + while (isalnum(c) || c == '_') { + len += 1; + l->index += 1; + c = l->source[l->index]; + } + l->index -= len; + token_type keyword = trie_get(keywords, l->source + l->index, len); + if (keyword) { + add_token(l, keyword, len); + } else { + add_token(l, TOKEN_IDENTIFIER, len); + } + l->index += len; +} + +static void parse_string(lexer *l) +{ + char c = l->source[l->index]; + usize len = 0; + + while (c != '"') { + if (c == '\0' || c == '\n') { + l->index -= len; + add_error(l, "unclosed string literal."); + l->index += len; + return; + } + len += 1; + l->index += 1; + c = l->source[l->index]; + } + l->index -= len; + add_token(l, TOKEN_STRING, len); + l->index += len + 1; +} + +static bool parse_special(lexer *l) +{ + switch (l->source[l->index]) { + case '+': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_PLUS_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '+') { + add_token(l, TOKEN_PLUS_PLUS, 2); + l->index += 2; + } else { + add_token(l, TOKEN_PLUS, 1); + l->index += 1; + } + return true; + case '-': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_MINUS_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '-') { + add_token(l, TOKEN_MINUS_MINUS, 2); + l->index += 2; + } else if (l->source[l->index+1] == '>') { + add_token(l, TOKEN_ARROW, 2); + l->index += 2; + } else { + add_token(l, TOKEN_MINUS, 1); + l->index += 1; + } + return true; + case '/': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_SLASH_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_SLASH, 1); + l->index += 1; + } + return true; + case '*': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_STAR_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_STAR, 1); + l->index += 1; + } + return true; + case '%': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_PERC_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_PERC, 1); + l->index += 1; + } + return true; + case '&': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_AND_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '&') { + add_token(l, TOKEN_DOUBLE_AND, 2); + l->index += 2; + } else { + add_token(l, TOKEN_AND, 1); + l->index += 1; + } + return true; + case '^': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_HAT_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_HAT, 1); + l->index += 1; + } + return true; + case '|': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_PIPE_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '|') { + add_token(l, TOKEN_OR, 2); + l->index += 2; + } else { + add_token(l, TOKEN_PIPE, 1); + l->index += 1; + } + return true; + case '=': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_DOUBLE_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_EQ, 1); + l->index += 1; + } + return true; + case '>': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_GREATER_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '>') { + if (l->source[l->index+2] == '=') { + add_token(l, TOKEN_RSHIFT_EQ, 3); + l->index += 3; + return true; + } + add_token(l, TOKEN_RSHIFT, 2); + l->index += 2; + } else { + add_token(l, TOKEN_GREATER_THAN, 1); + l->index += 1; + } + return true; + case '<': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_LESS_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '<') { + if (l->source[l->index+2] == '=') { + add_token(l, TOKEN_LSHIFT_EQ, 3); + l->index += 3; + return true; + } + add_token(l, TOKEN_LSHIFT, 2); + l->index += 2; + } else { + add_token(l, TOKEN_LESS_THAN, 1); + l->index += 1; + } + return true; + case '!': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_NOT_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_BANG, 1); + l->index += 1; + } + return true; + case ':': + add_token(l, TOKEN_COLON, 1); + l->index += 1; + return true; + case ';': + add_token(l, TOKEN_SEMICOLON, 1); + l->index += 1; + return true; + case '.': + add_token(l, TOKEN_DOT, 1); + l->index += 1; + return true; + case ',': + add_token(l, TOKEN_COMMA, 1); + l->index += 1; + return true; + case '(': + add_token(l, TOKEN_LPAREN, 1); + l->index += 1; + return true; + case ')': + add_token(l, TOKEN_RPAREN, 1); + l->index += 1; + return true; + case '[': + add_token(l, TOKEN_LSQUARE, 1); + l->index += 1; + return true; + case ']': + add_token(l, TOKEN_RSQUARE, 1); + l->index += 1; + return true; + case '{': + add_token(l, TOKEN_LCURLY, 1); + l->index += 1; + return true; + case '}': + add_token(l, TOKEN_RCURLY, 1); + l->index += 1; + return true; + case '\'': + if (l->source[l->index+1] == '\\') { + if (l->source[l->index+3] != '\'') { + add_error(l, "unclosed character literal."); + l->index += 1; + return true; + } + l->index += 1; + add_token(l, TOKEN_CHAR, 2); + l->index += 3; + return true; + } else { + if (l->source[l->index+2] != '\'') { + add_error(l, "unclosed character literal."); + l->index += 1; + return true; + } + l->index += 1; + add_token(l, TOKEN_CHAR, 1); + l->index += 2; + return true; + } + default: + return false; + } +} + +static void parse(lexer *l) +{ + char c; + + while (l->index <= l->size) { + c = l->source[l->index]; + l->column += 1; + + if (c == '\n') { + l->index += 1; + l->row += 1; + l->column = 0; + continue; + } + + if (isspace(c)) { + l->index += 1; + continue; + } + + usize head = l->index; + + if (parse_special(l)) { + l->column += (l->index - head - 1); + continue; + } + + if (isdigit(c)) { + parse_number(l); + l->column += (l->index - head - 1); + continue; + } + + if (isalpha(c)) { + parse_identifier(l); + l->column += (l->index - head - 1); + continue; + } + + if (c == '"') { + l->index += 1; + parse_string(l); + l->column += (l->index - head - 1); + continue; + } + + l->index += 1; + } +} + +lexer *lexer_init(char *source, usize size, arena *arena) +{ + lexer *lex = arena_alloc(arena, sizeof(lexer)); + lex->column = 0; + lex->row = 0; + lex->index = 0; + lex->size = size; + lex->tokens = 0; + lex->tail = 0; + lex->allocator = arena; + lex->source = source; + + keywords = arena_alloc(arena, sizeof(trie_node)); + trie_insert(keywords, lex->allocator, "while", TOKEN_WHILE); + trie_insert(keywords, lex->allocator, "for", TOKEN_FOR); + trie_insert(keywords, lex->allocator, "goto", TOKEN_GOTO); + trie_insert(keywords, lex->allocator, "if", TOKEN_IF); + trie_insert(keywords, lex->allocator, "else", TOKEN_ELSE); + trie_insert(keywords, lex->allocator, "switch", TOKEN_SWITCH); + trie_insert(keywords, lex->allocator, "case", TOKEN_CASE); + trie_insert(keywords, lex->allocator, "do", TOKEN_DO); + trie_insert(keywords, lex->allocator, "defer", TOKEN_DEFER); + trie_insert(keywords, lex->allocator, "module", TOKEN_MODULE); + trie_insert(keywords, lex->allocator, "static", TOKEN_STATIC); + trie_insert(keywords, lex->allocator, "const", TOKEN_CONST); + trie_insert(keywords, lex->allocator, "extern", TOKEN_EXTERN); + trie_insert(keywords, lex->allocator, "volatile", TOKEN_VOLATILE); + + parse(lex); + + return lex; +} diff --git a/lexer.h b/lexer.h new file mode 100644 index 0000000..04961f8 --- /dev/null +++ b/lexer.h @@ -0,0 +1,95 @@ +#ifndef LEXER_H +#define LEXER_H + +#include "utils.h" + +typedef enum { + TOKEN_ERROR, + TOKEN_END, + + TOKEN_PLUS, // + + TOKEN_PLUS_PLUS, // ++ + TOKEN_MINUS, // - + TOKEN_MINUS_MINUS, // -- + TOKEN_SLASH, // / + TOKEN_PERC, // % + TOKEN_STAR, // * + TOKEN_AND, // & + TOKEN_HAT, // ^ + TOKEN_PIPE, // | + TOKEN_EQ, // = + TOKEN_ARROW, // -> + TOKEN_LSHIFT, // << + TOKEN_RSHIFT, // >> + TOKEN_DOUBLE_EQ, // == + TOKEN_LESS_THAN, // < + TOKEN_GREATER_THAN, // > + TOKEN_LESS_EQ, // <= + TOKEN_GREATER_EQ, // >= + TOKEN_NOT_EQ, // != + TOKEN_PLUS_EQ, // += + TOKEN_MINUS_EQ, // -= + TOKEN_STAR_EQ, // *= + TOKEN_SLASH_EQ, // /= + TOKEN_AND_EQ, // &= + TOKEN_HAT_EQ, // ^= + TOKEN_PIPE_EQ, // |= + TOKEN_PERC_EQ, // %= + TOKEN_LSHIFT_EQ, // <<= + TOKEN_RSHIFT_EQ, // >>= + TOKEN_OR, // || + TOKEN_DOUBLE_AND, // && + TOKEN_COLON, // : + TOKEN_SEMICOLON, // ; + TOKEN_DOT, // . + TOKEN_BANG, // ! + TOKEN_COMMA, // , + TOKEN_LPAREN, // ( + TOKEN_RPAREN, // ) + TOKEN_LSQUARE, // [ + TOKEN_RSQUARE, // ] + TOKEN_LCURLY, // { + TOKEN_RCURLY, // } + + TOKEN_INTEGER, + TOKEN_FLOAT, + TOKEN_IDENTIFIER, + TOKEN_STRING, + TOKEN_CHAR, + + TOKEN_WHILE, + TOKEN_FOR, + TOKEN_GOTO, + TOKEN_IF, + TOKEN_ELSE, + TOKEN_SWITCH, + TOKEN_CASE, + TOKEN_DO, + TOKEN_DEFER, + TOKEN_MODULE, + + TOKEN_STATIC, + TOKEN_CONST, + TOKEN_EXTERN, + TOKEN_VOLATILE +} token_type; + +typedef struct _token { + token_type type; + source_pos position; + char *lexeme; + usize lexeme_len; + struct _token *next; +} token; + +typedef struct { + usize column, row, index, size; + char *source; + token *tokens; + token *tail; + arena *allocator; +} lexer; + +lexer *lexer_init(char *source, usize size, arena *arena); + +#endif diff --git a/test.c b/test.c new file mode 100644 index 0000000..27839c1 --- /dev/null +++ b/test.c @@ -0,0 +1,482 @@ +#include "lexer.h" +#include +#include +#include +#include + +static const char *token_type_str[] = { + [TOKEN_ERROR] = "TOKEN_ERROR", + [TOKEN_END] = "TOKEN_END", + + [TOKEN_PLUS] = "TOKEN_PLUS", + [TOKEN_PLUS_PLUS] = "TOKEN_PLUS_PLUS", + [TOKEN_MINUS] = "TOKEN_MINUS", + [TOKEN_MINUS_MINUS] = "TOKEN_MINUS_MINUS", + [TOKEN_SLASH] = "TOKEN_SLASH", + [TOKEN_PERC] = "TOKEN_PERC", + [TOKEN_STAR] = "TOKEN_STAR", + [TOKEN_AND] = "TOKEN_AND", + [TOKEN_HAT] = "TOKEN_HAT", + [TOKEN_PIPE] = "TOKEN_PIPE", + [TOKEN_EQ] = "TOKEN_EQ", + [TOKEN_ARROW] = "TOKEN_ARROW", + [TOKEN_LSHIFT] = "TOKEN_LSHIFT", + [TOKEN_RSHIFT] = "TOKEN_RSHIFT", + [TOKEN_DOUBLE_EQ] = "TOKEN_DOUBLE_EQ", + [TOKEN_LESS_THAN] = "TOKEN_LESS_THAN", + [TOKEN_GREATER_THAN] = "TOKEN_GREATER_THAN", + [TOKEN_LESS_EQ] = "TOKEN_LESS_EQ", + [TOKEN_GREATER_EQ] = "TOKEN_GREATER_EQ", + [TOKEN_NOT_EQ] = "TOKEN_NOT_EQ", + [TOKEN_PLUS_EQ] = "TOKEN_PLUS_EQ", + [TOKEN_MINUS_EQ] = "TOKEN_MINUS_EQ", + [TOKEN_STAR_EQ] = "TOKEN_STAR_EQ", + [TOKEN_SLASH_EQ] = "TOKEN_SLASH_EQ", + [TOKEN_AND_EQ] = "TOKEN_AND_EQ", + [TOKEN_HAT_EQ] = "TOKEN_HAT_EQ", + [TOKEN_PIPE_EQ] = "TOKEN_PIPE_EQ", + [TOKEN_PERC_EQ] = "TOKEN_PERC_EQ", + [TOKEN_LSHIFT_EQ] = "TOKEN_LSHIFT_EQ", + [TOKEN_RSHIFT_EQ] = "TOKEN_RSHIFT_EQ", + [TOKEN_OR] = "TOKEN_OR", + [TOKEN_DOUBLE_AND] = "TOKEN_DOUBLE_AND", + [TOKEN_COLON] = "TOKEN_COLON", + [TOKEN_SEMICOLON] = "TOKEN_SEMICOLON", + [TOKEN_DOT] = "TOKEN_DOT", + [TOKEN_BANG] = "TOKEN_BANG", + [TOKEN_COMMA] = "TOKEN_COMMA", + [TOKEN_LPAREN] = "TOKEN_LPAREN", + [TOKEN_RPAREN] = "TOKEN_RPAREN", + [TOKEN_LSQUARE] = "TOKEN_LSQUARE", + [TOKEN_RSQUARE] = "TOKEN_RSQUARE", + [TOKEN_LCURLY] = "TOKEN_LCURLY", + [TOKEN_RCURLY] = "TOKEN_RCURLY", + + [TOKEN_INTEGER] = "TOKEN_INTEGER", + [TOKEN_FLOAT] = "TOKEN_FLOAT", + [TOKEN_IDENTIFIER] = "TOKEN_IDENTIFIER", + [TOKEN_STRING] = "TOKEN_STRING", + [TOKEN_CHAR] = "TOKEN_CHAR", + + [TOKEN_WHILE] = "TOKEN_WHILE", + [TOKEN_FOR] = "TOKEN_FOR", + [TOKEN_GOTO] = "TOKEN_GOTO", + [TOKEN_IF] = "TOKEN_IF", + [TOKEN_ELSE] = "TOKEN_ELSE", + [TOKEN_SWITCH] = "TOKEN_SWITCH", + [TOKEN_CASE] = "TOKEN_CASE", + [TOKEN_DO] = "TOKEN_DO", + [TOKEN_DEFER] = "TOKEN_DEFER", + [TOKEN_MODULE] = "TOKEN_MODULE", + + [TOKEN_STATIC] = "TOKEN_STATIC", + [TOKEN_CONST] = "TOKEN_CONST", + [TOKEN_EXTERN] = "TOKEN_EXTERN", + [TOKEN_VOLATILE] = "TOKEN_VOLATILE", +}; + +trie_node *keywords; + +void lexer_print_token(token *t) +{ + printf("%s: ", token_type_str[t->type]); + for (usize i=0; i < t->lexeme_len; i++) { + printf("%c", t->lexeme[i]); + } +} + +static void add_token(lexer *l, token_type type, usize len) +{ + token *t = arena_alloc(l->allocator, sizeof(token)); + t->type = type; + t->lexeme_len = len; + t->lexeme = l->source + l->index; + t->position.row = l->row; + t->position.column = l->column; + + if (!l->tokens) { + l->tokens = t; + l->tail = t; + } else { + l->tail->next = t; + l->tail = t; + } +} + +static void add_error(lexer *l, char *msg) +{ + token *t = arena_alloc(l->allocator, sizeof(token)); + t->type = TOKEN_ERROR; + t->lexeme_len = strlen(msg); + t->lexeme = msg; + t->position.row = l->row; + t->position.column = l->column; + + if (!l->tokens) { + l->tokens = t; + l->tail = t; + } else { + l->tail->next = t; + l->tail = t; + } +} + +static void parse_number(lexer *l) +{ + char c = l->source[l->index]; + /* Is the number a float? */ + bool f = false; + usize len = 0; + + while (isdigit(c)) { + /* If a dot is found, and the character after it is a digit, this is a float. */ + if (l->source[l->index+1] == '.' && isdigit(l->source[l->index+2])) { + f = true; + len += 3; + l->index += 3; + } else { + len += 1; + l->index += 1; + } + c = l->source[l->index]; + } + l->index -= len; + if (f) { + add_token(l, TOKEN_FLOAT, len); + } else { + add_token(l, TOKEN_INTEGER, len); + } + l->index += len; +} + +static void parse_identifier(lexer *l) +{ + char c = l->source[l->index]; + usize len = 0; + + while (isalnum(c) || c == '_') { + len += 1; + l->index += 1; + c = l->source[l->index]; + } + l->index -= len; + token_type keyword = trie_get(keywords, l->source + l->index, len); + if (keyword) { + add_token(l, keyword, len); + } else { + add_token(l, TOKEN_IDENTIFIER, len); + } + l->index += len; +} + +static void parse_string(lexer *l) +{ + char c = l->source[l->index]; + usize len = 0; + + while (c != '"') { + if (c == '\0' || c == '\n') { + printf("%c", c); + l->index -= len; + add_error(l, "unclosed string literal."); + l->index += len; + return; + } + len += 1; + l->index += 1; + c = l->source[l->index]; + } + l->index -= len; + add_token(l, TOKEN_STRING, len); + l->index += len + 1; +} + +static bool parse_special(lexer *l) +{ + switch (l->source[l->index]) { + case '+': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_PLUS_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '+') { + add_token(l, TOKEN_PLUS_PLUS, 2); + l->index += 2; + } else { + add_token(l, TOKEN_PLUS, 1); + l->index += 1; + } + return true; + case '-': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_MINUS_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '-') { + add_token(l, TOKEN_MINUS_MINUS, 2); + l->index += 2; + } else if (l->source[l->index+1] == '>') { + add_token(l, TOKEN_ARROW, 2); + l->index += 2; + } else { + add_token(l, TOKEN_MINUS, 1); + l->index += 1; + } + return true; + case '/': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_SLASH_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_SLASH, 1); + l->index += 1; + } + return true; + case '*': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_STAR_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_STAR, 1); + l->index += 1; + } + return true; + case '%': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_PERC_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_PERC, 1); + l->index += 1; + } + return true; + case '&': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_AND_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '&') { + add_token(l, TOKEN_DOUBLE_AND, 2); + l->index += 2; + } else { + add_token(l, TOKEN_AND, 1); + l->index += 1; + } + return true; + case '^': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_HAT_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_HAT, 1); + l->index += 1; + } + return true; + case '|': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_PIPE_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '|') { + add_token(l, TOKEN_OR, 2); + l->index += 2; + } else { + add_token(l, TOKEN_PIPE, 1); + l->index += 1; + } + return true; + case '=': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_DOUBLE_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_EQ, 1); + l->index += 1; + } + return true; + case '>': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_GREATER_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '>') { + if (l->source[l->index+2] == '=') { + add_token(l, TOKEN_RSHIFT_EQ, 3); + l->index += 3; + return true; + } + add_token(l, TOKEN_RSHIFT, 2); + l->index += 2; + } else { + add_token(l, TOKEN_GREATER_THAN, 1); + l->index += 1; + } + return true; + case '<': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_LESS_EQ, 2); + l->index += 2; + } else if (l->source[l->index+1] == '<') { + if (l->source[l->index+2] == '=') { + add_token(l, TOKEN_LSHIFT_EQ, 3); + l->index += 3; + return true; + } + add_token(l, TOKEN_LSHIFT, 2); + l->index += 2; + } else { + add_token(l, TOKEN_LESS_THAN, 1); + l->index += 1; + } + return true; + case '!': + if (l->source[l->index+1] == '=') { + add_token(l, TOKEN_NOT_EQ, 2); + l->index += 2; + } else { + add_token(l, TOKEN_BANG, 1); + l->index += 1; + } + return true; + case ':': + add_token(l, TOKEN_COLON, 1); + l->index += 1; + return true; + case ';': + add_token(l, TOKEN_SEMICOLON, 1); + l->index += 1; + return true; + case '.': + add_token(l, TOKEN_DOT, 1); + l->index += 1; + return true; + case ',': + add_token(l, TOKEN_COMMA, 1); + l->index += 1; + return true; + case '(': + add_token(l, TOKEN_LPAREN, 1); + l->index += 1; + return true; + case ')': + add_token(l, TOKEN_RPAREN, 1); + l->index += 1; + return true; + case '[': + add_token(l, TOKEN_LSQUARE, 1); + l->index += 1; + return true; + case ']': + add_token(l, TOKEN_RSQUARE, 1); + l->index += 1; + return true; + case '{': + add_token(l, TOKEN_LCURLY, 1); + l->index += 1; + return true; + case '}': + add_token(l, TOKEN_RCURLY, 1); + l->index += 1; + return true; + case '\'': + if (l->source[l->index+1] == '\\') { + if (l->source[l->index+3] != '\'') { + add_error(l, "unclosed character literal."); + return true; + } + l->index += 1; + add_token(l, TOKEN_CHAR, 2); + l->index += 3; + return true; + } else { + if (l->source[l->index+2] != '\'') { + add_error(l, "unclosed character literal."); + return false; + } + l->index += 1; + add_token(l, TOKEN_CHAR, 1); + l->index += 2; + return true; + } + default: + return false; + } +} + +static void parse(lexer *l) +{ + char c; + + while (l->index <= l->size) { + c = l->source[l->index]; + l->column += 1; + + if (c == '\n') { + l->index += 1; + l->row += 1; + l->column = 0; + continue; + } + + if (isspace(c)) { + l->index += 1; + continue; + } + + usize head = l->index; + + if (parse_special(l)) { + l->column += (l->index - head - 1); + continue; + } + + if (isdigit(c)) { + parse_number(l); + l->column += (l->index - head - 1); + continue; + } + + if (isalpha(c)) { + parse_identifier(l); + l->column += (l->index - head - 1); + continue; + } + + if (c == '"') { + l->index += 1; + parse_string(l); + l->column += (l->index - head - 1); + continue; + } + + l->index += 1; + } +} + +lexer *lexer_init(char *source, usize size, arena *arena) +{ + lexer *lex = arena_alloc(arena, sizeof(lexer)); + lex->column = 0; + lex->row = 0; + lex->index = 0; + lex->size = size; + lex->tokens = 0; + lex->tail = 0; + lex->allocator = arena; + lex->source = source; + + keywords = arena_alloc(arena, sizeof(trie_node)); + trie_insert(keywords, lex->allocator, "while", TOKEN_WHILE); + trie_insert(keywords, lex->allocator, "for", TOKEN_FOR); + trie_insert(keywords, lex->allocator, "goto", TOKEN_GOTO); + trie_insert(keywords, lex->allocator, "if", TOKEN_IF); + trie_insert(keywords, lex->allocator, "else", TOKEN_ELSE); + trie_insert(keywords, lex->allocator, "switch", TOKEN_SWITCH); + trie_insert(keywords, lex->allocator, "case", TOKEN_CASE); + trie_insert(keywords, lex->allocator, "do", TOKEN_DO); + trie_insert(keywords, lex->allocator, "defer", TOKEN_DEFER); + trie_insert(keywords, lex->allocator, "module", TOKEN_MODULE); + trie_insert(keywords, lex->allocator, "static", TOKEN_STATIC); + trie_insert(keywords, lex->allocator, "const", TOKEN_CONST); + trie_insert(keywords, lex->allocator, "extern", TOKEN_EXTERN); + trie_insert(keywords, lex->allocator, "volatile", TOKEN_VOLATILE); + + parse(lex); + + return lex; +} diff --git a/utils.c b/utils.c new file mode 100644 index 0000000..4080730 --- /dev/null +++ b/utils.c @@ -0,0 +1,87 @@ +#include "utils.h" +#include +#include +#include + +void trie_insert(trie_node *root, arena *a, char *key, uint16_t value) +{ + trie_node *node = root; + while (*key) { + if (!node->children[(usize)*key]) { + node->children[(usize)*key] = arena_alloc(a, sizeof(trie_node)); + memset(node->children[(usize)*key], 0x0, sizeof(trie_node)); + } + node = node->children[(usize)*key]; + + key++; + } + + node->value = value; +} + +uint16_t trie_get(trie_node *root, char *key, usize len) +{ + trie_node *node = root; + for (usize i=0; i < len; i++) { + if (!node->children[(usize)(key[i])]) { + return 0; + } + node = node->children[(usize)(key[i])]; + } + + return node->value; +} + +#ifndef DEFAULT_ALIGNMENT +#define DEFAULT_ALIGNMENT (2 * sizeof(void *)) +#endif + +static usize align_forward(usize ptr, usize align) { + uintptr_t p = ptr; + uintptr_t a = (uintptr_t)align; + uintptr_t modulo = p & (a - 1); + + if (modulo != 0) { + p += a - modulo; + } + return (usize)p; +} + +arena arena_init(usize size) +{ + return (arena){ + .capacity = size, + .position = 0, + .memory = malloc(size), + }; +} + +void *arena_alloc(arena *a, usize size) { + uintptr_t current_addr = (uintptr_t)a->memory + a->position; + uintptr_t padding = align_forward(current_addr, DEFAULT_ALIGNMENT) - current_addr; + if (a->position + padding + size > a->capacity) return NULL; + void *ret = (unsigned char *)a->memory + a->position + padding; + a->position += (size + padding); + + return ret; +} + +snapshot arena_snapshot(arena a) +{ + return a.position; +} + +void arena_reset_to_snapshot(arena *a, snapshot s) +{ + a->position = s; +} + +void arena_reset(arena *a) +{ + arena_reset_to_snapshot(a, 0); +} + +void arena_deinit(arena a) +{ + free(a.memory); +} diff --git a/utils.h b/utils.h new file mode 100644 index 0000000..455e9e3 --- /dev/null +++ b/utils.h @@ -0,0 +1,61 @@ +#ifndef UTILS_H +#define UTILS_H + +#include +#include +#include + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +typedef int8_t i8; +typedef int16_t i16; +typedef int32_t i32; +typedef int64_t i64; + +typedef size_t usize; + +typedef float f32; +typedef double f64; + +typedef struct { + usize capacity; + usize position; + void* memory; +} arena; + +typedef usize snapshot; + +/* + * NOTE(ernesto): faulty initialization is signalided by the arena.memory + * being null. It is the responsability of the caller to check for fulty + * initialization. + */ +arena arena_init(usize size); +/* + * Returns null on unsuccessfull allocation. + * In this implemention an allocation is only unsuccessfull if the arena + * does not have enough memory to allocate the requested space + */ +void *arena_alloc(arena *a, usize size); +snapshot arena_snapshot(arena a); +void arena_reset_to_snapshot(arena *a, snapshot s); +void arena_reset(arena *a); +/* This call should never fail, also, do we even care if it does? */ +void arena_deinit(arena a); + +typedef struct _trie_node { + uint16_t value; + struct _trie_node *children[256]; +} trie_node; + +void trie_insert(trie_node *root, arena *a, char *key, uint16_t value); +uint16_t trie_get(trie_node *root, char *key, usize len); + +typedef struct { + usize row, column; +} source_pos; + +#endif