implemented the lexer

This commit is contained in:
Lorenzo Torres 2025-11-30 12:58:35 +01:00
commit f1675bca76
13 changed files with 1303 additions and 0 deletions

3
.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
**/*.o
**/*~
cc

24
LICENSE Normal file
View file

@ -0,0 +1,24 @@
Copyright (c) 2025, Lorenzo Torres
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the name of the <organization> nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ''AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

55
Makefile Normal file
View file

@ -0,0 +1,55 @@
# cc - C compiler
# See LICENSE file for copyright and license details.
include config.mk
SRC = cc.c utils.c lexer.c
HDR = config.def.h utils.h
OBJ = ${SRC:.c=.o}
all: options cc
options:
@echo cc build options:
@echo "CFLAGS = ${CFLAGS}"
@echo "LDFLAGS = ${LDFLAGS}"
@echo "CC = ${CC}"
.c.o:
${CC} -c ${CFLAGS} $<
${OBJ}: config.h config.mk
config.h:
cp config.def.h $@
users.h:
cp users.def.h $@
cc: ${OBJ}
${CC} -o $@ ${OBJ} ${LDFLAGS}
clean:
rm -f cc ${OBJ} cc-${VERSION}.tar.gz
dist: clean
mkdir -p cc-${VERSION}
cp -R LICENSE Makefile README config.mk\
cc.1 ${HDR} ${SRC} cc-${VERSION}
tar -cf cc-${VERSION}.tar cc-${VERSION}
gzip cc-${VERSION}.tar
rm -rf cc-${VERSION}
install: all
mkdir -p ${DESTDIR}${PREFIX}/bin
cp -f cc ${DESTDIR}${PREFIX}/bin
chmod 755 ${DESTDIR}${PREFIX}/bin/cc
mkdir -p ${DESTDIR}${MANPREFIX}/man1
sed "s/VERSION/${VERSION}/g" < cc.1 > ${DESTDIR}${MANPREFIX}/man1/cc.1
chmod 644 ${DESTDIR}${MANPREFIX}/man1/cc.1
uninstall:
rm -f ${DESTDIR}${PREFIX}/bin/cc\
${DESTDIR}${MANPREFIX}/man1/cc.1
.PHONY: all options clean dist install uninstall

33
README Normal file
View file

@ -0,0 +1,33 @@
sis - simple imap server
============================
sis is an IMAP server, following the unix philosophy,
trying to be as small as possible while providing
a reliable service.
Requirements
------------
In order to build sis you need... a computer
Installation
------------
Edit config.mk to match your local setup (sis is installed into
the /usr/local namespace by default).
Afterwards enter the following command to build and install sis (if
necessary as root):
make clean install
Running sis
-----------
By default, sis runs in daemon mode, if you want to avoid detaching use the -d option
sis -d
Configuration
-------------
The configuration of sis is done by creating a custom config.h
and (re)compiling the source code.

24
cc.c Normal file
View file

@ -0,0 +1,24 @@
#include <stdio.h>
#include <stdlib.h>
#include "utils.h"
#include "lexer.h"
int main(void)
{
FILE *fp = fopen("test.c", "r");
usize size = 0;
fseek(fp, 0, SEEK_END);
size = ftell(fp);
fseek(fp, 0, SEEK_SET);
char *src = malloc(size+1);
fread(src, size, 1, fp);
fclose(fp);
src[size] = '\0';
arena a = arena_init(0x1000 * 0x1000 * 64);
lexer *l = lexer_init(src, size, &a);
arena_deinit(a);
return 0;
}

4
config.def.h Normal file
View file

@ -0,0 +1,4 @@
#ifndef CONFIG_H
#define CONFIG_H
#endif

4
config.h Normal file
View file

@ -0,0 +1,4 @@
#ifndef CONFIG_H
#define CONFIG_H
#endif

27
config.mk Normal file
View file

@ -0,0 +1,27 @@
# cc version
VERSION = 0.1
# Customize below to fit your system
# paths
PREFIX = /usr
MANPREFIX = ${PREFIX}/share/man
# OpenBSD (uncomment)
#MANPREFIX = ${PREFIX}/man
# includes and libs
INCS = -I.
LIBS =
# flags
CPPFLAGS = -DVERSION=\"${VERSION}\"
CFLAGS := -std=c99 -pedantic -Wall -O0 ${INCS} ${CPPFLAGS}
CFLAGS := ${CFLAGS} -g
LDFLAGS = ${LIBS}
# Solaris
#CFLAGS = -fast ${INCS} -DVERSION=\"${VERSION}\"
#LDFLAGS = ${LIBS}
# compiler and linker
CC = cc

404
lexer.c Normal file
View file

@ -0,0 +1,404 @@
#include "lexer.h"
#include <stdbool.h>
#include <ctype.h>
#include <stdio.h>
#include <string.h>
trie_node *keywords;
static void add_token(lexer *l, token_type type, usize len)
{
token *t = arena_alloc(l->allocator, sizeof(token));
t->type = type;
t->lexeme_len = len;
t->lexeme = l->source + l->index;
t->position.row = l->row;
t->position.column = l->column;
if (!l->tokens) {
l->tokens = t;
l->tail = t;
} else {
l->tail->next = t;
l->tail = t;
}
}
static void add_error(lexer *l, char *msg)
{
token *t = arena_alloc(l->allocator, sizeof(token));
t->type = TOKEN_ERROR;
t->lexeme_len = strlen(msg);
t->lexeme = msg;
t->position.row = l->row;
t->position.column = l->column;
if (!l->tokens) {
l->tokens = t;
l->tail = t;
} else {
l->tail->next = t;
l->tail = t;
}
}
static void parse_number(lexer *l)
{
char c = l->source[l->index];
/* Is the number a float? */
bool f = false;
usize len = 0;
while (isdigit(c)) {
/* If a dot is found, and the character after it is a digit, this is a float. */
if (l->source[l->index+1] == '.' && isdigit(l->source[l->index+2])) {
f = true;
len += 3;
l->index += 3;
} else {
len += 1;
l->index += 1;
}
c = l->source[l->index];
}
l->index -= len;
if (f) {
add_token(l, TOKEN_FLOAT, len);
} else {
add_token(l, TOKEN_INTEGER, len);
}
l->index += len;
}
static void parse_identifier(lexer *l)
{
char c = l->source[l->index];
usize len = 0;
while (isalnum(c) || c == '_') {
len += 1;
l->index += 1;
c = l->source[l->index];
}
l->index -= len;
token_type keyword = trie_get(keywords, l->source + l->index, len);
if (keyword) {
add_token(l, keyword, len);
} else {
add_token(l, TOKEN_IDENTIFIER, len);
}
l->index += len;
}
static void parse_string(lexer *l)
{
char c = l->source[l->index];
usize len = 0;
while (c != '"') {
if (c == '\0' || c == '\n') {
l->index -= len;
add_error(l, "unclosed string literal.");
l->index += len;
return;
}
len += 1;
l->index += 1;
c = l->source[l->index];
}
l->index -= len;
add_token(l, TOKEN_STRING, len);
l->index += len + 1;
}
static bool parse_special(lexer *l)
{
switch (l->source[l->index]) {
case '+':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_PLUS_EQ, 2);
l->index += 2;
} else if (l->source[l->index+1] == '+') {
add_token(l, TOKEN_PLUS_PLUS, 2);
l->index += 2;
} else {
add_token(l, TOKEN_PLUS, 1);
l->index += 1;
}
return true;
case '-':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_MINUS_EQ, 2);
l->index += 2;
} else if (l->source[l->index+1] == '-') {
add_token(l, TOKEN_MINUS_MINUS, 2);
l->index += 2;
} else if (l->source[l->index+1] == '>') {
add_token(l, TOKEN_ARROW, 2);
l->index += 2;
} else {
add_token(l, TOKEN_MINUS, 1);
l->index += 1;
}
return true;
case '/':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_SLASH_EQ, 2);
l->index += 2;
} else {
add_token(l, TOKEN_SLASH, 1);
l->index += 1;
}
return true;
case '*':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_STAR_EQ, 2);
l->index += 2;
} else {
add_token(l, TOKEN_STAR, 1);
l->index += 1;
}
return true;
case '%':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_PERC_EQ, 2);
l->index += 2;
} else {
add_token(l, TOKEN_PERC, 1);
l->index += 1;
}
return true;
case '&':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_AND_EQ, 2);
l->index += 2;
} else if (l->source[l->index+1] == '&') {
add_token(l, TOKEN_DOUBLE_AND, 2);
l->index += 2;
} else {
add_token(l, TOKEN_AND, 1);
l->index += 1;
}
return true;
case '^':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_HAT_EQ, 2);
l->index += 2;
} else {
add_token(l, TOKEN_HAT, 1);
l->index += 1;
}
return true;
case '|':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_PIPE_EQ, 2);
l->index += 2;
} else if (l->source[l->index+1] == '|') {
add_token(l, TOKEN_OR, 2);
l->index += 2;
} else {
add_token(l, TOKEN_PIPE, 1);
l->index += 1;
}
return true;
case '=':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_DOUBLE_EQ, 2);
l->index += 2;
} else {
add_token(l, TOKEN_EQ, 1);
l->index += 1;
}
return true;
case '>':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_GREATER_EQ, 2);
l->index += 2;
} else if (l->source[l->index+1] == '>') {
if (l->source[l->index+2] == '=') {
add_token(l, TOKEN_RSHIFT_EQ, 3);
l->index += 3;
return true;
}
add_token(l, TOKEN_RSHIFT, 2);
l->index += 2;
} else {
add_token(l, TOKEN_GREATER_THAN, 1);
l->index += 1;
}
return true;
case '<':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_LESS_EQ, 2);
l->index += 2;
} else if (l->source[l->index+1] == '<') {
if (l->source[l->index+2] == '=') {
add_token(l, TOKEN_LSHIFT_EQ, 3);
l->index += 3;
return true;
}
add_token(l, TOKEN_LSHIFT, 2);
l->index += 2;
} else {
add_token(l, TOKEN_LESS_THAN, 1);
l->index += 1;
}
return true;
case '!':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_NOT_EQ, 2);
l->index += 2;
} else {
add_token(l, TOKEN_BANG, 1);
l->index += 1;
}
return true;
case ':':
add_token(l, TOKEN_COLON, 1);
l->index += 1;
return true;
case ';':
add_token(l, TOKEN_SEMICOLON, 1);
l->index += 1;
return true;
case '.':
add_token(l, TOKEN_DOT, 1);
l->index += 1;
return true;
case ',':
add_token(l, TOKEN_COMMA, 1);
l->index += 1;
return true;
case '(':
add_token(l, TOKEN_LPAREN, 1);
l->index += 1;
return true;
case ')':
add_token(l, TOKEN_RPAREN, 1);
l->index += 1;
return true;
case '[':
add_token(l, TOKEN_LSQUARE, 1);
l->index += 1;
return true;
case ']':
add_token(l, TOKEN_RSQUARE, 1);
l->index += 1;
return true;
case '{':
add_token(l, TOKEN_LCURLY, 1);
l->index += 1;
return true;
case '}':
add_token(l, TOKEN_RCURLY, 1);
l->index += 1;
return true;
case '\'':
if (l->source[l->index+1] == '\\') {
if (l->source[l->index+3] != '\'') {
add_error(l, "unclosed character literal.");
l->index += 1;
return true;
}
l->index += 1;
add_token(l, TOKEN_CHAR, 2);
l->index += 3;
return true;
} else {
if (l->source[l->index+2] != '\'') {
add_error(l, "unclosed character literal.");
l->index += 1;
return true;
}
l->index += 1;
add_token(l, TOKEN_CHAR, 1);
l->index += 2;
return true;
}
default:
return false;
}
}
static void parse(lexer *l)
{
char c;
while (l->index <= l->size) {
c = l->source[l->index];
l->column += 1;
if (c == '\n') {
l->index += 1;
l->row += 1;
l->column = 0;
continue;
}
if (isspace(c)) {
l->index += 1;
continue;
}
usize head = l->index;
if (parse_special(l)) {
l->column += (l->index - head - 1);
continue;
}
if (isdigit(c)) {
parse_number(l);
l->column += (l->index - head - 1);
continue;
}
if (isalpha(c)) {
parse_identifier(l);
l->column += (l->index - head - 1);
continue;
}
if (c == '"') {
l->index += 1;
parse_string(l);
l->column += (l->index - head - 1);
continue;
}
l->index += 1;
}
}
lexer *lexer_init(char *source, usize size, arena *arena)
{
lexer *lex = arena_alloc(arena, sizeof(lexer));
lex->column = 0;
lex->row = 0;
lex->index = 0;
lex->size = size;
lex->tokens = 0;
lex->tail = 0;
lex->allocator = arena;
lex->source = source;
keywords = arena_alloc(arena, sizeof(trie_node));
trie_insert(keywords, lex->allocator, "while", TOKEN_WHILE);
trie_insert(keywords, lex->allocator, "for", TOKEN_FOR);
trie_insert(keywords, lex->allocator, "goto", TOKEN_GOTO);
trie_insert(keywords, lex->allocator, "if", TOKEN_IF);
trie_insert(keywords, lex->allocator, "else", TOKEN_ELSE);
trie_insert(keywords, lex->allocator, "switch", TOKEN_SWITCH);
trie_insert(keywords, lex->allocator, "case", TOKEN_CASE);
trie_insert(keywords, lex->allocator, "do", TOKEN_DO);
trie_insert(keywords, lex->allocator, "defer", TOKEN_DEFER);
trie_insert(keywords, lex->allocator, "module", TOKEN_MODULE);
trie_insert(keywords, lex->allocator, "static", TOKEN_STATIC);
trie_insert(keywords, lex->allocator, "const", TOKEN_CONST);
trie_insert(keywords, lex->allocator, "extern", TOKEN_EXTERN);
trie_insert(keywords, lex->allocator, "volatile", TOKEN_VOLATILE);
parse(lex);
return lex;
}

95
lexer.h Normal file
View file

@ -0,0 +1,95 @@
#ifndef LEXER_H
#define LEXER_H
#include "utils.h"
typedef enum {
TOKEN_ERROR,
TOKEN_END,
TOKEN_PLUS, // +
TOKEN_PLUS_PLUS, // ++
TOKEN_MINUS, // -
TOKEN_MINUS_MINUS, // --
TOKEN_SLASH, // /
TOKEN_PERC, // %
TOKEN_STAR, // *
TOKEN_AND, // &
TOKEN_HAT, // ^
TOKEN_PIPE, // |
TOKEN_EQ, // =
TOKEN_ARROW, // ->
TOKEN_LSHIFT, // <<
TOKEN_RSHIFT, // >>
TOKEN_DOUBLE_EQ, // ==
TOKEN_LESS_THAN, // <
TOKEN_GREATER_THAN, // >
TOKEN_LESS_EQ, // <=
TOKEN_GREATER_EQ, // >=
TOKEN_NOT_EQ, // !=
TOKEN_PLUS_EQ, // +=
TOKEN_MINUS_EQ, // -=
TOKEN_STAR_EQ, // *=
TOKEN_SLASH_EQ, // /=
TOKEN_AND_EQ, // &=
TOKEN_HAT_EQ, // ^=
TOKEN_PIPE_EQ, // |=
TOKEN_PERC_EQ, // %=
TOKEN_LSHIFT_EQ, // <<=
TOKEN_RSHIFT_EQ, // >>=
TOKEN_OR, // ||
TOKEN_DOUBLE_AND, // &&
TOKEN_COLON, // :
TOKEN_SEMICOLON, // ;
TOKEN_DOT, // .
TOKEN_BANG, // !
TOKEN_COMMA, // ,
TOKEN_LPAREN, // (
TOKEN_RPAREN, // )
TOKEN_LSQUARE, // [
TOKEN_RSQUARE, // ]
TOKEN_LCURLY, // {
TOKEN_RCURLY, // }
TOKEN_INTEGER,
TOKEN_FLOAT,
TOKEN_IDENTIFIER,
TOKEN_STRING,
TOKEN_CHAR,
TOKEN_WHILE,
TOKEN_FOR,
TOKEN_GOTO,
TOKEN_IF,
TOKEN_ELSE,
TOKEN_SWITCH,
TOKEN_CASE,
TOKEN_DO,
TOKEN_DEFER,
TOKEN_MODULE,
TOKEN_STATIC,
TOKEN_CONST,
TOKEN_EXTERN,
TOKEN_VOLATILE
} token_type;
typedef struct _token {
token_type type;
source_pos position;
char *lexeme;
usize lexeme_len;
struct _token *next;
} token;
typedef struct {
usize column, row, index, size;
char *source;
token *tokens;
token *tail;
arena *allocator;
} lexer;
lexer *lexer_init(char *source, usize size, arena *arena);
#endif

482
test.c Normal file
View file

@ -0,0 +1,482 @@
#include "lexer.h"
#include <stdbool.h>
#include <ctype.h>
#include <stdio.h>
#include <string.h>
static const char *token_type_str[] = {
[TOKEN_ERROR] = "TOKEN_ERROR",
[TOKEN_END] = "TOKEN_END",
[TOKEN_PLUS] = "TOKEN_PLUS",
[TOKEN_PLUS_PLUS] = "TOKEN_PLUS_PLUS",
[TOKEN_MINUS] = "TOKEN_MINUS",
[TOKEN_MINUS_MINUS] = "TOKEN_MINUS_MINUS",
[TOKEN_SLASH] = "TOKEN_SLASH",
[TOKEN_PERC] = "TOKEN_PERC",
[TOKEN_STAR] = "TOKEN_STAR",
[TOKEN_AND] = "TOKEN_AND",
[TOKEN_HAT] = "TOKEN_HAT",
[TOKEN_PIPE] = "TOKEN_PIPE",
[TOKEN_EQ] = "TOKEN_EQ",
[TOKEN_ARROW] = "TOKEN_ARROW",
[TOKEN_LSHIFT] = "TOKEN_LSHIFT",
[TOKEN_RSHIFT] = "TOKEN_RSHIFT",
[TOKEN_DOUBLE_EQ] = "TOKEN_DOUBLE_EQ",
[TOKEN_LESS_THAN] = "TOKEN_LESS_THAN",
[TOKEN_GREATER_THAN] = "TOKEN_GREATER_THAN",
[TOKEN_LESS_EQ] = "TOKEN_LESS_EQ",
[TOKEN_GREATER_EQ] = "TOKEN_GREATER_EQ",
[TOKEN_NOT_EQ] = "TOKEN_NOT_EQ",
[TOKEN_PLUS_EQ] = "TOKEN_PLUS_EQ",
[TOKEN_MINUS_EQ] = "TOKEN_MINUS_EQ",
[TOKEN_STAR_EQ] = "TOKEN_STAR_EQ",
[TOKEN_SLASH_EQ] = "TOKEN_SLASH_EQ",
[TOKEN_AND_EQ] = "TOKEN_AND_EQ",
[TOKEN_HAT_EQ] = "TOKEN_HAT_EQ",
[TOKEN_PIPE_EQ] = "TOKEN_PIPE_EQ",
[TOKEN_PERC_EQ] = "TOKEN_PERC_EQ",
[TOKEN_LSHIFT_EQ] = "TOKEN_LSHIFT_EQ",
[TOKEN_RSHIFT_EQ] = "TOKEN_RSHIFT_EQ",
[TOKEN_OR] = "TOKEN_OR",
[TOKEN_DOUBLE_AND] = "TOKEN_DOUBLE_AND",
[TOKEN_COLON] = "TOKEN_COLON",
[TOKEN_SEMICOLON] = "TOKEN_SEMICOLON",
[TOKEN_DOT] = "TOKEN_DOT",
[TOKEN_BANG] = "TOKEN_BANG",
[TOKEN_COMMA] = "TOKEN_COMMA",
[TOKEN_LPAREN] = "TOKEN_LPAREN",
[TOKEN_RPAREN] = "TOKEN_RPAREN",
[TOKEN_LSQUARE] = "TOKEN_LSQUARE",
[TOKEN_RSQUARE] = "TOKEN_RSQUARE",
[TOKEN_LCURLY] = "TOKEN_LCURLY",
[TOKEN_RCURLY] = "TOKEN_RCURLY",
[TOKEN_INTEGER] = "TOKEN_INTEGER",
[TOKEN_FLOAT] = "TOKEN_FLOAT",
[TOKEN_IDENTIFIER] = "TOKEN_IDENTIFIER",
[TOKEN_STRING] = "TOKEN_STRING",
[TOKEN_CHAR] = "TOKEN_CHAR",
[TOKEN_WHILE] = "TOKEN_WHILE",
[TOKEN_FOR] = "TOKEN_FOR",
[TOKEN_GOTO] = "TOKEN_GOTO",
[TOKEN_IF] = "TOKEN_IF",
[TOKEN_ELSE] = "TOKEN_ELSE",
[TOKEN_SWITCH] = "TOKEN_SWITCH",
[TOKEN_CASE] = "TOKEN_CASE",
[TOKEN_DO] = "TOKEN_DO",
[TOKEN_DEFER] = "TOKEN_DEFER",
[TOKEN_MODULE] = "TOKEN_MODULE",
[TOKEN_STATIC] = "TOKEN_STATIC",
[TOKEN_CONST] = "TOKEN_CONST",
[TOKEN_EXTERN] = "TOKEN_EXTERN",
[TOKEN_VOLATILE] = "TOKEN_VOLATILE",
};
trie_node *keywords;
void lexer_print_token(token *t)
{
printf("%s: ", token_type_str[t->type]);
for (usize i=0; i < t->lexeme_len; i++) {
printf("%c", t->lexeme[i]);
}
}
static void add_token(lexer *l, token_type type, usize len)
{
token *t = arena_alloc(l->allocator, sizeof(token));
t->type = type;
t->lexeme_len = len;
t->lexeme = l->source + l->index;
t->position.row = l->row;
t->position.column = l->column;
if (!l->tokens) {
l->tokens = t;
l->tail = t;
} else {
l->tail->next = t;
l->tail = t;
}
}
static void add_error(lexer *l, char *msg)
{
token *t = arena_alloc(l->allocator, sizeof(token));
t->type = TOKEN_ERROR;
t->lexeme_len = strlen(msg);
t->lexeme = msg;
t->position.row = l->row;
t->position.column = l->column;
if (!l->tokens) {
l->tokens = t;
l->tail = t;
} else {
l->tail->next = t;
l->tail = t;
}
}
static void parse_number(lexer *l)
{
char c = l->source[l->index];
/* Is the number a float? */
bool f = false;
usize len = 0;
while (isdigit(c)) {
/* If a dot is found, and the character after it is a digit, this is a float. */
if (l->source[l->index+1] == '.' && isdigit(l->source[l->index+2])) {
f = true;
len += 3;
l->index += 3;
} else {
len += 1;
l->index += 1;
}
c = l->source[l->index];
}
l->index -= len;
if (f) {
add_token(l, TOKEN_FLOAT, len);
} else {
add_token(l, TOKEN_INTEGER, len);
}
l->index += len;
}
static void parse_identifier(lexer *l)
{
char c = l->source[l->index];
usize len = 0;
while (isalnum(c) || c == '_') {
len += 1;
l->index += 1;
c = l->source[l->index];
}
l->index -= len;
token_type keyword = trie_get(keywords, l->source + l->index, len);
if (keyword) {
add_token(l, keyword, len);
} else {
add_token(l, TOKEN_IDENTIFIER, len);
}
l->index += len;
}
static void parse_string(lexer *l)
{
char c = l->source[l->index];
usize len = 0;
while (c != '"') {
if (c == '\0' || c == '\n') {
printf("%c", c);
l->index -= len;
add_error(l, "unclosed string literal.");
l->index += len;
return;
}
len += 1;
l->index += 1;
c = l->source[l->index];
}
l->index -= len;
add_token(l, TOKEN_STRING, len);
l->index += len + 1;
}
static bool parse_special(lexer *l)
{
switch (l->source[l->index]) {
case '+':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_PLUS_EQ, 2);
l->index += 2;
} else if (l->source[l->index+1] == '+') {
add_token(l, TOKEN_PLUS_PLUS, 2);
l->index += 2;
} else {
add_token(l, TOKEN_PLUS, 1);
l->index += 1;
}
return true;
case '-':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_MINUS_EQ, 2);
l->index += 2;
} else if (l->source[l->index+1] == '-') {
add_token(l, TOKEN_MINUS_MINUS, 2);
l->index += 2;
} else if (l->source[l->index+1] == '>') {
add_token(l, TOKEN_ARROW, 2);
l->index += 2;
} else {
add_token(l, TOKEN_MINUS, 1);
l->index += 1;
}
return true;
case '/':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_SLASH_EQ, 2);
l->index += 2;
} else {
add_token(l, TOKEN_SLASH, 1);
l->index += 1;
}
return true;
case '*':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_STAR_EQ, 2);
l->index += 2;
} else {
add_token(l, TOKEN_STAR, 1);
l->index += 1;
}
return true;
case '%':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_PERC_EQ, 2);
l->index += 2;
} else {
add_token(l, TOKEN_PERC, 1);
l->index += 1;
}
return true;
case '&':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_AND_EQ, 2);
l->index += 2;
} else if (l->source[l->index+1] == '&') {
add_token(l, TOKEN_DOUBLE_AND, 2);
l->index += 2;
} else {
add_token(l, TOKEN_AND, 1);
l->index += 1;
}
return true;
case '^':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_HAT_EQ, 2);
l->index += 2;
} else {
add_token(l, TOKEN_HAT, 1);
l->index += 1;
}
return true;
case '|':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_PIPE_EQ, 2);
l->index += 2;
} else if (l->source[l->index+1] == '|') {
add_token(l, TOKEN_OR, 2);
l->index += 2;
} else {
add_token(l, TOKEN_PIPE, 1);
l->index += 1;
}
return true;
case '=':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_DOUBLE_EQ, 2);
l->index += 2;
} else {
add_token(l, TOKEN_EQ, 1);
l->index += 1;
}
return true;
case '>':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_GREATER_EQ, 2);
l->index += 2;
} else if (l->source[l->index+1] == '>') {
if (l->source[l->index+2] == '=') {
add_token(l, TOKEN_RSHIFT_EQ, 3);
l->index += 3;
return true;
}
add_token(l, TOKEN_RSHIFT, 2);
l->index += 2;
} else {
add_token(l, TOKEN_GREATER_THAN, 1);
l->index += 1;
}
return true;
case '<':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_LESS_EQ, 2);
l->index += 2;
} else if (l->source[l->index+1] == '<') {
if (l->source[l->index+2] == '=') {
add_token(l, TOKEN_LSHIFT_EQ, 3);
l->index += 3;
return true;
}
add_token(l, TOKEN_LSHIFT, 2);
l->index += 2;
} else {
add_token(l, TOKEN_LESS_THAN, 1);
l->index += 1;
}
return true;
case '!':
if (l->source[l->index+1] == '=') {
add_token(l, TOKEN_NOT_EQ, 2);
l->index += 2;
} else {
add_token(l, TOKEN_BANG, 1);
l->index += 1;
}
return true;
case ':':
add_token(l, TOKEN_COLON, 1);
l->index += 1;
return true;
case ';':
add_token(l, TOKEN_SEMICOLON, 1);
l->index += 1;
return true;
case '.':
add_token(l, TOKEN_DOT, 1);
l->index += 1;
return true;
case ',':
add_token(l, TOKEN_COMMA, 1);
l->index += 1;
return true;
case '(':
add_token(l, TOKEN_LPAREN, 1);
l->index += 1;
return true;
case ')':
add_token(l, TOKEN_RPAREN, 1);
l->index += 1;
return true;
case '[':
add_token(l, TOKEN_LSQUARE, 1);
l->index += 1;
return true;
case ']':
add_token(l, TOKEN_RSQUARE, 1);
l->index += 1;
return true;
case '{':
add_token(l, TOKEN_LCURLY, 1);
l->index += 1;
return true;
case '}':
add_token(l, TOKEN_RCURLY, 1);
l->index += 1;
return true;
case '\'':
if (l->source[l->index+1] == '\\') {
if (l->source[l->index+3] != '\'') {
add_error(l, "unclosed character literal.");
return true;
}
l->index += 1;
add_token(l, TOKEN_CHAR, 2);
l->index += 3;
return true;
} else {
if (l->source[l->index+2] != '\'') {
add_error(l, "unclosed character literal.");
return false;
}
l->index += 1;
add_token(l, TOKEN_CHAR, 1);
l->index += 2;
return true;
}
default:
return false;
}
}
static void parse(lexer *l)
{
char c;
while (l->index <= l->size) {
c = l->source[l->index];
l->column += 1;
if (c == '\n') {
l->index += 1;
l->row += 1;
l->column = 0;
continue;
}
if (isspace(c)) {
l->index += 1;
continue;
}
usize head = l->index;
if (parse_special(l)) {
l->column += (l->index - head - 1);
continue;
}
if (isdigit(c)) {
parse_number(l);
l->column += (l->index - head - 1);
continue;
}
if (isalpha(c)) {
parse_identifier(l);
l->column += (l->index - head - 1);
continue;
}
if (c == '"') {
l->index += 1;
parse_string(l);
l->column += (l->index - head - 1);
continue;
}
l->index += 1;
}
}
lexer *lexer_init(char *source, usize size, arena *arena)
{
lexer *lex = arena_alloc(arena, sizeof(lexer));
lex->column = 0;
lex->row = 0;
lex->index = 0;
lex->size = size;
lex->tokens = 0;
lex->tail = 0;
lex->allocator = arena;
lex->source = source;
keywords = arena_alloc(arena, sizeof(trie_node));
trie_insert(keywords, lex->allocator, "while", TOKEN_WHILE);
trie_insert(keywords, lex->allocator, "for", TOKEN_FOR);
trie_insert(keywords, lex->allocator, "goto", TOKEN_GOTO);
trie_insert(keywords, lex->allocator, "if", TOKEN_IF);
trie_insert(keywords, lex->allocator, "else", TOKEN_ELSE);
trie_insert(keywords, lex->allocator, "switch", TOKEN_SWITCH);
trie_insert(keywords, lex->allocator, "case", TOKEN_CASE);
trie_insert(keywords, lex->allocator, "do", TOKEN_DO);
trie_insert(keywords, lex->allocator, "defer", TOKEN_DEFER);
trie_insert(keywords, lex->allocator, "module", TOKEN_MODULE);
trie_insert(keywords, lex->allocator, "static", TOKEN_STATIC);
trie_insert(keywords, lex->allocator, "const", TOKEN_CONST);
trie_insert(keywords, lex->allocator, "extern", TOKEN_EXTERN);
trie_insert(keywords, lex->allocator, "volatile", TOKEN_VOLATILE);
parse(lex);
return lex;
}

87
utils.c Normal file
View file

@ -0,0 +1,87 @@
#include "utils.h"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
void trie_insert(trie_node *root, arena *a, char *key, uint16_t value)
{
trie_node *node = root;
while (*key) {
if (!node->children[(usize)*key]) {
node->children[(usize)*key] = arena_alloc(a, sizeof(trie_node));
memset(node->children[(usize)*key], 0x0, sizeof(trie_node));
}
node = node->children[(usize)*key];
key++;
}
node->value = value;
}
uint16_t trie_get(trie_node *root, char *key, usize len)
{
trie_node *node = root;
for (usize i=0; i < len; i++) {
if (!node->children[(usize)(key[i])]) {
return 0;
}
node = node->children[(usize)(key[i])];
}
return node->value;
}
#ifndef DEFAULT_ALIGNMENT
#define DEFAULT_ALIGNMENT (2 * sizeof(void *))
#endif
static usize align_forward(usize ptr, usize align) {
uintptr_t p = ptr;
uintptr_t a = (uintptr_t)align;
uintptr_t modulo = p & (a - 1);
if (modulo != 0) {
p += a - modulo;
}
return (usize)p;
}
arena arena_init(usize size)
{
return (arena){
.capacity = size,
.position = 0,
.memory = malloc(size),
};
}
void *arena_alloc(arena *a, usize size) {
uintptr_t current_addr = (uintptr_t)a->memory + a->position;
uintptr_t padding = align_forward(current_addr, DEFAULT_ALIGNMENT) - current_addr;
if (a->position + padding + size > a->capacity) return NULL;
void *ret = (unsigned char *)a->memory + a->position + padding;
a->position += (size + padding);
return ret;
}
snapshot arena_snapshot(arena a)
{
return a.position;
}
void arena_reset_to_snapshot(arena *a, snapshot s)
{
a->position = s;
}
void arena_reset(arena *a)
{
arena_reset_to_snapshot(a, 0);
}
void arena_deinit(arena a)
{
free(a.memory);
}

61
utils.h Normal file
View file

@ -0,0 +1,61 @@
#ifndef UTILS_H
#define UTILS_H
#include <stdint.h>
#include <stdint.h>
#include <stddef.h>
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
typedef int8_t i8;
typedef int16_t i16;
typedef int32_t i32;
typedef int64_t i64;
typedef size_t usize;
typedef float f32;
typedef double f64;
typedef struct {
usize capacity;
usize position;
void* memory;
} arena;
typedef usize snapshot;
/*
* NOTE(ernesto): faulty initialization is signalided by the arena.memory
* being null. It is the responsability of the caller to check for fulty
* initialization.
*/
arena arena_init(usize size);
/*
* Returns null on unsuccessfull allocation.
* In this implemention an allocation is only unsuccessfull if the arena
* does not have enough memory to allocate the requested space
*/
void *arena_alloc(arena *a, usize size);
snapshot arena_snapshot(arena a);
void arena_reset_to_snapshot(arena *a, snapshot s);
void arena_reset(arena *a);
/* This call should never fail, also, do we even care if it does? */
void arena_deinit(arena a);
typedef struct _trie_node {
uint16_t value;
struct _trie_node *children[256];
} trie_node;
void trie_insert(trie_node *root, arena *a, char *key, uint16_t value);
uint16_t trie_get(trie_node *root, char *key, usize len);
typedef struct {
usize row, column;
} source_pos;
#endif