From abf1d7c0661ccff5939d21f83edf9a8bb757bac2 Mon Sep 17 00:00:00 2001 From: Lorenzo Torres Date: Sun, 30 Nov 2025 21:56:39 +0100 Subject: [PATCH] implemented basic parsing --- Makefile | 4 +- cc.c | 117 ++++++++++++++ lexer.c | 5 +- lexer.h | 1 + parser.c | 231 ++++++++++++++++++++++++++ parser.h | 145 +++++++++++++++++ sema.h | 54 +++++++ test.c | 483 +------------------------------------------------------ utils.c | 31 ++++ utils.h | 3 + 10 files changed, 588 insertions(+), 486 deletions(-) create mode 100644 parser.c create mode 100644 parser.h create mode 100644 sema.h diff --git a/Makefile b/Makefile index 92dc81f..8ee851d 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,8 @@ include config.mk -SRC = cc.c utils.c lexer.c -HDR = config.def.h utils.h +SRC = cc.c utils.c lexer.c parser.c +HDR = config.def.h utils.h lexer.h parser.h sema.h OBJ = ${SRC:.c=.o} all: options cc diff --git a/cc.c b/cc.c index 9b59dd4..dd478a7 100644 --- a/cc.c +++ b/cc.c @@ -2,6 +2,121 @@ #include #include "utils.h" #include "lexer.h" +#include "parser.h" + +// Helper to print indentation +void print_indent(int depth) { + for (int i = 0; i < depth; i++) printf(" "); +} + +// Helper to convert Binary Op enum to string +const char* get_op_str(binary_op op) { + switch(op) { + case OP_PLUS: return "+"; + case OP_MINUS: return "-"; + case OP_DIV: return "/"; + case OP_MUL: return "*"; + case OP_EQ: return "=="; + case OP_ASSIGN: return "="; + case OP_AND: return "&&"; + case OP_OR: return "||"; + case OP_NEQ: return "!="; + case OP_GT: return ">"; + case OP_LT: return "<"; + case OP_GE: return ">="; + case OP_LE: return "<="; + case OP_BOR: return "|"; + case OP_BAND: return "&"; + case OP_BXOR: return "^"; + case OP_MOD: return "%"; + case OP_PLUS_EQ: return "+="; + case OP_MINUS_EQ: return "-="; + case OP_DIV_EQ: return "/="; + case OP_MUL_EQ: return "*="; + default: return "?"; + } +} + +const char *get_uop_str(unary_op op) { + switch (op) { + case UOP_INCR: return "++"; + case UOP_MINUS: return "-"; + case UOP_DECR: return "--"; + case UOP_DEREF: return "*"; + case UOP_REF: return "&"; + case UOP_NOT: return "!"; + default: return "?"; + } +} + +void print_ast(ast_node *node, int depth) { + if (!node) return; + + print_indent(depth); + + switch (node->type) { + case NODE_INTEGER: + printf("Integer: %lu\n", node->expr.integer); + break; + case NODE_FLOAT: + printf("Float: %f\n", node->expr.flt); + break; + case NODE_CHAR: + printf("Char: '%c'\n", node->expr.ch); + break; + case NODE_STRING: + printf("String: \"%.*s\"\n", (int)node->expr.string.len, node->expr.string.start); + break; + case NODE_IDENTIFIER: + printf("Identifier: %.*s\n", (int)node->expr.string.len, node->expr.string.start); + break; + case NODE_BINARY: + printf("BinaryOp (%s)\n", get_op_str(node->expr.binary.operator)); + print_ast(node->expr.binary.left, depth + 1); + print_ast(node->expr.binary.right, depth + 1); + break; + case NODE_UNARY: + printf("UnaryOp (%s)\n", get_uop_str(node->expr.unary.operator)); + print_ast(node->expr.unary.right, depth + 1); + break; + case NODE_TERNARY: + printf("Ternary (? :)\n"); + print_indent(depth + 1); printf("Condition:\n"); + print_ast(node->expr.ternary.condition, depth + 2); + print_indent(depth + 1); printf("Then:\n"); + print_ast(node->expr.ternary.then, depth + 2); + print_indent(depth + 1); printf("Else:\n"); + print_ast(node->expr.ternary.otherwise, depth + 2); + break; + case NODE_UNIT: + case NODE_COMPOUND: + printf("Unit/Block:\n"); + ast_node *current = node; + while (current && (current->type == NODE_UNIT || current->type == NODE_COMPOUND)) { + print_ast(current->expr.unit_node.expr, depth + 1); + current = current->expr.unit_node.next; + } + break; + case NODE_IF: + printf("IfStmt (Fields missing in struct)\n"); + break; + case NODE_WHILE: + printf("WhileStmt (Fields missing in struct)\n"); + break; + case NODE_VAR_DECL: + printf("VarDecl (Fields missing in struct)\n"); + break; + case NODE_FUNCTION_DEF: + printf("FunctionDef (Fields missing in struct)\n"); + break; + case NODE_RETURN: + printf("Return (Fields missing in struct)\n"); + break; + default: + printf("Unknown Node Type: %d\n", node->type); + break; + } +} int main(void) { @@ -17,6 +132,8 @@ int main(void) arena a = arena_init(0x1000 * 0x1000 * 64); lexer *l = lexer_init(src, size, &a); + parser *p = parser_init(l, &a); + print_ast(p->ast, 0); arena_deinit(a); diff --git a/lexer.c b/lexer.c index b0e9d86..7891cfd 100644 --- a/lexer.c +++ b/lexer.c @@ -381,8 +381,8 @@ static void parse(lexer *l) lexer *lexer_init(char *source, usize size, arena *arena) { lexer *lex = arena_alloc(arena, sizeof(lexer)); - lex->column = 0; - lex->row = 0; + lex->column = 1; + lex->row = 1; lex->index = 0; lex->size = size; lex->tokens = 0; @@ -403,6 +403,7 @@ lexer *lexer_init(char *source, usize size, arena *arena) trie_insert(keywords, lex->allocator, "case", TOKEN_CASE); trie_insert(keywords, lex->allocator, "do", TOKEN_DO); trie_insert(keywords, lex->allocator, "defer", TOKEN_DEFER); + trie_insert(keywords, lex->allocator, "return", TOKEN_RETURN); trie_insert(keywords, lex->allocator, "module", TOKEN_MODULE); trie_insert(keywords, lex->allocator, "static", TOKEN_STATIC); trie_insert(keywords, lex->allocator, "const", TOKEN_CONST); diff --git a/lexer.h b/lexer.h index c27bdd7..f3879f4 100644 --- a/lexer.h +++ b/lexer.h @@ -67,6 +67,7 @@ typedef enum { TOKEN_DO, TOKEN_DEFER, TOKEN_MODULE, + TOKEN_RETURN, TOKEN_STATIC, TOKEN_CONST, diff --git a/parser.c b/parser.c new file mode 100644 index 0000000..9ace5e0 --- /dev/null +++ b/parser.c @@ -0,0 +1,231 @@ +#include "parser.h" +#include +#include + +ast_node *parse_expression(parser *p); + +static void advance(parser *p) +{ + p->previous = p->tokens; + if (p->tokens) + p->tokens = p->tokens->next; +} + +static token *peek(parser *p) +{ + return p->tokens; +} + +static bool match_peek(parser *p, token_type type) +{ + if (p->tokens) { + return p->tokens->type == type; + } else { + return false; + } +} + +static bool match(parser *p, token_type type) +{ + if (p->tokens) { + if (p->tokens->type == type) { + advance(p); + return true; + } + } + return false; +} + +static void parser_sync(parser *p) +{ + advance(p); + + while (p->tokens) { + if (p->previous->type == TOKEN_SEMICOLON || p->previous->type == TOKEN_RCURLY) { + return; + } + + switch (p->tokens->type) { + case TOKEN_STRUCT: + case TOKEN_ENUM: + case TOKEN_IF: + case TOKEN_WHILE: + case TOKEN_FOR: + case TOKEN_DO: + case TOKEN_RETURN: + case TOKEN_SWITCH: + return; + default: + advance(p); + } + } +} + +static void error(parser *p, char *msg) +{ + printf("\x1b[31m\x1b[1merror\x1b[0m\x1b[1m:%ld:%ld:\x1b[0m %s\n", p->previous->position.row, p->previous->position.column, msg); + parser_sync(p); +} + +static ast_node *parse_factor(parser *p) +{ + token *t = peek(p); + if (match(p, TOKEN_INTEGER)) { + ast_node *node = arena_alloc(p->allocator, sizeof(ast_node)); + node->type = NODE_INTEGER; + node->expr.integer = parse_int(t->lexeme, t->lexeme_len); + return node; + } else if (match(p, TOKEN_FLOAT)) { + ast_node *node = arena_alloc(p->allocator, sizeof(ast_node)); + node->type = NODE_FLOAT; + node->expr.flt = parse_float(t->lexeme, t->lexeme_len); + return node; + } else if (match(p, TOKEN_IDENTIFIER)) { + ast_node *node = arena_alloc(p->allocator, sizeof(ast_node)); + node->type = NODE_IDENTIFIER; + node->expr.string.start = t->lexeme; + node->expr.string.len = t->lexeme_len; + return node; + } else if (match(p, TOKEN_STRING)) { + ast_node *node = arena_alloc(p->allocator, sizeof(ast_node)); + node->type = NODE_STRING; + node->expr.string.start = t->lexeme; + node->expr.string.len = t->lexeme_len; + return node; + } else if (match(p, TOKEN_CHAR)) { + ast_node *node = arena_alloc(p->allocator, sizeof(ast_node)); + node->type = NODE_CHAR; + if (t->lexeme_len == 2) { + char c; + switch (t->lexeme[1]) { + case 'n': c = '\n'; break; + case 't': c = '\t'; break; + case 'r': c = '\r'; break; + case '0': c = '\0'; break; + case '\\': c = '\\'; break; + case '\'': c = '\''; break; + default: + error(p, "invalid escape code."); + return NULL; + } + node->expr.ch = c; + } else { + node->expr.ch = *(t->lexeme); + } + return node; + } else if (match(p, TOKEN_LPAREN)) { + ast_node *node = parse_expression(p); + if (!match(p, TOKEN_RPAREN)) { + error(p, "unclosed parenthesis"); + return NULL; + } + + return node; + } + + return NULL; +} + +ast_node *parse_unary(parser *p) +{ + if (match(p, TOKEN_PLUS_PLUS) || match(p, TOKEN_MINUS) || match(p, TOKEN_MINUS_MINUS) || match(p, TOKEN_STAR) || match(p, TOKEN_AND) || match(p, TOKEN_BANG)) { + unary_op op; + switch (p->previous->type) { + case TOKEN_PLUS_PLUS: + op = UOP_INCR; + break; + case TOKEN_MINUS: + op = UOP_MINUS; + break; + case TOKEN_MINUS_MINUS: + op = UOP_DECR; + break; + case TOKEN_STAR: + op = UOP_DEREF; + break; + case TOKEN_AND: + op = UOP_REF; + break; + case TOKEN_BANG: + op = UOP_NOT; + break; + default: + goto end; + } + + ast_node *node = arena_alloc(p->allocator, sizeof(ast_node)); + node->type = NODE_UNARY; + node->expr.unary.operator = op; + node->expr.unary.right = parse_expression(p); + + return node; + } + +end: + return parse_factor(p); +} + +ast_node *parse_term(parser *p) +{ + ast_node *left = parse_unary(p); + + while (match_peek(p, TOKEN_STAR) || match_peek(p, TOKEN_SLASH)) { + binary_op op = peek(p)->type == TOKEN_STAR ? OP_MUL : OP_DIV; + advance(p); + ast_node *right = parse_factor(p); + ast_node *node = arena_alloc(p->allocator, sizeof(ast_node)); + node->type = NODE_BINARY; + node->expr.binary.left = left; + node->expr.binary.right = right; + node->expr.binary.operator = op; + left = node; + } + + return left; +} + +ast_node *parse_expression(parser *p) +{ + ast_node *left = parse_term(p); + + while (match_peek(p, TOKEN_PLUS) || match_peek(p, TOKEN_MINUS)) { + binary_op op = peek(p)->type == TOKEN_PLUS ? OP_PLUS : OP_MINUS; + advance(p); + ast_node *right = parse_term(p); + ast_node *node = arena_alloc(p->allocator, sizeof(ast_node)); + node->type = NODE_BINARY; + node->expr.binary.left = left; + node->expr.binary.right = right; + node->expr.binary.operator = op; + left = node; + } + + return left; +} + +static void parse(parser *p) +{ + p->ast = arena_alloc(p->allocator, sizeof(ast_node)); + p->ast->type = NODE_UNIT; + p->ast->expr.unit_node.expr = parse_expression(p); + ast_node *tail = p->ast; + ast_node *expr = parse_expression(p); + while (expr) { + tail->expr.unit_node.next = arena_alloc(p->allocator, sizeof(ast_node)); + tail->expr.unit_node.next->expr.unit_node.expr = expr; + tail = tail->expr.unit_node.next; + tail->type = NODE_UNIT; + expr = parse_expression(p); + } +} + +parser *parser_init(lexer *l, arena *allocator) +{ + parser *p = arena_alloc(allocator, sizeof(parser)); + p->tokens = l->tokens; + p->allocator= allocator; + + parse(p); + + return p; +} diff --git a/parser.h b/parser.h new file mode 100644 index 0000000..fc44856 --- /dev/null +++ b/parser.h @@ -0,0 +1,145 @@ +#ifndef PARSER_H +#define PARSER_H + +#include "lexer.h" +#include "utils.h" + +typedef enum { + OP_PLUS, // + + OP_MINUS, // - + OP_DIV, // / + OP_MUL, // * + OP_EQ, // == + OP_ASSIGN, // = + OP_AND, // && + OP_OR, // || + OP_NEQ, // != + OP_GT, // > + OP_LT, // < + OP_GE, // >= + OP_LE, // <= + OP_BOR, // | + OP_BAND, // & + OP_BXOR, // ^ + OP_MOD, // % + OP_PLUS_EQ, // += + OP_MINUS_EQ, // -= + OP_DIV_EQ, // /= + OP_MUL_EQ, // *= + OP_BOR_EQ, // |= + OP_BAND_EQ, // &= + OP_BXOR_EQ, // ^= + OP_MOD_EQ, // %= +} binary_op; + +typedef enum { + UOP_INCR, // ++ + UOP_MINUS, // - + UOP_DECR, // -- + UOP_DEREF, // * + UOP_REF, // & + UOP_NOT, // ! +} unary_op; + +typedef enum { + LAYOUT_AUTO, + LAYOUT_PACKED, + LAYOUT_EXTERN +} struct_layout; + +typedef struct { + char *type_name; + usize type_len; + char *name; + usize name_len; + usize offset; +} member; + +typedef struct { + char *name; + usize name_len; + member *params; +} function_decl; + +typedef enum { + NODE_IDENTIFIER, + NODE_INTEGER, + NODE_FLOAT, + NODE_STRING, + NODE_CHAR, + NODE_TERNARY, + NODE_CAST, + NODE_ARRAY_SUBSCRIPT, + NODE_ACCESS, + NODE_ACCESS_PTR, + NODE_CALL, + NODE_POSTFIX, + NODE_UNARY, + NODE_BINARY, + NODE_GOTO, + NODE_BREAK, + NODE_CASE, + NODE_SWITCH, + NODE_FOR, + NODE_DO, + NODE_WHILE, + NODE_IF, + NODE_RETURN, + NODE_COMPOUND, + NODE_TYPEDEF, + NODE_ENUM, + NODE_STRUCT, + NODE_UNION, + NODE_VAR_DECL, + NODE_FUNCTION_DEF, + NODE_FUNCTION_DECL, + NODE_UNIT, + NODE_AS, +} node_type; + +typedef struct _ast_node { + node_type type; + union { + struct { + struct _ast_node *left; + struct _ast_node *right; + binary_op operator; + } binary; + struct { + struct _ast_node *right; + unary_op operator; + } unary; + u64 integer; + f64 flt; // float + struct { + char *start; + usize len; + } string; + char ch; // char; + struct { + struct _ast_node *condition; + struct _ast_node *then; + struct _ast_node *otherwise; + } ternary; + struct { + struct _ast_node *value; + char *type; + usize type_len; + } cast; + struct { + struct _ast_node *expr; + struct _ast_node *next; + } unit_node; + } expr; +} ast_node; + +typedef struct { + token *tokens; + token *previous; + ast_node *ast; + arena *allocator; +} parser; + +parser *parser_init(lexer *l, arena *allocator); + +#endif diff --git a/sema.h b/sema.h new file mode 100644 index 0000000..07f2e26 --- /dev/null +++ b/sema.h @@ -0,0 +1,54 @@ +#ifndef SEMA_H +#define SEMA_H + +typedef enum { + TYPE_VOID, + TYPE_PTR, + TYPE_I8, + TYPE_I16, + TYPE_I32, + TYPE_I64, + TYPE_U8, + TYPE_U16, + TYPE_U32, + TYPE_U64, + TYPE_STRUCT, + TYPE_UNION, + TYPE_ENUM, +} type_tag; + +typedef struct _type { + type_tag tag; + union { + u8 integer; + u8 flt; // float + struct { + bool is_const; + bool is_volatile; + u16 alignment; + struct _type child; + } ptr; + struct { + usize len; + struct _type child; + } array; + struct { + struct_layout layout; + char *name; + usize name_len; + usize alignment; + member *members; + function_decl *decls; + } structure; + struct { + struct_layout layout; + char *name; + usize name_len; + usize alignment; + member *members; + function_decl *decls; + } enum; + } data; +} type; + +#endif diff --git a/test.c b/test.c index 27839c1..dbfa1f4 100644 --- a/test.c +++ b/test.c @@ -1,482 +1 @@ -#include "lexer.h" -#include -#include -#include -#include - -static const char *token_type_str[] = { - [TOKEN_ERROR] = "TOKEN_ERROR", - [TOKEN_END] = "TOKEN_END", - - [TOKEN_PLUS] = "TOKEN_PLUS", - [TOKEN_PLUS_PLUS] = "TOKEN_PLUS_PLUS", - [TOKEN_MINUS] = "TOKEN_MINUS", - [TOKEN_MINUS_MINUS] = "TOKEN_MINUS_MINUS", - [TOKEN_SLASH] = "TOKEN_SLASH", - [TOKEN_PERC] = "TOKEN_PERC", - [TOKEN_STAR] = "TOKEN_STAR", - [TOKEN_AND] = "TOKEN_AND", - [TOKEN_HAT] = "TOKEN_HAT", - [TOKEN_PIPE] = "TOKEN_PIPE", - [TOKEN_EQ] = "TOKEN_EQ", - [TOKEN_ARROW] = "TOKEN_ARROW", - [TOKEN_LSHIFT] = "TOKEN_LSHIFT", - [TOKEN_RSHIFT] = "TOKEN_RSHIFT", - [TOKEN_DOUBLE_EQ] = "TOKEN_DOUBLE_EQ", - [TOKEN_LESS_THAN] = "TOKEN_LESS_THAN", - [TOKEN_GREATER_THAN] = "TOKEN_GREATER_THAN", - [TOKEN_LESS_EQ] = "TOKEN_LESS_EQ", - [TOKEN_GREATER_EQ] = "TOKEN_GREATER_EQ", - [TOKEN_NOT_EQ] = "TOKEN_NOT_EQ", - [TOKEN_PLUS_EQ] = "TOKEN_PLUS_EQ", - [TOKEN_MINUS_EQ] = "TOKEN_MINUS_EQ", - [TOKEN_STAR_EQ] = "TOKEN_STAR_EQ", - [TOKEN_SLASH_EQ] = "TOKEN_SLASH_EQ", - [TOKEN_AND_EQ] = "TOKEN_AND_EQ", - [TOKEN_HAT_EQ] = "TOKEN_HAT_EQ", - [TOKEN_PIPE_EQ] = "TOKEN_PIPE_EQ", - [TOKEN_PERC_EQ] = "TOKEN_PERC_EQ", - [TOKEN_LSHIFT_EQ] = "TOKEN_LSHIFT_EQ", - [TOKEN_RSHIFT_EQ] = "TOKEN_RSHIFT_EQ", - [TOKEN_OR] = "TOKEN_OR", - [TOKEN_DOUBLE_AND] = "TOKEN_DOUBLE_AND", - [TOKEN_COLON] = "TOKEN_COLON", - [TOKEN_SEMICOLON] = "TOKEN_SEMICOLON", - [TOKEN_DOT] = "TOKEN_DOT", - [TOKEN_BANG] = "TOKEN_BANG", - [TOKEN_COMMA] = "TOKEN_COMMA", - [TOKEN_LPAREN] = "TOKEN_LPAREN", - [TOKEN_RPAREN] = "TOKEN_RPAREN", - [TOKEN_LSQUARE] = "TOKEN_LSQUARE", - [TOKEN_RSQUARE] = "TOKEN_RSQUARE", - [TOKEN_LCURLY] = "TOKEN_LCURLY", - [TOKEN_RCURLY] = "TOKEN_RCURLY", - - [TOKEN_INTEGER] = "TOKEN_INTEGER", - [TOKEN_FLOAT] = "TOKEN_FLOAT", - [TOKEN_IDENTIFIER] = "TOKEN_IDENTIFIER", - [TOKEN_STRING] = "TOKEN_STRING", - [TOKEN_CHAR] = "TOKEN_CHAR", - - [TOKEN_WHILE] = "TOKEN_WHILE", - [TOKEN_FOR] = "TOKEN_FOR", - [TOKEN_GOTO] = "TOKEN_GOTO", - [TOKEN_IF] = "TOKEN_IF", - [TOKEN_ELSE] = "TOKEN_ELSE", - [TOKEN_SWITCH] = "TOKEN_SWITCH", - [TOKEN_CASE] = "TOKEN_CASE", - [TOKEN_DO] = "TOKEN_DO", - [TOKEN_DEFER] = "TOKEN_DEFER", - [TOKEN_MODULE] = "TOKEN_MODULE", - - [TOKEN_STATIC] = "TOKEN_STATIC", - [TOKEN_CONST] = "TOKEN_CONST", - [TOKEN_EXTERN] = "TOKEN_EXTERN", - [TOKEN_VOLATILE] = "TOKEN_VOLATILE", -}; - -trie_node *keywords; - -void lexer_print_token(token *t) -{ - printf("%s: ", token_type_str[t->type]); - for (usize i=0; i < t->lexeme_len; i++) { - printf("%c", t->lexeme[i]); - } -} - -static void add_token(lexer *l, token_type type, usize len) -{ - token *t = arena_alloc(l->allocator, sizeof(token)); - t->type = type; - t->lexeme_len = len; - t->lexeme = l->source + l->index; - t->position.row = l->row; - t->position.column = l->column; - - if (!l->tokens) { - l->tokens = t; - l->tail = t; - } else { - l->tail->next = t; - l->tail = t; - } -} - -static void add_error(lexer *l, char *msg) -{ - token *t = arena_alloc(l->allocator, sizeof(token)); - t->type = TOKEN_ERROR; - t->lexeme_len = strlen(msg); - t->lexeme = msg; - t->position.row = l->row; - t->position.column = l->column; - - if (!l->tokens) { - l->tokens = t; - l->tail = t; - } else { - l->tail->next = t; - l->tail = t; - } -} - -static void parse_number(lexer *l) -{ - char c = l->source[l->index]; - /* Is the number a float? */ - bool f = false; - usize len = 0; - - while (isdigit(c)) { - /* If a dot is found, and the character after it is a digit, this is a float. */ - if (l->source[l->index+1] == '.' && isdigit(l->source[l->index+2])) { - f = true; - len += 3; - l->index += 3; - } else { - len += 1; - l->index += 1; - } - c = l->source[l->index]; - } - l->index -= len; - if (f) { - add_token(l, TOKEN_FLOAT, len); - } else { - add_token(l, TOKEN_INTEGER, len); - } - l->index += len; -} - -static void parse_identifier(lexer *l) -{ - char c = l->source[l->index]; - usize len = 0; - - while (isalnum(c) || c == '_') { - len += 1; - l->index += 1; - c = l->source[l->index]; - } - l->index -= len; - token_type keyword = trie_get(keywords, l->source + l->index, len); - if (keyword) { - add_token(l, keyword, len); - } else { - add_token(l, TOKEN_IDENTIFIER, len); - } - l->index += len; -} - -static void parse_string(lexer *l) -{ - char c = l->source[l->index]; - usize len = 0; - - while (c != '"') { - if (c == '\0' || c == '\n') { - printf("%c", c); - l->index -= len; - add_error(l, "unclosed string literal."); - l->index += len; - return; - } - len += 1; - l->index += 1; - c = l->source[l->index]; - } - l->index -= len; - add_token(l, TOKEN_STRING, len); - l->index += len + 1; -} - -static bool parse_special(lexer *l) -{ - switch (l->source[l->index]) { - case '+': - if (l->source[l->index+1] == '=') { - add_token(l, TOKEN_PLUS_EQ, 2); - l->index += 2; - } else if (l->source[l->index+1] == '+') { - add_token(l, TOKEN_PLUS_PLUS, 2); - l->index += 2; - } else { - add_token(l, TOKEN_PLUS, 1); - l->index += 1; - } - return true; - case '-': - if (l->source[l->index+1] == '=') { - add_token(l, TOKEN_MINUS_EQ, 2); - l->index += 2; - } else if (l->source[l->index+1] == '-') { - add_token(l, TOKEN_MINUS_MINUS, 2); - l->index += 2; - } else if (l->source[l->index+1] == '>') { - add_token(l, TOKEN_ARROW, 2); - l->index += 2; - } else { - add_token(l, TOKEN_MINUS, 1); - l->index += 1; - } - return true; - case '/': - if (l->source[l->index+1] == '=') { - add_token(l, TOKEN_SLASH_EQ, 2); - l->index += 2; - } else { - add_token(l, TOKEN_SLASH, 1); - l->index += 1; - } - return true; - case '*': - if (l->source[l->index+1] == '=') { - add_token(l, TOKEN_STAR_EQ, 2); - l->index += 2; - } else { - add_token(l, TOKEN_STAR, 1); - l->index += 1; - } - return true; - case '%': - if (l->source[l->index+1] == '=') { - add_token(l, TOKEN_PERC_EQ, 2); - l->index += 2; - } else { - add_token(l, TOKEN_PERC, 1); - l->index += 1; - } - return true; - case '&': - if (l->source[l->index+1] == '=') { - add_token(l, TOKEN_AND_EQ, 2); - l->index += 2; - } else if (l->source[l->index+1] == '&') { - add_token(l, TOKEN_DOUBLE_AND, 2); - l->index += 2; - } else { - add_token(l, TOKEN_AND, 1); - l->index += 1; - } - return true; - case '^': - if (l->source[l->index+1] == '=') { - add_token(l, TOKEN_HAT_EQ, 2); - l->index += 2; - } else { - add_token(l, TOKEN_HAT, 1); - l->index += 1; - } - return true; - case '|': - if (l->source[l->index+1] == '=') { - add_token(l, TOKEN_PIPE_EQ, 2); - l->index += 2; - } else if (l->source[l->index+1] == '|') { - add_token(l, TOKEN_OR, 2); - l->index += 2; - } else { - add_token(l, TOKEN_PIPE, 1); - l->index += 1; - } - return true; - case '=': - if (l->source[l->index+1] == '=') { - add_token(l, TOKEN_DOUBLE_EQ, 2); - l->index += 2; - } else { - add_token(l, TOKEN_EQ, 1); - l->index += 1; - } - return true; - case '>': - if (l->source[l->index+1] == '=') { - add_token(l, TOKEN_GREATER_EQ, 2); - l->index += 2; - } else if (l->source[l->index+1] == '>') { - if (l->source[l->index+2] == '=') { - add_token(l, TOKEN_RSHIFT_EQ, 3); - l->index += 3; - return true; - } - add_token(l, TOKEN_RSHIFT, 2); - l->index += 2; - } else { - add_token(l, TOKEN_GREATER_THAN, 1); - l->index += 1; - } - return true; - case '<': - if (l->source[l->index+1] == '=') { - add_token(l, TOKEN_LESS_EQ, 2); - l->index += 2; - } else if (l->source[l->index+1] == '<') { - if (l->source[l->index+2] == '=') { - add_token(l, TOKEN_LSHIFT_EQ, 3); - l->index += 3; - return true; - } - add_token(l, TOKEN_LSHIFT, 2); - l->index += 2; - } else { - add_token(l, TOKEN_LESS_THAN, 1); - l->index += 1; - } - return true; - case '!': - if (l->source[l->index+1] == '=') { - add_token(l, TOKEN_NOT_EQ, 2); - l->index += 2; - } else { - add_token(l, TOKEN_BANG, 1); - l->index += 1; - } - return true; - case ':': - add_token(l, TOKEN_COLON, 1); - l->index += 1; - return true; - case ';': - add_token(l, TOKEN_SEMICOLON, 1); - l->index += 1; - return true; - case '.': - add_token(l, TOKEN_DOT, 1); - l->index += 1; - return true; - case ',': - add_token(l, TOKEN_COMMA, 1); - l->index += 1; - return true; - case '(': - add_token(l, TOKEN_LPAREN, 1); - l->index += 1; - return true; - case ')': - add_token(l, TOKEN_RPAREN, 1); - l->index += 1; - return true; - case '[': - add_token(l, TOKEN_LSQUARE, 1); - l->index += 1; - return true; - case ']': - add_token(l, TOKEN_RSQUARE, 1); - l->index += 1; - return true; - case '{': - add_token(l, TOKEN_LCURLY, 1); - l->index += 1; - return true; - case '}': - add_token(l, TOKEN_RCURLY, 1); - l->index += 1; - return true; - case '\'': - if (l->source[l->index+1] == '\\') { - if (l->source[l->index+3] != '\'') { - add_error(l, "unclosed character literal."); - return true; - } - l->index += 1; - add_token(l, TOKEN_CHAR, 2); - l->index += 3; - return true; - } else { - if (l->source[l->index+2] != '\'') { - add_error(l, "unclosed character literal."); - return false; - } - l->index += 1; - add_token(l, TOKEN_CHAR, 1); - l->index += 2; - return true; - } - default: - return false; - } -} - -static void parse(lexer *l) -{ - char c; - - while (l->index <= l->size) { - c = l->source[l->index]; - l->column += 1; - - if (c == '\n') { - l->index += 1; - l->row += 1; - l->column = 0; - continue; - } - - if (isspace(c)) { - l->index += 1; - continue; - } - - usize head = l->index; - - if (parse_special(l)) { - l->column += (l->index - head - 1); - continue; - } - - if (isdigit(c)) { - parse_number(l); - l->column += (l->index - head - 1); - continue; - } - - if (isalpha(c)) { - parse_identifier(l); - l->column += (l->index - head - 1); - continue; - } - - if (c == '"') { - l->index += 1; - parse_string(l); - l->column += (l->index - head - 1); - continue; - } - - l->index += 1; - } -} - -lexer *lexer_init(char *source, usize size, arena *arena) -{ - lexer *lex = arena_alloc(arena, sizeof(lexer)); - lex->column = 0; - lex->row = 0; - lex->index = 0; - lex->size = size; - lex->tokens = 0; - lex->tail = 0; - lex->allocator = arena; - lex->source = source; - - keywords = arena_alloc(arena, sizeof(trie_node)); - trie_insert(keywords, lex->allocator, "while", TOKEN_WHILE); - trie_insert(keywords, lex->allocator, "for", TOKEN_FOR); - trie_insert(keywords, lex->allocator, "goto", TOKEN_GOTO); - trie_insert(keywords, lex->allocator, "if", TOKEN_IF); - trie_insert(keywords, lex->allocator, "else", TOKEN_ELSE); - trie_insert(keywords, lex->allocator, "switch", TOKEN_SWITCH); - trie_insert(keywords, lex->allocator, "case", TOKEN_CASE); - trie_insert(keywords, lex->allocator, "do", TOKEN_DO); - trie_insert(keywords, lex->allocator, "defer", TOKEN_DEFER); - trie_insert(keywords, lex->allocator, "module", TOKEN_MODULE); - trie_insert(keywords, lex->allocator, "static", TOKEN_STATIC); - trie_insert(keywords, lex->allocator, "const", TOKEN_CONST); - trie_insert(keywords, lex->allocator, "extern", TOKEN_EXTERN); - trie_insert(keywords, lex->allocator, "volatile", TOKEN_VOLATILE); - - parse(lex); - - return lex; -} +hello+3-(--ciao) diff --git a/utils.c b/utils.c index 4080730..c8ee4ed 100644 --- a/utils.c +++ b/utils.c @@ -3,6 +3,37 @@ #include #include +u64 parse_int(char *s, usize len) +{ + u64 int_part = 0; + for (usize i=0; i < len; i++) { + int_part = (int_part * 10) + (s[i] - '0'); + } + + return int_part; +} + +f64 parse_float(char *s, usize len) +{ + f64 decimal_part = (f64)parse_int(s, len); + usize point_pos = 0; + + for (usize i=0; i < len; i++) { + if (s[i] == '.') { + point_pos = i; + break; + } + } + point_pos += 1; + + for (usize i=0; i < len - point_pos; i++) { + decimal_part /= 10.0; + } + + return decimal_part; +} + + void trie_insert(trie_node *root, arena *a, char *key, uint16_t value) { trie_node *node = root; diff --git a/utils.h b/utils.h index 455e9e3..6eac1de 100644 --- a/utils.h +++ b/utils.h @@ -20,6 +20,9 @@ typedef size_t usize; typedef float f32; typedef double f64; +u64 parse_int(char *s, usize len); +f64 parse_float(char *s, usize len); + typedef struct { usize capacity; usize position;