From 6322f2f93ca52588f022c5bdcc61194a6be08aa5 Mon Sep 17 00:00:00 2001 From: MCorange Date: Wed, 24 Jul 2024 21:06:48 +0300 Subject: [PATCH] Uwu nya lexer has bween made :3 --- src/include/dyn_arr.h | 10 +-- src/include/loc.h | 11 +++ src/include/token.h | 40 +++++++++++ src/include/tokeniser.h | 15 ++++ src/main.c | 16 +++++ src/token.c | 61 ++++++++++++++++ src/tokeniser.c | 150 ++++++++++++++++++++++++++++++++++++++++ test.mcl | 6 ++ 8 files changed, 304 insertions(+), 5 deletions(-) create mode 100644 src/include/loc.h create mode 100644 src/include/token.h create mode 100644 src/token.c create mode 100644 test.mcl diff --git a/src/include/dyn_arr.h b/src/include/dyn_arr.h index 66c1097..6355b05 100644 --- a/src/include/dyn_arr.h +++ b/src/include/dyn_arr.h @@ -9,21 +9,21 @@ typedef struct mcl_da_##name##_s { \ size_t count; \ size_t capacity; \ } mcl_da_##name##_t; \ -type mcl_da_##name##_pop(mcl_da_##name##_t* da); \ +type* mcl_da_##name##_pop(mcl_da_##name##_t* da); \ void mcl_da_##name##_push(mcl_da_##name##_t* da, type item); \ void mcl_da_##name##_free(mcl_da_##name##_t* da); \ #define DEFINE_DA_IMPL(name, type) \ -type mcl_da_##name##_pop(mcl_da_##name##_t* da) { \ +type* mcl_da_##name##_pop(mcl_da_##name##_t* da) { \ if (da->capacity <= 0 || da->count <= 0) \ return NULL; \ if (da->count < da->capacity / 2) { \ da->capacity /= 2; \ da->items = realloc(da->items, \ - da->capacity * sizeof(type)); \ + da->capacity * sizeof(type)); \ assert(da->items && "Out of memory"); \ } \ - return da->items[(da->count--) - 1]; \ + return &da->items[(da->count--) - 1]; \ } \ void mcl_da_##name##_push(mcl_da_##name##_t* da, type item) { \ if (da->capacity <= da->count) { \ @@ -33,7 +33,7 @@ void mcl_da_##name##_push(mcl_da_##name##_t* da, type item) { \ da->capacity *= 2; \ } \ da->items = realloc(da->items, \ - da->capacity * sizeof(type)); \ + da->capacity * sizeof(type)); \ assert(da->items && "Out of memory"); \ } \ da->items[da->count++] = item; \ diff --git a/src/include/loc.h b/src/include/loc.h new file mode 100644 index 0000000..ae822e9 --- /dev/null +++ b/src/include/loc.h @@ -0,0 +1,11 @@ +#ifndef _H_MCL_LOC +#define _H_MCL_LOC + +#include +typedef struct loc_s { + char* file; + size_t col; + size_t line; +} loc_t; + +#endif diff --git a/src/include/token.h b/src/include/token.h new file mode 100644 index 0000000..a697c19 --- /dev/null +++ b/src/include/token.h @@ -0,0 +1,40 @@ + +#ifndef _H_MCL_TOKEN +#define _H_MCL_TOKEN + +#include "loc.h" +typedef enum token_type_e { + TT_IDENT, // identifier + TT_STR, // "*" + TT_CHR, // '*' + TT_CURLY_R, // } + TT_CURLY_L, // { + TT_BRACK_R, // ] + TT_BRACK_L, // [ + TT_PAREN_R, // ) + TT_PAREN_L, // ( + TT_COLON, // : + TT_SEMI, // ; + TT_COMMA, // , + TT_DOT, // . + TT_AMP, // & + TT_STAR, // * + TT_PLUS, // + + TT_DASH, // - + TT_FSLASH, // / + TT_BAR, // | + TT_EQ, // = + TT_LT, // < + TT_GT, // > +} token_type_t; + + +typedef struct token_s { + token_type_t type; + char* text; + loc_t loc; +} token_t; + +char* token_to_string(token_t* tt); + +#endif diff --git a/src/include/tokeniser.h b/src/include/tokeniser.h index 7bc4266..fbfc703 100644 --- a/src/include/tokeniser.h +++ b/src/include/tokeniser.h @@ -1,5 +1,20 @@ #ifndef _H_MCL_TOKENSIER #define _H_MCL_TOKENSIER +#include +#include + +#include "dyn_arr.h" +#include "loc.h" +#include "token.h" + +DEFINE_DA(token, token_t) + +typedef struct tokeniser_s { + mcl_da_token_t tokens; + loc_t loc; +} tokeniser_t; + +tokeniser_t* tokenise(char* file); #endif diff --git a/src/main.c b/src/main.c index 56d547b..777442d 100644 --- a/src/main.c +++ b/src/main.c @@ -3,15 +3,31 @@ #include #include "cliargs.h" #include "dyn_arr.h" +#include "token.h" +#include "tokeniser.h" int main(int argc, char** argv) { cliargs_t* cliargs = parse_cliargs(argc, argv); printf("Hewo world :33\n"); printf("Output file: %s\n", cliargs->output); + MCL_DA_FOR_IN(char*, &cliargs->input, file, { printf("Input file: %s\n", file); + tokeniser_t* tokeniser = tokenise(file); + if (!tokeniser) { + printf("Failed to tokenise\n"); + return 1; + } + MCL_DA_FOR_IN(token_t, &tokeniser->tokens, token, { + printf("%s:%zu:%zu: %s\n", + token.loc.file, + token.loc.line, + token.loc.col, + token_to_string(&token)); + }); }); + return 0; } diff --git a/src/token.c b/src/token.c new file mode 100644 index 0000000..3de450a --- /dev/null +++ b/src/token.c @@ -0,0 +1,61 @@ +#include +#include +#include +#include "token.h" + + +char* token_to_string(token_t* t) { + char* buf = {0}; + int len = 0; + switch(t->type) { + case TT_IDENT: + return t->text; + case TT_STR: + len = strlen(t->text)+3; + buf = malloc(len); + snprintf(buf, len, "\"%s\"", t->text); + return buf; + case TT_CHR: + buf = malloc(4); + snprintf(buf, 4, "'%c\'", *t->text); + return buf; + case TT_CURLY_R: + return "}"; + case TT_CURLY_L: + return "{"; + case TT_BRACK_R: + return "]"; + case TT_BRACK_L: + return "["; + case TT_PAREN_R: + return ")"; + case TT_PAREN_L: + return "("; + case TT_COLON: + return ":"; + case TT_SEMI: + return ";"; + case TT_COMMA: + return ","; + case TT_DOT: + return "."; + case TT_AMP: + return "&"; + case TT_STAR: + return "*"; + case TT_PLUS: + return "+"; + case TT_DASH: + return "-"; + case TT_FSLASH: + return "/"; + case TT_BAR: + return "|"; + case TT_EQ: + return "="; + case TT_LT: + return "<"; + case TT_GT: + return ">"; + } +} diff --git a/src/tokeniser.c b/src/tokeniser.c index 4885b56..802dc63 100644 --- a/src/tokeniser.c +++ b/src/tokeniser.c @@ -1,4 +1,154 @@ +#include +#include +#include #include "tokeniser.h" +#include "dyn_arr.h" +#include "token.h" + +DEFINE_DA_IMPL(token, token_t) + +#define TZ_TOK_PUSH(_loc, _type, _text) mcl_da_token_push(&tz->tokens, (token_t){.loc = (_loc), .type = (_type), .text=(_text)}) + +tokeniser_t* tokenise(char* file) { + tokeniser_t* tz = malloc(sizeof(tokeniser_t)); + tz->loc.file = file; + FILE* f = fopen(file, "r"); + if (!f) { + printf("Could not open file %s\n", file); + return NULL; + } + + char c; + while ((c = fgetc(f)) != EOF) { + tz->loc.col++; + switch(c) { + case ' ': + case '\t': + case '\r': break; + + case '\n': { + tz->loc.col = 0; + tz->loc.line++; + } break; + + case '"': { + int size = 256; + int i = 0; + char* buf = malloc(size * sizeof(char)); + while ((c = fgetc(f)) != EOF) { + if (size <= strlen(buf) - 1) { + buf = realloc(buf, size *= 2); + } + if (c == '\n') { + printf("ERROR: Newline in string\n"); + return NULL; + } + tz->loc.col++; + if (c == '"') break; + if (c == '\\') { + switch (c = fgetc(f)) { + case 'n': c = '\n'; break; + case '\\': break; + } + } + + buf[i++] = c; + } + TZ_TOK_PUSH(tz->loc, TT_STR, buf); + } break; + + case '\'': { + bool escape = false; + char c = fgetc(f); + char* buf = malloc(1 * sizeof(char)); + if (c == '\\') { + c = fgetc(f); + switch (c) { + case 'n': *buf = '\n'; break; + default: + printf("ERROR: Unknown escape: \\%c\n", c); + return NULL; + } + } else { + *buf = c; + } + TZ_TOK_PUSH(tz->loc, TT_CHR, buf); + + } + case 'a': case 'A': + case 'b': case 'B': + case 'c': case 'C': + case 'd': case 'D': + case 'e': case 'E': + case 'f': case 'F': + case 'g': case 'G': + case 'h': case 'H': + case 'i': case 'I': + case 'j': case 'J': + case 'k': case 'K': + case 'l': case 'L': + case 'm': case 'M': + case 'n': case 'N': + case 'o': case 'O': + case 'p': case 'P': + case 'q': case 'Q': + case 'r': case 'R': + case 's': case 'S': + case 't': case 'T': + case 'u': case 'U': + case 'v': case 'V': + case 'w': case 'W': + case 'y': case 'Y': + case 'z': case 'Z': + case '_': { + int size = 256; + int i = 1; + char* buf = malloc(size * sizeof(char)); + buf[0] = c; + while ((c = fgetc(f)) != EOF) { + if (!( (c >= 'A' && c <= 'Z') || + (c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9') || + (c == '_') + )) { + ungetc(c, f); + break; + }; + if (size <= strlen(buf) - 1) { + buf = realloc(buf, size *= 2); + } + + tz->loc.col++; + buf[i++] = c; + } + TZ_TOK_PUSH(tz->loc, TT_IDENT, buf); + + } break; + + case '}': TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL); break; + case '{': TZ_TOK_PUSH(tz->loc, TT_CURLY_L, NULL); break; + case ']': TZ_TOK_PUSH(tz->loc, TT_BRACK_R, NULL); break; + case '[': TZ_TOK_PUSH(tz->loc, TT_BRACK_L, NULL); break; + case ')': TZ_TOK_PUSH(tz->loc, TT_PAREN_R, NULL); break; + case '(': TZ_TOK_PUSH(tz->loc, TT_PAREN_L, NULL); break; + case ':': TZ_TOK_PUSH(tz->loc, TT_COLON, NULL); break; + case ';': TZ_TOK_PUSH(tz->loc, TT_SEMI, NULL); break; + case ',': TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL); break; + case '.': TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL); break; + case '&': TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL); break; + case '*': TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL); break; + case '+': TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL); break; + case '-': TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL); break; + case '/': TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL); break; + case '|': TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL); break; + case '=': TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL); break; + case '<': TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL); break; + case '>': TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL); break; + } + } + return tz; +} + diff --git a/test.mcl b/test.mcl new file mode 100644 index 0000000..25593cb --- /dev/null +++ b/test.mcl @@ -0,0 +1,6 @@ + + + +main :: fn(argc: i32, argv: string[]) -> i32 { + println!("Hello world!\n"); +}