mclangc/src/tokeniser.c

256 lines
7.7 KiB
C

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "tokeniser.h"
#include "dyn_arr.h"
#include "logger.h"
#include "token.h"
DEFINE_DA_IMPL(token, token_t)
#define TZ_TOK_PUSH(_loc, _type, _text) mcl_da_token_push(&tz->tokens, (token_t){.loc = (_loc), .type = (_type), .text=(_text)})
static struct {
char* k;
token_type_t v;
} KEYWORDS[] = {
{"fn", TT_KW_FN },
{"return", TT_KW_RETURN},
{"for", TT_KW_FOR},
{"if", TT_KW_IF},
{"else", TT_KW_ELSE},
{"enum", TT_KW_ENUM},
{"struct", TT_KW_STRUCT},
{"while", TT_KW_WHILE},
{"break", TT_KW_BREAK},
{"continue", TT_KW_CONTINUE}
};
tokeniser_t* tokenise(char* file) {
tokeniser_t* tz = malloc(sizeof(tokeniser_t));
tz->loc.file = file;
tz->loc.line = 1;
tz->loc.col = 1;
FILE* f = fopen(file, "r");
if (!f) {
printf("Could not open file %s\n", file);
return NULL;
}
char c;
while ((c = fgetc(f)) != EOF) {
switch(c) {
case '\t':
case '\r':
case ' ':{
tz->loc.col += 1;
} break;
case '\n': {
tz->loc.col = 1;
tz->loc.line++;
} break;
case '"': {
loc_t loc = tz->loc;
int size = 256;
int i = 0;
char* buf = malloc(size * sizeof(char));
while ((c = fgetc(f)) != EOF) {
tz->loc.col++;
if (size <= strlen(buf) - 1) {
buf = realloc(buf, size *= 2);
}
if (c == '\n') {
mcl_log_loc(ERROR, &tz->loc, "No newlines in strings");
return NULL;
}
if (c == '"') break;
if (c == '\\') {
switch (c = fgetc(f)) {
case 'n': c = '\n'; break;
case '\\': break;
}
}
buf[i++] = c;
}
TZ_TOK_PUSH(loc, TT_STR, buf);
} break;
case '\'': {
loc_t loc = tz->loc;
bool escape = false;
char c = fgetc(f);
tz->loc.col += 1;
char* buf = malloc(1 * sizeof(char));
if (c == '\\') {
c = fgetc(f);
tz->loc.col += 1;
switch (c) {
case 'n': *buf = '\n'; break;
default:
mcl_log_loc(ERROR, &tz->loc, "Unknown escape: \\%c\n", c);
return NULL;
}
} else {
tz->loc.col += 1;
*buf = c;
}
c = fgetc(f);
if (c != '\'') {
mcl_log_loc(ERROR, &tz->loc, "Expected \"'\" found \"%c\"", c);
return NULL;
}
TZ_TOK_PUSH(loc, TT_CHR, buf);
}
case 'a': case 'A':
case 'b': case 'B':
case 'c': case 'C':
case 'd': case 'D':
case 'e': case 'E':
case 'f': case 'F':
case 'g': case 'G':
case 'h': case 'H':
case 'i': case 'I':
case 'j': case 'J':
case 'k': case 'K':
case 'l': case 'L':
case 'm': case 'M':
case 'n': case 'N':
case 'o': case 'O':
case 'p': case 'P':
case 'q': case 'Q':
case 'r': case 'R':
case 's': case 'S':
case 't': case 'T':
case 'u': case 'U':
case 'v': case 'V':
case 'w': case 'W':
case 'y': case 'Y':
case 'z': case 'Z':
case '_': {
loc_t loc = tz->loc;
int size = 256;
int i = 1;
char* buf = malloc(size * sizeof(char));
buf[0] = c;
while ((c = fgetc(f)) != EOF) {
if (!( (c >= 'A' && c <= 'Z') ||
(c >= 'a' && c <= 'z') ||
(c >= '0' && c <= '9') ||
(c == '_')
)) {
tz->loc.col++;
ungetc(c, f);
break;
};
if (size <= strlen(buf) - 1) {
buf = realloc(buf, size *= 2);
}
tz->loc.col++;
buf[i++] = c;
}
bool found = false;
for (int i = 0; i < sizeof(KEYWORDS)/sizeof(KEYWORDS[0]); i++) {
if (strcmp(buf, KEYWORDS[i].k) == 0) {
TZ_TOK_PUSH(loc, KEYWORDS[i].v, NULL);
found = true;
}
}
if (!found) TZ_TOK_PUSH(loc, TT_IDENT, buf);
} break;
case '}': {
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
tz->loc.col++;
} break;
case '{': {
TZ_TOK_PUSH(tz->loc, TT_CURLY_L, NULL);
tz->loc.col++;
} break;
case ']': {
TZ_TOK_PUSH(tz->loc, TT_BRACK_R, NULL);
tz->loc.col++;
} break;
case '[': {
TZ_TOK_PUSH(tz->loc, TT_BRACK_L, NULL);
tz->loc.col++;
} break;
case ')': {
TZ_TOK_PUSH(tz->loc, TT_PAREN_R, NULL);
tz->loc.col++;
} break;
case '(': {
TZ_TOK_PUSH(tz->loc, TT_PAREN_L, NULL);
tz->loc.col++;
} break;
case ':': {
TZ_TOK_PUSH(tz->loc, TT_COLON, NULL);
tz->loc.col++;
} break;
case ';': {
TZ_TOK_PUSH(tz->loc, TT_SEMI, NULL);
tz->loc.col++;
} break;
case ',': {
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
tz->loc.col++;
} break;
case '.': {
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
tz->loc.col++;
} break;
case '&': {
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
tz->loc.col++;
} break;
case '*': {
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
tz->loc.col++;
} break;
case '+': {
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
tz->loc.col++;
} break;
case '-': {
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
tz->loc.col++;
} break;
case '/': {
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
tz->loc.col++;
} break;
case '|': {
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
tz->loc.col++;
} break;
case '=': {
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
tz->loc.col++;
} break;
case '<': {
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
tz->loc.col++;
} break;
case '>': {
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
tz->loc.col++;
} break;
}
}
return tz;
}