256 lines
7.7 KiB
C
256 lines
7.7 KiB
C
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "tokeniser.h"
|
|
#include "dyn_arr.h"
|
|
#include "logger.h"
|
|
#include "token.h"
|
|
|
|
DEFINE_DA_IMPL(token, token_t)
|
|
|
|
#define TZ_TOK_PUSH(_loc, _type, _text) mcl_da_token_push(&tz->tokens, (token_t){.loc = (_loc), .type = (_type), .text=(_text)})
|
|
|
|
static struct {
|
|
char* k;
|
|
token_type_t v;
|
|
} KEYWORDS[] = {
|
|
{"fn", TT_KW_FN },
|
|
{"return", TT_KW_RETURN},
|
|
{"for", TT_KW_FOR},
|
|
{"if", TT_KW_IF},
|
|
{"else", TT_KW_ELSE},
|
|
{"enum", TT_KW_ENUM},
|
|
{"struct", TT_KW_STRUCT},
|
|
{"while", TT_KW_WHILE},
|
|
{"break", TT_KW_BREAK},
|
|
{"continue", TT_KW_CONTINUE}
|
|
};
|
|
|
|
tokeniser_t* tokenise(char* file) {
|
|
tokeniser_t* tz = malloc(sizeof(tokeniser_t));
|
|
tz->loc.file = file;
|
|
tz->loc.line = 1;
|
|
tz->loc.col = 1;
|
|
FILE* f = fopen(file, "r");
|
|
if (!f) {
|
|
printf("Could not open file %s\n", file);
|
|
return NULL;
|
|
}
|
|
|
|
char c;
|
|
while ((c = fgetc(f)) != EOF) {
|
|
switch(c) {
|
|
case '\t':
|
|
case '\r':
|
|
case ' ':{
|
|
tz->loc.col += 1;
|
|
} break;
|
|
|
|
case '\n': {
|
|
tz->loc.col = 1;
|
|
tz->loc.line++;
|
|
} break;
|
|
|
|
case '"': {
|
|
loc_t loc = tz->loc;
|
|
int size = 256;
|
|
int i = 0;
|
|
char* buf = malloc(size * sizeof(char));
|
|
while ((c = fgetc(f)) != EOF) {
|
|
tz->loc.col++;
|
|
if (size <= strlen(buf) - 1) {
|
|
buf = realloc(buf, size *= 2);
|
|
}
|
|
if (c == '\n') {
|
|
mcl_log_loc(ERROR, &tz->loc, "No newlines in strings");
|
|
return NULL;
|
|
}
|
|
|
|
if (c == '"') break;
|
|
if (c == '\\') {
|
|
switch (c = fgetc(f)) {
|
|
case 'n': c = '\n'; break;
|
|
case '\\': break;
|
|
}
|
|
}
|
|
|
|
buf[i++] = c;
|
|
}
|
|
TZ_TOK_PUSH(loc, TT_STR, buf);
|
|
} break;
|
|
|
|
case '\'': {
|
|
loc_t loc = tz->loc;
|
|
bool escape = false;
|
|
char c = fgetc(f);
|
|
tz->loc.col += 1;
|
|
char* buf = malloc(1 * sizeof(char));
|
|
if (c == '\\') {
|
|
c = fgetc(f);
|
|
tz->loc.col += 1;
|
|
switch (c) {
|
|
case 'n': *buf = '\n'; break;
|
|
default:
|
|
mcl_log_loc(ERROR, &tz->loc, "Unknown escape: \\%c\n", c);
|
|
return NULL;
|
|
}
|
|
|
|
} else {
|
|
tz->loc.col += 1;
|
|
*buf = c;
|
|
}
|
|
|
|
c = fgetc(f);
|
|
if (c != '\'') {
|
|
mcl_log_loc(ERROR, &tz->loc, "Expected \"'\" found \"%c\"", c);
|
|
return NULL;
|
|
}
|
|
TZ_TOK_PUSH(loc, TT_CHR, buf);
|
|
|
|
}
|
|
|
|
|
|
case 'a': case 'A':
|
|
case 'b': case 'B':
|
|
case 'c': case 'C':
|
|
case 'd': case 'D':
|
|
case 'e': case 'E':
|
|
case 'f': case 'F':
|
|
case 'g': case 'G':
|
|
case 'h': case 'H':
|
|
case 'i': case 'I':
|
|
case 'j': case 'J':
|
|
case 'k': case 'K':
|
|
case 'l': case 'L':
|
|
case 'm': case 'M':
|
|
case 'n': case 'N':
|
|
case 'o': case 'O':
|
|
case 'p': case 'P':
|
|
case 'q': case 'Q':
|
|
case 'r': case 'R':
|
|
case 's': case 'S':
|
|
case 't': case 'T':
|
|
case 'u': case 'U':
|
|
case 'v': case 'V':
|
|
case 'w': case 'W':
|
|
case 'y': case 'Y':
|
|
case 'z': case 'Z':
|
|
case '_': {
|
|
loc_t loc = tz->loc;
|
|
int size = 256;
|
|
int i = 1;
|
|
char* buf = malloc(size * sizeof(char));
|
|
buf[0] = c;
|
|
while ((c = fgetc(f)) != EOF) {
|
|
if (!( (c >= 'A' && c <= 'Z') ||
|
|
(c >= 'a' && c <= 'z') ||
|
|
(c >= '0' && c <= '9') ||
|
|
(c == '_')
|
|
)) {
|
|
tz->loc.col++;
|
|
ungetc(c, f);
|
|
break;
|
|
};
|
|
if (size <= strlen(buf) - 1) {
|
|
buf = realloc(buf, size *= 2);
|
|
}
|
|
|
|
tz->loc.col++;
|
|
buf[i++] = c;
|
|
}
|
|
bool found = false;
|
|
for (int i = 0; i < sizeof(KEYWORDS)/sizeof(KEYWORDS[0]); i++) {
|
|
if (strcmp(buf, KEYWORDS[i].k) == 0) {
|
|
TZ_TOK_PUSH(loc, KEYWORDS[i].v, NULL);
|
|
found = true;
|
|
}
|
|
}
|
|
|
|
if (!found) TZ_TOK_PUSH(loc, TT_IDENT, buf);
|
|
|
|
} break;
|
|
|
|
case '}': {
|
|
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case '{': {
|
|
TZ_TOK_PUSH(tz->loc, TT_CURLY_L, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case ']': {
|
|
TZ_TOK_PUSH(tz->loc, TT_BRACK_R, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case '[': {
|
|
TZ_TOK_PUSH(tz->loc, TT_BRACK_L, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case ')': {
|
|
TZ_TOK_PUSH(tz->loc, TT_PAREN_R, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case '(': {
|
|
TZ_TOK_PUSH(tz->loc, TT_PAREN_L, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case ':': {
|
|
TZ_TOK_PUSH(tz->loc, TT_COLON, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case ';': {
|
|
TZ_TOK_PUSH(tz->loc, TT_SEMI, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case ',': {
|
|
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case '.': {
|
|
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case '&': {
|
|
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case '*': {
|
|
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case '+': {
|
|
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case '-': {
|
|
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case '/': {
|
|
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case '|': {
|
|
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case '=': {
|
|
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case '<': {
|
|
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
case '>': {
|
|
TZ_TOK_PUSH(tz->loc, TT_CURLY_R, NULL);
|
|
tz->loc.col++;
|
|
} break;
|
|
}
|
|
}
|
|
return tz;
|
|
}
|
|
|
|
|