This commit is contained in:
2024-12-21 03:22:07 +02:00
commit 54b6df5862
31 changed files with 2217 additions and 0 deletions

292
src/tokeniser/mod.rs Normal file
View File

@@ -0,0 +1,292 @@
use std::{collections::HashMap, fmt::Display};
use anyhow::bail;
use parse_int::parse;
use crate::{common::{loc::LocIncr, Loc}, error, lerror};
pub mod tokentype;
use tokentype::*;
#[derive(Debug, Clone)]
pub struct Token {
loc: Loc,
tt: TokenType,
}
impl Display for Token {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}: {:?}", self.loc(), self.tt())
}
}
impl Token {
fn new(tt: TokenType, loc: &Loc) -> Self {
Self {
tt, loc: loc.clone()
}
}
pub fn loc(&self) -> &Loc {
&self.loc
}
pub fn tt(&self) -> &TokenType {
&self.tt
}
}
pub fn tokenise(s: &str) -> anyhow::Result<Vec<Token>> {
let mut loc = Loc::default();
let mut tokens = Vec::new();
let chars: Vec<_> = s.chars().collect();
let mut chars = chars.iter().peekable();
while let Some(c) = chars.next() {
loc.inc_col();
match c {
' ' | '\t' => (),
'/' if chars.peek() == Some(&&'/') => {
let mut buf = String::new();
chars.next();
while let Some(c) = chars.next_if(|c| !matches!(c, '\n' | '\r')) {
loc.inc_col();
buf.push(*c);
}
// tokens.push(Token::new(TokenType::Comment(Comment::Line(buf.clone())), &loc));
},
'/' if chars.peek() == Some(&&'*') => {
let mut buf = String::new();
chars.next();
while let Some(c) = chars.peek() {
if matches!(c, '\n' | '\r') {
loc.inc_line();
} else {
loc.inc_col();
}
let c = *chars.next().expect("Unreachable");
if c == '*' && matches!(chars.peek(), Some(&&'/') | None) {
chars.next();
break;
}
buf.push(c);
}
// tokens.push(Token::new(TokenType::Comment(Comment::Line(buf.clone())), &loc));
}
'\n' => loc.inc_line(),
'"' | '\'' |
'c' if *c != 'c' || chars.peek() == Some(&&'"') => {
let str_typ = *c;
let mut sc = *c;
if *c == 'c' {
sc = '"';
chars.peek();
}
let mut last = '\0';
let mut buf = String::new();
while let Some(c) = chars.next_if(|v| **v != '\n') {
loc.inc_col();
if *c == sc && last != '\\' {
break;
}
buf.push(*c);
last = *c;
}
match str_typ {
'"' => {
tokens.push(Token::new(TokenType::string(&buf, false), &loc));
}
'c' => {
tokens.push(Token::new(TokenType::string(&buf, true), &loc));
}
'\'' => {
let buf = buf
.replace("\\n", "\n")
.replace("\\r", "\r");
if buf.len() > 1 {
lerror!(&loc, "Chars can only have 1 byte");
bail!("")
}
tokens.push(Token::new(TokenType::char(buf.chars().nth(0).unwrap()), &loc));
}
_ => unreachable!()
}
}
'a'..='z' | 'A'..='Z' | '_' => {
let mut buf = String::new();
buf.push(*c);
while let Some(c) = chars.next_if(|v| matches!(**v, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')) {
loc.inc_col();
buf.push(*c);
}
if let Some(kw) = TokenType::from_str(&buf) {
tokens.push(Token::new(kw, &loc));
continue;
}
tokens.push(Token::new(TokenType::ident(&buf), &loc));
buf.clear();
},
'+' | '-' | '0'..='9'
// Checks if its a number an not an operator in disguise
if matches!(c, '0'..='9') || matches!(chars.peek(), Some('0'..='9')) => {
let mut buf = String::new();
buf.push(*c);
let signed = *c == '-';
let mut radix = 10;
match chars.peek() {
Some(v) => {
match v {
'x' => radix = 16,
'b' => radix = 2,
'o' => radix = 8,
_ => (),
}
},
None => {
tokens.push(Token::new(TokenType::number(parse(&buf).unwrap(), radix, signed), &loc));
}
}
while let Some(c) = chars.next_if(|v| matches!(**v, '0'..='9' | '.' | 'a'..='f' | 'A'..='F')) {
loc.inc_col();
buf.push(*c);
}
match radix {
2 => {
if buf.strip_prefix("0b").expect("Unreachable")
.chars().filter(|v| !matches!(v, '0' | '1')).collect::<Vec<_>>().len() > 0 {
lerror!(&loc, "Invalid character in binary number");
bail!("")
}
tokens.push(Token::new(TokenType::number(parse(&buf).unwrap(), radix, signed), &loc));
}
8 => {
if buf.strip_prefix("0o").expect("Unreachable")
.chars().filter(|v| !matches!(v, '0'..='7')).collect::<Vec<_>>().len() > 0 {
lerror!(&loc, "Invalid character in octal number");
bail!("")
}
tokens.push(Token::new(TokenType::number(parse(&buf).unwrap(), radix, false), &loc));
}
10 => {
if buf.chars().filter(|v| !matches!(v, '0'..='9' | '.')).collect::<Vec<_>>().len() > 0 {
lerror!(&loc, "Invalid character in decimal number");
bail!("")
}
if buf.contains(".") {
if buf.chars().filter(|v| *v == '.').collect::<Vec<_>>().len() > 1 {
lerror!(&loc, "Floats cant have more than 1 dot");
}
todo!()
}
tokens.push(Token::new(TokenType::number(parse(&buf).unwrap(), radix, signed), &loc));
}
16 => {
if buf.strip_prefix("0x").expect("Unreachable")
.chars().filter(|v| !matches!(v, '0'..='9' | 'a'..='f' | 'A'..='F')).collect::<Vec<_>>().len() > 0 {
lerror!(&loc, "Invalid character in hex number");
bail!("")
}
tokens.push(Token::new(TokenType::number(parse(&buf).unwrap(), radix, false), &loc));
}
_ => unreachable!()
}
buf.clear();
},
_ => {
let mut buf = String::new();
buf.push(*c);
while let Some(c) = chars.peek() {
if let None = TokenType::from_str(&format!("{buf}{c}")) {
break;
}
if let Some(c) = chars.next() {
buf.push(*c);
}
}
if let Some(tt) = TokenType::from_str(&buf) {
tokens.push(Token::new(tt, &loc));
} else {
lerror!(&loc, "Unknown token: {buf}");
}
}
}
}
tokens.reverse();
Ok(tokens)
}
// Lookup table for all tokens, fast for normal tokenisation,
// but slower for reveres lookup (for like error messages)
lazy_static::lazy_static!(
static ref TT: HashMap<&'static str, TokenType> = [
("fn", TokenType::Keyword(Keyword::Fn)),
("if", TokenType::Keyword(Keyword::If)),
("else", TokenType::Keyword(Keyword::Else)),
("struct", TokenType::Keyword(Keyword::Struct)),
("enum", TokenType::Keyword(Keyword::Enum)),
("type", TokenType::Keyword(Keyword::Type)),
("while", TokenType::Keyword(Keyword::While)),
("for", TokenType::Keyword(Keyword::For)),
("break", TokenType::Keyword(Keyword::Break)),
("continue", TokenType::Keyword(Keyword::Continue)),
("let", TokenType::Keyword(Keyword::Let)),
("const", TokenType::Keyword(Keyword::Const)),
("mut", TokenType::Keyword(Keyword::Mut)),
("static", TokenType::Keyword(Keyword::Static)),
("true", TokenType::Keyword(Keyword::True)),
("false", TokenType::Keyword(Keyword::False)),
("include", TokenType::Keyword(Keyword::Include)),
("extern", TokenType::Keyword(Keyword::Extern)),
("return", TokenType::Keyword(Keyword::Return)),
("loop", TokenType::Keyword(Keyword::Loop)),
("as", TokenType::Keyword(Keyword::As)),
("{", TokenType::Delim(Delimiter::CurlyL)),
("}", TokenType::Delim(Delimiter::CurlyR)),
("[", TokenType::Delim(Delimiter::SquareL)),
("]", TokenType::Delim(Delimiter::SquareR)),
("(", TokenType::Delim(Delimiter::ParenL)),
(")", TokenType::Delim(Delimiter::ParenR)),
(";", TokenType::Punct(Punctuation::Semi)),
(":", TokenType::Punct(Punctuation::Colon)),
("::", TokenType::Punct(Punctuation::Pathsep)),
("->", TokenType::Punct(Punctuation::Arrow)),
("=>", TokenType::Punct(Punctuation::FatArrow)),
("+", TokenType::Punct(Punctuation::Plus)),
("-", TokenType::Punct(Punctuation::Minus)),
(",", TokenType::Punct(Punctuation::Comma)),
("&", TokenType::Punct(Punctuation::Ampersand)),
("*", TokenType::Punct(Punctuation::Star)),
("!", TokenType::Punct(Punctuation::Not)),
("/", TokenType::Punct(Punctuation::Div)),
("%", TokenType::Punct(Punctuation::Mod)),
("<<", TokenType::Punct(Punctuation::Shl)),
(">>", TokenType::Punct(Punctuation::Shr)),
("&&", TokenType::Punct(Punctuation::AndAnd)),
("||", TokenType::Punct(Punctuation::OrOr)),
("|", TokenType::Punct(Punctuation::Or)),
(">", TokenType::Punct(Punctuation::Gt)),
("<", TokenType::Punct(Punctuation::Lt)),
(">=", TokenType::Punct(Punctuation::Ge)),
("<=", TokenType::Punct(Punctuation::Le)),
("^", TokenType::Punct(Punctuation::Xor)),
("+=", TokenType::Punct(Punctuation::AddEq)),
("-=", TokenType::Punct(Punctuation::SubEq)),
("/=", TokenType::Punct(Punctuation::DivEq)),
("*=", TokenType::Punct(Punctuation::MulEq)),
("%=", TokenType::Punct(Punctuation::ModEq)),
("<<=", TokenType::Punct(Punctuation::ShlEq)),
(">>=", TokenType::Punct(Punctuation::ShrEq)),
("&=", TokenType::Punct(Punctuation::AndEq)),
("|=", TokenType::Punct(Punctuation::OrEq)),
("^=", TokenType::Punct(Punctuation::XorEq)),
("=", TokenType::Punct(Punctuation::Eq)),
("==", TokenType::Punct(Punctuation::EqEq)),
("!=", TokenType::Punct(Punctuation::Neq)),
(".", TokenType::Punct(Punctuation::Fieldaccess)),
("::", TokenType::Punct(Punctuation::Pathaccess)),
].into();
);

215
src/tokeniser/tokentype.rs Normal file
View File

@@ -0,0 +1,215 @@
use core::panic;
use std::fmt::Display;
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Ident(pub String);
impl ToString for Ident {
fn to_string(&self) -> String {
self.0.clone()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Number {
pub val: usize,
pub base: u8,
pub signed: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
pub struct TString {
pub val: String,
pub cstr: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Char(char);
impl Into<char> for Char {
fn into(self) -> char {
self.0
}
}
impl From<char> for Char {
fn from(value: char) -> Self {
Char(value)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Keyword {
Fn, If, Else, Struct, Enum,
Type, While, For, Break, Continue,
Let, Const, Mut, Static,
True, False, Include, Extern, Return,
As, Loop
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Delimiter {
CurlyL, CurlyR,
SquareL, SquareR,
ParenL, ParenR,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Punctuation {
Semi, Colon, Pathsep, Comma,
Arrow, FatArrow, Plus, Minus,
Ampersand, Star, Div,
Mod, Shl, Shr, AndAnd,
OrOr, Or, Xor, Not,
AddEq, SubEq,
DivEq, MulEq,
ModEq, ShlEq,
ShrEq, AndEq,
OrEq, XorEq,
Eq, EqEq, Fieldaccess,
Pathaccess, Lt, Gt, Le, Ge, Neq
}
impl Punctuation {
// pls help
pub fn precedence(&self) -> Option<(usize, usize)> {
match self {
Punctuation::AddEq |
Punctuation::SubEq |
Punctuation::DivEq |
Punctuation::MulEq |
Punctuation::ModEq |
Punctuation::ShlEq |
Punctuation::ShrEq |
Punctuation::AndEq |
Punctuation::OrEq |
Punctuation::XorEq |
Punctuation::Eq => Some((1, 2)),
Punctuation::EqEq |
Punctuation::Neq => Some((3, 4)),
Punctuation::Div |
Punctuation::Star |
Punctuation::Mod => Some((5,6)),
Punctuation::Plus |
Punctuation::Minus => Some((7,8)),
Punctuation::Shl |
Punctuation::Shr => Some((9,10)),
Punctuation::Lt |
Punctuation::Gt |
Punctuation::Le |
Punctuation::Ge => Some((11, 12)),
Punctuation::Ampersand => Some((13, 14)),
Punctuation::Xor => Some((15, 16)),
Punctuation::Or => Some((17, 18)),
Punctuation::AndAnd => Some((19, 20)),
Punctuation::OrOr => Some((21, 22)),
_ => None
}
}
}
#[derive(Debug, Clone, Hash, PartialEq, PartialOrd, Ord, Eq)]
pub enum TokenType {
Ident(Ident),
Number(Number),
String(TString),
Char(Char),
Keyword(Keyword),
Delim(Delimiter),
Punct(Punctuation),
Comment(Comment),
}
#[derive(Debug, Clone, Hash, PartialEq, PartialOrd, Ord, Eq)]
pub enum Comment {
Line(String),
Block(String)
}
impl TokenType {
pub fn unwrap_ident(&self) -> Ident {
match self {
Self::Ident(i) => i.clone(),
_ => panic!("Expected {}, got {self}", Self::ident(""))
}
}
pub fn ident(s: &str) -> Self {
Self::Ident(Ident(s.to_string()))
}
pub fn number(val: usize, base: u8, signed: bool) -> Self {
Self::Number(Number { val, base, signed })
}
pub fn string(s: &str, cstr: bool) -> Self{
Self::String(TString { val: s.to_string(), cstr })
}
pub fn char(v: char) -> Self {
Self::Char(Char(v))
}
pub fn from_str(s: &str) -> Option<Self> {
super::TT.get(s).cloned()
}
pub fn to_str(&self) -> String {
for (k, v) in super::TT.iter() {
if v == self {
return k.to_string();
}
}
match self {
TokenType::Ident(s) => {
return format!("Ident(\"{}\")", s.to_string());
},
TokenType::Number(num) => {
match num.base {
2 => {
assert!(!num.signed, "base 2 (binary) numbers physically cannot be signed");
format!("{:#b}", num.val)
}
8 => {
assert!(!num.signed, "base 8 (octal) numbers physically cannot be signed");
format!("{:#o}", num.val)
}
10 => {
if num.signed {
format!("{}", num.val as isize)
} else {
format!("{}", num.val)
}
}
16 => {
assert!(!num.signed, "base 16 (hex) numbers physically cannot be signed");
format!("{:#x}", num.val)
}
_ => panic!("Invalid base for number, {}", num.base),
}
},
TokenType::String(s) => {
if s.cstr {
format!("\"{}\\0\"", s.val)
} else {
format!("\"{}\"", s.val)
}
},
TokenType::Char(c) => {
format!("'{}'", c.0)
}
_ => unreachable!("Unreachable, did you add a new token and forget to add reverse lookup?"),
}
}
}
impl Display for TokenType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.to_str())
}
}