Initial
This commit is contained in:
292
src/tokeniser/mod.rs
Normal file
292
src/tokeniser/mod.rs
Normal file
@@ -0,0 +1,292 @@
|
||||
use std::{collections::HashMap, fmt::Display};
|
||||
use anyhow::bail;
|
||||
use parse_int::parse;
|
||||
use crate::{common::{loc::LocIncr, Loc}, error, lerror};
|
||||
|
||||
pub mod tokentype;
|
||||
use tokentype::*;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Token {
|
||||
loc: Loc,
|
||||
tt: TokenType,
|
||||
}
|
||||
|
||||
impl Display for Token {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}: {:?}", self.loc(), self.tt())
|
||||
}
|
||||
}
|
||||
|
||||
impl Token {
|
||||
fn new(tt: TokenType, loc: &Loc) -> Self {
|
||||
Self {
|
||||
tt, loc: loc.clone()
|
||||
}
|
||||
}
|
||||
pub fn loc(&self) -> &Loc {
|
||||
&self.loc
|
||||
}
|
||||
pub fn tt(&self) -> &TokenType {
|
||||
&self.tt
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn tokenise(s: &str) -> anyhow::Result<Vec<Token>> {
|
||||
let mut loc = Loc::default();
|
||||
let mut tokens = Vec::new();
|
||||
let chars: Vec<_> = s.chars().collect();
|
||||
let mut chars = chars.iter().peekable();
|
||||
while let Some(c) = chars.next() {
|
||||
loc.inc_col();
|
||||
match c {
|
||||
' ' | '\t' => (),
|
||||
'/' if chars.peek() == Some(&&'/') => {
|
||||
let mut buf = String::new();
|
||||
chars.next();
|
||||
while let Some(c) = chars.next_if(|c| !matches!(c, '\n' | '\r')) {
|
||||
loc.inc_col();
|
||||
buf.push(*c);
|
||||
}
|
||||
// tokens.push(Token::new(TokenType::Comment(Comment::Line(buf.clone())), &loc));
|
||||
},
|
||||
'/' if chars.peek() == Some(&&'*') => {
|
||||
let mut buf = String::new();
|
||||
chars.next();
|
||||
while let Some(c) = chars.peek() {
|
||||
if matches!(c, '\n' | '\r') {
|
||||
loc.inc_line();
|
||||
} else {
|
||||
loc.inc_col();
|
||||
}
|
||||
let c = *chars.next().expect("Unreachable");
|
||||
if c == '*' && matches!(chars.peek(), Some(&&'/') | None) {
|
||||
chars.next();
|
||||
break;
|
||||
}
|
||||
buf.push(c);
|
||||
}
|
||||
// tokens.push(Token::new(TokenType::Comment(Comment::Line(buf.clone())), &loc));
|
||||
}
|
||||
'\n' => loc.inc_line(),
|
||||
'"' | '\'' |
|
||||
'c' if *c != 'c' || chars.peek() == Some(&&'"') => {
|
||||
let str_typ = *c;
|
||||
let mut sc = *c;
|
||||
if *c == 'c' {
|
||||
sc = '"';
|
||||
chars.peek();
|
||||
}
|
||||
let mut last = '\0';
|
||||
let mut buf = String::new();
|
||||
while let Some(c) = chars.next_if(|v| **v != '\n') {
|
||||
loc.inc_col();
|
||||
if *c == sc && last != '\\' {
|
||||
break;
|
||||
}
|
||||
buf.push(*c);
|
||||
last = *c;
|
||||
}
|
||||
|
||||
match str_typ {
|
||||
'"' => {
|
||||
tokens.push(Token::new(TokenType::string(&buf, false), &loc));
|
||||
}
|
||||
'c' => {
|
||||
tokens.push(Token::new(TokenType::string(&buf, true), &loc));
|
||||
}
|
||||
'\'' => {
|
||||
let buf = buf
|
||||
.replace("\\n", "\n")
|
||||
.replace("\\r", "\r");
|
||||
if buf.len() > 1 {
|
||||
lerror!(&loc, "Chars can only have 1 byte");
|
||||
bail!("")
|
||||
}
|
||||
tokens.push(Token::new(TokenType::char(buf.chars().nth(0).unwrap()), &loc));
|
||||
}
|
||||
_ => unreachable!()
|
||||
}
|
||||
}
|
||||
'a'..='z' | 'A'..='Z' | '_' => {
|
||||
let mut buf = String::new();
|
||||
buf.push(*c);
|
||||
while let Some(c) = chars.next_if(|v| matches!(**v, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')) {
|
||||
loc.inc_col();
|
||||
buf.push(*c);
|
||||
}
|
||||
if let Some(kw) = TokenType::from_str(&buf) {
|
||||
tokens.push(Token::new(kw, &loc));
|
||||
continue;
|
||||
}
|
||||
tokens.push(Token::new(TokenType::ident(&buf), &loc));
|
||||
buf.clear();
|
||||
},
|
||||
|
||||
'+' | '-' | '0'..='9'
|
||||
// Checks if its a number an not an operator in disguise
|
||||
if matches!(c, '0'..='9') || matches!(chars.peek(), Some('0'..='9')) => {
|
||||
let mut buf = String::new();
|
||||
buf.push(*c);
|
||||
let signed = *c == '-';
|
||||
let mut radix = 10;
|
||||
match chars.peek() {
|
||||
Some(v) => {
|
||||
match v {
|
||||
'x' => radix = 16,
|
||||
'b' => radix = 2,
|
||||
'o' => radix = 8,
|
||||
_ => (),
|
||||
}
|
||||
},
|
||||
None => {
|
||||
tokens.push(Token::new(TokenType::number(parse(&buf).unwrap(), radix, signed), &loc));
|
||||
}
|
||||
}
|
||||
while let Some(c) = chars.next_if(|v| matches!(**v, '0'..='9' | '.' | 'a'..='f' | 'A'..='F')) {
|
||||
loc.inc_col();
|
||||
buf.push(*c);
|
||||
}
|
||||
match radix {
|
||||
2 => {
|
||||
if buf.strip_prefix("0b").expect("Unreachable")
|
||||
.chars().filter(|v| !matches!(v, '0' | '1')).collect::<Vec<_>>().len() > 0 {
|
||||
lerror!(&loc, "Invalid character in binary number");
|
||||
bail!("")
|
||||
}
|
||||
tokens.push(Token::new(TokenType::number(parse(&buf).unwrap(), radix, signed), &loc));
|
||||
}
|
||||
8 => {
|
||||
if buf.strip_prefix("0o").expect("Unreachable")
|
||||
.chars().filter(|v| !matches!(v, '0'..='7')).collect::<Vec<_>>().len() > 0 {
|
||||
lerror!(&loc, "Invalid character in octal number");
|
||||
bail!("")
|
||||
}
|
||||
tokens.push(Token::new(TokenType::number(parse(&buf).unwrap(), radix, false), &loc));
|
||||
}
|
||||
10 => {
|
||||
if buf.chars().filter(|v| !matches!(v, '0'..='9' | '.')).collect::<Vec<_>>().len() > 0 {
|
||||
lerror!(&loc, "Invalid character in decimal number");
|
||||
bail!("")
|
||||
}
|
||||
if buf.contains(".") {
|
||||
if buf.chars().filter(|v| *v == '.').collect::<Vec<_>>().len() > 1 {
|
||||
lerror!(&loc, "Floats cant have more than 1 dot");
|
||||
}
|
||||
todo!()
|
||||
}
|
||||
tokens.push(Token::new(TokenType::number(parse(&buf).unwrap(), radix, signed), &loc));
|
||||
}
|
||||
16 => {
|
||||
if buf.strip_prefix("0x").expect("Unreachable")
|
||||
.chars().filter(|v| !matches!(v, '0'..='9' | 'a'..='f' | 'A'..='F')).collect::<Vec<_>>().len() > 0 {
|
||||
lerror!(&loc, "Invalid character in hex number");
|
||||
bail!("")
|
||||
}
|
||||
tokens.push(Token::new(TokenType::number(parse(&buf).unwrap(), radix, false), &loc));
|
||||
}
|
||||
_ => unreachable!()
|
||||
}
|
||||
|
||||
buf.clear();
|
||||
},
|
||||
|
||||
_ => {
|
||||
let mut buf = String::new();
|
||||
buf.push(*c);
|
||||
while let Some(c) = chars.peek() {
|
||||
if let None = TokenType::from_str(&format!("{buf}{c}")) {
|
||||
break;
|
||||
}
|
||||
if let Some(c) = chars.next() {
|
||||
buf.push(*c);
|
||||
}
|
||||
}
|
||||
if let Some(tt) = TokenType::from_str(&buf) {
|
||||
tokens.push(Token::new(tt, &loc));
|
||||
} else {
|
||||
lerror!(&loc, "Unknown token: {buf}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
tokens.reverse();
|
||||
Ok(tokens)
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Lookup table for all tokens, fast for normal tokenisation,
|
||||
// but slower for reveres lookup (for like error messages)
|
||||
lazy_static::lazy_static!(
|
||||
static ref TT: HashMap<&'static str, TokenType> = [
|
||||
("fn", TokenType::Keyword(Keyword::Fn)),
|
||||
("if", TokenType::Keyword(Keyword::If)),
|
||||
("else", TokenType::Keyword(Keyword::Else)),
|
||||
("struct", TokenType::Keyword(Keyword::Struct)),
|
||||
("enum", TokenType::Keyword(Keyword::Enum)),
|
||||
("type", TokenType::Keyword(Keyword::Type)),
|
||||
("while", TokenType::Keyword(Keyword::While)),
|
||||
("for", TokenType::Keyword(Keyword::For)),
|
||||
("break", TokenType::Keyword(Keyword::Break)),
|
||||
("continue", TokenType::Keyword(Keyword::Continue)),
|
||||
("let", TokenType::Keyword(Keyword::Let)),
|
||||
("const", TokenType::Keyword(Keyword::Const)),
|
||||
("mut", TokenType::Keyword(Keyword::Mut)),
|
||||
("static", TokenType::Keyword(Keyword::Static)),
|
||||
("true", TokenType::Keyword(Keyword::True)),
|
||||
("false", TokenType::Keyword(Keyword::False)),
|
||||
("include", TokenType::Keyword(Keyword::Include)),
|
||||
("extern", TokenType::Keyword(Keyword::Extern)),
|
||||
("return", TokenType::Keyword(Keyword::Return)),
|
||||
("loop", TokenType::Keyword(Keyword::Loop)),
|
||||
("as", TokenType::Keyword(Keyword::As)),
|
||||
("{", TokenType::Delim(Delimiter::CurlyL)),
|
||||
("}", TokenType::Delim(Delimiter::CurlyR)),
|
||||
("[", TokenType::Delim(Delimiter::SquareL)),
|
||||
("]", TokenType::Delim(Delimiter::SquareR)),
|
||||
("(", TokenType::Delim(Delimiter::ParenL)),
|
||||
(")", TokenType::Delim(Delimiter::ParenR)),
|
||||
(";", TokenType::Punct(Punctuation::Semi)),
|
||||
(":", TokenType::Punct(Punctuation::Colon)),
|
||||
("::", TokenType::Punct(Punctuation::Pathsep)),
|
||||
("->", TokenType::Punct(Punctuation::Arrow)),
|
||||
("=>", TokenType::Punct(Punctuation::FatArrow)),
|
||||
("+", TokenType::Punct(Punctuation::Plus)),
|
||||
("-", TokenType::Punct(Punctuation::Minus)),
|
||||
(",", TokenType::Punct(Punctuation::Comma)),
|
||||
("&", TokenType::Punct(Punctuation::Ampersand)),
|
||||
("*", TokenType::Punct(Punctuation::Star)),
|
||||
("!", TokenType::Punct(Punctuation::Not)),
|
||||
("/", TokenType::Punct(Punctuation::Div)),
|
||||
("%", TokenType::Punct(Punctuation::Mod)),
|
||||
("<<", TokenType::Punct(Punctuation::Shl)),
|
||||
(">>", TokenType::Punct(Punctuation::Shr)),
|
||||
("&&", TokenType::Punct(Punctuation::AndAnd)),
|
||||
("||", TokenType::Punct(Punctuation::OrOr)),
|
||||
("|", TokenType::Punct(Punctuation::Or)),
|
||||
(">", TokenType::Punct(Punctuation::Gt)),
|
||||
("<", TokenType::Punct(Punctuation::Lt)),
|
||||
(">=", TokenType::Punct(Punctuation::Ge)),
|
||||
("<=", TokenType::Punct(Punctuation::Le)),
|
||||
("^", TokenType::Punct(Punctuation::Xor)),
|
||||
("+=", TokenType::Punct(Punctuation::AddEq)),
|
||||
("-=", TokenType::Punct(Punctuation::SubEq)),
|
||||
("/=", TokenType::Punct(Punctuation::DivEq)),
|
||||
("*=", TokenType::Punct(Punctuation::MulEq)),
|
||||
("%=", TokenType::Punct(Punctuation::ModEq)),
|
||||
("<<=", TokenType::Punct(Punctuation::ShlEq)),
|
||||
(">>=", TokenType::Punct(Punctuation::ShrEq)),
|
||||
("&=", TokenType::Punct(Punctuation::AndEq)),
|
||||
("|=", TokenType::Punct(Punctuation::OrEq)),
|
||||
("^=", TokenType::Punct(Punctuation::XorEq)),
|
||||
("=", TokenType::Punct(Punctuation::Eq)),
|
||||
("==", TokenType::Punct(Punctuation::EqEq)),
|
||||
("!=", TokenType::Punct(Punctuation::Neq)),
|
||||
(".", TokenType::Punct(Punctuation::Fieldaccess)),
|
||||
("::", TokenType::Punct(Punctuation::Pathaccess)),
|
||||
].into();
|
||||
);
|
||||
|
||||
|
||||
215
src/tokeniser/tokentype.rs
Normal file
215
src/tokeniser/tokentype.rs
Normal file
@@ -0,0 +1,215 @@
|
||||
use core::panic;
|
||||
use std::fmt::Display;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Ident(pub String);
|
||||
|
||||
impl ToString for Ident {
|
||||
fn to_string(&self) -> String {
|
||||
self.0.clone()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Number {
|
||||
pub val: usize,
|
||||
pub base: u8,
|
||||
pub signed: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
|
||||
pub struct TString {
|
||||
pub val: String,
|
||||
pub cstr: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Char(char);
|
||||
|
||||
impl Into<char> for Char {
|
||||
fn into(self) -> char {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<char> for Char {
|
||||
fn from(value: char) -> Self {
|
||||
Char(value)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub enum Keyword {
|
||||
Fn, If, Else, Struct, Enum,
|
||||
Type, While, For, Break, Continue,
|
||||
Let, Const, Mut, Static,
|
||||
True, False, Include, Extern, Return,
|
||||
As, Loop
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub enum Delimiter {
|
||||
CurlyL, CurlyR,
|
||||
SquareL, SquareR,
|
||||
ParenL, ParenR,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub enum Punctuation {
|
||||
Semi, Colon, Pathsep, Comma,
|
||||
Arrow, FatArrow, Plus, Minus,
|
||||
Ampersand, Star, Div,
|
||||
Mod, Shl, Shr, AndAnd,
|
||||
OrOr, Or, Xor, Not,
|
||||
AddEq, SubEq,
|
||||
DivEq, MulEq,
|
||||
ModEq, ShlEq,
|
||||
ShrEq, AndEq,
|
||||
OrEq, XorEq,
|
||||
Eq, EqEq, Fieldaccess,
|
||||
Pathaccess, Lt, Gt, Le, Ge, Neq
|
||||
|
||||
}
|
||||
|
||||
impl Punctuation {
|
||||
|
||||
// pls help
|
||||
pub fn precedence(&self) -> Option<(usize, usize)> {
|
||||
match self {
|
||||
Punctuation::AddEq |
|
||||
Punctuation::SubEq |
|
||||
Punctuation::DivEq |
|
||||
Punctuation::MulEq |
|
||||
Punctuation::ModEq |
|
||||
Punctuation::ShlEq |
|
||||
Punctuation::ShrEq |
|
||||
Punctuation::AndEq |
|
||||
Punctuation::OrEq |
|
||||
Punctuation::XorEq |
|
||||
Punctuation::Eq => Some((1, 2)),
|
||||
|
||||
Punctuation::EqEq |
|
||||
Punctuation::Neq => Some((3, 4)),
|
||||
|
||||
Punctuation::Div |
|
||||
Punctuation::Star |
|
||||
Punctuation::Mod => Some((5,6)),
|
||||
|
||||
Punctuation::Plus |
|
||||
Punctuation::Minus => Some((7,8)),
|
||||
|
||||
Punctuation::Shl |
|
||||
Punctuation::Shr => Some((9,10)),
|
||||
|
||||
Punctuation::Lt |
|
||||
Punctuation::Gt |
|
||||
Punctuation::Le |
|
||||
Punctuation::Ge => Some((11, 12)),
|
||||
|
||||
Punctuation::Ampersand => Some((13, 14)),
|
||||
Punctuation::Xor => Some((15, 16)),
|
||||
Punctuation::Or => Some((17, 18)),
|
||||
Punctuation::AndAnd => Some((19, 20)),
|
||||
Punctuation::OrOr => Some((21, 22)),
|
||||
|
||||
_ => None
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Hash, PartialEq, PartialOrd, Ord, Eq)]
|
||||
pub enum TokenType {
|
||||
Ident(Ident),
|
||||
Number(Number),
|
||||
String(TString),
|
||||
Char(Char),
|
||||
Keyword(Keyword),
|
||||
Delim(Delimiter),
|
||||
Punct(Punctuation),
|
||||
Comment(Comment),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Hash, PartialEq, PartialOrd, Ord, Eq)]
|
||||
pub enum Comment {
|
||||
Line(String),
|
||||
Block(String)
|
||||
}
|
||||
|
||||
impl TokenType {
|
||||
pub fn unwrap_ident(&self) -> Ident {
|
||||
match self {
|
||||
Self::Ident(i) => i.clone(),
|
||||
_ => panic!("Expected {}, got {self}", Self::ident(""))
|
||||
}
|
||||
}
|
||||
pub fn ident(s: &str) -> Self {
|
||||
Self::Ident(Ident(s.to_string()))
|
||||
}
|
||||
pub fn number(val: usize, base: u8, signed: bool) -> Self {
|
||||
Self::Number(Number { val, base, signed })
|
||||
}
|
||||
pub fn string(s: &str, cstr: bool) -> Self{
|
||||
Self::String(TString { val: s.to_string(), cstr })
|
||||
}
|
||||
pub fn char(v: char) -> Self {
|
||||
Self::Char(Char(v))
|
||||
}
|
||||
pub fn from_str(s: &str) -> Option<Self> {
|
||||
super::TT.get(s).cloned()
|
||||
}
|
||||
pub fn to_str(&self) -> String {
|
||||
for (k, v) in super::TT.iter() {
|
||||
if v == self {
|
||||
return k.to_string();
|
||||
}
|
||||
}
|
||||
|
||||
match self {
|
||||
TokenType::Ident(s) => {
|
||||
return format!("Ident(\"{}\")", s.to_string());
|
||||
},
|
||||
TokenType::Number(num) => {
|
||||
match num.base {
|
||||
2 => {
|
||||
assert!(!num.signed, "base 2 (binary) numbers physically cannot be signed");
|
||||
format!("{:#b}", num.val)
|
||||
}
|
||||
8 => {
|
||||
assert!(!num.signed, "base 8 (octal) numbers physically cannot be signed");
|
||||
format!("{:#o}", num.val)
|
||||
}
|
||||
10 => {
|
||||
if num.signed {
|
||||
format!("{}", num.val as isize)
|
||||
} else {
|
||||
format!("{}", num.val)
|
||||
}
|
||||
}
|
||||
16 => {
|
||||
assert!(!num.signed, "base 16 (hex) numbers physically cannot be signed");
|
||||
format!("{:#x}", num.val)
|
||||
}
|
||||
_ => panic!("Invalid base for number, {}", num.base),
|
||||
}
|
||||
},
|
||||
TokenType::String(s) => {
|
||||
if s.cstr {
|
||||
format!("\"{}\\0\"", s.val)
|
||||
} else {
|
||||
format!("\"{}\"", s.val)
|
||||
}
|
||||
},
|
||||
TokenType::Char(c) => {
|
||||
format!("'{}'", c.0)
|
||||
}
|
||||
_ => unreachable!("Unreachable, did you add a new token and forget to add reverse lookup?"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for TokenType {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.to_str())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user