361 lines
12 KiB
Rust
361 lines
12 KiB
Rust
|
|
use crate::{
|
||
|
|
syn::{error::*, span::*, token::*},
|
||
|
|
util::LazyString,
|
||
|
|
};
|
||
|
|
use lazy_static::lazy_static;
|
||
|
|
use maplit::hashmap;
|
||
|
|
use std::{collections::HashMap, mem, str::Chars};
|
||
|
|
|
||
|
|
const IDENT_START_CHARS: &'static [char] = &[
|
||
|
|
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
|
||
|
|
't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
|
||
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_',
|
||
|
|
];
|
||
|
|
const IDENT_CHARS: &'static [char] = &[
|
||
|
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
|
||
|
|
'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B',
|
||
|
|
'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U',
|
||
|
|
'V', 'W', 'X', 'Y', 'Z', '_',
|
||
|
|
];
|
||
|
|
|
||
|
|
const DEC_NUM_CHARS: &'static [char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'];
|
||
|
|
|
||
|
|
const HEX_NUM_CHARS: &'static [char] = &[
|
||
|
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C',
|
||
|
|
'D', 'E', 'F',
|
||
|
|
];
|
||
|
|
|
||
|
|
const STR_QUOTE_CHARS: &'static [char] = &['"', '\''];
|
||
|
|
|
||
|
|
const OP_CHARS: &'static [char] = &['=', '+', '*', '-', '/', '>', '<', '~', '!', '%', '^'];
|
||
|
|
|
||
|
|
lazy_static! {
|
||
|
|
static ref OPS: HashMap<&'static str, TokenKind> = hashmap! {
|
||
|
|
"=" => TokenKind::Eq,
|
||
|
|
"->" => TokenKind::Arrow,
|
||
|
|
};
|
||
|
|
|
||
|
|
static ref KEYWORDS: HashMap<&'static str, TokenKind> = hashmap! {
|
||
|
|
"return" => TokenKind::KwReturn,
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
pub struct Lexer<'t> {
|
||
|
|
chars: Chars<'t>,
|
||
|
|
text: &'t str,
|
||
|
|
start: Pos,
|
||
|
|
end: Pos,
|
||
|
|
}
|
||
|
|
|
||
|
|
impl<'t> Lexer<'t> {
|
||
|
|
/// Creates a new lexer that tokenizes the given text.
|
||
|
|
pub fn new(text: &'t str) -> Self {
|
||
|
|
// load the first position into the start/end position trackers
|
||
|
|
let pos = if let Some(c) = text.chars().next() {
|
||
|
|
Pos::from_char(c, 0, 0, 0, 0)
|
||
|
|
} else {
|
||
|
|
Default::default()
|
||
|
|
};
|
||
|
|
|
||
|
|
Lexer {
|
||
|
|
chars: text.chars(),
|
||
|
|
text,
|
||
|
|
start: pos,
|
||
|
|
end: pos,
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Gets whether this lexer has reached the EOF.
|
||
|
|
pub fn is_eof(&self) -> bool {
|
||
|
|
self.chars.clone().next().is_none()
|
||
|
|
}
|
||
|
|
|
||
|
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
|
// Character advancement
|
||
|
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
|
fn curr_char(&self) -> Option<char> {
|
||
|
|
self.chars.clone().next()
|
||
|
|
}
|
||
|
|
|
||
|
|
fn adv_char(&mut self) -> Option<char> {
|
||
|
|
let c = self.chars.next()?;
|
||
|
|
self.end = self.end.next_char(c);
|
||
|
|
Some(c)
|
||
|
|
}
|
||
|
|
|
||
|
|
fn skip_whitespace(&mut self) {
|
||
|
|
self.match_while(|c| c.is_whitespace());
|
||
|
|
}
|
||
|
|
|
||
|
|
fn catchup(&mut self) -> Span {
|
||
|
|
let start = mem::replace(&mut self.start, self.end);
|
||
|
|
Span {
|
||
|
|
start,
|
||
|
|
end: self.end,
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
fn make_token(&mut self, kind: TokenKind) -> Token {
|
||
|
|
let span = self.catchup();
|
||
|
|
Token::new(kind, span)
|
||
|
|
}
|
||
|
|
|
||
|
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
|
// Tokens
|
||
|
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
|
pub fn next_token(&mut self) -> Result<Option<Token>> {
|
||
|
|
self.skip_whitespace();
|
||
|
|
|
||
|
|
let curr = if let Some(curr) = self.curr_char() {
|
||
|
|
curr
|
||
|
|
} else {
|
||
|
|
return Ok(None);
|
||
|
|
};
|
||
|
|
|
||
|
|
let token = match curr {
|
||
|
|
c if IDENT_START_CHARS.contains(&c) => self.next_ident_or_kw()?,
|
||
|
|
c if DEC_NUM_CHARS.contains(&c) => self.next_num()?,
|
||
|
|
'"' | '\'' => self.next_str()?,
|
||
|
|
':' => self.next_sym()?,
|
||
|
|
'(' => self.next_char_token('(', TokenKind::LParen)?,
|
||
|
|
')' => self.next_char_token(')', TokenKind::RParen)?,
|
||
|
|
'{' => self.next_char_token('{', TokenKind::LBrace)?,
|
||
|
|
'}' => self.next_char_token('}', TokenKind::RBrace)?,
|
||
|
|
'[' => self.next_char_token('[', TokenKind::LBracket)?,
|
||
|
|
']' => self.next_char_token(']', TokenKind::RBracket)?,
|
||
|
|
',' => self.next_char_token(',', TokenKind::Comma)?,
|
||
|
|
c if OP_CHARS.contains(&c) => self.next_op()?,
|
||
|
|
c => return Err(Error::Unexpected {
|
||
|
|
what: format!("character {}", c.escape_debug()),
|
||
|
|
span: self.span(),
|
||
|
|
})
|
||
|
|
};
|
||
|
|
Ok(Some(token))
|
||
|
|
}
|
||
|
|
|
||
|
|
fn next_ident_or_kw(&mut self) -> Result<Token> {
|
||
|
|
let ident = self.expect_ident("identifier")?;
|
||
|
|
let kind = KEYWORDS.get(ident).copied()
|
||
|
|
.unwrap_or(TokenKind::Ident);
|
||
|
|
Ok(self.make_token(kind))
|
||
|
|
}
|
||
|
|
|
||
|
|
fn next_num(&mut self) -> Result<Token> {
|
||
|
|
let first = self.expect_any(DEC_NUM_CHARS, "number")?;
|
||
|
|
let alphabet = if first == '0' && matches!(self.curr_char(), Some('x') | Some('X')) {
|
||
|
|
self.adv_char().unwrap();
|
||
|
|
self.expect_any(HEX_NUM_CHARS, "hex number")?;
|
||
|
|
HEX_NUM_CHARS
|
||
|
|
} else {
|
||
|
|
DEC_NUM_CHARS
|
||
|
|
};
|
||
|
|
|
||
|
|
self.match_while(|c| alphabet.contains(&c));
|
||
|
|
Ok(self.make_token(TokenKind::Num))
|
||
|
|
}
|
||
|
|
|
||
|
|
fn next_str(&mut self) -> Result<Token> {
|
||
|
|
let start_char = self.expect_any(STR_QUOTE_CHARS, "string")?;
|
||
|
|
while let Some(c) = self.match_where(|curr| curr != start_char) {
|
||
|
|
if c == '\\' {
|
||
|
|
// Match escapes
|
||
|
|
self.expect_any(&['n', 't', 'r', '\\', '\'', '\"', '0'], "escape character")?;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
self.expect_char(start_char, "end of string")?;
|
||
|
|
Ok(self.make_token(TokenKind::Str))
|
||
|
|
}
|
||
|
|
|
||
|
|
fn next_sym(&mut self) -> Result<Token> {
|
||
|
|
self.expect_char(':', "symbol")?;
|
||
|
|
self.expect_ident("symbol")?;
|
||
|
|
Ok(self.make_token(TokenKind::Sym))
|
||
|
|
}
|
||
|
|
|
||
|
|
fn next_op(&mut self) -> Result<Token> {
|
||
|
|
self.expect_any(OP_CHARS, "operator")?;
|
||
|
|
let op_text = self.match_while(|c| OP_CHARS.contains(&c));
|
||
|
|
if let Some(kind) = OPS.get(op_text).copied() {
|
||
|
|
Ok(self.make_token(kind))
|
||
|
|
} else {
|
||
|
|
Err(Error::Unknown {
|
||
|
|
what: format!("operator {}", op_text.escape_debug()),
|
||
|
|
span: self.span(),
|
||
|
|
})
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
fn next_char_token(&mut self, c: char, kind: TokenKind) -> Result<Token> {
|
||
|
|
self.expect_char(c, LazyString::new(|| format!("{} token", kind)))?;
|
||
|
|
Ok(self.make_token(kind))
|
||
|
|
}
|
||
|
|
|
||
|
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
|
// Character pattern matching
|
||
|
|
////////////////////////////////////////////////////////////////////////////////
|
||
|
|
fn match_where<P>(&mut self, p: P) -> Option<char>
|
||
|
|
where
|
||
|
|
P: Fn(char) -> bool,
|
||
|
|
{
|
||
|
|
if (p)(self.curr_char()?) {
|
||
|
|
self.adv_char()
|
||
|
|
} else {
|
||
|
|
None
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
fn match_while<P>(&mut self, p: P) -> &str
|
||
|
|
where
|
||
|
|
P: Fn(char) -> bool + Copy,
|
||
|
|
{
|
||
|
|
while self.match_where(p).is_some() {}
|
||
|
|
self.text_at(self.text)
|
||
|
|
}
|
||
|
|
|
||
|
|
fn expect_where<P>(&mut self, p: P, expected: impl ToString) -> Result<char>
|
||
|
|
where
|
||
|
|
P: Fn(char) -> bool,
|
||
|
|
{
|
||
|
|
// Check EOF
|
||
|
|
self.curr_char().ok_or_else(|| Error::ExpectedGot {
|
||
|
|
expected: expected.to_string(),
|
||
|
|
got: "EOF".to_string(),
|
||
|
|
span: self.span(),
|
||
|
|
})?;
|
||
|
|
|
||
|
|
// Match
|
||
|
|
self.match_where(p).ok_or_else(|| Error::ExpectedGot {
|
||
|
|
expected: expected.to_string(),
|
||
|
|
got: format!("{} character", self.curr_char().unwrap().escape_debug()),
|
||
|
|
span: self.span(),
|
||
|
|
})
|
||
|
|
}
|
||
|
|
|
||
|
|
fn expect_char(&mut self, c: char, expected: impl ToString) -> Result<char> {
|
||
|
|
self.expect_where(|curr| curr == c, expected)
|
||
|
|
}
|
||
|
|
|
||
|
|
fn expect_any(&mut self, chars: &[char], expected: impl ToString) -> Result<char> {
|
||
|
|
self.expect_where(|curr| chars.contains(&curr), expected)
|
||
|
|
}
|
||
|
|
|
||
|
|
fn expect_ident(&mut self, expected: impl ToString) -> Result<&str> {
|
||
|
|
self.expect_any(IDENT_START_CHARS, expected)?;
|
||
|
|
Ok(self.match_while(|curr| IDENT_CHARS.contains(&curr)))
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
impl Spanned for Lexer<'_> {
|
||
|
|
fn span(&self) -> Span {
|
||
|
|
Span {
|
||
|
|
start: self.start,
|
||
|
|
end: self.end,
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[cfg(test)]
|
||
|
|
mod test {
|
||
|
|
use super::*;
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_next_token_eof() {
|
||
|
|
let mut lexer = Lexer::new("");
|
||
|
|
assert!(matches!(lexer.next_token(), Ok(None)));
|
||
|
|
assert!(lexer.is_eof());
|
||
|
|
|
||
|
|
let mut lexer = Lexer::new(" ");
|
||
|
|
assert!(matches!(lexer.next_token(), Ok(None)));
|
||
|
|
assert!(lexer.is_eof());
|
||
|
|
|
||
|
|
let mut lexer = Lexer::new(" \n \n \n\r\n\t ");
|
||
|
|
assert!(matches!(lexer.next_token(), Ok(None)));
|
||
|
|
assert!(lexer.is_eof());
|
||
|
|
}
|
||
|
|
|
||
|
|
macro_rules! test_token {
|
||
|
|
($text:expr, $token_kind:expr, $token_text:expr) => {{
|
||
|
|
let text = $text;
|
||
|
|
let mut lexer = Lexer::new(text);
|
||
|
|
let token = lexer.next_token().expect("token").expect("token");
|
||
|
|
assert_eq!(token.kind(), $token_kind);
|
||
|
|
assert_eq!(token.text_at(text), $token_text);
|
||
|
|
}};
|
||
|
|
|
||
|
|
($text:expr, $token_kind:expr) => {{
|
||
|
|
test_token!($text, $token_kind, $text);
|
||
|
|
}};
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_ident_token() {
|
||
|
|
test_token!("ident", TokenKind::Ident);
|
||
|
|
test_token!("OtherIdent", TokenKind::Ident);
|
||
|
|
test_token!("other_ident", TokenKind::Ident);
|
||
|
|
test_token!("ident1234", TokenKind::Ident);
|
||
|
|
test_token!("RETURN", TokenKind::Ident);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_keywords() {
|
||
|
|
test_token!("return", TokenKind::KwReturn);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_num_token() {
|
||
|
|
test_token!("1234", TokenKind::Num);
|
||
|
|
test_token!("4321", TokenKind::Num);
|
||
|
|
test_token!("123498765", TokenKind::Num);
|
||
|
|
test_token!("432156789", TokenKind::Num);
|
||
|
|
test_token!("0xdcbaBEEF", TokenKind::Num);
|
||
|
|
test_token!("0xabcdFEED", TokenKind::Num);
|
||
|
|
test_token!("0XdcbaBEEF", TokenKind::Num);
|
||
|
|
test_token!("0XabcdFEED", TokenKind::Num);
|
||
|
|
test_token!("0X123456789DCBAbeef", TokenKind::Num);
|
||
|
|
test_token!("0xABCDfeed192837465", TokenKind::Num);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_str_token() {
|
||
|
|
test_token!(r#""this is a string""#, TokenKind::Str);
|
||
|
|
test_token!(r#"'this is a string'"#, TokenKind::Str);
|
||
|
|
test_token!(r#"'this is a string\nwith escapes'"#, TokenKind::Str);
|
||
|
|
test_token!(r#""this is a string\nwith escapes""#, TokenKind::Str);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_sym_token() {
|
||
|
|
test_token!(":symbol", TokenKind::Sym);
|
||
|
|
test_token!(":OtherSymbol", TokenKind::Sym);
|
||
|
|
test_token!(":other_symbol", TokenKind::Sym);
|
||
|
|
test_token!(":symbol1234", TokenKind::Sym);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_single_char_symbols() {
|
||
|
|
test_token!("(", TokenKind::LParen);
|
||
|
|
test_token!(")", TokenKind::RParen);
|
||
|
|
test_token!("{", TokenKind::LBrace);
|
||
|
|
test_token!("}", TokenKind::RBrace);
|
||
|
|
test_token!("[", TokenKind::LBracket);
|
||
|
|
test_token!("]", TokenKind::RBracket);
|
||
|
|
test_token!(",", TokenKind::Comma);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_op_tokens() {
|
||
|
|
test_token!("=", TokenKind::Eq);
|
||
|
|
test_token!("->", TokenKind::Arrow);
|
||
|
|
}
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn test_expect_char() {
|
||
|
|
let mut lexer = Lexer::new("asdf");
|
||
|
|
assert!(matches!(lexer.expect_char('a', "a"), Ok('a')));
|
||
|
|
assert!(matches!(lexer.expect_char('s', "s"), Ok('s')));
|
||
|
|
assert!(matches!(lexer.expect_char('d', "d"), Ok('d')));
|
||
|
|
assert!(matches!(lexer.expect_char('f', "f"), Ok('f')));
|
||
|
|
assert!(lexer.is_eof());
|
||
|
|
}
|
||
|
|
}
|