2020-04-27 20:17:16 -04:00
|
|
|
use crate::syn::{error::*, span::*, token::*};
|
2020-04-27 12:42:17 -04:00
|
|
|
use lazy_static::lazy_static;
|
2020-04-27 19:50:46 -04:00
|
|
|
use regex::{Regex, RegexBuilder};
|
2020-04-27 20:17:16 -04:00
|
|
|
use std::str::Chars;
|
2020-04-27 12:42:17 -04:00
|
|
|
|
|
|
|
|
pub struct Lexer<'t> {
|
|
|
|
|
text: &'t str,
|
2020-04-27 19:50:46 -04:00
|
|
|
pos: Pos,
|
2020-04-27 12:42:17 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl<'t> Lexer<'t> {
|
|
|
|
|
/// Creates a new lexer that tokenizes the given text.
|
|
|
|
|
pub fn new(text: &'t str) -> Self {
|
|
|
|
|
// load the first position into the start/end position trackers
|
|
|
|
|
let pos = if let Some(c) = text.chars().next() {
|
|
|
|
|
Pos::from_char(c, 0, 0, 0, 0)
|
|
|
|
|
} else {
|
|
|
|
|
Default::default()
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Lexer {
|
|
|
|
|
text,
|
2020-04-27 19:50:46 -04:00
|
|
|
pos,
|
2020-04-27 12:42:17 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Gets whether this lexer has reached the EOF.
|
|
|
|
|
pub fn is_eof(&self) -> bool {
|
2020-04-27 19:50:46 -04:00
|
|
|
self.chars().next().is_none()
|
2020-04-27 12:42:17 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
// Character advancement
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
2020-04-27 19:50:46 -04:00
|
|
|
fn chars(&'t self) -> Chars<'t> {
|
|
|
|
|
self.pos_text().chars()
|
2020-04-27 12:42:17 -04:00
|
|
|
}
|
|
|
|
|
|
2020-04-27 19:50:46 -04:00
|
|
|
fn pos_text(&self) -> &str {
|
|
|
|
|
&self.text[self.pos.byte..]
|
2020-04-27 12:42:17 -04:00
|
|
|
}
|
|
|
|
|
|
2020-04-27 19:50:46 -04:00
|
|
|
fn curr_char(&self) -> Option<char> {
|
|
|
|
|
self.chars().next()
|
2020-04-27 12:42:17 -04:00
|
|
|
}
|
|
|
|
|
|
2020-04-27 19:50:46 -04:00
|
|
|
fn skip_whitespace(&mut self) {
|
|
|
|
|
while let Some(c) = self.curr_char() {
|
2020-05-05 16:38:07 -04:00
|
|
|
if !c.is_whitespace() || c == '\n' {
|
2020-04-27 19:50:46 -04:00
|
|
|
break;
|
|
|
|
|
} else {
|
|
|
|
|
self.adv_char();
|
|
|
|
|
}
|
2020-04-27 12:42:17 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-27 19:50:46 -04:00
|
|
|
fn adv_char(&mut self) -> Option<char> {
|
|
|
|
|
let c = self.curr_char()?;
|
|
|
|
|
self.pos = self.pos.next_char(c);
|
|
|
|
|
Some(c)
|
2020-04-27 12:42:17 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
// Tokens
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
pub fn next_token(&mut self) -> Result<Option<Token>> {
|
2020-04-27 19:50:46 -04:00
|
|
|
// Constants and statics
|
|
|
|
|
lazy_static! {
|
|
|
|
|
static ref REGEX: Regex = RegexBuilder::new(r#"
|
|
|
|
|
^(?P<kw_return>return)
|
|
|
|
|
|(?P<ident>[a-zA-Z_][a-zA-Z0-9_]*)
|
|
|
|
|
|(?P<sym>:[a-zA-Z_][a-zA-Z0-9_]*)
|
|
|
|
|
|(?P<hex_num>0[xX][0-9a-fA-F]+)
|
|
|
|
|
|(?P<dec_num>[0-9]+)
|
2020-05-02 18:42:01 -04:00
|
|
|
|(?P<dq_str>"([^\\"]|\\[ntr0"'])*")
|
|
|
|
|
|(?P<sq_str>'([^\\"]|\\[ntr0"'])*')
|
|
|
|
|
|
|
|
|
|
|(?P<arrow>->)
|
|
|
|
|
|(?P<eqeq>==)
|
|
|
|
|
|(?P<bangeq>!=)
|
|
|
|
|
|(?P<lteq><=)
|
|
|
|
|
|(?P<gteq>>=)
|
|
|
|
|
|(?P<lt><)
|
|
|
|
|
|(?P<gt>>)
|
|
|
|
|
|
|
|
|
|
|(?P<eq>=)
|
|
|
|
|
|(?P<plus>\+)
|
|
|
|
|
|(?P<minus>-)
|
|
|
|
|
|(?P<splat>\*)
|
|
|
|
|
|(?P<fslash>/)
|
|
|
|
|
|(?P<bang>!)
|
2020-04-27 19:50:46 -04:00
|
|
|
|(?P<lparen>\()
|
|
|
|
|
|(?P<rparen>\))
|
|
|
|
|
|(?P<lbracket>\[)
|
|
|
|
|
|(?P<rbracket>\])
|
|
|
|
|
|(?P<lbrace>\{)
|
|
|
|
|
|(?P<rbrace>\})
|
|
|
|
|
|(?P<comma>,)
|
2020-05-05 16:38:07 -04:00
|
|
|
|(?P<eol>;)
|
|
|
|
|
|(?P<newline>\n)
|
2020-04-27 19:50:46 -04:00
|
|
|
"#).ignore_whitespace(true)
|
|
|
|
|
.build()
|
|
|
|
|
.unwrap();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const CAPTURES: &[(&str, TokenKind)] = &[
|
|
|
|
|
("kw_return", TokenKind::KwReturn),
|
2020-04-27 20:17:16 -04:00
|
|
|
|
2020-04-27 19:50:46 -04:00
|
|
|
("ident", TokenKind::Ident),
|
|
|
|
|
("sym", TokenKind::Sym),
|
|
|
|
|
("dec_num", TokenKind::Num),
|
|
|
|
|
("hex_num", TokenKind::Num),
|
|
|
|
|
("dq_str", TokenKind::Str),
|
|
|
|
|
("sq_str", TokenKind::Str),
|
2020-04-27 20:17:16 -04:00
|
|
|
|
2020-04-27 19:50:46 -04:00
|
|
|
("lparen", TokenKind::LParen),
|
|
|
|
|
("rparen", TokenKind::RParen),
|
|
|
|
|
("lbracket", TokenKind::LBracket),
|
|
|
|
|
("rbracket", TokenKind::RBracket),
|
|
|
|
|
("lbrace", TokenKind::LBrace),
|
|
|
|
|
("rbrace", TokenKind::RBrace),
|
|
|
|
|
("comma", TokenKind::Comma),
|
2020-04-27 20:17:16 -04:00
|
|
|
("plus", TokenKind::Plus),
|
|
|
|
|
("minus", TokenKind::Minus),
|
|
|
|
|
("splat", TokenKind::Splat),
|
|
|
|
|
("fslash", TokenKind::FSlash),
|
2020-05-02 18:42:01 -04:00
|
|
|
("bang", TokenKind::Bang),
|
2020-04-27 20:17:16 -04:00
|
|
|
|
2020-04-27 19:50:46 -04:00
|
|
|
("arrow", TokenKind::Arrow),
|
2020-05-02 18:42:01 -04:00
|
|
|
("eqeq", TokenKind::EqEq),
|
|
|
|
|
("bangeq", TokenKind::BangEq),
|
|
|
|
|
("lteq", TokenKind::LtEq),
|
|
|
|
|
("gteq", TokenKind::GtEq),
|
|
|
|
|
("lt", TokenKind::Lt),
|
|
|
|
|
("gt", TokenKind::Gt),
|
|
|
|
|
|
2020-04-27 19:50:46 -04:00
|
|
|
("eq", TokenKind::Eq),
|
2020-05-05 16:38:07 -04:00
|
|
|
("eol", TokenKind::Eol),
|
|
|
|
|
("newline", TokenKind::Newline),
|
2020-04-27 19:50:46 -04:00
|
|
|
];
|
|
|
|
|
|
2020-04-27 12:42:17 -04:00
|
|
|
self.skip_whitespace();
|
|
|
|
|
|
2020-04-27 20:17:16 -04:00
|
|
|
if self.curr_char().is_none() {
|
2020-04-27 12:42:17 -04:00
|
|
|
return Ok(None);
|
|
|
|
|
};
|
|
|
|
|
|
2020-04-27 19:50:46 -04:00
|
|
|
let caps = REGEX.captures(&self.text[self.pos.byte..])
|
|
|
|
|
.ok_or_else(|| Error::Unexpected {
|
|
|
|
|
what: "EOF".to_string(),
|
|
|
|
|
pos: self.pos,
|
|
|
|
|
})?;
|
|
|
|
|
|
|
|
|
|
// Get first capture
|
|
|
|
|
let capture_kind = CAPTURES.iter()
|
|
|
|
|
.filter_map(|(name, kind)|
|
|
|
|
|
caps.name(name)
|
|
|
|
|
.map(|cap| (cap, kind)))
|
|
|
|
|
.next();
|
|
|
|
|
|
|
|
|
|
let (token_text, kind) = if let Some((capture, kind)) = capture_kind {
|
|
|
|
|
(capture.as_str(), *kind)
|
2020-04-27 12:42:17 -04:00
|
|
|
} else {
|
2020-04-27 19:50:46 -04:00
|
|
|
return Err(
|
|
|
|
|
Error::Unexpected {
|
|
|
|
|
what: format!("character {}", (&self.text[self.pos.byte..]).chars().next().unwrap()),
|
|
|
|
|
pos: self.pos,
|
|
|
|
|
}
|
|
|
|
|
);
|
2020-04-27 12:42:17 -04:00
|
|
|
};
|
|
|
|
|
|
2020-04-27 19:50:46 -04:00
|
|
|
let start = self.pos;
|
|
|
|
|
self.pos.adv_str(token_text);
|
|
|
|
|
let end = self.pos;
|
2020-04-27 12:42:17 -04:00
|
|
|
|
2020-04-27 19:50:46 -04:00
|
|
|
Ok(Some(Token::new(kind, Span { start, end })))
|
2020-04-27 12:42:17 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
|
mod test {
|
|
|
|
|
use super::*;
|
|
|
|
|
|
2020-05-02 18:42:01 -04:00
|
|
|
macro_rules! test_token {
|
|
|
|
|
($text:expr, $($token_kind:expr, $token_text:expr),+ $(,)?) => {{
|
|
|
|
|
let text = $text;
|
|
|
|
|
let mut lexer = Lexer::new(text);
|
|
|
|
|
|
|
|
|
|
$(
|
|
|
|
|
let token = lexer.next_token().expect("token").expect("token");
|
|
|
|
|
assert_eq!(token.kind(), $token_kind);
|
|
|
|
|
assert_eq!(token.text_at(text), $token_text);
|
|
|
|
|
)+
|
|
|
|
|
|
|
|
|
|
assert!(lexer.is_eof());
|
|
|
|
|
}};
|
|
|
|
|
|
|
|
|
|
($text:expr, $token_kind:expr) => {{
|
|
|
|
|
test_token!($text, $token_kind, $text);
|
|
|
|
|
}};
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-27 12:42:17 -04:00
|
|
|
#[test]
|
|
|
|
|
fn test_next_token_eof() {
|
|
|
|
|
let mut lexer = Lexer::new("");
|
|
|
|
|
assert!(matches!(lexer.next_token(), Ok(None)));
|
|
|
|
|
assert!(lexer.is_eof());
|
|
|
|
|
|
|
|
|
|
let mut lexer = Lexer::new(" ");
|
|
|
|
|
assert!(matches!(lexer.next_token(), Ok(None)));
|
|
|
|
|
assert!(lexer.is_eof());
|
|
|
|
|
|
2020-05-05 16:38:07 -04:00
|
|
|
let mut lexer = Lexer::new(" \t \r \r\r\t\t ");
|
2020-04-27 12:42:17 -04:00
|
|
|
assert!(matches!(lexer.next_token(), Ok(None)));
|
|
|
|
|
assert!(lexer.is_eof());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_ident_token() {
|
2020-05-02 18:42:01 -04:00
|
|
|
test_token!(
|
|
|
|
|
"ident OtherIdent other_ident ident1234 RETURN",
|
|
|
|
|
TokenKind::Ident, "ident",
|
|
|
|
|
TokenKind::Ident, "OtherIdent",
|
|
|
|
|
TokenKind::Ident, "other_ident",
|
|
|
|
|
TokenKind::Ident, "ident1234",
|
|
|
|
|
TokenKind::Ident, "RETURN",
|
|
|
|
|
);
|
2020-04-27 12:42:17 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_keywords() {
|
|
|
|
|
test_token!("return", TokenKind::KwReturn);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_num_token() {
|
2020-05-02 18:42:01 -04:00
|
|
|
test_token!(
|
|
|
|
|
"1234 4321 123498765 432156789 0xdcbaBEEF 0xabcdFEED 0XdcbaBEEF 0XabcdFEED 0X123456789DCBAbeef 0xABCDfeed192837465",
|
|
|
|
|
TokenKind::Num, "1234",
|
|
|
|
|
TokenKind::Num, "4321",
|
|
|
|
|
TokenKind::Num, "123498765",
|
|
|
|
|
TokenKind::Num, "432156789",
|
|
|
|
|
TokenKind::Num, "0xdcbaBEEF",
|
|
|
|
|
TokenKind::Num, "0xabcdFEED",
|
|
|
|
|
TokenKind::Num, "0XdcbaBEEF",
|
|
|
|
|
TokenKind::Num, "0XabcdFEED",
|
|
|
|
|
TokenKind::Num, "0X123456789DCBAbeef",
|
|
|
|
|
TokenKind::Num, "0xABCDfeed192837465",
|
|
|
|
|
);
|
2020-04-27 12:42:17 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_str_token() {
|
|
|
|
|
test_token!(r#""this is a string""#, TokenKind::Str);
|
|
|
|
|
test_token!(r#"'this is a string'"#, TokenKind::Str);
|
|
|
|
|
test_token!(r#"'this is a string\nwith escapes'"#, TokenKind::Str);
|
|
|
|
|
test_token!(r#""this is a string\nwith escapes""#, TokenKind::Str);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_sym_token() {
|
2020-05-02 18:42:01 -04:00
|
|
|
test_token!(":symbol :OtherSymbol :other_symbol :symbol1234",
|
|
|
|
|
TokenKind::Sym, ":symbol",
|
|
|
|
|
TokenKind::Sym, ":OtherSymbol",
|
|
|
|
|
TokenKind::Sym, ":other_symbol",
|
|
|
|
|
TokenKind::Sym, ":symbol1234",
|
|
|
|
|
);
|
2020-04-27 12:42:17 -04:00
|
|
|
}
|
|
|
|
|
|
2020-05-05 16:38:07 -04:00
|
|
|
#[test]
|
|
|
|
|
fn test_eol() {
|
|
|
|
|
test_token!("\n;",
|
|
|
|
|
TokenKind::Newline, "\n",
|
|
|
|
|
TokenKind::Eol, ";"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-27 12:42:17 -04:00
|
|
|
#[test]
|
2020-05-02 18:42:01 -04:00
|
|
|
fn test_symbols() {
|
2020-04-27 12:42:17 -04:00
|
|
|
test_token!("(", TokenKind::LParen);
|
|
|
|
|
test_token!(")", TokenKind::RParen);
|
|
|
|
|
test_token!("{", TokenKind::LBrace);
|
|
|
|
|
test_token!("}", TokenKind::RBrace);
|
|
|
|
|
test_token!("[", TokenKind::LBracket);
|
|
|
|
|
test_token!("]", TokenKind::RBracket);
|
|
|
|
|
test_token!(",", TokenKind::Comma);
|
2020-04-27 20:17:16 -04:00
|
|
|
test_token!("+", TokenKind::Plus);
|
|
|
|
|
test_token!("-", TokenKind::Minus);
|
|
|
|
|
test_token!("*", TokenKind::Splat);
|
|
|
|
|
test_token!("/", TokenKind::FSlash);
|
2020-05-02 18:42:01 -04:00
|
|
|
test_token!("!", TokenKind::Bang);
|
2020-04-27 12:42:17 -04:00
|
|
|
|
|
|
|
|
test_token!("=", TokenKind::Eq);
|
2020-05-02 18:42:01 -04:00
|
|
|
test_token!("!=", TokenKind::BangEq);
|
|
|
|
|
test_token!("==", TokenKind::EqEq);
|
|
|
|
|
test_token!("<=", TokenKind::LtEq);
|
|
|
|
|
test_token!(">=", TokenKind::GtEq);
|
|
|
|
|
test_token!("<", TokenKind::Lt);
|
|
|
|
|
test_token!(">", TokenKind::Gt);
|
2020-04-27 12:42:17 -04:00
|
|
|
test_token!("->", TokenKind::Arrow);
|
|
|
|
|
}
|
|
|
|
|
}
|