use crate::syn::{error::*, span::*, token::*}; use lazy_static::lazy_static; use regex::{Regex, RegexBuilder}; use std::str::Chars; pub struct Lexer<'t> { text: &'t str, pos: Pos, } impl<'t> Lexer<'t> { /// Creates a new lexer that tokenizes the given text. pub fn new(text: &'t str) -> Self { // load the first position into the start/end position trackers let pos = if let Some(c) = text.chars().next() { Pos::from_char(c, 0, 0, 0, 0) } else { Default::default() }; Lexer { text, pos, } } /// Gets whether this lexer has reached the EOF. pub fn is_eof(&self) -> bool { self.chars().next().is_none() } //////////////////////////////////////////////////////////////////////////////// // Character advancement //////////////////////////////////////////////////////////////////////////////// fn chars(&'t self) -> Chars<'t> { self.pos_text().chars() } fn pos_text(&self) -> &str { &self.text[self.pos.byte..] } fn curr_char(&self) -> Option { self.chars().next() } fn skip_whitespace(&mut self) { while let Some(c) = self.curr_char() { if !c.is_whitespace() || c == '\n' { break; } else { self.adv_char(); } } } fn adv_char(&mut self) -> Option { let c = self.curr_char()?; self.pos = self.pos.next_char(c); Some(c) } //////////////////////////////////////////////////////////////////////////////// // Tokens //////////////////////////////////////////////////////////////////////////////// pub fn next_token(&mut self) -> Result> { // Constants and statics lazy_static! { static ref REGEX: Regex = RegexBuilder::new(r#" ^(?Preturn) |(?P[a-zA-Z_][a-zA-Z0-9_]*) |(?P:[a-zA-Z_][a-zA-Z0-9_]*) |(?P0[xX][0-9a-fA-F]+) |(?P[0-9]+) |(?P"([^\\"]|\\[ntr0"'])*") |(?P'([^\\"]|\\[ntr0"'])*') |(?P->) |(?P==) |(?P!=) |(?P<=) |(?P>=) |(?P<) |(?P>) |(?P=) |(?P\+) |(?P-) |(?P\*) |(?P/) |(?P!) |(?P\() |(?P\)) |(?P\[) |(?P\]) |(?P\{) |(?P\}) |(?P%\{) |(?P,) |(?P;) |(?P\n) "#).ignore_whitespace(true) .build() .unwrap(); } const CAPTURES: &[(&str, TokenKind)] = &[ ("kw_return", TokenKind::KwReturn), ("ident", TokenKind::Ident), ("sym", TokenKind::Sym), ("dec_num", TokenKind::Num), ("hex_num", TokenKind::Num), ("dq_str", TokenKind::Str), ("sq_str", TokenKind::Str), ("lparen", TokenKind::LParen), ("rparen", TokenKind::RParen), ("lbracket", TokenKind::LBracket), ("rbracket", TokenKind::RBracket), ("lbrace", TokenKind::LBrace), ("rbrace", TokenKind::RBrace), ("objbrace", TokenKind::ObjBrace), ("comma", TokenKind::Comma), ("plus", TokenKind::Plus), ("minus", TokenKind::Minus), ("splat", TokenKind::Splat), ("fslash", TokenKind::FSlash), ("bang", TokenKind::Bang), ("arrow", TokenKind::Arrow), ("eqeq", TokenKind::EqEq), ("bangeq", TokenKind::BangEq), ("lteq", TokenKind::LtEq), ("gteq", TokenKind::GtEq), ("lt", TokenKind::Lt), ("gt", TokenKind::Gt), ("eq", TokenKind::Eq), ("eol", TokenKind::Eol), ("newline", TokenKind::Newline), ]; self.skip_whitespace(); if self.curr_char().is_none() { return Ok(None); }; let caps = REGEX.captures(&self.text[self.pos.byte..]) .ok_or_else(|| Error::Unexpected { what: "EOF".to_string(), pos: self.pos, })?; // Get first capture let capture_kind = CAPTURES.iter() .filter_map(|(name, kind)| caps.name(name) .map(|cap| (cap, kind))) .next(); let (token_text, kind) = if let Some((capture, kind)) = capture_kind { (capture.as_str(), *kind) } else { return Err( Error::Unexpected { what: format!("character {}", (&self.text[self.pos.byte..]).chars().next().unwrap()), pos: self.pos, } ); }; let start = self.pos; self.pos.adv_str(token_text); let end = self.pos; Ok(Some(Token::new(kind, Span { start, end }))) } } #[cfg(test)] mod test { use super::*; macro_rules! test_token { ($text:expr, $($token_kind:expr, $token_text:expr),+ $(,)?) => {{ let text = $text; let mut lexer = Lexer::new(text); $( let token = lexer.next_token().expect("token").expect("token"); assert_eq!(token.kind(), $token_kind); assert_eq!(token.text_at(text), $token_text); )+ assert!(lexer.is_eof()); }}; ($text:expr, $token_kind:expr) => {{ test_token!($text, $token_kind, $text); }}; } #[test] fn test_next_token_eof() { let mut lexer = Lexer::new(""); assert!(matches!(lexer.next_token(), Ok(None))); assert!(lexer.is_eof()); let mut lexer = Lexer::new(" "); assert!(matches!(lexer.next_token(), Ok(None))); assert!(lexer.is_eof()); let mut lexer = Lexer::new(" \t \r \r\r\t\t "); assert!(matches!(lexer.next_token(), Ok(None))); assert!(lexer.is_eof()); } #[test] fn test_ident_token() { test_token!( "ident OtherIdent other_ident ident1234 RETURN", TokenKind::Ident, "ident", TokenKind::Ident, "OtherIdent", TokenKind::Ident, "other_ident", TokenKind::Ident, "ident1234", TokenKind::Ident, "RETURN", ); } #[test] fn test_keywords() { test_token!("return", TokenKind::KwReturn); } #[test] fn test_num_token() { test_token!( "1234 4321 123498765 432156789 0xdcbaBEEF 0xabcdFEED 0XdcbaBEEF 0XabcdFEED 0X123456789DCBAbeef 0xABCDfeed192837465", TokenKind::Num, "1234", TokenKind::Num, "4321", TokenKind::Num, "123498765", TokenKind::Num, "432156789", TokenKind::Num, "0xdcbaBEEF", TokenKind::Num, "0xabcdFEED", TokenKind::Num, "0XdcbaBEEF", TokenKind::Num, "0XabcdFEED", TokenKind::Num, "0X123456789DCBAbeef", TokenKind::Num, "0xABCDfeed192837465", ); } #[test] fn test_str_token() { test_token!(r#""this is a string""#, TokenKind::Str); test_token!(r#"'this is a string'"#, TokenKind::Str); test_token!(r#"'this is a string\nwith escapes'"#, TokenKind::Str); test_token!(r#""this is a string\nwith escapes""#, TokenKind::Str); } #[test] fn test_sym_token() { test_token!(":symbol :OtherSymbol :other_symbol :symbol1234", TokenKind::Sym, ":symbol", TokenKind::Sym, ":OtherSymbol", TokenKind::Sym, ":other_symbol", TokenKind::Sym, ":symbol1234", ); } #[test] fn test_eol() { test_token!("\n;", TokenKind::Newline, "\n", TokenKind::Eol, ";" ); } #[test] fn test_symbols() { test_token!("(", TokenKind::LParen); test_token!(")", TokenKind::RParen); test_token!("{", TokenKind::LBrace); test_token!("}", TokenKind::RBrace); test_token!("%{", TokenKind::ObjBrace); test_token!("[", TokenKind::LBracket); test_token!("]", TokenKind::RBracket); test_token!(",", TokenKind::Comma); test_token!("+", TokenKind::Plus); test_token!("-", TokenKind::Minus); test_token!("*", TokenKind::Splat); test_token!("/", TokenKind::FSlash); test_token!("!", TokenKind::Bang); test_token!("=", TokenKind::Eq); test_token!("!=", TokenKind::BangEq); test_token!("==", TokenKind::EqEq); test_token!("<=", TokenKind::LtEq); test_token!(">=", TokenKind::GtEq); test_token!("<", TokenKind::Lt); test_token!(">", TokenKind::Gt); test_token!("->", TokenKind::Arrow); } }