use crate::{ syn::{error::*, span::*, token::*}, util::LazyString, }; use lazy_static::lazy_static; use maplit::hashmap; use std::{collections::HashMap, mem, str::Chars}; const IDENT_START_CHARS: &'static [char] = &[ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', ]; const IDENT_CHARS: &'static [char] = &[ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', ]; const DEC_NUM_CHARS: &'static [char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']; const HEX_NUM_CHARS: &'static [char] = &[ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', 'D', 'E', 'F', ]; const STR_QUOTE_CHARS: &'static [char] = &['"', '\'']; const OP_CHARS: &'static [char] = &['=', '+', '*', '-', '/', '>', '<', '~', '!', '%', '^']; lazy_static! { static ref OPS: HashMap<&'static str, TokenKind> = hashmap! { "=" => TokenKind::Eq, "->" => TokenKind::Arrow, }; static ref KEYWORDS: HashMap<&'static str, TokenKind> = hashmap! { "return" => TokenKind::KwReturn, }; } pub struct Lexer<'t> { chars: Chars<'t>, text: &'t str, start: Pos, end: Pos, } impl<'t> Lexer<'t> { /// Creates a new lexer that tokenizes the given text. pub fn new(text: &'t str) -> Self { // load the first position into the start/end position trackers let pos = if let Some(c) = text.chars().next() { Pos::from_char(c, 0, 0, 0, 0) } else { Default::default() }; Lexer { chars: text.chars(), text, start: pos, end: pos, } } /// Gets whether this lexer has reached the EOF. pub fn is_eof(&self) -> bool { self.chars.clone().next().is_none() } //////////////////////////////////////////////////////////////////////////////// // Character advancement //////////////////////////////////////////////////////////////////////////////// fn curr_char(&self) -> Option { self.chars.clone().next() } fn adv_char(&mut self) -> Option { let c = self.chars.next()?; self.end = self.end.next_char(c); Some(c) } fn skip_whitespace(&mut self) { self.match_while(|c| c.is_whitespace()); } fn catchup(&mut self) -> Span { let start = mem::replace(&mut self.start, self.end); Span { start, end: self.end, } } fn make_token(&mut self, kind: TokenKind) -> Token { let span = self.catchup(); Token::new(kind, span) } //////////////////////////////////////////////////////////////////////////////// // Tokens //////////////////////////////////////////////////////////////////////////////// pub fn next_token(&mut self) -> Result> { self.skip_whitespace(); let curr = if let Some(curr) = self.curr_char() { curr } else { return Ok(None); }; let token = match curr { c if IDENT_START_CHARS.contains(&c) => self.next_ident_or_kw()?, c if DEC_NUM_CHARS.contains(&c) => self.next_num()?, '"' | '\'' => self.next_str()?, ':' => self.next_sym()?, '(' => self.next_char_token('(', TokenKind::LParen)?, ')' => self.next_char_token(')', TokenKind::RParen)?, '{' => self.next_char_token('{', TokenKind::LBrace)?, '}' => self.next_char_token('}', TokenKind::RBrace)?, '[' => self.next_char_token('[', TokenKind::LBracket)?, ']' => self.next_char_token(']', TokenKind::RBracket)?, ',' => self.next_char_token(',', TokenKind::Comma)?, c if OP_CHARS.contains(&c) => self.next_op()?, c => return Err(Error::Unexpected { what: format!("character {}", c.escape_debug()), span: self.span(), }) }; Ok(Some(token)) } fn next_ident_or_kw(&mut self) -> Result { let ident = self.expect_ident("identifier")?; let kind = KEYWORDS.get(ident).copied() .unwrap_or(TokenKind::Ident); Ok(self.make_token(kind)) } fn next_num(&mut self) -> Result { let first = self.expect_any(DEC_NUM_CHARS, "number")?; let alphabet = if first == '0' && matches!(self.curr_char(), Some('x') | Some('X')) { self.adv_char().unwrap(); self.expect_any(HEX_NUM_CHARS, "hex number")?; HEX_NUM_CHARS } else { DEC_NUM_CHARS }; self.match_while(|c| alphabet.contains(&c)); Ok(self.make_token(TokenKind::Num)) } fn next_str(&mut self) -> Result { let start_char = self.expect_any(STR_QUOTE_CHARS, "string")?; while let Some(c) = self.match_where(|curr| curr != start_char) { if c == '\\' { // Match escapes self.expect_any(&['n', 't', 'r', '\\', '\'', '\"', '0'], "escape character")?; } } self.expect_char(start_char, "end of string")?; Ok(self.make_token(TokenKind::Str)) } fn next_sym(&mut self) -> Result { self.expect_char(':', "symbol")?; self.expect_ident("symbol")?; Ok(self.make_token(TokenKind::Sym)) } fn next_op(&mut self) -> Result { self.expect_any(OP_CHARS, "operator")?; let op_text = self.match_while(|c| OP_CHARS.contains(&c)); if let Some(kind) = OPS.get(op_text).copied() { Ok(self.make_token(kind)) } else { Err(Error::Unknown { what: format!("operator {}", op_text.escape_debug()), span: self.span(), }) } } fn next_char_token(&mut self, c: char, kind: TokenKind) -> Result { self.expect_char(c, LazyString::new(|| format!("{} token", kind)))?; Ok(self.make_token(kind)) } //////////////////////////////////////////////////////////////////////////////// // Character pattern matching //////////////////////////////////////////////////////////////////////////////// fn match_where

(&mut self, p: P) -> Option where P: Fn(char) -> bool, { if (p)(self.curr_char()?) { self.adv_char() } else { None } } fn match_while

(&mut self, p: P) -> &str where P: Fn(char) -> bool + Copy, { while self.match_where(p).is_some() {} self.text_at(self.text) } fn expect_where

(&mut self, p: P, expected: impl ToString) -> Result where P: Fn(char) -> bool, { // Check EOF self.curr_char().ok_or_else(|| Error::ExpectedGot { expected: expected.to_string(), got: "EOF".to_string(), span: self.span(), })?; // Match self.match_where(p).ok_or_else(|| Error::ExpectedGot { expected: expected.to_string(), got: format!("{} character", self.curr_char().unwrap().escape_debug()), span: self.span(), }) } fn expect_char(&mut self, c: char, expected: impl ToString) -> Result { self.expect_where(|curr| curr == c, expected) } fn expect_any(&mut self, chars: &[char], expected: impl ToString) -> Result { self.expect_where(|curr| chars.contains(&curr), expected) } fn expect_ident(&mut self, expected: impl ToString) -> Result<&str> { self.expect_any(IDENT_START_CHARS, expected)?; Ok(self.match_while(|curr| IDENT_CHARS.contains(&curr))) } } impl Spanned for Lexer<'_> { fn span(&self) -> Span { Span { start: self.start, end: self.end, } } } #[cfg(test)] mod test { use super::*; #[test] fn test_next_token_eof() { let mut lexer = Lexer::new(""); assert!(matches!(lexer.next_token(), Ok(None))); assert!(lexer.is_eof()); let mut lexer = Lexer::new(" "); assert!(matches!(lexer.next_token(), Ok(None))); assert!(lexer.is_eof()); let mut lexer = Lexer::new(" \n \n \n\r\n\t "); assert!(matches!(lexer.next_token(), Ok(None))); assert!(lexer.is_eof()); } macro_rules! test_token { ($text:expr, $token_kind:expr, $token_text:expr) => {{ let text = $text; let mut lexer = Lexer::new(text); let token = lexer.next_token().expect("token").expect("token"); assert_eq!(token.kind(), $token_kind); assert_eq!(token.text_at(text), $token_text); }}; ($text:expr, $token_kind:expr) => {{ test_token!($text, $token_kind, $text); }}; } #[test] fn test_ident_token() { test_token!("ident", TokenKind::Ident); test_token!("OtherIdent", TokenKind::Ident); test_token!("other_ident", TokenKind::Ident); test_token!("ident1234", TokenKind::Ident); test_token!("RETURN", TokenKind::Ident); } #[test] fn test_keywords() { test_token!("return", TokenKind::KwReturn); } #[test] fn test_num_token() { test_token!("1234", TokenKind::Num); test_token!("4321", TokenKind::Num); test_token!("123498765", TokenKind::Num); test_token!("432156789", TokenKind::Num); test_token!("0xdcbaBEEF", TokenKind::Num); test_token!("0xabcdFEED", TokenKind::Num); test_token!("0XdcbaBEEF", TokenKind::Num); test_token!("0XabcdFEED", TokenKind::Num); test_token!("0X123456789DCBAbeef", TokenKind::Num); test_token!("0xABCDfeed192837465", TokenKind::Num); } #[test] fn test_str_token() { test_token!(r#""this is a string""#, TokenKind::Str); test_token!(r#"'this is a string'"#, TokenKind::Str); test_token!(r#"'this is a string\nwith escapes'"#, TokenKind::Str); test_token!(r#""this is a string\nwith escapes""#, TokenKind::Str); } #[test] fn test_sym_token() { test_token!(":symbol", TokenKind::Sym); test_token!(":OtherSymbol", TokenKind::Sym); test_token!(":other_symbol", TokenKind::Sym); test_token!(":symbol1234", TokenKind::Sym); } #[test] fn test_single_char_symbols() { test_token!("(", TokenKind::LParen); test_token!(")", TokenKind::RParen); test_token!("{", TokenKind::LBrace); test_token!("}", TokenKind::RBrace); test_token!("[", TokenKind::LBracket); test_token!("]", TokenKind::RBracket); test_token!(",", TokenKind::Comma); } #[test] fn test_op_tokens() { test_token!("=", TokenKind::Eq); test_token!("->", TokenKind::Arrow); } #[test] fn test_expect_char() { let mut lexer = Lexer::new("asdf"); assert!(matches!(lexer.expect_char('a', "a"), Ok('a'))); assert!(matches!(lexer.expect_char('s', "s"), Ok('s'))); assert!(matches!(lexer.expect_char('d', "d"), Ok('d'))); assert!(matches!(lexer.expect_char('f', "f"), Ok('f'))); assert!(lexer.is_eof()); } }