diff --git a/Cargo.lock b/Cargo.lock index b3d3405..a158ba5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,5 +1,14 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +[[package]] +name = "aho-corasick" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8716408b8bc624ed7f65d223ddb9ac2d044c0547b6fa4b0d554f3a9540496ada" +dependencies = [ + "memchr", +] + [[package]] name = "doc-comment" version = "0.3.3" @@ -18,12 +27,19 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" +[[package]] +name = "memchr" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" + [[package]] name = "not-python" version = "0.1.0" dependencies = [ "lazy_static", "maplit", + "regex", "snafu", ] @@ -45,6 +61,24 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "regex" +version = "1.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6020f034922e3194c711b82a627453881bc4682166cabb07134a10c26ba7692" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", + "thread_local", +] + +[[package]] +name = "regex-syntax" +version = "0.6.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae" + [[package]] name = "snafu" version = "0.6.6" @@ -77,6 +111,15 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "thread_local" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" +dependencies = [ + "lazy_static", +] + [[package]] name = "unicode-xid" version = "0.2.0" diff --git a/Cargo.toml b/Cargo.toml index 21e1da6..653beca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,3 +10,4 @@ edition = "2018" snafu = "0.6.6" lazy_static = "1.4.0" maplit = "1.0.2" +regex = "1.3.7" diff --git a/src/syn/error.rs b/src/syn/error.rs index f43b214..db71a92 100644 --- a/src/syn/error.rs +++ b/src/syn/error.rs @@ -7,29 +7,19 @@ pub enum Error { ExpectedGot { expected: String, got: String, - span: Span, + pos: Pos, }, #[snafu(display("unexpected {}", what))] Unexpected { what: String, - span: Span, + pos: Pos, }, #[snafu(display("unknown {}", what))] Unknown { what: String, - span: Span, - } -} - -impl Spanned for Error { - fn span(&self) -> Span { - match self { - Error::ExpectedGot { span, .. } - | Error::Unknown { span, .. } - | Error::Unexpected { span, .. } => { *span } - } + pos: Pos, } } diff --git a/src/syn/lexer.rs b/src/syn/lexer.rs index 5504e3e..73de156 100644 --- a/src/syn/lexer.rs +++ b/src/syn/lexer.rs @@ -4,6 +4,7 @@ use crate::{ }; use lazy_static::lazy_static; use maplit::hashmap; +use regex::{Regex, RegexBuilder}; use std::{collections::HashMap, mem, str::Chars}; const IDENT_START_CHARS: &'static [char] = &[ @@ -38,13 +39,12 @@ lazy_static! { static ref KEYWORDS: HashMap<&'static str, TokenKind> = hashmap! { "return" => TokenKind::KwReturn, }; + } pub struct Lexer<'t> { - chars: Chars<'t>, text: &'t str, - start: Pos, - end: Pos, + pos: Pos, } impl<'t> Lexer<'t> { @@ -58,52 +58,94 @@ impl<'t> Lexer<'t> { }; Lexer { - chars: text.chars(), text, - start: pos, - end: pos, + pos, } } /// Gets whether this lexer has reached the EOF. pub fn is_eof(&self) -> bool { - self.chars.clone().next().is_none() + self.chars().next().is_none() } //////////////////////////////////////////////////////////////////////////////// // Character advancement //////////////////////////////////////////////////////////////////////////////// - fn curr_char(&self) -> Option { - self.chars.clone().next() + fn chars(&'t self) -> Chars<'t> { + self.pos_text().chars() } - fn adv_char(&mut self) -> Option { - let c = self.chars.next()?; - self.end = self.end.next_char(c); - Some(c) + fn pos_text(&self) -> &str { + &self.text[self.pos.byte..] + } + + fn curr_char(&self) -> Option { + self.chars().next() } fn skip_whitespace(&mut self) { - self.match_while(|c| c.is_whitespace()); - } - - fn catchup(&mut self) -> Span { - let start = mem::replace(&mut self.start, self.end); - Span { - start, - end: self.end, + while let Some(c) = self.curr_char() { + if !c.is_whitespace() { + break; + } else { + self.adv_char(); + } } } - fn make_token(&mut self, kind: TokenKind) -> Token { - let span = self.catchup(); - Token::new(kind, span) + fn adv_char(&mut self) -> Option { + let c = self.curr_char()?; + self.pos = self.pos.next_char(c); + Some(c) } //////////////////////////////////////////////////////////////////////////////// // Tokens //////////////////////////////////////////////////////////////////////////////// pub fn next_token(&mut self) -> Result> { + // Constants and statics + lazy_static! { + static ref REGEX: Regex = RegexBuilder::new(r#" + ^(?Preturn) + |(?P[a-zA-Z_][a-zA-Z0-9_]*) + |(?P:[a-zA-Z_][a-zA-Z0-9_]*) + |(?P0[xX][0-9a-fA-F]+) + |(?P[0-9]+) + |(?P\() + |(?P\)) + |(?P\[) + |(?P\]) + |(?P\{) + |(?P\}) + |(?P,) + |(?P->) + |(?P=) + |(?P"([^\\"]|\\[ntr0"'])*") + |(?P'([^\\"]|\\[ntr0"'])*') + "#).ignore_whitespace(true) + .build() + .unwrap(); + } + + const CAPTURES: &[(&str, TokenKind)] = &[ + ("kw_return", TokenKind::KwReturn), + ("ident", TokenKind::Ident), + ("sym", TokenKind::Sym), + ("dec_num", TokenKind::Num), + ("hex_num", TokenKind::Num), + ("dq_str", TokenKind::Str), + ("sq_str", TokenKind::Str), + ("lparen", TokenKind::LParen), + ("rparen", TokenKind::RParen), + ("lbracket", TokenKind::LBracket), + ("rbracket", TokenKind::RBracket), + ("lbrace", TokenKind::LBrace), + ("rbrace", TokenKind::RBrace), + ("comma", TokenKind::Comma), + ("arrow", TokenKind::Arrow), + ("eq", TokenKind::Eq), + ]; + self.skip_whitespace(); let curr = if let Some(curr) = self.curr_char() { @@ -112,145 +154,35 @@ impl<'t> Lexer<'t> { return Ok(None); }; - let token = match curr { - c if IDENT_START_CHARS.contains(&c) => self.next_ident_or_kw()?, - c if DEC_NUM_CHARS.contains(&c) => self.next_num()?, - '"' | '\'' => self.next_str()?, - ':' => self.next_sym()?, - '(' => self.next_char_token('(', TokenKind::LParen)?, - ')' => self.next_char_token(')', TokenKind::RParen)?, - '{' => self.next_char_token('{', TokenKind::LBrace)?, - '}' => self.next_char_token('}', TokenKind::RBrace)?, - '[' => self.next_char_token('[', TokenKind::LBracket)?, - ']' => self.next_char_token(']', TokenKind::RBracket)?, - ',' => self.next_char_token(',', TokenKind::Comma)?, - c if OP_CHARS.contains(&c) => self.next_op()?, - c => return Err(Error::Unexpected { - what: format!("character {}", c.escape_debug()), - span: self.span(), - }) - }; - Ok(Some(token)) - } + let caps = REGEX.captures(&self.text[self.pos.byte..]) + .ok_or_else(|| Error::Unexpected { + what: "EOF".to_string(), + pos: self.pos, + })?; - fn next_ident_or_kw(&mut self) -> Result { - let ident = self.expect_ident("identifier")?; - let kind = KEYWORDS.get(ident).copied() - .unwrap_or(TokenKind::Ident); - Ok(self.make_token(kind)) - } + // Get first capture + let capture_kind = CAPTURES.iter() + .filter_map(|(name, kind)| + caps.name(name) + .map(|cap| (cap, kind))) + .next(); - fn next_num(&mut self) -> Result { - let first = self.expect_any(DEC_NUM_CHARS, "number")?; - let alphabet = if first == '0' && matches!(self.curr_char(), Some('x') | Some('X')) { - self.adv_char().unwrap(); - self.expect_any(HEX_NUM_CHARS, "hex number")?; - HEX_NUM_CHARS + let (token_text, kind) = if let Some((capture, kind)) = capture_kind { + (capture.as_str(), *kind) } else { - DEC_NUM_CHARS + return Err( + Error::Unexpected { + what: format!("character {}", (&self.text[self.pos.byte..]).chars().next().unwrap()), + pos: self.pos, + } + ); }; - self.match_while(|c| alphabet.contains(&c)); - Ok(self.make_token(TokenKind::Num)) - } + let start = self.pos; + self.pos.adv_str(token_text); + let end = self.pos; - fn next_str(&mut self) -> Result { - let start_char = self.expect_any(STR_QUOTE_CHARS, "string")?; - while let Some(c) = self.match_where(|curr| curr != start_char) { - if c == '\\' { - // Match escapes - self.expect_any(&['n', 't', 'r', '\\', '\'', '\"', '0'], "escape character")?; - } - } - self.expect_char(start_char, "end of string")?; - Ok(self.make_token(TokenKind::Str)) - } - - fn next_sym(&mut self) -> Result { - self.expect_char(':', "symbol")?; - self.expect_ident("symbol")?; - Ok(self.make_token(TokenKind::Sym)) - } - - fn next_op(&mut self) -> Result { - self.expect_any(OP_CHARS, "operator")?; - let op_text = self.match_while(|c| OP_CHARS.contains(&c)); - if let Some(kind) = OPS.get(op_text).copied() { - Ok(self.make_token(kind)) - } else { - Err(Error::Unknown { - what: format!("operator {}", op_text.escape_debug()), - span: self.span(), - }) - } - } - - fn next_char_token(&mut self, c: char, kind: TokenKind) -> Result { - self.expect_char(c, LazyString::new(|| format!("{} token", kind)))?; - Ok(self.make_token(kind)) - } - - //////////////////////////////////////////////////////////////////////////////// - // Character pattern matching - //////////////////////////////////////////////////////////////////////////////// - fn match_where

(&mut self, p: P) -> Option - where - P: Fn(char) -> bool, - { - if (p)(self.curr_char()?) { - self.adv_char() - } else { - None - } - } - - fn match_while

(&mut self, p: P) -> &str - where - P: Fn(char) -> bool + Copy, - { - while self.match_where(p).is_some() {} - self.text_at(self.text) - } - - fn expect_where

(&mut self, p: P, expected: impl ToString) -> Result - where - P: Fn(char) -> bool, - { - // Check EOF - self.curr_char().ok_or_else(|| Error::ExpectedGot { - expected: expected.to_string(), - got: "EOF".to_string(), - span: self.span(), - })?; - - // Match - self.match_where(p).ok_or_else(|| Error::ExpectedGot { - expected: expected.to_string(), - got: format!("{} character", self.curr_char().unwrap().escape_debug()), - span: self.span(), - }) - } - - fn expect_char(&mut self, c: char, expected: impl ToString) -> Result { - self.expect_where(|curr| curr == c, expected) - } - - fn expect_any(&mut self, chars: &[char], expected: impl ToString) -> Result { - self.expect_where(|curr| chars.contains(&curr), expected) - } - - fn expect_ident(&mut self, expected: impl ToString) -> Result<&str> { - self.expect_any(IDENT_START_CHARS, expected)?; - Ok(self.match_while(|curr| IDENT_CHARS.contains(&curr))) - } -} - -impl Spanned for Lexer<'_> { - fn span(&self) -> Span { - Span { - start: self.start, - end: self.end, - } + Ok(Some(Token::new(kind, Span { start, end }))) } } @@ -347,14 +279,4 @@ mod test { test_token!("=", TokenKind::Eq); test_token!("->", TokenKind::Arrow); } - - #[test] - fn test_expect_char() { - let mut lexer = Lexer::new("asdf"); - assert!(matches!(lexer.expect_char('a', "a"), Ok('a'))); - assert!(matches!(lexer.expect_char('s', "s"), Ok('s'))); - assert!(matches!(lexer.expect_char('d', "d"), Ok('d'))); - assert!(matches!(lexer.expect_char('f', "f"), Ok('f'))); - assert!(lexer.is_eof()); - } } diff --git a/src/syn/span.rs b/src/syn/span.rs index c744e86..b32ebc8 100644 --- a/src/syn/span.rs +++ b/src/syn/span.rs @@ -36,6 +36,16 @@ impl Pos { } } + pub fn adv_char(&mut self, c: char) { + *self = self.next_char(c); + } + + pub fn adv_str(&mut self, s: &str) { + for c in s.chars() { + self.adv_char(c); + } + } + pub fn min(self, other: Self) -> Self { if self.byte < other.byte { self diff --git a/src/syn/token.rs b/src/syn/token.rs index de9e7d3..4e58716 100644 --- a/src/syn/token.rs +++ b/src/syn/token.rs @@ -48,6 +48,7 @@ impl Display for TokenKind { } } +#[derive(Debug, Clone, Copy)] pub struct Token { kind: TokenKind, span: Span,