Move lexer to use regex instead of hand-rolled lexing

Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
2020-04-27 19:50:46 -04:00
parent 58421a0469
commit ce97d90f9d
6 changed files with 147 additions and 180 deletions

View File

@@ -7,29 +7,19 @@ pub enum Error {
ExpectedGot {
expected: String,
got: String,
span: Span,
pos: Pos,
},
#[snafu(display("unexpected {}", what))]
Unexpected {
what: String,
span: Span,
pos: Pos,
},
#[snafu(display("unknown {}", what))]
Unknown {
what: String,
span: Span,
}
}
impl Spanned for Error {
fn span(&self) -> Span {
match self {
Error::ExpectedGot { span, .. }
| Error::Unknown { span, .. }
| Error::Unexpected { span, .. } => { *span }
}
pos: Pos,
}
}

View File

@@ -4,6 +4,7 @@ use crate::{
};
use lazy_static::lazy_static;
use maplit::hashmap;
use regex::{Regex, RegexBuilder};
use std::{collections::HashMap, mem, str::Chars};
const IDENT_START_CHARS: &'static [char] = &[
@@ -38,13 +39,12 @@ lazy_static! {
static ref KEYWORDS: HashMap<&'static str, TokenKind> = hashmap! {
"return" => TokenKind::KwReturn,
};
}
pub struct Lexer<'t> {
chars: Chars<'t>,
text: &'t str,
start: Pos,
end: Pos,
pos: Pos,
}
impl<'t> Lexer<'t> {
@@ -58,52 +58,94 @@ impl<'t> Lexer<'t> {
};
Lexer {
chars: text.chars(),
text,
start: pos,
end: pos,
pos,
}
}
/// Gets whether this lexer has reached the EOF.
pub fn is_eof(&self) -> bool {
self.chars.clone().next().is_none()
self.chars().next().is_none()
}
////////////////////////////////////////////////////////////////////////////////
// Character advancement
////////////////////////////////////////////////////////////////////////////////
fn curr_char(&self) -> Option<char> {
self.chars.clone().next()
fn chars(&'t self) -> Chars<'t> {
self.pos_text().chars()
}
fn adv_char(&mut self) -> Option<char> {
let c = self.chars.next()?;
self.end = self.end.next_char(c);
Some(c)
fn pos_text(&self) -> &str {
&self.text[self.pos.byte..]
}
fn curr_char(&self) -> Option<char> {
self.chars().next()
}
fn skip_whitespace(&mut self) {
self.match_while(|c| c.is_whitespace());
}
fn catchup(&mut self) -> Span {
let start = mem::replace(&mut self.start, self.end);
Span {
start,
end: self.end,
while let Some(c) = self.curr_char() {
if !c.is_whitespace() {
break;
} else {
self.adv_char();
}
}
}
fn make_token(&mut self, kind: TokenKind) -> Token {
let span = self.catchup();
Token::new(kind, span)
fn adv_char(&mut self) -> Option<char> {
let c = self.curr_char()?;
self.pos = self.pos.next_char(c);
Some(c)
}
////////////////////////////////////////////////////////////////////////////////
// Tokens
////////////////////////////////////////////////////////////////////////////////
pub fn next_token(&mut self) -> Result<Option<Token>> {
// Constants and statics
lazy_static! {
static ref REGEX: Regex = RegexBuilder::new(r#"
^(?P<kw_return>return)
|(?P<ident>[a-zA-Z_][a-zA-Z0-9_]*)
|(?P<sym>:[a-zA-Z_][a-zA-Z0-9_]*)
|(?P<hex_num>0[xX][0-9a-fA-F]+)
|(?P<dec_num>[0-9]+)
|(?P<lparen>\()
|(?P<rparen>\))
|(?P<lbracket>\[)
|(?P<rbracket>\])
|(?P<lbrace>\{)
|(?P<rbrace>\})
|(?P<comma>,)
|(?P<arrow>->)
|(?P<eq>=)
|(?P<dq_str>"([^\\"]|\\[ntr0"'])*")
|(?P<sq_str>'([^\\"]|\\[ntr0"'])*')
"#).ignore_whitespace(true)
.build()
.unwrap();
}
const CAPTURES: &[(&str, TokenKind)] = &[
("kw_return", TokenKind::KwReturn),
("ident", TokenKind::Ident),
("sym", TokenKind::Sym),
("dec_num", TokenKind::Num),
("hex_num", TokenKind::Num),
("dq_str", TokenKind::Str),
("sq_str", TokenKind::Str),
("lparen", TokenKind::LParen),
("rparen", TokenKind::RParen),
("lbracket", TokenKind::LBracket),
("rbracket", TokenKind::RBracket),
("lbrace", TokenKind::LBrace),
("rbrace", TokenKind::RBrace),
("comma", TokenKind::Comma),
("arrow", TokenKind::Arrow),
("eq", TokenKind::Eq),
];
self.skip_whitespace();
let curr = if let Some(curr) = self.curr_char() {
@@ -112,145 +154,35 @@ impl<'t> Lexer<'t> {
return Ok(None);
};
let token = match curr {
c if IDENT_START_CHARS.contains(&c) => self.next_ident_or_kw()?,
c if DEC_NUM_CHARS.contains(&c) => self.next_num()?,
'"' | '\'' => self.next_str()?,
':' => self.next_sym()?,
'(' => self.next_char_token('(', TokenKind::LParen)?,
')' => self.next_char_token(')', TokenKind::RParen)?,
'{' => self.next_char_token('{', TokenKind::LBrace)?,
'}' => self.next_char_token('}', TokenKind::RBrace)?,
'[' => self.next_char_token('[', TokenKind::LBracket)?,
']' => self.next_char_token(']', TokenKind::RBracket)?,
',' => self.next_char_token(',', TokenKind::Comma)?,
c if OP_CHARS.contains(&c) => self.next_op()?,
c => return Err(Error::Unexpected {
what: format!("character {}", c.escape_debug()),
span: self.span(),
})
};
Ok(Some(token))
}
let caps = REGEX.captures(&self.text[self.pos.byte..])
.ok_or_else(|| Error::Unexpected {
what: "EOF".to_string(),
pos: self.pos,
})?;
fn next_ident_or_kw(&mut self) -> Result<Token> {
let ident = self.expect_ident("identifier")?;
let kind = KEYWORDS.get(ident).copied()
.unwrap_or(TokenKind::Ident);
Ok(self.make_token(kind))
}
// Get first capture
let capture_kind = CAPTURES.iter()
.filter_map(|(name, kind)|
caps.name(name)
.map(|cap| (cap, kind)))
.next();
fn next_num(&mut self) -> Result<Token> {
let first = self.expect_any(DEC_NUM_CHARS, "number")?;
let alphabet = if first == '0' && matches!(self.curr_char(), Some('x') | Some('X')) {
self.adv_char().unwrap();
self.expect_any(HEX_NUM_CHARS, "hex number")?;
HEX_NUM_CHARS
let (token_text, kind) = if let Some((capture, kind)) = capture_kind {
(capture.as_str(), *kind)
} else {
DEC_NUM_CHARS
return Err(
Error::Unexpected {
what: format!("character {}", (&self.text[self.pos.byte..]).chars().next().unwrap()),
pos: self.pos,
}
);
};
self.match_while(|c| alphabet.contains(&c));
Ok(self.make_token(TokenKind::Num))
}
let start = self.pos;
self.pos.adv_str(token_text);
let end = self.pos;
fn next_str(&mut self) -> Result<Token> {
let start_char = self.expect_any(STR_QUOTE_CHARS, "string")?;
while let Some(c) = self.match_where(|curr| curr != start_char) {
if c == '\\' {
// Match escapes
self.expect_any(&['n', 't', 'r', '\\', '\'', '\"', '0'], "escape character")?;
}
}
self.expect_char(start_char, "end of string")?;
Ok(self.make_token(TokenKind::Str))
}
fn next_sym(&mut self) -> Result<Token> {
self.expect_char(':', "symbol")?;
self.expect_ident("symbol")?;
Ok(self.make_token(TokenKind::Sym))
}
fn next_op(&mut self) -> Result<Token> {
self.expect_any(OP_CHARS, "operator")?;
let op_text = self.match_while(|c| OP_CHARS.contains(&c));
if let Some(kind) = OPS.get(op_text).copied() {
Ok(self.make_token(kind))
} else {
Err(Error::Unknown {
what: format!("operator {}", op_text.escape_debug()),
span: self.span(),
})
}
}
fn next_char_token(&mut self, c: char, kind: TokenKind) -> Result<Token> {
self.expect_char(c, LazyString::new(|| format!("{} token", kind)))?;
Ok(self.make_token(kind))
}
////////////////////////////////////////////////////////////////////////////////
// Character pattern matching
////////////////////////////////////////////////////////////////////////////////
fn match_where<P>(&mut self, p: P) -> Option<char>
where
P: Fn(char) -> bool,
{
if (p)(self.curr_char()?) {
self.adv_char()
} else {
None
}
}
fn match_while<P>(&mut self, p: P) -> &str
where
P: Fn(char) -> bool + Copy,
{
while self.match_where(p).is_some() {}
self.text_at(self.text)
}
fn expect_where<P>(&mut self, p: P, expected: impl ToString) -> Result<char>
where
P: Fn(char) -> bool,
{
// Check EOF
self.curr_char().ok_or_else(|| Error::ExpectedGot {
expected: expected.to_string(),
got: "EOF".to_string(),
span: self.span(),
})?;
// Match
self.match_where(p).ok_or_else(|| Error::ExpectedGot {
expected: expected.to_string(),
got: format!("{} character", self.curr_char().unwrap().escape_debug()),
span: self.span(),
})
}
fn expect_char(&mut self, c: char, expected: impl ToString) -> Result<char> {
self.expect_where(|curr| curr == c, expected)
}
fn expect_any(&mut self, chars: &[char], expected: impl ToString) -> Result<char> {
self.expect_where(|curr| chars.contains(&curr), expected)
}
fn expect_ident(&mut self, expected: impl ToString) -> Result<&str> {
self.expect_any(IDENT_START_CHARS, expected)?;
Ok(self.match_while(|curr| IDENT_CHARS.contains(&curr)))
}
}
impl Spanned for Lexer<'_> {
fn span(&self) -> Span {
Span {
start: self.start,
end: self.end,
}
Ok(Some(Token::new(kind, Span { start, end })))
}
}
@@ -347,14 +279,4 @@ mod test {
test_token!("=", TokenKind::Eq);
test_token!("->", TokenKind::Arrow);
}
#[test]
fn test_expect_char() {
let mut lexer = Lexer::new("asdf");
assert!(matches!(lexer.expect_char('a', "a"), Ok('a')));
assert!(matches!(lexer.expect_char('s', "s"), Ok('s')));
assert!(matches!(lexer.expect_char('d', "d"), Ok('d')));
assert!(matches!(lexer.expect_char('f', "f"), Ok('f')));
assert!(lexer.is_eof());
}
}

View File

@@ -36,6 +36,16 @@ impl Pos {
}
}
pub fn adv_char(&mut self, c: char) {
*self = self.next_char(c);
}
pub fn adv_str(&mut self, s: &str) {
for c in s.chars() {
self.adv_char(c);
}
}
pub fn min(self, other: Self) -> Self {
if self.byte < other.byte {
self

View File

@@ -48,6 +48,7 @@ impl Display for TokenKind {
}
}
#[derive(Debug, Clone, Copy)]
pub struct Token {
kind: TokenKind,
span: Span,