Move lexer to use regex instead of hand-rolled lexing
Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
43
Cargo.lock
generated
43
Cargo.lock
generated
@@ -1,5 +1,14 @@
|
|||||||
# This file is automatically @generated by Cargo.
|
# This file is automatically @generated by Cargo.
|
||||||
# It is not intended for manual editing.
|
# It is not intended for manual editing.
|
||||||
|
[[package]]
|
||||||
|
name = "aho-corasick"
|
||||||
|
version = "0.7.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8716408b8bc624ed7f65d223ddb9ac2d044c0547b6fa4b0d554f3a9540496ada"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "doc-comment"
|
name = "doc-comment"
|
||||||
version = "0.3.3"
|
version = "0.3.3"
|
||||||
@@ -18,12 +27,19 @@ version = "1.0.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
|
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memchr"
|
||||||
|
version = "2.3.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "not-python"
|
name = "not-python"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"lazy_static",
|
"lazy_static",
|
||||||
"maplit",
|
"maplit",
|
||||||
|
"regex",
|
||||||
"snafu",
|
"snafu",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -45,6 +61,24 @@ dependencies = [
|
|||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex"
|
||||||
|
version = "1.3.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a6020f034922e3194c711b82a627453881bc4682166cabb07134a10c26ba7692"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-syntax",
|
||||||
|
"thread_local",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-syntax"
|
||||||
|
version = "0.6.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "snafu"
|
name = "snafu"
|
||||||
version = "0.6.6"
|
version = "0.6.6"
|
||||||
@@ -77,6 +111,15 @@ dependencies = [
|
|||||||
"unicode-xid",
|
"unicode-xid",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "thread_local"
|
||||||
|
version = "1.0.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
|
||||||
|
dependencies = [
|
||||||
|
"lazy_static",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-xid"
|
name = "unicode-xid"
|
||||||
version = "0.2.0"
|
version = "0.2.0"
|
||||||
|
|||||||
@@ -10,3 +10,4 @@ edition = "2018"
|
|||||||
snafu = "0.6.6"
|
snafu = "0.6.6"
|
||||||
lazy_static = "1.4.0"
|
lazy_static = "1.4.0"
|
||||||
maplit = "1.0.2"
|
maplit = "1.0.2"
|
||||||
|
regex = "1.3.7"
|
||||||
|
|||||||
@@ -7,29 +7,19 @@ pub enum Error {
|
|||||||
ExpectedGot {
|
ExpectedGot {
|
||||||
expected: String,
|
expected: String,
|
||||||
got: String,
|
got: String,
|
||||||
span: Span,
|
pos: Pos,
|
||||||
},
|
},
|
||||||
|
|
||||||
#[snafu(display("unexpected {}", what))]
|
#[snafu(display("unexpected {}", what))]
|
||||||
Unexpected {
|
Unexpected {
|
||||||
what: String,
|
what: String,
|
||||||
span: Span,
|
pos: Pos,
|
||||||
},
|
},
|
||||||
|
|
||||||
#[snafu(display("unknown {}", what))]
|
#[snafu(display("unknown {}", what))]
|
||||||
Unknown {
|
Unknown {
|
||||||
what: String,
|
what: String,
|
||||||
span: Span,
|
pos: Pos,
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Spanned for Error {
|
|
||||||
fn span(&self) -> Span {
|
|
||||||
match self {
|
|
||||||
Error::ExpectedGot { span, .. }
|
|
||||||
| Error::Unknown { span, .. }
|
|
||||||
| Error::Unexpected { span, .. } => { *span }
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
254
src/syn/lexer.rs
254
src/syn/lexer.rs
@@ -4,6 +4,7 @@ use crate::{
|
|||||||
};
|
};
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use maplit::hashmap;
|
use maplit::hashmap;
|
||||||
|
use regex::{Regex, RegexBuilder};
|
||||||
use std::{collections::HashMap, mem, str::Chars};
|
use std::{collections::HashMap, mem, str::Chars};
|
||||||
|
|
||||||
const IDENT_START_CHARS: &'static [char] = &[
|
const IDENT_START_CHARS: &'static [char] = &[
|
||||||
@@ -38,13 +39,12 @@ lazy_static! {
|
|||||||
static ref KEYWORDS: HashMap<&'static str, TokenKind> = hashmap! {
|
static ref KEYWORDS: HashMap<&'static str, TokenKind> = hashmap! {
|
||||||
"return" => TokenKind::KwReturn,
|
"return" => TokenKind::KwReturn,
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Lexer<'t> {
|
pub struct Lexer<'t> {
|
||||||
chars: Chars<'t>,
|
|
||||||
text: &'t str,
|
text: &'t str,
|
||||||
start: Pos,
|
pos: Pos,
|
||||||
end: Pos,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> Lexer<'t> {
|
impl<'t> Lexer<'t> {
|
||||||
@@ -58,52 +58,94 @@ impl<'t> Lexer<'t> {
|
|||||||
};
|
};
|
||||||
|
|
||||||
Lexer {
|
Lexer {
|
||||||
chars: text.chars(),
|
|
||||||
text,
|
text,
|
||||||
start: pos,
|
pos,
|
||||||
end: pos,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Gets whether this lexer has reached the EOF.
|
/// Gets whether this lexer has reached the EOF.
|
||||||
pub fn is_eof(&self) -> bool {
|
pub fn is_eof(&self) -> bool {
|
||||||
self.chars.clone().next().is_none()
|
self.chars().next().is_none()
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Character advancement
|
// Character advancement
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
fn curr_char(&self) -> Option<char> {
|
fn chars(&'t self) -> Chars<'t> {
|
||||||
self.chars.clone().next()
|
self.pos_text().chars()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn adv_char(&mut self) -> Option<char> {
|
fn pos_text(&self) -> &str {
|
||||||
let c = self.chars.next()?;
|
&self.text[self.pos.byte..]
|
||||||
self.end = self.end.next_char(c);
|
}
|
||||||
Some(c)
|
|
||||||
|
fn curr_char(&self) -> Option<char> {
|
||||||
|
self.chars().next()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn skip_whitespace(&mut self) {
|
fn skip_whitespace(&mut self) {
|
||||||
self.match_while(|c| c.is_whitespace());
|
while let Some(c) = self.curr_char() {
|
||||||
|
if !c.is_whitespace() {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
self.adv_char();
|
||||||
}
|
}
|
||||||
|
|
||||||
fn catchup(&mut self) -> Span {
|
|
||||||
let start = mem::replace(&mut self.start, self.end);
|
|
||||||
Span {
|
|
||||||
start,
|
|
||||||
end: self.end,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn make_token(&mut self, kind: TokenKind) -> Token {
|
fn adv_char(&mut self) -> Option<char> {
|
||||||
let span = self.catchup();
|
let c = self.curr_char()?;
|
||||||
Token::new(kind, span)
|
self.pos = self.pos.next_char(c);
|
||||||
|
Some(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Tokens
|
// Tokens
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
pub fn next_token(&mut self) -> Result<Option<Token>> {
|
pub fn next_token(&mut self) -> Result<Option<Token>> {
|
||||||
|
// Constants and statics
|
||||||
|
lazy_static! {
|
||||||
|
static ref REGEX: Regex = RegexBuilder::new(r#"
|
||||||
|
^(?P<kw_return>return)
|
||||||
|
|(?P<ident>[a-zA-Z_][a-zA-Z0-9_]*)
|
||||||
|
|(?P<sym>:[a-zA-Z_][a-zA-Z0-9_]*)
|
||||||
|
|(?P<hex_num>0[xX][0-9a-fA-F]+)
|
||||||
|
|(?P<dec_num>[0-9]+)
|
||||||
|
|(?P<lparen>\()
|
||||||
|
|(?P<rparen>\))
|
||||||
|
|(?P<lbracket>\[)
|
||||||
|
|(?P<rbracket>\])
|
||||||
|
|(?P<lbrace>\{)
|
||||||
|
|(?P<rbrace>\})
|
||||||
|
|(?P<comma>,)
|
||||||
|
|(?P<arrow>->)
|
||||||
|
|(?P<eq>=)
|
||||||
|
|(?P<dq_str>"([^\\"]|\\[ntr0"'])*")
|
||||||
|
|(?P<sq_str>'([^\\"]|\\[ntr0"'])*')
|
||||||
|
"#).ignore_whitespace(true)
|
||||||
|
.build()
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
const CAPTURES: &[(&str, TokenKind)] = &[
|
||||||
|
("kw_return", TokenKind::KwReturn),
|
||||||
|
("ident", TokenKind::Ident),
|
||||||
|
("sym", TokenKind::Sym),
|
||||||
|
("dec_num", TokenKind::Num),
|
||||||
|
("hex_num", TokenKind::Num),
|
||||||
|
("dq_str", TokenKind::Str),
|
||||||
|
("sq_str", TokenKind::Str),
|
||||||
|
("lparen", TokenKind::LParen),
|
||||||
|
("rparen", TokenKind::RParen),
|
||||||
|
("lbracket", TokenKind::LBracket),
|
||||||
|
("rbracket", TokenKind::RBracket),
|
||||||
|
("lbrace", TokenKind::LBrace),
|
||||||
|
("rbrace", TokenKind::RBrace),
|
||||||
|
("comma", TokenKind::Comma),
|
||||||
|
("arrow", TokenKind::Arrow),
|
||||||
|
("eq", TokenKind::Eq),
|
||||||
|
];
|
||||||
|
|
||||||
self.skip_whitespace();
|
self.skip_whitespace();
|
||||||
|
|
||||||
let curr = if let Some(curr) = self.curr_char() {
|
let curr = if let Some(curr) = self.curr_char() {
|
||||||
@@ -112,145 +154,35 @@ impl<'t> Lexer<'t> {
|
|||||||
return Ok(None);
|
return Ok(None);
|
||||||
};
|
};
|
||||||
|
|
||||||
let token = match curr {
|
let caps = REGEX.captures(&self.text[self.pos.byte..])
|
||||||
c if IDENT_START_CHARS.contains(&c) => self.next_ident_or_kw()?,
|
.ok_or_else(|| Error::Unexpected {
|
||||||
c if DEC_NUM_CHARS.contains(&c) => self.next_num()?,
|
what: "EOF".to_string(),
|
||||||
'"' | '\'' => self.next_str()?,
|
pos: self.pos,
|
||||||
':' => self.next_sym()?,
|
|
||||||
'(' => self.next_char_token('(', TokenKind::LParen)?,
|
|
||||||
')' => self.next_char_token(')', TokenKind::RParen)?,
|
|
||||||
'{' => self.next_char_token('{', TokenKind::LBrace)?,
|
|
||||||
'}' => self.next_char_token('}', TokenKind::RBrace)?,
|
|
||||||
'[' => self.next_char_token('[', TokenKind::LBracket)?,
|
|
||||||
']' => self.next_char_token(']', TokenKind::RBracket)?,
|
|
||||||
',' => self.next_char_token(',', TokenKind::Comma)?,
|
|
||||||
c if OP_CHARS.contains(&c) => self.next_op()?,
|
|
||||||
c => return Err(Error::Unexpected {
|
|
||||||
what: format!("character {}", c.escape_debug()),
|
|
||||||
span: self.span(),
|
|
||||||
})
|
|
||||||
};
|
|
||||||
Ok(Some(token))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn next_ident_or_kw(&mut self) -> Result<Token> {
|
|
||||||
let ident = self.expect_ident("identifier")?;
|
|
||||||
let kind = KEYWORDS.get(ident).copied()
|
|
||||||
.unwrap_or(TokenKind::Ident);
|
|
||||||
Ok(self.make_token(kind))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn next_num(&mut self) -> Result<Token> {
|
|
||||||
let first = self.expect_any(DEC_NUM_CHARS, "number")?;
|
|
||||||
let alphabet = if first == '0' && matches!(self.curr_char(), Some('x') | Some('X')) {
|
|
||||||
self.adv_char().unwrap();
|
|
||||||
self.expect_any(HEX_NUM_CHARS, "hex number")?;
|
|
||||||
HEX_NUM_CHARS
|
|
||||||
} else {
|
|
||||||
DEC_NUM_CHARS
|
|
||||||
};
|
|
||||||
|
|
||||||
self.match_while(|c| alphabet.contains(&c));
|
|
||||||
Ok(self.make_token(TokenKind::Num))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn next_str(&mut self) -> Result<Token> {
|
|
||||||
let start_char = self.expect_any(STR_QUOTE_CHARS, "string")?;
|
|
||||||
while let Some(c) = self.match_where(|curr| curr != start_char) {
|
|
||||||
if c == '\\' {
|
|
||||||
// Match escapes
|
|
||||||
self.expect_any(&['n', 't', 'r', '\\', '\'', '\"', '0'], "escape character")?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self.expect_char(start_char, "end of string")?;
|
|
||||||
Ok(self.make_token(TokenKind::Str))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn next_sym(&mut self) -> Result<Token> {
|
|
||||||
self.expect_char(':', "symbol")?;
|
|
||||||
self.expect_ident("symbol")?;
|
|
||||||
Ok(self.make_token(TokenKind::Sym))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn next_op(&mut self) -> Result<Token> {
|
|
||||||
self.expect_any(OP_CHARS, "operator")?;
|
|
||||||
let op_text = self.match_while(|c| OP_CHARS.contains(&c));
|
|
||||||
if let Some(kind) = OPS.get(op_text).copied() {
|
|
||||||
Ok(self.make_token(kind))
|
|
||||||
} else {
|
|
||||||
Err(Error::Unknown {
|
|
||||||
what: format!("operator {}", op_text.escape_debug()),
|
|
||||||
span: self.span(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn next_char_token(&mut self, c: char, kind: TokenKind) -> Result<Token> {
|
|
||||||
self.expect_char(c, LazyString::new(|| format!("{} token", kind)))?;
|
|
||||||
Ok(self.make_token(kind))
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Character pattern matching
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
|
||||||
fn match_where<P>(&mut self, p: P) -> Option<char>
|
|
||||||
where
|
|
||||||
P: Fn(char) -> bool,
|
|
||||||
{
|
|
||||||
if (p)(self.curr_char()?) {
|
|
||||||
self.adv_char()
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn match_while<P>(&mut self, p: P) -> &str
|
|
||||||
where
|
|
||||||
P: Fn(char) -> bool + Copy,
|
|
||||||
{
|
|
||||||
while self.match_where(p).is_some() {}
|
|
||||||
self.text_at(self.text)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn expect_where<P>(&mut self, p: P, expected: impl ToString) -> Result<char>
|
|
||||||
where
|
|
||||||
P: Fn(char) -> bool,
|
|
||||||
{
|
|
||||||
// Check EOF
|
|
||||||
self.curr_char().ok_or_else(|| Error::ExpectedGot {
|
|
||||||
expected: expected.to_string(),
|
|
||||||
got: "EOF".to_string(),
|
|
||||||
span: self.span(),
|
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
// Match
|
// Get first capture
|
||||||
self.match_where(p).ok_or_else(|| Error::ExpectedGot {
|
let capture_kind = CAPTURES.iter()
|
||||||
expected: expected.to_string(),
|
.filter_map(|(name, kind)|
|
||||||
got: format!("{} character", self.curr_char().unwrap().escape_debug()),
|
caps.name(name)
|
||||||
span: self.span(),
|
.map(|cap| (cap, kind)))
|
||||||
})
|
.next();
|
||||||
}
|
|
||||||
|
|
||||||
fn expect_char(&mut self, c: char, expected: impl ToString) -> Result<char> {
|
let (token_text, kind) = if let Some((capture, kind)) = capture_kind {
|
||||||
self.expect_where(|curr| curr == c, expected)
|
(capture.as_str(), *kind)
|
||||||
|
} else {
|
||||||
|
return Err(
|
||||||
|
Error::Unexpected {
|
||||||
|
what: format!("character {}", (&self.text[self.pos.byte..]).chars().next().unwrap()),
|
||||||
|
pos: self.pos,
|
||||||
}
|
}
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
fn expect_any(&mut self, chars: &[char], expected: impl ToString) -> Result<char> {
|
let start = self.pos;
|
||||||
self.expect_where(|curr| chars.contains(&curr), expected)
|
self.pos.adv_str(token_text);
|
||||||
}
|
let end = self.pos;
|
||||||
|
|
||||||
fn expect_ident(&mut self, expected: impl ToString) -> Result<&str> {
|
Ok(Some(Token::new(kind, Span { start, end })))
|
||||||
self.expect_any(IDENT_START_CHARS, expected)?;
|
|
||||||
Ok(self.match_while(|curr| IDENT_CHARS.contains(&curr)))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Spanned for Lexer<'_> {
|
|
||||||
fn span(&self) -> Span {
|
|
||||||
Span {
|
|
||||||
start: self.start,
|
|
||||||
end: self.end,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -347,14 +279,4 @@ mod test {
|
|||||||
test_token!("=", TokenKind::Eq);
|
test_token!("=", TokenKind::Eq);
|
||||||
test_token!("->", TokenKind::Arrow);
|
test_token!("->", TokenKind::Arrow);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_expect_char() {
|
|
||||||
let mut lexer = Lexer::new("asdf");
|
|
||||||
assert!(matches!(lexer.expect_char('a', "a"), Ok('a')));
|
|
||||||
assert!(matches!(lexer.expect_char('s', "s"), Ok('s')));
|
|
||||||
assert!(matches!(lexer.expect_char('d', "d"), Ok('d')));
|
|
||||||
assert!(matches!(lexer.expect_char('f', "f"), Ok('f')));
|
|
||||||
assert!(lexer.is_eof());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -36,6 +36,16 @@ impl Pos {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn adv_char(&mut self, c: char) {
|
||||||
|
*self = self.next_char(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn adv_str(&mut self, s: &str) {
|
||||||
|
for c in s.chars() {
|
||||||
|
self.adv_char(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn min(self, other: Self) -> Self {
|
pub fn min(self, other: Self) -> Self {
|
||||||
if self.byte < other.byte {
|
if self.byte < other.byte {
|
||||||
self
|
self
|
||||||
|
|||||||
@@ -48,6 +48,7 @@ impl Display for TokenKind {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct Token {
|
pub struct Token {
|
||||||
kind: TokenKind,
|
kind: TokenKind,
|
||||||
span: Span,
|
span: Span,
|
||||||
|
|||||||
Reference in New Issue
Block a user