Add parser and AST, remove some stuff from lexer

Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
2020-04-27 20:17:16 -04:00
parent ce97d90f9d
commit 28d29c2270
8 changed files with 340 additions and 56 deletions

7
Cargo.lock generated
View File

@@ -21,12 +21,6 @@ version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "maplit"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.3.3" version = "2.3.3"
@@ -38,7 +32,6 @@ name = "not-python"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"lazy_static", "lazy_static",
"maplit",
"regex", "regex",
"snafu", "snafu",
] ]

View File

@@ -9,5 +9,4 @@ edition = "2018"
[dependencies] [dependencies]
snafu = "0.6.6" snafu = "0.6.6"
lazy_static = "1.4.0" lazy_static = "1.4.0"
maplit = "1.0.2"
regex = "1.3.7" regex = "1.3.7"

52
src/syn/ast.rs Normal file
View File

@@ -0,0 +1,52 @@
use crate::syn::{op::*, span::*};
pub enum Expr {
Base(BaseExpr),
Bin(Box<BinExpr>),
Un(Box<UnExpr>),
}
impl From<UnExpr> for Expr {
fn from(un: UnExpr) -> Self {
Expr::Un(Box::new(un))
}
}
impl From<BinExpr> for Expr {
fn from(bin: BinExpr) -> Self {
Expr::Bin(Box::new(bin))
}
}
impl From<BaseExpr> for Expr {
fn from(base: BaseExpr) -> Self {
Expr::Base(base)
}
}
pub struct BinExpr {
pub lhs: Expr,
pub op: BinOp,
pub rhs: Expr,
}
pub struct UnExpr {
pub op: UnOp,
pub expr: Expr,
pub span: Span,
}
pub enum BaseExprKind {
Ident,
Num,
Str,
Sym,
List(Vec<Expr>),
Object(Vec<(Expr, Expr)>),
Tuple(Vec<Expr>),
}
pub struct BaseExpr {
pub kind: BaseExprKind,
pub span: Span,
}

View File

@@ -1,46 +1,7 @@
use crate::{ use crate::syn::{error::*, span::*, token::*};
syn::{error::*, span::*, token::*},
util::LazyString,
};
use lazy_static::lazy_static; use lazy_static::lazy_static;
use maplit::hashmap;
use regex::{Regex, RegexBuilder}; use regex::{Regex, RegexBuilder};
use std::{collections::HashMap, mem, str::Chars}; use std::str::Chars;
const IDENT_START_CHARS: &'static [char] = &[
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_',
];
const IDENT_CHARS: &'static [char] = &[
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B',
'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U',
'V', 'W', 'X', 'Y', 'Z', '_',
];
const DEC_NUM_CHARS: &'static [char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'];
const HEX_NUM_CHARS: &'static [char] = &[
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C',
'D', 'E', 'F',
];
const STR_QUOTE_CHARS: &'static [char] = &['"', '\''];
const OP_CHARS: &'static [char] = &['=', '+', '*', '-', '/', '>', '<', '~', '!', '%', '^'];
lazy_static! {
static ref OPS: HashMap<&'static str, TokenKind> = hashmap! {
"=" => TokenKind::Eq,
"->" => TokenKind::Arrow,
};
static ref KEYWORDS: HashMap<&'static str, TokenKind> = hashmap! {
"return" => TokenKind::KwReturn,
};
}
pub struct Lexer<'t> { pub struct Lexer<'t> {
text: &'t str, text: &'t str,
@@ -120,6 +81,10 @@ impl<'t> Lexer<'t> {
|(?P<comma>,) |(?P<comma>,)
|(?P<arrow>->) |(?P<arrow>->)
|(?P<eq>=) |(?P<eq>=)
|(?P<plus>\+)
|(?P<minus>-)
|(?P<splat>\*)
|(?P<fslash>/)
|(?P<dq_str>"([^\\"]|\\[ntr0"'])*") |(?P<dq_str>"([^\\"]|\\[ntr0"'])*")
|(?P<sq_str>'([^\\"]|\\[ntr0"'])*') |(?P<sq_str>'([^\\"]|\\[ntr0"'])*')
"#).ignore_whitespace(true) "#).ignore_whitespace(true)
@@ -129,12 +94,14 @@ impl<'t> Lexer<'t> {
const CAPTURES: &[(&str, TokenKind)] = &[ const CAPTURES: &[(&str, TokenKind)] = &[
("kw_return", TokenKind::KwReturn), ("kw_return", TokenKind::KwReturn),
("ident", TokenKind::Ident), ("ident", TokenKind::Ident),
("sym", TokenKind::Sym), ("sym", TokenKind::Sym),
("dec_num", TokenKind::Num), ("dec_num", TokenKind::Num),
("hex_num", TokenKind::Num), ("hex_num", TokenKind::Num),
("dq_str", TokenKind::Str), ("dq_str", TokenKind::Str),
("sq_str", TokenKind::Str), ("sq_str", TokenKind::Str),
("lparen", TokenKind::LParen), ("lparen", TokenKind::LParen),
("rparen", TokenKind::RParen), ("rparen", TokenKind::RParen),
("lbracket", TokenKind::LBracket), ("lbracket", TokenKind::LBracket),
@@ -142,15 +109,18 @@ impl<'t> Lexer<'t> {
("lbrace", TokenKind::LBrace), ("lbrace", TokenKind::LBrace),
("rbrace", TokenKind::RBrace), ("rbrace", TokenKind::RBrace),
("comma", TokenKind::Comma), ("comma", TokenKind::Comma),
("plus", TokenKind::Plus),
("minus", TokenKind::Minus),
("splat", TokenKind::Splat),
("fslash", TokenKind::FSlash),
("arrow", TokenKind::Arrow), ("arrow", TokenKind::Arrow),
("eq", TokenKind::Eq), ("eq", TokenKind::Eq),
]; ];
self.skip_whitespace(); self.skip_whitespace();
let curr = if let Some(curr) = self.curr_char() { if self.curr_char().is_none() {
curr
} else {
return Ok(None); return Ok(None);
}; };
@@ -272,6 +242,10 @@ mod test {
test_token!("[", TokenKind::LBracket); test_token!("[", TokenKind::LBracket);
test_token!("]", TokenKind::RBracket); test_token!("]", TokenKind::RBracket);
test_token!(",", TokenKind::Comma); test_token!(",", TokenKind::Comma);
test_token!("+", TokenKind::Plus);
test_token!("-", TokenKind::Minus);
test_token!("*", TokenKind::Splat);
test_token!("/", TokenKind::FSlash);
} }
#[test] #[test]

View File

@@ -1,4 +1,7 @@
pub mod ast;
pub mod error; pub mod error;
pub mod lexer; pub mod lexer;
pub mod op;
pub mod parser;
pub mod span; pub mod span;
pub mod token; pub mod token;

6
src/syn/op.rs Normal file
View File

@@ -0,0 +1,6 @@
pub enum UnOp {
}
pub enum BinOp {
}

249
src/syn/parser.rs Normal file
View File

@@ -0,0 +1,249 @@
#![allow(dead_code)]
use crate::syn::{ast::*, error::*, lexer::Lexer, span::*, token::*};
use std::{convert::TryFrom, mem};
const EXPR_START: &'static [TokenKind] = &[
TokenKind::Ident,
TokenKind::Num,
TokenKind::Str,
TokenKind::Sym,
TokenKind::LParen,
TokenKind::LBracket,
TokenKind::LBrace,
// TODO unary tokens
];
const VALUE_EXPR_START: &'static [TokenKind] = &[
TokenKind::Ident,
TokenKind::Num,
TokenKind::Str,
TokenKind::Sym,
];
pub struct Parser<'t> {
lexer: Lexer<'t>,
curr_token: Option<Token>,
}
impl<'t> Parser<'t> {
pub fn new(lexer: Lexer<'t>) -> Result<Self> {
let mut parser = Parser {
lexer,
curr_token: None,
};
parser.adv_token()?;
Ok(parser)
}
pub fn is_eof(&self) -> bool {
self.lexer.is_eof()
}
pub fn pos(&self) -> Pos {
self.span().start
}
////////////////////////////////////////////////////////////////////////////////
// Parsing functions
////////////////////////////////////////////////////////////////////////////////
pub fn next_expr(&mut self) -> Result<Expr> {
self.next_bin_expr()
}
fn next_bin_expr(&mut self) -> Result<Expr> {
let lhs = self.next_un_expr()?;
todo!()
}
fn next_un_expr(&mut self) -> Result<Expr> {
todo!()
}
fn next_base_expr(&mut self) -> Result<Expr> {
let token =
self.expect_token_where(|t| VALUE_EXPR_START.contains(&t.kind()), "base expression")?;
let expr: Expr = match token.kind() {
TokenKind::Ident => BaseExpr {
kind: BaseExprKind::Ident,
span: token.span(),
}
.into(),
TokenKind::Num => BaseExpr {
kind: BaseExprKind::Num,
span: token.span(),
}
.into(),
TokenKind::Str => BaseExpr {
kind: BaseExprKind::Str,
span: token.span(),
}
.into(),
TokenKind::Sym => BaseExpr {
kind: BaseExprKind::Sym,
span: token.span(),
}
.into(),
_ => unreachable!(),
};
Ok(expr)
}
fn next_list(&mut self) -> Result<Expr> {
let start_token = self.expect_token_where(|t| t.kind() == TokenKind::LBracket, "start of list (left bracket)")?;
let mut list_items = Vec::new();
while ! matches!(self.curr_token.map(|t| t.kind()), Some(TokenKind::RBrace) | None) {
let expr = self.next_expr()?;
}
let end_token = self.expect_token_where(
|t| t.kind() == TokenKind::RBracket,
"end of list (right bracket)",
)?;
let expr = BaseExpr {
kind: BaseExprKind::List(list_items),
span: start_token.span().union(end_token.span()),
};
Ok(expr.into())
}
////////////////////////////////////////////////////////////////////////////////
// Token matching functions
////////////////////////////////////////////////////////////////////////////////
fn adv_token(&mut self) -> Result<Option<Token>> {
let next_token = self.lexer.next_token()?;
Ok(mem::replace(&mut self.curr_token, next_token))
}
fn match_token_where<P>(&mut self, pred: P) -> Result<Option<Token>>
where
P: Fn(Token) -> bool,
{
match self.curr_token {
Some(curr) if (pred)(curr) => self.adv_token(),
_ => Ok(None),
}
}
fn expect_token_where<P>(&mut self, pred: P, expected: impl ToString) -> Result<Token>
where
P: Fn(Token) -> bool,
{
self.match_token_where(pred)?
.ok_or_else(|| Error::ExpectedGot {
expected: expected.to_string(),
got: self
.curr_token
.map(|token| token.kind().to_string())
.unwrap_or_else(|| "EOF".to_string()),
pos: self.pos(),
})
}
}
impl<'t> Spanned for Parser<'t> {
fn span(&self) -> Span {
self.curr_token
.as_ref()
.map(Spanned::span)
.unwrap_or(Span::default())
}
}
impl<'t> TryFrom<Lexer<'t>> for Parser<'t> {
type Error = Error;
fn try_from(lexer: Lexer<'t>) -> Result<Self> {
Parser::new(lexer)
}
}
impl<'t> TryFrom<&'t str> for Parser<'t> {
type Error = Error;
fn try_from(text: &'t str) -> Result<Self> {
Parser::new(Lexer::new(text))
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_adv_token() {
const EXPECTED: &'static [TokenKind] = &[
TokenKind::Num,
TokenKind::Ident,
TokenKind::Sym,
TokenKind::Str,
];
let mut parser = Parser::try_from("1 ident :sym 'string'").unwrap();
for expected in EXPECTED.iter().copied() {
let token = parser.adv_token().unwrap();
let kind = token.unwrap().kind();
assert_eq!(kind, expected);
}
assert!(parser.is_eof());
}
#[test]
fn test_match_token_where() {
let mut parser = Parser::try_from("1 ident :sym 'string'").unwrap();
assert!(matches!(
parser.match_token_where(|token| token.kind() == TokenKind::Num),
Ok(Some(_))
));
assert!(matches!(
parser.match_token_where(|token| token.kind() == TokenKind::Num),
Ok(None)
));
assert!(matches!(
parser.match_token_where(|token| token.kind() == TokenKind::Ident),
Ok(Some(_))
));
assert!(matches!(
parser.match_token_where(|token| token.kind() == TokenKind::Ident),
Ok(None)
));
assert!(matches!(
parser.match_token_where(|token| token.kind() == TokenKind::Sym),
Ok(Some(_))
));
assert!(matches!(
parser.match_token_where(|token| token.kind() == TokenKind::Sym),
Ok(None)
));
assert!(matches!(
parser.match_token_where(|token| token.kind() == TokenKind::Str),
Ok(Some(_))
));
assert!(matches!(
parser.match_token_where(|token| token.kind() == TokenKind::Str),
Ok(None)
));
assert!(parser.is_eof());
}
#[test]
fn test_base_expr() {
let mut parser = Parser::try_from("1").unwrap();
assert!(matches!(
parser.next_base_expr(),
Ok(
Expr::Base(
BaseExpr {
kind: BaseExprKind::Num,
..
}
)
)
));
}
}

View File

@@ -3,6 +3,8 @@ use std::fmt::{Display, Formatter, self};
#[derive(Debug, Clone, Copy, PartialEq, Eq)] #[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenKind { pub enum TokenKind {
KwReturn,
Ident, Ident,
Num, Num,
Str, Str,
@@ -18,14 +20,18 @@ pub enum TokenKind {
Eq, Eq,
Arrow, Arrow,
Plus,
KwReturn, Minus,
Splat,
FSlash,
} }
impl Display for TokenKind { impl Display for TokenKind {
fn fmt(&self, fmt: &mut Formatter) -> fmt::Result { fn fmt(&self, fmt: &mut Formatter) -> fmt::Result {
use TokenKind::*; use TokenKind::*;
let s = match self { let s = match self {
KwReturn => "return keyword",
Ident => "identifier", Ident => "identifier",
Num => "number", Num => "number",
Str => "string", Str => "string",
@@ -41,8 +47,10 @@ impl Display for TokenKind {
Eq => "equals", Eq => "equals",
Arrow => "arrow", Arrow => "arrow",
Plus => "plus",
KwReturn => "return keyword", Minus => "minus",
Splat => "splat (or times)",
FSlash => "fslash (or divide)",
}; };
Display::fmt(s, fmt) Display::fmt(s, fmt)
} }