Files
not-python-rust/src/parser.rs

802 lines
23 KiB
Rust
Raw Normal View History

use common_macros::hash_map;
use thiserror::Error;
use std::collections::HashMap;
use std::fmt::{self, Display};
use std::path::{Path, PathBuf};
use std::sync::OnceLock;
use crate::ast::*;
use crate::token::{Token, TokenKind};
////////////////////////////////////////////////////////////////////////////////
// ParseError
////////////////////////////////////////////////////////////////////////////////
#[derive(Error, Debug)]
pub struct ParseError {
pub message: String,
pub line: usize,
pub path: PathBuf,
}
pub type Result<T> = std::result::Result<T, ParseError>;
impl Display for ParseError {
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
write!(
fmt,
"in {} at line {}: {}",
self.path.as_os_str().to_str().unwrap(),
self.line,
self.message
)
}
}
////////////////////////////////////////////////////////////////////////////////
// Constants
////////////////////////////////////////////////////////////////////////////////
const WHITESPACE: &str = " \t\r";
const NAME_START_CHARS: &str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_";
const NAME_CHARS: &str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789-";
const NUMBER_START_CHARS: &str = "0123456789";
const NUMBER_CHARS: &str = "0123456789.";
const STRING_START_CHARS: &str = "'\"";
const STRING_ESCAPES: &str = "nrt\\\"'";
////////////////////////////////////////////////////////////////////////////////
// Lexer
////////////////////////////////////////////////////////////////////////////////
#[derive(Debug)]
pub struct Lexer {
line: usize,
index: usize,
start: usize,
text: String,
path: PathBuf,
paren_stack: Vec<char>,
was_error: bool,
}
impl Lexer {
pub fn new(text: String, path: &dyn AsRef<Path>) -> Self {
Self {
line: 1,
index: 1,
start: 0,
text,
path: path.as_ref().into(),
paren_stack: Vec::new(),
was_error: false,
}
}
pub fn is_eof(&self) -> bool {
self.index >= self.text.len()
}
pub fn lexeme(&self) -> &str {
&self.text[self.start..self.index - 1]
}
pub fn was_error(&self) -> bool {
self.was_error
}
fn current(&self) -> char {
if self.is_eof() {
return '\0';
}
self.text[self.index - 1..].chars().nth(0).unwrap()
}
fn ignore_newlines(&self) -> bool {
self.paren_stack.len() > 0 && self.paren_stack.last() != Some(&'}')
}
fn error(&mut self, message: impl ToString) -> ParseError {
self.was_error = true;
ParseError {
message: message.to_string(),
line: self.line,
path: self.path.clone(),
}
}
fn advance(&mut self) {
if self.is_eof() {
return;
}
if self.current() == '\n' {
self.line += 1;
}
self.index += self.text[self.index - 1..]
.chars()
.nth(0)
.unwrap()
.len_utf8();
}
fn mat(&mut self, c: char) -> bool {
if self.current() == c {
self.advance();
return true;
} else {
return false;
}
}
fn skip_whitespace(&mut self) {
while WHITESPACE.contains(self.current())
|| (self.current() == '\n' && self.ignore_newlines())
|| self.current() == '#'
{
if self.current() == '#' {
self.advance();
while self.current() != '\n' && !self.is_eof() {
self.advance();
}
self.mat('\n');
} else {
self.advance();
}
}
self.start = self.index - 1;
}
fn make_token(&mut self, kind: TokenKind) -> Token {
let token = Token {
line: self.line,
//index: self.start,
text: self.lexeme().to_string(),
kind,
};
self.start = self.index - 1;
token
}
pub fn next(&mut self) -> Result<Token> {
self.skip_whitespace();
if self.is_eof() {
return Ok(self.make_token(TokenKind::Eof));
} else if NAME_START_CHARS.contains(self.current()) {
return Ok(self.name());
} else if NUMBER_START_CHARS.contains(self.current()) {
return Ok(self.number());
} else if STRING_START_CHARS.contains(self.current()) {
return self.string();
} else if self.mat('+') {
return Ok(self.make_token(TokenKind::Plus));
} else if self.mat('-') {
if self.mat('>') {
return Ok(self.make_token(TokenKind::Arrow));
} else {
return Ok(self.make_token(TokenKind::Minus));
}
} else if self.mat('*') {
return Ok(self.make_token(TokenKind::Star));
} else if self.mat('/') {
return Ok(self.make_token(TokenKind::Slash));
} else if self.mat('&') {
if self.mat('&') {
return Ok(self.make_token(TokenKind::And));
}
} else if self.mat('|') {
if self.mat('|') {
return Ok(self.make_token(TokenKind::Or));
}
} else if self.mat('!') {
if self.mat('=') {
return Ok(self.make_token(TokenKind::BangEq));
} else {
return Ok(self.make_token(TokenKind::Bang));
}
} else if self.mat('=') {
if self.mat('=') {
return Ok(self.make_token(TokenKind::EqEq));
} else {
return Ok(self.make_token(TokenKind::Eq));
}
} else if self.mat('<') {
if self.mat('=') {
return Ok(self.make_token(TokenKind::LessEq));
} else {
return Ok(self.make_token(TokenKind::Less));
}
} else if self.mat('>') {
if self.mat('=') {
return Ok(self.make_token(TokenKind::GreaterEq));
} else {
return Ok(self.make_token(TokenKind::Greater));
}
} else if self.mat('(') {
self.paren_stack.push(')');
return Ok(self.make_token(TokenKind::LParen));
} else if self.mat(')') {
return match self.paren_stack.last() {
None => Err(self.error("')' has unmatched '('")),
Some(')') => {
self.paren_stack.pop();
Ok(self.make_token(TokenKind::RParen))
}
Some(c) => Err(self.error(format!("mismatched ')' (expected {:?})", c))),
};
} else if self.mat('{') {
self.paren_stack.push('}');
return Ok(self.make_token(TokenKind::LBrace));
} else if self.mat('}') {
return match self.paren_stack.last() {
None => Err(self.error("'}' has unmatched '{'")),
Some('}') => {
self.paren_stack.pop();
Ok(self.make_token(TokenKind::RBrace))
}
Some(c) => Err(self.error(format!("mismatched '}}' (expected {:?})", c))),
};
} else if self.mat('[') {
self.paren_stack.push(']');
return Ok(self.make_token(TokenKind::LBracket));
} else if self.mat(']') {
return match self.paren_stack.last() {
None => Err(self.error("']' has unmatched '['")),
Some(']') => {
self.paren_stack.pop();
Ok(self.make_token(TokenKind::RBracket))
}
Some(c) => Err(self.error(format!("mismatched ']' (expected {:?})", c))),
};
} else if self.mat('.') {
return Ok(self.make_token(TokenKind::Dot));
} else if self.mat(',') {
return Ok(self.make_token(TokenKind::Comma));
} else if self.mat(':') {
return Ok(self.make_token(TokenKind::Colon));
} else if self.mat('\n') {
assert!(!self.ignore_newlines());
// fix the line number since it will have already advanced when we make the token
self.line -= 1;
let token = self.make_token(TokenKind::Eol);
self.line += 1;
return Ok(token);
} else if self.mat(';') {
return Ok(self.make_token(TokenKind::Eol));
}
Err(self.error(format!("unexpected character: {:?}", self.current())))
}
fn name(&mut self) -> Token {
static KEYWORDS: OnceLock<HashMap<&'static str, TokenKind>> = OnceLock::new();
let keywords = KEYWORDS.get_or_init(|| {
hash_map! {
"return" => TokenKind::Return,
"if" => TokenKind::If,
"else" => TokenKind::Else,
"true" => TokenKind::True,
"false" => TokenKind::False,
"nil" => TokenKind::Nil,
}
});
while NAME_CHARS.contains(self.current()) {
self.advance();
}
if let Some(kind) = keywords.get(self.lexeme()) {
self.make_token(*kind)
} else {
self.make_token(TokenKind::Name)
}
}
fn number(&mut self) -> Token {
while NUMBER_CHARS.contains(self.current()) {
self.advance();
}
self.make_token(TokenKind::Number)
}
fn string(&mut self) -> Result<Token> {
let terminator = self.current();
self.advance();
while self.current() != terminator && !self.is_eof() {
if self.current() == '\\' {
self.advance();
if STRING_ESCAPES.contains(self.current()) {
self.advance();
} else {
return Err(self.error(format!("unknown string escape {:?}", self.current())));
}
} else {
self.advance();
}
}
if self.current() == terminator {
self.advance();
Ok(self.make_token(TokenKind::String))
} else {
Err(self.error("unterminated string"))
}
}
}
////////////////////////////////////////////////////////////////////////////////
// Parser
////////////////////////////////////////////////////////////////////////////////
macro_rules! mat {
($self:expr, $($op:expr),+ $(,)?) => {
$($self.mat($op)?)||+
};
}
macro_rules! expect {
($self:expr, $message:expr, $($kind:expr),+ $(,)?) => {{
if mat!($self, $($kind),+) {
Ok($self.prev.clone().unwrap())
} else {
Err($self.error($message))
}
}};
}
macro_rules! bin_expr {
($name:ident, $next:ident, $($op:expr),+ $(,)?) => {
fn $name(&mut self) -> Result<ExprP> {
let mut expr = self.$next()?;
while $(self.mat($op)?)||+ {
let op = self.prev.clone().unwrap();
let rhs = self.$next()?;
expr = Box::new(BinaryExpr {lhs: expr, op, rhs});
}
Ok(expr)
}
};
}
pub struct Parser {
lexer: Lexer,
prev: Option<Token>,
current: Token,
next: Token,
was_error: bool,
}
impl Parser {
pub fn new(text: String, path: &dyn AsRef<Path>) -> Result<Self> {
let mut lexer = Lexer::new(text, path);
let prev = None;
let current = lexer.next()?;
let next = lexer.next()?;
Ok(Self {
lexer,
prev,
current,
next,
was_error: false,
})
}
pub fn parse_all(&mut self) -> Result<Vec<StmtP>> {
let mut stmts = Vec::new();
while !self.is_eof() {
if let Some(s) = self.stmt()? {
stmts.push(s);
}
}
Ok(stmts)
}
//
// Properties
//
fn line(&self) -> usize {
self.lexer.line
}
fn path(&self) -> &Path {
&self.lexer.path
}
fn is_eof(&self) -> bool {
self.lexer.is_eof()
}
pub fn was_error(&self) -> bool {
self.was_error || self.lexer.was_error()
}
//
// Parser primitives
//
fn advance(&mut self) -> Result<()> {
self.prev = Some(self.current.clone());
self.current = self.next.clone();
self.next = self.lexer.next()?;
Ok(())
}
fn check(&self, what: TokenKind) -> bool {
self.current.kind == what
}
fn mat(&mut self, what: TokenKind) -> Result<bool> {
if self.check(what) {
self.advance()?;
Ok(true)
} else {
Ok(false)
}
}
fn expect(&mut self, message: impl Display, what: TokenKind) -> Result<&Token> {
if self.mat(what)? {
Ok(self.prev.as_ref().unwrap())
} else {
Err(self.error(format!(
"{message} (NOTE: got {:?} {:?})",
self.current.kind, self.current.text
)))
}
}
fn error(&mut self, message: impl ToString) -> ParseError {
self.was_error = true;
ParseError {
message: message.to_string(),
line: self.line(),
path: self.path().into(),
}
}
fn synchronize(&mut self) -> Result<()> {
while !self.is_eof() {
match self.current.kind {
TokenKind::Return | TokenKind::If | TokenKind::LBrace => {
break;
}
_ => self.advance()?,
}
}
Ok(())
}
//
// Statements
//
fn stmt(&mut self) -> Result<Option<StmtP>> {
// skip past end-lines to get to the good stuff
while self.mat(TokenKind::Eol)? {
continue;
}
// nothing left after EOLs
if self.is_eof() {
return Ok(None);
}
match self.stmt_wrapped() {
Ok(result) => Ok(Some(result)),
Err(e) => {
eprintln!("{}", e);
self.synchronize()?;
Ok(None)
}
}
}
fn stmt_wrapped(&mut self) -> Result<StmtP> {
if self.mat(TokenKind::Return)? {
self.return_stmt()
} else if self.mat(TokenKind::If)? {
self.if_stmt()
} else if self.mat(TokenKind::LBrace)? {
let lbrace = self.prev.clone().unwrap();
let stmts = self.block()?;
let rbrace = self.prev.clone().unwrap();
Ok(Box::new(BlockStmt {
lbrace,
stmts,
rbrace,
}) as Box<dyn Stmt + 'static>)
} else if self.current.kind == TokenKind::Name && self.next.kind == TokenKind::Eq {
self.assign_stmt()
} else {
let expr = self.expr()?;
let stmt: StmtP;
if expr.as_any_ref().downcast_ref::<GetExpr>().is_some() && self.mat(TokenKind::Eq)? {
let expr = expr.as_any().downcast::<GetExpr>().unwrap();
let rhs = self.expr()?;
// unpack the GetExpr and turn it into a SetExpr instead
stmt = Box::new(SetStmt {
expr: expr.expr,
name: expr.name,
rhs,
});
} else {
stmt = Box::new(ExprStmt { expr });
}
expect!(
self,
"expect end of line after expression",
TokenKind::Eol,
TokenKind::Eof,
)?;
Ok(stmt)
}
}
fn return_stmt(&mut self) -> Result<StmtP> {
let return_kw = self.prev.clone().unwrap();
let mut expr = None;
if !self.check(TokenKind::Eol) && !self.check(TokenKind::RBrace) {
expr = Some(self.expr()?);
}
if !self.check(TokenKind::RBrace) {
expect!(
self,
"expected end of line after return statement",
TokenKind::Eol,
TokenKind::Eof,
)?;
}
Ok(Box::new(ReturnStmt { return_kw, expr }))
}
fn if_stmt(&mut self) -> Result<StmtP> {
let if_kw = self.prev.clone().unwrap();
let condition = self.expr()?;
self.expect("expect '{' after 'if' condition", TokenKind::LBrace)?;
let then_branch = self.block_stmt()?;
let mut else_branch = Vec::new();
if self.mat(TokenKind::Else)? {
if self.mat(TokenKind::If)? {
else_branch.push(self.if_stmt()?);
} else {
self.expect("expect '{' after else statement", TokenKind::LBrace)?;
else_branch = self.block()?;
}
}
Ok(Box::new(IfStmt {
if_kw,
condition,
then_branch,
else_branch,
}))
}
fn block_stmt(&mut self) -> Result<BlockStmt> {
let lbrace = self.prev.clone().unwrap();
assert_eq!(lbrace.kind, TokenKind::LBrace);
let stmts = self.block()?;
let rbrace = self.prev.clone().unwrap();
assert_eq!(rbrace.kind, TokenKind::RBrace);
Ok(BlockStmt {
lbrace,
stmts,
rbrace,
})
}
fn block(&mut self) -> Result<Vec<StmtP>> {
let mut stmts = Vec::new();
// the stmt rule is skipping past EOLs too. however if there's nothing *except* for EOLs
// remaining for the rest of the block, we want to know about that head of time rather than
// let the statement rule handle it.
// so we handle a bunch of EOLs right here and now.
while self.mat(TokenKind::Eol)? {
continue;
}
while !self.check(TokenKind::RBrace) && !self.is_eof() {
let s = self.stmt()?;
if let Some(s) = s {
stmts.push(s);
} else {
break;
}
while self.mat(TokenKind::Eol)? {
continue;
}
}
self.expect("expect '}' after statement block", TokenKind::RBrace)?;
Ok(stmts)
}
fn assign_stmt(&mut self) -> Result<StmtP> {
let name = self
.expect("expect name for assign statement", TokenKind::Name)?
.clone();
self.expect("expect '=' after name", TokenKind::Eq)?;
let expr = self.expr()?;
if !self.check(TokenKind::RBrace) {
expect!(
self,
"expected end of line after assign statement",
TokenKind::Eol,
TokenKind::Eof
)?;
}
Ok(Box::new(AssignStmt {
lhs: name,
rhs: expr,
}))
}
//
// Expressions
//
fn expr(&mut self) -> Result<ExprP> {
self.logical_or_expr()
}
bin_expr!(logical_or_expr, logical_and_expr, TokenKind::Or);
bin_expr!(logical_and_expr, equality_expr, TokenKind::And);
bin_expr!(
equality_expr,
compare_expr,
TokenKind::BangEq,
TokenKind::EqEq
);
bin_expr!(
compare_expr,
binary_term,
TokenKind::Less,
TokenKind::LessEq,
TokenKind::Greater,
TokenKind::GreaterEq
);
bin_expr!(
binary_term,
binary_factor,
TokenKind::Plus,
TokenKind::Minus
);
bin_expr!(binary_factor, unary_expr, TokenKind::Star, TokenKind::Slash);
fn unary_expr(&mut self) -> Result<ExprP> {
if mat!(self, TokenKind::Bang, TokenKind::Minus, TokenKind::Plus) {
let op = self.prev.clone().unwrap();
let expr = self.unary_expr()?;
Ok(Box::new(UnaryExpr { op, expr }))
} else {
self.call_expr()
}
}
fn call_expr(&mut self) -> Result<ExprP> {
let mut expr = self.primary_expr()?;
loop {
if self.mat(TokenKind::LParen)? {
expr = self.finish_call_expr(expr)?;
} else if self.mat(TokenKind::Dot)? {
let name = self
.expect("expect name after '.'", TokenKind::Name)?
.clone();
expr = Box::new(GetExpr { expr, name });
} else {
break;
}
}
Ok(expr)
}
fn finish_call_expr(&mut self, callee: ExprP) -> Result<ExprP> {
let mut args = Vec::new();
if !self.check(TokenKind::RParen) {
args.push(self.expr()?);
while self.mat(TokenKind::Comma)? {
// this allows a trailing comma
if self.check(TokenKind::RParen) {
break;
}
args.push(self.expr()?);
}
}
let rparen = self
.expect("expect ')' after function arguments", TokenKind::RParen)?
.clone();
Ok(Box::new(CallExpr {
expr: callee,
args,
rparen,
}))
}
fn primary_expr(&mut self) -> Result<ExprP> {
if mat!(
self,
TokenKind::Name,
TokenKind::Number,
TokenKind::String,
TokenKind::True,
TokenKind::False,
TokenKind::Nil
) {
Ok(Box::new(PrimaryExpr {
token: self.prev.clone().unwrap(),
}))
} else if self.mat(TokenKind::LParen)? {
let expr: ExprP;
// check if we're defining a function
if self.check(TokenKind::RParen) {
expr = self.finish_function_expr()?;
} else if self.current.kind == TokenKind::Name
&& (self.next.kind == TokenKind::RParen
|| self.next.kind == TokenKind::Colon
|| self.next.kind == TokenKind::Comma)
{
expr = self.finish_function_expr()?;
} else {
expr = self.expr()?;
self.expect("expect ')' after expression", TokenKind::RParen)?;
}
Ok(expr)
} else {
Err(self.error(format!("unexpected token {:?}", self.current.kind)))
}
}
fn finish_function_expr(&mut self) -> Result<ExprP> {
let lparen = self.prev.clone().unwrap();
let mut params = Vec::new();
if !self.check(TokenKind::RParen) {
self.parse_param(&mut params)?;
while self.mat(TokenKind::Comma)? {
if self.check(TokenKind::RParen) {
break;
}
self.parse_param(&mut params)?;
}
}
self.expect(
"expect ')' after function definition parameters",
TokenKind::RParen,
)?;
let mut return_type = None;
if self.mat(TokenKind::Arrow)? {
return_type = Some(self.expr()?);
}
self.expect("expect '{' after function signature", TokenKind::LBrace)?;
let body = self.block()?;
let rbrace = self.prev.clone().unwrap();
Ok(Box::new(FunctionExpr {
lparen,
params,
return_type,
body,
rbrace,
}))
}
fn parse_param(&mut self, params: &mut Vec<(Token, Option<ExprP>)>) -> Result<()> {
let name = self
.expect("expect name after function declaration", TokenKind::Name)?
.clone();
let mut ty = None;
if self.mat(TokenKind::Colon)? {
ty = Some(self.expr()?);
}
params.push((name, ty));
Ok(())
}
}