From 58421a0469dfade4fc21c14eaf1710c6ccda7524 Mon Sep 17 00:00:00 2001 From: Alek Ratzloff Date: Mon, 27 Apr 2020 12:42:17 -0400 Subject: [PATCH] Initial commit with lexer Signed-off-by: Alek Ratzloff --- .gitignore | 1 + Cargo.lock | 84 +++++++++++ Cargo.toml | 12 ++ src/main.rs | 6 + src/syn/error.rs | 36 +++++ src/syn/lexer.rs | 360 +++++++++++++++++++++++++++++++++++++++++++++++ src/syn/mod.rs | 4 + src/syn/span.rs | 150 ++++++++++++++++++++ src/syn/token.rs | 68 +++++++++ src/util.rs | 38 +++++ 10 files changed, 759 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/main.rs create mode 100644 src/syn/error.rs create mode 100644 src/syn/lexer.rs create mode 100644 src/syn/mod.rs create mode 100644 src/syn/span.rs create mode 100644 src/syn/token.rs create mode 100644 src/util.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..b3d3405 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,84 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "maplit" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" + +[[package]] +name = "not-python" +version = "0.1.0" +dependencies = [ + "lazy_static", + "maplit", + "snafu", +] + +[[package]] +name = "proc-macro2" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df246d292ff63439fea9bc8c0a270bed0e390d5ebd4db4ba15aba81111b5abe3" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "quote" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bdc6c187c65bca4260c9011c9e3132efe4909da44726bad24cf7572ae338d7f" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "snafu" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1ec0ae2ed980f26e1ad62e717feb01df90731df56887b5391a2c79f9f6805be" +dependencies = [ + "doc-comment", + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ec32ba84a7a86aeb0bc32fd0c46d31b0285599f68ea72e87eff6127889d99e1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "syn" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "410a7488c0a728c7ceb4ad59b9567eb4053d02e8cc7f5c0e0eeeb39518369213" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "unicode-xid" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..21e1da6 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "not-python" +version = "0.1.0" +authors = ["Alek Ratzloff "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +snafu = "0.6.6" +lazy_static = "1.4.0" +maplit = "1.0.2" diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..c082f31 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,6 @@ +mod syn; +mod util; + +fn main() { + println!("Hello, world!"); +} diff --git a/src/syn/error.rs b/src/syn/error.rs new file mode 100644 index 0000000..f43b214 --- /dev/null +++ b/src/syn/error.rs @@ -0,0 +1,36 @@ +use crate::syn::span::*; +use snafu::Snafu; + +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("expected {}, but got {} instead", expected, got))] + ExpectedGot { + expected: String, + got: String, + span: Span, + }, + + #[snafu(display("unexpected {}", what))] + Unexpected { + what: String, + span: Span, + }, + + #[snafu(display("unknown {}", what))] + Unknown { + what: String, + span: Span, + } +} + +impl Spanned for Error { + fn span(&self) -> Span { + match self { + Error::ExpectedGot { span, .. } + | Error::Unknown { span, .. } + | Error::Unexpected { span, .. } => { *span } + } + } +} + +pub type Result = std::result::Result; diff --git a/src/syn/lexer.rs b/src/syn/lexer.rs new file mode 100644 index 0000000..5504e3e --- /dev/null +++ b/src/syn/lexer.rs @@ -0,0 +1,360 @@ +use crate::{ + syn::{error::*, span::*, token::*}, + util::LazyString, +}; +use lazy_static::lazy_static; +use maplit::hashmap; +use std::{collections::HashMap, mem, str::Chars}; + +const IDENT_START_CHARS: &'static [char] = &[ + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', + 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', + 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', +]; +const IDENT_CHARS: &'static [char] = &[ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', + 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', + 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', + 'V', 'W', 'X', 'Y', 'Z', '_', +]; + +const DEC_NUM_CHARS: &'static [char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']; + +const HEX_NUM_CHARS: &'static [char] = &[ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C', + 'D', 'E', 'F', +]; + +const STR_QUOTE_CHARS: &'static [char] = &['"', '\'']; + +const OP_CHARS: &'static [char] = &['=', '+', '*', '-', '/', '>', '<', '~', '!', '%', '^']; + +lazy_static! { + static ref OPS: HashMap<&'static str, TokenKind> = hashmap! { + "=" => TokenKind::Eq, + "->" => TokenKind::Arrow, + }; + + static ref KEYWORDS: HashMap<&'static str, TokenKind> = hashmap! { + "return" => TokenKind::KwReturn, + }; +} + +pub struct Lexer<'t> { + chars: Chars<'t>, + text: &'t str, + start: Pos, + end: Pos, +} + +impl<'t> Lexer<'t> { + /// Creates a new lexer that tokenizes the given text. + pub fn new(text: &'t str) -> Self { + // load the first position into the start/end position trackers + let pos = if let Some(c) = text.chars().next() { + Pos::from_char(c, 0, 0, 0, 0) + } else { + Default::default() + }; + + Lexer { + chars: text.chars(), + text, + start: pos, + end: pos, + } + } + + /// Gets whether this lexer has reached the EOF. + pub fn is_eof(&self) -> bool { + self.chars.clone().next().is_none() + } + + //////////////////////////////////////////////////////////////////////////////// + // Character advancement + //////////////////////////////////////////////////////////////////////////////// + fn curr_char(&self) -> Option { + self.chars.clone().next() + } + + fn adv_char(&mut self) -> Option { + let c = self.chars.next()?; + self.end = self.end.next_char(c); + Some(c) + } + + fn skip_whitespace(&mut self) { + self.match_while(|c| c.is_whitespace()); + } + + fn catchup(&mut self) -> Span { + let start = mem::replace(&mut self.start, self.end); + Span { + start, + end: self.end, + } + } + + fn make_token(&mut self, kind: TokenKind) -> Token { + let span = self.catchup(); + Token::new(kind, span) + } + + //////////////////////////////////////////////////////////////////////////////// + // Tokens + //////////////////////////////////////////////////////////////////////////////// + pub fn next_token(&mut self) -> Result> { + self.skip_whitespace(); + + let curr = if let Some(curr) = self.curr_char() { + curr + } else { + return Ok(None); + }; + + let token = match curr { + c if IDENT_START_CHARS.contains(&c) => self.next_ident_or_kw()?, + c if DEC_NUM_CHARS.contains(&c) => self.next_num()?, + '"' | '\'' => self.next_str()?, + ':' => self.next_sym()?, + '(' => self.next_char_token('(', TokenKind::LParen)?, + ')' => self.next_char_token(')', TokenKind::RParen)?, + '{' => self.next_char_token('{', TokenKind::LBrace)?, + '}' => self.next_char_token('}', TokenKind::RBrace)?, + '[' => self.next_char_token('[', TokenKind::LBracket)?, + ']' => self.next_char_token(']', TokenKind::RBracket)?, + ',' => self.next_char_token(',', TokenKind::Comma)?, + c if OP_CHARS.contains(&c) => self.next_op()?, + c => return Err(Error::Unexpected { + what: format!("character {}", c.escape_debug()), + span: self.span(), + }) + }; + Ok(Some(token)) + } + + fn next_ident_or_kw(&mut self) -> Result { + let ident = self.expect_ident("identifier")?; + let kind = KEYWORDS.get(ident).copied() + .unwrap_or(TokenKind::Ident); + Ok(self.make_token(kind)) + } + + fn next_num(&mut self) -> Result { + let first = self.expect_any(DEC_NUM_CHARS, "number")?; + let alphabet = if first == '0' && matches!(self.curr_char(), Some('x') | Some('X')) { + self.adv_char().unwrap(); + self.expect_any(HEX_NUM_CHARS, "hex number")?; + HEX_NUM_CHARS + } else { + DEC_NUM_CHARS + }; + + self.match_while(|c| alphabet.contains(&c)); + Ok(self.make_token(TokenKind::Num)) + } + + fn next_str(&mut self) -> Result { + let start_char = self.expect_any(STR_QUOTE_CHARS, "string")?; + while let Some(c) = self.match_where(|curr| curr != start_char) { + if c == '\\' { + // Match escapes + self.expect_any(&['n', 't', 'r', '\\', '\'', '\"', '0'], "escape character")?; + } + } + self.expect_char(start_char, "end of string")?; + Ok(self.make_token(TokenKind::Str)) + } + + fn next_sym(&mut self) -> Result { + self.expect_char(':', "symbol")?; + self.expect_ident("symbol")?; + Ok(self.make_token(TokenKind::Sym)) + } + + fn next_op(&mut self) -> Result { + self.expect_any(OP_CHARS, "operator")?; + let op_text = self.match_while(|c| OP_CHARS.contains(&c)); + if let Some(kind) = OPS.get(op_text).copied() { + Ok(self.make_token(kind)) + } else { + Err(Error::Unknown { + what: format!("operator {}", op_text.escape_debug()), + span: self.span(), + }) + } + } + + fn next_char_token(&mut self, c: char, kind: TokenKind) -> Result { + self.expect_char(c, LazyString::new(|| format!("{} token", kind)))?; + Ok(self.make_token(kind)) + } + + //////////////////////////////////////////////////////////////////////////////// + // Character pattern matching + //////////////////////////////////////////////////////////////////////////////// + fn match_where

(&mut self, p: P) -> Option + where + P: Fn(char) -> bool, + { + if (p)(self.curr_char()?) { + self.adv_char() + } else { + None + } + } + + fn match_while

(&mut self, p: P) -> &str + where + P: Fn(char) -> bool + Copy, + { + while self.match_where(p).is_some() {} + self.text_at(self.text) + } + + fn expect_where

(&mut self, p: P, expected: impl ToString) -> Result + where + P: Fn(char) -> bool, + { + // Check EOF + self.curr_char().ok_or_else(|| Error::ExpectedGot { + expected: expected.to_string(), + got: "EOF".to_string(), + span: self.span(), + })?; + + // Match + self.match_where(p).ok_or_else(|| Error::ExpectedGot { + expected: expected.to_string(), + got: format!("{} character", self.curr_char().unwrap().escape_debug()), + span: self.span(), + }) + } + + fn expect_char(&mut self, c: char, expected: impl ToString) -> Result { + self.expect_where(|curr| curr == c, expected) + } + + fn expect_any(&mut self, chars: &[char], expected: impl ToString) -> Result { + self.expect_where(|curr| chars.contains(&curr), expected) + } + + fn expect_ident(&mut self, expected: impl ToString) -> Result<&str> { + self.expect_any(IDENT_START_CHARS, expected)?; + Ok(self.match_while(|curr| IDENT_CHARS.contains(&curr))) + } +} + +impl Spanned for Lexer<'_> { + fn span(&self) -> Span { + Span { + start: self.start, + end: self.end, + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_next_token_eof() { + let mut lexer = Lexer::new(""); + assert!(matches!(lexer.next_token(), Ok(None))); + assert!(lexer.is_eof()); + + let mut lexer = Lexer::new(" "); + assert!(matches!(lexer.next_token(), Ok(None))); + assert!(lexer.is_eof()); + + let mut lexer = Lexer::new(" \n \n \n\r\n\t "); + assert!(matches!(lexer.next_token(), Ok(None))); + assert!(lexer.is_eof()); + } + + macro_rules! test_token { + ($text:expr, $token_kind:expr, $token_text:expr) => {{ + let text = $text; + let mut lexer = Lexer::new(text); + let token = lexer.next_token().expect("token").expect("token"); + assert_eq!(token.kind(), $token_kind); + assert_eq!(token.text_at(text), $token_text); + }}; + + ($text:expr, $token_kind:expr) => {{ + test_token!($text, $token_kind, $text); + }}; + } + + #[test] + fn test_ident_token() { + test_token!("ident", TokenKind::Ident); + test_token!("OtherIdent", TokenKind::Ident); + test_token!("other_ident", TokenKind::Ident); + test_token!("ident1234", TokenKind::Ident); + test_token!("RETURN", TokenKind::Ident); + } + + #[test] + fn test_keywords() { + test_token!("return", TokenKind::KwReturn); + } + + #[test] + fn test_num_token() { + test_token!("1234", TokenKind::Num); + test_token!("4321", TokenKind::Num); + test_token!("123498765", TokenKind::Num); + test_token!("432156789", TokenKind::Num); + test_token!("0xdcbaBEEF", TokenKind::Num); + test_token!("0xabcdFEED", TokenKind::Num); + test_token!("0XdcbaBEEF", TokenKind::Num); + test_token!("0XabcdFEED", TokenKind::Num); + test_token!("0X123456789DCBAbeef", TokenKind::Num); + test_token!("0xABCDfeed192837465", TokenKind::Num); + } + + #[test] + fn test_str_token() { + test_token!(r#""this is a string""#, TokenKind::Str); + test_token!(r#"'this is a string'"#, TokenKind::Str); + test_token!(r#"'this is a string\nwith escapes'"#, TokenKind::Str); + test_token!(r#""this is a string\nwith escapes""#, TokenKind::Str); + } + + #[test] + fn test_sym_token() { + test_token!(":symbol", TokenKind::Sym); + test_token!(":OtherSymbol", TokenKind::Sym); + test_token!(":other_symbol", TokenKind::Sym); + test_token!(":symbol1234", TokenKind::Sym); + } + + #[test] + fn test_single_char_symbols() { + test_token!("(", TokenKind::LParen); + test_token!(")", TokenKind::RParen); + test_token!("{", TokenKind::LBrace); + test_token!("}", TokenKind::RBrace); + test_token!("[", TokenKind::LBracket); + test_token!("]", TokenKind::RBracket); + test_token!(",", TokenKind::Comma); + } + + #[test] + fn test_op_tokens() { + test_token!("=", TokenKind::Eq); + test_token!("->", TokenKind::Arrow); + } + + #[test] + fn test_expect_char() { + let mut lexer = Lexer::new("asdf"); + assert!(matches!(lexer.expect_char('a', "a"), Ok('a'))); + assert!(matches!(lexer.expect_char('s', "s"), Ok('s'))); + assert!(matches!(lexer.expect_char('d', "d"), Ok('d'))); + assert!(matches!(lexer.expect_char('f', "f"), Ok('f'))); + assert!(lexer.is_eof()); + } +} diff --git a/src/syn/mod.rs b/src/syn/mod.rs new file mode 100644 index 0000000..9959f63 --- /dev/null +++ b/src/syn/mod.rs @@ -0,0 +1,4 @@ +pub mod error; +pub mod lexer; +pub mod span; +pub mod token; diff --git a/src/syn/span.rs b/src/syn/span.rs new file mode 100644 index 0000000..c744e86 --- /dev/null +++ b/src/syn/span.rs @@ -0,0 +1,150 @@ +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Pos { + pub source: usize, + pub line: usize, + pub col: usize, + pub byte: usize, + pub len: usize, +} + +impl Default for Pos { + fn default() -> Self { + Pos { + source: 0, + line: 0, + col: 0, + byte: 0, + len: 1, + } + } +} + +impl Pos { + pub fn from_char(c: char, source: usize, line: usize, col: usize, byte: usize) -> Self { + Pos { + source, line, col, byte, len: c.len_utf8(), + } + } + + pub fn next_char(&self, c: char) -> Self { + Pos { + source: self.source + 1, + line: self.line, + col: self.col + 1, + byte: self.byte + self.len, + len: c.len_utf8(), + } + } + + pub fn min(self, other: Self) -> Self { + if self.byte < other.byte { + self + } else { + other + } + } + + pub fn max(self, other: Self) -> Self { + if self.byte > other.byte { + self + } else { + other + } + } +} + +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct Span { + pub start: Pos, + pub end: Pos, +} + +impl Span { + pub fn union(self, other: Self) -> Self { + let start = self.start.min(other.start); + let end = self.end.max(other.end); + Span { start, end } + } +} + +pub trait Spanned { + fn span(&self) -> Span; + + fn text_at<'t>(&self, text: &'t str) -> &'t str { + let Span { start, end } = self.span(); + &text[start.byte .. end.byte] + } +} + +impl Spanned for Span { + fn span(&self) -> Span { + *self + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_pos_min() { + let small = Pos::default(); + let large = Pos { + source: 1, + byte: 1, + ..Default::default() + }; + + assert_eq!(small.min(large), small); + assert_eq!(large.min(small), small); + } + + #[test] + fn test_pos_max() { + let small = Pos::default(); + let large = Pos { + source: 1, + byte: 1, + ..Default::default() + }; + + assert_eq!(small.max(large), large); + assert_eq!(large.max(small), large); + } + + #[test] + fn test_span_union() { + let first = Span { + start: Pos::default(), + end: Pos { + source: 15, + col: 15, + byte: 15, + ..Default::default() + } + }; + + let second = Span { + start: Pos { + source: 25, + col: 25, + byte: 25, + ..Default::default() + }, + end: Pos { + source: 27, + col: 27, + byte: 27, + ..Default::default() + } + }; + + let expected = Span { + start: first.start, + end: second.end, + }; + + assert_eq!(first.union(second), expected); + assert_eq!(second.union(first), expected); + } +} diff --git a/src/syn/token.rs b/src/syn/token.rs new file mode 100644 index 0000000..de9e7d3 --- /dev/null +++ b/src/syn/token.rs @@ -0,0 +1,68 @@ +use crate::syn::span::*; +use std::fmt::{Display, Formatter, self}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TokenKind { + Ident, + Num, + Str, + Sym, + + LParen, + RParen, + LBrace, + RBrace, + LBracket, + RBracket, + Comma, + + Eq, + Arrow, + + KwReturn, +} + +impl Display for TokenKind { + fn fmt(&self, fmt: &mut Formatter) -> fmt::Result { + use TokenKind::*; + let s = match self { + Ident => "identifier", + Num => "number", + Str => "string", + Sym => "symbol", + + LParen => "left paren", + RParen => "right paren", + LBrace => "left brace", + RBrace => "right brace", + LBracket => "left bracket", + RBracket => "right bracket", + Comma => "comma", + + Eq => "equals", + Arrow => "arrow", + + KwReturn => "return keyword", + }; + Display::fmt(s, fmt) + } +} + +pub struct Token { + kind: TokenKind, + span: Span, +} + +impl Token { + pub fn new(kind: TokenKind, span: Span) -> Self { + Token { kind, span, } + } + + pub fn kind(&self) -> TokenKind { + self.kind + } +} + +impl Spanned for Token { + fn span(&self) -> Span { self.span } +} diff --git a/src/util.rs b/src/util.rs new file mode 100644 index 0000000..a50e422 --- /dev/null +++ b/src/util.rs @@ -0,0 +1,38 @@ +use std::{ + fmt::{Display, Formatter, self}, +}; + +pub struct LazyString<'f, F> + where F: Fn() -> String + 'f +{ + source: F, + _lifetime: std::marker::PhantomData, +} + +impl<'f, F> LazyString<'f, F> + where F: Fn() -> String + 'f +{ + pub fn new(source: F) -> Self { + LazyString { + source, + _lifetime: Default::default(), + } + } +} + +impl<'f, F> Display for LazyString<'f, F> + where F: Fn() -> String + 'f +{ + fn fmt(&self, fmt: &mut Formatter) -> fmt::Result { + let s = (self.source)(); + Display::fmt(&s, fmt) + } +} + +#[test] +fn test_lazy_string() { + let i = 10; + + let lzstr = LazyString::new(|| format!("the value is {}", i)); + assert_eq!(lzstr.to_string(), "the value is 10"); +}