Initial commit with lexer

Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
2020-04-27 12:42:17 -04:00
commit 58421a0469
10 changed files with 759 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
/target

84
Cargo.lock generated Normal file
View File

@@ -0,0 +1,84 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
[[package]]
name = "doc-comment"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "maplit"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
[[package]]
name = "not-python"
version = "0.1.0"
dependencies = [
"lazy_static",
"maplit",
"snafu",
]
[[package]]
name = "proc-macro2"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df246d292ff63439fea9bc8c0a270bed0e390d5ebd4db4ba15aba81111b5abe3"
dependencies = [
"unicode-xid",
]
[[package]]
name = "quote"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2bdc6c187c65bca4260c9011c9e3132efe4909da44726bad24cf7572ae338d7f"
dependencies = [
"proc-macro2",
]
[[package]]
name = "snafu"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1ec0ae2ed980f26e1ad62e717feb01df90731df56887b5391a2c79f9f6805be"
dependencies = [
"doc-comment",
"snafu-derive",
]
[[package]]
name = "snafu-derive"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ec32ba84a7a86aeb0bc32fd0c46d31b0285599f68ea72e87eff6127889d99e1"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "syn"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "410a7488c0a728c7ceb4ad59b9567eb4053d02e8cc7f5c0e0eeeb39518369213"
dependencies = [
"proc-macro2",
"quote",
"unicode-xid",
]
[[package]]
name = "unicode-xid"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"

12
Cargo.toml Normal file
View File

@@ -0,0 +1,12 @@
[package]
name = "not-python"
version = "0.1.0"
authors = ["Alek Ratzloff <alekratz@gmail.com>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
snafu = "0.6.6"
lazy_static = "1.4.0"
maplit = "1.0.2"

6
src/main.rs Normal file
View File

@@ -0,0 +1,6 @@
mod syn;
mod util;
fn main() {
println!("Hello, world!");
}

36
src/syn/error.rs Normal file
View File

@@ -0,0 +1,36 @@
use crate::syn::span::*;
use snafu::Snafu;
#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("expected {}, but got {} instead", expected, got))]
ExpectedGot {
expected: String,
got: String,
span: Span,
},
#[snafu(display("unexpected {}", what))]
Unexpected {
what: String,
span: Span,
},
#[snafu(display("unknown {}", what))]
Unknown {
what: String,
span: Span,
}
}
impl Spanned for Error {
fn span(&self) -> Span {
match self {
Error::ExpectedGot { span, .. }
| Error::Unknown { span, .. }
| Error::Unexpected { span, .. } => { *span }
}
}
}
pub type Result<T, E = Error> = std::result::Result<T, E>;

360
src/syn/lexer.rs Normal file
View File

@@ -0,0 +1,360 @@
use crate::{
syn::{error::*, span::*, token::*},
util::LazyString,
};
use lazy_static::lazy_static;
use maplit::hashmap;
use std::{collections::HashMap, mem, str::Chars};
const IDENT_START_CHARS: &'static [char] = &[
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_',
];
const IDENT_CHARS: &'static [char] = &[
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B',
'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U',
'V', 'W', 'X', 'Y', 'Z', '_',
];
const DEC_NUM_CHARS: &'static [char] = &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'];
const HEX_NUM_CHARS: &'static [char] = &[
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'A', 'B', 'C',
'D', 'E', 'F',
];
const STR_QUOTE_CHARS: &'static [char] = &['"', '\''];
const OP_CHARS: &'static [char] = &['=', '+', '*', '-', '/', '>', '<', '~', '!', '%', '^'];
lazy_static! {
static ref OPS: HashMap<&'static str, TokenKind> = hashmap! {
"=" => TokenKind::Eq,
"->" => TokenKind::Arrow,
};
static ref KEYWORDS: HashMap<&'static str, TokenKind> = hashmap! {
"return" => TokenKind::KwReturn,
};
}
pub struct Lexer<'t> {
chars: Chars<'t>,
text: &'t str,
start: Pos,
end: Pos,
}
impl<'t> Lexer<'t> {
/// Creates a new lexer that tokenizes the given text.
pub fn new(text: &'t str) -> Self {
// load the first position into the start/end position trackers
let pos = if let Some(c) = text.chars().next() {
Pos::from_char(c, 0, 0, 0, 0)
} else {
Default::default()
};
Lexer {
chars: text.chars(),
text,
start: pos,
end: pos,
}
}
/// Gets whether this lexer has reached the EOF.
pub fn is_eof(&self) -> bool {
self.chars.clone().next().is_none()
}
////////////////////////////////////////////////////////////////////////////////
// Character advancement
////////////////////////////////////////////////////////////////////////////////
fn curr_char(&self) -> Option<char> {
self.chars.clone().next()
}
fn adv_char(&mut self) -> Option<char> {
let c = self.chars.next()?;
self.end = self.end.next_char(c);
Some(c)
}
fn skip_whitespace(&mut self) {
self.match_while(|c| c.is_whitespace());
}
fn catchup(&mut self) -> Span {
let start = mem::replace(&mut self.start, self.end);
Span {
start,
end: self.end,
}
}
fn make_token(&mut self, kind: TokenKind) -> Token {
let span = self.catchup();
Token::new(kind, span)
}
////////////////////////////////////////////////////////////////////////////////
// Tokens
////////////////////////////////////////////////////////////////////////////////
pub fn next_token(&mut self) -> Result<Option<Token>> {
self.skip_whitespace();
let curr = if let Some(curr) = self.curr_char() {
curr
} else {
return Ok(None);
};
let token = match curr {
c if IDENT_START_CHARS.contains(&c) => self.next_ident_or_kw()?,
c if DEC_NUM_CHARS.contains(&c) => self.next_num()?,
'"' | '\'' => self.next_str()?,
':' => self.next_sym()?,
'(' => self.next_char_token('(', TokenKind::LParen)?,
')' => self.next_char_token(')', TokenKind::RParen)?,
'{' => self.next_char_token('{', TokenKind::LBrace)?,
'}' => self.next_char_token('}', TokenKind::RBrace)?,
'[' => self.next_char_token('[', TokenKind::LBracket)?,
']' => self.next_char_token(']', TokenKind::RBracket)?,
',' => self.next_char_token(',', TokenKind::Comma)?,
c if OP_CHARS.contains(&c) => self.next_op()?,
c => return Err(Error::Unexpected {
what: format!("character {}", c.escape_debug()),
span: self.span(),
})
};
Ok(Some(token))
}
fn next_ident_or_kw(&mut self) -> Result<Token> {
let ident = self.expect_ident("identifier")?;
let kind = KEYWORDS.get(ident).copied()
.unwrap_or(TokenKind::Ident);
Ok(self.make_token(kind))
}
fn next_num(&mut self) -> Result<Token> {
let first = self.expect_any(DEC_NUM_CHARS, "number")?;
let alphabet = if first == '0' && matches!(self.curr_char(), Some('x') | Some('X')) {
self.adv_char().unwrap();
self.expect_any(HEX_NUM_CHARS, "hex number")?;
HEX_NUM_CHARS
} else {
DEC_NUM_CHARS
};
self.match_while(|c| alphabet.contains(&c));
Ok(self.make_token(TokenKind::Num))
}
fn next_str(&mut self) -> Result<Token> {
let start_char = self.expect_any(STR_QUOTE_CHARS, "string")?;
while let Some(c) = self.match_where(|curr| curr != start_char) {
if c == '\\' {
// Match escapes
self.expect_any(&['n', 't', 'r', '\\', '\'', '\"', '0'], "escape character")?;
}
}
self.expect_char(start_char, "end of string")?;
Ok(self.make_token(TokenKind::Str))
}
fn next_sym(&mut self) -> Result<Token> {
self.expect_char(':', "symbol")?;
self.expect_ident("symbol")?;
Ok(self.make_token(TokenKind::Sym))
}
fn next_op(&mut self) -> Result<Token> {
self.expect_any(OP_CHARS, "operator")?;
let op_text = self.match_while(|c| OP_CHARS.contains(&c));
if let Some(kind) = OPS.get(op_text).copied() {
Ok(self.make_token(kind))
} else {
Err(Error::Unknown {
what: format!("operator {}", op_text.escape_debug()),
span: self.span(),
})
}
}
fn next_char_token(&mut self, c: char, kind: TokenKind) -> Result<Token> {
self.expect_char(c, LazyString::new(|| format!("{} token", kind)))?;
Ok(self.make_token(kind))
}
////////////////////////////////////////////////////////////////////////////////
// Character pattern matching
////////////////////////////////////////////////////////////////////////////////
fn match_where<P>(&mut self, p: P) -> Option<char>
where
P: Fn(char) -> bool,
{
if (p)(self.curr_char()?) {
self.adv_char()
} else {
None
}
}
fn match_while<P>(&mut self, p: P) -> &str
where
P: Fn(char) -> bool + Copy,
{
while self.match_where(p).is_some() {}
self.text_at(self.text)
}
fn expect_where<P>(&mut self, p: P, expected: impl ToString) -> Result<char>
where
P: Fn(char) -> bool,
{
// Check EOF
self.curr_char().ok_or_else(|| Error::ExpectedGot {
expected: expected.to_string(),
got: "EOF".to_string(),
span: self.span(),
})?;
// Match
self.match_where(p).ok_or_else(|| Error::ExpectedGot {
expected: expected.to_string(),
got: format!("{} character", self.curr_char().unwrap().escape_debug()),
span: self.span(),
})
}
fn expect_char(&mut self, c: char, expected: impl ToString) -> Result<char> {
self.expect_where(|curr| curr == c, expected)
}
fn expect_any(&mut self, chars: &[char], expected: impl ToString) -> Result<char> {
self.expect_where(|curr| chars.contains(&curr), expected)
}
fn expect_ident(&mut self, expected: impl ToString) -> Result<&str> {
self.expect_any(IDENT_START_CHARS, expected)?;
Ok(self.match_while(|curr| IDENT_CHARS.contains(&curr)))
}
}
impl Spanned for Lexer<'_> {
fn span(&self) -> Span {
Span {
start: self.start,
end: self.end,
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_next_token_eof() {
let mut lexer = Lexer::new("");
assert!(matches!(lexer.next_token(), Ok(None)));
assert!(lexer.is_eof());
let mut lexer = Lexer::new(" ");
assert!(matches!(lexer.next_token(), Ok(None)));
assert!(lexer.is_eof());
let mut lexer = Lexer::new(" \n \n \n\r\n\t ");
assert!(matches!(lexer.next_token(), Ok(None)));
assert!(lexer.is_eof());
}
macro_rules! test_token {
($text:expr, $token_kind:expr, $token_text:expr) => {{
let text = $text;
let mut lexer = Lexer::new(text);
let token = lexer.next_token().expect("token").expect("token");
assert_eq!(token.kind(), $token_kind);
assert_eq!(token.text_at(text), $token_text);
}};
($text:expr, $token_kind:expr) => {{
test_token!($text, $token_kind, $text);
}};
}
#[test]
fn test_ident_token() {
test_token!("ident", TokenKind::Ident);
test_token!("OtherIdent", TokenKind::Ident);
test_token!("other_ident", TokenKind::Ident);
test_token!("ident1234", TokenKind::Ident);
test_token!("RETURN", TokenKind::Ident);
}
#[test]
fn test_keywords() {
test_token!("return", TokenKind::KwReturn);
}
#[test]
fn test_num_token() {
test_token!("1234", TokenKind::Num);
test_token!("4321", TokenKind::Num);
test_token!("123498765", TokenKind::Num);
test_token!("432156789", TokenKind::Num);
test_token!("0xdcbaBEEF", TokenKind::Num);
test_token!("0xabcdFEED", TokenKind::Num);
test_token!("0XdcbaBEEF", TokenKind::Num);
test_token!("0XabcdFEED", TokenKind::Num);
test_token!("0X123456789DCBAbeef", TokenKind::Num);
test_token!("0xABCDfeed192837465", TokenKind::Num);
}
#[test]
fn test_str_token() {
test_token!(r#""this is a string""#, TokenKind::Str);
test_token!(r#"'this is a string'"#, TokenKind::Str);
test_token!(r#"'this is a string\nwith escapes'"#, TokenKind::Str);
test_token!(r#""this is a string\nwith escapes""#, TokenKind::Str);
}
#[test]
fn test_sym_token() {
test_token!(":symbol", TokenKind::Sym);
test_token!(":OtherSymbol", TokenKind::Sym);
test_token!(":other_symbol", TokenKind::Sym);
test_token!(":symbol1234", TokenKind::Sym);
}
#[test]
fn test_single_char_symbols() {
test_token!("(", TokenKind::LParen);
test_token!(")", TokenKind::RParen);
test_token!("{", TokenKind::LBrace);
test_token!("}", TokenKind::RBrace);
test_token!("[", TokenKind::LBracket);
test_token!("]", TokenKind::RBracket);
test_token!(",", TokenKind::Comma);
}
#[test]
fn test_op_tokens() {
test_token!("=", TokenKind::Eq);
test_token!("->", TokenKind::Arrow);
}
#[test]
fn test_expect_char() {
let mut lexer = Lexer::new("asdf");
assert!(matches!(lexer.expect_char('a', "a"), Ok('a')));
assert!(matches!(lexer.expect_char('s', "s"), Ok('s')));
assert!(matches!(lexer.expect_char('d', "d"), Ok('d')));
assert!(matches!(lexer.expect_char('f', "f"), Ok('f')));
assert!(lexer.is_eof());
}
}

4
src/syn/mod.rs Normal file
View File

@@ -0,0 +1,4 @@
pub mod error;
pub mod lexer;
pub mod span;
pub mod token;

150
src/syn/span.rs Normal file
View File

@@ -0,0 +1,150 @@
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Pos {
pub source: usize,
pub line: usize,
pub col: usize,
pub byte: usize,
pub len: usize,
}
impl Default for Pos {
fn default() -> Self {
Pos {
source: 0,
line: 0,
col: 0,
byte: 0,
len: 1,
}
}
}
impl Pos {
pub fn from_char(c: char, source: usize, line: usize, col: usize, byte: usize) -> Self {
Pos {
source, line, col, byte, len: c.len_utf8(),
}
}
pub fn next_char(&self, c: char) -> Self {
Pos {
source: self.source + 1,
line: self.line,
col: self.col + 1,
byte: self.byte + self.len,
len: c.len_utf8(),
}
}
pub fn min(self, other: Self) -> Self {
if self.byte < other.byte {
self
} else {
other
}
}
pub fn max(self, other: Self) -> Self {
if self.byte > other.byte {
self
} else {
other
}
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub struct Span {
pub start: Pos,
pub end: Pos,
}
impl Span {
pub fn union(self, other: Self) -> Self {
let start = self.start.min(other.start);
let end = self.end.max(other.end);
Span { start, end }
}
}
pub trait Spanned {
fn span(&self) -> Span;
fn text_at<'t>(&self, text: &'t str) -> &'t str {
let Span { start, end } = self.span();
&text[start.byte .. end.byte]
}
}
impl Spanned for Span {
fn span(&self) -> Span {
*self
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_pos_min() {
let small = Pos::default();
let large = Pos {
source: 1,
byte: 1,
..Default::default()
};
assert_eq!(small.min(large), small);
assert_eq!(large.min(small), small);
}
#[test]
fn test_pos_max() {
let small = Pos::default();
let large = Pos {
source: 1,
byte: 1,
..Default::default()
};
assert_eq!(small.max(large), large);
assert_eq!(large.max(small), large);
}
#[test]
fn test_span_union() {
let first = Span {
start: Pos::default(),
end: Pos {
source: 15,
col: 15,
byte: 15,
..Default::default()
}
};
let second = Span {
start: Pos {
source: 25,
col: 25,
byte: 25,
..Default::default()
},
end: Pos {
source: 27,
col: 27,
byte: 27,
..Default::default()
}
};
let expected = Span {
start: first.start,
end: second.end,
};
assert_eq!(first.union(second), expected);
assert_eq!(second.union(first), expected);
}
}

68
src/syn/token.rs Normal file
View File

@@ -0,0 +1,68 @@
use crate::syn::span::*;
use std::fmt::{Display, Formatter, self};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenKind {
Ident,
Num,
Str,
Sym,
LParen,
RParen,
LBrace,
RBrace,
LBracket,
RBracket,
Comma,
Eq,
Arrow,
KwReturn,
}
impl Display for TokenKind {
fn fmt(&self, fmt: &mut Formatter) -> fmt::Result {
use TokenKind::*;
let s = match self {
Ident => "identifier",
Num => "number",
Str => "string",
Sym => "symbol",
LParen => "left paren",
RParen => "right paren",
LBrace => "left brace",
RBrace => "right brace",
LBracket => "left bracket",
RBracket => "right bracket",
Comma => "comma",
Eq => "equals",
Arrow => "arrow",
KwReturn => "return keyword",
};
Display::fmt(s, fmt)
}
}
pub struct Token {
kind: TokenKind,
span: Span,
}
impl Token {
pub fn new(kind: TokenKind, span: Span) -> Self {
Token { kind, span, }
}
pub fn kind(&self) -> TokenKind {
self.kind
}
}
impl Spanned for Token {
fn span(&self) -> Span { self.span }
}

38
src/util.rs Normal file
View File

@@ -0,0 +1,38 @@
use std::{
fmt::{Display, Formatter, self},
};
pub struct LazyString<'f, F>
where F: Fn() -> String + 'f
{
source: F,
_lifetime: std::marker::PhantomData<dyn Fn() + 'f>,
}
impl<'f, F> LazyString<'f, F>
where F: Fn() -> String + 'f
{
pub fn new(source: F) -> Self {
LazyString {
source,
_lifetime: Default::default(),
}
}
}
impl<'f, F> Display for LazyString<'f, F>
where F: Fn() -> String + 'f
{
fn fmt(&self, fmt: &mut Formatter) -> fmt::Result {
let s = (self.source)();
Display::fmt(&s, fmt)
}
}
#[test]
fn test_lazy_string() {
let i = 10;
let lzstr = LazyString::new(|| format!("the value is {}", i));
assert_eq!(lzstr.to_string(), "the value is 10");
}