This repository has been archived on 2020-09-15. You can view files and clone it, but cannot push or open issues or pull requests.
Files
not-python-old.2020-08-27/src/syn/lexer.rs

257 lines
7.7 KiB
Rust
Raw Normal View History

use crate::syn::{error::*, span::*, token::*};
use lazy_static::lazy_static;
use regex::{Regex, RegexBuilder};
use std::str::Chars;
pub struct Lexer<'t> {
text: &'t str,
pos: Pos,
}
impl<'t> Lexer<'t> {
/// Creates a new lexer that tokenizes the given text.
pub fn new(text: &'t str) -> Self {
// load the first position into the start/end position trackers
let pos = if let Some(c) = text.chars().next() {
Pos::from_char(c, 0, 0, 0, 0)
} else {
Default::default()
};
Lexer {
text,
pos,
}
}
/// Gets whether this lexer has reached the EOF.
pub fn is_eof(&self) -> bool {
self.chars().next().is_none()
}
////////////////////////////////////////////////////////////////////////////////
// Character advancement
////////////////////////////////////////////////////////////////////////////////
fn chars(&'t self) -> Chars<'t> {
self.pos_text().chars()
}
fn pos_text(&self) -> &str {
&self.text[self.pos.byte..]
}
fn curr_char(&self) -> Option<char> {
self.chars().next()
}
fn skip_whitespace(&mut self) {
while let Some(c) = self.curr_char() {
if !c.is_whitespace() {
break;
} else {
self.adv_char();
}
}
}
fn adv_char(&mut self) -> Option<char> {
let c = self.curr_char()?;
self.pos = self.pos.next_char(c);
Some(c)
}
////////////////////////////////////////////////////////////////////////////////
// Tokens
////////////////////////////////////////////////////////////////////////////////
pub fn next_token(&mut self) -> Result<Option<Token>> {
// Constants and statics
lazy_static! {
static ref REGEX: Regex = RegexBuilder::new(r#"
^(?P<kw_return>return)
|(?P<ident>[a-zA-Z_][a-zA-Z0-9_]*)
|(?P<sym>:[a-zA-Z_][a-zA-Z0-9_]*)
|(?P<hex_num>0[xX][0-9a-fA-F]+)
|(?P<dec_num>[0-9]+)
|(?P<lparen>\()
|(?P<rparen>\))
|(?P<lbracket>\[)
|(?P<rbracket>\])
|(?P<lbrace>\{)
|(?P<rbrace>\})
|(?P<comma>,)
|(?P<arrow>->)
|(?P<eq>=)
|(?P<plus>\+)
|(?P<minus>-)
|(?P<splat>\*)
|(?P<fslash>/)
|(?P<dq_str>"([^\\"]|\\[ntr0"'])*")
|(?P<sq_str>'([^\\"]|\\[ntr0"'])*')
"#).ignore_whitespace(true)
.build()
.unwrap();
}
const CAPTURES: &[(&str, TokenKind)] = &[
("kw_return", TokenKind::KwReturn),
("ident", TokenKind::Ident),
("sym", TokenKind::Sym),
("dec_num", TokenKind::Num),
("hex_num", TokenKind::Num),
("dq_str", TokenKind::Str),
("sq_str", TokenKind::Str),
("lparen", TokenKind::LParen),
("rparen", TokenKind::RParen),
("lbracket", TokenKind::LBracket),
("rbracket", TokenKind::RBracket),
("lbrace", TokenKind::LBrace),
("rbrace", TokenKind::RBrace),
("comma", TokenKind::Comma),
("plus", TokenKind::Plus),
("minus", TokenKind::Minus),
("splat", TokenKind::Splat),
("fslash", TokenKind::FSlash),
("arrow", TokenKind::Arrow),
("eq", TokenKind::Eq),
];
self.skip_whitespace();
if self.curr_char().is_none() {
return Ok(None);
};
let caps = REGEX.captures(&self.text[self.pos.byte..])
.ok_or_else(|| Error::Unexpected {
what: "EOF".to_string(),
pos: self.pos,
})?;
// Get first capture
let capture_kind = CAPTURES.iter()
.filter_map(|(name, kind)|
caps.name(name)
.map(|cap| (cap, kind)))
.next();
let (token_text, kind) = if let Some((capture, kind)) = capture_kind {
(capture.as_str(), *kind)
} else {
return Err(
Error::Unexpected {
what: format!("character {}", (&self.text[self.pos.byte..]).chars().next().unwrap()),
pos: self.pos,
}
);
};
let start = self.pos;
self.pos.adv_str(token_text);
let end = self.pos;
Ok(Some(Token::new(kind, Span { start, end })))
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_next_token_eof() {
let mut lexer = Lexer::new("");
assert!(matches!(lexer.next_token(), Ok(None)));
assert!(lexer.is_eof());
let mut lexer = Lexer::new(" ");
assert!(matches!(lexer.next_token(), Ok(None)));
assert!(lexer.is_eof());
let mut lexer = Lexer::new(" \n \n \n\r\n\t ");
assert!(matches!(lexer.next_token(), Ok(None)));
assert!(lexer.is_eof());
}
macro_rules! test_token {
($text:expr, $token_kind:expr, $token_text:expr) => {{
let text = $text;
let mut lexer = Lexer::new(text);
let token = lexer.next_token().expect("token").expect("token");
assert_eq!(token.kind(), $token_kind);
assert_eq!(token.text_at(text), $token_text);
}};
($text:expr, $token_kind:expr) => {{
test_token!($text, $token_kind, $text);
}};
}
#[test]
fn test_ident_token() {
test_token!("ident", TokenKind::Ident);
test_token!("OtherIdent", TokenKind::Ident);
test_token!("other_ident", TokenKind::Ident);
test_token!("ident1234", TokenKind::Ident);
test_token!("RETURN", TokenKind::Ident);
}
#[test]
fn test_keywords() {
test_token!("return", TokenKind::KwReturn);
}
#[test]
fn test_num_token() {
test_token!("1234", TokenKind::Num);
test_token!("4321", TokenKind::Num);
test_token!("123498765", TokenKind::Num);
test_token!("432156789", TokenKind::Num);
test_token!("0xdcbaBEEF", TokenKind::Num);
test_token!("0xabcdFEED", TokenKind::Num);
test_token!("0XdcbaBEEF", TokenKind::Num);
test_token!("0XabcdFEED", TokenKind::Num);
test_token!("0X123456789DCBAbeef", TokenKind::Num);
test_token!("0xABCDfeed192837465", TokenKind::Num);
}
#[test]
fn test_str_token() {
test_token!(r#""this is a string""#, TokenKind::Str);
test_token!(r#"'this is a string'"#, TokenKind::Str);
test_token!(r#"'this is a string\nwith escapes'"#, TokenKind::Str);
test_token!(r#""this is a string\nwith escapes""#, TokenKind::Str);
}
#[test]
fn test_sym_token() {
test_token!(":symbol", TokenKind::Sym);
test_token!(":OtherSymbol", TokenKind::Sym);
test_token!(":other_symbol", TokenKind::Sym);
test_token!(":symbol1234", TokenKind::Sym);
}
#[test]
fn test_single_char_symbols() {
test_token!("(", TokenKind::LParen);
test_token!(")", TokenKind::RParen);
test_token!("{", TokenKind::LBrace);
test_token!("}", TokenKind::RBrace);
test_token!("[", TokenKind::LBracket);
test_token!("]", TokenKind::RBracket);
test_token!(",", TokenKind::Comma);
test_token!("+", TokenKind::Plus);
test_token!("-", TokenKind::Minus);
test_token!("*", TokenKind::Splat);
test_token!("/", TokenKind::FSlash);
}
#[test]
fn test_op_tokens() {
test_token!("=", TokenKind::Eq);
test_token!("->", TokenKind::Arrow);
}
}