Add binary and hex number parsing

Signed-off-by: Alek Ratzloff <alekratz@gmail.com>
This commit is contained in:
2024-09-26 10:03:54 -07:00
parent cd9617d2fd
commit 1dd058ae18
4 changed files with 120 additions and 5 deletions

View File

@@ -43,6 +43,7 @@ const NAME_START_CHARS: &str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUV
const NAME_CHARS: &str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789-";
const NUMBER_START_CHARS: &str = "0123456789";
const NUMBER_CHARS: &str = "0123456789.";
const NUMBER_HEX_CHARS: &str = "0123456789ABCDEFabcdef";
const STRING_START_CHARS: &str = "'\"";
const STRING_ESCAPES: &str = "nrt\\\"'";
@@ -62,7 +63,7 @@ pub struct Lexer {
}
impl Lexer {
pub fn new(text: String, path: &dyn AsRef<Path>) -> Self {
pub fn new(text: String, path: impl AsRef<Path>) -> Self {
Self {
line: 1,
index: 1,
@@ -79,7 +80,12 @@ impl Lexer {
}
pub fn lexeme(&self) -> &str {
&self.text[self.start..self.index - 1]
if self.is_eof() {
// if we're at EOF, the index should not be cut off at the very end
&self.text[self.start..self.index]
} else {
&self.text[self.start..self.index - 1]
}
}
pub fn was_error(&self) -> bool {
@@ -164,8 +170,16 @@ impl Lexer {
return Ok(self.make_token(TokenKind::Eof));
} else if NAME_START_CHARS.contains(self.current()) {
return Ok(self.name());
} else if self.mat('0') {
return if self.mat('x') || self.mat('X') {
self.hex_number()
} else if self.mat('b') || self.mat('B') {
self.bin_number()
} else {
self.number()
};
} else if NUMBER_START_CHARS.contains(self.current()) {
return Ok(self.number());
return self.number();
} else if STRING_START_CHARS.contains(self.current()) {
return self.string();
} else if self.mat('+') {
@@ -291,11 +305,53 @@ impl Lexer {
}
}
fn number(&mut self) -> Token {
fn number(&mut self) -> Result<Token> {
let mut was_decimal = false;
while NUMBER_CHARS.contains(self.current()) {
// this allows some weird syntax, you're allowed to do e.g. `1.0.to_int()` (usually
// written as `(1.0).to_int()` but I don't see a problem with it)
if self.current() == '.' {
if was_decimal {
break;
} else {
was_decimal = true;
}
}
self.advance();
}
self.make_token(TokenKind::Number)
if NAME_CHARS.contains(self.current()) {
Err(self.error(format!("invalid digit '{}'", self.current())))
} else {
Ok(self.make_token(TokenKind::Number))
}
}
fn hex_number(&mut self) -> Result<Token> {
if !NUMBER_HEX_CHARS.contains(self.current()) {
return Err(self.error("expected hex digit after '0x' leader"));
}
while NUMBER_HEX_CHARS.contains(self.current()) {
self.advance();
}
if NAME_CHARS.contains(self.current()) {
Err(self.error(format!("invalid hex digit '{}'", self.current())))
} else {
Ok(self.make_token(TokenKind::Number))
}
}
fn bin_number(&mut self) -> Result<Token> {
if self.current() != '0' && self.current() != '1' {
return Err(self.error("expected binary digit after '0b' leader"));
}
while self.current() == '0' || self.current() == '1' {
self.advance();
}
if NAME_CHARS.contains(self.current()) {
Err(self.error(format!("invalid binary digit '{}'", self.current())))
} else {
Ok(self.make_token(TokenKind::Number))
}
}
fn string(&mut self) -> Result<Token> {
@@ -799,3 +855,52 @@ impl Parser {
Ok(())
}
}
#[cfg(test)]
macro_rules! lexer_check {
($lexer:expr, $kind:expr, $text:expr) => {{
let next = $lexer.next().unwrap();
assert_eq!(next.kind, $kind);
assert_eq!(next.text, $text);
}};
}
#[test]
fn test_lexer_names() {
let input = "asdf fdsa the-quick-brown-fox jumped_over-the-lazy_dogs";
let mut lexer = Lexer::new(input.to_string(), ":testing:");
lexer_check!(lexer, TokenKind::Name, "asdf");
lexer_check!(lexer, TokenKind::Name, "fdsa");
lexer_check!(lexer, TokenKind::Name, "the-quick-brown-fox");
lexer_check!(lexer, TokenKind::Name, "jumped_over-the-lazy_dogs");
assert!(lexer.is_eof());
}
#[test]
fn test_lexer_numbers() {
let input = "1 2 3 0 0.0 1.0 2.0 3.0 0x1 0xa 0xff 0xabcd 0xabcdef 0XDEADBEEF 0XDECAFDAD 0b0 0b1 0b010101 0B101010 0B00000";
let mut lexer = Lexer::new(input.to_string(), ":testing:");
lexer_check!(lexer, TokenKind::Number, "1");
lexer_check!(lexer, TokenKind::Number, "2");
lexer_check!(lexer, TokenKind::Number, "3");
lexer_check!(lexer, TokenKind::Number, "0");
lexer_check!(lexer, TokenKind::Number, "0.0");
lexer_check!(lexer, TokenKind::Number, "1.0");
lexer_check!(lexer, TokenKind::Number, "2.0");
lexer_check!(lexer, TokenKind::Number, "3.0");
lexer_check!(lexer, TokenKind::Number, "0x1");
lexer_check!(lexer, TokenKind::Number, "0xa");
lexer_check!(lexer, TokenKind::Number, "0xff");
lexer_check!(lexer, TokenKind::Number, "0xabcd");
lexer_check!(lexer, TokenKind::Number, "0xabcdef");
lexer_check!(lexer, TokenKind::Number, "0XDEADBEEF");
lexer_check!(lexer, TokenKind::Number, "0XDECAFDAD");
lexer_check!(lexer, TokenKind::Number, "0b0");
lexer_check!(lexer, TokenKind::Number, "0b1");
lexer_check!(lexer, TokenKind::Number, "0b010101");
lexer_check!(lexer, TokenKind::Number, "0B101010");
lexer_check!(lexer, TokenKind::Number, "0B00000");
assert!(lexer.is_eof());
}