From 1dd058ae18661a8884ed21f1651a3b7834061ba0 Mon Sep 17 00:00:00 2001 From: Alek Ratzloff Date: Thu, 26 Sep 2024 10:03:54 -0700 Subject: [PATCH] Add binary and hex number parsing Signed-off-by: Alek Ratzloff --- src/compiler.rs | 4 ++ src/parser.rs | 115 +++++++++++++++++++++++++++++++++++++++++-- tests/int.npp | 3 ++ tests/int.npp.expect | 3 ++ 4 files changed, 120 insertions(+), 5 deletions(-) diff --git a/src/compiler.rs b/src/compiler.rs index cd3e96c..6c9aad8 100644 --- a/src/compiler.rs +++ b/src/compiler.rs @@ -889,6 +889,10 @@ impl ExprVisitor for Compiler { TokenKind::Number => { let obj = if expr.token.text.contains('.') { FloatInst::create(expr.token.text.parse().unwrap()) + } else if expr.token.text.starts_with("0x") || expr.token.text.starts_with("0X") { + IntInst::create(i64::from_str_radix(&expr.token.text[2..], 16).unwrap()) + } else if expr.token.text.starts_with("0b") || expr.token.text.starts_with("0B") { + IntInst::create(i64::from_str_radix(&expr.token.text[2..], 2).unwrap()) } else { IntInst::create(expr.token.text.parse().unwrap()) }; diff --git a/src/parser.rs b/src/parser.rs index bc78658..146f955 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -43,6 +43,7 @@ const NAME_START_CHARS: &str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUV const NAME_CHARS: &str = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789-"; const NUMBER_START_CHARS: &str = "0123456789"; const NUMBER_CHARS: &str = "0123456789."; +const NUMBER_HEX_CHARS: &str = "0123456789ABCDEFabcdef"; const STRING_START_CHARS: &str = "'\""; const STRING_ESCAPES: &str = "nrt\\\"'"; @@ -62,7 +63,7 @@ pub struct Lexer { } impl Lexer { - pub fn new(text: String, path: &dyn AsRef) -> Self { + pub fn new(text: String, path: impl AsRef) -> Self { Self { line: 1, index: 1, @@ -79,7 +80,12 @@ impl Lexer { } pub fn lexeme(&self) -> &str { - &self.text[self.start..self.index - 1] + if self.is_eof() { + // if we're at EOF, the index should not be cut off at the very end + &self.text[self.start..self.index] + } else { + &self.text[self.start..self.index - 1] + } } pub fn was_error(&self) -> bool { @@ -164,8 +170,16 @@ impl Lexer { return Ok(self.make_token(TokenKind::Eof)); } else if NAME_START_CHARS.contains(self.current()) { return Ok(self.name()); + } else if self.mat('0') { + return if self.mat('x') || self.mat('X') { + self.hex_number() + } else if self.mat('b') || self.mat('B') { + self.bin_number() + } else { + self.number() + }; } else if NUMBER_START_CHARS.contains(self.current()) { - return Ok(self.number()); + return self.number(); } else if STRING_START_CHARS.contains(self.current()) { return self.string(); } else if self.mat('+') { @@ -291,11 +305,53 @@ impl Lexer { } } - fn number(&mut self) -> Token { + fn number(&mut self) -> Result { + let mut was_decimal = false; while NUMBER_CHARS.contains(self.current()) { + // this allows some weird syntax, you're allowed to do e.g. `1.0.to_int()` (usually + // written as `(1.0).to_int()` but I don't see a problem with it) + if self.current() == '.' { + if was_decimal { + break; + } else { + was_decimal = true; + } + } self.advance(); } - self.make_token(TokenKind::Number) + if NAME_CHARS.contains(self.current()) { + Err(self.error(format!("invalid digit '{}'", self.current()))) + } else { + Ok(self.make_token(TokenKind::Number)) + } + } + + fn hex_number(&mut self) -> Result { + if !NUMBER_HEX_CHARS.contains(self.current()) { + return Err(self.error("expected hex digit after '0x' leader")); + } + while NUMBER_HEX_CHARS.contains(self.current()) { + self.advance(); + } + if NAME_CHARS.contains(self.current()) { + Err(self.error(format!("invalid hex digit '{}'", self.current()))) + } else { + Ok(self.make_token(TokenKind::Number)) + } + } + + fn bin_number(&mut self) -> Result { + if self.current() != '0' && self.current() != '1' { + return Err(self.error("expected binary digit after '0b' leader")); + } + while self.current() == '0' || self.current() == '1' { + self.advance(); + } + if NAME_CHARS.contains(self.current()) { + Err(self.error(format!("invalid binary digit '{}'", self.current()))) + } else { + Ok(self.make_token(TokenKind::Number)) + } } fn string(&mut self) -> Result { @@ -799,3 +855,52 @@ impl Parser { Ok(()) } } + +#[cfg(test)] +macro_rules! lexer_check { + ($lexer:expr, $kind:expr, $text:expr) => {{ + let next = $lexer.next().unwrap(); + assert_eq!(next.kind, $kind); + assert_eq!(next.text, $text); + }}; +} + +#[test] +fn test_lexer_names() { + let input = "asdf fdsa the-quick-brown-fox jumped_over-the-lazy_dogs"; + let mut lexer = Lexer::new(input.to_string(), ":testing:"); + + lexer_check!(lexer, TokenKind::Name, "asdf"); + lexer_check!(lexer, TokenKind::Name, "fdsa"); + lexer_check!(lexer, TokenKind::Name, "the-quick-brown-fox"); + lexer_check!(lexer, TokenKind::Name, "jumped_over-the-lazy_dogs"); + assert!(lexer.is_eof()); +} + +#[test] +fn test_lexer_numbers() { + let input = "1 2 3 0 0.0 1.0 2.0 3.0 0x1 0xa 0xff 0xabcd 0xabcdef 0XDEADBEEF 0XDECAFDAD 0b0 0b1 0b010101 0B101010 0B00000"; + let mut lexer = Lexer::new(input.to_string(), ":testing:"); + + lexer_check!(lexer, TokenKind::Number, "1"); + lexer_check!(lexer, TokenKind::Number, "2"); + lexer_check!(lexer, TokenKind::Number, "3"); + lexer_check!(lexer, TokenKind::Number, "0"); + lexer_check!(lexer, TokenKind::Number, "0.0"); + lexer_check!(lexer, TokenKind::Number, "1.0"); + lexer_check!(lexer, TokenKind::Number, "2.0"); + lexer_check!(lexer, TokenKind::Number, "3.0"); + lexer_check!(lexer, TokenKind::Number, "0x1"); + lexer_check!(lexer, TokenKind::Number, "0xa"); + lexer_check!(lexer, TokenKind::Number, "0xff"); + lexer_check!(lexer, TokenKind::Number, "0xabcd"); + lexer_check!(lexer, TokenKind::Number, "0xabcdef"); + lexer_check!(lexer, TokenKind::Number, "0XDEADBEEF"); + lexer_check!(lexer, TokenKind::Number, "0XDECAFDAD"); + lexer_check!(lexer, TokenKind::Number, "0b0"); + lexer_check!(lexer, TokenKind::Number, "0b1"); + lexer_check!(lexer, TokenKind::Number, "0b010101"); + lexer_check!(lexer, TokenKind::Number, "0B101010"); + lexer_check!(lexer, TokenKind::Number, "0B00000"); + assert!(lexer.is_eof()); +} diff --git a/tests/int.npp b/tests/int.npp index bd4fa3d..bdb4737 100644 --- a/tests/int.npp +++ b/tests/int.npp @@ -16,6 +16,8 @@ println(a + b) println(b + a) println(a + -b) println(-a + b) +println(0xa + 1) +println(0b10 + 0b10) # __sub__ println("__sub__") @@ -120,3 +122,4 @@ println(----1) println(10 - -20) println(-10 - 20) println(-10 - -20) +println(-0xff) diff --git a/tests/int.npp.expect b/tests/int.npp.expect index 31e766b..4c044af 100644 --- a/tests/int.npp.expect +++ b/tests/int.npp.expect @@ -8,6 +8,8 @@ __add__ 30 -10 10 +11 +4 __sub__ -1 1 @@ -90,3 +92,4 @@ __neg__ 30 -30 10 +-255