From 56e99a3083aa410d0a6c4fa71976add0008c2c70 Mon Sep 17 00:00:00 2001 From: Daniel Flanagan Date: Thu, 13 Feb 2025 17:13:28 -0600 Subject: [PATCH] chore: wip lexer overhaul --- Cargo.lock | 16 --- Cargo.toml | 1 - lyt/Cargo.toml | 2 - lyt/src/cli.rs | 37 ++++-- lyt/src/lexer.rs | 310 ++++++++++++++++++++++++++++++++------------- lyt/src/main.rs | 15 ++- lyt/src/parser.rs | 37 +++--- lyt/src/prelude.rs | 1 + 8 files changed, 283 insertions(+), 136 deletions(-) create mode 100644 lyt/src/prelude.rs diff --git a/Cargo.lock b/Cargo.lock index e315197..2a6a1de 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,22 +2,6 @@ # It is not intended for manual editing. version = 3 -[[package]] -name = "anyhow" -version = "1.0.95" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04" - -[[package]] -name = "lexopt" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baff4b617f7df3d896f97fe922b64817f6cd9a756bb81d40f8883f2f66dcb401" - [[package]] name = "lyt" version = "0.1.0" -dependencies = [ - "anyhow", - "lexopt", -] diff --git a/Cargo.toml b/Cargo.toml index 242cfdf..f8606ef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,4 +3,3 @@ resolver = "2" members = ["lyt"] [workspace.dependencies] -anyhow = "1.0.95" diff --git a/lyt/Cargo.toml b/lyt/Cargo.toml index a443ba3..5246ad8 100644 --- a/lyt/Cargo.toml +++ b/lyt/Cargo.toml @@ -4,5 +4,3 @@ version = "0.1.0" edition = "2021" [dependencies] -anyhow = { workspace = true } -lexopt = "0.3.0" diff --git a/lyt/src/cli.rs b/lyt/src/cli.rs index 2ef81e4..433e464 100644 --- a/lyt/src/cli.rs +++ b/lyt/src/cli.rs @@ -9,15 +9,30 @@ enum Subcommand { Help, } +#[derive(Debug)] +pub enum ParseArgsError { + InvalidSubcommand(String), + UnexpectedArgument(String), +} +impl std::error::Error for ParseArgsError {} +impl std::fmt::Display for ParseArgsError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ParseArgsError::InvalidSubcommand(c) => write!(f, "invalid subcommand: {}", c), + ParseArgsError::UnexpectedArgument(c) => write!(f, "unexpected argument: {}", c), + } + } +} + impl FromStr for Subcommand { - type Err = (); + type Err = ParseArgsError; fn from_str(input: &str) -> Result { // TODO: errors should show similar commands? match input { "help" => Ok(Subcommand::Help), "repl" => Ok(Subcommand::Repl), - _ => Err(()), + _ => Err(ParseArgsError::InvalidSubcommand(input.to_string())), } } } @@ -27,27 +42,25 @@ fn usage(exit_code: i32) -> ! { std::process::exit(exit_code); } -fn parse_global_args() -> Result { - use lexopt::prelude::*; - +fn parse_global_args() -> Result { let mut subcommand = None; + let mut args = std::env::args(); - let mut parser = lexopt::Parser::from_env(); - while let Some(arg) = parser.next()? { - match arg { - Short('h') | Long("help") => { + while let Some(arg) = args.next() { + match arg.as_str() { + "--help" | "-h" => { usage(0); } - Value(s) if subcommand.is_none() => { + s if subcommand.is_none() => { subcommand = Some(s.parse()?); } - _ => return Err(arg.unexpected()), + _ => return Err(ParseArgsError::UnexpectedArgument(arg.to_string())), } } return Ok(GlobalArgs { subcommand }); } -pub fn run() -> anyhow::Result<()> { +pub fn run() -> crate::Result<()> { Ok(()) } diff --git a/lyt/src/lexer.rs b/lyt/src/lexer.rs index 618b540..04db048 100644 --- a/lyt/src/lexer.rs +++ b/lyt/src/lexer.rs @@ -2,139 +2,275 @@ use std::{ iter::{self, from_fn}, num::ParseIntError, path::Path, + str::Chars, + sync::Arc, }; // TODO: tree_sitter ? #[derive(Debug, PartialEq)] -pub enum Token { +enum BareToken { Integer(i64), - Operator(char), + Plus, + Minus, + Star, + Slash, + Percent, OpenParen, CloseParen, -} - -#[derive(Debug, PartialEq, Clone, Copy)] -pub enum Source<'a> { - File(&'a Path), - Unknown, + NewLine, + EndOfFile, + // TODO: how to handle indentation? } #[derive(Debug, PartialEq)] -pub struct Position<'a> { - source: Source<'a>, // TODO: not certain how this will work in the real world, but I don't want all these positions taking up a bunch of memory unnecessarily as we lex and parse, so for now we deal with lifetimes so we can store references and have Source be cheap to clone everywhere +struct Token { + location: Option, // Not all tokens are associated with a location, such as EndOfFile + token: BareToken, +} + +#[derive(Debug, PartialEq, Clone, Default)] +pub enum Source { + #[default] + Unknown, + + File(Arc>), +} + +#[derive(Debug, PartialEq, Clone)] +struct Location { + source: Source, // TODO: not certain how this will work in the real world, but I don't want all these locations taking up a bunch of memory unnecessarily as we lex and parse, so for now we deal with lifetimes so we can store references and have Source be cheap to clone everywhere line: usize, + len: usize, col: usize, } #[derive(Debug, PartialEq)] -pub enum Error { +enum LexerError { // TODO: not a number error? ParseIntError(ParseIntError), Unexpected(char), } -pub struct Lexer { - num_tokens: usize, +#[derive(Debug, PartialEq)] +pub struct Error { + pub location: Location, + pub error: LexerError, } -impl Default for Lexer { - fn default() -> Self { - Self { num_tokens: 0 } - } +pub struct Lexer<'a> { + line: usize, + col: usize, + source: Source, + chars: Chars<'a>, + collected: Vec, + + done: bool, + sent_eof: bool, } -impl Lexer { - pub fn num_tokens(&self) -> usize { - self.num_tokens +impl<'a> Lexer<'a> { + fn new(code: &'a str, source: Source) -> Self { + let mut lexer = Lexer { + done: false, + sent_eof: false, + line: 1, + col: 0, + source: source.clone(), + chars: code.chars(), + collected: vec![], + }; + lexer.advance(); + lexer } - pub fn lex_str<'a>( - &mut self, - code: &'a str, - ) -> Result, Token)>, (Position<'a>, Error)> { - self.lex(code, None) - } - - pub fn lex<'a>( - &mut self, - code: &'a str, - source: Option>, - ) -> Result, Token)>, (Position<'a>, Error)> { - let source: Source = source.unwrap_or(Source::Unknown); - let mut result = vec![]; - let mut scanner = Self::walk(code).peekable(); - while let Some((line, col, c)) = scanner.next() { - let pos = move || Position { line, col, source }; - let err = |err: Error| Err((pos(), err)); - let token: Token = match c { - '(' => Token::OpenParen, - ')' => Token::CloseParen, - '\t' | ' ' => { - continue; - } - '0'..='9' => { - let s = iter::once(c) - .chain(from_fn(|| { - scanner - .by_ref() - .next_if(|(_, _, c)| c.is_ascii_digit()) - .map(|(_, _, c)| c) - })) - .collect::(); - let result = match s.parse() { - Ok(i) => Token::Integer(i), - Err(e) => return err(Error::ParseIntError(e)), - }; - result - } - '+' | '/' | '%' | '*' | '-' => Token::Operator(c), - c => return err(Error::Unexpected(c)), - }; - result.push((pos(), token)); - // an excuse for mutability at present - self.num_tokens += 1 + fn advance(&mut self) { + match self.chars.next() { + Some('\n') => { + self.line += 1; + self.col = 1; + self.collected.push('\n') + } + Some(c) => { + self.col += 1; + self.collected.push(c) + } + None => self.done = true, } - return Ok(result); } - fn walk(code: &str) -> impl Iterator + use<'_> { - Self::lines(code) - .map(|(line_num, line)| { - Self::cols(line).map(move |(col_num, c)| (line_num, col_num, c)) - }) - .flatten() + fn current_location(&self) -> Location { + Location { + source: self.source.clone(), + line: self.line, + len: self.collected.len(), + col: self.col, + } } - fn lines<'a>(code: &'a str) -> impl Iterator { - code.lines().enumerate().map(|(i, line)| (i + 1, line)) + fn produce(&mut self, token: BareToken) -> Token { + let t = Token { + location: Some(self.current_location()), + token, + }; + self.collected.clear(); + t } - fn cols(line: &str) -> impl Iterator + use<'_> { - line.chars().enumerate().map(|(i, c)| (i + 1, c)) + fn produce_error(&mut self, error: LexerError) -> Error { + Error { + location: self.current_location(), + error, + } + } + + fn token(&mut self) -> Result, Error> { + self.advance(); + let current = match self.collected.last() { + None => return Ok(None), + Some(c) => c, + }; + Ok(Some(match current { + c if c.is_ascii_digit() => self.number()?, + c => return Err(self.produce_error(LexerError::Unexpected(*c))), + })) + } + + fn number(&mut self) -> Result {} +} + +impl<'a> Iterator for Lexer<'a> { + type Item = Result; + + fn next(&mut self) -> Option { + if self.done && self.sent_eof { + return None; + } + if self.done && !self.sent_eof { + return Some(Ok(Token { + location: None, + token: BareToken::EndOfFile, + })); + } + match self.token() { + Ok(Some(t)) => Some(Ok(t)), + Ok(None) => None, + Err(e) => Some(Err(e)), + } } } +// pub fn lex_str(code: &str) -> Result, Error> { +// lex(code, None) +// } + +// pub fn lex(code: &str, source: Option) -> Result, Error> { +// let source: Source = source.unwrap_or_default(); +// let mut result: Vec = vec![]; +// let mut scanner = scanner(source.clone(), code).peekable(); +// loop { +// match Self::token(&mut scanner) { +// Ok(Some(t)) => { +// let end = t.token == BareToken::EndOfFile; +// result.push(t); +// if end { +// break; +// } +// } +// Ok(None) => {} +// Err(e) => return Err(e), +// } +// } +// return Ok(result); +// } + +// fn token() -> Result, Error> { +// let r = match scanner.next() { +// None => { +// return Ok(Some(Token { +// token: BareToken::EndOfFile, +// location: None, +// })) +// } +// Some(s) => s, +// }; +// match r.c { +// c if c.is_ascii_whitespace() => Ok(None), +// '\n' => t(BareToken::NewLine), +// '(' => t(BareToken::OpenParen), +// ')' => t(BareToken::CloseParen), +// '0'..='9' => Self::number(r.c, r.location, scanner), +// '+' => t(BareToken::Plus), +// '-' => t(BareToken::Minus), +// '*' => t(BareToken::Star), +// '/' => t(BareToken::Slash), +// '%' => t(BareToken::Percent), +// c => { +// return Err(Error { +// location: r.location, +// error: LexerError::Unexpected(c), +// }) +// } +// } +// } + +// fn number( +// first_digit: char, +// mut location: Location, +// scanner: &mut impl Iterator, +// ) -> Result { +// let mut scanner = scanner.peekable(); +// let s = iter::once(first_digit) +// .chain(from_fn(move || { +// scanner +// .by_ref() +// .next_if(|r| r.c.is_ascii_digit()) +// .map(|r| r.c) +// })) +// .collect::(); +// location.len += s.len(); +// let result = match s.parse() { +// Ok(i) => BareToken::Integer(i), +// Err(e) => { +// return Err(Error { +// error: LexerError::ParseIntError(e), +// location, +// }) +// } +// }; +// Ok() +// } + #[cfg(test)] mod test { use super::*; - #[test] - fn addition_operation() { - let pos = |col| Position { + fn pos(col: usize) -> Option { + Some(Location { line: 1, + len: 1, col, source: Source::Unknown, - }; - let mut lexer = Lexer::default(); + }) + } + + fn t(location: Option, token: BareToken) -> Token { + Token { location, token } + } + + #[test] + fn addition_operation() -> Result<(), Error> { + let lexer = Lexer::new("3 + 9", Source::Unknown); + let tokens: Result, Error> = lexer.collect(); assert_eq!( - lexer.lex_str("3 + 9"), + tokens, Ok(vec![ - (pos(1), Token::Integer(3)), - (pos(3), Token::Operator(ast::Operator::Add)), - (pos(5), Token::Integer(9)), - ]) + t(pos(1), BareToken::Integer(3)), + t(pos(3), BareToken::Plus), + t(pos(5), BareToken::Integer(9)), + ]), ); - assert_eq!(lexer.num_tokens(), 3); + assert_eq!(tokens?.len(), 3); + Ok(()) } } diff --git a/lyt/src/main.rs b/lyt/src/main.rs index 62ee518..a145cf2 100644 --- a/lyt/src/main.rs +++ b/lyt/src/main.rs @@ -2,7 +2,20 @@ mod ast; mod cli; mod lexer; mod parser; +mod prelude; -fn main() -> anyhow::Result<()> { +pub type StdError = dyn std::error::Error; + +#[derive(Debug)] +pub struct Error(Box); + +impl From for Error { + fn from(err: E) -> Self { + Error(Box::new(err)) + } +} +pub type Result = std::result::Result; + +fn main() -> Result<()> { Ok(cli::run()?) } diff --git a/lyt/src/parser.rs b/lyt/src/parser.rs index 47dc592..e98c3dc 100644 --- a/lyt/src/parser.rs +++ b/lyt/src/parser.rs @@ -1,3 +1,5 @@ +use std::{iter::Peekable, vec::IntoIter}; + use crate::{ast::*, lexer}; struct Parser { @@ -14,31 +16,33 @@ impl Default for Parser { #[derive(Debug, PartialEq)] pub enum Error { - Lexer((lexer::Position<'static>, lexer::Error)), - Unexpected((lexer::Position<'static>, lexer::Token)), + Lexer(lexer::Error), + Unexpected(lexer::Token), +} + +impl<'a> From for Error { + fn from(value: lexer::Error) -> Self { + Error::Lexer(value) + } } impl Parser { pub fn parse_str(&mut self, code: &'static str) -> Result { let mut lexer = lexer::Lexer::default(); - let tokens = lexer.lex_str(code).map_err(|e| Error::Lexer(e))?; - - Ok(self.parse(tokens)?) + let tokens = lexer.lex_str(code); + Ok(self.parse(tokens?)?) } - pub fn parse( + pub fn parse(&mut self, tokens: Vec) -> Result { + let iter = tokens.into_iter().peekable(); + self.expression(iter) + } + + fn expression( &mut self, - tokens: Vec<(lexer::Position, lexer::Token)>, + tokens: Peekable>, ) -> Result { - let mut iter = tokens.iter().peekable(); - while let Some((_p, token)) = iter.next() { - match iter.peek() { - Some(_) => Expression::Infix, - _ => {} - } - self.num_tokens_parsed += 1 - } - Ok(ast::Expression::Unit) + Ok(Expression::Unit) } pub fn num_tokens_parsed(&self) -> usize { @@ -49,7 +53,6 @@ impl Parser { #[cfg(test)] mod test { use super::*; - use ast::*; #[test] fn addition_operation() { diff --git a/lyt/src/prelude.rs b/lyt/src/prelude.rs new file mode 100644 index 0000000..9fb8bc3 --- /dev/null +++ b/lyt/src/prelude.rs @@ -0,0 +1 @@ +#![allow(unused_imports)]