chore: wip lexer overhaul

This commit is contained in:
Daniel Flanagan 2025-02-13 17:13:28 -06:00
parent b38642d869
commit 56e99a3083
8 changed files with 283 additions and 136 deletions

16
Cargo.lock generated
View file

@ -2,22 +2,6 @@
# It is not intended for manual editing. # It is not intended for manual editing.
version = 3 version = 3
[[package]]
name = "anyhow"
version = "1.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
[[package]]
name = "lexopt"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baff4b617f7df3d896f97fe922b64817f6cd9a756bb81d40f8883f2f66dcb401"
[[package]] [[package]]
name = "lyt" name = "lyt"
version = "0.1.0" version = "0.1.0"
dependencies = [
"anyhow",
"lexopt",
]

View file

@ -3,4 +3,3 @@ resolver = "2"
members = ["lyt"] members = ["lyt"]
[workspace.dependencies] [workspace.dependencies]
anyhow = "1.0.95"

View file

@ -4,5 +4,3 @@ version = "0.1.0"
edition = "2021" edition = "2021"
[dependencies] [dependencies]
anyhow = { workspace = true }
lexopt = "0.3.0"

View file

@ -9,15 +9,30 @@ enum Subcommand {
Help, Help,
} }
#[derive(Debug)]
pub enum ParseArgsError {
InvalidSubcommand(String),
UnexpectedArgument(String),
}
impl std::error::Error for ParseArgsError {}
impl std::fmt::Display for ParseArgsError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ParseArgsError::InvalidSubcommand(c) => write!(f, "invalid subcommand: {}", c),
ParseArgsError::UnexpectedArgument(c) => write!(f, "unexpected argument: {}", c),
}
}
}
impl FromStr for Subcommand { impl FromStr for Subcommand {
type Err = (); type Err = ParseArgsError;
fn from_str(input: &str) -> Result<Subcommand, Self::Err> { fn from_str(input: &str) -> Result<Subcommand, Self::Err> {
// TODO: errors should show similar commands? // TODO: errors should show similar commands?
match input { match input {
"help" => Ok(Subcommand::Help), "help" => Ok(Subcommand::Help),
"repl" => Ok(Subcommand::Repl), "repl" => Ok(Subcommand::Repl),
_ => Err(()), _ => Err(ParseArgsError::InvalidSubcommand(input.to_string())),
} }
} }
} }
@ -27,27 +42,25 @@ fn usage(exit_code: i32) -> ! {
std::process::exit(exit_code); std::process::exit(exit_code);
} }
fn parse_global_args() -> Result<GlobalArgs, lexopt::Error> { fn parse_global_args() -> Result<GlobalArgs, ParseArgsError> {
use lexopt::prelude::*;
let mut subcommand = None; let mut subcommand = None;
let mut args = std::env::args();
let mut parser = lexopt::Parser::from_env(); while let Some(arg) = args.next() {
while let Some(arg) = parser.next()? { match arg.as_str() {
match arg { "--help" | "-h" => {
Short('h') | Long("help") => {
usage(0); usage(0);
} }
Value(s) if subcommand.is_none() => { s if subcommand.is_none() => {
subcommand = Some(s.parse()?); subcommand = Some(s.parse()?);
} }
_ => return Err(arg.unexpected()), _ => return Err(ParseArgsError::UnexpectedArgument(arg.to_string())),
} }
} }
return Ok(GlobalArgs { subcommand }); return Ok(GlobalArgs { subcommand });
} }
pub fn run() -> anyhow::Result<()> { pub fn run() -> crate::Result<()> {
Ok(()) Ok(())
} }

View file

@ -2,139 +2,275 @@ use std::{
iter::{self, from_fn}, iter::{self, from_fn},
num::ParseIntError, num::ParseIntError,
path::Path, path::Path,
str::Chars,
sync::Arc,
}; };
// TODO: tree_sitter ? // TODO: tree_sitter ?
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub enum Token { enum BareToken {
Integer(i64), Integer(i64),
Operator(char), Plus,
Minus,
Star,
Slash,
Percent,
OpenParen, OpenParen,
CloseParen, CloseParen,
} NewLine,
EndOfFile,
#[derive(Debug, PartialEq, Clone, Copy)] // TODO: how to handle indentation?
pub enum Source<'a> {
File(&'a Path),
Unknown,
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub struct Position<'a> { struct Token {
source: Source<'a>, // TODO: not certain how this will work in the real world, but I don't want all these positions taking up a bunch of memory unnecessarily as we lex and parse, so for now we deal with lifetimes so we can store references and have Source be cheap to clone everywhere location: Option<Location>, // Not all tokens are associated with a location, such as EndOfFile
token: BareToken,
}
#[derive(Debug, PartialEq, Clone, Default)]
pub enum Source {
#[default]
Unknown,
File(Arc<Box<Path>>),
}
#[derive(Debug, PartialEq, Clone)]
struct Location {
source: Source, // TODO: not certain how this will work in the real world, but I don't want all these locations taking up a bunch of memory unnecessarily as we lex and parse, so for now we deal with lifetimes so we can store references and have Source be cheap to clone everywhere
line: usize, line: usize,
len: usize,
col: usize, col: usize,
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub enum Error { enum LexerError {
// TODO: not a number error? // TODO: not a number error?
ParseIntError(ParseIntError), ParseIntError(ParseIntError),
Unexpected(char), Unexpected(char),
} }
pub struct Lexer { #[derive(Debug, PartialEq)]
num_tokens: usize, pub struct Error {
pub location: Location,
pub error: LexerError,
} }
impl Default for Lexer { pub struct Lexer<'a> {
fn default() -> Self { line: usize,
Self { num_tokens: 0 } col: usize,
} source: Source,
chars: Chars<'a>,
collected: Vec<char>,
done: bool,
sent_eof: bool,
} }
impl Lexer { impl<'a> Lexer<'a> {
pub fn num_tokens(&self) -> usize { fn new(code: &'a str, source: Source) -> Self {
self.num_tokens let mut lexer = Lexer {
done: false,
sent_eof: false,
line: 1,
col: 0,
source: source.clone(),
chars: code.chars(),
collected: vec![],
};
lexer.advance();
lexer
} }
pub fn lex_str<'a>( fn advance(&mut self) {
&mut self, match self.chars.next() {
code: &'a str, Some('\n') => {
) -> Result<Vec<(Position<'a>, Token)>, (Position<'a>, Error)> { self.line += 1;
self.lex(code, None) self.col = 1;
} self.collected.push('\n')
}
pub fn lex<'a>( Some(c) => {
&mut self, self.col += 1;
code: &'a str, self.collected.push(c)
source: Option<Source<'a>>, }
) -> Result<Vec<(Position<'a>, Token)>, (Position<'a>, Error)> { None => self.done = true,
let source: Source = source.unwrap_or(Source::Unknown);
let mut result = vec![];
let mut scanner = Self::walk(code).peekable();
while let Some((line, col, c)) = scanner.next() {
let pos = move || Position { line, col, source };
let err = |err: Error| Err((pos(), err));
let token: Token = match c {
'(' => Token::OpenParen,
')' => Token::CloseParen,
'\t' | ' ' => {
continue;
}
'0'..='9' => {
let s = iter::once(c)
.chain(from_fn(|| {
scanner
.by_ref()
.next_if(|(_, _, c)| c.is_ascii_digit())
.map(|(_, _, c)| c)
}))
.collect::<String>();
let result = match s.parse() {
Ok(i) => Token::Integer(i),
Err(e) => return err(Error::ParseIntError(e)),
};
result
}
'+' | '/' | '%' | '*' | '-' => Token::Operator(c),
c => return err(Error::Unexpected(c)),
};
result.push((pos(), token));
// an excuse for mutability at present
self.num_tokens += 1
} }
return Ok(result);
} }
fn walk(code: &str) -> impl Iterator<Item = (usize, usize, char)> + use<'_> { fn current_location(&self) -> Location {
Self::lines(code) Location {
.map(|(line_num, line)| { source: self.source.clone(),
Self::cols(line).map(move |(col_num, c)| (line_num, col_num, c)) line: self.line,
}) len: self.collected.len(),
.flatten() col: self.col,
}
} }
fn lines<'a>(code: &'a str) -> impl Iterator<Item = (usize, &'a str)> { fn produce(&mut self, token: BareToken) -> Token {
code.lines().enumerate().map(|(i, line)| (i + 1, line)) let t = Token {
location: Some(self.current_location()),
token,
};
self.collected.clear();
t
} }
fn cols(line: &str) -> impl Iterator<Item = (usize, char)> + use<'_> { fn produce_error(&mut self, error: LexerError) -> Error {
line.chars().enumerate().map(|(i, c)| (i + 1, c)) Error {
location: self.current_location(),
error,
}
}
fn token(&mut self) -> Result<Option<Token>, Error> {
self.advance();
let current = match self.collected.last() {
None => return Ok(None),
Some(c) => c,
};
Ok(Some(match current {
c if c.is_ascii_digit() => self.number()?,
c => return Err(self.produce_error(LexerError::Unexpected(*c))),
}))
}
fn number(&mut self) -> Result<Token, Error> {}
}
impl<'a> Iterator for Lexer<'a> {
type Item = Result<Token, Error>;
fn next(&mut self) -> Option<Self::Item> {
if self.done && self.sent_eof {
return None;
}
if self.done && !self.sent_eof {
return Some(Ok(Token {
location: None,
token: BareToken::EndOfFile,
}));
}
match self.token() {
Ok(Some(t)) => Some(Ok(t)),
Ok(None) => None,
Err(e) => Some(Err(e)),
}
} }
} }
// pub fn lex_str(code: &str) -> Result<Vec<Token>, Error> {
// lex(code, None)
// }
// pub fn lex(code: &str, source: Option<Source>) -> Result<Vec<Token>, Error> {
// let source: Source = source.unwrap_or_default();
// let mut result: Vec<Token> = vec![];
// let mut scanner = scanner(source.clone(), code).peekable();
// loop {
// match Self::token(&mut scanner) {
// Ok(Some(t)) => {
// let end = t.token == BareToken::EndOfFile;
// result.push(t);
// if end {
// break;
// }
// }
// Ok(None) => {}
// Err(e) => return Err(e),
// }
// }
// return Ok(result);
// }
// fn token() -> Result<Option<Token>, Error> {
// let r = match scanner.next() {
// None => {
// return Ok(Some(Token {
// token: BareToken::EndOfFile,
// location: None,
// }))
// }
// Some(s) => s,
// };
// match r.c {
// c if c.is_ascii_whitespace() => Ok(None),
// '\n' => t(BareToken::NewLine),
// '(' => t(BareToken::OpenParen),
// ')' => t(BareToken::CloseParen),
// '0'..='9' => Self::number(r.c, r.location, scanner),
// '+' => t(BareToken::Plus),
// '-' => t(BareToken::Minus),
// '*' => t(BareToken::Star),
// '/' => t(BareToken::Slash),
// '%' => t(BareToken::Percent),
// c => {
// return Err(Error {
// location: r.location,
// error: LexerError::Unexpected(c),
// })
// }
// }
// }
// fn number(
// first_digit: char,
// mut location: Location,
// scanner: &mut impl Iterator<Item = ScannerEntry>,
// ) -> Result<Token, Error> {
// let mut scanner = scanner.peekable();
// let s = iter::once(first_digit)
// .chain(from_fn(move || {
// scanner
// .by_ref()
// .next_if(|r| r.c.is_ascii_digit())
// .map(|r| r.c)
// }))
// .collect::<String>();
// location.len += s.len();
// let result = match s.parse() {
// Ok(i) => BareToken::Integer(i),
// Err(e) => {
// return Err(Error {
// error: LexerError::ParseIntError(e),
// location,
// })
// }
// };
// Ok()
// }
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;
#[test] fn pos(col: usize) -> Option<Location> {
fn addition_operation() { Some(Location {
let pos = |col| Position {
line: 1, line: 1,
len: 1,
col, col,
source: Source::Unknown, source: Source::Unknown,
}; })
let mut lexer = Lexer::default(); }
fn t(location: Option<Location>, token: BareToken) -> Token {
Token { location, token }
}
#[test]
fn addition_operation() -> Result<(), Error> {
let lexer = Lexer::new("3 + 9", Source::Unknown);
let tokens: Result<Vec<Token>, Error> = lexer.collect();
assert_eq!( assert_eq!(
lexer.lex_str("3 + 9"), tokens,
Ok(vec![ Ok(vec![
(pos(1), Token::Integer(3)), t(pos(1), BareToken::Integer(3)),
(pos(3), Token::Operator(ast::Operator::Add)), t(pos(3), BareToken::Plus),
(pos(5), Token::Integer(9)), t(pos(5), BareToken::Integer(9)),
]) ]),
); );
assert_eq!(lexer.num_tokens(), 3); assert_eq!(tokens?.len(), 3);
Ok(())
} }
} }

View file

@ -2,7 +2,20 @@ mod ast;
mod cli; mod cli;
mod lexer; mod lexer;
mod parser; mod parser;
mod prelude;
fn main() -> anyhow::Result<()> { pub type StdError = dyn std::error::Error;
#[derive(Debug)]
pub struct Error(Box<StdError>);
impl<E: std::error::Error + 'static> From<E> for Error {
fn from(err: E) -> Self {
Error(Box::new(err))
}
}
pub type Result<T> = std::result::Result<T, Error>;
fn main() -> Result<()> {
Ok(cli::run()?) Ok(cli::run()?)
} }

View file

@ -1,3 +1,5 @@
use std::{iter::Peekable, vec::IntoIter};
use crate::{ast::*, lexer}; use crate::{ast::*, lexer};
struct Parser { struct Parser {
@ -14,31 +16,33 @@ impl Default for Parser {
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub enum Error { pub enum Error {
Lexer((lexer::Position<'static>, lexer::Error)), Lexer(lexer::Error),
Unexpected((lexer::Position<'static>, lexer::Token)), Unexpected(lexer::Token),
}
impl<'a> From<lexer::Error> for Error {
fn from(value: lexer::Error) -> Self {
Error::Lexer(value)
}
} }
impl Parser { impl Parser {
pub fn parse_str(&mut self, code: &'static str) -> Result<Expression, Error> { pub fn parse_str(&mut self, code: &'static str) -> Result<Expression, Error> {
let mut lexer = lexer::Lexer::default(); let mut lexer = lexer::Lexer::default();
let tokens = lexer.lex_str(code).map_err(|e| Error::Lexer(e))?; let tokens = lexer.lex_str(code);
Ok(self.parse(tokens?)?)
Ok(self.parse(tokens)?)
} }
pub fn parse( pub fn parse(&mut self, tokens: Vec<lexer::Token>) -> Result<Expression, Error> {
let iter = tokens.into_iter().peekable();
self.expression(iter)
}
fn expression(
&mut self, &mut self,
tokens: Vec<(lexer::Position, lexer::Token)>, tokens: Peekable<IntoIter<lexer::Token>>,
) -> Result<Expression, Error> { ) -> Result<Expression, Error> {
let mut iter = tokens.iter().peekable(); Ok(Expression::Unit)
while let Some((_p, token)) = iter.next() {
match iter.peek() {
Some(_) => Expression::Infix,
_ => {}
}
self.num_tokens_parsed += 1
}
Ok(ast::Expression::Unit)
} }
pub fn num_tokens_parsed(&self) -> usize { pub fn num_tokens_parsed(&self) -> usize {
@ -49,7 +53,6 @@ impl Parser {
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;
use ast::*;
#[test] #[test]
fn addition_operation() { fn addition_operation() {

1
lyt/src/prelude.rs Normal file
View file

@ -0,0 +1 @@
#![allow(unused_imports)]