chore: wip lexer overhaul

This commit is contained in:
Daniel Flanagan 2025-02-13 17:13:28 -06:00
parent b38642d869
commit 56e99a3083
8 changed files with 283 additions and 136 deletions

16
Cargo.lock generated
View file

@ -2,22 +2,6 @@
# It is not intended for manual editing.
version = 3
[[package]]
name = "anyhow"
version = "1.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
[[package]]
name = "lexopt"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baff4b617f7df3d896f97fe922b64817f6cd9a756bb81d40f8883f2f66dcb401"
[[package]]
name = "lyt"
version = "0.1.0"
dependencies = [
"anyhow",
"lexopt",
]

View file

@ -3,4 +3,3 @@ resolver = "2"
members = ["lyt"]
[workspace.dependencies]
anyhow = "1.0.95"

View file

@ -4,5 +4,3 @@ version = "0.1.0"
edition = "2021"
[dependencies]
anyhow = { workspace = true }
lexopt = "0.3.0"

View file

@ -9,15 +9,30 @@ enum Subcommand {
Help,
}
#[derive(Debug)]
pub enum ParseArgsError {
InvalidSubcommand(String),
UnexpectedArgument(String),
}
impl std::error::Error for ParseArgsError {}
impl std::fmt::Display for ParseArgsError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ParseArgsError::InvalidSubcommand(c) => write!(f, "invalid subcommand: {}", c),
ParseArgsError::UnexpectedArgument(c) => write!(f, "unexpected argument: {}", c),
}
}
}
impl FromStr for Subcommand {
type Err = ();
type Err = ParseArgsError;
fn from_str(input: &str) -> Result<Subcommand, Self::Err> {
// TODO: errors should show similar commands?
match input {
"help" => Ok(Subcommand::Help),
"repl" => Ok(Subcommand::Repl),
_ => Err(()),
_ => Err(ParseArgsError::InvalidSubcommand(input.to_string())),
}
}
}
@ -27,27 +42,25 @@ fn usage(exit_code: i32) -> ! {
std::process::exit(exit_code);
}
fn parse_global_args() -> Result<GlobalArgs, lexopt::Error> {
use lexopt::prelude::*;
fn parse_global_args() -> Result<GlobalArgs, ParseArgsError> {
let mut subcommand = None;
let mut args = std::env::args();
let mut parser = lexopt::Parser::from_env();
while let Some(arg) = parser.next()? {
match arg {
Short('h') | Long("help") => {
while let Some(arg) = args.next() {
match arg.as_str() {
"--help" | "-h" => {
usage(0);
}
Value(s) if subcommand.is_none() => {
s if subcommand.is_none() => {
subcommand = Some(s.parse()?);
}
_ => return Err(arg.unexpected()),
_ => return Err(ParseArgsError::UnexpectedArgument(arg.to_string())),
}
}
return Ok(GlobalArgs { subcommand });
}
pub fn run() -> anyhow::Result<()> {
pub fn run() -> crate::Result<()> {
Ok(())
}

View file

@ -2,139 +2,275 @@ use std::{
iter::{self, from_fn},
num::ParseIntError,
path::Path,
str::Chars,
sync::Arc,
};
// TODO: tree_sitter ?
#[derive(Debug, PartialEq)]
pub enum Token {
enum BareToken {
Integer(i64),
Operator(char),
Plus,
Minus,
Star,
Slash,
Percent,
OpenParen,
CloseParen,
}
#[derive(Debug, PartialEq, Clone, Copy)]
pub enum Source<'a> {
File(&'a Path),
Unknown,
NewLine,
EndOfFile,
// TODO: how to handle indentation?
}
#[derive(Debug, PartialEq)]
pub struct Position<'a> {
source: Source<'a>, // TODO: not certain how this will work in the real world, but I don't want all these positions taking up a bunch of memory unnecessarily as we lex and parse, so for now we deal with lifetimes so we can store references and have Source be cheap to clone everywhere
struct Token {
location: Option<Location>, // Not all tokens are associated with a location, such as EndOfFile
token: BareToken,
}
#[derive(Debug, PartialEq, Clone, Default)]
pub enum Source {
#[default]
Unknown,
File(Arc<Box<Path>>),
}
#[derive(Debug, PartialEq, Clone)]
struct Location {
source: Source, // TODO: not certain how this will work in the real world, but I don't want all these locations taking up a bunch of memory unnecessarily as we lex and parse, so for now we deal with lifetimes so we can store references and have Source be cheap to clone everywhere
line: usize,
len: usize,
col: usize,
}
#[derive(Debug, PartialEq)]
pub enum Error {
enum LexerError {
// TODO: not a number error?
ParseIntError(ParseIntError),
Unexpected(char),
}
pub struct Lexer {
num_tokens: usize,
#[derive(Debug, PartialEq)]
pub struct Error {
pub location: Location,
pub error: LexerError,
}
impl Default for Lexer {
fn default() -> Self {
Self { num_tokens: 0 }
pub struct Lexer<'a> {
line: usize,
col: usize,
source: Source,
chars: Chars<'a>,
collected: Vec<char>,
done: bool,
sent_eof: bool,
}
impl<'a> Lexer<'a> {
fn new(code: &'a str, source: Source) -> Self {
let mut lexer = Lexer {
done: false,
sent_eof: false,
line: 1,
col: 0,
source: source.clone(),
chars: code.chars(),
collected: vec![],
};
lexer.advance();
lexer
}
fn advance(&mut self) {
match self.chars.next() {
Some('\n') => {
self.line += 1;
self.col = 1;
self.collected.push('\n')
}
Some(c) => {
self.col += 1;
self.collected.push(c)
}
None => self.done = true,
}
}
impl Lexer {
pub fn num_tokens(&self) -> usize {
self.num_tokens
fn current_location(&self) -> Location {
Location {
source: self.source.clone(),
line: self.line,
len: self.collected.len(),
col: self.col,
}
}
pub fn lex_str<'a>(
&mut self,
code: &'a str,
) -> Result<Vec<(Position<'a>, Token)>, (Position<'a>, Error)> {
self.lex(code, None)
fn produce(&mut self, token: BareToken) -> Token {
let t = Token {
location: Some(self.current_location()),
token,
};
self.collected.clear();
t
}
pub fn lex<'a>(
&mut self,
code: &'a str,
source: Option<Source<'a>>,
) -> Result<Vec<(Position<'a>, Token)>, (Position<'a>, Error)> {
let source: Source = source.unwrap_or(Source::Unknown);
let mut result = vec![];
let mut scanner = Self::walk(code).peekable();
while let Some((line, col, c)) = scanner.next() {
let pos = move || Position { line, col, source };
let err = |err: Error| Err((pos(), err));
let token: Token = match c {
'(' => Token::OpenParen,
')' => Token::CloseParen,
'\t' | ' ' => {
continue;
fn produce_error(&mut self, error: LexerError) -> Error {
Error {
location: self.current_location(),
error,
}
'0'..='9' => {
let s = iter::once(c)
.chain(from_fn(|| {
scanner
.by_ref()
.next_if(|(_, _, c)| c.is_ascii_digit())
.map(|(_, _, c)| c)
}
fn token(&mut self) -> Result<Option<Token>, Error> {
self.advance();
let current = match self.collected.last() {
None => return Ok(None),
Some(c) => c,
};
Ok(Some(match current {
c if c.is_ascii_digit() => self.number()?,
c => return Err(self.produce_error(LexerError::Unexpected(*c))),
}))
.collect::<String>();
let result = match s.parse() {
Ok(i) => Token::Integer(i),
Err(e) => return err(Error::ParseIntError(e)),
};
result
}
'+' | '/' | '%' | '*' | '-' => Token::Operator(c),
c => return err(Error::Unexpected(c)),
};
result.push((pos(), token));
// an excuse for mutability at present
self.num_tokens += 1
}
return Ok(result);
}
fn walk(code: &str) -> impl Iterator<Item = (usize, usize, char)> + use<'_> {
Self::lines(code)
.map(|(line_num, line)| {
Self::cols(line).map(move |(col_num, c)| (line_num, col_num, c))
})
.flatten()
fn number(&mut self) -> Result<Token, Error> {}
}
fn lines<'a>(code: &'a str) -> impl Iterator<Item = (usize, &'a str)> {
code.lines().enumerate().map(|(i, line)| (i + 1, line))
impl<'a> Iterator for Lexer<'a> {
type Item = Result<Token, Error>;
fn next(&mut self) -> Option<Self::Item> {
if self.done && self.sent_eof {
return None;
}
if self.done && !self.sent_eof {
return Some(Ok(Token {
location: None,
token: BareToken::EndOfFile,
}));
}
match self.token() {
Ok(Some(t)) => Some(Ok(t)),
Ok(None) => None,
Err(e) => Some(Err(e)),
}
}
}
fn cols(line: &str) -> impl Iterator<Item = (usize, char)> + use<'_> {
line.chars().enumerate().map(|(i, c)| (i + 1, c))
}
}
// pub fn lex_str(code: &str) -> Result<Vec<Token>, Error> {
// lex(code, None)
// }
// pub fn lex(code: &str, source: Option<Source>) -> Result<Vec<Token>, Error> {
// let source: Source = source.unwrap_or_default();
// let mut result: Vec<Token> = vec![];
// let mut scanner = scanner(source.clone(), code).peekable();
// loop {
// match Self::token(&mut scanner) {
// Ok(Some(t)) => {
// let end = t.token == BareToken::EndOfFile;
// result.push(t);
// if end {
// break;
// }
// }
// Ok(None) => {}
// Err(e) => return Err(e),
// }
// }
// return Ok(result);
// }
// fn token() -> Result<Option<Token>, Error> {
// let r = match scanner.next() {
// None => {
// return Ok(Some(Token {
// token: BareToken::EndOfFile,
// location: None,
// }))
// }
// Some(s) => s,
// };
// match r.c {
// c if c.is_ascii_whitespace() => Ok(None),
// '\n' => t(BareToken::NewLine),
// '(' => t(BareToken::OpenParen),
// ')' => t(BareToken::CloseParen),
// '0'..='9' => Self::number(r.c, r.location, scanner),
// '+' => t(BareToken::Plus),
// '-' => t(BareToken::Minus),
// '*' => t(BareToken::Star),
// '/' => t(BareToken::Slash),
// '%' => t(BareToken::Percent),
// c => {
// return Err(Error {
// location: r.location,
// error: LexerError::Unexpected(c),
// })
// }
// }
// }
// fn number(
// first_digit: char,
// mut location: Location,
// scanner: &mut impl Iterator<Item = ScannerEntry>,
// ) -> Result<Token, Error> {
// let mut scanner = scanner.peekable();
// let s = iter::once(first_digit)
// .chain(from_fn(move || {
// scanner
// .by_ref()
// .next_if(|r| r.c.is_ascii_digit())
// .map(|r| r.c)
// }))
// .collect::<String>();
// location.len += s.len();
// let result = match s.parse() {
// Ok(i) => BareToken::Integer(i),
// Err(e) => {
// return Err(Error {
// error: LexerError::ParseIntError(e),
// location,
// })
// }
// };
// Ok()
// }
#[cfg(test)]
mod test {
use super::*;
#[test]
fn addition_operation() {
let pos = |col| Position {
fn pos(col: usize) -> Option<Location> {
Some(Location {
line: 1,
len: 1,
col,
source: Source::Unknown,
};
let mut lexer = Lexer::default();
})
}
fn t(location: Option<Location>, token: BareToken) -> Token {
Token { location, token }
}
#[test]
fn addition_operation() -> Result<(), Error> {
let lexer = Lexer::new("3 + 9", Source::Unknown);
let tokens: Result<Vec<Token>, Error> = lexer.collect();
assert_eq!(
lexer.lex_str("3 + 9"),
tokens,
Ok(vec![
(pos(1), Token::Integer(3)),
(pos(3), Token::Operator(ast::Operator::Add)),
(pos(5), Token::Integer(9)),
])
t(pos(1), BareToken::Integer(3)),
t(pos(3), BareToken::Plus),
t(pos(5), BareToken::Integer(9)),
]),
);
assert_eq!(lexer.num_tokens(), 3);
assert_eq!(tokens?.len(), 3);
Ok(())
}
}

View file

@ -2,7 +2,20 @@ mod ast;
mod cli;
mod lexer;
mod parser;
mod prelude;
fn main() -> anyhow::Result<()> {
pub type StdError = dyn std::error::Error;
#[derive(Debug)]
pub struct Error(Box<StdError>);
impl<E: std::error::Error + 'static> From<E> for Error {
fn from(err: E) -> Self {
Error(Box::new(err))
}
}
pub type Result<T> = std::result::Result<T, Error>;
fn main() -> Result<()> {
Ok(cli::run()?)
}

View file

@ -1,3 +1,5 @@
use std::{iter::Peekable, vec::IntoIter};
use crate::{ast::*, lexer};
struct Parser {
@ -14,31 +16,33 @@ impl Default for Parser {
#[derive(Debug, PartialEq)]
pub enum Error {
Lexer((lexer::Position<'static>, lexer::Error)),
Unexpected((lexer::Position<'static>, lexer::Token)),
Lexer(lexer::Error),
Unexpected(lexer::Token),
}
impl<'a> From<lexer::Error> for Error {
fn from(value: lexer::Error) -> Self {
Error::Lexer(value)
}
}
impl Parser {
pub fn parse_str(&mut self, code: &'static str) -> Result<Expression, Error> {
let mut lexer = lexer::Lexer::default();
let tokens = lexer.lex_str(code).map_err(|e| Error::Lexer(e))?;
Ok(self.parse(tokens)?)
let tokens = lexer.lex_str(code);
Ok(self.parse(tokens?)?)
}
pub fn parse(
pub fn parse(&mut self, tokens: Vec<lexer::Token>) -> Result<Expression, Error> {
let iter = tokens.into_iter().peekable();
self.expression(iter)
}
fn expression(
&mut self,
tokens: Vec<(lexer::Position, lexer::Token)>,
tokens: Peekable<IntoIter<lexer::Token>>,
) -> Result<Expression, Error> {
let mut iter = tokens.iter().peekable();
while let Some((_p, token)) = iter.next() {
match iter.peek() {
Some(_) => Expression::Infix,
_ => {}
}
self.num_tokens_parsed += 1
}
Ok(ast::Expression::Unit)
Ok(Expression::Unit)
}
pub fn num_tokens_parsed(&self) -> usize {
@ -49,7 +53,6 @@ impl Parser {
#[cfg(test)]
mod test {
use super::*;
use ast::*;
#[test]
fn addition_operation() {

1
lyt/src/prelude.rs Normal file
View file

@ -0,0 +1 @@
#![allow(unused_imports)]