From 5868d4bd49a7b80a395f1ecabedeb0b8f4ddffce Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Mon, 18 Jan 2021 20:27:14 +0300 Subject: refactor(tazjin/rlox): Prepare scanner for shared use In the book, the clox interpreter has its own scanner which uses a pull-based model for a single pass compiler. I can't be bothered to write another scanner, or amend this one into pull-mode to work with the treewalk interpreter, so instead I will just reuse it and pull from a vector of tokens. The tokens are shared between both interpreters and the scanner is not what I'm interested in here. Change-Id: Ib07e89127fce2b047f9b3e1ff7e9908d798b3b2b Reviewed-on: https://cl.tvl.fyi/c/depot/+/2420 Reviewed-by: tazjin Tested-by: BuildkiteCI --- users/tazjin/rlox/src/main.rs | 1 + users/tazjin/rlox/src/scanner.rs | 284 ++++++++++++++++++++++++++ users/tazjin/rlox/src/treewalk/errors.rs | 18 ++ users/tazjin/rlox/src/treewalk/interpreter.rs | 4 +- users/tazjin/rlox/src/treewalk/mod.rs | 3 +- users/tazjin/rlox/src/treewalk/scanner.rs | 283 ------------------------- 6 files changed, 308 insertions(+), 285 deletions(-) create mode 100644 users/tazjin/rlox/src/scanner.rs delete mode 100644 users/tazjin/rlox/src/treewalk/scanner.rs diff --git a/users/tazjin/rlox/src/main.rs b/users/tazjin/rlox/src/main.rs index c9cc96d2e6..3a956833c1 100644 --- a/users/tazjin/rlox/src/main.rs +++ b/users/tazjin/rlox/src/main.rs @@ -5,6 +5,7 @@ use std::io::Write; use std::process; mod bytecode; +mod scanner; mod treewalk; /// Trait for making the different interpreters callable in the same diff --git a/users/tazjin/rlox/src/scanner.rs b/users/tazjin/rlox/src/scanner.rs new file mode 100644 index 0000000000..314b56d6d3 --- /dev/null +++ b/users/tazjin/rlox/src/scanner.rs @@ -0,0 +1,284 @@ +#[derive(Clone, Debug, PartialEq)] +pub enum TokenKind { + // Single-character tokens. + LeftParen, + RightParen, + LeftBrace, + RightBrace, + Comma, + Dot, + Minus, + Plus, + Semicolon, + Slash, + Star, + + // One or two character tokens. + Bang, + BangEqual, + Equal, + EqualEqual, + Greater, + GreaterEqual, + Less, + LessEqual, + + // Literals. + Identifier(String), + String(String), + Number(f64), + True, + False, + Nil, + + // Keywords. + And, + Class, + Else, + Fun, + For, + If, + Or, + Print, + Return, + Super, + This, + Var, + While, + + // Special things + Eof, +} + +#[derive(Clone, Debug)] +pub struct Token { + pub kind: TokenKind, + pub lexeme: String, + pub line: usize, +} + +pub enum ScannerError { + UnexpectedChar { line: usize, unexpected: char }, + UnterminatedString { line: usize }, +} + +struct Scanner<'a> { + source: &'a [char], + tokens: Vec, + errors: Vec, + start: usize, // offset of first character in current lexeme + current: usize, // current offset into source + line: usize, // current line in source +} + +impl<'a> Scanner<'a> { + fn is_at_end(&self) -> bool { + return self.current >= self.source.len(); + } + + fn advance(&mut self) -> char { + self.current += 1; + self.source[self.current - 1] + } + + fn add_token(&mut self, kind: TokenKind) { + let lexeme = &self.source[self.start..self.current]; + self.tokens.push(Token { + kind, + lexeme: lexeme.into_iter().collect(), + line: self.line, + }) + } + + fn scan_token(&mut self) { + match self.advance() { + // simple single-character tokens + '(' => self.add_token(TokenKind::LeftParen), + ')' => self.add_token(TokenKind::RightParen), + '{' => self.add_token(TokenKind::LeftBrace), + '}' => self.add_token(TokenKind::RightBrace), + ',' => self.add_token(TokenKind::Comma), + '.' => self.add_token(TokenKind::Dot), + '-' => self.add_token(TokenKind::Minus), + '+' => self.add_token(TokenKind::Plus), + ';' => self.add_token(TokenKind::Semicolon), + '*' => self.add_token(TokenKind::Star), + + // possible multi-character tokens + '!' => self.add_if_next('=', TokenKind::BangEqual, TokenKind::Bang), + '=' => self.add_if_next('=', TokenKind::EqualEqual, TokenKind::Equal), + '<' => self.add_if_next('=', TokenKind::LessEqual, TokenKind::Less), + '>' => self.add_if_next('=', TokenKind::GreaterEqual, TokenKind::Greater), + + '/' => { + // support comments until EOL by discarding characters + if self.match_next('/') { + while self.peek() != '\n' && !self.is_at_end() { + self.advance(); + } + } else { + self.add_token(TokenKind::Slash); + } + } + + // ignore whitespace + ws if ws.is_whitespace() => { + if ws == '\n' { + self.line += 1 + } + } + + '"' => self.scan_string(), + + digit if digit.is_digit(10) => self.scan_number(), + + chr if chr.is_alphabetic() || chr == '_' => self.scan_identifier(), + + unexpected => self.errors.push(ScannerError::UnexpectedChar { + line: self.line, + unexpected, + }), + }; + } + + fn match_next(&mut self, expected: char) -> bool { + if self.is_at_end() || self.source[self.current] != expected { + false + } else { + self.current += 1; + true + } + } + + fn add_if_next(&mut self, expected: char, then: TokenKind, or: TokenKind) { + if self.match_next(expected) { + self.add_token(then); + } else { + self.add_token(or); + } + } + + fn peek(&self) -> char { + if self.is_at_end() { + return '\0'; + } else { + return self.source[self.current]; + } + } + + fn peek_next(&self) -> char { + if self.current + 1 >= self.source.len() { + return '\0'; + } else { + return self.source[self.current + 1]; + } + } + + fn scan_string(&mut self) { + while self.peek() != '"' && !self.is_at_end() { + if self.peek() == '\n' { + self.line += 1; + } + + self.advance(); + } + + if self.is_at_end() { + self.errors + .push(ScannerError::UnterminatedString { line: self.line }); + return; + } + + // closing '"' + self.advance(); + + // add token without surrounding quotes + let string: String = self.source[(self.start + 1)..(self.current - 1)] + .iter() + .collect(); + self.add_token(TokenKind::String(string)); + } + + fn scan_number(&mut self) { + while self.peek().is_digit(10) { + self.advance(); + } + + // Look for a fractional part + if self.peek() == '.' && self.peek_next().is_digit(10) { + // consume '.' + self.advance(); + + while self.peek().is_digit(10) { + self.advance(); + } + } + + let num: f64 = self.source[self.start..self.current] + .iter() + .collect::() + .parse() + .expect("float parsing should always work"); + + self.add_token(TokenKind::Number(num)); + } + + fn scan_identifier(&mut self) { + while self.peek().is_alphanumeric() || self.peek() == '_' { + self.advance(); + } + + let ident: String = self.source[self.start..self.current].iter().collect(); + + // Determine whether this is an identifier, or a keyword: + let token_kind = match ident.as_str() { + "and" => TokenKind::And, + "class" => TokenKind::Class, + "else" => TokenKind::Else, + "false" => TokenKind::False, + "for" => TokenKind::For, + "fun" => TokenKind::Fun, + "if" => TokenKind::If, + "nil" => TokenKind::Nil, + "or" => TokenKind::Or, + "print" => TokenKind::Print, + "return" => TokenKind::Return, + "super" => TokenKind::Super, + "this" => TokenKind::This, + "true" => TokenKind::True, + "var" => TokenKind::Var, + "while" => TokenKind::While, + _ => TokenKind::Identifier(ident), + }; + + self.add_token(token_kind); + } + + fn scan_tokens(&mut self) { + while !self.is_at_end() { + self.start = self.current; + self.scan_token(); + } + + self.add_token(TokenKind::Eof); + } +} + +pub fn scan<'a>(input: &'a [char]) -> Result, Vec> { + let mut scanner = Scanner { + source: &input, + tokens: vec![], + errors: vec![], + start: 0, + current: 0, + line: 0, + }; + + scanner.scan_tokens(); + + if !scanner.errors.is_empty() { + return Err(scanner.errors); + } + + return Ok(scanner.tokens); +} diff --git a/users/tazjin/rlox/src/treewalk/errors.rs b/users/tazjin/rlox/src/treewalk/errors.rs index 54d2718eed..391663d51b 100644 --- a/users/tazjin/rlox/src/treewalk/errors.rs +++ b/users/tazjin/rlox/src/treewalk/errors.rs @@ -1,4 +1,6 @@ +use crate::scanner::ScannerError; use crate::treewalk::interpreter::Value; + use std::fmt; #[derive(Debug)] @@ -39,3 +41,19 @@ impl fmt::Display for Error { write!(f, "[line {}] Error: {:?}", self.line, self.kind) } } + +impl From for Error { + fn from(err: ScannerError) -> Self { + match err { + ScannerError::UnexpectedChar { line, unexpected } => Error { + line, + kind: ErrorKind::UnexpectedChar(unexpected), + }, + + ScannerError::UnterminatedString { line } => Error { + line, + kind: ErrorKind::UnterminatedString, + }, + } + } +} diff --git a/users/tazjin/rlox/src/treewalk/interpreter.rs b/users/tazjin/rlox/src/treewalk/interpreter.rs index 1263e6cb81..3285775bbe 100644 --- a/users/tazjin/rlox/src/treewalk/interpreter.rs +++ b/users/tazjin/rlox/src/treewalk/interpreter.rs @@ -200,7 +200,9 @@ impl Lox for Interpreter { fn interpret(&mut self, code: String) -> Result> { let chars: Vec = code.chars().collect(); - let mut program = scanner::scan(&chars).and_then(|tokens| parser::parse(tokens))?; + let mut program = scanner::scan(&chars) + .map_err(|errors| errors.into_iter().map(Into::into).collect()) + .and_then(|tokens| parser::parse(tokens))?; let globals = self .env diff --git a/users/tazjin/rlox/src/treewalk/mod.rs b/users/tazjin/rlox/src/treewalk/mod.rs index d53bd13f8e..2d82b3320a 100644 --- a/users/tazjin/rlox/src/treewalk/mod.rs +++ b/users/tazjin/rlox/src/treewalk/mod.rs @@ -1,5 +1,6 @@ +use crate::scanner; + mod errors; pub mod interpreter; mod parser; mod resolver; -mod scanner; diff --git a/users/tazjin/rlox/src/treewalk/scanner.rs b/users/tazjin/rlox/src/treewalk/scanner.rs deleted file mode 100644 index af90754841..0000000000 --- a/users/tazjin/rlox/src/treewalk/scanner.rs +++ /dev/null @@ -1,283 +0,0 @@ -use crate::treewalk::errors::{Error, ErrorKind}; - -#[derive(Clone, Debug, PartialEq)] -pub enum TokenKind { - // Single-character tokens. - LeftParen, - RightParen, - LeftBrace, - RightBrace, - Comma, - Dot, - Minus, - Plus, - Semicolon, - Slash, - Star, - - // One or two character tokens. - Bang, - BangEqual, - Equal, - EqualEqual, - Greater, - GreaterEqual, - Less, - LessEqual, - - // Literals. - Identifier(String), - String(String), - Number(f64), - True, - False, - Nil, - - // Keywords. - And, - Class, - Else, - Fun, - For, - If, - Or, - Print, - Return, - Super, - This, - Var, - While, - - // Special things - Eof, -} - -#[derive(Clone, Debug)] -pub struct Token { - pub kind: TokenKind, - pub lexeme: String, - pub line: usize, -} - -struct Scanner<'a> { - source: &'a [char], - tokens: Vec, - errors: Vec, - start: usize, // offset of first character in current lexeme - current: usize, // current offset into source - line: usize, // current line in source -} - -impl<'a> Scanner<'a> { - fn is_at_end(&self) -> bool { - return self.current >= self.source.len(); - } - - fn advance(&mut self) -> char { - self.current += 1; - self.source[self.current - 1] - } - - fn add_token(&mut self, kind: TokenKind) { - let lexeme = &self.source[self.start..self.current]; - self.tokens.push(Token { - kind, - lexeme: lexeme.into_iter().collect(), - line: self.line, - }) - } - - fn scan_token(&mut self) { - match self.advance() { - // simple single-character tokens - '(' => self.add_token(TokenKind::LeftParen), - ')' => self.add_token(TokenKind::RightParen), - '{' => self.add_token(TokenKind::LeftBrace), - '}' => self.add_token(TokenKind::RightBrace), - ',' => self.add_token(TokenKind::Comma), - '.' => self.add_token(TokenKind::Dot), - '-' => self.add_token(TokenKind::Minus), - '+' => self.add_token(TokenKind::Plus), - ';' => self.add_token(TokenKind::Semicolon), - '*' => self.add_token(TokenKind::Star), - - // possible multi-character tokens - '!' => self.add_if_next('=', TokenKind::BangEqual, TokenKind::Bang), - '=' => self.add_if_next('=', TokenKind::EqualEqual, TokenKind::Equal), - '<' => self.add_if_next('=', TokenKind::LessEqual, TokenKind::Less), - '>' => self.add_if_next('=', TokenKind::GreaterEqual, TokenKind::Greater), - - '/' => { - // support comments until EOL by discarding characters - if self.match_next('/') { - while self.peek() != '\n' && !self.is_at_end() { - self.advance(); - } - } else { - self.add_token(TokenKind::Slash); - } - } - - // ignore whitespace - ws if ws.is_whitespace() => { - if ws == '\n' { - self.line += 1 - } - } - - '"' => self.scan_string(), - - digit if digit.is_digit(10) => self.scan_number(), - - chr if chr.is_alphabetic() || chr == '_' => self.scan_identifier(), - - unexpected => self.errors.push(Error { - line: self.line, - kind: ErrorKind::UnexpectedChar(unexpected), - }), - }; - } - - fn match_next(&mut self, expected: char) -> bool { - if self.is_at_end() || self.source[self.current] != expected { - false - } else { - self.current += 1; - true - } - } - - fn add_if_next(&mut self, expected: char, then: TokenKind, or: TokenKind) { - if self.match_next(expected) { - self.add_token(then); - } else { - self.add_token(or); - } - } - - fn peek(&self) -> char { - if self.is_at_end() { - return '\0'; - } else { - return self.source[self.current]; - } - } - - fn peek_next(&self) -> char { - if self.current + 1 >= self.source.len() { - return '\0'; - } else { - return self.source[self.current + 1]; - } - } - - fn scan_string(&mut self) { - while self.peek() != '"' && !self.is_at_end() { - if self.peek() == '\n' { - self.line += 1; - } - - self.advance(); - } - - if self.is_at_end() { - self.errors.push(Error { - line: self.line, - kind: ErrorKind::UnterminatedString, - }); - return; - } - - // closing '"' - self.advance(); - - // add token without surrounding quotes - let string: String = self.source[(self.start + 1)..(self.current - 1)] - .iter() - .collect(); - self.add_token(TokenKind::String(string)); - } - - fn scan_number(&mut self) { - while self.peek().is_digit(10) { - self.advance(); - } - - // Look for a fractional part - if self.peek() == '.' && self.peek_next().is_digit(10) { - // consume '.' - self.advance(); - - while self.peek().is_digit(10) { - self.advance(); - } - } - - let num: f64 = self.source[self.start..self.current] - .iter() - .collect::() - .parse() - .expect("float parsing should always work"); - - self.add_token(TokenKind::Number(num)); - } - - fn scan_identifier(&mut self) { - while self.peek().is_alphanumeric() || self.peek() == '_' { - self.advance(); - } - - let ident: String = self.source[self.start..self.current].iter().collect(); - - // Determine whether this is an identifier, or a keyword: - let token_kind = match ident.as_str() { - "and" => TokenKind::And, - "class" => TokenKind::Class, - "else" => TokenKind::Else, - "false" => TokenKind::False, - "for" => TokenKind::For, - "fun" => TokenKind::Fun, - "if" => TokenKind::If, - "nil" => TokenKind::Nil, - "or" => TokenKind::Or, - "print" => TokenKind::Print, - "return" => TokenKind::Return, - "super" => TokenKind::Super, - "this" => TokenKind::This, - "true" => TokenKind::True, - "var" => TokenKind::Var, - "while" => TokenKind::While, - _ => TokenKind::Identifier(ident), - }; - - self.add_token(token_kind); - } - - fn scan_tokens(&mut self) { - while !self.is_at_end() { - self.start = self.current; - self.scan_token(); - } - - self.add_token(TokenKind::Eof); - } -} - -pub fn scan<'a>(input: &'a [char]) -> Result, Vec> { - let mut scanner = Scanner { - source: &input, - tokens: vec![], - errors: vec![], - start: 0, - current: 0, - line: 0, - }; - - scanner.scan_tokens(); - - if !scanner.errors.is_empty() { - return Err(scanner.errors); - } - - return Ok(scanner.tokens); -} -- cgit 1.4.1