From b1d0e22b1f5fe907ba3d48931e5a38b9a75b0dcf Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Sun, 17 Jan 2021 21:13:57 +0300 Subject: chore(tazjin/rlox): Move other modules under treewalk:: It's unclear if the second part of the book can reuse anything from the first part (I'm guessing probably the scanner, but I'll move that back if it turns out to be the case). Change-Id: I9411355929e31ac6e953599e51665406b1f48d55 Reviewed-on: https://cl.tvl.fyi/c/depot/+/2415 Reviewed-by: tazjin Tested-by: BuildkiteCI --- users/tazjin/rlox/src/treewalk/scanner.rs | 283 ++++++++++++++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 users/tazjin/rlox/src/treewalk/scanner.rs (limited to 'users/tazjin/rlox/src/treewalk/scanner.rs') diff --git a/users/tazjin/rlox/src/treewalk/scanner.rs b/users/tazjin/rlox/src/treewalk/scanner.rs new file mode 100644 index 000000000000..af9075484145 --- /dev/null +++ b/users/tazjin/rlox/src/treewalk/scanner.rs @@ -0,0 +1,283 @@ +use crate::treewalk::errors::{Error, ErrorKind}; + +#[derive(Clone, Debug, PartialEq)] +pub enum TokenKind { + // Single-character tokens. + LeftParen, + RightParen, + LeftBrace, + RightBrace, + Comma, + Dot, + Minus, + Plus, + Semicolon, + Slash, + Star, + + // One or two character tokens. + Bang, + BangEqual, + Equal, + EqualEqual, + Greater, + GreaterEqual, + Less, + LessEqual, + + // Literals. + Identifier(String), + String(String), + Number(f64), + True, + False, + Nil, + + // Keywords. + And, + Class, + Else, + Fun, + For, + If, + Or, + Print, + Return, + Super, + This, + Var, + While, + + // Special things + Eof, +} + +#[derive(Clone, Debug)] +pub struct Token { + pub kind: TokenKind, + pub lexeme: String, + pub line: usize, +} + +struct Scanner<'a> { + source: &'a [char], + tokens: Vec, + errors: Vec, + start: usize, // offset of first character in current lexeme + current: usize, // current offset into source + line: usize, // current line in source +} + +impl<'a> Scanner<'a> { + fn is_at_end(&self) -> bool { + return self.current >= self.source.len(); + } + + fn advance(&mut self) -> char { + self.current += 1; + self.source[self.current - 1] + } + + fn add_token(&mut self, kind: TokenKind) { + let lexeme = &self.source[self.start..self.current]; + self.tokens.push(Token { + kind, + lexeme: lexeme.into_iter().collect(), + line: self.line, + }) + } + + fn scan_token(&mut self) { + match self.advance() { + // simple single-character tokens + '(' => self.add_token(TokenKind::LeftParen), + ')' => self.add_token(TokenKind::RightParen), + '{' => self.add_token(TokenKind::LeftBrace), + '}' => self.add_token(TokenKind::RightBrace), + ',' => self.add_token(TokenKind::Comma), + '.' => self.add_token(TokenKind::Dot), + '-' => self.add_token(TokenKind::Minus), + '+' => self.add_token(TokenKind::Plus), + ';' => self.add_token(TokenKind::Semicolon), + '*' => self.add_token(TokenKind::Star), + + // possible multi-character tokens + '!' => self.add_if_next('=', TokenKind::BangEqual, TokenKind::Bang), + '=' => self.add_if_next('=', TokenKind::EqualEqual, TokenKind::Equal), + '<' => self.add_if_next('=', TokenKind::LessEqual, TokenKind::Less), + '>' => self.add_if_next('=', TokenKind::GreaterEqual, TokenKind::Greater), + + '/' => { + // support comments until EOL by discarding characters + if self.match_next('/') { + while self.peek() != '\n' && !self.is_at_end() { + self.advance(); + } + } else { + self.add_token(TokenKind::Slash); + } + } + + // ignore whitespace + ws if ws.is_whitespace() => { + if ws == '\n' { + self.line += 1 + } + } + + '"' => self.scan_string(), + + digit if digit.is_digit(10) => self.scan_number(), + + chr if chr.is_alphabetic() || chr == '_' => self.scan_identifier(), + + unexpected => self.errors.push(Error { + line: self.line, + kind: ErrorKind::UnexpectedChar(unexpected), + }), + }; + } + + fn match_next(&mut self, expected: char) -> bool { + if self.is_at_end() || self.source[self.current] != expected { + false + } else { + self.current += 1; + true + } + } + + fn add_if_next(&mut self, expected: char, then: TokenKind, or: TokenKind) { + if self.match_next(expected) { + self.add_token(then); + } else { + self.add_token(or); + } + } + + fn peek(&self) -> char { + if self.is_at_end() { + return '\0'; + } else { + return self.source[self.current]; + } + } + + fn peek_next(&self) -> char { + if self.current + 1 >= self.source.len() { + return '\0'; + } else { + return self.source[self.current + 1]; + } + } + + fn scan_string(&mut self) { + while self.peek() != '"' && !self.is_at_end() { + if self.peek() == '\n' { + self.line += 1; + } + + self.advance(); + } + + if self.is_at_end() { + self.errors.push(Error { + line: self.line, + kind: ErrorKind::UnterminatedString, + }); + return; + } + + // closing '"' + self.advance(); + + // add token without surrounding quotes + let string: String = self.source[(self.start + 1)..(self.current - 1)] + .iter() + .collect(); + self.add_token(TokenKind::String(string)); + } + + fn scan_number(&mut self) { + while self.peek().is_digit(10) { + self.advance(); + } + + // Look for a fractional part + if self.peek() == '.' && self.peek_next().is_digit(10) { + // consume '.' + self.advance(); + + while self.peek().is_digit(10) { + self.advance(); + } + } + + let num: f64 = self.source[self.start..self.current] + .iter() + .collect::() + .parse() + .expect("float parsing should always work"); + + self.add_token(TokenKind::Number(num)); + } + + fn scan_identifier(&mut self) { + while self.peek().is_alphanumeric() || self.peek() == '_' { + self.advance(); + } + + let ident: String = self.source[self.start..self.current].iter().collect(); + + // Determine whether this is an identifier, or a keyword: + let token_kind = match ident.as_str() { + "and" => TokenKind::And, + "class" => TokenKind::Class, + "else" => TokenKind::Else, + "false" => TokenKind::False, + "for" => TokenKind::For, + "fun" => TokenKind::Fun, + "if" => TokenKind::If, + "nil" => TokenKind::Nil, + "or" => TokenKind::Or, + "print" => TokenKind::Print, + "return" => TokenKind::Return, + "super" => TokenKind::Super, + "this" => TokenKind::This, + "true" => TokenKind::True, + "var" => TokenKind::Var, + "while" => TokenKind::While, + _ => TokenKind::Identifier(ident), + }; + + self.add_token(token_kind); + } + + fn scan_tokens(&mut self) { + while !self.is_at_end() { + self.start = self.current; + self.scan_token(); + } + + self.add_token(TokenKind::Eof); + } +} + +pub fn scan<'a>(input: &'a [char]) -> Result, Vec> { + let mut scanner = Scanner { + source: &input, + tokens: vec![], + errors: vec![], + start: 0, + current: 0, + line: 0, + }; + + scanner.scan_tokens(); + + if !scanner.errors.is_empty() { + return Err(scanner.errors); + } + + return Ok(scanner.tokens); +} -- cgit 1.4.1