diff options
author | Vincent Ambo <mail@tazj.in> | 2021-01-18T17·27+0300 |
---|---|---|
committer | tazjin <mail@tazj.in> | 2021-01-19T09·57+0000 |
commit | 5868d4bd49a7b80a395f1ecabedeb0b8f4ddffce (patch) | |
tree | 7f9b786e665f7ea103b256d9cab92bbb982d4770 /users/tazjin/rlox/src/scanner.rs | |
parent | 2d136e03279e481021a23948fdf5556f25394cd3 (diff) |
refactor(tazjin/rlox): Prepare scanner for shared use r/2132
In the book, the clox interpreter has its own scanner which uses a pull-based model for a single pass compiler. I can't be bothered to write another scanner, or amend this one into pull-mode to work with the treewalk interpreter, so instead I will just reuse it and pull from a vector of tokens. The tokens are shared between both interpreters and the scanner is not what I'm interested in here. Change-Id: Ib07e89127fce2b047f9b3e1ff7e9908d798b3b2b Reviewed-on: https://cl.tvl.fyi/c/depot/+/2420 Reviewed-by: tazjin <mail@tazj.in> Tested-by: BuildkiteCI
Diffstat (limited to 'users/tazjin/rlox/src/scanner.rs')
-rw-r--r-- | users/tazjin/rlox/src/scanner.rs | 284 |
1 files changed, 284 insertions, 0 deletions
diff --git a/users/tazjin/rlox/src/scanner.rs b/users/tazjin/rlox/src/scanner.rs new file mode 100644 index 000000000000..314b56d6d380 --- /dev/null +++ b/users/tazjin/rlox/src/scanner.rs @@ -0,0 +1,284 @@ +#[derive(Clone, Debug, PartialEq)] +pub enum TokenKind { + // Single-character tokens. + LeftParen, + RightParen, + LeftBrace, + RightBrace, + Comma, + Dot, + Minus, + Plus, + Semicolon, + Slash, + Star, + + // One or two character tokens. + Bang, + BangEqual, + Equal, + EqualEqual, + Greater, + GreaterEqual, + Less, + LessEqual, + + // Literals. + Identifier(String), + String(String), + Number(f64), + True, + False, + Nil, + + // Keywords. + And, + Class, + Else, + Fun, + For, + If, + Or, + Print, + Return, + Super, + This, + Var, + While, + + // Special things + Eof, +} + +#[derive(Clone, Debug)] +pub struct Token { + pub kind: TokenKind, + pub lexeme: String, + pub line: usize, +} + +pub enum ScannerError { + UnexpectedChar { line: usize, unexpected: char }, + UnterminatedString { line: usize }, +} + +struct Scanner<'a> { + source: &'a [char], + tokens: Vec<Token>, + errors: Vec<ScannerError>, + start: usize, // offset of first character in current lexeme + current: usize, // current offset into source + line: usize, // current line in source +} + +impl<'a> Scanner<'a> { + fn is_at_end(&self) -> bool { + return self.current >= self.source.len(); + } + + fn advance(&mut self) -> char { + self.current += 1; + self.source[self.current - 1] + } + + fn add_token(&mut self, kind: TokenKind) { + let lexeme = &self.source[self.start..self.current]; + self.tokens.push(Token { + kind, + lexeme: lexeme.into_iter().collect(), + line: self.line, + }) + } + + fn scan_token(&mut self) { + match self.advance() { + // simple single-character tokens + '(' => self.add_token(TokenKind::LeftParen), + ')' => self.add_token(TokenKind::RightParen), + '{' => self.add_token(TokenKind::LeftBrace), + '}' => self.add_token(TokenKind::RightBrace), + ',' => self.add_token(TokenKind::Comma), + '.' => self.add_token(TokenKind::Dot), + '-' => self.add_token(TokenKind::Minus), + '+' => self.add_token(TokenKind::Plus), + ';' => self.add_token(TokenKind::Semicolon), + '*' => self.add_token(TokenKind::Star), + + // possible multi-character tokens + '!' => self.add_if_next('=', TokenKind::BangEqual, TokenKind::Bang), + '=' => self.add_if_next('=', TokenKind::EqualEqual, TokenKind::Equal), + '<' => self.add_if_next('=', TokenKind::LessEqual, TokenKind::Less), + '>' => self.add_if_next('=', TokenKind::GreaterEqual, TokenKind::Greater), + + '/' => { + // support comments until EOL by discarding characters + if self.match_next('/') { + while self.peek() != '\n' && !self.is_at_end() { + self.advance(); + } + } else { + self.add_token(TokenKind::Slash); + } + } + + // ignore whitespace + ws if ws.is_whitespace() => { + if ws == '\n' { + self.line += 1 + } + } + + '"' => self.scan_string(), + + digit if digit.is_digit(10) => self.scan_number(), + + chr if chr.is_alphabetic() || chr == '_' => self.scan_identifier(), + + unexpected => self.errors.push(ScannerError::UnexpectedChar { + line: self.line, + unexpected, + }), + }; + } + + fn match_next(&mut self, expected: char) -> bool { + if self.is_at_end() || self.source[self.current] != expected { + false + } else { + self.current += 1; + true + } + } + + fn add_if_next(&mut self, expected: char, then: TokenKind, or: TokenKind) { + if self.match_next(expected) { + self.add_token(then); + } else { + self.add_token(or); + } + } + + fn peek(&self) -> char { + if self.is_at_end() { + return '\0'; + } else { + return self.source[self.current]; + } + } + + fn peek_next(&self) -> char { + if self.current + 1 >= self.source.len() { + return '\0'; + } else { + return self.source[self.current + 1]; + } + } + + fn scan_string(&mut self) { + while self.peek() != '"' && !self.is_at_end() { + if self.peek() == '\n' { + self.line += 1; + } + + self.advance(); + } + + if self.is_at_end() { + self.errors + .push(ScannerError::UnterminatedString { line: self.line }); + return; + } + + // closing '"' + self.advance(); + + // add token without surrounding quotes + let string: String = self.source[(self.start + 1)..(self.current - 1)] + .iter() + .collect(); + self.add_token(TokenKind::String(string)); + } + + fn scan_number(&mut self) { + while self.peek().is_digit(10) { + self.advance(); + } + + // Look for a fractional part + if self.peek() == '.' && self.peek_next().is_digit(10) { + // consume '.' + self.advance(); + + while self.peek().is_digit(10) { + self.advance(); + } + } + + let num: f64 = self.source[self.start..self.current] + .iter() + .collect::<String>() + .parse() + .expect("float parsing should always work"); + + self.add_token(TokenKind::Number(num)); + } + + fn scan_identifier(&mut self) { + while self.peek().is_alphanumeric() || self.peek() == '_' { + self.advance(); + } + + let ident: String = self.source[self.start..self.current].iter().collect(); + + // Determine whether this is an identifier, or a keyword: + let token_kind = match ident.as_str() { + "and" => TokenKind::And, + "class" => TokenKind::Class, + "else" => TokenKind::Else, + "false" => TokenKind::False, + "for" => TokenKind::For, + "fun" => TokenKind::Fun, + "if" => TokenKind::If, + "nil" => TokenKind::Nil, + "or" => TokenKind::Or, + "print" => TokenKind::Print, + "return" => TokenKind::Return, + "super" => TokenKind::Super, + "this" => TokenKind::This, + "true" => TokenKind::True, + "var" => TokenKind::Var, + "while" => TokenKind::While, + _ => TokenKind::Identifier(ident), + }; + + self.add_token(token_kind); + } + + fn scan_tokens(&mut self) { + while !self.is_at_end() { + self.start = self.current; + self.scan_token(); + } + + self.add_token(TokenKind::Eof); + } +} + +pub fn scan<'a>(input: &'a [char]) -> Result<Vec<Token>, Vec<ScannerError>> { + let mut scanner = Scanner { + source: &input, + tokens: vec![], + errors: vec![], + start: 0, + current: 0, + line: 0, + }; + + scanner.scan_tokens(); + + if !scanner.errors.is_empty() { + return Err(scanner.errors); + } + + return Ok(scanner.tokens); +} |