about summary refs log tree commit diff
path: root/users/tazjin/rlox/src/treewalk/scanner.rs
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2021-01-17T18·13+0300
committertazjin <mail@tazj.in>2021-01-17T21·17+0000
commitb1d0e22b1f5fe907ba3d48931e5a38b9a75b0dcf (patch)
treeaaed1a8bf4cd3bde2f3fd63980ce3d98928155c5 /users/tazjin/rlox/src/treewalk/scanner.rs
parentc26915d0120e8577cd684eb9c4f2694e1727cb4a (diff)
chore(tazjin/rlox): Move other modules under treewalk:: r/2126
It's unclear if the second part of the book can reuse anything from
the first part (I'm guessing probably the scanner, but I'll move that
back if it turns out to be the case).

Change-Id: I9411355929e31ac6e953599e51665406b1f48d55
Reviewed-on: https://cl.tvl.fyi/c/depot/+/2415
Reviewed-by: tazjin <mail@tazj.in>
Tested-by: BuildkiteCI
Diffstat (limited to 'users/tazjin/rlox/src/treewalk/scanner.rs')
-rw-r--r--users/tazjin/rlox/src/treewalk/scanner.rs283
1 files changed, 283 insertions, 0 deletions
diff --git a/users/tazjin/rlox/src/treewalk/scanner.rs b/users/tazjin/rlox/src/treewalk/scanner.rs
new file mode 100644
index 000000000000..af9075484145
--- /dev/null
+++ b/users/tazjin/rlox/src/treewalk/scanner.rs
@@ -0,0 +1,283 @@
+use crate::treewalk::errors::{Error, ErrorKind};
+
+#[derive(Clone, Debug, PartialEq)]
+pub enum TokenKind {
+    // Single-character tokens.
+    LeftParen,
+    RightParen,
+    LeftBrace,
+    RightBrace,
+    Comma,
+    Dot,
+    Minus,
+    Plus,
+    Semicolon,
+    Slash,
+    Star,
+
+    // One or two character tokens.
+    Bang,
+    BangEqual,
+    Equal,
+    EqualEqual,
+    Greater,
+    GreaterEqual,
+    Less,
+    LessEqual,
+
+    // Literals.
+    Identifier(String),
+    String(String),
+    Number(f64),
+    True,
+    False,
+    Nil,
+
+    // Keywords.
+    And,
+    Class,
+    Else,
+    Fun,
+    For,
+    If,
+    Or,
+    Print,
+    Return,
+    Super,
+    This,
+    Var,
+    While,
+
+    // Special things
+    Eof,
+}
+
+#[derive(Clone, Debug)]
+pub struct Token {
+    pub kind: TokenKind,
+    pub lexeme: String,
+    pub line: usize,
+}
+
+struct Scanner<'a> {
+    source: &'a [char],
+    tokens: Vec<Token>,
+    errors: Vec<Error>,
+    start: usize,   // offset of first character in current lexeme
+    current: usize, // current offset into source
+    line: usize,    // current line in source
+}
+
+impl<'a> Scanner<'a> {
+    fn is_at_end(&self) -> bool {
+        return self.current >= self.source.len();
+    }
+
+    fn advance(&mut self) -> char {
+        self.current += 1;
+        self.source[self.current - 1]
+    }
+
+    fn add_token(&mut self, kind: TokenKind) {
+        let lexeme = &self.source[self.start..self.current];
+        self.tokens.push(Token {
+            kind,
+            lexeme: lexeme.into_iter().collect(),
+            line: self.line,
+        })
+    }
+
+    fn scan_token(&mut self) {
+        match self.advance() {
+            // simple single-character tokens
+            '(' => self.add_token(TokenKind::LeftParen),
+            ')' => self.add_token(TokenKind::RightParen),
+            '{' => self.add_token(TokenKind::LeftBrace),
+            '}' => self.add_token(TokenKind::RightBrace),
+            ',' => self.add_token(TokenKind::Comma),
+            '.' => self.add_token(TokenKind::Dot),
+            '-' => self.add_token(TokenKind::Minus),
+            '+' => self.add_token(TokenKind::Plus),
+            ';' => self.add_token(TokenKind::Semicolon),
+            '*' => self.add_token(TokenKind::Star),
+
+            // possible multi-character tokens
+            '!' => self.add_if_next('=', TokenKind::BangEqual, TokenKind::Bang),
+            '=' => self.add_if_next('=', TokenKind::EqualEqual, TokenKind::Equal),
+            '<' => self.add_if_next('=', TokenKind::LessEqual, TokenKind::Less),
+            '>' => self.add_if_next('=', TokenKind::GreaterEqual, TokenKind::Greater),
+
+            '/' => {
+                // support comments until EOL by discarding characters
+                if self.match_next('/') {
+                    while self.peek() != '\n' && !self.is_at_end() {
+                        self.advance();
+                    }
+                } else {
+                    self.add_token(TokenKind::Slash);
+                }
+            }
+
+            // ignore whitespace
+            ws if ws.is_whitespace() => {
+                if ws == '\n' {
+                    self.line += 1
+                }
+            }
+
+            '"' => self.scan_string(),
+
+            digit if digit.is_digit(10) => self.scan_number(),
+
+            chr if chr.is_alphabetic() || chr == '_' => self.scan_identifier(),
+
+            unexpected => self.errors.push(Error {
+                line: self.line,
+                kind: ErrorKind::UnexpectedChar(unexpected),
+            }),
+        };
+    }
+
+    fn match_next(&mut self, expected: char) -> bool {
+        if self.is_at_end() || self.source[self.current] != expected {
+            false
+        } else {
+            self.current += 1;
+            true
+        }
+    }
+
+    fn add_if_next(&mut self, expected: char, then: TokenKind, or: TokenKind) {
+        if self.match_next(expected) {
+            self.add_token(then);
+        } else {
+            self.add_token(or);
+        }
+    }
+
+    fn peek(&self) -> char {
+        if self.is_at_end() {
+            return '\0';
+        } else {
+            return self.source[self.current];
+        }
+    }
+
+    fn peek_next(&self) -> char {
+        if self.current + 1 >= self.source.len() {
+            return '\0';
+        } else {
+            return self.source[self.current + 1];
+        }
+    }
+
+    fn scan_string(&mut self) {
+        while self.peek() != '"' && !self.is_at_end() {
+            if self.peek() == '\n' {
+                self.line += 1;
+            }
+
+            self.advance();
+        }
+
+        if self.is_at_end() {
+            self.errors.push(Error {
+                line: self.line,
+                kind: ErrorKind::UnterminatedString,
+            });
+            return;
+        }
+
+        // closing '"'
+        self.advance();
+
+        // add token without surrounding quotes
+        let string: String = self.source[(self.start + 1)..(self.current - 1)]
+            .iter()
+            .collect();
+        self.add_token(TokenKind::String(string));
+    }
+
+    fn scan_number(&mut self) {
+        while self.peek().is_digit(10) {
+            self.advance();
+        }
+
+        // Look for a fractional part
+        if self.peek() == '.' && self.peek_next().is_digit(10) {
+            // consume '.'
+            self.advance();
+
+            while self.peek().is_digit(10) {
+                self.advance();
+            }
+        }
+
+        let num: f64 = self.source[self.start..self.current]
+            .iter()
+            .collect::<String>()
+            .parse()
+            .expect("float parsing should always work");
+
+        self.add_token(TokenKind::Number(num));
+    }
+
+    fn scan_identifier(&mut self) {
+        while self.peek().is_alphanumeric() || self.peek() == '_' {
+            self.advance();
+        }
+
+        let ident: String = self.source[self.start..self.current].iter().collect();
+
+        // Determine whether this is an identifier, or a keyword:
+        let token_kind = match ident.as_str() {
+            "and" => TokenKind::And,
+            "class" => TokenKind::Class,
+            "else" => TokenKind::Else,
+            "false" => TokenKind::False,
+            "for" => TokenKind::For,
+            "fun" => TokenKind::Fun,
+            "if" => TokenKind::If,
+            "nil" => TokenKind::Nil,
+            "or" => TokenKind::Or,
+            "print" => TokenKind::Print,
+            "return" => TokenKind::Return,
+            "super" => TokenKind::Super,
+            "this" => TokenKind::This,
+            "true" => TokenKind::True,
+            "var" => TokenKind::Var,
+            "while" => TokenKind::While,
+            _ => TokenKind::Identifier(ident),
+        };
+
+        self.add_token(token_kind);
+    }
+
+    fn scan_tokens(&mut self) {
+        while !self.is_at_end() {
+            self.start = self.current;
+            self.scan_token();
+        }
+
+        self.add_token(TokenKind::Eof);
+    }
+}
+
+pub fn scan<'a>(input: &'a [char]) -> Result<Vec<Token>, Vec<Error>> {
+    let mut scanner = Scanner {
+        source: &input,
+        tokens: vec![],
+        errors: vec![],
+        start: 0,
+        current: 0,
+        line: 0,
+    };
+
+    scanner.scan_tokens();
+
+    if !scanner.errors.is_empty() {
+        return Err(scanner.errors);
+    }
+
+    return Ok(scanner.tokens);
+}