about summary refs log tree commit diff
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2020-11-23T01·00+0100
committertazjin <mail@tazj.in>2020-11-23T01·15+0000
commit3d1b116f7fdcdd7af77a4abc7dbedef4df8fd0a4 (patch)
treef0f0aa71e199da39daefe3f218b4d619fe56238f
parent9d2b001c4cc86cc57bdb890037c80b7a1c766ecd (diff)
feat(tazjin/rlox): Implement single-character scanning r/1914
... still not that interesting, but at this point slightly divergent
from the book:

The book embraces mutability for interpreter state, initially for
tracking whether an error condition has occured.

I avoid this by instead defining an error type and collecting the
error values, to be handled later on.

Notes: So far nothing special, but this is just the beginning of the
book. I like the style it is written in and it has pointed to some
interesting resources, such as a 1965 paper titled "The Next 700
Languages".

Change-Id: I030b38438fec9eb55372bf547af225138908230a
Reviewed-on: https://cl.tvl.fyi/c/depot/+/2144
Reviewed-by: tazjin <mail@tazj.in>
Tested-by: BuildkiteCI
-rw-r--r--users/tazjin/rlox/src/errors.rs14
-rw-r--r--users/tazjin/rlox/src/main.rs2
-rw-r--r--users/tazjin/rlox/src/scanner.rs123
3 files changed, 139 insertions, 0 deletions
diff --git a/users/tazjin/rlox/src/errors.rs b/users/tazjin/rlox/src/errors.rs
new file mode 100644
index 0000000000..46c739ef2f
--- /dev/null
+++ b/users/tazjin/rlox/src/errors.rs
@@ -0,0 +1,14 @@
+#[derive(Debug)]
+pub enum ErrorKind {
+    UnexpectedChar(char),
+}
+
+#[derive(Debug)]
+pub struct Error {
+    pub line: usize,
+    pub kind: ErrorKind,
+}
+
+pub fn report(loc: &str, err: &Error) {
+    eprintln!("[line {}] Error {}: {:?}", err.line, loc, err.kind);
+}
diff --git a/users/tazjin/rlox/src/main.rs b/users/tazjin/rlox/src/main.rs
index 83d220c816..b14ed97d5e 100644
--- a/users/tazjin/rlox/src/main.rs
+++ b/users/tazjin/rlox/src/main.rs
@@ -4,7 +4,9 @@ use std::io;
 use std::io::Write;
 use std::process;
 
+mod errors;
 mod interpreter;
+mod scanner;
 
 fn main() {
     let mut args = env::args();
diff --git a/users/tazjin/rlox/src/scanner.rs b/users/tazjin/rlox/src/scanner.rs
new file mode 100644
index 0000000000..c180901054
--- /dev/null
+++ b/users/tazjin/rlox/src/scanner.rs
@@ -0,0 +1,123 @@
+use crate::errors::{Error, ErrorKind};
+
+#[derive(Debug)]
+pub enum TokenKind {
+    // Single-character tokens.
+    LeftParen,
+    RightParen,
+    LeftBrace,
+    RightBrace,
+    Comma,
+    Dot,
+    Minus,
+    Plus,
+    Semicolon,
+    Slash,
+    Star,
+
+    // One or two character tokens.
+    Bang,
+    BangEqual,
+    Equal,
+    EqualEqual,
+    Greater,
+    GreaterEqual,
+    Less,
+    LessEqual,
+
+    // Literals.
+    Identifier,
+    String,
+    Number,
+
+    // Keywords.
+    And,
+    Class,
+    Else,
+    False,
+    Fun,
+    For,
+    If,
+    Nil,
+    Or,
+    Print,
+    Return,
+    Super,
+    This,
+    True,
+    Var,
+    While,
+
+    // Special things
+    Eof,
+}
+
+#[derive(Debug)]
+pub struct Token<'a> {
+    kind: TokenKind,
+    lexeme: &'a str,
+    // literal: Object, // TODO(tazjin): Uhh?
+    line: usize,
+}
+
+struct Scanner<'a> {
+    source: &'a str,
+    tokens: Vec<Token<'a>>,
+    errors: Vec<Error>,
+    start: usize,   // offset of first character in current lexeme
+    current: usize, // current offset into source
+    line: usize,    // current line in source
+}
+
+impl<'a> Scanner<'a> {
+    fn is_at_end(&self) -> bool {
+        return self.current >= self.source.len();
+    }
+
+    fn advance(&mut self) -> char {
+        self.current += 1;
+
+        // TODO(tazjin): Due to utf8-safety, this is a bit annoying.
+        // Since string iteration is not the point here I'm just
+        // leaving this as is for now.
+        self.source.chars().nth(self.current - 1).unwrap()
+    }
+
+    fn add_token(&mut self, kind: TokenKind) {
+        let lexeme = &self.source[self.start..self.current];
+        self.tokens.push(Token {
+            kind,
+            lexeme,
+            line: self.line,
+        })
+    }
+
+    fn scan_token(&mut self) {
+        match self.advance() {
+            '(' => self.add_token(TokenKind::LeftParen),
+            ')' => self.add_token(TokenKind::RightParen),
+            '{' => self.add_token(TokenKind::LeftBrace),
+            '}' => self.add_token(TokenKind::RightBrace),
+            ',' => self.add_token(TokenKind::Comma),
+            '.' => self.add_token(TokenKind::Dot),
+            '-' => self.add_token(TokenKind::Minus),
+            '+' => self.add_token(TokenKind::Plus),
+            ';' => self.add_token(TokenKind::Semicolon),
+            '*' => self.add_token(TokenKind::Star),
+
+            unexpected => self.errors.push(Error {
+                line: self.line,
+                kind: ErrorKind::UnexpectedChar(unexpected),
+            }),
+        };
+    }
+
+    fn scan_tokens(mut self) -> Vec<Token<'a>> {
+        while !self.is_at_end() {
+            self.start = self.current;
+            self.scan_token();
+        }
+
+        return self.tokens;
+    }
+}