about summary refs log tree commit diff
path: root/users/tazjin/rlox
diff options
context:
space:
mode:
authorVincent Ambo <mail@tazj.in>2021-03-01T16·31+0200
committertazjin <mail@tazj.in>2021-03-01T21·09+0000
commitef7a0da8cb22601d8cdd466644413877126c70df (patch)
tree82efc70429434293452f717c36e5d5f77bbcf533 /users/tazjin/rlox
parent6f600c8300c028beb07bf224baf7dfdaa6490fd3 (diff)
feat(tazjin/rlox): Add a simple string interner r/2259
This is based on this matklad post:

https://matklad.github.io/2020/03/22/fast-simple-rust-interner.html

It's modified slightly to provide a safer interface and slightly more
readable implementation:

* interned string IDs are wrapped in a newtype that is not publicly
  constructible

* unsafe block is reduced to only the small scope in which it is
  needed

* lookup lifetime is pinned explicitly to make the intent clearer when
  reading this code

Change-Id: Ia3dae988f33f8e5e7d8dc0c1a9216914a945b036
Reviewed-on: https://cl.tvl.fyi/c/depot/+/2578
Tested-by: BuildkiteCI
Reviewed-by: tazjin <mail@tazj.in>
Diffstat (limited to 'users/tazjin/rlox')
-rw-r--r--users/tazjin/rlox/src/bytecode/interner/mod.rs87
-rw-r--r--users/tazjin/rlox/src/bytecode/interner/tests.rs24
-rw-r--r--users/tazjin/rlox/src/bytecode/mod.rs1
3 files changed, 112 insertions, 0 deletions
diff --git a/users/tazjin/rlox/src/bytecode/interner/mod.rs b/users/tazjin/rlox/src/bytecode/interner/mod.rs
new file mode 100644
index 0000000000..f5f695904e
--- /dev/null
+++ b/users/tazjin/rlox/src/bytecode/interner/mod.rs
@@ -0,0 +1,87 @@
+//! String-interning implementation for values that are likely to
+//! benefit from fast comparisons and deduplication (e.g. instances of
+//! variable names).
+//!
+//! This uses a trick from the typed-arena crate for guaranteeing
+//! stable addresses by never resizing the existing String buffer, and
+//! collecting full buffers in a vector.
+
+use std::collections::HashMap;
+
+#[cfg(test)]
+mod tests;
+
+#[derive(Clone, Copy, Debug, PartialEq, Hash)]
+pub struct InternedStr {
+    id: usize,
+}
+
+#[derive(Default)]
+pub struct Interner {
+    map: HashMap<&'static str, InternedStr>,
+    vec: Vec<&'static str>,
+    buf: String,
+    full: Vec<String>,
+}
+
+impl Interner {
+    pub fn with_capacity(cap: usize) -> Self {
+        Interner {
+            buf: String::with_capacity(cap),
+            ..Default::default()
+        }
+    }
+
+    pub fn intern<S: AsRef<str>>(&mut self, name: S) -> InternedStr {
+        let name = name.as_ref();
+        if let Some(&id) = self.map.get(name) {
+            return id;
+        }
+
+        let name = self.alloc(name);
+        let id = InternedStr {
+            id: self.vec.len() as usize,
+        };
+
+        self.map.insert(name, id);
+        self.vec.push(name);
+
+        debug_assert!(self.lookup(id) == name);
+        debug_assert!(self.intern(name) == id);
+
+        id
+    }
+
+    pub fn lookup<'a>(&'a self, id: InternedStr) -> &'a str {
+        self.vec[id.id]
+    }
+
+    fn alloc<'a>(&'a mut self, name: &str) -> &'static str {
+        let cap = self.buf.capacity();
+        if cap < self.buf.len() + name.len() {
+            let new_cap = (cap.max(name.len()) + 1).next_power_of_two();
+            let new_buf = String::with_capacity(new_cap);
+            let old_buf = std::mem::replace(&mut self.buf, new_buf);
+            self.full.push(old_buf);
+        }
+
+        let interned: &'a str = {
+            let start = self.buf.len();
+            self.buf.push_str(name);
+            &self.buf[start..]
+        };
+
+        unsafe {
+            // This is sound for two reasons:
+            //
+            // 1. This function (Interner::alloc) is private, which
+            //    prevents users from allocating a supposedly static
+            //    reference.
+            //
+            // 2. Interner::lookup explicitly shortens the lifetime of
+            //    references that are handed out to that of the
+            //    reference to self.
+            return &*(interned as *const str);
+        }
+    }
+}
diff --git a/users/tazjin/rlox/src/bytecode/interner/tests.rs b/users/tazjin/rlox/src/bytecode/interner/tests.rs
new file mode 100644
index 0000000000..b34bf68353
--- /dev/null
+++ b/users/tazjin/rlox/src/bytecode/interner/tests.rs
@@ -0,0 +1,24 @@
+use super::*;
+
+#[test]
+fn interns_strings() {
+    let mut interner = Interner::with_capacity(128);
+    let id = interner.intern("hello world");
+    assert_eq!("hello world", interner.lookup(id));
+}
+
+#[test]
+fn deduplicates_strings() {
+    let mut interner = Interner::with_capacity(128);
+    let id_1 = interner.intern("hello world");
+    let id_2 = interner.intern("hello world");
+    assert_eq!(id_1, id_2);
+}
+
+#[test]
+fn ids_survive_growing() {
+    let mut interner = Interner::with_capacity(16);
+    let id = interner.intern("hello");
+    interner.intern("excessively large string that will cause eallocation");
+    assert_eq!("hello", interner.lookup(id));
+}
diff --git a/users/tazjin/rlox/src/bytecode/mod.rs b/users/tazjin/rlox/src/bytecode/mod.rs
index 27fa7c0f67..b3e0966fa0 100644
--- a/users/tazjin/rlox/src/bytecode/mod.rs
+++ b/users/tazjin/rlox/src/bytecode/mod.rs
@@ -5,6 +5,7 @@
 mod chunk;
 mod compiler;
 mod errors;
+mod interner;
 mod opcode;
 mod value;
 mod vm;