diff options
author | Vincent Ambo <mail@tazj.in> | 2021-03-01T16·31+0200 |
---|---|---|
committer | tazjin <mail@tazj.in> | 2021-03-01T21·09+0000 |
commit | ef7a0da8cb22601d8cdd466644413877126c70df (patch) | |
tree | 82efc70429434293452f717c36e5d5f77bbcf533 /users | |
parent | 6f600c8300c028beb07bf224baf7dfdaa6490fd3 (diff) |
feat(tazjin/rlox): Add a simple string interner r/2259
This is based on this matklad post: https://matklad.github.io/2020/03/22/fast-simple-rust-interner.html It's modified slightly to provide a safer interface and slightly more readable implementation: * interned string IDs are wrapped in a newtype that is not publicly constructible * unsafe block is reduced to only the small scope in which it is needed * lookup lifetime is pinned explicitly to make the intent clearer when reading this code Change-Id: Ia3dae988f33f8e5e7d8dc0c1a9216914a945b036 Reviewed-on: https://cl.tvl.fyi/c/depot/+/2578 Tested-by: BuildkiteCI Reviewed-by: tazjin <mail@tazj.in>
Diffstat (limited to 'users')
-rw-r--r-- | users/tazjin/rlox/src/bytecode/interner/mod.rs | 87 | ||||
-rw-r--r-- | users/tazjin/rlox/src/bytecode/interner/tests.rs | 24 | ||||
-rw-r--r-- | users/tazjin/rlox/src/bytecode/mod.rs | 1 |
3 files changed, 112 insertions, 0 deletions
diff --git a/users/tazjin/rlox/src/bytecode/interner/mod.rs b/users/tazjin/rlox/src/bytecode/interner/mod.rs new file mode 100644 index 000000000000..f5f695904e85 --- /dev/null +++ b/users/tazjin/rlox/src/bytecode/interner/mod.rs @@ -0,0 +1,87 @@ +//! String-interning implementation for values that are likely to +//! benefit from fast comparisons and deduplication (e.g. instances of +//! variable names). +//! +//! This uses a trick from the typed-arena crate for guaranteeing +//! stable addresses by never resizing the existing String buffer, and +//! collecting full buffers in a vector. + +use std::collections::HashMap; + +#[cfg(test)] +mod tests; + +#[derive(Clone, Copy, Debug, PartialEq, Hash)] +pub struct InternedStr { + id: usize, +} + +#[derive(Default)] +pub struct Interner { + map: HashMap<&'static str, InternedStr>, + vec: Vec<&'static str>, + buf: String, + full: Vec<String>, +} + +impl Interner { + pub fn with_capacity(cap: usize) -> Self { + Interner { + buf: String::with_capacity(cap), + ..Default::default() + } + } + + pub fn intern<S: AsRef<str>>(&mut self, name: S) -> InternedStr { + let name = name.as_ref(); + if let Some(&id) = self.map.get(name) { + return id; + } + + let name = self.alloc(name); + let id = InternedStr { + id: self.vec.len() as usize, + }; + + self.map.insert(name, id); + self.vec.push(name); + + debug_assert!(self.lookup(id) == name); + debug_assert!(self.intern(name) == id); + + id + } + + pub fn lookup<'a>(&'a self, id: InternedStr) -> &'a str { + self.vec[id.id] + } + + fn alloc<'a>(&'a mut self, name: &str) -> &'static str { + let cap = self.buf.capacity(); + if cap < self.buf.len() + name.len() { + let new_cap = (cap.max(name.len()) + 1).next_power_of_two(); + let new_buf = String::with_capacity(new_cap); + let old_buf = std::mem::replace(&mut self.buf, new_buf); + self.full.push(old_buf); + } + + let interned: &'a str = { + let start = self.buf.len(); + self.buf.push_str(name); + &self.buf[start..] + }; + + unsafe { + // This is sound for two reasons: + // + // 1. This function (Interner::alloc) is private, which + // prevents users from allocating a supposedly static + // reference. + // + // 2. Interner::lookup explicitly shortens the lifetime of + // references that are handed out to that of the + // reference to self. + return &*(interned as *const str); + } + } +} diff --git a/users/tazjin/rlox/src/bytecode/interner/tests.rs b/users/tazjin/rlox/src/bytecode/interner/tests.rs new file mode 100644 index 000000000000..b34bf6835389 --- /dev/null +++ b/users/tazjin/rlox/src/bytecode/interner/tests.rs @@ -0,0 +1,24 @@ +use super::*; + +#[test] +fn interns_strings() { + let mut interner = Interner::with_capacity(128); + let id = interner.intern("hello world"); + assert_eq!("hello world", interner.lookup(id)); +} + +#[test] +fn deduplicates_strings() { + let mut interner = Interner::with_capacity(128); + let id_1 = interner.intern("hello world"); + let id_2 = interner.intern("hello world"); + assert_eq!(id_1, id_2); +} + +#[test] +fn ids_survive_growing() { + let mut interner = Interner::with_capacity(16); + let id = interner.intern("hello"); + interner.intern("excessively large string that will cause eallocation"); + assert_eq!("hello", interner.lookup(id)); +} diff --git a/users/tazjin/rlox/src/bytecode/mod.rs b/users/tazjin/rlox/src/bytecode/mod.rs index 27fa7c0f677e..b3e0966fa071 100644 --- a/users/tazjin/rlox/src/bytecode/mod.rs +++ b/users/tazjin/rlox/src/bytecode/mod.rs @@ -5,6 +5,7 @@ mod chunk; mod compiler; mod errors; +mod interner; mod opcode; mod value; mod vm; |