From ef7a0da8cb22601d8cdd466644413877126c70df Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Mon, 1 Mar 2021 18:31:17 +0200 Subject: feat(tazjin/rlox): Add a simple string interner This is based on this matklad post: https://matklad.github.io/2020/03/22/fast-simple-rust-interner.html It's modified slightly to provide a safer interface and slightly more readable implementation: * interned string IDs are wrapped in a newtype that is not publicly constructible * unsafe block is reduced to only the small scope in which it is needed * lookup lifetime is pinned explicitly to make the intent clearer when reading this code Change-Id: Ia3dae988f33f8e5e7d8dc0c1a9216914a945b036 Reviewed-on: https://cl.tvl.fyi/c/depot/+/2578 Tested-by: BuildkiteCI Reviewed-by: tazjin --- users/tazjin/rlox/src/bytecode/interner/mod.rs | 87 ++++++++++++++++++++++++ users/tazjin/rlox/src/bytecode/interner/tests.rs | 24 +++++++ users/tazjin/rlox/src/bytecode/mod.rs | 1 + 3 files changed, 112 insertions(+) create mode 100644 users/tazjin/rlox/src/bytecode/interner/mod.rs create mode 100644 users/tazjin/rlox/src/bytecode/interner/tests.rs (limited to 'users/tazjin/rlox') diff --git a/users/tazjin/rlox/src/bytecode/interner/mod.rs b/users/tazjin/rlox/src/bytecode/interner/mod.rs new file mode 100644 index 0000000000..f5f695904e --- /dev/null +++ b/users/tazjin/rlox/src/bytecode/interner/mod.rs @@ -0,0 +1,87 @@ +//! String-interning implementation for values that are likely to +//! benefit from fast comparisons and deduplication (e.g. instances of +//! variable names). +//! +//! This uses a trick from the typed-arena crate for guaranteeing +//! stable addresses by never resizing the existing String buffer, and +//! collecting full buffers in a vector. + +use std::collections::HashMap; + +#[cfg(test)] +mod tests; + +#[derive(Clone, Copy, Debug, PartialEq, Hash)] +pub struct InternedStr { + id: usize, +} + +#[derive(Default)] +pub struct Interner { + map: HashMap<&'static str, InternedStr>, + vec: Vec<&'static str>, + buf: String, + full: Vec, +} + +impl Interner { + pub fn with_capacity(cap: usize) -> Self { + Interner { + buf: String::with_capacity(cap), + ..Default::default() + } + } + + pub fn intern>(&mut self, name: S) -> InternedStr { + let name = name.as_ref(); + if let Some(&id) = self.map.get(name) { + return id; + } + + let name = self.alloc(name); + let id = InternedStr { + id: self.vec.len() as usize, + }; + + self.map.insert(name, id); + self.vec.push(name); + + debug_assert!(self.lookup(id) == name); + debug_assert!(self.intern(name) == id); + + id + } + + pub fn lookup<'a>(&'a self, id: InternedStr) -> &'a str { + self.vec[id.id] + } + + fn alloc<'a>(&'a mut self, name: &str) -> &'static str { + let cap = self.buf.capacity(); + if cap < self.buf.len() + name.len() { + let new_cap = (cap.max(name.len()) + 1).next_power_of_two(); + let new_buf = String::with_capacity(new_cap); + let old_buf = std::mem::replace(&mut self.buf, new_buf); + self.full.push(old_buf); + } + + let interned: &'a str = { + let start = self.buf.len(); + self.buf.push_str(name); + &self.buf[start..] + }; + + unsafe { + // This is sound for two reasons: + // + // 1. This function (Interner::alloc) is private, which + // prevents users from allocating a supposedly static + // reference. + // + // 2. Interner::lookup explicitly shortens the lifetime of + // references that are handed out to that of the + // reference to self. + return &*(interned as *const str); + } + } +} diff --git a/users/tazjin/rlox/src/bytecode/interner/tests.rs b/users/tazjin/rlox/src/bytecode/interner/tests.rs new file mode 100644 index 0000000000..b34bf68353 --- /dev/null +++ b/users/tazjin/rlox/src/bytecode/interner/tests.rs @@ -0,0 +1,24 @@ +use super::*; + +#[test] +fn interns_strings() { + let mut interner = Interner::with_capacity(128); + let id = interner.intern("hello world"); + assert_eq!("hello world", interner.lookup(id)); +} + +#[test] +fn deduplicates_strings() { + let mut interner = Interner::with_capacity(128); + let id_1 = interner.intern("hello world"); + let id_2 = interner.intern("hello world"); + assert_eq!(id_1, id_2); +} + +#[test] +fn ids_survive_growing() { + let mut interner = Interner::with_capacity(16); + let id = interner.intern("hello"); + interner.intern("excessively large string that will cause eallocation"); + assert_eq!("hello", interner.lookup(id)); +} diff --git a/users/tazjin/rlox/src/bytecode/mod.rs b/users/tazjin/rlox/src/bytecode/mod.rs index 27fa7c0f67..b3e0966fa0 100644 --- a/users/tazjin/rlox/src/bytecode/mod.rs +++ b/users/tazjin/rlox/src/bytecode/mod.rs @@ -5,6 +5,7 @@ mod chunk; mod compiler; mod errors; +mod interner; mod opcode; mod value; mod vm; -- cgit 1.4.1