diff options
author | Aspen Smith <root@gws.fyi> | 2024-07-28T16·19-0400 |
---|---|---|
committer | aspen <root@gws.fyi> | 2024-08-08T00·02+0000 |
commit | 6366cee717d47dc002d874b0c3eab2182e6cf55f (patch) | |
tree | 954dd4c1f77e5140d4e6a11ddc2c9048839a2007 /tvix/eval/src/value/string.rs | |
parent | b8f92a6d535af09c24ac887855eb230ca25af1ed (diff) |
feat(tvix/eval): Intern (and leak) small strings, behind a mutex r/8454
This is the most naive version of string interning possible - we store a map from the string itself to the pointer behind a global mutex, and memoize the allocation of all strings below a threshold length (16 bytes, for now) into that map. This requires leaking /all/ strings, since it's not easy to know just from the pointer that a string has been interned - so interning is disabled if string leaking is also disabled. In the case where we're leaking strings (the default), even the naive version of this gets us a pretty nice perfomance boost: hello outpath time: [742.54 ms 745.89 ms 749.14 ms] change: [-2.8722% -2.0135% -1.0654%] (p = 0.00 < 0.05) Performance has improved. However, in the case where we're not leaking strings, we have to keep track of which strings have and haven't been interned, which makes this a little worse: hello outpath time: [779.30 ms 792.82 ms 808.74 ms] change: [+2.5258% +4.0884% +5.8931%] (p = 0.00 < 0.05) Performance has regressed. Hopefully we can close the gap here a bit with some clever tricks (coming next). Change-Id: If08cb48ede703c7fe3bdd8d617443f8a561ad09b Reviewed-on: https://cl.tvl.fyi/c/depot/+/12047 Tested-by: BuildkiteCI Reviewed-by: flokli <flokli@flokli.de> Autosubmit: aspen <root@gws.fyi>
Diffstat (limited to 'tvix/eval/src/value/string.rs')
-rw-r--r-- | tvix/eval/src/value/string.rs | 87 |
1 files changed, 82 insertions, 5 deletions
diff --git a/tvix/eval/src/value/string.rs b/tvix/eval/src/value/string.rs index 5c1f31db7eea..7fcdad413466 100644 --- a/tvix/eval/src/value/string.rs +++ b/tvix/eval/src/value/string.rs @@ -4,16 +4,19 @@ //! level, allowing us to shave off some memory overhead and only //! paying the cost when creating new strings. use bstr::{BStr, BString, ByteSlice, Chars}; +use lazy_static::lazy_static; use rnix::ast; -use rustc_hash::FxHashSet; -use std::alloc::{alloc, dealloc, handle_alloc_error, Layout}; +use rustc_hash::{FxHashMap, FxHashSet}; +use std::alloc::dealloc; +use std::alloc::{alloc, handle_alloc_error, Layout}; use std::borrow::{Borrow, Cow}; use std::ffi::c_void; use std::fmt::{self, Debug, Display}; use std::hash::Hash; use std::ops::Deref; use std::ptr::{self, NonNull}; -use std::slice; +use std::sync::Mutex; +use std::{mem, slice}; use serde::de::{Deserializer, Visitor}; use serde::{Deserialize, Serialize}; @@ -395,6 +398,48 @@ impl NixStringInner { } } +#[derive(Default)] +struct InternerInner { + map: FxHashMap<&'static [u8], NonNull<c_void>>, + #[cfg(feature = "no_leak")] + interned_strings: FxHashSet<NonNull<c_void>>, +} + +unsafe impl Send for InternerInner {} + +impl InternerInner { + pub fn intern(&mut self, s: &[u8]) -> NixString { + if let Some(s) = self.map.get(s) { + return NixString(*s); + } + + let string = NixString::new_inner(s, None); + self.map + .insert(unsafe { mem::transmute(string.as_bytes()) }, string.0); + #[cfg(feature = "no_leak")] + self.interned_strings.insert(string.0); + string + } +} + +#[derive(Default)] +struct Interner(Mutex<InternerInner>); + +impl Interner { + pub fn intern(&self, s: &[u8]) -> NixString { + self.0.lock().unwrap().intern(s) + } + + #[cfg(feature = "no_leak")] + pub fn is_interned_string(&self, string: &NixString) -> bool { + self.0.lock().unwrap().interned_strings.contains(&string.0) + } +} + +lazy_static! { + static ref INTERNER: Interner = Interner::default(); +} + /// Nix string values /// /// # Internals @@ -414,8 +459,20 @@ unsafe impl Send for NixString {} unsafe impl Sync for NixString {} impl Drop for NixString { + #[cfg(not(feature = "no_leak"))] + fn drop(&mut self) { + if self.context().is_some() { + // SAFETY: There's no way to construct a NixString that doesn't leave the allocation correct + // according to the rules of dealloc + unsafe { + NixStringInner::dealloc(self.0); + } + } + } + + #[cfg(feature = "no_leak")] fn drop(&mut self) { - if !cfg!(feature = "no_leak") { + if INTERNER.is_interned_string(self) { return; } @@ -458,7 +515,7 @@ impl Debug for NixString { impl PartialEq for NixString { fn eq(&self, other: &Self) -> bool { - self.as_bstr() == other.as_bstr() + self.0 == other.0 || self.as_bstr() == other.as_bstr() } } @@ -484,6 +541,9 @@ impl PartialOrd for NixString { impl Ord for NixString { fn cmp(&self, other: &Self) -> std::cmp::Ordering { + if self.0 == other.0 { + return std::cmp::Ordering::Equal; + } self.as_bstr().cmp(other.as_bstr()) } } @@ -647,6 +707,9 @@ mod arbitrary { } } +/// Set non-scientifically. TODO(aspen): think more about what this should be +const INTERN_THRESHOLD: usize = 32; + impl NixString { fn new(contents: &[u8], context: Option<Box<NixContext>>) -> Self { debug_assert!( @@ -654,6 +717,20 @@ impl NixString { "BUG: initialized with empty context" ); + if !cfg!(feature = "no_leak") /* It's only safe to intern if we leak strings, since there's + * nothing yet preventing interned strings from getting freed + * (and then used by other copies) otherwise + */ + && contents.len() <= INTERN_THRESHOLD + && context.is_none() + { + return INTERNER.intern(contents); + } + + Self::new_inner(contents, context) + } + + fn new_inner(contents: &[u8], context: Option<Box<NixContext>>) -> Self { // SAFETY: We're always fully initializing a NixString here: // // 1. NixStringInner::alloc sets up the len for us |