about summary refs log tree commit diff
path: root/tvix/eval/src/value/string.rs
diff options
context:
space:
mode:
authorAspen Smith <root@gws.fyi>2024-07-28T16·19-0400
committeraspen <root@gws.fyi>2024-08-08T00·02+0000
commit6366cee717d47dc002d874b0c3eab2182e6cf55f (patch)
tree954dd4c1f77e5140d4e6a11ddc2c9048839a2007 /tvix/eval/src/value/string.rs
parentb8f92a6d535af09c24ac887855eb230ca25af1ed (diff)
feat(tvix/eval): Intern (and leak) small strings, behind a mutex r/8454
This is the most naive version of string interning possible - we store a
map from the string itself to the pointer behind a global mutex, and
memoize the allocation of all strings below a threshold length (16
bytes, for now) into that map. This requires leaking /all/ strings,
since it's not easy to know just from the pointer that a string has been
interned - so interning is disabled if string leaking is also disabled.

In the case where we're leaking strings (the default), even the naive
version of this gets us a pretty nice perfomance boost:

hello outpath           time:   [742.54 ms 745.89 ms 749.14 ms]
                        change: [-2.8722% -2.0135% -1.0654%] (p = 0.00 < 0.05)
                        Performance has improved.

However, in the case where we're not leaking strings, we have to keep
track of which strings have and haven't been interned, which makes this
a little worse:

hello outpath           time:   [779.30 ms 792.82 ms 808.74 ms]
                        change: [+2.5258% +4.0884% +5.8931%] (p = 0.00 < 0.05)
                        Performance has regressed.

Hopefully we can close the gap here a bit with some clever
tricks (coming next).

Change-Id: If08cb48ede703c7fe3bdd8d617443f8a561ad09b
Reviewed-on: https://cl.tvl.fyi/c/depot/+/12047
Tested-by: BuildkiteCI
Reviewed-by: flokli <flokli@flokli.de>
Autosubmit: aspen <root@gws.fyi>
Diffstat (limited to 'tvix/eval/src/value/string.rs')
-rw-r--r--tvix/eval/src/value/string.rs87
1 files changed, 82 insertions, 5 deletions
diff --git a/tvix/eval/src/value/string.rs b/tvix/eval/src/value/string.rs
index 5c1f31db7eea..7fcdad413466 100644
--- a/tvix/eval/src/value/string.rs
+++ b/tvix/eval/src/value/string.rs
@@ -4,16 +4,19 @@
 //! level, allowing us to shave off some memory overhead and only
 //! paying the cost when creating new strings.
 use bstr::{BStr, BString, ByteSlice, Chars};
+use lazy_static::lazy_static;
 use rnix::ast;
-use rustc_hash::FxHashSet;
-use std::alloc::{alloc, dealloc, handle_alloc_error, Layout};
+use rustc_hash::{FxHashMap, FxHashSet};
+use std::alloc::dealloc;
+use std::alloc::{alloc, handle_alloc_error, Layout};
 use std::borrow::{Borrow, Cow};
 use std::ffi::c_void;
 use std::fmt::{self, Debug, Display};
 use std::hash::Hash;
 use std::ops::Deref;
 use std::ptr::{self, NonNull};
-use std::slice;
+use std::sync::Mutex;
+use std::{mem, slice};
 
 use serde::de::{Deserializer, Visitor};
 use serde::{Deserialize, Serialize};
@@ -395,6 +398,48 @@ impl NixStringInner {
     }
 }
 
+#[derive(Default)]
+struct InternerInner {
+    map: FxHashMap<&'static [u8], NonNull<c_void>>,
+    #[cfg(feature = "no_leak")]
+    interned_strings: FxHashSet<NonNull<c_void>>,
+}
+
+unsafe impl Send for InternerInner {}
+
+impl InternerInner {
+    pub fn intern(&mut self, s: &[u8]) -> NixString {
+        if let Some(s) = self.map.get(s) {
+            return NixString(*s);
+        }
+
+        let string = NixString::new_inner(s, None);
+        self.map
+            .insert(unsafe { mem::transmute(string.as_bytes()) }, string.0);
+        #[cfg(feature = "no_leak")]
+        self.interned_strings.insert(string.0);
+        string
+    }
+}
+
+#[derive(Default)]
+struct Interner(Mutex<InternerInner>);
+
+impl Interner {
+    pub fn intern(&self, s: &[u8]) -> NixString {
+        self.0.lock().unwrap().intern(s)
+    }
+
+    #[cfg(feature = "no_leak")]
+    pub fn is_interned_string(&self, string: &NixString) -> bool {
+        self.0.lock().unwrap().interned_strings.contains(&string.0)
+    }
+}
+
+lazy_static! {
+    static ref INTERNER: Interner = Interner::default();
+}
+
 /// Nix string values
 ///
 /// # Internals
@@ -414,8 +459,20 @@ unsafe impl Send for NixString {}
 unsafe impl Sync for NixString {}
 
 impl Drop for NixString {
+    #[cfg(not(feature = "no_leak"))]
+    fn drop(&mut self) {
+        if self.context().is_some() {
+            // SAFETY: There's no way to construct a NixString that doesn't leave the allocation correct
+            // according to the rules of dealloc
+            unsafe {
+                NixStringInner::dealloc(self.0);
+            }
+        }
+    }
+
+    #[cfg(feature = "no_leak")]
     fn drop(&mut self) {
-        if !cfg!(feature = "no_leak") {
+        if INTERNER.is_interned_string(self) {
             return;
         }
 
@@ -458,7 +515,7 @@ impl Debug for NixString {
 
 impl PartialEq for NixString {
     fn eq(&self, other: &Self) -> bool {
-        self.as_bstr() == other.as_bstr()
+        self.0 == other.0 || self.as_bstr() == other.as_bstr()
     }
 }
 
@@ -484,6 +541,9 @@ impl PartialOrd for NixString {
 
 impl Ord for NixString {
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        if self.0 == other.0 {
+            return std::cmp::Ordering::Equal;
+        }
         self.as_bstr().cmp(other.as_bstr())
     }
 }
@@ -647,6 +707,9 @@ mod arbitrary {
     }
 }
 
+/// Set non-scientifically. TODO(aspen): think more about what this should be
+const INTERN_THRESHOLD: usize = 32;
+
 impl NixString {
     fn new(contents: &[u8], context: Option<Box<NixContext>>) -> Self {
         debug_assert!(
@@ -654,6 +717,20 @@ impl NixString {
             "BUG: initialized with empty context"
         );
 
+        if !cfg!(feature = "no_leak") /* It's only safe to intern if we leak strings, since there's
+                                       * nothing yet preventing interned strings from getting freed
+                                       * (and then used by other copies) otherwise
+                                       */
+            && contents.len() <= INTERN_THRESHOLD
+            && context.is_none()
+        {
+            return INTERNER.intern(contents);
+        }
+
+        Self::new_inner(contents, context)
+    }
+
+    fn new_inner(contents: &[u8], context: Option<Box<NixContext>>) -> Self {
         // SAFETY: We're always fully initializing a NixString here:
         //
         // 1. NixStringInner::alloc sets up the len for us