about summary refs log tree commit diff
path: root/tvix/eval/src/value
diff options
context:
space:
mode:
authorAspen Smith <root@gws.fyi>2023-12-05T22·25-0500
committeraspen <root@gws.fyi>2024-01-31T14·51+0000
commit201173afaca7d70aa039a1e37a91c49af3a99b0b (patch)
treed661ca257820aca975339ee7d17dd1a08df85932 /tvix/eval/src/value
parent6f9e25943f3e2f83d191cadcc76a278073626fe8 (diff)
fix(tvix): Represent strings as byte arrays r/7460
C++ nix uses C-style zero-terminated char pointers to represent strings
internally - however, up to this point, tvix has used Rust `String` and
`str` for string values. Since those are required to be valid utf-8, we
haven't been able to properly represent all the string values that Nix
supports.

To fix that, this change converts the internal representation of the
NixString struct from `Box<str>` to `BString`, from the `bstr` crate -
this is a wrapper around a `Vec<u8>` with extra functions for treating
that byte vector as a "morally string-like" value, which is basically
exactly what we need.

Since this changes a pretty fundamental assumption about a pretty core
type, there are a *lot* of changes in a lot of places to make this work,
but I've tried to keep the general philosophy and intent of most of the
code in most places intact. Most notably, there's nothing that's been
done to make the derivation stuff in //tvix/glue work with non-utf8
strings everywhere, instead opting to just convert to String/str when
passing things into that - there *might* be something to be done there,
but I don't know what the rules should be and I don't want to figure
them out in this change.

To deal with OS-native paths in a way that also works in WASM for
tvixbolt, this also adds a dependency on the "os_str_bytes" crate.

Fixes: b/189
Fixes: b/337
Change-Id: I5e6eb29c62f47dd91af954f5e12bfc3d186f5526
Reviewed-on: https://cl.tvl.fyi/c/depot/+/10200
Reviewed-by: tazjin <tazjin@tvl.su>
Reviewed-by: flokli <flokli@flokli.de>
Reviewed-by: sterni <sternenseemann@systemli.org>
Autosubmit: aspen <root@gws.fyi>
Tested-by: BuildkiteCI
Diffstat (limited to 'tvix/eval/src/value')
-rw-r--r--tvix/eval/src/value/attrs.rs41
-rw-r--r--tvix/eval/src/value/attrs/tests.rs4
-rw-r--r--tvix/eval/src/value/json.rs7
-rw-r--r--tvix/eval/src/value/mod.rs28
-rw-r--r--tvix/eval/src/value/string.rs149
5 files changed, 142 insertions, 87 deletions
diff --git a/tvix/eval/src/value/attrs.rs b/tvix/eval/src/value/attrs.rs
index b80d8b2db8..2027653c3f 100644
--- a/tvix/eval/src/value/attrs.rs
+++ b/tvix/eval/src/value/attrs.rs
@@ -5,8 +5,10 @@
 //!
 //! Due to this, construction and management of attribute sets has
 //! some peculiarities that are encapsulated within this module.
+use std::borrow::Borrow;
 use std::iter::FromIterator;
 
+use bstr::BStr;
 use imbl::{ordmap, OrdMap};
 use lazy_static::lazy_static;
 use serde::de::{Deserializer, Error, Visitor};
@@ -67,25 +69,25 @@ impl AttrsRep {
         }
     }
 
-    fn select(&self, key: &str) -> Option<&Value> {
+    fn select(&self, key: &BStr) -> Option<&Value> {
         match self {
             AttrsRep::Empty => None,
 
-            AttrsRep::KV { name, value } => match key {
-                "name" => Some(name),
-                "value" => Some(value),
+            AttrsRep::KV { name, value } => match &**key {
+                b"name" => Some(name),
+                b"value" => Some(value),
                 _ => None,
             },
 
-            AttrsRep::Im(map) => map.get(&key.into()),
+            AttrsRep::Im(map) => map.get(key),
         }
     }
 
-    fn contains(&self, key: &str) -> bool {
+    fn contains(&self, key: &BStr) -> bool {
         match self {
             AttrsRep::Empty => false,
             AttrsRep::KV { .. } => key == "name" || key == "value",
-            AttrsRep::Im(map) => map.contains_key(&key.into()),
+            AttrsRep::Im(map) => map.contains_key(key),
         }
     }
 }
@@ -264,19 +266,30 @@ impl NixAttrs {
     }
 
     /// Select a value from an attribute set by key.
-    pub fn select(&self, key: &str) -> Option<&Value> {
-        self.0.select(key)
+    pub fn select<K>(&self, key: &K) -> Option<&Value>
+    where
+        K: Borrow<BStr> + ?Sized,
+    {
+        self.0.select(key.borrow())
     }
 
     /// Select a required value from an attribute set by key, return
     /// an `AttributeNotFound` error if it is missing.
-    pub fn select_required(&self, key: &str) -> Result<&Value, ErrorKind> {
+    pub fn select_required<K>(&self, key: &K) -> Result<&Value, ErrorKind>
+    where
+        K: Borrow<BStr> + ?Sized,
+    {
         self.select(key)
-            .ok_or_else(|| ErrorKind::AttributeNotFound { name: key.into() })
+            .ok_or_else(|| ErrorKind::AttributeNotFound {
+                name: key.borrow().to_string(),
+            })
     }
 
-    pub fn contains(&self, key: &str) -> bool {
-        self.0.contains(key)
+    pub fn contains<'a, K: 'a>(&self, key: K) -> bool
+    where
+        &'a BStr: From<K>,
+    {
+        self.0.contains(key.into())
     }
 
     /// Construct an iterator over all the key-value pairs in the attribute set.
@@ -423,7 +436,7 @@ fn attempt_optimise_kv(slice: &mut [Value]) -> Option<NixAttrs> {
 fn set_attr(attrs: &mut NixAttrs, key: NixString, value: Value) -> Result<(), ErrorKind> {
     match attrs.0.map_mut().entry(key) {
         imbl::ordmap::Entry::Occupied(entry) => Err(ErrorKind::DuplicateAttrsKey {
-            key: entry.key().as_str().to_string(),
+            key: entry.key().to_string(),
         }),
 
         imbl::ordmap::Entry::Vacant(entry) => {
diff --git a/tvix/eval/src/value/attrs/tests.rs b/tvix/eval/src/value/attrs/tests.rs
index 5c693873cd..d269044b32 100644
--- a/tvix/eval/src/value/attrs/tests.rs
+++ b/tvix/eval/src/value/attrs/tests.rs
@@ -1,3 +1,5 @@
+use bstr::B;
+
 use super::*;
 
 #[test]
@@ -99,6 +101,6 @@ fn test_map_attrs_iter() {
     let mut iter = attrs.iter().collect::<Vec<_>>().into_iter();
     let (k, v) = iter.next().unwrap();
     assert!(k == &NixString::from("key"));
-    assert!(v.to_str().unwrap().as_str() == "value");
+    assert_eq!(v.to_str().unwrap(), B("value"));
     assert!(iter.next().is_none());
 }
diff --git a/tvix/eval/src/value/json.rs b/tvix/eval/src/value/json.rs
index 8c31bd0e60..4e915a36e5 100644
--- a/tvix/eval/src/value/json.rs
+++ b/tvix/eval/src/value/json.rs
@@ -7,6 +7,7 @@ use super::{CoercionKind, Value};
 use crate::errors::{CatchableErrorKind, ErrorKind};
 use crate::generators::{self, GenCo};
 
+use bstr::ByteSlice;
 use serde_json::value::to_value;
 use serde_json::Value as Json; // name clash with *our* `Value`
 use serde_json::{Map, Number};
@@ -23,7 +24,7 @@ impl Value {
             Value::Bool(b) => Json::Bool(b),
             Value::Integer(i) => Json::Number(Number::from(i)),
             Value::Float(f) => to_value(f)?,
-            Value::String(s) => Json::String(s.as_str().into()),
+            Value::String(s) => Json::String(s.to_str()?.to_owned()),
 
             Value::Path(p) => {
                 let imported = generators::request_path_import(co, *p).await;
@@ -61,7 +62,7 @@ impl Value {
                         .await?
                     {
                         Value::Catchable(cek) => return Ok(Err(cek)),
-                        Value::String(s) => return Ok(Ok(Json::String(s.as_str().to_owned()))),
+                        Value::String(s) => return Ok(Ok(Json::String(s.to_str()?.to_owned()))),
                         _ => panic!("Value::coerce_to_string_() returned a non-string!"),
                     }
                 }
@@ -76,7 +77,7 @@ impl Value {
                 let mut out = Map::with_capacity(attrs.len());
                 for (name, value) in attrs.into_iter_sorted() {
                     out.insert(
-                        name.as_str().to_string(),
+                        name.to_str()?.to_owned(),
                         match generators::request_to_json(co, value).await {
                             Ok(v) => v,
                             Err(cek) => return Ok(Err(cek)),
diff --git a/tvix/eval/src/value/mod.rs b/tvix/eval/src/value/mod.rs
index ccca0bea5b..e181b3a2da 100644
--- a/tvix/eval/src/value/mod.rs
+++ b/tvix/eval/src/value/mod.rs
@@ -6,6 +6,7 @@ use std::num::{NonZeroI32, NonZeroUsize};
 use std::path::PathBuf;
 use std::rc::Rc;
 
+use bstr::{BString, ByteVec};
 use lexical_core::format::CXX_LITERAL;
 use serde::Deserialize;
 
@@ -313,7 +314,7 @@ impl Value {
         kind: CoercionKind,
         span: LightSpan,
     ) -> Result<Value, ErrorKind> {
-        let mut result = String::new();
+        let mut result = BString::default();
         let mut vals = vec![self];
         // Track if we are coercing the first value of a list to correctly emit
         // separating white spaces.
@@ -326,18 +327,15 @@ impl Value {
             let value = if let Some(v) = vals.pop() {
                 v.force(co, span.clone()).await?
             } else {
-                return Ok(Value::String(NixString::new_context_from(
-                    context,
-                    result.as_str(),
-                )));
+                return Ok(Value::String(NixString::new_context_from(context, result)));
             };
-            let coerced = match (value, kind) {
+            let coerced: Result<BString, _> = match (value, kind) {
                 // coercions that are always done
                 (Value::String(mut s), _) => {
                     if let Some(ctx) = s.context_mut() {
                         context = context.join(ctx);
                     }
-                    Ok(s.as_str().to_owned())
+                    Ok(s.into())
                 }
 
                 // TODO(sterni): Think about proper encoding handling here. This needs
@@ -357,7 +355,7 @@ impl Value {
                     context = context.append(NixContextElement::Plain(
                         imported.to_string_lossy().to_string(),
                     ));
-                    Ok(imported.to_string_lossy().into_owned())
+                    Ok(imported.into_os_string().into_encoded_bytes().into())
                 }
                 (
                     Value::Path(p),
@@ -365,7 +363,7 @@ impl Value {
                         import_paths: false,
                         ..
                     },
-                ) => Ok(p.to_string_lossy().into_owned()),
+                ) => Ok(p.into_os_string().into_encoded_bytes().into()),
 
                 // Attribute sets can be converted to strings if they either have an
                 // `__toString` attribute which holds a function that receives the
@@ -397,14 +395,14 @@ impl Value {
 
                 // strong coercions
                 (Value::Null, CoercionKind { strong: true, .. })
-                | (Value::Bool(false), CoercionKind { strong: true, .. }) => Ok("".to_owned()),
-                (Value::Bool(true), CoercionKind { strong: true, .. }) => Ok("1".to_owned()),
+                | (Value::Bool(false), CoercionKind { strong: true, .. }) => Ok("".into()),
+                (Value::Bool(true), CoercionKind { strong: true, .. }) => Ok("1".into()),
 
-                (Value::Integer(i), CoercionKind { strong: true, .. }) => Ok(format!("{i}")),
+                (Value::Integer(i), CoercionKind { strong: true, .. }) => Ok(format!("{i}").into()),
                 (Value::Float(f), CoercionKind { strong: true, .. }) => {
                     // contrary to normal Display, coercing a float to a string will
                     // result in unconditional 6 decimal places
-                    Ok(format!("{:.6}", f))
+                    Ok(format!("{:.6}", f).into())
                 }
 
                 // Lists are coerced by coercing their elements and interspersing spaces
@@ -448,7 +446,7 @@ impl Value {
 
             if let Some(head) = is_list_head {
                 if !head {
-                    result.push(' ');
+                    result.push(b' ');
                 } else {
                     is_list_head = Some(false);
                 }
@@ -576,7 +574,7 @@ impl Value {
                             let s2 = s2.to_str();
 
                             if let (Ok(s1), Ok(s2)) = (s1, s2) {
-                                if s1.as_str() == "derivation" && s2.as_str() == "derivation" {
+                                if s1 == "derivation" && s2 == "derivation" {
                                     // TODO(tazjin): are the outPaths really required,
                                     // or should it fall through?
                                     let out1 = a1
diff --git a/tvix/eval/src/value/string.rs b/tvix/eval/src/value/string.rs
index c528fd19f2..e2767dd22c 100644
--- a/tvix/eval/src/value/string.rs
+++ b/tvix/eval/src/value/string.rs
@@ -3,14 +3,13 @@
 //! Nix language strings never need to be modified on the language
 //! level, allowing us to shave off some memory overhead and only
 //! paying the cost when creating new strings.
+use bstr::{BStr, BString, ByteSlice, Chars};
 use rnix::ast;
+use std::borrow::{Borrow, Cow};
 use std::collections::HashSet;
-use std::ffi::OsStr;
+use std::fmt::Display;
 use std::hash::Hash;
 use std::ops::Deref;
-use std::path::Path;
-use std::str::{self, Utf8Error};
-use std::{borrow::Cow, fmt::Display, str::Chars};
 
 use serde::de::{Deserializer, Visitor};
 use serde::{Deserialize, Serialize};
@@ -147,16 +146,28 @@ impl NixContext {
 
 // FIXME: when serializing, ignore the context?
 #[derive(Clone, Debug, Serialize)]
-pub struct NixString(Box<str>, Option<NixContext>);
+pub struct NixString(BString, Option<NixContext>);
 
 impl PartialEq for NixString {
     fn eq(&self, other: &Self) -> bool {
-        self.as_str() == other.as_str()
+        self.as_bstr() == other.as_bstr()
     }
 }
 
 impl Eq for NixString {}
 
+impl PartialEq<&[u8]> for NixString {
+    fn eq(&self, other: &&[u8]) -> bool {
+        **self == **other
+    }
+}
+
+impl PartialEq<&str> for NixString {
+    fn eq(&self, other: &&str) -> bool {
+        **self == other.as_bytes()
+    }
+}
+
 impl PartialOrd for NixString {
     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
         Some(self.cmp(other))
@@ -165,39 +176,49 @@ impl PartialOrd for NixString {
 
 impl Ord for NixString {
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        self.as_str().cmp(other.as_str())
+        self.as_bstr().cmp(other.as_bstr())
+    }
+}
+
+impl From<&BStr> for NixString {
+    fn from(value: &BStr) -> Self {
+        Self(value.to_owned(), None)
     }
 }
 
-impl TryFrom<&[u8]> for NixString {
-    type Error = Utf8Error;
+impl From<&[u8]> for NixString {
+    fn from(value: &[u8]) -> Self {
+        Self(value.into(), None)
+    }
+}
 
-    fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
-        Ok(Self(Box::from(str::from_utf8(value)?), None))
+impl From<Vec<u8>> for NixString {
+    fn from(value: Vec<u8>) -> Self {
+        Self(value.into(), None)
     }
 }
 
 impl From<&str> for NixString {
     fn from(s: &str) -> Self {
-        NixString(Box::from(s), None)
+        Self(s.as_bytes().into(), None)
     }
 }
 
 impl From<String> for NixString {
     fn from(s: String) -> Self {
-        NixString(s.into_boxed_str(), None)
+        Self(s.into(), None)
     }
 }
 
 impl From<(String, Option<NixContext>)> for NixString {
-    fn from(s: (String, Option<NixContext>)) -> Self {
-        NixString(s.0.into_boxed_str(), s.1)
+    fn from((s, ctx): (String, Option<NixContext>)) -> Self {
+        NixString(s.into(), ctx)
     }
 }
 
 impl From<Box<str>> for NixString {
     fn from(s: Box<str>) -> Self {
-        Self(s, None)
+        Self(s.into_boxed_bytes().into_vec().into(), None)
     }
 }
 
@@ -207,9 +228,33 @@ impl From<ast::Ident> for NixString {
     }
 }
 
+impl<'a> From<&'a NixString> for &'a BStr {
+    fn from(s: &'a NixString) -> Self {
+        BStr::new(&*s.0)
+    }
+}
+
+impl From<NixString> for BString {
+    fn from(s: NixString) -> Self {
+        s.0
+    }
+}
+
+impl AsRef<[u8]> for NixString {
+    fn as_ref(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+impl Borrow<BStr> for NixString {
+    fn borrow(&self) -> &BStr {
+        self.as_bstr()
+    }
+}
+
 impl Hash for NixString {
     fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
-        self.as_str().hash(state)
+        self.as_bstr().hash(state)
     }
 }
 
@@ -246,6 +291,14 @@ impl<'de> Deserialize<'de> for NixString {
     }
 }
 
+impl Deref for NixString {
+    type Target = BString;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
 #[cfg(feature = "arbitrary")]
 mod arbitrary {
     use super::*;
@@ -264,13 +317,13 @@ mod arbitrary {
 }
 
 impl NixString {
-    pub fn new_inherit_context_from(other: &NixString, new_contents: &str) -> Self {
-        Self(Box::from(new_contents), other.1.clone())
+    pub fn new_inherit_context_from(other: &NixString, new_contents: BString) -> Self {
+        Self(new_contents, other.1.clone())
     }
 
-    pub fn new_context_from(context: NixContext, contents: &str) -> Self {
+    pub fn new_context_from(context: NixContext, contents: BString) -> Self {
         Self(
-            Box::from(contents),
+            contents,
             if context.is_empty() {
                 None
             } else {
@@ -279,10 +332,22 @@ impl NixString {
         )
     }
 
-    pub fn as_str(&self) -> &str {
+    pub fn as_mut_bstring(&mut self) -> &mut BString {
+        &mut self.0
+    }
+
+    pub fn as_bstr(&self) -> &BStr {
+        BStr::new(self.as_bytes())
+    }
+
+    pub fn as_bytes(&self) -> &[u8] {
         &self.0
     }
 
+    pub fn into_bstring(self) -> BString {
+        self.0
+    }
+
     /// Return a displayable representation of the string as an
     /// identifier.
     ///
@@ -290,8 +355,10 @@ impl NixString {
     /// set keys, as those are only escaped in the presence of special
     /// characters.
     pub fn ident_str(&self) -> Cow<str> {
-        let escaped = nix_escape_string(self.as_str());
-
+        let escaped = match self.to_str_lossy() {
+            Cow::Borrowed(s) => nix_escape_string(s),
+            Cow::Owned(s) => nix_escape_string(&s).into_owned().into(),
+        };
         match escaped {
             // A borrowed string is unchanged and can be returned as
             // is.
@@ -310,8 +377,8 @@ impl NixString {
     }
 
     pub fn concat(&self, other: &Self) -> Self {
-        let mut s = self.as_str().to_owned();
-        s.push_str(other.as_str());
+        let mut s = self.to_vec();
+        s.extend(other.0.as_slice());
 
         let context = [&self.1, &other.1]
             .into_iter()
@@ -319,7 +386,7 @@ impl NixString {
             .fold(NixContext::new(), |acc_ctx, new_ctx| {
                 acc_ctx.join(&mut new_ctx.clone())
             });
-        Self::new_context_from(context, &s.into_boxed_str())
+        Self::new_context_from(context, s.into())
     }
 
     pub(crate) fn context_mut(&mut self) -> Option<&mut NixContext> {
@@ -436,37 +503,11 @@ fn nix_escape_string(input: &str) -> Cow<str> {
 impl Display for NixString {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.write_str("\"")?;
-        f.write_str(&nix_escape_string(self.as_str()))?;
+        f.write_str(&nix_escape_string(&self.to_str_lossy()))?;
         f.write_str("\"")
     }
 }
 
-impl AsRef<str> for NixString {
-    fn as_ref(&self) -> &str {
-        self.as_str()
-    }
-}
-
-impl AsRef<OsStr> for NixString {
-    fn as_ref(&self) -> &OsStr {
-        self.as_str().as_ref()
-    }
-}
-
-impl AsRef<Path> for NixString {
-    fn as_ref(&self) -> &Path {
-        self.as_str().as_ref()
-    }
-}
-
-impl Deref for NixString {
-    type Target = str;
-
-    fn deref(&self) -> &Self::Target {
-        self.as_str()
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;