about summary refs log tree commit diff
path: root/tvix/eval/src/builtins/mod.rs
diff options
context:
space:
mode:
authorAspen Smith <root@gws.fyi>2023-12-05T22·25-0500
committeraspen <root@gws.fyi>2024-01-31T14·51+0000
commit201173afaca7d70aa039a1e37a91c49af3a99b0b (patch)
treed661ca257820aca975339ee7d17dd1a08df85932 /tvix/eval/src/builtins/mod.rs
parent6f9e25943f3e2f83d191cadcc76a278073626fe8 (diff)
fix(tvix): Represent strings as byte arrays r/7460
C++ nix uses C-style zero-terminated char pointers to represent strings
internally - however, up to this point, tvix has used Rust `String` and
`str` for string values. Since those are required to be valid utf-8, we
haven't been able to properly represent all the string values that Nix
supports.

To fix that, this change converts the internal representation of the
NixString struct from `Box<str>` to `BString`, from the `bstr` crate -
this is a wrapper around a `Vec<u8>` with extra functions for treating
that byte vector as a "morally string-like" value, which is basically
exactly what we need.

Since this changes a pretty fundamental assumption about a pretty core
type, there are a *lot* of changes in a lot of places to make this work,
but I've tried to keep the general philosophy and intent of most of the
code in most places intact. Most notably, there's nothing that's been
done to make the derivation stuff in //tvix/glue work with non-utf8
strings everywhere, instead opting to just convert to String/str when
passing things into that - there *might* be something to be done there,
but I don't know what the rules should be and I don't want to figure
them out in this change.

To deal with OS-native paths in a way that also works in WASM for
tvixbolt, this also adds a dependency on the "os_str_bytes" crate.

Fixes: b/189
Fixes: b/337
Change-Id: I5e6eb29c62f47dd91af954f5e12bfc3d186f5526
Reviewed-on: https://cl.tvl.fyi/c/depot/+/10200
Reviewed-by: tazjin <tazjin@tvl.su>
Reviewed-by: flokli <flokli@flokli.de>
Reviewed-by: sterni <sternenseemann@systemli.org>
Autosubmit: aspen <root@gws.fyi>
Tested-by: BuildkiteCI
Diffstat (limited to 'tvix/eval/src/builtins/mod.rs')
-rw-r--r--tvix/eval/src/builtins/mod.rs109
1 files changed, 62 insertions, 47 deletions
diff --git a/tvix/eval/src/builtins/mod.rs b/tvix/eval/src/builtins/mod.rs
index dc3ea3fc15d4..6ccae2f9de1c 100644
--- a/tvix/eval/src/builtins/mod.rs
+++ b/tvix/eval/src/builtins/mod.rs
@@ -3,6 +3,7 @@
 //! See //tvix/eval/docs/builtins.md for a some context on the
 //! available builtins in Nix.
 
+use bstr::ByteVec;
 use builtin_macros::builtins;
 use genawaiter::rc::Gen;
 use imbl::OrdMap;
@@ -66,7 +67,7 @@ pub async fn coerce_value_to_path(
     .await
     {
         Ok(vs) => {
-            let path = PathBuf::from(vs.as_str());
+            let path = (**vs).clone().into_path_buf()?;
             if path.is_absolute() {
                 Ok(Ok(path))
             } else {
@@ -79,8 +80,12 @@ pub async fn coerce_value_to_path(
 
 #[builtins]
 mod pure_builtins {
+    use std::ffi::OsString;
+
+    use bstr::{BString, ByteSlice};
     use imbl::Vector;
     use itertools::Itertools;
+    use os_str_bytes::OsStringBytes;
 
     use crate::{value::PointerEquality, NixContext, NixContextElement};
 
@@ -187,7 +192,7 @@ mod pure_builtins {
     #[builtin("baseNameOf")]
     async fn builtin_base_name_of(co: GenCo, s: Value) -> Result<Value, ErrorKind> {
         let span = generators::request_span(&co).await;
-        let s = match s {
+        let mut s = match s {
             val @ Value::Catchable(_) => return Ok(val),
             _ => s
                 .coerce_to_string(
@@ -201,11 +206,12 @@ mod pure_builtins {
                 .await?
                 .to_contextful_str()?,
         };
-        let result: NixString = NixString::new_inherit_context_from(
-            &s,
-            s.rsplit_once('/').map(|(_, x)| x).unwrap_or(&s),
-        );
-        Ok(result.into())
+
+        let bs = s.as_mut_bstring();
+        if let Some(last_slash) = bs.rfind_char('/') {
+            *bs = bs[(last_slash + 1)..].into();
+        }
+        Ok(s.into())
     }
 
     #[builtin("bitAnd")]
@@ -240,7 +246,7 @@ mod pure_builtins {
         for item in list.into_iter() {
             let set = generators::request_force(&co, item).await.to_attrs()?;
 
-            if let Some(value) = set.select(key.as_str()) {
+            if let Some(value) = set.select(&key) {
                 output.push(value.clone());
             }
         }
@@ -256,9 +262,9 @@ mod pure_builtins {
     #[builtin("compareVersions")]
     async fn builtin_compare_versions(co: GenCo, x: Value, y: Value) -> Result<Value, ErrorKind> {
         let s1 = x.to_str()?;
-        let s1 = VersionPartsIter::new_for_cmp(s1.as_str());
+        let s1 = VersionPartsIter::new_for_cmp((&s1).into());
         let s2 = y.to_str()?;
-        let s2 = VersionPartsIter::new_for_cmp(s2.as_str());
+        let s2 = VersionPartsIter::new_for_cmp((&s2).into());
 
         match s1.cmp(s2) {
             std::cmp::Ordering::Less => Ok(Value::Integer(-1)),
@@ -323,7 +329,7 @@ mod pure_builtins {
             context = context.join(sep_context);
         }
         let list = list.to_list()?;
-        let mut res = String::new();
+        let mut res = BString::default();
         for (i, val) in list.into_iter().enumerate() {
             if i != 0 {
                 res.push_str(&separator);
@@ -339,7 +345,7 @@ mod pure_builtins {
             .await
             {
                 Ok(mut s) => {
-                    res.push_str(s.as_str());
+                    res.push_str(&s);
                     if let Some(ref mut other_context) = s.context_mut() {
                         // It is safe to consume the other context here
                         // because the `list` and `separator` are originally
@@ -353,7 +359,7 @@ mod pure_builtins {
             }
         }
         // FIXME: pass immediately the string res.
-        Ok(NixString::new_context_from(context, &res).into())
+        Ok(NixString::new_context_from(context, res).into())
     }
 
     #[builtin("deepSeq")]
@@ -383,17 +389,24 @@ mod pure_builtins {
             .await?
             .to_contextful_str()?;
         let result = str
-            .rsplit_once('/')
-            .map(|(x, _)| match x {
-                "" => "/",
-                _ => x,
+            .rfind_char('/')
+            .map(|last_slash| {
+                let x = &str[..last_slash];
+                if x.is_empty() {
+                    b"/"
+                } else {
+                    x
+                }
             })
-            .unwrap_or(".");
+            .unwrap_or(b".");
         if is_path {
-            Ok(Value::Path(Box::new(result.into())))
+            Ok(Value::Path(Box::new(PathBuf::from(
+                OsString::assert_from_raw_vec(result.to_owned()),
+            ))))
         } else {
             Ok(Value::String(NixString::new_inherit_context_from(
-                &str, result,
+                &str,
+                result.into(),
             )))
         }
     }
@@ -519,7 +532,7 @@ mod pure_builtins {
 
         let json_str = json.to_str()?;
 
-        serde_json::from_str(&json_str).map_err(|err| err.into())
+        serde_json::from_slice(&json_str).map_err(|err| err.into())
     }
 
     #[builtin("toJSON")]
@@ -537,7 +550,7 @@ mod pure_builtins {
     async fn builtin_from_toml(co: GenCo, toml: Value) -> Result<Value, ErrorKind> {
         let toml_str = toml.to_str()?;
 
-        toml::from_str(&toml_str).map_err(|err| err.into())
+        toml::from_str(toml_str.to_str()?).map_err(|err| err.into())
     }
 
     #[builtin("filterSource")]
@@ -632,7 +645,7 @@ mod pure_builtins {
         let k = key.to_str()?;
         let xs = set.to_attrs()?;
 
-        match xs.select(k.as_str()) {
+        match xs.select(&k) {
             Some(x) => Ok(x.clone()),
             None => Err(ErrorKind::AttributeNotFound {
                 name: k.to_string(),
@@ -680,7 +693,7 @@ mod pure_builtins {
         let k = key.to_str()?;
         let xs = set.to_attrs()?;
 
-        Ok(Value::Bool(xs.contains(k.as_str())))
+        Ok(Value::Bool(xs.contains(&k)))
     }
 
     #[builtin("hasContext")]
@@ -1069,8 +1082,8 @@ mod pure_builtins {
             return Ok(re);
         }
         let re = re.to_str()?;
-        let re: Regex = Regex::new(&format!("^{}$", re.as_str())).unwrap();
-        match re.captures(&s) {
+        let re: Regex = Regex::new(&format!("^{}$", re.to_str()?)).unwrap();
+        match re.captures(s.to_str()?) {
             Some(caps) => Ok(Value::List(
                 caps.iter()
                     .skip(1)
@@ -1106,7 +1119,7 @@ mod pure_builtins {
         // This replicates cppnix's (mis?)handling of codepoints
         // above U+007f following 0x2d ('-')
         let s = s.to_str()?;
-        let slice: &[u8] = s.as_str().as_ref();
+        let slice: &[u8] = s.as_ref();
         let (name, dash_and_version) = slice.split_at(
             slice
                 .windows(2)
@@ -1219,7 +1232,7 @@ mod pure_builtins {
 
         let mut string = s.to_contextful_str()?;
 
-        let mut res = String::new();
+        let mut res = BString::default();
 
         let mut i: usize = 0;
         let mut empty_string_replace = false;
@@ -1248,27 +1261,27 @@ mod pure_builtins {
                 // We already applied a from->to with an empty from
                 // transformation.
                 // Let's skip it so that we don't loop infinitely
-                if empty_string_replace && from.as_str().is_empty() {
+                if empty_string_replace && from.is_empty() {
                     continue;
                 }
 
                 // if we match the `from` string, let's replace
-                if &string[i..i + from.len()] == from.as_str() {
-                    res += &to;
+                if string[i..i + from.len()] == *from {
+                    res.push_str(&to);
                     i += from.len();
                     if let Some(to_ctx) = to.context_mut() {
                         context = context.join(to_ctx);
                     }
 
                     // remember if we applied the empty from->to
-                    empty_string_replace = from.as_str().is_empty();
+                    empty_string_replace = from.is_empty();
 
                     continue 'outer;
                 }
             }
 
             // If we don't match any `from`, we simply add a character
-            res += &string[i..i + 1];
+            res.push_str(&string[i..i + 1]);
             i += 1;
 
             // Since we didn't apply anything transformation,
@@ -1286,8 +1299,8 @@ mod pure_builtins {
             // We don't need to merge again the context, it's already in the right state.
             let mut to = elem.1.to_contextful_str()?;
 
-            if from.as_str().is_empty() {
-                res += &to;
+            if from.is_empty() {
+                res.push_str(&to);
                 if let Some(to_ctx) = to.context_mut() {
                     context = context.join(to_ctx);
                 }
@@ -1295,8 +1308,7 @@ mod pure_builtins {
             }
         }
 
-        // FIXME: consume directly the String.
-        Ok(Value::String(NixString::new_context_from(context, &res)))
+        Ok(Value::String(NixString::new_context_from(context, res)))
     }
 
     #[builtin("seq")]
@@ -1317,9 +1329,9 @@ mod pure_builtins {
         }
 
         let s = str.to_contextful_str()?;
-        let text = s.as_str();
+        let text = s.to_str()?;
         let re = regex.to_str()?;
-        let re: Regex = Regex::new(re.as_str()).unwrap();
+        let re = Regex::new(re.to_str()?).unwrap();
         let mut capture_locations = re.capture_locations();
         let num_captures = capture_locations.len();
         let mut ret = imbl::Vector::new();
@@ -1329,7 +1341,7 @@ mod pure_builtins {
             // push the unmatched characters preceding the match
             ret.push_back(Value::from(NixString::new_inherit_context_from(
                 &s,
-                &text[pos..thematch.start()],
+                (&text[pos..thematch.start()]).into(),
             )));
 
             // Push a list with one element for each capture
@@ -1380,7 +1392,7 @@ mod pure_builtins {
             return Ok(s);
         }
         let s = s.to_str()?;
-        let s = VersionPartsIter::new(s.as_str());
+        let s = VersionPartsIter::new((&s).into());
 
         let parts = s
             .map(|s| {
@@ -1412,7 +1424,7 @@ mod pure_builtins {
             return Ok(s);
         }
 
-        Ok(Value::Integer(s.to_contextful_str()?.as_str().len() as i64))
+        Ok(Value::Integer(s.to_contextful_str()?.len() as i64))
     }
 
     #[builtin("sub")]
@@ -1453,19 +1465,22 @@ mod pure_builtins {
         // Nix doesn't assert that the length argument is
         // non-negative when the starting index is GTE the
         // string's length.
-        if beg >= x.as_str().len() {
-            return Ok(Value::String(NixString::new_inherit_context_from(&x, "")));
+        if beg >= x.len() {
+            return Ok(Value::String(NixString::new_inherit_context_from(
+                &x,
+                BString::default(),
+            )));
         }
 
         let end = if len < 0 {
-            x.as_str().len()
+            x.len()
         } else {
-            cmp::min(beg + (len as usize), x.as_str().len())
+            cmp::min(beg + (len as usize), x.len())
         };
 
         Ok(Value::String(NixString::new_inherit_context_from(
             &x,
-            &x[beg..end],
+            (&x[beg..end]).into(),
         )))
     }