diff options
author | Aspen Smith <root@gws.fyi> | 2023-12-05T22·25-0500 |
---|---|---|
committer | aspen <root@gws.fyi> | 2024-01-31T14·51+0000 |
commit | 201173afaca7d70aa039a1e37a91c49af3a99b0b (patch) | |
tree | d661ca257820aca975339ee7d17dd1a08df85932 /tvix/eval/src/value/mod.rs | |
parent | 6f9e25943f3e2f83d191cadcc76a278073626fe8 (diff) |
fix(tvix): Represent strings as byte arrays r/7460
C++ nix uses C-style zero-terminated char pointers to represent strings internally - however, up to this point, tvix has used Rust `String` and `str` for string values. Since those are required to be valid utf-8, we haven't been able to properly represent all the string values that Nix supports. To fix that, this change converts the internal representation of the NixString struct from `Box<str>` to `BString`, from the `bstr` crate - this is a wrapper around a `Vec<u8>` with extra functions for treating that byte vector as a "morally string-like" value, which is basically exactly what we need. Since this changes a pretty fundamental assumption about a pretty core type, there are a *lot* of changes in a lot of places to make this work, but I've tried to keep the general philosophy and intent of most of the code in most places intact. Most notably, there's nothing that's been done to make the derivation stuff in //tvix/glue work with non-utf8 strings everywhere, instead opting to just convert to String/str when passing things into that - there *might* be something to be done there, but I don't know what the rules should be and I don't want to figure them out in this change. To deal with OS-native paths in a way that also works in WASM for tvixbolt, this also adds a dependency on the "os_str_bytes" crate. Fixes: b/189 Fixes: b/337 Change-Id: I5e6eb29c62f47dd91af954f5e12bfc3d186f5526 Reviewed-on: https://cl.tvl.fyi/c/depot/+/10200 Reviewed-by: tazjin <tazjin@tvl.su> Reviewed-by: flokli <flokli@flokli.de> Reviewed-by: sterni <sternenseemann@systemli.org> Autosubmit: aspen <root@gws.fyi> Tested-by: BuildkiteCI
Diffstat (limited to 'tvix/eval/src/value/mod.rs')
-rw-r--r-- | tvix/eval/src/value/mod.rs | 28 |
1 files changed, 13 insertions, 15 deletions
diff --git a/tvix/eval/src/value/mod.rs b/tvix/eval/src/value/mod.rs index ccca0bea5b2e..e181b3a2da4e 100644 --- a/tvix/eval/src/value/mod.rs +++ b/tvix/eval/src/value/mod.rs @@ -6,6 +6,7 @@ use std::num::{NonZeroI32, NonZeroUsize}; use std::path::PathBuf; use std::rc::Rc; +use bstr::{BString, ByteVec}; use lexical_core::format::CXX_LITERAL; use serde::Deserialize; @@ -313,7 +314,7 @@ impl Value { kind: CoercionKind, span: LightSpan, ) -> Result<Value, ErrorKind> { - let mut result = String::new(); + let mut result = BString::default(); let mut vals = vec![self]; // Track if we are coercing the first value of a list to correctly emit // separating white spaces. @@ -326,18 +327,15 @@ impl Value { let value = if let Some(v) = vals.pop() { v.force(co, span.clone()).await? } else { - return Ok(Value::String(NixString::new_context_from( - context, - result.as_str(), - ))); + return Ok(Value::String(NixString::new_context_from(context, result))); }; - let coerced = match (value, kind) { + let coerced: Result<BString, _> = match (value, kind) { // coercions that are always done (Value::String(mut s), _) => { if let Some(ctx) = s.context_mut() { context = context.join(ctx); } - Ok(s.as_str().to_owned()) + Ok(s.into()) } // TODO(sterni): Think about proper encoding handling here. This needs @@ -357,7 +355,7 @@ impl Value { context = context.append(NixContextElement::Plain( imported.to_string_lossy().to_string(), )); - Ok(imported.to_string_lossy().into_owned()) + Ok(imported.into_os_string().into_encoded_bytes().into()) } ( Value::Path(p), @@ -365,7 +363,7 @@ impl Value { import_paths: false, .. }, - ) => Ok(p.to_string_lossy().into_owned()), + ) => Ok(p.into_os_string().into_encoded_bytes().into()), // Attribute sets can be converted to strings if they either have an // `__toString` attribute which holds a function that receives the @@ -397,14 +395,14 @@ impl Value { // strong coercions (Value::Null, CoercionKind { strong: true, .. }) - | (Value::Bool(false), CoercionKind { strong: true, .. }) => Ok("".to_owned()), - (Value::Bool(true), CoercionKind { strong: true, .. }) => Ok("1".to_owned()), + | (Value::Bool(false), CoercionKind { strong: true, .. }) => Ok("".into()), + (Value::Bool(true), CoercionKind { strong: true, .. }) => Ok("1".into()), - (Value::Integer(i), CoercionKind { strong: true, .. }) => Ok(format!("{i}")), + (Value::Integer(i), CoercionKind { strong: true, .. }) => Ok(format!("{i}").into()), (Value::Float(f), CoercionKind { strong: true, .. }) => { // contrary to normal Display, coercing a float to a string will // result in unconditional 6 decimal places - Ok(format!("{:.6}", f)) + Ok(format!("{:.6}", f).into()) } // Lists are coerced by coercing their elements and interspersing spaces @@ -448,7 +446,7 @@ impl Value { if let Some(head) = is_list_head { if !head { - result.push(' '); + result.push(b' '); } else { is_list_head = Some(false); } @@ -576,7 +574,7 @@ impl Value { let s2 = s2.to_str(); if let (Ok(s1), Ok(s2)) = (s1, s2) { - if s1.as_str() == "derivation" && s2.as_str() == "derivation" { + if s1 == "derivation" && s2 == "derivation" { // TODO(tazjin): are the outPaths really required, // or should it fall through? let out1 = a1 |