From 201173afaca7d70aa039a1e37a91c49af3a99b0b Mon Sep 17 00:00:00 2001 From: Aspen Smith Date: Tue, 5 Dec 2023 17:25:52 -0500 Subject: fix(tvix): Represent strings as byte arrays C++ nix uses C-style zero-terminated char pointers to represent strings internally - however, up to this point, tvix has used Rust `String` and `str` for string values. Since those are required to be valid utf-8, we haven't been able to properly represent all the string values that Nix supports. To fix that, this change converts the internal representation of the NixString struct from `Box` to `BString`, from the `bstr` crate - this is a wrapper around a `Vec` with extra functions for treating that byte vector as a "morally string-like" value, which is basically exactly what we need. Since this changes a pretty fundamental assumption about a pretty core type, there are a *lot* of changes in a lot of places to make this work, but I've tried to keep the general philosophy and intent of most of the code in most places intact. Most notably, there's nothing that's been done to make the derivation stuff in //tvix/glue work with non-utf8 strings everywhere, instead opting to just convert to String/str when passing things into that - there *might* be something to be done there, but I don't know what the rules should be and I don't want to figure them out in this change. To deal with OS-native paths in a way that also works in WASM for tvixbolt, this also adds a dependency on the "os_str_bytes" crate. Fixes: b/189 Fixes: b/337 Change-Id: I5e6eb29c62f47dd91af954f5e12bfc3d186f5526 Reviewed-on: https://cl.tvl.fyi/c/depot/+/10200 Reviewed-by: tazjin Reviewed-by: flokli Reviewed-by: sterni Autosubmit: aspen Tested-by: BuildkiteCI --- tvix/eval/src/builtins/impure.rs | 7 +- tvix/eval/src/builtins/mod.rs | 109 +++++++++++++++------------ tvix/eval/src/builtins/to_xml.rs | 9 ++- tvix/eval/src/builtins/versions.rs | 38 ++++++---- tvix/eval/src/compiler/import.rs | 5 +- tvix/eval/src/compiler/mod.rs | 2 +- tvix/eval/src/errors.rs | 27 +++++-- tvix/eval/src/value/attrs.rs | 41 ++++++---- tvix/eval/src/value/attrs/tests.rs | 4 +- tvix/eval/src/value/json.rs | 7 +- tvix/eval/src/value/mod.rs | 28 ++++--- tvix/eval/src/value/string.rs | 149 +++++++++++++++++++++++-------------- tvix/eval/src/vm/mod.rs | 25 ++++--- 13 files changed, 278 insertions(+), 173 deletions(-) (limited to 'tvix/eval/src') diff --git a/tvix/eval/src/builtins/impure.rs b/tvix/eval/src/builtins/impure.rs index 5324fd565304..28b8697644dd 100644 --- a/tvix/eval/src/builtins/impure.rs +++ b/tvix/eval/src/builtins/impure.rs @@ -17,12 +17,17 @@ use crate::{ #[builtins] mod impure_builtins { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + use super::*; use crate::builtins::coerce_value_to_path; #[builtin("getEnv")] async fn builtin_get_env(co: GenCo, var: Value) -> Result { - Ok(env::var(var.to_str()?).unwrap_or_else(|_| "".into()).into()) + Ok(env::var(OsStr::from_bytes(&var.to_str()?)) + .unwrap_or_else(|_| "".into()) + .into()) } #[builtin("pathExists")] diff --git a/tvix/eval/src/builtins/mod.rs b/tvix/eval/src/builtins/mod.rs index dc3ea3fc15d4..6ccae2f9de1c 100644 --- a/tvix/eval/src/builtins/mod.rs +++ b/tvix/eval/src/builtins/mod.rs @@ -3,6 +3,7 @@ //! See //tvix/eval/docs/builtins.md for a some context on the //! available builtins in Nix. +use bstr::ByteVec; use builtin_macros::builtins; use genawaiter::rc::Gen; use imbl::OrdMap; @@ -66,7 +67,7 @@ pub async fn coerce_value_to_path( .await { Ok(vs) => { - let path = PathBuf::from(vs.as_str()); + let path = (**vs).clone().into_path_buf()?; if path.is_absolute() { Ok(Ok(path)) } else { @@ -79,8 +80,12 @@ pub async fn coerce_value_to_path( #[builtins] mod pure_builtins { + use std::ffi::OsString; + + use bstr::{BString, ByteSlice}; use imbl::Vector; use itertools::Itertools; + use os_str_bytes::OsStringBytes; use crate::{value::PointerEquality, NixContext, NixContextElement}; @@ -187,7 +192,7 @@ mod pure_builtins { #[builtin("baseNameOf")] async fn builtin_base_name_of(co: GenCo, s: Value) -> Result { let span = generators::request_span(&co).await; - let s = match s { + let mut s = match s { val @ Value::Catchable(_) => return Ok(val), _ => s .coerce_to_string( @@ -201,11 +206,12 @@ mod pure_builtins { .await? .to_contextful_str()?, }; - let result: NixString = NixString::new_inherit_context_from( - &s, - s.rsplit_once('/').map(|(_, x)| x).unwrap_or(&s), - ); - Ok(result.into()) + + let bs = s.as_mut_bstring(); + if let Some(last_slash) = bs.rfind_char('/') { + *bs = bs[(last_slash + 1)..].into(); + } + Ok(s.into()) } #[builtin("bitAnd")] @@ -240,7 +246,7 @@ mod pure_builtins { for item in list.into_iter() { let set = generators::request_force(&co, item).await.to_attrs()?; - if let Some(value) = set.select(key.as_str()) { + if let Some(value) = set.select(&key) { output.push(value.clone()); } } @@ -256,9 +262,9 @@ mod pure_builtins { #[builtin("compareVersions")] async fn builtin_compare_versions(co: GenCo, x: Value, y: Value) -> Result { let s1 = x.to_str()?; - let s1 = VersionPartsIter::new_for_cmp(s1.as_str()); + let s1 = VersionPartsIter::new_for_cmp((&s1).into()); let s2 = y.to_str()?; - let s2 = VersionPartsIter::new_for_cmp(s2.as_str()); + let s2 = VersionPartsIter::new_for_cmp((&s2).into()); match s1.cmp(s2) { std::cmp::Ordering::Less => Ok(Value::Integer(-1)), @@ -323,7 +329,7 @@ mod pure_builtins { context = context.join(sep_context); } let list = list.to_list()?; - let mut res = String::new(); + let mut res = BString::default(); for (i, val) in list.into_iter().enumerate() { if i != 0 { res.push_str(&separator); @@ -339,7 +345,7 @@ mod pure_builtins { .await { Ok(mut s) => { - res.push_str(s.as_str()); + res.push_str(&s); if let Some(ref mut other_context) = s.context_mut() { // It is safe to consume the other context here // because the `list` and `separator` are originally @@ -353,7 +359,7 @@ mod pure_builtins { } } // FIXME: pass immediately the string res. - Ok(NixString::new_context_from(context, &res).into()) + Ok(NixString::new_context_from(context, res).into()) } #[builtin("deepSeq")] @@ -383,17 +389,24 @@ mod pure_builtins { .await? .to_contextful_str()?; let result = str - .rsplit_once('/') - .map(|(x, _)| match x { - "" => "/", - _ => x, + .rfind_char('/') + .map(|last_slash| { + let x = &str[..last_slash]; + if x.is_empty() { + b"/" + } else { + x + } }) - .unwrap_or("."); + .unwrap_or(b"."); if is_path { - Ok(Value::Path(Box::new(result.into()))) + Ok(Value::Path(Box::new(PathBuf::from( + OsString::assert_from_raw_vec(result.to_owned()), + )))) } else { Ok(Value::String(NixString::new_inherit_context_from( - &str, result, + &str, + result.into(), ))) } } @@ -519,7 +532,7 @@ mod pure_builtins { let json_str = json.to_str()?; - serde_json::from_str(&json_str).map_err(|err| err.into()) + serde_json::from_slice(&json_str).map_err(|err| err.into()) } #[builtin("toJSON")] @@ -537,7 +550,7 @@ mod pure_builtins { async fn builtin_from_toml(co: GenCo, toml: Value) -> Result { let toml_str = toml.to_str()?; - toml::from_str(&toml_str).map_err(|err| err.into()) + toml::from_str(toml_str.to_str()?).map_err(|err| err.into()) } #[builtin("filterSource")] @@ -632,7 +645,7 @@ mod pure_builtins { let k = key.to_str()?; let xs = set.to_attrs()?; - match xs.select(k.as_str()) { + match xs.select(&k) { Some(x) => Ok(x.clone()), None => Err(ErrorKind::AttributeNotFound { name: k.to_string(), @@ -680,7 +693,7 @@ mod pure_builtins { let k = key.to_str()?; let xs = set.to_attrs()?; - Ok(Value::Bool(xs.contains(k.as_str()))) + Ok(Value::Bool(xs.contains(&k))) } #[builtin("hasContext")] @@ -1069,8 +1082,8 @@ mod pure_builtins { return Ok(re); } let re = re.to_str()?; - let re: Regex = Regex::new(&format!("^{}$", re.as_str())).unwrap(); - match re.captures(&s) { + let re: Regex = Regex::new(&format!("^{}$", re.to_str()?)).unwrap(); + match re.captures(s.to_str()?) { Some(caps) => Ok(Value::List( caps.iter() .skip(1) @@ -1106,7 +1119,7 @@ mod pure_builtins { // This replicates cppnix's (mis?)handling of codepoints // above U+007f following 0x2d ('-') let s = s.to_str()?; - let slice: &[u8] = s.as_str().as_ref(); + let slice: &[u8] = s.as_ref(); let (name, dash_and_version) = slice.split_at( slice .windows(2) @@ -1219,7 +1232,7 @@ mod pure_builtins { let mut string = s.to_contextful_str()?; - let mut res = String::new(); + let mut res = BString::default(); let mut i: usize = 0; let mut empty_string_replace = false; @@ -1248,27 +1261,27 @@ mod pure_builtins { // We already applied a from->to with an empty from // transformation. // Let's skip it so that we don't loop infinitely - if empty_string_replace && from.as_str().is_empty() { + if empty_string_replace && from.is_empty() { continue; } // if we match the `from` string, let's replace - if &string[i..i + from.len()] == from.as_str() { - res += &to; + if string[i..i + from.len()] == *from { + res.push_str(&to); i += from.len(); if let Some(to_ctx) = to.context_mut() { context = context.join(to_ctx); } // remember if we applied the empty from->to - empty_string_replace = from.as_str().is_empty(); + empty_string_replace = from.is_empty(); continue 'outer; } } // If we don't match any `from`, we simply add a character - res += &string[i..i + 1]; + res.push_str(&string[i..i + 1]); i += 1; // Since we didn't apply anything transformation, @@ -1286,8 +1299,8 @@ mod pure_builtins { // We don't need to merge again the context, it's already in the right state. let mut to = elem.1.to_contextful_str()?; - if from.as_str().is_empty() { - res += &to; + if from.is_empty() { + res.push_str(&to); if let Some(to_ctx) = to.context_mut() { context = context.join(to_ctx); } @@ -1295,8 +1308,7 @@ mod pure_builtins { } } - // FIXME: consume directly the String. - Ok(Value::String(NixString::new_context_from(context, &res))) + Ok(Value::String(NixString::new_context_from(context, res))) } #[builtin("seq")] @@ -1317,9 +1329,9 @@ mod pure_builtins { } let s = str.to_contextful_str()?; - let text = s.as_str(); + let text = s.to_str()?; let re = regex.to_str()?; - let re: Regex = Regex::new(re.as_str()).unwrap(); + let re = Regex::new(re.to_str()?).unwrap(); let mut capture_locations = re.capture_locations(); let num_captures = capture_locations.len(); let mut ret = imbl::Vector::new(); @@ -1329,7 +1341,7 @@ mod pure_builtins { // push the unmatched characters preceding the match ret.push_back(Value::from(NixString::new_inherit_context_from( &s, - &text[pos..thematch.start()], + (&text[pos..thematch.start()]).into(), ))); // Push a list with one element for each capture @@ -1380,7 +1392,7 @@ mod pure_builtins { return Ok(s); } let s = s.to_str()?; - let s = VersionPartsIter::new(s.as_str()); + let s = VersionPartsIter::new((&s).into()); let parts = s .map(|s| { @@ -1412,7 +1424,7 @@ mod pure_builtins { return Ok(s); } - Ok(Value::Integer(s.to_contextful_str()?.as_str().len() as i64)) + Ok(Value::Integer(s.to_contextful_str()?.len() as i64)) } #[builtin("sub")] @@ -1453,19 +1465,22 @@ mod pure_builtins { // Nix doesn't assert that the length argument is // non-negative when the starting index is GTE the // string's length. - if beg >= x.as_str().len() { - return Ok(Value::String(NixString::new_inherit_context_from(&x, ""))); + if beg >= x.len() { + return Ok(Value::String(NixString::new_inherit_context_from( + &x, + BString::default(), + ))); } let end = if len < 0 { - x.as_str().len() + x.len() } else { - cmp::min(beg + (len as usize), x.as_str().len()) + cmp::min(beg + (len as usize), x.len()) }; Ok(Value::String(NixString::new_inherit_context_from( &x, - &x[beg..end], + (&x[beg..end]).into(), ))) } diff --git a/tvix/eval/src/builtins/to_xml.rs b/tvix/eval/src/builtins/to_xml.rs index f23cb22db3d9..2f9a11e78852 100644 --- a/tvix/eval/src/builtins/to_xml.rs +++ b/tvix/eval/src/builtins/to_xml.rs @@ -2,6 +2,7 @@ //! of value information as well as internal tvix state that several //! things in nixpkgs rely on. +use bstr::ByteSlice; use std::{io::Write, rc::Rc}; use xml::writer::events::XmlEvent; use xml::writer::EmitterConfig; @@ -60,7 +61,7 @@ fn value_variant_to_xml(w: &mut EventWriter, value: &Value) -> Resu Value::Bool(b) => return write_typed_value(w, "bool", b), Value::Integer(i) => return write_typed_value(w, "int", i), Value::Float(f) => return write_typed_value(w, "float", f), - Value::String(s) => return write_typed_value(w, "string", s.as_str()), + Value::String(s) => return write_typed_value(w, "string", s.to_str()?), Value::Path(p) => return write_typed_value(w, "path", p.to_string_lossy()), Value::List(list) => { @@ -77,7 +78,7 @@ fn value_variant_to_xml(w: &mut EventWriter, value: &Value) -> Resu w.write(XmlEvent::start_element("attrs"))?; for elem in attrs.iter() { - w.write(XmlEvent::start_element("attr").attr("name", elem.0.as_str()))?; + w.write(XmlEvent::start_element("attr").attr("name", &elem.0.to_str_lossy()))?; value_variant_to_xml(w, elem.1)?; w.write(XmlEvent::end_element())?; } @@ -101,7 +102,9 @@ fn value_variant_to_xml(w: &mut EventWriter, value: &Value) -> Resu w.write(attrspat)?; for arg in formals.arguments.iter() { - w.write(XmlEvent::start_element("attr").attr("name", arg.0.as_str()))?; + w.write( + XmlEvent::start_element("attr").attr("name", &arg.0.to_str_lossy()), + )?; w.write(XmlEvent::end_element())?; } diff --git a/tvix/eval/src/builtins/versions.rs b/tvix/eval/src/builtins/versions.rs index 79fb82b868fb..6de512142440 100644 --- a/tvix/eval/src/builtins/versions.rs +++ b/tvix/eval/src/builtins/versions.rs @@ -2,13 +2,15 @@ use std::cmp::Ordering; use std::iter::{once, Chain, Once}; use std::ops::RangeInclusive; +use bstr::{BStr, ByteSlice, B}; + /// Version strings can be broken up into Parts. /// One Part represents either a string of digits or characters. /// '.' and '_' represent deviders between parts and are not included in any part. #[derive(PartialEq, Eq, Clone, Debug)] pub enum VersionPart<'a> { - Word(&'a str), - Number(&'a str), + Word(&'a BStr), + Number(&'a BStr), } impl PartialOrd for VersionPart<'_> { @@ -23,15 +25,17 @@ impl Ord for VersionPart<'_> { (VersionPart::Number(s1), VersionPart::Number(s2)) => { // Note: C++ Nix uses `int`, but probably doesn't make a difference // We trust that the splitting was done correctly and parsing will work - let n1: u64 = s1.parse().unwrap(); - let n2: u64 = s2.parse().unwrap(); + let n1: u64 = s1.to_str_lossy().parse().unwrap(); + let n2: u64 = s2.to_str_lossy().parse().unwrap(); n1.cmp(&n2) } // `pre` looses unless the other part is also a `pre` - (VersionPart::Word("pre"), VersionPart::Word("pre")) => Ordering::Equal, - (VersionPart::Word("pre"), _) => Ordering::Less, - (_, VersionPart::Word("pre")) => Ordering::Greater, + (VersionPart::Word(x), VersionPart::Word(y)) if *x == B("pre") && *y == B("pre") => { + Ordering::Equal + } + (VersionPart::Word(x), _) if *x == B("pre") => Ordering::Less, + (_, VersionPart::Word(y)) if *y == B("pre") => Ordering::Greater, // Number wins against Word (VersionPart::Number(_), VersionPart::Word(_)) => Ordering::Greater, @@ -54,12 +58,12 @@ enum InternalPart { /// This can then be directly used to compare two versions pub struct VersionPartsIter<'a> { cached_part: InternalPart, - iter: std::str::CharIndices<'a>, - version: &'a str, + iter: bstr::CharIndices<'a>, + version: &'a BStr, } impl<'a> VersionPartsIter<'a> { - pub fn new(version: &'a str) -> Self { + pub fn new(version: &'a BStr) -> Self { Self { cached_part: InternalPart::Break, iter: version.char_indices(), @@ -77,8 +81,8 @@ impl<'a> VersionPartsIter<'a> { /// like `2.3 < 2.3.0pre` ensues. Luckily for us, this means that we can /// lexicographically compare two version strings, _if_ we append an extra /// component to both versions. - pub fn new_for_cmp(version: &'a str) -> Chain> { - Self::new(version).chain(once(VersionPart::Word(""))) + pub fn new_for_cmp(version: &'a BStr) -> Chain> { + Self::new(version).chain(once(VersionPart::Word("".into()))) } } @@ -101,7 +105,7 @@ impl<'a> Iterator for VersionPartsIter<'a> { } } - let (pos, char) = char.unwrap(); + let (start, end, char) = char.unwrap(); match char { // Divider encountered '.' | '-' => { @@ -119,7 +123,9 @@ impl<'a> Iterator for VersionPartsIter<'a> { _ if char.is_ascii_digit() => { let cached_part = std::mem::replace( &mut self.cached_part, - InternalPart::Number { range: pos..=pos }, + InternalPart::Number { + range: start..=(end - 1), + }, ); match cached_part { InternalPart::Number { range } => { @@ -135,7 +141,9 @@ impl<'a> Iterator for VersionPartsIter<'a> { // char encountered _ => { - let mut cached_part = InternalPart::Word { range: pos..=pos }; + let mut cached_part = InternalPart::Word { + range: start..=(end - 1), + }; std::mem::swap(&mut cached_part, &mut self.cached_part); match cached_part { InternalPart::Word { range } => { diff --git a/tvix/eval/src/compiler/import.rs b/tvix/eval/src/compiler/import.rs index 94a6602e9df6..4e8ea7195d81 100644 --- a/tvix/eval/src/compiler/import.rs +++ b/tvix/eval/src/compiler/import.rs @@ -6,6 +6,7 @@ //! instance, or observers). use super::GlobalsMap; +use bstr::ByteSlice; use genawaiter::rc::Gen; use std::rc::Weak; @@ -40,11 +41,11 @@ async fn import_impl( // TODO(tazjin): make this return a string directly instead let contents: Value = generators::request_read_to_string(&co, path.clone()).await; - let contents = contents.to_str()?.as_str().to_string(); + let contents = contents.to_str()?.to_str()?.to_owned(); let parsed = rnix::ast::Root::parse(&contents); let errors = parsed.errors(); - let file = source.add_file(path.to_string_lossy().to_string(), contents); + let file = source.add_file(path.to_string_lossy().to_string(), contents.to_owned()); if !errors.is_empty() { return Err(ErrorKind::ImportParseError { diff --git a/tvix/eval/src/compiler/mod.rs b/tvix/eval/src/compiler/mod.rs index 53a99a453d9e..b4ef7973c7e8 100644 --- a/tvix/eval/src/compiler/mod.rs +++ b/tvix/eval/src/compiler/mod.rs @@ -727,7 +727,7 @@ impl Compiler<'_> { if let (Some(attr), None) = (path_iter.next(), path_iter.next()) { // Only do this optimisation for statically known attrs. if let Some(ident) = expr_static_attr_str(&attr) { - if let Some(selected_value) = attrs.select(ident.as_str()) { + if let Some(selected_value) = attrs.select(ident.as_bytes()) { *constant = selected_value.clone(); // If this worked, we can unthunk the current thunk. diff --git a/tvix/eval/src/errors.rs b/tvix/eval/src/errors.rs index 1bd2e47e93ec..d5cb6ef34b0d 100644 --- a/tvix/eval/src/errors.rs +++ b/tvix/eval/src/errors.rs @@ -178,6 +178,9 @@ pub enum ErrorKind { formals_span: Span, }, + /// Invalid UTF-8 was encoutered somewhere + Utf8, + /// Errors while serialising to XML. Xml(Rc), @@ -245,6 +248,18 @@ impl From for ErrorKind { } } +impl From for ErrorKind { + fn from(_: bstr::Utf8Error) -> Self { + Self::Utf8 + } +} + +impl From for ErrorKind { + fn from(_value: bstr::FromUtf8Error) -> Self { + Self::Utf8 + } +} + impl From for ErrorKind { fn from(err: XmlError) -> Self { Self::Xml(Rc::new(err)) @@ -457,11 +472,11 @@ to a missing value in the attribute set(s) included via `with`."#, } ErrorKind::UnexpectedArgument { arg, .. } => { - write!( - f, - "Unexpected argument `{}` supplied to function", - arg.as_str() - ) + write!(f, "Unexpected argument `{arg}` supplied to function",) + } + + ErrorKind::Utf8 => { + write!(f, "Invalid UTF-8 in string") } ErrorKind::Xml(error) => write!(f, "failed to serialise to XML: {error}"), @@ -775,6 +790,7 @@ impl Error { | ErrorKind::NotSerialisableToJson(_) | ErrorKind::FromTomlError(_) | ErrorKind::Xml(_) + | ErrorKind::Utf8 | ErrorKind::TvixError(_) | ErrorKind::TvixBug { .. } | ErrorKind::NotImplemented(_) @@ -819,6 +835,7 @@ impl Error { ErrorKind::FromTomlError(_) => "E035", ErrorKind::NotSerialisableToJson(_) => "E036", ErrorKind::UnexpectedContext => "E037", + ErrorKind::Utf8 => "E038", // Special error code for errors from other Tvix // components. We may want to introduce a code namespacing diff --git a/tvix/eval/src/value/attrs.rs b/tvix/eval/src/value/attrs.rs index b80d8b2db84e..2027653c3f39 100644 --- a/tvix/eval/src/value/attrs.rs +++ b/tvix/eval/src/value/attrs.rs @@ -5,8 +5,10 @@ //! //! Due to this, construction and management of attribute sets has //! some peculiarities that are encapsulated within this module. +use std::borrow::Borrow; use std::iter::FromIterator; +use bstr::BStr; use imbl::{ordmap, OrdMap}; use lazy_static::lazy_static; use serde::de::{Deserializer, Error, Visitor}; @@ -67,25 +69,25 @@ impl AttrsRep { } } - fn select(&self, key: &str) -> Option<&Value> { + fn select(&self, key: &BStr) -> Option<&Value> { match self { AttrsRep::Empty => None, - AttrsRep::KV { name, value } => match key { - "name" => Some(name), - "value" => Some(value), + AttrsRep::KV { name, value } => match &**key { + b"name" => Some(name), + b"value" => Some(value), _ => None, }, - AttrsRep::Im(map) => map.get(&key.into()), + AttrsRep::Im(map) => map.get(key), } } - fn contains(&self, key: &str) -> bool { + fn contains(&self, key: &BStr) -> bool { match self { AttrsRep::Empty => false, AttrsRep::KV { .. } => key == "name" || key == "value", - AttrsRep::Im(map) => map.contains_key(&key.into()), + AttrsRep::Im(map) => map.contains_key(key), } } } @@ -264,19 +266,30 @@ impl NixAttrs { } /// Select a value from an attribute set by key. - pub fn select(&self, key: &str) -> Option<&Value> { - self.0.select(key) + pub fn select(&self, key: &K) -> Option<&Value> + where + K: Borrow + ?Sized, + { + self.0.select(key.borrow()) } /// Select a required value from an attribute set by key, return /// an `AttributeNotFound` error if it is missing. - pub fn select_required(&self, key: &str) -> Result<&Value, ErrorKind> { + pub fn select_required(&self, key: &K) -> Result<&Value, ErrorKind> + where + K: Borrow + ?Sized, + { self.select(key) - .ok_or_else(|| ErrorKind::AttributeNotFound { name: key.into() }) + .ok_or_else(|| ErrorKind::AttributeNotFound { + name: key.borrow().to_string(), + }) } - pub fn contains(&self, key: &str) -> bool { - self.0.contains(key) + pub fn contains<'a, K: 'a>(&self, key: K) -> bool + where + &'a BStr: From, + { + self.0.contains(key.into()) } /// Construct an iterator over all the key-value pairs in the attribute set. @@ -423,7 +436,7 @@ fn attempt_optimise_kv(slice: &mut [Value]) -> Option { fn set_attr(attrs: &mut NixAttrs, key: NixString, value: Value) -> Result<(), ErrorKind> { match attrs.0.map_mut().entry(key) { imbl::ordmap::Entry::Occupied(entry) => Err(ErrorKind::DuplicateAttrsKey { - key: entry.key().as_str().to_string(), + key: entry.key().to_string(), }), imbl::ordmap::Entry::Vacant(entry) => { diff --git a/tvix/eval/src/value/attrs/tests.rs b/tvix/eval/src/value/attrs/tests.rs index 5c693873cd75..d269044b32ba 100644 --- a/tvix/eval/src/value/attrs/tests.rs +++ b/tvix/eval/src/value/attrs/tests.rs @@ -1,3 +1,5 @@ +use bstr::B; + use super::*; #[test] @@ -99,6 +101,6 @@ fn test_map_attrs_iter() { let mut iter = attrs.iter().collect::>().into_iter(); let (k, v) = iter.next().unwrap(); assert!(k == &NixString::from("key")); - assert!(v.to_str().unwrap().as_str() == "value"); + assert_eq!(v.to_str().unwrap(), B("value")); assert!(iter.next().is_none()); } diff --git a/tvix/eval/src/value/json.rs b/tvix/eval/src/value/json.rs index 8c31bd0e6082..4e915a36e55a 100644 --- a/tvix/eval/src/value/json.rs +++ b/tvix/eval/src/value/json.rs @@ -7,6 +7,7 @@ use super::{CoercionKind, Value}; use crate::errors::{CatchableErrorKind, ErrorKind}; use crate::generators::{self, GenCo}; +use bstr::ByteSlice; use serde_json::value::to_value; use serde_json::Value as Json; // name clash with *our* `Value` use serde_json::{Map, Number}; @@ -23,7 +24,7 @@ impl Value { Value::Bool(b) => Json::Bool(b), Value::Integer(i) => Json::Number(Number::from(i)), Value::Float(f) => to_value(f)?, - Value::String(s) => Json::String(s.as_str().into()), + Value::String(s) => Json::String(s.to_str()?.to_owned()), Value::Path(p) => { let imported = generators::request_path_import(co, *p).await; @@ -61,7 +62,7 @@ impl Value { .await? { Value::Catchable(cek) => return Ok(Err(cek)), - Value::String(s) => return Ok(Ok(Json::String(s.as_str().to_owned()))), + Value::String(s) => return Ok(Ok(Json::String(s.to_str()?.to_owned()))), _ => panic!("Value::coerce_to_string_() returned a non-string!"), } } @@ -76,7 +77,7 @@ impl Value { let mut out = Map::with_capacity(attrs.len()); for (name, value) in attrs.into_iter_sorted() { out.insert( - name.as_str().to_string(), + name.to_str()?.to_owned(), match generators::request_to_json(co, value).await { Ok(v) => v, Err(cek) => return Ok(Err(cek)), diff --git a/tvix/eval/src/value/mod.rs b/tvix/eval/src/value/mod.rs index ccca0bea5b2e..e181b3a2da4e 100644 --- a/tvix/eval/src/value/mod.rs +++ b/tvix/eval/src/value/mod.rs @@ -6,6 +6,7 @@ use std::num::{NonZeroI32, NonZeroUsize}; use std::path::PathBuf; use std::rc::Rc; +use bstr::{BString, ByteVec}; use lexical_core::format::CXX_LITERAL; use serde::Deserialize; @@ -313,7 +314,7 @@ impl Value { kind: CoercionKind, span: LightSpan, ) -> Result { - let mut result = String::new(); + let mut result = BString::default(); let mut vals = vec![self]; // Track if we are coercing the first value of a list to correctly emit // separating white spaces. @@ -326,18 +327,15 @@ impl Value { let value = if let Some(v) = vals.pop() { v.force(co, span.clone()).await? } else { - return Ok(Value::String(NixString::new_context_from( - context, - result.as_str(), - ))); + return Ok(Value::String(NixString::new_context_from(context, result))); }; - let coerced = match (value, kind) { + let coerced: Result = match (value, kind) { // coercions that are always done (Value::String(mut s), _) => { if let Some(ctx) = s.context_mut() { context = context.join(ctx); } - Ok(s.as_str().to_owned()) + Ok(s.into()) } // TODO(sterni): Think about proper encoding handling here. This needs @@ -357,7 +355,7 @@ impl Value { context = context.append(NixContextElement::Plain( imported.to_string_lossy().to_string(), )); - Ok(imported.to_string_lossy().into_owned()) + Ok(imported.into_os_string().into_encoded_bytes().into()) } ( Value::Path(p), @@ -365,7 +363,7 @@ impl Value { import_paths: false, .. }, - ) => Ok(p.to_string_lossy().into_owned()), + ) => Ok(p.into_os_string().into_encoded_bytes().into()), // Attribute sets can be converted to strings if they either have an // `__toString` attribute which holds a function that receives the @@ -397,14 +395,14 @@ impl Value { // strong coercions (Value::Null, CoercionKind { strong: true, .. }) - | (Value::Bool(false), CoercionKind { strong: true, .. }) => Ok("".to_owned()), - (Value::Bool(true), CoercionKind { strong: true, .. }) => Ok("1".to_owned()), + | (Value::Bool(false), CoercionKind { strong: true, .. }) => Ok("".into()), + (Value::Bool(true), CoercionKind { strong: true, .. }) => Ok("1".into()), - (Value::Integer(i), CoercionKind { strong: true, .. }) => Ok(format!("{i}")), + (Value::Integer(i), CoercionKind { strong: true, .. }) => Ok(format!("{i}").into()), (Value::Float(f), CoercionKind { strong: true, .. }) => { // contrary to normal Display, coercing a float to a string will // result in unconditional 6 decimal places - Ok(format!("{:.6}", f)) + Ok(format!("{:.6}", f).into()) } // Lists are coerced by coercing their elements and interspersing spaces @@ -448,7 +446,7 @@ impl Value { if let Some(head) = is_list_head { if !head { - result.push(' '); + result.push(b' '); } else { is_list_head = Some(false); } @@ -576,7 +574,7 @@ impl Value { let s2 = s2.to_str(); if let (Ok(s1), Ok(s2)) = (s1, s2) { - if s1.as_str() == "derivation" && s2.as_str() == "derivation" { + if s1 == "derivation" && s2 == "derivation" { // TODO(tazjin): are the outPaths really required, // or should it fall through? let out1 = a1 diff --git a/tvix/eval/src/value/string.rs b/tvix/eval/src/value/string.rs index c528fd19f229..e2767dd22cdb 100644 --- a/tvix/eval/src/value/string.rs +++ b/tvix/eval/src/value/string.rs @@ -3,14 +3,13 @@ //! Nix language strings never need to be modified on the language //! level, allowing us to shave off some memory overhead and only //! paying the cost when creating new strings. +use bstr::{BStr, BString, ByteSlice, Chars}; use rnix::ast; +use std::borrow::{Borrow, Cow}; use std::collections::HashSet; -use std::ffi::OsStr; +use std::fmt::Display; use std::hash::Hash; use std::ops::Deref; -use std::path::Path; -use std::str::{self, Utf8Error}; -use std::{borrow::Cow, fmt::Display, str::Chars}; use serde::de::{Deserializer, Visitor}; use serde::{Deserialize, Serialize}; @@ -147,16 +146,28 @@ impl NixContext { // FIXME: when serializing, ignore the context? #[derive(Clone, Debug, Serialize)] -pub struct NixString(Box, Option); +pub struct NixString(BString, Option); impl PartialEq for NixString { fn eq(&self, other: &Self) -> bool { - self.as_str() == other.as_str() + self.as_bstr() == other.as_bstr() } } impl Eq for NixString {} +impl PartialEq<&[u8]> for NixString { + fn eq(&self, other: &&[u8]) -> bool { + **self == **other + } +} + +impl PartialEq<&str> for NixString { + fn eq(&self, other: &&str) -> bool { + **self == other.as_bytes() + } +} + impl PartialOrd for NixString { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) @@ -165,39 +176,49 @@ impl PartialOrd for NixString { impl Ord for NixString { fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.as_str().cmp(other.as_str()) + self.as_bstr().cmp(other.as_bstr()) + } +} + +impl From<&BStr> for NixString { + fn from(value: &BStr) -> Self { + Self(value.to_owned(), None) } } -impl TryFrom<&[u8]> for NixString { - type Error = Utf8Error; +impl From<&[u8]> for NixString { + fn from(value: &[u8]) -> Self { + Self(value.into(), None) + } +} - fn try_from(value: &[u8]) -> Result { - Ok(Self(Box::from(str::from_utf8(value)?), None)) +impl From> for NixString { + fn from(value: Vec) -> Self { + Self(value.into(), None) } } impl From<&str> for NixString { fn from(s: &str) -> Self { - NixString(Box::from(s), None) + Self(s.as_bytes().into(), None) } } impl From for NixString { fn from(s: String) -> Self { - NixString(s.into_boxed_str(), None) + Self(s.into(), None) } } impl From<(String, Option)> for NixString { - fn from(s: (String, Option)) -> Self { - NixString(s.0.into_boxed_str(), s.1) + fn from((s, ctx): (String, Option)) -> Self { + NixString(s.into(), ctx) } } impl From> for NixString { fn from(s: Box) -> Self { - Self(s, None) + Self(s.into_boxed_bytes().into_vec().into(), None) } } @@ -207,9 +228,33 @@ impl From for NixString { } } +impl<'a> From<&'a NixString> for &'a BStr { + fn from(s: &'a NixString) -> Self { + BStr::new(&*s.0) + } +} + +impl From for BString { + fn from(s: NixString) -> Self { + s.0 + } +} + +impl AsRef<[u8]> for NixString { + fn as_ref(&self) -> &[u8] { + &self.0 + } +} + +impl Borrow for NixString { + fn borrow(&self) -> &BStr { + self.as_bstr() + } +} + impl Hash for NixString { fn hash(&self, state: &mut H) { - self.as_str().hash(state) + self.as_bstr().hash(state) } } @@ -246,6 +291,14 @@ impl<'de> Deserialize<'de> for NixString { } } +impl Deref for NixString { + type Target = BString; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + #[cfg(feature = "arbitrary")] mod arbitrary { use super::*; @@ -264,13 +317,13 @@ mod arbitrary { } impl NixString { - pub fn new_inherit_context_from(other: &NixString, new_contents: &str) -> Self { - Self(Box::from(new_contents), other.1.clone()) + pub fn new_inherit_context_from(other: &NixString, new_contents: BString) -> Self { + Self(new_contents, other.1.clone()) } - pub fn new_context_from(context: NixContext, contents: &str) -> Self { + pub fn new_context_from(context: NixContext, contents: BString) -> Self { Self( - Box::from(contents), + contents, if context.is_empty() { None } else { @@ -279,10 +332,22 @@ impl NixString { ) } - pub fn as_str(&self) -> &str { + pub fn as_mut_bstring(&mut self) -> &mut BString { + &mut self.0 + } + + pub fn as_bstr(&self) -> &BStr { + BStr::new(self.as_bytes()) + } + + pub fn as_bytes(&self) -> &[u8] { &self.0 } + pub fn into_bstring(self) -> BString { + self.0 + } + /// Return a displayable representation of the string as an /// identifier. /// @@ -290,8 +355,10 @@ impl NixString { /// set keys, as those are only escaped in the presence of special /// characters. pub fn ident_str(&self) -> Cow { - let escaped = nix_escape_string(self.as_str()); - + let escaped = match self.to_str_lossy() { + Cow::Borrowed(s) => nix_escape_string(s), + Cow::Owned(s) => nix_escape_string(&s).into_owned().into(), + }; match escaped { // A borrowed string is unchanged and can be returned as // is. @@ -310,8 +377,8 @@ impl NixString { } pub fn concat(&self, other: &Self) -> Self { - let mut s = self.as_str().to_owned(); - s.push_str(other.as_str()); + let mut s = self.to_vec(); + s.extend(other.0.as_slice()); let context = [&self.1, &other.1] .into_iter() @@ -319,7 +386,7 @@ impl NixString { .fold(NixContext::new(), |acc_ctx, new_ctx| { acc_ctx.join(&mut new_ctx.clone()) }); - Self::new_context_from(context, &s.into_boxed_str()) + Self::new_context_from(context, s.into()) } pub(crate) fn context_mut(&mut self) -> Option<&mut NixContext> { @@ -436,37 +503,11 @@ fn nix_escape_string(input: &str) -> Cow { impl Display for NixString { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.write_str("\"")?; - f.write_str(&nix_escape_string(self.as_str()))?; + f.write_str(&nix_escape_string(&self.to_str_lossy()))?; f.write_str("\"") } } -impl AsRef for NixString { - fn as_ref(&self) -> &str { - self.as_str() - } -} - -impl AsRef for NixString { - fn as_ref(&self) -> &OsStr { - self.as_str().as_ref() - } -} - -impl AsRef for NixString { - fn as_ref(&self) -> &Path { - self.as_str().as_ref() - } -} - -impl Deref for NixString { - type Target = str; - - fn deref(&self) -> &Self::Target { - self.as_str() - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/tvix/eval/src/vm/mod.rs b/tvix/eval/src/vm/mod.rs index 5238fdc0662f..d51d44b6a110 100644 --- a/tvix/eval/src/vm/mod.rs +++ b/tvix/eval/src/vm/mod.rs @@ -12,6 +12,7 @@ pub mod generators; mod macros; +use bstr::{BString, ByteSlice, ByteVec}; use codemap::Span; use serde_json::json; use std::{cmp::Ordering, collections::HashMap, ops::DerefMut, path::PathBuf, rc::Rc}; @@ -550,14 +551,14 @@ where let key = key.to_str().with_span(&frame, self)?; let attrs = attrs.to_attrs().with_span(&frame, self)?; - match attrs.select(key.as_str()) { + match attrs.select(&key) { Some(value) => self.stack.push(value.clone()), None => { return frame.error( self, ErrorKind::AttributeNotFound { - name: key.as_str().to_string(), + name: (**key).clone().into_string_lossy() }, ); } @@ -598,7 +599,7 @@ where OpCode::OpAttrsTrySelect => { let key = self.stack_pop().to_str().with_span(&frame, self)?; let value = match self.stack_pop() { - Value::Attrs(attrs) => match attrs.select(key.as_str()) { + Value::Attrs(attrs) => match attrs.select(&key) { Some(value) => value.clone(), None => Value::AttrNotFound, }, @@ -705,7 +706,7 @@ where self(key, attrs) => { let key = key.to_str().with_span(&frame, self)?; let result = match attrs { - Value::Attrs(attrs) => attrs.contains(key.as_str()), + Value::Attrs(attrs) => attrs.contains(&key), // Nix allows use of `?` on non-set types, but // always returns false in those cases. @@ -742,7 +743,7 @@ where self.enqueue_generator("resolve_with", op_span, |co| { resolve_with( co, - ident.as_str().to_owned(), + ident.as_bstr().to_owned(), with_stack_len, closed_with_stack_len, ) @@ -966,7 +967,7 @@ where /// fragments of the stack, evaluating them to strings, and pushing /// the concatenated result string back on the stack. fn run_interpolate(&mut self, frame: &CallFrame, count: usize) -> EvalResult<()> { - let mut out = String::new(); + let mut out = BString::default(); // Interpolation propagates the context and union them. let mut context: NixContext = NixContext::new(); @@ -980,7 +981,7 @@ where return Ok(()); } let mut nix_string = val.to_contextful_str().with_span(frame, self)?; - out.push_str(nix_string.as_str()); + out.push_str(nix_string.as_bstr()); if let Some(nix_string_ctx) = nix_string.context_mut() { context = context.join(nix_string_ctx); } @@ -988,7 +989,7 @@ where // FIXME: consume immediately here the String. self.stack - .push(Value::String(NixString::new_context_from(context, &out))); + .push(Value::String(NixString::new_context_from(context, out))); Ok(()) } @@ -1160,7 +1161,7 @@ where /// for matching values in the with-stacks carried at runtime. async fn resolve_with( co: GenCo, - ident: String, + ident: BString, vm_with_len: usize, upvalue_with_len: usize, ) -> Result { @@ -1213,7 +1214,7 @@ async fn resolve_with( } } - Err(ErrorKind::UnknownDynamicVariable(ident)) + Err(ErrorKind::UnknownDynamicVariable(ident.to_string())) } // TODO(amjoseph): de-asyncify this @@ -1221,7 +1222,7 @@ async fn add_values(co: GenCo, a: Value, b: Value) -> Result { // What we try to do is solely determined by the type of the first value! let result = match (a, b) { (Value::Path(p), v) => { - let mut path = p.to_string_lossy().into_owned(); + let mut path = p.into_os_string(); match generators::request_string_coerce( &co, v, @@ -1243,7 +1244,7 @@ async fn add_values(co: GenCo, a: Value, b: Value) -> Result { .await { Ok(vs) => { - path.push_str(vs.as_str()); + path.push(vs.to_os_str()?); crate::value::canon_path(PathBuf::from(path)).into() } Err(c) => Value::Catchable(c), -- cgit 1.4.1