about summary refs log tree commit diff
diff options
context:
space:
mode:
authorFlorian Klink <flokli@flokli.de>2023-07-29T19·14+0200
committerclbot <clbot@tvl.fyi>2023-07-31T21·41+0000
commit79531c3dab1c24ff3171c0aa067004c8e6c92e3f (patch)
tree6e4198e648810bb835a8b0e0f68bf1af779829a8
parent9521df708f92a237090b1b17ec969b319c4d00fe (diff)
refactor(tvix/nix-compat): support non-unicode Derivations r/6449
Derivations can have non-unicode strings in their env values, so the
ATerm representations are not necessarily String anymore, but Vec<u8>.

Change-Id: Ic23839471eb7f68d9c3c30667c878830946b6607
Reviewed-on: https://cl.tvl.fyi/c/depot/+/8990
Tested-by: BuildkiteCI
Reviewed-by: raitobezarius <tvl@lahfa.xyz>
Autosubmit: flokli <flokli@flokli.de>
-rw-r--r--tvix/Cargo.lock18
-rw-r--r--tvix/Cargo.nix82
-rw-r--r--tvix/cli/src/derivation.rs6
-rw-r--r--tvix/cli/src/refscan.rs16
-rw-r--r--tvix/nix-compat/Cargo.toml1
-rw-r--r--tvix/nix-compat/src/derivation/escape.rs31
-rw-r--r--tvix/nix-compat/src/derivation/mod.rs31
-rw-r--r--tvix/nix-compat/src/derivation/string_escape.rs17
-rw-r--r--tvix/nix-compat/src/derivation/tests/derivation_tests/m1vfixn8iprlf0v9abmlrz7mjw1xj8kp-cp1252.drv1
-rw-r--r--tvix/nix-compat/src/derivation/tests/derivation_tests/m1vfixn8iprlf0v9abmlrz7mjw1xj8kp-cp1252.drv.json21
-rw-r--r--tvix/nix-compat/src/derivation/tests/derivation_tests/x6p0hg79i3wg0kkv7699935f7rrj9jf3-latin1.drv1
-rw-r--r--tvix/nix-compat/src/derivation/tests/derivation_tests/x6p0hg79i3wg0kkv7699935f7rrj9jf3-latin1.drv.json21
-rw-r--r--tvix/nix-compat/src/derivation/tests/mod.rs159
-rw-r--r--tvix/nix-compat/src/derivation/write.rs145
14 files changed, 426 insertions, 124 deletions
diff --git a/tvix/Cargo.lock b/tvix/Cargo.lock
index 52cc6d4a75..cb65d4343f 100644
--- a/tvix/Cargo.lock
+++ b/tvix/Cargo.lock
@@ -267,6 +267,17 @@ dependencies = [
 ]
 
 [[package]]
+name = "bstr"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6798148dccfbff0fae41c7574d2fa8f1ef3492fba0face179de5d8d447d67b05"
+dependencies = [
+ "memchr",
+ "regex-automata",
+ "serde",
+]
+
+[[package]]
 name = "bumpalo"
 version = "3.12.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1346,6 +1357,7 @@ name = "nix-compat"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "bstr",
  "data-encoding",
  "glob",
  "serde",
@@ -1890,6 +1902,12 @@ dependencies = [
 ]
 
 [[package]]
+name = "regex-automata"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7b6d6190b7594385f61bd3911cd1be99dfddcfc365a4160cc2ab5bff4aed294"
+
+[[package]]
 name = "regex-syntax"
 version = "0.6.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/tvix/Cargo.nix b/tvix/Cargo.nix
index 83c3825c31..b44e10de92 100644
--- a/tvix/Cargo.nix
+++ b/tvix/Cargo.nix
@@ -829,6 +829,43 @@ rec {
         ];
 
       };
+      "bstr" = rec {
+        crateName = "bstr";
+        version = "1.6.0";
+        edition = "2021";
+        sha256 = "01bvsr3x9n75klbwxym0zf939vzim0plsmy786p0zzzvrj6i9637";
+        authors = [
+          "Andrew Gallant <jamslam@gmail.com>"
+        ];
+        dependencies = [
+          {
+            name = "memchr";
+            packageId = "memchr";
+            usesDefaultFeatures = false;
+          }
+          {
+            name = "regex-automata";
+            packageId = "regex-automata";
+            optional = true;
+            usesDefaultFeatures = false;
+            features = [ "dfa-search" ];
+          }
+          {
+            name = "serde";
+            packageId = "serde";
+            optional = true;
+            usesDefaultFeatures = false;
+          }
+        ];
+        features = {
+          "alloc" = [ "serde?/alloc" ];
+          "default" = [ "std" "unicode" ];
+          "serde" = [ "dep:serde" ];
+          "std" = [ "alloc" "memchr/std" "serde?/std" ];
+          "unicode" = [ "dep:regex-automata" ];
+        };
+        resolvedDefaultFeatures = [ "alloc" "default" "serde" "std" "unicode" ];
+      };
       "bumpalo" = rec {
         crateName = "bumpalo";
         version = "3.12.1";
@@ -3856,6 +3893,11 @@ rec {
             packageId = "anyhow";
           }
           {
+            name = "bstr";
+            packageId = "bstr";
+            features = [ "alloc" "unicode" "serde" ];
+          }
+          {
             name = "data-encoding";
             packageId = "data-encoding";
           }
@@ -5337,6 +5379,46 @@ rec {
         };
         resolvedDefaultFeatures = [ "aho-corasick" "default" "memchr" "perf" "perf-cache" "perf-dfa" "perf-inline" "perf-literal" "std" "unicode" "unicode-age" "unicode-bool" "unicode-case" "unicode-gencat" "unicode-perl" "unicode-script" "unicode-segment" ];
       };
+      "regex-automata" = rec {
+        crateName = "regex-automata";
+        version = "0.3.4";
+        edition = "2021";
+        sha256 = "156jmvsbzd9arih42ninzkfgv7g93g6i2fdxc5gki53m1ccxddmp";
+        authors = [
+          "The Rust Project Developers"
+          "Andrew Gallant <jamslam@gmail.com>"
+        ];
+        features = {
+          "default" = [ "std" "syntax" "perf" "unicode" "meta" "nfa" "dfa" "hybrid" ];
+          "dfa" = [ "dfa-build" "dfa-search" "dfa-onepass" ];
+          "dfa-build" = [ "nfa-thompson" "dfa-search" ];
+          "dfa-onepass" = [ "nfa-thompson" ];
+          "hybrid" = [ "alloc" "nfa-thompson" ];
+          "internal-instrument" = [ "internal-instrument-pikevm" ];
+          "internal-instrument-pikevm" = [ "logging" "std" ];
+          "logging" = [ "dep:log" "aho-corasick?/logging" ];
+          "meta" = [ "syntax" "nfa-pikevm" ];
+          "nfa" = [ "nfa-thompson" "nfa-pikevm" "nfa-backtrack" ];
+          "nfa-backtrack" = [ "nfa-thompson" ];
+          "nfa-pikevm" = [ "nfa-thompson" ];
+          "nfa-thompson" = [ "alloc" ];
+          "perf" = [ "perf-inline" "perf-literal" ];
+          "perf-literal" = [ "perf-literal-substring" "perf-literal-multisubstring" ];
+          "perf-literal-multisubstring" = [ "std" "dep:aho-corasick" ];
+          "perf-literal-substring" = [ "aho-corasick?/perf-literal" "dep:memchr" ];
+          "std" = [ "regex-syntax?/std" "memchr?/std" "aho-corasick?/std" "alloc" ];
+          "syntax" = [ "dep:regex-syntax" "alloc" ];
+          "unicode" = [ "unicode-age" "unicode-bool" "unicode-case" "unicode-gencat" "unicode-perl" "unicode-script" "unicode-segment" "unicode-word-boundary" "regex-syntax?/unicode" ];
+          "unicode-age" = [ "regex-syntax?/unicode-age" ];
+          "unicode-bool" = [ "regex-syntax?/unicode-bool" ];
+          "unicode-case" = [ "regex-syntax?/unicode-case" ];
+          "unicode-gencat" = [ "regex-syntax?/unicode-gencat" ];
+          "unicode-perl" = [ "regex-syntax?/unicode-perl" ];
+          "unicode-script" = [ "regex-syntax?/unicode-script" ];
+          "unicode-segment" = [ "regex-syntax?/unicode-segment" ];
+        };
+        resolvedDefaultFeatures = [ "dfa-search" ];
+      };
       "regex-syntax 0.6.29" = rec {
         crateName = "regex-syntax";
         version = "0.6.29";
diff --git a/tvix/cli/src/derivation.rs b/tvix/cli/src/derivation.rs
index fa246cc74f..f8afe19461 100644
--- a/tvix/cli/src/derivation.rs
+++ b/tvix/cli/src/derivation.rs
@@ -283,7 +283,7 @@ mod derivation_builtins {
             // Most of these are also added to the builder's environment in "raw" form.
             if drv
                 .environment
-                .insert(name.as_str().to_string(), val_str)
+                .insert(name.as_str().to_string(), val_str.into())
                 .is_some()
             {
                 return Err(Error::DuplicateEnvVar(name.as_str().to_string()).into());
@@ -312,7 +312,7 @@ mod derivation_builtins {
             } else {
                 let mut refscan = state.reference_scanner();
                 drv.arguments.iter().for_each(|s| refscan.scan_str(s));
-                drv.environment.values().for_each(|s| refscan.scan_str(s));
+                drv.environment.values().for_each(|s| refscan.scan_bytes(s));
                 refscan.scan_str(&drv.builder);
                 refscan.finalise()
             }
@@ -324,7 +324,7 @@ mod derivation_builtins {
         for output in drv.outputs.keys() {
             if drv
                 .environment
-                .insert(output.to_string(), String::new())
+                .insert(output.to_string(), String::new().into())
                 .is_some()
             {
                 emit_warning_kind(&co, WarningKind::ShadowedOutput(output.to_string())).await;
diff --git a/tvix/cli/src/refscan.rs b/tvix/cli/src/refscan.rs
index b391781f92..25ada4c865 100644
--- a/tvix/cli/src/refscan.rs
+++ b/tvix/cli/src/refscan.rs
@@ -37,7 +37,21 @@ impl<P: Clone + Ord + AsRef<[u8]>> ReferenceScanner<P> {
         }
     }
 
-    /// Scan the given string for all non-overlapping matches and collect them
+    /// If the given &[u8] is also a valid UTF-8 string, scan for all non-
+    /// overlapping matches and collect them in the scanner.
+    /// TODO: ideally, wu-manber would just work with &[u8] directly.
+    pub fn scan_bytes(&mut self, haystack: &[u8]) {
+        if haystack.len() < STORE_PATH_LEN {
+            return;
+        }
+
+        match std::str::from_utf8(haystack) {
+            Ok(s) => self.scan_str(s),
+            Err(_) => {}
+        }
+    }
+
+    /// Scan the given str for all non-overlapping matches and collect them
     /// in the scanner.
     pub fn scan_str(&mut self, haystack: &str) {
         if haystack.len() < STORE_PATH_LEN {
diff --git a/tvix/nix-compat/Cargo.toml b/tvix/nix-compat/Cargo.toml
index 49ddbf4728..47bbeb2517 100644
--- a/tvix/nix-compat/Cargo.toml
+++ b/tvix/nix-compat/Cargo.toml
@@ -7,6 +7,7 @@ edition = "2021"
 
 [dependencies]
 anyhow = "1.0.68"
+bstr = { version = "1.6.0", features = ["alloc", "unicode", "serde"] }
 data-encoding = "2.3.3"
 glob = "0.3.0"
 serde = { version = "1.0", features = ["derive"] }
diff --git a/tvix/nix-compat/src/derivation/escape.rs b/tvix/nix-compat/src/derivation/escape.rs
new file mode 100644
index 0000000000..03106c4420
--- /dev/null
+++ b/tvix/nix-compat/src/derivation/escape.rs
@@ -0,0 +1,31 @@
+use bstr::{BString, ByteSlice};
+
+pub fn escape_bstr(s: &[u8]) -> BString {
+    let mut s: Vec<u8> = s.to_owned();
+
+    s = s.replace(b"\\", b"\\\\");
+    s = s.replace(b"\n", b"\\n");
+    s = s.replace(b"\r", b"\\r");
+    s = s.replace(b"\t", b"\\t");
+    s = s.replace(b"\"", b"\\\"");
+
+    let mut out: Vec<u8> = Vec::new();
+    out.push(b'\"');
+    out.append(&mut s);
+    out.push(b'\"');
+
+    out.into()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::escape_bstr;
+    use test_case::test_case;
+
+    #[test_case(b"", b"\"\""; "empty")]
+    #[test_case(b"\"", b"\"\\\"\""; "doublequote")]
+    #[test_case(b":", b"\":\""; "colon")]
+    fn escape(input: &[u8], expected: &[u8]) {
+        assert_eq!(expected, escape_bstr(input))
+    }
+}
diff --git a/tvix/nix-compat/src/derivation/mod.rs b/tvix/nix-compat/src/derivation/mod.rs
index ab615d502d..498aa1b59d 100644
--- a/tvix/nix-compat/src/derivation/mod.rs
+++ b/tvix/nix-compat/src/derivation/mod.rs
@@ -1,13 +1,14 @@
 use crate::store_path::{
     self, build_output_path, build_regular_ca_path, build_text_path, StorePath,
 };
+use bstr::BString;
 use serde::{Deserialize, Serialize};
 use sha2::{Digest, Sha256};
 use std::collections::{BTreeMap, BTreeSet};
 
 mod errors;
+mod escape;
 mod output;
-mod string_escape;
 mod validate;
 mod write;
 
@@ -27,7 +28,7 @@ pub struct Derivation {
     pub builder: String,
 
     #[serde(rename = "env")]
-    pub environment: BTreeMap<String, String>,
+    pub environment: BTreeMap<String, BString>,
 
     #[serde(rename = "inputDrvs")]
     pub input_derivations: BTreeMap<String, BTreeSet<String>>,
@@ -41,12 +42,12 @@ pub struct Derivation {
 }
 
 impl Derivation {
-    /// write the Derivation to the given [std::fmt::Write], in ATerm format.
+    /// write the Derivation to the given [std::io::Write], in ATerm format.
     ///
     /// The only errors returns are these when writing to the passed writer.
-    pub fn serialize(&self, writer: &mut impl std::fmt::Write) -> Result<(), std::fmt::Error> {
-        writer.write_str(write::DERIVATION_PREFIX)?;
-        writer.write_char(write::PAREN_OPEN)?;
+    pub fn serialize(&self, writer: &mut impl std::io::Write) -> Result<(), std::io::Error> {
+        write::write_str(writer, write::DERIVATION_PREFIX)?;
+        write::write_char(writer, write::PAREN_OPEN)?;
 
         write::write_outputs(writer, &self.outputs)?;
         write::write_input_derivations(writer, &self.input_derivations)?;
@@ -56,14 +57,14 @@ impl Derivation {
         write::write_arguments(writer, &self.arguments)?;
         write::write_enviroment(writer, &self.environment)?;
 
-        writer.write_char(write::PAREN_CLOSE)?;
+        write::write_char(writer, write::PAREN_CLOSE)?;
 
         Ok(())
     }
 
-    /// return the ATerm serialization as a string.
-    pub fn to_aterm_string(&self) -> String {
-        let mut buffer = String::new();
+    /// return the ATerm serialization.
+    pub fn to_aterm_bytes(&self) -> Vec<u8> {
+        let mut buffer: Vec<u8> = Vec::new();
 
         // invoke serialize and write to the buffer.
         // Note we only propagate errors writing to the writer in serialize,
@@ -93,7 +94,7 @@ impl Derivation {
             inputs
         };
 
-        build_text_path(name, self.to_aterm_string(), references)
+        build_text_path(name, self.to_aterm_bytes(), references)
             .map_err(|_e| DerivationError::InvalidOutputName(name.to_string()))
     }
 
@@ -165,7 +166,7 @@ impl Derivation {
 
             // write the ATerm of that to the hash function
             let mut hasher = Sha256::new();
-            hasher.update(replaced_derivation.to_aterm_string());
+            hasher.update(replaced_derivation.to_aterm_bytes());
 
             hasher.finalize().to_vec()
         });
@@ -218,8 +219,10 @@ impl Derivation {
             };
 
             output.path = abs_store_path.to_absolute_path();
-            self.environment
-                .insert(output_name.to_string(), abs_store_path.to_absolute_path());
+            self.environment.insert(
+                output_name.to_string(),
+                abs_store_path.to_absolute_path().into(),
+            );
         }
 
         Ok(())
diff --git a/tvix/nix-compat/src/derivation/string_escape.rs b/tvix/nix-compat/src/derivation/string_escape.rs
deleted file mode 100644
index 0e1dbe516f..0000000000
--- a/tvix/nix-compat/src/derivation/string_escape.rs
+++ /dev/null
@@ -1,17 +0,0 @@
-const STRING_ESCAPER: [(char, &str); 5] = [
-    ('\\', "\\\\"),
-    ('\n', "\\n"),
-    ('\r', "\\r"),
-    ('\t', "\\t"),
-    ('\"', "\\\""),
-];
-
-pub fn escape_string(s: &str) -> String {
-    let mut s_replaced = s.to_string();
-
-    for escape_sequence in STRING_ESCAPER {
-        s_replaced = s_replaced.replace(escape_sequence.0, escape_sequence.1);
-    }
-
-    format!("\"{}\"", s_replaced)
-}
diff --git a/tvix/nix-compat/src/derivation/tests/derivation_tests/m1vfixn8iprlf0v9abmlrz7mjw1xj8kp-cp1252.drv b/tvix/nix-compat/src/derivation/tests/derivation_tests/m1vfixn8iprlf0v9abmlrz7mjw1xj8kp-cp1252.drv
new file mode 100644
index 0000000000..6a7a35c58c
--- /dev/null
+++ b/tvix/nix-compat/src/derivation/tests/derivation_tests/m1vfixn8iprlf0v9abmlrz7mjw1xj8kp-cp1252.drv
@@ -0,0 +1 @@
+Derive([("out","/nix/store/drr2mjp9fp9vvzsf5f9p0a80j33dxy7m-cp1252","","")],[],[],":",":",[],[("builder",":"),("chars","ÅÄÖ"),("name","cp1252"),("out","/nix/store/drr2mjp9fp9vvzsf5f9p0a80j33dxy7m-cp1252"),("system",":")])
\ No newline at end of file
diff --git a/tvix/nix-compat/src/derivation/tests/derivation_tests/m1vfixn8iprlf0v9abmlrz7mjw1xj8kp-cp1252.drv.json b/tvix/nix-compat/src/derivation/tests/derivation_tests/m1vfixn8iprlf0v9abmlrz7mjw1xj8kp-cp1252.drv.json
new file mode 100644
index 0000000000..9d6ba8b797
--- /dev/null
+++ b/tvix/nix-compat/src/derivation/tests/derivation_tests/m1vfixn8iprlf0v9abmlrz7mjw1xj8kp-cp1252.drv.json
@@ -0,0 +1,21 @@
+{
+  "/nix/store/m1vfixn8iprlf0v9abmlrz7mjw1xj8kp-cp1252.drv": {
+    "outputs": {
+      "out": {
+        "path": "/nix/store/drr2mjp9fp9vvzsf5f9p0a80j33dxy7m-cp1252"
+      }
+    },
+    "inputSrcs": [],
+    "inputDrvs": {},
+    "system": ":",
+    "builder": ":",
+    "args": [],
+    "env": {
+      "builder": ":",
+      "chars": "ÅÄÖ",
+      "name": "cp1252",
+      "out": "/nix/store/drr2mjp9fp9vvzsf5f9p0a80j33dxy7m-cp1252",
+      "system": ":"
+    }
+  }
+}
diff --git a/tvix/nix-compat/src/derivation/tests/derivation_tests/x6p0hg79i3wg0kkv7699935f7rrj9jf3-latin1.drv b/tvix/nix-compat/src/derivation/tests/derivation_tests/x6p0hg79i3wg0kkv7699935f7rrj9jf3-latin1.drv
new file mode 100644
index 0000000000..b19fd8eb2c
--- /dev/null
+++ b/tvix/nix-compat/src/derivation/tests/derivation_tests/x6p0hg79i3wg0kkv7699935f7rrj9jf3-latin1.drv
@@ -0,0 +1 @@
+Derive([("out","/nix/store/x1f6jfq9qgb6i8jrmpifkn9c64fg4hcm-latin1","","")],[],[],":",":",[],[("builder",":"),("chars","ÅÄÖ"),("name","latin1"),("out","/nix/store/x1f6jfq9qgb6i8jrmpifkn9c64fg4hcm-latin1"),("system",":")])
\ No newline at end of file
diff --git a/tvix/nix-compat/src/derivation/tests/derivation_tests/x6p0hg79i3wg0kkv7699935f7rrj9jf3-latin1.drv.json b/tvix/nix-compat/src/derivation/tests/derivation_tests/x6p0hg79i3wg0kkv7699935f7rrj9jf3-latin1.drv.json
new file mode 100644
index 0000000000..ffd5c08da8
--- /dev/null
+++ b/tvix/nix-compat/src/derivation/tests/derivation_tests/x6p0hg79i3wg0kkv7699935f7rrj9jf3-latin1.drv.json
@@ -0,0 +1,21 @@
+{
+  "/nix/store/x6p0hg79i3wg0kkv7699935f7rrj9jf3-latin1.drv": {
+    "outputs": {
+      "out": {
+        "path": "/nix/store/x1f6jfq9qgb6i8jrmpifkn9c64fg4hcm-latin1"
+      }
+    },
+    "inputSrcs": [],
+    "inputDrvs": {},
+    "system": ":",
+    "builder": ":",
+    "args": [],
+    "env": {
+      "builder": ":",
+      "chars": "ÅÄÖ",
+      "name": "latin1",
+      "out": "/nix/store/x1f6jfq9qgb6i8jrmpifkn9c64fg4hcm-latin1",
+      "system": ":"
+    }
+  }
+}
diff --git a/tvix/nix-compat/src/derivation/tests/mod.rs b/tvix/nix-compat/src/derivation/tests/mod.rs
index f4070b2496..8fbaa52e23 100644
--- a/tvix/nix-compat/src/derivation/tests/mod.rs
+++ b/tvix/nix-compat/src/derivation/tests/mod.rs
@@ -2,7 +2,8 @@ use crate::derivation::output::Output;
 use crate::derivation::Derivation;
 use crate::nixhash::NixHash;
 use crate::store_path::StorePath;
-use std::collections::BTreeSet;
+use bstr::{BStr, BString};
+use std::collections::{BTreeMap, BTreeSet};
 use std::fs::File;
 use std::io::Read;
 use std::path::Path;
@@ -12,33 +13,43 @@ use test_generator::test_resources;
 
 const RESOURCES_PATHS: &str = "src/derivation/tests/derivation_tests";
 
-fn read_file(path: &str) -> String {
+fn read_file(path: &str) -> BString {
     let path = Path::new(path);
     let mut file = File::open(path).unwrap();
-    let mut data = String::new();
+    let mut data = Vec::new();
 
-    file.read_to_string(&mut data).unwrap();
+    file.read_to_end(&mut data).unwrap();
 
-    data
+    data.into()
 }
 
 #[test_resources("src/derivation/tests/derivation_tests/*.drv")]
 fn check_serizaliation(path_to_drv_file: &str) {
+    // skip JSON files known to fail parsing
+    if path_to_drv_file.ends_with("cp1252.drv") || path_to_drv_file.ends_with("latin1.drv") {
+        return;
+    }
     let data = read_file(&format!("{}.json", path_to_drv_file));
-    let derivation: Derivation = serde_json::from_str(&data).expect("JSON was not well-formatted");
+    let derivation: Derivation =
+        serde_json::from_slice(&data).expect("JSON was not well-formatted");
 
-    let mut serialized_derivation = String::new();
+    let mut serialized_derivation = Vec::new();
     derivation.serialize(&mut serialized_derivation).unwrap();
 
     let expected = read_file(path_to_drv_file);
 
-    assert_eq!(expected, serialized_derivation);
+    assert_eq!(expected, BStr::new(&serialized_derivation));
 }
 
 #[test_resources("src/derivation/tests/derivation_tests/*.drv")]
 fn validate(path_to_drv_file: &str) {
+    // skip JSON files known to fail parsing
+    if path_to_drv_file.ends_with("cp1252.drv") || path_to_drv_file.ends_with("latin1.drv") {
+        return;
+    }
     let data = read_file(&format!("{}.json", path_to_drv_file));
-    let derivation: Derivation = serde_json::from_str(&data).expect("JSON was not well-formatted");
+    let derivation: Derivation =
+        serde_json::from_slice(&data).expect("JSON was not well-formatted");
 
     derivation
         .validate(true)
@@ -46,13 +57,18 @@ fn validate(path_to_drv_file: &str) {
 }
 
 #[test_resources("src/derivation/tests/derivation_tests/*.drv")]
-fn check_to_aterm_string(path_to_drv_file: &str) {
+fn check_to_aterm_bytes(path_to_drv_file: &str) {
+    // skip JSON files known to fail parsing
+    if path_to_drv_file.ends_with("cp1252.drv") || path_to_drv_file.ends_with("latin1.drv") {
+        return;
+    }
     let data = read_file(&format!("{}.json", path_to_drv_file));
-    let derivation: Derivation = serde_json::from_str(&data).expect("JSON was not well-formatted");
+    let derivation: Derivation =
+        serde_json::from_slice(&data).expect("JSON was not well-formatted");
 
     let expected = read_file(path_to_drv_file);
 
-    assert_eq!(expected, derivation.to_aterm_string());
+    assert_eq!(expected, BStr::new(&derivation.to_aterm_bytes()));
 }
 
 #[test_case("bar","0hm2f1psjpcwg8fijsmr4wwxrx59s092-bar.drv"; "fixed_sha256")]
@@ -64,7 +80,8 @@ fn check_to_aterm_string(path_to_drv_file: &str) {
 #[test_case("unicode", "52a9id8hx688hvlnz4d1n25ml1jdykz0-unicode.drv"; "unicode")]
 fn derivation_path(name: &str, expected_path: &str) {
     let data = read_file(&format!("{}/{}.json", RESOURCES_PATHS, expected_path));
-    let derivation: Derivation = serde_json::from_str(&data).expect("JSON was not well-formatted");
+    let derivation: Derivation =
+        serde_json::from_slice(&data).expect("JSON was not well-formatted");
 
     assert_eq!(
         derivation.calculate_derivation_path(name).unwrap(),
@@ -79,7 +96,7 @@ fn derivation_with_trimmed_output_paths(derivation: &Derivation) -> Derivation {
     let mut trimmed_outputs = derivation.outputs.clone();
 
     for (output_name, output) in &derivation.outputs {
-        trimmed_env.insert(output_name.clone(), "".to_string());
+        trimmed_env.insert(output_name.clone(), "".into());
         assert!(trimmed_outputs.contains_key(output_name));
         trimmed_outputs.insert(
             output_name.to_string(),
@@ -103,7 +120,7 @@ fn derivation_with_trimmed_output_paths(derivation: &Derivation) -> Derivation {
 fn derivation_or_fod_hash(drv_path: &str, expected_nix_hash_string: &str) {
     // read in the fixture
     let data = read_file(&format!("{}/{}.json", RESOURCES_PATHS, drv_path));
-    let drv: Derivation = serde_json::from_str(&data).expect("must deserialize");
+    let drv: Derivation = serde_json::from_slice(&data).expect("must deserialize");
 
     let actual = drv.derivation_or_fod_hash(|_| panic!("must not be called"));
 
@@ -120,7 +137,7 @@ fn derivation_or_fod_hash(drv_path: &str, expected_nix_hash_string: &str) {
 fn output_paths(name: &str, drv_path: &str) {
     // read in the fixture
     let data = read_file(&format!("{}/{}.json", RESOURCES_PATHS, drv_path));
-    let expected_derivation: Derivation = serde_json::from_str(&data).expect("must deserialize");
+    let expected_derivation: Derivation = serde_json::from_slice(&data).expect("must deserialize");
 
     let mut derivation = derivation_with_trimmed_output_paths(&expected_derivation);
 
@@ -148,7 +165,7 @@ fn output_paths(name: &str, drv_path: &str) {
                     .to_string_lossy()
             ));
 
-            let drv: Derivation = serde_json::from_str(&data).expect("must deserialize");
+            let drv: Derivation = serde_json::from_slice(&data).expect("must deserialize");
 
             // calculate derivation_or_fod_hash for each parent.
             // This may not trigger subsequent requests, as both parents are FOD.
@@ -204,16 +221,16 @@ fn output_path_construction() {
 
     // assemble bar env
     let bar_env = &mut bar_drv.environment;
-    bar_env.insert("builder".to_string(), ":".to_string());
-    bar_env.insert("name".to_string(), "bar".to_string());
-    bar_env.insert("out".to_string(), "".to_string()); // will be calculated
+    bar_env.insert("builder".to_string(), ":".into());
+    bar_env.insert("name".to_string(), "bar".into());
+    bar_env.insert("out".to_string(), "".into()); // will be calculated
     bar_env.insert(
         "outputHash".to_string(),
-        "08813cbee9903c62be4c5027726a418a300da4500b2d369d3af9286f4815ceba".to_string(),
+        "08813cbee9903c62be4c5027726a418a300da4500b2d369d3af9286f4815ceba".into(),
     );
-    bar_env.insert("outputHashAlgo".to_string(), "sha256".to_string());
-    bar_env.insert("outputHashMode".to_string(), "recursive".to_string());
-    bar_env.insert("system".to_string(), ":".to_string());
+    bar_env.insert("outputHashAlgo".to_string(), "sha256".into());
+    bar_env.insert("outputHashMode".to_string(), "recursive".into());
+    bar_env.insert("system".to_string(), ":".into());
 
     // assemble bar outputs
     bar_drv.outputs.insert(
@@ -244,7 +261,7 @@ fn output_path_construction() {
         "{}/{}.json",
         RESOURCES_PATHS, "0hm2f1psjpcwg8fijsmr4wwxrx59s092-bar.drv"
     ));
-    let bar_drv_expected: Derivation = serde_json::from_str(&bar_data).expect("must deserialize");
+    let bar_drv_expected: Derivation = serde_json::from_slice(&bar_data).expect("must deserialize");
     assert_eq!(bar_drv_expected, bar_drv);
 
     // now construct foo, which requires bar_drv
@@ -266,11 +283,11 @@ fn output_path_construction() {
 
     // assemble foo env
     let foo_env = &mut foo_drv.environment;
-    foo_env.insert("bar".to_string(), bar_output_path.to_string());
-    foo_env.insert("builder".to_string(), ":".to_string());
-    foo_env.insert("name".to_string(), "foo".to_string());
-    foo_env.insert("out".to_string(), "".to_string()); // will be calculated
-    foo_env.insert("system".to_string(), ":".to_string());
+    foo_env.insert("bar".to_string(), bar_output_path.to_owned().into());
+    foo_env.insert("builder".to_string(), ":".into());
+    foo_env.insert("name".to_string(), "foo".into());
+    foo_env.insert("out".to_string(), "".into()); // will be calculated
+    foo_env.insert("system".to_string(), ":".into());
 
     // asssemble foo outputs
     foo_drv.outputs.insert(
@@ -304,7 +321,7 @@ fn output_path_construction() {
         "{}/{}.json",
         RESOURCES_PATHS, "4wvvbi4jwn0prsdxb7vs673qa5h9gr7x-foo.drv",
     ));
-    let foo_drv_expected: Derivation = serde_json::from_str(&foo_data).expect("must deserialize");
+    let foo_drv_expected: Derivation = serde_json::from_slice(&foo_data).expect("must deserialize");
     assert_eq!(foo_drv_expected, foo_drv);
 
     assert_eq!(
@@ -314,3 +331,83 @@ fn output_path_construction() {
             .expect("must succeed")
     );
 }
+
+/// This constructs a Derivation using cp1252 encoding and ensures the
+/// calculated derivation path matches the one Nix does calculate, as
+/// well as the ATerm serialization.
+/// We can't add this as a test_case to `output_paths`, as the JSON parser
+/// refuses to parse our JSONs.
+/// It looks like more recent versions of Nix also seem to not produce these
+/// JSON files anymore, however, it still happily produces the .drv files in
+/// the store.
+#[test_case(
+    "cp1252",
+    vec![0xc5, 0xc4, 0xd6],
+    "/nix/store/drr2mjp9fp9vvzsf5f9p0a80j33dxy7m-cp1252",
+    "m1vfixn8iprlf0v9abmlrz7mjw1xj8kp-cp1252.drv";
+    "cp1252"
+)]
+#[test_case(
+    "latin1",
+    vec![0xc5, 0xc4, 0xd6],
+    "/nix/store/x1f6jfq9qgb6i8jrmpifkn9c64fg4hcm-latin1",
+    "x6p0hg79i3wg0kkv7699935f7rrj9jf3-latin1.drv";
+    "latin1"
+)]
+fn non_unicode(name: &str, chars: Vec<u8>, exp_output_path: &str, exp_derivation_path: &str) {
+    // construct the Derivation
+    let mut outputs: BTreeMap<String, Output> = BTreeMap::new();
+    outputs.insert(
+        "out".to_string(),
+        Output {
+            path: exp_output_path.to_string(),
+            ..Default::default()
+        },
+    );
+
+    let mut environment: BTreeMap<String, BString> = BTreeMap::new();
+    environment.insert("builder".to_string(), ":".into());
+    environment.insert("chars".to_string(), chars.into());
+    environment.insert("name".to_string(), name.into());
+    environment.insert("out".to_string(), exp_output_path.into());
+    environment.insert("system".to_string(), ":".into());
+    let derivation: Derivation = Derivation {
+        builder: ":".to_string(),
+        environment,
+        outputs,
+        system: ":".to_string(),
+        ..Default::default()
+    };
+
+    // check the derivation_path matches what Nix calculated.
+    let actual_drv_path = derivation.calculate_derivation_path(name).unwrap();
+    assert_eq!(exp_derivation_path.to_string(), actual_drv_path.to_string());
+
+    // Now wipe the output path info, and ensure we calculate the same output
+    // path.
+    {
+        let mut derivation = derivation_with_trimmed_output_paths(&derivation);
+        let calculated_derivation_or_fod_hash = derivation.derivation_or_fod_hash(|_| {
+            panic!("No parents expected");
+        });
+        derivation
+            .calculate_output_paths(name, &calculated_derivation_or_fod_hash)
+            .unwrap();
+
+        assert_eq!(
+            exp_output_path.to_string(),
+            derivation.outputs.get("out").unwrap().path,
+            "expected calculated output path to match"
+        );
+    }
+
+    // Construct the ATerm representation and compare with our fixture.
+    {
+        let data = read_file(&format!("{}/{}", RESOURCES_PATHS, exp_derivation_path));
+        assert_eq!(
+            data,
+            BStr::new(&derivation.to_aterm_bytes()),
+            "expected ATerm serialization to match",
+        );
+    }
+}
diff --git a/tvix/nix-compat/src/derivation/write.rs b/tvix/nix-compat/src/derivation/write.rs
index 52166294e0..cf62f85022 100644
--- a/tvix/nix-compat/src/derivation/write.rs
+++ b/tvix/nix-compat/src/derivation/write.rs
@@ -3,10 +3,12 @@
 //!
 //! [ATerm]: http://program-transformation.org/Tools/ATermFormat.html
 
+use crate::derivation::escape::escape_bstr;
 use crate::derivation::output::Output;
-use crate::derivation::string_escape::escape_string;
+use bstr::BString;
 use std::collections::BTreeSet;
-use std::{collections::BTreeMap, fmt, fmt::Write};
+use std::io::Cursor;
+use std::{collections::BTreeMap, io, io::Error, io::Write};
 
 pub const DERIVATION_PREFIX: &str = "Derive";
 pub const PAREN_OPEN: char = '(';
@@ -16,32 +18,46 @@ pub const BRACKET_CLOSE: char = ']';
 pub const COMMA: char = ',';
 pub const QUOTE: char = '"';
 
+// Writes a character to the writer.
+pub(crate) fn write_char(writer: &mut impl Write, c: char) -> io::Result<()> {
+    let mut buf = [0; 4];
+    let b = c.encode_utf8(&mut buf).as_bytes();
+    io::copy(&mut Cursor::new(b), writer)?;
+    Ok(())
+}
+
+// Writes a string to the writer (as unicode)
+pub(crate) fn write_str(writer: &mut impl Write, s: &str) -> io::Result<()> {
+    io::copy(&mut Cursor::new(s.as_bytes()), writer)?;
+    Ok(())
+}
+
 fn write_array_elements(
     writer: &mut impl Write,
     quote: bool,
     open: &str,
     closing: &str,
-    elements: Vec<&str>,
-) -> Result<(), fmt::Error> {
-    writer.write_str(open)?;
+    elements: &[BString],
+) -> Result<(), io::Error> {
+    write_str(writer, open)?;
 
     for (index, element) in elements.iter().enumerate() {
         if index > 0 {
-            writer.write_char(COMMA)?;
+            write_char(writer, COMMA)?;
         }
 
         if quote {
-            writer.write_char(QUOTE)?;
+            write_char(writer, QUOTE)?;
         }
 
-        writer.write_str(element)?;
+        io::copy(&mut Cursor::new(element), writer)?;
 
         if quote {
-            writer.write_char(QUOTE)?;
+            write_char(writer, QUOTE)?;
         }
     }
 
-    writer.write_str(closing)?;
+    write_str(writer, closing)?;
 
     Ok(())
 }
@@ -49,41 +65,44 @@ fn write_array_elements(
 pub fn write_outputs(
     writer: &mut impl Write,
     outputs: &BTreeMap<String, Output>,
-) -> Result<(), fmt::Error> {
-    writer.write_char(BRACKET_OPEN)?;
+) -> Result<(), io::Error> {
+    write_char(writer, BRACKET_OPEN)?;
     for (ii, (output_name, output)) in outputs.iter().enumerate() {
         if ii > 0 {
-            writer.write_char(COMMA)?;
+            write_char(writer, COMMA)?;
         }
 
-        let mut elements: Vec<&str> = vec![output_name, &output.path];
+        let mut elements: Vec<BString> = vec![
+            output_name.as_bytes().to_vec().into(),
+            output.path.as_bytes().to_vec().into(),
+        ];
 
         let (e2, e3) = match &output.hash_with_mode {
             Some(hash) => match hash {
                 crate::nixhash::NixHashWithMode::Flat(h) => (
-                    h.algo.to_string(),
-                    data_encoding::HEXLOWER.encode(&h.digest),
+                    h.algo.to_string().as_bytes().to_vec(),
+                    data_encoding::HEXLOWER.encode(&h.digest).as_bytes().into(),
                 ),
                 crate::nixhash::NixHashWithMode::Recursive(h) => (
-                    format!("r:{}", h.algo),
-                    data_encoding::HEXLOWER.encode(&h.digest),
+                    format!("r:{}", h.algo).as_bytes().to_vec(),
+                    data_encoding::HEXLOWER.encode(&h.digest).as_bytes().into(),
                 ),
             },
-            None => ("".to_string(), "".to_string()),
+            None => (vec![], vec![]),
         };
 
-        elements.push(&e2);
-        elements.push(&e3);
+        elements.push(e2.into());
+        elements.push(e3.into());
 
         write_array_elements(
             writer,
             true,
             &PAREN_OPEN.to_string(),
             &PAREN_CLOSE.to_string(),
-            elements,
+            &elements,
         )?
     }
-    writer.write_char(BRACKET_CLOSE)?;
+    write_char(writer, BRACKET_CLOSE)?;
 
     Ok(())
 }
@@ -91,33 +110,37 @@ pub fn write_outputs(
 pub fn write_input_derivations(
     writer: &mut impl Write,
     input_derivations: &BTreeMap<String, BTreeSet<String>>,
-) -> Result<(), fmt::Error> {
-    writer.write_char(COMMA)?;
-    writer.write_char(BRACKET_OPEN)?;
+) -> Result<(), io::Error> {
+    write_char(writer, COMMA)?;
+    write_char(writer, BRACKET_OPEN)?;
 
-    for (ii, (input_derivation_path, input_derivation)) in input_derivations.iter().enumerate() {
+    for (ii, (input_derivation_path, input_derivation)) in input_derivations.into_iter().enumerate()
+    {
         if ii > 0 {
-            writer.write_char(COMMA)?;
+            write_char(writer, COMMA)?;
         }
 
-        writer.write_char(PAREN_OPEN)?;
-        writer.write_char(QUOTE)?;
-        writer.write_str(input_derivation_path.as_str())?;
-        writer.write_char(QUOTE)?;
-        writer.write_char(COMMA)?;
+        write_char(writer, PAREN_OPEN)?;
+        write_char(writer, QUOTE)?;
+        write_str(writer, input_derivation_path.as_str())?;
+        write_char(writer, QUOTE)?;
+        write_char(writer, COMMA)?;
 
         write_array_elements(
             writer,
             true,
             &BRACKET_OPEN.to_string(),
             &BRACKET_CLOSE.to_string(),
-            input_derivation.iter().map(|s| &**s).collect(),
+            &input_derivation
+                .iter()
+                .map(|s| s.as_bytes().to_vec().into())
+                .collect::<Vec<BString>>(),
         )?;
 
-        writer.write_char(PAREN_CLOSE)?;
+        write_char(writer, PAREN_CLOSE)?;
     }
 
-    writer.write_char(BRACKET_CLOSE)?;
+    write_char(writer, BRACKET_CLOSE)?;
 
     Ok(())
 }
@@ -125,39 +148,45 @@ pub fn write_input_derivations(
 pub fn write_input_sources(
     writer: &mut impl Write,
     input_sources: &BTreeSet<String>,
-) -> Result<(), fmt::Error> {
-    writer.write_char(COMMA)?;
+) -> Result<(), io::Error> {
+    write_char(writer, COMMA)?;
 
     write_array_elements(
         writer,
         true,
         &BRACKET_OPEN.to_string(),
         &BRACKET_CLOSE.to_string(),
-        input_sources.iter().map(|s| &**s).collect(),
+        &input_sources
+            .iter()
+            .map(|s| s.as_bytes().to_vec().into())
+            .collect::<Vec<BString>>(),
     )?;
 
     Ok(())
 }
 
-pub fn write_system(writer: &mut impl Write, platform: &str) -> Result<(), fmt::Error> {
-    writer.write_char(COMMA)?;
-    writer.write_str(escape_string(platform).as_str())?;
+pub fn write_system(writer: &mut impl Write, platform: &str) -> Result<(), Error> {
+    write_char(writer, COMMA)?;
+    io::copy(&mut Cursor::new(escape_bstr(platform.as_bytes())), writer)?;
     Ok(())
 }
 
-pub fn write_builder(writer: &mut impl Write, builder: &str) -> Result<(), fmt::Error> {
-    writer.write_char(COMMA)?;
-    writer.write_str(escape_string(builder).as_str())?;
+pub fn write_builder(writer: &mut impl Write, builder: &str) -> Result<(), Error> {
+    write_char(writer, COMMA)?;
+    io::copy(&mut Cursor::new(escape_bstr(builder.as_bytes())), writer)?;
     Ok(())
 }
-pub fn write_arguments(writer: &mut impl Write, arguments: &[String]) -> Result<(), fmt::Error> {
-    writer.write_char(COMMA)?;
+pub fn write_arguments(writer: &mut impl Write, arguments: &[String]) -> Result<(), io::Error> {
+    write_char(writer, COMMA)?;
     write_array_elements(
         writer,
         true,
         &BRACKET_OPEN.to_string(),
         &BRACKET_CLOSE.to_string(),
-        arguments.iter().map(|s| &**s).collect(),
+        &arguments
+            .iter()
+            .map(|s| s.as_bytes().to_vec().into())
+            .collect::<Vec<BString>>(),
     )?;
 
     Ok(())
@@ -165,14 +194,14 @@ pub fn write_arguments(writer: &mut impl Write, arguments: &[String]) -> Result<
 
 pub fn write_enviroment(
     writer: &mut impl Write,
-    environment: &BTreeMap<String, String>,
-) -> Result<(), fmt::Error> {
-    writer.write_char(COMMA)?;
-    writer.write_char(BRACKET_OPEN)?;
-
-    for (ii, (key, environment)) in environment.iter().enumerate() {
-        if ii > 0 {
-            writer.write_char(COMMA)?;
+    environment: &BTreeMap<String, BString>,
+) -> Result<(), io::Error> {
+    write_char(writer, COMMA)?;
+    write_char(writer, BRACKET_OPEN)?;
+
+    for (i, (k, v)) in environment.into_iter().enumerate() {
+        if i > 0 {
+            write_char(writer, COMMA)?;
         }
 
         write_array_elements(
@@ -180,11 +209,11 @@ pub fn write_enviroment(
             false,
             &PAREN_OPEN.to_string(),
             &PAREN_CLOSE.to_string(),
-            vec![&escape_string(key), &escape_string(environment)],
+            &[escape_bstr(k.as_bytes()), escape_bstr(v)],
         )?;
     }
 
-    writer.write_char(BRACKET_CLOSE)?;
+    write_char(writer, BRACKET_CLOSE)?;
 
     Ok(())
 }