about summary refs log tree commit diff
path: root/tvix/eval/src/builtins
diff options
context:
space:
mode:
authorConnor Brewster <cbrewster@hey.com>2024-03-22T23·52-0500
committerConnor Brewster <cbrewster@hey.com>2024-04-09T17·31+0000
commit63116d8c21afdc50725ae93d13839fe1915b06b7 (patch)
tree4997838251dac809c2917b35e5d32224030ba595 /tvix/eval/src/builtins
parent17849c5c0033fa1909f0403b5d5e6a5e018b7fee (diff)
fix(tvix): Avoid buffering file into memory in builtins.hashFile r/7882
Right now `builtins.hashFile` always reads the entire file into memory
before hashing, which is not ideal for large files. This replaces
`read_to_string` with `open_file` which allows calculating the hash of
the file without buffering it entirely into memory. Other callers can
continue to buffer into memory if they choose, but they still use the
`open_file` VM request and then call `read_to_string` or `read_to_end`
on the `std::io::Reader`.

Fixes b/380

Change-Id: Ifa1c8324bcee8f751604b0b449feab875c632fda
Reviewed-on: https://cl.tvl.fyi/c/depot/+/11236
Reviewed-by: flokli <flokli@flokli.de>
Tested-by: BuildkiteCI
Diffstat (limited to 'tvix/eval/src/builtins')
-rw-r--r--tvix/eval/src/builtins/hash.rs20
-rw-r--r--tvix/eval/src/builtins/impure.rs15
-rw-r--r--tvix/eval/src/builtins/mod.rs5
3 files changed, 24 insertions, 16 deletions
diff --git a/tvix/eval/src/builtins/hash.rs b/tvix/eval/src/builtins/hash.rs
index 6d07fc9b2dc8..d0145f1e7d75 100644
--- a/tvix/eval/src/builtins/hash.rs
+++ b/tvix/eval/src/builtins/hash.rs
@@ -6,18 +6,22 @@ use sha2::{digest::Output, Digest, Sha256, Sha512};
 
 use crate::ErrorKind;
 
-fn hash<D: Digest>(b: &[u8]) -> Output<D> {
+/// Reads through all data from the passed reader, and returns the resulting [Digest].
+/// The exact hash function used is left generic over all [Digest].
+fn hash<D: Digest + std::io::Write>(mut r: impl std::io::Read) -> Result<Output<D>, ErrorKind> {
     let mut hasher = D::new();
-    hasher.update(b);
-    hasher.finalize()
+    std::io::copy(&mut r, &mut hasher)?;
+    Ok(hasher.finalize())
 }
 
-pub fn hash_nix_string(algo: impl AsRef<[u8]>, s: impl AsRef<[u8]>) -> Result<String, ErrorKind> {
+/// For a given algo "string" and reader for data, calculate the digest
+/// and return it as a hexlower encoded [String].
+pub fn hash_nix_string(algo: impl AsRef<[u8]>, s: impl std::io::Read) -> Result<String, ErrorKind> {
     match algo.as_ref() {
-        b"md5" => Ok(HEXLOWER.encode(hash::<Md5>(s.as_ref()).as_bstr())),
-        b"sha1" => Ok(HEXLOWER.encode(hash::<Sha1>(s.as_ref()).as_bstr())),
-        b"sha256" => Ok(HEXLOWER.encode(hash::<Sha256>(s.as_ref()).as_bstr())),
-        b"sha512" => Ok(HEXLOWER.encode(hash::<Sha512>(s.as_ref()).as_bstr())),
+        b"md5" => Ok(HEXLOWER.encode(hash::<Md5>(s)?.as_bstr())),
+        b"sha1" => Ok(HEXLOWER.encode(hash::<Sha1>(s)?.as_bstr())),
+        b"sha256" => Ok(HEXLOWER.encode(hash::<Sha256>(s)?.as_bstr())),
+        b"sha512" => Ok(HEXLOWER.encode(hash::<Sha512>(s)?.as_bstr())),
         _ => Err(ErrorKind::UnknownHashType(
             algo.as_ref().as_bstr().to_string(),
         )),
diff --git a/tvix/eval/src/builtins/impure.rs b/tvix/eval/src/builtins/impure.rs
index aad55c7331e8..18403fe5d89b 100644
--- a/tvix/eval/src/builtins/impure.rs
+++ b/tvix/eval/src/builtins/impure.rs
@@ -31,14 +31,13 @@ mod impure_builtins {
     }
 
     #[builtin("hashFile")]
-    #[allow(non_snake_case)]
-    async fn builtin_hashFile(co: GenCo, algo: Value, path: Value) -> Result<Value, ErrorKind> {
+    async fn builtin_hash_file(co: GenCo, algo: Value, path: Value) -> Result<Value, ErrorKind> {
         let path = match coerce_value_to_path(&co, path).await? {
             Err(cek) => return Ok(Value::from(cek)),
             Ok(p) => p,
         };
-        let s = generators::request_read_to_string(&co, path).await;
-        hash_nix_string(algo.to_str()?, s.to_str()?).map(Value::from)
+        let r = generators::request_open_file(&co, path).await;
+        Ok(hash_nix_string(algo.to_str()?, r).map(Value::from)?)
     }
 
     #[builtin("pathExists")]
@@ -79,7 +78,13 @@ mod impure_builtins {
     async fn builtin_read_file(co: GenCo, path: Value) -> Result<Value, ErrorKind> {
         match coerce_value_to_path(&co, path).await? {
             Err(cek) => Ok(Value::from(cek)),
-            Ok(path) => Ok(generators::request_read_to_string(&co, path).await),
+            Ok(path) => {
+                let mut buf = Vec::new();
+                generators::request_open_file(&co, path)
+                    .await
+                    .read_to_end(&mut buf)?;
+                Ok(Value::from(buf))
+            }
         }
     }
 }
diff --git a/tvix/eval/src/builtins/mod.rs b/tvix/eval/src/builtins/mod.rs
index 8973a25927ed..cb55894b6c7e 100644
--- a/tvix/eval/src/builtins/mod.rs
+++ b/tvix/eval/src/builtins/mod.rs
@@ -773,9 +773,8 @@ mod pure_builtins {
     }
 
     #[builtin("hashString")]
-    #[allow(non_snake_case)]
-    async fn builtin_hashString(co: GenCo, algo: Value, s: Value) -> Result<Value, ErrorKind> {
-        hash_nix_string(algo.to_str()?, s.to_str()?).map(Value::from)
+    async fn builtin_hash_string(co: GenCo, algo: Value, s: Value) -> Result<Value, ErrorKind> {
+        hash_nix_string(algo.to_str()?, std::io::Cursor::new(s.to_str()?)).map(Value::from)
     }
 
     #[builtin("head")]