From 63116d8c21afdc50725ae93d13839fe1915b06b7 Mon Sep 17 00:00:00 2001 From: Connor Brewster Date: Fri, 22 Mar 2024 18:52:21 -0500 Subject: fix(tvix): Avoid buffering file into memory in builtins.hashFile Right now `builtins.hashFile` always reads the entire file into memory before hashing, which is not ideal for large files. This replaces `read_to_string` with `open_file` which allows calculating the hash of the file without buffering it entirely into memory. Other callers can continue to buffer into memory if they choose, but they still use the `open_file` VM request and then call `read_to_string` or `read_to_end` on the `std::io::Reader`. Fixes b/380 Change-Id: Ifa1c8324bcee8f751604b0b449feab875c632fda Reviewed-on: https://cl.tvl.fyi/c/depot/+/11236 Reviewed-by: flokli Tested-by: BuildkiteCI --- tvix/eval/src/builtins/hash.rs | 20 ++++++++++++-------- tvix/eval/src/builtins/impure.rs | 15 ++++++++++----- tvix/eval/src/builtins/mod.rs | 5 ++--- 3 files changed, 24 insertions(+), 16 deletions(-) (limited to 'tvix/eval/src/builtins') diff --git a/tvix/eval/src/builtins/hash.rs b/tvix/eval/src/builtins/hash.rs index 6d07fc9b2dc8..d0145f1e7d75 100644 --- a/tvix/eval/src/builtins/hash.rs +++ b/tvix/eval/src/builtins/hash.rs @@ -6,18 +6,22 @@ use sha2::{digest::Output, Digest, Sha256, Sha512}; use crate::ErrorKind; -fn hash(b: &[u8]) -> Output { +/// Reads through all data from the passed reader, and returns the resulting [Digest]. +/// The exact hash function used is left generic over all [Digest]. +fn hash(mut r: impl std::io::Read) -> Result, ErrorKind> { let mut hasher = D::new(); - hasher.update(b); - hasher.finalize() + std::io::copy(&mut r, &mut hasher)?; + Ok(hasher.finalize()) } -pub fn hash_nix_string(algo: impl AsRef<[u8]>, s: impl AsRef<[u8]>) -> Result { +/// For a given algo "string" and reader for data, calculate the digest +/// and return it as a hexlower encoded [String]. +pub fn hash_nix_string(algo: impl AsRef<[u8]>, s: impl std::io::Read) -> Result { match algo.as_ref() { - b"md5" => Ok(HEXLOWER.encode(hash::(s.as_ref()).as_bstr())), - b"sha1" => Ok(HEXLOWER.encode(hash::(s.as_ref()).as_bstr())), - b"sha256" => Ok(HEXLOWER.encode(hash::(s.as_ref()).as_bstr())), - b"sha512" => Ok(HEXLOWER.encode(hash::(s.as_ref()).as_bstr())), + b"md5" => Ok(HEXLOWER.encode(hash::(s)?.as_bstr())), + b"sha1" => Ok(HEXLOWER.encode(hash::(s)?.as_bstr())), + b"sha256" => Ok(HEXLOWER.encode(hash::(s)?.as_bstr())), + b"sha512" => Ok(HEXLOWER.encode(hash::(s)?.as_bstr())), _ => Err(ErrorKind::UnknownHashType( algo.as_ref().as_bstr().to_string(), )), diff --git a/tvix/eval/src/builtins/impure.rs b/tvix/eval/src/builtins/impure.rs index aad55c7331e8..18403fe5d89b 100644 --- a/tvix/eval/src/builtins/impure.rs +++ b/tvix/eval/src/builtins/impure.rs @@ -31,14 +31,13 @@ mod impure_builtins { } #[builtin("hashFile")] - #[allow(non_snake_case)] - async fn builtin_hashFile(co: GenCo, algo: Value, path: Value) -> Result { + async fn builtin_hash_file(co: GenCo, algo: Value, path: Value) -> Result { let path = match coerce_value_to_path(&co, path).await? { Err(cek) => return Ok(Value::from(cek)), Ok(p) => p, }; - let s = generators::request_read_to_string(&co, path).await; - hash_nix_string(algo.to_str()?, s.to_str()?).map(Value::from) + let r = generators::request_open_file(&co, path).await; + Ok(hash_nix_string(algo.to_str()?, r).map(Value::from)?) } #[builtin("pathExists")] @@ -79,7 +78,13 @@ mod impure_builtins { async fn builtin_read_file(co: GenCo, path: Value) -> Result { match coerce_value_to_path(&co, path).await? { Err(cek) => Ok(Value::from(cek)), - Ok(path) => Ok(generators::request_read_to_string(&co, path).await), + Ok(path) => { + let mut buf = Vec::new(); + generators::request_open_file(&co, path) + .await + .read_to_end(&mut buf)?; + Ok(Value::from(buf)) + } } } } diff --git a/tvix/eval/src/builtins/mod.rs b/tvix/eval/src/builtins/mod.rs index 8973a25927ed..cb55894b6c7e 100644 --- a/tvix/eval/src/builtins/mod.rs +++ b/tvix/eval/src/builtins/mod.rs @@ -773,9 +773,8 @@ mod pure_builtins { } #[builtin("hashString")] - #[allow(non_snake_case)] - async fn builtin_hashString(co: GenCo, algo: Value, s: Value) -> Result { - hash_nix_string(algo.to_str()?, s.to_str()?).map(Value::from) + async fn builtin_hash_string(co: GenCo, algo: Value, s: Value) -> Result { + hash_nix_string(algo.to_str()?, std::io::Cursor::new(s.to_str()?)).map(Value::from) } #[builtin("head")] -- cgit 1.4.1