about summary refs log tree commit diff
path: root/tvix
diff options
context:
space:
mode:
authorConnor Brewster <cbrewster@hey.com>2024-03-22T23·52-0500
committerConnor Brewster <cbrewster@hey.com>2024-04-09T17·31+0000
commit63116d8c21afdc50725ae93d13839fe1915b06b7 (patch)
tree4997838251dac809c2917b35e5d32224030ba595 /tvix
parent17849c5c0033fa1909f0403b5d5e6a5e018b7fee (diff)
fix(tvix): Avoid buffering file into memory in builtins.hashFile r/7882
Right now `builtins.hashFile` always reads the entire file into memory
before hashing, which is not ideal for large files. This replaces
`read_to_string` with `open_file` which allows calculating the hash of
the file without buffering it entirely into memory. Other callers can
continue to buffer into memory if they choose, but they still use the
`open_file` VM request and then call `read_to_string` or `read_to_end`
on the `std::io::Reader`.

Fixes b/380

Change-Id: Ifa1c8324bcee8f751604b0b449feab875c632fda
Reviewed-on: https://cl.tvl.fyi/c/depot/+/11236
Reviewed-by: flokli <flokli@flokli.de>
Tested-by: BuildkiteCI
Diffstat (limited to 'tvix')
-rw-r--r--tvix/eval/src/builtins/hash.rs20
-rw-r--r--tvix/eval/src/builtins/impure.rs15
-rw-r--r--tvix/eval/src/builtins/mod.rs5
-rw-r--r--tvix/eval/src/compiler/import.rs9
-rw-r--r--tvix/eval/src/io.rs16
-rw-r--r--tvix/eval/src/vm/generators.rs27
-rw-r--r--tvix/glue/src/builtins/import.rs6
-rw-r--r--tvix/glue/src/tvix_io.rs11
-rw-r--r--tvix/glue/src/tvix_store_io.rs45
9 files changed, 80 insertions, 74 deletions
diff --git a/tvix/eval/src/builtins/hash.rs b/tvix/eval/src/builtins/hash.rs
index 6d07fc9b2dc8..d0145f1e7d75 100644
--- a/tvix/eval/src/builtins/hash.rs
+++ b/tvix/eval/src/builtins/hash.rs
@@ -6,18 +6,22 @@ use sha2::{digest::Output, Digest, Sha256, Sha512};
 
 use crate::ErrorKind;
 
-fn hash<D: Digest>(b: &[u8]) -> Output<D> {
+/// Reads through all data from the passed reader, and returns the resulting [Digest].
+/// The exact hash function used is left generic over all [Digest].
+fn hash<D: Digest + std::io::Write>(mut r: impl std::io::Read) -> Result<Output<D>, ErrorKind> {
     let mut hasher = D::new();
-    hasher.update(b);
-    hasher.finalize()
+    std::io::copy(&mut r, &mut hasher)?;
+    Ok(hasher.finalize())
 }
 
-pub fn hash_nix_string(algo: impl AsRef<[u8]>, s: impl AsRef<[u8]>) -> Result<String, ErrorKind> {
+/// For a given algo "string" and reader for data, calculate the digest
+/// and return it as a hexlower encoded [String].
+pub fn hash_nix_string(algo: impl AsRef<[u8]>, s: impl std::io::Read) -> Result<String, ErrorKind> {
     match algo.as_ref() {
-        b"md5" => Ok(HEXLOWER.encode(hash::<Md5>(s.as_ref()).as_bstr())),
-        b"sha1" => Ok(HEXLOWER.encode(hash::<Sha1>(s.as_ref()).as_bstr())),
-        b"sha256" => Ok(HEXLOWER.encode(hash::<Sha256>(s.as_ref()).as_bstr())),
-        b"sha512" => Ok(HEXLOWER.encode(hash::<Sha512>(s.as_ref()).as_bstr())),
+        b"md5" => Ok(HEXLOWER.encode(hash::<Md5>(s)?.as_bstr())),
+        b"sha1" => Ok(HEXLOWER.encode(hash::<Sha1>(s)?.as_bstr())),
+        b"sha256" => Ok(HEXLOWER.encode(hash::<Sha256>(s)?.as_bstr())),
+        b"sha512" => Ok(HEXLOWER.encode(hash::<Sha512>(s)?.as_bstr())),
         _ => Err(ErrorKind::UnknownHashType(
             algo.as_ref().as_bstr().to_string(),
         )),
diff --git a/tvix/eval/src/builtins/impure.rs b/tvix/eval/src/builtins/impure.rs
index aad55c7331e8..18403fe5d89b 100644
--- a/tvix/eval/src/builtins/impure.rs
+++ b/tvix/eval/src/builtins/impure.rs
@@ -31,14 +31,13 @@ mod impure_builtins {
     }
 
     #[builtin("hashFile")]
-    #[allow(non_snake_case)]
-    async fn builtin_hashFile(co: GenCo, algo: Value, path: Value) -> Result<Value, ErrorKind> {
+    async fn builtin_hash_file(co: GenCo, algo: Value, path: Value) -> Result<Value, ErrorKind> {
         let path = match coerce_value_to_path(&co, path).await? {
             Err(cek) => return Ok(Value::from(cek)),
             Ok(p) => p,
         };
-        let s = generators::request_read_to_string(&co, path).await;
-        hash_nix_string(algo.to_str()?, s.to_str()?).map(Value::from)
+        let r = generators::request_open_file(&co, path).await;
+        Ok(hash_nix_string(algo.to_str()?, r).map(Value::from)?)
     }
 
     #[builtin("pathExists")]
@@ -79,7 +78,13 @@ mod impure_builtins {
     async fn builtin_read_file(co: GenCo, path: Value) -> Result<Value, ErrorKind> {
         match coerce_value_to_path(&co, path).await? {
             Err(cek) => Ok(Value::from(cek)),
-            Ok(path) => Ok(generators::request_read_to_string(&co, path).await),
+            Ok(path) => {
+                let mut buf = Vec::new();
+                generators::request_open_file(&co, path)
+                    .await
+                    .read_to_end(&mut buf)?;
+                Ok(Value::from(buf))
+            }
         }
     }
 }
diff --git a/tvix/eval/src/builtins/mod.rs b/tvix/eval/src/builtins/mod.rs
index 8973a25927ed..cb55894b6c7e 100644
--- a/tvix/eval/src/builtins/mod.rs
+++ b/tvix/eval/src/builtins/mod.rs
@@ -773,9 +773,8 @@ mod pure_builtins {
     }
 
     #[builtin("hashString")]
-    #[allow(non_snake_case)]
-    async fn builtin_hashString(co: GenCo, algo: Value, s: Value) -> Result<Value, ErrorKind> {
-        hash_nix_string(algo.to_str()?, s.to_str()?).map(Value::from)
+    async fn builtin_hash_string(co: GenCo, algo: Value, s: Value) -> Result<Value, ErrorKind> {
+        hash_nix_string(algo.to_str()?, std::io::Cursor::new(s.to_str()?)).map(Value::from)
     }
 
     #[builtin("head")]
diff --git a/tvix/eval/src/compiler/import.rs b/tvix/eval/src/compiler/import.rs
index c56909e958fb..9036eec81731 100644
--- a/tvix/eval/src/compiler/import.rs
+++ b/tvix/eval/src/compiler/import.rs
@@ -6,7 +6,6 @@
 //! instance, or observers).
 
 use super::GlobalsMap;
-use bstr::ByteSlice;
 use genawaiter::rc::Gen;
 use std::rc::Weak;
 
@@ -39,9 +38,11 @@ async fn import_impl(
         return Ok(cached);
     }
 
-    // TODO(tazjin): make this return a string directly instead
-    let contents: Value = generators::request_read_to_string(&co, path.clone()).await;
-    let contents = contents.to_str()?.to_str()?.to_owned();
+    let mut reader = generators::request_open_file(&co, path.clone()).await;
+    // We read to a String instead of a Vec<u8> because rnix only supports
+    // string source files.
+    let mut contents = String::new();
+    reader.read_to_string(&mut contents)?;
 
     let parsed = rnix::ast::Root::parse(&contents);
     let errors = parsed.errors();
diff --git a/tvix/eval/src/io.rs b/tvix/eval/src/io.rs
index 1c38bc68de90..f775077af818 100644
--- a/tvix/eval/src/io.rs
+++ b/tvix/eval/src/io.rs
@@ -16,6 +16,7 @@
 //! how store paths are opened and so on.
 
 use std::{
+    fs::File,
     io,
     path::{Path, PathBuf},
 };
@@ -48,13 +49,8 @@ pub trait EvalIO {
     /// * `builtins.pathExists :: path -> bool`
     fn path_exists(&self, path: &Path) -> io::Result<bool>;
 
-    /// Read the file at the specified path to a `Vec<u8>`.
-    ///
-    /// This is used for the following language evaluation cases:
-    ///
-    /// * `builtins.readFile :: path -> string`
-    /// * `builtins.import :: path -> any`
-    fn read_to_end(&self, path: &Path) -> io::Result<Vec<u8>>;
+    /// Open the file at the specified path to a `io::Read`.
+    fn open(&self, path: &Path) -> io::Result<Box<dyn io::Read>>;
 
     /// Read the directory at the specified path and return the names
     /// of its entries associated with their [`FileType`].
@@ -99,8 +95,8 @@ impl EvalIO for StdIO {
         path.try_exists()
     }
 
-    fn read_to_end(&self, path: &Path) -> io::Result<Vec<u8>> {
-        std::fs::read(path)
+    fn open(&self, path: &Path) -> io::Result<Box<dyn io::Read>> {
+        Ok(Box::new(File::open(path)?))
     }
 
     fn read_dir(&self, path: &Path) -> io::Result<Vec<(bytes::Bytes, FileType)>> {
@@ -145,7 +141,7 @@ impl EvalIO for DummyIO {
         ))
     }
 
-    fn read_to_end(&self, _: &Path) -> io::Result<Vec<u8>> {
+    fn open(&self, _: &Path) -> io::Result<Box<dyn io::Read>> {
         Err(io::Error::new(
             io::ErrorKind::Unsupported,
             "I/O methods are not implemented in DummyIO",
diff --git a/tvix/eval/src/vm/generators.rs b/tvix/eval/src/vm/generators.rs
index 4dacdef0dd08..d5b5f1de4979 100644
--- a/tvix/eval/src/vm/generators.rs
+++ b/tvix/eval/src/vm/generators.rs
@@ -102,8 +102,8 @@ pub enum VMRequest {
     /// Request that the VM imports the given path through its I/O interface.
     PathImport(PathBuf),
 
-    /// Request that the VM reads the given path to a string.
-    ReadToString(PathBuf),
+    /// Request that the VM opens the specified file and provides a reader.
+    OpenFile(PathBuf),
 
     /// Request that the VM checks whether the given path exists.
     PathExists(PathBuf),
@@ -170,8 +170,8 @@ impl Display for VMRequest {
                 write!(f, "import_cache_put({})", p.to_string_lossy())
             }
             VMRequest::PathImport(p) => write!(f, "path_import({})", p.to_string_lossy()),
-            VMRequest::ReadToString(p) => {
-                write!(f, "read_to_string({})", p.to_string_lossy())
+            VMRequest::OpenFile(p) => {
+                write!(f, "open_file({})", p.to_string_lossy())
             }
             VMRequest::PathExists(p) => write!(f, "path_exists({})", p.to_string_lossy()),
             VMRequest::ReadDir(p) => write!(f, "read_dir({})", p.to_string_lossy()),
@@ -199,6 +199,9 @@ pub enum VMResponse {
 
     /// VM response with a span to use at the current point.
     Span(LightSpan),
+
+    /// [std::io::Reader] produced by the VM in response to some IO operation.
+    Reader(Box<dyn std::io::Read>),
 }
 
 impl Display for VMResponse {
@@ -209,6 +212,7 @@ impl Display for VMResponse {
             VMResponse::Path(p) => write!(f, "path({})", p.to_string_lossy()),
             VMResponse::Directory(d) => write!(f, "dir(len = {})", d.len()),
             VMResponse::Span(_) => write!(f, "span"),
+            VMResponse::Reader(_) => write!(f, "reader"),
         }
     }
 }
@@ -425,18 +429,18 @@ where
                             message = VMResponse::Path(imported);
                         }
 
-                        VMRequest::ReadToString(path) => {
-                            let content = self
+                        VMRequest::OpenFile(path) => {
+                            let reader = self
                                 .io_handle
                                 .as_ref()
-                                .read_to_end(&path)
+                                .open(&path)
                                 .map_err(|e| ErrorKind::IO {
                                     path: Some(path),
                                     error: e.into(),
                                 })
                                 .with_span(&span, self)?;
 
-                            message = VMResponse::Value(content.into())
+                            message = VMResponse::Reader(reader)
                         }
 
                         VMRequest::PathExists(path) => {
@@ -730,9 +734,10 @@ pub(crate) async fn request_path_import(co: &GenCo, path: PathBuf) -> PathBuf {
     }
 }
 
-pub(crate) async fn request_read_to_string(co: &GenCo, path: PathBuf) -> Value {
-    match co.yield_(VMRequest::ReadToString(path)).await {
-        VMResponse::Value(value) => value,
+/// Request that the VM open a [std::io::Read] for the specified file.
+pub async fn request_open_file(co: &GenCo, path: PathBuf) -> Box<dyn std::io::Read> {
+    match co.yield_(VMRequest::OpenFile(path)).await {
+        VMResponse::Reader(value) => value,
         msg => panic!(
             "Tvix bug: VM responded with incorrect generator message: {}",
             msg
diff --git a/tvix/glue/src/builtins/import.rs b/tvix/glue/src/builtins/import.rs
index 2f02cd6ebbcd..3a2dea953fd3 100644
--- a/tvix/glue/src/builtins/import.rs
+++ b/tvix/glue/src/builtins/import.rs
@@ -177,9 +177,9 @@ mod import_builtins {
             })
             .transpose()?;
 
-        // FUTUREWORK(performance): this reads the file instead of using a stat-like
-        // system call to the file, this degrades very badly on large files.
-        if !recursive_ingestion && state.read_to_end(path.as_ref()).is_err() {
+        // FUTUREWORK(performance): this opens the file instead of using a stat-like
+        // system call to the file.
+        if !recursive_ingestion && state.open(path.as_ref()).is_err() {
             Err(ImportError::FlatImportOfNonFile(
                 path.to_string_lossy().to_string(),
             ))?;
diff --git a/tvix/glue/src/tvix_io.rs b/tvix/glue/src/tvix_io.rs
index 95146df7287e..0e5f23b99093 100644
--- a/tvix/glue/src/tvix_io.rs
+++ b/tvix/glue/src/tvix_io.rs
@@ -8,7 +8,7 @@
 //! otherwise fundamental features like nixpkgs bootstrapping and hash
 //! calculation will not work.
 
-use std::io;
+use std::io::{self, Cursor};
 use std::path::{Path, PathBuf};
 use tvix_eval::{EvalIO, FileType};
 
@@ -44,7 +44,7 @@ where
         self.actual.as_ref().path_exists(path)
     }
 
-    fn read_to_end(&self, path: &Path) -> io::Result<Vec<u8>> {
+    fn open(&self, path: &Path) -> io::Result<Box<dyn io::Read>> {
         // Bundled version of corepkgs/fetchurl.nix. The counterpart
         // of this happens in [crate::configure_nix_path], where the `nix_path`
         // of the evaluation has `nix=/__corepkgs__` added to it.
@@ -52,13 +52,12 @@ where
         // This workaround is similar to what cppnix does for passing
         // the path through.
         //
-        // TODO: this comparison is bad and allocates, we should use
-        // the sane path library.
+        // TODO: this comparison is bad we should use the sane path library.
         if path.starts_with("/__corepkgs__/fetchurl.nix") {
-            return Ok(include_bytes!("fetchurl.nix").to_vec());
+            return Ok(Box::new(Cursor::new(include_bytes!("fetchurl.nix"))));
         }
 
-        self.actual.as_ref().read_to_end(path)
+        self.actual.as_ref().open(path)
     }
 
     fn read_dir(&self, path: &Path) -> io::Result<Vec<(bytes::Bytes, FileType)>> {
diff --git a/tvix/glue/src/tvix_store_io.rs b/tvix/glue/src/tvix_store_io.rs
index 7b675bfc7d88..10a59027852f 100644
--- a/tvix/glue/src/tvix_store_io.rs
+++ b/tvix/glue/src/tvix_store_io.rs
@@ -17,7 +17,7 @@ use std::{
     path::{Path, PathBuf},
     sync::Arc,
 };
-use tokio::io::AsyncReadExt;
+use tokio_util::io::SyncIoBridge;
 use tracing::{error, instrument, warn, Level};
 use tvix_build::buildservice::BuildService;
 use tvix_eval::{ErrorKind, EvalIO, FileType, StdIO};
@@ -478,7 +478,7 @@ impl EvalIO for TvixStoreIO {
     }
 
     #[instrument(skip(self), err)]
-    fn read_to_end(&self, path: &Path) -> io::Result<Vec<u8>> {
+    fn open(&self, path: &Path) -> io::Result<Box<dyn io::Read>> {
         if let Ok((store_path, sub_path)) =
             StorePath::from_absolute_path_full(&path.to_string_lossy())
         {
@@ -509,27 +509,24 @@ impl EvalIO for TvixStoreIO {
                             })?;
 
                         self.tokio_handle.block_on(async {
-                            let mut reader = {
-                                let resp = self.blob_service.as_ref().open_read(&digest).await?;
-                                match resp {
-                                    Some(blob_reader) => blob_reader,
-                                    None => {
-                                        error!(
-                                            blob.digest = %digest,
-                                            "blob not found",
-                                        );
-                                        Err(io::Error::new(
-                                            io::ErrorKind::NotFound,
-                                            format!("blob {} not found", &digest),
-                                        ))?
-                                    }
+                            let resp = self.blob_service.as_ref().open_read(&digest).await?;
+                            match resp {
+                                Some(blob_reader) => {
+                                    // The VM Response needs a sync [std::io::Reader].
+                                    Ok(Box::new(SyncIoBridge::new(blob_reader))
+                                        as Box<dyn io::Read>)
                                 }
-                            };
-
-                            let mut buf = Vec::new();
-
-                            reader.read_to_end(&mut buf).await?;
-                            Ok(buf)
+                                None => {
+                                    error!(
+                                        blob.digest = %digest,
+                                        "blob not found",
+                                    );
+                                    Err(io::Error::new(
+                                        io::ErrorKind::NotFound,
+                                        format!("blob {} not found", &digest),
+                                    ))
+                                }
+                            }
                         })
                     }
                     Node::Symlink(_symlink_node) => Err(io::Error::new(
@@ -540,11 +537,11 @@ impl EvalIO for TvixStoreIO {
             } else {
                 // As tvix-store doesn't manage /nix/store on the filesystem,
                 // we still need to also ask self.std_io here.
-                self.std_io.read_to_end(path)
+                self.std_io.open(path)
             }
         } else {
             // The store path is no store path, so do regular StdIO.
-            self.std_io.read_to_end(path)
+            self.std_io.open(path)
         }
     }