From 63116d8c21afdc50725ae93d13839fe1915b06b7 Mon Sep 17 00:00:00 2001 From: Connor Brewster Date: Fri, 22 Mar 2024 18:52:21 -0500 Subject: fix(tvix): Avoid buffering file into memory in builtins.hashFile Right now `builtins.hashFile` always reads the entire file into memory before hashing, which is not ideal for large files. This replaces `read_to_string` with `open_file` which allows calculating the hash of the file without buffering it entirely into memory. Other callers can continue to buffer into memory if they choose, but they still use the `open_file` VM request and then call `read_to_string` or `read_to_end` on the `std::io::Reader`. Fixes b/380 Change-Id: Ifa1c8324bcee8f751604b0b449feab875c632fda Reviewed-on: https://cl.tvl.fyi/c/depot/+/11236 Reviewed-by: flokli Tested-by: BuildkiteCI --- tvix/glue/src/builtins/import.rs | 6 +++--- tvix/glue/src/tvix_io.rs | 11 +++++----- tvix/glue/src/tvix_store_io.rs | 45 +++++++++++++++++++--------------------- 3 files changed, 29 insertions(+), 33 deletions(-) (limited to 'tvix/glue/src') diff --git a/tvix/glue/src/builtins/import.rs b/tvix/glue/src/builtins/import.rs index 2f02cd6ebbcd..3a2dea953fd3 100644 --- a/tvix/glue/src/builtins/import.rs +++ b/tvix/glue/src/builtins/import.rs @@ -177,9 +177,9 @@ mod import_builtins { }) .transpose()?; - // FUTUREWORK(performance): this reads the file instead of using a stat-like - // system call to the file, this degrades very badly on large files. - if !recursive_ingestion && state.read_to_end(path.as_ref()).is_err() { + // FUTUREWORK(performance): this opens the file instead of using a stat-like + // system call to the file. + if !recursive_ingestion && state.open(path.as_ref()).is_err() { Err(ImportError::FlatImportOfNonFile( path.to_string_lossy().to_string(), ))?; diff --git a/tvix/glue/src/tvix_io.rs b/tvix/glue/src/tvix_io.rs index 95146df7287e..0e5f23b99093 100644 --- a/tvix/glue/src/tvix_io.rs +++ b/tvix/glue/src/tvix_io.rs @@ -8,7 +8,7 @@ //! otherwise fundamental features like nixpkgs bootstrapping and hash //! calculation will not work. -use std::io; +use std::io::{self, Cursor}; use std::path::{Path, PathBuf}; use tvix_eval::{EvalIO, FileType}; @@ -44,7 +44,7 @@ where self.actual.as_ref().path_exists(path) } - fn read_to_end(&self, path: &Path) -> io::Result> { + fn open(&self, path: &Path) -> io::Result> { // Bundled version of corepkgs/fetchurl.nix. The counterpart // of this happens in [crate::configure_nix_path], where the `nix_path` // of the evaluation has `nix=/__corepkgs__` added to it. @@ -52,13 +52,12 @@ where // This workaround is similar to what cppnix does for passing // the path through. // - // TODO: this comparison is bad and allocates, we should use - // the sane path library. + // TODO: this comparison is bad we should use the sane path library. if path.starts_with("/__corepkgs__/fetchurl.nix") { - return Ok(include_bytes!("fetchurl.nix").to_vec()); + return Ok(Box::new(Cursor::new(include_bytes!("fetchurl.nix")))); } - self.actual.as_ref().read_to_end(path) + self.actual.as_ref().open(path) } fn read_dir(&self, path: &Path) -> io::Result> { diff --git a/tvix/glue/src/tvix_store_io.rs b/tvix/glue/src/tvix_store_io.rs index 7b675bfc7d88..10a59027852f 100644 --- a/tvix/glue/src/tvix_store_io.rs +++ b/tvix/glue/src/tvix_store_io.rs @@ -17,7 +17,7 @@ use std::{ path::{Path, PathBuf}, sync::Arc, }; -use tokio::io::AsyncReadExt; +use tokio_util::io::SyncIoBridge; use tracing::{error, instrument, warn, Level}; use tvix_build::buildservice::BuildService; use tvix_eval::{ErrorKind, EvalIO, FileType, StdIO}; @@ -478,7 +478,7 @@ impl EvalIO for TvixStoreIO { } #[instrument(skip(self), err)] - fn read_to_end(&self, path: &Path) -> io::Result> { + fn open(&self, path: &Path) -> io::Result> { if let Ok((store_path, sub_path)) = StorePath::from_absolute_path_full(&path.to_string_lossy()) { @@ -509,27 +509,24 @@ impl EvalIO for TvixStoreIO { })?; self.tokio_handle.block_on(async { - let mut reader = { - let resp = self.blob_service.as_ref().open_read(&digest).await?; - match resp { - Some(blob_reader) => blob_reader, - None => { - error!( - blob.digest = %digest, - "blob not found", - ); - Err(io::Error::new( - io::ErrorKind::NotFound, - format!("blob {} not found", &digest), - ))? - } + let resp = self.blob_service.as_ref().open_read(&digest).await?; + match resp { + Some(blob_reader) => { + // The VM Response needs a sync [std::io::Reader]. + Ok(Box::new(SyncIoBridge::new(blob_reader)) + as Box) } - }; - - let mut buf = Vec::new(); - - reader.read_to_end(&mut buf).await?; - Ok(buf) + None => { + error!( + blob.digest = %digest, + "blob not found", + ); + Err(io::Error::new( + io::ErrorKind::NotFound, + format!("blob {} not found", &digest), + )) + } + } }) } Node::Symlink(_symlink_node) => Err(io::Error::new( @@ -540,11 +537,11 @@ impl EvalIO for TvixStoreIO { } else { // As tvix-store doesn't manage /nix/store on the filesystem, // we still need to also ask self.std_io here. - self.std_io.read_to_end(path) + self.std_io.open(path) } } else { // The store path is no store path, so do regular StdIO. - self.std_io.read_to_end(path) + self.std_io.open(path) } } -- cgit 1.4.1