From 63116d8c21afdc50725ae93d13839fe1915b06b7 Mon Sep 17 00:00:00 2001 From: Connor Brewster Date: Fri, 22 Mar 2024 18:52:21 -0500 Subject: fix(tvix): Avoid buffering file into memory in builtins.hashFile Right now `builtins.hashFile` always reads the entire file into memory before hashing, which is not ideal for large files. This replaces `read_to_string` with `open_file` which allows calculating the hash of the file without buffering it entirely into memory. Other callers can continue to buffer into memory if they choose, but they still use the `open_file` VM request and then call `read_to_string` or `read_to_end` on the `std::io::Reader`. Fixes b/380 Change-Id: Ifa1c8324bcee8f751604b0b449feab875c632fda Reviewed-on: https://cl.tvl.fyi/c/depot/+/11236 Reviewed-by: flokli Tested-by: BuildkiteCI --- tvix/eval/src/builtins/hash.rs | 20 +++++++++++------- tvix/eval/src/builtins/impure.rs | 15 +++++++++----- tvix/eval/src/builtins/mod.rs | 5 ++--- tvix/eval/src/compiler/import.rs | 9 ++++---- tvix/eval/src/io.rs | 16 ++++++-------- tvix/eval/src/vm/generators.rs | 27 ++++++++++++++---------- tvix/glue/src/builtins/import.rs | 6 +++--- tvix/glue/src/tvix_io.rs | 11 +++++----- tvix/glue/src/tvix_store_io.rs | 45 +++++++++++++++++++--------------------- 9 files changed, 80 insertions(+), 74 deletions(-) diff --git a/tvix/eval/src/builtins/hash.rs b/tvix/eval/src/builtins/hash.rs index 6d07fc9b2dc8..d0145f1e7d75 100644 --- a/tvix/eval/src/builtins/hash.rs +++ b/tvix/eval/src/builtins/hash.rs @@ -6,18 +6,22 @@ use sha2::{digest::Output, Digest, Sha256, Sha512}; use crate::ErrorKind; -fn hash(b: &[u8]) -> Output { +/// Reads through all data from the passed reader, and returns the resulting [Digest]. +/// The exact hash function used is left generic over all [Digest]. +fn hash(mut r: impl std::io::Read) -> Result, ErrorKind> { let mut hasher = D::new(); - hasher.update(b); - hasher.finalize() + std::io::copy(&mut r, &mut hasher)?; + Ok(hasher.finalize()) } -pub fn hash_nix_string(algo: impl AsRef<[u8]>, s: impl AsRef<[u8]>) -> Result { +/// For a given algo "string" and reader for data, calculate the digest +/// and return it as a hexlower encoded [String]. +pub fn hash_nix_string(algo: impl AsRef<[u8]>, s: impl std::io::Read) -> Result { match algo.as_ref() { - b"md5" => Ok(HEXLOWER.encode(hash::(s.as_ref()).as_bstr())), - b"sha1" => Ok(HEXLOWER.encode(hash::(s.as_ref()).as_bstr())), - b"sha256" => Ok(HEXLOWER.encode(hash::(s.as_ref()).as_bstr())), - b"sha512" => Ok(HEXLOWER.encode(hash::(s.as_ref()).as_bstr())), + b"md5" => Ok(HEXLOWER.encode(hash::(s)?.as_bstr())), + b"sha1" => Ok(HEXLOWER.encode(hash::(s)?.as_bstr())), + b"sha256" => Ok(HEXLOWER.encode(hash::(s)?.as_bstr())), + b"sha512" => Ok(HEXLOWER.encode(hash::(s)?.as_bstr())), _ => Err(ErrorKind::UnknownHashType( algo.as_ref().as_bstr().to_string(), )), diff --git a/tvix/eval/src/builtins/impure.rs b/tvix/eval/src/builtins/impure.rs index aad55c7331e8..18403fe5d89b 100644 --- a/tvix/eval/src/builtins/impure.rs +++ b/tvix/eval/src/builtins/impure.rs @@ -31,14 +31,13 @@ mod impure_builtins { } #[builtin("hashFile")] - #[allow(non_snake_case)] - async fn builtin_hashFile(co: GenCo, algo: Value, path: Value) -> Result { + async fn builtin_hash_file(co: GenCo, algo: Value, path: Value) -> Result { let path = match coerce_value_to_path(&co, path).await? { Err(cek) => return Ok(Value::from(cek)), Ok(p) => p, }; - let s = generators::request_read_to_string(&co, path).await; - hash_nix_string(algo.to_str()?, s.to_str()?).map(Value::from) + let r = generators::request_open_file(&co, path).await; + Ok(hash_nix_string(algo.to_str()?, r).map(Value::from)?) } #[builtin("pathExists")] @@ -79,7 +78,13 @@ mod impure_builtins { async fn builtin_read_file(co: GenCo, path: Value) -> Result { match coerce_value_to_path(&co, path).await? { Err(cek) => Ok(Value::from(cek)), - Ok(path) => Ok(generators::request_read_to_string(&co, path).await), + Ok(path) => { + let mut buf = Vec::new(); + generators::request_open_file(&co, path) + .await + .read_to_end(&mut buf)?; + Ok(Value::from(buf)) + } } } } diff --git a/tvix/eval/src/builtins/mod.rs b/tvix/eval/src/builtins/mod.rs index 8973a25927ed..cb55894b6c7e 100644 --- a/tvix/eval/src/builtins/mod.rs +++ b/tvix/eval/src/builtins/mod.rs @@ -773,9 +773,8 @@ mod pure_builtins { } #[builtin("hashString")] - #[allow(non_snake_case)] - async fn builtin_hashString(co: GenCo, algo: Value, s: Value) -> Result { - hash_nix_string(algo.to_str()?, s.to_str()?).map(Value::from) + async fn builtin_hash_string(co: GenCo, algo: Value, s: Value) -> Result { + hash_nix_string(algo.to_str()?, std::io::Cursor::new(s.to_str()?)).map(Value::from) } #[builtin("head")] diff --git a/tvix/eval/src/compiler/import.rs b/tvix/eval/src/compiler/import.rs index c56909e958fb..9036eec81731 100644 --- a/tvix/eval/src/compiler/import.rs +++ b/tvix/eval/src/compiler/import.rs @@ -6,7 +6,6 @@ //! instance, or observers). use super::GlobalsMap; -use bstr::ByteSlice; use genawaiter::rc::Gen; use std::rc::Weak; @@ -39,9 +38,11 @@ async fn import_impl( return Ok(cached); } - // TODO(tazjin): make this return a string directly instead - let contents: Value = generators::request_read_to_string(&co, path.clone()).await; - let contents = contents.to_str()?.to_str()?.to_owned(); + let mut reader = generators::request_open_file(&co, path.clone()).await; + // We read to a String instead of a Vec because rnix only supports + // string source files. + let mut contents = String::new(); + reader.read_to_string(&mut contents)?; let parsed = rnix::ast::Root::parse(&contents); let errors = parsed.errors(); diff --git a/tvix/eval/src/io.rs b/tvix/eval/src/io.rs index 1c38bc68de90..f775077af818 100644 --- a/tvix/eval/src/io.rs +++ b/tvix/eval/src/io.rs @@ -16,6 +16,7 @@ //! how store paths are opened and so on. use std::{ + fs::File, io, path::{Path, PathBuf}, }; @@ -48,13 +49,8 @@ pub trait EvalIO { /// * `builtins.pathExists :: path -> bool` fn path_exists(&self, path: &Path) -> io::Result; - /// Read the file at the specified path to a `Vec`. - /// - /// This is used for the following language evaluation cases: - /// - /// * `builtins.readFile :: path -> string` - /// * `builtins.import :: path -> any` - fn read_to_end(&self, path: &Path) -> io::Result>; + /// Open the file at the specified path to a `io::Read`. + fn open(&self, path: &Path) -> io::Result>; /// Read the directory at the specified path and return the names /// of its entries associated with their [`FileType`]. @@ -99,8 +95,8 @@ impl EvalIO for StdIO { path.try_exists() } - fn read_to_end(&self, path: &Path) -> io::Result> { - std::fs::read(path) + fn open(&self, path: &Path) -> io::Result> { + Ok(Box::new(File::open(path)?)) } fn read_dir(&self, path: &Path) -> io::Result> { @@ -145,7 +141,7 @@ impl EvalIO for DummyIO { )) } - fn read_to_end(&self, _: &Path) -> io::Result> { + fn open(&self, _: &Path) -> io::Result> { Err(io::Error::new( io::ErrorKind::Unsupported, "I/O methods are not implemented in DummyIO", diff --git a/tvix/eval/src/vm/generators.rs b/tvix/eval/src/vm/generators.rs index 4dacdef0dd08..d5b5f1de4979 100644 --- a/tvix/eval/src/vm/generators.rs +++ b/tvix/eval/src/vm/generators.rs @@ -102,8 +102,8 @@ pub enum VMRequest { /// Request that the VM imports the given path through its I/O interface. PathImport(PathBuf), - /// Request that the VM reads the given path to a string. - ReadToString(PathBuf), + /// Request that the VM opens the specified file and provides a reader. + OpenFile(PathBuf), /// Request that the VM checks whether the given path exists. PathExists(PathBuf), @@ -170,8 +170,8 @@ impl Display for VMRequest { write!(f, "import_cache_put({})", p.to_string_lossy()) } VMRequest::PathImport(p) => write!(f, "path_import({})", p.to_string_lossy()), - VMRequest::ReadToString(p) => { - write!(f, "read_to_string({})", p.to_string_lossy()) + VMRequest::OpenFile(p) => { + write!(f, "open_file({})", p.to_string_lossy()) } VMRequest::PathExists(p) => write!(f, "path_exists({})", p.to_string_lossy()), VMRequest::ReadDir(p) => write!(f, "read_dir({})", p.to_string_lossy()), @@ -199,6 +199,9 @@ pub enum VMResponse { /// VM response with a span to use at the current point. Span(LightSpan), + + /// [std::io::Reader] produced by the VM in response to some IO operation. + Reader(Box), } impl Display for VMResponse { @@ -209,6 +212,7 @@ impl Display for VMResponse { VMResponse::Path(p) => write!(f, "path({})", p.to_string_lossy()), VMResponse::Directory(d) => write!(f, "dir(len = {})", d.len()), VMResponse::Span(_) => write!(f, "span"), + VMResponse::Reader(_) => write!(f, "reader"), } } } @@ -425,18 +429,18 @@ where message = VMResponse::Path(imported); } - VMRequest::ReadToString(path) => { - let content = self + VMRequest::OpenFile(path) => { + let reader = self .io_handle .as_ref() - .read_to_end(&path) + .open(&path) .map_err(|e| ErrorKind::IO { path: Some(path), error: e.into(), }) .with_span(&span, self)?; - message = VMResponse::Value(content.into()) + message = VMResponse::Reader(reader) } VMRequest::PathExists(path) => { @@ -730,9 +734,10 @@ pub(crate) async fn request_path_import(co: &GenCo, path: PathBuf) -> PathBuf { } } -pub(crate) async fn request_read_to_string(co: &GenCo, path: PathBuf) -> Value { - match co.yield_(VMRequest::ReadToString(path)).await { - VMResponse::Value(value) => value, +/// Request that the VM open a [std::io::Read] for the specified file. +pub async fn request_open_file(co: &GenCo, path: PathBuf) -> Box { + match co.yield_(VMRequest::OpenFile(path)).await { + VMResponse::Reader(value) => value, msg => panic!( "Tvix bug: VM responded with incorrect generator message: {}", msg diff --git a/tvix/glue/src/builtins/import.rs b/tvix/glue/src/builtins/import.rs index 2f02cd6ebbcd..3a2dea953fd3 100644 --- a/tvix/glue/src/builtins/import.rs +++ b/tvix/glue/src/builtins/import.rs @@ -177,9 +177,9 @@ mod import_builtins { }) .transpose()?; - // FUTUREWORK(performance): this reads the file instead of using a stat-like - // system call to the file, this degrades very badly on large files. - if !recursive_ingestion && state.read_to_end(path.as_ref()).is_err() { + // FUTUREWORK(performance): this opens the file instead of using a stat-like + // system call to the file. + if !recursive_ingestion && state.open(path.as_ref()).is_err() { Err(ImportError::FlatImportOfNonFile( path.to_string_lossy().to_string(), ))?; diff --git a/tvix/glue/src/tvix_io.rs b/tvix/glue/src/tvix_io.rs index 95146df7287e..0e5f23b99093 100644 --- a/tvix/glue/src/tvix_io.rs +++ b/tvix/glue/src/tvix_io.rs @@ -8,7 +8,7 @@ //! otherwise fundamental features like nixpkgs bootstrapping and hash //! calculation will not work. -use std::io; +use std::io::{self, Cursor}; use std::path::{Path, PathBuf}; use tvix_eval::{EvalIO, FileType}; @@ -44,7 +44,7 @@ where self.actual.as_ref().path_exists(path) } - fn read_to_end(&self, path: &Path) -> io::Result> { + fn open(&self, path: &Path) -> io::Result> { // Bundled version of corepkgs/fetchurl.nix. The counterpart // of this happens in [crate::configure_nix_path], where the `nix_path` // of the evaluation has `nix=/__corepkgs__` added to it. @@ -52,13 +52,12 @@ where // This workaround is similar to what cppnix does for passing // the path through. // - // TODO: this comparison is bad and allocates, we should use - // the sane path library. + // TODO: this comparison is bad we should use the sane path library. if path.starts_with("/__corepkgs__/fetchurl.nix") { - return Ok(include_bytes!("fetchurl.nix").to_vec()); + return Ok(Box::new(Cursor::new(include_bytes!("fetchurl.nix")))); } - self.actual.as_ref().read_to_end(path) + self.actual.as_ref().open(path) } fn read_dir(&self, path: &Path) -> io::Result> { diff --git a/tvix/glue/src/tvix_store_io.rs b/tvix/glue/src/tvix_store_io.rs index 7b675bfc7d88..10a59027852f 100644 --- a/tvix/glue/src/tvix_store_io.rs +++ b/tvix/glue/src/tvix_store_io.rs @@ -17,7 +17,7 @@ use std::{ path::{Path, PathBuf}, sync::Arc, }; -use tokio::io::AsyncReadExt; +use tokio_util::io::SyncIoBridge; use tracing::{error, instrument, warn, Level}; use tvix_build::buildservice::BuildService; use tvix_eval::{ErrorKind, EvalIO, FileType, StdIO}; @@ -478,7 +478,7 @@ impl EvalIO for TvixStoreIO { } #[instrument(skip(self), err)] - fn read_to_end(&self, path: &Path) -> io::Result> { + fn open(&self, path: &Path) -> io::Result> { if let Ok((store_path, sub_path)) = StorePath::from_absolute_path_full(&path.to_string_lossy()) { @@ -509,27 +509,24 @@ impl EvalIO for TvixStoreIO { })?; self.tokio_handle.block_on(async { - let mut reader = { - let resp = self.blob_service.as_ref().open_read(&digest).await?; - match resp { - Some(blob_reader) => blob_reader, - None => { - error!( - blob.digest = %digest, - "blob not found", - ); - Err(io::Error::new( - io::ErrorKind::NotFound, - format!("blob {} not found", &digest), - ))? - } + let resp = self.blob_service.as_ref().open_read(&digest).await?; + match resp { + Some(blob_reader) => { + // The VM Response needs a sync [std::io::Reader]. + Ok(Box::new(SyncIoBridge::new(blob_reader)) + as Box) } - }; - - let mut buf = Vec::new(); - - reader.read_to_end(&mut buf).await?; - Ok(buf) + None => { + error!( + blob.digest = %digest, + "blob not found", + ); + Err(io::Error::new( + io::ErrorKind::NotFound, + format!("blob {} not found", &digest), + )) + } + } }) } Node::Symlink(_symlink_node) => Err(io::Error::new( @@ -540,11 +537,11 @@ impl EvalIO for TvixStoreIO { } else { // As tvix-store doesn't manage /nix/store on the filesystem, // we still need to also ask self.std_io here. - self.std_io.read_to_end(path) + self.std_io.open(path) } } else { // The store path is no store path, so do regular StdIO. - self.std_io.read_to_end(path) + self.std_io.open(path) } } -- cgit 1.4.1