diff options
author | Florian Klink <flokli@flokli.de> | 2024-10-15T13·11+0300 |
---|---|---|
committer | clbot <clbot@tvl.fyi> | 2024-10-15T23·26+0000 |
commit | ca1e628c8573fe4f054af510918b81a378d7eb9a (patch) | |
tree | 74ffa5ada0387ddaff4b5cfb88dfbfcea485cacb /tvix | |
parent | baebe29bab5d02607b7811dbc2542f708aee2665 (diff) |
refactor(tvix/glue/builtins/import): refactor r/8811
This removes all the intermediate helper functions and reorganizes the import code to only do the calculations where/when needed, and hopefully makes things easier to understand as well. Change-Id: I7e4c89c742bf8569b45e303523f7f801da7127ea Reviewed-on: https://cl.tvl.fyi/c/depot/+/12627 Autosubmit: flokli <flokli@flokli.de> Tested-by: BuildkiteCI Reviewed-by: Jörg Thalheim <joerg@thalheim.io> Reviewed-by: edef <edef@edef.eu>
Diffstat (limited to 'tvix')
-rw-r--r-- | tvix/docs/src/TODO.md | 14 | ||||
-rw-r--r-- | tvix/glue/src/builtins/errors.rs | 4 | ||||
-rw-r--r-- | tvix/glue/src/builtins/import.rs | 276 | ||||
-rw-r--r-- | tvix/glue/src/tvix_store_io.rs | 56 | ||||
-rw-r--r-- | tvix/store/src/bin/tvix-store.rs | 4 | ||||
-rw-r--r-- | tvix/store/src/import.rs | 81 |
6 files changed, 204 insertions, 231 deletions
diff --git a/tvix/docs/src/TODO.md b/tvix/docs/src/TODO.md index 3d2e083ef9f9..e3f867e11ac2 100644 --- a/tvix/docs/src/TODO.md +++ b/tvix/docs/src/TODO.md @@ -77,20 +77,6 @@ correctness: "some amount of outgoing bytes" in memory. This is somewhat blocked until the {Chunk/Blob}Service split is done, as then prefetching would only be a matter of adding it into the one `BlobReader`. - - The import builtins (`builtins.path` and `builtins.filterSource`) use(d) some - helper functions in TvixStoreIO that deals with constructing the `PathInfo` - structs and inserting them, but some of the abstractions where probably done - at the wrong layer: - - Some ways of importing calculate the NAR representation twice - - The code isn't very usable from other consumers that also create structs - `PathInfo`. - - `node_to_path_info` is ony called by `register_in_path_info_service` (due - to this marked as private for now). - Instead of fighting these abstractions, maybe it's best to explicitly add the - code back to the two builtins, remove redundant calls and calculations. A - later phase could then see how/if some of this can be reasonably factored out in - a way it's also usable for other places having a node and wanting to persist - it (if at all). ### Error cleanup - Currently, all services use tvix_castore::Error, which only has two kinds diff --git a/tvix/glue/src/builtins/errors.rs b/tvix/glue/src/builtins/errors.rs index d41c7d8e4998..ec85942bb1ee 100644 --- a/tvix/glue/src/builtins/errors.rs +++ b/tvix/glue/src/builtins/errors.rs @@ -64,10 +64,10 @@ pub enum FetcherError { #[derive(Debug, Error)] pub enum ImportError { #[error("non-file '{0}' cannot be imported in 'flat' mode")] - FlatImportOfNonFile(String), + FlatImportOfNonFile(PathBuf), #[error("hash mismatch at ingestion of '{0}', expected: '{1}', got: '{2}'")] - HashMismatch(String, NixHash, NixHash), + HashMismatch(PathBuf, NixHash, NixHash), #[error("path '{}' is not absolute or invalid", .0.display())] PathNotAbsoluteOrInvalid(PathBuf), diff --git a/tvix/glue/src/builtins/import.rs b/tvix/glue/src/builtins/import.rs index d2e2e6c5bd7d..db60f44de183 100644 --- a/tvix/glue/src/builtins/import.rs +++ b/tvix/glue/src/builtins/import.rs @@ -112,7 +112,7 @@ mod import_builtins { use crate::tvix_store_io::TvixStoreIO; use bstr::ByteSlice; use nix_compat::nixhash::{CAHash, NixHash}; - use nix_compat::store_path::StorePathRef; + use nix_compat::store_path::{build_ca_path, StorePathRef}; use sha2::Digest; use std::rc::Rc; use tokio::io::AsyncWriteExt; @@ -120,6 +120,7 @@ mod import_builtins { use tvix_eval::generators::Gen; use tvix_eval::{generators::GenCo, ErrorKind, Value}; use tvix_eval::{FileType, NixContextElement, NixString}; + use tvix_store::path_info::PathInfo; #[builtin("path")] async fn builtin_path( @@ -147,124 +148,128 @@ mod import_builtins { .expect("Failed to derive the default name out of the path") .to_string() }; + let filter = args.select("filter"); + + // Construct a sha256 hasher, which is needed for flat ingestion. let recursive_ingestion = args .select("recursive") .map(|r| r.as_bool()) .transpose()? .unwrap_or(true); // Yes, yes, Nix, by default, puts `recursive = true;`. + let expected_sha256 = args .select("sha256") .map(|h| { h.to_str().and_then(|expected| { - let expected = expected.into_bstring().to_string(); - // TODO: ensure that we fail if this is not a valid str. - nix_compat::nixhash::from_str(&expected, None).map_err(|_err| { - // TODO: a better error would be nice, we use - // DerivationError::InvalidOutputHash usually for derivation construction. - // This is not a derivation construction, should we move it outside and - // generalize? - ErrorKind::TypeError { - expected: "sha256", - actual: "not a sha256", + match nix_compat::nixhash::from_str( + expected.into_bstring().to_str()?, + Some("sha256"), + ) { + Ok(NixHash::Sha256(digest)) => Ok(digest), + Ok(_) => unreachable!(), + Err(_e) => { + // TODO: a better error would be nice, we use + // DerivationError::InvalidOutputHash usually for derivation construction. + // This is not a derivation construction, should we move it outside and + // generalize? + Err(ErrorKind::TypeError { + expected: "sha256", + actual: "not a sha256", + }) } - }) + } }) }) .transpose()?; - // Check if the path points to a regular file. - // If it does, the filter function is never executed. - // TODO: follow symlinks and check their type instead - let (root_node, ca_hash) = match state.file_type(path.as_ref())? { + // As a first step, we ingest the contents, and get back a root node, + // and optionally the sha256 a flat file. + let (root_node, ca) = match state.file_type(path.as_ref())? { + // Check if the path points to a regular file. + // If it does, the filter function is never executed, and we copy to the blobservice directly. + // If recursive is false, we need to calculate the sha256 digest of the raw contents, + // as that affects the output path calculation. FileType::Regular => { let mut file = state.open(path.as_ref())?; - // This is a single file, copy it to the blobservice directly. - let mut hash = sha2::Sha256::new(); + + let mut flat_sha256 = (!recursive_ingestion).then(sha2::Sha256::new); let mut blob_size = 0; + let mut blob_writer = state .tokio_handle .block_on(async { state.blob_service.open_write().await }); - let mut buf = [0u8; 4096]; + // read piece by piece and write to blob_writer. + // This is a bit manual due to EvalIO being sync, while everything else async. + { + let mut buf = [0u8; 4096]; - loop { - // read bytes into buffer, break out if EOF - let len = file.read(&mut buf)?; - if len == 0 { - break; - } - blob_size += len as u64; + loop { + // read bytes into buffer, break out if EOF + let len = file.read(&mut buf)?; + if len == 0 { + break; + } + blob_size += len as u64; - let data = &buf[0..len]; + let data = &buf[0..len]; - // add to blobwriter - state - .tokio_handle - .block_on(async { blob_writer.write_all(data).await })?; + // add to blobwriter + state + .tokio_handle + .block_on(async { blob_writer.write_all(data).await })?; - // update the sha256 hash function. We can skip that if we're not using it. - if !recursive_ingestion { - hash.update(data); + // update blob_sha256 if needed. + if let Some(h) = flat_sha256.as_mut() { + h.update(data) + } } } - // close the blob writer, get back the b3 digest. - let blob_digest = state - .tokio_handle - .block_on(async { blob_writer.close().await })?; - - let root_node = Node::File { - digest: blob_digest, - size: blob_size, - executable: false, - }; - - let ca_hash = if recursive_ingestion { - let (_nar_size, nar_sha256) = state - .tokio_handle - .block_on(async { - state - .nar_calculation_service - .as_ref() - .calculate_nar(&root_node) - .await - }) - .map_err(|e| tvix_eval::ErrorKind::TvixError(Rc::new(e)))?; - CAHash::Nar(NixHash::Sha256(nar_sha256)) - } else { - CAHash::Flat(NixHash::Sha256(hash.finalize().into())) - }; - - (root_node, ca_hash) + // close the blob writer, construct the root node and the blob_sha256 (later used for output path calculation) + ( + Node::File { + digest: state + .tokio_handle + .block_on(async { blob_writer.close().await })?, + size: blob_size, + executable: false, + }, + { + // If non-recursive ingestion is requested… + if let Some(flat_sha256) = flat_sha256 { + let actual_sha256 = flat_sha256.finalize().into(); + + // compare the recorded flat hash with an upfront one if provided. + if let Some(expected_sha256) = expected_sha256 { + if actual_sha256 != expected_sha256 { + return Err(ImportError::HashMismatch( + path, + NixHash::Sha256(expected_sha256), + NixHash::Sha256(actual_sha256), + ) + .into()); + } + } + + Some(CAHash::Flat(NixHash::Sha256(actual_sha256))) + } else { + None + } + }, + ) } - FileType::Directory => { - if !recursive_ingestion { - return Err(ImportError::FlatImportOfNonFile( - path.to_string_lossy().to_string(), - ))?; - } - - // do the filtered ingest - let root_node = filtered_ingest(state.clone(), co, path.as_ref(), filter).await?; - - // calculate the NAR sha256 - let (_nar_size, nar_sha256) = state - .tokio_handle - .block_on(async { - state - .nar_calculation_service - .as_ref() - .calculate_nar(&root_node) - .await - }) - .map_err(|e| tvix_eval::ErrorKind::TvixError(Rc::new(e)))?; - - let ca_hash = CAHash::Nar(NixHash::Sha256(nar_sha256)); - - (root_node, ca_hash) + FileType::Directory if !recursive_ingestion => { + return Err(ImportError::FlatImportOfNonFile(path))? } + + // do the filtered ingest + FileType::Directory => ( + filtered_ingest(state.clone(), co, path.as_ref(), filter).await?, + None, + ), FileType::Symlink => { // FUTUREWORK: Nix follows a symlink if it's at the root, // except if it's not resolve-able (NixOS/nix#7761).i @@ -287,26 +292,63 @@ mod import_builtins { } }; - if let Some(expected_sha256) = expected_sha256 { - if *ca_hash.hash() != expected_sha256 { - Err(ImportError::HashMismatch( - path.to_string_lossy().to_string(), - expected_sha256, - ca_hash.hash().into_owned(), - ))?; + // Calculate the NAR sha256. + let (nar_size, nar_sha256) = state + .tokio_handle + .block_on(async { + state + .nar_calculation_service + .as_ref() + .calculate_nar(&root_node) + .await + }) + .map_err(|e| tvix_eval::ErrorKind::TvixError(Rc::new(e)))?; + + // Calculate the CA hash for the recursive cases, this is only already + // `Some(_)` for flat ingestion. + let ca = match ca { + None => { + // If an upfront-expected NAR hash was specified, compare. + if let Some(expected_nar_sha256) = expected_sha256 { + if expected_nar_sha256 != nar_sha256 { + return Err(ImportError::HashMismatch( + path, + NixHash::Sha256(expected_nar_sha256), + NixHash::Sha256(nar_sha256), + ) + .into()); + } + } + CAHash::Nar(NixHash::Sha256(nar_sha256)) } - } + Some(ca) => ca, + }; + + let store_path = build_ca_path(&name, &ca, Vec::<&str>::new(), false) + .map_err(|e| tvix_eval::ErrorKind::TvixError(Rc::new(e)))?; let path_info = state .tokio_handle .block_on(async { state - .register_in_path_info_service(name.as_ref(), path.as_ref(), ca_hash, root_node) + .path_info_service + .as_ref() + .put(PathInfo { + store_path, + node: root_node, + // There's no reference scanning on path contents ingested like this. + references: vec![], + nar_size, + nar_sha256, + signatures: vec![], + deriver: None, + ca: Some(ca), + }) .await }) .map_err(|e| tvix_eval::ErrorKind::IO { path: Some(path.to_path_buf()), - error: Rc::new(e), + error: Rc::new(e.into()), })?; // We need to attach context to the final output path. @@ -332,24 +374,46 @@ mod import_builtins { let path_info = state .tokio_handle .block_on(async { - let (_, nar_sha256) = state + // Ask the PathInfoService for the NAR size and sha256 + // We always need it no matter what is the actual hash mode + // because the [PathInfo] needs to contain nar_{sha256,size}. + let (nar_size, nar_sha256) = state .nar_calculation_service .as_ref() .calculate_nar(&root_node) .await?; + let ca = CAHash::Nar(NixHash::Sha256(nar_sha256)); + + // Calculate the output path. This might still fail, as some names are illegal. + let output_path = + nix_compat::store_path::build_ca_path(name, &ca, Vec::<&str>::new(), false) + .map_err(|_| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("invalid name: {}", name), + ) + })?; + state - .register_in_path_info_service( - name, - &p, - CAHash::Nar(NixHash::Sha256(nar_sha256)), - root_node, - ) + .path_info_service + .as_ref() + .put(PathInfo { + store_path: output_path, + node: root_node, + // There's no reference scanning on path contents ingested like this. + references: vec![], + nar_size, + nar_sha256, + signatures: vec![], + deriver: None, + ca: Some(ca), + }) .await }) - .map_err(|err| ErrorKind::IO { + .map_err(|e| ErrorKind::IO { path: Some(p.to_path_buf()), - error: err.into(), + error: Rc::new(e.into()), })?; // We need to attach context to the final output path. diff --git a/tvix/glue/src/tvix_store_io.rs b/tvix/glue/src/tvix_store_io.rs index b5877566d524..839d2ae85845 100644 --- a/tvix/glue/src/tvix_store_io.rs +++ b/tvix/glue/src/tvix_store_io.rs @@ -384,56 +384,6 @@ impl TvixStoreIO { .await .map_err(|e| std::io::Error::new(io::ErrorKind::Other, e)) } - - async fn node_to_path_info<'a>( - &self, - name: &'a str, - path: &Path, - ca: CAHash, - root_node: Node, - ) -> io::Result<PathInfo> { - // Ask the PathInfoService for the NAR size and sha256 - // We always need it no matter what is the actual hash mode - // because the [PathInfo] needs to contain nar_{sha256,size}. - let (nar_size, nar_sha256) = self - .nar_calculation_service - .as_ref() - .calculate_nar(&root_node) - .await?; - - // Calculate the output path. This might still fail, as some names are illegal. - let output_path = - nix_compat::store_path::build_ca_path(name, &ca, Vec::<&str>::new(), false).map_err( - |_| { - std::io::Error::new( - std::io::ErrorKind::InvalidData, - format!("invalid name: {}", name), - ) - }, - )?; - - tvix_store::import::log_node(name.as_bytes(), &root_node, path); - - // construct a PathInfo - Ok(tvix_store::import::derive_nar_ca_path_info( - nar_size, - nar_sha256, - Some(ca), - output_path, - root_node, - )) - } - - pub(crate) async fn register_in_path_info_service<'a>( - &self, - name: &'a str, - path: &Path, - ca: CAHash, - root_node: Node, - ) -> io::Result<PathInfo> { - let path_info = self.node_to_path_info(name, path, ca, root_node).await?; - Ok(self.path_info_service.as_ref().put(path_info).await?) - } } impl EvalIO for TvixStoreIO { @@ -589,7 +539,7 @@ impl EvalIO for TvixStoreIO { #[instrument(skip(self), ret(level = Level::TRACE), err)] fn import_path(&self, path: &Path) -> io::Result<PathBuf> { - let output_path = self.tokio_handle.block_on(async { + let path_info = self.tokio_handle.block_on({ tvix_store::import::import_path_as_nar_ca( path, tvix_store::import::path_to_name(path)?, @@ -598,10 +548,10 @@ impl EvalIO for TvixStoreIO { &self.path_info_service, &self.nar_calculation_service, ) - .await })?; - Ok(output_path.to_absolute_path().into()) + // From the returned PathInfo, extract the store path and return it. + Ok(path_info.store_path.to_absolute_path().into()) } #[instrument(skip(self), ret(level = Level::TRACE))] diff --git a/tvix/store/src/bin/tvix-store.rs b/tvix/store/src/bin/tvix-store.rs index f39b73df2e0b..6cc7c39ab74e 100644 --- a/tvix/store/src/bin/tvix-store.rs +++ b/tvix/store/src/bin/tvix-store.rs @@ -262,9 +262,9 @@ async fn run_cli(cli: Cli) -> Result<(), Box<dyn std::error::Error + Send + Sync nar_calculation_service, ) .await; - if let Ok(output_path) = resp { + if let Ok(path_info) = resp { // If the import was successful, print the path to stdout. - println!("{}", output_path.to_absolute_path()); + println!("{}", path_info.store_path.to_absolute_path()); } } } diff --git a/tvix/store/src/import.rs b/tvix/store/src/import.rs index 1e9fd060b492..8101b6635a89 100644 --- a/tvix/store/src/import.rs +++ b/tvix/store/src/import.rs @@ -7,7 +7,7 @@ use tvix_castore::{ use nix_compat::{ nixhash::{CAHash, NixHash}, - store_path::{self, StorePath, StorePathRef}, + store_path::{self, StorePath}, }; use crate::{ @@ -70,38 +70,10 @@ pub fn path_to_name(path: &Path) -> std::io::Result<&str> { }) } -/// Takes the NAR size, SHA-256 of the NAR representation, the root node and optionally -/// a CA hash information. -/// -/// Constructs a [PathInfo] for a NAR-style object. -/// -/// The user can then further fill the fields (like deriver, signatures), and/or -/// verify to have the expected hashes. -#[inline] -pub fn derive_nar_ca_path_info( - nar_size: u64, - nar_sha256: [u8; 32], - ca: Option<CAHash>, - store_path: StorePath<String>, - root_node: Node, -) -> PathInfo { - // assemble the [crate::proto::PathInfo] object. - PathInfo { - store_path, - node: root_node, - // There's no reference scanning on path contents ingested like this. - references: vec![], - nar_size, - nar_sha256, - signatures: vec![], - deriver: None, - ca, - } -} - /// Ingest the contents at the given path `path` into castore, and registers the /// resulting root node in the passed PathInfoService, using the "NAR sha256 /// digest" and the passed name for output path calculation. +/// Inserts the PathInfo into the PathInfoService and returns it back to the caller. #[instrument(skip_all, fields(store_name=name, path=?path), err)] pub async fn import_path_as_nar_ca<BS, DS, PS, NS, P>( path: P, @@ -110,7 +82,7 @@ pub async fn import_path_as_nar_ca<BS, DS, PS, NS, P>( directory_service: DS, path_info_service: PS, nar_calculation_service: NS, -) -> Result<StorePathRef, std::io::Error> +) -> Result<PathInfo, std::io::Error> where P: AsRef<Path> + std::fmt::Debug, BS: BlobService + Clone, @@ -118,6 +90,7 @@ where PS: AsRef<dyn PathInfoService>, NS: NarCalculationService, { + // Ingest the contents at the given path `path` into castore. let root_node = ingest_path::<_, _, _, &[u8]>(blob_service, directory_service, path.as_ref(), None) .await @@ -129,29 +102,29 @@ where // Calculate the output path. This might still fail, as some names are illegal. // FUTUREWORK: express the `name` at the type level to be valid and move the conversion // at the caller level. - let output_path = store_path::build_nar_based_store_path(&nar_sha256, name).map_err(|_| { - std::io::Error::new( - std::io::ErrorKind::InvalidData, - format!("invalid name: {}", name), - ) - })?; - - log_node(name.as_ref(), &root_node, path.as_ref()); - - let path_info = derive_nar_ca_path_info( - nar_size, - nar_sha256, - Some(CAHash::Nar(NixHash::Sha256(nar_sha256))), - output_path.to_owned(), - root_node, - ); - - // This new [`PathInfo`] that we get back from there might contain additional signatures or - // information set by the service itself. In this function, we silently swallow it because - // callers don't really need it. - let _path_info = path_info_service.as_ref().put(path_info).await?; - - Ok(output_path) + let output_path: StorePath<String> = store_path::build_nar_based_store_path(&nar_sha256, name) + .map_err(|_| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("invalid name: {}", name), + ) + })?; + + // Insert a PathInfo. On success, return it back to the caller. + Ok(path_info_service + .as_ref() + .put(PathInfo { + store_path: output_path.to_owned(), + node: root_node, + // There's no reference scanning on imported paths + references: vec![], + nar_size, + nar_sha256, + signatures: vec![], + deriver: None, + ca: Some(CAHash::Nar(NixHash::Sha256(nar_sha256))), + }) + .await?) } #[cfg(test)] |