From ee23220564987771c8e7909ded6fb9853f1d1b0d Mon Sep 17 00:00:00 2001 From: Florian Klink Date: Thu, 16 Mar 2023 00:01:30 +0100 Subject: refactor(tvix/store/directorysvc): use [u8; 32] instead of Vec Also, simplify the trait interface, only allowing lookups of Directory objects by their digest. Change-Id: I6eec28a8cb0557bed9b69df8b8ff99a5e0f8fe35 Reviewed-on: https://cl.tvl.fyi/c/depot/+/8313 Tested-by: BuildkiteCI Autosubmit: flokli Reviewed-by: tazjin --- .../src/proto/grpc_directoryservice_wrapper.rs | 68 +++++++++++++--------- tvix/store/src/proto/mod.rs | 4 +- tvix/store/src/proto/tests/directory.rs | 4 +- .../store/src/proto/tests/grpc_directoryservice.rs | 20 ++++--- 4 files changed, 57 insertions(+), 39 deletions(-) (limited to 'tvix/store/src/proto') diff --git a/tvix/store/src/proto/grpc_directoryservice_wrapper.rs b/tvix/store/src/proto/grpc_directoryservice_wrapper.rs index 049b1571c23f..e73dfdab1866 100644 --- a/tvix/store/src/proto/grpc_directoryservice_wrapper.rs +++ b/tvix/store/src/proto/grpc_directoryservice_wrapper.rs @@ -34,31 +34,33 @@ impl let req_inner = request.into_inner(); - let client = self.directory_service.clone(); + let directory_service = self.directory_service.clone(); // kick off an async thread task::spawn(async move { // Keep the list of directory digests to traverse. // As per rpc_directory.proto, we traverse in BFS order. - let mut deq: VecDeque> = VecDeque::new(); + let mut deq: VecDeque<[u8; 32]> = VecDeque::new(); // look at the digest in the request and put it in the top of the queue. match &req_inner.by_what { None => return Err(Status::invalid_argument("by_what needs to be specified")), Some(proto::get_directory_request::ByWhat::Digest(digest)) => { - if digest.len() != 32 { - return Err(Status::invalid_argument("invalid digest length")); - } - deq.push_back(digest.clone()); + deq.push_back( + digest + .as_slice() + .try_into() + .map_err(|_e| Status::invalid_argument("invalid digest length"))?, + ); } } // keep a list of all the Directory messages already sent, so we can omit sending the same. - let mut sent_directory_dgsts: HashSet> = HashSet::new(); + let mut sent_directory_dgsts: HashSet<[u8; 32]> = HashSet::new(); // look up the directory at the top of the queue - while let Some(ref digest) = deq.pop_front() { - let digest_b64: String = BASE64.encode(digest); + while let Some(digest) = deq.pop_front() { + let digest_b64: String = BASE64.encode(&digest); // add digest we're currently processing to a span, but pay attention to // https://docs.rs/tracing/0.1.37/tracing/span/struct.Span.html#in-asynchronous-code @@ -69,9 +71,7 @@ impl let _enter = span.enter(); // invoke client.get, and map to a Result - match client.get(&proto::get_directory_request::ByWhat::Digest( - digest.to_vec(), - )) { + match directory_service.get(&digest) { // The directory was not found, abort Ok(None) => { if !sent_directory_dgsts.is_empty() { @@ -94,10 +94,19 @@ impl // Same applies to when it already is in the queue. if req_inner.recursive { for child_directory_node in &directory.directories { - if !sent_directory_dgsts.contains(&child_directory_node.digest) - && !deq.contains(&child_directory_node.digest) + let child_directory_node_digest: [u8; 32] = + child_directory_node.digest.clone().try_into().map_err( + |_e| { + Status::internal( + "invalid child directory digest len", + ) + }, + )?; + + if !sent_directory_dgsts.contains(&child_directory_node_digest) + && !deq.contains(&child_directory_node_digest) { - deq.push_back(child_directory_node.digest.clone()); + deq.push_back(child_directory_node_digest); } } } @@ -106,7 +115,7 @@ impl // Strictly speaking, it wasn't sent yet, but tx.send happens right after, // and the only way we can still fail is by the remote side to hang up, // in which case we stop anyways. - sent_directory_dgsts.insert(digest.to_vec()); + sent_directory_dgsts.insert(digest); Ok(directory) } @@ -143,8 +152,8 @@ impl // This keeps track of the seen directory keys, and their size. // This is used to validate the size field of a reference to a previously sent directory. // We don't need to keep the contents around, they're stored in the DB. - let mut seen_directories_sizes: HashMap, u32> = HashMap::new(); - let mut last_directory_dgst: Option> = None; + let mut seen_directories_sizes: HashMap<[u8; 32], u32> = HashMap::new(); + let mut last_directory_dgst: Option<[u8; 32]> = None; // Consume directories, and insert them into the store. // Reject directory messages that refer to Directories not sent in the same stream. @@ -162,12 +171,18 @@ impl // to ensure it has been seen already in this stream, and that the size // matches what we recorded. for child_directory in &directory.directories { - match seen_directories_sizes.get(&child_directory.digest) { + let child_directory_digest: [u8; 32] = child_directory + .digest + .clone() + .try_into() + .map_err(|_e| Status::internal("invalid child directory digest len"))?; + + match seen_directories_sizes.get(&child_directory_digest) { None => { return Err(Status::invalid_argument(format!( "child directory '{}' ({}) in directory '{}' not seen yet", child_directory.name, - BASE64.encode(&child_directory.digest), + BASE64.encode(&child_directory_digest), BASE64.encode(&directory.digest()), ))); } @@ -176,7 +191,7 @@ impl return Err(Status::invalid_argument(format!( "child directory '{}' ({}) in directory '{}' referred with wrong size, expected {}, actual {}", child_directory.name, - BASE64.encode(&child_directory.digest), + BASE64.encode(&child_directory_digest), BASE64.encode(&directory.digest()), seen_child_directory_size, child_directory.size, @@ -197,15 +212,12 @@ impl // does a reachability check. let dgst = directory.digest(); - seen_directories_sizes.insert(dgst.clone(), directory.size()); - last_directory_dgst = Some(dgst.clone()); + seen_directories_sizes.insert(dgst, directory.size()); + last_directory_dgst = Some(dgst); // check if the directory already exists in the database. We can skip // inserting if it's already there, as that'd be a no-op. - match self - .directory_service - .get(&proto::get_directory_request::ByWhat::Digest(dgst.to_vec())) - { + match self.directory_service.get(&dgst) { Err(e) => { warn!("error checking if directory already exists: {}", e); return Err(e.into()); @@ -224,7 +236,7 @@ impl match last_directory_dgst { None => Err(Status::invalid_argument("no directories received")), Some(last_directory_dgst) => Ok(Response::new(proto::PutDirectoryResponse { - root_digest: last_directory_dgst, + root_digest: last_directory_dgst.to_vec(), })), } } diff --git a/tvix/store/src/proto/mod.rs b/tvix/store/src/proto/mod.rs index fceb6b5902df..f5945d650ad8 100644 --- a/tvix/store/src/proto/mod.rs +++ b/tvix/store/src/proto/mod.rs @@ -236,10 +236,10 @@ impl Directory { /// Calculates the digest of a Directory, which is the blake3 hash of a /// Directory protobuf message, serialized in protobuf canonical form. - pub fn digest(&self) -> Vec { + pub fn digest(&self) -> [u8; 32] { let mut hasher = blake3::Hasher::new(); - hasher.update(&self.encode_to_vec()).finalize().as_bytes()[..].to_vec() + *hasher.update(&self.encode_to_vec()).finalize().as_bytes() } /// validate checks the directory for invalid data, such as: diff --git a/tvix/store/src/proto/tests/directory.rs b/tvix/store/src/proto/tests/directory.rs index 890cb2164ad5..03572629e606 100644 --- a/tvix/store/src/proto/tests/directory.rs +++ b/tvix/store/src/proto/tests/directory.rs @@ -2,7 +2,7 @@ use crate::proto::{Directory, DirectoryNode, FileNode, SymlinkNode, ValidateDire use lazy_static::lazy_static; lazy_static! { - static ref DUMMY_DIGEST: Vec = vec![ + static ref DUMMY_DIGEST: [u8; 32] = [ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, @@ -66,7 +66,7 @@ fn digest() { assert_eq!( d.digest(), - vec![ + [ 0xaf, 0x13, 0x49, 0xb9, 0xf5, 0xf9, 0xa1, 0xa6, 0xa0, 0x40, 0x4d, 0xea, 0x36, 0xdc, 0xc9, 0x49, 0x9b, 0xcb, 0x25, 0xc9, 0xad, 0xc1, 0x12, 0xb7, 0xcc, 0x9a, 0x93, 0xca, 0xe4, 0x1f, 0x32, 0x62 diff --git a/tvix/store/src/proto/tests/grpc_directoryservice.rs b/tvix/store/src/proto/tests/grpc_directoryservice.rs index 464c25f33d75..37428fbfa339 100644 --- a/tvix/store/src/proto/tests/grpc_directoryservice.rs +++ b/tvix/store/src/proto/tests/grpc_directoryservice.rs @@ -44,7 +44,7 @@ async fn not_found() { let resp = service .get(tonic::Request::new(GetDirectoryRequest { - by_what: Some(ByWhat::Digest(DIRECTORY_A.digest())), + by_what: Some(ByWhat::Digest(DIRECTORY_A.digest().to_vec())), ..Default::default() })) .await; @@ -114,14 +114,17 @@ async fn put_get_multiple() { .await .expect("must succeed"); - assert_eq!(DIRECTORY_B.digest(), put_resp.into_inner().root_digest); + assert_eq!( + DIRECTORY_B.digest().to_vec(), + put_resp.into_inner().root_digest + ); // now, request b, first in non-recursive mode. let items = get_directories( &service, GetDirectoryRequest { recursive: false, - by_what: Some(ByWhat::Digest(DIRECTORY_B.digest())), + by_what: Some(ByWhat::Digest(DIRECTORY_B.digest().to_vec())), }, ) .await @@ -135,7 +138,7 @@ async fn put_get_multiple() { &service, GetDirectoryRequest { recursive: true, - by_what: Some(ByWhat::Digest(DIRECTORY_B.digest())), + by_what: Some(ByWhat::Digest(DIRECTORY_B.digest().to_vec())), }, ) .await @@ -161,14 +164,17 @@ async fn put_get_dedup() { .await .expect("must succeed"); - assert_eq!(DIRECTORY_C.digest(), put_resp.into_inner().root_digest); + assert_eq!( + DIRECTORY_C.digest().to_vec(), + put_resp.into_inner().root_digest + ); // Ask for "C" recursively. We expect to only get "A" once, as there's no point sending it twice. let items = get_directories( &service, GetDirectoryRequest { recursive: true, - by_what: Some(ByWhat::Digest(DIRECTORY_C.digest())), + by_what: Some(ByWhat::Digest(DIRECTORY_C.digest().to_vec())), }, ) .await @@ -211,7 +217,7 @@ async fn put_reject_wrong_size() { let broken_parent_directory = Directory { directories: vec![DirectoryNode { name: "foo".to_string(), - digest: DIRECTORY_A.digest(), + digest: DIRECTORY_A.digest().to_vec(), size: 42, }], ..Default::default() -- cgit 1.4.1