diff options
Diffstat (limited to 'tvix/castore')
63 files changed, 12016 insertions, 0 deletions
diff --git a/tvix/castore/Cargo.toml b/tvix/castore/Cargo.toml new file mode 100644 index 0000000000..40d4f24d9f --- /dev/null +++ b/tvix/castore/Cargo.toml @@ -0,0 +1,124 @@ +[package] +name = "tvix-castore" +version = "0.1.0" +edition = "2021" + +[dependencies] +async-compression = { version = "0.4.9", features = ["tokio", "zstd"]} +async-stream = "0.3.5" +async-tempfile = "0.4.0" +blake3 = { version = "1.3.1", features = ["rayon", "std", "traits-preview"] } +bstr = "1.6.0" +bytes = "1.4.0" +data-encoding = "2.3.3" +digest = "0.10.7" +fastcdc = { version = "3.1.0", features = ["tokio"] } +futures = "0.3.30" +indicatif = "0.17.8" +lazy_static = "1.4.0" +object_store = { version = "0.9.1", features = ["http"] } +parking_lot = "0.12.1" +pin-project-lite = "0.2.13" +prost = "0.12.1" +sled = { version = "0.34.7" } +thiserror = "1.0.38" +tokio-stream = { version = "0.1.14", features = ["fs", "net"] } +tokio-util = { version = "0.7.9", features = ["io", "io-util", "codec"] } +tokio-tar = "0.3.1" +tokio = { version = "1.32.0", features = ["fs", "macros", "net", "rt", "rt-multi-thread", "signal"] } +tonic = "0.11.0" +tower = "0.4.13" +tracing = "0.1.37" +tracing-indicatif = "0.3.6" +url = "2.4.0" +walkdir = "2.4.0" +zstd = "0.13.0" +serde = { version = "1.0.197", features = [ "derive" ] } +serde_with = "3.7.0" +serde_qs = "0.12.0" +petgraph = "0.6.4" + +[dependencies.bigtable_rs] +optional = true +# https://github.com/liufuyang/bigtable_rs/pull/72 +git = "https://github.com/flokli/bigtable_rs" +rev = "0af404741dfc40eb9fa99cf4d4140a09c5c20df7" + +[dependencies.fuse-backend-rs] +optional = true +version = "0.11.0" + +[dependencies.libc] +optional = true +version = "0.2.144" + +[dependencies.tonic-reflection] +optional = true +version = "0.11.0" + +[dependencies.vhost] +optional = true +version = "0.6" + +[dependencies.vhost-user-backend] +optional = true +version = "0.8" + +[dependencies.virtio-queue] +optional = true +version = "0.7" + +[dependencies.vm-memory] +optional = true +version = "0.10" + +[dependencies.vmm-sys-util] +optional = true +version = "0.11" + +[dependencies.virtio-bindings] +optional = true +version = "0.2.1" + +[build-dependencies] +prost-build = "0.12.1" +tonic-build = "0.11.0" + +[dev-dependencies] +async-process = "2.1.0" +rstest = "0.19.0" +tempfile = "3.3.0" +tokio-retry = "0.3.0" +hex-literal = "0.4.1" +rstest_reuse = "0.6.0" +xattr = "1.3.1" + +[features] +default = ["cloud"] +cloud = [ + "dep:bigtable_rs", + "object_store/aws", + "object_store/azure", + "object_store/gcp", +] +fs = ["dep:libc", "dep:fuse-backend-rs"] +virtiofs = [ + "fs", + "dep:vhost", + "dep:vhost-user-backend", + "dep:virtio-queue", + "dep:vm-memory", + "dep:vmm-sys-util", + "dep:virtio-bindings", + "fuse-backend-rs?/vhost-user-fs", # impl FsCacheReqHandler for SlaveFsCacheReq + "fuse-backend-rs?/virtiofs", +] +fuse = ["fs"] +tonic-reflection = ["dep:tonic-reflection"] +# Whether to run the integration tests. +# Requires the following packages in $PATH: +# cbtemulator, google-cloud-bigtable-tool +integration = [] + +[lints] +workspace = true diff --git a/tvix/castore/build.rs b/tvix/castore/build.rs new file mode 100644 index 0000000000..089c093e71 --- /dev/null +++ b/tvix/castore/build.rs @@ -0,0 +1,39 @@ +use std::io::Result; + +fn main() -> Result<()> { + #[allow(unused_mut)] + let mut builder = tonic_build::configure(); + + #[cfg(feature = "tonic-reflection")] + { + let out_dir = std::path::PathBuf::from(std::env::var("OUT_DIR").unwrap()); + let descriptor_path = out_dir.join("tvix.castore.v1.bin"); + + builder = builder.file_descriptor_set_path(descriptor_path); + }; + + // https://github.com/hyperium/tonic/issues/908 + let mut config = prost_build::Config::new(); + config.bytes(["."]); + config.type_attribute(".", "#[derive(Eq, Hash)]"); + + builder + .build_server(true) + .build_client(true) + .emit_rerun_if_changed(false) + .compile_with_config( + config, + &[ + "tvix/castore/protos/castore.proto", + "tvix/castore/protos/rpc_blobstore.proto", + "tvix/castore/protos/rpc_directory.proto", + ], + // If we are in running `cargo build` manually, using `../..` works fine, + // but in case we run inside a nix build, we need to instead point PROTO_ROOT + // to a sparseTree containing that structure. + &[match std::env::var_os("PROTO_ROOT") { + Some(proto_root) => proto_root.to_str().unwrap().to_owned(), + None => "../..".to_string(), + }], + ) +} diff --git a/tvix/castore/default.nix b/tvix/castore/default.nix new file mode 100644 index 0000000000..03a12b6c20 --- /dev/null +++ b/tvix/castore/default.nix @@ -0,0 +1,28 @@ +{ depot, pkgs, lib, ... }: + +(depot.tvix.crates.workspaceMembers.tvix-castore.build.override { + runTests = true; + testPreRun = '' + export SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt; + ''; +}).overrideAttrs (old: rec { + meta.ci.targets = [ "integration-tests" ] ++ lib.filter (x: lib.hasPrefix "with-features" x || x == "no-features") (lib.attrNames passthru); + passthru = (depot.tvix.utils.mkFeaturePowerset { + inherit (old) crateName; + features = ([ "cloud" "fuse" "tonic-reflection" ] + # virtiofs feature currently fails to build on Darwin + ++ lib.optional pkgs.stdenv.isLinux "virtiofs"); + override.testPreRun = '' + export SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt + ''; + }) // { + integration-tests = depot.tvix.crates.workspaceMembers.${old.crateName}.build.override (old: { + runTests = true; + testPreRun = '' + export SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt; + export PATH="$PATH:${pkgs.lib.makeBinPath [ pkgs.cbtemulator pkgs.google-cloud-bigtable-tool ]}" + ''; + features = old.features ++ [ "integration" ]; + }); + }; +}) diff --git a/tvix/castore/docs/blobstore-chunking.md b/tvix/castore/docs/blobstore-chunking.md new file mode 100644 index 0000000000..49bbe69275 --- /dev/null +++ b/tvix/castore/docs/blobstore-chunking.md @@ -0,0 +1,147 @@ +# BlobStore: Chunking & Verified Streaming + +`tvix-castore`'s BlobStore is a content-addressed storage system, using [blake3] +as hash function. + +Returned data is fetched by using the digest as lookup key, and can be verified +to be correct by feeding the received data through the hash function and +ensuring it matches the digest initially used for the lookup. + +This means, data can be downloaded by any untrusted third-party as well, as the +received data is validated to match the digest it was originally requested with. + +However, for larger blobs of data, having to download the entire blob at once is +wasteful, if we only care about a part of the blob. Think about mounting a +seekable data structure, like loop-mounting an .iso file, or doing partial reads +in a large Parquet file, a column-oriented data format. + +> We want to have the possibility to *seek* into a larger file. + +This however shouldn't compromise on data integrity properties - we should not +need to trust a peer we're downloading from to be "honest" about the partial +data we're reading. We should be able to verify smaller reads. + +Especially when substituting from an untrusted third-party, we want to be able +to detect quickly if that third-party is sending us wrong data, and terminate +the connection early. + +## Chunking +In content-addressed systems, this problem has historically been solved by +breaking larger blobs into smaller chunks, which can be fetched individually, +and making a hash of *this listing* the blob digest/identifier. + + - BitTorrent for example breaks files up into smaller chunks, and maintains + a list of sha1 digests for each of these chunks. Magnet links contain a + digest over this listing as an identifier. (See [bittorrent-v2][here for + more details]). + With the identifier, a client can fetch the entire list, and then recursively + "unpack the graph" of nodes, until it ends up with a list of individual small + chunks, which can be fetched individually. + - Similarly, IPFS with its IPLD model builds up a Merkle DAG, and uses the + digest of the root node as an identitier. + +These approaches solve the problem of being able to fetch smaller chunks in a +trusted fashion. They can also do some deduplication, in case there's the same +leaf nodes same leaf nodes in multiple places. + +However, they also have a big disadvantage. The chunking parameters, and the +"topology" of the graph structure itself "bleed" into the root hash of the +entire data structure itself. + +Depending on the chunking parameters used, there's different representations for +the same data, causing less data sharing/reuse in the overall system, in terms of how +many chunks need to be downloaded vs. are already available locally, as well as +how compact data is stored on-disk. + +This can be workarounded by agreeing on only a single way of chunking, but it's +not pretty and misses a lot of deduplication potential. + +### Chunking in Tvix' Blobstore +tvix-castore's BlobStore uses a hybrid approach to eliminate some of the +disadvantages, while still being content-addressed internally, with the +highlighted benefits. + +It uses [blake3] as hash function, and the blake3 digest of **the raw data +itself** as an identifier (rather than some application-specific Merkle DAG that +also embeds some chunking information). + +BLAKE3 is a tree hash where all left nodes fully populated, contrary to +conventional serial hash functions. To be able to validate the hash of a node, +one only needs the hash of the (2) children [^1], if any. + +This means one only needs to the root digest to validate a constructions, and these +constructions can be sent [separately][bao-spec]. + +This relieves us from the need of having to encode more granular chunking into +our data model / identifier upfront, but can make this a mostly a transport/ +storage concern. + +For the some more description on the (remote) protocol, check +`./blobstore-protocol.md`. + +#### Logical vs. physical chunking + +Due to the properties of the BLAKE3 hash function, we have logical blocks of +1KiB, but this doesn't necessarily imply we need to restrict ourselves to these +chunk sizes w.r.t. what "physical chunks" are sent over the wire between peers, +or are stored on-disk. + +The only thing we need to be able to read and verify an arbitrary byte range is +having the covering range of aligned 1K blocks, and a construction from the root +digest to the 1K block. + +Note the intermediate hash tree can be further trimmed, [omitting][bao-tree] +lower parts of the tree while still providing verified streaming - at the cost +of having to fetch larger covering ranges of aligned blocks. + +Let's pick an example. We identify each KiB by a number here for illustrational +purposes. + +Assuming we omit the last two layers of the hash tree, we end up with logical +4KiB leaf chunks (`bao_shift` of `2`). + +For a blob of 14 KiB total size, we could fetch logical blocks `[0..=3]`, +`[4..=7]`, `[8..=11]` and `[12..=13]` in an authenticated fashion: + +`[ 0 1 2 3 ] [ 4 5 6 7 ] [ 8 9 10 11 ] [ 12 13 ]` + +Assuming the server now informs us about the following physical chunking: + +``` +[ 0 1 ] [ 2 3 4 5 ] [ 6 ] [ 7 8 ] [ 9 10 11 12 13 14 15 ]` +``` + +If our application now wants to arbitrarily read from 0 until 4 (inclusive): + +``` +[ 0 1 ] [ 2 3 4 5 ] [ 6 ] [ 7 8 ] [ 9 10 11 12 13 14 15 ] + |-------------| + +``` + +…we need to fetch physical chunks `[ 0 1 ]`, `[ 2 3 4 5 ]` and `[ 6 ] [ 7 8 ]`. + + +`[ 0 1 ]` and `[ 2 3 4 5 ]` are obvious, they contain the data we're +interested in. + +We however also need to fetch the physical chunks `[ 6 ]` and `[ 7 8 ]`, so we +can assemble `[ 4 5 6 7 ]` to verify both logical chunks: + +``` +[ 0 1 ] [ 2 3 4 5 ] [ 6 ] [ 7 8 ] [ 9 10 11 12 13 14 15 ] +^ ^ ^ ^ +|----4KiB----|------4KiB-----| +``` + +Each physical chunk fetched can be validated to have the blake3 digest that was +communicated upfront, and can be stored in a client-side cache/storage, so +subsequent / other requests for the same data will be fast(er). + +--- + +[^1]: and the surrounding context, aka position inside the whole blob, which is available while verifying the tree +[bittorrent-v2]: https://blog.libtorrent.org/2020/09/bittorrent-v2/ +[blake3]: https://github.com/BLAKE3-team/BLAKE3 +[bao-spec]: https://github.com/oconnor663/bao/blob/master/docs/spec.md +[bao-tree]: https://github.com/n0-computer/bao-tree diff --git a/tvix/castore/docs/blobstore-protocol.md b/tvix/castore/docs/blobstore-protocol.md new file mode 100644 index 0000000000..048cafc3d8 --- /dev/null +++ b/tvix/castore/docs/blobstore-protocol.md @@ -0,0 +1,104 @@ +# BlobStore: Protocol / Composition + +This documents describes the protocol that BlobStore uses to substitute blobs +other ("remote") BlobStores. + +How to come up with the blake3 digest of the blob to fetch is left to another +layer in the stack. + +To put this into the context of Tvix as a Nix alternative, a blob represents an +individual file inside a StorePath. +In the Tvix Data Model, this is accomplished by having a `FileNode` (either the +`root_node` in a `PathInfo` message, or a individual file inside a `Directory` +message) encode a BLAKE3 digest. + +However, the whole infrastructure can be applied for other usecases requiring +exchange/storage or access into data of which the blake3 digest is known. + +## Protocol and Interfaces +As an RPC protocol, BlobStore currently uses gRPC. + +On the Rust side of things, every blob service implements the +[`BlobService`](../src/blobservice/mod.rs) async trait, which isn't +gRPC-specific. + +This `BlobService` trait provides functionality to check for existence of Blobs, +read from blobs, and write new blobs. +It also provides a method to ask for more granular chunks if they are available. + +In addition to some in-memory, on-disk and (soon) object-storage-based +implementations, we also have a `BlobService` implementation that talks to a +gRPC server, as well as a gRPC server wrapper component, which provides a gRPC +service for anything implementing the `BlobService` trait. + +This makes it very easy to talk to a remote `BlobService`, which does not even +need to be written in the same language, as long it speaks the same gRPC +protocol. + +It also puts very little requirements on someone implementing a new +`BlobService`, and how its internal storage or chunking algorithm looks like. + +The gRPC protocol is documented in `../protos/rpc_blobstore.proto`. +Contrary to the `BlobService` trait, it does not have any options for seeking/ +ranging, as it's more desirable to provide this through chunking (see also +`./blobstore-chunking.md`). + +## Composition +Different `BlobStore` are supposed to be "composed"/"layered" to express +caching, multiple local and remote sources. + +The fronting interface can be the same, it'd just be multiple "tiers" that can +respond to requests, depending on where the data resides. [^1] + +This makes it very simple for consumers, as they don't need to be aware of the +entire substitutor config. + +The flexibility of this doesn't need to be exposed to the user in the default +case; in most cases we should be fine with some form of on-disk storage and a +bunch of substituters with different priorities. + +### gRPC Clients +Clients are encouraged to always read blobs in a chunked fashion (asking for a +list of chunks for a blob via `BlobService.Stat()`, then fetching chunks via +`BlobService.Read()` as needed), instead of directly reading the entire blob via +`BlobService.Read()`. + +In a composition setting, this provides opportunity for caching, and avoids +downloading some chunks if they're already present locally (for example, because +they were already downloaded by reading from a similar blob earlier). + +It also removes the need for seeking to be a part of the gRPC protocol +alltogether, as chunks are supposed to be "reasonably small" [^2]. + +There's some further optimization potential, a `BlobService.Stat()` request +could tell the server it's happy with very small blobs just being inlined in +an additional additional field in the response, which would allow clients to +populate their local chunk store in a single roundtrip. + +## Verified Streaming +As already described in `./docs/blobstore-chunking.md`, the physical chunk +information sent in a `BlobService.Stat()` response is still sufficient to fetch +in an authenticated fashion. + +The exact protocol and formats are still a bit in flux, but here's some notes: + + - `BlobService.Stat()` request gets a `send_bao` field (bool), signalling a + [BAO][bao-spec] should be sent. Could also be `bao_shift` integer, signalling + how detailed (down to the leaf chunks) it should go. + The exact format (and request fields) still need to be defined, edef has some + ideas around omitting some intermediate hash nodes over the wire and + recomputing them, reducing size by another ~50% over [bao-tree]. + - `BlobService.Stat()` response gets some bao-related fields (`bao_shift` + field, signalling the actual format/shift level the server replies with, the + actual bao, and maybe some format specifier). + It would be nice to also be compatible with the baos used by [iroh], so we + can provide an implementation using it too. + +--- + +[^1]: We might want to have some backchannel, so it becomes possible to provide + feedback to the user that something is downloaded. +[^2]: Something between 512K-4M, TBD. +[bao-spec]: https://github.com/oconnor663/bao/blob/master/docs/spec.md +[bao-tree]: https://github.com/n0-computer/bao-tree +[iroh]: https://github.com/n0-computer/iroh diff --git a/tvix/castore/docs/data-model.md b/tvix/castore/docs/data-model.md new file mode 100644 index 0000000000..2df6761aae --- /dev/null +++ b/tvix/castore/docs/data-model.md @@ -0,0 +1,50 @@ +# Data model + +This provides some more notes on the fields used in castore.proto. + +See `//tvix/store/docs/api.md` for the full context. + +## Directory message +`Directory` messages use the blake3 hash of their canonical protobuf +serialization as its identifier. + +A `Directory` message contains three lists, `directories`, `files` and +`symlinks`, holding `DirectoryNode`, `FileNode` and `SymlinkNode` messages +respectively. They describe all the direct child elements that are contained in +a directory. + +All three message types have a `name` field, specifying the (base)name of the +element (which MUST not contain slashes or null bytes, and MUST not be '.' or '..'). +For reproducibility reasons, the lists MUST be sorted by that name and also +MUST be unique across all three lists. + +In addition to the `name` field, the various *Node messages have the following +fields: + +## DirectoryNode +A `DirectoryNode` message represents a child directory. + +It has a `digest` field, which points to the identifier of another `Directory` +message, making a `Directory` a merkle tree (or strictly speaking, a graph, as +two elements pointing to a child directory with the same contents would point +to the same `Directory` message. + +There's also a `size` field, containing the (total) number of all child +elements in the referenced `Directory`, which helps for inode calculation. + +## FileNode +A `FileNode` message represents a child (regular) file. + +Its `digest` field contains the blake3 hash of the file contents. It can be +looked up in the `BlobService`. + +The `size` field contains the size of the blob the `digest` field refers to. + +The `executable` field specifies whether the file should be marked as +executable or not. + +## SymlinkNode +A `SymlinkNode` message represents a child symlink. + +In addition to the `name` field, the only additional field is the `target`, +which is a string containing the target of the symlink. diff --git a/tvix/castore/docs/why-not-git-trees.md b/tvix/castore/docs/why-not-git-trees.md new file mode 100644 index 0000000000..fd46252cf5 --- /dev/null +++ b/tvix/castore/docs/why-not-git-trees.md @@ -0,0 +1,57 @@ +## Why not git tree objects? + +We've been experimenting with (some variations of) the git tree and object +format, and ultimately decided against using it as an internal format, and +instead adapted the one documented in the other documents here. + +While the tvix-store API protocol shares some similarities with the format used +in git for trees and objects, the git one has shown some significant +disadvantages: + +### The binary encoding itself + +#### trees +The git tree object format is a very binary, error-prone and +"made-to-be-read-and-written-from-C" format. + +Tree objects are a combination of null-terminated strings, and fields of known +length. References to other tree objects use the literal sha1 hash of another +tree object in this encoding. +Extensions of the format/changes are very hard to do right, because parsers are +not aware they might be parsing something different. + +The tvix-store protocol uses a canonical protobuf serialization, and uses +the [blake3][blake3] hash of that serialization to point to other `Directory` +messages. +It's both compact and with a wide range of libraries for encoders and decoders +in many programming languages. +The choice of protobuf makes it easy to add new fields, and make old clients +aware of some unknown fields being detected [^adding-fields]. + +#### blob +On disk, git blob objects start with a "blob" prefix, then the size of the +payload, and then the data itself. The hash of a blob is the literal sha1sum +over all of this - which makes it something very git specific to request for. + +tvix-store simply uses the [blake3][blake3] hash of the literal contents +when referring to a file/blob, which makes it very easy to ask other data +sources for the same data, as no git-specific payload is included in the hash. +This also plays very well together with things like [iroh][iroh-discussion], +which plans to provide a way to substitute (large)blobs by their blake3 hash +over the IPFS network. + +In addition to that, [blake3][blake3] makes it possible to do +[verified streaming][bao], as already described in other parts of the +documentation. + +The git tree object format uses sha1 both for references to other trees and +hashes of blobs, which isn't really a hash function to fundamentally base +everything on in 2023. +The [migration to sha256][git-sha256] also has been dead for some years now, +and it's unclear how a "blake3" version of this would even look like. + +[bao]: https://github.com/oconnor663/bao +[blake3]: https://github.com/BLAKE3-team/BLAKE3 +[git-sha256]: https://git-scm.com/docs/hash-function-transition/ +[iroh-discussion]: https://github.com/n0-computer/iroh/discussions/707#discussioncomment-5070197 +[^adding-fields]: Obviously, adding new fields will change hashes, but it's something that's easy to detect. \ No newline at end of file diff --git a/tvix/castore/protos/LICENSE b/tvix/castore/protos/LICENSE new file mode 100644 index 0000000000..2034ada6fd --- /dev/null +++ b/tvix/castore/protos/LICENSE @@ -0,0 +1,21 @@ +Copyright © The Tvix Authors + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +“Software”), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/tvix/castore/protos/castore.proto b/tvix/castore/protos/castore.proto new file mode 100644 index 0000000000..1ef4044045 --- /dev/null +++ b/tvix/castore/protos/castore.proto @@ -0,0 +1,71 @@ +// SPDX-FileCopyrightText: edef <edef@unfathomable.blue> +// SPDX-License-Identifier: OSL-3.0 OR MIT OR Apache-2.0 + +syntax = "proto3"; + +package tvix.castore.v1; + +option go_package = "code.tvl.fyi/tvix/castore-go;castorev1"; + +// A Directory can contain Directory, File or Symlink nodes. +// Each of these nodes have a name attribute, which is the basename in that +// directory and node type specific attributes. +// The name attribute: +// - MUST not contain slashes or null bytes +// - MUST not be '.' or '..' +// - MUST be unique across all three lists +// Elements in each list need to be lexicographically ordered by the name +// attribute. +message Directory { + repeated DirectoryNode directories = 1; + repeated FileNode files = 2; + repeated SymlinkNode symlinks = 3; +} + +// A DirectoryNode represents a directory in a Directory. +message DirectoryNode { + // The (base)name of the directory + bytes name = 1; + // The blake3 hash of a Directory message, serialized in protobuf canonical form. + bytes digest = 2; + // Number of child elements in the Directory referred to by `digest`. + // Calculated by summing up the numbers of `directories`, `files` and + // `symlinks`, and for each directory, its size field. Used for inode number + // calculation. + // This field is precisely as verifiable as any other Merkle tree edge. + // Resolve `digest`, and you can compute it incrementally. Resolve the entire + // tree, and you can fully compute it from scratch. + // A credulous implementation won't reject an excessive size, but this is + // harmless: you'll have some ordinals without nodes. Undersizing is obvious + // and easy to reject: you won't have an ordinal for some nodes. + uint64 size = 3; +} + +// A FileNode represents a regular or executable file in a Directory. +message FileNode { + // The (base)name of the file + bytes name = 1; + // The blake3 digest of the file contents + bytes digest = 2; + // The file content size + uint64 size = 3; + // Whether the file is executable + bool executable = 4; +} + +// A SymlinkNode represents a symbolic link in a Directory. +message SymlinkNode { + // The (base)name of the symlink + bytes name = 1; + // The target of the symlink. + bytes target = 2; +} + +// A Node is either a DirectoryNode, FileNode or SymlinkNode. +message Node { + oneof node { + DirectoryNode directory = 1; + FileNode file = 2; + SymlinkNode symlink = 3; + } +} diff --git a/tvix/castore/protos/default.nix b/tvix/castore/protos/default.nix new file mode 100644 index 0000000000..feef55690f --- /dev/null +++ b/tvix/castore/protos/default.nix @@ -0,0 +1,54 @@ +{ depot, pkgs, ... }: +let + protos = depot.nix.sparseTree { + name = "castore-protos"; + root = depot.path.origSrc; + paths = [ + ./castore.proto + ./rpc_blobstore.proto + ./rpc_directory.proto + ../../../buf.yaml + ../../../buf.gen.yaml + ]; + }; +in +depot.nix.readTree.drvTargets { + inherit protos; + + # Lints and ensures formatting of the proto files. + check = pkgs.stdenv.mkDerivation { + name = "proto-check"; + src = protos; + + nativeBuildInputs = [ + pkgs.buf + ]; + + buildPhase = '' + export HOME=$TMPDIR + buf lint + buf format -d --exit-code + touch $out + ''; + }; + + # Produces the golang bindings. + go-bindings = pkgs.stdenv.mkDerivation { + name = "go-bindings"; + src = protos; + + nativeBuildInputs = [ + pkgs.buf + pkgs.protoc-gen-go + pkgs.protoc-gen-go-grpc + ]; + + buildPhase = '' + export HOME=$TMPDIR + buf generate + + mkdir -p $out + cp tvix/castore/protos/*.pb.go $out/ + ''; + }; +} diff --git a/tvix/castore/protos/rpc_blobstore.proto b/tvix/castore/protos/rpc_blobstore.proto new file mode 100644 index 0000000000..eebe39ace7 --- /dev/null +++ b/tvix/castore/protos/rpc_blobstore.proto @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: MIT +// Copyright © 2022 The Tvix Authors +syntax = "proto3"; + +package tvix.castore.v1; + +option go_package = "code.tvl.fyi/tvix/castore-go;castorev1"; + +// BlobService allows reading (or uploading) content-addressed blobs of data. +// BLAKE3 is used as a hashing function for the data. Uploading a blob will +// return the BLAKE3 digest of it, and that's the identifier used to Read/Stat +// them too. +service BlobService { + // Stat can be used to check for the existence of a blob, as well as + // gathering more data about it, like more granular chunking information + // or baos. + // Server implementations are not required to provide more granular chunking + // information, especially if the digest specified in `StatBlobRequest` is + // already a chunk of a blob. + rpc Stat(StatBlobRequest) returns (StatBlobResponse); + + // Read allows reading (all) data of a blob/chunk by the BLAKE3 digest of + // its contents. + // If the backend communicated more granular chunks in the `Stat` request, + // this can also be used to read chunks. + // This request returns a stream of BlobChunk, which is just a container for + // a stream of bytes. + // The server may decide on whatever chunking it may seem fit as a size for + // the individual BlobChunk sent in the response stream, this is mostly to + // keep individual messages at a manageable size. + rpc Read(ReadBlobRequest) returns (stream BlobChunk); + + // Put uploads a Blob, by reading a stream of bytes. + // + // The way the data is chunked up in individual BlobChunk messages sent in + // the stream has no effect on how the server ends up chunking blobs up, if + // it does at all. + rpc Put(stream BlobChunk) returns (PutBlobResponse); +} + +message StatBlobRequest { + // The blake3 digest of the blob requested + bytes digest = 1; + + // Whether the server should reply with a list of more granular chunks. + bool send_chunks = 2; + + // Whether the server should reply with a bao. + bool send_bao = 3; +} + +message StatBlobResponse { + // If `send_chunks` was set to true, this MAY contain a list of more + // granular chunks, which then may be read individually via the `Read` + // method. + repeated ChunkMeta chunks = 2; + + message ChunkMeta { + // Digest of that specific chunk + bytes digest = 1; + + // Length of that chunk, in bytes. + uint64 size = 2; + } + + // If `send_bao` was set to true, this MAY contain a outboard bao. + // The exact format and message types here will still be fleshed out. + bytes bao = 3; +} + +message ReadBlobRequest { + // The blake3 digest of the blob or chunk requested + bytes digest = 1; +} + +// This represents some bytes of a blob. +// Blobs are sent in smaller chunks to keep message sizes manageable. +message BlobChunk { + bytes data = 1; +} + +message PutBlobResponse { + // The blake3 digest of the data that was sent. + bytes digest = 1; +} diff --git a/tvix/castore/protos/rpc_directory.proto b/tvix/castore/protos/rpc_directory.proto new file mode 100644 index 0000000000..f4f41c433a --- /dev/null +++ b/tvix/castore/protos/rpc_directory.proto @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: MIT +// Copyright © 2022 The Tvix Authors +syntax = "proto3"; + +package tvix.castore.v1; + +import "tvix/castore/protos/castore.proto"; + +option go_package = "code.tvl.fyi/tvix/castore-go;castorev1"; + +service DirectoryService { + // Get retrieves a stream of Directory messages, by using the lookup + // parameters in GetDirectoryRequest. + // Keep in mind multiple DirectoryNodes in different parts of the graph might + // have the same digest if they have the same underlying contents, + // so sending subsequent ones can be omitted. + // + // It is okay for certain implementations to only allow retrieval of + // Directory digests that are at the "root", aka the last element that's + // sent in a Put. This makes sense for implementations bundling closures of + // directories together in batches. + rpc Get(GetDirectoryRequest) returns (stream Directory); + + // Put uploads a graph of Directory messages. + // Individual Directory messages need to be send in an order walking up + // from the leaves to the root - a Directory message can only refer to + // Directory messages previously sent in the same stream. + // Keep in mind multiple DirectoryNodes in different parts of the graph might + // have the same digest if they have the same underlying contents, + // so sending subsequent ones can be omitted. + // We might add a separate method, allowing to send partial graphs at a later + // time, if requiring to send the full graph turns out to be a problem. + rpc Put(stream Directory) returns (PutDirectoryResponse); +} + +message GetDirectoryRequest { + oneof by_what { + // The blake3 hash of the (root) Directory message, serialized in + // protobuf canonical form. + // Keep in mind this can be a subtree of another root. + bytes digest = 1; + } + + // If set to true, recursively resolve all child Directory messages. + // Directory messages SHOULD be streamed in a recursive breadth-first walk, + // but other orders are also fine, as long as Directory messages are only + // sent after they are referred to from previously sent Directory messages. + bool recursive = 2; +} + +message PutDirectoryResponse { + bytes root_digest = 1; +} diff --git a/tvix/castore/src/blobservice/chunked_reader.rs b/tvix/castore/src/blobservice/chunked_reader.rs new file mode 100644 index 0000000000..6e8355874b --- /dev/null +++ b/tvix/castore/src/blobservice/chunked_reader.rs @@ -0,0 +1,496 @@ +use futures::{ready, TryStreamExt}; +use pin_project_lite::pin_project; +use tokio::io::{AsyncRead, AsyncSeekExt}; +use tokio_stream::StreamExt; +use tokio_util::io::{ReaderStream, StreamReader}; +use tracing::{instrument, trace, warn}; + +use crate::B3Digest; +use std::{cmp::Ordering, pin::Pin}; + +use super::{BlobReader, BlobService}; + +pin_project! { + /// ChunkedReader provides a chunk-aware [BlobReader], so allows reading and + /// seeking into a blob. + /// It internally holds a [ChunkedBlob], which is storing chunk information + /// able to emit a reader seeked to a specific position whenever we need to seek. + pub struct ChunkedReader<BS> { + chunked_blob: ChunkedBlob<BS>, + + #[pin] + r: Box<dyn AsyncRead + Unpin + Send>, + + pos: u64, + } +} + +impl<BS> ChunkedReader<BS> +where + BS: AsRef<dyn BlobService> + Clone + 'static + Send, +{ + /// Construct a new [ChunkedReader], by retrieving a list of chunks (their + /// blake3 digests and chunk sizes) + pub fn from_chunks(chunks_it: impl Iterator<Item = (B3Digest, u64)>, blob_service: BS) -> Self { + let chunked_blob = ChunkedBlob::from_iter(chunks_it, blob_service); + let r = chunked_blob.reader_skipped_offset(0); + + Self { + chunked_blob, + r, + pos: 0, + } + } +} + +/// ChunkedReader implements BlobReader. +impl<BS> BlobReader for ChunkedReader<BS> where BS: Send + Clone + 'static + AsRef<dyn BlobService> {} + +impl<BS> tokio::io::AsyncRead for ChunkedReader<BS> +where + BS: AsRef<dyn BlobService> + Clone + 'static, +{ + fn poll_read( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> std::task::Poll<std::io::Result<()>> { + // The amount of data read can be determined by the increase + // in the length of the slice returned by `ReadBuf::filled`. + let filled_before = buf.filled().len(); + + let this = self.project(); + + ready!(this.r.poll_read(cx, buf))?; + let bytes_read = buf.filled().len() - filled_before; + *this.pos += bytes_read as u64; + + Ok(()).into() + } +} + +impl<BS> tokio::io::AsyncSeek for ChunkedReader<BS> +where + BS: AsRef<dyn BlobService> + Clone + Send + 'static, +{ + #[instrument(skip(self), err(Debug))] + fn start_seek(self: Pin<&mut Self>, position: std::io::SeekFrom) -> std::io::Result<()> { + let total_len = self.chunked_blob.blob_length(); + let mut this = self.project(); + + let absolute_offset: u64 = match position { + std::io::SeekFrom::Start(from_start) => from_start, + std::io::SeekFrom::End(from_end) => { + // note from_end is i64, not u64, so this is usually negative. + total_len.checked_add_signed(from_end).ok_or_else(|| { + std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "over/underflow while seeking", + ) + })? + } + std::io::SeekFrom::Current(from_current) => { + // note from_end is i64, not u64, so this can be positive or negative. + (*this.pos) + .checked_add_signed(from_current) + .ok_or_else(|| { + std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "over/underflow while seeking", + ) + })? + } + }; + + // check if the position actually did change. + if absolute_offset != *this.pos { + // ensure the new position still is inside the file. + if absolute_offset > total_len { + Err(std::io::Error::new( + std::io::ErrorKind::InvalidInput, + "seeked beyond EOF", + ))? + } + + // Update the position and the internal reader. + *this.pos = absolute_offset; + + // FUTUREWORK: if we can seek forward, avoid re-assembling. + // At least if it's still in the same chunk? + *this.r = this.chunked_blob.reader_skipped_offset(absolute_offset); + } + + Ok(()) + } + + fn poll_complete( + self: Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll<std::io::Result<u64>> { + std::task::Poll::Ready(Ok(self.pos)) + } +} + +/// Holds a list of blake3 digest for individual chunks (and their sizes). +/// Is able to construct a Reader that seeked to a certain offset, which +/// is useful to construct a BlobReader (that implements AsyncSeek). +/// - the current chunk index, and a Custor<Vec<u8>> holding the data of that chunk. +struct ChunkedBlob<BS> { + blob_service: BS, + chunks: Vec<(u64, u64, B3Digest)>, +} + +impl<BS> ChunkedBlob<BS> +where + BS: AsRef<dyn BlobService> + Clone + 'static + Send, +{ + /// Constructs [Self] from a list of blake3 digests of chunks and their + /// sizes, and a reference to a blob service. + /// Initializing it with an empty list is disallowed. + fn from_iter(chunks_it: impl Iterator<Item = (B3Digest, u64)>, blob_service: BS) -> Self { + let mut chunks = Vec::new(); + let mut offset: u64 = 0; + + for (chunk_digest, chunk_size) in chunks_it { + chunks.push((offset, chunk_size, chunk_digest)); + offset += chunk_size; + } + + assert!( + !chunks.is_empty(), + "Chunks must be provided, don't use this for blobs without chunks" + ); + + Self { + blob_service, + chunks, + } + } + + /// Returns the length of the blob. + fn blob_length(&self) -> u64 { + self.chunks + .last() + .map(|(chunk_offset, chunk_size, _)| chunk_offset + chunk_size) + .unwrap_or(0) + } + + /// For a given position pos, return the chunk containing the data. + /// In case this would range outside the blob, None is returned. + #[instrument(level = "trace", skip(self), ret)] + fn get_chunk_idx_for_position(&self, pos: u64) -> Option<usize> { + // FUTUREWORK: benchmark when to use linear search, binary_search and BTreeSet + self.chunks + .binary_search_by(|(chunk_start_pos, chunk_size, _)| { + if chunk_start_pos + chunk_size <= pos { + Ordering::Less + } else if *chunk_start_pos > pos { + Ordering::Greater + } else { + Ordering::Equal + } + }) + .ok() + } + + /// Returns a stream of bytes of the data in that blob. + /// It internally assembles a stream reading from each chunk (skipping over + /// chunks containing irrelevant data). + /// From the first relevant chunk, the irrelevant bytes are skipped too. + /// The returned boxed thing does not implement AsyncSeek on its own, but + /// ChunkedReader does. + #[instrument(level = "trace", skip(self))] + fn reader_skipped_offset(&self, offset: u64) -> Box<dyn tokio::io::AsyncRead + Send + Unpin> { + if offset == self.blob_length() { + return Box::new(std::io::Cursor::new(vec![])); + } + // construct a stream of all chunks starting with the given offset + let start_chunk_idx = self + .get_chunk_idx_for_position(offset) + .expect("outside of blob"); + // It's ok to panic here, we can only reach this by seeking, and seeking should already reject out-of-file seeking. + + let skip_first_chunk_bytes = (offset - self.chunks[start_chunk_idx].0) as usize; + + let blob_service = self.blob_service.clone(); + let chunks: Vec<_> = self.chunks[start_chunk_idx..].to_vec(); + let readers_stream = tokio_stream::iter(chunks.into_iter().enumerate()).map( + move |(nth_chunk, (_chunk_start_offset, chunk_size, chunk_digest))| { + let chunk_digest = chunk_digest.to_owned(); + let blob_service = blob_service.clone(); + async move { + trace!(chunk_size=%chunk_size, chunk_digest=%chunk_digest, "open_read on chunk in stream"); + let mut blob_reader = blob_service + .as_ref() + .open_read(&chunk_digest.to_owned()) + .await? + .ok_or_else(|| { + warn!(chunk.digest = %chunk_digest, "chunk not found"); + std::io::Error::new(std::io::ErrorKind::NotFound, "chunk not found") + })?; + + // iff this is the first chunk in the stream, skip by skip_first_chunk_bytes + if nth_chunk == 0 && skip_first_chunk_bytes > 0 { + blob_reader + .seek(std::io::SeekFrom::Start(skip_first_chunk_bytes as u64)) + .await?; + } + Ok::<_, std::io::Error>(blob_reader) + } + }, + ); + + // convert the stream of readers to a stream of streams of byte chunks + let bytes_streams = readers_stream.then(|elem| async { elem.await.map(ReaderStream::new) }); + + // flatten into one stream of byte chunks + let bytes_stream = bytes_streams.try_flatten(); + + // convert into AsyncRead + Box::new(StreamReader::new(Box::pin(bytes_stream))) + } +} + +#[cfg(test)] +mod test { + use std::{io::SeekFrom, sync::Arc}; + + use crate::{ + blobservice::{chunked_reader::ChunkedReader, BlobService, MemoryBlobService}, + B3Digest, + }; + use hex_literal::hex; + use lazy_static::lazy_static; + use tokio::io::{AsyncReadExt, AsyncSeekExt}; + + const CHUNK_1: [u8; 2] = hex!("0001"); + const CHUNK_2: [u8; 4] = hex!("02030405"); + const CHUNK_3: [u8; 1] = hex!("06"); + const CHUNK_4: [u8; 2] = hex!("0708"); + const CHUNK_5: [u8; 7] = hex!("090a0b0c0d0e0f"); + + lazy_static! { + // `[ 0 1 ] [ 2 3 4 5 ] [ 6 ] [ 7 8 ] [ 9 10 11 12 13 14 15 ]` + pub static ref CHUNK_1_DIGEST: B3Digest = blake3::hash(&CHUNK_1).as_bytes().into(); + pub static ref CHUNK_2_DIGEST: B3Digest = blake3::hash(&CHUNK_2).as_bytes().into(); + pub static ref CHUNK_3_DIGEST: B3Digest = blake3::hash(&CHUNK_3).as_bytes().into(); + pub static ref CHUNK_4_DIGEST: B3Digest = blake3::hash(&CHUNK_4).as_bytes().into(); + pub static ref CHUNK_5_DIGEST: B3Digest = blake3::hash(&CHUNK_5).as_bytes().into(); + pub static ref BLOB_1_LIST: [(B3Digest, u64); 5] = [ + (CHUNK_1_DIGEST.clone(), 2), + (CHUNK_2_DIGEST.clone(), 4), + (CHUNK_3_DIGEST.clone(), 1), + (CHUNK_4_DIGEST.clone(), 2), + (CHUNK_5_DIGEST.clone(), 7), + ]; + } + + use super::ChunkedBlob; + + /// ensure the start offsets are properly calculated. + #[test] + fn from_iter() { + let cb = ChunkedBlob::from_iter( + BLOB_1_LIST.clone().into_iter(), + Arc::new(MemoryBlobService::default()) as Arc<dyn BlobService>, + ); + + assert_eq!( + cb.chunks, + Vec::from_iter([ + (0, 2, CHUNK_1_DIGEST.clone()), + (2, 4, CHUNK_2_DIGEST.clone()), + (6, 1, CHUNK_3_DIGEST.clone()), + (7, 2, CHUNK_4_DIGEST.clone()), + (9, 7, CHUNK_5_DIGEST.clone()), + ]) + ); + } + + /// ensure ChunkedBlob can't be used with an empty list of chunks + #[test] + #[should_panic] + fn from_iter_empty() { + ChunkedBlob::from_iter( + [].into_iter(), + Arc::new(MemoryBlobService::default()) as Arc<dyn BlobService>, + ); + } + + /// ensure the right chunk is selected + #[test] + fn chunk_idx_for_position() { + let cb = ChunkedBlob::from_iter( + BLOB_1_LIST.clone().into_iter(), + Arc::new(MemoryBlobService::default()) as Arc<dyn BlobService>, + ); + + assert_eq!(Some(0), cb.get_chunk_idx_for_position(0), "start of blob"); + + assert_eq!( + Some(0), + cb.get_chunk_idx_for_position(1), + "middle of first chunk" + ); + assert_eq!( + Some(1), + cb.get_chunk_idx_for_position(2), + "beginning of second chunk" + ); + + assert_eq!( + Some(4), + cb.get_chunk_idx_for_position(15), + "right before the end of the blob" + ); + assert_eq!( + None, + cb.get_chunk_idx_for_position(16), + "right outside the blob" + ); + assert_eq!( + None, + cb.get_chunk_idx_for_position(100), + "way outside the blob" + ); + } + + /// returns a blobservice with all chunks in BLOB_1 present. + async fn gen_blobservice_blob1() -> Arc<dyn BlobService> { + let blob_service = Arc::new(MemoryBlobService::default()) as Arc<dyn BlobService>; + + // seed blob service with all chunks + for blob_contents in [ + CHUNK_1.to_vec(), + CHUNK_2.to_vec(), + CHUNK_3.to_vec(), + CHUNK_4.to_vec(), + CHUNK_5.to_vec(), + ] { + let mut bw = blob_service.open_write().await; + tokio::io::copy(&mut std::io::Cursor::new(blob_contents), &mut bw) + .await + .expect("writing blob"); + bw.close().await.expect("close blobwriter"); + } + + blob_service + } + + #[tokio::test] + async fn test_read() { + let blob_service = gen_blobservice_blob1().await; + let mut chunked_reader = + ChunkedReader::from_chunks(BLOB_1_LIST.clone().into_iter(), blob_service); + + // read all data + let mut buf = Vec::new(); + tokio::io::copy(&mut chunked_reader, &mut buf) + .await + .expect("copy"); + + assert_eq!( + hex!("000102030405060708090a0b0c0d0e0f").to_vec(), + buf, + "read data must match" + ); + } + + #[tokio::test] + async fn test_seek() { + let blob_service = gen_blobservice_blob1().await; + let mut chunked_reader = + ChunkedReader::from_chunks(BLOB_1_LIST.clone().into_iter(), blob_service); + + // seek to the end + // expect to read 0 bytes + { + chunked_reader + .seek(SeekFrom::End(0)) + .await + .expect("seek to end"); + + let mut buf = Vec::new(); + chunked_reader + .read_to_end(&mut buf) + .await + .expect("read to end"); + + assert_eq!(hex!("").to_vec(), buf); + } + + // seek one bytes before the end + { + chunked_reader.seek(SeekFrom::End(-1)).await.expect("seek"); + + let mut buf = Vec::new(); + chunked_reader + .read_to_end(&mut buf) + .await + .expect("read to end"); + + assert_eq!(hex!("0f").to_vec(), buf); + } + + // seek back three bytes, but using relative positioning + // read two bytes + { + chunked_reader + .seek(SeekFrom::Current(-3)) + .await + .expect("seek"); + + let mut buf = [0b0; 2]; + chunked_reader + .read_exact(&mut buf) + .await + .expect("read exact"); + + assert_eq!(hex!("0d0e"), buf); + } + } + + // seeds a blob service with only the first two chunks, reads a bit in the + // front (which succeeds), but then tries to seek past and read more (which + // should fail). + #[tokio::test] + async fn test_read_missing_chunks() { + let blob_service = Arc::new(MemoryBlobService::default()) as Arc<dyn BlobService>; + + for blob_contents in [CHUNK_1.to_vec(), CHUNK_2.to_vec()] { + let mut bw = blob_service.open_write().await; + tokio::io::copy(&mut std::io::Cursor::new(blob_contents), &mut bw) + .await + .expect("writing blob"); + + bw.close().await.expect("close blobwriter"); + } + + let mut chunked_reader = + ChunkedReader::from_chunks(BLOB_1_LIST.clone().into_iter(), blob_service); + + // read a bit from the front (5 bytes out of 6 available) + let mut buf = [0b0; 5]; + chunked_reader + .read_exact(&mut buf) + .await + .expect("read exact"); + + assert_eq!(hex!("0001020304"), buf); + + // seek 2 bytes forward, into an area where we don't have chunks + chunked_reader + .seek(SeekFrom::Current(2)) + .await + .expect("seek"); + + let mut buf = Vec::new(); + chunked_reader + .read_to_end(&mut buf) + .await + .expect_err("must fail"); + + // FUTUREWORK: check semantics on errorkinds. Should this be InvalidData + // or NotFound? + } +} diff --git a/tvix/castore/src/blobservice/combinator.rs b/tvix/castore/src/blobservice/combinator.rs new file mode 100644 index 0000000000..067eff96f4 --- /dev/null +++ b/tvix/castore/src/blobservice/combinator.rs @@ -0,0 +1,132 @@ +use futures::{StreamExt, TryStreamExt}; +use tokio_util::io::{ReaderStream, StreamReader}; +use tonic::async_trait; +use tracing::{instrument, warn}; + +use crate::B3Digest; + +use super::{naive_seeker::NaiveSeeker, BlobReader, BlobService, BlobWriter}; + +/// Combinator for a BlobService, using a "local" and "remote" blobservice. +/// Requests are tried in (and returned from) the local store first, only if +/// things are not present there, the remote BlobService is queried. +/// In case the local blobservice doesn't have the blob, we ask the remote +/// blobservice for chunks, and try to read each of these chunks from the local +/// blobservice again, before falling back to the remote one. +/// The remote BlobService is never written to. +pub struct CombinedBlobService<BL, BR> { + local: BL, + remote: BR, +} + +impl<BL, BR> Clone for CombinedBlobService<BL, BR> +where + BL: Clone, + BR: Clone, +{ + fn clone(&self) -> Self { + Self { + local: self.local.clone(), + remote: self.remote.clone(), + } + } +} + +#[async_trait] +impl<BL, BR> BlobService for CombinedBlobService<BL, BR> +where + BL: AsRef<dyn BlobService> + Clone + Send + Sync + 'static, + BR: AsRef<dyn BlobService> + Clone + Send + Sync + 'static, +{ + #[instrument(skip(self, digest), fields(blob.digest=%digest))] + async fn has(&self, digest: &B3Digest) -> std::io::Result<bool> { + Ok(self.local.as_ref().has(digest).await? || self.remote.as_ref().has(digest).await?) + } + + #[instrument(skip(self, digest), fields(blob.digest=%digest), err)] + async fn open_read(&self, digest: &B3Digest) -> std::io::Result<Option<Box<dyn BlobReader>>> { + if self.local.as_ref().has(digest).await? { + // local store has the blob, so we can assume it also has all chunks. + self.local.as_ref().open_read(digest).await + } else { + // Local store doesn't have the blob. + // Ask the remote one for the list of chunks, + // and create a chunked reader that uses self.open_read() for + // individual chunks. There's a chance we already have some chunks + // locally, meaning we don't need to fetch them all from the remote + // BlobService. + match self.remote.as_ref().chunks(digest).await? { + // blob doesn't exist on the remote side either, nothing we can do. + None => Ok(None), + Some(remote_chunks) => { + // if there's no more granular chunks, or the remote + // blobservice doesn't support chunks, read the blob from + // the remote blobservice directly. + if remote_chunks.is_empty() { + return self.remote.as_ref().open_read(digest).await; + } + // otherwise, a chunked reader, which will always try the + // local backend first. + + // map Vec<ChunkMeta> to Vec<(B3Digest, u64)> + let chunks: Vec<(B3Digest, u64)> = remote_chunks + .into_iter() + .map(|chunk_meta| { + ( + B3Digest::try_from(chunk_meta.digest) + .expect("invalid chunk digest"), + chunk_meta.size, + ) + }) + .collect(); + + Ok(Some(make_chunked_reader(self.clone(), chunks))) + } + } + } + } + + #[instrument(skip_all)] + async fn open_write(&self) -> Box<dyn BlobWriter> { + // direct writes to the local one. + self.local.as_ref().open_write().await + } +} + +fn make_chunked_reader<BS>( + // This must consume, as we can't retain references to blob_service, + // as it'd add a lifetime to BlobReader in general, which will get + // problematic in TvixStoreFs, which is using async move closures and cloning. + blob_service: BS, + // A list of b3 digests for individual chunks, and their sizes. + chunks: Vec<(B3Digest, u64)>, +) -> Box<dyn BlobReader> +where + BS: BlobService + Clone + 'static, +{ + // TODO: offset, verified streaming + + // construct readers for each chunk + let blob_service = blob_service.clone(); + let readers_stream = tokio_stream::iter(chunks).map(move |(digest, _)| { + let d = digest.to_owned(); + let blob_service = blob_service.clone(); + async move { + blob_service.open_read(&d.to_owned()).await?.ok_or_else(|| { + warn!(chunk.digest = %digest, "chunk not found"); + std::io::Error::new(std::io::ErrorKind::NotFound, "chunk not found") + }) + } + }); + + // convert the stream of readers to a stream of streams of byte chunks + let bytes_streams = readers_stream.then(|elem| async { elem.await.map(ReaderStream::new) }); + + // flatten into one stream of byte chunks + let bytes_stream = bytes_streams.try_flatten(); + + // convert into AsyncRead + let blob_reader = StreamReader::new(bytes_stream); + + Box::new(NaiveSeeker::new(Box::pin(blob_reader))) +} diff --git a/tvix/castore/src/blobservice/from_addr.rs b/tvix/castore/src/blobservice/from_addr.rs new file mode 100644 index 0000000000..8898bbfb95 --- /dev/null +++ b/tvix/castore/src/blobservice/from_addr.rs @@ -0,0 +1,111 @@ +use url::Url; + +use crate::{proto::blob_service_client::BlobServiceClient, Error}; + +use super::{BlobService, GRPCBlobService, MemoryBlobService, ObjectStoreBlobService}; + +/// Constructs a new instance of a [BlobService] from an URI. +/// +/// The following schemes are supported by the following services: +/// - `memory://` ([MemoryBlobService]) +/// - `grpc+*://` ([GRPCBlobService]) +/// - `objectstore+*://` ([ObjectStoreBlobService]) +/// +/// See their `from_url` methods for more details about their syntax. +pub async fn from_addr(uri: &str) -> Result<Box<dyn BlobService>, crate::Error> { + let url = Url::parse(uri) + .map_err(|e| crate::Error::StorageError(format!("unable to parse url: {}", e)))?; + + let blob_service: Box<dyn BlobService> = match url.scheme() { + "memory" => { + // memory doesn't support host or path in the URL. + if url.has_host() || !url.path().is_empty() { + return Err(Error::StorageError("invalid url".to_string())); + } + Box::<MemoryBlobService>::default() + } + scheme if scheme.starts_with("grpc+") => { + // schemes starting with grpc+ go to the GRPCPathInfoService. + // That's normally grpc+unix for unix sockets, and grpc+http(s) for the HTTP counterparts. + // - In the case of unix sockets, there must be a path, but may not be a host. + // - In the case of non-unix sockets, there must be a host, but no path. + // Constructing the channel is handled by tvix_castore::channel::from_url. + let client = BlobServiceClient::new(crate::tonic::channel_from_url(&url).await?); + Box::new(GRPCBlobService::from_client(client)) + } + scheme if scheme.starts_with("objectstore+") => { + // We need to convert the URL to string, strip the prefix there, and then + // parse it back as url, as Url::set_scheme() rejects some of the transitions we want to do. + let trimmed_url = { + let s = url.to_string(); + Url::parse(s.strip_prefix("objectstore+").unwrap()).unwrap() + }; + Box::new( + ObjectStoreBlobService::parse_url(&trimmed_url) + .map_err(|e| Error::StorageError(e.to_string()))?, + ) + } + scheme => { + return Err(crate::Error::StorageError(format!( + "unknown scheme: {}", + scheme + ))) + } + }; + + Ok(blob_service) +} + +#[cfg(test)] +mod tests { + use super::from_addr; + use rstest::rstest; + + #[rstest] + /// This uses an unsupported scheme. + #[case::unsupported_scheme("http://foo.example/test", false)] + /// This correctly sets the scheme, and doesn't set a path. + #[case::memory_valid("memory://", true)] + /// This sets a memory url host to `foo` + #[case::memory_invalid_host("memory://foo", false)] + /// This sets a memory url path to "/", which is invalid. + #[case::memory_invalid_root_path("memory:///", false)] + /// This sets a memory url path to "/foo", which is invalid. + #[case::memory_invalid_root_path_foo("memory:///foo", false)] + /// Correct scheme to connect to a unix socket. + #[case::grpc_valid_unix_socket("grpc+unix:///path/to/somewhere", true)] + /// Correct scheme for unix socket, but setting a host too, which is invalid. + #[case::grpc_invalid_unix_socket_and_host("grpc+unix://host.example/path/to/somewhere", false)] + /// Correct scheme to connect to localhost, with port 12345 + #[case::grpc_valid_ipv6_localhost_port_12345("grpc+http://[::1]:12345", true)] + /// Correct scheme to connect to localhost over http, without specifying a port. + #[case::grpc_valid_http_host_without_port("grpc+http://localhost", true)] + /// Correct scheme to connect to localhost over http, without specifying a port. + #[case::grpc_valid_https_host_without_port("grpc+https://localhost", true)] + /// Correct scheme to connect to localhost over http, but with additional path, which is invalid. + #[case::grpc_invalid_has_path("grpc+http://localhost/some-path", false)] + /// An example for object store (InMemory) + #[case::objectstore_valid_memory("objectstore+memory:///", true)] + /// An example for object store (LocalFileSystem) + #[case::objectstore_valid_file("objectstore+file:///foo/bar", true)] + // An example for object store (HTTP / WebDAV) + #[case::objectstore_valid_http_url("objectstore+https://localhost:8080/some-path", true)] + /// An example for object store (S3) + #[cfg_attr( + feature = "cloud", + case::objectstore_valid_s3_url("objectstore+s3://bucket/path", true) + )] + /// An example for object store (GCS) + #[cfg_attr( + feature = "cloud", + case::objectstore_valid_gcs_url("objectstore+gs://bucket/path", true) + )] + #[tokio::test] + async fn test_from_addr_tokio(#[case] uri_str: &str, #[case] exp_succeed: bool) { + if exp_succeed { + from_addr(uri_str).await.expect("should succeed"); + } else { + assert!(from_addr(uri_str).await.is_err(), "should fail"); + } + } +} diff --git a/tvix/castore/src/blobservice/grpc.rs b/tvix/castore/src/blobservice/grpc.rs new file mode 100644 index 0000000000..5663cd3838 --- /dev/null +++ b/tvix/castore/src/blobservice/grpc.rs @@ -0,0 +1,349 @@ +use super::{BlobReader, BlobService, BlobWriter, ChunkedReader}; +use crate::{ + proto::{self, stat_blob_response::ChunkMeta}, + B3Digest, +}; +use futures::sink::SinkExt; +use std::{ + io::{self, Cursor}, + pin::pin, + sync::Arc, + task::Poll, +}; +use tokio::io::AsyncWriteExt; +use tokio::task::JoinHandle; +use tokio_stream::{wrappers::ReceiverStream, StreamExt}; +use tokio_util::{ + io::{CopyToBytes, SinkWriter}, + sync::PollSender, +}; +use tonic::{async_trait, transport::Channel, Code, Status}; +use tracing::instrument; + +/// Connects to a (remote) tvix-store BlobService over gRPC. +#[derive(Clone)] +pub struct GRPCBlobService { + /// The internal reference to a gRPC client. + /// Cloning it is cheap, and it internally handles concurrent requests. + grpc_client: proto::blob_service_client::BlobServiceClient<Channel>, +} + +impl GRPCBlobService { + /// construct a [GRPCBlobService] from a [proto::blob_service_client::BlobServiceClient]. + /// panics if called outside the context of a tokio runtime. + pub fn from_client( + grpc_client: proto::blob_service_client::BlobServiceClient<Channel>, + ) -> Self { + Self { grpc_client } + } +} + +#[async_trait] +impl BlobService for GRPCBlobService { + #[instrument(skip(self, digest), fields(blob.digest=%digest))] + async fn has(&self, digest: &B3Digest) -> io::Result<bool> { + let mut grpc_client = self.grpc_client.clone(); + let resp = grpc_client + .stat(proto::StatBlobRequest { + digest: digest.clone().into(), + ..Default::default() + }) + .await; + + match resp { + Ok(_blob_meta) => Ok(true), + Err(e) if e.code() == Code::NotFound => Ok(false), + Err(e) => Err(io::Error::new(io::ErrorKind::Other, e)), + } + } + + #[instrument(skip(self, digest), fields(blob.digest=%digest), err)] + async fn open_read(&self, digest: &B3Digest) -> io::Result<Option<Box<dyn BlobReader>>> { + // First try to get a list of chunks. In case there's only one chunk returned, + // buffer its data into a Vec, otherwise use a ChunkedReader. + // We previously used NaiveSeeker here, but userland likes to seek backwards too often, + // and without store composition this will get very noisy. + // FUTUREWORK: use CombinedBlobService and store composition. + match self.chunks(digest).await { + Ok(None) => Ok(None), + Ok(Some(chunks)) => { + if chunks.is_empty() || chunks.len() == 1 { + // No more granular chunking info, treat this as an individual chunk. + // Get a stream of [proto::BlobChunk], or return an error if the blob + // doesn't exist. + return match self + .grpc_client + .clone() + .read(proto::ReadBlobRequest { + digest: digest.clone().into(), + }) + .await + { + Ok(stream) => { + let data_stream = stream.into_inner().map(|e| { + e.map(|c| c.data) + .map_err(|s| std::io::Error::new(io::ErrorKind::InvalidData, s)) + }); + + // Use StreamReader::new to convert to an AsyncRead. + let mut data_reader = tokio_util::io::StreamReader::new(data_stream); + + let mut buf = Vec::new(); + // TODO: only do this up to a certain limit. + tokio::io::copy(&mut data_reader, &mut buf).await?; + + Ok(Some(Box::new(Cursor::new(buf)))) + } + Err(e) if e.code() == Code::NotFound => Ok(None), + Err(e) => Err(io::Error::new(io::ErrorKind::Other, e)), + }; + } + + // The chunked case. Let ChunkedReader do individual reads. + // TODO: we should store the chunking data in some local cache, + // so `ChunkedReader` doesn't call `self.chunks` *again* for every chunk. + // Think about how store composition will fix this. + let chunked_reader = ChunkedReader::from_chunks( + chunks.into_iter().map(|chunk| { + ( + chunk.digest.try_into().expect("invalid b3 digest"), + chunk.size, + ) + }), + Arc::new(self.clone()) as Arc<dyn BlobService>, + ); + Ok(Some(Box::new(chunked_reader))) + } + Err(e) => Err(e)?, + } + } + + /// Returns a BlobWriter, that'll internally wrap each write in a + /// [proto::BlobChunk], which is send to the gRPC server. + #[instrument(skip_all)] + async fn open_write(&self) -> Box<dyn BlobWriter> { + // set up an mpsc channel passing around Bytes. + let (tx, rx) = tokio::sync::mpsc::channel::<bytes::Bytes>(10); + + // bytes arriving on the RX side are wrapped inside a + // [proto::BlobChunk], and a [ReceiverStream] is constructed. + let blobchunk_stream = ReceiverStream::new(rx).map(|x| proto::BlobChunk { data: x }); + + // spawn the gRPC put request, which will read from blobchunk_stream. + let task = tokio::spawn({ + let mut grpc_client = self.grpc_client.clone(); + async move { Ok::<_, Status>(grpc_client.put(blobchunk_stream).await?.into_inner()) } + }); + + // The tx part of the channel is converted to a sink of byte chunks. + let sink = PollSender::new(tx) + .sink_map_err(|e| std::io::Error::new(std::io::ErrorKind::BrokenPipe, e)); + + // … which is turned into an [tokio::io::AsyncWrite]. + let writer = SinkWriter::new(CopyToBytes::new(sink)); + + Box::new(GRPCBlobWriter { + task_and_writer: Some((task, writer)), + digest: None, + }) + } + + #[instrument(skip(self, digest), fields(blob.digest=%digest), err)] + async fn chunks(&self, digest: &B3Digest) -> io::Result<Option<Vec<ChunkMeta>>> { + let resp = self + .grpc_client + .clone() + .stat(proto::StatBlobRequest { + digest: digest.clone().into(), + send_chunks: true, + ..Default::default() + }) + .await; + + match resp { + Err(e) if e.code() == Code::NotFound => Ok(None), + Err(e) => Err(io::Error::new(io::ErrorKind::Other, e)), + Ok(resp) => { + let resp = resp.into_inner(); + + resp.validate() + .map_err(|e| std::io::Error::new(io::ErrorKind::InvalidData, e))?; + + Ok(Some(resp.chunks)) + } + } + } +} + +pub struct GRPCBlobWriter<W: tokio::io::AsyncWrite> { + /// The task containing the put request, and the inner writer, if we're still writing. + task_and_writer: Option<(JoinHandle<Result<proto::PutBlobResponse, Status>>, W)>, + + /// The digest that has been returned, if we successfully closed. + digest: Option<B3Digest>, +} + +#[async_trait] +impl<W: tokio::io::AsyncWrite + Send + Sync + Unpin + 'static> BlobWriter for GRPCBlobWriter<W> { + async fn close(&mut self) -> io::Result<B3Digest> { + if self.task_and_writer.is_none() { + // if we're already closed, return the b3 digest, which must exist. + // If it doesn't, we already closed and failed once, and didn't handle the error. + match &self.digest { + Some(digest) => Ok(digest.clone()), + None => Err(io::Error::new(io::ErrorKind::BrokenPipe, "already closed")), + } + } else { + let (task, mut writer) = self.task_and_writer.take().unwrap(); + + // invoke shutdown, so the inner writer closes its internal tx side of + // the channel. + writer.shutdown().await?; + + // block on the RPC call to return. + // This ensures all chunks are sent out, and have been received by the + // backend. + + match task.await? { + Ok(resp) => { + // return the digest from the response, and store it in self.digest for subsequent closes. + let digest_len = resp.digest.len(); + let digest: B3Digest = resp.digest.try_into().map_err(|_| { + io::Error::new( + io::ErrorKind::Other, + format!("invalid root digest length {} in response", digest_len), + ) + })?; + self.digest = Some(digest.clone()); + Ok(digest) + } + Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.to_string())), + } + } + } +} + +impl<W: tokio::io::AsyncWrite + Unpin> tokio::io::AsyncWrite for GRPCBlobWriter<W> { + fn poll_write( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> std::task::Poll<Result<usize, io::Error>> { + match &mut self.task_and_writer { + None => Poll::Ready(Err(io::Error::new( + io::ErrorKind::NotConnected, + "already closed", + ))), + Some((_, ref mut writer)) => { + let pinned_writer = pin!(writer); + pinned_writer.poll_write(cx, buf) + } + } + } + + fn poll_flush( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll<Result<(), io::Error>> { + match &mut self.task_and_writer { + None => Poll::Ready(Err(io::Error::new( + io::ErrorKind::NotConnected, + "already closed", + ))), + Some((_, ref mut writer)) => { + let pinned_writer = pin!(writer); + pinned_writer.poll_flush(cx) + } + } + } + + fn poll_shutdown( + self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll<Result<(), io::Error>> { + // TODO(raitobezarius): this might not be a graceful shutdown of the + // channel inside the gRPC connection. + Poll::Ready(Ok(())) + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use tempfile::TempDir; + use tokio::net::UnixListener; + use tokio_retry::strategy::ExponentialBackoff; + use tokio_retry::Retry; + use tokio_stream::wrappers::UnixListenerStream; + + use crate::blobservice::MemoryBlobService; + use crate::fixtures; + use crate::proto::blob_service_client::BlobServiceClient; + use crate::proto::GRPCBlobServiceWrapper; + + use super::BlobService; + use super::GRPCBlobService; + + /// This ensures connecting via gRPC works as expected. + #[tokio::test] + async fn test_valid_unix_path_ping_pong() { + let tmpdir = TempDir::new().unwrap(); + let socket_path = tmpdir.path().join("daemon"); + + let path_clone = socket_path.clone(); + + // Spin up a server + tokio::spawn(async { + let uds = UnixListener::bind(path_clone).unwrap(); + let uds_stream = UnixListenerStream::new(uds); + + // spin up a new server + let mut server = tonic::transport::Server::builder(); + let router = + server.add_service(crate::proto::blob_service_server::BlobServiceServer::new( + GRPCBlobServiceWrapper::new( + Box::<MemoryBlobService>::default() as Box<dyn BlobService> + ), + )); + router.serve_with_incoming(uds_stream).await + }); + + // wait for the socket to be created + Retry::spawn( + ExponentialBackoff::from_millis(20).max_delay(Duration::from_secs(10)), + || async { + if socket_path.exists() { + Ok(()) + } else { + Err(()) + } + }, + ) + .await + .expect("failed to wait for socket"); + + // prepare a client + let grpc_client = { + let url = url::Url::parse(&format!( + "grpc+unix://{}?wait-connect=1", + socket_path.display() + )) + .expect("must parse"); + let client = BlobServiceClient::new( + crate::tonic::channel_from_url(&url) + .await + .expect("must succeed"), + ); + + GRPCBlobService::from_client(client) + }; + + let has = grpc_client + .has(&fixtures::BLOB_A_DIGEST) + .await + .expect("must not be err"); + + assert!(!has); + } +} diff --git a/tvix/castore/src/blobservice/memory.rs b/tvix/castore/src/blobservice/memory.rs new file mode 100644 index 0000000000..873d06b461 --- /dev/null +++ b/tvix/castore/src/blobservice/memory.rs @@ -0,0 +1,127 @@ +use parking_lot::RwLock; +use std::io::{self, Cursor, Write}; +use std::task::Poll; +use std::{collections::HashMap, sync::Arc}; +use tonic::async_trait; +use tracing::instrument; + +use super::{BlobReader, BlobService, BlobWriter}; +use crate::B3Digest; + +#[derive(Clone, Default)] +pub struct MemoryBlobService { + db: Arc<RwLock<HashMap<B3Digest, Vec<u8>>>>, +} + +#[async_trait] +impl BlobService for MemoryBlobService { + #[instrument(skip_all, ret, err, fields(blob.digest=%digest))] + async fn has(&self, digest: &B3Digest) -> io::Result<bool> { + let db = self.db.read(); + Ok(db.contains_key(digest)) + } + + #[instrument(skip_all, err, fields(blob.digest=%digest))] + async fn open_read(&self, digest: &B3Digest) -> io::Result<Option<Box<dyn BlobReader>>> { + let db = self.db.read(); + + match db.get(digest).map(|x| Cursor::new(x.clone())) { + Some(result) => Ok(Some(Box::new(result))), + None => Ok(None), + } + } + + #[instrument(skip_all)] + async fn open_write(&self) -> Box<dyn BlobWriter> { + Box::new(MemoryBlobWriter::new(self.db.clone())) + } +} + +pub struct MemoryBlobWriter { + db: Arc<RwLock<HashMap<B3Digest, Vec<u8>>>>, + + /// Contains the buffer Vec and hasher, or None if already closed + writers: Option<(Vec<u8>, blake3::Hasher)>, + + /// The digest that has been returned, if we successfully closed. + digest: Option<B3Digest>, +} + +impl MemoryBlobWriter { + fn new(db: Arc<RwLock<HashMap<B3Digest, Vec<u8>>>>) -> Self { + Self { + db, + writers: Some((Vec::new(), blake3::Hasher::new())), + digest: None, + } + } +} +impl tokio::io::AsyncWrite for MemoryBlobWriter { + fn poll_write( + mut self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + b: &[u8], + ) -> std::task::Poll<Result<usize, io::Error>> { + Poll::Ready(match &mut self.writers { + None => Err(io::Error::new( + io::ErrorKind::NotConnected, + "already closed", + )), + Some((ref mut buf, ref mut hasher)) => { + let bytes_written = buf.write(b)?; + hasher.write(&b[..bytes_written]) + } + }) + } + + fn poll_flush( + self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll<Result<(), io::Error>> { + Poll::Ready(match self.writers { + None => Err(io::Error::new( + io::ErrorKind::NotConnected, + "already closed", + )), + Some(_) => Ok(()), + }) + } + + fn poll_shutdown( + self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll<Result<(), io::Error>> { + // shutdown is "instantaneous", we only write to memory. + Poll::Ready(Ok(())) + } +} + +#[async_trait] +impl BlobWriter for MemoryBlobWriter { + async fn close(&mut self) -> io::Result<B3Digest> { + if self.writers.is_none() { + match &self.digest { + Some(digest) => Ok(digest.clone()), + None => Err(io::Error::new(io::ErrorKind::BrokenPipe, "already closed")), + } + } else { + let (buf, hasher) = self.writers.take().unwrap(); + + let digest: B3Digest = hasher.finalize().as_bytes().into(); + + // Only insert if the blob doesn't already exist. + let mut db = self.db.upgradable_read(); + if !db.contains_key(&digest) { + // open the database for writing. + db.with_upgraded(|db| { + // and put buf in there. This will move buf out. + db.insert(digest.clone(), buf); + }); + } + + self.digest = Some(digest.clone()); + + Ok(digest) + } + } +} diff --git a/tvix/castore/src/blobservice/mod.rs b/tvix/castore/src/blobservice/mod.rs new file mode 100644 index 0000000000..50acd40bf7 --- /dev/null +++ b/tvix/castore/src/blobservice/mod.rs @@ -0,0 +1,103 @@ +use std::io; +use tonic::async_trait; + +use crate::proto::stat_blob_response::ChunkMeta; +use crate::B3Digest; + +mod chunked_reader; +mod combinator; +mod from_addr; +mod grpc; +mod memory; +mod naive_seeker; +mod object_store; + +#[cfg(test)] +pub mod tests; + +pub use self::chunked_reader::ChunkedReader; +pub use self::combinator::CombinedBlobService; +pub use self::from_addr::from_addr; +pub use self::grpc::GRPCBlobService; +pub use self::memory::MemoryBlobService; +pub use self::object_store::ObjectStoreBlobService; + +/// The base trait all BlobService services need to implement. +/// It provides functions to check whether a given blob exists, +/// a way to read (and seek) a blob, and a method to create a blobwriter handle, +/// which will implement a writer interface, and also provides a close funtion, +/// to finalize a blob and get its digest. +#[async_trait] +pub trait BlobService: Send + Sync { + /// Check if the service has the blob, by its content hash. + /// On implementations returning chunks, this must also work for chunks. + async fn has(&self, digest: &B3Digest) -> io::Result<bool>; + + /// Request a blob from the store, by its content hash. + /// On implementations returning chunks, this must also work for chunks. + async fn open_read(&self, digest: &B3Digest) -> io::Result<Option<Box<dyn BlobReader>>>; + + /// Insert a new blob into the store. Returns a [BlobWriter], which + /// implements [tokio::io::AsyncWrite] and a [BlobWriter::close] to finalize + /// the blob and get its digest. + async fn open_write(&self) -> Box<dyn BlobWriter>; + + /// Return a list of chunks for a given blob. + /// There's a distinction between returning Ok(None) and Ok(Some(vec![])). + /// The former return value is sent in case the blob is not present at all, + /// while the second one is sent in case there's no more granular chunks (or + /// the backend does not support chunking). + /// A default implementation checking for existence and then returning it + /// does not have more granular chunks available is provided. + async fn chunks(&self, digest: &B3Digest) -> io::Result<Option<Vec<ChunkMeta>>> { + if !self.has(digest).await? { + return Ok(None); + } + // default implementation, signalling the backend does not have more + // granular chunks available. + Ok(Some(vec![])) + } +} + +#[async_trait] +impl<A> BlobService for A +where + A: AsRef<dyn BlobService> + Send + Sync, +{ + async fn has(&self, digest: &B3Digest) -> io::Result<bool> { + self.as_ref().has(digest).await + } + + async fn open_read(&self, digest: &B3Digest) -> io::Result<Option<Box<dyn BlobReader>>> { + self.as_ref().open_read(digest).await + } + + async fn open_write(&self) -> Box<dyn BlobWriter> { + self.as_ref().open_write().await + } + + async fn chunks(&self, digest: &B3Digest) -> io::Result<Option<Vec<ChunkMeta>>> { + self.as_ref().chunks(digest).await + } +} + +/// A [tokio::io::AsyncWrite] that the user needs to close() afterwards for persist. +/// On success, it returns the digest of the written blob. +#[async_trait] +pub trait BlobWriter: tokio::io::AsyncWrite + Send + Unpin { + /// Signal there's no more data to be written, and return the digest of the + /// contents written. + /// + /// Closing a already-closed BlobWriter is a no-op. + async fn close(&mut self) -> io::Result<B3Digest>; +} + +/// BlobReader is a [tokio::io::AsyncRead] that also allows seeking. +pub trait BlobReader: tokio::io::AsyncRead + tokio::io::AsyncSeek + Send + Unpin + 'static {} + +/// A [`io::Cursor<Vec<u8>>`] can be used as a BlobReader. +impl BlobReader for io::Cursor<&'static [u8]> {} +impl BlobReader for io::Cursor<&'static [u8; 0]> {} +impl BlobReader for io::Cursor<Vec<u8>> {} +impl BlobReader for io::Cursor<bytes::Bytes> {} +impl BlobReader for tokio::fs::File {} diff --git a/tvix/castore/src/blobservice/naive_seeker.rs b/tvix/castore/src/blobservice/naive_seeker.rs new file mode 100644 index 0000000000..f5a5307150 --- /dev/null +++ b/tvix/castore/src/blobservice/naive_seeker.rs @@ -0,0 +1,265 @@ +use super::BlobReader; +use futures::ready; +use pin_project_lite::pin_project; +use std::io; +use std::task::Poll; +use tokio::io::AsyncRead; +use tracing::{debug, instrument, trace, warn}; + +pin_project! { + /// This implements [tokio::io::AsyncSeek] for and [tokio::io::AsyncRead] by + /// simply skipping over some bytes, keeping track of the position. + /// It fails whenever you try to seek backwards. + /// + /// ## Pinning concerns: + /// + /// [NaiveSeeker] is itself pinned by callers, and we do not need to concern + /// ourselves regarding that. + /// + /// Though, its fields as per + /// <https://doc.rust-lang.org/std/pin/#pinning-is-not-structural-for-field> + /// can be pinned or unpinned. + /// + /// So we need to go over each field and choose our policy carefully. + /// + /// The obvious cases are the bookkeeping integers we keep in the structure, + /// those are private and not shared to anyone, we never build a + /// `Pin<&mut X>` out of them at any point, therefore, we can safely never + /// mark them as pinned. Of course, it is expected that no developer here + /// attempt to `pin!(self.pos)` to pin them because it makes no sense. If + /// they have to become pinned, they should be marked `#[pin]` and we need + /// to discuss it. + /// + /// So the bookkeeping integers are in the right state with respect to their + /// pinning status. The projection should offer direct access. + /// + /// On the `r` field, i.e. a `BufReader<R>`, given that + /// <https://docs.rs/tokio/latest/tokio/io/struct.BufReader.html#impl-Unpin-for-BufReader%3CR%3E> + /// is available, even a `Pin<&mut BufReader<R>>` can be safely moved. + /// + /// The only care we should have regards the internal reader itself, i.e. + /// the `R` instance, see that Tokio decided to `#[pin]` it too: + /// <https://docs.rs/tokio/latest/src/tokio/io/util/buf_reader.rs.html#29> + /// + /// In general, there's no `Unpin` instance for `R: tokio::io::AsyncRead` + /// (see <https://docs.rs/tokio/latest/tokio/io/trait.AsyncRead.html>). + /// + /// Therefore, we could keep it unpinned and pin it in every call site + /// whenever we need to call `poll_*` which can be confusing to the non- + /// expert developer and we have a fair share amount of situations where the + /// [BufReader] instance is naked, i.e. in its `&mut BufReader<R>` + /// form, this is annoying because it could lead to expose the naked `R` + /// internal instance somehow and would produce a risk of making it move + /// unexpectedly. + /// + /// We choose the path of the least resistance as we have no reason to have + /// access to the raw `BufReader<R>` instance, we just `#[pin]` it too and + /// enjoy its `poll_*` safe APIs and push the unpinning concerns to the + /// internal implementations themselves, which studied the question longer + /// than us. + pub struct NaiveSeeker<R: tokio::io::AsyncRead> { + #[pin] + r: tokio::io::BufReader<R>, + pos: u64, + bytes_to_skip: u64, + } +} + +/// The buffer size used to discard data. +const DISCARD_BUF_SIZE: usize = 4096; + +impl<R: tokio::io::AsyncRead> NaiveSeeker<R> { + pub fn new(r: R) -> Self { + NaiveSeeker { + r: tokio::io::BufReader::new(r), + pos: 0, + bytes_to_skip: 0, + } + } +} + +impl<R: tokio::io::AsyncRead> tokio::io::AsyncRead for NaiveSeeker<R> { + #[instrument(level = "trace", skip_all)] + fn poll_read( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> Poll<std::io::Result<()>> { + // The amount of data read can be determined by the increase + // in the length of the slice returned by `ReadBuf::filled`. + let filled_before = buf.filled().len(); + + let this = self.project(); + ready!(this.r.poll_read(cx, buf))?; + + let bytes_read = buf.filled().len() - filled_before; + *this.pos += bytes_read as u64; + + trace!(bytes_read = bytes_read, new_pos = this.pos, "poll_read"); + + Ok(()).into() + } +} + +impl<R: tokio::io::AsyncRead> tokio::io::AsyncBufRead for NaiveSeeker<R> { + fn poll_fill_buf( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll<io::Result<&[u8]>> { + self.project().r.poll_fill_buf(cx) + } + + #[instrument(level = "trace", skip(self))] + fn consume(self: std::pin::Pin<&mut Self>, amt: usize) { + let this = self.project(); + this.r.consume(amt); + *this.pos += amt as u64; + + trace!(new_pos = this.pos, "consume"); + } +} + +impl<R: tokio::io::AsyncRead> tokio::io::AsyncSeek for NaiveSeeker<R> { + #[instrument(level="trace", skip(self), fields(inner_pos=%self.pos), err(Debug))] + fn start_seek( + self: std::pin::Pin<&mut Self>, + position: std::io::SeekFrom, + ) -> std::io::Result<()> { + let absolute_offset: u64 = match position { + io::SeekFrom::Start(start_offset) => { + if start_offset < self.pos { + return Err(io::Error::new( + io::ErrorKind::Unsupported, + format!("can't seek backwards ({} -> {})", self.pos, start_offset), + )); + } else { + start_offset + } + } + // we don't know the total size, can't support this. + io::SeekFrom::End(_end_offset) => { + return Err(io::Error::new( + io::ErrorKind::Unsupported, + "can't seek from end", + )); + } + io::SeekFrom::Current(relative_offset) => { + if relative_offset < 0 { + return Err(io::Error::new( + io::ErrorKind::Unsupported, + "can't seek backwards relative to current position", + )); + } else { + self.pos + relative_offset as u64 + } + } + }; + + // we already know absolute_offset is >= self.pos + debug_assert!( + absolute_offset >= self.pos, + "absolute_offset {} must be >= self.pos {}", + absolute_offset, + self.pos + ); + + // calculate bytes to skip + let this = self.project(); + *this.bytes_to_skip = absolute_offset - *this.pos; + + debug!(bytes_to_skip = *this.bytes_to_skip, "seek"); + + Ok(()) + } + + #[instrument(skip_all)] + fn poll_complete( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll<std::io::Result<u64>> { + if self.bytes_to_skip == 0 { + // return the new position (from the start of the stream) + return Poll::Ready(Ok(self.pos)); + } + + // discard some bytes, until pos is where we want it to be. + // We create a buffer that we'll discard later on. + let mut discard_buf = [0; DISCARD_BUF_SIZE]; + + // Loop until we've reached the desired seek position. This is done by issuing repeated + // `poll_read` calls. + // If the data is not available yet, we will yield back to the executor + // and wait to be polled again. + loop { + if self.bytes_to_skip == 0 { + return Poll::Ready(Ok(self.pos)); + } + + // calculate the length we want to skip at most, which is either a max + // buffer size, or the number of remaining bytes to read, whatever is + // smaller. + let bytes_to_skip_now = std::cmp::min(self.bytes_to_skip as usize, discard_buf.len()); + let mut discard_buf = tokio::io::ReadBuf::new(&mut discard_buf[..bytes_to_skip_now]); + + ready!(self.as_mut().poll_read(cx, &mut discard_buf))?; + let bytes_skipped = discard_buf.filled().len(); + + if bytes_skipped == 0 { + return Poll::Ready(Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "got EOF while trying to skip bytes", + ))); + } + // decrement bytes to skip. The poll_read call already updated self.pos. + *self.as_mut().project().bytes_to_skip -= bytes_skipped as u64; + } + } +} + +impl<R: tokio::io::AsyncRead + Send + Unpin + 'static> BlobReader for NaiveSeeker<R> {} + +#[cfg(test)] +mod tests { + use super::{NaiveSeeker, DISCARD_BUF_SIZE}; + use std::io::{Cursor, SeekFrom}; + use tokio::io::{AsyncReadExt, AsyncSeekExt}; + + /// This seek requires multiple `poll_read` as we use a multiples of + /// DISCARD_BUF_SIZE when doing the seek. + /// This ensures we don't hang indefinitely. + #[tokio::test] + async fn seek() { + let buf = vec![0u8; DISCARD_BUF_SIZE * 4]; + let reader = Cursor::new(&buf); + let mut seeker = NaiveSeeker::new(reader); + seeker.seek(SeekFrom::Start(4000)).await.unwrap(); + } + + #[tokio::test] + async fn seek_read() { + let mut buf = vec![0u8; DISCARD_BUF_SIZE * 2]; + buf.extend_from_slice(&[1u8; DISCARD_BUF_SIZE * 2]); + buf.extend_from_slice(&[2u8; DISCARD_BUF_SIZE * 2]); + + let reader = Cursor::new(&buf); + let mut seeker = NaiveSeeker::new(reader); + + let mut read_buf = vec![0u8; DISCARD_BUF_SIZE]; + seeker.read_exact(&mut read_buf).await.expect("must read"); + assert_eq!(read_buf.as_slice(), &[0u8; DISCARD_BUF_SIZE]); + + seeker + .seek(SeekFrom::Current(DISCARD_BUF_SIZE as i64)) + .await + .expect("must seek"); + seeker.read_exact(&mut read_buf).await.expect("must read"); + assert_eq!(read_buf.as_slice(), &[1u8; DISCARD_BUF_SIZE]); + + seeker + .seek(SeekFrom::Start(2 * 2 * DISCARD_BUF_SIZE as u64)) + .await + .expect("must seek"); + seeker.read_exact(&mut read_buf).await.expect("must read"); + assert_eq!(read_buf.as_slice(), &[2u8; DISCARD_BUF_SIZE]); + } +} diff --git a/tvix/castore/src/blobservice/object_store.rs b/tvix/castore/src/blobservice/object_store.rs new file mode 100644 index 0000000000..d2d0a288a5 --- /dev/null +++ b/tvix/castore/src/blobservice/object_store.rs @@ -0,0 +1,546 @@ +use std::{ + io::{self, Cursor}, + pin::pin, + sync::Arc, + task::Poll, +}; + +use data_encoding::HEXLOWER; +use fastcdc::v2020::AsyncStreamCDC; +use futures::Future; +use object_store::{path::Path, ObjectStore}; +use pin_project_lite::pin_project; +use prost::Message; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use tokio_stream::StreamExt; +use tonic::async_trait; +use tracing::{debug, instrument, trace, Level}; +use url::Url; + +use crate::{ + proto::{stat_blob_response::ChunkMeta, StatBlobResponse}, + B3Digest, B3HashingReader, +}; + +use super::{BlobReader, BlobService, BlobWriter, ChunkedReader}; + +#[derive(Clone)] +pub struct ObjectStoreBlobService { + object_store: Arc<dyn ObjectStore>, + base_path: Path, + + /// Average chunk size for FastCDC, in bytes. + /// min value is half, max value double of that number. + avg_chunk_size: u32, +} + +/// Uses any object storage supported by the [object_store] crate to provide a +/// tvix-castore [BlobService]. +/// +/// # Data format +/// Data is organized in "blobs" and "chunks". +/// Blobs don't hold the actual data, but instead contain a list of more +/// granular chunks that assemble to the contents requested. +/// This allows clients to seek, and not download chunks they already have +/// locally, as it's referred to from other files. +/// Check `rpc_blobstore` and more general BlobStore docs on that. +/// +/// ## Blobs +/// Stored at `${base_path}/blobs/b3/$digest_key`. They contains the serialized +/// StatBlobResponse for the blob with the digest. +/// +/// ## Chunks +/// Chunks are stored at `${base_path}/chunks/b3/$digest_key`. They contain +/// the literal contents of the chunk, but are zstd-compressed. +/// +/// ## Digest key sharding +/// The blake3 digest encoded in lower hex, and sharded after the second +/// character. +/// The blob for "Hello World" is stored at +/// `${base_path}/blobs/b3/41/41f8394111eb713a22165c46c90ab8f0fd9399c92028fd6d288944b23ff5bf76`. +/// +/// This reduces the number of files in the same directory, which would be a +/// problem at least when using [object_store::local::LocalFileSystem]. +/// +/// # Future changes +/// There's no guarantees about this being a final format yet. +/// Once object_store gets support for additional metadata / content-types, +/// we can eliminate some requests (small blobs only consisting of a single +/// chunk can be stored as-is, without the blob index file). +/// It also allows signalling any compression of chunks in the content-type. +/// Migration *should* be possible by simply adding the right content-types to +/// all keys stored so far, but no promises ;-) +impl ObjectStoreBlobService { + /// Constructs a new [ObjectStoreBlobService] from a [Url] supported by + /// [object_store]. + /// Any path suffix becomes the base path of the object store. + /// additional options, the same as in [object_store::parse_url_opts] can + /// be passed. + pub fn parse_url_opts<I, K, V>(url: &Url, options: I) -> Result<Self, object_store::Error> + where + I: IntoIterator<Item = (K, V)>, + K: AsRef<str>, + V: Into<String>, + { + let (object_store, path) = object_store::parse_url_opts(url, options)?; + + Ok(Self { + object_store: Arc::new(object_store), + base_path: path, + avg_chunk_size: 256 * 1024, + }) + } + + /// Like [Self::parse_url_opts], except without the options. + pub fn parse_url(url: &Url) -> Result<Self, object_store::Error> { + Self::parse_url_opts(url, Vec::<(String, String)>::new()) + } +} + +#[instrument(level=Level::TRACE, skip_all,fields(base_path=%base_path,blob.digest=%digest),ret(Display))] +fn derive_blob_path(base_path: &Path, digest: &B3Digest) -> Path { + base_path + .child("blobs") + .child("b3") + .child(HEXLOWER.encode(&digest.as_slice()[..2])) + .child(HEXLOWER.encode(digest.as_slice())) +} + +#[instrument(level=Level::TRACE, skip_all,fields(base_path=%base_path,chunk.digest=%digest),ret(Display))] +fn derive_chunk_path(base_path: &Path, digest: &B3Digest) -> Path { + base_path + .child("chunks") + .child("b3") + .child(HEXLOWER.encode(&digest.as_slice()[..2])) + .child(HEXLOWER.encode(digest.as_slice())) +} + +#[async_trait] +impl BlobService for ObjectStoreBlobService { + #[instrument(skip_all, ret, err, fields(blob.digest=%digest))] + async fn has(&self, digest: &B3Digest) -> io::Result<bool> { + // TODO: clarify if this should work for chunks or not, and explicitly + // document in the proto docs. + let p = derive_blob_path(&self.base_path, digest); + + match self.object_store.head(&p).await { + Ok(_) => Ok(true), + Err(object_store::Error::NotFound { .. }) => { + let p = derive_chunk_path(&self.base_path, digest); + match self.object_store.head(&p).await { + Ok(_) => Ok(true), + Err(object_store::Error::NotFound { .. }) => Ok(false), + Err(e) => Err(e)?, + } + } + Err(e) => Err(e)?, + } + } + + #[instrument(skip_all, err, fields(blob.digest=%digest))] + async fn open_read(&self, digest: &B3Digest) -> io::Result<Option<Box<dyn BlobReader>>> { + // handle reading the empty blob. + if digest.as_slice() == blake3::hash(b"").as_bytes() { + return Ok(Some(Box::new(Cursor::new(b"")) as Box<dyn BlobReader>)); + } + match self + .object_store + .get(&derive_chunk_path(&self.base_path, digest)) + .await + { + Ok(res) => { + // handle reading blobs that are small enough to fit inside a single chunk: + // fetch the entire chunk into memory, decompress, ensure the b3 digest matches, + // and return a io::Cursor over that data. + // FUTUREWORK: use zstd::bulk to prevent decompression bombs + + let chunk_raw_bytes = res.bytes().await?; + let chunk_contents = zstd::stream::decode_all(Cursor::new(chunk_raw_bytes))?; + + if *digest != blake3::hash(&chunk_contents).as_bytes().into() { + Err(io::Error::other("chunk contents invalid"))?; + } + + Ok(Some(Box::new(Cursor::new(chunk_contents)))) + } + Err(object_store::Error::NotFound { .. }) => { + // NOTE: For public-facing things, we would want to stop here. + // Clients should fetch granularly, so they can make use of + // chunks they have locally. + // However, if this is used directly, without any caches, do the + // assembly here. + // This is subject to change, once we have store composition. + // TODO: make this configurable, and/or clarify behaviour for + // the gRPC server surface (explicitly document behaviour in the + // proto docs) + if let Some(chunks) = self.chunks(digest).await? { + let chunked_reader = ChunkedReader::from_chunks( + chunks.into_iter().map(|chunk| { + ( + chunk.digest.try_into().expect("invalid b3 digest"), + chunk.size, + ) + }), + Arc::new(self.clone()) as Arc<dyn BlobService>, + ); + + Ok(Some(Box::new(chunked_reader))) + } else { + // This is neither a chunk nor a blob, return None. + Ok(None) + } + } + Err(e) => Err(e.into()), + } + } + + #[instrument(skip_all)] + async fn open_write(&self) -> Box<dyn BlobWriter> { + // ObjectStoreBlobWriter implements AsyncWrite, but all the chunking + // needs an AsyncRead, so we create a pipe here. + // In its `AsyncWrite` implementation, `ObjectStoreBlobWriter` delegates + // writes to w. It periodically polls the future that's reading from the + // other side. + let (w, r) = tokio::io::duplex(self.avg_chunk_size as usize * 10); + + Box::new(ObjectStoreBlobWriter { + writer: Some(w), + fut: Some(Box::pin(chunk_and_upload( + r, + self.object_store.clone(), + self.base_path.clone(), + self.avg_chunk_size / 2, + self.avg_chunk_size, + self.avg_chunk_size * 2, + ))), + fut_output: None, + }) + } + + #[instrument(skip_all, err, fields(blob.digest=%digest))] + async fn chunks(&self, digest: &B3Digest) -> io::Result<Option<Vec<ChunkMeta>>> { + match self + .object_store + .get(&derive_blob_path(&self.base_path, digest)) + .await + { + Ok(get_result) => { + // fetch the data at the blob path + let blob_data = get_result.bytes().await?; + // parse into StatBlobResponse + let stat_blob_response: StatBlobResponse = StatBlobResponse::decode(blob_data)?; + + debug!( + chunk.count = stat_blob_response.chunks.len(), + blob.size = stat_blob_response + .chunks + .iter() + .map(|x| x.size) + .sum::<u64>(), + "found more granular chunks" + ); + + Ok(Some(stat_blob_response.chunks)) + } + Err(object_store::Error::NotFound { .. }) => { + // If there's only a chunk, we must return the empty vec here, rather than None. + match self + .object_store + .head(&derive_chunk_path(&self.base_path, digest)) + .await + { + Ok(_) => { + // present, but no more chunks available + debug!("found a single chunk"); + Ok(Some(vec![])) + } + Err(object_store::Error::NotFound { .. }) => { + // Neither blob nor single chunk found + debug!("not found"); + Ok(None) + } + // error checking for chunk + Err(e) => Err(e.into()), + } + } + // error checking for blob + Err(err) => Err(err.into()), + } + } +} + +/// Reads blob contents from a AsyncRead, chunks and uploads them. +/// On success, returns a [StatBlobResponse] pointing to the individual chunks. +#[instrument(skip_all, fields(base_path=%base_path, min_chunk_size, avg_chunk_size, max_chunk_size), err)] +async fn chunk_and_upload<R: AsyncRead + Unpin>( + r: R, + object_store: Arc<dyn ObjectStore>, + base_path: Path, + min_chunk_size: u32, + avg_chunk_size: u32, + max_chunk_size: u32, +) -> io::Result<B3Digest> { + // wrap reader with something calculating the blake3 hash of all data read. + let mut b3_r = B3HashingReader::from(r); + // set up a fastcdc chunker + let mut chunker = + AsyncStreamCDC::new(&mut b3_r, min_chunk_size, avg_chunk_size, max_chunk_size); + + /// This really should just belong into the closure at + /// `chunker.as_stream().then(|_| { … })``, but if we try to, rustc spits + /// higher-ranked lifetime errors at us. + async fn fastcdc_chunk_uploader( + resp: Result<fastcdc::v2020::ChunkData, fastcdc::v2020::Error>, + base_path: Path, + object_store: Arc<dyn ObjectStore>, + ) -> std::io::Result<ChunkMeta> { + let chunk_data = resp?; + let chunk_digest: B3Digest = blake3::hash(&chunk_data.data).as_bytes().into(); + let chunk_path = derive_chunk_path(&base_path, &chunk_digest); + + upload_chunk(object_store, chunk_digest, chunk_path, chunk_data.data).await + } + + // Use the fastcdc chunker to produce a stream of chunks, and upload these + // that don't exist to the backend. + let chunks = chunker + .as_stream() + .then(|resp| fastcdc_chunk_uploader(resp, base_path.clone(), object_store.clone())) + .collect::<io::Result<Vec<ChunkMeta>>>() + .await?; + + let stat_blob_response = StatBlobResponse { + chunks, + bao: "".into(), // still todo + }; + + // check for Blob, if it doesn't exist, persist. + let blob_digest: B3Digest = b3_r.digest().into(); + let blob_path = derive_blob_path(&base_path, &blob_digest); + + match object_store.head(&blob_path).await { + // blob already exists, nothing to do + Ok(_) => { + trace!( + blob.digest = %blob_digest, + blob.path = %blob_path, + "blob already exists on backend" + ); + } + // chunk does not yet exist, upload first + Err(object_store::Error::NotFound { .. }) => { + debug!( + blob.digest = %blob_digest, + blob.path = %blob_path, + "uploading blob" + ); + object_store + .put(&blob_path, stat_blob_response.encode_to_vec().into()) + .await?; + } + Err(err) => { + // other error + Err(err)? + } + } + + Ok(blob_digest) +} + +/// upload chunk if it doesn't exist yet. +#[instrument(skip_all, fields(chunk.digest = %chunk_digest, chunk.size = chunk_data.len(), chunk.path = %chunk_path), err)] +async fn upload_chunk( + object_store: Arc<dyn ObjectStore>, + chunk_digest: B3Digest, + chunk_path: Path, + chunk_data: Vec<u8>, +) -> std::io::Result<ChunkMeta> { + let chunk_size = chunk_data.len(); + match object_store.head(&chunk_path).await { + // chunk already exists, nothing to do + Ok(_) => { + debug!("chunk already exists"); + } + + // chunk does not yet exist, compress and upload. + Err(object_store::Error::NotFound { .. }) => { + let chunk_data_compressed = + zstd::encode_all(Cursor::new(chunk_data), zstd::DEFAULT_COMPRESSION_LEVEL)?; + + debug!(chunk.compressed_size=%chunk_data_compressed.len(), "uploading chunk"); + + object_store + .as_ref() + .put(&chunk_path, chunk_data_compressed.into()) + .await?; + } + // other error + Err(err) => Err(err)?, + } + + Ok(ChunkMeta { + digest: chunk_digest.into(), + size: chunk_size as u64, + }) +} + +pin_project! { + /// Takes care of blob uploads. + /// All writes are relayed to self.writer, and we continuously poll the + /// future (which will internally read from the other side of the pipe and + /// upload chunks). + /// Our BlobWriter::close() needs to drop self.writer, so the other side + /// will read EOF and can finalize the blob. + /// The future should then resolve and return the blob digest. + pub struct ObjectStoreBlobWriter<W, Fut> + where + W: AsyncWrite, + Fut: Future, + { + #[pin] + writer: Option<W>, + + #[pin] + fut: Option<Fut>, + + fut_output: Option<io::Result<B3Digest>> + } +} + +impl<W, Fut> tokio::io::AsyncWrite for ObjectStoreBlobWriter<W, Fut> +where + W: AsyncWrite + Send + Unpin, + Fut: Future, +{ + fn poll_write( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &[u8], + ) -> std::task::Poll<Result<usize, io::Error>> { + let this = self.project(); + // poll the future. + let fut = this.fut.as_pin_mut().expect("not future"); + let fut_p = fut.poll(cx); + // if it's ready, the only way this could have happened is that the + // upload failed, because we're only closing `self.writer` after all + // writes happened. + if fut_p.is_ready() { + return Poll::Ready(Err(io::Error::other("upload failed"))); + } + + // write to the underlying writer + this.writer + .as_pin_mut() + .expect("writer must be some") + .poll_write(cx, buf) + } + + fn poll_flush( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll<Result<(), io::Error>> { + let this = self.project(); + // poll the future. + let fut = this.fut.as_pin_mut().expect("not future"); + let fut_p = fut.poll(cx); + // if it's ready, the only way this could have happened is that the + // upload failed, because we're only closing `self.writer` after all + // writes happened. + if fut_p.is_ready() { + return Poll::Ready(Err(io::Error::other("upload failed"))); + } + + // Call poll_flush on the writer + this.writer + .as_pin_mut() + .expect("writer must be some") + .poll_flush(cx) + } + + fn poll_shutdown( + self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll<Result<(), io::Error>> { + // There's nothing to do on shutdown. We might have written some chunks + // that are nowhere else referenced, but cleaning them up here would be racy. + std::task::Poll::Ready(Ok(())) + } +} + +#[async_trait] +impl<W, Fut> BlobWriter for ObjectStoreBlobWriter<W, Fut> +where + W: AsyncWrite + Send + Unpin, + Fut: Future<Output = io::Result<B3Digest>> + Send + Unpin, +{ + async fn close(&mut self) -> io::Result<B3Digest> { + match self.writer.take() { + Some(mut writer) => { + // shut down the writer, so the other side will read EOF. + writer.shutdown().await?; + + // take out the future. + let fut = self.fut.take().expect("fut must be some"); + // await it. + let resp = pin!(fut).await; + + match resp.as_ref() { + // In the case of an Ok value, we store it in self.fut_output, + // so future calls to close can return that. + Ok(b3_digest) => { + self.fut_output = Some(Ok(b3_digest.clone())); + } + Err(e) => { + // for the error type, we need to cheat a bit, as + // they're not clone-able. + // Simply store a sloppy clone, with the same ErrorKind and message there. + self.fut_output = Some(Err(std::io::Error::new(e.kind(), e.to_string()))) + } + } + resp + } + None => { + // called a second time, return self.fut_output. + match self.fut_output.as_ref().unwrap() { + Ok(ref b3_digest) => Ok(b3_digest.clone()), + Err(e) => Err(std::io::Error::new(e.kind(), e.to_string())), + } + } + } + } +} + +#[cfg(test)] +mod test { + use super::chunk_and_upload; + use crate::{ + blobservice::{BlobService, ObjectStoreBlobService}, + fixtures::{BLOB_A, BLOB_A_DIGEST}, + }; + use std::{io::Cursor, sync::Arc}; + use url::Url; + + /// Tests chunk_and_upload directly, bypassing the BlobWriter at open_write(). + #[tokio::test] + async fn test_chunk_and_upload() { + let blobsvc = Arc::new( + ObjectStoreBlobService::parse_url(&Url::parse("memory:///").unwrap()).unwrap(), + ); + + let blob_digest = chunk_and_upload( + &mut Cursor::new(BLOB_A.to_vec()), + blobsvc.object_store.clone(), + object_store::path::Path::from("/"), + 1024 / 2, + 1024, + 1024 * 2, + ) + .await + .expect("chunk_and_upload succeeds"); + + assert_eq!(BLOB_A_DIGEST.clone(), blob_digest); + + // Now we should have the blob + assert!(blobsvc.has(&BLOB_A_DIGEST).await.unwrap()); + } +} diff --git a/tvix/castore/src/blobservice/tests/mod.rs b/tvix/castore/src/blobservice/tests/mod.rs new file mode 100644 index 0000000000..0280faebb1 --- /dev/null +++ b/tvix/castore/src/blobservice/tests/mod.rs @@ -0,0 +1,253 @@ +//! This contains test scenarios that a given [BlobService] needs to pass. +//! We use [rstest] and [rstest_reuse] to provide all services we want to test +//! against, and then apply this template to all test functions. + +use rstest::*; +use rstest_reuse::{self, *}; +use std::io; +use tokio::io::AsyncReadExt; +use tokio::io::AsyncSeekExt; + +use super::BlobService; +use crate::blobservice; +use crate::fixtures::BLOB_A; +use crate::fixtures::BLOB_A_DIGEST; +use crate::fixtures::BLOB_B; +use crate::fixtures::BLOB_B_DIGEST; + +mod utils; +use self::utils::make_grpc_blob_service_client; + +/// This produces a template, which will be applied to all individual test functions. +/// See https://github.com/la10736/rstest/issues/130#issuecomment-968864832 +#[template] +#[rstest] +#[case::grpc(make_grpc_blob_service_client().await)] +#[case::memory(blobservice::from_addr("memory://").await.unwrap())] +#[case::objectstore_memory(blobservice::from_addr("objectstore+memory://").await.unwrap())] +pub fn blob_services(#[case] blob_service: impl BlobService) {} + +/// Using [BlobService::has] on a non-existing blob should return false. +#[apply(blob_services)] +#[tokio::test] +async fn has_nonexistent_false(blob_service: impl BlobService) { + assert!(!blob_service + .has(&BLOB_A_DIGEST) + .await + .expect("must not fail")); +} + +/// Using [BlobService::chunks] on a non-existing blob should return Ok(None) +#[apply(blob_services)] +#[tokio::test] +async fn chunks_nonexistent_false(blob_service: impl BlobService) { + assert!(blob_service + .chunks(&BLOB_A_DIGEST) + .await + .expect("must be ok") + .is_none()); +} + +// TODO: do tests with `chunks` + +/// Trying to read a non-existing blob should return a None instead of a reader. +#[apply(blob_services)] +#[tokio::test] +async fn not_found_read(blob_service: impl BlobService) { + assert!(blob_service + .open_read(&BLOB_A_DIGEST) + .await + .expect("must not fail") + .is_none()) +} + +/// Put a blob in the store, check has, get it back. +#[apply(blob_services)] +// #[case::small(&fixtures::BLOB_A, &fixtures::BLOB_A_DIGEST)] +// #[case::big(&fixtures::BLOB_B, &fixtures::BLOB_B_DIGEST)] +#[tokio::test] +async fn put_has_get(blob_service: impl BlobService) { + // TODO: figure out how to instantiate this with BLOB_A and BLOB_B, as two separate cases + for (blob_contents, blob_digest) in &[ + (&*BLOB_A, BLOB_A_DIGEST.clone()), + (&*BLOB_B, BLOB_B_DIGEST.clone()), + ] { + let mut w = blob_service.open_write().await; + + let l = tokio::io::copy(&mut io::Cursor::new(blob_contents), &mut w) + .await + .expect("copy must succeed"); + assert_eq!( + blob_contents.len(), + l as usize, + "written bytes must match blob length" + ); + + let digest = w.close().await.expect("close must succeed"); + + assert_eq!(*blob_digest, digest, "returned digest must be correct"); + + assert!( + blob_service.has(blob_digest).await.expect("must not fail"), + "blob service should now have the blob" + ); + + let mut r = blob_service + .open_read(blob_digest) + .await + .expect("open_read must succeed") + .expect("must be some"); + + let mut buf: Vec<u8> = Vec::new(); + let mut pinned_reader = std::pin::pin!(r); + let l = tokio::io::copy(&mut pinned_reader, &mut buf) + .await + .expect("copy must succeed"); + + assert_eq!( + blob_contents.len(), + l as usize, + "read bytes must match blob length" + ); + + assert_eq!(&blob_contents[..], &buf, "read blob contents must match"); + } +} + +/// Put a blob in the store, and seek inside it a bit. +#[apply(blob_services)] +#[tokio::test] +async fn put_seek(blob_service: impl BlobService) { + let mut w = blob_service.open_write().await; + + tokio::io::copy(&mut io::Cursor::new(&BLOB_B.to_vec()), &mut w) + .await + .expect("copy must succeed"); + w.close().await.expect("close must succeed"); + + // open a blob for reading + let mut r = blob_service + .open_read(&BLOB_B_DIGEST) + .await + .expect("open_read must succeed") + .expect("must be some"); + + let mut pos: u64 = 0; + + // read the first 10 bytes, they must match the data in the fixture. + { + let mut buf = [0; 10]; + r.read_exact(&mut buf).await.expect("must succeed"); + + assert_eq!( + &BLOB_B[pos as usize..pos as usize + buf.len()], + buf, + "expected first 10 bytes to match" + ); + + pos += buf.len() as u64; + } + // seek by 0 bytes, using SeekFrom::Start. + let p = r + .seek(io::SeekFrom::Start(pos)) + .await + .expect("must not fail"); + assert_eq!(pos, p); + + // read the next 10 bytes, they must match the data in the fixture. + { + let mut buf = [0; 10]; + r.read_exact(&mut buf).await.expect("must succeed"); + + assert_eq!( + &BLOB_B[pos as usize..pos as usize + buf.len()], + buf, + "expected data to match" + ); + + pos += buf.len() as u64; + } + + // seek by 5 bytes, using SeekFrom::Start. + let p = r + .seek(io::SeekFrom::Start(pos + 5)) + .await + .expect("must not fail"); + pos += 5; + assert_eq!(pos, p); + + // read the next 10 bytes, they must match the data in the fixture. + { + let mut buf = [0; 10]; + r.read_exact(&mut buf).await.expect("must succeed"); + + assert_eq!( + &BLOB_B[pos as usize..pos as usize + buf.len()], + buf, + "expected data to match" + ); + + pos += buf.len() as u64; + } + + // seek by 12345 bytes, using SeekFrom:: + let p = r + .seek(io::SeekFrom::Current(12345)) + .await + .expect("must not fail"); + pos += 12345; + assert_eq!(pos, p); + + // read the next 10 bytes, they must match the data in the fixture. + { + let mut buf = [0; 10]; + r.read_exact(&mut buf).await.expect("must succeed"); + + assert_eq!( + &BLOB_B[pos as usize..pos as usize + buf.len()], + buf, + "expected data to match" + ); + + #[allow(unused_assignments)] + { + pos += buf.len() as u64; + } + } + + // seeking to the end is okay… + let p = r + .seek(io::SeekFrom::Start(BLOB_B.len() as u64)) + .await + .expect("must not fail"); + pos = BLOB_B.len() as u64; + assert_eq!(pos, p); + + { + // but it returns no more data. + let mut buf: Vec<u8> = Vec::new(); + r.read_to_end(&mut buf).await.expect("must not fail"); + assert!(buf.is_empty(), "expected no more data to be read"); + } + + // seeking past the end… + // should either be ok, but then return 0 bytes. + // this matches the behaviour or a Cursor<Vec<u8>>. + if let Ok(_pos) = r.seek(io::SeekFrom::Start(BLOB_B.len() as u64 + 1)).await { + let mut buf: Vec<u8> = Vec::new(); + r.read_to_end(&mut buf).await.expect("must not fail"); + assert!(buf.is_empty(), "expected no more data to be read"); + } + // or not be okay. + + // TODO: this is only broken for the gRPC version + // We expect seeking backwards or relative to the end to fail. + // r.seek(io::SeekFrom::Current(-1)) + // .expect_err("SeekFrom::Current(-1) expected to fail"); + + // r.seek(io::SeekFrom::Start(pos - 1)) + // .expect_err("SeekFrom::Start(pos-1) expected to fail"); + + // r.seek(io::SeekFrom::End(0)) + // .expect_err("SeekFrom::End(_) expected to fail"); +} diff --git a/tvix/castore/src/blobservice/tests/utils.rs b/tvix/castore/src/blobservice/tests/utils.rs new file mode 100644 index 0000000000..706c4b5e43 --- /dev/null +++ b/tvix/castore/src/blobservice/tests/utils.rs @@ -0,0 +1,41 @@ +use crate::blobservice::{BlobService, MemoryBlobService}; +use crate::proto::blob_service_client::BlobServiceClient; +use crate::proto::GRPCBlobServiceWrapper; +use crate::{blobservice::GRPCBlobService, proto::blob_service_server::BlobServiceServer}; +use tonic::transport::{Endpoint, Server, Uri}; + +/// Constructs and returns a gRPC BlobService. +/// The server part is a [MemoryBlobService], exposed via the +/// [GRPCBlobServiceWrapper], and connected through a DuplexStream +pub async fn make_grpc_blob_service_client() -> Box<dyn BlobService> { + let (left, right) = tokio::io::duplex(64); + + // spin up a server, which will only connect once, to the left side. + tokio::spawn(async { + let blob_service = Box::<MemoryBlobService>::default() as Box<dyn BlobService>; + + // spin up a new DirectoryService + let mut server = Server::builder(); + let router = server.add_service(BlobServiceServer::new(GRPCBlobServiceWrapper::new( + blob_service, + ))); + + router + .serve_with_incoming(tokio_stream::once(Ok::<_, std::io::Error>(left))) + .await + }); + + // Create a client, connecting to the right side. The URI is unused. + let mut maybe_right = Some(right); + + Box::new(GRPCBlobService::from_client(BlobServiceClient::new( + Endpoint::try_from("http://[::]:50051") + .unwrap() + .connect_with_connector(tower::service_fn(move |_: Uri| { + let right = maybe_right.take().unwrap(); + async move { Ok::<_, std::io::Error>(right) } + })) + .await + .unwrap(), + ))) +} diff --git a/tvix/castore/src/digests.rs b/tvix/castore/src/digests.rs new file mode 100644 index 0000000000..2311c95c4d --- /dev/null +++ b/tvix/castore/src/digests.rs @@ -0,0 +1,86 @@ +use bytes::Bytes; +use data_encoding::BASE64; +use thiserror::Error; + +#[derive(PartialEq, Eq, Hash)] +pub struct B3Digest(Bytes); + +// TODO: allow converting these errors to crate::Error +#[derive(Error, Debug)] +pub enum Error { + #[error("invalid digest length: {0}")] + InvalidDigestLen(usize), +} + +pub const B3_LEN: usize = 32; + +impl B3Digest { + pub fn as_slice(&self) -> &[u8] { + &self.0[..] + } +} + +impl From<B3Digest> for bytes::Bytes { + fn from(val: B3Digest) -> Self { + val.0 + } +} + +impl From<digest::Output<blake3::Hasher>> for B3Digest { + fn from(value: digest::Output<blake3::Hasher>) -> Self { + let v = Into::<[u8; B3_LEN]>::into(value); + Self(Bytes::copy_from_slice(&v)) + } +} + +impl TryFrom<Vec<u8>> for B3Digest { + type Error = Error; + + // constructs a [B3Digest] from a [Vec<u8>]. + // Returns an error if the digest has the wrong length. + fn try_from(value: Vec<u8>) -> Result<Self, Self::Error> { + if value.len() != B3_LEN { + Err(Error::InvalidDigestLen(value.len())) + } else { + Ok(Self(value.into())) + } + } +} + +impl TryFrom<bytes::Bytes> for B3Digest { + type Error = Error; + + // constructs a [B3Digest] from a [bytes::Bytes]. + // Returns an error if the digest has the wrong length. + fn try_from(value: bytes::Bytes) -> Result<Self, Self::Error> { + if value.len() != B3_LEN { + Err(Error::InvalidDigestLen(value.len())) + } else { + Ok(Self(value)) + } + } +} + +impl From<&[u8; B3_LEN]> for B3Digest { + fn from(value: &[u8; B3_LEN]) -> Self { + Self(value.to_vec().into()) + } +} + +impl Clone for B3Digest { + fn clone(&self) -> Self { + Self(self.0.to_owned()) + } +} + +impl std::fmt::Display for B3Digest { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "b3:{}", BASE64.encode(&self.0)) + } +} + +impl std::fmt::Debug for B3Digest { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "b3:{}", BASE64.encode(&self.0)) + } +} diff --git a/tvix/castore/src/directoryservice/bigtable.rs b/tvix/castore/src/directoryservice/bigtable.rs new file mode 100644 index 0000000000..1194c6ddc9 --- /dev/null +++ b/tvix/castore/src/directoryservice/bigtable.rs @@ -0,0 +1,357 @@ +use bigtable_rs::{bigtable, google::bigtable::v2 as bigtable_v2}; +use bytes::Bytes; +use data_encoding::HEXLOWER; +use futures::stream::BoxStream; +use prost::Message; +use serde::{Deserialize, Serialize}; +use serde_with::{serde_as, DurationSeconds}; +use tonic::async_trait; +use tracing::{instrument, trace, warn}; + +use super::{utils::traverse_directory, DirectoryPutter, DirectoryService, SimplePutter}; +use crate::{proto, B3Digest, Error}; + +/// There should not be more than 10 MiB in a single cell. +/// https://cloud.google.com/bigtable/docs/schema-design#cells +const CELL_SIZE_LIMIT: u64 = 10 * 1024 * 1024; + +/// Provides a [DirectoryService] implementation using +/// [Bigtable](https://cloud.google.com/bigtable/docs/) +/// as an underlying K/V store. +/// +/// # Data format +/// We use Bigtable as a plain K/V store. +/// The row key is the digest of the directory, in hexlower. +/// Inside the row, we currently have a single column/cell, again using the +/// hexlower directory digest. +/// Its value is the Directory message, serialized in canonical protobuf. +/// We currently only populate this column. +/// +/// In the future, we might want to introduce "bucketing", essentially storing +/// all directories inserted via `put_multiple_start` in a batched form. +/// This will prevent looking up intermediate Directories, which are not +/// directly at the root, so rely on store composition. +#[derive(Clone)] +pub struct BigtableDirectoryService { + client: bigtable::BigTable, + params: BigtableParameters, + + #[cfg(test)] + #[allow(dead_code)] + /// Holds the temporary directory containing the unix socket, and the + /// spawned emulator process. + emulator: std::sync::Arc<(tempfile::TempDir, async_process::Child)>, +} + +/// Represents configuration of [BigtableDirectoryService]. +/// This currently conflates both connect parameters and data model/client +/// behaviour parameters. +#[serde_as] +#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)] +pub struct BigtableParameters { + project_id: String, + instance_name: String, + #[serde(default)] + is_read_only: bool, + #[serde(default = "default_channel_size")] + channel_size: usize, + + #[serde_as(as = "Option<DurationSeconds<String>>")] + #[serde(default = "default_timeout")] + timeout: Option<std::time::Duration>, + table_name: String, + family_name: String, + + #[serde(default = "default_app_profile_id")] + app_profile_id: String, +} + +fn default_app_profile_id() -> String { + "default".to_owned() +} + +fn default_channel_size() -> usize { + 4 +} + +fn default_timeout() -> Option<std::time::Duration> { + Some(std::time::Duration::from_secs(4)) +} + +impl BigtableDirectoryService { + #[cfg(not(test))] + pub async fn connect(params: BigtableParameters) -> Result<Self, bigtable::Error> { + let connection = bigtable::BigTableConnection::new( + ¶ms.project_id, + ¶ms.instance_name, + params.is_read_only, + params.channel_size, + params.timeout, + ) + .await?; + + Ok(Self { + client: connection.client(), + params, + }) + } + + #[cfg(test)] + pub async fn connect(params: BigtableParameters) -> Result<Self, bigtable::Error> { + use std::time::Duration; + + use async_process::{Command, Stdio}; + use tempfile::TempDir; + use tokio_retry::{strategy::ExponentialBackoff, Retry}; + + let tmpdir = TempDir::new().unwrap(); + + let socket_path = tmpdir.path().join("cbtemulator.sock"); + + let emulator_process = Command::new("cbtemulator") + .arg("-address") + .arg(socket_path.clone()) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .kill_on_drop(true) + .spawn() + .expect("failed to spawn emulator"); + + Retry::spawn( + ExponentialBackoff::from_millis(20) + .max_delay(Duration::from_secs(1)) + .take(3), + || async { + if socket_path.exists() { + Ok(()) + } else { + Err(()) + } + }, + ) + .await + .expect("failed to wait for socket"); + + // populate the emulator + for cmd in &[ + vec!["createtable", ¶ms.table_name], + vec!["createfamily", ¶ms.table_name, ¶ms.family_name], + ] { + Command::new("cbt") + .args({ + let mut args = vec![ + "-instance", + ¶ms.instance_name, + "-project", + ¶ms.project_id, + ]; + args.extend_from_slice(cmd); + args + }) + .env( + "BIGTABLE_EMULATOR_HOST", + format!("unix://{}", socket_path.to_string_lossy()), + ) + .output() + .await + .expect("failed to run cbt setup command"); + } + + let connection = bigtable_rs::bigtable::BigTableConnection::new_with_emulator( + &format!("unix://{}", socket_path.to_string_lossy()), + ¶ms.project_id, + ¶ms.instance_name, + params.is_read_only, + params.timeout, + )?; + + Ok(Self { + client: connection.client(), + params, + emulator: (tmpdir, emulator_process).into(), + }) + } +} + +/// Derives the row/column key for a given blake3 digest. +/// We use hexlower encoding, also because it can't be misinterpreted as RE2. +fn derive_directory_key(digest: &B3Digest) -> String { + HEXLOWER.encode(digest.as_slice()) +} + +#[async_trait] +impl DirectoryService for BigtableDirectoryService { + #[instrument(skip(self, digest), err, fields(directory.digest = %digest))] + async fn get(&self, digest: &B3Digest) -> Result<Option<proto::Directory>, Error> { + let mut client = self.client.clone(); + let directory_key = derive_directory_key(digest); + + let request = bigtable_v2::ReadRowsRequest { + app_profile_id: self.params.app_profile_id.to_string(), + table_name: client.get_full_table_name(&self.params.table_name), + rows_limit: 1, + rows: Some(bigtable_v2::RowSet { + row_keys: vec![directory_key.clone().into()], + row_ranges: vec![], + }), + // Filter selected family name, and column qualifier matching our digest. + // This is to ensure we don't fail once we start bucketing. + filter: Some(bigtable_v2::RowFilter { + filter: Some(bigtable_v2::row_filter::Filter::Chain( + bigtable_v2::row_filter::Chain { + filters: vec![ + bigtable_v2::RowFilter { + filter: Some( + bigtable_v2::row_filter::Filter::FamilyNameRegexFilter( + self.params.family_name.to_string(), + ), + ), + }, + bigtable_v2::RowFilter { + filter: Some( + bigtable_v2::row_filter::Filter::ColumnQualifierRegexFilter( + directory_key.clone().into(), + ), + ), + }, + ], + }, + )), + }), + ..Default::default() + }; + + let mut response = client + .read_rows(request) + .await + .map_err(|e| Error::StorageError(format!("unable to read rows: {}", e)))?; + + if response.len() != 1 { + if response.len() > 1 { + // This shouldn't happen, we limit number of rows to 1 + return Err(Error::StorageError( + "got more than one row from bigtable".into(), + )); + } + // else, this is simply a "not found". + return Ok(None); + } + + let (row_key, mut row_cells) = response.pop().unwrap(); + if row_key != directory_key.as_bytes() { + // This shouldn't happen, we requested this row key. + return Err(Error::StorageError( + "got wrong row key from bigtable".into(), + )); + } + + let row_cell = row_cells + .pop() + .ok_or_else(|| Error::StorageError("found no cells".into()))?; + + // Ensure there's only one cell (so no more left after the pop()) + // This shouldn't happen, We filter out other cells in our query. + if !row_cells.is_empty() { + return Err(Error::StorageError( + "more than one cell returned from bigtable".into(), + )); + } + + // We also require the qualifier to be correct in the filter above, + // so this shouldn't happen. + if directory_key.as_bytes() != row_cell.qualifier { + return Err(Error::StorageError("unexpected cell qualifier".into())); + } + + // For the data in that cell, ensure the digest matches what's requested, before parsing. + let got_digest = B3Digest::from(blake3::hash(&row_cell.value).as_bytes()); + if got_digest != *digest { + return Err(Error::StorageError(format!( + "invalid digest: {}", + got_digest + ))); + } + + // Try to parse the value into a Directory message. + let directory = proto::Directory::decode(Bytes::from(row_cell.value)) + .map_err(|e| Error::StorageError(format!("unable to decode directory proto: {}", e)))?; + + // validate the Directory. + directory + .validate() + .map_err(|e| Error::StorageError(format!("invalid Directory message: {}", e)))?; + + Ok(Some(directory)) + } + + #[instrument(skip(self, directory), err, fields(directory.digest = %directory.digest()))] + async fn put(&self, directory: proto::Directory) -> Result<B3Digest, Error> { + let directory_digest = directory.digest(); + let mut client = self.client.clone(); + let directory_key = derive_directory_key(&directory_digest); + + // Ensure the directory we're trying to upload passes validation + directory + .validate() + .map_err(|e| Error::InvalidRequest(format!("directory is invalid: {}", e)))?; + + let data = directory.encode_to_vec(); + if data.len() as u64 > CELL_SIZE_LIMIT { + return Err(Error::StorageError( + "Directory exceeds cell limit on Bigtable".into(), + )); + } + + let resp = client + .check_and_mutate_row(bigtable_v2::CheckAndMutateRowRequest { + table_name: client.get_full_table_name(&self.params.table_name), + app_profile_id: self.params.app_profile_id.to_string(), + row_key: directory_key.clone().into(), + predicate_filter: Some(bigtable_v2::RowFilter { + filter: Some(bigtable_v2::row_filter::Filter::ColumnQualifierRegexFilter( + directory_key.clone().into(), + )), + }), + // If the column was already found, do nothing. + true_mutations: vec![], + // Else, do the insert. + false_mutations: vec![ + // https://cloud.google.com/bigtable/docs/writes + bigtable_v2::Mutation { + mutation: Some(bigtable_v2::mutation::Mutation::SetCell( + bigtable_v2::mutation::SetCell { + family_name: self.params.family_name.to_string(), + column_qualifier: directory_key.clone().into(), + timestamp_micros: -1, // use server time to fill timestamp + value: data, + }, + )), + }, + ], + }) + .await + .map_err(|e| Error::StorageError(format!("unable to mutate rows: {}", e)))?; + + if resp.predicate_matched { + trace!("already existed") + } + + Ok(directory_digest) + } + + #[instrument(skip_all, fields(directory.digest = %root_directory_digest))] + fn get_recursive( + &self, + root_directory_digest: &B3Digest, + ) -> BoxStream<'static, Result<proto::Directory, Error>> { + traverse_directory(self.clone(), root_directory_digest) + } + + #[instrument(skip_all)] + fn put_multiple_start(&self) -> Box<(dyn DirectoryPutter + 'static)> + where + Self: Clone, + { + Box::new(SimplePutter::new(self.clone())) + } +} diff --git a/tvix/castore/src/directoryservice/closure_validator.rs b/tvix/castore/src/directoryservice/closure_validator.rs new file mode 100644 index 0000000000..b9746a5a05 --- /dev/null +++ b/tvix/castore/src/directoryservice/closure_validator.rs @@ -0,0 +1,309 @@ +use std::collections::{HashMap, HashSet}; + +use bstr::ByteSlice; + +use petgraph::{ + graph::{DiGraph, NodeIndex}, + visit::{Bfs, Walker}, +}; +use tracing::instrument; + +use crate::{ + proto::{self, Directory}, + B3Digest, Error, +}; + +type DirectoryGraph = DiGraph<Directory, ()>; + +/// This can be used to validate a Directory closure (DAG of connected +/// Directories), and their insertion order. +/// +/// Directories need to be inserted (via `add`), in an order from the leaves to +/// the root (DFS Post-Order). +/// During insertion, We validate as much as we can at that time: +/// +/// - individual validation of Directory messages +/// - validation of insertion order (no upload of not-yet-known Directories) +/// - validation of size fields of referred Directories +/// +/// Internally it keeps all received Directories in a directed graph, +/// with node weights being the Directories and edges pointing to child +/// directories. +/// +/// Once all Directories have been inserted, a finalize function can be +/// called to get a (deduplicated and) validated list of directories, in +/// insertion order. +/// During finalize, a check for graph connectivity is performed too, to ensure +/// there's no disconnected components, and only one root. +#[derive(Default)] +pub struct ClosureValidator { + // A directed graph, using Directory as node weight, without edge weights. + // Edges point from parents to children. + graph: DirectoryGraph, + + // A lookup table from directory digest to node index. + digest_to_node_ix: HashMap<B3Digest, NodeIndex>, + + /// Keeps track of the last-inserted directory graph node index. + /// On a correct insert, this will be the root node, from which the DFS post + /// order traversal will start from. + last_directory_ix: Option<NodeIndex>, +} + +impl ClosureValidator { + /// Insert a new Directory into the closure. + /// Perform individual Directory validation, validation of insertion order + /// and size fields. + #[instrument(level = "trace", skip_all, fields(directory.digest=%directory.digest(), directory.size=%directory.size()), err)] + pub fn add(&mut self, directory: proto::Directory) -> Result<(), Error> { + let digest = directory.digest(); + + // If we already saw this node previously, it's already validated and in the graph. + if self.digest_to_node_ix.contains_key(&digest) { + return Ok(()); + } + + // Do some general validation + directory + .validate() + .map_err(|e| Error::InvalidRequest(e.to_string()))?; + + // Ensure the directory only refers to directories which we already accepted. + // We lookup their node indices and add them to a HashSet. + let mut child_ixs = HashSet::new(); + for dir in &directory.directories { + let child_digest = B3Digest::try_from(dir.digest.to_owned()).unwrap(); // validated + + // Ensure the digest has already been seen + let child_ix = *self.digest_to_node_ix.get(&child_digest).ok_or_else(|| { + Error::InvalidRequest(format!( + "'{}' refers to unseen child dir: {}", + dir.name.as_bstr(), + &child_digest + )) + })?; + + // Ensure the size specified in the child node matches the directory size itself. + let recorded_child_size = self + .graph + .node_weight(child_ix) + .expect("node not found") + .size(); + + // Ensure the size specified in the child node matches our records. + if dir.size != recorded_child_size { + return Err(Error::InvalidRequest(format!( + "'{}' has wrong size, specified {}, recorded {}", + dir.name.as_bstr(), + dir.size, + recorded_child_size + ))); + } + + child_ixs.insert(child_ix); + } + + // Insert node into the graph, and add edges to all children. + let node_ix = self.graph.add_node(directory); + for child_ix in child_ixs { + self.graph.add_edge(node_ix, child_ix, ()); + } + + // Record the mapping from digest to node_ix in our lookup table. + self.digest_to_node_ix.insert(digest, node_ix); + + // Update last_directory_ix. + self.last_directory_ix = Some(node_ix); + + Ok(()) + } + + /// Ensure that all inserted Directories are connected, then return a + /// (deduplicated) and validated list of directories, in from-leaves-to-root + /// order. + /// In case no elements have been inserted, returns an empty list. + #[instrument(level = "trace", skip_all, err)] + pub(crate) fn finalize(self) -> Result<Vec<Directory>, Error> { + let (graph, _) = match self.finalize_raw()? { + None => return Ok(vec![]), + Some(v) => v, + }; + // Dissolve the graph, returning the nodes as a Vec. + // As the graph was populated in a valid DFS PostOrder, we can return + // nodes in that same order. + let (nodes, _edges) = graph.into_nodes_edges(); + Ok(nodes.into_iter().map(|x| x.weight).collect()) + } + + /// Ensure that all inserted Directories are connected, then return a + /// (deduplicated) and validated list of directories, in from-root-to-leaves + /// order. + /// In case no elements have been inserted, returns an empty list. + #[instrument(level = "trace", skip_all, err)] + pub(crate) fn finalize_root_to_leaves(self) -> Result<Vec<Directory>, Error> { + let (graph, root) = match self.finalize_raw()? { + None => return Ok(vec![]), + Some(v) => v, + }; + + // do a BFS traversal of the graph, starting with the root node to get + // all nodes reachable from there. + let traversal = Bfs::new(&graph, root); + + let order = traversal.iter(&graph).collect::<Vec<_>>(); + + let (nodes, _edges) = graph.into_nodes_edges(); + + // Convert to option, so that we can take individual nodes out without messing up the + // indices + let mut nodes = nodes.into_iter().map(Some).collect::<Vec<_>>(); + + Ok(order + .iter() + .map(|i| nodes[i.index()].take().unwrap().weight) + .collect()) + } + + /// Internal implementation of closure validation + #[instrument(level = "trace", skip_all, err)] + fn finalize_raw(self) -> Result<Option<(DirectoryGraph, NodeIndex)>, Error> { + // If no nodes were inserted, an empty list is returned. + let last_directory_ix = if let Some(x) = self.last_directory_ix { + x + } else { + return Ok(None); + }; + + // do a BFS traversal of the graph, starting with the root node to get + // (the count of) all nodes reachable from there. + let mut traversal = Bfs::new(&self.graph, last_directory_ix); + + let mut visited_directory_count = 0; + #[cfg(debug_assertions)] + let mut visited_directory_ixs = HashSet::new(); + #[cfg_attr(not(debug_assertions), allow(unused))] + while let Some(directory_ix) = traversal.next(&self.graph) { + #[cfg(debug_assertions)] + visited_directory_ixs.insert(directory_ix); + + visited_directory_count += 1; + } + + // If the number of nodes collected equals the total number of nodes in + // the graph, we know all nodes are connected. + if visited_directory_count != self.graph.node_count() { + // more or less exhaustive error reporting. + #[cfg(debug_assertions)] + { + let all_directory_ixs: HashSet<_> = self.graph.node_indices().collect(); + + let unvisited_directories: HashSet<_> = all_directory_ixs + .difference(&visited_directory_ixs) + .map(|ix| self.graph.node_weight(*ix).expect("node not found")) + .collect(); + + return Err(Error::InvalidRequest(format!( + "found {} disconnected directories: {:?}", + self.graph.node_count() - visited_directory_ixs.len(), + unvisited_directories + ))); + } + #[cfg(not(debug_assertions))] + { + return Err(Error::InvalidRequest(format!( + "found {} disconnected directories", + self.graph.node_count() - visited_directory_count + ))); + } + } + + Ok(Some((self.graph, last_directory_ix))) + } +} + +#[cfg(test)] +mod tests { + use crate::{ + fixtures::{DIRECTORY_A, DIRECTORY_B, DIRECTORY_C}, + proto::{self, Directory}, + }; + use lazy_static::lazy_static; + use rstest::rstest; + + lazy_static! { + pub static ref BROKEN_DIRECTORY : Directory = Directory { + symlinks: vec![proto::SymlinkNode { + name: "".into(), // invalid name! + target: "doesntmatter".into(), + }], + ..Default::default() + }; + + pub static ref BROKEN_PARENT_DIRECTORY: Directory = Directory { + directories: vec![proto::DirectoryNode { + name: "foo".into(), + digest: DIRECTORY_A.digest().into(), + size: DIRECTORY_A.size() + 42, // wrong! + }], + ..Default::default() + }; + } + + use super::ClosureValidator; + + #[rstest] + /// Uploading an empty directory should succeed. + #[case::empty_directory(&[&*DIRECTORY_A], false, Some(vec![&*DIRECTORY_A]))] + /// Uploading A, then B (referring to A) should succeed. + #[case::simple_closure(&[&*DIRECTORY_A, &*DIRECTORY_B], false, Some(vec![&*DIRECTORY_A, &*DIRECTORY_B]))] + /// Uploading A, then A, then C (referring to A twice) should succeed. + /// We pretend to be a dumb client not deduping directories. + #[case::same_child(&[&*DIRECTORY_A, &*DIRECTORY_A, &*DIRECTORY_C], false, Some(vec![&*DIRECTORY_A, &*DIRECTORY_C]))] + /// Uploading A, then C (referring to A twice) should succeed. + #[case::same_child_dedup(&[&*DIRECTORY_A, &*DIRECTORY_C], false, Some(vec![&*DIRECTORY_A, &*DIRECTORY_C]))] + /// Uploading A, then C (referring to A twice), then B (itself referring to A) should fail during close, + /// as B itself would be left unconnected. + #[case::unconnected_node(&[&*DIRECTORY_A, &*DIRECTORY_C, &*DIRECTORY_B], false, None)] + /// Uploading B (referring to A) should fail immediately, because A was never uploaded. + #[case::dangling_pointer(&[&*DIRECTORY_B], true, None)] + /// Uploading a directory failing validation should fail immediately. + #[case::failing_validation(&[&*BROKEN_DIRECTORY], true, None)] + /// Uploading a directory which refers to another Directory with a wrong size should fail. + #[case::wrong_size_in_parent(&[&*DIRECTORY_A, &*BROKEN_PARENT_DIRECTORY], true, None)] + fn test_uploads( + #[case] directories_to_upload: &[&Directory], + #[case] exp_fail_upload_last: bool, + #[case] exp_finalize: Option<Vec<&Directory>>, // Some(_) if finalize successful, None if not. + ) { + let mut dcv = ClosureValidator::default(); + let len_directories_to_upload = directories_to_upload.len(); + + for (i, d) in directories_to_upload.iter().enumerate() { + let resp = dcv.add((*d).clone()); + if i == len_directories_to_upload - 1 && exp_fail_upload_last { + assert!(resp.is_err(), "expect last put to fail"); + + // We don't really care anymore what finalize() would return, as + // the add() failed. + return; + } else { + assert!(resp.is_ok(), "expect put to succeed"); + } + } + + // everything was uploaded successfully. Test finalize(). + let resp = dcv.finalize(); + + match exp_finalize { + Some(directories) => { + assert_eq!( + Vec::from_iter(directories.iter().map(|e| (*e).to_owned())), + resp.expect("drain should succeed") + ); + } + None => { + resp.expect_err("drain should fail"); + } + } + } +} diff --git a/tvix/castore/src/directoryservice/from_addr.rs b/tvix/castore/src/directoryservice/from_addr.rs new file mode 100644 index 0000000000..ee675ca68a --- /dev/null +++ b/tvix/castore/src/directoryservice/from_addr.rs @@ -0,0 +1,189 @@ +use url::Url; + +use crate::{proto::directory_service_client::DirectoryServiceClient, Error}; + +use super::{ + DirectoryService, GRPCDirectoryService, MemoryDirectoryService, ObjectStoreDirectoryService, + SledDirectoryService, +}; + +/// Constructs a new instance of a [DirectoryService] from an URI. +/// +/// The following URIs are supported: +/// - `memory:` +/// Uses a in-memory implementation. +/// - `sled:` +/// Uses a in-memory sled implementation. +/// - `sled:///absolute/path/to/somewhere` +/// Uses sled, using a path on the disk for persistency. Can be only opened +/// from one process at the same time. +/// - `grpc+unix:///absolute/path/to/somewhere` +/// Connects to a local tvix-store gRPC service via Unix socket. +/// - `grpc+http://host:port`, `grpc+https://host:port` +/// Connects to a (remote) tvix-store gRPC service. +pub async fn from_addr(uri: &str) -> Result<Box<dyn DirectoryService>, crate::Error> { + #[allow(unused_mut)] + let mut url = Url::parse(uri) + .map_err(|e| crate::Error::StorageError(format!("unable to parse url: {}", e)))?; + + let directory_service: Box<dyn DirectoryService> = match url.scheme() { + "memory" => { + // memory doesn't support host or path in the URL. + if url.has_host() || !url.path().is_empty() { + return Err(Error::StorageError("invalid url".to_string())); + } + Box::<MemoryDirectoryService>::default() + } + "sled" => { + // sled doesn't support host, and a path can be provided (otherwise + // it'll live in memory only). + if url.has_host() { + return Err(Error::StorageError("no host allowed".to_string())); + } + + if url.path() == "/" { + return Err(Error::StorageError( + "cowardly refusing to open / with sled".to_string(), + )); + } + + // TODO: expose compression and other parameters as URL parameters? + + Box::new(if url.path().is_empty() { + SledDirectoryService::new_temporary() + .map_err(|e| Error::StorageError(e.to_string()))? + } else { + SledDirectoryService::new(url.path()) + .map_err(|e| Error::StorageError(e.to_string()))? + }) + } + scheme if scheme.starts_with("grpc+") => { + // schemes starting with grpc+ go to the GRPCPathInfoService. + // That's normally grpc+unix for unix sockets, and grpc+http(s) for the HTTP counterparts. + // - In the case of unix sockets, there must be a path, but may not be a host. + // - In the case of non-unix sockets, there must be a host, but no path. + // Constructing the channel is handled by tvix_castore::channel::from_url. + let client = DirectoryServiceClient::new(crate::tonic::channel_from_url(&url).await?); + Box::new(GRPCDirectoryService::from_client(client)) + } + scheme if scheme.starts_with("objectstore+") => { + // We need to convert the URL to string, strip the prefix there, and then + // parse it back as url, as Url::set_scheme() rejects some of the transitions we want to do. + let trimmed_url = { + let s = url.to_string(); + Url::parse(s.strip_prefix("objectstore+").unwrap()).unwrap() + }; + Box::new( + ObjectStoreDirectoryService::parse_url(&trimmed_url) + .map_err(|e| Error::StorageError(e.to_string()))?, + ) + } + #[cfg(feature = "cloud")] + "bigtable" => { + use super::bigtable::BigtableParameters; + use super::BigtableDirectoryService; + + // parse the instance name from the hostname. + let instance_name = url + .host_str() + .ok_or_else(|| Error::StorageError("instance name missing".into()))? + .to_string(); + + // … but add it to the query string now, so we just need to parse that. + url.query_pairs_mut() + .append_pair("instance_name", &instance_name); + + let params: BigtableParameters = serde_qs::from_str(url.query().unwrap_or_default()) + .map_err(|e| Error::InvalidRequest(format!("failed to parse parameters: {}", e)))?; + + Box::new( + BigtableDirectoryService::connect(params) + .await + .map_err(|e| Error::StorageError(e.to_string()))?, + ) + } + _ => { + return Err(crate::Error::StorageError(format!( + "unknown scheme: {}", + url.scheme() + ))) + } + }; + Ok(directory_service) +} + +#[cfg(test)] +mod tests { + use super::from_addr; + use lazy_static::lazy_static; + use rstest::rstest; + use tempfile::TempDir; + + lazy_static! { + static ref TMPDIR_SLED_1: TempDir = TempDir::new().unwrap(); + static ref TMPDIR_SLED_2: TempDir = TempDir::new().unwrap(); + } + + #[rstest] + /// This uses an unsupported scheme. + #[case::unsupported_scheme("http://foo.example/test", false)] + /// This configures sled in temporary mode. + #[case::sled_valid_temporary("sled://", true)] + /// This configures sled with /, which should fail. + #[case::sled_invalid_root("sled:///", false)] + /// This configures sled with a host, not path, which should fail. + #[case::sled_invalid_host("sled://foo.example", false)] + /// This configures sled with a valid path path, which should succeed. + #[case::sled_valid_path(&format!("sled://{}", &TMPDIR_SLED_1.path().to_str().unwrap()), true)] + /// This configures sled with a host, and a valid path path, which should fail. + #[case::sled_invalid_host_with_valid_path(&format!("sled://foo.example{}", &TMPDIR_SLED_2.path().to_str().unwrap()), false)] + /// This correctly sets the scheme, and doesn't set a path. + #[case::memory_valid("memory://", true)] + /// This sets a memory url host to `foo` + #[case::memory_invalid_host("memory://foo", false)] + /// This sets a memory url path to "/", which is invalid. + #[case::memory_invalid_root_path("memory:///", false)] + /// This sets a memory url path to "/foo", which is invalid. + #[case::memory_invalid_root_path_foo("memory:///foo", false)] + /// Correct scheme to connect to a unix socket. + #[case::grpc_valid_unix_socket("grpc+unix:///path/to/somewhere", true)] + /// Correct scheme for unix socket, but setting a host too, which is invalid. + #[case::grpc_invalid_unix_socket_and_host("grpc+unix://host.example/path/to/somewhere", false)] + /// Correct scheme to connect to localhost, with port 12345 + #[case::grpc_valid_ipv6_localhost_port_12345("grpc+http://[::1]:12345", true)] + /// Correct scheme to connect to localhost over http, without specifying a port. + #[case::grpc_valid_http_host_without_port("grpc+http://localhost", true)] + /// Correct scheme to connect to localhost over http, without specifying a port. + #[case::grpc_valid_https_host_without_port("grpc+https://localhost", true)] + /// Correct scheme to connect to localhost over http, but with additional path, which is invalid. + #[case::grpc_invalid_host_and_path("grpc+http://localhost/some-path", false)] + /// A valid example for Bigtable + #[cfg_attr( + all(feature = "cloud", feature = "integration"), + case::bigtable_valid_url( + "bigtable://instance-1?project_id=project-1&table_name=table-1&family_name=cf1", + true + ) + )] + /// A valid example for Bigtable, specifying a custom channel size and timeout + #[cfg_attr( + all(feature = "cloud", feature = "integration"), + case::bigtable_valid_url( + "bigtable://instance-1?project_id=project-1&table_name=table-1&family_name=cf1&channel_size=10&timeout=10", + true + ) + )] + /// A invalid Bigtable example (missing fields) + #[cfg_attr( + all(feature = "cloud", feature = "integration"), + case::bigtable_invalid_url("bigtable://instance-1", false) + )] + #[tokio::test] + async fn test_from_addr_tokio(#[case] uri_str: &str, #[case] exp_succeed: bool) { + if exp_succeed { + from_addr(uri_str).await.expect("should succeed"); + } else { + assert!(from_addr(uri_str).await.is_err(), "should fail"); + } + } +} diff --git a/tvix/castore/src/directoryservice/grpc.rs b/tvix/castore/src/directoryservice/grpc.rs new file mode 100644 index 0000000000..fe935629bf --- /dev/null +++ b/tvix/castore/src/directoryservice/grpc.rs @@ -0,0 +1,345 @@ +use std::collections::HashSet; + +use super::{DirectoryPutter, DirectoryService}; +use crate::proto::{self, get_directory_request::ByWhat}; +use crate::{B3Digest, Error}; +use async_stream::try_stream; +use futures::stream::BoxStream; +use tokio::spawn; +use tokio::sync::mpsc::UnboundedSender; +use tokio::task::JoinHandle; +use tokio_stream::wrappers::UnboundedReceiverStream; +use tonic::async_trait; +use tonic::Code; +use tonic::{transport::Channel, Status}; +use tracing::{instrument, warn}; + +/// Connects to a (remote) tvix-store DirectoryService over gRPC. +#[derive(Clone)] +pub struct GRPCDirectoryService { + /// The internal reference to a gRPC client. + /// Cloning it is cheap, and it internally handles concurrent requests. + grpc_client: proto::directory_service_client::DirectoryServiceClient<Channel>, +} + +impl GRPCDirectoryService { + /// construct a [GRPCDirectoryService] from a [proto::directory_service_client::DirectoryServiceClient]. + /// panics if called outside the context of a tokio runtime. + pub fn from_client( + grpc_client: proto::directory_service_client::DirectoryServiceClient<Channel>, + ) -> Self { + Self { grpc_client } + } +} + +#[async_trait] +impl DirectoryService for GRPCDirectoryService { + #[instrument(level = "trace", skip_all, fields(directory.digest = %digest))] + async fn get( + &self, + digest: &B3Digest, + ) -> Result<Option<crate::proto::Directory>, crate::Error> { + // Get a new handle to the gRPC client, and copy the digest. + let mut grpc_client = self.grpc_client.clone(); + let digest_cpy = digest.clone(); + let message = async move { + let mut s = grpc_client + .get(proto::GetDirectoryRequest { + recursive: false, + by_what: Some(ByWhat::Digest(digest_cpy.into())), + }) + .await? + .into_inner(); + + // Retrieve the first message only, then close the stream (we set recursive to false) + s.message().await + }; + + let digest = digest.clone(); + match message.await { + Ok(Some(directory)) => { + // Validate the retrieved Directory indeed has the + // digest we expect it to have, to detect corruptions. + let actual_digest = directory.digest(); + if actual_digest != digest { + Err(crate::Error::StorageError(format!( + "requested directory with digest {}, but got {}", + digest, actual_digest + ))) + } else if let Err(e) = directory.validate() { + // Validate the Directory itself is valid. + warn!("directory failed validation: {}", e.to_string()); + Err(crate::Error::StorageError(format!( + "directory {} failed validation: {}", + digest, e, + ))) + } else { + Ok(Some(directory)) + } + } + Ok(None) => Ok(None), + Err(e) if e.code() == Code::NotFound => Ok(None), + Err(e) => Err(crate::Error::StorageError(e.to_string())), + } + } + + #[instrument(level = "trace", skip_all, fields(directory.digest = %directory.digest()))] + async fn put(&self, directory: crate::proto::Directory) -> Result<B3Digest, crate::Error> { + let resp = self + .grpc_client + .clone() + .put(tokio_stream::once(directory)) + .await; + + match resp { + Ok(put_directory_resp) => Ok(put_directory_resp + .into_inner() + .root_digest + .try_into() + .map_err(|_| { + Error::StorageError("invalid root digest length in response".to_string()) + })?), + Err(e) => Err(crate::Error::StorageError(e.to_string())), + } + } + + #[instrument(level = "trace", skip_all, fields(directory.digest = %root_directory_digest))] + fn get_recursive( + &self, + root_directory_digest: &B3Digest, + ) -> BoxStream<'static, Result<proto::Directory, Error>> { + let mut grpc_client = self.grpc_client.clone(); + let root_directory_digest = root_directory_digest.clone(); + + let stream = try_stream! { + let mut stream = grpc_client + .get(proto::GetDirectoryRequest { + recursive: true, + by_what: Some(ByWhat::Digest(root_directory_digest.clone().into())), + }) + .await + .map_err(|e| crate::Error::StorageError(e.to_string()))? + .into_inner(); + + // The Directory digests we received so far + let mut received_directory_digests: HashSet<B3Digest> = HashSet::new(); + // The Directory digests we're still expecting to get sent. + let mut expected_directory_digests: HashSet<B3Digest> = HashSet::from([root_directory_digest]); + + loop { + match stream.message().await { + Ok(Some(directory)) => { + // validate the directory itself. + if let Err(e) = directory.validate() { + Err(crate::Error::StorageError(format!( + "directory {} failed validation: {}", + directory.digest(), + e, + )))?; + } + // validate we actually expected that directory, and move it from expected to received. + let directory_digest = directory.digest(); + let was_expected = expected_directory_digests.remove(&directory_digest); + if !was_expected { + // FUTUREWORK: dumb clients might send the same stuff twice. + // as a fallback, we might want to tolerate receiving + // it if it's in received_directory_digests (as that + // means it once was in expected_directory_digests) + Err(crate::Error::StorageError(format!( + "received unexpected directory {}", + directory_digest + )))?; + } + received_directory_digests.insert(directory_digest); + + // register all children in expected_directory_digests. + for child_directory in &directory.directories { + // We ran validate() above, so we know these digests must be correct. + let child_directory_digest = + child_directory.digest.clone().try_into().unwrap(); + + expected_directory_digests + .insert(child_directory_digest); + } + + yield directory; + }, + Ok(None) => { + // If we were still expecting something, that's an error. + if !expected_directory_digests.is_empty() { + Err(crate::Error::StorageError(format!( + "still expected {} directories, but got premature end of stream", + expected_directory_digests.len(), + )))? + } else { + return + } + }, + Err(e) => { + Err(crate::Error::StorageError(e.to_string()))?; + }, + } + } + }; + + Box::pin(stream) + } + + #[instrument(skip_all)] + fn put_multiple_start(&self) -> Box<(dyn DirectoryPutter + 'static)> + where + Self: Clone, + { + let mut grpc_client = self.grpc_client.clone(); + + let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); + + let task: JoinHandle<Result<proto::PutDirectoryResponse, Status>> = spawn(async move { + let s = grpc_client + .put(UnboundedReceiverStream::new(rx)) + .await? + .into_inner(); + + Ok(s) + }); + + Box::new(GRPCPutter { + rq: Some((task, tx)), + }) + } +} + +/// Allows uploading multiple Directory messages in the same gRPC stream. +pub struct GRPCPutter { + /// Data about the current request - a handle to the task, and the tx part + /// of the channel. + /// The tx part of the pipe is used to send [proto::Directory] to the ongoing request. + /// The task will yield a [proto::PutDirectoryResponse] once the stream is closed. + #[allow(clippy::type_complexity)] // lol + rq: Option<( + JoinHandle<Result<proto::PutDirectoryResponse, Status>>, + UnboundedSender<proto::Directory>, + )>, +} + +#[async_trait] +impl DirectoryPutter for GRPCPutter { + #[instrument(level = "trace", skip_all, fields(directory.digest=%directory.digest()), err)] + async fn put(&mut self, directory: proto::Directory) -> Result<(), crate::Error> { + match self.rq { + // If we're not already closed, send the directory to directory_sender. + Some((_, ref directory_sender)) => { + if directory_sender.send(directory).is_err() { + // If the channel has been prematurely closed, invoke close (so we can peek at the error code) + // That error code is much more helpful, because it + // contains the error message from the server. + self.close().await?; + } + Ok(()) + } + // If self.close() was already called, we can't put again. + None => Err(Error::StorageError( + "DirectoryPutter already closed".to_string(), + )), + } + } + + /// Closes the stream for sending, and returns the value. + #[instrument(level = "trace", skip_all, ret, err)] + async fn close(&mut self) -> Result<B3Digest, crate::Error> { + // get self.rq, and replace it with None. + // This ensures we can only close it once. + match std::mem::take(&mut self.rq) { + None => Err(Error::StorageError("already closed".to_string())), + Some((task, directory_sender)) => { + // close directory_sender, so blocking on task will finish. + drop(directory_sender); + + let root_digest = task + .await? + .map_err(|e| Error::StorageError(e.to_string()))? + .root_digest; + + root_digest.try_into().map_err(|_| { + Error::StorageError("invalid root digest length in response".to_string()) + }) + } + } + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + use tempfile::TempDir; + use tokio::net::UnixListener; + use tokio_retry::{strategy::ExponentialBackoff, Retry}; + use tokio_stream::wrappers::UnixListenerStream; + + use crate::{ + directoryservice::{DirectoryService, GRPCDirectoryService, MemoryDirectoryService}, + fixtures, + proto::{directory_service_client::DirectoryServiceClient, GRPCDirectoryServiceWrapper}, + }; + + /// This ensures connecting via gRPC works as expected. + #[tokio::test] + async fn test_valid_unix_path_ping_pong() { + let tmpdir = TempDir::new().unwrap(); + let socket_path = tmpdir.path().join("daemon"); + + let path_clone = socket_path.clone(); + + // Spin up a server + tokio::spawn(async { + let uds = UnixListener::bind(path_clone).unwrap(); + let uds_stream = UnixListenerStream::new(uds); + + // spin up a new server + let mut server = tonic::transport::Server::builder(); + let router = server.add_service( + crate::proto::directory_service_server::DirectoryServiceServer::new( + GRPCDirectoryServiceWrapper::new( + Box::<MemoryDirectoryService>::default() as Box<dyn DirectoryService> + ), + ), + ); + router.serve_with_incoming(uds_stream).await + }); + + // wait for the socket to be created + Retry::spawn( + ExponentialBackoff::from_millis(20).max_delay(Duration::from_secs(10)), + || async { + if socket_path.exists() { + Ok(()) + } else { + Err(()) + } + }, + ) + .await + .expect("failed to wait for socket"); + + // prepare a client + let grpc_client = { + let url = url::Url::parse(&format!( + "grpc+unix://{}?wait-connect=1", + socket_path.display() + )) + .expect("must parse"); + let client = DirectoryServiceClient::new( + crate::tonic::channel_from_url(&url) + .await + .expect("must succeed"), + ); + GRPCDirectoryService::from_client(client) + }; + + assert!(grpc_client + .get(&fixtures::DIRECTORY_A.digest()) + .await + .expect("must not fail") + .is_none()) + } +} diff --git a/tvix/castore/src/directoryservice/memory.rs b/tvix/castore/src/directoryservice/memory.rs new file mode 100644 index 0000000000..3b2795c396 --- /dev/null +++ b/tvix/castore/src/directoryservice/memory.rs @@ -0,0 +1,87 @@ +use crate::{proto, B3Digest, Error}; +use futures::stream::BoxStream; +use std::collections::HashMap; +use std::sync::Arc; +use tokio::sync::RwLock; +use tonic::async_trait; +use tracing::{instrument, warn}; + +use super::utils::traverse_directory; +use super::{DirectoryPutter, DirectoryService, SimplePutter}; + +#[derive(Clone, Default)] +pub struct MemoryDirectoryService { + db: Arc<RwLock<HashMap<B3Digest, proto::Directory>>>, +} + +#[async_trait] +impl DirectoryService for MemoryDirectoryService { + #[instrument(skip(self, digest), fields(directory.digest = %digest))] + async fn get(&self, digest: &B3Digest) -> Result<Option<proto::Directory>, Error> { + let db = self.db.read().await; + + match db.get(digest) { + // The directory was not found, return + None => Ok(None), + + // The directory was found, try to parse the data as Directory message + Some(directory) => { + // Validate the retrieved Directory indeed has the + // digest we expect it to have, to detect corruptions. + let actual_digest = directory.digest(); + if actual_digest != *digest { + return Err(Error::StorageError(format!( + "requested directory with digest {}, but got {}", + digest, actual_digest + ))); + } + + // Validate the Directory itself is valid. + if let Err(e) = directory.validate() { + warn!("directory failed validation: {}", e.to_string()); + return Err(Error::StorageError(format!( + "directory {} failed validation: {}", + actual_digest, e, + ))); + } + + Ok(Some(directory.clone())) + } + } + } + + #[instrument(skip(self, directory), fields(directory.digest = %directory.digest()))] + async fn put(&self, directory: proto::Directory) -> Result<B3Digest, Error> { + let digest = directory.digest(); + + // validate the directory itself. + if let Err(e) = directory.validate() { + return Err(Error::InvalidRequest(format!( + "directory {} failed validation: {}", + digest, e, + ))); + } + + // store it + let mut db = self.db.write().await; + db.insert(digest.clone(), directory); + + Ok(digest) + } + + #[instrument(skip_all, fields(directory.digest = %root_directory_digest))] + fn get_recursive( + &self, + root_directory_digest: &B3Digest, + ) -> BoxStream<'static, Result<proto::Directory, Error>> { + traverse_directory(self.clone(), root_directory_digest) + } + + #[instrument(skip_all)] + fn put_multiple_start(&self) -> Box<(dyn DirectoryPutter + 'static)> + where + Self: Clone, + { + Box::new(SimplePutter::new(self.clone())) + } +} diff --git a/tvix/castore/src/directoryservice/mod.rs b/tvix/castore/src/directoryservice/mod.rs new file mode 100644 index 0000000000..3f180ef162 --- /dev/null +++ b/tvix/castore/src/directoryservice/mod.rs @@ -0,0 +1,124 @@ +use crate::{proto, B3Digest, Error}; +use futures::stream::BoxStream; +use tonic::async_trait; + +mod closure_validator; +mod from_addr; +mod grpc; +mod memory; +mod object_store; +mod simple_putter; +mod sled; +#[cfg(test)] +pub mod tests; +mod traverse; +mod utils; + +pub use self::closure_validator::ClosureValidator; +pub use self::from_addr::from_addr; +pub use self::grpc::GRPCDirectoryService; +pub use self::memory::MemoryDirectoryService; +pub use self::object_store::ObjectStoreDirectoryService; +pub use self::simple_putter::SimplePutter; +pub use self::sled::SledDirectoryService; +pub use self::traverse::descend_to; +pub use self::utils::traverse_directory; + +#[cfg(feature = "cloud")] +mod bigtable; + +#[cfg(feature = "cloud")] +pub use self::bigtable::BigtableDirectoryService; + +/// The base trait all Directory services need to implement. +/// This is a simple get and put of [crate::proto::Directory], returning their +/// digest. +#[async_trait] +pub trait DirectoryService: Send + Sync { + /// Looks up a single Directory message by its digest. + /// The returned Directory message *must* be valid. + /// In case the directory is not found, Ok(None) is returned. + /// + /// It is okay for certain implementations to only allow retrieval of + /// Directory digests that are at the "root", aka the last element that's + /// sent to a DirectoryPutter. This makes sense for implementations bundling + /// closures of directories together in batches. + async fn get(&self, digest: &B3Digest) -> Result<Option<proto::Directory>, Error>; + /// Uploads a single Directory message, and returns the calculated + /// digest, or an error. An error *must* also be returned if the message is + /// not valid. + async fn put(&self, directory: proto::Directory) -> Result<B3Digest, Error>; + + /// Looks up a closure of [proto::Directory]. + /// Ideally this would be a `impl Stream<Item = Result<proto::Directory, Error>>`, + /// and we'd be able to add a default implementation for it here, but + /// we can't have that yet. + /// + /// This returns a pinned, boxed stream. The pinning allows for it to be polled easily, + /// and the box allows different underlying stream implementations to be returned since + /// Rust doesn't support this as a generic in traits yet. This is the same thing that + /// [async_trait] generates, but for streams instead of futures. + /// + /// The individually returned Directory messages *must* be valid. + /// Directories are sent in an order from the root to the leaves, so that + /// the receiving side can validate each message to be a connected to the root + /// that has initially been requested. + fn get_recursive( + &self, + root_directory_digest: &B3Digest, + ) -> BoxStream<'static, Result<proto::Directory, Error>>; + + /// Allows persisting a closure of [proto::Directory], which is a graph of + /// connected Directory messages. + fn put_multiple_start(&self) -> Box<dyn DirectoryPutter>; +} + +#[async_trait] +impl<A> DirectoryService for A +where + A: AsRef<dyn DirectoryService> + Send + Sync, +{ + async fn get(&self, digest: &B3Digest) -> Result<Option<proto::Directory>, Error> { + self.as_ref().get(digest).await + } + + async fn put(&self, directory: proto::Directory) -> Result<B3Digest, Error> { + self.as_ref().put(directory).await + } + + fn get_recursive( + &self, + root_directory_digest: &B3Digest, + ) -> BoxStream<'static, Result<proto::Directory, Error>> { + self.as_ref().get_recursive(root_directory_digest) + } + + fn put_multiple_start(&self) -> Box<dyn DirectoryPutter> { + self.as_ref().put_multiple_start() + } +} + +/// Provides a handle to put a closure of connected [proto::Directory] elements. +/// +/// The consumer can periodically call [DirectoryPutter::put], starting from the +/// leaves. Once the root is reached, [DirectoryPutter::close] can be called to +/// retrieve the root digest (or an error). +/// +/// DirectoryPutters might be created without a single [DirectoryPutter::put], +/// and then dropped without calling [DirectoryPutter::close], +/// for example when ingesting a path that ends up not pointing to a directory, +/// but a single file or symlink. +#[async_trait] +pub trait DirectoryPutter: Send { + /// Put a individual [proto::Directory] into the store. + /// Error semantics and behaviour is up to the specific implementation of + /// this trait. + /// Due to bursting, the returned error might refer to an object previously + /// sent via `put`. + async fn put(&mut self, directory: proto::Directory) -> Result<(), Error>; + + /// Close the stream, and wait for any errors. + /// If there's been any invalid Directory message uploaded, and error *must* + /// be returned. + async fn close(&mut self) -> Result<B3Digest, Error>; +} diff --git a/tvix/castore/src/directoryservice/object_store.rs b/tvix/castore/src/directoryservice/object_store.rs new file mode 100644 index 0000000000..64ce335edb --- /dev/null +++ b/tvix/castore/src/directoryservice/object_store.rs @@ -0,0 +1,261 @@ +use std::collections::HashSet; +use std::sync::Arc; + +use data_encoding::HEXLOWER; +use futures::future::Either; +use futures::stream::BoxStream; +use futures::SinkExt; +use futures::StreamExt; +use futures::TryFutureExt; +use futures::TryStreamExt; +use object_store::{path::Path, ObjectStore}; +use prost::Message; +use tokio::io::AsyncWriteExt; +use tokio_util::codec::LengthDelimitedCodec; +use tonic::async_trait; +use tracing::{instrument, trace, warn, Level}; +use url::Url; + +use super::{ClosureValidator, DirectoryPutter, DirectoryService}; +use crate::{proto, B3Digest, Error}; + +/// Stores directory closures in an object store. +/// Notably, this makes use of the option to disallow accessing child directories except when +/// fetching them recursively via the top-level directory, since all batched writes +/// (using `put_multiple_start`) are stored in a single object. +/// Directories are stored in a length-delimited format with a 1MiB limit. The length field is a +/// u32 and the directories are stored in root-to-leaves topological order, the same way they will +/// be returned to the client in get_recursive. +#[derive(Clone)] +pub struct ObjectStoreDirectoryService { + object_store: Arc<dyn ObjectStore>, + base_path: Path, +} + +#[instrument(level=Level::TRACE, skip_all,fields(base_path=%base_path,blob.digest=%digest),ret(Display))] +fn derive_dirs_path(base_path: &Path, digest: &B3Digest) -> Path { + base_path + .child("dirs") + .child("b3") + .child(HEXLOWER.encode(&digest.as_slice()[..2])) + .child(HEXLOWER.encode(digest.as_slice())) +} + +#[allow(clippy::identity_op)] +const MAX_FRAME_LENGTH: usize = 1 * 1024 * 1024 * 1000; // 1 MiB + // +impl ObjectStoreDirectoryService { + /// Constructs a new [ObjectStoreBlobService] from a [Url] supported by + /// [object_store]. + /// Any path suffix becomes the base path of the object store. + /// additional options, the same as in [object_store::parse_url_opts] can + /// be passed. + pub fn parse_url_opts<I, K, V>(url: &Url, options: I) -> Result<Self, object_store::Error> + where + I: IntoIterator<Item = (K, V)>, + K: AsRef<str>, + V: Into<String>, + { + let (object_store, path) = object_store::parse_url_opts(url, options)?; + + Ok(Self { + object_store: Arc::new(object_store), + base_path: path, + }) + } + + /// Like [Self::parse_url_opts], except without the options. + pub fn parse_url(url: &Url) -> Result<Self, object_store::Error> { + Self::parse_url_opts(url, Vec::<(String, String)>::new()) + } +} + +#[async_trait] +impl DirectoryService for ObjectStoreDirectoryService { + /// This is the same steps as for get_recursive anyways, so we just call get_recursive and + /// return the first element of the stream and drop the request. + #[instrument(skip(self, digest), fields(directory.digest = %digest))] + async fn get(&self, digest: &B3Digest) -> Result<Option<proto::Directory>, Error> { + self.get_recursive(digest).take(1).next().await.transpose() + } + + #[instrument(skip(self, directory), fields(directory.digest = %directory.digest()))] + async fn put(&self, directory: proto::Directory) -> Result<B3Digest, Error> { + if !directory.directories.is_empty() { + return Err(Error::InvalidRequest( + "only put_multiple_start is supported by the ObjectStoreDirectoryService for directories with children".into(), + )); + } + + let mut handle = self.put_multiple_start(); + handle.put(directory).await?; + handle.close().await + } + + #[instrument(skip_all, fields(directory.digest = %root_directory_digest))] + fn get_recursive( + &self, + root_directory_digest: &B3Digest, + ) -> BoxStream<'static, Result<proto::Directory, Error>> { + // The Directory digests we're expecting to receive. + let mut expected_directory_digests: HashSet<B3Digest> = + HashSet::from([root_directory_digest.clone()]); + + let dir_path = derive_dirs_path(&self.base_path, root_directory_digest); + let object_store = self.object_store.clone(); + + Box::pin( + (async move { + let stream = match object_store.get(&dir_path).await { + Ok(v) => v.into_stream(), + Err(object_store::Error::NotFound { .. }) => { + return Ok(Either::Left(futures::stream::empty())) + } + Err(e) => return Err(std::io::Error::from(e).into()), + }; + + // get a reader of the response body. + let r = tokio_util::io::StreamReader::new(stream); + let decompressed_stream = async_compression::tokio::bufread::ZstdDecoder::new(r); + + // the subdirectories are stored in a length delimited format + let delimited_stream = LengthDelimitedCodec::builder() + .max_frame_length(MAX_FRAME_LENGTH) + .length_field_type::<u32>() + .new_read(decompressed_stream); + + let dirs_stream = delimited_stream.map_err(Error::from).and_then(move |buf| { + futures::future::ready((|| { + let mut hasher = blake3::Hasher::new(); + let digest: B3Digest = hasher.update(&buf).finalize().as_bytes().into(); + + // Ensure to only decode the directory objects whose digests we trust + let was_expected = expected_directory_digests.remove(&digest); + if !was_expected { + return Err(crate::Error::StorageError(format!( + "received unexpected directory {}", + digest + ))); + } + + let directory = proto::Directory::decode(&*buf).map_err(|e| { + warn!("unable to parse directory {}: {}", digest, e); + Error::StorageError(e.to_string()) + })?; + + for directory in &directory.directories { + // Allow the children to appear next + expected_directory_digests.insert( + B3Digest::try_from(directory.digest.clone()) + .map_err(|e| Error::StorageError(e.to_string()))?, + ); + } + + Ok(directory) + })()) + }); + + Ok(Either::Right(dirs_stream)) + }) + .try_flatten_stream(), + ) + } + + #[instrument(skip_all)] + fn put_multiple_start(&self) -> Box<(dyn DirectoryPutter + 'static)> + where + Self: Clone, + { + Box::new(ObjectStoreDirectoryPutter::new( + self.object_store.clone(), + self.base_path.clone(), + )) + } +} + +struct ObjectStoreDirectoryPutter { + object_store: Arc<dyn ObjectStore>, + base_path: Path, + + directory_validator: Option<ClosureValidator>, +} + +impl ObjectStoreDirectoryPutter { + fn new(object_store: Arc<dyn ObjectStore>, base_path: Path) -> Self { + Self { + object_store, + base_path, + directory_validator: Some(Default::default()), + } + } +} + +#[async_trait] +impl DirectoryPutter for ObjectStoreDirectoryPutter { + #[instrument(level = "trace", skip_all, fields(directory.digest=%directory.digest()), err)] + async fn put(&mut self, directory: proto::Directory) -> Result<(), Error> { + match self.directory_validator { + None => return Err(Error::StorageError("already closed".to_string())), + Some(ref mut validator) => { + validator.add(directory)?; + } + } + + Ok(()) + } + + #[instrument(level = "trace", skip_all, ret, err)] + async fn close(&mut self) -> Result<B3Digest, Error> { + let validator = match self.directory_validator.take() { + None => return Err(Error::InvalidRequest("already closed".to_string())), + Some(validator) => validator, + }; + + // retrieve the validated directories. + // It is important that they are in topological order (root first), + // since that's how we want to retrieve them from the object store in the end. + let directories = validator.finalize_root_to_leaves()?; + + // Get the root digest + let root_digest = directories + .first() + .ok_or_else(|| Error::InvalidRequest("got no directories".to_string()))? + .digest(); + + let dir_path = derive_dirs_path(&self.base_path, &root_digest); + + match self.object_store.head(&dir_path).await { + // directory tree already exists, nothing to do + Ok(_) => { + trace!("directory tree already exists"); + } + + // directory tree does not yet exist, compress and upload. + Err(object_store::Error::NotFound { .. }) => { + trace!("uploading directory tree"); + + let object_store_writer = + object_store::buffered::BufWriter::new(self.object_store.clone(), dir_path); + let compressed_writer = + async_compression::tokio::write::ZstdEncoder::new(object_store_writer); + let mut directories_sink = LengthDelimitedCodec::builder() + .max_frame_length(MAX_FRAME_LENGTH) + .length_field_type::<u32>() + .new_write(compressed_writer); + + for directory in directories { + directories_sink + .send(directory.encode_to_vec().into()) + .await?; + } + + let mut compressed_writer = directories_sink.into_inner(); + compressed_writer.shutdown().await?; + } + // other error + Err(err) => Err(std::io::Error::from(err))?, + } + + Ok(root_digest) + } +} diff --git a/tvix/castore/src/directoryservice/simple_putter.rs b/tvix/castore/src/directoryservice/simple_putter.rs new file mode 100644 index 0000000000..25617ebcac --- /dev/null +++ b/tvix/castore/src/directoryservice/simple_putter.rs @@ -0,0 +1,75 @@ +use super::ClosureValidator; +use super::DirectoryPutter; +use super::DirectoryService; +use crate::proto; +use crate::B3Digest; +use crate::Error; +use tonic::async_trait; +use tracing::instrument; +use tracing::warn; + +/// This is an implementation of DirectoryPutter that simply +/// inserts individual Directory messages one by one, on close, after +/// they successfully validated. +pub struct SimplePutter<DS: DirectoryService> { + directory_service: DS, + + directory_validator: Option<ClosureValidator>, +} + +impl<DS: DirectoryService> SimplePutter<DS> { + pub fn new(directory_service: DS) -> Self { + Self { + directory_service, + directory_validator: Some(Default::default()), + } + } +} + +#[async_trait] +impl<DS: DirectoryService + 'static> DirectoryPutter for SimplePutter<DS> { + #[instrument(level = "trace", skip_all, fields(directory.digest=%directory.digest()), err)] + async fn put(&mut self, directory: proto::Directory) -> Result<(), Error> { + match self.directory_validator { + None => return Err(Error::StorageError("already closed".to_string())), + Some(ref mut validator) => { + validator.add(directory)?; + } + } + + Ok(()) + } + + #[instrument(level = "trace", skip_all, ret, err)] + async fn close(&mut self) -> Result<B3Digest, Error> { + match self.directory_validator.take() { + None => Err(Error::InvalidRequest("already closed".to_string())), + Some(validator) => { + // retrieve the validated directories. + let directories = validator.finalize()?; + + // Get the root digest, which is at the end (cf. insertion order) + let root_digest = directories + .last() + .ok_or_else(|| Error::InvalidRequest("got no directories".to_string()))? + .digest(); + + // call an individual put for each directory and await the insertion. + for directory in directories { + let exp_digest = directory.digest(); + let actual_digest = self.directory_service.put(directory).await?; + + // ensure the digest the backend told us matches our expectations. + if exp_digest != actual_digest { + warn!(directory.digest_expected=%exp_digest, directory.digest_actual=%actual_digest, "unexpected digest"); + return Err(Error::StorageError( + "got unexpected digest from backend during put".into(), + )); + } + } + + Ok(root_digest) + } + } + } +} diff --git a/tvix/castore/src/directoryservice/sled.rs b/tvix/castore/src/directoryservice/sled.rs new file mode 100644 index 0000000000..9490a49c00 --- /dev/null +++ b/tvix/castore/src/directoryservice/sled.rs @@ -0,0 +1,189 @@ +use crate::proto::Directory; +use crate::{proto, B3Digest, Error}; +use futures::stream::BoxStream; +use prost::Message; +use std::ops::Deref; +use std::path::Path; +use tonic::async_trait; +use tracing::{instrument, warn}; + +use super::utils::traverse_directory; +use super::{ClosureValidator, DirectoryPutter, DirectoryService}; + +#[derive(Clone)] +pub struct SledDirectoryService { + db: sled::Db, +} + +impl SledDirectoryService { + pub fn new<P: AsRef<Path>>(p: P) -> Result<Self, sled::Error> { + let config = sled::Config::default() + .use_compression(false) // is a required parameter + .path(p); + let db = config.open()?; + + Ok(Self { db }) + } + + pub fn new_temporary() -> Result<Self, sled::Error> { + let config = sled::Config::default().temporary(true); + let db = config.open()?; + + Ok(Self { db }) + } +} + +#[async_trait] +impl DirectoryService for SledDirectoryService { + #[instrument(skip(self, digest), fields(directory.digest = %digest))] + async fn get(&self, digest: &B3Digest) -> Result<Option<proto::Directory>, Error> { + let resp = tokio::task::spawn_blocking({ + let db = self.db.clone(); + let digest = digest.clone(); + move || db.get(digest.as_slice()) + }) + .await? + .map_err(|e| { + warn!("failed to retrieve directory: {}", e); + Error::StorageError(format!("failed to retrieve directory: {}", e)) + })?; + + match resp { + // The directory was not found, return + None => Ok(None), + + // The directory was found, try to parse the data as Directory message + Some(data) => match Directory::decode(&*data) { + Ok(directory) => { + // Validate the retrieved Directory indeed has the + // digest we expect it to have, to detect corruptions. + let actual_digest = directory.digest(); + if actual_digest != *digest { + return Err(Error::StorageError(format!( + "requested directory with digest {}, but got {}", + digest, actual_digest + ))); + } + + // Validate the Directory itself is valid. + if let Err(e) = directory.validate() { + warn!("directory failed validation: {}", e.to_string()); + return Err(Error::StorageError(format!( + "directory {} failed validation: {}", + actual_digest, e, + ))); + } + + Ok(Some(directory)) + } + Err(e) => { + warn!("unable to parse directory {}: {}", digest, e); + Err(Error::StorageError(e.to_string())) + } + }, + } + } + + #[instrument(skip(self, directory), fields(directory.digest = %directory.digest()))] + async fn put(&self, directory: proto::Directory) -> Result<B3Digest, Error> { + tokio::task::spawn_blocking({ + let db = self.db.clone(); + move || { + let digest = directory.digest(); + + // validate the directory itself. + if let Err(e) = directory.validate() { + return Err(Error::InvalidRequest(format!( + "directory {} failed validation: {}", + digest, e, + ))); + } + // store it + db.insert(digest.as_slice(), directory.encode_to_vec()) + .map_err(|e| Error::StorageError(e.to_string()))?; + + Ok(digest) + } + }) + .await? + } + + #[instrument(skip_all, fields(directory.digest = %root_directory_digest))] + fn get_recursive( + &self, + root_directory_digest: &B3Digest, + ) -> BoxStream<'static, Result<proto::Directory, Error>> { + traverse_directory(self.clone(), root_directory_digest) + } + + #[instrument(skip_all)] + fn put_multiple_start(&self) -> Box<(dyn DirectoryPutter + 'static)> + where + Self: Clone, + { + Box::new(SledDirectoryPutter { + tree: self.db.deref().clone(), + directory_validator: Some(Default::default()), + }) + } +} + +/// Buffers Directory messages to be uploaded and inserts them in a batch +/// transaction on close. +pub struct SledDirectoryPutter { + tree: sled::Tree, + + /// The directories (inside the directory validator) that we insert later, + /// or None, if they were already inserted. + directory_validator: Option<ClosureValidator>, +} + +#[async_trait] +impl DirectoryPutter for SledDirectoryPutter { + #[instrument(level = "trace", skip_all, fields(directory.digest=%directory.digest()), err)] + async fn put(&mut self, directory: proto::Directory) -> Result<(), Error> { + match self.directory_validator { + None => return Err(Error::StorageError("already closed".to_string())), + Some(ref mut validator) => { + validator.add(directory)?; + } + } + + Ok(()) + } + + #[instrument(level = "trace", skip_all, ret, err)] + async fn close(&mut self) -> Result<B3Digest, Error> { + match self.directory_validator.take() { + None => Err(Error::InvalidRequest("already closed".to_string())), + Some(validator) => { + // Insert all directories as a batch. + tokio::task::spawn_blocking({ + let tree = self.tree.clone(); + move || { + // retrieve the validated directories. + let directories = validator.finalize()?; + + // Get the root digest, which is at the end (cf. insertion order) + let root_digest = directories + .last() + .ok_or_else(|| Error::InvalidRequest("got no directories".to_string()))? + .digest(); + + let mut batch = sled::Batch::default(); + for directory in directories { + batch.insert(directory.digest().as_slice(), directory.encode_to_vec()); + } + + tree.apply_batch(batch).map_err(|e| { + Error::StorageError(format!("unable to apply batch: {}", e)) + })?; + + Ok(root_digest) + } + }) + .await? + } + } + } +} diff --git a/tvix/castore/src/directoryservice/tests/mod.rs b/tvix/castore/src/directoryservice/tests/mod.rs new file mode 100644 index 0000000000..cc3c5b788a --- /dev/null +++ b/tvix/castore/src/directoryservice/tests/mod.rs @@ -0,0 +1,227 @@ +//! This contains test scenarios that a given [DirectoryService] needs to pass. +//! We use [rstest] and [rstest_reuse] to provide all services we want to test +//! against, and then apply this template to all test functions. + +use futures::StreamExt; +use rstest::*; +use rstest_reuse::{self, *}; + +use super::DirectoryService; +use crate::directoryservice; +use crate::{ + fixtures::{DIRECTORY_A, DIRECTORY_B, DIRECTORY_C}, + proto::{self, Directory}, +}; + +mod utils; +use self::utils::make_grpc_directory_service_client; + +// TODO: add tests doing individual puts of a closure, then doing a get_recursive +// (and figure out semantics if necessary) + +/// This produces a template, which will be applied to all individual test functions. +/// See https://github.com/la10736/rstest/issues/130#issuecomment-968864832 +#[template] +#[rstest] +#[case::grpc(make_grpc_directory_service_client().await)] +#[case::memory(directoryservice::from_addr("memory://").await.unwrap())] +#[case::sled(directoryservice::from_addr("sled://").await.unwrap())] +#[case::objectstore(directoryservice::from_addr("objectstore+memory://").await.unwrap())] +#[cfg_attr(all(feature = "cloud", feature = "integration"), case::bigtable(directoryservice::from_addr("bigtable://instance-1?project_id=project-1&table_name=table-1&family_name=cf1").await.unwrap()))] +pub fn directory_services(#[case] directory_service: impl DirectoryService) {} + +/// Ensures asking for a directory that doesn't exist returns a Ok(None). +#[apply(directory_services)] +#[tokio::test] +async fn test_non_exist(directory_service: impl DirectoryService) { + let resp = directory_service.get(&DIRECTORY_A.digest()).await; + assert!(resp.unwrap().is_none()) +} + +/// Putting a single directory into the store, and then getting it out both via +/// `.get[_recursive]` should work. +#[apply(directory_services)] +#[tokio::test] +async fn put_get(directory_service: impl DirectoryService) { + // Insert a Directory. + let digest = directory_service.put(DIRECTORY_A.clone()).await.unwrap(); + assert_eq!(DIRECTORY_A.digest(), digest, "returned digest must match"); + + // single get + assert_eq!( + Some(DIRECTORY_A.clone()), + directory_service.get(&DIRECTORY_A.digest()).await.unwrap() + ); + + // recursive get + assert_eq!( + vec![Ok(DIRECTORY_A.clone())], + directory_service + .get_recursive(&DIRECTORY_A.digest()) + .collect::<Vec<_>>() + .await + ); +} + +/// Putting a directory closure should work, and it should be possible to get +/// back the root node both via .get[_recursive]. We don't check `.get` for the +/// leaf node is possible, as it's Ok for stores to not support that. +#[apply(directory_services)] +#[tokio::test] +async fn put_get_multiple_success(directory_service: impl DirectoryService) { + // Insert a Directory closure. + let mut handle = directory_service.put_multiple_start(); + handle.put(DIRECTORY_A.clone()).await.unwrap(); + handle.put(DIRECTORY_C.clone()).await.unwrap(); + let root_digest = handle.close().await.unwrap(); + assert_eq!( + DIRECTORY_C.digest(), + root_digest, + "root digest should match" + ); + + // Get the root node. + assert_eq!( + Some(DIRECTORY_C.clone()), + directory_service.get(&DIRECTORY_C.digest()).await.unwrap() + ); + + // Get the closure. Ensure it's sent from the root to the leaves. + assert_eq!( + vec![Ok(DIRECTORY_C.clone()), Ok(DIRECTORY_A.clone())], + directory_service + .get_recursive(&DIRECTORY_C.digest()) + .collect::<Vec<_>>() + .await + ) +} + +/// Puts a directory closure, but simulates a dumb client not deduplicating +/// its list. Ensure we still only get back a deduplicated list. +#[apply(directory_services)] +#[tokio::test] +async fn put_get_multiple_dedup(directory_service: impl DirectoryService) { + // Insert a Directory closure. + let mut handle = directory_service.put_multiple_start(); + handle.put(DIRECTORY_A.clone()).await.unwrap(); + handle.put(DIRECTORY_A.clone()).await.unwrap(); + handle.put(DIRECTORY_C.clone()).await.unwrap(); + let root_digest = handle.close().await.unwrap(); + assert_eq!( + DIRECTORY_C.digest(), + root_digest, + "root digest should match" + ); + + // Ensure the returned closure only contains `DIRECTORY_A` once. + assert_eq!( + vec![Ok(DIRECTORY_C.clone()), Ok(DIRECTORY_A.clone())], + directory_service + .get_recursive(&DIRECTORY_C.digest()) + .collect::<Vec<_>>() + .await + ) +} + +/// Uploading A, then C (referring to A twice), then B (itself referring to A) should fail during close, +/// as B itself would be left unconnected. +#[apply(directory_services)] +#[tokio::test] +async fn upload_reject_unconnected(directory_service: impl DirectoryService) { + let mut handle = directory_service.put_multiple_start(); + + handle.put(DIRECTORY_A.clone()).await.unwrap(); + handle.put(DIRECTORY_C.clone()).await.unwrap(); + handle.put(DIRECTORY_B.clone()).await.unwrap(); + + assert!( + handle.close().await.is_err(), + "closing handle should fail, as B would be left unconnected" + ); +} + +/// Uploading a directory that refers to another directory not yet uploaded +/// should fail. +#[apply(directory_services)] +#[tokio::test] +async fn upload_reject_dangling_pointer(directory_service: impl DirectoryService) { + let mut handle = directory_service.put_multiple_start(); + + // We insert DIRECTORY_A on its own, to ensure the check runs for the + // individual put_multiple session, not across the global DirectoryService + // contents. + directory_service.put(DIRECTORY_A.clone()).await.unwrap(); + + // DIRECTORY_B refers to DIRECTORY_A, which is not uploaded with this handle. + if handle.put(DIRECTORY_B.clone()).await.is_ok() { + assert!( + handle.close().await.is_err(), + "when succeeding put, close must fail" + ) + } +} + +/// Try uploading a Directory failing its internal validation, ensure it gets +/// rejected. +#[apply(directory_services)] +#[tokio::test] +async fn upload_reject_failing_validation(directory_service: impl DirectoryService) { + let broken_directory = Directory { + symlinks: vec![proto::SymlinkNode { + name: "".into(), // wrong! + target: "doesntmatter".into(), + }], + ..Default::default() + }; + assert!(broken_directory.validate().is_err()); + + // Try to upload via single upload. + assert!( + directory_service + .put(broken_directory.clone()) + .await + .is_err(), + "single upload must fail" + ); + + // Try to upload via put_multiple. We're a bit more permissive here, the + // intermediate .put() might succeed, due to client-side bursting (in the + // case of gRPC), but then the close MUST fail. + let mut handle = directory_service.put_multiple_start(); + if handle.put(broken_directory).await.is_ok() { + assert!( + handle.close().await.is_err(), + "when succeeding put, close must fail" + ) + } +} + +/// Try uploading a Directory that refers to a previously-uploaded directory. +/// Both pass their isolated validation, but the size field in the parent is wrong. +/// This should be rejected. +#[apply(directory_services)] +#[tokio::test] +async fn upload_reject_wrong_size(directory_service: impl DirectoryService) { + let wrong_parent_directory = Directory { + directories: vec![proto::DirectoryNode { + name: "foo".into(), + digest: DIRECTORY_A.digest().into(), + size: DIRECTORY_A.size() + 42, // wrong! + }], + ..Default::default() + }; + + // Make sure isolated validation itself is ok + assert!(wrong_parent_directory.validate().is_ok()); + + // Now upload both. Ensure it either fails during the second put, or during + // the close. + let mut handle = directory_service.put_multiple_start(); + handle.put(DIRECTORY_A.clone()).await.unwrap(); + if handle.put(wrong_parent_directory).await.is_ok() { + assert!( + handle.close().await.is_err(), + "when second put succeeds, close must fail" + ) + } +} diff --git a/tvix/castore/src/directoryservice/tests/utils.rs b/tvix/castore/src/directoryservice/tests/utils.rs new file mode 100644 index 0000000000..0f706695ee --- /dev/null +++ b/tvix/castore/src/directoryservice/tests/utils.rs @@ -0,0 +1,46 @@ +use crate::directoryservice::{DirectoryService, GRPCDirectoryService}; +use crate::proto::directory_service_client::DirectoryServiceClient; +use crate::proto::GRPCDirectoryServiceWrapper; +use crate::{ + directoryservice::MemoryDirectoryService, + proto::directory_service_server::DirectoryServiceServer, +}; + +use tonic::transport::{Endpoint, Server, Uri}; + +/// Constructs and returns a gRPC DirectoryService. +/// The server part is a [MemoryDirectoryService], exposed via the +/// [GRPCDirectoryServiceWrapper], and connected through a DuplexStream. +pub async fn make_grpc_directory_service_client() -> Box<dyn DirectoryService> { + let (left, right) = tokio::io::duplex(64); + + // spin up a server, which will only connect once, to the left side. + tokio::spawn(async { + let directory_service = + Box::<MemoryDirectoryService>::default() as Box<dyn DirectoryService>; + + let mut server = Server::builder(); + let router = server.add_service(DirectoryServiceServer::new( + GRPCDirectoryServiceWrapper::new(directory_service), + )); + + router + .serve_with_incoming(tokio_stream::once(Ok::<_, std::io::Error>(left))) + .await + }); + + // Create a client, connecting to the right side. The URI is unused. + let mut maybe_right = Some(right); + Box::new(GRPCDirectoryService::from_client( + DirectoryServiceClient::new( + Endpoint::try_from("http://[::]:50051") + .unwrap() + .connect_with_connector(tower::service_fn(move |_: Uri| { + let right = maybe_right.take().unwrap(); + async move { Ok::<_, std::io::Error>(right) } + })) + .await + .unwrap(), + ), + )) +} diff --git a/tvix/castore/src/directoryservice/traverse.rs b/tvix/castore/src/directoryservice/traverse.rs new file mode 100644 index 0000000000..17a51ae2bb --- /dev/null +++ b/tvix/castore/src/directoryservice/traverse.rs @@ -0,0 +1,186 @@ +use super::DirectoryService; +use crate::{ + proto::{node::Node, NamedNode}, + B3Digest, Error, Path, +}; +use tracing::{instrument, warn}; + +/// This descends from a (root) node to the given (sub)path, returning the Node +/// at that path, or none, if there's nothing at that path. +#[instrument(skip(directory_service, path), fields(%path))] +pub async fn descend_to<DS>( + directory_service: DS, + root_node: Node, + path: impl AsRef<Path> + std::fmt::Display, +) -> Result<Option<Node>, Error> +where + DS: AsRef<dyn DirectoryService>, +{ + let mut parent_node = root_node; + for component in path.as_ref().components() { + match parent_node { + Node::File(_) | Node::Symlink(_) => { + // There's still some path left, but the parent node is no directory. + // This means the path doesn't exist, as we can't reach it. + return Ok(None); + } + Node::Directory(directory_node) => { + let digest: B3Digest = directory_node + .digest + .try_into() + .map_err(|_e| Error::StorageError("invalid digest length".to_string()))?; + + // fetch the linked node from the directory_service. + let directory = + directory_service + .as_ref() + .get(&digest) + .await? + .ok_or_else(|| { + // If we didn't get the directory node that's linked, that's a store inconsistency, bail out! + warn!("directory {} does not exist", digest); + + Error::StorageError(format!("directory {} does not exist", digest)) + })?; + + // look for the component in the [Directory]. + // FUTUREWORK: as the nodes() iterator returns in a sorted fashion, we + // could stop as soon as e.name is larger than the search string. + if let Some(child_node) = directory.nodes().find(|n| n.get_name() == component) { + // child node found, update prev_node to that and continue. + parent_node = child_node; + } else { + // child node not found means there's no such element inside the directory. + return Ok(None); + } + } + } + } + + // We traversed the entire path, so this must be the node. + Ok(Some(parent_node)) +} + +#[cfg(test)] +mod tests { + use crate::{ + directoryservice, + fixtures::{DIRECTORY_COMPLICATED, DIRECTORY_WITH_KEEP}, + PathBuf, + }; + + use super::descend_to; + + #[tokio::test] + async fn test_descend_to() { + let directory_service = directoryservice::from_addr("memory://").await.unwrap(); + + let mut handle = directory_service.put_multiple_start(); + handle + .put(DIRECTORY_WITH_KEEP.clone()) + .await + .expect("must succeed"); + handle + .put(DIRECTORY_COMPLICATED.clone()) + .await + .expect("must succeed"); + + handle.close().await.expect("must upload"); + + // construct the node for DIRECTORY_COMPLICATED + let node_directory_complicated = + crate::proto::node::Node::Directory(crate::proto::DirectoryNode { + name: "doesntmatter".into(), + digest: DIRECTORY_COMPLICATED.digest().into(), + size: DIRECTORY_COMPLICATED.size(), + }); + + // construct the node for DIRECTORY_COMPLICATED + let node_directory_with_keep = crate::proto::node::Node::Directory( + DIRECTORY_COMPLICATED.directories.first().unwrap().clone(), + ); + + // construct the node for the .keep file + let node_file_keep = + crate::proto::node::Node::File(DIRECTORY_WITH_KEEP.files.first().unwrap().clone()); + + // traversal to an empty subpath should return the root node. + { + let resp = descend_to( + &directory_service, + node_directory_complicated.clone(), + "".parse::<PathBuf>().unwrap(), + ) + .await + .expect("must succeed"); + + assert_eq!(Some(node_directory_complicated.clone()), resp); + } + + // traversal to `keep` should return the node for DIRECTORY_WITH_KEEP + { + let resp = descend_to( + &directory_service, + node_directory_complicated.clone(), + "keep".parse::<PathBuf>().unwrap(), + ) + .await + .expect("must succeed"); + + assert_eq!(Some(node_directory_with_keep), resp); + } + + // traversal to `keep/.keep` should return the node for the .keep file + { + let resp = descend_to( + &directory_service, + node_directory_complicated.clone(), + "keep/.keep".parse::<PathBuf>().unwrap(), + ) + .await + .expect("must succeed"); + + assert_eq!(Some(node_file_keep.clone()), resp); + } + + // traversal to `void` should return None (doesn't exist) + { + let resp = descend_to( + &directory_service, + node_directory_complicated.clone(), + "void".parse::<PathBuf>().unwrap(), + ) + .await + .expect("must succeed"); + + assert_eq!(None, resp); + } + + // traversal to `v/oid` should return None (doesn't exist) + { + let resp = descend_to( + &directory_service, + node_directory_complicated.clone(), + "v/oid".parse::<PathBuf>().unwrap(), + ) + .await + .expect("must succeed"); + + assert_eq!(None, resp); + } + + // traversal to `keep/.keep/404` should return None (the path can't be + // reached, as keep/.keep already is a file) + { + let resp = descend_to( + &directory_service, + node_directory_complicated.clone(), + "keep/.keep/foo".parse::<PathBuf>().unwrap(), + ) + .await + .expect("must succeed"); + + assert_eq!(None, resp); + } + } +} diff --git a/tvix/castore/src/directoryservice/utils.rs b/tvix/castore/src/directoryservice/utils.rs new file mode 100644 index 0000000000..a0ba395ecd --- /dev/null +++ b/tvix/castore/src/directoryservice/utils.rs @@ -0,0 +1,77 @@ +use super::DirectoryService; +use crate::proto; +use crate::B3Digest; +use crate::Error; +use async_stream::try_stream; +use futures::stream::BoxStream; +use std::collections::{HashSet, VecDeque}; +use tracing::instrument; +use tracing::warn; + +/// Traverses a [proto::Directory] from the root to the children. +/// +/// This is mostly BFS, but directories are only returned once. +#[instrument(skip(directory_service))] +pub fn traverse_directory<'a, DS: DirectoryService + 'static>( + directory_service: DS, + root_directory_digest: &B3Digest, +) -> BoxStream<'a, Result<proto::Directory, Error>> { + // The list of all directories that still need to be traversed. The next + // element is picked from the front, new elements are enqueued at the + // back. + let mut worklist_directory_digests: VecDeque<B3Digest> = + VecDeque::from([root_directory_digest.clone()]); + // The list of directory digests already sent to the consumer. + // We omit sending the same directories multiple times. + let mut sent_directory_digests: HashSet<B3Digest> = HashSet::new(); + + Box::pin(try_stream! { + while let Some(current_directory_digest) = worklist_directory_digests.pop_front() { + let current_directory = directory_service.get(¤t_directory_digest).await.map_err(|e| { + warn!("failed to look up directory"); + Error::StorageError(format!( + "unable to look up directory {}: {}", + current_directory_digest, e + )) + })?.ok_or_else(|| { + // if it's not there, we have an inconsistent store! + warn!("directory {} does not exist", current_directory_digest); + Error::StorageError(format!( + "directory {} does not exist", + current_directory_digest + )) + + })?; + + // validate, we don't want to send invalid directories. + current_directory.validate().map_err(|e| { + warn!("directory failed validation: {}", e.to_string()); + Error::StorageError(format!( + "invalid directory: {}", + current_directory_digest + )) + })?; + + // We're about to send this directory, so let's avoid sending it again if a + // descendant has it. + sent_directory_digests.insert(current_directory_digest); + + // enqueue all child directory digests to the work queue, as + // long as they're not part of the worklist or already sent. + // This panics if the digest looks invalid, it's supposed to be checked first. + for child_directory_node in ¤t_directory.directories { + // TODO: propagate error + let child_digest: B3Digest = child_directory_node.digest.clone().try_into().unwrap(); + + if worklist_directory_digests.contains(&child_digest) + || sent_directory_digests.contains(&child_digest) + { + continue; + } + worklist_directory_digests.push_back(child_digest); + } + + yield current_directory; + } + }) +} diff --git a/tvix/castore/src/errors.rs b/tvix/castore/src/errors.rs new file mode 100644 index 0000000000..8343d0774a --- /dev/null +++ b/tvix/castore/src/errors.rs @@ -0,0 +1,54 @@ +use thiserror::Error; +use tokio::task::JoinError; +use tonic::Status; + +/// Errors related to communication with the store. +#[derive(Debug, Error, PartialEq)] +pub enum Error { + #[error("invalid request: {0}")] + InvalidRequest(String), + + #[error("internal storage error: {0}")] + StorageError(String), +} + +impl From<JoinError> for Error { + fn from(value: JoinError) -> Self { + Error::StorageError(value.to_string()) + } +} + +impl From<Error> for Status { + fn from(value: Error) -> Self { + match value { + Error::InvalidRequest(msg) => Status::invalid_argument(msg), + Error::StorageError(msg) => Status::data_loss(format!("storage error: {}", msg)), + } + } +} + +impl From<crate::tonic::Error> for Error { + fn from(value: crate::tonic::Error) -> Self { + Self::StorageError(value.to_string()) + } +} + +impl From<std::io::Error> for Error { + fn from(value: std::io::Error) -> Self { + if value.kind() == std::io::ErrorKind::InvalidInput { + Error::InvalidRequest(value.to_string()) + } else { + Error::StorageError(value.to_string()) + } + } +} + +// TODO: this should probably go somewhere else? +impl From<Error> for std::io::Error { + fn from(value: Error) -> Self { + match value { + Error::InvalidRequest(msg) => Self::new(std::io::ErrorKind::InvalidInput, msg), + Error::StorageError(msg) => Self::new(std::io::ErrorKind::Other, msg), + } + } +} diff --git a/tvix/castore/src/fixtures.rs b/tvix/castore/src/fixtures.rs new file mode 100644 index 0000000000..a206d9b7dd --- /dev/null +++ b/tvix/castore/src/fixtures.rs @@ -0,0 +1,88 @@ +use crate::{ + proto::{self, Directory, DirectoryNode, FileNode, SymlinkNode}, + B3Digest, +}; +use lazy_static::lazy_static; + +pub const HELLOWORLD_BLOB_CONTENTS: &[u8] = b"Hello World!"; +pub const EMPTY_BLOB_CONTENTS: &[u8] = b""; + +lazy_static! { + pub static ref DUMMY_DIGEST: B3Digest = { + let u = [0u8; 32]; + (&u).into() + }; + pub static ref DUMMY_DIGEST_2: B3Digest = { + let mut u = [0u8; 32]; + u[0] = 0x10; + (&u).into() + }; + pub static ref DUMMY_DATA_1: bytes::Bytes = vec![0x01, 0x02, 0x03].into(); + pub static ref DUMMY_DATA_2: bytes::Bytes = vec![0x04, 0x05].into(); + + pub static ref HELLOWORLD_BLOB_DIGEST: B3Digest = + blake3::hash(HELLOWORLD_BLOB_CONTENTS).as_bytes().into(); + pub static ref EMPTY_BLOB_DIGEST: B3Digest = + blake3::hash(EMPTY_BLOB_CONTENTS).as_bytes().into(); + + // 2 bytes + pub static ref BLOB_A: bytes::Bytes = vec![0x00, 0x01].into(); + pub static ref BLOB_A_DIGEST: B3Digest = blake3::hash(&BLOB_A).as_bytes().into(); + + // 1MB + pub static ref BLOB_B: bytes::Bytes = (0..255).collect::<Vec<u8>>().repeat(4 * 1024).into(); + pub static ref BLOB_B_DIGEST: B3Digest = blake3::hash(&BLOB_B).as_bytes().into(); + + // Directories + pub static ref DIRECTORY_WITH_KEEP: proto::Directory = proto::Directory { + directories: vec![], + files: vec![FileNode { + name: b".keep".to_vec().into(), + digest: EMPTY_BLOB_DIGEST.clone().into(), + size: 0, + executable: false, + }], + symlinks: vec![], + }; + pub static ref DIRECTORY_COMPLICATED: proto::Directory = proto::Directory { + directories: vec![DirectoryNode { + name: b"keep".to_vec().into(), + digest: DIRECTORY_WITH_KEEP.digest().into(), + size: DIRECTORY_WITH_KEEP.size(), + }], + files: vec![FileNode { + name: b".keep".to_vec().into(), + digest: EMPTY_BLOB_DIGEST.clone().into(), + size: 0, + executable: false, + }], + symlinks: vec![SymlinkNode { + name: b"aa".to_vec().into(), + target: b"/nix/store/somewhereelse".to_vec().into(), + }], + }; + pub static ref DIRECTORY_A: Directory = Directory::default(); + pub static ref DIRECTORY_B: Directory = Directory { + directories: vec![DirectoryNode { + name: b"a".to_vec().into(), + digest: DIRECTORY_A.digest().into(), + size: DIRECTORY_A.size(), + }], + ..Default::default() + }; + pub static ref DIRECTORY_C: Directory = Directory { + directories: vec![ + DirectoryNode { + name: b"a".to_vec().into(), + digest: DIRECTORY_A.digest().into(), + size: DIRECTORY_A.size(), + }, + DirectoryNode { + name: b"a'".to_vec().into(), + digest: DIRECTORY_A.digest().into(), + size: DIRECTORY_A.size(), + } + ], + ..Default::default() + }; +} diff --git a/tvix/castore/src/fs/file_attr.rs b/tvix/castore/src/fs/file_attr.rs new file mode 100644 index 0000000000..2e0e70e3cd --- /dev/null +++ b/tvix/castore/src/fs/file_attr.rs @@ -0,0 +1,29 @@ +#![allow(clippy::unnecessary_cast)] // libc::S_IFDIR is u32 on Linux and u16 on MacOS + +use fuse_backend_rs::abi::fuse_abi::Attr; + +/// The [Attr] describing the root +pub const ROOT_FILE_ATTR: Attr = Attr { + ino: fuse_backend_rs::api::filesystem::ROOT_ID, + size: 0, + blksize: 1024, + blocks: 0, + mode: libc::S_IFDIR as u32 | 0o555, + atime: 0, + mtime: 0, + ctime: 0, + atimensec: 0, + mtimensec: 0, + ctimensec: 0, + nlink: 0, + uid: 0, + gid: 0, + rdev: 0, + flags: 0, + #[cfg(target_os = "macos")] + crtime: 0, + #[cfg(target_os = "macos")] + crtimensec: 0, + #[cfg(target_os = "macos")] + padding: 0, +}; diff --git a/tvix/castore/src/fs/fuse/mod.rs b/tvix/castore/src/fs/fuse/mod.rs new file mode 100644 index 0000000000..94b73d422a --- /dev/null +++ b/tvix/castore/src/fs/fuse/mod.rs @@ -0,0 +1,123 @@ +use std::{io, path::Path, sync::Arc, thread}; + +use fuse_backend_rs::{api::filesystem::FileSystem, transport::FuseSession}; +use tracing::{error, instrument}; + +#[cfg(test)] +mod tests; + +struct FuseServer<FS> +where + FS: FileSystem + Sync + Send, +{ + server: Arc<fuse_backend_rs::api::server::Server<Arc<FS>>>, + channel: fuse_backend_rs::transport::FuseChannel, +} + +#[cfg(target_os = "macos")] +const BADFD: libc::c_int = libc::EBADF; +#[cfg(target_os = "linux")] +const BADFD: libc::c_int = libc::EBADFD; + +impl<FS> FuseServer<FS> +where + FS: FileSystem + Sync + Send, +{ + fn start(&mut self) -> io::Result<()> { + while let Some((reader, writer)) = self + .channel + .get_request() + .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))? + { + if let Err(e) = self + .server + .handle_message(reader, writer.into(), None, None) + { + match e { + // This indicates the session has been shut down. + fuse_backend_rs::Error::EncodeMessage(e) if e.raw_os_error() == Some(BADFD) => { + break; + } + error => { + error!(?error, "failed to handle fuse request"); + continue; + } + } + } + } + Ok(()) + } +} + +pub struct FuseDaemon { + session: FuseSession, + threads: Vec<thread::JoinHandle<()>>, +} + +impl FuseDaemon { + #[instrument(skip(fs, mountpoint), fields(mountpoint=?mountpoint), err)] + pub fn new<FS, P>( + fs: FS, + mountpoint: P, + threads: usize, + allow_other: bool, + ) -> Result<Self, io::Error> + where + FS: FileSystem + Sync + Send + 'static, + P: AsRef<Path> + std::fmt::Debug, + { + let server = Arc::new(fuse_backend_rs::api::server::Server::new(Arc::new(fs))); + + let mut session = FuseSession::new(mountpoint.as_ref(), "tvix-store", "", true) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; + + #[cfg(target_os = "linux")] + session.set_allow_other(allow_other); + session + .mount() + .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; + let mut join_handles = Vec::with_capacity(threads); + for _ in 0..threads { + let mut server = FuseServer { + server: server.clone(), + channel: session + .new_channel() + .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?, + }; + let join_handle = thread::Builder::new() + .name("fuse_server".to_string()) + .spawn(move || { + let _ = server.start(); + })?; + join_handles.push(join_handle); + } + + Ok(FuseDaemon { + session, + threads: join_handles, + }) + } + + #[instrument(skip_all, err)] + pub fn unmount(&mut self) -> Result<(), io::Error> { + self.session + .umount() + .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; + + for thread in self.threads.drain(..) { + thread.join().map_err(|_| { + io::Error::new(io::ErrorKind::Other, "failed to join fuse server thread") + })?; + } + + Ok(()) + } +} + +impl Drop for FuseDaemon { + fn drop(&mut self) { + if let Err(error) = self.unmount() { + error!(?error, "failed to unmont fuse filesystem") + } + } +} diff --git a/tvix/castore/src/fs/fuse/tests.rs b/tvix/castore/src/fs/fuse/tests.rs new file mode 100644 index 0000000000..bb321f5888 --- /dev/null +++ b/tvix/castore/src/fs/fuse/tests.rs @@ -0,0 +1,1245 @@ +use bstr::ByteSlice; +use bytes::Bytes; +use std::{ + collections::BTreeMap, + ffi::{OsStr, OsString}, + io::{self, Cursor}, + os::unix::{ffi::OsStrExt, fs::MetadataExt}, + path::Path, + sync::Arc, +}; +use tempfile::TempDir; +use tokio_stream::{wrappers::ReadDirStream, StreamExt}; + +use super::FuseDaemon; +use crate::fs::{TvixStoreFs, XATTR_NAME_BLOB_DIGEST, XATTR_NAME_DIRECTORY_DIGEST}; +use crate::proto as castorepb; +use crate::proto::node::Node; +use crate::{ + blobservice::{BlobService, MemoryBlobService}, + directoryservice::{DirectoryService, MemoryDirectoryService}, + fixtures, +}; + +const BLOB_A_NAME: &str = "00000000000000000000000000000000-test"; +const BLOB_B_NAME: &str = "55555555555555555555555555555555-test"; +const HELLOWORLD_BLOB_NAME: &str = "66666666666666666666666666666666-test"; +const SYMLINK_NAME: &str = "11111111111111111111111111111111-test"; +const SYMLINK_NAME2: &str = "44444444444444444444444444444444-test"; +const DIRECTORY_WITH_KEEP_NAME: &str = "22222222222222222222222222222222-test"; +const DIRECTORY_COMPLICATED_NAME: &str = "33333333333333333333333333333333-test"; + +fn gen_svcs() -> (Arc<dyn BlobService>, Arc<dyn DirectoryService>) { + ( + Arc::new(MemoryBlobService::default()) as Arc<dyn BlobService>, + Arc::new(MemoryDirectoryService::default()) as Arc<dyn DirectoryService>, + ) +} + +fn do_mount<P: AsRef<Path>, BS, DS>( + blob_service: BS, + directory_service: DS, + root_nodes: BTreeMap<bytes::Bytes, Node>, + mountpoint: P, + list_root: bool, + show_xattr: bool, +) -> io::Result<FuseDaemon> +where + BS: AsRef<dyn BlobService> + Send + Sync + Clone + 'static, + DS: AsRef<dyn DirectoryService> + Send + Sync + Clone + 'static, +{ + let fs = TvixStoreFs::new( + blob_service, + directory_service, + Arc::new(root_nodes), + list_root, + show_xattr, + ); + FuseDaemon::new(Arc::new(fs), mountpoint.as_ref(), 4, false) +} + +async fn populate_blob_a( + blob_service: &Arc<dyn BlobService>, + root_nodes: &mut BTreeMap<Bytes, Node>, +) { + let mut bw = blob_service.open_write().await; + tokio::io::copy(&mut Cursor::new(fixtures::BLOB_A.to_vec()), &mut bw) + .await + .expect("must succeed uploading"); + bw.close().await.expect("must succeed closing"); + + root_nodes.insert( + BLOB_A_NAME.into(), + Node::File(castorepb::FileNode { + name: BLOB_A_NAME.into(), + digest: fixtures::BLOB_A_DIGEST.clone().into(), + size: fixtures::BLOB_A.len() as u64, + executable: false, + }), + ); +} + +async fn populate_blob_b( + blob_service: &Arc<dyn BlobService>, + root_nodes: &mut BTreeMap<Bytes, Node>, +) { + let mut bw = blob_service.open_write().await; + tokio::io::copy(&mut Cursor::new(fixtures::BLOB_B.to_vec()), &mut bw) + .await + .expect("must succeed uploading"); + bw.close().await.expect("must succeed closing"); + + root_nodes.insert( + BLOB_B_NAME.into(), + Node::File(castorepb::FileNode { + name: BLOB_B_NAME.into(), + digest: fixtures::BLOB_B_DIGEST.clone().into(), + size: fixtures::BLOB_B.len() as u64, + executable: false, + }), + ); +} + +/// adds a blob containing helloworld and marks it as executable +async fn populate_blob_helloworld( + blob_service: &Arc<dyn BlobService>, + root_nodes: &mut BTreeMap<Bytes, Node>, +) { + let mut bw = blob_service.open_write().await; + tokio::io::copy( + &mut Cursor::new(fixtures::HELLOWORLD_BLOB_CONTENTS.to_vec()), + &mut bw, + ) + .await + .expect("must succeed uploading"); + bw.close().await.expect("must succeed closing"); + + root_nodes.insert( + HELLOWORLD_BLOB_NAME.into(), + Node::File(castorepb::FileNode { + name: HELLOWORLD_BLOB_NAME.into(), + digest: fixtures::HELLOWORLD_BLOB_DIGEST.clone().into(), + size: fixtures::HELLOWORLD_BLOB_CONTENTS.len() as u64, + executable: true, + }), + ); +} + +async fn populate_symlink(root_nodes: &mut BTreeMap<Bytes, Node>) { + root_nodes.insert( + SYMLINK_NAME.into(), + Node::Symlink(castorepb::SymlinkNode { + name: SYMLINK_NAME.into(), + target: BLOB_A_NAME.into(), + }), + ); +} + +/// This writes a symlink pointing to /nix/store/somewhereelse, +/// which is the same symlink target as "aa" inside DIRECTORY_COMPLICATED. +async fn populate_symlink2(root_nodes: &mut BTreeMap<Bytes, Node>) { + root_nodes.insert( + SYMLINK_NAME2.into(), + Node::Symlink(castorepb::SymlinkNode { + name: SYMLINK_NAME2.into(), + target: "/nix/store/somewhereelse".into(), + }), + ); +} + +async fn populate_directory_with_keep( + blob_service: &Arc<dyn BlobService>, + directory_service: &Arc<dyn DirectoryService>, + root_nodes: &mut BTreeMap<Bytes, Node>, +) { + // upload empty blob + let mut bw = blob_service.open_write().await; + assert_eq!( + fixtures::EMPTY_BLOB_DIGEST.as_slice(), + bw.close().await.expect("must succeed closing").as_slice(), + ); + + // upload directory + directory_service + .put(fixtures::DIRECTORY_WITH_KEEP.clone()) + .await + .expect("must succeed uploading"); + + root_nodes.insert( + DIRECTORY_WITH_KEEP_NAME.into(), + castorepb::node::Node::Directory(castorepb::DirectoryNode { + name: DIRECTORY_WITH_KEEP_NAME.into(), + digest: fixtures::DIRECTORY_WITH_KEEP.digest().into(), + size: fixtures::DIRECTORY_WITH_KEEP.size(), + }), + ); +} + +/// Create a root node for DIRECTORY_WITH_KEEP, but don't upload the Directory +/// itself. +async fn populate_directorynode_without_directory(root_nodes: &mut BTreeMap<Bytes, Node>) { + root_nodes.insert( + DIRECTORY_WITH_KEEP_NAME.into(), + castorepb::node::Node::Directory(castorepb::DirectoryNode { + name: DIRECTORY_WITH_KEEP_NAME.into(), + digest: fixtures::DIRECTORY_WITH_KEEP.digest().into(), + size: fixtures::DIRECTORY_WITH_KEEP.size(), + }), + ); +} + +/// Insert BLOB_A, but don't provide the blob .keep is pointing to. +async fn populate_filenode_without_blob(root_nodes: &mut BTreeMap<Bytes, Node>) { + root_nodes.insert( + BLOB_A_NAME.into(), + Node::File(castorepb::FileNode { + name: BLOB_A_NAME.into(), + digest: fixtures::BLOB_A_DIGEST.clone().into(), + size: fixtures::BLOB_A.len() as u64, + executable: false, + }), + ); +} + +async fn populate_directory_complicated( + blob_service: &Arc<dyn BlobService>, + directory_service: &Arc<dyn DirectoryService>, + root_nodes: &mut BTreeMap<Bytes, Node>, +) { + // upload empty blob + let mut bw = blob_service.open_write().await; + assert_eq!( + fixtures::EMPTY_BLOB_DIGEST.as_slice(), + bw.close().await.expect("must succeed closing").as_slice(), + ); + + // upload inner directory + directory_service + .put(fixtures::DIRECTORY_WITH_KEEP.clone()) + .await + .expect("must succeed uploading"); + + // upload parent directory + directory_service + .put(fixtures::DIRECTORY_COMPLICATED.clone()) + .await + .expect("must succeed uploading"); + + root_nodes.insert( + DIRECTORY_COMPLICATED_NAME.into(), + Node::Directory(castorepb::DirectoryNode { + name: DIRECTORY_COMPLICATED_NAME.into(), + digest: fixtures::DIRECTORY_COMPLICATED.digest().into(), + size: fixtures::DIRECTORY_COMPLICATED.size(), + }), + ); +} + +/// Ensure mounting itself doesn't fail +#[tokio::test] +async fn mount() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + BTreeMap::default(), + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + fuse_daemon.unmount().expect("unmount"); +} +/// Ensure listing the root isn't allowed +#[tokio::test] +async fn root() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + BTreeMap::default(), + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + { + // read_dir fails (as opendir fails). + let err = tokio::fs::read_dir(tmpdir).await.expect_err("must fail"); + assert_eq!(std::io::ErrorKind::PermissionDenied, err.kind()); + } + + fuse_daemon.unmount().expect("unmount"); +} + +/// Ensure listing the root is allowed if configured explicitly +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn root_with_listing() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_blob_a(&blob_service, &mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + true, /* allow listing */ + false, + ) + .expect("must succeed"); + + { + // read_dir succeeds, but getting the first element will fail. + let mut it = ReadDirStream::new(tokio::fs::read_dir(tmpdir).await.expect("must succeed")); + + let e = it + .next() + .await + .expect("must be some") + .expect("must succeed"); + + let metadata = e.metadata().await.expect("must succeed"); + assert!(metadata.is_file()); + assert!(metadata.permissions().readonly()); + assert_eq!(fixtures::BLOB_A.len() as u64, metadata.len()); + } + + fuse_daemon.unmount().expect("unmount"); +} + +/// Ensure we can stat a file at the root +#[tokio::test] +async fn stat_file_at_root() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_blob_a(&blob_service, &mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p = tmpdir.path().join(BLOB_A_NAME); + + // peek at the file metadata + let metadata = tokio::fs::metadata(p).await.expect("must succeed"); + + assert!(metadata.is_file()); + assert!(metadata.permissions().readonly()); + assert_eq!(fixtures::BLOB_A.len() as u64, metadata.len()); + + fuse_daemon.unmount().expect("unmount"); +} + +/// Ensure we can read a file at the root +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn read_file_at_root() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_blob_a(&blob_service, &mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p = tmpdir.path().join(BLOB_A_NAME); + + // read the file contents + let data = tokio::fs::read(p).await.expect("must succeed"); + + // ensure size and contents match + assert_eq!(fixtures::BLOB_A.len(), data.len()); + assert_eq!(fixtures::BLOB_A.to_vec(), data); + + fuse_daemon.unmount().expect("unmount"); +} + +/// Ensure we can read a large file at the root +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn read_large_file_at_root() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_blob_b(&blob_service, &mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p = tmpdir.path().join(BLOB_B_NAME); + { + // peek at the file metadata + let metadata = tokio::fs::metadata(&p).await.expect("must succeed"); + + assert!(metadata.is_file()); + assert!(metadata.permissions().readonly()); + assert_eq!(fixtures::BLOB_B.len() as u64, metadata.len()); + } + + // read the file contents + let data = tokio::fs::read(p).await.expect("must succeed"); + + // ensure size and contents match + assert_eq!(fixtures::BLOB_B.len(), data.len()); + assert_eq!(fixtures::BLOB_B.to_vec(), data); + + fuse_daemon.unmount().expect("unmount"); +} + +/// Read the target of a symlink +#[tokio::test] +async fn symlink_readlink() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_symlink(&mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p = tmpdir.path().join(SYMLINK_NAME); + + let target = tokio::fs::read_link(&p).await.expect("must succeed"); + assert_eq!(BLOB_A_NAME, target.to_str().unwrap()); + + // peek at the file metadata, which follows symlinks. + // this must fail, as we didn't populate the target. + let e = tokio::fs::metadata(&p).await.expect_err("must fail"); + assert_eq!(std::io::ErrorKind::NotFound, e.kind()); + + // peeking at the file metadata without following symlinks will succeed. + let metadata = tokio::fs::symlink_metadata(&p).await.expect("must succeed"); + assert!(metadata.is_symlink()); + + // reading from the symlink (which follows) will fail, because the target doesn't exist. + let e = tokio::fs::read(p).await.expect_err("must fail"); + assert_eq!(std::io::ErrorKind::NotFound, e.kind()); + + fuse_daemon.unmount().expect("unmount"); +} + +/// Read and stat a regular file through a symlink pointing to it. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn read_stat_through_symlink() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_blob_a(&blob_service, &mut root_nodes).await; + populate_symlink(&mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p_symlink = tmpdir.path().join(SYMLINK_NAME); + let p_blob = tmpdir.path().join(SYMLINK_NAME); + + // peek at the file metadata, which follows symlinks. + // this must now return the same metadata as when statting at the target directly. + let metadata_symlink = tokio::fs::metadata(&p_symlink).await.expect("must succeed"); + let metadata_blob = tokio::fs::metadata(&p_blob).await.expect("must succeed"); + assert_eq!(metadata_blob.file_type(), metadata_symlink.file_type()); + assert_eq!(metadata_blob.len(), metadata_symlink.len()); + + // reading from the symlink (which follows) will return the same data as if + // we were reading from the file directly. + assert_eq!( + tokio::fs::read(p_blob).await.expect("must succeed"), + tokio::fs::read(p_symlink).await.expect("must succeed"), + ); + + fuse_daemon.unmount().expect("unmount"); +} + +/// Read a directory in the root, and validate some attributes. +#[tokio::test] +async fn read_stat_directory() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_directory_with_keep(&blob_service, &directory_service, &mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p = tmpdir.path().join(DIRECTORY_WITH_KEEP_NAME); + + // peek at the metadata of the directory + let metadata = tokio::fs::metadata(p).await.expect("must succeed"); + assert!(metadata.is_dir()); + assert!(metadata.permissions().readonly()); + + fuse_daemon.unmount().expect("unmount"); +} + +/// Read a directory and file in the root, and ensure the xattrs expose blob or +/// directory digests. +#[tokio::test] +async fn xattr() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_directory_with_keep(&blob_service, &directory_service, &mut root_nodes).await; + populate_blob_a(&blob_service, &mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + true, /* support xattr */ + ) + .expect("must succeed"); + + // peek at the directory + { + let p = tmpdir.path().join(DIRECTORY_WITH_KEEP_NAME); + + let xattr_names: Vec<OsString> = xattr::list(&p).expect("must succeed").collect(); + // There should be 1 key, XATTR_NAME_DIRECTORY_DIGEST. + assert_eq!(1, xattr_names.len(), "there should be 1 xattr name"); + assert_eq!( + XATTR_NAME_DIRECTORY_DIGEST, + xattr_names.first().unwrap().as_encoded_bytes() + ); + + // The key should equal to the string-formatted b3 digest. + let val = xattr::get(&p, OsStr::from_bytes(XATTR_NAME_DIRECTORY_DIGEST)) + .expect("must succeed") + .expect("must be some"); + assert_eq!( + fixtures::DIRECTORY_WITH_KEEP + .digest() + .to_string() + .as_bytes() + .as_bstr(), + val.as_bstr() + ); + + // Reading another xattr key is gonna return None. + let val = xattr::get(&p, OsStr::from_bytes(b"user.cheesecake")).expect("must succeed"); + assert_eq!(None, val); + } + // peek at the file + { + let p = tmpdir.path().join(BLOB_A_NAME); + + let xattr_names: Vec<OsString> = xattr::list(&p).expect("must succeed").collect(); + // There should be 1 key, XATTR_NAME_BLOB_DIGEST. + assert_eq!(1, xattr_names.len(), "there should be 1 xattr name"); + assert_eq!( + XATTR_NAME_BLOB_DIGEST, + xattr_names.first().unwrap().as_encoded_bytes() + ); + + // The key should equal to the string-formatted b3 digest. + let val = xattr::get(&p, OsStr::from_bytes(XATTR_NAME_BLOB_DIGEST)) + .expect("must succeed") + .expect("must be some"); + assert_eq!( + fixtures::BLOB_A_DIGEST.to_string().as_bytes().as_bstr(), + val.as_bstr() + ); + + // Reading another xattr key is gonna return None. + let val = xattr::get(&p, OsStr::from_bytes(b"user.cheesecake")).expect("must succeed"); + assert_eq!(None, val); + } + + fuse_daemon.unmount().expect("unmount"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +/// Read a blob inside a directory. This ensures we successfully populate directory data. +async fn read_blob_inside_dir() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_directory_with_keep(&blob_service, &directory_service, &mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p = tmpdir.path().join(DIRECTORY_WITH_KEEP_NAME).join(".keep"); + + // peek at metadata. + let metadata = tokio::fs::metadata(&p).await.expect("must succeed"); + assert!(metadata.is_file()); + assert!(metadata.permissions().readonly()); + + // read from it + let data = tokio::fs::read(&p).await.expect("must succeed"); + assert_eq!(fixtures::EMPTY_BLOB_CONTENTS.to_vec(), data); + + fuse_daemon.unmount().expect("unmount"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +/// Read a blob inside a directory inside a directory. This ensures we properly +/// populate directories as we traverse down the structure. +async fn read_blob_deep_inside_dir() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_directory_complicated(&blob_service, &directory_service, &mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p = tmpdir + .path() + .join(DIRECTORY_COMPLICATED_NAME) + .join("keep") + .join(".keep"); + + // peek at metadata. + let metadata = tokio::fs::metadata(&p).await.expect("must succeed"); + assert!(metadata.is_file()); + assert!(metadata.permissions().readonly()); + + // read from it + let data = tokio::fs::read(&p).await.expect("must succeed"); + assert_eq!(fixtures::EMPTY_BLOB_CONTENTS.to_vec(), data); + + fuse_daemon.unmount().expect("unmount"); +} + +/// Ensure readdir works. +#[tokio::test] +async fn readdir() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_directory_complicated(&blob_service, &directory_service, &mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p = tmpdir.path().join(DIRECTORY_COMPLICATED_NAME); + + { + // read_dir should succeed. Collect all elements + let elements: Vec<_> = + ReadDirStream::new(tokio::fs::read_dir(p).await.expect("must succeed")) + .map(|e| e.expect("must not be err")) + .collect() + .await; + + assert_eq!(3, elements.len(), "number of elements should be 3"); // rust skips . and .. + + // We explicitly look at specific positions here, because we always emit + // them ordered. + + // ".keep", 0 byte file. + let e = &elements[0]; + assert_eq!(".keep", e.file_name()); + assert!(e.file_type().await.expect("must succeed").is_file()); + assert_eq!(0, e.metadata().await.expect("must succeed").len()); + + // "aa", symlink. + let e = &elements[1]; + assert_eq!("aa", e.file_name()); + assert!(e.file_type().await.expect("must succeed").is_symlink()); + + // "keep", directory + let e = &elements[2]; + assert_eq!("keep", e.file_name()); + assert!(e.file_type().await.expect("must succeed").is_dir()); + } + + fuse_daemon.unmount().expect("unmount"); +} + +#[tokio::test] +/// Do a readdir deeper inside a directory, without doing readdir or stat in the parent directory. +async fn readdir_deep() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_directory_complicated(&blob_service, &directory_service, &mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p = tmpdir.path().join(DIRECTORY_COMPLICATED_NAME).join("keep"); + + { + // read_dir should succeed. Collect all elements + let elements: Vec<_> = + ReadDirStream::new(tokio::fs::read_dir(p).await.expect("must succeed")) + .map(|e| e.expect("must not be err")) + .collect() + .await; + + assert_eq!(1, elements.len(), "number of elements should be 1"); // rust skips . and .. + + // ".keep", 0 byte file. + let e = &elements[0]; + assert_eq!(".keep", e.file_name()); + assert!(e.file_type().await.expect("must succeed").is_file()); + assert_eq!(0, e.metadata().await.expect("must succeed").len()); + } + + fuse_daemon.unmount().expect("unmount"); +} + +/// Check attributes match how they show up in /nix/store normally. +#[tokio::test] +async fn check_attributes() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_blob_a(&blob_service, &mut root_nodes).await; + populate_directory_with_keep(&blob_service, &directory_service, &mut root_nodes).await; + populate_symlink(&mut root_nodes).await; + populate_blob_helloworld(&blob_service, &mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p_file = tmpdir.path().join(BLOB_A_NAME); + let p_directory = tmpdir.path().join(DIRECTORY_WITH_KEEP_NAME); + let p_symlink = tmpdir.path().join(SYMLINK_NAME); + let p_executable_file = tmpdir.path().join(HELLOWORLD_BLOB_NAME); + + // peek at metadata. We use symlink_metadata to ensure we don't traverse a symlink by accident. + let metadata_file = tokio::fs::symlink_metadata(&p_file) + .await + .expect("must succeed"); + let metadata_executable_file = tokio::fs::symlink_metadata(&p_executable_file) + .await + .expect("must succeed"); + let metadata_directory = tokio::fs::symlink_metadata(&p_directory) + .await + .expect("must succeed"); + let metadata_symlink = tokio::fs::symlink_metadata(&p_symlink) + .await + .expect("must succeed"); + + // modes should match. We & with 0o777 to remove any higher bits. + assert_eq!(0o444, metadata_file.mode() & 0o777); + assert_eq!(0o555, metadata_executable_file.mode() & 0o777); + assert_eq!(0o555, metadata_directory.mode() & 0o777); + assert_eq!(0o444, metadata_symlink.mode() & 0o777); + + // files should have the correct filesize + assert_eq!(fixtures::BLOB_A.len() as u64, metadata_file.len()); + // directories should have their "size" as filesize + assert_eq!( + { fixtures::DIRECTORY_WITH_KEEP.size() }, + metadata_directory.size() + ); + + for metadata in &[&metadata_file, &metadata_directory, &metadata_symlink] { + // uid and gid should be 0. + assert_eq!(0, metadata.uid()); + assert_eq!(0, metadata.gid()); + + // all times should be set to the unix epoch. + assert_eq!(0, metadata.atime()); + assert_eq!(0, metadata.mtime()); + assert_eq!(0, metadata.ctime()); + // crtime seems MacOS only + } + + fuse_daemon.unmount().expect("unmount"); +} + +#[tokio::test] +/// Ensure we allocate the same inodes for the same directory contents. +/// $DIRECTORY_COMPLICATED_NAME/keep contains the same data as $DIRECTORY_WITH_KEEP. +async fn compare_inodes_directories() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_directory_with_keep(&blob_service, &directory_service, &mut root_nodes).await; + populate_directory_complicated(&blob_service, &directory_service, &mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p_dir_with_keep = tmpdir.path().join(DIRECTORY_WITH_KEEP_NAME); + let p_sibling_dir = tmpdir.path().join(DIRECTORY_COMPLICATED_NAME).join("keep"); + + // peek at metadata. + assert_eq!( + tokio::fs::metadata(p_dir_with_keep) + .await + .expect("must succeed") + .ino(), + tokio::fs::metadata(p_sibling_dir) + .await + .expect("must succeed") + .ino() + ); + + fuse_daemon.unmount().expect("unmount"); +} + +/// Ensure we allocate the same inodes for the same directory contents. +/// $DIRECTORY_COMPLICATED_NAME/keep/,keep contains the same data as $DIRECTORY_COMPLICATED_NAME/.keep +#[tokio::test] +async fn compare_inodes_files() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_directory_complicated(&blob_service, &directory_service, &mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p_keep1 = tmpdir.path().join(DIRECTORY_COMPLICATED_NAME).join(".keep"); + let p_keep2 = tmpdir + .path() + .join(DIRECTORY_COMPLICATED_NAME) + .join("keep") + .join(".keep"); + + // peek at metadata. + assert_eq!( + tokio::fs::metadata(p_keep1) + .await + .expect("must succeed") + .ino(), + tokio::fs::metadata(p_keep2) + .await + .expect("must succeed") + .ino() + ); + + fuse_daemon.unmount().expect("unmount"); +} + +/// Ensure we allocate the same inode for symlinks pointing to the same targets. +/// $DIRECTORY_COMPLICATED_NAME/aa points to the same target as SYMLINK_NAME2. +#[tokio::test] +async fn compare_inodes_symlinks() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_directory_complicated(&blob_service, &directory_service, &mut root_nodes).await; + populate_symlink2(&mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p1 = tmpdir.path().join(DIRECTORY_COMPLICATED_NAME).join("aa"); + let p2 = tmpdir.path().join(SYMLINK_NAME2); + + // peek at metadata. + assert_eq!( + tokio::fs::symlink_metadata(p1) + .await + .expect("must succeed") + .ino(), + tokio::fs::symlink_metadata(p2) + .await + .expect("must succeed") + .ino() + ); + + fuse_daemon.unmount().expect("unmount"); +} + +/// Check we match paths exactly. +#[tokio::test] +async fn read_wrong_paths_in_root() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_blob_a(&blob_service, &mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + // wrong name + assert!( + tokio::fs::metadata(tmpdir.path().join("00000000000000000000000000000000-tes")) + .await + .is_err() + ); + + // invalid hash + assert!( + tokio::fs::metadata(tmpdir.path().join("0000000000000000000000000000000-test")) + .await + .is_err() + ); + + // right name, must exist + assert!( + tokio::fs::metadata(tmpdir.path().join("00000000000000000000000000000000-test")) + .await + .is_ok() + ); + + // now wrong name with right hash still may not exist + assert!( + tokio::fs::metadata(tmpdir.path().join("00000000000000000000000000000000-tes")) + .await + .is_err() + ); + + fuse_daemon.unmount().expect("unmount"); +} + +/// Make sure writes are not allowed +#[tokio::test] +async fn disallow_writes() { + // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let root_nodes = BTreeMap::default(); + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p = tmpdir.path().join(BLOB_A_NAME); + let e = tokio::fs::File::create(p).await.expect_err("must fail"); + + assert_eq!(Some(libc::EROFS), e.raw_os_error()); + + fuse_daemon.unmount().expect("unmount"); +} + +#[tokio::test] +/// Ensure we get an IO error if the directory service does not have the Directory object. +async fn missing_directory() { + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_directorynode_without_directory(&mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p = tmpdir.path().join(DIRECTORY_WITH_KEEP_NAME); + + { + // `stat` on the path should succeed, because it doesn't trigger the directory request. + tokio::fs::metadata(&p).await.expect("must succeed"); + + // However, calling either `readdir` or `stat` on a child should fail with an IO error. + // It fails when trying to pull the first entry, because we don't implement opendir separately + ReadDirStream::new(tokio::fs::read_dir(&p).await.unwrap()) + .next() + .await + .expect("must be some") + .expect_err("must be err"); + + // rust currently sets e.kind() to Uncategorized, which isn't very + // helpful, so we don't look at the error more closely than that.. + tokio::fs::metadata(p.join(".keep")) + .await + .expect_err("must fail"); + } + + fuse_daemon.unmount().expect("unmount"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +/// Ensure we get an IO error if the blob service does not have the blob +async fn missing_blob() { + if !std::path::Path::new("/dev/fuse").exists() { + eprintln!("skipping test"); + return; + } + let tmpdir = TempDir::new().unwrap(); + + let (blob_service, directory_service) = gen_svcs(); + let mut root_nodes = BTreeMap::default(); + + populate_filenode_without_blob(&mut root_nodes).await; + + let mut fuse_daemon = do_mount( + blob_service, + directory_service, + root_nodes, + tmpdir.path(), + false, + false, + ) + .expect("must succeed"); + + let p = tmpdir.path().join(BLOB_A_NAME); + + { + // `stat` on the blob should succeed, because it doesn't trigger a request to the blob service. + tokio::fs::metadata(&p).await.expect("must succeed"); + + // However, calling read on the blob should fail. + // rust currently sets e.kind() to Uncategorized, which isn't very + // helpful, so we don't look at the error more closely than that.. + tokio::fs::read(p).await.expect_err("must fail"); + } + + fuse_daemon.unmount().expect("unmount"); +} diff --git a/tvix/castore/src/fs/inode_tracker.rs b/tvix/castore/src/fs/inode_tracker.rs new file mode 100644 index 0000000000..4a8283b6b1 --- /dev/null +++ b/tvix/castore/src/fs/inode_tracker.rs @@ -0,0 +1,207 @@ +use std::{collections::HashMap, sync::Arc}; + +use super::inodes::{DirectoryInodeData, InodeData}; +use crate::B3Digest; + +/// InodeTracker keeps track of inodes, stores data being these inodes and deals +/// with inode allocation. +pub struct InodeTracker { + data: HashMap<u64, Arc<InodeData>>, + + // lookup table for blobs by their B3Digest + blob_digest_to_inode: HashMap<B3Digest, u64>, + + // lookup table for symlinks by their target + symlink_target_to_inode: HashMap<bytes::Bytes, u64>, + + // lookup table for directories by their B3Digest. + // Note the corresponding directory may not be present in data yet. + directory_digest_to_inode: HashMap<B3Digest, u64>, + + // the next inode to allocate + next_inode: u64, +} + +impl Default for InodeTracker { + fn default() -> Self { + Self { + data: Default::default(), + + blob_digest_to_inode: Default::default(), + symlink_target_to_inode: Default::default(), + directory_digest_to_inode: Default::default(), + + next_inode: 2, + } + } +} + +impl InodeTracker { + // Retrieves data for a given inode, if it exists. + pub fn get(&self, ino: u64) -> Option<Arc<InodeData>> { + self.data.get(&ino).cloned() + } + + // Replaces data for a given inode. + // Panics if the inode doesn't already exist. + pub fn replace(&mut self, ino: u64, data: Arc<InodeData>) { + if self.data.insert(ino, data).is_none() { + panic!("replace called on unknown inode"); + } + } + + // Stores data and returns the inode for it. + // In case an inode has already been allocated for the same data, that inode + // is returned, otherwise a new one is allocated. + // In case data is a [InodeData::Directory], inodes for all items are looked + // up + pub fn put(&mut self, data: InodeData) -> u64 { + match data { + InodeData::Regular(ref digest, _, _) => { + match self.blob_digest_to_inode.get(digest) { + Some(found_ino) => { + // We already have it, return the inode. + *found_ino + } + None => self.insert_and_increment(data), + } + } + InodeData::Symlink(ref target) => { + match self.symlink_target_to_inode.get(target) { + Some(found_ino) => { + // We already have it, return the inode. + *found_ino + } + None => self.insert_and_increment(data), + } + } + InodeData::Directory(DirectoryInodeData::Sparse(ref digest, _size)) => { + // check the lookup table if the B3Digest is known. + match self.directory_digest_to_inode.get(digest) { + Some(found_ino) => { + // We already have it, return the inode. + *found_ino + } + None => { + // insert and return the inode + self.insert_and_increment(data) + } + } + } + // Inserting [DirectoryInodeData::Populated] doesn't normally happen, + // only via [replace]. + InodeData::Directory(DirectoryInodeData::Populated(..)) => { + unreachable!("should never be called with DirectoryInodeData::Populated") + } + } + } + + // Inserts the data and returns the inode it was stored at, while + // incrementing next_inode. + fn insert_and_increment(&mut self, data: InodeData) -> u64 { + let ino = self.next_inode; + // insert into lookup tables + match data { + InodeData::Regular(ref digest, _, _) => { + self.blob_digest_to_inode.insert(digest.clone(), ino); + } + InodeData::Symlink(ref target) => { + self.symlink_target_to_inode.insert(target.clone(), ino); + } + InodeData::Directory(DirectoryInodeData::Sparse(ref digest, _size)) => { + self.directory_digest_to_inode.insert(digest.clone(), ino); + } + // This is currently not used outside test fixtures. + // Usually a [DirectoryInodeData::Sparse] is inserted and later + // "upgraded" with more data. + // However, as a future optimization, a lookup for a PathInfo could trigger a + // [DirectoryService::get_recursive()] request that "forks into + // background" and prepopulates all Directories in a closure. + InodeData::Directory(DirectoryInodeData::Populated(ref digest, _)) => { + self.directory_digest_to_inode.insert(digest.clone(), ino); + } + } + // Insert data + self.data.insert(ino, Arc::new(data)); + + // increment inode counter and return old inode. + self.next_inode += 1; + ino + } +} + +#[cfg(test)] +mod tests { + use crate::fixtures; + + use super::InodeData; + use super::InodeTracker; + + /// Getting something non-existent should be none + #[test] + fn get_nonexistent() { + let inode_tracker = InodeTracker::default(); + assert!(inode_tracker.get(1).is_none()); + } + + /// Put of a regular file should allocate a uid, which should be the same when inserting again. + #[test] + fn put_regular() { + let mut inode_tracker = InodeTracker::default(); + let f = InodeData::Regular( + fixtures::BLOB_A_DIGEST.clone(), + fixtures::BLOB_A.len() as u64, + false, + ); + + // put it in + let ino = inode_tracker.put(f.clone()); + + // a get should return the right data + let data = inode_tracker.get(ino).expect("must be some"); + match *data { + InodeData::Regular(ref digest, _, _) => { + assert_eq!(&fixtures::BLOB_A_DIGEST.clone(), digest); + } + InodeData::Symlink(_) | InodeData::Directory(..) => panic!("wrong type"), + } + + // another put should return the same ino + assert_eq!(ino, inode_tracker.put(f)); + + // inserting another file should return a different ino + assert_ne!( + ino, + inode_tracker.put(InodeData::Regular( + fixtures::BLOB_B_DIGEST.clone(), + fixtures::BLOB_B.len() as u64, + false, + )) + ); + } + + // Put of a symlink should allocate a uid, which should be the same when inserting again + #[test] + fn put_symlink() { + let mut inode_tracker = InodeTracker::default(); + let f = InodeData::Symlink("target".into()); + + // put it in + let ino = inode_tracker.put(f.clone()); + + // a get should return the right data + let data = inode_tracker.get(ino).expect("must be some"); + match *data { + InodeData::Symlink(ref target) => { + assert_eq!(b"target".to_vec(), *target); + } + InodeData::Regular(..) | InodeData::Directory(..) => panic!("wrong type"), + } + + // another put should return the same ino + assert_eq!(ino, inode_tracker.put(f)); + + // inserting another file should return a different ino + assert_ne!(ino, inode_tracker.put(InodeData::Symlink("target2".into()))); + } +} diff --git a/tvix/castore/src/fs/inodes.rs b/tvix/castore/src/fs/inodes.rs new file mode 100644 index 0000000000..bdd4595434 --- /dev/null +++ b/tvix/castore/src/fs/inodes.rs @@ -0,0 +1,96 @@ +//! This module contains all the data structures used to track information +//! about inodes, which present tvix-castore nodes in a filesystem. +use std::time::Duration; + +use bytes::Bytes; + +use crate::proto as castorepb; +use crate::B3Digest; + +#[derive(Clone, Debug)] +pub enum InodeData { + Regular(B3Digest, u64, bool), // digest, size, executable + Symlink(bytes::Bytes), // target + Directory(DirectoryInodeData), // either [DirectoryInodeData:Sparse] or [DirectoryInodeData:Populated] +} + +/// This encodes the two different states of [InodeData::Directory]. +/// Either the data still is sparse (we only saw a [castorepb::DirectoryNode], +/// but didn't fetch the [castorepb::Directory] struct yet, or we processed a +/// lookup and did fetch the data. +#[derive(Clone, Debug)] +pub enum DirectoryInodeData { + Sparse(B3Digest, u64), // digest, size + Populated(B3Digest, Vec<(u64, castorepb::node::Node)>), // [(child_inode, node)] +} + +impl InodeData { + /// Constructs a new InodeData by consuming a [Node]. + /// It splits off the orginal name, so it can be used later. + pub fn from_node(node: castorepb::node::Node) -> (Self, Bytes) { + match node { + castorepb::node::Node::Directory(n) => ( + Self::Directory(DirectoryInodeData::Sparse( + n.digest.try_into().unwrap(), + n.size, + )), + n.name, + ), + castorepb::node::Node::File(n) => ( + Self::Regular(n.digest.try_into().unwrap(), n.size, n.executable), + n.name, + ), + castorepb::node::Node::Symlink(n) => (Self::Symlink(n.target), n.name), + } + } + + pub fn as_fuse_file_attr(&self, inode: u64) -> fuse_backend_rs::abi::fuse_abi::Attr { + fuse_backend_rs::abi::fuse_abi::Attr { + ino: inode, + // FUTUREWORK: play with this numbers, as it affects read sizes for client applications. + blocks: 1024, + size: match self { + InodeData::Regular(_, size, _) => *size, + InodeData::Symlink(target) => target.len() as u64, + InodeData::Directory(DirectoryInodeData::Sparse(_, size)) => *size, + InodeData::Directory(DirectoryInodeData::Populated(_, ref children)) => { + children.len() as u64 + } + }, + mode: self.as_fuse_type() | self.mode(), + ..Default::default() + } + } + + fn mode(&self) -> u32 { + match self { + InodeData::Regular(_, _, false) | InodeData::Symlink(_) => 0o444, + InodeData::Regular(_, _, true) | InodeData::Directory(_) => 0o555, + } + } + + pub fn as_fuse_entry(&self, inode: u64) -> fuse_backend_rs::api::filesystem::Entry { + fuse_backend_rs::api::filesystem::Entry { + inode, + attr: self.as_fuse_file_attr(inode).into(), + attr_timeout: Duration::MAX, + entry_timeout: Duration::MAX, + ..Default::default() + } + } + + /// Returns the u32 fuse type + pub fn as_fuse_type(&self) -> u32 { + #[allow(clippy::let_and_return)] + let ty = match self { + InodeData::Regular(_, _, _) => libc::S_IFREG, + InodeData::Symlink(_) => libc::S_IFLNK, + InodeData::Directory(_) => libc::S_IFDIR, + }; + // libc::S_IFDIR is u32 on Linux and u16 on MacOS + #[cfg(target_os = "macos")] + let ty = ty as u32; + + ty + } +} diff --git a/tvix/castore/src/fs/mod.rs b/tvix/castore/src/fs/mod.rs new file mode 100644 index 0000000000..176199f64a --- /dev/null +++ b/tvix/castore/src/fs/mod.rs @@ -0,0 +1,874 @@ +mod file_attr; +mod inode_tracker; +mod inodes; +mod root_nodes; + +#[cfg(feature = "fuse")] +pub mod fuse; + +#[cfg(feature = "virtiofs")] +pub mod virtiofs; + +pub use self::root_nodes::RootNodes; +use self::{ + file_attr::ROOT_FILE_ATTR, + inode_tracker::InodeTracker, + inodes::{DirectoryInodeData, InodeData}, +}; +use crate::proto as castorepb; +use crate::{ + blobservice::{BlobReader, BlobService}, + directoryservice::DirectoryService, + proto::{node::Node, NamedNode}, + B3Digest, +}; +use bstr::ByteVec; +use bytes::Bytes; +use fuse_backend_rs::abi::fuse_abi::{stat64, OpenOptions}; +use fuse_backend_rs::api::filesystem::{ + Context, FileSystem, FsOptions, GetxattrReply, ListxattrReply, ROOT_ID, +}; +use futures::StreamExt; +use parking_lot::RwLock; +use std::sync::Mutex; +use std::{ + collections::HashMap, + io, + sync::atomic::AtomicU64, + sync::{atomic::Ordering, Arc}, + time::Duration, +}; +use std::{ffi::CStr, io::Cursor}; +use tokio::{ + io::{AsyncReadExt, AsyncSeekExt}, + sync::mpsc, +}; +use tracing::{debug, error, instrument, warn, Span}; + +/// This implements a read-only FUSE filesystem for a tvix-store +/// with the passed [BlobService], [DirectoryService] and [RootNodes]. +/// +/// Linux uses inodes in filesystems. When implementing FUSE, most calls are +/// *for* a given inode. +/// +/// This means, we need to have a stable mapping of inode numbers to the +/// corresponding store nodes. +/// +/// We internally delegate all inode allocation and state keeping to the +/// inode tracker. +/// We store a mapping from currently "explored" names in the root to their +/// inode. +/// +/// There's some places where inodes are allocated / data inserted into +/// the inode tracker, if not allocated before already: +/// - Processing a `lookup` request, either in the mount root, or somewhere +/// deeper. +/// - Processing a `readdir` request +/// +/// Things pointing to the same contents get the same inodes, irrespective of +/// their own location. +/// This means: +/// - Symlinks with the same target will get the same inode. +/// - Regular/executable files with the same contents will get the same inode +/// - Directories with the same contents will get the same inode. +/// +/// Due to the above being valid across the whole store, and considering the +/// merkle structure is a DAG, not a tree, this also means we can't do "bucketed +/// allocation", aka reserve Directory.size inodes for each directory node we +/// explore. +/// Tests for this live in the tvix-store crate. +pub struct TvixStoreFs<BS, DS, RN> { + blob_service: BS, + directory_service: DS, + root_nodes_provider: RN, + + /// Whether to (try) listing elements in the root. + list_root: bool, + + /// Whether to expose blob and directory digests as extended attributes. + show_xattr: bool, + + /// This maps a given basename in the root to the inode we allocated for the node. + root_nodes: RwLock<HashMap<Bytes, u64>>, + + /// This keeps track of inodes and data alongside them. + inode_tracker: RwLock<InodeTracker>, + + // FUTUREWORK: have a generic container type for dir/file handles and handle + // allocation. + /// Maps from the handle returned from an opendir to + /// This holds all opendir handles (for the root inode) + /// They point to the rx part of the channel producing the listing. + #[allow(clippy::type_complexity)] + dir_handles: RwLock< + HashMap< + u64, + ( + Span, + Arc<Mutex<mpsc::Receiver<(usize, Result<Node, crate::Error>)>>>, + ), + >, + >, + + next_dir_handle: AtomicU64, + + /// This holds all open file handles + #[allow(clippy::type_complexity)] + file_handles: RwLock<HashMap<u64, (Span, Arc<Mutex<Box<dyn BlobReader>>>)>>, + + next_file_handle: AtomicU64, + + tokio_handle: tokio::runtime::Handle, +} + +impl<BS, DS, RN> TvixStoreFs<BS, DS, RN> +where + BS: AsRef<dyn BlobService> + Clone + Send, + DS: AsRef<dyn DirectoryService> + Clone + Send + 'static, + RN: RootNodes + Clone + 'static, +{ + pub fn new( + blob_service: BS, + directory_service: DS, + root_nodes_provider: RN, + list_root: bool, + show_xattr: bool, + ) -> Self { + Self { + blob_service, + directory_service, + root_nodes_provider, + + list_root, + show_xattr, + + root_nodes: RwLock::new(HashMap::default()), + inode_tracker: RwLock::new(Default::default()), + + dir_handles: RwLock::new(Default::default()), + next_dir_handle: AtomicU64::new(1), + + file_handles: RwLock::new(Default::default()), + next_file_handle: AtomicU64::new(1), + tokio_handle: tokio::runtime::Handle::current(), + } + } + + /// Retrieves the inode for a given root node basename, if present. + /// This obtains a read lock on self.root_nodes. + fn get_inode_for_root_name(&self, name: &[u8]) -> Option<u64> { + self.root_nodes.read().get(name).cloned() + } + + /// For a given inode, look up the given directory behind it (from + /// self.inode_tracker), and return its children. + /// The inode_tracker MUST know about this inode already, and it MUST point + /// to a [InodeData::Directory]. + /// It is ok if it's a [DirectoryInodeData::Sparse] - in that case, a lookup + /// in self.directory_service is performed, and self.inode_tracker is updated with the + /// [DirectoryInodeData::Populated]. + #[instrument(skip(self), err)] + fn get_directory_children(&self, ino: u64) -> io::Result<(B3Digest, Vec<(u64, Node)>)> { + let data = self.inode_tracker.read().get(ino).unwrap(); + match *data { + // if it's populated already, return children. + InodeData::Directory(DirectoryInodeData::Populated( + ref parent_digest, + ref children, + )) => Ok((parent_digest.clone(), children.clone())), + // if it's sparse, fetch data using directory_service, populate child nodes + // and update it in [self.inode_tracker]. + InodeData::Directory(DirectoryInodeData::Sparse(ref parent_digest, _)) => { + let directory = self + .tokio_handle + .block_on({ + let directory_service = self.directory_service.clone(); + let parent_digest = parent_digest.to_owned(); + async move { directory_service.as_ref().get(&parent_digest).await } + })? + .ok_or_else(|| { + warn!(directory.digest=%parent_digest, "directory not found"); + // If the Directory can't be found, this is a hole, bail out. + io::Error::from_raw_os_error(libc::EIO) + })?; + + // Turn the retrieved directory into a InodeData::Directory(DirectoryInodeData::Populated(..)), + // allocating inodes for the children on the way. + // FUTUREWORK: there's a bunch of cloning going on here, which we can probably avoid. + let children = { + let mut inode_tracker = self.inode_tracker.write(); + + let children: Vec<(u64, castorepb::node::Node)> = directory + .nodes() + .map(|child_node| { + let (inode_data, _) = InodeData::from_node(child_node.clone()); + + let child_ino = inode_tracker.put(inode_data); + (child_ino, child_node) + }) + .collect(); + + // replace. + inode_tracker.replace( + ino, + Arc::new(InodeData::Directory(DirectoryInodeData::Populated( + parent_digest.clone(), + children.clone(), + ))), + ); + + children + }; + + Ok((parent_digest.clone(), children)) + } + // if the parent inode was not a directory, this doesn't make sense + InodeData::Regular(..) | InodeData::Symlink(_) => { + Err(io::Error::from_raw_os_error(libc::ENOTDIR)) + } + } + } + + /// This will turn a lookup request for a name in the root to a ino and + /// [InodeData]. + /// It will peek in [self.root_nodes], and then either look it up from + /// [self.inode_tracker], + /// or otherwise fetch from [self.root_nodes], and then insert into + /// [self.inode_tracker]. + /// In the case the name can't be found, a libc::ENOENT is returned. + fn name_in_root_to_ino_and_data( + &self, + name: &std::ffi::CStr, + ) -> io::Result<(u64, Arc<InodeData>)> { + // Look up the inode for that root node. + // If there's one, [self.inode_tracker] MUST also contain the data, + // which we can then return. + if let Some(inode) = self.get_inode_for_root_name(name.to_bytes()) { + return Ok(( + inode, + self.inode_tracker + .read() + .get(inode) + .expect("must exist") + .to_owned(), + )); + } + + // We don't have it yet, look it up in [self.root_nodes]. + match self.tokio_handle.block_on({ + let root_nodes_provider = self.root_nodes_provider.clone(); + async move { root_nodes_provider.get_by_basename(name.to_bytes()).await } + }) { + // if there was an error looking up the root node, propagate up an IO error. + Err(_e) => Err(io::Error::from_raw_os_error(libc::EIO)), + // the root node doesn't exist, so the file doesn't exist. + Ok(None) => Err(io::Error::from_raw_os_error(libc::ENOENT)), + // The root node does exist + Ok(Some(root_node)) => { + // The name must match what's passed in the lookup, otherwise this is also a ENOENT. + if root_node.get_name() != name.to_bytes() { + debug!(root_node.name=?root_node.get_name(), found_node.name=%name.to_string_lossy(), "node name mismatch"); + return Err(io::Error::from_raw_os_error(libc::ENOENT)); + } + + // Let's check if someone else beat us to updating the inode tracker and + // root_nodes map. This avoids locking inode_tracker for writing. + if let Some(ino) = self.root_nodes.read().get(name.to_bytes()) { + return Ok(( + *ino, + self.inode_tracker.read().get(*ino).expect("must exist"), + )); + } + + // Only in case it doesn't, lock [self.root_nodes] and + // [self.inode_tracker] for writing. + let mut root_nodes = self.root_nodes.write(); + let mut inode_tracker = self.inode_tracker.write(); + + // insert the (sparse) inode data and register in + // self.root_nodes. + let (inode_data, name) = InodeData::from_node(root_node); + let ino = inode_tracker.put(inode_data.clone()); + root_nodes.insert(name, ino); + + Ok((ino, Arc::new(inode_data))) + } + } + } +} + +/// Buffer size of the channel providing nodes in the mount root +const ROOT_NODES_BUFFER_SIZE: usize = 16; + +const XATTR_NAME_DIRECTORY_DIGEST: &[u8] = b"user.tvix.castore.directory.digest"; +const XATTR_NAME_BLOB_DIGEST: &[u8] = b"user.tvix.castore.blob.digest"; + +impl<BS, DS, RN> FileSystem for TvixStoreFs<BS, DS, RN> +where + BS: AsRef<dyn BlobService> + Clone + Send + 'static, + DS: AsRef<dyn DirectoryService> + Send + Clone + 'static, + RN: RootNodes + Clone + 'static, +{ + type Handle = u64; + type Inode = u64; + + fn init(&self, _capable: FsOptions) -> io::Result<FsOptions> { + Ok(FsOptions::empty()) + } + + #[tracing::instrument(skip_all, fields(rq.inode = inode))] + fn getattr( + &self, + _ctx: &Context, + inode: Self::Inode, + _handle: Option<Self::Handle>, + ) -> io::Result<(stat64, Duration)> { + if inode == ROOT_ID { + return Ok((ROOT_FILE_ATTR.into(), Duration::MAX)); + } + + match self.inode_tracker.read().get(inode) { + None => Err(io::Error::from_raw_os_error(libc::ENOENT)), + Some(inode_data) => { + debug!(inode_data = ?inode_data, "found node"); + Ok((inode_data.as_fuse_file_attr(inode).into(), Duration::MAX)) + } + } + } + + #[tracing::instrument(skip_all, fields(rq.parent_inode = parent, rq.name = ?name))] + fn lookup( + &self, + _ctx: &Context, + parent: Self::Inode, + name: &std::ffi::CStr, + ) -> io::Result<fuse_backend_rs::api::filesystem::Entry> { + debug!("lookup"); + + // This goes from a parent inode to a node. + // - If the parent is [ROOT_ID], we need to check + // [self.root_nodes] (fetching from a [RootNode] provider if needed) + // - Otherwise, lookup the parent in [self.inode_tracker] (which must be + // a [InodeData::Directory]), and find the child with that name. + if parent == ROOT_ID { + let (ino, inode_data) = self.name_in_root_to_ino_and_data(name)?; + + debug!(inode_data=?&inode_data, ino=ino, "Some"); + return Ok(inode_data.as_fuse_entry(ino)); + } + // This is the "lookup for "a" inside inode 42. + // We already know that inode 42 must be a directory. + let (parent_digest, children) = self.get_directory_children(parent)?; + + Span::current().record("directory.digest", parent_digest.to_string()); + // Search for that name in the list of children and return the FileAttrs. + + // in the children, find the one with the desired name. + if let Some((child_ino, _)) = children.iter().find(|e| e.1.get_name() == name.to_bytes()) { + // lookup the child [InodeData] in [self.inode_tracker]. + // We know the inodes for children have already been allocated. + let child_inode_data = self.inode_tracker.read().get(*child_ino).unwrap(); + + // Reply with the file attributes for the child. + // For child directories, we still have all data we need to reply. + Ok(child_inode_data.as_fuse_entry(*child_ino)) + } else { + // Child not found, return ENOENT. + Err(io::Error::from_raw_os_error(libc::ENOENT)) + } + } + + #[tracing::instrument(skip_all, fields(rq.inode = inode))] + fn opendir( + &self, + _ctx: &Context, + inode: Self::Inode, + _flags: u32, + ) -> io::Result<(Option<Self::Handle>, OpenOptions)> { + // In case opendir on the root is called, we provide the handle, as re-entering that listing is expensive. + // For all other directory inodes we just let readdir take care of it. + if inode == ROOT_ID { + if !self.list_root { + return Err(io::Error::from_raw_os_error(libc::EPERM)); // same error code as ipfs/kubo + } + + let root_nodes_provider = self.root_nodes_provider.clone(); + let (tx, rx) = mpsc::channel(ROOT_NODES_BUFFER_SIZE); + + // This task will run in the background immediately and will exit + // after the stream ends or if we no longer want any more entries. + self.tokio_handle.spawn(async move { + let mut stream = root_nodes_provider.list().enumerate(); + while let Some(node) = stream.next().await { + if tx.send(node).await.is_err() { + // If we get a send error, it means the sync code + // doesn't want any more entries. + break; + } + } + }); + + // Put the rx part into [self.dir_handles]. + // TODO: this will overflow after 2**64 operations, + // which is fine for now. + // See https://cl.tvl.fyi/c/depot/+/8834/comment/a6684ce0_d72469d1 + // for the discussion on alternatives. + let dh = self.next_dir_handle.fetch_add(1, Ordering::SeqCst); + + self.dir_handles + .write() + .insert(dh, (Span::current(), Arc::new(Mutex::new(rx)))); + + return Ok(( + Some(dh), + fuse_backend_rs::api::filesystem::OpenOptions::empty(), // TODO: non-seekable + )); + } + + Ok((None, OpenOptions::empty())) + } + + #[tracing::instrument(skip_all, fields(rq.inode = inode, rq.handle = handle, rq.offset = offset), parent = self.dir_handles.read().get(&handle).and_then(|x| x.0.id()))] + fn readdir( + &self, + _ctx: &Context, + inode: Self::Inode, + handle: Self::Handle, + _size: u32, + offset: u64, + add_entry: &mut dyn FnMut(fuse_backend_rs::api::filesystem::DirEntry) -> io::Result<usize>, + ) -> io::Result<()> { + debug!("readdir"); + + if inode == ROOT_ID { + if !self.list_root { + return Err(io::Error::from_raw_os_error(libc::EPERM)); // same error code as ipfs/kubo + } + + // get the handle from [self.dir_handles] + let (_span, rx) = match self.dir_handles.read().get(&handle) { + Some(rx) => rx.clone(), + None => { + warn!("dir handle {} unknown", handle); + return Err(io::Error::from_raw_os_error(libc::EIO)); + } + }; + + let mut rx = rx + .lock() + .map_err(|_| crate::Error::StorageError("mutex poisoned".into()))?; + + while let Some((i, n)) = rx.blocking_recv() { + let root_node = n.map_err(|e| { + warn!("failed to retrieve root node: {}", e); + io::Error::from_raw_os_error(libc::EIO) + })?; + + let (inode_data, name) = InodeData::from_node(root_node); + + // obtain the inode, or allocate a new one. + let ino = self.get_inode_for_root_name(&name).unwrap_or_else(|| { + // insert the (sparse) inode data and register in + // self.root_nodes. + let ino = self.inode_tracker.write().put(inode_data.clone()); + self.root_nodes.write().insert(name.clone(), ino); + ino + }); + + let written = add_entry(fuse_backend_rs::api::filesystem::DirEntry { + ino, + offset: offset + (i as u64) + 1, + type_: inode_data.as_fuse_type(), + name: &name, + })?; + // If the buffer is full, add_entry will return `Ok(0)`. + if written == 0 { + break; + } + } + return Ok(()); + } + + // Non root-node case: lookup the children, or return an error if it's not a directory. + let (parent_digest, children) = self.get_directory_children(inode)?; + Span::current().record("directory.digest", parent_digest.to_string()); + + for (i, (ino, child_node)) in children.into_iter().skip(offset as usize).enumerate() { + let (inode_data, name) = InodeData::from_node(child_node); + + // the second parameter will become the "offset" parameter on the next call. + let written = add_entry(fuse_backend_rs::api::filesystem::DirEntry { + ino, + offset: offset + (i as u64) + 1, + type_: inode_data.as_fuse_type(), + name: &name, + })?; + // If the buffer is full, add_entry will return `Ok(0)`. + if written == 0 { + break; + } + } + + Ok(()) + } + + #[tracing::instrument(skip_all, fields(rq.inode = inode, rq.handle = handle), parent = self.dir_handles.read().get(&handle).and_then(|x| x.0.id()))] + fn readdirplus( + &self, + _ctx: &Context, + inode: Self::Inode, + handle: Self::Handle, + _size: u32, + offset: u64, + add_entry: &mut dyn FnMut( + fuse_backend_rs::api::filesystem::DirEntry, + fuse_backend_rs::api::filesystem::Entry, + ) -> io::Result<usize>, + ) -> io::Result<()> { + debug!("readdirplus"); + + if inode == ROOT_ID { + if !self.list_root { + return Err(io::Error::from_raw_os_error(libc::EPERM)); // same error code as ipfs/kubo + } + + // get the handle from [self.dir_handles] + let (_span, rx) = match self.dir_handles.read().get(&handle) { + Some(rx) => rx.clone(), + None => { + warn!("dir handle {} unknown", handle); + return Err(io::Error::from_raw_os_error(libc::EIO)); + } + }; + + let mut rx = rx + .lock() + .map_err(|_| crate::Error::StorageError("mutex poisoned".into()))?; + + while let Some((i, n)) = rx.blocking_recv() { + let root_node = n.map_err(|e| { + warn!("failed to retrieve root node: {}", e); + io::Error::from_raw_os_error(libc::EPERM) + })?; + + let (inode_data, name) = InodeData::from_node(root_node); + + // obtain the inode, or allocate a new one. + let ino = self.get_inode_for_root_name(&name).unwrap_or_else(|| { + // insert the (sparse) inode data and register in + // self.root_nodes. + let ino = self.inode_tracker.write().put(inode_data.clone()); + self.root_nodes.write().insert(name.clone(), ino); + ino + }); + + let written = add_entry( + fuse_backend_rs::api::filesystem::DirEntry { + ino, + offset: offset + (i as u64) + 1, + type_: inode_data.as_fuse_type(), + name: &name, + }, + inode_data.as_fuse_entry(ino), + )?; + // If the buffer is full, add_entry will return `Ok(0)`. + if written == 0 { + break; + } + } + return Ok(()); + } + + // Non root-node case: lookup the children, or return an error if it's not a directory. + let (parent_digest, children) = self.get_directory_children(inode)?; + Span::current().record("directory.digest", parent_digest.to_string()); + + for (i, (ino, child_node)) in children.into_iter().skip(offset as usize).enumerate() { + let (inode_data, name) = InodeData::from_node(child_node); + + // the second parameter will become the "offset" parameter on the next call. + let written = add_entry( + fuse_backend_rs::api::filesystem::DirEntry { + ino, + offset: offset + (i as u64) + 1, + type_: inode_data.as_fuse_type(), + name: &name, + }, + inode_data.as_fuse_entry(ino), + )?; + // If the buffer is full, add_entry will return `Ok(0)`. + if written == 0 { + break; + } + } + + Ok(()) + } + + #[tracing::instrument(skip_all, fields(rq.inode = inode, rq.handle = handle), parent = self.dir_handles.read().get(&handle).and_then(|x| x.0.id()))] + fn releasedir( + &self, + _ctx: &Context, + inode: Self::Inode, + _flags: u32, + handle: Self::Handle, + ) -> io::Result<()> { + if inode == ROOT_ID { + // drop the rx part of the channel. + match self.dir_handles.write().remove(&handle) { + // drop it, which will close it. + Some(rx) => drop(rx), + None => { + warn!("dir handle not found"); + } + } + } + + Ok(()) + } + + #[tracing::instrument(skip_all, fields(rq.inode = inode))] + fn open( + &self, + _ctx: &Context, + inode: Self::Inode, + _flags: u32, + _fuse_flags: u32, + ) -> io::Result<( + Option<Self::Handle>, + fuse_backend_rs::api::filesystem::OpenOptions, + )> { + if inode == ROOT_ID { + return Err(io::Error::from_raw_os_error(libc::ENOSYS)); + } + + // lookup the inode + match *self.inode_tracker.read().get(inode).unwrap() { + // read is invalid on non-files. + InodeData::Directory(..) | InodeData::Symlink(_) => { + warn!("is directory"); + Err(io::Error::from_raw_os_error(libc::EISDIR)) + } + InodeData::Regular(ref blob_digest, _blob_size, _) => { + Span::current().record("blob.digest", blob_digest.to_string()); + + match self.tokio_handle.block_on({ + let blob_service = self.blob_service.clone(); + let blob_digest = blob_digest.clone(); + async move { blob_service.as_ref().open_read(&blob_digest).await } + }) { + Ok(None) => { + warn!("blob not found"); + Err(io::Error::from_raw_os_error(libc::EIO)) + } + Err(e) => { + warn!(e=?e, "error opening blob"); + Err(io::Error::from_raw_os_error(libc::EIO)) + } + Ok(Some(blob_reader)) => { + // get a new file handle + // TODO: this will overflow after 2**64 operations, + // which is fine for now. + // See https://cl.tvl.fyi/c/depot/+/8834/comment/a6684ce0_d72469d1 + // for the discussion on alternatives. + let fh = self.next_file_handle.fetch_add(1, Ordering::SeqCst); + + self.file_handles + .write() + .insert(fh, (Span::current(), Arc::new(Mutex::new(blob_reader)))); + + Ok(( + Some(fh), + fuse_backend_rs::api::filesystem::OpenOptions::empty(), + )) + } + } + } + } + } + + #[tracing::instrument(skip_all, fields(rq.inode = inode, rq.handle = handle), parent = self.file_handles.read().get(&handle).and_then(|x| x.0.id()))] + fn release( + &self, + _ctx: &Context, + inode: Self::Inode, + _flags: u32, + handle: Self::Handle, + _flush: bool, + _flock_release: bool, + _lock_owner: Option<u64>, + ) -> io::Result<()> { + match self.file_handles.write().remove(&handle) { + // drop the blob reader, which will close it. + Some(blob_reader) => drop(blob_reader), + None => { + // These might already be dropped if a read error occured. + warn!("file handle not found"); + } + } + + Ok(()) + } + + #[tracing::instrument(skip_all, fields(rq.inode = inode, rq.handle = handle, rq.offset = offset, rq.size = size), parent = self.file_handles.read().get(&handle).and_then(|x| x.0.id()))] + fn read( + &self, + _ctx: &Context, + inode: Self::Inode, + handle: Self::Handle, + w: &mut dyn fuse_backend_rs::api::filesystem::ZeroCopyWriter, + size: u32, + offset: u64, + _lock_owner: Option<u64>, + _flags: u32, + ) -> io::Result<usize> { + debug!("read"); + + // We need to take out the blob reader from self.file_handles, so we can + // interact with it in the separate task. + // On success, we pass it back out of the task, so we can put it back in self.file_handles. + let (_span, blob_reader) = self + .file_handles + .read() + .get(&handle) + .ok_or_else(|| { + warn!("file handle {} unknown", handle); + io::Error::from_raw_os_error(libc::EIO) + }) + .cloned()?; + + let mut blob_reader = blob_reader + .lock() + .map_err(|_| crate::Error::StorageError("mutex poisoned".into()))?; + + let buf = self.tokio_handle.block_on(async move { + // seek to the offset specified, which is relative to the start of the file. + let pos = blob_reader + .seek(io::SeekFrom::Start(offset)) + .await + .map_err(|e| { + warn!("failed to seek to offset {}: {}", offset, e); + io::Error::from_raw_os_error(libc::EIO) + })?; + + debug_assert_eq!(offset, pos); + + // As written in the fuse docs, read should send exactly the number + // of bytes requested except on EOF or error. + + let mut buf: Vec<u8> = Vec::with_capacity(size as usize); + + // copy things from the internal buffer into buf to fill it till up until size + tokio::io::copy(&mut blob_reader.as_mut().take(size as u64), &mut buf).await?; + + Ok::<_, std::io::Error>(buf) + })?; + + // We cannot use w.write() here, we're required to call write multiple + // times until we wrote the entirety of the buffer (which is `size`, except on EOF). + let buf_len = buf.len(); + let bytes_written = io::copy(&mut Cursor::new(buf), w)?; + if bytes_written != buf_len as u64 { + error!(bytes_written=%bytes_written, "unable to write all of buf to kernel"); + return Err(io::Error::from_raw_os_error(libc::EIO)); + } + + Ok(bytes_written as usize) + } + + #[tracing::instrument(skip_all, fields(rq.inode = inode))] + fn readlink(&self, _ctx: &Context, inode: Self::Inode) -> io::Result<Vec<u8>> { + if inode == ROOT_ID { + return Err(io::Error::from_raw_os_error(libc::ENOSYS)); + } + + // lookup the inode + match *self.inode_tracker.read().get(inode).unwrap() { + InodeData::Directory(..) | InodeData::Regular(..) => { + Err(io::Error::from_raw_os_error(libc::EINVAL)) + } + InodeData::Symlink(ref target) => Ok(target.to_vec()), + } + } + + #[tracing::instrument(skip_all, fields(rq.inode = inode, name=?name))] + fn getxattr( + &self, + _ctx: &Context, + inode: Self::Inode, + name: &CStr, + size: u32, + ) -> io::Result<GetxattrReply> { + if !self.show_xattr { + return Err(io::Error::from_raw_os_error(libc::ENOSYS)); + } + + // Peek at the inode requested, and construct the response. + let digest_str = match *self + .inode_tracker + .read() + .get(inode) + .ok_or_else(|| io::Error::from_raw_os_error(libc::ENODATA))? + { + InodeData::Directory(DirectoryInodeData::Sparse(ref digest, _)) + | InodeData::Directory(DirectoryInodeData::Populated(ref digest, _)) + if name.to_bytes() == XATTR_NAME_DIRECTORY_DIGEST => + { + digest.to_string() + } + InodeData::Regular(ref digest, _, _) if name.to_bytes() == XATTR_NAME_BLOB_DIGEST => { + digest.to_string() + } + _ => { + return Err(io::Error::from_raw_os_error(libc::ENODATA)); + } + }; + + if size == 0 { + Ok(GetxattrReply::Count(digest_str.len() as u32)) + } else if size < digest_str.len() as u32 { + Err(io::Error::from_raw_os_error(libc::ERANGE)) + } else { + Ok(GetxattrReply::Value(digest_str.into_bytes())) + } + } + + #[tracing::instrument(skip_all, fields(rq.inode = inode))] + fn listxattr( + &self, + _ctx: &Context, + inode: Self::Inode, + size: u32, + ) -> io::Result<ListxattrReply> { + if !self.show_xattr { + return Err(io::Error::from_raw_os_error(libc::ENOSYS)); + } + + // determine the (\0-terminated list) to of xattr keys present, depending on the type of the inode. + let xattrs_names = { + let mut out = Vec::new(); + if let Some(inode_data) = self.inode_tracker.read().get(inode) { + match *inode_data { + InodeData::Directory(_) => { + out.extend_from_slice(XATTR_NAME_DIRECTORY_DIGEST); + out.push_byte(b'\x00'); + } + InodeData::Regular(..) => { + out.extend_from_slice(XATTR_NAME_BLOB_DIGEST); + out.push_byte(b'\x00'); + } + _ => {} + } + } + out + }; + + if size == 0 { + Ok(ListxattrReply::Count(xattrs_names.len() as u32)) + } else if size < xattrs_names.len() as u32 { + Err(io::Error::from_raw_os_error(libc::ERANGE)) + } else { + Ok(ListxattrReply::Names(xattrs_names.to_vec())) + } + } +} diff --git a/tvix/castore/src/fs/root_nodes.rs b/tvix/castore/src/fs/root_nodes.rs new file mode 100644 index 0000000000..6609e049a1 --- /dev/null +++ b/tvix/castore/src/fs/root_nodes.rs @@ -0,0 +1,37 @@ +use std::collections::BTreeMap; + +use crate::{proto::node::Node, Error}; +use bytes::Bytes; +use futures::stream::BoxStream; +use tonic::async_trait; + +/// Provides an interface for looking up root nodes in tvix-castore by given +/// a lookup key (usually the basename), and optionally allow a listing. +#[async_trait] +pub trait RootNodes: Send + Sync { + /// Looks up a root CA node based on the basename of the node in the root + /// directory of the filesystem. + async fn get_by_basename(&self, name: &[u8]) -> Result<Option<Node>, Error>; + + /// Lists all root CA nodes in the filesystem. An error can be returned + /// in case listing is not allowed + fn list(&self) -> BoxStream<Result<Node, Error>>; +} + +#[async_trait] +/// Implements RootNodes for something deref'ing to a BTreeMap of Nodes, where +/// the key is the node name. +impl<T> RootNodes for T +where + T: AsRef<BTreeMap<Bytes, Node>> + Send + Sync, +{ + async fn get_by_basename(&self, name: &[u8]) -> Result<Option<Node>, Error> { + Ok(self.as_ref().get(name).cloned()) + } + + fn list(&self) -> BoxStream<Result<Node, Error>> { + Box::pin(tokio_stream::iter( + self.as_ref().iter().map(|(_, v)| Ok(v.clone())), + )) + } +} diff --git a/tvix/castore/src/fs/virtiofs.rs b/tvix/castore/src/fs/virtiofs.rs new file mode 100644 index 0000000000..d63e2f2bdd --- /dev/null +++ b/tvix/castore/src/fs/virtiofs.rs @@ -0,0 +1,238 @@ +use std::{ + convert, error, fmt, io, + ops::Deref, + path::Path, + sync::{Arc, MutexGuard, RwLock}, +}; + +use fuse_backend_rs::{ + api::{filesystem::FileSystem, server::Server}, + transport::{FsCacheReqHandler, Reader, VirtioFsWriter}, +}; +use tracing::error; +use vhost::vhost_user::{ + Listener, SlaveFsCacheReq, VhostUserProtocolFeatures, VhostUserVirtioFeatures, +}; +use vhost_user_backend::{VhostUserBackendMut, VhostUserDaemon, VringMutex, VringState, VringT}; +use virtio_bindings::bindings::virtio_ring::{ + VIRTIO_RING_F_EVENT_IDX, VIRTIO_RING_F_INDIRECT_DESC, +}; +use virtio_queue::QueueT; +use vm_memory::{GuestAddressSpace, GuestMemoryAtomic, GuestMemoryMmap}; +use vmm_sys_util::epoll::EventSet; + +const VIRTIO_F_VERSION_1: u32 = 32; +const NUM_QUEUES: usize = 2; +const QUEUE_SIZE: usize = 1024; + +#[derive(Debug)] +enum Error { + /// Failed to handle non-input event. + HandleEventNotEpollIn, + /// Failed to handle unknown event. + HandleEventUnknownEvent, + /// Invalid descriptor chain. + InvalidDescriptorChain, + /// Failed to handle filesystem requests. + #[allow(dead_code)] + HandleRequests(fuse_backend_rs::Error), + /// Failed to construct new vhost user daemon. + NewDaemon, + /// Failed to start the vhost user daemon. + StartDaemon, + /// Failed to wait for the vhost user daemon. + WaitDaemon, +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "vhost_user_fs_error: {self:?}") + } +} + +impl error::Error for Error {} + +impl convert::From<Error> for io::Error { + fn from(e: Error) -> Self { + io::Error::new(io::ErrorKind::Other, e) + } +} + +struct VhostUserFsBackend<FS> +where + FS: FileSystem + Send + Sync, +{ + server: Arc<Server<Arc<FS>>>, + event_idx: bool, + guest_mem: GuestMemoryAtomic<GuestMemoryMmap>, + cache_req: Option<SlaveFsCacheReq>, +} + +impl<FS> VhostUserFsBackend<FS> +where + FS: FileSystem + Send + Sync, +{ + fn process_queue(&mut self, vring: &mut MutexGuard<VringState>) -> std::io::Result<bool> { + let mut used_descs = false; + + while let Some(desc_chain) = vring + .get_queue_mut() + .pop_descriptor_chain(self.guest_mem.memory()) + { + let memory = desc_chain.memory(); + let reader = Reader::from_descriptor_chain(memory, desc_chain.clone()) + .map_err(|_| Error::InvalidDescriptorChain)?; + let writer = VirtioFsWriter::new(memory, desc_chain.clone()) + .map_err(|_| Error::InvalidDescriptorChain)?; + + self.server + .handle_message( + reader, + writer.into(), + self.cache_req + .as_mut() + .map(|req| req as &mut dyn FsCacheReqHandler), + None, + ) + .map_err(Error::HandleRequests)?; + + // TODO: Is len 0 correct? + if let Err(error) = vring + .get_queue_mut() + .add_used(memory, desc_chain.head_index(), 0) + { + error!(?error, "failed to add desc back to ring"); + } + + // TODO: What happens if we error out before here? + used_descs = true; + } + + let needs_notification = if self.event_idx { + match vring + .get_queue_mut() + .needs_notification(self.guest_mem.memory().deref()) + { + Ok(needs_notification) => needs_notification, + Err(error) => { + error!(?error, "failed to check if queue needs notification"); + true + } + } + } else { + true + }; + + if needs_notification { + if let Err(error) = vring.signal_used_queue() { + error!(?error, "failed to signal used queue"); + } + } + + Ok(used_descs) + } +} + +impl<FS> VhostUserBackendMut<VringMutex> for VhostUserFsBackend<FS> +where + FS: FileSystem + Send + Sync, +{ + fn num_queues(&self) -> usize { + NUM_QUEUES + } + + fn max_queue_size(&self) -> usize { + QUEUE_SIZE + } + + fn features(&self) -> u64 { + 1 << VIRTIO_F_VERSION_1 + | 1 << VIRTIO_RING_F_INDIRECT_DESC + | 1 << VIRTIO_RING_F_EVENT_IDX + | VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits() + } + + fn protocol_features(&self) -> VhostUserProtocolFeatures { + VhostUserProtocolFeatures::MQ | VhostUserProtocolFeatures::SLAVE_REQ + } + + fn set_event_idx(&mut self, enabled: bool) { + self.event_idx = enabled; + } + + fn update_memory(&mut self, _mem: GuestMemoryAtomic<GuestMemoryMmap>) -> std::io::Result<()> { + // This is what most the vhost user implementations do... + Ok(()) + } + + fn set_slave_req_fd(&mut self, cache_req: SlaveFsCacheReq) { + self.cache_req = Some(cache_req); + } + + fn handle_event( + &mut self, + device_event: u16, + evset: vmm_sys_util::epoll::EventSet, + vrings: &[VringMutex], + _thread_id: usize, + ) -> std::io::Result<bool> { + if evset != EventSet::IN { + return Err(Error::HandleEventNotEpollIn.into()); + } + + let mut queue = match device_event { + // High priority queue + 0 => vrings[0].get_mut(), + // Regurlar priority queue + 1 => vrings[1].get_mut(), + _ => { + return Err(Error::HandleEventUnknownEvent.into()); + } + }; + + if self.event_idx { + loop { + queue + .get_queue_mut() + .enable_notification(self.guest_mem.memory().deref()) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; + if !self.process_queue(&mut queue)? { + break; + } + } + } else { + self.process_queue(&mut queue)?; + } + + Ok(false) + } +} + +pub fn start_virtiofs_daemon<FS, P>(fs: FS, socket: P) -> io::Result<()> +where + FS: FileSystem + Send + Sync + 'static, + P: AsRef<Path>, +{ + let guest_mem = GuestMemoryAtomic::new(GuestMemoryMmap::new()); + + let server = Arc::new(fuse_backend_rs::api::server::Server::new(Arc::new(fs))); + + let backend = Arc::new(RwLock::new(VhostUserFsBackend { + server, + guest_mem: guest_mem.clone(), + event_idx: false, + cache_req: None, + })); + + let listener = Listener::new(socket, true).unwrap(); + + let mut fs_daemon = + VhostUserDaemon::new(String::from("vhost-user-fs-tvix-store"), backend, guest_mem) + .map_err(|_| Error::NewDaemon)?; + + fs_daemon.start(listener).map_err(|_| Error::StartDaemon)?; + + fs_daemon.wait().map_err(|_| Error::WaitDaemon)?; + + Ok(()) +} diff --git a/tvix/castore/src/hashing_reader.rs b/tvix/castore/src/hashing_reader.rs new file mode 100644 index 0000000000..7d78cae587 --- /dev/null +++ b/tvix/castore/src/hashing_reader.rs @@ -0,0 +1,89 @@ +use pin_project_lite::pin_project; +use tokio::io::AsyncRead; + +pin_project! { + /// Wraps an existing AsyncRead, and allows querying for the digest of all + /// data read "through" it. + /// The hash function is configurable by type parameter. + pub struct HashingReader<R, H> + where + R: AsyncRead, + H: digest::Digest, + { + #[pin] + inner: R, + hasher: H, + } +} + +pub type B3HashingReader<R> = HashingReader<R, blake3::Hasher>; + +impl<R, H> HashingReader<R, H> +where + R: AsyncRead, + H: digest::Digest, +{ + pub fn from(r: R) -> Self { + Self { + inner: r, + hasher: H::new(), + } + } + + /// Return the digest. + pub fn digest(self) -> digest::Output<H> { + self.hasher.finalize() + } +} + +impl<R, H> tokio::io::AsyncRead for HashingReader<R, H> +where + R: AsyncRead, + H: digest::Digest, +{ + fn poll_read( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut tokio::io::ReadBuf<'_>, + ) -> std::task::Poll<std::io::Result<()>> { + let buf_filled_len_before = buf.filled().len(); + + let this = self.project(); + let ret = this.inner.poll_read(cx, buf); + + // write everything new filled into the hasher. + this.hasher.update(&buf.filled()[buf_filled_len_before..]); + + ret + } +} + +#[cfg(test)] +mod tests { + use std::io::Cursor; + + use rstest::rstest; + + use crate::fixtures::BLOB_A; + use crate::fixtures::BLOB_A_DIGEST; + use crate::fixtures::BLOB_B; + use crate::fixtures::BLOB_B_DIGEST; + use crate::fixtures::EMPTY_BLOB_DIGEST; + use crate::{B3Digest, B3HashingReader}; + + #[rstest] + #[case::blob_a(&BLOB_A, &BLOB_A_DIGEST)] + #[case::blob_b(&BLOB_B, &BLOB_B_DIGEST)] + #[case::empty_blob(&[], &EMPTY_BLOB_DIGEST)] + #[tokio::test] + async fn test_b3_hashing_reader(#[case] data: &[u8], #[case] b3_digest: &B3Digest) { + let r = Cursor::new(data); + let mut hr = B3HashingReader::from(r); + + tokio::io::copy(&mut hr, &mut tokio::io::sink()) + .await + .expect("read must succeed"); + + assert_eq!(*b3_digest, hr.digest().into()); + } +} diff --git a/tvix/castore/src/import/archive.rs b/tvix/castore/src/import/archive.rs new file mode 100644 index 0000000000..cd5b1290e0 --- /dev/null +++ b/tvix/castore/src/import/archive.rs @@ -0,0 +1,373 @@ +//! Imports from an archive (tarballs) + +use std::collections::HashMap; + +use petgraph::graph::{DiGraph, NodeIndex}; +use petgraph::visit::{DfsPostOrder, EdgeRef}; +use petgraph::Direction; +use tokio::io::AsyncRead; +use tokio_stream::StreamExt; +use tokio_tar::Archive; +use tracing::{instrument, warn, Level}; + +use crate::blobservice::BlobService; +use crate::directoryservice::DirectoryService; +use crate::import::{ingest_entries, IngestionEntry, IngestionError}; +use crate::proto::node::Node; + +use super::blobs::{self, ConcurrentBlobUploader}; + +type TarPathBuf = std::path::PathBuf; + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("unable to construct stream of entries: {0}")] + Entries(std::io::Error), + + #[error("unable to read next entry: {0}")] + NextEntry(std::io::Error), + + #[error("unable to read path for entry: {0}")] + PathRead(std::io::Error), + + #[error("unable to convert path {0} for entry: {1}")] + PathConvert(TarPathBuf, std::io::Error), + + #[error("unable to read size field for {0}: {1}")] + Size(TarPathBuf, std::io::Error), + + #[error("unable to read mode field for {0}: {1}")] + Mode(TarPathBuf, std::io::Error), + + #[error("unable to read link name field for {0}: {1}")] + LinkName(TarPathBuf, std::io::Error), + + #[error("unsupported tar entry {0} type: {1:?}")] + EntryType(TarPathBuf, tokio_tar::EntryType), + + #[error("symlink missing target {0}")] + MissingSymlinkTarget(TarPathBuf), + + #[error("unexpected number of top level directory entries")] + UnexpectedNumberOfTopLevelEntries, + + #[error(transparent)] + BlobUploadError(#[from] blobs::Error), +} + +/// Ingests elements from the given tar [`Archive`] into a the passed [`BlobService`] and +/// [`DirectoryService`]. +#[instrument(skip_all, ret(level = Level::TRACE), err)] +pub async fn ingest_archive<BS, DS, R>( + blob_service: BS, + directory_service: DS, + mut archive: Archive<R>, +) -> Result<Node, IngestionError<Error>> +where + BS: BlobService + Clone + 'static, + DS: DirectoryService, + R: AsyncRead + Unpin, +{ + // Since tarballs can have entries in any arbitrary order, we need to + // buffer all of the directory metadata so we can reorder directory + // contents and entries to meet the requires of the castore. + + // In the first phase, collect up all the regular files and symlinks. + let mut nodes = IngestionEntryGraph::new(); + + let mut blob_uploader = ConcurrentBlobUploader::new(blob_service); + + let mut entries_iter = archive.entries().map_err(Error::Entries)?; + while let Some(mut entry) = entries_iter.try_next().await.map_err(Error::NextEntry)? { + let tar_path: TarPathBuf = entry.path().map_err(Error::PathRead)?.into(); + + // construct a castore PathBuf, which we use in the produced IngestionEntry. + let path = crate::path::PathBuf::from_host_path(tar_path.as_path(), true) + .map_err(|e| Error::PathConvert(tar_path.clone(), e))?; + + let header = entry.header(); + let entry = match header.entry_type() { + tokio_tar::EntryType::Regular + | tokio_tar::EntryType::GNUSparse + | tokio_tar::EntryType::Continuous => { + let size = header + .size() + .map_err(|e| Error::Size(tar_path.clone(), e))?; + + let digest = blob_uploader + .upload(&path, size, &mut entry) + .await + .map_err(Error::BlobUploadError)?; + + let executable = entry + .header() + .mode() + .map_err(|e| Error::Mode(tar_path, e))? + & 64 + != 0; + + IngestionEntry::Regular { + path, + size, + executable, + digest, + } + } + tokio_tar::EntryType::Symlink => IngestionEntry::Symlink { + target: entry + .link_name() + .map_err(|e| Error::LinkName(tar_path.clone(), e))? + .ok_or_else(|| Error::MissingSymlinkTarget(tar_path.clone()))? + .into_owned() + .into_os_string() + .into_encoded_bytes(), + path, + }, + // Push a bogus directory marker so we can make sure this directoy gets + // created. We don't know the digest and size until after reading the full + // tarball. + tokio_tar::EntryType::Directory => IngestionEntry::Dir { path }, + + tokio_tar::EntryType::XGlobalHeader | tokio_tar::EntryType::XHeader => continue, + + entry_type => return Err(Error::EntryType(tar_path, entry_type).into()), + }; + + nodes.add(entry)?; + } + + blob_uploader.join().await.map_err(Error::BlobUploadError)?; + + let root_node = ingest_entries( + directory_service, + futures::stream::iter(nodes.finalize()?.into_iter().map(Ok)), + ) + .await?; + + Ok(root_node) +} + +/// Keep track of the directory structure of a file tree being ingested. This is used +/// for ingestion sources which do not provide any ordering or uniqueness guarantees +/// like tarballs. +/// +/// If we ingest multiple entries with the same paths and both entries are not directories, +/// the newer entry will replace the latter entry, disconnecting the old node's children +/// from the graph. +/// +/// Once all nodes are ingested a call to [IngestionEntryGraph::finalize] will return +/// a list of entries compute by performaing a DFS post order traversal of the graph +/// from the top-level directory entry. +/// +/// This expects the directory structure to contain a single top-level directory entry. +/// An error is returned if this is not the case and ingestion will fail. +struct IngestionEntryGraph { + graph: DiGraph<IngestionEntry, ()>, + path_to_index: HashMap<crate::path::PathBuf, NodeIndex>, + root_node: Option<NodeIndex>, +} + +impl Default for IngestionEntryGraph { + fn default() -> Self { + Self::new() + } +} + +impl IngestionEntryGraph { + /// Creates a new ingestion entry graph. + pub fn new() -> Self { + IngestionEntryGraph { + graph: DiGraph::new(), + path_to_index: HashMap::new(), + root_node: None, + } + } + + /// Adds a new entry to the graph. Parent directories are automatically inserted. + /// If a node exists in the graph with the same name as the new entry and both the old + /// and new nodes are not directories, the node is replaced and is disconnected from its + /// children. + pub fn add(&mut self, entry: IngestionEntry) -> Result<NodeIndex, Error> { + let path = entry.path().to_owned(); + + let index = match self.path_to_index.get(entry.path()) { + Some(&index) => { + // If either the old entry or new entry are not directories, we'll replace the old + // entry. + if !entry.is_dir() || !self.get_node(index).is_dir() { + self.replace_node(index, entry); + } + + index + } + None => self.graph.add_node(entry), + }; + + // for archives, a path with 1 component is the root node + if path.components().count() == 1 { + // We expect archives to contain a single root node, if there is another root node + // entry with a different path name, this is unsupported. + if let Some(root_node) = self.root_node { + if self.get_node(root_node).path() != path.as_ref() { + return Err(Error::UnexpectedNumberOfTopLevelEntries); + } + } + + self.root_node = Some(index) + } else if let Some(parent_path) = path.parent() { + // Recursively add the parent node until it hits the root node. + let parent_index = self.add(IngestionEntry::Dir { + path: parent_path.to_owned(), + })?; + + // Insert an edge from the parent directory to the child entry. + self.graph.add_edge(parent_index, index, ()); + } + + self.path_to_index.insert(path, index); + + Ok(index) + } + + /// Traverses the graph in DFS post order and collects the entries into a [Vec<IngestionEntry>]. + /// + /// Unreachable parts of the graph are not included in the result. + pub fn finalize(self) -> Result<Vec<IngestionEntry>, Error> { + // There must be a root node. + let Some(root_node_index) = self.root_node else { + return Err(Error::UnexpectedNumberOfTopLevelEntries); + }; + + // The root node must be a directory. + if !self.get_node(root_node_index).is_dir() { + return Err(Error::UnexpectedNumberOfTopLevelEntries); + } + + let mut traversal = DfsPostOrder::new(&self.graph, root_node_index); + let mut nodes = Vec::with_capacity(self.graph.node_count()); + while let Some(node_index) = traversal.next(&self.graph) { + nodes.push(self.get_node(node_index).clone()); + } + + Ok(nodes) + } + + /// Replaces the node with the specified entry. The node's children are disconnected. + /// + /// This should never be called if both the old and new nodes are directories. + fn replace_node(&mut self, index: NodeIndex, new_entry: IngestionEntry) { + let entry = self + .graph + .node_weight_mut(index) + .expect("Tvix bug: missing node entry"); + + debug_assert!(!(entry.is_dir() && new_entry.is_dir())); + + // Replace the node itself. + warn!( + "saw duplicate entry in archive at path {:?}. old: {:?} new: {:?}", + entry.path(), + &entry, + &new_entry + ); + *entry = new_entry; + + // Remove any outgoing edges to disconnect the old node's children. + let edges = self + .graph + .edges_directed(index, Direction::Outgoing) + .map(|edge| edge.id()) + .collect::<Vec<_>>(); + for edge in edges { + self.graph.remove_edge(edge); + } + } + + fn get_node(&self, index: NodeIndex) -> &IngestionEntry { + self.graph + .node_weight(index) + .expect("Tvix bug: missing node entry") + } +} + +#[cfg(test)] +mod test { + use crate::import::IngestionEntry; + use crate::B3Digest; + + use super::{Error, IngestionEntryGraph}; + + use lazy_static::lazy_static; + use rstest::rstest; + + lazy_static! { + pub static ref EMPTY_DIGEST: B3Digest = blake3::hash(&[]).as_bytes().into(); + pub static ref DIR_A: IngestionEntry = IngestionEntry::Dir { + path: "a".parse().unwrap() + }; + pub static ref DIR_B: IngestionEntry = IngestionEntry::Dir { + path: "b".parse().unwrap() + }; + pub static ref DIR_A_B: IngestionEntry = IngestionEntry::Dir { + path: "a/b".parse().unwrap() + }; + pub static ref FILE_A: IngestionEntry = IngestionEntry::Regular { + path: "a".parse().unwrap(), + size: 0, + executable: false, + digest: EMPTY_DIGEST.clone(), + }; + pub static ref FILE_A_B: IngestionEntry = IngestionEntry::Regular { + path: "a/b".parse().unwrap(), + size: 0, + executable: false, + digest: EMPTY_DIGEST.clone(), + }; + pub static ref FILE_A_B_C: IngestionEntry = IngestionEntry::Regular { + path: "a/b/c".parse().unwrap(), + size: 0, + executable: false, + digest: EMPTY_DIGEST.clone(), + }; + } + + #[rstest] + #[case::implicit_directories(&[&*FILE_A_B_C], &[&*FILE_A_B_C, &*DIR_A_B, &*DIR_A])] + #[case::explicit_directories(&[&*DIR_A, &*DIR_A_B, &*FILE_A_B_C], &[&*FILE_A_B_C, &*DIR_A_B, &*DIR_A])] + #[case::inaccesible_tree(&[&*DIR_A, &*DIR_A_B, &*FILE_A_B], &[&*FILE_A_B, &*DIR_A])] + fn node_ingestion_success( + #[case] in_entries: &[&IngestionEntry], + #[case] exp_entries: &[&IngestionEntry], + ) { + let mut nodes = IngestionEntryGraph::new(); + + for entry in in_entries { + nodes.add((*entry).clone()).expect("failed to add entry"); + } + + let entries = nodes.finalize().expect("invalid entries"); + + let exp_entries: Vec<IngestionEntry> = + exp_entries.iter().map(|entry| (*entry).clone()).collect(); + + assert_eq!(entries, exp_entries); + } + + #[rstest] + #[case::no_top_level_entries(&[], Error::UnexpectedNumberOfTopLevelEntries)] + #[case::multiple_top_level_dirs(&[&*DIR_A, &*DIR_B], Error::UnexpectedNumberOfTopLevelEntries)] + #[case::top_level_file_entry(&[&*FILE_A], Error::UnexpectedNumberOfTopLevelEntries)] + fn node_ingestion_error(#[case] in_entries: &[&IngestionEntry], #[case] exp_error: Error) { + let mut nodes = IngestionEntryGraph::new(); + + let result = (|| { + for entry in in_entries { + nodes.add((*entry).clone())?; + } + nodes.finalize() + })(); + + let error = result.expect_err("expected error"); + assert_eq!(error.to_string(), exp_error.to_string()); + } +} diff --git a/tvix/castore/src/import/blobs.rs b/tvix/castore/src/import/blobs.rs new file mode 100644 index 0000000000..8135d871d6 --- /dev/null +++ b/tvix/castore/src/import/blobs.rs @@ -0,0 +1,177 @@ +use std::{ + io::{Cursor, Write}, + sync::Arc, +}; + +use tokio::{ + io::AsyncRead, + sync::Semaphore, + task::{JoinError, JoinSet}, +}; +use tokio_util::io::InspectReader; + +use crate::{blobservice::BlobService, B3Digest, Path, PathBuf}; + +/// Files smaller than this threshold, in bytes, are uploaded to the [BlobService] in the +/// background. +/// +/// This is a u32 since we acquire a weighted semaphore using the size of the blob. +/// [Semaphore::acquire_many_owned] takes a u32, so we need to ensure the size of +/// the blob can be represented using a u32 and will not cause an overflow. +const CONCURRENT_BLOB_UPLOAD_THRESHOLD: u32 = 1024 * 1024; + +/// The maximum amount of bytes allowed to be buffered in memory to perform async blob uploads. +const MAX_BUFFER_SIZE: usize = 128 * 1024 * 1024; + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("unable to read blob contents for {0}: {1}")] + BlobRead(PathBuf, std::io::Error), + + // FUTUREWORK: proper error for blob finalize + #[error("unable to finalize blob {0}: {1}")] + BlobFinalize(PathBuf, std::io::Error), + + #[error("unexpected size for {path} wanted: {wanted} got: {got}")] + UnexpectedSize { + path: PathBuf, + wanted: u64, + got: u64, + }, + + #[error("blob upload join error: {0}")] + JoinError(#[from] JoinError), +} + +/// The concurrent blob uploader provides a mechanism for concurrently uploading small blobs. +/// This is useful when ingesting from sources like tarballs and archives which each blob entry +/// must be read sequentially. Ingesting many small blobs sequentially becomes slow due to +/// round trip time with the blob service. The concurrent blob uploader will buffer small +/// blobs in memory and upload them to the blob service in the background. +/// +/// Once all blobs have been uploaded, make sure to call [ConcurrentBlobUploader::join] to wait +/// for all background jobs to complete and check for any errors. +pub struct ConcurrentBlobUploader<BS> { + blob_service: BS, + upload_tasks: JoinSet<Result<(), Error>>, + upload_semaphore: Arc<Semaphore>, +} + +impl<BS> ConcurrentBlobUploader<BS> +where + BS: BlobService + Clone + 'static, +{ + /// Creates a new concurrent blob uploader which uploads blobs to the provided + /// blob service. + pub fn new(blob_service: BS) -> Self { + Self { + blob_service, + upload_tasks: JoinSet::new(), + upload_semaphore: Arc::new(Semaphore::new(MAX_BUFFER_SIZE)), + } + } + + /// Uploads a blob to the blob service. If the blob is small enough it will be read to a buffer + /// and uploaded in the background. + /// This will read the entirety of the provided reader unless an error occurs, even if blobs + /// are uploaded in the background.. + pub async fn upload<R>( + &mut self, + path: &Path, + expected_size: u64, + mut r: R, + ) -> Result<B3Digest, Error> + where + R: AsyncRead + Unpin, + { + if expected_size < CONCURRENT_BLOB_UPLOAD_THRESHOLD as u64 { + let mut buffer = Vec::with_capacity(expected_size as usize); + let mut hasher = blake3::Hasher::new(); + let mut reader = InspectReader::new(&mut r, |bytes| { + hasher.write_all(bytes).unwrap(); + }); + + let permit = self + .upload_semaphore + .clone() + // This cast is safe because ensure the header_size is less than + // CONCURRENT_BLOB_UPLOAD_THRESHOLD which is a u32. + .acquire_many_owned(expected_size as u32) + .await + .unwrap(); + let size = tokio::io::copy(&mut reader, &mut buffer) + .await + .map_err(|e| Error::BlobRead(path.into(), e))?; + let digest: B3Digest = hasher.finalize().as_bytes().into(); + + if size != expected_size { + return Err(Error::UnexpectedSize { + path: path.into(), + wanted: expected_size, + got: size, + }); + } + + self.upload_tasks.spawn({ + let blob_service = self.blob_service.clone(); + let expected_digest = digest.clone(); + let path = path.to_owned(); + let r = Cursor::new(buffer); + async move { + let digest = upload_blob(&blob_service, &path, expected_size, r).await?; + + assert_eq!(digest, expected_digest, "Tvix bug: blob digest mismatch"); + + // Make sure we hold the permit until we finish writing the blob + // to the [BlobService]. + drop(permit); + Ok(()) + } + }); + + return Ok(digest); + } + + upload_blob(&self.blob_service, path, expected_size, r).await + } + + /// Waits for all background upload jobs to complete, returning any upload errors. + pub async fn join(mut self) -> Result<(), Error> { + while let Some(result) = self.upload_tasks.join_next().await { + result??; + } + Ok(()) + } +} + +async fn upload_blob<BS, R>( + blob_service: &BS, + path: &Path, + expected_size: u64, + mut r: R, +) -> Result<B3Digest, Error> +where + BS: BlobService, + R: AsyncRead + Unpin, +{ + let mut writer = blob_service.open_write().await; + + let size = tokio::io::copy(&mut r, &mut writer) + .await + .map_err(|e| Error::BlobRead(path.into(), e))?; + + let digest = writer + .close() + .await + .map_err(|e| Error::BlobFinalize(path.into(), e))?; + + if size != expected_size { + return Err(Error::UnexpectedSize { + path: path.into(), + wanted: expected_size, + got: size, + }); + } + + Ok(digest) +} diff --git a/tvix/castore/src/import/error.rs b/tvix/castore/src/import/error.rs new file mode 100644 index 0000000000..e3fba617e0 --- /dev/null +++ b/tvix/castore/src/import/error.rs @@ -0,0 +1,20 @@ +use super::PathBuf; + +use crate::Error as CastoreError; + +/// Represents all error types that emitted by ingest_entries. +/// It can represent errors uploading individual Directories and finalizing +/// the upload. +/// It also contains a generic error kind that'll carry ingestion-method +/// specific errors. +#[derive(Debug, thiserror::Error)] +pub enum IngestionError<E: std::fmt::Display> { + #[error("error from producer: {0}")] + Producer(#[from] E), + + #[error("failed to upload directory at {0}: {1}")] + UploadDirectoryError(PathBuf, CastoreError), + + #[error("failed to finalize directory upload: {0}")] + FinalizeDirectoryUpload(CastoreError), +} diff --git a/tvix/castore/src/import/fs.rs b/tvix/castore/src/import/fs.rs new file mode 100644 index 0000000000..265d772355 --- /dev/null +++ b/tvix/castore/src/import/fs.rs @@ -0,0 +1,198 @@ +//! Import from a real filesystem. + +use futures::stream::BoxStream; +use futures::StreamExt; +use std::fs::FileType; +use std::os::unix::ffi::OsStringExt; +use std::os::unix::fs::MetadataExt; +use std::os::unix::fs::PermissionsExt; +use tracing::instrument; +use tracing::Span; +use tracing_indicatif::span_ext::IndicatifSpanExt; +use walkdir::DirEntry; +use walkdir::WalkDir; + +use crate::blobservice::BlobService; +use crate::directoryservice::DirectoryService; +use crate::proto::node::Node; +use crate::B3Digest; + +use super::ingest_entries; +use super::IngestionEntry; +use super::IngestionError; + +/// Ingests the contents at a given path into the tvix store, interacting with a [BlobService] and +/// [DirectoryService]. It returns the root node or an error. +/// +/// It does not follow symlinks at the root, they will be ingested as actual symlinks. +/// +/// This function will walk the filesystem using `walkdir` and will consume +/// `O(#number of entries)` space. +#[instrument(skip(blob_service, directory_service), fields(path), err)] +pub async fn ingest_path<BS, DS, P>( + blob_service: BS, + directory_service: DS, + path: P, +) -> Result<Node, IngestionError<Error>> +where + P: AsRef<std::path::Path> + std::fmt::Debug, + BS: BlobService + Clone, + DS: DirectoryService, +{ + Span::current().pb_start(); + let iter = WalkDir::new(path.as_ref()) + .follow_links(false) + .follow_root_links(false) + .contents_first(true) + .into_iter(); + + let entries = dir_entries_to_ingestion_stream(blob_service, iter, path.as_ref()); + ingest_entries( + directory_service, + entries.inspect(|e| { + if let Ok(e) = e { + let s = Span::current(); + s.pb_inc(1); + s.pb_set_message(&format!("Ingesting {}", e.path())); + } + }), + ) + .await +} + +/// Converts an iterator of [walkdir::DirEntry]s into a stream of ingestion entries. +/// This can then be fed into [ingest_entries] to ingest all the entries into the castore. +/// +/// The produced stream is buffered, so uploads can happen concurrently. +/// +/// The root is the [Path] in the filesystem that is being ingested into the castore. +pub fn dir_entries_to_ingestion_stream<'a, BS, I>( + blob_service: BS, + iter: I, + root: &'a std::path::Path, +) -> BoxStream<'a, Result<IngestionEntry, Error>> +where + BS: BlobService + Clone + 'a, + I: Iterator<Item = Result<DirEntry, walkdir::Error>> + Send + 'a, +{ + let prefix = root.parent().unwrap_or_else(|| std::path::Path::new("")); + + Box::pin( + futures::stream::iter(iter) + .map(move |x| { + let blob_service = blob_service.clone(); + async move { + match x { + Ok(dir_entry) => { + dir_entry_to_ingestion_entry(blob_service, &dir_entry, prefix).await + } + Err(e) => Err(Error::Stat( + prefix.to_path_buf(), + e.into_io_error().expect("walkdir err must be some"), + )), + } + } + }) + .buffered(50), + ) +} + +/// Converts a [walkdir::DirEntry] into an [IngestionEntry], uploading blobs to the +/// provided [BlobService]. +/// +/// The prefix path is stripped from the path of each entry. This is usually the parent path +/// of the path being ingested so that the last element of the stream only has one component. +pub async fn dir_entry_to_ingestion_entry<BS>( + blob_service: BS, + entry: &DirEntry, + prefix: &std::path::Path, +) -> Result<IngestionEntry, Error> +where + BS: BlobService, +{ + let file_type = entry.file_type(); + + let fs_path = entry + .path() + .strip_prefix(prefix) + .expect("Tvix bug: failed to strip root path prefix"); + + // convert to castore PathBuf + let path = crate::path::PathBuf::from_host_path(fs_path, false) + .unwrap_or_else(|e| panic!("Tvix bug: walkdir direntry cannot be parsed: {}", e)); + + if file_type.is_dir() { + Ok(IngestionEntry::Dir { path }) + } else if file_type.is_symlink() { + let target = std::fs::read_link(entry.path()) + .map_err(|e| Error::Stat(entry.path().to_path_buf(), e))? + .into_os_string() + .into_vec(); + + Ok(IngestionEntry::Symlink { path, target }) + } else if file_type.is_file() { + let metadata = entry + .metadata() + .map_err(|e| Error::Stat(entry.path().to_path_buf(), e.into()))?; + + let digest = upload_blob(blob_service, entry.path().to_path_buf()).await?; + + Ok(IngestionEntry::Regular { + path, + size: metadata.size(), + // If it's executable by the user, it'll become executable. + // This matches nix's dump() function behaviour. + executable: metadata.permissions().mode() & 64 != 0, + digest, + }) + } else { + return Err(Error::FileType(fs_path.to_path_buf(), file_type)); + } +} + +/// Uploads the file at the provided [Path] the the [BlobService]. +#[instrument(skip(blob_service), fields(path), err)] +async fn upload_blob<BS>( + blob_service: BS, + path: impl AsRef<std::path::Path>, +) -> Result<B3Digest, Error> +where + BS: BlobService, +{ + let mut file = match tokio::fs::File::open(path.as_ref()).await { + Ok(file) => file, + Err(e) => return Err(Error::BlobRead(path.as_ref().to_path_buf(), e)), + }; + + let mut writer = blob_service.open_write().await; + + if let Err(e) = tokio::io::copy(&mut file, &mut writer).await { + return Err(Error::BlobRead(path.as_ref().to_path_buf(), e)); + }; + + let digest = writer + .close() + .await + .map_err(|e| Error::BlobFinalize(path.as_ref().to_path_buf(), e))?; + + Ok(digest) +} + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("unsupported file type at {0}: {1:?}")] + FileType(std::path::PathBuf, FileType), + + #[error("unable to stat {0}: {1}")] + Stat(std::path::PathBuf, std::io::Error), + + #[error("unable to open {0}: {1}")] + Open(std::path::PathBuf, std::io::Error), + + #[error("unable to read {0}: {1}")] + BlobRead(std::path::PathBuf, std::io::Error), + + // TODO: proper error for blob finalize + #[error("unable to finalize blob {0}: {1}")] + BlobFinalize(std::path::PathBuf, std::io::Error), +} diff --git a/tvix/castore/src/import/mod.rs b/tvix/castore/src/import/mod.rs new file mode 100644 index 0000000000..c57c5bcada --- /dev/null +++ b/tvix/castore/src/import/mod.rs @@ -0,0 +1,345 @@ +//! The main library function here is [ingest_entries], receiving a stream of +//! [IngestionEntry]. +//! +//! Specific implementations, such as ingesting from the filesystem, live in +//! child modules. + +use crate::directoryservice::DirectoryPutter; +use crate::directoryservice::DirectoryService; +use crate::path::{Path, PathBuf}; +use crate::proto::node::Node; +use crate::proto::Directory; +use crate::proto::DirectoryNode; +use crate::proto::FileNode; +use crate::proto::SymlinkNode; +use crate::B3Digest; +use futures::{Stream, StreamExt}; +use tracing::Span; +use tracing_indicatif::span_ext::IndicatifSpanExt; + +use tracing::Level; + +use std::collections::HashMap; +use tracing::instrument; + +mod error; +pub use error::IngestionError; + +pub mod archive; +pub mod blobs; +pub mod fs; + +/// Ingests [IngestionEntry] from the given stream into a the passed [DirectoryService]. +/// On success, returns the root [Node]. +/// +/// The stream must have the following invariants: +/// - All children entries must come before their parents. +/// - The last entry must be the root node which must have a single path component. +/// - Every entry should have a unique path, and only consist of normal components. +/// This means, no windows path prefixes, absolute paths, `.` or `..`. +/// - All referenced directories must have an associated directory entry in the stream. +/// This means if there is a file entry for `foo/bar`, there must also be a `foo` directory +/// entry. +/// +/// Internally we maintain a [HashMap] of [PathBuf] to partially populated [Directory] at that +/// path. Once we receive an [IngestionEntry] for the directory itself, we remove it from the +/// map and upload it to the [DirectoryService] through a lazily created [DirectoryPutter]. +/// +/// On success, returns the root node. +#[instrument(skip_all, fields(indicatif.pb_show=1), ret(level = Level::TRACE), err)] +pub async fn ingest_entries<DS, S, E>( + directory_service: DS, + mut entries: S, +) -> Result<Node, IngestionError<E>> +where + DS: DirectoryService, + S: Stream<Item = Result<IngestionEntry, E>> + Send + std::marker::Unpin, + E: std::error::Error, +{ + // For a given path, this holds the [Directory] structs as they are populated. + let mut directories: HashMap<PathBuf, Directory> = HashMap::default(); + let mut maybe_directory_putter: Option<Box<dyn DirectoryPutter>> = None; + + Span::current().pb_start(); + + let root_node = loop { + let mut entry = entries + .next() + .await + // The last entry of the stream must have 1 path component, after which + // we break the loop manually. + .expect("Tvix bug: unexpected end of stream")?; + + let name = entry + .path() + .file_name() + // If this is the root node, it will have an empty name. + .unwrap_or_default() + .to_owned() + .into(); + + let node = match &mut entry { + IngestionEntry::Dir { .. } => { + // If the entry is a directory, we traversed all its children (and + // populated it in `directories`). + // If we don't have it in directories, it's a directory without + // children. + let directory = directories + .remove(entry.path()) + // In that case, it contained no children + .unwrap_or_default(); + + let directory_size = directory.size(); + let directory_digest = directory.digest(); + + // Use the directory_putter to upload the directory. + // If we don't have one yet (as that's the first one to upload), + // initialize the putter. + maybe_directory_putter + .get_or_insert_with(|| directory_service.put_multiple_start()) + .put(directory) + .await + .map_err(|e| { + IngestionError::UploadDirectoryError(entry.path().to_owned(), e) + })?; + + Node::Directory(DirectoryNode { + name, + digest: directory_digest.into(), + size: directory_size, + }) + } + IngestionEntry::Symlink { ref target, .. } => Node::Symlink(SymlinkNode { + name, + target: target.to_owned().into(), + }), + IngestionEntry::Regular { + size, + executable, + digest, + .. + } => Node::File(FileNode { + name, + digest: digest.to_owned().into(), + size: *size, + executable: *executable, + }), + }; + + let parent = entry + .path() + .parent() + .expect("Tvix bug: got entry with root node"); + + if parent == crate::Path::ROOT { + break node; + } else { + // record node in parent directory, creating a new [Directory] if not there yet. + directories.entry(parent.to_owned()).or_default().add(node); + } + }; + + assert!( + entries.count().await == 0, + "Tvix bug: left over elements in the stream" + ); + + assert!( + directories.is_empty(), + "Tvix bug: left over directories after processing ingestion stream" + ); + + // if there were directories uploaded, make sure we flush the putter, so + // they're all persisted to the backend. + if let Some(mut directory_putter) = maybe_directory_putter { + #[cfg_attr(not(debug_assertions), allow(unused))] + let root_directory_digest = directory_putter + .close() + .await + .map_err(|e| IngestionError::FinalizeDirectoryUpload(e))?; + + #[cfg(debug_assertions)] + { + if let Node::Directory(directory_node) = &root_node { + debug_assert_eq!( + root_directory_digest, + directory_node + .digest + .to_vec() + .try_into() + .expect("invalid digest len") + ) + } else { + unreachable!("Tvix bug: directory putter initialized but no root directory node"); + } + } + }; + + Ok(root_node) +} + +#[derive(Debug, Clone, Eq, PartialEq)] +pub enum IngestionEntry { + Regular { + path: PathBuf, + size: u64, + executable: bool, + digest: B3Digest, + }, + Symlink { + path: PathBuf, + target: Vec<u8>, + }, + Dir { + path: PathBuf, + }, +} + +impl IngestionEntry { + fn path(&self) -> &Path { + match self { + IngestionEntry::Regular { path, .. } => path, + IngestionEntry::Symlink { path, .. } => path, + IngestionEntry::Dir { path } => path, + } + } + + fn is_dir(&self) -> bool { + matches!(self, IngestionEntry::Dir { .. }) + } +} + +#[cfg(test)] +mod test { + use rstest::rstest; + + use crate::fixtures::{DIRECTORY_COMPLICATED, DIRECTORY_WITH_KEEP, EMPTY_BLOB_DIGEST}; + use crate::proto::node::Node; + use crate::proto::{Directory, DirectoryNode, FileNode, SymlinkNode}; + use crate::{directoryservice::MemoryDirectoryService, fixtures::DUMMY_DIGEST}; + + use super::ingest_entries; + use super::IngestionEntry; + + #[rstest] + #[case::single_file(vec![IngestionEntry::Regular { + path: "foo".parse().unwrap(), + size: 42, + executable: true, + digest: DUMMY_DIGEST.clone(), + }], + Node::File(FileNode { name: "foo".into(), digest: DUMMY_DIGEST.clone().into(), size: 42, executable: true } + ))] + #[case::single_symlink(vec![IngestionEntry::Symlink { + path: "foo".parse().unwrap(), + target: b"blub".into(), + }], + Node::Symlink(SymlinkNode { name: "foo".into(), target: "blub".into()}) + )] + #[case::single_dir(vec![IngestionEntry::Dir { + path: "foo".parse().unwrap(), + }], + Node::Directory(DirectoryNode { name: "foo".into(), digest: Directory::default().digest().into(), size: Directory::default().size()}) + )] + #[case::dir_with_keep(vec![ + IngestionEntry::Regular { + path: "foo/.keep".parse().unwrap(), + size: 0, + executable: false, + digest: EMPTY_BLOB_DIGEST.clone(), + }, + IngestionEntry::Dir { + path: "foo".parse().unwrap(), + }, + ], + Node::Directory(DirectoryNode { name: "foo".into(), digest: DIRECTORY_WITH_KEEP.digest().into(), size: DIRECTORY_WITH_KEEP.size() }) + )] + /// This is intentionally a bit unsorted, though it still satisfies all + /// requirements we have on the order of elements in the stream. + #[case::directory_complicated(vec![ + IngestionEntry::Regular { + path: "blub/.keep".parse().unwrap(), + size: 0, + executable: false, + digest: EMPTY_BLOB_DIGEST.clone(), + }, + IngestionEntry::Regular { + path: "blub/keep/.keep".parse().unwrap(), + size: 0, + executable: false, + digest: EMPTY_BLOB_DIGEST.clone(), + }, + IngestionEntry::Dir { + path: "blub/keep".parse().unwrap(), + }, + IngestionEntry::Symlink { + path: "blub/aa".parse().unwrap(), + target: b"/nix/store/somewhereelse".into(), + }, + IngestionEntry::Dir { + path: "blub".parse().unwrap(), + }, + ], + Node::Directory(DirectoryNode { name: "blub".into(), digest: DIRECTORY_COMPLICATED.digest().into(), size:DIRECTORY_COMPLICATED.size() }) + )] + #[tokio::test] + async fn test_ingestion(#[case] entries: Vec<IngestionEntry>, #[case] exp_root_node: Node) { + let directory_service = MemoryDirectoryService::default(); + + let root_node = ingest_entries( + directory_service.clone(), + futures::stream::iter(entries.into_iter().map(Ok::<_, std::io::Error>)), + ) + .await + .expect("must succeed"); + + assert_eq!(exp_root_node, root_node, "root node should match"); + } + + #[rstest] + #[should_panic] + #[case::empty_entries(vec![])] + #[should_panic] + #[case::missing_intermediate_dir(vec![ + IngestionEntry::Regular { + path: "blub/.keep".parse().unwrap(), + size: 0, + executable: false, + digest: EMPTY_BLOB_DIGEST.clone(), + }, + ])] + #[should_panic] + #[case::leaf_after_parent(vec![ + IngestionEntry::Dir { + path: "blub".parse().unwrap(), + }, + IngestionEntry::Regular { + path: "blub/.keep".parse().unwrap(), + size: 0, + executable: false, + digest: EMPTY_BLOB_DIGEST.clone(), + }, + ])] + #[should_panic] + #[case::root_in_entry(vec![ + IngestionEntry::Regular { + path: ".keep".parse().unwrap(), + size: 0, + executable: false, + digest: EMPTY_BLOB_DIGEST.clone(), + }, + IngestionEntry::Dir { + path: "".parse().unwrap(), + }, + ])] + #[tokio::test] + async fn test_ingestion_fail(#[case] entries: Vec<IngestionEntry>) { + let directory_service = MemoryDirectoryService::default(); + + let _ = ingest_entries( + directory_service.clone(), + futures::stream::iter(entries.into_iter().map(Ok::<_, std::io::Error>)), + ) + .await; + } +} diff --git a/tvix/castore/src/lib.rs b/tvix/castore/src/lib.rs new file mode 100644 index 0000000000..bdc533a8c5 --- /dev/null +++ b/tvix/castore/src/lib.rs @@ -0,0 +1,30 @@ +mod digests; +mod errors; +mod hashing_reader; + +pub mod blobservice; +pub mod directoryservice; +pub mod fixtures; + +#[cfg(feature = "fs")] +pub mod fs; + +mod path; +pub use path::{Path, PathBuf}; + +pub mod import; +pub mod proto; +pub mod tonic; + +pub use digests::{B3Digest, B3_LEN}; +pub use errors::Error; +pub use hashing_reader::{B3HashingReader, HashingReader}; + +#[cfg(test)] +mod tests; + +// That's what the rstest_reuse README asks us do, and fails about being unable +// to find rstest_reuse in crate root. +#[cfg(test)] +#[allow(clippy::single_component_path_imports)] +use rstest_reuse; diff --git a/tvix/castore/src/path.rs b/tvix/castore/src/path.rs new file mode 100644 index 0000000000..fcc2bd01fb --- /dev/null +++ b/tvix/castore/src/path.rs @@ -0,0 +1,446 @@ +//! Contains data structures to deal with Paths in the tvix-castore model. + +use std::{ + borrow::Borrow, + fmt::{self, Debug, Display}, + mem, + ops::Deref, + str::FromStr, +}; + +use bstr::ByteSlice; + +use crate::proto::validate_node_name; + +/// Represents a Path in the castore model. +/// These are always relative, and platform-independent, which distinguishes +/// them from the ones provided in the standard library. +#[derive(Eq, Hash, PartialEq)] +#[repr(transparent)] // SAFETY: Representation has to match [u8] +pub struct Path { + // As node names in the castore model cannot contain slashes, + // we use them as component separators here. + inner: [u8], +} + +#[allow(dead_code)] +impl Path { + // SAFETY: The empty path is valid. + pub const ROOT: &'static Path = unsafe { Path::from_bytes_unchecked(&[]) }; + + /// Convert a byte slice to a path, without checking validity. + const unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Path { + // SAFETY: &[u8] and &Path have the same representation. + unsafe { mem::transmute(bytes) } + } + + fn from_bytes(bytes: &[u8]) -> Option<&Path> { + if !bytes.is_empty() { + // Ensure all components are valid castore node names. + for component in bytes.split_str(b"/") { + validate_node_name(component).ok()?; + } + } + + // SAFETY: We have verified that the path contains no empty components. + Some(unsafe { Path::from_bytes_unchecked(bytes) }) + } + + pub fn into_boxed_bytes(self: Box<Path>) -> Box<[u8]> { + // SAFETY: Box<Path> and Box<[u8]> have the same representation. + unsafe { mem::transmute(self) } + } + + /// Returns the path without its final component, if there is one. + /// + /// Note that the parent of a bare file name is [Path::ROOT]. + /// [Path::ROOT] is the only path without a parent. + pub fn parent(&self) -> Option<&Path> { + // The root does not have a parent. + if self.inner.is_empty() { + return None; + } + + Some( + if let Some((parent, _file_name)) = self.inner.rsplit_once_str(b"/") { + // SAFETY: The parent of a valid Path is a valid Path. + unsafe { Path::from_bytes_unchecked(parent) } + } else { + // The parent of a bare file name is the root. + Path::ROOT + }, + ) + } + + /// Creates a PathBuf with `name` adjoined to self. + pub fn try_join(&self, name: &[u8]) -> Result<PathBuf, std::io::Error> { + let mut v = PathBuf::with_capacity(self.inner.len() + name.len() + 1); + v.inner.extend_from_slice(&self.inner); + v.try_push(name)?; + + Ok(v) + } + + /// Produces an iterator over the components of the path, which are + /// individual byte slices. + /// In case the path is empty, an empty iterator is returned. + pub fn components(&self) -> impl Iterator<Item = &[u8]> { + let mut iter = self.inner.split_str(&b"/"); + + // We don't want to return an empty element, consume it if it's the only one. + if self.inner.is_empty() { + let _ = iter.next(); + } + + iter + } + + /// Returns the final component of the Path, if there is one. + pub fn file_name(&self) -> Option<&[u8]> { + self.components().last() + } + + pub fn as_bytes(&self) -> &[u8] { + &self.inner + } +} + +impl Debug for Path { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + Debug::fmt(self.inner.as_bstr(), f) + } +} + +impl Display for Path { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + Display::fmt(self.inner.as_bstr(), f) + } +} + +impl AsRef<Path> for Path { + fn as_ref(&self) -> &Path { + self + } +} + +/// Represents a owned PathBuf in the castore model. +/// These are always relative, and platform-independent, which distinguishes +/// them from the ones provided in the standard library. +#[derive(Clone, Default, Eq, Hash, PartialEq)] +pub struct PathBuf { + inner: Vec<u8>, +} + +impl Deref for PathBuf { + type Target = Path; + + fn deref(&self) -> &Self::Target { + // SAFETY: PathBuf always contains a valid Path. + unsafe { Path::from_bytes_unchecked(&self.inner) } + } +} + +impl AsRef<Path> for PathBuf { + fn as_ref(&self) -> &Path { + self + } +} + +impl ToOwned for Path { + type Owned = PathBuf; + + fn to_owned(&self) -> Self::Owned { + PathBuf { + inner: self.inner.to_owned(), + } + } +} + +impl Borrow<Path> for PathBuf { + fn borrow(&self) -> &Path { + self + } +} + +impl From<Box<Path>> for PathBuf { + fn from(value: Box<Path>) -> Self { + // SAFETY: Box<Path> is always a valid path. + unsafe { PathBuf::from_bytes_unchecked(value.into_boxed_bytes().into_vec()) } + } +} + +impl From<&Path> for PathBuf { + fn from(value: &Path) -> Self { + value.to_owned() + } +} + +impl FromStr for PathBuf { + type Err = std::io::Error; + + fn from_str(s: &str) -> Result<PathBuf, Self::Err> { + Ok(Path::from_bytes(s.as_bytes()) + .ok_or(std::io::ErrorKind::InvalidData)? + .to_owned()) + } +} + +impl Debug for PathBuf { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + Debug::fmt(&**self, f) + } +} + +impl Display for PathBuf { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + Display::fmt(&**self, f) + } +} + +impl PathBuf { + pub fn new() -> PathBuf { + Self::default() + } + + pub fn with_capacity(capacity: usize) -> PathBuf { + // SAFETY: The empty path is a valid path. + Self { + inner: Vec::with_capacity(capacity), + } + } + + /// Adjoins `name` to self. + pub fn try_push(&mut self, name: &[u8]) -> Result<(), std::io::Error> { + validate_node_name(name).map_err(|_| std::io::ErrorKind::InvalidData)?; + + if !self.inner.is_empty() { + self.inner.push(b'/'); + } + + self.inner.extend_from_slice(name); + + Ok(()) + } + + /// Convert a byte vector to a PathBuf, without checking validity. + unsafe fn from_bytes_unchecked(bytes: Vec<u8>) -> PathBuf { + PathBuf { inner: bytes } + } + + /// Convert from a [&std::path::Path] to [Self]. + /// + /// - Self uses `/` as path separator. + /// - Absolute paths are always rejected, are are these with custom prefixes. + /// - Repeated separators are deduplicated. + /// - Occurrences of `.` are normalized away. + /// - A trailing slash is normalized away. + /// + /// A `canonicalize_dotdot` boolean controls whether `..` will get + /// canonicalized if possible, or should return an error. + /// + /// For more exotic paths, this conversion might produce different results + /// on different platforms, due to different underlying byte + /// representations, which is why it's restricted to unix for now. + #[cfg(unix)] + pub fn from_host_path( + host_path: &std::path::Path, + canonicalize_dotdot: bool, + ) -> Result<Self, std::io::Error> { + let mut p = PathBuf::with_capacity(host_path.as_os_str().len()); + + for component in host_path.components() { + match component { + std::path::Component::Prefix(_) | std::path::Component::RootDir => { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "found disallowed prefix or rootdir", + )) + } + std::path::Component::CurDir => continue, // ignore + std::path::Component::ParentDir => { + if canonicalize_dotdot { + // Try popping the last element from the path being constructed. + // FUTUREWORK: pop method? + p = p + .parent() + .ok_or_else(|| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + "found .. going too far up", + ) + })? + .to_owned(); + } else { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "found disallowed ..", + )); + } + } + std::path::Component::Normal(s) => { + // append the new component to the path being constructed. + p.try_push(s.as_encoded_bytes()).map_err(|_| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + "encountered invalid node in sub_path component", + ) + })? + } + } + } + + Ok(p) + } + + pub fn into_boxed_path(self) -> Box<Path> { + // SAFETY: Box<[u8]> and Box<Path> have the same representation, + // and PathBuf always contains a valid Path. + unsafe { mem::transmute(self.inner.into_boxed_slice()) } + } + + pub fn into_bytes(self) -> Vec<u8> { + self.inner + } +} + +#[cfg(test)] +mod test { + use super::{Path, PathBuf}; + use bstr::ByteSlice; + use rstest::rstest; + + // TODO: add some manual tests including invalid UTF-8 (hard to express + // with rstest) + + #[rstest] + #[case::empty("", 0)] + #[case("a", 1)] + #[case("a/b", 2)] + #[case("a/b/c", 3)] + // add two slightly more cursed variants. + // Technically nothing prevents us from representing this with castore, + // but maybe we want to disallow constructing paths like this as it's a + // bad idea. + #[case::cursed("C:\\a/b", 2)] + #[case::cursed("\\\\tvix-store", 1)] + pub fn from_str(#[case] s: &str, #[case] num_components: usize) { + let p: PathBuf = s.parse().expect("must parse"); + + assert_eq!(s.as_bytes(), p.as_bytes(), "inner bytes mismatch"); + assert_eq!( + num_components, + p.components().count(), + "number of components mismatch" + ); + } + + #[rstest] + #[case::absolute("/a/b")] + #[case::two_forward_slashes_start("//a/b")] + #[case::two_forward_slashes_middle("a/b//c/d")] + #[case::trailing_slash("a/b/")] + #[case::dot(".")] + #[case::dotdot("..")] + #[case::dot_start("./a")] + #[case::dotdot_start("../a")] + #[case::dot_middle("a/./b")] + #[case::dotdot_middle("a/../b")] + #[case::dot_end("a/b/.")] + #[case::dotdot_end("a/b/..")] + #[case::null("fo\0o")] + pub fn from_str_fail(#[case] s: &str) { + s.parse::<PathBuf>().expect_err("must fail"); + } + + #[rstest] + #[case("foo", "")] + #[case("foo/bar", "foo")] + #[case("foo2/bar2", "foo2")] + #[case("foo/bar/baz", "foo/bar")] + pub fn parent(#[case] p: PathBuf, #[case] exp_parent: PathBuf) { + assert_eq!(Some(&*exp_parent), p.parent()); + } + + #[rstest] + pub fn no_parent() { + assert!(Path::ROOT.parent().is_none()); + } + + #[rstest] + #[case("a", "b", "a/b")] + #[case("a", "b", "a/b")] + pub fn join_push(#[case] mut p: PathBuf, #[case] name: &str, #[case] exp_p: PathBuf) { + assert_eq!(exp_p, p.try_join(name.as_bytes()).expect("join failed")); + p.try_push(name.as_bytes()).expect("push failed"); + assert_eq!(exp_p, p); + } + + #[rstest] + #[case("a", "/")] + #[case("a", "")] + #[case("a", "b/c")] + #[case("", "/")] + #[case("", "")] + #[case("", "b/c")] + #[case("", ".")] + #[case("", "..")] + pub fn join_push_fail(#[case] mut p: PathBuf, #[case] name: &str) { + p.try_join(name.as_bytes()) + .expect_err("join succeeded unexpectedly"); + p.try_push(name.as_bytes()) + .expect_err("push succeeded unexpectedly"); + } + + #[rstest] + #[case::empty("", vec![])] + #[case("a", vec!["a"])] + #[case("a/b", vec!["a", "b"])] + #[case("a/b/c", vec!["a","b", "c"])] + pub fn components(#[case] p: PathBuf, #[case] exp_components: Vec<&str>) { + assert_eq!( + exp_components, + p.components() + .map(|x| x.to_str().unwrap()) + .collect::<Vec<_>>() + ); + } + + #[rstest] + #[case::empty("", "", false)] + #[case::path("a", "a", false)] + #[case::path2("a/b", "a/b", false)] + #[case::double_slash_middle("a//b", "a/b", false)] + #[case::dot(".", "", false)] + #[case::dot_start("./a/b", "a/b", false)] + #[case::dot_middle("a/./b", "a/b", false)] + #[case::dot_end("a/b/.", "a/b", false)] + #[case::trailing_slash("a/b/", "a/b", false)] + #[case::dotdot_canonicalize("a/..", "", true)] + #[case::dotdot_canonicalize2("a/../b", "b", true)] + #[cfg_attr(unix, case::faux_prefix("\\\\nix-store", "\\\\nix-store", false))] + #[cfg_attr(unix, case::faux_letter("C:\\foo.txt", "C:\\foo.txt", false))] + pub fn from_host_path( + #[case] host_path: std::path::PathBuf, + #[case] exp_path: PathBuf, + #[case] canonicalize_dotdot: bool, + ) { + let p = PathBuf::from_host_path(&host_path, canonicalize_dotdot).expect("must succeed"); + + assert_eq!(exp_path, p); + } + + #[rstest] + #[case::absolute("/", false)] + #[case::dotdot_root("..", false)] + #[case::dotdot_root_canonicalize("..", true)] + #[case::dotdot_root_no_canonicalize("a/..", false)] + #[case::invalid_name("foo/bar\0", false)] + // #[cfg_attr(windows, case::prefix("\\\\nix-store", false))] + // #[cfg_attr(windows, case::letter("C:\\foo.txt", false))] + pub fn from_host_path_fail( + #[case] host_path: std::path::PathBuf, + #[case] canonicalize_dotdot: bool, + ) { + PathBuf::from_host_path(&host_path, canonicalize_dotdot).expect_err("must fail"); + } +} diff --git a/tvix/castore/src/proto/grpc_blobservice_wrapper.rs b/tvix/castore/src/proto/grpc_blobservice_wrapper.rs new file mode 100644 index 0000000000..41bd0698ec --- /dev/null +++ b/tvix/castore/src/proto/grpc_blobservice_wrapper.rs @@ -0,0 +1,175 @@ +use crate::blobservice::BlobService; +use core::pin::pin; +use data_encoding::BASE64; +use futures::{stream::BoxStream, TryFutureExt}; +use std::{ + collections::VecDeque, + ops::{Deref, DerefMut}, +}; +use tokio_stream::StreamExt; +use tokio_util::io::ReaderStream; +use tonic::{async_trait, Request, Response, Status, Streaming}; +use tracing::{instrument, warn}; + +pub struct GRPCBlobServiceWrapper<T> { + blob_service: T, +} + +impl<T> GRPCBlobServiceWrapper<T> { + pub fn new(blob_service: T) -> Self { + Self { blob_service } + } +} + +// This is necessary because bytes::BytesMut comes up with +// a default 64 bytes capacity that cannot be changed +// easily if you assume a bytes::BufMut trait implementation +// Therefore, we override the Default implementation here +// TODO(raitobezarius?): upstream me properly +struct BytesMutWithDefaultCapacity<const N: usize> { + inner: bytes::BytesMut, +} + +impl<const N: usize> Deref for BytesMutWithDefaultCapacity<N> { + type Target = bytes::BytesMut; + fn deref(&self) -> &Self::Target { + &self.inner + } +} + +impl<const N: usize> DerefMut for BytesMutWithDefaultCapacity<N> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.inner + } +} + +impl<const N: usize> Default for BytesMutWithDefaultCapacity<N> { + fn default() -> Self { + BytesMutWithDefaultCapacity { + inner: bytes::BytesMut::with_capacity(N), + } + } +} + +impl<const N: usize> bytes::Buf for BytesMutWithDefaultCapacity<N> { + fn remaining(&self) -> usize { + self.inner.remaining() + } + + fn chunk(&self) -> &[u8] { + self.inner.chunk() + } + + fn advance(&mut self, cnt: usize) { + self.inner.advance(cnt); + } +} + +unsafe impl<const N: usize> bytes::BufMut for BytesMutWithDefaultCapacity<N> { + fn remaining_mut(&self) -> usize { + self.inner.remaining_mut() + } + + unsafe fn advance_mut(&mut self, cnt: usize) { + self.inner.advance_mut(cnt); + } + + fn chunk_mut(&mut self) -> &mut bytes::buf::UninitSlice { + self.inner.chunk_mut() + } +} + +#[async_trait] +impl<T> super::blob_service_server::BlobService for GRPCBlobServiceWrapper<T> +where + T: Deref<Target = dyn BlobService> + Send + Sync + 'static, +{ + // https://github.com/tokio-rs/tokio/issues/2723#issuecomment-1534723933 + type ReadStream = BoxStream<'static, Result<super::BlobChunk, Status>>; + + #[instrument(skip_all, fields(blob.digest=format!("b3:{}", BASE64.encode(&request.get_ref().digest))))] + async fn stat( + &self, + request: Request<super::StatBlobRequest>, + ) -> Result<Response<super::StatBlobResponse>, Status> { + let rq = request.into_inner(); + let req_digest = rq + .digest + .try_into() + .map_err(|_e| Status::invalid_argument("invalid digest length"))?; + + match self.blob_service.chunks(&req_digest).await { + Ok(None) => Err(Status::not_found(format!("blob {} not found", &req_digest))), + Ok(Some(chunk_metas)) => Ok(Response::new(super::StatBlobResponse { + chunks: chunk_metas, + ..Default::default() + })), + Err(e) => { + warn!(err=%e, "failed to request chunks"); + Err(e.into()) + } + } + } + + #[instrument(skip_all, fields(blob.digest=format!("b3:{}", BASE64.encode(&request.get_ref().digest))))] + async fn read( + &self, + request: Request<super::ReadBlobRequest>, + ) -> Result<Response<Self::ReadStream>, Status> { + let rq = request.into_inner(); + + let req_digest = rq + .digest + .try_into() + .map_err(|_e| Status::invalid_argument("invalid digest length"))?; + + match self.blob_service.open_read(&req_digest).await { + Ok(Some(r)) => { + let chunks_stream = + ReaderStream::new(r).map(|chunk| Ok(super::BlobChunk { data: chunk? })); + Ok(Response::new(Box::pin(chunks_stream))) + } + Ok(None) => Err(Status::not_found(format!("blob {} not found", &req_digest))), + Err(e) => { + warn!(err=%e, "failed to call open_read"); + Err(e.into()) + } + } + } + + #[instrument(skip_all)] + async fn put( + &self, + request: Request<Streaming<super::BlobChunk>>, + ) -> Result<Response<super::PutBlobResponse>, Status> { + let req_inner = request.into_inner(); + + let data_stream = req_inner.map(|x| { + x.map(|x| VecDeque::from(x.data.to_vec())) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e)) + }); + + let mut data_reader = tokio_util::io::StreamReader::new(data_stream); + + let mut blob_writer = pin!(self.blob_service.open_write().await); + + tokio::io::copy(&mut data_reader, &mut blob_writer) + .await + .map_err(|e| { + warn!("error copying: {}", e); + Status::internal("error copying") + })?; + + let digest = blob_writer + .close() + .map_err(|e| { + warn!("error closing stream: {}", e); + Status::internal("error closing stream") + }) + .await?; + + Ok(Response::new(super::PutBlobResponse { + digest: digest.into(), + })) + } +} diff --git a/tvix/castore/src/proto/grpc_directoryservice_wrapper.rs b/tvix/castore/src/proto/grpc_directoryservice_wrapper.rs new file mode 100644 index 0000000000..5c1428690c --- /dev/null +++ b/tvix/castore/src/proto/grpc_directoryservice_wrapper.rs @@ -0,0 +1,103 @@ +use crate::directoryservice::ClosureValidator; +use crate::proto; +use crate::{directoryservice::DirectoryService, B3Digest}; +use futures::stream::BoxStream; +use futures::TryStreamExt; +use std::ops::Deref; +use tokio_stream::once; +use tonic::{async_trait, Request, Response, Status, Streaming}; +use tracing::{instrument, warn}; + +pub struct GRPCDirectoryServiceWrapper<T> { + directory_service: T, +} + +impl<T> GRPCDirectoryServiceWrapper<T> { + pub fn new(directory_service: T) -> Self { + Self { directory_service } + } +} + +#[async_trait] +impl<T> proto::directory_service_server::DirectoryService for GRPCDirectoryServiceWrapper<T> +where + T: Deref<Target = dyn DirectoryService> + Send + Sync + 'static, +{ + type GetStream = BoxStream<'static, tonic::Result<proto::Directory, Status>>; + + #[instrument(skip_all)] + async fn get<'a>( + &'a self, + request: Request<proto::GetDirectoryRequest>, + ) -> Result<Response<Self::GetStream>, Status> { + let req_inner = request.into_inner(); + + let by_what = &req_inner + .by_what + .ok_or_else(|| Status::invalid_argument("invalid by_what"))?; + + match by_what { + proto::get_directory_request::ByWhat::Digest(ref digest) => { + let digest: B3Digest = digest + .clone() + .try_into() + .map_err(|_e| Status::invalid_argument("invalid digest length"))?; + + Ok(tonic::Response::new({ + if !req_inner.recursive { + let directory = self + .directory_service + .get(&digest) + .await + .map_err(|e| { + warn!(err = %e, directory.digest=%digest, "failed to get directory"); + tonic::Status::new(tonic::Code::Internal, e.to_string()) + })? + .ok_or_else(|| { + Status::not_found(format!("directory {} not found", digest)) + })?; + + Box::pin(once(Ok(directory))) + } else { + // If recursive was requested, traverse via get_recursive. + Box::pin( + self.directory_service.get_recursive(&digest).map_err(|e| { + tonic::Status::new(tonic::Code::Internal, e.to_string()) + }), + ) + } + })) + } + } + } + + #[instrument(skip_all)] + async fn put( + &self, + request: Request<Streaming<proto::Directory>>, + ) -> Result<Response<proto::PutDirectoryResponse>, Status> { + let mut req_inner = request.into_inner(); + + // We put all Directory messages we receive into ClosureValidator first. + let mut validator = ClosureValidator::default(); + while let Some(directory) = req_inner.message().await? { + validator.add(directory)?; + } + + // drain, which validates connectivity too. + let directories = validator.finalize()?; + + let mut directory_putter = self.directory_service.put_multiple_start(); + for directory in directories { + directory_putter.put(directory).await?; + } + + // Properly close the directory putter. Peek at last_directory_digest + // and return it, or propagate errors. + let last_directory_dgst = directory_putter.close().await?; + + Ok(Response::new(proto::PutDirectoryResponse { + root_digest: last_directory_dgst.into(), + })) + } +} diff --git a/tvix/castore/src/proto/mod.rs b/tvix/castore/src/proto/mod.rs new file mode 100644 index 0000000000..5374e3ae5a --- /dev/null +++ b/tvix/castore/src/proto/mod.rs @@ -0,0 +1,471 @@ +#![allow(non_snake_case)] +// https://github.com/hyperium/tonic/issues/1056 +use bstr::ByteSlice; +use std::{collections::HashSet, iter::Peekable, str}; + +use prost::Message; + +mod grpc_blobservice_wrapper; +mod grpc_directoryservice_wrapper; + +pub use grpc_blobservice_wrapper::GRPCBlobServiceWrapper; +pub use grpc_directoryservice_wrapper::GRPCDirectoryServiceWrapper; + +use crate::{B3Digest, B3_LEN}; + +tonic::include_proto!("tvix.castore.v1"); + +#[cfg(feature = "tonic-reflection")] +/// Compiled file descriptors for implementing [gRPC +/// reflection](https://github.com/grpc/grpc/blob/master/doc/server-reflection.md) with e.g. +/// [`tonic_reflection`](https://docs.rs/tonic-reflection). +pub const FILE_DESCRIPTOR_SET: &[u8] = tonic::include_file_descriptor_set!("tvix.castore.v1"); + +#[cfg(test)] +mod tests; + +/// Errors that can occur during the validation of [Directory] messages. +#[derive(Debug, PartialEq, Eq, thiserror::Error)] +pub enum ValidateDirectoryError { + /// Elements are not in sorted order + #[error("{:?} is not sorted", .0.as_bstr())] + WrongSorting(Vec<u8>), + /// Multiple elements with the same name encountered + #[error("{:?} is a duplicate name", .0.as_bstr())] + DuplicateName(Vec<u8>), + /// Invalid node + #[error("invalid node with name {:?}: {:?}", .0.as_bstr(), .1.to_string())] + InvalidNode(Vec<u8>, ValidateNodeError), + #[error("Total size exceeds u32::MAX")] + SizeOverflow, +} + +/// Errors that occur during Node validation +#[derive(Debug, PartialEq, Eq, thiserror::Error)] +pub enum ValidateNodeError { + #[error("No node set")] + NoNodeSet, + /// Invalid digest length encountered + #[error("Invalid Digest length: {0}")] + InvalidDigestLen(usize), + /// Invalid name encountered + #[error("Invalid name: {}", .0.as_bstr())] + InvalidName(Vec<u8>), + /// Invalid symlink target + #[error("Invalid symlink target: {}", .0.as_bstr())] + InvalidSymlinkTarget(Vec<u8>), +} + +/// Errors that occur during StatBlobResponse validation +#[derive(Debug, PartialEq, Eq, thiserror::Error)] +pub enum ValidateStatBlobResponseError { + /// Invalid digest length encountered + #[error("Invalid digest length {0} for chunk #{1}")] + InvalidDigestLen(usize, usize), +} + +/// Checks a Node name for validity as an intermediate node. +/// We disallow slashes, null bytes, '.', '..' and the empty string. +pub(crate) fn validate_node_name(name: &[u8]) -> Result<(), ValidateNodeError> { + if name.is_empty() + || name == b".." + || name == b"." + || name.contains(&0x00) + || name.contains(&b'/') + { + Err(ValidateNodeError::InvalidName(name.to_owned())) + } else { + Ok(()) + } +} + +/// NamedNode is implemented for [FileNode], [DirectoryNode] and [SymlinkNode] +/// and [node::Node], so we can ask all of them for the name easily. +pub trait NamedNode { + fn get_name(&self) -> &[u8]; +} + +impl NamedNode for &FileNode { + fn get_name(&self) -> &[u8] { + &self.name + } +} + +impl NamedNode for &DirectoryNode { + fn get_name(&self) -> &[u8] { + &self.name + } +} + +impl NamedNode for &SymlinkNode { + fn get_name(&self) -> &[u8] { + &self.name + } +} + +impl NamedNode for node::Node { + fn get_name(&self) -> &[u8] { + match self { + node::Node::File(node_file) => &node_file.name, + node::Node::Directory(node_directory) => &node_directory.name, + node::Node::Symlink(node_symlink) => &node_symlink.name, + } + } +} + +impl Node { + /// Ensures the node has a valid enum kind (is Some), and passes its + // per-enum validation. + pub fn validate(&self) -> Result<(), ValidateNodeError> { + if let Some(node) = self.node.as_ref() { + node.validate() + } else { + Err(ValidateNodeError::NoNodeSet) + } + } +} + +impl node::Node { + /// Returns the node with a new name. + pub fn rename(self, name: bytes::Bytes) -> Self { + match self { + node::Node::Directory(n) => node::Node::Directory(DirectoryNode { name, ..n }), + node::Node::File(n) => node::Node::File(FileNode { name, ..n }), + node::Node::Symlink(n) => node::Node::Symlink(SymlinkNode { name, ..n }), + } + } + + /// Ensures the node has a valid name, and checks the type-specific fields too. + pub fn validate(&self) -> Result<(), ValidateNodeError> { + match self { + // for a directory root node, ensure the digest has the appropriate size. + node::Node::Directory(directory_node) => { + if directory_node.digest.len() != B3_LEN { + Err(ValidateNodeError::InvalidDigestLen( + directory_node.digest.len(), + ))?; + } + validate_node_name(&directory_node.name) + } + // for a file root node, ensure the digest has the appropriate size. + node::Node::File(file_node) => { + if file_node.digest.len() != B3_LEN { + Err(ValidateNodeError::InvalidDigestLen(file_node.digest.len()))?; + } + validate_node_name(&file_node.name) + } + // ensure the symlink target is not empty and doesn't contain null bytes. + node::Node::Symlink(symlink_node) => { + if symlink_node.target.is_empty() || symlink_node.target.contains(&b'\0') { + Err(ValidateNodeError::InvalidSymlinkTarget( + symlink_node.target.to_vec(), + ))?; + } + validate_node_name(&symlink_node.name) + } + } + } +} + +impl PartialOrd for node::Node { + fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { + Some(self.cmp(other)) + } +} + +impl Ord for node::Node { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.get_name().cmp(other.get_name()) + } +} + +impl PartialOrd for FileNode { + fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { + Some(self.cmp(other)) + } +} + +impl Ord for FileNode { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.get_name().cmp(other.get_name()) + } +} + +impl PartialOrd for SymlinkNode { + fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { + Some(self.cmp(other)) + } +} + +impl Ord for SymlinkNode { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.get_name().cmp(other.get_name()) + } +} + +impl PartialOrd for DirectoryNode { + fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { + Some(self.cmp(other)) + } +} + +impl Ord for DirectoryNode { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.get_name().cmp(other.get_name()) + } +} + +/// Accepts a name, and a mutable reference to the previous name. +/// If the passed name is larger than the previous one, the reference is updated. +/// If it's not, an error is returned. +fn update_if_lt_prev<'n>( + prev_name: &mut &'n [u8], + name: &'n [u8], +) -> Result<(), ValidateDirectoryError> { + if *name < **prev_name { + return Err(ValidateDirectoryError::WrongSorting(name.to_vec())); + } + *prev_name = name; + Ok(()) +} + +/// Inserts the given name into a HashSet if it's not already in there. +/// If it is, an error is returned. +fn insert_once<'n>( + seen_names: &mut HashSet<&'n [u8]>, + name: &'n [u8], +) -> Result<(), ValidateDirectoryError> { + if seen_names.get(name).is_some() { + return Err(ValidateDirectoryError::DuplicateName(name.to_vec())); + } + seen_names.insert(name); + Ok(()) +} + +fn checked_sum(iter: impl IntoIterator<Item = u64>) -> Option<u64> { + iter.into_iter().try_fold(0u64, |acc, i| acc.checked_add(i)) +} + +impl Directory { + /// The size of a directory is the number of all regular and symlink elements, + /// the number of directory elements, and their size fields. + pub fn size(&self) -> u64 { + if cfg!(debug_assertions) { + self.size_checked() + .expect("Directory::size exceeds u64::MAX") + } else { + self.size_checked().unwrap_or(u64::MAX) + } + } + + fn size_checked(&self) -> Option<u64> { + checked_sum([ + self.files.len().try_into().ok()?, + self.symlinks.len().try_into().ok()?, + self.directories.len().try_into().ok()?, + checked_sum(self.directories.iter().map(|e| e.size))?, + ]) + } + + /// Calculates the digest of a Directory, which is the blake3 hash of a + /// Directory protobuf message, serialized in protobuf canonical form. + pub fn digest(&self) -> B3Digest { + let mut hasher = blake3::Hasher::new(); + + hasher + .update(&self.encode_to_vec()) + .finalize() + .as_bytes() + .into() + } + + /// validate checks the directory for invalid data, such as: + /// - violations of name restrictions + /// - invalid digest lengths + /// - not properly sorted lists + /// - duplicate names in the three lists + pub fn validate(&self) -> Result<(), ValidateDirectoryError> { + let mut seen_names: HashSet<&[u8]> = HashSet::new(); + + let mut last_directory_name: &[u8] = b""; + let mut last_file_name: &[u8] = b""; + let mut last_symlink_name: &[u8] = b""; + + // check directories + for directory_node in &self.directories { + node::Node::Directory(directory_node.clone()) + .validate() + .map_err(|e| { + ValidateDirectoryError::InvalidNode(directory_node.name.to_vec(), e) + })?; + + update_if_lt_prev(&mut last_directory_name, &directory_node.name)?; + insert_once(&mut seen_names, &directory_node.name)?; + } + + // check files + for file_node in &self.files { + node::Node::File(file_node.clone()) + .validate() + .map_err(|e| ValidateDirectoryError::InvalidNode(file_node.name.to_vec(), e))?; + + update_if_lt_prev(&mut last_file_name, &file_node.name)?; + insert_once(&mut seen_names, &file_node.name)?; + } + + // check symlinks + for symlink_node in &self.symlinks { + node::Node::Symlink(symlink_node.clone()) + .validate() + .map_err(|e| ValidateDirectoryError::InvalidNode(symlink_node.name.to_vec(), e))?; + + update_if_lt_prev(&mut last_symlink_name, &symlink_node.name)?; + insert_once(&mut seen_names, &symlink_node.name)?; + } + + self.size_checked() + .ok_or(ValidateDirectoryError::SizeOverflow)?; + + Ok(()) + } + + /// Allows iterating over all three nodes ([DirectoryNode], [FileNode], + /// [SymlinkNode]) in an ordered fashion, as long as the individual lists + /// are sorted (which can be checked by the [Directory::validate]). + pub fn nodes(&self) -> DirectoryNodesIterator { + return DirectoryNodesIterator { + i_directories: self.directories.iter().peekable(), + i_files: self.files.iter().peekable(), + i_symlinks: self.symlinks.iter().peekable(), + }; + } + + /// Adds the specified [node::Node] to the [Directory], preserving sorted entries. + /// This assumes the [Directory] to be sorted prior to adding the node. + /// + /// Inserting an element that already exists with the same name in the directory is not + /// supported. + pub fn add(&mut self, node: node::Node) { + debug_assert!( + !self.files.iter().any(|x| x.get_name() == node.get_name()), + "name already exists in files" + ); + debug_assert!( + !self + .directories + .iter() + .any(|x| x.get_name() == node.get_name()), + "name already exists in directories" + ); + debug_assert!( + !self + .symlinks + .iter() + .any(|x| x.get_name() == node.get_name()), + "name already exists in symlinks" + ); + + match node { + node::Node::File(node) => { + let pos = self + .files + .binary_search(&node) + .expect_err("Tvix bug: dir entry with name already exists"); + self.files.insert(pos, node); + } + node::Node::Directory(node) => { + let pos = self + .directories + .binary_search(&node) + .expect_err("Tvix bug: dir entry with name already exists"); + self.directories.insert(pos, node); + } + node::Node::Symlink(node) => { + let pos = self + .symlinks + .binary_search(&node) + .expect_err("Tvix bug: dir entry with name already exists"); + self.symlinks.insert(pos, node); + } + } + } +} + +impl StatBlobResponse { + /// Validates a StatBlobResponse. All chunks must have valid blake3 digests. + /// It is allowed to send an empty list, if no more granular chunking is + /// available. + pub fn validate(&self) -> Result<(), ValidateStatBlobResponseError> { + for (i, chunk) in self.chunks.iter().enumerate() { + if chunk.digest.len() != blake3::KEY_LEN { + return Err(ValidateStatBlobResponseError::InvalidDigestLen( + chunk.digest.len(), + i, + )); + } + } + Ok(()) + } +} + +/// Struct to hold the state of an iterator over all nodes of a Directory. +/// +/// Internally, this keeps peekable Iterators over all three lists of a +/// Directory message. +pub struct DirectoryNodesIterator<'a> { + // directory: &Directory, + i_directories: Peekable<std::slice::Iter<'a, DirectoryNode>>, + i_files: Peekable<std::slice::Iter<'a, FileNode>>, + i_symlinks: Peekable<std::slice::Iter<'a, SymlinkNode>>, +} + +/// looks at two elements implementing NamedNode, and returns true if "left +/// is smaller / comes first". +/// +/// Some(_) is preferred over None. +fn left_name_lt_right<A: NamedNode, B: NamedNode>(left: Option<&A>, right: Option<&B>) -> bool { + match left { + // if left is None, right always wins + None => false, + Some(left_inner) => { + // left is Some. + match right { + // left is Some, right is None - left wins. + None => true, + Some(right_inner) => { + // both are Some - compare the name. + return left_inner.get_name() < right_inner.get_name(); + } + } + } + } +} + +impl Iterator for DirectoryNodesIterator<'_> { + type Item = node::Node; + + // next returns the next node in the Directory. + // we peek at all three internal iterators, and pick the one with the + // smallest name, to ensure lexicographical ordering. + // The individual lists are already known to be sorted. + fn next(&mut self) -> Option<Self::Item> { + if left_name_lt_right(self.i_directories.peek(), self.i_files.peek()) { + // i_directories is still in the game, compare with symlinks + if left_name_lt_right(self.i_directories.peek(), self.i_symlinks.peek()) { + self.i_directories + .next() + .cloned() + .map(node::Node::Directory) + } else { + self.i_symlinks.next().cloned().map(node::Node::Symlink) + } + } else { + // i_files is still in the game, compare with symlinks + if left_name_lt_right(self.i_files.peek(), self.i_symlinks.peek()) { + self.i_files.next().cloned().map(node::Node::File) + } else { + self.i_symlinks.next().cloned().map(node::Node::Symlink) + } + } + } +} diff --git a/tvix/castore/src/proto/tests/directory.rs b/tvix/castore/src/proto/tests/directory.rs new file mode 100644 index 0000000000..81b73a048d --- /dev/null +++ b/tvix/castore/src/proto/tests/directory.rs @@ -0,0 +1,452 @@ +use crate::proto::{ + node, Directory, DirectoryNode, FileNode, SymlinkNode, ValidateDirectoryError, + ValidateNodeError, +}; + +use hex_literal::hex; + +const DUMMY_DIGEST: [u8; 32] = [0; 32]; + +#[test] +fn size() { + { + let d = Directory::default(); + assert_eq!(d.size(), 0); + } + { + let d = Directory { + directories: vec![DirectoryNode { + name: "foo".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 0, + }], + ..Default::default() + }; + assert_eq!(d.size(), 1); + } + { + let d = Directory { + directories: vec![DirectoryNode { + name: "foo".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 4, + }], + ..Default::default() + }; + assert_eq!(d.size(), 5); + } + { + let d = Directory { + files: vec![FileNode { + name: "foo".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 42, + executable: false, + }], + ..Default::default() + }; + assert_eq!(d.size(), 1); + } + { + let d = Directory { + symlinks: vec![SymlinkNode { + name: "foo".into(), + target: "bar".into(), + }], + ..Default::default() + }; + assert_eq!(d.size(), 1); + } +} + +#[test] +#[cfg_attr(not(debug_assertions), ignore)] +#[should_panic = "Directory::size exceeds u64::MAX"] +fn size_unchecked_panic() { + let d = Directory { + directories: vec![DirectoryNode { + name: "foo".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: u64::MAX, + }], + ..Default::default() + }; + + d.size(); +} + +#[test] +#[cfg_attr(debug_assertions, ignore)] +fn size_unchecked_saturate() { + let d = Directory { + directories: vec![DirectoryNode { + name: "foo".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: u64::MAX, + }], + ..Default::default() + }; + + assert_eq!(d.size(), u64::MAX); +} + +#[test] +fn size_checked() { + // We don't test the overflow cases that rely purely on immediate + // child count, since that would take an absurd amount of memory. + { + let d = Directory { + directories: vec![DirectoryNode { + name: "foo".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: u64::MAX - 1, + }], + ..Default::default() + }; + assert_eq!(d.size_checked(), Some(u64::MAX)); + } + { + let d = Directory { + directories: vec![DirectoryNode { + name: "foo".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: u64::MAX, + }], + ..Default::default() + }; + assert_eq!(d.size_checked(), None); + } + { + let d = Directory { + directories: vec![ + DirectoryNode { + name: "foo".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: u64::MAX / 2, + }, + DirectoryNode { + name: "foo".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: u64::MAX / 2, + }, + ], + ..Default::default() + }; + assert_eq!(d.size_checked(), None); + } +} + +#[test] +fn digest() { + let d = Directory::default(); + + assert_eq!( + d.digest(), + (&hex!("af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262")).into() + ) +} + +#[test] +fn validate_empty() { + let d = Directory::default(); + assert_eq!(d.validate(), Ok(())); +} + +#[test] +fn validate_invalid_names() { + { + let d = Directory { + directories: vec![DirectoryNode { + name: "".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 42, + }], + ..Default::default() + }; + match d.validate().expect_err("must fail") { + ValidateDirectoryError::InvalidNode(n, ValidateNodeError::InvalidName(_)) => { + assert_eq!(n, b"") + } + _ => panic!("unexpected error"), + }; + } + + { + let d = Directory { + directories: vec![DirectoryNode { + name: ".".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 42, + }], + ..Default::default() + }; + match d.validate().expect_err("must fail") { + ValidateDirectoryError::InvalidNode(n, ValidateNodeError::InvalidName(_)) => { + assert_eq!(n, b".") + } + _ => panic!("unexpected error"), + }; + } + + { + let d = Directory { + files: vec![FileNode { + name: "..".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 42, + executable: false, + }], + ..Default::default() + }; + match d.validate().expect_err("must fail") { + ValidateDirectoryError::InvalidNode(n, ValidateNodeError::InvalidName(_)) => { + assert_eq!(n, b"..") + } + _ => panic!("unexpected error"), + }; + } + + { + let d = Directory { + symlinks: vec![SymlinkNode { + name: "\x00".into(), + target: "foo".into(), + }], + ..Default::default() + }; + match d.validate().expect_err("must fail") { + ValidateDirectoryError::InvalidNode(n, ValidateNodeError::InvalidName(_)) => { + assert_eq!(n, b"\x00") + } + _ => panic!("unexpected error"), + }; + } + + { + let d = Directory { + symlinks: vec![SymlinkNode { + name: "foo/bar".into(), + target: "foo".into(), + }], + ..Default::default() + }; + match d.validate().expect_err("must fail") { + ValidateDirectoryError::InvalidNode(n, ValidateNodeError::InvalidName(_)) => { + assert_eq!(n, b"foo/bar") + } + _ => panic!("unexpected error"), + }; + } +} + +#[test] +fn validate_invalid_digest() { + let d = Directory { + directories: vec![DirectoryNode { + name: "foo".into(), + digest: vec![0x00, 0x42].into(), // invalid length + size: 42, + }], + ..Default::default() + }; + match d.validate().expect_err("must fail") { + ValidateDirectoryError::InvalidNode(_, ValidateNodeError::InvalidDigestLen(n)) => { + assert_eq!(n, 2) + } + _ => panic!("unexpected error"), + } +} + +#[test] +fn validate_sorting() { + // "b" comes before "a", bad. + { + let d = Directory { + directories: vec![ + DirectoryNode { + name: "b".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 42, + }, + DirectoryNode { + name: "a".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 42, + }, + ], + ..Default::default() + }; + match d.validate().expect_err("must fail") { + ValidateDirectoryError::WrongSorting(s) => { + assert_eq!(s, b"a"); + } + _ => panic!("unexpected error"), + } + } + + // "a" exists twice, bad. + { + let d = Directory { + directories: vec![ + DirectoryNode { + name: "a".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 42, + }, + DirectoryNode { + name: "a".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 42, + }, + ], + ..Default::default() + }; + match d.validate().expect_err("must fail") { + ValidateDirectoryError::DuplicateName(s) => { + assert_eq!(s, b"a"); + } + _ => panic!("unexpected error"), + } + } + + // "a" comes before "b", all good. + { + let d = Directory { + directories: vec![ + DirectoryNode { + name: "a".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 42, + }, + DirectoryNode { + name: "b".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 42, + }, + ], + ..Default::default() + }; + + d.validate().expect("validate shouldn't error"); + } + + // [b, c] and [a] are both properly sorted. + { + let d = Directory { + directories: vec![ + DirectoryNode { + name: "b".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 42, + }, + DirectoryNode { + name: "c".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 42, + }, + ], + symlinks: vec![SymlinkNode { + name: "a".into(), + target: "foo".into(), + }], + ..Default::default() + }; + + d.validate().expect("validate shouldn't error"); + } +} + +#[test] +fn validate_overflow() { + let d = Directory { + directories: vec![DirectoryNode { + name: "foo".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: u64::MAX, + }], + ..Default::default() + }; + + match d.validate().expect_err("must fail") { + ValidateDirectoryError::SizeOverflow => {} + _ => panic!("unexpected error"), + } +} + +#[test] +fn add_nodes_to_directory() { + let mut d = Directory { + ..Default::default() + }; + + d.add(node::Node::Directory(DirectoryNode { + name: "b".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 1, + })); + d.add(node::Node::Directory(DirectoryNode { + name: "a".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 1, + })); + d.add(node::Node::Directory(DirectoryNode { + name: "z".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 1, + })); + + d.add(node::Node::File(FileNode { + name: "f".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 1, + executable: true, + })); + d.add(node::Node::File(FileNode { + name: "c".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 1, + executable: true, + })); + d.add(node::Node::File(FileNode { + name: "g".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 1, + executable: true, + })); + + d.add(node::Node::Symlink(SymlinkNode { + name: "t".into(), + target: "a".into(), + })); + d.add(node::Node::Symlink(SymlinkNode { + name: "o".into(), + target: "a".into(), + })); + d.add(node::Node::Symlink(SymlinkNode { + name: "e".into(), + target: "a".into(), + })); + + d.validate().expect("directory should be valid"); +} + +#[test] +#[cfg_attr(not(debug_assertions), ignore)] +#[should_panic = "name already exists in directories"] +fn add_duplicate_node_to_directory_panics() { + let mut d = Directory { + ..Default::default() + }; + + d.add(node::Node::Directory(DirectoryNode { + name: "a".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 1, + })); + d.add(node::Node::File(FileNode { + name: "a".into(), + digest: DUMMY_DIGEST.to_vec().into(), + size: 1, + executable: true, + })); +} diff --git a/tvix/castore/src/proto/tests/directory_nodes_iterator.rs b/tvix/castore/src/proto/tests/directory_nodes_iterator.rs new file mode 100644 index 0000000000..68f147a332 --- /dev/null +++ b/tvix/castore/src/proto/tests/directory_nodes_iterator.rs @@ -0,0 +1,78 @@ +use crate::proto::Directory; +use crate::proto::DirectoryNode; +use crate::proto::FileNode; +use crate::proto::NamedNode; +use crate::proto::SymlinkNode; + +#[test] +fn iterator() { + let d = Directory { + directories: vec![ + DirectoryNode { + name: "c".into(), + ..DirectoryNode::default() + }, + DirectoryNode { + name: "d".into(), + ..DirectoryNode::default() + }, + DirectoryNode { + name: "h".into(), + ..DirectoryNode::default() + }, + DirectoryNode { + name: "l".into(), + ..DirectoryNode::default() + }, + ], + files: vec![ + FileNode { + name: "b".into(), + ..FileNode::default() + }, + FileNode { + name: "e".into(), + ..FileNode::default() + }, + FileNode { + name: "g".into(), + ..FileNode::default() + }, + FileNode { + name: "j".into(), + ..FileNode::default() + }, + ], + symlinks: vec![ + SymlinkNode { + name: "a".into(), + ..SymlinkNode::default() + }, + SymlinkNode { + name: "f".into(), + ..SymlinkNode::default() + }, + SymlinkNode { + name: "i".into(), + ..SymlinkNode::default() + }, + SymlinkNode { + name: "k".into(), + ..SymlinkNode::default() + }, + ], + }; + + // We keep this strings here and convert to string to make the comparison + // less messy. + let mut node_names: Vec<String> = vec![]; + + for node in d.nodes() { + node_names.push(String::from_utf8(node.get_name().to_vec()).unwrap()); + } + + assert_eq!( + vec!["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"], + node_names + ); +} diff --git a/tvix/castore/src/proto/tests/mod.rs b/tvix/castore/src/proto/tests/mod.rs new file mode 100644 index 0000000000..8d903bacb6 --- /dev/null +++ b/tvix/castore/src/proto/tests/mod.rs @@ -0,0 +1,2 @@ +mod directory; +mod directory_nodes_iterator; diff --git a/tvix/castore/src/tests/import.rs b/tvix/castore/src/tests/import.rs new file mode 100644 index 0000000000..8b3bd5ce0f --- /dev/null +++ b/tvix/castore/src/tests/import.rs @@ -0,0 +1,129 @@ +use crate::blobservice::{self, BlobService}; +use crate::directoryservice; +use crate::fixtures::*; +use crate::import::fs::ingest_path; +use crate::proto; + +use std::sync::Arc; +use tempfile::TempDir; + +#[cfg(target_family = "unix")] +use std::os::unix::ffi::OsStrExt; + +#[cfg(target_family = "unix")] +#[tokio::test] +async fn symlink() { + let blob_service = blobservice::from_addr("memory://").await.unwrap(); + let directory_service = directoryservice::from_addr("memory://").await.unwrap(); + + let tmpdir = TempDir::new().unwrap(); + + std::fs::create_dir_all(&tmpdir).unwrap(); + std::os::unix::fs::symlink( + "/nix/store/somewhereelse", + tmpdir.path().join("doesntmatter"), + ) + .unwrap(); + + let root_node = ingest_path( + Arc::from(blob_service), + directory_service, + tmpdir.path().join("doesntmatter"), + ) + .await + .expect("must succeed"); + + assert_eq!( + proto::node::Node::Symlink(proto::SymlinkNode { + name: "doesntmatter".into(), + target: "/nix/store/somewhereelse".into(), + }), + root_node, + ) +} + +#[tokio::test] +async fn single_file() { + let blob_service = + Arc::from(blobservice::from_addr("memory://").await.unwrap()) as Arc<dyn BlobService>; + let directory_service = directoryservice::from_addr("memory://").await.unwrap(); + + let tmpdir = TempDir::new().unwrap(); + + std::fs::write(tmpdir.path().join("root"), HELLOWORLD_BLOB_CONTENTS).unwrap(); + + let root_node = ingest_path( + blob_service.clone(), + directory_service, + tmpdir.path().join("root"), + ) + .await + .expect("must succeed"); + + assert_eq!( + proto::node::Node::File(proto::FileNode { + name: "root".into(), + digest: HELLOWORLD_BLOB_DIGEST.clone().into(), + size: HELLOWORLD_BLOB_CONTENTS.len() as u64, + executable: false, + }), + root_node, + ); + + // ensure the blob has been uploaded + assert!(blob_service.has(&HELLOWORLD_BLOB_DIGEST).await.unwrap()); +} + +#[cfg(target_family = "unix")] +#[tokio::test] +async fn complicated() { + let blob_service = + Arc::from(blobservice::from_addr("memory://").await.unwrap()) as Arc<dyn BlobService>; + let directory_service = directoryservice::from_addr("memory://").await.unwrap(); + + let tmpdir = TempDir::new().unwrap(); + + // File ``.keep` + std::fs::write(tmpdir.path().join(".keep"), vec![]).unwrap(); + // Symlink `aa` + std::os::unix::fs::symlink("/nix/store/somewhereelse", tmpdir.path().join("aa")).unwrap(); + // Directory `keep` + std::fs::create_dir(tmpdir.path().join("keep")).unwrap(); + // File ``keep/.keep` + std::fs::write(tmpdir.path().join("keep").join(".keep"), vec![]).unwrap(); + + let root_node = ingest_path(blob_service.clone(), &directory_service, tmpdir.path()) + .await + .expect("must succeed"); + + // ensure root_node matched expectations + assert_eq!( + proto::node::Node::Directory(proto::DirectoryNode { + name: tmpdir + .path() + .file_name() + .unwrap() + .as_bytes() + .to_owned() + .into(), + digest: DIRECTORY_COMPLICATED.digest().into(), + size: DIRECTORY_COMPLICATED.size(), + }), + root_node, + ); + + // ensure DIRECTORY_WITH_KEEP and DIRECTORY_COMPLICATED have been uploaded + assert!(directory_service + .get(&DIRECTORY_WITH_KEEP.digest()) + .await + .unwrap() + .is_some()); + assert!(directory_service + .get(&DIRECTORY_COMPLICATED.digest()) + .await + .unwrap() + .is_some()); + + // ensure EMPTY_BLOB_CONTENTS has been uploaded + assert!(blob_service.has(&EMPTY_BLOB_DIGEST).await.unwrap()); +} diff --git a/tvix/castore/src/tests/mod.rs b/tvix/castore/src/tests/mod.rs new file mode 100644 index 0000000000..d016f3e0aa --- /dev/null +++ b/tvix/castore/src/tests/mod.rs @@ -0,0 +1 @@ +mod import; diff --git a/tvix/castore/src/tonic.rs b/tvix/castore/src/tonic.rs new file mode 100644 index 0000000000..4b65d6b028 --- /dev/null +++ b/tvix/castore/src/tonic.rs @@ -0,0 +1,122 @@ +use tokio::net::UnixStream; +use tonic::transport::{Channel, Endpoint}; + +fn url_wants_wait_connect(url: &url::Url) -> bool { + url.query_pairs() + .filter(|(k, v)| k == "wait-connect" && v == "1") + .count() + > 0 +} + +/// Turn a [url::Url] to a [Channel] if it can be parsed successfully. +/// It supports the following schemes (and URLs): +/// - `grpc+http://[::1]:8000`, connecting over unencrypted HTTP/2 (h2c) +/// - `grpc+https://[::1]:8000`, connecting over encrypted HTTP/2 +/// - `grpc+unix:/path/to/socket`, connecting to a unix domain socket +/// +/// All URLs support adding `wait-connect=1` as a URL parameter, in which case +/// the connection is established lazily. +pub async fn channel_from_url(url: &url::Url) -> Result<Channel, self::Error> { + match url.scheme() { + "grpc+unix" => { + if url.host_str().is_some() { + return Err(Error::HostSetForUnixSocket()); + } + + let connector = tower::service_fn({ + let url = url.clone(); + move |_: tonic::transport::Uri| UnixStream::connect(url.path().to_string().clone()) + }); + + // the URL doesn't matter + let endpoint = Endpoint::from_static("http://[::]:50051"); + if url_wants_wait_connect(url) { + Ok(endpoint.connect_with_connector(connector).await?) + } else { + Ok(endpoint.connect_with_connector_lazy(connector)) + } + } + _ => { + // ensure path is empty, not supported with gRPC. + if !url.path().is_empty() { + return Err(Error::PathMayNotBeSet()); + } + + // Stringify the URL and remove the grpc+ prefix. + // We can't use `url.set_scheme(rest)`, as it disallows + // setting something http(s) that previously wasn't. + let unprefixed_url_str = match url.to_string().strip_prefix("grpc+") { + None => return Err(Error::MissingGRPCPrefix()), + Some(url_str) => url_str.to_owned(), + }; + + // Use the regular tonic transport::Endpoint logic, but unprefixed_url_str, + // as tonic doesn't know about grpc+http[s]. + let endpoint = Endpoint::try_from(unprefixed_url_str)?; + if url_wants_wait_connect(url) { + Ok(endpoint.connect().await?) + } else { + Ok(endpoint.connect_lazy()) + } + } + } +} + +/// Errors occuring when trying to connect to a backend +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("grpc+ prefix is missing from URL")] + MissingGRPCPrefix(), + + #[error("host may not be set for unix domain sockets")] + HostSetForUnixSocket(), + + #[error("path may not be set")] + PathMayNotBeSet(), + + #[error("transport error: {0}")] + TransportError(tonic::transport::Error), +} + +impl From<tonic::transport::Error> for Error { + fn from(value: tonic::transport::Error) -> Self { + Self::TransportError(value) + } +} + +#[cfg(test)] +mod tests { + use super::channel_from_url; + use rstest::rstest; + use url::Url; + + #[rstest] + /// Correct scheme to connect to a unix socket. + #[case::valid_unix_socket("grpc+unix:///path/to/somewhere", true)] + /// Connecting with wait-connect set to 0 succeeds, as that's the default. + #[case::valid_unix_socket_wait_connect_0("grpc+unix:///path/to/somewhere?wait-connect=0", true)] + /// Connecting with wait-connect set to 1 fails, as the path doesn't exist. + #[case::valid_unix_socket_wait_connect_1( + "grpc+unix:///path/to/somewhere?wait-connect=1", + false + )] + /// Correct scheme for unix socket, but setting a host too, which is invalid. + #[case::invalid_unix_socket_and_host("grpc+unix://host.example/path/to/somewhere", false)] + /// Correct scheme to connect to localhost, with port 12345 + #[case::valid_ipv6_localhost_port_12345("grpc+http://[::1]:12345", true)] + /// Correct scheme to connect to localhost over http, without specifying a port. + #[case::valid_http_host_without_port("grpc+http://localhost", true)] + /// Correct scheme to connect to localhost over http, without specifying a port. + #[case::valid_https_host_without_port("grpc+https://localhost", true)] + /// Correct scheme to connect to localhost over http, but with additional path, which is invalid. + #[case::invalid_host_and_path("grpc+http://localhost/some-path", false)] + /// Connecting with wait-connect set to 0 succeeds, as that's the default. + #[case::valid_host_wait_connect_0("grpc+http://localhost?wait-connect=0", true)] + /// Connecting with wait-connect set to 1 fails, as the host doesn't exist. + #[case::valid_host_wait_connect_1_fails("grpc+http://nonexist.invalid?wait-connect=1", false)] + #[tokio::test] + async fn test_from_addr_tokio(#[case] uri_str: &str, #[case] is_ok: bool) { + let url = Url::parse(uri_str).expect("must parse"); + assert_eq!(channel_from_url(&url).await.is_ok(), is_ok) + } +} |