about summary refs log tree commit diff
path: root/tvix/castore
diff options
context:
space:
mode:
Diffstat (limited to 'tvix/castore')
-rw-r--r--tvix/castore/Cargo.toml124
-rw-r--r--tvix/castore/build.rs39
-rw-r--r--tvix/castore/default.nix28
-rw-r--r--tvix/castore/docs/blobstore-chunking.md147
-rw-r--r--tvix/castore/docs/blobstore-protocol.md104
-rw-r--r--tvix/castore/docs/data-model.md50
-rw-r--r--tvix/castore/docs/why-not-git-trees.md57
-rw-r--r--tvix/castore/protos/LICENSE21
-rw-r--r--tvix/castore/protos/castore.proto71
-rw-r--r--tvix/castore/protos/default.nix54
-rw-r--r--tvix/castore/protos/rpc_blobstore.proto85
-rw-r--r--tvix/castore/protos/rpc_directory.proto53
-rw-r--r--tvix/castore/src/blobservice/chunked_reader.rs496
-rw-r--r--tvix/castore/src/blobservice/combinator.rs132
-rw-r--r--tvix/castore/src/blobservice/from_addr.rs111
-rw-r--r--tvix/castore/src/blobservice/grpc.rs349
-rw-r--r--tvix/castore/src/blobservice/memory.rs127
-rw-r--r--tvix/castore/src/blobservice/mod.rs103
-rw-r--r--tvix/castore/src/blobservice/naive_seeker.rs265
-rw-r--r--tvix/castore/src/blobservice/object_store.rs546
-rw-r--r--tvix/castore/src/blobservice/tests/mod.rs253
-rw-r--r--tvix/castore/src/blobservice/tests/utils.rs41
-rw-r--r--tvix/castore/src/digests.rs86
-rw-r--r--tvix/castore/src/directoryservice/bigtable.rs357
-rw-r--r--tvix/castore/src/directoryservice/closure_validator.rs309
-rw-r--r--tvix/castore/src/directoryservice/from_addr.rs189
-rw-r--r--tvix/castore/src/directoryservice/grpc.rs345
-rw-r--r--tvix/castore/src/directoryservice/memory.rs87
-rw-r--r--tvix/castore/src/directoryservice/mod.rs124
-rw-r--r--tvix/castore/src/directoryservice/object_store.rs261
-rw-r--r--tvix/castore/src/directoryservice/simple_putter.rs75
-rw-r--r--tvix/castore/src/directoryservice/sled.rs189
-rw-r--r--tvix/castore/src/directoryservice/tests/mod.rs227
-rw-r--r--tvix/castore/src/directoryservice/tests/utils.rs46
-rw-r--r--tvix/castore/src/directoryservice/traverse.rs186
-rw-r--r--tvix/castore/src/directoryservice/utils.rs77
-rw-r--r--tvix/castore/src/errors.rs54
-rw-r--r--tvix/castore/src/fixtures.rs88
-rw-r--r--tvix/castore/src/fs/file_attr.rs29
-rw-r--r--tvix/castore/src/fs/fuse/mod.rs123
-rw-r--r--tvix/castore/src/fs/fuse/tests.rs1245
-rw-r--r--tvix/castore/src/fs/inode_tracker.rs207
-rw-r--r--tvix/castore/src/fs/inodes.rs96
-rw-r--r--tvix/castore/src/fs/mod.rs874
-rw-r--r--tvix/castore/src/fs/root_nodes.rs37
-rw-r--r--tvix/castore/src/fs/virtiofs.rs238
-rw-r--r--tvix/castore/src/hashing_reader.rs89
-rw-r--r--tvix/castore/src/import/archive.rs373
-rw-r--r--tvix/castore/src/import/blobs.rs177
-rw-r--r--tvix/castore/src/import/error.rs20
-rw-r--r--tvix/castore/src/import/fs.rs198
-rw-r--r--tvix/castore/src/import/mod.rs345
-rw-r--r--tvix/castore/src/lib.rs30
-rw-r--r--tvix/castore/src/path.rs446
-rw-r--r--tvix/castore/src/proto/grpc_blobservice_wrapper.rs175
-rw-r--r--tvix/castore/src/proto/grpc_directoryservice_wrapper.rs103
-rw-r--r--tvix/castore/src/proto/mod.rs471
-rw-r--r--tvix/castore/src/proto/tests/directory.rs452
-rw-r--r--tvix/castore/src/proto/tests/directory_nodes_iterator.rs78
-rw-r--r--tvix/castore/src/proto/tests/mod.rs2
-rw-r--r--tvix/castore/src/tests/import.rs129
-rw-r--r--tvix/castore/src/tests/mod.rs1
-rw-r--r--tvix/castore/src/tonic.rs122
63 files changed, 12016 insertions, 0 deletions
diff --git a/tvix/castore/Cargo.toml b/tvix/castore/Cargo.toml
new file mode 100644
index 0000000000..40d4f24d9f
--- /dev/null
+++ b/tvix/castore/Cargo.toml
@@ -0,0 +1,124 @@
+[package]
+name = "tvix-castore"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+async-compression = { version = "0.4.9", features = ["tokio", "zstd"]}
+async-stream = "0.3.5"
+async-tempfile = "0.4.0"
+blake3 = { version = "1.3.1", features = ["rayon", "std", "traits-preview"] }
+bstr = "1.6.0"
+bytes = "1.4.0"
+data-encoding = "2.3.3"
+digest = "0.10.7"
+fastcdc = { version = "3.1.0", features = ["tokio"] }
+futures = "0.3.30"
+indicatif = "0.17.8"
+lazy_static = "1.4.0"
+object_store = { version = "0.9.1", features = ["http"] }
+parking_lot = "0.12.1"
+pin-project-lite = "0.2.13"
+prost = "0.12.1"
+sled = { version = "0.34.7" }
+thiserror = "1.0.38"
+tokio-stream = { version = "0.1.14", features = ["fs", "net"] }
+tokio-util = { version = "0.7.9", features = ["io", "io-util", "codec"] }
+tokio-tar = "0.3.1"
+tokio = { version = "1.32.0", features = ["fs", "macros", "net", "rt", "rt-multi-thread", "signal"] }
+tonic = "0.11.0"
+tower = "0.4.13"
+tracing = "0.1.37"
+tracing-indicatif = "0.3.6"
+url = "2.4.0"
+walkdir = "2.4.0"
+zstd = "0.13.0"
+serde = { version = "1.0.197", features = [ "derive" ] }
+serde_with = "3.7.0"
+serde_qs = "0.12.0"
+petgraph = "0.6.4"
+
+[dependencies.bigtable_rs]
+optional = true
+# https://github.com/liufuyang/bigtable_rs/pull/72
+git = "https://github.com/flokli/bigtable_rs"
+rev = "0af404741dfc40eb9fa99cf4d4140a09c5c20df7"
+
+[dependencies.fuse-backend-rs]
+optional = true
+version = "0.11.0"
+
+[dependencies.libc]
+optional = true
+version = "0.2.144"
+
+[dependencies.tonic-reflection]
+optional = true
+version = "0.11.0"
+
+[dependencies.vhost]
+optional = true
+version = "0.6"
+
+[dependencies.vhost-user-backend]
+optional = true
+version = "0.8"
+
+[dependencies.virtio-queue]
+optional = true
+version = "0.7"
+
+[dependencies.vm-memory]
+optional = true
+version = "0.10"
+
+[dependencies.vmm-sys-util]
+optional = true
+version = "0.11"
+
+[dependencies.virtio-bindings]
+optional = true
+version = "0.2.1"
+
+[build-dependencies]
+prost-build = "0.12.1"
+tonic-build = "0.11.0"
+
+[dev-dependencies]
+async-process = "2.1.0"
+rstest = "0.19.0"
+tempfile = "3.3.0"
+tokio-retry = "0.3.0"
+hex-literal = "0.4.1"
+rstest_reuse = "0.6.0"
+xattr = "1.3.1"
+
+[features]
+default = ["cloud"]
+cloud = [
+  "dep:bigtable_rs",
+  "object_store/aws",
+  "object_store/azure",
+  "object_store/gcp",
+]
+fs = ["dep:libc", "dep:fuse-backend-rs"]
+virtiofs = [
+  "fs",
+  "dep:vhost",
+  "dep:vhost-user-backend",
+  "dep:virtio-queue",
+  "dep:vm-memory",
+  "dep:vmm-sys-util",
+  "dep:virtio-bindings",
+  "fuse-backend-rs?/vhost-user-fs", # impl FsCacheReqHandler for SlaveFsCacheReq
+  "fuse-backend-rs?/virtiofs",
+]
+fuse = ["fs"]
+tonic-reflection = ["dep:tonic-reflection"]
+# Whether to run the integration tests.
+# Requires the following packages in $PATH:
+# cbtemulator, google-cloud-bigtable-tool
+integration = []
+
+[lints]
+workspace = true
diff --git a/tvix/castore/build.rs b/tvix/castore/build.rs
new file mode 100644
index 0000000000..089c093e71
--- /dev/null
+++ b/tvix/castore/build.rs
@@ -0,0 +1,39 @@
+use std::io::Result;
+
+fn main() -> Result<()> {
+    #[allow(unused_mut)]
+    let mut builder = tonic_build::configure();
+
+    #[cfg(feature = "tonic-reflection")]
+    {
+        let out_dir = std::path::PathBuf::from(std::env::var("OUT_DIR").unwrap());
+        let descriptor_path = out_dir.join("tvix.castore.v1.bin");
+
+        builder = builder.file_descriptor_set_path(descriptor_path);
+    };
+
+    // https://github.com/hyperium/tonic/issues/908
+    let mut config = prost_build::Config::new();
+    config.bytes(["."]);
+    config.type_attribute(".", "#[derive(Eq, Hash)]");
+
+    builder
+        .build_server(true)
+        .build_client(true)
+        .emit_rerun_if_changed(false)
+        .compile_with_config(
+            config,
+            &[
+                "tvix/castore/protos/castore.proto",
+                "tvix/castore/protos/rpc_blobstore.proto",
+                "tvix/castore/protos/rpc_directory.proto",
+            ],
+            // If we are in running `cargo build` manually, using `../..` works fine,
+            // but in case we run inside a nix build, we need to instead point PROTO_ROOT
+            // to a sparseTree containing that structure.
+            &[match std::env::var_os("PROTO_ROOT") {
+                Some(proto_root) => proto_root.to_str().unwrap().to_owned(),
+                None => "../..".to_string(),
+            }],
+        )
+}
diff --git a/tvix/castore/default.nix b/tvix/castore/default.nix
new file mode 100644
index 0000000000..03a12b6c20
--- /dev/null
+++ b/tvix/castore/default.nix
@@ -0,0 +1,28 @@
+{ depot, pkgs, lib, ... }:
+
+(depot.tvix.crates.workspaceMembers.tvix-castore.build.override {
+  runTests = true;
+  testPreRun = ''
+    export SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt;
+  '';
+}).overrideAttrs (old: rec {
+  meta.ci.targets = [ "integration-tests" ] ++ lib.filter (x: lib.hasPrefix "with-features" x || x == "no-features") (lib.attrNames passthru);
+  passthru = (depot.tvix.utils.mkFeaturePowerset {
+    inherit (old) crateName;
+    features = ([ "cloud" "fuse" "tonic-reflection" ]
+      # virtiofs feature currently fails to build on Darwin
+      ++ lib.optional pkgs.stdenv.isLinux "virtiofs");
+    override.testPreRun = ''
+      export SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt
+    '';
+  }) // {
+    integration-tests = depot.tvix.crates.workspaceMembers.${old.crateName}.build.override (old: {
+      runTests = true;
+      testPreRun = ''
+        export SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt;
+        export PATH="$PATH:${pkgs.lib.makeBinPath [ pkgs.cbtemulator pkgs.google-cloud-bigtable-tool ]}"
+      '';
+      features = old.features ++ [ "integration" ];
+    });
+  };
+})
diff --git a/tvix/castore/docs/blobstore-chunking.md b/tvix/castore/docs/blobstore-chunking.md
new file mode 100644
index 0000000000..49bbe69275
--- /dev/null
+++ b/tvix/castore/docs/blobstore-chunking.md
@@ -0,0 +1,147 @@
+# BlobStore: Chunking & Verified Streaming
+
+`tvix-castore`'s BlobStore is a content-addressed storage system, using [blake3]
+as hash function.
+
+Returned data is fetched by using the digest as lookup key, and can be verified
+to be correct by feeding the received data through the hash function and
+ensuring it matches the digest initially used for the lookup.
+
+This means, data can be downloaded by any untrusted third-party as well, as the
+received data is validated to match the digest it was originally requested with.
+
+However, for larger blobs of data, having to download the entire blob at once is
+wasteful, if we only care about a part of the blob. Think about mounting a
+seekable data structure, like loop-mounting an .iso file, or doing partial reads
+in a large Parquet file, a column-oriented data format.
+
+> We want to have the possibility to *seek* into a larger file.
+
+This however shouldn't compromise on data integrity properties - we should not
+need to trust a peer we're downloading from to be "honest" about the partial
+data we're reading. We should be able to verify smaller reads.
+
+Especially when substituting from an untrusted third-party, we want to be able
+to detect quickly if that third-party is sending us wrong data, and terminate
+the connection early.
+
+## Chunking
+In content-addressed systems, this problem has historically been solved by
+breaking larger blobs into smaller chunks, which can be fetched individually,
+and making a hash of *this listing* the blob digest/identifier.
+
+ - BitTorrent for example breaks files up into smaller chunks, and maintains
+   a list of sha1 digests for each of these chunks. Magnet links contain a
+   digest over this listing as an identifier. (See [bittorrent-v2][here for
+   more details]).
+   With the identifier, a client can fetch the entire list, and then recursively
+   "unpack the graph" of nodes, until it ends up with a list of individual small
+   chunks, which can be fetched individually.
+ - Similarly, IPFS with its IPLD model builds up a Merkle DAG, and uses the
+   digest of the root node as an identitier.
+
+These approaches solve the problem of being able to fetch smaller chunks in a
+trusted fashion. They can also do some deduplication, in case there's the same
+leaf nodes same leaf nodes in multiple places.
+
+However, they also have a big disadvantage. The chunking parameters, and the
+"topology" of the graph structure itself "bleed" into the root hash of the
+entire data structure itself.
+
+Depending on the chunking parameters used, there's different representations for
+the same data, causing less data sharing/reuse in the overall system, in terms of how
+many chunks need to be downloaded vs. are already available locally, as well as
+how compact data is stored on-disk.
+
+This can be workarounded by agreeing on only a single way of chunking, but it's
+not pretty and misses a lot of deduplication potential.
+
+### Chunking in Tvix' Blobstore
+tvix-castore's BlobStore uses a hybrid approach to eliminate some of the
+disadvantages, while still being content-addressed internally, with the
+highlighted benefits.
+
+It uses [blake3] as hash function, and the blake3 digest of **the raw data
+itself** as an identifier (rather than some application-specific Merkle DAG that
+also embeds some chunking information).
+
+BLAKE3 is a tree hash where all left nodes fully populated, contrary to
+conventional serial hash functions. To be able to validate the hash of a node,
+one only needs the hash of the (2) children [^1], if any.
+
+This means one only needs to the root digest to validate a constructions, and these
+constructions can be sent [separately][bao-spec].
+
+This relieves us from the need of having to encode more granular chunking into
+our data model / identifier upfront, but can make this a mostly a transport/
+storage concern.
+
+For the some more description on the (remote) protocol, check
+`./blobstore-protocol.md`.
+
+#### Logical vs. physical chunking
+
+Due to the properties of the BLAKE3 hash function, we have logical blocks of
+1KiB, but this doesn't necessarily imply we need to restrict ourselves to these
+chunk sizes w.r.t. what "physical chunks" are sent over the wire between peers,
+or are stored on-disk.
+
+The only thing we need to be able to read and verify an arbitrary byte range is
+having the covering range of aligned 1K blocks, and a construction from the root
+digest to the 1K block.
+
+Note the intermediate hash tree can be further trimmed, [omitting][bao-tree]
+lower parts of the tree while still providing verified streaming - at the cost
+of having to fetch larger covering ranges of aligned blocks.
+
+Let's pick an example. We identify each KiB by a number here for illustrational
+purposes.
+
+Assuming we omit the last two layers of the hash tree, we end up with logical
+4KiB leaf chunks (`bao_shift` of `2`).
+
+For a blob of 14 KiB total size, we could fetch logical blocks `[0..=3]`,
+`[4..=7]`, `[8..=11]` and `[12..=13]` in an authenticated fashion:
+
+`[ 0 1 2 3 ] [ 4 5 6 7 ] [ 8 9 10 11 ] [ 12 13 ]`
+
+Assuming the server now informs us about the following physical chunking:
+
+```
+[ 0 1 ] [ 2 3 4 5 ] [ 6 ] [ 7 8 ] [ 9 10 11 12 13 14 15 ]`
+```
+
+If our application now wants to arbitrarily read from 0 until 4 (inclusive):
+
+```
+[ 0 1 ] [ 2 3 4 5 ] [ 6 ] [ 7 8 ] [ 9 10 11 12 13 14 15 ]
+ |-------------|
+
+```
+
+…we need to fetch physical chunks `[ 0 1 ]`, `[ 2 3 4 5 ]` and `[ 6 ] [ 7 8 ]`.
+
+
+`[ 0 1 ]` and `[ 2 3 4 5 ]` are obvious, they contain the data we're
+interested in.
+
+We however also need to fetch the physical chunks `[ 6 ]` and `[ 7 8 ]`, so we
+can assemble `[ 4 5 6 7 ]` to verify both logical chunks:
+
+```
+[ 0 1 ] [ 2 3 4 5 ] [ 6 ] [ 7 8 ] [ 9 10 11 12 13 14 15 ]
+^       ^           ^     ^
+|----4KiB----|------4KiB-----|
+```
+
+Each physical chunk fetched can be validated to have the blake3 digest that was
+communicated upfront, and can be stored in a client-side cache/storage, so
+subsequent / other requests for the same data will be fast(er).
+
+---
+
+[^1]: and the surrounding context, aka position inside the whole blob, which is available while verifying the tree
+[bittorrent-v2]: https://blog.libtorrent.org/2020/09/bittorrent-v2/
+[blake3]: https://github.com/BLAKE3-team/BLAKE3
+[bao-spec]: https://github.com/oconnor663/bao/blob/master/docs/spec.md
+[bao-tree]: https://github.com/n0-computer/bao-tree
diff --git a/tvix/castore/docs/blobstore-protocol.md b/tvix/castore/docs/blobstore-protocol.md
new file mode 100644
index 0000000000..048cafc3d8
--- /dev/null
+++ b/tvix/castore/docs/blobstore-protocol.md
@@ -0,0 +1,104 @@
+# BlobStore: Protocol / Composition
+
+This documents describes the protocol that BlobStore uses to substitute blobs
+other ("remote") BlobStores.
+
+How to come up with the blake3 digest of the blob to fetch is left to another
+layer in the stack.
+
+To put this into the context of Tvix as a Nix alternative, a blob represents an
+individual file inside a StorePath.
+In the Tvix Data Model, this is accomplished by having a `FileNode` (either the
+`root_node` in a `PathInfo` message, or a individual file inside a `Directory`
+message) encode a BLAKE3 digest.
+
+However, the whole infrastructure can be applied for other usecases requiring
+exchange/storage or access into data of which the blake3 digest is known.
+
+## Protocol and Interfaces
+As an RPC protocol, BlobStore currently uses gRPC.
+
+On the Rust side of things, every blob service implements the
+[`BlobService`](../src/blobservice/mod.rs) async trait, which isn't
+gRPC-specific.
+
+This `BlobService` trait provides functionality to check for existence of Blobs,
+read from blobs, and write new blobs.
+It also provides a method to ask for more granular chunks if they are available.
+
+In addition to some in-memory, on-disk and (soon) object-storage-based
+implementations, we also have a `BlobService` implementation that talks to a
+gRPC server, as well as a gRPC server wrapper component, which provides a gRPC
+service for anything implementing the `BlobService` trait.
+
+This makes it very easy to talk to a remote `BlobService`, which does not even
+need to be written in the same language, as long it speaks the same gRPC
+protocol.
+
+It also puts very little requirements on someone implementing a new
+`BlobService`, and how its internal storage or chunking algorithm looks like.
+
+The gRPC protocol is documented in `../protos/rpc_blobstore.proto`.
+Contrary to the `BlobService` trait, it does not have any options for seeking/
+ranging, as it's more desirable to provide this through chunking (see also
+`./blobstore-chunking.md`).
+
+## Composition
+Different `BlobStore` are supposed to be "composed"/"layered" to express
+caching, multiple local and remote sources.
+
+The fronting interface can be the same, it'd just be multiple "tiers" that can
+respond to requests, depending on where the data resides. [^1]
+
+This makes it very simple for consumers, as they don't need to be aware of the
+entire substitutor config.
+
+The flexibility of this doesn't need to be exposed to the user in the default
+case; in most cases we should be fine with some form of on-disk storage and a
+bunch of substituters with different priorities.
+
+### gRPC Clients
+Clients are encouraged to always read blobs in a chunked fashion (asking for a
+list of chunks for a blob via `BlobService.Stat()`, then fetching chunks via
+`BlobService.Read()` as needed), instead of directly reading the entire blob via
+`BlobService.Read()`.
+
+In a composition setting, this provides opportunity for caching, and avoids
+downloading some chunks if they're already present locally (for example, because
+they were already downloaded by reading from a similar blob earlier).
+
+It also removes the need for seeking to be a part of the gRPC protocol
+alltogether, as chunks are supposed to be "reasonably small" [^2].
+
+There's some further optimization potential, a `BlobService.Stat()` request
+could tell the server it's happy with very small blobs just being inlined in
+an additional additional field in the response, which would allow clients to
+populate their local chunk store in a single roundtrip.
+
+## Verified Streaming
+As already described in `./docs/blobstore-chunking.md`, the physical chunk
+information sent in a `BlobService.Stat()` response is still sufficient to fetch
+in an authenticated fashion.
+
+The exact protocol and formats are still a bit in flux, but here's some notes:
+
+ - `BlobService.Stat()` request gets a `send_bao` field (bool), signalling a
+   [BAO][bao-spec] should be sent. Could also be `bao_shift` integer, signalling
+   how detailed (down to the leaf chunks) it should go.
+   The exact format (and request fields) still need to be defined, edef has some
+   ideas around omitting some intermediate hash nodes over the wire and
+   recomputing them, reducing size by another ~50% over [bao-tree].
+ - `BlobService.Stat()` response gets some bao-related fields (`bao_shift`
+   field, signalling the actual format/shift level the server replies with, the
+   actual bao, and maybe some format specifier).
+   It would be nice to also be compatible with the baos used by [iroh], so we
+   can provide an implementation using it too.
+
+---
+
+[^1]: We might want to have some backchannel, so it becomes possible to provide
+      feedback to the user that something is downloaded.
+[^2]: Something between 512K-4M, TBD.
+[bao-spec]: https://github.com/oconnor663/bao/blob/master/docs/spec.md
+[bao-tree]: https://github.com/n0-computer/bao-tree
+[iroh]: https://github.com/n0-computer/iroh
diff --git a/tvix/castore/docs/data-model.md b/tvix/castore/docs/data-model.md
new file mode 100644
index 0000000000..2df6761aae
--- /dev/null
+++ b/tvix/castore/docs/data-model.md
@@ -0,0 +1,50 @@
+# Data model
+
+This provides some more notes on the fields used in castore.proto.
+
+See `//tvix/store/docs/api.md` for the full context.
+
+## Directory message
+`Directory` messages use the blake3 hash of their canonical protobuf
+serialization as its identifier.
+
+A `Directory` message contains three lists, `directories`, `files` and
+`symlinks`, holding `DirectoryNode`, `FileNode` and `SymlinkNode` messages
+respectively. They describe all the direct child elements that are contained in
+a directory.
+
+All three message types have a `name` field, specifying the (base)name of the
+element (which MUST not contain slashes or null bytes, and MUST not be '.' or '..').
+For reproducibility reasons, the lists MUST be sorted by that name and also
+MUST be unique across all three lists.
+
+In addition to the `name` field, the various *Node messages have the following
+fields:
+
+## DirectoryNode
+A `DirectoryNode` message represents a child directory.
+
+It has a `digest` field, which points to the identifier of another `Directory`
+message, making a `Directory` a merkle tree (or strictly speaking, a graph, as
+two elements pointing to a child directory with the same contents would point
+to the same `Directory` message.
+
+There's also a `size` field, containing the (total) number of all child
+elements in the referenced `Directory`, which helps for inode calculation.
+
+## FileNode
+A `FileNode` message represents a child (regular) file.
+
+Its `digest` field contains the blake3 hash of the file contents. It can be
+looked up in the `BlobService`.
+
+The `size` field contains the size of the blob the `digest` field refers to.
+
+The `executable` field specifies whether the file should be marked as
+executable or not.
+
+## SymlinkNode
+A `SymlinkNode` message represents a child symlink.
+
+In addition to the `name` field, the only additional field is the `target`,
+which is a string containing the target of the symlink.
diff --git a/tvix/castore/docs/why-not-git-trees.md b/tvix/castore/docs/why-not-git-trees.md
new file mode 100644
index 0000000000..fd46252cf5
--- /dev/null
+++ b/tvix/castore/docs/why-not-git-trees.md
@@ -0,0 +1,57 @@
+## Why not git tree objects?
+
+We've been experimenting with (some variations of) the git tree and object
+format, and ultimately decided against using it as an internal format, and
+instead adapted the one documented in the other documents here.
+
+While the tvix-store API protocol shares some similarities with the format used
+in git for trees and objects, the git one has shown some significant
+disadvantages:
+
+### The binary encoding itself
+
+#### trees
+The git tree object format is a very binary, error-prone and
+"made-to-be-read-and-written-from-C" format.
+
+Tree objects are a combination of null-terminated strings, and fields of known
+length. References to other tree objects use the literal sha1 hash of another
+tree object in this encoding.
+Extensions of the format/changes are very hard to do right, because parsers are
+not aware they might be parsing something different.
+
+The tvix-store protocol uses a canonical protobuf serialization, and uses
+the [blake3][blake3] hash of that serialization to point to other `Directory`
+messages.
+It's both compact and with a wide range of libraries for encoders and decoders
+in many programming languages.
+The choice of protobuf makes it easy to add new fields, and make old clients
+aware of some unknown fields being detected [^adding-fields].
+
+#### blob
+On disk, git blob objects start with a "blob" prefix, then the size of the
+payload, and then the data itself. The hash of a blob is the literal sha1sum
+over all of this - which makes it something very git specific to request for.
+
+tvix-store simply uses the [blake3][blake3] hash of the literal contents
+when referring to a file/blob, which makes it very easy to ask other data
+sources for the same data, as no git-specific payload is included in the hash.
+This also plays very well together with things like [iroh][iroh-discussion],
+which plans to provide a way to substitute (large)blobs by their blake3 hash
+over the IPFS network.
+
+In addition to that, [blake3][blake3] makes it possible to do
+[verified streaming][bao], as already described in other parts of the
+documentation.
+
+The git tree object format uses sha1 both for references to other trees and
+hashes of blobs, which isn't really a hash function to fundamentally base
+everything on in 2023.
+The [migration to sha256][git-sha256] also has been dead for some years now,
+and it's unclear how a "blake3" version of this would even look like.
+
+[bao]: https://github.com/oconnor663/bao
+[blake3]: https://github.com/BLAKE3-team/BLAKE3
+[git-sha256]: https://git-scm.com/docs/hash-function-transition/
+[iroh-discussion]: https://github.com/n0-computer/iroh/discussions/707#discussioncomment-5070197
+[^adding-fields]: Obviously, adding new fields will change hashes, but it's something that's easy to detect.
\ No newline at end of file
diff --git a/tvix/castore/protos/LICENSE b/tvix/castore/protos/LICENSE
new file mode 100644
index 0000000000..2034ada6fd
--- /dev/null
+++ b/tvix/castore/protos/LICENSE
@@ -0,0 +1,21 @@
+Copyright © The Tvix Authors
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+“Software”), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
diff --git a/tvix/castore/protos/castore.proto b/tvix/castore/protos/castore.proto
new file mode 100644
index 0000000000..1ef4044045
--- /dev/null
+++ b/tvix/castore/protos/castore.proto
@@ -0,0 +1,71 @@
+// SPDX-FileCopyrightText: edef <edef@unfathomable.blue>
+// SPDX-License-Identifier: OSL-3.0 OR MIT OR Apache-2.0
+
+syntax = "proto3";
+
+package tvix.castore.v1;
+
+option go_package = "code.tvl.fyi/tvix/castore-go;castorev1";
+
+// A Directory can contain Directory, File or Symlink nodes.
+// Each of these nodes have a name attribute, which is the basename in that
+// directory and node type specific attributes.
+// The name attribute:
+//  - MUST not contain slashes or null bytes
+//  - MUST not be '.' or '..'
+//  - MUST be unique across all three lists
+// Elements in each list need to be lexicographically ordered by the name
+// attribute.
+message Directory {
+  repeated DirectoryNode directories = 1;
+  repeated FileNode files = 2;
+  repeated SymlinkNode symlinks = 3;
+}
+
+// A DirectoryNode represents a directory in a Directory.
+message DirectoryNode {
+  // The (base)name of the directory
+  bytes name = 1;
+  // The blake3 hash of a Directory message, serialized in protobuf canonical form.
+  bytes digest = 2;
+  // Number of child elements in the Directory referred to by `digest`.
+  // Calculated by summing up the numbers of `directories`, `files` and
+  // `symlinks`, and for each directory, its size field. Used for inode number
+  // calculation.
+  // This field is precisely as verifiable as any other Merkle tree edge.
+  // Resolve `digest`, and you can compute it incrementally. Resolve the entire
+  // tree, and you can fully compute it from scratch.
+  // A credulous implementation won't reject an excessive size, but this is
+  // harmless: you'll have some ordinals without nodes. Undersizing is obvious
+  // and easy to reject: you won't have an ordinal for some nodes.
+  uint64 size = 3;
+}
+
+// A FileNode represents a regular or executable file in a Directory.
+message FileNode {
+  // The (base)name of the file
+  bytes name = 1;
+  // The blake3 digest of the file contents
+  bytes digest = 2;
+  // The file content size
+  uint64 size = 3;
+  // Whether the file is executable
+  bool executable = 4;
+}
+
+// A SymlinkNode represents a symbolic link in a Directory.
+message SymlinkNode {
+  // The (base)name of the symlink
+  bytes name = 1;
+  // The target of the symlink.
+  bytes target = 2;
+}
+
+// A Node is either a DirectoryNode, FileNode or SymlinkNode.
+message Node {
+  oneof node {
+    DirectoryNode directory = 1;
+    FileNode file = 2;
+    SymlinkNode symlink = 3;
+  }
+}
diff --git a/tvix/castore/protos/default.nix b/tvix/castore/protos/default.nix
new file mode 100644
index 0000000000..feef55690f
--- /dev/null
+++ b/tvix/castore/protos/default.nix
@@ -0,0 +1,54 @@
+{ depot, pkgs, ... }:
+let
+  protos = depot.nix.sparseTree {
+    name = "castore-protos";
+    root = depot.path.origSrc;
+    paths = [
+      ./castore.proto
+      ./rpc_blobstore.proto
+      ./rpc_directory.proto
+      ../../../buf.yaml
+      ../../../buf.gen.yaml
+    ];
+  };
+in
+depot.nix.readTree.drvTargets {
+  inherit protos;
+
+  # Lints and ensures formatting of the proto files.
+  check = pkgs.stdenv.mkDerivation {
+    name = "proto-check";
+    src = protos;
+
+    nativeBuildInputs = [
+      pkgs.buf
+    ];
+
+    buildPhase = ''
+      export HOME=$TMPDIR
+      buf lint
+      buf format -d --exit-code
+      touch $out
+    '';
+  };
+
+  # Produces the golang bindings.
+  go-bindings = pkgs.stdenv.mkDerivation {
+    name = "go-bindings";
+    src = protos;
+
+    nativeBuildInputs = [
+      pkgs.buf
+      pkgs.protoc-gen-go
+      pkgs.protoc-gen-go-grpc
+    ];
+
+    buildPhase = ''
+      export HOME=$TMPDIR
+      buf generate
+
+      mkdir -p $out
+      cp tvix/castore/protos/*.pb.go $out/
+    '';
+  };
+}
diff --git a/tvix/castore/protos/rpc_blobstore.proto b/tvix/castore/protos/rpc_blobstore.proto
new file mode 100644
index 0000000000..eebe39ace7
--- /dev/null
+++ b/tvix/castore/protos/rpc_blobstore.proto
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright © 2022 The Tvix Authors
+syntax = "proto3";
+
+package tvix.castore.v1;
+
+option go_package = "code.tvl.fyi/tvix/castore-go;castorev1";
+
+// BlobService allows reading (or uploading) content-addressed blobs of data.
+// BLAKE3 is used as a hashing function for the data. Uploading a blob will
+// return the BLAKE3 digest of it, and that's the identifier used to Read/Stat
+// them too.
+service BlobService {
+  // Stat can be used to check for the existence of a blob, as well as
+  // gathering more data about it, like more granular chunking information
+  // or baos.
+  // Server implementations are not required to provide more granular chunking
+  // information, especially if the digest specified in `StatBlobRequest` is
+  // already a chunk of a blob.
+  rpc Stat(StatBlobRequest) returns (StatBlobResponse);
+
+  // Read allows reading (all) data of a blob/chunk by the BLAKE3 digest of
+  // its contents.
+  // If the backend communicated more granular chunks in the `Stat` request,
+  // this can also be used to read chunks.
+  // This request returns a stream of BlobChunk, which is just a container for
+  // a stream of bytes.
+  // The server may decide on whatever chunking it may seem fit as a size for
+  // the individual BlobChunk sent in the response stream, this is mostly to
+  // keep individual messages at a manageable size.
+  rpc Read(ReadBlobRequest) returns (stream BlobChunk);
+
+  // Put uploads a Blob, by reading a stream of bytes.
+  //
+  // The way the data is chunked up in individual BlobChunk messages sent in
+  // the stream has no effect on how the server ends up chunking blobs up, if
+  // it does at all.
+  rpc Put(stream BlobChunk) returns (PutBlobResponse);
+}
+
+message StatBlobRequest {
+  // The blake3 digest of the blob requested
+  bytes digest = 1;
+
+  // Whether the server should reply with a list of more granular chunks.
+  bool send_chunks = 2;
+
+  // Whether the server should reply with a bao.
+  bool send_bao = 3;
+}
+
+message StatBlobResponse {
+  // If `send_chunks` was set to true, this MAY contain a list of more
+  // granular chunks, which then may be read individually via the `Read`
+  // method.
+  repeated ChunkMeta chunks = 2;
+
+  message ChunkMeta {
+    // Digest of that specific chunk
+    bytes digest = 1;
+
+    // Length of that chunk, in bytes.
+    uint64 size = 2;
+  }
+
+  // If `send_bao` was set to true, this MAY contain a outboard bao.
+  // The exact format and message types here will still be fleshed out.
+  bytes bao = 3;
+}
+
+message ReadBlobRequest {
+  // The blake3 digest of the blob or chunk requested
+  bytes digest = 1;
+}
+
+// This represents some bytes of a blob.
+// Blobs are sent in smaller chunks to keep message sizes manageable.
+message BlobChunk {
+  bytes data = 1;
+}
+
+message PutBlobResponse {
+  // The blake3 digest of the data that was sent.
+  bytes digest = 1;
+}
diff --git a/tvix/castore/protos/rpc_directory.proto b/tvix/castore/protos/rpc_directory.proto
new file mode 100644
index 0000000000..f4f41c433a
--- /dev/null
+++ b/tvix/castore/protos/rpc_directory.proto
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright © 2022 The Tvix Authors
+syntax = "proto3";
+
+package tvix.castore.v1;
+
+import "tvix/castore/protos/castore.proto";
+
+option go_package = "code.tvl.fyi/tvix/castore-go;castorev1";
+
+service DirectoryService {
+  // Get retrieves a stream of Directory messages, by using the lookup
+  // parameters in GetDirectoryRequest.
+  // Keep in mind multiple DirectoryNodes in different parts of the graph might
+  // have the same digest if they have the same underlying contents,
+  // so sending subsequent ones can be omitted.
+  //
+  // It is okay for certain implementations to only allow retrieval of
+  // Directory digests that are at the "root", aka the last element that's
+  // sent in a Put. This makes sense for implementations bundling closures of
+  // directories together in batches.
+  rpc Get(GetDirectoryRequest) returns (stream Directory);
+
+  // Put uploads a graph of Directory messages.
+  // Individual Directory messages need to be send in an order walking up
+  // from the leaves to the root - a Directory message can only refer to
+  // Directory messages previously sent in the same stream.
+  // Keep in mind multiple DirectoryNodes in different parts of the graph might
+  // have the same digest if they have the same underlying contents,
+  // so sending subsequent ones can be omitted.
+  // We might add a separate method, allowing to send partial graphs at a later
+  // time, if requiring to send the full graph turns out to be a problem.
+  rpc Put(stream Directory) returns (PutDirectoryResponse);
+}
+
+message GetDirectoryRequest {
+  oneof by_what {
+    // The blake3 hash of the (root) Directory message, serialized in
+    // protobuf canonical form.
+    // Keep in mind this can be a subtree of another root.
+    bytes digest = 1;
+  }
+
+  // If set to true, recursively resolve all child Directory messages.
+  // Directory messages SHOULD be streamed in a recursive breadth-first walk,
+  // but other orders are also fine, as long as Directory messages are only
+  // sent after they are referred to from previously sent Directory messages.
+  bool recursive = 2;
+}
+
+message PutDirectoryResponse {
+  bytes root_digest = 1;
+}
diff --git a/tvix/castore/src/blobservice/chunked_reader.rs b/tvix/castore/src/blobservice/chunked_reader.rs
new file mode 100644
index 0000000000..6e8355874b
--- /dev/null
+++ b/tvix/castore/src/blobservice/chunked_reader.rs
@@ -0,0 +1,496 @@
+use futures::{ready, TryStreamExt};
+use pin_project_lite::pin_project;
+use tokio::io::{AsyncRead, AsyncSeekExt};
+use tokio_stream::StreamExt;
+use tokio_util::io::{ReaderStream, StreamReader};
+use tracing::{instrument, trace, warn};
+
+use crate::B3Digest;
+use std::{cmp::Ordering, pin::Pin};
+
+use super::{BlobReader, BlobService};
+
+pin_project! {
+    /// ChunkedReader provides a chunk-aware [BlobReader], so allows reading and
+    /// seeking into a blob.
+    /// It internally holds a [ChunkedBlob], which is storing chunk information
+    /// able to emit a reader seeked to a specific position whenever we need to seek.
+    pub struct ChunkedReader<BS> {
+        chunked_blob: ChunkedBlob<BS>,
+
+        #[pin]
+        r: Box<dyn AsyncRead + Unpin + Send>,
+
+        pos: u64,
+    }
+}
+
+impl<BS> ChunkedReader<BS>
+where
+    BS: AsRef<dyn BlobService> + Clone + 'static + Send,
+{
+    /// Construct a new [ChunkedReader], by retrieving a list of chunks (their
+    /// blake3 digests and chunk sizes)
+    pub fn from_chunks(chunks_it: impl Iterator<Item = (B3Digest, u64)>, blob_service: BS) -> Self {
+        let chunked_blob = ChunkedBlob::from_iter(chunks_it, blob_service);
+        let r = chunked_blob.reader_skipped_offset(0);
+
+        Self {
+            chunked_blob,
+            r,
+            pos: 0,
+        }
+    }
+}
+
+/// ChunkedReader implements BlobReader.
+impl<BS> BlobReader for ChunkedReader<BS> where BS: Send + Clone + 'static + AsRef<dyn BlobService> {}
+
+impl<BS> tokio::io::AsyncRead for ChunkedReader<BS>
+where
+    BS: AsRef<dyn BlobService> + Clone + 'static,
+{
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut tokio::io::ReadBuf<'_>,
+    ) -> std::task::Poll<std::io::Result<()>> {
+        // The amount of data read can be determined by the increase
+        // in the length of the slice returned by `ReadBuf::filled`.
+        let filled_before = buf.filled().len();
+
+        let this = self.project();
+
+        ready!(this.r.poll_read(cx, buf))?;
+        let bytes_read = buf.filled().len() - filled_before;
+        *this.pos += bytes_read as u64;
+
+        Ok(()).into()
+    }
+}
+
+impl<BS> tokio::io::AsyncSeek for ChunkedReader<BS>
+where
+    BS: AsRef<dyn BlobService> + Clone + Send + 'static,
+{
+    #[instrument(skip(self), err(Debug))]
+    fn start_seek(self: Pin<&mut Self>, position: std::io::SeekFrom) -> std::io::Result<()> {
+        let total_len = self.chunked_blob.blob_length();
+        let mut this = self.project();
+
+        let absolute_offset: u64 = match position {
+            std::io::SeekFrom::Start(from_start) => from_start,
+            std::io::SeekFrom::End(from_end) => {
+                // note from_end is i64, not u64, so this is usually negative.
+                total_len.checked_add_signed(from_end).ok_or_else(|| {
+                    std::io::Error::new(
+                        std::io::ErrorKind::InvalidInput,
+                        "over/underflow while seeking",
+                    )
+                })?
+            }
+            std::io::SeekFrom::Current(from_current) => {
+                // note from_end is i64, not u64, so this can be positive or negative.
+                (*this.pos)
+                    .checked_add_signed(from_current)
+                    .ok_or_else(|| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::InvalidInput,
+                            "over/underflow while seeking",
+                        )
+                    })?
+            }
+        };
+
+        // check if the position actually did change.
+        if absolute_offset != *this.pos {
+            // ensure the new position still is inside the file.
+            if absolute_offset > total_len {
+                Err(std::io::Error::new(
+                    std::io::ErrorKind::InvalidInput,
+                    "seeked beyond EOF",
+                ))?
+            }
+
+            // Update the position and the internal reader.
+            *this.pos = absolute_offset;
+
+            // FUTUREWORK: if we can seek forward, avoid re-assembling.
+            // At least if it's still in the same chunk?
+            *this.r = this.chunked_blob.reader_skipped_offset(absolute_offset);
+        }
+
+        Ok(())
+    }
+
+    fn poll_complete(
+        self: Pin<&mut Self>,
+        _cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<std::io::Result<u64>> {
+        std::task::Poll::Ready(Ok(self.pos))
+    }
+}
+
+/// Holds a list of blake3 digest for individual chunks (and their sizes).
+/// Is able to construct a Reader that seeked to a certain offset, which
+/// is useful to construct a BlobReader (that implements AsyncSeek).
+/// - the current chunk index, and a Custor<Vec<u8>> holding the data of that chunk.
+struct ChunkedBlob<BS> {
+    blob_service: BS,
+    chunks: Vec<(u64, u64, B3Digest)>,
+}
+
+impl<BS> ChunkedBlob<BS>
+where
+    BS: AsRef<dyn BlobService> + Clone + 'static + Send,
+{
+    /// Constructs [Self] from a list of blake3 digests of chunks and their
+    /// sizes, and a reference to a blob service.
+    /// Initializing it with an empty list is disallowed.
+    fn from_iter(chunks_it: impl Iterator<Item = (B3Digest, u64)>, blob_service: BS) -> Self {
+        let mut chunks = Vec::new();
+        let mut offset: u64 = 0;
+
+        for (chunk_digest, chunk_size) in chunks_it {
+            chunks.push((offset, chunk_size, chunk_digest));
+            offset += chunk_size;
+        }
+
+        assert!(
+            !chunks.is_empty(),
+            "Chunks must be provided, don't use this for blobs without chunks"
+        );
+
+        Self {
+            blob_service,
+            chunks,
+        }
+    }
+
+    /// Returns the length of the blob.
+    fn blob_length(&self) -> u64 {
+        self.chunks
+            .last()
+            .map(|(chunk_offset, chunk_size, _)| chunk_offset + chunk_size)
+            .unwrap_or(0)
+    }
+
+    /// For a given position pos, return the chunk containing the data.
+    /// In case this would range outside the blob, None is returned.
+    #[instrument(level = "trace", skip(self), ret)]
+    fn get_chunk_idx_for_position(&self, pos: u64) -> Option<usize> {
+        // FUTUREWORK: benchmark when to use linear search, binary_search and BTreeSet
+        self.chunks
+            .binary_search_by(|(chunk_start_pos, chunk_size, _)| {
+                if chunk_start_pos + chunk_size <= pos {
+                    Ordering::Less
+                } else if *chunk_start_pos > pos {
+                    Ordering::Greater
+                } else {
+                    Ordering::Equal
+                }
+            })
+            .ok()
+    }
+
+    /// Returns a stream of bytes of the data in that blob.
+    /// It internally assembles a stream reading from each chunk (skipping over
+    /// chunks containing irrelevant data).
+    /// From the first relevant chunk, the irrelevant bytes are skipped too.
+    /// The returned boxed thing does not implement AsyncSeek on its own, but
+    /// ChunkedReader does.
+    #[instrument(level = "trace", skip(self))]
+    fn reader_skipped_offset(&self, offset: u64) -> Box<dyn tokio::io::AsyncRead + Send + Unpin> {
+        if offset == self.blob_length() {
+            return Box::new(std::io::Cursor::new(vec![]));
+        }
+        // construct a stream of all chunks starting with the given offset
+        let start_chunk_idx = self
+            .get_chunk_idx_for_position(offset)
+            .expect("outside of blob");
+        // It's ok to panic here, we can only reach this by seeking, and seeking should already reject out-of-file seeking.
+
+        let skip_first_chunk_bytes = (offset - self.chunks[start_chunk_idx].0) as usize;
+
+        let blob_service = self.blob_service.clone();
+        let chunks: Vec<_> = self.chunks[start_chunk_idx..].to_vec();
+        let readers_stream = tokio_stream::iter(chunks.into_iter().enumerate()).map(
+            move |(nth_chunk, (_chunk_start_offset, chunk_size, chunk_digest))| {
+                let chunk_digest = chunk_digest.to_owned();
+                let blob_service = blob_service.clone();
+                async move {
+                    trace!(chunk_size=%chunk_size, chunk_digest=%chunk_digest, "open_read on chunk in stream");
+                    let mut blob_reader = blob_service
+                        .as_ref()
+                        .open_read(&chunk_digest.to_owned())
+                        .await?
+                        .ok_or_else(|| {
+                            warn!(chunk.digest = %chunk_digest, "chunk not found");
+                            std::io::Error::new(std::io::ErrorKind::NotFound, "chunk not found")
+                        })?;
+
+                    // iff this is the first chunk in the stream, skip by skip_first_chunk_bytes
+                    if nth_chunk == 0 && skip_first_chunk_bytes > 0 {
+                        blob_reader
+                            .seek(std::io::SeekFrom::Start(skip_first_chunk_bytes as u64))
+                            .await?;
+                    }
+                    Ok::<_, std::io::Error>(blob_reader)
+                }
+            },
+        );
+
+        // convert the stream of readers to a stream of streams of byte chunks
+        let bytes_streams = readers_stream.then(|elem| async { elem.await.map(ReaderStream::new) });
+
+        // flatten into one stream of byte chunks
+        let bytes_stream = bytes_streams.try_flatten();
+
+        // convert into AsyncRead
+        Box::new(StreamReader::new(Box::pin(bytes_stream)))
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::{io::SeekFrom, sync::Arc};
+
+    use crate::{
+        blobservice::{chunked_reader::ChunkedReader, BlobService, MemoryBlobService},
+        B3Digest,
+    };
+    use hex_literal::hex;
+    use lazy_static::lazy_static;
+    use tokio::io::{AsyncReadExt, AsyncSeekExt};
+
+    const CHUNK_1: [u8; 2] = hex!("0001");
+    const CHUNK_2: [u8; 4] = hex!("02030405");
+    const CHUNK_3: [u8; 1] = hex!("06");
+    const CHUNK_4: [u8; 2] = hex!("0708");
+    const CHUNK_5: [u8; 7] = hex!("090a0b0c0d0e0f");
+
+    lazy_static! {
+        // `[ 0 1 ] [ 2 3 4 5 ] [ 6 ] [ 7 8 ] [ 9 10 11 12 13 14 15 ]`
+        pub static ref CHUNK_1_DIGEST: B3Digest = blake3::hash(&CHUNK_1).as_bytes().into();
+        pub static ref CHUNK_2_DIGEST: B3Digest = blake3::hash(&CHUNK_2).as_bytes().into();
+        pub static ref CHUNK_3_DIGEST: B3Digest = blake3::hash(&CHUNK_3).as_bytes().into();
+        pub static ref CHUNK_4_DIGEST: B3Digest = blake3::hash(&CHUNK_4).as_bytes().into();
+        pub static ref CHUNK_5_DIGEST: B3Digest = blake3::hash(&CHUNK_5).as_bytes().into();
+        pub static ref BLOB_1_LIST: [(B3Digest, u64); 5] = [
+            (CHUNK_1_DIGEST.clone(), 2),
+            (CHUNK_2_DIGEST.clone(), 4),
+            (CHUNK_3_DIGEST.clone(), 1),
+            (CHUNK_4_DIGEST.clone(), 2),
+            (CHUNK_5_DIGEST.clone(), 7),
+        ];
+    }
+
+    use super::ChunkedBlob;
+
+    /// ensure the start offsets are properly calculated.
+    #[test]
+    fn from_iter() {
+        let cb = ChunkedBlob::from_iter(
+            BLOB_1_LIST.clone().into_iter(),
+            Arc::new(MemoryBlobService::default()) as Arc<dyn BlobService>,
+        );
+
+        assert_eq!(
+            cb.chunks,
+            Vec::from_iter([
+                (0, 2, CHUNK_1_DIGEST.clone()),
+                (2, 4, CHUNK_2_DIGEST.clone()),
+                (6, 1, CHUNK_3_DIGEST.clone()),
+                (7, 2, CHUNK_4_DIGEST.clone()),
+                (9, 7, CHUNK_5_DIGEST.clone()),
+            ])
+        );
+    }
+
+    /// ensure ChunkedBlob can't be used with an empty list of chunks
+    #[test]
+    #[should_panic]
+    fn from_iter_empty() {
+        ChunkedBlob::from_iter(
+            [].into_iter(),
+            Arc::new(MemoryBlobService::default()) as Arc<dyn BlobService>,
+        );
+    }
+
+    /// ensure the right chunk is selected
+    #[test]
+    fn chunk_idx_for_position() {
+        let cb = ChunkedBlob::from_iter(
+            BLOB_1_LIST.clone().into_iter(),
+            Arc::new(MemoryBlobService::default()) as Arc<dyn BlobService>,
+        );
+
+        assert_eq!(Some(0), cb.get_chunk_idx_for_position(0), "start of blob");
+
+        assert_eq!(
+            Some(0),
+            cb.get_chunk_idx_for_position(1),
+            "middle of first chunk"
+        );
+        assert_eq!(
+            Some(1),
+            cb.get_chunk_idx_for_position(2),
+            "beginning of second chunk"
+        );
+
+        assert_eq!(
+            Some(4),
+            cb.get_chunk_idx_for_position(15),
+            "right before the end of the blob"
+        );
+        assert_eq!(
+            None,
+            cb.get_chunk_idx_for_position(16),
+            "right outside the blob"
+        );
+        assert_eq!(
+            None,
+            cb.get_chunk_idx_for_position(100),
+            "way outside the blob"
+        );
+    }
+
+    /// returns a blobservice with all chunks in BLOB_1 present.
+    async fn gen_blobservice_blob1() -> Arc<dyn BlobService> {
+        let blob_service = Arc::new(MemoryBlobService::default()) as Arc<dyn BlobService>;
+
+        // seed blob service with all chunks
+        for blob_contents in [
+            CHUNK_1.to_vec(),
+            CHUNK_2.to_vec(),
+            CHUNK_3.to_vec(),
+            CHUNK_4.to_vec(),
+            CHUNK_5.to_vec(),
+        ] {
+            let mut bw = blob_service.open_write().await;
+            tokio::io::copy(&mut std::io::Cursor::new(blob_contents), &mut bw)
+                .await
+                .expect("writing blob");
+            bw.close().await.expect("close blobwriter");
+        }
+
+        blob_service
+    }
+
+    #[tokio::test]
+    async fn test_read() {
+        let blob_service = gen_blobservice_blob1().await;
+        let mut chunked_reader =
+            ChunkedReader::from_chunks(BLOB_1_LIST.clone().into_iter(), blob_service);
+
+        // read all data
+        let mut buf = Vec::new();
+        tokio::io::copy(&mut chunked_reader, &mut buf)
+            .await
+            .expect("copy");
+
+        assert_eq!(
+            hex!("000102030405060708090a0b0c0d0e0f").to_vec(),
+            buf,
+            "read data must match"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_seek() {
+        let blob_service = gen_blobservice_blob1().await;
+        let mut chunked_reader =
+            ChunkedReader::from_chunks(BLOB_1_LIST.clone().into_iter(), blob_service);
+
+        // seek to the end
+        // expect to read 0 bytes
+        {
+            chunked_reader
+                .seek(SeekFrom::End(0))
+                .await
+                .expect("seek to end");
+
+            let mut buf = Vec::new();
+            chunked_reader
+                .read_to_end(&mut buf)
+                .await
+                .expect("read to end");
+
+            assert_eq!(hex!("").to_vec(), buf);
+        }
+
+        // seek one bytes before the end
+        {
+            chunked_reader.seek(SeekFrom::End(-1)).await.expect("seek");
+
+            let mut buf = Vec::new();
+            chunked_reader
+                .read_to_end(&mut buf)
+                .await
+                .expect("read to end");
+
+            assert_eq!(hex!("0f").to_vec(), buf);
+        }
+
+        // seek back three bytes, but using relative positioning
+        // read two bytes
+        {
+            chunked_reader
+                .seek(SeekFrom::Current(-3))
+                .await
+                .expect("seek");
+
+            let mut buf = [0b0; 2];
+            chunked_reader
+                .read_exact(&mut buf)
+                .await
+                .expect("read exact");
+
+            assert_eq!(hex!("0d0e"), buf);
+        }
+    }
+
+    // seeds a blob service with only the first two chunks, reads a bit in the
+    // front (which succeeds), but then tries to seek past and read more (which
+    // should fail).
+    #[tokio::test]
+    async fn test_read_missing_chunks() {
+        let blob_service = Arc::new(MemoryBlobService::default()) as Arc<dyn BlobService>;
+
+        for blob_contents in [CHUNK_1.to_vec(), CHUNK_2.to_vec()] {
+            let mut bw = blob_service.open_write().await;
+            tokio::io::copy(&mut std::io::Cursor::new(blob_contents), &mut bw)
+                .await
+                .expect("writing blob");
+
+            bw.close().await.expect("close blobwriter");
+        }
+
+        let mut chunked_reader =
+            ChunkedReader::from_chunks(BLOB_1_LIST.clone().into_iter(), blob_service);
+
+        // read a bit from the front (5 bytes out of 6 available)
+        let mut buf = [0b0; 5];
+        chunked_reader
+            .read_exact(&mut buf)
+            .await
+            .expect("read exact");
+
+        assert_eq!(hex!("0001020304"), buf);
+
+        // seek 2 bytes forward, into an area where we don't have chunks
+        chunked_reader
+            .seek(SeekFrom::Current(2))
+            .await
+            .expect("seek");
+
+        let mut buf = Vec::new();
+        chunked_reader
+            .read_to_end(&mut buf)
+            .await
+            .expect_err("must fail");
+
+        // FUTUREWORK: check semantics on errorkinds. Should this be InvalidData
+        // or NotFound?
+    }
+}
diff --git a/tvix/castore/src/blobservice/combinator.rs b/tvix/castore/src/blobservice/combinator.rs
new file mode 100644
index 0000000000..067eff96f4
--- /dev/null
+++ b/tvix/castore/src/blobservice/combinator.rs
@@ -0,0 +1,132 @@
+use futures::{StreamExt, TryStreamExt};
+use tokio_util::io::{ReaderStream, StreamReader};
+use tonic::async_trait;
+use tracing::{instrument, warn};
+
+use crate::B3Digest;
+
+use super::{naive_seeker::NaiveSeeker, BlobReader, BlobService, BlobWriter};
+
+/// Combinator for a BlobService, using a "local" and "remote" blobservice.
+/// Requests are tried in (and returned from) the local store first, only if
+/// things are not present there, the remote BlobService is queried.
+/// In case the local blobservice doesn't have the blob, we ask the remote
+/// blobservice for chunks, and try to read each of these chunks from the local
+/// blobservice again, before falling back to the remote one.
+/// The remote BlobService is never written to.
+pub struct CombinedBlobService<BL, BR> {
+    local: BL,
+    remote: BR,
+}
+
+impl<BL, BR> Clone for CombinedBlobService<BL, BR>
+where
+    BL: Clone,
+    BR: Clone,
+{
+    fn clone(&self) -> Self {
+        Self {
+            local: self.local.clone(),
+            remote: self.remote.clone(),
+        }
+    }
+}
+
+#[async_trait]
+impl<BL, BR> BlobService for CombinedBlobService<BL, BR>
+where
+    BL: AsRef<dyn BlobService> + Clone + Send + Sync + 'static,
+    BR: AsRef<dyn BlobService> + Clone + Send + Sync + 'static,
+{
+    #[instrument(skip(self, digest), fields(blob.digest=%digest))]
+    async fn has(&self, digest: &B3Digest) -> std::io::Result<bool> {
+        Ok(self.local.as_ref().has(digest).await? || self.remote.as_ref().has(digest).await?)
+    }
+
+    #[instrument(skip(self, digest), fields(blob.digest=%digest), err)]
+    async fn open_read(&self, digest: &B3Digest) -> std::io::Result<Option<Box<dyn BlobReader>>> {
+        if self.local.as_ref().has(digest).await? {
+            // local store has the blob, so we can assume it also has all chunks.
+            self.local.as_ref().open_read(digest).await
+        } else {
+            // Local store doesn't have the blob.
+            // Ask the remote one for the list of chunks,
+            // and create a chunked reader that uses self.open_read() for
+            // individual chunks. There's a chance we already have some chunks
+            // locally, meaning we don't need to fetch them all from the remote
+            // BlobService.
+            match self.remote.as_ref().chunks(digest).await? {
+                // blob doesn't exist on the remote side either, nothing we can do.
+                None => Ok(None),
+                Some(remote_chunks) => {
+                    // if there's no more granular chunks, or the remote
+                    // blobservice doesn't support chunks, read the blob from
+                    // the remote blobservice directly.
+                    if remote_chunks.is_empty() {
+                        return self.remote.as_ref().open_read(digest).await;
+                    }
+                    // otherwise, a chunked reader, which will always try the
+                    // local backend first.
+
+                    // map Vec<ChunkMeta> to Vec<(B3Digest, u64)>
+                    let chunks: Vec<(B3Digest, u64)> = remote_chunks
+                        .into_iter()
+                        .map(|chunk_meta| {
+                            (
+                                B3Digest::try_from(chunk_meta.digest)
+                                    .expect("invalid chunk digest"),
+                                chunk_meta.size,
+                            )
+                        })
+                        .collect();
+
+                    Ok(Some(make_chunked_reader(self.clone(), chunks)))
+                }
+            }
+        }
+    }
+
+    #[instrument(skip_all)]
+    async fn open_write(&self) -> Box<dyn BlobWriter> {
+        // direct writes to the local one.
+        self.local.as_ref().open_write().await
+    }
+}
+
+fn make_chunked_reader<BS>(
+    // This must consume, as we can't retain references to blob_service,
+    // as it'd add a lifetime to BlobReader in general, which will get
+    // problematic in TvixStoreFs, which is using async move closures and cloning.
+    blob_service: BS,
+    // A list of b3 digests for individual chunks, and their sizes.
+    chunks: Vec<(B3Digest, u64)>,
+) -> Box<dyn BlobReader>
+where
+    BS: BlobService + Clone + 'static,
+{
+    // TODO: offset, verified streaming
+
+    // construct readers for each chunk
+    let blob_service = blob_service.clone();
+    let readers_stream = tokio_stream::iter(chunks).map(move |(digest, _)| {
+        let d = digest.to_owned();
+        let blob_service = blob_service.clone();
+        async move {
+            blob_service.open_read(&d.to_owned()).await?.ok_or_else(|| {
+                warn!(chunk.digest = %digest, "chunk not found");
+                std::io::Error::new(std::io::ErrorKind::NotFound, "chunk not found")
+            })
+        }
+    });
+
+    // convert the stream of readers to a stream of streams of byte chunks
+    let bytes_streams = readers_stream.then(|elem| async { elem.await.map(ReaderStream::new) });
+
+    // flatten into one stream of byte chunks
+    let bytes_stream = bytes_streams.try_flatten();
+
+    // convert into AsyncRead
+    let blob_reader = StreamReader::new(bytes_stream);
+
+    Box::new(NaiveSeeker::new(Box::pin(blob_reader)))
+}
diff --git a/tvix/castore/src/blobservice/from_addr.rs b/tvix/castore/src/blobservice/from_addr.rs
new file mode 100644
index 0000000000..8898bbfb95
--- /dev/null
+++ b/tvix/castore/src/blobservice/from_addr.rs
@@ -0,0 +1,111 @@
+use url::Url;
+
+use crate::{proto::blob_service_client::BlobServiceClient, Error};
+
+use super::{BlobService, GRPCBlobService, MemoryBlobService, ObjectStoreBlobService};
+
+/// Constructs a new instance of a [BlobService] from an URI.
+///
+/// The following schemes are supported by the following services:
+/// - `memory://` ([MemoryBlobService])
+/// - `grpc+*://` ([GRPCBlobService])
+/// - `objectstore+*://` ([ObjectStoreBlobService])
+///
+/// See their `from_url` methods for more details about their syntax.
+pub async fn from_addr(uri: &str) -> Result<Box<dyn BlobService>, crate::Error> {
+    let url = Url::parse(uri)
+        .map_err(|e| crate::Error::StorageError(format!("unable to parse url: {}", e)))?;
+
+    let blob_service: Box<dyn BlobService> = match url.scheme() {
+        "memory" => {
+            // memory doesn't support host or path in the URL.
+            if url.has_host() || !url.path().is_empty() {
+                return Err(Error::StorageError("invalid url".to_string()));
+            }
+            Box::<MemoryBlobService>::default()
+        }
+        scheme if scheme.starts_with("grpc+") => {
+            // schemes starting with grpc+ go to the GRPCPathInfoService.
+            //   That's normally grpc+unix for unix sockets, and grpc+http(s) for the HTTP counterparts.
+            // - In the case of unix sockets, there must be a path, but may not be a host.
+            // - In the case of non-unix sockets, there must be a host, but no path.
+            // Constructing the channel is handled by tvix_castore::channel::from_url.
+            let client = BlobServiceClient::new(crate::tonic::channel_from_url(&url).await?);
+            Box::new(GRPCBlobService::from_client(client))
+        }
+        scheme if scheme.starts_with("objectstore+") => {
+            // We need to convert the URL to string, strip the prefix there, and then
+            // parse it back as url, as Url::set_scheme() rejects some of the transitions we want to do.
+            let trimmed_url = {
+                let s = url.to_string();
+                Url::parse(s.strip_prefix("objectstore+").unwrap()).unwrap()
+            };
+            Box::new(
+                ObjectStoreBlobService::parse_url(&trimmed_url)
+                    .map_err(|e| Error::StorageError(e.to_string()))?,
+            )
+        }
+        scheme => {
+            return Err(crate::Error::StorageError(format!(
+                "unknown scheme: {}",
+                scheme
+            )))
+        }
+    };
+
+    Ok(blob_service)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::from_addr;
+    use rstest::rstest;
+
+    #[rstest]
+    /// This uses an unsupported scheme.
+    #[case::unsupported_scheme("http://foo.example/test", false)]
+    /// This correctly sets the scheme, and doesn't set a path.
+    #[case::memory_valid("memory://", true)]
+    /// This sets a memory url host to `foo`
+    #[case::memory_invalid_host("memory://foo", false)]
+    /// This sets a memory url path to "/", which is invalid.
+    #[case::memory_invalid_root_path("memory:///", false)]
+    /// This sets a memory url path to "/foo", which is invalid.
+    #[case::memory_invalid_root_path_foo("memory:///foo", false)]
+    /// Correct scheme to connect to a unix socket.
+    #[case::grpc_valid_unix_socket("grpc+unix:///path/to/somewhere", true)]
+    /// Correct scheme for unix socket, but setting a host too, which is invalid.
+    #[case::grpc_invalid_unix_socket_and_host("grpc+unix://host.example/path/to/somewhere", false)]
+    /// Correct scheme to connect to localhost, with port 12345
+    #[case::grpc_valid_ipv6_localhost_port_12345("grpc+http://[::1]:12345", true)]
+    /// Correct scheme to connect to localhost over http, without specifying a port.
+    #[case::grpc_valid_http_host_without_port("grpc+http://localhost", true)]
+    /// Correct scheme to connect to localhost over http, without specifying a port.
+    #[case::grpc_valid_https_host_without_port("grpc+https://localhost", true)]
+    /// Correct scheme to connect to localhost over http, but with additional path, which is invalid.
+    #[case::grpc_invalid_has_path("grpc+http://localhost/some-path", false)]
+    /// An example for object store (InMemory)
+    #[case::objectstore_valid_memory("objectstore+memory:///", true)]
+    /// An example for object store (LocalFileSystem)
+    #[case::objectstore_valid_file("objectstore+file:///foo/bar", true)]
+    // An example for object store (HTTP / WebDAV)
+    #[case::objectstore_valid_http_url("objectstore+https://localhost:8080/some-path", true)]
+    /// An example for object store (S3)
+    #[cfg_attr(
+        feature = "cloud",
+        case::objectstore_valid_s3_url("objectstore+s3://bucket/path", true)
+    )]
+    /// An example for object store (GCS)
+    #[cfg_attr(
+        feature = "cloud",
+        case::objectstore_valid_gcs_url("objectstore+gs://bucket/path", true)
+    )]
+    #[tokio::test]
+    async fn test_from_addr_tokio(#[case] uri_str: &str, #[case] exp_succeed: bool) {
+        if exp_succeed {
+            from_addr(uri_str).await.expect("should succeed");
+        } else {
+            assert!(from_addr(uri_str).await.is_err(), "should fail");
+        }
+    }
+}
diff --git a/tvix/castore/src/blobservice/grpc.rs b/tvix/castore/src/blobservice/grpc.rs
new file mode 100644
index 0000000000..5663cd3838
--- /dev/null
+++ b/tvix/castore/src/blobservice/grpc.rs
@@ -0,0 +1,349 @@
+use super::{BlobReader, BlobService, BlobWriter, ChunkedReader};
+use crate::{
+    proto::{self, stat_blob_response::ChunkMeta},
+    B3Digest,
+};
+use futures::sink::SinkExt;
+use std::{
+    io::{self, Cursor},
+    pin::pin,
+    sync::Arc,
+    task::Poll,
+};
+use tokio::io::AsyncWriteExt;
+use tokio::task::JoinHandle;
+use tokio_stream::{wrappers::ReceiverStream, StreamExt};
+use tokio_util::{
+    io::{CopyToBytes, SinkWriter},
+    sync::PollSender,
+};
+use tonic::{async_trait, transport::Channel, Code, Status};
+use tracing::instrument;
+
+/// Connects to a (remote) tvix-store BlobService over gRPC.
+#[derive(Clone)]
+pub struct GRPCBlobService {
+    /// The internal reference to a gRPC client.
+    /// Cloning it is cheap, and it internally handles concurrent requests.
+    grpc_client: proto::blob_service_client::BlobServiceClient<Channel>,
+}
+
+impl GRPCBlobService {
+    /// construct a [GRPCBlobService] from a [proto::blob_service_client::BlobServiceClient].
+    /// panics if called outside the context of a tokio runtime.
+    pub fn from_client(
+        grpc_client: proto::blob_service_client::BlobServiceClient<Channel>,
+    ) -> Self {
+        Self { grpc_client }
+    }
+}
+
+#[async_trait]
+impl BlobService for GRPCBlobService {
+    #[instrument(skip(self, digest), fields(blob.digest=%digest))]
+    async fn has(&self, digest: &B3Digest) -> io::Result<bool> {
+        let mut grpc_client = self.grpc_client.clone();
+        let resp = grpc_client
+            .stat(proto::StatBlobRequest {
+                digest: digest.clone().into(),
+                ..Default::default()
+            })
+            .await;
+
+        match resp {
+            Ok(_blob_meta) => Ok(true),
+            Err(e) if e.code() == Code::NotFound => Ok(false),
+            Err(e) => Err(io::Error::new(io::ErrorKind::Other, e)),
+        }
+    }
+
+    #[instrument(skip(self, digest), fields(blob.digest=%digest), err)]
+    async fn open_read(&self, digest: &B3Digest) -> io::Result<Option<Box<dyn BlobReader>>> {
+        // First try to get a list of chunks. In case there's only one chunk returned,
+        // buffer its data into a Vec, otherwise use a ChunkedReader.
+        // We previously used NaiveSeeker here, but userland likes to seek backwards too often,
+        // and without store composition this will get very noisy.
+        // FUTUREWORK: use CombinedBlobService and store composition.
+        match self.chunks(digest).await {
+            Ok(None) => Ok(None),
+            Ok(Some(chunks)) => {
+                if chunks.is_empty() || chunks.len() == 1 {
+                    // No more granular chunking info, treat this as an individual chunk.
+                    // Get a stream of [proto::BlobChunk], or return an error if the blob
+                    // doesn't exist.
+                    return match self
+                        .grpc_client
+                        .clone()
+                        .read(proto::ReadBlobRequest {
+                            digest: digest.clone().into(),
+                        })
+                        .await
+                    {
+                        Ok(stream) => {
+                            let data_stream = stream.into_inner().map(|e| {
+                                e.map(|c| c.data)
+                                    .map_err(|s| std::io::Error::new(io::ErrorKind::InvalidData, s))
+                            });
+
+                            // Use StreamReader::new to convert to an AsyncRead.
+                            let mut data_reader = tokio_util::io::StreamReader::new(data_stream);
+
+                            let mut buf = Vec::new();
+                            // TODO: only do this up to a certain limit.
+                            tokio::io::copy(&mut data_reader, &mut buf).await?;
+
+                            Ok(Some(Box::new(Cursor::new(buf))))
+                        }
+                        Err(e) if e.code() == Code::NotFound => Ok(None),
+                        Err(e) => Err(io::Error::new(io::ErrorKind::Other, e)),
+                    };
+                }
+
+                // The chunked case. Let ChunkedReader do individual reads.
+                // TODO: we should store the chunking data in some local cache,
+                // so `ChunkedReader` doesn't call `self.chunks` *again* for every chunk.
+                // Think about how store composition will fix this.
+                let chunked_reader = ChunkedReader::from_chunks(
+                    chunks.into_iter().map(|chunk| {
+                        (
+                            chunk.digest.try_into().expect("invalid b3 digest"),
+                            chunk.size,
+                        )
+                    }),
+                    Arc::new(self.clone()) as Arc<dyn BlobService>,
+                );
+                Ok(Some(Box::new(chunked_reader)))
+            }
+            Err(e) => Err(e)?,
+        }
+    }
+
+    /// Returns a BlobWriter, that'll internally wrap each write in a
+    /// [proto::BlobChunk], which is send to the gRPC server.
+    #[instrument(skip_all)]
+    async fn open_write(&self) -> Box<dyn BlobWriter> {
+        // set up an mpsc channel passing around Bytes.
+        let (tx, rx) = tokio::sync::mpsc::channel::<bytes::Bytes>(10);
+
+        // bytes arriving on the RX side are wrapped inside a
+        // [proto::BlobChunk], and a [ReceiverStream] is constructed.
+        let blobchunk_stream = ReceiverStream::new(rx).map(|x| proto::BlobChunk { data: x });
+
+        // spawn the gRPC put request, which will read from blobchunk_stream.
+        let task = tokio::spawn({
+            let mut grpc_client = self.grpc_client.clone();
+            async move { Ok::<_, Status>(grpc_client.put(blobchunk_stream).await?.into_inner()) }
+        });
+
+        // The tx part of the channel is converted to a sink of byte chunks.
+        let sink = PollSender::new(tx)
+            .sink_map_err(|e| std::io::Error::new(std::io::ErrorKind::BrokenPipe, e));
+
+        // … which is turned into an [tokio::io::AsyncWrite].
+        let writer = SinkWriter::new(CopyToBytes::new(sink));
+
+        Box::new(GRPCBlobWriter {
+            task_and_writer: Some((task, writer)),
+            digest: None,
+        })
+    }
+
+    #[instrument(skip(self, digest), fields(blob.digest=%digest), err)]
+    async fn chunks(&self, digest: &B3Digest) -> io::Result<Option<Vec<ChunkMeta>>> {
+        let resp = self
+            .grpc_client
+            .clone()
+            .stat(proto::StatBlobRequest {
+                digest: digest.clone().into(),
+                send_chunks: true,
+                ..Default::default()
+            })
+            .await;
+
+        match resp {
+            Err(e) if e.code() == Code::NotFound => Ok(None),
+            Err(e) => Err(io::Error::new(io::ErrorKind::Other, e)),
+            Ok(resp) => {
+                let resp = resp.into_inner();
+
+                resp.validate()
+                    .map_err(|e| std::io::Error::new(io::ErrorKind::InvalidData, e))?;
+
+                Ok(Some(resp.chunks))
+            }
+        }
+    }
+}
+
+pub struct GRPCBlobWriter<W: tokio::io::AsyncWrite> {
+    /// The task containing the put request, and the inner writer, if we're still writing.
+    task_and_writer: Option<(JoinHandle<Result<proto::PutBlobResponse, Status>>, W)>,
+
+    /// The digest that has been returned, if we successfully closed.
+    digest: Option<B3Digest>,
+}
+
+#[async_trait]
+impl<W: tokio::io::AsyncWrite + Send + Sync + Unpin + 'static> BlobWriter for GRPCBlobWriter<W> {
+    async fn close(&mut self) -> io::Result<B3Digest> {
+        if self.task_and_writer.is_none() {
+            // if we're already closed, return the b3 digest, which must exist.
+            // If it doesn't, we already closed and failed once, and didn't handle the error.
+            match &self.digest {
+                Some(digest) => Ok(digest.clone()),
+                None => Err(io::Error::new(io::ErrorKind::BrokenPipe, "already closed")),
+            }
+        } else {
+            let (task, mut writer) = self.task_and_writer.take().unwrap();
+
+            // invoke shutdown, so the inner writer closes its internal tx side of
+            // the channel.
+            writer.shutdown().await?;
+
+            // block on the RPC call to return.
+            // This ensures all chunks are sent out, and have been received by the
+            // backend.
+
+            match task.await? {
+                Ok(resp) => {
+                    // return the digest from the response, and store it in self.digest for subsequent closes.
+                    let digest_len = resp.digest.len();
+                    let digest: B3Digest = resp.digest.try_into().map_err(|_| {
+                        io::Error::new(
+                            io::ErrorKind::Other,
+                            format!("invalid root digest length {} in response", digest_len),
+                        )
+                    })?;
+                    self.digest = Some(digest.clone());
+                    Ok(digest)
+                }
+                Err(e) => Err(io::Error::new(io::ErrorKind::Other, e.to_string())),
+            }
+        }
+    }
+}
+
+impl<W: tokio::io::AsyncWrite + Unpin> tokio::io::AsyncWrite for GRPCBlobWriter<W> {
+    fn poll_write(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &[u8],
+    ) -> std::task::Poll<Result<usize, io::Error>> {
+        match &mut self.task_and_writer {
+            None => Poll::Ready(Err(io::Error::new(
+                io::ErrorKind::NotConnected,
+                "already closed",
+            ))),
+            Some((_, ref mut writer)) => {
+                let pinned_writer = pin!(writer);
+                pinned_writer.poll_write(cx, buf)
+            }
+        }
+    }
+
+    fn poll_flush(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Result<(), io::Error>> {
+        match &mut self.task_and_writer {
+            None => Poll::Ready(Err(io::Error::new(
+                io::ErrorKind::NotConnected,
+                "already closed",
+            ))),
+            Some((_, ref mut writer)) => {
+                let pinned_writer = pin!(writer);
+                pinned_writer.poll_flush(cx)
+            }
+        }
+    }
+
+    fn poll_shutdown(
+        self: std::pin::Pin<&mut Self>,
+        _cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Result<(), io::Error>> {
+        // TODO(raitobezarius): this might not be a graceful shutdown of the
+        // channel inside the gRPC connection.
+        Poll::Ready(Ok(()))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+
+    use tempfile::TempDir;
+    use tokio::net::UnixListener;
+    use tokio_retry::strategy::ExponentialBackoff;
+    use tokio_retry::Retry;
+    use tokio_stream::wrappers::UnixListenerStream;
+
+    use crate::blobservice::MemoryBlobService;
+    use crate::fixtures;
+    use crate::proto::blob_service_client::BlobServiceClient;
+    use crate::proto::GRPCBlobServiceWrapper;
+
+    use super::BlobService;
+    use super::GRPCBlobService;
+
+    /// This ensures connecting via gRPC works as expected.
+    #[tokio::test]
+    async fn test_valid_unix_path_ping_pong() {
+        let tmpdir = TempDir::new().unwrap();
+        let socket_path = tmpdir.path().join("daemon");
+
+        let path_clone = socket_path.clone();
+
+        // Spin up a server
+        tokio::spawn(async {
+            let uds = UnixListener::bind(path_clone).unwrap();
+            let uds_stream = UnixListenerStream::new(uds);
+
+            // spin up a new server
+            let mut server = tonic::transport::Server::builder();
+            let router =
+                server.add_service(crate::proto::blob_service_server::BlobServiceServer::new(
+                    GRPCBlobServiceWrapper::new(
+                        Box::<MemoryBlobService>::default() as Box<dyn BlobService>
+                    ),
+                ));
+            router.serve_with_incoming(uds_stream).await
+        });
+
+        // wait for the socket to be created
+        Retry::spawn(
+            ExponentialBackoff::from_millis(20).max_delay(Duration::from_secs(10)),
+            || async {
+                if socket_path.exists() {
+                    Ok(())
+                } else {
+                    Err(())
+                }
+            },
+        )
+        .await
+        .expect("failed to wait for socket");
+
+        // prepare a client
+        let grpc_client = {
+            let url = url::Url::parse(&format!(
+                "grpc+unix://{}?wait-connect=1",
+                socket_path.display()
+            ))
+            .expect("must parse");
+            let client = BlobServiceClient::new(
+                crate::tonic::channel_from_url(&url)
+                    .await
+                    .expect("must succeed"),
+            );
+
+            GRPCBlobService::from_client(client)
+        };
+
+        let has = grpc_client
+            .has(&fixtures::BLOB_A_DIGEST)
+            .await
+            .expect("must not be err");
+
+        assert!(!has);
+    }
+}
diff --git a/tvix/castore/src/blobservice/memory.rs b/tvix/castore/src/blobservice/memory.rs
new file mode 100644
index 0000000000..873d06b461
--- /dev/null
+++ b/tvix/castore/src/blobservice/memory.rs
@@ -0,0 +1,127 @@
+use parking_lot::RwLock;
+use std::io::{self, Cursor, Write};
+use std::task::Poll;
+use std::{collections::HashMap, sync::Arc};
+use tonic::async_trait;
+use tracing::instrument;
+
+use super::{BlobReader, BlobService, BlobWriter};
+use crate::B3Digest;
+
+#[derive(Clone, Default)]
+pub struct MemoryBlobService {
+    db: Arc<RwLock<HashMap<B3Digest, Vec<u8>>>>,
+}
+
+#[async_trait]
+impl BlobService for MemoryBlobService {
+    #[instrument(skip_all, ret, err, fields(blob.digest=%digest))]
+    async fn has(&self, digest: &B3Digest) -> io::Result<bool> {
+        let db = self.db.read();
+        Ok(db.contains_key(digest))
+    }
+
+    #[instrument(skip_all, err, fields(blob.digest=%digest))]
+    async fn open_read(&self, digest: &B3Digest) -> io::Result<Option<Box<dyn BlobReader>>> {
+        let db = self.db.read();
+
+        match db.get(digest).map(|x| Cursor::new(x.clone())) {
+            Some(result) => Ok(Some(Box::new(result))),
+            None => Ok(None),
+        }
+    }
+
+    #[instrument(skip_all)]
+    async fn open_write(&self) -> Box<dyn BlobWriter> {
+        Box::new(MemoryBlobWriter::new(self.db.clone()))
+    }
+}
+
+pub struct MemoryBlobWriter {
+    db: Arc<RwLock<HashMap<B3Digest, Vec<u8>>>>,
+
+    /// Contains the buffer Vec and hasher, or None if already closed
+    writers: Option<(Vec<u8>, blake3::Hasher)>,
+
+    /// The digest that has been returned, if we successfully closed.
+    digest: Option<B3Digest>,
+}
+
+impl MemoryBlobWriter {
+    fn new(db: Arc<RwLock<HashMap<B3Digest, Vec<u8>>>>) -> Self {
+        Self {
+            db,
+            writers: Some((Vec::new(), blake3::Hasher::new())),
+            digest: None,
+        }
+    }
+}
+impl tokio::io::AsyncWrite for MemoryBlobWriter {
+    fn poll_write(
+        mut self: std::pin::Pin<&mut Self>,
+        _cx: &mut std::task::Context<'_>,
+        b: &[u8],
+    ) -> std::task::Poll<Result<usize, io::Error>> {
+        Poll::Ready(match &mut self.writers {
+            None => Err(io::Error::new(
+                io::ErrorKind::NotConnected,
+                "already closed",
+            )),
+            Some((ref mut buf, ref mut hasher)) => {
+                let bytes_written = buf.write(b)?;
+                hasher.write(&b[..bytes_written])
+            }
+        })
+    }
+
+    fn poll_flush(
+        self: std::pin::Pin<&mut Self>,
+        _cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Result<(), io::Error>> {
+        Poll::Ready(match self.writers {
+            None => Err(io::Error::new(
+                io::ErrorKind::NotConnected,
+                "already closed",
+            )),
+            Some(_) => Ok(()),
+        })
+    }
+
+    fn poll_shutdown(
+        self: std::pin::Pin<&mut Self>,
+        _cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Result<(), io::Error>> {
+        // shutdown is "instantaneous", we only write to memory.
+        Poll::Ready(Ok(()))
+    }
+}
+
+#[async_trait]
+impl BlobWriter for MemoryBlobWriter {
+    async fn close(&mut self) -> io::Result<B3Digest> {
+        if self.writers.is_none() {
+            match &self.digest {
+                Some(digest) => Ok(digest.clone()),
+                None => Err(io::Error::new(io::ErrorKind::BrokenPipe, "already closed")),
+            }
+        } else {
+            let (buf, hasher) = self.writers.take().unwrap();
+
+            let digest: B3Digest = hasher.finalize().as_bytes().into();
+
+            // Only insert if the blob doesn't already exist.
+            let mut db = self.db.upgradable_read();
+            if !db.contains_key(&digest) {
+                // open the database for writing.
+                db.with_upgraded(|db| {
+                    // and put buf in there. This will move buf out.
+                    db.insert(digest.clone(), buf);
+                });
+            }
+
+            self.digest = Some(digest.clone());
+
+            Ok(digest)
+        }
+    }
+}
diff --git a/tvix/castore/src/blobservice/mod.rs b/tvix/castore/src/blobservice/mod.rs
new file mode 100644
index 0000000000..50acd40bf7
--- /dev/null
+++ b/tvix/castore/src/blobservice/mod.rs
@@ -0,0 +1,103 @@
+use std::io;
+use tonic::async_trait;
+
+use crate::proto::stat_blob_response::ChunkMeta;
+use crate::B3Digest;
+
+mod chunked_reader;
+mod combinator;
+mod from_addr;
+mod grpc;
+mod memory;
+mod naive_seeker;
+mod object_store;
+
+#[cfg(test)]
+pub mod tests;
+
+pub use self::chunked_reader::ChunkedReader;
+pub use self::combinator::CombinedBlobService;
+pub use self::from_addr::from_addr;
+pub use self::grpc::GRPCBlobService;
+pub use self::memory::MemoryBlobService;
+pub use self::object_store::ObjectStoreBlobService;
+
+/// The base trait all BlobService services need to implement.
+/// It provides functions to check whether a given blob exists,
+/// a way to read (and seek) a blob, and a method to create a blobwriter handle,
+/// which will implement a writer interface, and also provides a close funtion,
+/// to finalize a blob and get its digest.
+#[async_trait]
+pub trait BlobService: Send + Sync {
+    /// Check if the service has the blob, by its content hash.
+    /// On implementations returning chunks, this must also work for chunks.
+    async fn has(&self, digest: &B3Digest) -> io::Result<bool>;
+
+    /// Request a blob from the store, by its content hash.
+    /// On implementations returning chunks, this must also work for chunks.
+    async fn open_read(&self, digest: &B3Digest) -> io::Result<Option<Box<dyn BlobReader>>>;
+
+    /// Insert a new blob into the store. Returns a [BlobWriter], which
+    /// implements [tokio::io::AsyncWrite] and a [BlobWriter::close] to finalize
+    /// the blob and get its digest.
+    async fn open_write(&self) -> Box<dyn BlobWriter>;
+
+    /// Return a list of chunks for a given blob.
+    /// There's a distinction between returning Ok(None) and Ok(Some(vec![])).
+    /// The former return value is sent in case the blob is not present at all,
+    /// while the second one is sent in case there's no more granular chunks (or
+    /// the backend does not support chunking).
+    /// A default implementation checking for existence and then returning it
+    /// does not have more granular chunks available is provided.
+    async fn chunks(&self, digest: &B3Digest) -> io::Result<Option<Vec<ChunkMeta>>> {
+        if !self.has(digest).await? {
+            return Ok(None);
+        }
+        // default implementation, signalling the backend does not have more
+        // granular chunks available.
+        Ok(Some(vec![]))
+    }
+}
+
+#[async_trait]
+impl<A> BlobService for A
+where
+    A: AsRef<dyn BlobService> + Send + Sync,
+{
+    async fn has(&self, digest: &B3Digest) -> io::Result<bool> {
+        self.as_ref().has(digest).await
+    }
+
+    async fn open_read(&self, digest: &B3Digest) -> io::Result<Option<Box<dyn BlobReader>>> {
+        self.as_ref().open_read(digest).await
+    }
+
+    async fn open_write(&self) -> Box<dyn BlobWriter> {
+        self.as_ref().open_write().await
+    }
+
+    async fn chunks(&self, digest: &B3Digest) -> io::Result<Option<Vec<ChunkMeta>>> {
+        self.as_ref().chunks(digest).await
+    }
+}
+
+/// A [tokio::io::AsyncWrite] that the user needs to close() afterwards for persist.
+/// On success, it returns the digest of the written blob.
+#[async_trait]
+pub trait BlobWriter: tokio::io::AsyncWrite + Send + Unpin {
+    /// Signal there's no more data to be written, and return the digest of the
+    /// contents written.
+    ///
+    /// Closing a already-closed BlobWriter is a no-op.
+    async fn close(&mut self) -> io::Result<B3Digest>;
+}
+
+/// BlobReader is a [tokio::io::AsyncRead] that also allows seeking.
+pub trait BlobReader: tokio::io::AsyncRead + tokio::io::AsyncSeek + Send + Unpin + 'static {}
+
+/// A [`io::Cursor<Vec<u8>>`] can be used as a BlobReader.
+impl BlobReader for io::Cursor<&'static [u8]> {}
+impl BlobReader for io::Cursor<&'static [u8; 0]> {}
+impl BlobReader for io::Cursor<Vec<u8>> {}
+impl BlobReader for io::Cursor<bytes::Bytes> {}
+impl BlobReader for tokio::fs::File {}
diff --git a/tvix/castore/src/blobservice/naive_seeker.rs b/tvix/castore/src/blobservice/naive_seeker.rs
new file mode 100644
index 0000000000..f5a5307150
--- /dev/null
+++ b/tvix/castore/src/blobservice/naive_seeker.rs
@@ -0,0 +1,265 @@
+use super::BlobReader;
+use futures::ready;
+use pin_project_lite::pin_project;
+use std::io;
+use std::task::Poll;
+use tokio::io::AsyncRead;
+use tracing::{debug, instrument, trace, warn};
+
+pin_project! {
+    /// This implements [tokio::io::AsyncSeek] for and [tokio::io::AsyncRead] by
+    /// simply skipping over some bytes, keeping track of the position.
+    /// It fails whenever you try to seek backwards.
+    ///
+    /// ## Pinning concerns:
+    ///
+    /// [NaiveSeeker] is itself pinned by callers, and we do not need to concern
+    /// ourselves regarding that.
+    ///
+    /// Though, its fields as per
+    /// <https://doc.rust-lang.org/std/pin/#pinning-is-not-structural-for-field>
+    /// can be pinned or unpinned.
+    ///
+    /// So we need to go over each field and choose our policy carefully.
+    ///
+    /// The obvious cases are the bookkeeping integers we keep in the structure,
+    /// those are private and not shared to anyone, we never build a
+    /// `Pin<&mut X>` out of them at any point, therefore, we can safely never
+    /// mark them as pinned. Of course, it is expected that no developer here
+    /// attempt to `pin!(self.pos)` to pin them because it makes no sense. If
+    /// they have to become pinned, they should be marked `#[pin]` and we need
+    /// to discuss it.
+    ///
+    /// So the bookkeeping integers are in the right state with respect to their
+    /// pinning status. The projection should offer direct access.
+    ///
+    /// On the `r` field, i.e. a `BufReader<R>`, given that
+    /// <https://docs.rs/tokio/latest/tokio/io/struct.BufReader.html#impl-Unpin-for-BufReader%3CR%3E>
+    /// is available, even a `Pin<&mut BufReader<R>>` can be safely moved.
+    ///
+    /// The only care we should have regards the internal reader itself, i.e.
+    /// the `R` instance, see that Tokio decided to `#[pin]` it too:
+    /// <https://docs.rs/tokio/latest/src/tokio/io/util/buf_reader.rs.html#29>
+    ///
+    /// In general, there's no `Unpin` instance for `R: tokio::io::AsyncRead`
+    /// (see <https://docs.rs/tokio/latest/tokio/io/trait.AsyncRead.html>).
+    ///
+    /// Therefore, we could keep it unpinned and pin it in every call site
+    /// whenever we need to call `poll_*` which can be confusing to the non-
+    /// expert developer and we have a fair share amount of situations where the
+    /// [BufReader] instance is naked, i.e. in its `&mut BufReader<R>`
+    /// form, this is annoying because it could lead to expose the naked `R`
+    /// internal instance somehow and would produce a risk of making it move
+    /// unexpectedly.
+    ///
+    /// We choose the path of the least resistance as we have no reason to have
+    /// access to the raw `BufReader<R>` instance, we just `#[pin]` it too and
+    /// enjoy its `poll_*` safe APIs and push the unpinning concerns to the
+    /// internal implementations themselves, which studied the question longer
+    /// than us.
+    pub struct NaiveSeeker<R: tokio::io::AsyncRead> {
+        #[pin]
+        r: tokio::io::BufReader<R>,
+        pos: u64,
+        bytes_to_skip: u64,
+    }
+}
+
+/// The buffer size used to discard data.
+const DISCARD_BUF_SIZE: usize = 4096;
+
+impl<R: tokio::io::AsyncRead> NaiveSeeker<R> {
+    pub fn new(r: R) -> Self {
+        NaiveSeeker {
+            r: tokio::io::BufReader::new(r),
+            pos: 0,
+            bytes_to_skip: 0,
+        }
+    }
+}
+
+impl<R: tokio::io::AsyncRead> tokio::io::AsyncRead for NaiveSeeker<R> {
+    #[instrument(level = "trace", skip_all)]
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut tokio::io::ReadBuf<'_>,
+    ) -> Poll<std::io::Result<()>> {
+        // The amount of data read can be determined by the increase
+        // in the length of the slice returned by `ReadBuf::filled`.
+        let filled_before = buf.filled().len();
+
+        let this = self.project();
+        ready!(this.r.poll_read(cx, buf))?;
+
+        let bytes_read = buf.filled().len() - filled_before;
+        *this.pos += bytes_read as u64;
+
+        trace!(bytes_read = bytes_read, new_pos = this.pos, "poll_read");
+
+        Ok(()).into()
+    }
+}
+
+impl<R: tokio::io::AsyncRead> tokio::io::AsyncBufRead for NaiveSeeker<R> {
+    fn poll_fill_buf(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<io::Result<&[u8]>> {
+        self.project().r.poll_fill_buf(cx)
+    }
+
+    #[instrument(level = "trace", skip(self))]
+    fn consume(self: std::pin::Pin<&mut Self>, amt: usize) {
+        let this = self.project();
+        this.r.consume(amt);
+        *this.pos += amt as u64;
+
+        trace!(new_pos = this.pos, "consume");
+    }
+}
+
+impl<R: tokio::io::AsyncRead> tokio::io::AsyncSeek for NaiveSeeker<R> {
+    #[instrument(level="trace", skip(self), fields(inner_pos=%self.pos), err(Debug))]
+    fn start_seek(
+        self: std::pin::Pin<&mut Self>,
+        position: std::io::SeekFrom,
+    ) -> std::io::Result<()> {
+        let absolute_offset: u64 = match position {
+            io::SeekFrom::Start(start_offset) => {
+                if start_offset < self.pos {
+                    return Err(io::Error::new(
+                        io::ErrorKind::Unsupported,
+                        format!("can't seek backwards ({} -> {})", self.pos, start_offset),
+                    ));
+                } else {
+                    start_offset
+                }
+            }
+            // we don't know the total size, can't support this.
+            io::SeekFrom::End(_end_offset) => {
+                return Err(io::Error::new(
+                    io::ErrorKind::Unsupported,
+                    "can't seek from end",
+                ));
+            }
+            io::SeekFrom::Current(relative_offset) => {
+                if relative_offset < 0 {
+                    return Err(io::Error::new(
+                        io::ErrorKind::Unsupported,
+                        "can't seek backwards relative to current position",
+                    ));
+                } else {
+                    self.pos + relative_offset as u64
+                }
+            }
+        };
+
+        // we already know absolute_offset is >= self.pos
+        debug_assert!(
+            absolute_offset >= self.pos,
+            "absolute_offset {} must be >= self.pos {}",
+            absolute_offset,
+            self.pos
+        );
+
+        // calculate bytes to skip
+        let this = self.project();
+        *this.bytes_to_skip = absolute_offset - *this.pos;
+
+        debug!(bytes_to_skip = *this.bytes_to_skip, "seek");
+
+        Ok(())
+    }
+
+    #[instrument(skip_all)]
+    fn poll_complete(
+        mut self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> Poll<std::io::Result<u64>> {
+        if self.bytes_to_skip == 0 {
+            // return the new position (from the start of the stream)
+            return Poll::Ready(Ok(self.pos));
+        }
+
+        // discard some bytes, until pos is where we want it to be.
+        // We create a buffer that we'll discard later on.
+        let mut discard_buf = [0; DISCARD_BUF_SIZE];
+
+        // Loop until we've reached the desired seek position. This is done by issuing repeated
+        // `poll_read` calls.
+        // If the data is not available yet, we will yield back to the executor
+        // and wait to be polled again.
+        loop {
+            if self.bytes_to_skip == 0 {
+                return Poll::Ready(Ok(self.pos));
+            }
+
+            // calculate the length we want to skip at most, which is either a max
+            // buffer size, or the number of remaining bytes to read, whatever is
+            // smaller.
+            let bytes_to_skip_now = std::cmp::min(self.bytes_to_skip as usize, discard_buf.len());
+            let mut discard_buf = tokio::io::ReadBuf::new(&mut discard_buf[..bytes_to_skip_now]);
+
+            ready!(self.as_mut().poll_read(cx, &mut discard_buf))?;
+            let bytes_skipped = discard_buf.filled().len();
+
+            if bytes_skipped == 0 {
+                return Poll::Ready(Err(io::Error::new(
+                    io::ErrorKind::UnexpectedEof,
+                    "got EOF while trying to skip bytes",
+                )));
+            }
+            // decrement bytes to skip. The poll_read call already updated self.pos.
+            *self.as_mut().project().bytes_to_skip -= bytes_skipped as u64;
+        }
+    }
+}
+
+impl<R: tokio::io::AsyncRead + Send + Unpin + 'static> BlobReader for NaiveSeeker<R> {}
+
+#[cfg(test)]
+mod tests {
+    use super::{NaiveSeeker, DISCARD_BUF_SIZE};
+    use std::io::{Cursor, SeekFrom};
+    use tokio::io::{AsyncReadExt, AsyncSeekExt};
+
+    /// This seek requires multiple `poll_read` as we use a multiples of
+    /// DISCARD_BUF_SIZE when doing the seek.
+    /// This ensures we don't hang indefinitely.
+    #[tokio::test]
+    async fn seek() {
+        let buf = vec![0u8; DISCARD_BUF_SIZE * 4];
+        let reader = Cursor::new(&buf);
+        let mut seeker = NaiveSeeker::new(reader);
+        seeker.seek(SeekFrom::Start(4000)).await.unwrap();
+    }
+
+    #[tokio::test]
+    async fn seek_read() {
+        let mut buf = vec![0u8; DISCARD_BUF_SIZE * 2];
+        buf.extend_from_slice(&[1u8; DISCARD_BUF_SIZE * 2]);
+        buf.extend_from_slice(&[2u8; DISCARD_BUF_SIZE * 2]);
+
+        let reader = Cursor::new(&buf);
+        let mut seeker = NaiveSeeker::new(reader);
+
+        let mut read_buf = vec![0u8; DISCARD_BUF_SIZE];
+        seeker.read_exact(&mut read_buf).await.expect("must read");
+        assert_eq!(read_buf.as_slice(), &[0u8; DISCARD_BUF_SIZE]);
+
+        seeker
+            .seek(SeekFrom::Current(DISCARD_BUF_SIZE as i64))
+            .await
+            .expect("must seek");
+        seeker.read_exact(&mut read_buf).await.expect("must read");
+        assert_eq!(read_buf.as_slice(), &[1u8; DISCARD_BUF_SIZE]);
+
+        seeker
+            .seek(SeekFrom::Start(2 * 2 * DISCARD_BUF_SIZE as u64))
+            .await
+            .expect("must seek");
+        seeker.read_exact(&mut read_buf).await.expect("must read");
+        assert_eq!(read_buf.as_slice(), &[2u8; DISCARD_BUF_SIZE]);
+    }
+}
diff --git a/tvix/castore/src/blobservice/object_store.rs b/tvix/castore/src/blobservice/object_store.rs
new file mode 100644
index 0000000000..d2d0a288a5
--- /dev/null
+++ b/tvix/castore/src/blobservice/object_store.rs
@@ -0,0 +1,546 @@
+use std::{
+    io::{self, Cursor},
+    pin::pin,
+    sync::Arc,
+    task::Poll,
+};
+
+use data_encoding::HEXLOWER;
+use fastcdc::v2020::AsyncStreamCDC;
+use futures::Future;
+use object_store::{path::Path, ObjectStore};
+use pin_project_lite::pin_project;
+use prost::Message;
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
+use tokio_stream::StreamExt;
+use tonic::async_trait;
+use tracing::{debug, instrument, trace, Level};
+use url::Url;
+
+use crate::{
+    proto::{stat_blob_response::ChunkMeta, StatBlobResponse},
+    B3Digest, B3HashingReader,
+};
+
+use super::{BlobReader, BlobService, BlobWriter, ChunkedReader};
+
+#[derive(Clone)]
+pub struct ObjectStoreBlobService {
+    object_store: Arc<dyn ObjectStore>,
+    base_path: Path,
+
+    /// Average chunk size for FastCDC, in bytes.
+    /// min value is half, max value double of that number.
+    avg_chunk_size: u32,
+}
+
+/// Uses any object storage supported by the [object_store] crate to provide a
+/// tvix-castore [BlobService].
+///
+/// # Data format
+/// Data is organized in "blobs" and "chunks".
+/// Blobs don't hold the actual data, but instead contain a list of more
+/// granular chunks that assemble to the contents requested.
+/// This allows clients to seek, and not download chunks they already have
+/// locally, as it's referred to from other files.
+/// Check `rpc_blobstore` and more general BlobStore docs on that.
+///
+/// ## Blobs
+/// Stored at `${base_path}/blobs/b3/$digest_key`. They contains the serialized
+/// StatBlobResponse for the blob with the digest.
+///
+/// ## Chunks
+/// Chunks are stored at `${base_path}/chunks/b3/$digest_key`. They contain
+/// the literal contents of the chunk, but are zstd-compressed.
+///
+/// ## Digest key sharding
+/// The blake3 digest encoded in lower hex, and sharded after the second
+/// character.
+/// The blob for "Hello World" is stored at
+/// `${base_path}/blobs/b3/41/41f8394111eb713a22165c46c90ab8f0fd9399c92028fd6d288944b23ff5bf76`.
+///
+/// This reduces the number of files in the same directory, which would be a
+/// problem at least when using [object_store::local::LocalFileSystem].
+///
+/// # Future changes
+/// There's no guarantees about this being a final format yet.
+/// Once object_store gets support for additional metadata / content-types,
+/// we can eliminate some requests (small blobs only consisting of a single
+/// chunk can be stored as-is, without the blob index file).
+/// It also allows signalling any compression of chunks in the content-type.
+/// Migration *should* be possible by simply adding the right content-types to
+/// all keys stored so far, but no promises ;-)
+impl ObjectStoreBlobService {
+    /// Constructs a new [ObjectStoreBlobService] from a [Url] supported by
+    /// [object_store].
+    /// Any path suffix becomes the base path of the object store.
+    /// additional options, the same as in [object_store::parse_url_opts] can
+    /// be passed.
+    pub fn parse_url_opts<I, K, V>(url: &Url, options: I) -> Result<Self, object_store::Error>
+    where
+        I: IntoIterator<Item = (K, V)>,
+        K: AsRef<str>,
+        V: Into<String>,
+    {
+        let (object_store, path) = object_store::parse_url_opts(url, options)?;
+
+        Ok(Self {
+            object_store: Arc::new(object_store),
+            base_path: path,
+            avg_chunk_size: 256 * 1024,
+        })
+    }
+
+    /// Like [Self::parse_url_opts], except without the options.
+    pub fn parse_url(url: &Url) -> Result<Self, object_store::Error> {
+        Self::parse_url_opts(url, Vec::<(String, String)>::new())
+    }
+}
+
+#[instrument(level=Level::TRACE, skip_all,fields(base_path=%base_path,blob.digest=%digest),ret(Display))]
+fn derive_blob_path(base_path: &Path, digest: &B3Digest) -> Path {
+    base_path
+        .child("blobs")
+        .child("b3")
+        .child(HEXLOWER.encode(&digest.as_slice()[..2]))
+        .child(HEXLOWER.encode(digest.as_slice()))
+}
+
+#[instrument(level=Level::TRACE, skip_all,fields(base_path=%base_path,chunk.digest=%digest),ret(Display))]
+fn derive_chunk_path(base_path: &Path, digest: &B3Digest) -> Path {
+    base_path
+        .child("chunks")
+        .child("b3")
+        .child(HEXLOWER.encode(&digest.as_slice()[..2]))
+        .child(HEXLOWER.encode(digest.as_slice()))
+}
+
+#[async_trait]
+impl BlobService for ObjectStoreBlobService {
+    #[instrument(skip_all, ret, err, fields(blob.digest=%digest))]
+    async fn has(&self, digest: &B3Digest) -> io::Result<bool> {
+        // TODO: clarify if this should work for chunks or not, and explicitly
+        // document in the proto docs.
+        let p = derive_blob_path(&self.base_path, digest);
+
+        match self.object_store.head(&p).await {
+            Ok(_) => Ok(true),
+            Err(object_store::Error::NotFound { .. }) => {
+                let p = derive_chunk_path(&self.base_path, digest);
+                match self.object_store.head(&p).await {
+                    Ok(_) => Ok(true),
+                    Err(object_store::Error::NotFound { .. }) => Ok(false),
+                    Err(e) => Err(e)?,
+                }
+            }
+            Err(e) => Err(e)?,
+        }
+    }
+
+    #[instrument(skip_all, err, fields(blob.digest=%digest))]
+    async fn open_read(&self, digest: &B3Digest) -> io::Result<Option<Box<dyn BlobReader>>> {
+        // handle reading the empty blob.
+        if digest.as_slice() == blake3::hash(b"").as_bytes() {
+            return Ok(Some(Box::new(Cursor::new(b"")) as Box<dyn BlobReader>));
+        }
+        match self
+            .object_store
+            .get(&derive_chunk_path(&self.base_path, digest))
+            .await
+        {
+            Ok(res) => {
+                // handle reading blobs that are small enough to fit inside a single chunk:
+                // fetch the entire chunk into memory, decompress, ensure the b3 digest matches,
+                // and return a io::Cursor over that data.
+                // FUTUREWORK: use zstd::bulk to prevent decompression bombs
+
+                let chunk_raw_bytes = res.bytes().await?;
+                let chunk_contents = zstd::stream::decode_all(Cursor::new(chunk_raw_bytes))?;
+
+                if *digest != blake3::hash(&chunk_contents).as_bytes().into() {
+                    Err(io::Error::other("chunk contents invalid"))?;
+                }
+
+                Ok(Some(Box::new(Cursor::new(chunk_contents))))
+            }
+            Err(object_store::Error::NotFound { .. }) => {
+                // NOTE: For public-facing things, we would want to stop here.
+                // Clients should fetch granularly, so they can make use of
+                // chunks they have locally.
+                // However, if this is used directly, without any caches, do the
+                // assembly here.
+                // This is subject to change, once we have store composition.
+                // TODO: make this configurable, and/or clarify behaviour for
+                // the gRPC server surface (explicitly document behaviour in the
+                // proto docs)
+                if let Some(chunks) = self.chunks(digest).await? {
+                    let chunked_reader = ChunkedReader::from_chunks(
+                        chunks.into_iter().map(|chunk| {
+                            (
+                                chunk.digest.try_into().expect("invalid b3 digest"),
+                                chunk.size,
+                            )
+                        }),
+                        Arc::new(self.clone()) as Arc<dyn BlobService>,
+                    );
+
+                    Ok(Some(Box::new(chunked_reader)))
+                } else {
+                    // This is neither a chunk nor a blob, return None.
+                    Ok(None)
+                }
+            }
+            Err(e) => Err(e.into()),
+        }
+    }
+
+    #[instrument(skip_all)]
+    async fn open_write(&self) -> Box<dyn BlobWriter> {
+        // ObjectStoreBlobWriter implements AsyncWrite, but all the chunking
+        // needs an AsyncRead, so we create a pipe here.
+        // In its `AsyncWrite` implementation, `ObjectStoreBlobWriter` delegates
+        // writes to w. It periodically polls the future that's reading from the
+        // other side.
+        let (w, r) = tokio::io::duplex(self.avg_chunk_size as usize * 10);
+
+        Box::new(ObjectStoreBlobWriter {
+            writer: Some(w),
+            fut: Some(Box::pin(chunk_and_upload(
+                r,
+                self.object_store.clone(),
+                self.base_path.clone(),
+                self.avg_chunk_size / 2,
+                self.avg_chunk_size,
+                self.avg_chunk_size * 2,
+            ))),
+            fut_output: None,
+        })
+    }
+
+    #[instrument(skip_all, err, fields(blob.digest=%digest))]
+    async fn chunks(&self, digest: &B3Digest) -> io::Result<Option<Vec<ChunkMeta>>> {
+        match self
+            .object_store
+            .get(&derive_blob_path(&self.base_path, digest))
+            .await
+        {
+            Ok(get_result) => {
+                // fetch the data at the blob path
+                let blob_data = get_result.bytes().await?;
+                // parse into StatBlobResponse
+                let stat_blob_response: StatBlobResponse = StatBlobResponse::decode(blob_data)?;
+
+                debug!(
+                    chunk.count = stat_blob_response.chunks.len(),
+                    blob.size = stat_blob_response
+                        .chunks
+                        .iter()
+                        .map(|x| x.size)
+                        .sum::<u64>(),
+                    "found more granular chunks"
+                );
+
+                Ok(Some(stat_blob_response.chunks))
+            }
+            Err(object_store::Error::NotFound { .. }) => {
+                // If there's only a chunk, we must return the empty vec here, rather than None.
+                match self
+                    .object_store
+                    .head(&derive_chunk_path(&self.base_path, digest))
+                    .await
+                {
+                    Ok(_) => {
+                        // present, but no more chunks available
+                        debug!("found a single chunk");
+                        Ok(Some(vec![]))
+                    }
+                    Err(object_store::Error::NotFound { .. }) => {
+                        // Neither blob nor single chunk found
+                        debug!("not found");
+                        Ok(None)
+                    }
+                    // error checking for chunk
+                    Err(e) => Err(e.into()),
+                }
+            }
+            // error checking for blob
+            Err(err) => Err(err.into()),
+        }
+    }
+}
+
+/// Reads blob contents from a AsyncRead, chunks and uploads them.
+/// On success, returns a [StatBlobResponse] pointing to the individual chunks.
+#[instrument(skip_all, fields(base_path=%base_path, min_chunk_size, avg_chunk_size, max_chunk_size), err)]
+async fn chunk_and_upload<R: AsyncRead + Unpin>(
+    r: R,
+    object_store: Arc<dyn ObjectStore>,
+    base_path: Path,
+    min_chunk_size: u32,
+    avg_chunk_size: u32,
+    max_chunk_size: u32,
+) -> io::Result<B3Digest> {
+    // wrap reader with something calculating the blake3 hash of all data read.
+    let mut b3_r = B3HashingReader::from(r);
+    // set up a fastcdc chunker
+    let mut chunker =
+        AsyncStreamCDC::new(&mut b3_r, min_chunk_size, avg_chunk_size, max_chunk_size);
+
+    /// This really should just belong into the closure at
+    /// `chunker.as_stream().then(|_| { … })``, but if we try to, rustc spits
+    /// higher-ranked lifetime errors at us.
+    async fn fastcdc_chunk_uploader(
+        resp: Result<fastcdc::v2020::ChunkData, fastcdc::v2020::Error>,
+        base_path: Path,
+        object_store: Arc<dyn ObjectStore>,
+    ) -> std::io::Result<ChunkMeta> {
+        let chunk_data = resp?;
+        let chunk_digest: B3Digest = blake3::hash(&chunk_data.data).as_bytes().into();
+        let chunk_path = derive_chunk_path(&base_path, &chunk_digest);
+
+        upload_chunk(object_store, chunk_digest, chunk_path, chunk_data.data).await
+    }
+
+    // Use the fastcdc chunker to produce a stream of chunks, and upload these
+    // that don't exist to the backend.
+    let chunks = chunker
+        .as_stream()
+        .then(|resp| fastcdc_chunk_uploader(resp, base_path.clone(), object_store.clone()))
+        .collect::<io::Result<Vec<ChunkMeta>>>()
+        .await?;
+
+    let stat_blob_response = StatBlobResponse {
+        chunks,
+        bao: "".into(), // still todo
+    };
+
+    // check for Blob, if it doesn't exist, persist.
+    let blob_digest: B3Digest = b3_r.digest().into();
+    let blob_path = derive_blob_path(&base_path, &blob_digest);
+
+    match object_store.head(&blob_path).await {
+        // blob already exists, nothing to do
+        Ok(_) => {
+            trace!(
+                blob.digest = %blob_digest,
+                blob.path = %blob_path,
+                "blob already exists on backend"
+            );
+        }
+        // chunk does not yet exist, upload first
+        Err(object_store::Error::NotFound { .. }) => {
+            debug!(
+                blob.digest = %blob_digest,
+                blob.path = %blob_path,
+                "uploading blob"
+            );
+            object_store
+                .put(&blob_path, stat_blob_response.encode_to_vec().into())
+                .await?;
+        }
+        Err(err) => {
+            // other error
+            Err(err)?
+        }
+    }
+
+    Ok(blob_digest)
+}
+
+/// upload chunk if it doesn't exist yet.
+#[instrument(skip_all, fields(chunk.digest = %chunk_digest, chunk.size = chunk_data.len(), chunk.path = %chunk_path), err)]
+async fn upload_chunk(
+    object_store: Arc<dyn ObjectStore>,
+    chunk_digest: B3Digest,
+    chunk_path: Path,
+    chunk_data: Vec<u8>,
+) -> std::io::Result<ChunkMeta> {
+    let chunk_size = chunk_data.len();
+    match object_store.head(&chunk_path).await {
+        // chunk already exists, nothing to do
+        Ok(_) => {
+            debug!("chunk already exists");
+        }
+
+        // chunk does not yet exist, compress and upload.
+        Err(object_store::Error::NotFound { .. }) => {
+            let chunk_data_compressed =
+                zstd::encode_all(Cursor::new(chunk_data), zstd::DEFAULT_COMPRESSION_LEVEL)?;
+
+            debug!(chunk.compressed_size=%chunk_data_compressed.len(), "uploading chunk");
+
+            object_store
+                .as_ref()
+                .put(&chunk_path, chunk_data_compressed.into())
+                .await?;
+        }
+        // other error
+        Err(err) => Err(err)?,
+    }
+
+    Ok(ChunkMeta {
+        digest: chunk_digest.into(),
+        size: chunk_size as u64,
+    })
+}
+
+pin_project! {
+    /// Takes care of blob uploads.
+    /// All writes are relayed to self.writer, and we continuously poll the
+    /// future (which will internally read from the other side of the pipe and
+    /// upload chunks).
+    /// Our BlobWriter::close() needs to drop self.writer, so the other side
+    /// will read EOF and can finalize the blob.
+    /// The future should then resolve and return the blob digest.
+    pub struct ObjectStoreBlobWriter<W, Fut>
+    where
+        W: AsyncWrite,
+        Fut: Future,
+    {
+        #[pin]
+        writer: Option<W>,
+
+        #[pin]
+        fut: Option<Fut>,
+
+        fut_output: Option<io::Result<B3Digest>>
+    }
+}
+
+impl<W, Fut> tokio::io::AsyncWrite for ObjectStoreBlobWriter<W, Fut>
+where
+    W: AsyncWrite + Send + Unpin,
+    Fut: Future,
+{
+    fn poll_write(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &[u8],
+    ) -> std::task::Poll<Result<usize, io::Error>> {
+        let this = self.project();
+        // poll the future.
+        let fut = this.fut.as_pin_mut().expect("not future");
+        let fut_p = fut.poll(cx);
+        // if it's ready, the only way this could have happened is that the
+        // upload failed, because we're only closing `self.writer` after all
+        // writes happened.
+        if fut_p.is_ready() {
+            return Poll::Ready(Err(io::Error::other("upload failed")));
+        }
+
+        // write to the underlying writer
+        this.writer
+            .as_pin_mut()
+            .expect("writer must be some")
+            .poll_write(cx, buf)
+    }
+
+    fn poll_flush(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Result<(), io::Error>> {
+        let this = self.project();
+        // poll the future.
+        let fut = this.fut.as_pin_mut().expect("not future");
+        let fut_p = fut.poll(cx);
+        // if it's ready, the only way this could have happened is that the
+        // upload failed, because we're only closing `self.writer` after all
+        // writes happened.
+        if fut_p.is_ready() {
+            return Poll::Ready(Err(io::Error::other("upload failed")));
+        }
+
+        // Call poll_flush on the writer
+        this.writer
+            .as_pin_mut()
+            .expect("writer must be some")
+            .poll_flush(cx)
+    }
+
+    fn poll_shutdown(
+        self: std::pin::Pin<&mut Self>,
+        _cx: &mut std::task::Context<'_>,
+    ) -> std::task::Poll<Result<(), io::Error>> {
+        // There's nothing to do on shutdown. We might have written some chunks
+        // that are nowhere else referenced, but cleaning them up here would be racy.
+        std::task::Poll::Ready(Ok(()))
+    }
+}
+
+#[async_trait]
+impl<W, Fut> BlobWriter for ObjectStoreBlobWriter<W, Fut>
+where
+    W: AsyncWrite + Send + Unpin,
+    Fut: Future<Output = io::Result<B3Digest>> + Send + Unpin,
+{
+    async fn close(&mut self) -> io::Result<B3Digest> {
+        match self.writer.take() {
+            Some(mut writer) => {
+                // shut down the writer, so the other side will read EOF.
+                writer.shutdown().await?;
+
+                // take out the future.
+                let fut = self.fut.take().expect("fut must be some");
+                // await it.
+                let resp = pin!(fut).await;
+
+                match resp.as_ref() {
+                    // In the case of an Ok value, we store it in self.fut_output,
+                    // so future calls to close can return that.
+                    Ok(b3_digest) => {
+                        self.fut_output = Some(Ok(b3_digest.clone()));
+                    }
+                    Err(e) => {
+                        // for the error type, we need to cheat a bit, as
+                        // they're not clone-able.
+                        // Simply store a sloppy clone, with the same ErrorKind and message there.
+                        self.fut_output = Some(Err(std::io::Error::new(e.kind(), e.to_string())))
+                    }
+                }
+                resp
+            }
+            None => {
+                // called a second time, return self.fut_output.
+                match self.fut_output.as_ref().unwrap() {
+                    Ok(ref b3_digest) => Ok(b3_digest.clone()),
+                    Err(e) => Err(std::io::Error::new(e.kind(), e.to_string())),
+                }
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::chunk_and_upload;
+    use crate::{
+        blobservice::{BlobService, ObjectStoreBlobService},
+        fixtures::{BLOB_A, BLOB_A_DIGEST},
+    };
+    use std::{io::Cursor, sync::Arc};
+    use url::Url;
+
+    /// Tests chunk_and_upload directly, bypassing the BlobWriter at open_write().
+    #[tokio::test]
+    async fn test_chunk_and_upload() {
+        let blobsvc = Arc::new(
+            ObjectStoreBlobService::parse_url(&Url::parse("memory:///").unwrap()).unwrap(),
+        );
+
+        let blob_digest = chunk_and_upload(
+            &mut Cursor::new(BLOB_A.to_vec()),
+            blobsvc.object_store.clone(),
+            object_store::path::Path::from("/"),
+            1024 / 2,
+            1024,
+            1024 * 2,
+        )
+        .await
+        .expect("chunk_and_upload succeeds");
+
+        assert_eq!(BLOB_A_DIGEST.clone(), blob_digest);
+
+        // Now we should have the blob
+        assert!(blobsvc.has(&BLOB_A_DIGEST).await.unwrap());
+    }
+}
diff --git a/tvix/castore/src/blobservice/tests/mod.rs b/tvix/castore/src/blobservice/tests/mod.rs
new file mode 100644
index 0000000000..0280faebb1
--- /dev/null
+++ b/tvix/castore/src/blobservice/tests/mod.rs
@@ -0,0 +1,253 @@
+//! This contains test scenarios that a given [BlobService] needs to pass.
+//! We use [rstest] and [rstest_reuse] to provide all services we want to test
+//! against, and then apply this template to all test functions.
+
+use rstest::*;
+use rstest_reuse::{self, *};
+use std::io;
+use tokio::io::AsyncReadExt;
+use tokio::io::AsyncSeekExt;
+
+use super::BlobService;
+use crate::blobservice;
+use crate::fixtures::BLOB_A;
+use crate::fixtures::BLOB_A_DIGEST;
+use crate::fixtures::BLOB_B;
+use crate::fixtures::BLOB_B_DIGEST;
+
+mod utils;
+use self::utils::make_grpc_blob_service_client;
+
+/// This produces a template, which will be applied to all individual test functions.
+/// See https://github.com/la10736/rstest/issues/130#issuecomment-968864832
+#[template]
+#[rstest]
+#[case::grpc(make_grpc_blob_service_client().await)]
+#[case::memory(blobservice::from_addr("memory://").await.unwrap())]
+#[case::objectstore_memory(blobservice::from_addr("objectstore+memory://").await.unwrap())]
+pub fn blob_services(#[case] blob_service: impl BlobService) {}
+
+/// Using [BlobService::has] on a non-existing blob should return false.
+#[apply(blob_services)]
+#[tokio::test]
+async fn has_nonexistent_false(blob_service: impl BlobService) {
+    assert!(!blob_service
+        .has(&BLOB_A_DIGEST)
+        .await
+        .expect("must not fail"));
+}
+
+/// Using [BlobService::chunks] on a non-existing blob should return Ok(None)
+#[apply(blob_services)]
+#[tokio::test]
+async fn chunks_nonexistent_false(blob_service: impl BlobService) {
+    assert!(blob_service
+        .chunks(&BLOB_A_DIGEST)
+        .await
+        .expect("must be ok")
+        .is_none());
+}
+
+// TODO: do tests with `chunks`
+
+/// Trying to read a non-existing blob should return a None instead of a reader.
+#[apply(blob_services)]
+#[tokio::test]
+async fn not_found_read(blob_service: impl BlobService) {
+    assert!(blob_service
+        .open_read(&BLOB_A_DIGEST)
+        .await
+        .expect("must not fail")
+        .is_none())
+}
+
+/// Put a blob in the store, check has, get it back.
+#[apply(blob_services)]
+// #[case::small(&fixtures::BLOB_A, &fixtures::BLOB_A_DIGEST)]
+// #[case::big(&fixtures::BLOB_B, &fixtures::BLOB_B_DIGEST)]
+#[tokio::test]
+async fn put_has_get(blob_service: impl BlobService) {
+    // TODO: figure out how to instantiate this with BLOB_A and BLOB_B, as two separate cases
+    for (blob_contents, blob_digest) in &[
+        (&*BLOB_A, BLOB_A_DIGEST.clone()),
+        (&*BLOB_B, BLOB_B_DIGEST.clone()),
+    ] {
+        let mut w = blob_service.open_write().await;
+
+        let l = tokio::io::copy(&mut io::Cursor::new(blob_contents), &mut w)
+            .await
+            .expect("copy must succeed");
+        assert_eq!(
+            blob_contents.len(),
+            l as usize,
+            "written bytes must match blob length"
+        );
+
+        let digest = w.close().await.expect("close must succeed");
+
+        assert_eq!(*blob_digest, digest, "returned digest must be correct");
+
+        assert!(
+            blob_service.has(blob_digest).await.expect("must not fail"),
+            "blob service should now have the blob"
+        );
+
+        let mut r = blob_service
+            .open_read(blob_digest)
+            .await
+            .expect("open_read must succeed")
+            .expect("must be some");
+
+        let mut buf: Vec<u8> = Vec::new();
+        let mut pinned_reader = std::pin::pin!(r);
+        let l = tokio::io::copy(&mut pinned_reader, &mut buf)
+            .await
+            .expect("copy must succeed");
+
+        assert_eq!(
+            blob_contents.len(),
+            l as usize,
+            "read bytes must match blob length"
+        );
+
+        assert_eq!(&blob_contents[..], &buf, "read blob contents must match");
+    }
+}
+
+/// Put a blob in the store, and seek inside it a bit.
+#[apply(blob_services)]
+#[tokio::test]
+async fn put_seek(blob_service: impl BlobService) {
+    let mut w = blob_service.open_write().await;
+
+    tokio::io::copy(&mut io::Cursor::new(&BLOB_B.to_vec()), &mut w)
+        .await
+        .expect("copy must succeed");
+    w.close().await.expect("close must succeed");
+
+    // open a blob for reading
+    let mut r = blob_service
+        .open_read(&BLOB_B_DIGEST)
+        .await
+        .expect("open_read must succeed")
+        .expect("must be some");
+
+    let mut pos: u64 = 0;
+
+    // read the first 10 bytes, they must match the data in the fixture.
+    {
+        let mut buf = [0; 10];
+        r.read_exact(&mut buf).await.expect("must succeed");
+
+        assert_eq!(
+            &BLOB_B[pos as usize..pos as usize + buf.len()],
+            buf,
+            "expected first 10 bytes to match"
+        );
+
+        pos += buf.len() as u64;
+    }
+    // seek by 0 bytes, using SeekFrom::Start.
+    let p = r
+        .seek(io::SeekFrom::Start(pos))
+        .await
+        .expect("must not fail");
+    assert_eq!(pos, p);
+
+    // read the next 10 bytes, they must match the data in the fixture.
+    {
+        let mut buf = [0; 10];
+        r.read_exact(&mut buf).await.expect("must succeed");
+
+        assert_eq!(
+            &BLOB_B[pos as usize..pos as usize + buf.len()],
+            buf,
+            "expected data to match"
+        );
+
+        pos += buf.len() as u64;
+    }
+
+    // seek by 5 bytes, using SeekFrom::Start.
+    let p = r
+        .seek(io::SeekFrom::Start(pos + 5))
+        .await
+        .expect("must not fail");
+    pos += 5;
+    assert_eq!(pos, p);
+
+    // read the next 10 bytes, they must match the data in the fixture.
+    {
+        let mut buf = [0; 10];
+        r.read_exact(&mut buf).await.expect("must succeed");
+
+        assert_eq!(
+            &BLOB_B[pos as usize..pos as usize + buf.len()],
+            buf,
+            "expected data to match"
+        );
+
+        pos += buf.len() as u64;
+    }
+
+    // seek by 12345 bytes, using SeekFrom::
+    let p = r
+        .seek(io::SeekFrom::Current(12345))
+        .await
+        .expect("must not fail");
+    pos += 12345;
+    assert_eq!(pos, p);
+
+    // read the next 10 bytes, they must match the data in the fixture.
+    {
+        let mut buf = [0; 10];
+        r.read_exact(&mut buf).await.expect("must succeed");
+
+        assert_eq!(
+            &BLOB_B[pos as usize..pos as usize + buf.len()],
+            buf,
+            "expected data to match"
+        );
+
+        #[allow(unused_assignments)]
+        {
+            pos += buf.len() as u64;
+        }
+    }
+
+    // seeking to the end is okay…
+    let p = r
+        .seek(io::SeekFrom::Start(BLOB_B.len() as u64))
+        .await
+        .expect("must not fail");
+    pos = BLOB_B.len() as u64;
+    assert_eq!(pos, p);
+
+    {
+        // but it returns no more data.
+        let mut buf: Vec<u8> = Vec::new();
+        r.read_to_end(&mut buf).await.expect("must not fail");
+        assert!(buf.is_empty(), "expected no more data to be read");
+    }
+
+    // seeking past the end…
+    // should either be ok, but then return 0 bytes.
+    // this matches the behaviour or a Cursor<Vec<u8>>.
+    if let Ok(_pos) = r.seek(io::SeekFrom::Start(BLOB_B.len() as u64 + 1)).await {
+        let mut buf: Vec<u8> = Vec::new();
+        r.read_to_end(&mut buf).await.expect("must not fail");
+        assert!(buf.is_empty(), "expected no more data to be read");
+    }
+    // or not be okay.
+
+    // TODO: this is only broken for the gRPC version
+    // We expect seeking backwards or relative to the end to fail.
+    // r.seek(io::SeekFrom::Current(-1))
+    //     .expect_err("SeekFrom::Current(-1) expected to fail");
+
+    // r.seek(io::SeekFrom::Start(pos - 1))
+    //     .expect_err("SeekFrom::Start(pos-1) expected to fail");
+
+    // r.seek(io::SeekFrom::End(0))
+    //     .expect_err("SeekFrom::End(_) expected to fail");
+}
diff --git a/tvix/castore/src/blobservice/tests/utils.rs b/tvix/castore/src/blobservice/tests/utils.rs
new file mode 100644
index 0000000000..706c4b5e43
--- /dev/null
+++ b/tvix/castore/src/blobservice/tests/utils.rs
@@ -0,0 +1,41 @@
+use crate::blobservice::{BlobService, MemoryBlobService};
+use crate::proto::blob_service_client::BlobServiceClient;
+use crate::proto::GRPCBlobServiceWrapper;
+use crate::{blobservice::GRPCBlobService, proto::blob_service_server::BlobServiceServer};
+use tonic::transport::{Endpoint, Server, Uri};
+
+/// Constructs and returns a gRPC BlobService.
+/// The server part is a [MemoryBlobService], exposed via the
+/// [GRPCBlobServiceWrapper], and connected through a DuplexStream
+pub async fn make_grpc_blob_service_client() -> Box<dyn BlobService> {
+    let (left, right) = tokio::io::duplex(64);
+
+    // spin up a server, which will only connect once, to the left side.
+    tokio::spawn(async {
+        let blob_service = Box::<MemoryBlobService>::default() as Box<dyn BlobService>;
+
+        // spin up a new DirectoryService
+        let mut server = Server::builder();
+        let router = server.add_service(BlobServiceServer::new(GRPCBlobServiceWrapper::new(
+            blob_service,
+        )));
+
+        router
+            .serve_with_incoming(tokio_stream::once(Ok::<_, std::io::Error>(left)))
+            .await
+    });
+
+    // Create a client, connecting to the right side. The URI is unused.
+    let mut maybe_right = Some(right);
+
+    Box::new(GRPCBlobService::from_client(BlobServiceClient::new(
+        Endpoint::try_from("http://[::]:50051")
+            .unwrap()
+            .connect_with_connector(tower::service_fn(move |_: Uri| {
+                let right = maybe_right.take().unwrap();
+                async move { Ok::<_, std::io::Error>(right) }
+            }))
+            .await
+            .unwrap(),
+    )))
+}
diff --git a/tvix/castore/src/digests.rs b/tvix/castore/src/digests.rs
new file mode 100644
index 0000000000..2311c95c4d
--- /dev/null
+++ b/tvix/castore/src/digests.rs
@@ -0,0 +1,86 @@
+use bytes::Bytes;
+use data_encoding::BASE64;
+use thiserror::Error;
+
+#[derive(PartialEq, Eq, Hash)]
+pub struct B3Digest(Bytes);
+
+// TODO: allow converting these errors to crate::Error
+#[derive(Error, Debug)]
+pub enum Error {
+    #[error("invalid digest length: {0}")]
+    InvalidDigestLen(usize),
+}
+
+pub const B3_LEN: usize = 32;
+
+impl B3Digest {
+    pub fn as_slice(&self) -> &[u8] {
+        &self.0[..]
+    }
+}
+
+impl From<B3Digest> for bytes::Bytes {
+    fn from(val: B3Digest) -> Self {
+        val.0
+    }
+}
+
+impl From<digest::Output<blake3::Hasher>> for B3Digest {
+    fn from(value: digest::Output<blake3::Hasher>) -> Self {
+        let v = Into::<[u8; B3_LEN]>::into(value);
+        Self(Bytes::copy_from_slice(&v))
+    }
+}
+
+impl TryFrom<Vec<u8>> for B3Digest {
+    type Error = Error;
+
+    // constructs a [B3Digest] from a [Vec<u8>].
+    // Returns an error if the digest has the wrong length.
+    fn try_from(value: Vec<u8>) -> Result<Self, Self::Error> {
+        if value.len() != B3_LEN {
+            Err(Error::InvalidDigestLen(value.len()))
+        } else {
+            Ok(Self(value.into()))
+        }
+    }
+}
+
+impl TryFrom<bytes::Bytes> for B3Digest {
+    type Error = Error;
+
+    // constructs a [B3Digest] from a [bytes::Bytes].
+    // Returns an error if the digest has the wrong length.
+    fn try_from(value: bytes::Bytes) -> Result<Self, Self::Error> {
+        if value.len() != B3_LEN {
+            Err(Error::InvalidDigestLen(value.len()))
+        } else {
+            Ok(Self(value))
+        }
+    }
+}
+
+impl From<&[u8; B3_LEN]> for B3Digest {
+    fn from(value: &[u8; B3_LEN]) -> Self {
+        Self(value.to_vec().into())
+    }
+}
+
+impl Clone for B3Digest {
+    fn clone(&self) -> Self {
+        Self(self.0.to_owned())
+    }
+}
+
+impl std::fmt::Display for B3Digest {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "b3:{}", BASE64.encode(&self.0))
+    }
+}
+
+impl std::fmt::Debug for B3Digest {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "b3:{}", BASE64.encode(&self.0))
+    }
+}
diff --git a/tvix/castore/src/directoryservice/bigtable.rs b/tvix/castore/src/directoryservice/bigtable.rs
new file mode 100644
index 0000000000..1194c6ddc9
--- /dev/null
+++ b/tvix/castore/src/directoryservice/bigtable.rs
@@ -0,0 +1,357 @@
+use bigtable_rs::{bigtable, google::bigtable::v2 as bigtable_v2};
+use bytes::Bytes;
+use data_encoding::HEXLOWER;
+use futures::stream::BoxStream;
+use prost::Message;
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DurationSeconds};
+use tonic::async_trait;
+use tracing::{instrument, trace, warn};
+
+use super::{utils::traverse_directory, DirectoryPutter, DirectoryService, SimplePutter};
+use crate::{proto, B3Digest, Error};
+
+/// There should not be more than 10 MiB in a single cell.
+/// https://cloud.google.com/bigtable/docs/schema-design#cells
+const CELL_SIZE_LIMIT: u64 = 10 * 1024 * 1024;
+
+/// Provides a [DirectoryService] implementation using
+/// [Bigtable](https://cloud.google.com/bigtable/docs/)
+/// as an underlying K/V store.
+///
+/// # Data format
+/// We use Bigtable as a plain K/V store.
+/// The row key is the digest of the directory, in hexlower.
+/// Inside the row, we currently have a single column/cell, again using the
+/// hexlower directory digest.
+/// Its value is the Directory message, serialized in canonical protobuf.
+/// We currently only populate this column.
+///
+/// In the future, we might want to introduce "bucketing", essentially storing
+/// all directories inserted via `put_multiple_start` in a batched form.
+/// This will prevent looking up intermediate Directories, which are not
+/// directly at the root, so rely on store composition.
+#[derive(Clone)]
+pub struct BigtableDirectoryService {
+    client: bigtable::BigTable,
+    params: BigtableParameters,
+
+    #[cfg(test)]
+    #[allow(dead_code)]
+    /// Holds the temporary directory containing the unix socket, and the
+    /// spawned emulator process.
+    emulator: std::sync::Arc<(tempfile::TempDir, async_process::Child)>,
+}
+
+/// Represents configuration of [BigtableDirectoryService].
+/// This currently conflates both connect parameters and data model/client
+/// behaviour parameters.
+#[serde_as]
+#[derive(Clone, Debug, PartialEq, Deserialize, Serialize)]
+pub struct BigtableParameters {
+    project_id: String,
+    instance_name: String,
+    #[serde(default)]
+    is_read_only: bool,
+    #[serde(default = "default_channel_size")]
+    channel_size: usize,
+
+    #[serde_as(as = "Option<DurationSeconds<String>>")]
+    #[serde(default = "default_timeout")]
+    timeout: Option<std::time::Duration>,
+    table_name: String,
+    family_name: String,
+
+    #[serde(default = "default_app_profile_id")]
+    app_profile_id: String,
+}
+
+fn default_app_profile_id() -> String {
+    "default".to_owned()
+}
+
+fn default_channel_size() -> usize {
+    4
+}
+
+fn default_timeout() -> Option<std::time::Duration> {
+    Some(std::time::Duration::from_secs(4))
+}
+
+impl BigtableDirectoryService {
+    #[cfg(not(test))]
+    pub async fn connect(params: BigtableParameters) -> Result<Self, bigtable::Error> {
+        let connection = bigtable::BigTableConnection::new(
+            &params.project_id,
+            &params.instance_name,
+            params.is_read_only,
+            params.channel_size,
+            params.timeout,
+        )
+        .await?;
+
+        Ok(Self {
+            client: connection.client(),
+            params,
+        })
+    }
+
+    #[cfg(test)]
+    pub async fn connect(params: BigtableParameters) -> Result<Self, bigtable::Error> {
+        use std::time::Duration;
+
+        use async_process::{Command, Stdio};
+        use tempfile::TempDir;
+        use tokio_retry::{strategy::ExponentialBackoff, Retry};
+
+        let tmpdir = TempDir::new().unwrap();
+
+        let socket_path = tmpdir.path().join("cbtemulator.sock");
+
+        let emulator_process = Command::new("cbtemulator")
+            .arg("-address")
+            .arg(socket_path.clone())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .kill_on_drop(true)
+            .spawn()
+            .expect("failed to spawn emulator");
+
+        Retry::spawn(
+            ExponentialBackoff::from_millis(20)
+                .max_delay(Duration::from_secs(1))
+                .take(3),
+            || async {
+                if socket_path.exists() {
+                    Ok(())
+                } else {
+                    Err(())
+                }
+            },
+        )
+        .await
+        .expect("failed to wait for socket");
+
+        // populate the emulator
+        for cmd in &[
+            vec!["createtable", &params.table_name],
+            vec!["createfamily", &params.table_name, &params.family_name],
+        ] {
+            Command::new("cbt")
+                .args({
+                    let mut args = vec![
+                        "-instance",
+                        &params.instance_name,
+                        "-project",
+                        &params.project_id,
+                    ];
+                    args.extend_from_slice(cmd);
+                    args
+                })
+                .env(
+                    "BIGTABLE_EMULATOR_HOST",
+                    format!("unix://{}", socket_path.to_string_lossy()),
+                )
+                .output()
+                .await
+                .expect("failed to run cbt setup command");
+        }
+
+        let connection = bigtable_rs::bigtable::BigTableConnection::new_with_emulator(
+            &format!("unix://{}", socket_path.to_string_lossy()),
+            &params.project_id,
+            &params.instance_name,
+            params.is_read_only,
+            params.timeout,
+        )?;
+
+        Ok(Self {
+            client: connection.client(),
+            params,
+            emulator: (tmpdir, emulator_process).into(),
+        })
+    }
+}
+
+/// Derives the row/column key for a given blake3 digest.
+/// We use hexlower encoding, also because it can't be misinterpreted as RE2.
+fn derive_directory_key(digest: &B3Digest) -> String {
+    HEXLOWER.encode(digest.as_slice())
+}
+
+#[async_trait]
+impl DirectoryService for BigtableDirectoryService {
+    #[instrument(skip(self, digest), err, fields(directory.digest = %digest))]
+    async fn get(&self, digest: &B3Digest) -> Result<Option<proto::Directory>, Error> {
+        let mut client = self.client.clone();
+        let directory_key = derive_directory_key(digest);
+
+        let request = bigtable_v2::ReadRowsRequest {
+            app_profile_id: self.params.app_profile_id.to_string(),
+            table_name: client.get_full_table_name(&self.params.table_name),
+            rows_limit: 1,
+            rows: Some(bigtable_v2::RowSet {
+                row_keys: vec![directory_key.clone().into()],
+                row_ranges: vec![],
+            }),
+            // Filter selected family name, and column qualifier matching our digest.
+            // This is to ensure we don't fail once we start bucketing.
+            filter: Some(bigtable_v2::RowFilter {
+                filter: Some(bigtable_v2::row_filter::Filter::Chain(
+                    bigtable_v2::row_filter::Chain {
+                        filters: vec![
+                            bigtable_v2::RowFilter {
+                                filter: Some(
+                                    bigtable_v2::row_filter::Filter::FamilyNameRegexFilter(
+                                        self.params.family_name.to_string(),
+                                    ),
+                                ),
+                            },
+                            bigtable_v2::RowFilter {
+                                filter: Some(
+                                    bigtable_v2::row_filter::Filter::ColumnQualifierRegexFilter(
+                                        directory_key.clone().into(),
+                                    ),
+                                ),
+                            },
+                        ],
+                    },
+                )),
+            }),
+            ..Default::default()
+        };
+
+        let mut response = client
+            .read_rows(request)
+            .await
+            .map_err(|e| Error::StorageError(format!("unable to read rows: {}", e)))?;
+
+        if response.len() != 1 {
+            if response.len() > 1 {
+                // This shouldn't happen, we limit number of rows to 1
+                return Err(Error::StorageError(
+                    "got more than one row from bigtable".into(),
+                ));
+            }
+            // else, this is simply a "not found".
+            return Ok(None);
+        }
+
+        let (row_key, mut row_cells) = response.pop().unwrap();
+        if row_key != directory_key.as_bytes() {
+            // This shouldn't happen, we requested this row key.
+            return Err(Error::StorageError(
+                "got wrong row key from bigtable".into(),
+            ));
+        }
+
+        let row_cell = row_cells
+            .pop()
+            .ok_or_else(|| Error::StorageError("found no cells".into()))?;
+
+        // Ensure there's only one cell (so no more left after the pop())
+        // This shouldn't happen, We filter out other cells in our query.
+        if !row_cells.is_empty() {
+            return Err(Error::StorageError(
+                "more than one cell returned from bigtable".into(),
+            ));
+        }
+
+        // We also require the qualifier to be correct in the filter above,
+        // so this shouldn't happen.
+        if directory_key.as_bytes() != row_cell.qualifier {
+            return Err(Error::StorageError("unexpected cell qualifier".into()));
+        }
+
+        // For the data in that cell, ensure the digest matches what's requested, before parsing.
+        let got_digest = B3Digest::from(blake3::hash(&row_cell.value).as_bytes());
+        if got_digest != *digest {
+            return Err(Error::StorageError(format!(
+                "invalid digest: {}",
+                got_digest
+            )));
+        }
+
+        // Try to parse the value into a Directory message.
+        let directory = proto::Directory::decode(Bytes::from(row_cell.value))
+            .map_err(|e| Error::StorageError(format!("unable to decode directory proto: {}", e)))?;
+
+        // validate the Directory.
+        directory
+            .validate()
+            .map_err(|e| Error::StorageError(format!("invalid Directory message: {}", e)))?;
+
+        Ok(Some(directory))
+    }
+
+    #[instrument(skip(self, directory), err, fields(directory.digest = %directory.digest()))]
+    async fn put(&self, directory: proto::Directory) -> Result<B3Digest, Error> {
+        let directory_digest = directory.digest();
+        let mut client = self.client.clone();
+        let directory_key = derive_directory_key(&directory_digest);
+
+        // Ensure the directory we're trying to upload passes validation
+        directory
+            .validate()
+            .map_err(|e| Error::InvalidRequest(format!("directory is invalid: {}", e)))?;
+
+        let data = directory.encode_to_vec();
+        if data.len() as u64 > CELL_SIZE_LIMIT {
+            return Err(Error::StorageError(
+                "Directory exceeds cell limit on Bigtable".into(),
+            ));
+        }
+
+        let resp = client
+            .check_and_mutate_row(bigtable_v2::CheckAndMutateRowRequest {
+                table_name: client.get_full_table_name(&self.params.table_name),
+                app_profile_id: self.params.app_profile_id.to_string(),
+                row_key: directory_key.clone().into(),
+                predicate_filter: Some(bigtable_v2::RowFilter {
+                    filter: Some(bigtable_v2::row_filter::Filter::ColumnQualifierRegexFilter(
+                        directory_key.clone().into(),
+                    )),
+                }),
+                // If the column was already found, do nothing.
+                true_mutations: vec![],
+                // Else, do the insert.
+                false_mutations: vec![
+                    // https://cloud.google.com/bigtable/docs/writes
+                    bigtable_v2::Mutation {
+                        mutation: Some(bigtable_v2::mutation::Mutation::SetCell(
+                            bigtable_v2::mutation::SetCell {
+                                family_name: self.params.family_name.to_string(),
+                                column_qualifier: directory_key.clone().into(),
+                                timestamp_micros: -1, // use server time to fill timestamp
+                                value: data,
+                            },
+                        )),
+                    },
+                ],
+            })
+            .await
+            .map_err(|e| Error::StorageError(format!("unable to mutate rows: {}", e)))?;
+
+        if resp.predicate_matched {
+            trace!("already existed")
+        }
+
+        Ok(directory_digest)
+    }
+
+    #[instrument(skip_all, fields(directory.digest = %root_directory_digest))]
+    fn get_recursive(
+        &self,
+        root_directory_digest: &B3Digest,
+    ) -> BoxStream<'static, Result<proto::Directory, Error>> {
+        traverse_directory(self.clone(), root_directory_digest)
+    }
+
+    #[instrument(skip_all)]
+    fn put_multiple_start(&self) -> Box<(dyn DirectoryPutter + 'static)>
+    where
+        Self: Clone,
+    {
+        Box::new(SimplePutter::new(self.clone()))
+    }
+}
diff --git a/tvix/castore/src/directoryservice/closure_validator.rs b/tvix/castore/src/directoryservice/closure_validator.rs
new file mode 100644
index 0000000000..b9746a5a05
--- /dev/null
+++ b/tvix/castore/src/directoryservice/closure_validator.rs
@@ -0,0 +1,309 @@
+use std::collections::{HashMap, HashSet};
+
+use bstr::ByteSlice;
+
+use petgraph::{
+    graph::{DiGraph, NodeIndex},
+    visit::{Bfs, Walker},
+};
+use tracing::instrument;
+
+use crate::{
+    proto::{self, Directory},
+    B3Digest, Error,
+};
+
+type DirectoryGraph = DiGraph<Directory, ()>;
+
+/// This can be used to validate a Directory closure (DAG of connected
+/// Directories), and their insertion order.
+///
+/// Directories need to be inserted (via `add`), in an order from the leaves to
+/// the root (DFS Post-Order).
+/// During insertion, We validate as much as we can at that time:
+///
+///  - individual validation of Directory messages
+///  - validation of insertion order (no upload of not-yet-known Directories)
+///  - validation of size fields of referred Directories
+///
+/// Internally it keeps all received Directories in a directed graph,
+/// with node weights being the Directories and edges pointing to child
+/// directories.
+///
+/// Once all Directories have been inserted, a finalize function can be
+/// called to get a (deduplicated and) validated list of directories, in
+/// insertion order.
+/// During finalize, a check for graph connectivity is performed too, to ensure
+/// there's no disconnected components, and only one root.
+#[derive(Default)]
+pub struct ClosureValidator {
+    // A directed graph, using Directory as node weight, without edge weights.
+    // Edges point from parents to children.
+    graph: DirectoryGraph,
+
+    // A lookup table from directory digest to node index.
+    digest_to_node_ix: HashMap<B3Digest, NodeIndex>,
+
+    /// Keeps track of the last-inserted directory graph node index.
+    /// On a correct insert, this will be the root node, from which the DFS post
+    /// order traversal will start from.
+    last_directory_ix: Option<NodeIndex>,
+}
+
+impl ClosureValidator {
+    /// Insert a new Directory into the closure.
+    /// Perform individual Directory validation, validation of insertion order
+    /// and size fields.
+    #[instrument(level = "trace", skip_all, fields(directory.digest=%directory.digest(), directory.size=%directory.size()), err)]
+    pub fn add(&mut self, directory: proto::Directory) -> Result<(), Error> {
+        let digest = directory.digest();
+
+        // If we already saw this node previously, it's already validated and in the graph.
+        if self.digest_to_node_ix.contains_key(&digest) {
+            return Ok(());
+        }
+
+        // Do some general validation
+        directory
+            .validate()
+            .map_err(|e| Error::InvalidRequest(e.to_string()))?;
+
+        // Ensure the directory only refers to directories which we already accepted.
+        // We lookup their node indices and add them to a HashSet.
+        let mut child_ixs = HashSet::new();
+        for dir in &directory.directories {
+            let child_digest = B3Digest::try_from(dir.digest.to_owned()).unwrap(); // validated
+
+            // Ensure the digest has already been seen
+            let child_ix = *self.digest_to_node_ix.get(&child_digest).ok_or_else(|| {
+                Error::InvalidRequest(format!(
+                    "'{}' refers to unseen child dir: {}",
+                    dir.name.as_bstr(),
+                    &child_digest
+                ))
+            })?;
+
+            // Ensure the size specified in the child node matches the directory size itself.
+            let recorded_child_size = self
+                .graph
+                .node_weight(child_ix)
+                .expect("node not found")
+                .size();
+
+            // Ensure the size specified in the child node matches our records.
+            if dir.size != recorded_child_size {
+                return Err(Error::InvalidRequest(format!(
+                    "'{}' has wrong size, specified {}, recorded {}",
+                    dir.name.as_bstr(),
+                    dir.size,
+                    recorded_child_size
+                )));
+            }
+
+            child_ixs.insert(child_ix);
+        }
+
+        // Insert node into the graph, and add edges to all children.
+        let node_ix = self.graph.add_node(directory);
+        for child_ix in child_ixs {
+            self.graph.add_edge(node_ix, child_ix, ());
+        }
+
+        // Record the mapping from digest to node_ix in our lookup table.
+        self.digest_to_node_ix.insert(digest, node_ix);
+
+        // Update last_directory_ix.
+        self.last_directory_ix = Some(node_ix);
+
+        Ok(())
+    }
+
+    /// Ensure that all inserted Directories are connected, then return a
+    /// (deduplicated) and validated list of directories, in from-leaves-to-root
+    /// order.
+    /// In case no elements have been inserted, returns an empty list.
+    #[instrument(level = "trace", skip_all, err)]
+    pub(crate) fn finalize(self) -> Result<Vec<Directory>, Error> {
+        let (graph, _) = match self.finalize_raw()? {
+            None => return Ok(vec![]),
+            Some(v) => v,
+        };
+        // Dissolve the graph, returning the nodes as a Vec.
+        // As the graph was populated in a valid DFS PostOrder, we can return
+        // nodes in that same order.
+        let (nodes, _edges) = graph.into_nodes_edges();
+        Ok(nodes.into_iter().map(|x| x.weight).collect())
+    }
+
+    /// Ensure that all inserted Directories are connected, then return a
+    /// (deduplicated) and validated list of directories, in from-root-to-leaves
+    /// order.
+    /// In case no elements have been inserted, returns an empty list.
+    #[instrument(level = "trace", skip_all, err)]
+    pub(crate) fn finalize_root_to_leaves(self) -> Result<Vec<Directory>, Error> {
+        let (graph, root) = match self.finalize_raw()? {
+            None => return Ok(vec![]),
+            Some(v) => v,
+        };
+
+        // do a BFS traversal of the graph, starting with the root node to get
+        // all nodes reachable from there.
+        let traversal = Bfs::new(&graph, root);
+
+        let order = traversal.iter(&graph).collect::<Vec<_>>();
+
+        let (nodes, _edges) = graph.into_nodes_edges();
+
+        // Convert to option, so that we can take individual nodes out without messing up the
+        // indices
+        let mut nodes = nodes.into_iter().map(Some).collect::<Vec<_>>();
+
+        Ok(order
+            .iter()
+            .map(|i| nodes[i.index()].take().unwrap().weight)
+            .collect())
+    }
+
+    /// Internal implementation of closure validation
+    #[instrument(level = "trace", skip_all, err)]
+    fn finalize_raw(self) -> Result<Option<(DirectoryGraph, NodeIndex)>, Error> {
+        // If no nodes were inserted, an empty list is returned.
+        let last_directory_ix = if let Some(x) = self.last_directory_ix {
+            x
+        } else {
+            return Ok(None);
+        };
+
+        // do a BFS traversal of the graph, starting with the root node to get
+        // (the count of) all nodes reachable from there.
+        let mut traversal = Bfs::new(&self.graph, last_directory_ix);
+
+        let mut visited_directory_count = 0;
+        #[cfg(debug_assertions)]
+        let mut visited_directory_ixs = HashSet::new();
+        #[cfg_attr(not(debug_assertions), allow(unused))]
+        while let Some(directory_ix) = traversal.next(&self.graph) {
+            #[cfg(debug_assertions)]
+            visited_directory_ixs.insert(directory_ix);
+
+            visited_directory_count += 1;
+        }
+
+        // If the number of nodes collected equals the total number of nodes in
+        // the graph, we know all nodes are connected.
+        if visited_directory_count != self.graph.node_count() {
+            // more or less exhaustive error reporting.
+            #[cfg(debug_assertions)]
+            {
+                let all_directory_ixs: HashSet<_> = self.graph.node_indices().collect();
+
+                let unvisited_directories: HashSet<_> = all_directory_ixs
+                    .difference(&visited_directory_ixs)
+                    .map(|ix| self.graph.node_weight(*ix).expect("node not found"))
+                    .collect();
+
+                return Err(Error::InvalidRequest(format!(
+                    "found {} disconnected directories: {:?}",
+                    self.graph.node_count() - visited_directory_ixs.len(),
+                    unvisited_directories
+                )));
+            }
+            #[cfg(not(debug_assertions))]
+            {
+                return Err(Error::InvalidRequest(format!(
+                    "found {} disconnected directories",
+                    self.graph.node_count() - visited_directory_count
+                )));
+            }
+        }
+
+        Ok(Some((self.graph, last_directory_ix)))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        fixtures::{DIRECTORY_A, DIRECTORY_B, DIRECTORY_C},
+        proto::{self, Directory},
+    };
+    use lazy_static::lazy_static;
+    use rstest::rstest;
+
+    lazy_static! {
+        pub static ref BROKEN_DIRECTORY : Directory = Directory {
+            symlinks: vec![proto::SymlinkNode {
+                name: "".into(), // invalid name!
+                target: "doesntmatter".into(),
+            }],
+            ..Default::default()
+        };
+
+        pub static ref BROKEN_PARENT_DIRECTORY: Directory = Directory {
+            directories: vec![proto::DirectoryNode {
+                name: "foo".into(),
+                digest: DIRECTORY_A.digest().into(),
+                size: DIRECTORY_A.size() + 42, // wrong!
+            }],
+            ..Default::default()
+        };
+    }
+
+    use super::ClosureValidator;
+
+    #[rstest]
+    /// Uploading an empty directory should succeed.
+    #[case::empty_directory(&[&*DIRECTORY_A], false, Some(vec![&*DIRECTORY_A]))]
+    /// Uploading A, then B (referring to A) should succeed.
+    #[case::simple_closure(&[&*DIRECTORY_A, &*DIRECTORY_B], false, Some(vec![&*DIRECTORY_A, &*DIRECTORY_B]))]
+    /// Uploading A, then A, then C (referring to A twice) should succeed.
+    /// We pretend to be a dumb client not deduping directories.
+    #[case::same_child(&[&*DIRECTORY_A, &*DIRECTORY_A, &*DIRECTORY_C], false, Some(vec![&*DIRECTORY_A, &*DIRECTORY_C]))]
+    /// Uploading A, then C (referring to A twice) should succeed.
+    #[case::same_child_dedup(&[&*DIRECTORY_A, &*DIRECTORY_C], false, Some(vec![&*DIRECTORY_A, &*DIRECTORY_C]))]
+    /// Uploading A, then C (referring to A twice), then B (itself referring to A) should fail during close,
+    /// as B itself would be left unconnected.
+    #[case::unconnected_node(&[&*DIRECTORY_A, &*DIRECTORY_C, &*DIRECTORY_B], false, None)]
+    /// Uploading B (referring to A) should fail immediately, because A was never uploaded.
+    #[case::dangling_pointer(&[&*DIRECTORY_B], true, None)]
+    /// Uploading a directory failing validation should fail immediately.
+    #[case::failing_validation(&[&*BROKEN_DIRECTORY], true, None)]
+    /// Uploading a directory which refers to another Directory with a wrong size should fail.
+    #[case::wrong_size_in_parent(&[&*DIRECTORY_A, &*BROKEN_PARENT_DIRECTORY], true, None)]
+    fn test_uploads(
+        #[case] directories_to_upload: &[&Directory],
+        #[case] exp_fail_upload_last: bool,
+        #[case] exp_finalize: Option<Vec<&Directory>>, // Some(_) if finalize successful, None if not.
+    ) {
+        let mut dcv = ClosureValidator::default();
+        let len_directories_to_upload = directories_to_upload.len();
+
+        for (i, d) in directories_to_upload.iter().enumerate() {
+            let resp = dcv.add((*d).clone());
+            if i == len_directories_to_upload - 1 && exp_fail_upload_last {
+                assert!(resp.is_err(), "expect last put to fail");
+
+                // We don't really care anymore what finalize() would return, as
+                // the add() failed.
+                return;
+            } else {
+                assert!(resp.is_ok(), "expect put to succeed");
+            }
+        }
+
+        // everything was uploaded successfully. Test finalize().
+        let resp = dcv.finalize();
+
+        match exp_finalize {
+            Some(directories) => {
+                assert_eq!(
+                    Vec::from_iter(directories.iter().map(|e| (*e).to_owned())),
+                    resp.expect("drain should succeed")
+                );
+            }
+            None => {
+                resp.expect_err("drain should fail");
+            }
+        }
+    }
+}
diff --git a/tvix/castore/src/directoryservice/from_addr.rs b/tvix/castore/src/directoryservice/from_addr.rs
new file mode 100644
index 0000000000..ee675ca68a
--- /dev/null
+++ b/tvix/castore/src/directoryservice/from_addr.rs
@@ -0,0 +1,189 @@
+use url::Url;
+
+use crate::{proto::directory_service_client::DirectoryServiceClient, Error};
+
+use super::{
+    DirectoryService, GRPCDirectoryService, MemoryDirectoryService, ObjectStoreDirectoryService,
+    SledDirectoryService,
+};
+
+/// Constructs a new instance of a [DirectoryService] from an URI.
+///
+/// The following URIs are supported:
+/// - `memory:`
+///   Uses a in-memory implementation.
+/// - `sled:`
+///   Uses a in-memory sled implementation.
+/// - `sled:///absolute/path/to/somewhere`
+///   Uses sled, using a path on the disk for persistency. Can be only opened
+///   from one process at the same time.
+/// - `grpc+unix:///absolute/path/to/somewhere`
+///   Connects to a local tvix-store gRPC service via Unix socket.
+/// - `grpc+http://host:port`, `grpc+https://host:port`
+///    Connects to a (remote) tvix-store gRPC service.
+pub async fn from_addr(uri: &str) -> Result<Box<dyn DirectoryService>, crate::Error> {
+    #[allow(unused_mut)]
+    let mut url = Url::parse(uri)
+        .map_err(|e| crate::Error::StorageError(format!("unable to parse url: {}", e)))?;
+
+    let directory_service: Box<dyn DirectoryService> = match url.scheme() {
+        "memory" => {
+            // memory doesn't support host or path in the URL.
+            if url.has_host() || !url.path().is_empty() {
+                return Err(Error::StorageError("invalid url".to_string()));
+            }
+            Box::<MemoryDirectoryService>::default()
+        }
+        "sled" => {
+            // sled doesn't support host, and a path can be provided (otherwise
+            // it'll live in memory only).
+            if url.has_host() {
+                return Err(Error::StorageError("no host allowed".to_string()));
+            }
+
+            if url.path() == "/" {
+                return Err(Error::StorageError(
+                    "cowardly refusing to open / with sled".to_string(),
+                ));
+            }
+
+            // TODO: expose compression and other parameters as URL parameters?
+
+            Box::new(if url.path().is_empty() {
+                SledDirectoryService::new_temporary()
+                    .map_err(|e| Error::StorageError(e.to_string()))?
+            } else {
+                SledDirectoryService::new(url.path())
+                    .map_err(|e| Error::StorageError(e.to_string()))?
+            })
+        }
+        scheme if scheme.starts_with("grpc+") => {
+            // schemes starting with grpc+ go to the GRPCPathInfoService.
+            //   That's normally grpc+unix for unix sockets, and grpc+http(s) for the HTTP counterparts.
+            // - In the case of unix sockets, there must be a path, but may not be a host.
+            // - In the case of non-unix sockets, there must be a host, but no path.
+            // Constructing the channel is handled by tvix_castore::channel::from_url.
+            let client = DirectoryServiceClient::new(crate::tonic::channel_from_url(&url).await?);
+            Box::new(GRPCDirectoryService::from_client(client))
+        }
+        scheme if scheme.starts_with("objectstore+") => {
+            // We need to convert the URL to string, strip the prefix there, and then
+            // parse it back as url, as Url::set_scheme() rejects some of the transitions we want to do.
+            let trimmed_url = {
+                let s = url.to_string();
+                Url::parse(s.strip_prefix("objectstore+").unwrap()).unwrap()
+            };
+            Box::new(
+                ObjectStoreDirectoryService::parse_url(&trimmed_url)
+                    .map_err(|e| Error::StorageError(e.to_string()))?,
+            )
+        }
+        #[cfg(feature = "cloud")]
+        "bigtable" => {
+            use super::bigtable::BigtableParameters;
+            use super::BigtableDirectoryService;
+
+            // parse the instance name from the hostname.
+            let instance_name = url
+                .host_str()
+                .ok_or_else(|| Error::StorageError("instance name missing".into()))?
+                .to_string();
+
+            // … but add it to the query string now, so we just need to parse that.
+            url.query_pairs_mut()
+                .append_pair("instance_name", &instance_name);
+
+            let params: BigtableParameters = serde_qs::from_str(url.query().unwrap_or_default())
+                .map_err(|e| Error::InvalidRequest(format!("failed to parse parameters: {}", e)))?;
+
+            Box::new(
+                BigtableDirectoryService::connect(params)
+                    .await
+                    .map_err(|e| Error::StorageError(e.to_string()))?,
+            )
+        }
+        _ => {
+            return Err(crate::Error::StorageError(format!(
+                "unknown scheme: {}",
+                url.scheme()
+            )))
+        }
+    };
+    Ok(directory_service)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::from_addr;
+    use lazy_static::lazy_static;
+    use rstest::rstest;
+    use tempfile::TempDir;
+
+    lazy_static! {
+        static ref TMPDIR_SLED_1: TempDir = TempDir::new().unwrap();
+        static ref TMPDIR_SLED_2: TempDir = TempDir::new().unwrap();
+    }
+
+    #[rstest]
+    /// This uses an unsupported scheme.
+    #[case::unsupported_scheme("http://foo.example/test", false)]
+    /// This configures sled in temporary mode.
+    #[case::sled_valid_temporary("sled://", true)]
+    /// This configures sled with /, which should fail.
+    #[case::sled_invalid_root("sled:///", false)]
+    /// This configures sled with a host, not path, which should fail.
+    #[case::sled_invalid_host("sled://foo.example", false)]
+    /// This configures sled with a valid path path, which should succeed.
+    #[case::sled_valid_path(&format!("sled://{}", &TMPDIR_SLED_1.path().to_str().unwrap()), true)]
+    /// This configures sled with a host, and a valid path path, which should fail.
+    #[case::sled_invalid_host_with_valid_path(&format!("sled://foo.example{}", &TMPDIR_SLED_2.path().to_str().unwrap()), false)]
+    /// This correctly sets the scheme, and doesn't set a path.
+    #[case::memory_valid("memory://", true)]
+    /// This sets a memory url host to `foo`
+    #[case::memory_invalid_host("memory://foo", false)]
+    /// This sets a memory url path to "/", which is invalid.
+    #[case::memory_invalid_root_path("memory:///", false)]
+    /// This sets a memory url path to "/foo", which is invalid.
+    #[case::memory_invalid_root_path_foo("memory:///foo", false)]
+    /// Correct scheme to connect to a unix socket.
+    #[case::grpc_valid_unix_socket("grpc+unix:///path/to/somewhere", true)]
+    /// Correct scheme for unix socket, but setting a host too, which is invalid.
+    #[case::grpc_invalid_unix_socket_and_host("grpc+unix://host.example/path/to/somewhere", false)]
+    /// Correct scheme to connect to localhost, with port 12345
+    #[case::grpc_valid_ipv6_localhost_port_12345("grpc+http://[::1]:12345", true)]
+    /// Correct scheme to connect to localhost over http, without specifying a port.
+    #[case::grpc_valid_http_host_without_port("grpc+http://localhost", true)]
+    /// Correct scheme to connect to localhost over http, without specifying a port.
+    #[case::grpc_valid_https_host_without_port("grpc+https://localhost", true)]
+    /// Correct scheme to connect to localhost over http, but with additional path, which is invalid.
+    #[case::grpc_invalid_host_and_path("grpc+http://localhost/some-path", false)]
+    /// A valid example for Bigtable
+    #[cfg_attr(
+        all(feature = "cloud", feature = "integration"),
+        case::bigtable_valid_url(
+            "bigtable://instance-1?project_id=project-1&table_name=table-1&family_name=cf1",
+            true
+        )
+    )]
+    /// A valid example for Bigtable, specifying a custom channel size and timeout
+    #[cfg_attr(
+        all(feature = "cloud", feature = "integration"),
+        case::bigtable_valid_url(
+            "bigtable://instance-1?project_id=project-1&table_name=table-1&family_name=cf1&channel_size=10&timeout=10",
+            true
+        )
+    )]
+    /// A invalid Bigtable example (missing fields)
+    #[cfg_attr(
+        all(feature = "cloud", feature = "integration"),
+        case::bigtable_invalid_url("bigtable://instance-1", false)
+    )]
+    #[tokio::test]
+    async fn test_from_addr_tokio(#[case] uri_str: &str, #[case] exp_succeed: bool) {
+        if exp_succeed {
+            from_addr(uri_str).await.expect("should succeed");
+        } else {
+            assert!(from_addr(uri_str).await.is_err(), "should fail");
+        }
+    }
+}
diff --git a/tvix/castore/src/directoryservice/grpc.rs b/tvix/castore/src/directoryservice/grpc.rs
new file mode 100644
index 0000000000..fe935629bf
--- /dev/null
+++ b/tvix/castore/src/directoryservice/grpc.rs
@@ -0,0 +1,345 @@
+use std::collections::HashSet;
+
+use super::{DirectoryPutter, DirectoryService};
+use crate::proto::{self, get_directory_request::ByWhat};
+use crate::{B3Digest, Error};
+use async_stream::try_stream;
+use futures::stream::BoxStream;
+use tokio::spawn;
+use tokio::sync::mpsc::UnboundedSender;
+use tokio::task::JoinHandle;
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tonic::async_trait;
+use tonic::Code;
+use tonic::{transport::Channel, Status};
+use tracing::{instrument, warn};
+
+/// Connects to a (remote) tvix-store DirectoryService over gRPC.
+#[derive(Clone)]
+pub struct GRPCDirectoryService {
+    /// The internal reference to a gRPC client.
+    /// Cloning it is cheap, and it internally handles concurrent requests.
+    grpc_client: proto::directory_service_client::DirectoryServiceClient<Channel>,
+}
+
+impl GRPCDirectoryService {
+    /// construct a [GRPCDirectoryService] from a [proto::directory_service_client::DirectoryServiceClient].
+    /// panics if called outside the context of a tokio runtime.
+    pub fn from_client(
+        grpc_client: proto::directory_service_client::DirectoryServiceClient<Channel>,
+    ) -> Self {
+        Self { grpc_client }
+    }
+}
+
+#[async_trait]
+impl DirectoryService for GRPCDirectoryService {
+    #[instrument(level = "trace", skip_all, fields(directory.digest = %digest))]
+    async fn get(
+        &self,
+        digest: &B3Digest,
+    ) -> Result<Option<crate::proto::Directory>, crate::Error> {
+        // Get a new handle to the gRPC client, and copy the digest.
+        let mut grpc_client = self.grpc_client.clone();
+        let digest_cpy = digest.clone();
+        let message = async move {
+            let mut s = grpc_client
+                .get(proto::GetDirectoryRequest {
+                    recursive: false,
+                    by_what: Some(ByWhat::Digest(digest_cpy.into())),
+                })
+                .await?
+                .into_inner();
+
+            // Retrieve the first message only, then close the stream (we set recursive to false)
+            s.message().await
+        };
+
+        let digest = digest.clone();
+        match message.await {
+            Ok(Some(directory)) => {
+                // Validate the retrieved Directory indeed has the
+                // digest we expect it to have, to detect corruptions.
+                let actual_digest = directory.digest();
+                if actual_digest != digest {
+                    Err(crate::Error::StorageError(format!(
+                        "requested directory with digest {}, but got {}",
+                        digest, actual_digest
+                    )))
+                } else if let Err(e) = directory.validate() {
+                    // Validate the Directory itself is valid.
+                    warn!("directory failed validation: {}", e.to_string());
+                    Err(crate::Error::StorageError(format!(
+                        "directory {} failed validation: {}",
+                        digest, e,
+                    )))
+                } else {
+                    Ok(Some(directory))
+                }
+            }
+            Ok(None) => Ok(None),
+            Err(e) if e.code() == Code::NotFound => Ok(None),
+            Err(e) => Err(crate::Error::StorageError(e.to_string())),
+        }
+    }
+
+    #[instrument(level = "trace", skip_all, fields(directory.digest = %directory.digest()))]
+    async fn put(&self, directory: crate::proto::Directory) -> Result<B3Digest, crate::Error> {
+        let resp = self
+            .grpc_client
+            .clone()
+            .put(tokio_stream::once(directory))
+            .await;
+
+        match resp {
+            Ok(put_directory_resp) => Ok(put_directory_resp
+                .into_inner()
+                .root_digest
+                .try_into()
+                .map_err(|_| {
+                    Error::StorageError("invalid root digest length in response".to_string())
+                })?),
+            Err(e) => Err(crate::Error::StorageError(e.to_string())),
+        }
+    }
+
+    #[instrument(level = "trace", skip_all, fields(directory.digest = %root_directory_digest))]
+    fn get_recursive(
+        &self,
+        root_directory_digest: &B3Digest,
+    ) -> BoxStream<'static, Result<proto::Directory, Error>> {
+        let mut grpc_client = self.grpc_client.clone();
+        let root_directory_digest = root_directory_digest.clone();
+
+        let stream = try_stream! {
+            let mut stream = grpc_client
+                .get(proto::GetDirectoryRequest {
+                    recursive: true,
+                    by_what: Some(ByWhat::Digest(root_directory_digest.clone().into())),
+                })
+                .await
+                .map_err(|e| crate::Error::StorageError(e.to_string()))?
+                .into_inner();
+
+            // The Directory digests we received so far
+            let mut received_directory_digests: HashSet<B3Digest> = HashSet::new();
+            // The Directory digests we're still expecting to get sent.
+            let mut expected_directory_digests: HashSet<B3Digest> = HashSet::from([root_directory_digest]);
+
+            loop {
+                match stream.message().await {
+                    Ok(Some(directory)) => {
+                        // validate the directory itself.
+                        if let Err(e) = directory.validate() {
+                            Err(crate::Error::StorageError(format!(
+                                "directory {} failed validation: {}",
+                                directory.digest(),
+                                e,
+                            )))?;
+                        }
+                        // validate we actually expected that directory, and move it from expected to received.
+                        let directory_digest = directory.digest();
+                        let was_expected = expected_directory_digests.remove(&directory_digest);
+                        if !was_expected {
+                            // FUTUREWORK: dumb clients might send the same stuff twice.
+                            // as a fallback, we might want to tolerate receiving
+                            // it if it's in received_directory_digests (as that
+                            // means it once was in expected_directory_digests)
+                            Err(crate::Error::StorageError(format!(
+                                "received unexpected directory {}",
+                                directory_digest
+                            )))?;
+                        }
+                        received_directory_digests.insert(directory_digest);
+
+                        // register all children in expected_directory_digests.
+                        for child_directory in &directory.directories {
+                            // We ran validate() above, so we know these digests must be correct.
+                            let child_directory_digest =
+                                child_directory.digest.clone().try_into().unwrap();
+
+                            expected_directory_digests
+                                .insert(child_directory_digest);
+                        }
+
+                        yield directory;
+                    },
+                    Ok(None) => {
+                        // If we were still expecting something, that's an error.
+                        if !expected_directory_digests.is_empty() {
+                            Err(crate::Error::StorageError(format!(
+                                "still expected {} directories, but got premature end of stream",
+                                expected_directory_digests.len(),
+                            )))?
+                        } else {
+                            return
+                        }
+                    },
+                    Err(e) => {
+                        Err(crate::Error::StorageError(e.to_string()))?;
+                    },
+                }
+            }
+        };
+
+        Box::pin(stream)
+    }
+
+    #[instrument(skip_all)]
+    fn put_multiple_start(&self) -> Box<(dyn DirectoryPutter + 'static)>
+    where
+        Self: Clone,
+    {
+        let mut grpc_client = self.grpc_client.clone();
+
+        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
+
+        let task: JoinHandle<Result<proto::PutDirectoryResponse, Status>> = spawn(async move {
+            let s = grpc_client
+                .put(UnboundedReceiverStream::new(rx))
+                .await?
+                .into_inner();
+
+            Ok(s)
+        });
+
+        Box::new(GRPCPutter {
+            rq: Some((task, tx)),
+        })
+    }
+}
+
+/// Allows uploading multiple Directory messages in the same gRPC stream.
+pub struct GRPCPutter {
+    /// Data about the current request - a handle to the task, and the tx part
+    /// of the channel.
+    /// The tx part of the pipe is used to send [proto::Directory] to the ongoing request.
+    /// The task will yield a [proto::PutDirectoryResponse] once the stream is closed.
+    #[allow(clippy::type_complexity)] // lol
+    rq: Option<(
+        JoinHandle<Result<proto::PutDirectoryResponse, Status>>,
+        UnboundedSender<proto::Directory>,
+    )>,
+}
+
+#[async_trait]
+impl DirectoryPutter for GRPCPutter {
+    #[instrument(level = "trace", skip_all, fields(directory.digest=%directory.digest()), err)]
+    async fn put(&mut self, directory: proto::Directory) -> Result<(), crate::Error> {
+        match self.rq {
+            // If we're not already closed, send the directory to directory_sender.
+            Some((_, ref directory_sender)) => {
+                if directory_sender.send(directory).is_err() {
+                    // If the channel has been prematurely closed, invoke close (so we can peek at the error code)
+                    // That error code is much more helpful, because it
+                    // contains the error message from the server.
+                    self.close().await?;
+                }
+                Ok(())
+            }
+            // If self.close() was already called, we can't put again.
+            None => Err(Error::StorageError(
+                "DirectoryPutter already closed".to_string(),
+            )),
+        }
+    }
+
+    /// Closes the stream for sending, and returns the value.
+    #[instrument(level = "trace", skip_all, ret, err)]
+    async fn close(&mut self) -> Result<B3Digest, crate::Error> {
+        // get self.rq, and replace it with None.
+        // This ensures we can only close it once.
+        match std::mem::take(&mut self.rq) {
+            None => Err(Error::StorageError("already closed".to_string())),
+            Some((task, directory_sender)) => {
+                // close directory_sender, so blocking on task will finish.
+                drop(directory_sender);
+
+                let root_digest = task
+                    .await?
+                    .map_err(|e| Error::StorageError(e.to_string()))?
+                    .root_digest;
+
+                root_digest.try_into().map_err(|_| {
+                    Error::StorageError("invalid root digest length in response".to_string())
+                })
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::time::Duration;
+    use tempfile::TempDir;
+    use tokio::net::UnixListener;
+    use tokio_retry::{strategy::ExponentialBackoff, Retry};
+    use tokio_stream::wrappers::UnixListenerStream;
+
+    use crate::{
+        directoryservice::{DirectoryService, GRPCDirectoryService, MemoryDirectoryService},
+        fixtures,
+        proto::{directory_service_client::DirectoryServiceClient, GRPCDirectoryServiceWrapper},
+    };
+
+    /// This ensures connecting via gRPC works as expected.
+    #[tokio::test]
+    async fn test_valid_unix_path_ping_pong() {
+        let tmpdir = TempDir::new().unwrap();
+        let socket_path = tmpdir.path().join("daemon");
+
+        let path_clone = socket_path.clone();
+
+        // Spin up a server
+        tokio::spawn(async {
+            let uds = UnixListener::bind(path_clone).unwrap();
+            let uds_stream = UnixListenerStream::new(uds);
+
+            // spin up a new server
+            let mut server = tonic::transport::Server::builder();
+            let router = server.add_service(
+                crate::proto::directory_service_server::DirectoryServiceServer::new(
+                    GRPCDirectoryServiceWrapper::new(
+                        Box::<MemoryDirectoryService>::default() as Box<dyn DirectoryService>
+                    ),
+                ),
+            );
+            router.serve_with_incoming(uds_stream).await
+        });
+
+        // wait for the socket to be created
+        Retry::spawn(
+            ExponentialBackoff::from_millis(20).max_delay(Duration::from_secs(10)),
+            || async {
+                if socket_path.exists() {
+                    Ok(())
+                } else {
+                    Err(())
+                }
+            },
+        )
+        .await
+        .expect("failed to wait for socket");
+
+        // prepare a client
+        let grpc_client = {
+            let url = url::Url::parse(&format!(
+                "grpc+unix://{}?wait-connect=1",
+                socket_path.display()
+            ))
+            .expect("must parse");
+            let client = DirectoryServiceClient::new(
+                crate::tonic::channel_from_url(&url)
+                    .await
+                    .expect("must succeed"),
+            );
+            GRPCDirectoryService::from_client(client)
+        };
+
+        assert!(grpc_client
+            .get(&fixtures::DIRECTORY_A.digest())
+            .await
+            .expect("must not fail")
+            .is_none())
+    }
+}
diff --git a/tvix/castore/src/directoryservice/memory.rs b/tvix/castore/src/directoryservice/memory.rs
new file mode 100644
index 0000000000..3b2795c396
--- /dev/null
+++ b/tvix/castore/src/directoryservice/memory.rs
@@ -0,0 +1,87 @@
+use crate::{proto, B3Digest, Error};
+use futures::stream::BoxStream;
+use std::collections::HashMap;
+use std::sync::Arc;
+use tokio::sync::RwLock;
+use tonic::async_trait;
+use tracing::{instrument, warn};
+
+use super::utils::traverse_directory;
+use super::{DirectoryPutter, DirectoryService, SimplePutter};
+
+#[derive(Clone, Default)]
+pub struct MemoryDirectoryService {
+    db: Arc<RwLock<HashMap<B3Digest, proto::Directory>>>,
+}
+
+#[async_trait]
+impl DirectoryService for MemoryDirectoryService {
+    #[instrument(skip(self, digest), fields(directory.digest = %digest))]
+    async fn get(&self, digest: &B3Digest) -> Result<Option<proto::Directory>, Error> {
+        let db = self.db.read().await;
+
+        match db.get(digest) {
+            // The directory was not found, return
+            None => Ok(None),
+
+            // The directory was found, try to parse the data as Directory message
+            Some(directory) => {
+                // Validate the retrieved Directory indeed has the
+                // digest we expect it to have, to detect corruptions.
+                let actual_digest = directory.digest();
+                if actual_digest != *digest {
+                    return Err(Error::StorageError(format!(
+                        "requested directory with digest {}, but got {}",
+                        digest, actual_digest
+                    )));
+                }
+
+                // Validate the Directory itself is valid.
+                if let Err(e) = directory.validate() {
+                    warn!("directory failed validation: {}", e.to_string());
+                    return Err(Error::StorageError(format!(
+                        "directory {} failed validation: {}",
+                        actual_digest, e,
+                    )));
+                }
+
+                Ok(Some(directory.clone()))
+            }
+        }
+    }
+
+    #[instrument(skip(self, directory), fields(directory.digest = %directory.digest()))]
+    async fn put(&self, directory: proto::Directory) -> Result<B3Digest, Error> {
+        let digest = directory.digest();
+
+        // validate the directory itself.
+        if let Err(e) = directory.validate() {
+            return Err(Error::InvalidRequest(format!(
+                "directory {} failed validation: {}",
+                digest, e,
+            )));
+        }
+
+        // store it
+        let mut db = self.db.write().await;
+        db.insert(digest.clone(), directory);
+
+        Ok(digest)
+    }
+
+    #[instrument(skip_all, fields(directory.digest = %root_directory_digest))]
+    fn get_recursive(
+        &self,
+        root_directory_digest: &B3Digest,
+    ) -> BoxStream<'static, Result<proto::Directory, Error>> {
+        traverse_directory(self.clone(), root_directory_digest)
+    }
+
+    #[instrument(skip_all)]
+    fn put_multiple_start(&self) -> Box<(dyn DirectoryPutter + 'static)>
+    where
+        Self: Clone,
+    {
+        Box::new(SimplePutter::new(self.clone()))
+    }
+}
diff --git a/tvix/castore/src/directoryservice/mod.rs b/tvix/castore/src/directoryservice/mod.rs
new file mode 100644
index 0000000000..3f180ef162
--- /dev/null
+++ b/tvix/castore/src/directoryservice/mod.rs
@@ -0,0 +1,124 @@
+use crate::{proto, B3Digest, Error};
+use futures::stream::BoxStream;
+use tonic::async_trait;
+
+mod closure_validator;
+mod from_addr;
+mod grpc;
+mod memory;
+mod object_store;
+mod simple_putter;
+mod sled;
+#[cfg(test)]
+pub mod tests;
+mod traverse;
+mod utils;
+
+pub use self::closure_validator::ClosureValidator;
+pub use self::from_addr::from_addr;
+pub use self::grpc::GRPCDirectoryService;
+pub use self::memory::MemoryDirectoryService;
+pub use self::object_store::ObjectStoreDirectoryService;
+pub use self::simple_putter::SimplePutter;
+pub use self::sled::SledDirectoryService;
+pub use self::traverse::descend_to;
+pub use self::utils::traverse_directory;
+
+#[cfg(feature = "cloud")]
+mod bigtable;
+
+#[cfg(feature = "cloud")]
+pub use self::bigtable::BigtableDirectoryService;
+
+/// The base trait all Directory services need to implement.
+/// This is a simple get and put of [crate::proto::Directory], returning their
+/// digest.
+#[async_trait]
+pub trait DirectoryService: Send + Sync {
+    /// Looks up a single Directory message by its digest.
+    /// The returned Directory message *must* be valid.
+    /// In case the directory is not found, Ok(None) is returned.
+    ///
+    /// It is okay for certain implementations to only allow retrieval of
+    /// Directory digests that are at the "root", aka the last element that's
+    /// sent to a DirectoryPutter. This makes sense for implementations bundling
+    /// closures of directories together in batches.
+    async fn get(&self, digest: &B3Digest) -> Result<Option<proto::Directory>, Error>;
+    /// Uploads a single Directory message, and returns the calculated
+    /// digest, or an error. An error *must* also be returned if the message is
+    /// not valid.
+    async fn put(&self, directory: proto::Directory) -> Result<B3Digest, Error>;
+
+    /// Looks up a closure of [proto::Directory].
+    /// Ideally this would be a `impl Stream<Item = Result<proto::Directory, Error>>`,
+    /// and we'd be able to add a default implementation for it here, but
+    /// we can't have that yet.
+    ///
+    /// This returns a pinned, boxed stream. The pinning allows for it to be polled easily,
+    /// and the box allows different underlying stream implementations to be returned since
+    /// Rust doesn't support this as a generic in traits yet. This is the same thing that
+    /// [async_trait] generates, but for streams instead of futures.
+    ///
+    /// The individually returned Directory messages *must* be valid.
+    /// Directories are sent in an order from the root to the leaves, so that
+    /// the receiving side can validate each message to be a connected to the root
+    /// that has initially been requested.
+    fn get_recursive(
+        &self,
+        root_directory_digest: &B3Digest,
+    ) -> BoxStream<'static, Result<proto::Directory, Error>>;
+
+    /// Allows persisting a closure of [proto::Directory], which is a graph of
+    /// connected Directory messages.
+    fn put_multiple_start(&self) -> Box<dyn DirectoryPutter>;
+}
+
+#[async_trait]
+impl<A> DirectoryService for A
+where
+    A: AsRef<dyn DirectoryService> + Send + Sync,
+{
+    async fn get(&self, digest: &B3Digest) -> Result<Option<proto::Directory>, Error> {
+        self.as_ref().get(digest).await
+    }
+
+    async fn put(&self, directory: proto::Directory) -> Result<B3Digest, Error> {
+        self.as_ref().put(directory).await
+    }
+
+    fn get_recursive(
+        &self,
+        root_directory_digest: &B3Digest,
+    ) -> BoxStream<'static, Result<proto::Directory, Error>> {
+        self.as_ref().get_recursive(root_directory_digest)
+    }
+
+    fn put_multiple_start(&self) -> Box<dyn DirectoryPutter> {
+        self.as_ref().put_multiple_start()
+    }
+}
+
+/// Provides a handle to put a closure of connected [proto::Directory] elements.
+///
+/// The consumer can periodically call [DirectoryPutter::put], starting from the
+/// leaves. Once the root is reached, [DirectoryPutter::close] can be called to
+/// retrieve the root digest (or an error).
+///
+/// DirectoryPutters might be created without a single [DirectoryPutter::put],
+/// and then dropped without calling [DirectoryPutter::close],
+/// for example when ingesting a path that ends up not pointing to a directory,
+/// but a single file or symlink.
+#[async_trait]
+pub trait DirectoryPutter: Send {
+    /// Put a individual [proto::Directory] into the store.
+    /// Error semantics and behaviour is up to the specific implementation of
+    /// this trait.
+    /// Due to bursting, the returned error might refer to an object previously
+    /// sent via `put`.
+    async fn put(&mut self, directory: proto::Directory) -> Result<(), Error>;
+
+    /// Close the stream, and wait for any errors.
+    /// If there's been any invalid Directory message uploaded, and error *must*
+    /// be returned.
+    async fn close(&mut self) -> Result<B3Digest, Error>;
+}
diff --git a/tvix/castore/src/directoryservice/object_store.rs b/tvix/castore/src/directoryservice/object_store.rs
new file mode 100644
index 0000000000..64ce335edb
--- /dev/null
+++ b/tvix/castore/src/directoryservice/object_store.rs
@@ -0,0 +1,261 @@
+use std::collections::HashSet;
+use std::sync::Arc;
+
+use data_encoding::HEXLOWER;
+use futures::future::Either;
+use futures::stream::BoxStream;
+use futures::SinkExt;
+use futures::StreamExt;
+use futures::TryFutureExt;
+use futures::TryStreamExt;
+use object_store::{path::Path, ObjectStore};
+use prost::Message;
+use tokio::io::AsyncWriteExt;
+use tokio_util::codec::LengthDelimitedCodec;
+use tonic::async_trait;
+use tracing::{instrument, trace, warn, Level};
+use url::Url;
+
+use super::{ClosureValidator, DirectoryPutter, DirectoryService};
+use crate::{proto, B3Digest, Error};
+
+/// Stores directory closures in an object store.
+/// Notably, this makes use of the option to disallow accessing child directories except when
+/// fetching them recursively via the top-level directory, since all batched writes
+/// (using `put_multiple_start`) are stored in a single object.
+/// Directories are stored in a length-delimited format with a 1MiB limit. The length field is a
+/// u32 and the directories are stored in root-to-leaves topological order, the same way they will
+/// be returned to the client in get_recursive.
+#[derive(Clone)]
+pub struct ObjectStoreDirectoryService {
+    object_store: Arc<dyn ObjectStore>,
+    base_path: Path,
+}
+
+#[instrument(level=Level::TRACE, skip_all,fields(base_path=%base_path,blob.digest=%digest),ret(Display))]
+fn derive_dirs_path(base_path: &Path, digest: &B3Digest) -> Path {
+    base_path
+        .child("dirs")
+        .child("b3")
+        .child(HEXLOWER.encode(&digest.as_slice()[..2]))
+        .child(HEXLOWER.encode(digest.as_slice()))
+}
+
+#[allow(clippy::identity_op)]
+const MAX_FRAME_LENGTH: usize = 1 * 1024 * 1024 * 1000; // 1 MiB
+                                                        //
+impl ObjectStoreDirectoryService {
+    /// Constructs a new [ObjectStoreBlobService] from a [Url] supported by
+    /// [object_store].
+    /// Any path suffix becomes the base path of the object store.
+    /// additional options, the same as in [object_store::parse_url_opts] can
+    /// be passed.
+    pub fn parse_url_opts<I, K, V>(url: &Url, options: I) -> Result<Self, object_store::Error>
+    where
+        I: IntoIterator<Item = (K, V)>,
+        K: AsRef<str>,
+        V: Into<String>,
+    {
+        let (object_store, path) = object_store::parse_url_opts(url, options)?;
+
+        Ok(Self {
+            object_store: Arc::new(object_store),
+            base_path: path,
+        })
+    }
+
+    /// Like [Self::parse_url_opts], except without the options.
+    pub fn parse_url(url: &Url) -> Result<Self, object_store::Error> {
+        Self::parse_url_opts(url, Vec::<(String, String)>::new())
+    }
+}
+
+#[async_trait]
+impl DirectoryService for ObjectStoreDirectoryService {
+    /// This is the same steps as for get_recursive anyways, so we just call get_recursive and
+    /// return the first element of the stream and drop the request.
+    #[instrument(skip(self, digest), fields(directory.digest = %digest))]
+    async fn get(&self, digest: &B3Digest) -> Result<Option<proto::Directory>, Error> {
+        self.get_recursive(digest).take(1).next().await.transpose()
+    }
+
+    #[instrument(skip(self, directory), fields(directory.digest = %directory.digest()))]
+    async fn put(&self, directory: proto::Directory) -> Result<B3Digest, Error> {
+        if !directory.directories.is_empty() {
+            return Err(Error::InvalidRequest(
+                    "only put_multiple_start is supported by the ObjectStoreDirectoryService for directories with children".into(),
+            ));
+        }
+
+        let mut handle = self.put_multiple_start();
+        handle.put(directory).await?;
+        handle.close().await
+    }
+
+    #[instrument(skip_all, fields(directory.digest = %root_directory_digest))]
+    fn get_recursive(
+        &self,
+        root_directory_digest: &B3Digest,
+    ) -> BoxStream<'static, Result<proto::Directory, Error>> {
+        // The Directory digests we're expecting to receive.
+        let mut expected_directory_digests: HashSet<B3Digest> =
+            HashSet::from([root_directory_digest.clone()]);
+
+        let dir_path = derive_dirs_path(&self.base_path, root_directory_digest);
+        let object_store = self.object_store.clone();
+
+        Box::pin(
+            (async move {
+                let stream = match object_store.get(&dir_path).await {
+                    Ok(v) => v.into_stream(),
+                    Err(object_store::Error::NotFound { .. }) => {
+                        return Ok(Either::Left(futures::stream::empty()))
+                    }
+                    Err(e) => return Err(std::io::Error::from(e).into()),
+                };
+
+                // get a reader of the response body.
+                let r = tokio_util::io::StreamReader::new(stream);
+                let decompressed_stream = async_compression::tokio::bufread::ZstdDecoder::new(r);
+
+                // the subdirectories are stored in a length delimited format
+                let delimited_stream = LengthDelimitedCodec::builder()
+                    .max_frame_length(MAX_FRAME_LENGTH)
+                    .length_field_type::<u32>()
+                    .new_read(decompressed_stream);
+
+                let dirs_stream = delimited_stream.map_err(Error::from).and_then(move |buf| {
+                    futures::future::ready((|| {
+                        let mut hasher = blake3::Hasher::new();
+                        let digest: B3Digest = hasher.update(&buf).finalize().as_bytes().into();
+
+                        // Ensure to only decode the directory objects whose digests we trust
+                        let was_expected = expected_directory_digests.remove(&digest);
+                        if !was_expected {
+                            return Err(crate::Error::StorageError(format!(
+                                "received unexpected directory {}",
+                                digest
+                            )));
+                        }
+
+                        let directory = proto::Directory::decode(&*buf).map_err(|e| {
+                            warn!("unable to parse directory {}: {}", digest, e);
+                            Error::StorageError(e.to_string())
+                        })?;
+
+                        for directory in &directory.directories {
+                            // Allow the children to appear next
+                            expected_directory_digests.insert(
+                                B3Digest::try_from(directory.digest.clone())
+                                    .map_err(|e| Error::StorageError(e.to_string()))?,
+                            );
+                        }
+
+                        Ok(directory)
+                    })())
+                });
+
+                Ok(Either::Right(dirs_stream))
+            })
+            .try_flatten_stream(),
+        )
+    }
+
+    #[instrument(skip_all)]
+    fn put_multiple_start(&self) -> Box<(dyn DirectoryPutter + 'static)>
+    where
+        Self: Clone,
+    {
+        Box::new(ObjectStoreDirectoryPutter::new(
+            self.object_store.clone(),
+            self.base_path.clone(),
+        ))
+    }
+}
+
+struct ObjectStoreDirectoryPutter {
+    object_store: Arc<dyn ObjectStore>,
+    base_path: Path,
+
+    directory_validator: Option<ClosureValidator>,
+}
+
+impl ObjectStoreDirectoryPutter {
+    fn new(object_store: Arc<dyn ObjectStore>, base_path: Path) -> Self {
+        Self {
+            object_store,
+            base_path,
+            directory_validator: Some(Default::default()),
+        }
+    }
+}
+
+#[async_trait]
+impl DirectoryPutter for ObjectStoreDirectoryPutter {
+    #[instrument(level = "trace", skip_all, fields(directory.digest=%directory.digest()), err)]
+    async fn put(&mut self, directory: proto::Directory) -> Result<(), Error> {
+        match self.directory_validator {
+            None => return Err(Error::StorageError("already closed".to_string())),
+            Some(ref mut validator) => {
+                validator.add(directory)?;
+            }
+        }
+
+        Ok(())
+    }
+
+    #[instrument(level = "trace", skip_all, ret, err)]
+    async fn close(&mut self) -> Result<B3Digest, Error> {
+        let validator = match self.directory_validator.take() {
+            None => return Err(Error::InvalidRequest("already closed".to_string())),
+            Some(validator) => validator,
+        };
+
+        // retrieve the validated directories.
+        // It is important that they are in topological order (root first),
+        // since that's how we want to retrieve them from the object store in the end.
+        let directories = validator.finalize_root_to_leaves()?;
+
+        // Get the root digest
+        let root_digest = directories
+            .first()
+            .ok_or_else(|| Error::InvalidRequest("got no directories".to_string()))?
+            .digest();
+
+        let dir_path = derive_dirs_path(&self.base_path, &root_digest);
+
+        match self.object_store.head(&dir_path).await {
+            // directory tree already exists, nothing to do
+            Ok(_) => {
+                trace!("directory tree already exists");
+            }
+
+            // directory tree does not yet exist, compress and upload.
+            Err(object_store::Error::NotFound { .. }) => {
+                trace!("uploading directory tree");
+
+                let object_store_writer =
+                    object_store::buffered::BufWriter::new(self.object_store.clone(), dir_path);
+                let compressed_writer =
+                    async_compression::tokio::write::ZstdEncoder::new(object_store_writer);
+                let mut directories_sink = LengthDelimitedCodec::builder()
+                    .max_frame_length(MAX_FRAME_LENGTH)
+                    .length_field_type::<u32>()
+                    .new_write(compressed_writer);
+
+                for directory in directories {
+                    directories_sink
+                        .send(directory.encode_to_vec().into())
+                        .await?;
+                }
+
+                let mut compressed_writer = directories_sink.into_inner();
+                compressed_writer.shutdown().await?;
+            }
+            // other error
+            Err(err) => Err(std::io::Error::from(err))?,
+        }
+
+        Ok(root_digest)
+    }
+}
diff --git a/tvix/castore/src/directoryservice/simple_putter.rs b/tvix/castore/src/directoryservice/simple_putter.rs
new file mode 100644
index 0000000000..25617ebcac
--- /dev/null
+++ b/tvix/castore/src/directoryservice/simple_putter.rs
@@ -0,0 +1,75 @@
+use super::ClosureValidator;
+use super::DirectoryPutter;
+use super::DirectoryService;
+use crate::proto;
+use crate::B3Digest;
+use crate::Error;
+use tonic::async_trait;
+use tracing::instrument;
+use tracing::warn;
+
+/// This is an implementation of DirectoryPutter that simply
+/// inserts individual Directory messages one by one, on close, after
+/// they successfully validated.
+pub struct SimplePutter<DS: DirectoryService> {
+    directory_service: DS,
+
+    directory_validator: Option<ClosureValidator>,
+}
+
+impl<DS: DirectoryService> SimplePutter<DS> {
+    pub fn new(directory_service: DS) -> Self {
+        Self {
+            directory_service,
+            directory_validator: Some(Default::default()),
+        }
+    }
+}
+
+#[async_trait]
+impl<DS: DirectoryService + 'static> DirectoryPutter for SimplePutter<DS> {
+    #[instrument(level = "trace", skip_all, fields(directory.digest=%directory.digest()), err)]
+    async fn put(&mut self, directory: proto::Directory) -> Result<(), Error> {
+        match self.directory_validator {
+            None => return Err(Error::StorageError("already closed".to_string())),
+            Some(ref mut validator) => {
+                validator.add(directory)?;
+            }
+        }
+
+        Ok(())
+    }
+
+    #[instrument(level = "trace", skip_all, ret, err)]
+    async fn close(&mut self) -> Result<B3Digest, Error> {
+        match self.directory_validator.take() {
+            None => Err(Error::InvalidRequest("already closed".to_string())),
+            Some(validator) => {
+                // retrieve the validated directories.
+                let directories = validator.finalize()?;
+
+                // Get the root digest, which is at the end (cf. insertion order)
+                let root_digest = directories
+                    .last()
+                    .ok_or_else(|| Error::InvalidRequest("got no directories".to_string()))?
+                    .digest();
+
+                // call an individual put for each directory and await the insertion.
+                for directory in directories {
+                    let exp_digest = directory.digest();
+                    let actual_digest = self.directory_service.put(directory).await?;
+
+                    // ensure the digest the backend told us matches our expectations.
+                    if exp_digest != actual_digest {
+                        warn!(directory.digest_expected=%exp_digest, directory.digest_actual=%actual_digest, "unexpected digest");
+                        return Err(Error::StorageError(
+                            "got unexpected digest from backend during put".into(),
+                        ));
+                    }
+                }
+
+                Ok(root_digest)
+            }
+        }
+    }
+}
diff --git a/tvix/castore/src/directoryservice/sled.rs b/tvix/castore/src/directoryservice/sled.rs
new file mode 100644
index 0000000000..9490a49c00
--- /dev/null
+++ b/tvix/castore/src/directoryservice/sled.rs
@@ -0,0 +1,189 @@
+use crate::proto::Directory;
+use crate::{proto, B3Digest, Error};
+use futures::stream::BoxStream;
+use prost::Message;
+use std::ops::Deref;
+use std::path::Path;
+use tonic::async_trait;
+use tracing::{instrument, warn};
+
+use super::utils::traverse_directory;
+use super::{ClosureValidator, DirectoryPutter, DirectoryService};
+
+#[derive(Clone)]
+pub struct SledDirectoryService {
+    db: sled::Db,
+}
+
+impl SledDirectoryService {
+    pub fn new<P: AsRef<Path>>(p: P) -> Result<Self, sled::Error> {
+        let config = sled::Config::default()
+            .use_compression(false) // is a required parameter
+            .path(p);
+        let db = config.open()?;
+
+        Ok(Self { db })
+    }
+
+    pub fn new_temporary() -> Result<Self, sled::Error> {
+        let config = sled::Config::default().temporary(true);
+        let db = config.open()?;
+
+        Ok(Self { db })
+    }
+}
+
+#[async_trait]
+impl DirectoryService for SledDirectoryService {
+    #[instrument(skip(self, digest), fields(directory.digest = %digest))]
+    async fn get(&self, digest: &B3Digest) -> Result<Option<proto::Directory>, Error> {
+        let resp = tokio::task::spawn_blocking({
+            let db = self.db.clone();
+            let digest = digest.clone();
+            move || db.get(digest.as_slice())
+        })
+        .await?
+        .map_err(|e| {
+            warn!("failed to retrieve directory: {}", e);
+            Error::StorageError(format!("failed to retrieve directory: {}", e))
+        })?;
+
+        match resp {
+            // The directory was not found, return
+            None => Ok(None),
+
+            // The directory was found, try to parse the data as Directory message
+            Some(data) => match Directory::decode(&*data) {
+                Ok(directory) => {
+                    // Validate the retrieved Directory indeed has the
+                    // digest we expect it to have, to detect corruptions.
+                    let actual_digest = directory.digest();
+                    if actual_digest != *digest {
+                        return Err(Error::StorageError(format!(
+                            "requested directory with digest {}, but got {}",
+                            digest, actual_digest
+                        )));
+                    }
+
+                    // Validate the Directory itself is valid.
+                    if let Err(e) = directory.validate() {
+                        warn!("directory failed validation: {}", e.to_string());
+                        return Err(Error::StorageError(format!(
+                            "directory {} failed validation: {}",
+                            actual_digest, e,
+                        )));
+                    }
+
+                    Ok(Some(directory))
+                }
+                Err(e) => {
+                    warn!("unable to parse directory {}: {}", digest, e);
+                    Err(Error::StorageError(e.to_string()))
+                }
+            },
+        }
+    }
+
+    #[instrument(skip(self, directory), fields(directory.digest = %directory.digest()))]
+    async fn put(&self, directory: proto::Directory) -> Result<B3Digest, Error> {
+        tokio::task::spawn_blocking({
+            let db = self.db.clone();
+            move || {
+                let digest = directory.digest();
+
+                // validate the directory itself.
+                if let Err(e) = directory.validate() {
+                    return Err(Error::InvalidRequest(format!(
+                        "directory {} failed validation: {}",
+                        digest, e,
+                    )));
+                }
+                // store it
+                db.insert(digest.as_slice(), directory.encode_to_vec())
+                    .map_err(|e| Error::StorageError(e.to_string()))?;
+
+                Ok(digest)
+            }
+        })
+        .await?
+    }
+
+    #[instrument(skip_all, fields(directory.digest = %root_directory_digest))]
+    fn get_recursive(
+        &self,
+        root_directory_digest: &B3Digest,
+    ) -> BoxStream<'static, Result<proto::Directory, Error>> {
+        traverse_directory(self.clone(), root_directory_digest)
+    }
+
+    #[instrument(skip_all)]
+    fn put_multiple_start(&self) -> Box<(dyn DirectoryPutter + 'static)>
+    where
+        Self: Clone,
+    {
+        Box::new(SledDirectoryPutter {
+            tree: self.db.deref().clone(),
+            directory_validator: Some(Default::default()),
+        })
+    }
+}
+
+/// Buffers Directory messages to be uploaded and inserts them in a batch
+/// transaction on close.
+pub struct SledDirectoryPutter {
+    tree: sled::Tree,
+
+    /// The directories (inside the directory validator) that we insert later,
+    /// or None, if they were already inserted.
+    directory_validator: Option<ClosureValidator>,
+}
+
+#[async_trait]
+impl DirectoryPutter for SledDirectoryPutter {
+    #[instrument(level = "trace", skip_all, fields(directory.digest=%directory.digest()), err)]
+    async fn put(&mut self, directory: proto::Directory) -> Result<(), Error> {
+        match self.directory_validator {
+            None => return Err(Error::StorageError("already closed".to_string())),
+            Some(ref mut validator) => {
+                validator.add(directory)?;
+            }
+        }
+
+        Ok(())
+    }
+
+    #[instrument(level = "trace", skip_all, ret, err)]
+    async fn close(&mut self) -> Result<B3Digest, Error> {
+        match self.directory_validator.take() {
+            None => Err(Error::InvalidRequest("already closed".to_string())),
+            Some(validator) => {
+                // Insert all directories as a batch.
+                tokio::task::spawn_blocking({
+                    let tree = self.tree.clone();
+                    move || {
+                        // retrieve the validated directories.
+                        let directories = validator.finalize()?;
+
+                        // Get the root digest, which is at the end (cf. insertion order)
+                        let root_digest = directories
+                            .last()
+                            .ok_or_else(|| Error::InvalidRequest("got no directories".to_string()))?
+                            .digest();
+
+                        let mut batch = sled::Batch::default();
+                        for directory in directories {
+                            batch.insert(directory.digest().as_slice(), directory.encode_to_vec());
+                        }
+
+                        tree.apply_batch(batch).map_err(|e| {
+                            Error::StorageError(format!("unable to apply batch: {}", e))
+                        })?;
+
+                        Ok(root_digest)
+                    }
+                })
+                .await?
+            }
+        }
+    }
+}
diff --git a/tvix/castore/src/directoryservice/tests/mod.rs b/tvix/castore/src/directoryservice/tests/mod.rs
new file mode 100644
index 0000000000..cc3c5b788a
--- /dev/null
+++ b/tvix/castore/src/directoryservice/tests/mod.rs
@@ -0,0 +1,227 @@
+//! This contains test scenarios that a given [DirectoryService] needs to pass.
+//! We use [rstest] and [rstest_reuse] to provide all services we want to test
+//! against, and then apply this template to all test functions.
+
+use futures::StreamExt;
+use rstest::*;
+use rstest_reuse::{self, *};
+
+use super::DirectoryService;
+use crate::directoryservice;
+use crate::{
+    fixtures::{DIRECTORY_A, DIRECTORY_B, DIRECTORY_C},
+    proto::{self, Directory},
+};
+
+mod utils;
+use self::utils::make_grpc_directory_service_client;
+
+// TODO: add tests doing individual puts of a closure, then doing a get_recursive
+// (and figure out semantics if necessary)
+
+/// This produces a template, which will be applied to all individual test functions.
+/// See https://github.com/la10736/rstest/issues/130#issuecomment-968864832
+#[template]
+#[rstest]
+#[case::grpc(make_grpc_directory_service_client().await)]
+#[case::memory(directoryservice::from_addr("memory://").await.unwrap())]
+#[case::sled(directoryservice::from_addr("sled://").await.unwrap())]
+#[case::objectstore(directoryservice::from_addr("objectstore+memory://").await.unwrap())]
+#[cfg_attr(all(feature = "cloud", feature = "integration"), case::bigtable(directoryservice::from_addr("bigtable://instance-1?project_id=project-1&table_name=table-1&family_name=cf1").await.unwrap()))]
+pub fn directory_services(#[case] directory_service: impl DirectoryService) {}
+
+/// Ensures asking for a directory that doesn't exist returns a Ok(None).
+#[apply(directory_services)]
+#[tokio::test]
+async fn test_non_exist(directory_service: impl DirectoryService) {
+    let resp = directory_service.get(&DIRECTORY_A.digest()).await;
+    assert!(resp.unwrap().is_none())
+}
+
+/// Putting a single directory into the store, and then getting it out both via
+/// `.get[_recursive]` should work.
+#[apply(directory_services)]
+#[tokio::test]
+async fn put_get(directory_service: impl DirectoryService) {
+    // Insert a Directory.
+    let digest = directory_service.put(DIRECTORY_A.clone()).await.unwrap();
+    assert_eq!(DIRECTORY_A.digest(), digest, "returned digest must match");
+
+    // single get
+    assert_eq!(
+        Some(DIRECTORY_A.clone()),
+        directory_service.get(&DIRECTORY_A.digest()).await.unwrap()
+    );
+
+    // recursive get
+    assert_eq!(
+        vec![Ok(DIRECTORY_A.clone())],
+        directory_service
+            .get_recursive(&DIRECTORY_A.digest())
+            .collect::<Vec<_>>()
+            .await
+    );
+}
+
+/// Putting a directory closure should work, and it should be possible to get
+/// back the root node both via .get[_recursive]. We don't check `.get` for the
+/// leaf node is possible, as it's Ok for stores to not support that.
+#[apply(directory_services)]
+#[tokio::test]
+async fn put_get_multiple_success(directory_service: impl DirectoryService) {
+    // Insert a Directory closure.
+    let mut handle = directory_service.put_multiple_start();
+    handle.put(DIRECTORY_A.clone()).await.unwrap();
+    handle.put(DIRECTORY_C.clone()).await.unwrap();
+    let root_digest = handle.close().await.unwrap();
+    assert_eq!(
+        DIRECTORY_C.digest(),
+        root_digest,
+        "root digest should match"
+    );
+
+    // Get the root node.
+    assert_eq!(
+        Some(DIRECTORY_C.clone()),
+        directory_service.get(&DIRECTORY_C.digest()).await.unwrap()
+    );
+
+    // Get the closure. Ensure it's sent from the root to the leaves.
+    assert_eq!(
+        vec![Ok(DIRECTORY_C.clone()), Ok(DIRECTORY_A.clone())],
+        directory_service
+            .get_recursive(&DIRECTORY_C.digest())
+            .collect::<Vec<_>>()
+            .await
+    )
+}
+
+/// Puts a directory closure, but simulates a dumb client not deduplicating
+/// its list. Ensure we still only get back a deduplicated list.
+#[apply(directory_services)]
+#[tokio::test]
+async fn put_get_multiple_dedup(directory_service: impl DirectoryService) {
+    // Insert a Directory closure.
+    let mut handle = directory_service.put_multiple_start();
+    handle.put(DIRECTORY_A.clone()).await.unwrap();
+    handle.put(DIRECTORY_A.clone()).await.unwrap();
+    handle.put(DIRECTORY_C.clone()).await.unwrap();
+    let root_digest = handle.close().await.unwrap();
+    assert_eq!(
+        DIRECTORY_C.digest(),
+        root_digest,
+        "root digest should match"
+    );
+
+    // Ensure the returned closure only contains `DIRECTORY_A` once.
+    assert_eq!(
+        vec![Ok(DIRECTORY_C.clone()), Ok(DIRECTORY_A.clone())],
+        directory_service
+            .get_recursive(&DIRECTORY_C.digest())
+            .collect::<Vec<_>>()
+            .await
+    )
+}
+
+/// Uploading A, then C (referring to A twice), then B (itself referring to A) should fail during close,
+/// as B itself would be left unconnected.
+#[apply(directory_services)]
+#[tokio::test]
+async fn upload_reject_unconnected(directory_service: impl DirectoryService) {
+    let mut handle = directory_service.put_multiple_start();
+
+    handle.put(DIRECTORY_A.clone()).await.unwrap();
+    handle.put(DIRECTORY_C.clone()).await.unwrap();
+    handle.put(DIRECTORY_B.clone()).await.unwrap();
+
+    assert!(
+        handle.close().await.is_err(),
+        "closing handle should fail, as B would be left unconnected"
+    );
+}
+
+/// Uploading a directory that refers to another directory not yet uploaded
+/// should fail.
+#[apply(directory_services)]
+#[tokio::test]
+async fn upload_reject_dangling_pointer(directory_service: impl DirectoryService) {
+    let mut handle = directory_service.put_multiple_start();
+
+    // We insert DIRECTORY_A on its own, to ensure the check runs for the
+    // individual put_multiple session, not across the global DirectoryService
+    // contents.
+    directory_service.put(DIRECTORY_A.clone()).await.unwrap();
+
+    // DIRECTORY_B refers to DIRECTORY_A, which is not uploaded with this handle.
+    if handle.put(DIRECTORY_B.clone()).await.is_ok() {
+        assert!(
+            handle.close().await.is_err(),
+            "when succeeding put, close must fail"
+        )
+    }
+}
+
+/// Try uploading a Directory failing its internal validation, ensure it gets
+/// rejected.
+#[apply(directory_services)]
+#[tokio::test]
+async fn upload_reject_failing_validation(directory_service: impl DirectoryService) {
+    let broken_directory = Directory {
+        symlinks: vec![proto::SymlinkNode {
+            name: "".into(), // wrong!
+            target: "doesntmatter".into(),
+        }],
+        ..Default::default()
+    };
+    assert!(broken_directory.validate().is_err());
+
+    // Try to upload via single upload.
+    assert!(
+        directory_service
+            .put(broken_directory.clone())
+            .await
+            .is_err(),
+        "single upload must fail"
+    );
+
+    // Try to upload via put_multiple. We're a bit more permissive here, the
+    // intermediate .put() might succeed, due to client-side bursting (in the
+    // case of gRPC), but then the close MUST fail.
+    let mut handle = directory_service.put_multiple_start();
+    if handle.put(broken_directory).await.is_ok() {
+        assert!(
+            handle.close().await.is_err(),
+            "when succeeding put, close must fail"
+        )
+    }
+}
+
+/// Try uploading a Directory that refers to a previously-uploaded directory.
+/// Both pass their isolated validation, but the size field in the parent is wrong.
+/// This should be rejected.
+#[apply(directory_services)]
+#[tokio::test]
+async fn upload_reject_wrong_size(directory_service: impl DirectoryService) {
+    let wrong_parent_directory = Directory {
+        directories: vec![proto::DirectoryNode {
+            name: "foo".into(),
+            digest: DIRECTORY_A.digest().into(),
+            size: DIRECTORY_A.size() + 42, // wrong!
+        }],
+        ..Default::default()
+    };
+
+    // Make sure isolated validation itself is ok
+    assert!(wrong_parent_directory.validate().is_ok());
+
+    // Now upload both. Ensure it either fails during the second put, or during
+    // the close.
+    let mut handle = directory_service.put_multiple_start();
+    handle.put(DIRECTORY_A.clone()).await.unwrap();
+    if handle.put(wrong_parent_directory).await.is_ok() {
+        assert!(
+            handle.close().await.is_err(),
+            "when second put succeeds, close must fail"
+        )
+    }
+}
diff --git a/tvix/castore/src/directoryservice/tests/utils.rs b/tvix/castore/src/directoryservice/tests/utils.rs
new file mode 100644
index 0000000000..0f706695ee
--- /dev/null
+++ b/tvix/castore/src/directoryservice/tests/utils.rs
@@ -0,0 +1,46 @@
+use crate::directoryservice::{DirectoryService, GRPCDirectoryService};
+use crate::proto::directory_service_client::DirectoryServiceClient;
+use crate::proto::GRPCDirectoryServiceWrapper;
+use crate::{
+    directoryservice::MemoryDirectoryService,
+    proto::directory_service_server::DirectoryServiceServer,
+};
+
+use tonic::transport::{Endpoint, Server, Uri};
+
+/// Constructs and returns a gRPC DirectoryService.
+/// The server part is a [MemoryDirectoryService], exposed via the
+/// [GRPCDirectoryServiceWrapper], and connected through a DuplexStream.
+pub async fn make_grpc_directory_service_client() -> Box<dyn DirectoryService> {
+    let (left, right) = tokio::io::duplex(64);
+
+    // spin up a server, which will only connect once, to the left side.
+    tokio::spawn(async {
+        let directory_service =
+            Box::<MemoryDirectoryService>::default() as Box<dyn DirectoryService>;
+
+        let mut server = Server::builder();
+        let router = server.add_service(DirectoryServiceServer::new(
+            GRPCDirectoryServiceWrapper::new(directory_service),
+        ));
+
+        router
+            .serve_with_incoming(tokio_stream::once(Ok::<_, std::io::Error>(left)))
+            .await
+    });
+
+    // Create a client, connecting to the right side. The URI is unused.
+    let mut maybe_right = Some(right);
+    Box::new(GRPCDirectoryService::from_client(
+        DirectoryServiceClient::new(
+            Endpoint::try_from("http://[::]:50051")
+                .unwrap()
+                .connect_with_connector(tower::service_fn(move |_: Uri| {
+                    let right = maybe_right.take().unwrap();
+                    async move { Ok::<_, std::io::Error>(right) }
+                }))
+                .await
+                .unwrap(),
+        ),
+    ))
+}
diff --git a/tvix/castore/src/directoryservice/traverse.rs b/tvix/castore/src/directoryservice/traverse.rs
new file mode 100644
index 0000000000..17a51ae2bb
--- /dev/null
+++ b/tvix/castore/src/directoryservice/traverse.rs
@@ -0,0 +1,186 @@
+use super::DirectoryService;
+use crate::{
+    proto::{node::Node, NamedNode},
+    B3Digest, Error, Path,
+};
+use tracing::{instrument, warn};
+
+/// This descends from a (root) node to the given (sub)path, returning the Node
+/// at that path, or none, if there's nothing at that path.
+#[instrument(skip(directory_service, path), fields(%path))]
+pub async fn descend_to<DS>(
+    directory_service: DS,
+    root_node: Node,
+    path: impl AsRef<Path> + std::fmt::Display,
+) -> Result<Option<Node>, Error>
+where
+    DS: AsRef<dyn DirectoryService>,
+{
+    let mut parent_node = root_node;
+    for component in path.as_ref().components() {
+        match parent_node {
+            Node::File(_) | Node::Symlink(_) => {
+                // There's still some path left, but the parent node is no directory.
+                // This means the path doesn't exist, as we can't reach it.
+                return Ok(None);
+            }
+            Node::Directory(directory_node) => {
+                let digest: B3Digest = directory_node
+                    .digest
+                    .try_into()
+                    .map_err(|_e| Error::StorageError("invalid digest length".to_string()))?;
+
+                // fetch the linked node from the directory_service.
+                let directory =
+                    directory_service
+                        .as_ref()
+                        .get(&digest)
+                        .await?
+                        .ok_or_else(|| {
+                            // If we didn't get the directory node that's linked, that's a store inconsistency, bail out!
+                            warn!("directory {} does not exist", digest);
+
+                            Error::StorageError(format!("directory {} does not exist", digest))
+                        })?;
+
+                // look for the component in the [Directory].
+                // FUTUREWORK: as the nodes() iterator returns in a sorted fashion, we
+                // could stop as soon as e.name is larger than the search string.
+                if let Some(child_node) = directory.nodes().find(|n| n.get_name() == component) {
+                    // child node found, update prev_node to that and continue.
+                    parent_node = child_node;
+                } else {
+                    // child node not found means there's no such element inside the directory.
+                    return Ok(None);
+                }
+            }
+        }
+    }
+
+    // We traversed the entire path, so this must be the node.
+    Ok(Some(parent_node))
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        directoryservice,
+        fixtures::{DIRECTORY_COMPLICATED, DIRECTORY_WITH_KEEP},
+        PathBuf,
+    };
+
+    use super::descend_to;
+
+    #[tokio::test]
+    async fn test_descend_to() {
+        let directory_service = directoryservice::from_addr("memory://").await.unwrap();
+
+        let mut handle = directory_service.put_multiple_start();
+        handle
+            .put(DIRECTORY_WITH_KEEP.clone())
+            .await
+            .expect("must succeed");
+        handle
+            .put(DIRECTORY_COMPLICATED.clone())
+            .await
+            .expect("must succeed");
+
+        handle.close().await.expect("must upload");
+
+        // construct the node for DIRECTORY_COMPLICATED
+        let node_directory_complicated =
+            crate::proto::node::Node::Directory(crate::proto::DirectoryNode {
+                name: "doesntmatter".into(),
+                digest: DIRECTORY_COMPLICATED.digest().into(),
+                size: DIRECTORY_COMPLICATED.size(),
+            });
+
+        // construct the node for DIRECTORY_COMPLICATED
+        let node_directory_with_keep = crate::proto::node::Node::Directory(
+            DIRECTORY_COMPLICATED.directories.first().unwrap().clone(),
+        );
+
+        // construct the node for the .keep file
+        let node_file_keep =
+            crate::proto::node::Node::File(DIRECTORY_WITH_KEEP.files.first().unwrap().clone());
+
+        // traversal to an empty subpath should return the root node.
+        {
+            let resp = descend_to(
+                &directory_service,
+                node_directory_complicated.clone(),
+                "".parse::<PathBuf>().unwrap(),
+            )
+            .await
+            .expect("must succeed");
+
+            assert_eq!(Some(node_directory_complicated.clone()), resp);
+        }
+
+        // traversal to `keep` should return the node for DIRECTORY_WITH_KEEP
+        {
+            let resp = descend_to(
+                &directory_service,
+                node_directory_complicated.clone(),
+                "keep".parse::<PathBuf>().unwrap(),
+            )
+            .await
+            .expect("must succeed");
+
+            assert_eq!(Some(node_directory_with_keep), resp);
+        }
+
+        // traversal to `keep/.keep` should return the node for the .keep file
+        {
+            let resp = descend_to(
+                &directory_service,
+                node_directory_complicated.clone(),
+                "keep/.keep".parse::<PathBuf>().unwrap(),
+            )
+            .await
+            .expect("must succeed");
+
+            assert_eq!(Some(node_file_keep.clone()), resp);
+        }
+
+        // traversal to `void` should return None (doesn't exist)
+        {
+            let resp = descend_to(
+                &directory_service,
+                node_directory_complicated.clone(),
+                "void".parse::<PathBuf>().unwrap(),
+            )
+            .await
+            .expect("must succeed");
+
+            assert_eq!(None, resp);
+        }
+
+        // traversal to `v/oid` should return None (doesn't exist)
+        {
+            let resp = descend_to(
+                &directory_service,
+                node_directory_complicated.clone(),
+                "v/oid".parse::<PathBuf>().unwrap(),
+            )
+            .await
+            .expect("must succeed");
+
+            assert_eq!(None, resp);
+        }
+
+        // traversal to `keep/.keep/404` should return None (the path can't be
+        // reached, as keep/.keep already is a file)
+        {
+            let resp = descend_to(
+                &directory_service,
+                node_directory_complicated.clone(),
+                "keep/.keep/foo".parse::<PathBuf>().unwrap(),
+            )
+            .await
+            .expect("must succeed");
+
+            assert_eq!(None, resp);
+        }
+    }
+}
diff --git a/tvix/castore/src/directoryservice/utils.rs b/tvix/castore/src/directoryservice/utils.rs
new file mode 100644
index 0000000000..a0ba395ecd
--- /dev/null
+++ b/tvix/castore/src/directoryservice/utils.rs
@@ -0,0 +1,77 @@
+use super::DirectoryService;
+use crate::proto;
+use crate::B3Digest;
+use crate::Error;
+use async_stream::try_stream;
+use futures::stream::BoxStream;
+use std::collections::{HashSet, VecDeque};
+use tracing::instrument;
+use tracing::warn;
+
+/// Traverses a [proto::Directory] from the root to the children.
+///
+/// This is mostly BFS, but directories are only returned once.
+#[instrument(skip(directory_service))]
+pub fn traverse_directory<'a, DS: DirectoryService + 'static>(
+    directory_service: DS,
+    root_directory_digest: &B3Digest,
+) -> BoxStream<'a, Result<proto::Directory, Error>> {
+    // The list of all directories that still need to be traversed. The next
+    // element is picked from the front, new elements are enqueued at the
+    // back.
+    let mut worklist_directory_digests: VecDeque<B3Digest> =
+        VecDeque::from([root_directory_digest.clone()]);
+    // The list of directory digests already sent to the consumer.
+    // We omit sending the same directories multiple times.
+    let mut sent_directory_digests: HashSet<B3Digest> = HashSet::new();
+
+    Box::pin(try_stream! {
+        while let Some(current_directory_digest) = worklist_directory_digests.pop_front() {
+            let current_directory = directory_service.get(&current_directory_digest).await.map_err(|e| {
+                warn!("failed to look up directory");
+                Error::StorageError(format!(
+                    "unable to look up directory {}: {}",
+                    current_directory_digest, e
+                ))
+            })?.ok_or_else(|| {
+                // if it's not there, we have an inconsistent store!
+                warn!("directory {} does not exist", current_directory_digest);
+                Error::StorageError(format!(
+                    "directory {} does not exist",
+                    current_directory_digest
+                ))
+
+            })?;
+
+            // validate, we don't want to send invalid directories.
+            current_directory.validate().map_err(|e| {
+               warn!("directory failed validation: {}", e.to_string());
+               Error::StorageError(format!(
+                   "invalid directory: {}",
+                   current_directory_digest
+               ))
+            })?;
+
+            // We're about to send this directory, so let's avoid sending it again if a
+            // descendant has it.
+            sent_directory_digests.insert(current_directory_digest);
+
+            // enqueue all child directory digests to the work queue, as
+            // long as they're not part of the worklist or already sent.
+            // This panics if the digest looks invalid, it's supposed to be checked first.
+            for child_directory_node in &current_directory.directories {
+                // TODO: propagate error
+                let child_digest: B3Digest = child_directory_node.digest.clone().try_into().unwrap();
+
+                if worklist_directory_digests.contains(&child_digest)
+                    || sent_directory_digests.contains(&child_digest)
+                {
+                    continue;
+                }
+                worklist_directory_digests.push_back(child_digest);
+            }
+
+            yield current_directory;
+        }
+    })
+}
diff --git a/tvix/castore/src/errors.rs b/tvix/castore/src/errors.rs
new file mode 100644
index 0000000000..8343d0774a
--- /dev/null
+++ b/tvix/castore/src/errors.rs
@@ -0,0 +1,54 @@
+use thiserror::Error;
+use tokio::task::JoinError;
+use tonic::Status;
+
+/// Errors related to communication with the store.
+#[derive(Debug, Error, PartialEq)]
+pub enum Error {
+    #[error("invalid request: {0}")]
+    InvalidRequest(String),
+
+    #[error("internal storage error: {0}")]
+    StorageError(String),
+}
+
+impl From<JoinError> for Error {
+    fn from(value: JoinError) -> Self {
+        Error::StorageError(value.to_string())
+    }
+}
+
+impl From<Error> for Status {
+    fn from(value: Error) -> Self {
+        match value {
+            Error::InvalidRequest(msg) => Status::invalid_argument(msg),
+            Error::StorageError(msg) => Status::data_loss(format!("storage error: {}", msg)),
+        }
+    }
+}
+
+impl From<crate::tonic::Error> for Error {
+    fn from(value: crate::tonic::Error) -> Self {
+        Self::StorageError(value.to_string())
+    }
+}
+
+impl From<std::io::Error> for Error {
+    fn from(value: std::io::Error) -> Self {
+        if value.kind() == std::io::ErrorKind::InvalidInput {
+            Error::InvalidRequest(value.to_string())
+        } else {
+            Error::StorageError(value.to_string())
+        }
+    }
+}
+
+// TODO: this should probably go somewhere else?
+impl From<Error> for std::io::Error {
+    fn from(value: Error) -> Self {
+        match value {
+            Error::InvalidRequest(msg) => Self::new(std::io::ErrorKind::InvalidInput, msg),
+            Error::StorageError(msg) => Self::new(std::io::ErrorKind::Other, msg),
+        }
+    }
+}
diff --git a/tvix/castore/src/fixtures.rs b/tvix/castore/src/fixtures.rs
new file mode 100644
index 0000000000..a206d9b7dd
--- /dev/null
+++ b/tvix/castore/src/fixtures.rs
@@ -0,0 +1,88 @@
+use crate::{
+    proto::{self, Directory, DirectoryNode, FileNode, SymlinkNode},
+    B3Digest,
+};
+use lazy_static::lazy_static;
+
+pub const HELLOWORLD_BLOB_CONTENTS: &[u8] = b"Hello World!";
+pub const EMPTY_BLOB_CONTENTS: &[u8] = b"";
+
+lazy_static! {
+    pub static ref DUMMY_DIGEST: B3Digest = {
+        let u = [0u8; 32];
+        (&u).into()
+    };
+    pub static ref DUMMY_DIGEST_2: B3Digest = {
+        let mut u = [0u8; 32];
+        u[0] = 0x10;
+        (&u).into()
+    };
+    pub static ref DUMMY_DATA_1: bytes::Bytes = vec![0x01, 0x02, 0x03].into();
+    pub static ref DUMMY_DATA_2: bytes::Bytes = vec![0x04, 0x05].into();
+
+    pub static ref HELLOWORLD_BLOB_DIGEST: B3Digest =
+        blake3::hash(HELLOWORLD_BLOB_CONTENTS).as_bytes().into();
+    pub static ref EMPTY_BLOB_DIGEST: B3Digest =
+        blake3::hash(EMPTY_BLOB_CONTENTS).as_bytes().into();
+
+    // 2 bytes
+    pub static ref BLOB_A: bytes::Bytes = vec![0x00, 0x01].into();
+    pub static ref BLOB_A_DIGEST: B3Digest = blake3::hash(&BLOB_A).as_bytes().into();
+
+    // 1MB
+    pub static ref BLOB_B: bytes::Bytes = (0..255).collect::<Vec<u8>>().repeat(4 * 1024).into();
+    pub static ref BLOB_B_DIGEST: B3Digest = blake3::hash(&BLOB_B).as_bytes().into();
+
+    // Directories
+    pub static ref DIRECTORY_WITH_KEEP: proto::Directory = proto::Directory {
+        directories: vec![],
+        files: vec![FileNode {
+            name: b".keep".to_vec().into(),
+            digest: EMPTY_BLOB_DIGEST.clone().into(),
+            size: 0,
+            executable: false,
+        }],
+        symlinks: vec![],
+    };
+    pub static ref DIRECTORY_COMPLICATED: proto::Directory = proto::Directory {
+        directories: vec![DirectoryNode {
+            name: b"keep".to_vec().into(),
+            digest: DIRECTORY_WITH_KEEP.digest().into(),
+            size: DIRECTORY_WITH_KEEP.size(),
+        }],
+        files: vec![FileNode {
+            name: b".keep".to_vec().into(),
+            digest: EMPTY_BLOB_DIGEST.clone().into(),
+            size: 0,
+            executable: false,
+        }],
+        symlinks: vec![SymlinkNode {
+            name: b"aa".to_vec().into(),
+            target: b"/nix/store/somewhereelse".to_vec().into(),
+        }],
+    };
+    pub static ref DIRECTORY_A: Directory = Directory::default();
+    pub static ref DIRECTORY_B: Directory = Directory {
+        directories: vec![DirectoryNode {
+            name: b"a".to_vec().into(),
+            digest: DIRECTORY_A.digest().into(),
+            size: DIRECTORY_A.size(),
+        }],
+        ..Default::default()
+    };
+    pub static ref DIRECTORY_C: Directory = Directory {
+        directories: vec![
+            DirectoryNode {
+                name: b"a".to_vec().into(),
+                digest: DIRECTORY_A.digest().into(),
+                size: DIRECTORY_A.size(),
+            },
+            DirectoryNode {
+                name: b"a'".to_vec().into(),
+                digest: DIRECTORY_A.digest().into(),
+                size: DIRECTORY_A.size(),
+            }
+        ],
+        ..Default::default()
+    };
+}
diff --git a/tvix/castore/src/fs/file_attr.rs b/tvix/castore/src/fs/file_attr.rs
new file mode 100644
index 0000000000..2e0e70e3cd
--- /dev/null
+++ b/tvix/castore/src/fs/file_attr.rs
@@ -0,0 +1,29 @@
+#![allow(clippy::unnecessary_cast)] // libc::S_IFDIR is u32 on Linux and u16 on MacOS
+
+use fuse_backend_rs::abi::fuse_abi::Attr;
+
+/// The [Attr] describing the root
+pub const ROOT_FILE_ATTR: Attr = Attr {
+    ino: fuse_backend_rs::api::filesystem::ROOT_ID,
+    size: 0,
+    blksize: 1024,
+    blocks: 0,
+    mode: libc::S_IFDIR as u32 | 0o555,
+    atime: 0,
+    mtime: 0,
+    ctime: 0,
+    atimensec: 0,
+    mtimensec: 0,
+    ctimensec: 0,
+    nlink: 0,
+    uid: 0,
+    gid: 0,
+    rdev: 0,
+    flags: 0,
+    #[cfg(target_os = "macos")]
+    crtime: 0,
+    #[cfg(target_os = "macos")]
+    crtimensec: 0,
+    #[cfg(target_os = "macos")]
+    padding: 0,
+};
diff --git a/tvix/castore/src/fs/fuse/mod.rs b/tvix/castore/src/fs/fuse/mod.rs
new file mode 100644
index 0000000000..94b73d422a
--- /dev/null
+++ b/tvix/castore/src/fs/fuse/mod.rs
@@ -0,0 +1,123 @@
+use std::{io, path::Path, sync::Arc, thread};
+
+use fuse_backend_rs::{api::filesystem::FileSystem, transport::FuseSession};
+use tracing::{error, instrument};
+
+#[cfg(test)]
+mod tests;
+
+struct FuseServer<FS>
+where
+    FS: FileSystem + Sync + Send,
+{
+    server: Arc<fuse_backend_rs::api::server::Server<Arc<FS>>>,
+    channel: fuse_backend_rs::transport::FuseChannel,
+}
+
+#[cfg(target_os = "macos")]
+const BADFD: libc::c_int = libc::EBADF;
+#[cfg(target_os = "linux")]
+const BADFD: libc::c_int = libc::EBADFD;
+
+impl<FS> FuseServer<FS>
+where
+    FS: FileSystem + Sync + Send,
+{
+    fn start(&mut self) -> io::Result<()> {
+        while let Some((reader, writer)) = self
+            .channel
+            .get_request()
+            .map_err(|_| io::Error::from_raw_os_error(libc::EINVAL))?
+        {
+            if let Err(e) = self
+                .server
+                .handle_message(reader, writer.into(), None, None)
+            {
+                match e {
+                    // This indicates the session has been shut down.
+                    fuse_backend_rs::Error::EncodeMessage(e) if e.raw_os_error() == Some(BADFD) => {
+                        break;
+                    }
+                    error => {
+                        error!(?error, "failed to handle fuse request");
+                        continue;
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+pub struct FuseDaemon {
+    session: FuseSession,
+    threads: Vec<thread::JoinHandle<()>>,
+}
+
+impl FuseDaemon {
+    #[instrument(skip(fs, mountpoint), fields(mountpoint=?mountpoint), err)]
+    pub fn new<FS, P>(
+        fs: FS,
+        mountpoint: P,
+        threads: usize,
+        allow_other: bool,
+    ) -> Result<Self, io::Error>
+    where
+        FS: FileSystem + Sync + Send + 'static,
+        P: AsRef<Path> + std::fmt::Debug,
+    {
+        let server = Arc::new(fuse_backend_rs::api::server::Server::new(Arc::new(fs)));
+
+        let mut session = FuseSession::new(mountpoint.as_ref(), "tvix-store", "", true)
+            .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
+
+        #[cfg(target_os = "linux")]
+        session.set_allow_other(allow_other);
+        session
+            .mount()
+            .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
+        let mut join_handles = Vec::with_capacity(threads);
+        for _ in 0..threads {
+            let mut server = FuseServer {
+                server: server.clone(),
+                channel: session
+                    .new_channel()
+                    .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?,
+            };
+            let join_handle = thread::Builder::new()
+                .name("fuse_server".to_string())
+                .spawn(move || {
+                    let _ = server.start();
+                })?;
+            join_handles.push(join_handle);
+        }
+
+        Ok(FuseDaemon {
+            session,
+            threads: join_handles,
+        })
+    }
+
+    #[instrument(skip_all, err)]
+    pub fn unmount(&mut self) -> Result<(), io::Error> {
+        self.session
+            .umount()
+            .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
+
+        for thread in self.threads.drain(..) {
+            thread.join().map_err(|_| {
+                io::Error::new(io::ErrorKind::Other, "failed to join fuse server thread")
+            })?;
+        }
+
+        Ok(())
+    }
+}
+
+impl Drop for FuseDaemon {
+    fn drop(&mut self) {
+        if let Err(error) = self.unmount() {
+            error!(?error, "failed to unmont fuse filesystem")
+        }
+    }
+}
diff --git a/tvix/castore/src/fs/fuse/tests.rs b/tvix/castore/src/fs/fuse/tests.rs
new file mode 100644
index 0000000000..bb321f5888
--- /dev/null
+++ b/tvix/castore/src/fs/fuse/tests.rs
@@ -0,0 +1,1245 @@
+use bstr::ByteSlice;
+use bytes::Bytes;
+use std::{
+    collections::BTreeMap,
+    ffi::{OsStr, OsString},
+    io::{self, Cursor},
+    os::unix::{ffi::OsStrExt, fs::MetadataExt},
+    path::Path,
+    sync::Arc,
+};
+use tempfile::TempDir;
+use tokio_stream::{wrappers::ReadDirStream, StreamExt};
+
+use super::FuseDaemon;
+use crate::fs::{TvixStoreFs, XATTR_NAME_BLOB_DIGEST, XATTR_NAME_DIRECTORY_DIGEST};
+use crate::proto as castorepb;
+use crate::proto::node::Node;
+use crate::{
+    blobservice::{BlobService, MemoryBlobService},
+    directoryservice::{DirectoryService, MemoryDirectoryService},
+    fixtures,
+};
+
+const BLOB_A_NAME: &str = "00000000000000000000000000000000-test";
+const BLOB_B_NAME: &str = "55555555555555555555555555555555-test";
+const HELLOWORLD_BLOB_NAME: &str = "66666666666666666666666666666666-test";
+const SYMLINK_NAME: &str = "11111111111111111111111111111111-test";
+const SYMLINK_NAME2: &str = "44444444444444444444444444444444-test";
+const DIRECTORY_WITH_KEEP_NAME: &str = "22222222222222222222222222222222-test";
+const DIRECTORY_COMPLICATED_NAME: &str = "33333333333333333333333333333333-test";
+
+fn gen_svcs() -> (Arc<dyn BlobService>, Arc<dyn DirectoryService>) {
+    (
+        Arc::new(MemoryBlobService::default()) as Arc<dyn BlobService>,
+        Arc::new(MemoryDirectoryService::default()) as Arc<dyn DirectoryService>,
+    )
+}
+
+fn do_mount<P: AsRef<Path>, BS, DS>(
+    blob_service: BS,
+    directory_service: DS,
+    root_nodes: BTreeMap<bytes::Bytes, Node>,
+    mountpoint: P,
+    list_root: bool,
+    show_xattr: bool,
+) -> io::Result<FuseDaemon>
+where
+    BS: AsRef<dyn BlobService> + Send + Sync + Clone + 'static,
+    DS: AsRef<dyn DirectoryService> + Send + Sync + Clone + 'static,
+{
+    let fs = TvixStoreFs::new(
+        blob_service,
+        directory_service,
+        Arc::new(root_nodes),
+        list_root,
+        show_xattr,
+    );
+    FuseDaemon::new(Arc::new(fs), mountpoint.as_ref(), 4, false)
+}
+
+async fn populate_blob_a(
+    blob_service: &Arc<dyn BlobService>,
+    root_nodes: &mut BTreeMap<Bytes, Node>,
+) {
+    let mut bw = blob_service.open_write().await;
+    tokio::io::copy(&mut Cursor::new(fixtures::BLOB_A.to_vec()), &mut bw)
+        .await
+        .expect("must succeed uploading");
+    bw.close().await.expect("must succeed closing");
+
+    root_nodes.insert(
+        BLOB_A_NAME.into(),
+        Node::File(castorepb::FileNode {
+            name: BLOB_A_NAME.into(),
+            digest: fixtures::BLOB_A_DIGEST.clone().into(),
+            size: fixtures::BLOB_A.len() as u64,
+            executable: false,
+        }),
+    );
+}
+
+async fn populate_blob_b(
+    blob_service: &Arc<dyn BlobService>,
+    root_nodes: &mut BTreeMap<Bytes, Node>,
+) {
+    let mut bw = blob_service.open_write().await;
+    tokio::io::copy(&mut Cursor::new(fixtures::BLOB_B.to_vec()), &mut bw)
+        .await
+        .expect("must succeed uploading");
+    bw.close().await.expect("must succeed closing");
+
+    root_nodes.insert(
+        BLOB_B_NAME.into(),
+        Node::File(castorepb::FileNode {
+            name: BLOB_B_NAME.into(),
+            digest: fixtures::BLOB_B_DIGEST.clone().into(),
+            size: fixtures::BLOB_B.len() as u64,
+            executable: false,
+        }),
+    );
+}
+
+/// adds a blob containing helloworld and marks it as executable
+async fn populate_blob_helloworld(
+    blob_service: &Arc<dyn BlobService>,
+    root_nodes: &mut BTreeMap<Bytes, Node>,
+) {
+    let mut bw = blob_service.open_write().await;
+    tokio::io::copy(
+        &mut Cursor::new(fixtures::HELLOWORLD_BLOB_CONTENTS.to_vec()),
+        &mut bw,
+    )
+    .await
+    .expect("must succeed uploading");
+    bw.close().await.expect("must succeed closing");
+
+    root_nodes.insert(
+        HELLOWORLD_BLOB_NAME.into(),
+        Node::File(castorepb::FileNode {
+            name: HELLOWORLD_BLOB_NAME.into(),
+            digest: fixtures::HELLOWORLD_BLOB_DIGEST.clone().into(),
+            size: fixtures::HELLOWORLD_BLOB_CONTENTS.len() as u64,
+            executable: true,
+        }),
+    );
+}
+
+async fn populate_symlink(root_nodes: &mut BTreeMap<Bytes, Node>) {
+    root_nodes.insert(
+        SYMLINK_NAME.into(),
+        Node::Symlink(castorepb::SymlinkNode {
+            name: SYMLINK_NAME.into(),
+            target: BLOB_A_NAME.into(),
+        }),
+    );
+}
+
+/// This writes a symlink pointing to /nix/store/somewhereelse,
+/// which is the same symlink target as "aa" inside DIRECTORY_COMPLICATED.
+async fn populate_symlink2(root_nodes: &mut BTreeMap<Bytes, Node>) {
+    root_nodes.insert(
+        SYMLINK_NAME2.into(),
+        Node::Symlink(castorepb::SymlinkNode {
+            name: SYMLINK_NAME2.into(),
+            target: "/nix/store/somewhereelse".into(),
+        }),
+    );
+}
+
+async fn populate_directory_with_keep(
+    blob_service: &Arc<dyn BlobService>,
+    directory_service: &Arc<dyn DirectoryService>,
+    root_nodes: &mut BTreeMap<Bytes, Node>,
+) {
+    // upload empty blob
+    let mut bw = blob_service.open_write().await;
+    assert_eq!(
+        fixtures::EMPTY_BLOB_DIGEST.as_slice(),
+        bw.close().await.expect("must succeed closing").as_slice(),
+    );
+
+    // upload directory
+    directory_service
+        .put(fixtures::DIRECTORY_WITH_KEEP.clone())
+        .await
+        .expect("must succeed uploading");
+
+    root_nodes.insert(
+        DIRECTORY_WITH_KEEP_NAME.into(),
+        castorepb::node::Node::Directory(castorepb::DirectoryNode {
+            name: DIRECTORY_WITH_KEEP_NAME.into(),
+            digest: fixtures::DIRECTORY_WITH_KEEP.digest().into(),
+            size: fixtures::DIRECTORY_WITH_KEEP.size(),
+        }),
+    );
+}
+
+/// Create a root node for DIRECTORY_WITH_KEEP, but don't upload the Directory
+/// itself.
+async fn populate_directorynode_without_directory(root_nodes: &mut BTreeMap<Bytes, Node>) {
+    root_nodes.insert(
+        DIRECTORY_WITH_KEEP_NAME.into(),
+        castorepb::node::Node::Directory(castorepb::DirectoryNode {
+            name: DIRECTORY_WITH_KEEP_NAME.into(),
+            digest: fixtures::DIRECTORY_WITH_KEEP.digest().into(),
+            size: fixtures::DIRECTORY_WITH_KEEP.size(),
+        }),
+    );
+}
+
+/// Insert BLOB_A, but don't provide the blob .keep is pointing to.
+async fn populate_filenode_without_blob(root_nodes: &mut BTreeMap<Bytes, Node>) {
+    root_nodes.insert(
+        BLOB_A_NAME.into(),
+        Node::File(castorepb::FileNode {
+            name: BLOB_A_NAME.into(),
+            digest: fixtures::BLOB_A_DIGEST.clone().into(),
+            size: fixtures::BLOB_A.len() as u64,
+            executable: false,
+        }),
+    );
+}
+
+async fn populate_directory_complicated(
+    blob_service: &Arc<dyn BlobService>,
+    directory_service: &Arc<dyn DirectoryService>,
+    root_nodes: &mut BTreeMap<Bytes, Node>,
+) {
+    // upload empty blob
+    let mut bw = blob_service.open_write().await;
+    assert_eq!(
+        fixtures::EMPTY_BLOB_DIGEST.as_slice(),
+        bw.close().await.expect("must succeed closing").as_slice(),
+    );
+
+    // upload inner directory
+    directory_service
+        .put(fixtures::DIRECTORY_WITH_KEEP.clone())
+        .await
+        .expect("must succeed uploading");
+
+    // upload parent directory
+    directory_service
+        .put(fixtures::DIRECTORY_COMPLICATED.clone())
+        .await
+        .expect("must succeed uploading");
+
+    root_nodes.insert(
+        DIRECTORY_COMPLICATED_NAME.into(),
+        Node::Directory(castorepb::DirectoryNode {
+            name: DIRECTORY_COMPLICATED_NAME.into(),
+            digest: fixtures::DIRECTORY_COMPLICATED.digest().into(),
+            size: fixtures::DIRECTORY_COMPLICATED.size(),
+        }),
+    );
+}
+
+/// Ensure mounting itself doesn't fail
+#[tokio::test]
+async fn mount() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        BTreeMap::default(),
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    fuse_daemon.unmount().expect("unmount");
+}
+/// Ensure listing the root isn't allowed
+#[tokio::test]
+async fn root() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        BTreeMap::default(),
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    {
+        // read_dir fails (as opendir fails).
+        let err = tokio::fs::read_dir(tmpdir).await.expect_err("must fail");
+        assert_eq!(std::io::ErrorKind::PermissionDenied, err.kind());
+    }
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+/// Ensure listing the root is allowed if configured explicitly
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn root_with_listing() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_blob_a(&blob_service, &mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        true, /* allow listing */
+        false,
+    )
+    .expect("must succeed");
+
+    {
+        // read_dir succeeds, but getting the first element will fail.
+        let mut it = ReadDirStream::new(tokio::fs::read_dir(tmpdir).await.expect("must succeed"));
+
+        let e = it
+            .next()
+            .await
+            .expect("must be some")
+            .expect("must succeed");
+
+        let metadata = e.metadata().await.expect("must succeed");
+        assert!(metadata.is_file());
+        assert!(metadata.permissions().readonly());
+        assert_eq!(fixtures::BLOB_A.len() as u64, metadata.len());
+    }
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+/// Ensure we can stat a file at the root
+#[tokio::test]
+async fn stat_file_at_root() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_blob_a(&blob_service, &mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p = tmpdir.path().join(BLOB_A_NAME);
+
+    // peek at the file metadata
+    let metadata = tokio::fs::metadata(p).await.expect("must succeed");
+
+    assert!(metadata.is_file());
+    assert!(metadata.permissions().readonly());
+    assert_eq!(fixtures::BLOB_A.len() as u64, metadata.len());
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+/// Ensure we can read a file at the root
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn read_file_at_root() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_blob_a(&blob_service, &mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p = tmpdir.path().join(BLOB_A_NAME);
+
+    // read the file contents
+    let data = tokio::fs::read(p).await.expect("must succeed");
+
+    // ensure size and contents match
+    assert_eq!(fixtures::BLOB_A.len(), data.len());
+    assert_eq!(fixtures::BLOB_A.to_vec(), data);
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+/// Ensure we can read a large file at the root
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn read_large_file_at_root() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_blob_b(&blob_service, &mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p = tmpdir.path().join(BLOB_B_NAME);
+    {
+        // peek at the file metadata
+        let metadata = tokio::fs::metadata(&p).await.expect("must succeed");
+
+        assert!(metadata.is_file());
+        assert!(metadata.permissions().readonly());
+        assert_eq!(fixtures::BLOB_B.len() as u64, metadata.len());
+    }
+
+    // read the file contents
+    let data = tokio::fs::read(p).await.expect("must succeed");
+
+    // ensure size and contents match
+    assert_eq!(fixtures::BLOB_B.len(), data.len());
+    assert_eq!(fixtures::BLOB_B.to_vec(), data);
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+/// Read the target of a symlink
+#[tokio::test]
+async fn symlink_readlink() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_symlink(&mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p = tmpdir.path().join(SYMLINK_NAME);
+
+    let target = tokio::fs::read_link(&p).await.expect("must succeed");
+    assert_eq!(BLOB_A_NAME, target.to_str().unwrap());
+
+    // peek at the file metadata, which follows symlinks.
+    // this must fail, as we didn't populate the target.
+    let e = tokio::fs::metadata(&p).await.expect_err("must fail");
+    assert_eq!(std::io::ErrorKind::NotFound, e.kind());
+
+    // peeking at the file metadata without following symlinks will succeed.
+    let metadata = tokio::fs::symlink_metadata(&p).await.expect("must succeed");
+    assert!(metadata.is_symlink());
+
+    // reading from the symlink (which follows) will fail, because the target doesn't exist.
+    let e = tokio::fs::read(p).await.expect_err("must fail");
+    assert_eq!(std::io::ErrorKind::NotFound, e.kind());
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+/// Read and stat a regular file through a symlink pointing to it.
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+async fn read_stat_through_symlink() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_blob_a(&blob_service, &mut root_nodes).await;
+    populate_symlink(&mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p_symlink = tmpdir.path().join(SYMLINK_NAME);
+    let p_blob = tmpdir.path().join(SYMLINK_NAME);
+
+    // peek at the file metadata, which follows symlinks.
+    // this must now return the same metadata as when statting at the target directly.
+    let metadata_symlink = tokio::fs::metadata(&p_symlink).await.expect("must succeed");
+    let metadata_blob = tokio::fs::metadata(&p_blob).await.expect("must succeed");
+    assert_eq!(metadata_blob.file_type(), metadata_symlink.file_type());
+    assert_eq!(metadata_blob.len(), metadata_symlink.len());
+
+    // reading from the symlink (which follows) will return the same data as if
+    // we were reading from the file directly.
+    assert_eq!(
+        tokio::fs::read(p_blob).await.expect("must succeed"),
+        tokio::fs::read(p_symlink).await.expect("must succeed"),
+    );
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+/// Read a directory in the root, and validate some attributes.
+#[tokio::test]
+async fn read_stat_directory() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_directory_with_keep(&blob_service, &directory_service, &mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p = tmpdir.path().join(DIRECTORY_WITH_KEEP_NAME);
+
+    // peek at the metadata of the directory
+    let metadata = tokio::fs::metadata(p).await.expect("must succeed");
+    assert!(metadata.is_dir());
+    assert!(metadata.permissions().readonly());
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+/// Read a directory and file in the root, and ensure the xattrs expose blob or
+/// directory digests.
+#[tokio::test]
+async fn xattr() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_directory_with_keep(&blob_service, &directory_service, &mut root_nodes).await;
+    populate_blob_a(&blob_service, &mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        true, /* support xattr */
+    )
+    .expect("must succeed");
+
+    // peek at the directory
+    {
+        let p = tmpdir.path().join(DIRECTORY_WITH_KEEP_NAME);
+
+        let xattr_names: Vec<OsString> = xattr::list(&p).expect("must succeed").collect();
+        // There should be 1 key, XATTR_NAME_DIRECTORY_DIGEST.
+        assert_eq!(1, xattr_names.len(), "there should be 1 xattr name");
+        assert_eq!(
+            XATTR_NAME_DIRECTORY_DIGEST,
+            xattr_names.first().unwrap().as_encoded_bytes()
+        );
+
+        // The key should equal to the string-formatted b3 digest.
+        let val = xattr::get(&p, OsStr::from_bytes(XATTR_NAME_DIRECTORY_DIGEST))
+            .expect("must succeed")
+            .expect("must be some");
+        assert_eq!(
+            fixtures::DIRECTORY_WITH_KEEP
+                .digest()
+                .to_string()
+                .as_bytes()
+                .as_bstr(),
+            val.as_bstr()
+        );
+
+        // Reading another xattr key is gonna return None.
+        let val = xattr::get(&p, OsStr::from_bytes(b"user.cheesecake")).expect("must succeed");
+        assert_eq!(None, val);
+    }
+    // peek at the file
+    {
+        let p = tmpdir.path().join(BLOB_A_NAME);
+
+        let xattr_names: Vec<OsString> = xattr::list(&p).expect("must succeed").collect();
+        // There should be 1 key, XATTR_NAME_BLOB_DIGEST.
+        assert_eq!(1, xattr_names.len(), "there should be 1 xattr name");
+        assert_eq!(
+            XATTR_NAME_BLOB_DIGEST,
+            xattr_names.first().unwrap().as_encoded_bytes()
+        );
+
+        // The key should equal to the string-formatted b3 digest.
+        let val = xattr::get(&p, OsStr::from_bytes(XATTR_NAME_BLOB_DIGEST))
+            .expect("must succeed")
+            .expect("must be some");
+        assert_eq!(
+            fixtures::BLOB_A_DIGEST.to_string().as_bytes().as_bstr(),
+            val.as_bstr()
+        );
+
+        // Reading another xattr key is gonna return None.
+        let val = xattr::get(&p, OsStr::from_bytes(b"user.cheesecake")).expect("must succeed");
+        assert_eq!(None, val);
+    }
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+/// Read a blob inside a directory. This ensures we successfully populate directory data.
+async fn read_blob_inside_dir() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_directory_with_keep(&blob_service, &directory_service, &mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p = tmpdir.path().join(DIRECTORY_WITH_KEEP_NAME).join(".keep");
+
+    // peek at metadata.
+    let metadata = tokio::fs::metadata(&p).await.expect("must succeed");
+    assert!(metadata.is_file());
+    assert!(metadata.permissions().readonly());
+
+    // read from it
+    let data = tokio::fs::read(&p).await.expect("must succeed");
+    assert_eq!(fixtures::EMPTY_BLOB_CONTENTS.to_vec(), data);
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+/// Read a blob inside a directory inside a directory. This ensures we properly
+/// populate directories as we traverse down the structure.
+async fn read_blob_deep_inside_dir() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_directory_complicated(&blob_service, &directory_service, &mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p = tmpdir
+        .path()
+        .join(DIRECTORY_COMPLICATED_NAME)
+        .join("keep")
+        .join(".keep");
+
+    // peek at metadata.
+    let metadata = tokio::fs::metadata(&p).await.expect("must succeed");
+    assert!(metadata.is_file());
+    assert!(metadata.permissions().readonly());
+
+    // read from it
+    let data = tokio::fs::read(&p).await.expect("must succeed");
+    assert_eq!(fixtures::EMPTY_BLOB_CONTENTS.to_vec(), data);
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+/// Ensure readdir works.
+#[tokio::test]
+async fn readdir() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_directory_complicated(&blob_service, &directory_service, &mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p = tmpdir.path().join(DIRECTORY_COMPLICATED_NAME);
+
+    {
+        // read_dir should succeed. Collect all elements
+        let elements: Vec<_> =
+            ReadDirStream::new(tokio::fs::read_dir(p).await.expect("must succeed"))
+                .map(|e| e.expect("must not be err"))
+                .collect()
+                .await;
+
+        assert_eq!(3, elements.len(), "number of elements should be 3"); // rust skips . and ..
+
+        // We explicitly look at specific positions here, because we always emit
+        // them ordered.
+
+        // ".keep", 0 byte file.
+        let e = &elements[0];
+        assert_eq!(".keep", e.file_name());
+        assert!(e.file_type().await.expect("must succeed").is_file());
+        assert_eq!(0, e.metadata().await.expect("must succeed").len());
+
+        // "aa", symlink.
+        let e = &elements[1];
+        assert_eq!("aa", e.file_name());
+        assert!(e.file_type().await.expect("must succeed").is_symlink());
+
+        // "keep", directory
+        let e = &elements[2];
+        assert_eq!("keep", e.file_name());
+        assert!(e.file_type().await.expect("must succeed").is_dir());
+    }
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+#[tokio::test]
+/// Do a readdir deeper inside a directory, without doing readdir or stat in the parent directory.
+async fn readdir_deep() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_directory_complicated(&blob_service, &directory_service, &mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p = tmpdir.path().join(DIRECTORY_COMPLICATED_NAME).join("keep");
+
+    {
+        // read_dir should succeed. Collect all elements
+        let elements: Vec<_> =
+            ReadDirStream::new(tokio::fs::read_dir(p).await.expect("must succeed"))
+                .map(|e| e.expect("must not be err"))
+                .collect()
+                .await;
+
+        assert_eq!(1, elements.len(), "number of elements should be 1"); // rust skips . and ..
+
+        // ".keep", 0 byte file.
+        let e = &elements[0];
+        assert_eq!(".keep", e.file_name());
+        assert!(e.file_type().await.expect("must succeed").is_file());
+        assert_eq!(0, e.metadata().await.expect("must succeed").len());
+    }
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+/// Check attributes match how they show up in /nix/store normally.
+#[tokio::test]
+async fn check_attributes() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_blob_a(&blob_service, &mut root_nodes).await;
+    populate_directory_with_keep(&blob_service, &directory_service, &mut root_nodes).await;
+    populate_symlink(&mut root_nodes).await;
+    populate_blob_helloworld(&blob_service, &mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p_file = tmpdir.path().join(BLOB_A_NAME);
+    let p_directory = tmpdir.path().join(DIRECTORY_WITH_KEEP_NAME);
+    let p_symlink = tmpdir.path().join(SYMLINK_NAME);
+    let p_executable_file = tmpdir.path().join(HELLOWORLD_BLOB_NAME);
+
+    // peek at metadata. We use symlink_metadata to ensure we don't traverse a symlink by accident.
+    let metadata_file = tokio::fs::symlink_metadata(&p_file)
+        .await
+        .expect("must succeed");
+    let metadata_executable_file = tokio::fs::symlink_metadata(&p_executable_file)
+        .await
+        .expect("must succeed");
+    let metadata_directory = tokio::fs::symlink_metadata(&p_directory)
+        .await
+        .expect("must succeed");
+    let metadata_symlink = tokio::fs::symlink_metadata(&p_symlink)
+        .await
+        .expect("must succeed");
+
+    // modes should match. We & with 0o777 to remove any higher bits.
+    assert_eq!(0o444, metadata_file.mode() & 0o777);
+    assert_eq!(0o555, metadata_executable_file.mode() & 0o777);
+    assert_eq!(0o555, metadata_directory.mode() & 0o777);
+    assert_eq!(0o444, metadata_symlink.mode() & 0o777);
+
+    // files should have the correct filesize
+    assert_eq!(fixtures::BLOB_A.len() as u64, metadata_file.len());
+    // directories should have their "size" as filesize
+    assert_eq!(
+        { fixtures::DIRECTORY_WITH_KEEP.size() },
+        metadata_directory.size()
+    );
+
+    for metadata in &[&metadata_file, &metadata_directory, &metadata_symlink] {
+        // uid and gid should be 0.
+        assert_eq!(0, metadata.uid());
+        assert_eq!(0, metadata.gid());
+
+        // all times should be set to the unix epoch.
+        assert_eq!(0, metadata.atime());
+        assert_eq!(0, metadata.mtime());
+        assert_eq!(0, metadata.ctime());
+        // crtime seems MacOS only
+    }
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+#[tokio::test]
+/// Ensure we allocate the same inodes for the same directory contents.
+/// $DIRECTORY_COMPLICATED_NAME/keep contains the same data as $DIRECTORY_WITH_KEEP.
+async fn compare_inodes_directories() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_directory_with_keep(&blob_service, &directory_service, &mut root_nodes).await;
+    populate_directory_complicated(&blob_service, &directory_service, &mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p_dir_with_keep = tmpdir.path().join(DIRECTORY_WITH_KEEP_NAME);
+    let p_sibling_dir = tmpdir.path().join(DIRECTORY_COMPLICATED_NAME).join("keep");
+
+    // peek at metadata.
+    assert_eq!(
+        tokio::fs::metadata(p_dir_with_keep)
+            .await
+            .expect("must succeed")
+            .ino(),
+        tokio::fs::metadata(p_sibling_dir)
+            .await
+            .expect("must succeed")
+            .ino()
+    );
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+/// Ensure we allocate the same inodes for the same directory contents.
+/// $DIRECTORY_COMPLICATED_NAME/keep/,keep contains the same data as $DIRECTORY_COMPLICATED_NAME/.keep
+#[tokio::test]
+async fn compare_inodes_files() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_directory_complicated(&blob_service, &directory_service, &mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p_keep1 = tmpdir.path().join(DIRECTORY_COMPLICATED_NAME).join(".keep");
+    let p_keep2 = tmpdir
+        .path()
+        .join(DIRECTORY_COMPLICATED_NAME)
+        .join("keep")
+        .join(".keep");
+
+    // peek at metadata.
+    assert_eq!(
+        tokio::fs::metadata(p_keep1)
+            .await
+            .expect("must succeed")
+            .ino(),
+        tokio::fs::metadata(p_keep2)
+            .await
+            .expect("must succeed")
+            .ino()
+    );
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+/// Ensure we allocate the same inode for symlinks pointing to the same targets.
+/// $DIRECTORY_COMPLICATED_NAME/aa points to the same target as SYMLINK_NAME2.
+#[tokio::test]
+async fn compare_inodes_symlinks() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_directory_complicated(&blob_service, &directory_service, &mut root_nodes).await;
+    populate_symlink2(&mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p1 = tmpdir.path().join(DIRECTORY_COMPLICATED_NAME).join("aa");
+    let p2 = tmpdir.path().join(SYMLINK_NAME2);
+
+    // peek at metadata.
+    assert_eq!(
+        tokio::fs::symlink_metadata(p1)
+            .await
+            .expect("must succeed")
+            .ino(),
+        tokio::fs::symlink_metadata(p2)
+            .await
+            .expect("must succeed")
+            .ino()
+    );
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+/// Check we match paths exactly.
+#[tokio::test]
+async fn read_wrong_paths_in_root() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_blob_a(&blob_service, &mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    // wrong name
+    assert!(
+        tokio::fs::metadata(tmpdir.path().join("00000000000000000000000000000000-tes"))
+            .await
+            .is_err()
+    );
+
+    // invalid hash
+    assert!(
+        tokio::fs::metadata(tmpdir.path().join("0000000000000000000000000000000-test"))
+            .await
+            .is_err()
+    );
+
+    // right name, must exist
+    assert!(
+        tokio::fs::metadata(tmpdir.path().join("00000000000000000000000000000000-test"))
+            .await
+            .is_ok()
+    );
+
+    // now wrong name with right hash still may not exist
+    assert!(
+        tokio::fs::metadata(tmpdir.path().join("00000000000000000000000000000000-tes"))
+            .await
+            .is_err()
+    );
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+/// Make sure writes are not allowed
+#[tokio::test]
+async fn disallow_writes() {
+    // https://plume.benboeckel.net/~/JustAnotherBlog/skipping-tests-in-rust
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let root_nodes = BTreeMap::default();
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p = tmpdir.path().join(BLOB_A_NAME);
+    let e = tokio::fs::File::create(p).await.expect_err("must fail");
+
+    assert_eq!(Some(libc::EROFS), e.raw_os_error());
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+#[tokio::test]
+/// Ensure we get an IO error if the directory service does not have the Directory object.
+async fn missing_directory() {
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_directorynode_without_directory(&mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p = tmpdir.path().join(DIRECTORY_WITH_KEEP_NAME);
+
+    {
+        // `stat` on the path should succeed, because it doesn't trigger the directory request.
+        tokio::fs::metadata(&p).await.expect("must succeed");
+
+        // However, calling either `readdir` or `stat` on a child should fail with an IO error.
+        // It fails when trying to pull the first entry, because we don't implement opendir separately
+        ReadDirStream::new(tokio::fs::read_dir(&p).await.unwrap())
+            .next()
+            .await
+            .expect("must be some")
+            .expect_err("must be err");
+
+        // rust currently sets e.kind() to Uncategorized, which isn't very
+        // helpful, so we don't look at the error more closely than that..
+        tokio::fs::metadata(p.join(".keep"))
+            .await
+            .expect_err("must fail");
+    }
+
+    fuse_daemon.unmount().expect("unmount");
+}
+
+#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+/// Ensure we get an IO error if the blob service does not have the blob
+async fn missing_blob() {
+    if !std::path::Path::new("/dev/fuse").exists() {
+        eprintln!("skipping test");
+        return;
+    }
+    let tmpdir = TempDir::new().unwrap();
+
+    let (blob_service, directory_service) = gen_svcs();
+    let mut root_nodes = BTreeMap::default();
+
+    populate_filenode_without_blob(&mut root_nodes).await;
+
+    let mut fuse_daemon = do_mount(
+        blob_service,
+        directory_service,
+        root_nodes,
+        tmpdir.path(),
+        false,
+        false,
+    )
+    .expect("must succeed");
+
+    let p = tmpdir.path().join(BLOB_A_NAME);
+
+    {
+        // `stat` on the blob should succeed, because it doesn't trigger a request to the blob service.
+        tokio::fs::metadata(&p).await.expect("must succeed");
+
+        // However, calling read on the blob should fail.
+        // rust currently sets e.kind() to Uncategorized, which isn't very
+        // helpful, so we don't look at the error more closely than that..
+        tokio::fs::read(p).await.expect_err("must fail");
+    }
+
+    fuse_daemon.unmount().expect("unmount");
+}
diff --git a/tvix/castore/src/fs/inode_tracker.rs b/tvix/castore/src/fs/inode_tracker.rs
new file mode 100644
index 0000000000..4a8283b6b1
--- /dev/null
+++ b/tvix/castore/src/fs/inode_tracker.rs
@@ -0,0 +1,207 @@
+use std::{collections::HashMap, sync::Arc};
+
+use super::inodes::{DirectoryInodeData, InodeData};
+use crate::B3Digest;
+
+/// InodeTracker keeps track of inodes, stores data being these inodes and deals
+/// with inode allocation.
+pub struct InodeTracker {
+    data: HashMap<u64, Arc<InodeData>>,
+
+    // lookup table for blobs by their B3Digest
+    blob_digest_to_inode: HashMap<B3Digest, u64>,
+
+    // lookup table for symlinks by their target
+    symlink_target_to_inode: HashMap<bytes::Bytes, u64>,
+
+    // lookup table for directories by their B3Digest.
+    // Note the corresponding directory may not be present in data yet.
+    directory_digest_to_inode: HashMap<B3Digest, u64>,
+
+    // the next inode to allocate
+    next_inode: u64,
+}
+
+impl Default for InodeTracker {
+    fn default() -> Self {
+        Self {
+            data: Default::default(),
+
+            blob_digest_to_inode: Default::default(),
+            symlink_target_to_inode: Default::default(),
+            directory_digest_to_inode: Default::default(),
+
+            next_inode: 2,
+        }
+    }
+}
+
+impl InodeTracker {
+    // Retrieves data for a given inode, if it exists.
+    pub fn get(&self, ino: u64) -> Option<Arc<InodeData>> {
+        self.data.get(&ino).cloned()
+    }
+
+    // Replaces data for a given inode.
+    // Panics if the inode doesn't already exist.
+    pub fn replace(&mut self, ino: u64, data: Arc<InodeData>) {
+        if self.data.insert(ino, data).is_none() {
+            panic!("replace called on unknown inode");
+        }
+    }
+
+    // Stores data and returns the inode for it.
+    // In case an inode has already been allocated for the same data, that inode
+    // is returned, otherwise a new one is allocated.
+    // In case data is a [InodeData::Directory], inodes for all items are looked
+    // up
+    pub fn put(&mut self, data: InodeData) -> u64 {
+        match data {
+            InodeData::Regular(ref digest, _, _) => {
+                match self.blob_digest_to_inode.get(digest) {
+                    Some(found_ino) => {
+                        // We already have it, return the inode.
+                        *found_ino
+                    }
+                    None => self.insert_and_increment(data),
+                }
+            }
+            InodeData::Symlink(ref target) => {
+                match self.symlink_target_to_inode.get(target) {
+                    Some(found_ino) => {
+                        // We already have it, return the inode.
+                        *found_ino
+                    }
+                    None => self.insert_and_increment(data),
+                }
+            }
+            InodeData::Directory(DirectoryInodeData::Sparse(ref digest, _size)) => {
+                // check the lookup table if the B3Digest is known.
+                match self.directory_digest_to_inode.get(digest) {
+                    Some(found_ino) => {
+                        // We already have it, return the inode.
+                        *found_ino
+                    }
+                    None => {
+                        // insert and return the inode
+                        self.insert_and_increment(data)
+                    }
+                }
+            }
+            // Inserting [DirectoryInodeData::Populated] doesn't normally happen,
+            // only via [replace].
+            InodeData::Directory(DirectoryInodeData::Populated(..)) => {
+                unreachable!("should never be called with DirectoryInodeData::Populated")
+            }
+        }
+    }
+
+    // Inserts the data and returns the inode it was stored at, while
+    // incrementing next_inode.
+    fn insert_and_increment(&mut self, data: InodeData) -> u64 {
+        let ino = self.next_inode;
+        // insert into lookup tables
+        match data {
+            InodeData::Regular(ref digest, _, _) => {
+                self.blob_digest_to_inode.insert(digest.clone(), ino);
+            }
+            InodeData::Symlink(ref target) => {
+                self.symlink_target_to_inode.insert(target.clone(), ino);
+            }
+            InodeData::Directory(DirectoryInodeData::Sparse(ref digest, _size)) => {
+                self.directory_digest_to_inode.insert(digest.clone(), ino);
+            }
+            // This is currently not used outside test fixtures.
+            // Usually a [DirectoryInodeData::Sparse] is inserted and later
+            // "upgraded" with more data.
+            // However, as a future optimization, a lookup for a PathInfo could trigger a
+            // [DirectoryService::get_recursive()] request that "forks into
+            // background" and prepopulates all Directories in a closure.
+            InodeData::Directory(DirectoryInodeData::Populated(ref digest, _)) => {
+                self.directory_digest_to_inode.insert(digest.clone(), ino);
+            }
+        }
+        // Insert data
+        self.data.insert(ino, Arc::new(data));
+
+        // increment inode counter and return old inode.
+        self.next_inode += 1;
+        ino
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::fixtures;
+
+    use super::InodeData;
+    use super::InodeTracker;
+
+    /// Getting something non-existent should be none
+    #[test]
+    fn get_nonexistent() {
+        let inode_tracker = InodeTracker::default();
+        assert!(inode_tracker.get(1).is_none());
+    }
+
+    /// Put of a regular file should allocate a uid, which should be the same when inserting again.
+    #[test]
+    fn put_regular() {
+        let mut inode_tracker = InodeTracker::default();
+        let f = InodeData::Regular(
+            fixtures::BLOB_A_DIGEST.clone(),
+            fixtures::BLOB_A.len() as u64,
+            false,
+        );
+
+        // put it in
+        let ino = inode_tracker.put(f.clone());
+
+        // a get should return the right data
+        let data = inode_tracker.get(ino).expect("must be some");
+        match *data {
+            InodeData::Regular(ref digest, _, _) => {
+                assert_eq!(&fixtures::BLOB_A_DIGEST.clone(), digest);
+            }
+            InodeData::Symlink(_) | InodeData::Directory(..) => panic!("wrong type"),
+        }
+
+        // another put should return the same ino
+        assert_eq!(ino, inode_tracker.put(f));
+
+        // inserting another file should return a different ino
+        assert_ne!(
+            ino,
+            inode_tracker.put(InodeData::Regular(
+                fixtures::BLOB_B_DIGEST.clone(),
+                fixtures::BLOB_B.len() as u64,
+                false,
+            ))
+        );
+    }
+
+    // Put of a symlink should allocate a uid, which should be the same when inserting again
+    #[test]
+    fn put_symlink() {
+        let mut inode_tracker = InodeTracker::default();
+        let f = InodeData::Symlink("target".into());
+
+        // put it in
+        let ino = inode_tracker.put(f.clone());
+
+        // a get should return the right data
+        let data = inode_tracker.get(ino).expect("must be some");
+        match *data {
+            InodeData::Symlink(ref target) => {
+                assert_eq!(b"target".to_vec(), *target);
+            }
+            InodeData::Regular(..) | InodeData::Directory(..) => panic!("wrong type"),
+        }
+
+        // another put should return the same ino
+        assert_eq!(ino, inode_tracker.put(f));
+
+        // inserting another file should return a different ino
+        assert_ne!(ino, inode_tracker.put(InodeData::Symlink("target2".into())));
+    }
+}
diff --git a/tvix/castore/src/fs/inodes.rs b/tvix/castore/src/fs/inodes.rs
new file mode 100644
index 0000000000..bdd4595434
--- /dev/null
+++ b/tvix/castore/src/fs/inodes.rs
@@ -0,0 +1,96 @@
+//! This module contains all the data structures used to track information
+//! about inodes, which present tvix-castore nodes in a filesystem.
+use std::time::Duration;
+
+use bytes::Bytes;
+
+use crate::proto as castorepb;
+use crate::B3Digest;
+
+#[derive(Clone, Debug)]
+pub enum InodeData {
+    Regular(B3Digest, u64, bool),  // digest, size, executable
+    Symlink(bytes::Bytes),         // target
+    Directory(DirectoryInodeData), // either [DirectoryInodeData:Sparse] or [DirectoryInodeData:Populated]
+}
+
+/// This encodes the two different states of [InodeData::Directory].
+/// Either the data still is sparse (we only saw a [castorepb::DirectoryNode],
+/// but didn't fetch the [castorepb::Directory] struct yet, or we processed a
+/// lookup and did fetch the data.
+#[derive(Clone, Debug)]
+pub enum DirectoryInodeData {
+    Sparse(B3Digest, u64),                                  // digest, size
+    Populated(B3Digest, Vec<(u64, castorepb::node::Node)>), // [(child_inode, node)]
+}
+
+impl InodeData {
+    /// Constructs a new InodeData by consuming a [Node].
+    /// It splits off the orginal name, so it can be used later.
+    pub fn from_node(node: castorepb::node::Node) -> (Self, Bytes) {
+        match node {
+            castorepb::node::Node::Directory(n) => (
+                Self::Directory(DirectoryInodeData::Sparse(
+                    n.digest.try_into().unwrap(),
+                    n.size,
+                )),
+                n.name,
+            ),
+            castorepb::node::Node::File(n) => (
+                Self::Regular(n.digest.try_into().unwrap(), n.size, n.executable),
+                n.name,
+            ),
+            castorepb::node::Node::Symlink(n) => (Self::Symlink(n.target), n.name),
+        }
+    }
+
+    pub fn as_fuse_file_attr(&self, inode: u64) -> fuse_backend_rs::abi::fuse_abi::Attr {
+        fuse_backend_rs::abi::fuse_abi::Attr {
+            ino: inode,
+            // FUTUREWORK: play with this numbers, as it affects read sizes for client applications.
+            blocks: 1024,
+            size: match self {
+                InodeData::Regular(_, size, _) => *size,
+                InodeData::Symlink(target) => target.len() as u64,
+                InodeData::Directory(DirectoryInodeData::Sparse(_, size)) => *size,
+                InodeData::Directory(DirectoryInodeData::Populated(_, ref children)) => {
+                    children.len() as u64
+                }
+            },
+            mode: self.as_fuse_type() | self.mode(),
+            ..Default::default()
+        }
+    }
+
+    fn mode(&self) -> u32 {
+        match self {
+            InodeData::Regular(_, _, false) | InodeData::Symlink(_) => 0o444,
+            InodeData::Regular(_, _, true) | InodeData::Directory(_) => 0o555,
+        }
+    }
+
+    pub fn as_fuse_entry(&self, inode: u64) -> fuse_backend_rs::api::filesystem::Entry {
+        fuse_backend_rs::api::filesystem::Entry {
+            inode,
+            attr: self.as_fuse_file_attr(inode).into(),
+            attr_timeout: Duration::MAX,
+            entry_timeout: Duration::MAX,
+            ..Default::default()
+        }
+    }
+
+    /// Returns the u32 fuse type
+    pub fn as_fuse_type(&self) -> u32 {
+        #[allow(clippy::let_and_return)]
+        let ty = match self {
+            InodeData::Regular(_, _, _) => libc::S_IFREG,
+            InodeData::Symlink(_) => libc::S_IFLNK,
+            InodeData::Directory(_) => libc::S_IFDIR,
+        };
+        // libc::S_IFDIR is u32 on Linux and u16 on MacOS
+        #[cfg(target_os = "macos")]
+        let ty = ty as u32;
+
+        ty
+    }
+}
diff --git a/tvix/castore/src/fs/mod.rs b/tvix/castore/src/fs/mod.rs
new file mode 100644
index 0000000000..176199f64a
--- /dev/null
+++ b/tvix/castore/src/fs/mod.rs
@@ -0,0 +1,874 @@
+mod file_attr;
+mod inode_tracker;
+mod inodes;
+mod root_nodes;
+
+#[cfg(feature = "fuse")]
+pub mod fuse;
+
+#[cfg(feature = "virtiofs")]
+pub mod virtiofs;
+
+pub use self::root_nodes::RootNodes;
+use self::{
+    file_attr::ROOT_FILE_ATTR,
+    inode_tracker::InodeTracker,
+    inodes::{DirectoryInodeData, InodeData},
+};
+use crate::proto as castorepb;
+use crate::{
+    blobservice::{BlobReader, BlobService},
+    directoryservice::DirectoryService,
+    proto::{node::Node, NamedNode},
+    B3Digest,
+};
+use bstr::ByteVec;
+use bytes::Bytes;
+use fuse_backend_rs::abi::fuse_abi::{stat64, OpenOptions};
+use fuse_backend_rs::api::filesystem::{
+    Context, FileSystem, FsOptions, GetxattrReply, ListxattrReply, ROOT_ID,
+};
+use futures::StreamExt;
+use parking_lot::RwLock;
+use std::sync::Mutex;
+use std::{
+    collections::HashMap,
+    io,
+    sync::atomic::AtomicU64,
+    sync::{atomic::Ordering, Arc},
+    time::Duration,
+};
+use std::{ffi::CStr, io::Cursor};
+use tokio::{
+    io::{AsyncReadExt, AsyncSeekExt},
+    sync::mpsc,
+};
+use tracing::{debug, error, instrument, warn, Span};
+
+/// This implements a read-only FUSE filesystem for a tvix-store
+/// with the passed [BlobService], [DirectoryService] and [RootNodes].
+///
+/// Linux uses inodes in filesystems. When implementing FUSE, most calls are
+/// *for* a given inode.
+///
+/// This means, we need to have a stable mapping of inode numbers to the
+/// corresponding store nodes.
+///
+/// We internally delegate all inode allocation and state keeping to the
+/// inode tracker.
+/// We store a mapping from currently "explored" names in the root to their
+/// inode.
+///
+/// There's some places where inodes are allocated / data inserted into
+/// the inode tracker, if not allocated before already:
+///  - Processing a `lookup` request, either in the mount root, or somewhere
+///    deeper.
+///  - Processing a `readdir` request
+///
+///  Things pointing to the same contents get the same inodes, irrespective of
+///  their own location.
+///  This means:
+///  - Symlinks with the same target will get the same inode.
+///  - Regular/executable files with the same contents will get the same inode
+///  - Directories with the same contents will get the same inode.
+///
+/// Due to the above being valid across the whole store, and considering the
+/// merkle structure is a DAG, not a tree, this also means we can't do "bucketed
+/// allocation", aka reserve Directory.size inodes for each directory node we
+/// explore.
+/// Tests for this live in the tvix-store crate.
+pub struct TvixStoreFs<BS, DS, RN> {
+    blob_service: BS,
+    directory_service: DS,
+    root_nodes_provider: RN,
+
+    /// Whether to (try) listing elements in the root.
+    list_root: bool,
+
+    /// Whether to expose blob and directory digests as extended attributes.
+    show_xattr: bool,
+
+    /// This maps a given basename in the root to the inode we allocated for the node.
+    root_nodes: RwLock<HashMap<Bytes, u64>>,
+
+    /// This keeps track of inodes and data alongside them.
+    inode_tracker: RwLock<InodeTracker>,
+
+    // FUTUREWORK: have a generic container type for dir/file handles and handle
+    // allocation.
+    /// Maps from the handle returned from an opendir to
+    /// This holds all opendir handles (for the root inode)
+    /// They point to the rx part of the channel producing the listing.
+    #[allow(clippy::type_complexity)]
+    dir_handles: RwLock<
+        HashMap<
+            u64,
+            (
+                Span,
+                Arc<Mutex<mpsc::Receiver<(usize, Result<Node, crate::Error>)>>>,
+            ),
+        >,
+    >,
+
+    next_dir_handle: AtomicU64,
+
+    /// This holds all open file handles
+    #[allow(clippy::type_complexity)]
+    file_handles: RwLock<HashMap<u64, (Span, Arc<Mutex<Box<dyn BlobReader>>>)>>,
+
+    next_file_handle: AtomicU64,
+
+    tokio_handle: tokio::runtime::Handle,
+}
+
+impl<BS, DS, RN> TvixStoreFs<BS, DS, RN>
+where
+    BS: AsRef<dyn BlobService> + Clone + Send,
+    DS: AsRef<dyn DirectoryService> + Clone + Send + 'static,
+    RN: RootNodes + Clone + 'static,
+{
+    pub fn new(
+        blob_service: BS,
+        directory_service: DS,
+        root_nodes_provider: RN,
+        list_root: bool,
+        show_xattr: bool,
+    ) -> Self {
+        Self {
+            blob_service,
+            directory_service,
+            root_nodes_provider,
+
+            list_root,
+            show_xattr,
+
+            root_nodes: RwLock::new(HashMap::default()),
+            inode_tracker: RwLock::new(Default::default()),
+
+            dir_handles: RwLock::new(Default::default()),
+            next_dir_handle: AtomicU64::new(1),
+
+            file_handles: RwLock::new(Default::default()),
+            next_file_handle: AtomicU64::new(1),
+            tokio_handle: tokio::runtime::Handle::current(),
+        }
+    }
+
+    /// Retrieves the inode for a given root node basename, if present.
+    /// This obtains a read lock on self.root_nodes.
+    fn get_inode_for_root_name(&self, name: &[u8]) -> Option<u64> {
+        self.root_nodes.read().get(name).cloned()
+    }
+
+    /// For a given inode, look up the given directory behind it (from
+    /// self.inode_tracker), and return its children.
+    /// The inode_tracker MUST know about this inode already, and it MUST point
+    /// to a [InodeData::Directory].
+    /// It is ok if it's a [DirectoryInodeData::Sparse] - in that case, a lookup
+    /// in self.directory_service is performed, and self.inode_tracker is updated with the
+    /// [DirectoryInodeData::Populated].
+    #[instrument(skip(self), err)]
+    fn get_directory_children(&self, ino: u64) -> io::Result<(B3Digest, Vec<(u64, Node)>)> {
+        let data = self.inode_tracker.read().get(ino).unwrap();
+        match *data {
+            // if it's populated already, return children.
+            InodeData::Directory(DirectoryInodeData::Populated(
+                ref parent_digest,
+                ref children,
+            )) => Ok((parent_digest.clone(), children.clone())),
+            // if it's sparse, fetch data using directory_service, populate child nodes
+            // and update it in [self.inode_tracker].
+            InodeData::Directory(DirectoryInodeData::Sparse(ref parent_digest, _)) => {
+                let directory = self
+                    .tokio_handle
+                    .block_on({
+                        let directory_service = self.directory_service.clone();
+                        let parent_digest = parent_digest.to_owned();
+                        async move { directory_service.as_ref().get(&parent_digest).await }
+                    })?
+                    .ok_or_else(|| {
+                        warn!(directory.digest=%parent_digest, "directory not found");
+                        // If the Directory can't be found, this is a hole, bail out.
+                        io::Error::from_raw_os_error(libc::EIO)
+                    })?;
+
+                // Turn the retrieved directory into a InodeData::Directory(DirectoryInodeData::Populated(..)),
+                // allocating inodes for the children on the way.
+                // FUTUREWORK: there's a bunch of cloning going on here, which we can probably avoid.
+                let children = {
+                    let mut inode_tracker = self.inode_tracker.write();
+
+                    let children: Vec<(u64, castorepb::node::Node)> = directory
+                        .nodes()
+                        .map(|child_node| {
+                            let (inode_data, _) = InodeData::from_node(child_node.clone());
+
+                            let child_ino = inode_tracker.put(inode_data);
+                            (child_ino, child_node)
+                        })
+                        .collect();
+
+                    // replace.
+                    inode_tracker.replace(
+                        ino,
+                        Arc::new(InodeData::Directory(DirectoryInodeData::Populated(
+                            parent_digest.clone(),
+                            children.clone(),
+                        ))),
+                    );
+
+                    children
+                };
+
+                Ok((parent_digest.clone(), children))
+            }
+            // if the parent inode was not a directory, this doesn't make sense
+            InodeData::Regular(..) | InodeData::Symlink(_) => {
+                Err(io::Error::from_raw_os_error(libc::ENOTDIR))
+            }
+        }
+    }
+
+    /// This will turn a lookup request for a name in the root to a ino and
+    /// [InodeData].
+    /// It will peek in [self.root_nodes], and then either look it up from
+    /// [self.inode_tracker],
+    /// or otherwise fetch from [self.root_nodes], and then insert into
+    /// [self.inode_tracker].
+    /// In the case the name can't be found, a libc::ENOENT is returned.
+    fn name_in_root_to_ino_and_data(
+        &self,
+        name: &std::ffi::CStr,
+    ) -> io::Result<(u64, Arc<InodeData>)> {
+        // Look up the inode for that root node.
+        // If there's one, [self.inode_tracker] MUST also contain the data,
+        // which we can then return.
+        if let Some(inode) = self.get_inode_for_root_name(name.to_bytes()) {
+            return Ok((
+                inode,
+                self.inode_tracker
+                    .read()
+                    .get(inode)
+                    .expect("must exist")
+                    .to_owned(),
+            ));
+        }
+
+        // We don't have it yet, look it up in [self.root_nodes].
+        match self.tokio_handle.block_on({
+            let root_nodes_provider = self.root_nodes_provider.clone();
+            async move { root_nodes_provider.get_by_basename(name.to_bytes()).await }
+        }) {
+            // if there was an error looking up the root node, propagate up an IO error.
+            Err(_e) => Err(io::Error::from_raw_os_error(libc::EIO)),
+            // the root node doesn't exist, so the file doesn't exist.
+            Ok(None) => Err(io::Error::from_raw_os_error(libc::ENOENT)),
+            // The root node does exist
+            Ok(Some(root_node)) => {
+                // The name must match what's passed in the lookup, otherwise this is also a ENOENT.
+                if root_node.get_name() != name.to_bytes() {
+                    debug!(root_node.name=?root_node.get_name(), found_node.name=%name.to_string_lossy(), "node name mismatch");
+                    return Err(io::Error::from_raw_os_error(libc::ENOENT));
+                }
+
+                // Let's check if someone else beat us to updating the inode tracker and
+                // root_nodes map. This avoids locking inode_tracker for writing.
+                if let Some(ino) = self.root_nodes.read().get(name.to_bytes()) {
+                    return Ok((
+                        *ino,
+                        self.inode_tracker.read().get(*ino).expect("must exist"),
+                    ));
+                }
+
+                // Only in case it doesn't, lock [self.root_nodes] and
+                // [self.inode_tracker] for writing.
+                let mut root_nodes = self.root_nodes.write();
+                let mut inode_tracker = self.inode_tracker.write();
+
+                // insert the (sparse) inode data and register in
+                // self.root_nodes.
+                let (inode_data, name) = InodeData::from_node(root_node);
+                let ino = inode_tracker.put(inode_data.clone());
+                root_nodes.insert(name, ino);
+
+                Ok((ino, Arc::new(inode_data)))
+            }
+        }
+    }
+}
+
+/// Buffer size of the channel providing nodes in the mount root
+const ROOT_NODES_BUFFER_SIZE: usize = 16;
+
+const XATTR_NAME_DIRECTORY_DIGEST: &[u8] = b"user.tvix.castore.directory.digest";
+const XATTR_NAME_BLOB_DIGEST: &[u8] = b"user.tvix.castore.blob.digest";
+
+impl<BS, DS, RN> FileSystem for TvixStoreFs<BS, DS, RN>
+where
+    BS: AsRef<dyn BlobService> + Clone + Send + 'static,
+    DS: AsRef<dyn DirectoryService> + Send + Clone + 'static,
+    RN: RootNodes + Clone + 'static,
+{
+    type Handle = u64;
+    type Inode = u64;
+
+    fn init(&self, _capable: FsOptions) -> io::Result<FsOptions> {
+        Ok(FsOptions::empty())
+    }
+
+    #[tracing::instrument(skip_all, fields(rq.inode = inode))]
+    fn getattr(
+        &self,
+        _ctx: &Context,
+        inode: Self::Inode,
+        _handle: Option<Self::Handle>,
+    ) -> io::Result<(stat64, Duration)> {
+        if inode == ROOT_ID {
+            return Ok((ROOT_FILE_ATTR.into(), Duration::MAX));
+        }
+
+        match self.inode_tracker.read().get(inode) {
+            None => Err(io::Error::from_raw_os_error(libc::ENOENT)),
+            Some(inode_data) => {
+                debug!(inode_data = ?inode_data, "found node");
+                Ok((inode_data.as_fuse_file_attr(inode).into(), Duration::MAX))
+            }
+        }
+    }
+
+    #[tracing::instrument(skip_all, fields(rq.parent_inode = parent, rq.name = ?name))]
+    fn lookup(
+        &self,
+        _ctx: &Context,
+        parent: Self::Inode,
+        name: &std::ffi::CStr,
+    ) -> io::Result<fuse_backend_rs::api::filesystem::Entry> {
+        debug!("lookup");
+
+        // This goes from a parent inode to a node.
+        // - If the parent is [ROOT_ID], we need to check
+        //   [self.root_nodes] (fetching from a [RootNode] provider if needed)
+        // - Otherwise, lookup the parent in [self.inode_tracker] (which must be
+        //   a [InodeData::Directory]), and find the child with that name.
+        if parent == ROOT_ID {
+            let (ino, inode_data) = self.name_in_root_to_ino_and_data(name)?;
+
+            debug!(inode_data=?&inode_data, ino=ino, "Some");
+            return Ok(inode_data.as_fuse_entry(ino));
+        }
+        // This is the "lookup for "a" inside inode 42.
+        // We already know that inode 42 must be a directory.
+        let (parent_digest, children) = self.get_directory_children(parent)?;
+
+        Span::current().record("directory.digest", parent_digest.to_string());
+        // Search for that name in the list of children and return the FileAttrs.
+
+        // in the children, find the one with the desired name.
+        if let Some((child_ino, _)) = children.iter().find(|e| e.1.get_name() == name.to_bytes()) {
+            // lookup the child [InodeData] in [self.inode_tracker].
+            // We know the inodes for children have already been allocated.
+            let child_inode_data = self.inode_tracker.read().get(*child_ino).unwrap();
+
+            // Reply with the file attributes for the child.
+            // For child directories, we still have all data we need to reply.
+            Ok(child_inode_data.as_fuse_entry(*child_ino))
+        } else {
+            // Child not found, return ENOENT.
+            Err(io::Error::from_raw_os_error(libc::ENOENT))
+        }
+    }
+
+    #[tracing::instrument(skip_all, fields(rq.inode = inode))]
+    fn opendir(
+        &self,
+        _ctx: &Context,
+        inode: Self::Inode,
+        _flags: u32,
+    ) -> io::Result<(Option<Self::Handle>, OpenOptions)> {
+        // In case opendir on the root is called, we provide the handle, as re-entering that listing is expensive.
+        // For all other directory inodes we just let readdir take care of it.
+        if inode == ROOT_ID {
+            if !self.list_root {
+                return Err(io::Error::from_raw_os_error(libc::EPERM)); // same error code as ipfs/kubo
+            }
+
+            let root_nodes_provider = self.root_nodes_provider.clone();
+            let (tx, rx) = mpsc::channel(ROOT_NODES_BUFFER_SIZE);
+
+            // This task will run in the background immediately and will exit
+            // after the stream ends or if we no longer want any more entries.
+            self.tokio_handle.spawn(async move {
+                let mut stream = root_nodes_provider.list().enumerate();
+                while let Some(node) = stream.next().await {
+                    if tx.send(node).await.is_err() {
+                        // If we get a send error, it means the sync code
+                        // doesn't want any more entries.
+                        break;
+                    }
+                }
+            });
+
+            // Put the rx part into [self.dir_handles].
+            // TODO: this will overflow after 2**64 operations,
+            // which is fine for now.
+            // See https://cl.tvl.fyi/c/depot/+/8834/comment/a6684ce0_d72469d1
+            // for the discussion on alternatives.
+            let dh = self.next_dir_handle.fetch_add(1, Ordering::SeqCst);
+
+            self.dir_handles
+                .write()
+                .insert(dh, (Span::current(), Arc::new(Mutex::new(rx))));
+
+            return Ok((
+                Some(dh),
+                fuse_backend_rs::api::filesystem::OpenOptions::empty(), // TODO: non-seekable
+            ));
+        }
+
+        Ok((None, OpenOptions::empty()))
+    }
+
+    #[tracing::instrument(skip_all, fields(rq.inode = inode, rq.handle = handle, rq.offset = offset), parent = self.dir_handles.read().get(&handle).and_then(|x| x.0.id()))]
+    fn readdir(
+        &self,
+        _ctx: &Context,
+        inode: Self::Inode,
+        handle: Self::Handle,
+        _size: u32,
+        offset: u64,
+        add_entry: &mut dyn FnMut(fuse_backend_rs::api::filesystem::DirEntry) -> io::Result<usize>,
+    ) -> io::Result<()> {
+        debug!("readdir");
+
+        if inode == ROOT_ID {
+            if !self.list_root {
+                return Err(io::Error::from_raw_os_error(libc::EPERM)); // same error code as ipfs/kubo
+            }
+
+            // get the handle from [self.dir_handles]
+            let (_span, rx) = match self.dir_handles.read().get(&handle) {
+                Some(rx) => rx.clone(),
+                None => {
+                    warn!("dir handle {} unknown", handle);
+                    return Err(io::Error::from_raw_os_error(libc::EIO));
+                }
+            };
+
+            let mut rx = rx
+                .lock()
+                .map_err(|_| crate::Error::StorageError("mutex poisoned".into()))?;
+
+            while let Some((i, n)) = rx.blocking_recv() {
+                let root_node = n.map_err(|e| {
+                    warn!("failed to retrieve root node: {}", e);
+                    io::Error::from_raw_os_error(libc::EIO)
+                })?;
+
+                let (inode_data, name) = InodeData::from_node(root_node);
+
+                // obtain the inode, or allocate a new one.
+                let ino = self.get_inode_for_root_name(&name).unwrap_or_else(|| {
+                    // insert the (sparse) inode data and register in
+                    // self.root_nodes.
+                    let ino = self.inode_tracker.write().put(inode_data.clone());
+                    self.root_nodes.write().insert(name.clone(), ino);
+                    ino
+                });
+
+                let written = add_entry(fuse_backend_rs::api::filesystem::DirEntry {
+                    ino,
+                    offset: offset + (i as u64) + 1,
+                    type_: inode_data.as_fuse_type(),
+                    name: &name,
+                })?;
+                // If the buffer is full, add_entry will return `Ok(0)`.
+                if written == 0 {
+                    break;
+                }
+            }
+            return Ok(());
+        }
+
+        // Non root-node case: lookup the children, or return an error if it's not a directory.
+        let (parent_digest, children) = self.get_directory_children(inode)?;
+        Span::current().record("directory.digest", parent_digest.to_string());
+
+        for (i, (ino, child_node)) in children.into_iter().skip(offset as usize).enumerate() {
+            let (inode_data, name) = InodeData::from_node(child_node);
+
+            // the second parameter will become the "offset" parameter on the next call.
+            let written = add_entry(fuse_backend_rs::api::filesystem::DirEntry {
+                ino,
+                offset: offset + (i as u64) + 1,
+                type_: inode_data.as_fuse_type(),
+                name: &name,
+            })?;
+            // If the buffer is full, add_entry will return `Ok(0)`.
+            if written == 0 {
+                break;
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tracing::instrument(skip_all, fields(rq.inode = inode, rq.handle = handle), parent = self.dir_handles.read().get(&handle).and_then(|x| x.0.id()))]
+    fn readdirplus(
+        &self,
+        _ctx: &Context,
+        inode: Self::Inode,
+        handle: Self::Handle,
+        _size: u32,
+        offset: u64,
+        add_entry: &mut dyn FnMut(
+            fuse_backend_rs::api::filesystem::DirEntry,
+            fuse_backend_rs::api::filesystem::Entry,
+        ) -> io::Result<usize>,
+    ) -> io::Result<()> {
+        debug!("readdirplus");
+
+        if inode == ROOT_ID {
+            if !self.list_root {
+                return Err(io::Error::from_raw_os_error(libc::EPERM)); // same error code as ipfs/kubo
+            }
+
+            // get the handle from [self.dir_handles]
+            let (_span, rx) = match self.dir_handles.read().get(&handle) {
+                Some(rx) => rx.clone(),
+                None => {
+                    warn!("dir handle {} unknown", handle);
+                    return Err(io::Error::from_raw_os_error(libc::EIO));
+                }
+            };
+
+            let mut rx = rx
+                .lock()
+                .map_err(|_| crate::Error::StorageError("mutex poisoned".into()))?;
+
+            while let Some((i, n)) = rx.blocking_recv() {
+                let root_node = n.map_err(|e| {
+                    warn!("failed to retrieve root node: {}", e);
+                    io::Error::from_raw_os_error(libc::EPERM)
+                })?;
+
+                let (inode_data, name) = InodeData::from_node(root_node);
+
+                // obtain the inode, or allocate a new one.
+                let ino = self.get_inode_for_root_name(&name).unwrap_or_else(|| {
+                    // insert the (sparse) inode data and register in
+                    // self.root_nodes.
+                    let ino = self.inode_tracker.write().put(inode_data.clone());
+                    self.root_nodes.write().insert(name.clone(), ino);
+                    ino
+                });
+
+                let written = add_entry(
+                    fuse_backend_rs::api::filesystem::DirEntry {
+                        ino,
+                        offset: offset + (i as u64) + 1,
+                        type_: inode_data.as_fuse_type(),
+                        name: &name,
+                    },
+                    inode_data.as_fuse_entry(ino),
+                )?;
+                // If the buffer is full, add_entry will return `Ok(0)`.
+                if written == 0 {
+                    break;
+                }
+            }
+            return Ok(());
+        }
+
+        // Non root-node case: lookup the children, or return an error if it's not a directory.
+        let (parent_digest, children) = self.get_directory_children(inode)?;
+        Span::current().record("directory.digest", parent_digest.to_string());
+
+        for (i, (ino, child_node)) in children.into_iter().skip(offset as usize).enumerate() {
+            let (inode_data, name) = InodeData::from_node(child_node);
+
+            // the second parameter will become the "offset" parameter on the next call.
+            let written = add_entry(
+                fuse_backend_rs::api::filesystem::DirEntry {
+                    ino,
+                    offset: offset + (i as u64) + 1,
+                    type_: inode_data.as_fuse_type(),
+                    name: &name,
+                },
+                inode_data.as_fuse_entry(ino),
+            )?;
+            // If the buffer is full, add_entry will return `Ok(0)`.
+            if written == 0 {
+                break;
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tracing::instrument(skip_all, fields(rq.inode = inode, rq.handle = handle), parent = self.dir_handles.read().get(&handle).and_then(|x| x.0.id()))]
+    fn releasedir(
+        &self,
+        _ctx: &Context,
+        inode: Self::Inode,
+        _flags: u32,
+        handle: Self::Handle,
+    ) -> io::Result<()> {
+        if inode == ROOT_ID {
+            // drop the rx part of the channel.
+            match self.dir_handles.write().remove(&handle) {
+                // drop it, which will close it.
+                Some(rx) => drop(rx),
+                None => {
+                    warn!("dir handle not found");
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tracing::instrument(skip_all, fields(rq.inode = inode))]
+    fn open(
+        &self,
+        _ctx: &Context,
+        inode: Self::Inode,
+        _flags: u32,
+        _fuse_flags: u32,
+    ) -> io::Result<(
+        Option<Self::Handle>,
+        fuse_backend_rs::api::filesystem::OpenOptions,
+    )> {
+        if inode == ROOT_ID {
+            return Err(io::Error::from_raw_os_error(libc::ENOSYS));
+        }
+
+        // lookup the inode
+        match *self.inode_tracker.read().get(inode).unwrap() {
+            // read is invalid on non-files.
+            InodeData::Directory(..) | InodeData::Symlink(_) => {
+                warn!("is directory");
+                Err(io::Error::from_raw_os_error(libc::EISDIR))
+            }
+            InodeData::Regular(ref blob_digest, _blob_size, _) => {
+                Span::current().record("blob.digest", blob_digest.to_string());
+
+                match self.tokio_handle.block_on({
+                    let blob_service = self.blob_service.clone();
+                    let blob_digest = blob_digest.clone();
+                    async move { blob_service.as_ref().open_read(&blob_digest).await }
+                }) {
+                    Ok(None) => {
+                        warn!("blob not found");
+                        Err(io::Error::from_raw_os_error(libc::EIO))
+                    }
+                    Err(e) => {
+                        warn!(e=?e, "error opening blob");
+                        Err(io::Error::from_raw_os_error(libc::EIO))
+                    }
+                    Ok(Some(blob_reader)) => {
+                        // get a new file handle
+                        // TODO: this will overflow after 2**64 operations,
+                        // which is fine for now.
+                        // See https://cl.tvl.fyi/c/depot/+/8834/comment/a6684ce0_d72469d1
+                        // for the discussion on alternatives.
+                        let fh = self.next_file_handle.fetch_add(1, Ordering::SeqCst);
+
+                        self.file_handles
+                            .write()
+                            .insert(fh, (Span::current(), Arc::new(Mutex::new(blob_reader))));
+
+                        Ok((
+                            Some(fh),
+                            fuse_backend_rs::api::filesystem::OpenOptions::empty(),
+                        ))
+                    }
+                }
+            }
+        }
+    }
+
+    #[tracing::instrument(skip_all, fields(rq.inode = inode, rq.handle = handle), parent = self.file_handles.read().get(&handle).and_then(|x| x.0.id()))]
+    fn release(
+        &self,
+        _ctx: &Context,
+        inode: Self::Inode,
+        _flags: u32,
+        handle: Self::Handle,
+        _flush: bool,
+        _flock_release: bool,
+        _lock_owner: Option<u64>,
+    ) -> io::Result<()> {
+        match self.file_handles.write().remove(&handle) {
+            // drop the blob reader, which will close it.
+            Some(blob_reader) => drop(blob_reader),
+            None => {
+                // These might already be dropped if a read error occured.
+                warn!("file handle not found");
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tracing::instrument(skip_all, fields(rq.inode = inode, rq.handle = handle, rq.offset = offset, rq.size = size), parent = self.file_handles.read().get(&handle).and_then(|x| x.0.id()))]
+    fn read(
+        &self,
+        _ctx: &Context,
+        inode: Self::Inode,
+        handle: Self::Handle,
+        w: &mut dyn fuse_backend_rs::api::filesystem::ZeroCopyWriter,
+        size: u32,
+        offset: u64,
+        _lock_owner: Option<u64>,
+        _flags: u32,
+    ) -> io::Result<usize> {
+        debug!("read");
+
+        // We need to take out the blob reader from self.file_handles, so we can
+        // interact with it in the separate task.
+        // On success, we pass it back out of the task, so we can put it back in self.file_handles.
+        let (_span, blob_reader) = self
+            .file_handles
+            .read()
+            .get(&handle)
+            .ok_or_else(|| {
+                warn!("file handle {} unknown", handle);
+                io::Error::from_raw_os_error(libc::EIO)
+            })
+            .cloned()?;
+
+        let mut blob_reader = blob_reader
+            .lock()
+            .map_err(|_| crate::Error::StorageError("mutex poisoned".into()))?;
+
+        let buf = self.tokio_handle.block_on(async move {
+            // seek to the offset specified, which is relative to the start of the file.
+            let pos = blob_reader
+                .seek(io::SeekFrom::Start(offset))
+                .await
+                .map_err(|e| {
+                    warn!("failed to seek to offset {}: {}", offset, e);
+                    io::Error::from_raw_os_error(libc::EIO)
+                })?;
+
+            debug_assert_eq!(offset, pos);
+
+            // As written in the fuse docs, read should send exactly the number
+            // of bytes requested except on EOF or error.
+
+            let mut buf: Vec<u8> = Vec::with_capacity(size as usize);
+
+            // copy things from the internal buffer into buf to fill it till up until size
+            tokio::io::copy(&mut blob_reader.as_mut().take(size as u64), &mut buf).await?;
+
+            Ok::<_, std::io::Error>(buf)
+        })?;
+
+        // We cannot use w.write() here, we're required to call write multiple
+        // times until we wrote the entirety of the buffer (which is `size`, except on EOF).
+        let buf_len = buf.len();
+        let bytes_written = io::copy(&mut Cursor::new(buf), w)?;
+        if bytes_written != buf_len as u64 {
+            error!(bytes_written=%bytes_written, "unable to write all of buf to kernel");
+            return Err(io::Error::from_raw_os_error(libc::EIO));
+        }
+
+        Ok(bytes_written as usize)
+    }
+
+    #[tracing::instrument(skip_all, fields(rq.inode = inode))]
+    fn readlink(&self, _ctx: &Context, inode: Self::Inode) -> io::Result<Vec<u8>> {
+        if inode == ROOT_ID {
+            return Err(io::Error::from_raw_os_error(libc::ENOSYS));
+        }
+
+        // lookup the inode
+        match *self.inode_tracker.read().get(inode).unwrap() {
+            InodeData::Directory(..) | InodeData::Regular(..) => {
+                Err(io::Error::from_raw_os_error(libc::EINVAL))
+            }
+            InodeData::Symlink(ref target) => Ok(target.to_vec()),
+        }
+    }
+
+    #[tracing::instrument(skip_all, fields(rq.inode = inode, name=?name))]
+    fn getxattr(
+        &self,
+        _ctx: &Context,
+        inode: Self::Inode,
+        name: &CStr,
+        size: u32,
+    ) -> io::Result<GetxattrReply> {
+        if !self.show_xattr {
+            return Err(io::Error::from_raw_os_error(libc::ENOSYS));
+        }
+
+        // Peek at the inode requested, and construct the response.
+        let digest_str = match *self
+            .inode_tracker
+            .read()
+            .get(inode)
+            .ok_or_else(|| io::Error::from_raw_os_error(libc::ENODATA))?
+        {
+            InodeData::Directory(DirectoryInodeData::Sparse(ref digest, _))
+            | InodeData::Directory(DirectoryInodeData::Populated(ref digest, _))
+                if name.to_bytes() == XATTR_NAME_DIRECTORY_DIGEST =>
+            {
+                digest.to_string()
+            }
+            InodeData::Regular(ref digest, _, _) if name.to_bytes() == XATTR_NAME_BLOB_DIGEST => {
+                digest.to_string()
+            }
+            _ => {
+                return Err(io::Error::from_raw_os_error(libc::ENODATA));
+            }
+        };
+
+        if size == 0 {
+            Ok(GetxattrReply::Count(digest_str.len() as u32))
+        } else if size < digest_str.len() as u32 {
+            Err(io::Error::from_raw_os_error(libc::ERANGE))
+        } else {
+            Ok(GetxattrReply::Value(digest_str.into_bytes()))
+        }
+    }
+
+    #[tracing::instrument(skip_all, fields(rq.inode = inode))]
+    fn listxattr(
+        &self,
+        _ctx: &Context,
+        inode: Self::Inode,
+        size: u32,
+    ) -> io::Result<ListxattrReply> {
+        if !self.show_xattr {
+            return Err(io::Error::from_raw_os_error(libc::ENOSYS));
+        }
+
+        // determine the (\0-terminated list) to of xattr keys present, depending on the type of the inode.
+        let xattrs_names = {
+            let mut out = Vec::new();
+            if let Some(inode_data) = self.inode_tracker.read().get(inode) {
+                match *inode_data {
+                    InodeData::Directory(_) => {
+                        out.extend_from_slice(XATTR_NAME_DIRECTORY_DIGEST);
+                        out.push_byte(b'\x00');
+                    }
+                    InodeData::Regular(..) => {
+                        out.extend_from_slice(XATTR_NAME_BLOB_DIGEST);
+                        out.push_byte(b'\x00');
+                    }
+                    _ => {}
+                }
+            }
+            out
+        };
+
+        if size == 0 {
+            Ok(ListxattrReply::Count(xattrs_names.len() as u32))
+        } else if size < xattrs_names.len() as u32 {
+            Err(io::Error::from_raw_os_error(libc::ERANGE))
+        } else {
+            Ok(ListxattrReply::Names(xattrs_names.to_vec()))
+        }
+    }
+}
diff --git a/tvix/castore/src/fs/root_nodes.rs b/tvix/castore/src/fs/root_nodes.rs
new file mode 100644
index 0000000000..6609e049a1
--- /dev/null
+++ b/tvix/castore/src/fs/root_nodes.rs
@@ -0,0 +1,37 @@
+use std::collections::BTreeMap;
+
+use crate::{proto::node::Node, Error};
+use bytes::Bytes;
+use futures::stream::BoxStream;
+use tonic::async_trait;
+
+/// Provides an interface for looking up root nodes  in tvix-castore by given
+/// a lookup key (usually the basename), and optionally allow a listing.
+#[async_trait]
+pub trait RootNodes: Send + Sync {
+    /// Looks up a root CA node based on the basename of the node in the root
+    /// directory of the filesystem.
+    async fn get_by_basename(&self, name: &[u8]) -> Result<Option<Node>, Error>;
+
+    /// Lists all root CA nodes in the filesystem. An error can be returned
+    /// in case listing is not allowed
+    fn list(&self) -> BoxStream<Result<Node, Error>>;
+}
+
+#[async_trait]
+/// Implements RootNodes for something deref'ing to a BTreeMap of Nodes, where
+/// the key is the node name.
+impl<T> RootNodes for T
+where
+    T: AsRef<BTreeMap<Bytes, Node>> + Send + Sync,
+{
+    async fn get_by_basename(&self, name: &[u8]) -> Result<Option<Node>, Error> {
+        Ok(self.as_ref().get(name).cloned())
+    }
+
+    fn list(&self) -> BoxStream<Result<Node, Error>> {
+        Box::pin(tokio_stream::iter(
+            self.as_ref().iter().map(|(_, v)| Ok(v.clone())),
+        ))
+    }
+}
diff --git a/tvix/castore/src/fs/virtiofs.rs b/tvix/castore/src/fs/virtiofs.rs
new file mode 100644
index 0000000000..d63e2f2bdd
--- /dev/null
+++ b/tvix/castore/src/fs/virtiofs.rs
@@ -0,0 +1,238 @@
+use std::{
+    convert, error, fmt, io,
+    ops::Deref,
+    path::Path,
+    sync::{Arc, MutexGuard, RwLock},
+};
+
+use fuse_backend_rs::{
+    api::{filesystem::FileSystem, server::Server},
+    transport::{FsCacheReqHandler, Reader, VirtioFsWriter},
+};
+use tracing::error;
+use vhost::vhost_user::{
+    Listener, SlaveFsCacheReq, VhostUserProtocolFeatures, VhostUserVirtioFeatures,
+};
+use vhost_user_backend::{VhostUserBackendMut, VhostUserDaemon, VringMutex, VringState, VringT};
+use virtio_bindings::bindings::virtio_ring::{
+    VIRTIO_RING_F_EVENT_IDX, VIRTIO_RING_F_INDIRECT_DESC,
+};
+use virtio_queue::QueueT;
+use vm_memory::{GuestAddressSpace, GuestMemoryAtomic, GuestMemoryMmap};
+use vmm_sys_util::epoll::EventSet;
+
+const VIRTIO_F_VERSION_1: u32 = 32;
+const NUM_QUEUES: usize = 2;
+const QUEUE_SIZE: usize = 1024;
+
+#[derive(Debug)]
+enum Error {
+    /// Failed to handle non-input event.
+    HandleEventNotEpollIn,
+    /// Failed to handle unknown event.
+    HandleEventUnknownEvent,
+    /// Invalid descriptor chain.
+    InvalidDescriptorChain,
+    /// Failed to handle filesystem requests.
+    #[allow(dead_code)]
+    HandleRequests(fuse_backend_rs::Error),
+    /// Failed to construct new vhost user daemon.
+    NewDaemon,
+    /// Failed to start the vhost user daemon.
+    StartDaemon,
+    /// Failed to wait for the vhost user daemon.
+    WaitDaemon,
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "vhost_user_fs_error: {self:?}")
+    }
+}
+
+impl error::Error for Error {}
+
+impl convert::From<Error> for io::Error {
+    fn from(e: Error) -> Self {
+        io::Error::new(io::ErrorKind::Other, e)
+    }
+}
+
+struct VhostUserFsBackend<FS>
+where
+    FS: FileSystem + Send + Sync,
+{
+    server: Arc<Server<Arc<FS>>>,
+    event_idx: bool,
+    guest_mem: GuestMemoryAtomic<GuestMemoryMmap>,
+    cache_req: Option<SlaveFsCacheReq>,
+}
+
+impl<FS> VhostUserFsBackend<FS>
+where
+    FS: FileSystem + Send + Sync,
+{
+    fn process_queue(&mut self, vring: &mut MutexGuard<VringState>) -> std::io::Result<bool> {
+        let mut used_descs = false;
+
+        while let Some(desc_chain) = vring
+            .get_queue_mut()
+            .pop_descriptor_chain(self.guest_mem.memory())
+        {
+            let memory = desc_chain.memory();
+            let reader = Reader::from_descriptor_chain(memory, desc_chain.clone())
+                .map_err(|_| Error::InvalidDescriptorChain)?;
+            let writer = VirtioFsWriter::new(memory, desc_chain.clone())
+                .map_err(|_| Error::InvalidDescriptorChain)?;
+
+            self.server
+                .handle_message(
+                    reader,
+                    writer.into(),
+                    self.cache_req
+                        .as_mut()
+                        .map(|req| req as &mut dyn FsCacheReqHandler),
+                    None,
+                )
+                .map_err(Error::HandleRequests)?;
+
+            // TODO: Is len 0 correct?
+            if let Err(error) = vring
+                .get_queue_mut()
+                .add_used(memory, desc_chain.head_index(), 0)
+            {
+                error!(?error, "failed to add desc back to ring");
+            }
+
+            // TODO: What happens if we error out before here?
+            used_descs = true;
+        }
+
+        let needs_notification = if self.event_idx {
+            match vring
+                .get_queue_mut()
+                .needs_notification(self.guest_mem.memory().deref())
+            {
+                Ok(needs_notification) => needs_notification,
+                Err(error) => {
+                    error!(?error, "failed to check if queue needs notification");
+                    true
+                }
+            }
+        } else {
+            true
+        };
+
+        if needs_notification {
+            if let Err(error) = vring.signal_used_queue() {
+                error!(?error, "failed to signal used queue");
+            }
+        }
+
+        Ok(used_descs)
+    }
+}
+
+impl<FS> VhostUserBackendMut<VringMutex> for VhostUserFsBackend<FS>
+where
+    FS: FileSystem + Send + Sync,
+{
+    fn num_queues(&self) -> usize {
+        NUM_QUEUES
+    }
+
+    fn max_queue_size(&self) -> usize {
+        QUEUE_SIZE
+    }
+
+    fn features(&self) -> u64 {
+        1 << VIRTIO_F_VERSION_1
+            | 1 << VIRTIO_RING_F_INDIRECT_DESC
+            | 1 << VIRTIO_RING_F_EVENT_IDX
+            | VhostUserVirtioFeatures::PROTOCOL_FEATURES.bits()
+    }
+
+    fn protocol_features(&self) -> VhostUserProtocolFeatures {
+        VhostUserProtocolFeatures::MQ | VhostUserProtocolFeatures::SLAVE_REQ
+    }
+
+    fn set_event_idx(&mut self, enabled: bool) {
+        self.event_idx = enabled;
+    }
+
+    fn update_memory(&mut self, _mem: GuestMemoryAtomic<GuestMemoryMmap>) -> std::io::Result<()> {
+        // This is what most the vhost user implementations do...
+        Ok(())
+    }
+
+    fn set_slave_req_fd(&mut self, cache_req: SlaveFsCacheReq) {
+        self.cache_req = Some(cache_req);
+    }
+
+    fn handle_event(
+        &mut self,
+        device_event: u16,
+        evset: vmm_sys_util::epoll::EventSet,
+        vrings: &[VringMutex],
+        _thread_id: usize,
+    ) -> std::io::Result<bool> {
+        if evset != EventSet::IN {
+            return Err(Error::HandleEventNotEpollIn.into());
+        }
+
+        let mut queue = match device_event {
+            // High priority queue
+            0 => vrings[0].get_mut(),
+            // Regurlar priority queue
+            1 => vrings[1].get_mut(),
+            _ => {
+                return Err(Error::HandleEventUnknownEvent.into());
+            }
+        };
+
+        if self.event_idx {
+            loop {
+                queue
+                    .get_queue_mut()
+                    .enable_notification(self.guest_mem.memory().deref())
+                    .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
+                if !self.process_queue(&mut queue)? {
+                    break;
+                }
+            }
+        } else {
+            self.process_queue(&mut queue)?;
+        }
+
+        Ok(false)
+    }
+}
+
+pub fn start_virtiofs_daemon<FS, P>(fs: FS, socket: P) -> io::Result<()>
+where
+    FS: FileSystem + Send + Sync + 'static,
+    P: AsRef<Path>,
+{
+    let guest_mem = GuestMemoryAtomic::new(GuestMemoryMmap::new());
+
+    let server = Arc::new(fuse_backend_rs::api::server::Server::new(Arc::new(fs)));
+
+    let backend = Arc::new(RwLock::new(VhostUserFsBackend {
+        server,
+        guest_mem: guest_mem.clone(),
+        event_idx: false,
+        cache_req: None,
+    }));
+
+    let listener = Listener::new(socket, true).unwrap();
+
+    let mut fs_daemon =
+        VhostUserDaemon::new(String::from("vhost-user-fs-tvix-store"), backend, guest_mem)
+            .map_err(|_| Error::NewDaemon)?;
+
+    fs_daemon.start(listener).map_err(|_| Error::StartDaemon)?;
+
+    fs_daemon.wait().map_err(|_| Error::WaitDaemon)?;
+
+    Ok(())
+}
diff --git a/tvix/castore/src/hashing_reader.rs b/tvix/castore/src/hashing_reader.rs
new file mode 100644
index 0000000000..7d78cae587
--- /dev/null
+++ b/tvix/castore/src/hashing_reader.rs
@@ -0,0 +1,89 @@
+use pin_project_lite::pin_project;
+use tokio::io::AsyncRead;
+
+pin_project! {
+    /// Wraps an existing AsyncRead, and allows querying for the digest of all
+    /// data read "through" it.
+    /// The hash function is configurable by type parameter.
+    pub struct HashingReader<R, H>
+    where
+        R: AsyncRead,
+        H: digest::Digest,
+    {
+        #[pin]
+        inner: R,
+        hasher: H,
+    }
+}
+
+pub type B3HashingReader<R> = HashingReader<R, blake3::Hasher>;
+
+impl<R, H> HashingReader<R, H>
+where
+    R: AsyncRead,
+    H: digest::Digest,
+{
+    pub fn from(r: R) -> Self {
+        Self {
+            inner: r,
+            hasher: H::new(),
+        }
+    }
+
+    /// Return the digest.
+    pub fn digest(self) -> digest::Output<H> {
+        self.hasher.finalize()
+    }
+}
+
+impl<R, H> tokio::io::AsyncRead for HashingReader<R, H>
+where
+    R: AsyncRead,
+    H: digest::Digest,
+{
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut tokio::io::ReadBuf<'_>,
+    ) -> std::task::Poll<std::io::Result<()>> {
+        let buf_filled_len_before = buf.filled().len();
+
+        let this = self.project();
+        let ret = this.inner.poll_read(cx, buf);
+
+        // write everything new filled into the hasher.
+        this.hasher.update(&buf.filled()[buf_filled_len_before..]);
+
+        ret
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io::Cursor;
+
+    use rstest::rstest;
+
+    use crate::fixtures::BLOB_A;
+    use crate::fixtures::BLOB_A_DIGEST;
+    use crate::fixtures::BLOB_B;
+    use crate::fixtures::BLOB_B_DIGEST;
+    use crate::fixtures::EMPTY_BLOB_DIGEST;
+    use crate::{B3Digest, B3HashingReader};
+
+    #[rstest]
+    #[case::blob_a(&BLOB_A, &BLOB_A_DIGEST)]
+    #[case::blob_b(&BLOB_B, &BLOB_B_DIGEST)]
+    #[case::empty_blob(&[], &EMPTY_BLOB_DIGEST)]
+    #[tokio::test]
+    async fn test_b3_hashing_reader(#[case] data: &[u8], #[case] b3_digest: &B3Digest) {
+        let r = Cursor::new(data);
+        let mut hr = B3HashingReader::from(r);
+
+        tokio::io::copy(&mut hr, &mut tokio::io::sink())
+            .await
+            .expect("read must succeed");
+
+        assert_eq!(*b3_digest, hr.digest().into());
+    }
+}
diff --git a/tvix/castore/src/import/archive.rs b/tvix/castore/src/import/archive.rs
new file mode 100644
index 0000000000..cd5b1290e0
--- /dev/null
+++ b/tvix/castore/src/import/archive.rs
@@ -0,0 +1,373 @@
+//! Imports from an archive (tarballs)
+
+use std::collections::HashMap;
+
+use petgraph::graph::{DiGraph, NodeIndex};
+use petgraph::visit::{DfsPostOrder, EdgeRef};
+use petgraph::Direction;
+use tokio::io::AsyncRead;
+use tokio_stream::StreamExt;
+use tokio_tar::Archive;
+use tracing::{instrument, warn, Level};
+
+use crate::blobservice::BlobService;
+use crate::directoryservice::DirectoryService;
+use crate::import::{ingest_entries, IngestionEntry, IngestionError};
+use crate::proto::node::Node;
+
+use super::blobs::{self, ConcurrentBlobUploader};
+
+type TarPathBuf = std::path::PathBuf;
+
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    #[error("unable to construct stream of entries: {0}")]
+    Entries(std::io::Error),
+
+    #[error("unable to read next entry: {0}")]
+    NextEntry(std::io::Error),
+
+    #[error("unable to read path for entry: {0}")]
+    PathRead(std::io::Error),
+
+    #[error("unable to convert path {0} for entry: {1}")]
+    PathConvert(TarPathBuf, std::io::Error),
+
+    #[error("unable to read size field for {0}: {1}")]
+    Size(TarPathBuf, std::io::Error),
+
+    #[error("unable to read mode field for {0}: {1}")]
+    Mode(TarPathBuf, std::io::Error),
+
+    #[error("unable to read link name field for {0}: {1}")]
+    LinkName(TarPathBuf, std::io::Error),
+
+    #[error("unsupported tar entry {0} type: {1:?}")]
+    EntryType(TarPathBuf, tokio_tar::EntryType),
+
+    #[error("symlink missing target {0}")]
+    MissingSymlinkTarget(TarPathBuf),
+
+    #[error("unexpected number of top level directory entries")]
+    UnexpectedNumberOfTopLevelEntries,
+
+    #[error(transparent)]
+    BlobUploadError(#[from] blobs::Error),
+}
+
+/// Ingests elements from the given tar [`Archive`] into a the passed [`BlobService`] and
+/// [`DirectoryService`].
+#[instrument(skip_all, ret(level = Level::TRACE), err)]
+pub async fn ingest_archive<BS, DS, R>(
+    blob_service: BS,
+    directory_service: DS,
+    mut archive: Archive<R>,
+) -> Result<Node, IngestionError<Error>>
+where
+    BS: BlobService + Clone + 'static,
+    DS: DirectoryService,
+    R: AsyncRead + Unpin,
+{
+    // Since tarballs can have entries in any arbitrary order, we need to
+    // buffer all of the directory metadata so we can reorder directory
+    // contents and entries to meet the requires of the castore.
+
+    // In the first phase, collect up all the regular files and symlinks.
+    let mut nodes = IngestionEntryGraph::new();
+
+    let mut blob_uploader = ConcurrentBlobUploader::new(blob_service);
+
+    let mut entries_iter = archive.entries().map_err(Error::Entries)?;
+    while let Some(mut entry) = entries_iter.try_next().await.map_err(Error::NextEntry)? {
+        let tar_path: TarPathBuf = entry.path().map_err(Error::PathRead)?.into();
+
+        // construct a castore PathBuf, which we use in the produced IngestionEntry.
+        let path = crate::path::PathBuf::from_host_path(tar_path.as_path(), true)
+            .map_err(|e| Error::PathConvert(tar_path.clone(), e))?;
+
+        let header = entry.header();
+        let entry = match header.entry_type() {
+            tokio_tar::EntryType::Regular
+            | tokio_tar::EntryType::GNUSparse
+            | tokio_tar::EntryType::Continuous => {
+                let size = header
+                    .size()
+                    .map_err(|e| Error::Size(tar_path.clone(), e))?;
+
+                let digest = blob_uploader
+                    .upload(&path, size, &mut entry)
+                    .await
+                    .map_err(Error::BlobUploadError)?;
+
+                let executable = entry
+                    .header()
+                    .mode()
+                    .map_err(|e| Error::Mode(tar_path, e))?
+                    & 64
+                    != 0;
+
+                IngestionEntry::Regular {
+                    path,
+                    size,
+                    executable,
+                    digest,
+                }
+            }
+            tokio_tar::EntryType::Symlink => IngestionEntry::Symlink {
+                target: entry
+                    .link_name()
+                    .map_err(|e| Error::LinkName(tar_path.clone(), e))?
+                    .ok_or_else(|| Error::MissingSymlinkTarget(tar_path.clone()))?
+                    .into_owned()
+                    .into_os_string()
+                    .into_encoded_bytes(),
+                path,
+            },
+            // Push a bogus directory marker so we can make sure this directoy gets
+            // created. We don't know the digest and size until after reading the full
+            // tarball.
+            tokio_tar::EntryType::Directory => IngestionEntry::Dir { path },
+
+            tokio_tar::EntryType::XGlobalHeader | tokio_tar::EntryType::XHeader => continue,
+
+            entry_type => return Err(Error::EntryType(tar_path, entry_type).into()),
+        };
+
+        nodes.add(entry)?;
+    }
+
+    blob_uploader.join().await.map_err(Error::BlobUploadError)?;
+
+    let root_node = ingest_entries(
+        directory_service,
+        futures::stream::iter(nodes.finalize()?.into_iter().map(Ok)),
+    )
+    .await?;
+
+    Ok(root_node)
+}
+
+/// Keep track of the directory structure of a file tree being ingested. This is used
+/// for ingestion sources which do not provide any ordering or uniqueness guarantees
+/// like tarballs.
+///
+/// If we ingest multiple entries with the same paths and both entries are not directories,
+/// the newer entry will replace the latter entry, disconnecting the old node's children
+/// from the graph.
+///
+/// Once all nodes are ingested a call to [IngestionEntryGraph::finalize] will return
+/// a list of entries compute by performaing a DFS post order traversal of the graph
+/// from the top-level directory entry.
+///
+/// This expects the directory structure to contain a single top-level directory entry.
+/// An error is returned if this is not the case and ingestion will fail.
+struct IngestionEntryGraph {
+    graph: DiGraph<IngestionEntry, ()>,
+    path_to_index: HashMap<crate::path::PathBuf, NodeIndex>,
+    root_node: Option<NodeIndex>,
+}
+
+impl Default for IngestionEntryGraph {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl IngestionEntryGraph {
+    /// Creates a new ingestion entry graph.
+    pub fn new() -> Self {
+        IngestionEntryGraph {
+            graph: DiGraph::new(),
+            path_to_index: HashMap::new(),
+            root_node: None,
+        }
+    }
+
+    /// Adds a new entry to the graph. Parent directories are automatically inserted.
+    /// If a node exists in the graph with the same name as the new entry and both the old
+    /// and new nodes are not directories, the node is replaced and is disconnected from its
+    /// children.
+    pub fn add(&mut self, entry: IngestionEntry) -> Result<NodeIndex, Error> {
+        let path = entry.path().to_owned();
+
+        let index = match self.path_to_index.get(entry.path()) {
+            Some(&index) => {
+                // If either the old entry or new entry are not directories, we'll replace the old
+                // entry.
+                if !entry.is_dir() || !self.get_node(index).is_dir() {
+                    self.replace_node(index, entry);
+                }
+
+                index
+            }
+            None => self.graph.add_node(entry),
+        };
+
+        // for archives, a path with 1 component is the root node
+        if path.components().count() == 1 {
+            // We expect archives to contain a single root node, if there is another root node
+            // entry with a different path name, this is unsupported.
+            if let Some(root_node) = self.root_node {
+                if self.get_node(root_node).path() != path.as_ref() {
+                    return Err(Error::UnexpectedNumberOfTopLevelEntries);
+                }
+            }
+
+            self.root_node = Some(index)
+        } else if let Some(parent_path) = path.parent() {
+            // Recursively add the parent node until it hits the root node.
+            let parent_index = self.add(IngestionEntry::Dir {
+                path: parent_path.to_owned(),
+            })?;
+
+            // Insert an edge from the parent directory to the child entry.
+            self.graph.add_edge(parent_index, index, ());
+        }
+
+        self.path_to_index.insert(path, index);
+
+        Ok(index)
+    }
+
+    /// Traverses the graph in DFS post order and collects the entries into a [Vec<IngestionEntry>].
+    ///
+    /// Unreachable parts of the graph are not included in the result.
+    pub fn finalize(self) -> Result<Vec<IngestionEntry>, Error> {
+        // There must be a root node.
+        let Some(root_node_index) = self.root_node else {
+            return Err(Error::UnexpectedNumberOfTopLevelEntries);
+        };
+
+        // The root node must be a directory.
+        if !self.get_node(root_node_index).is_dir() {
+            return Err(Error::UnexpectedNumberOfTopLevelEntries);
+        }
+
+        let mut traversal = DfsPostOrder::new(&self.graph, root_node_index);
+        let mut nodes = Vec::with_capacity(self.graph.node_count());
+        while let Some(node_index) = traversal.next(&self.graph) {
+            nodes.push(self.get_node(node_index).clone());
+        }
+
+        Ok(nodes)
+    }
+
+    /// Replaces the node with the specified entry. The node's children are disconnected.
+    ///
+    /// This should never be called if both the old and new nodes are directories.
+    fn replace_node(&mut self, index: NodeIndex, new_entry: IngestionEntry) {
+        let entry = self
+            .graph
+            .node_weight_mut(index)
+            .expect("Tvix bug: missing node entry");
+
+        debug_assert!(!(entry.is_dir() && new_entry.is_dir()));
+
+        // Replace the node itself.
+        warn!(
+            "saw duplicate entry in archive at path {:?}. old: {:?} new: {:?}",
+            entry.path(),
+            &entry,
+            &new_entry
+        );
+        *entry = new_entry;
+
+        // Remove any outgoing edges to disconnect the old node's children.
+        let edges = self
+            .graph
+            .edges_directed(index, Direction::Outgoing)
+            .map(|edge| edge.id())
+            .collect::<Vec<_>>();
+        for edge in edges {
+            self.graph.remove_edge(edge);
+        }
+    }
+
+    fn get_node(&self, index: NodeIndex) -> &IngestionEntry {
+        self.graph
+            .node_weight(index)
+            .expect("Tvix bug: missing node entry")
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::import::IngestionEntry;
+    use crate::B3Digest;
+
+    use super::{Error, IngestionEntryGraph};
+
+    use lazy_static::lazy_static;
+    use rstest::rstest;
+
+    lazy_static! {
+        pub static ref EMPTY_DIGEST: B3Digest = blake3::hash(&[]).as_bytes().into();
+        pub static ref DIR_A: IngestionEntry = IngestionEntry::Dir {
+            path: "a".parse().unwrap()
+        };
+        pub static ref DIR_B: IngestionEntry = IngestionEntry::Dir {
+            path: "b".parse().unwrap()
+        };
+        pub static ref DIR_A_B: IngestionEntry = IngestionEntry::Dir {
+            path: "a/b".parse().unwrap()
+        };
+        pub static ref FILE_A: IngestionEntry = IngestionEntry::Regular {
+            path: "a".parse().unwrap(),
+            size: 0,
+            executable: false,
+            digest: EMPTY_DIGEST.clone(),
+        };
+        pub static ref FILE_A_B: IngestionEntry = IngestionEntry::Regular {
+            path: "a/b".parse().unwrap(),
+            size: 0,
+            executable: false,
+            digest: EMPTY_DIGEST.clone(),
+        };
+        pub static ref FILE_A_B_C: IngestionEntry = IngestionEntry::Regular {
+            path: "a/b/c".parse().unwrap(),
+            size: 0,
+            executable: false,
+            digest: EMPTY_DIGEST.clone(),
+        };
+    }
+
+    #[rstest]
+    #[case::implicit_directories(&[&*FILE_A_B_C], &[&*FILE_A_B_C, &*DIR_A_B, &*DIR_A])]
+    #[case::explicit_directories(&[&*DIR_A, &*DIR_A_B, &*FILE_A_B_C], &[&*FILE_A_B_C, &*DIR_A_B, &*DIR_A])]
+    #[case::inaccesible_tree(&[&*DIR_A, &*DIR_A_B, &*FILE_A_B], &[&*FILE_A_B, &*DIR_A])]
+    fn node_ingestion_success(
+        #[case] in_entries: &[&IngestionEntry],
+        #[case] exp_entries: &[&IngestionEntry],
+    ) {
+        let mut nodes = IngestionEntryGraph::new();
+
+        for entry in in_entries {
+            nodes.add((*entry).clone()).expect("failed to add entry");
+        }
+
+        let entries = nodes.finalize().expect("invalid entries");
+
+        let exp_entries: Vec<IngestionEntry> =
+            exp_entries.iter().map(|entry| (*entry).clone()).collect();
+
+        assert_eq!(entries, exp_entries);
+    }
+
+    #[rstest]
+    #[case::no_top_level_entries(&[], Error::UnexpectedNumberOfTopLevelEntries)]
+    #[case::multiple_top_level_dirs(&[&*DIR_A, &*DIR_B], Error::UnexpectedNumberOfTopLevelEntries)]
+    #[case::top_level_file_entry(&[&*FILE_A], Error::UnexpectedNumberOfTopLevelEntries)]
+    fn node_ingestion_error(#[case] in_entries: &[&IngestionEntry], #[case] exp_error: Error) {
+        let mut nodes = IngestionEntryGraph::new();
+
+        let result = (|| {
+            for entry in in_entries {
+                nodes.add((*entry).clone())?;
+            }
+            nodes.finalize()
+        })();
+
+        let error = result.expect_err("expected error");
+        assert_eq!(error.to_string(), exp_error.to_string());
+    }
+}
diff --git a/tvix/castore/src/import/blobs.rs b/tvix/castore/src/import/blobs.rs
new file mode 100644
index 0000000000..8135d871d6
--- /dev/null
+++ b/tvix/castore/src/import/blobs.rs
@@ -0,0 +1,177 @@
+use std::{
+    io::{Cursor, Write},
+    sync::Arc,
+};
+
+use tokio::{
+    io::AsyncRead,
+    sync::Semaphore,
+    task::{JoinError, JoinSet},
+};
+use tokio_util::io::InspectReader;
+
+use crate::{blobservice::BlobService, B3Digest, Path, PathBuf};
+
+/// Files smaller than this threshold, in bytes, are uploaded to the [BlobService] in the
+/// background.
+///
+/// This is a u32 since we acquire a weighted semaphore using the size of the blob.
+/// [Semaphore::acquire_many_owned] takes a u32, so we need to ensure the size of
+/// the blob can be represented using a u32 and will not cause an overflow.
+const CONCURRENT_BLOB_UPLOAD_THRESHOLD: u32 = 1024 * 1024;
+
+/// The maximum amount of bytes allowed to be buffered in memory to perform async blob uploads.
+const MAX_BUFFER_SIZE: usize = 128 * 1024 * 1024;
+
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    #[error("unable to read blob contents for {0}: {1}")]
+    BlobRead(PathBuf, std::io::Error),
+
+    // FUTUREWORK: proper error for blob finalize
+    #[error("unable to finalize blob {0}: {1}")]
+    BlobFinalize(PathBuf, std::io::Error),
+
+    #[error("unexpected size for {path} wanted: {wanted} got: {got}")]
+    UnexpectedSize {
+        path: PathBuf,
+        wanted: u64,
+        got: u64,
+    },
+
+    #[error("blob upload join error: {0}")]
+    JoinError(#[from] JoinError),
+}
+
+/// The concurrent blob uploader provides a mechanism for concurrently uploading small blobs.
+/// This is useful when ingesting from sources like tarballs and archives which each blob entry
+/// must be read sequentially. Ingesting many small blobs sequentially becomes slow due to
+/// round trip time with the blob service. The concurrent blob uploader will buffer small
+/// blobs in memory and upload them to the blob service in the background.
+///
+/// Once all blobs have been uploaded, make sure to call [ConcurrentBlobUploader::join] to wait
+/// for all background jobs to complete and check for any errors.
+pub struct ConcurrentBlobUploader<BS> {
+    blob_service: BS,
+    upload_tasks: JoinSet<Result<(), Error>>,
+    upload_semaphore: Arc<Semaphore>,
+}
+
+impl<BS> ConcurrentBlobUploader<BS>
+where
+    BS: BlobService + Clone + 'static,
+{
+    /// Creates a new concurrent blob uploader which uploads blobs to the provided
+    /// blob service.
+    pub fn new(blob_service: BS) -> Self {
+        Self {
+            blob_service,
+            upload_tasks: JoinSet::new(),
+            upload_semaphore: Arc::new(Semaphore::new(MAX_BUFFER_SIZE)),
+        }
+    }
+
+    /// Uploads a blob to the blob service. If the blob is small enough it will be read to a buffer
+    /// and uploaded in the background.
+    /// This will read the entirety of the provided reader unless an error occurs, even if blobs
+    /// are uploaded in the background..
+    pub async fn upload<R>(
+        &mut self,
+        path: &Path,
+        expected_size: u64,
+        mut r: R,
+    ) -> Result<B3Digest, Error>
+    where
+        R: AsyncRead + Unpin,
+    {
+        if expected_size < CONCURRENT_BLOB_UPLOAD_THRESHOLD as u64 {
+            let mut buffer = Vec::with_capacity(expected_size as usize);
+            let mut hasher = blake3::Hasher::new();
+            let mut reader = InspectReader::new(&mut r, |bytes| {
+                hasher.write_all(bytes).unwrap();
+            });
+
+            let permit = self
+                .upload_semaphore
+                .clone()
+                // This cast is safe because ensure the header_size is less than
+                // CONCURRENT_BLOB_UPLOAD_THRESHOLD which is a u32.
+                .acquire_many_owned(expected_size as u32)
+                .await
+                .unwrap();
+            let size = tokio::io::copy(&mut reader, &mut buffer)
+                .await
+                .map_err(|e| Error::BlobRead(path.into(), e))?;
+            let digest: B3Digest = hasher.finalize().as_bytes().into();
+
+            if size != expected_size {
+                return Err(Error::UnexpectedSize {
+                    path: path.into(),
+                    wanted: expected_size,
+                    got: size,
+                });
+            }
+
+            self.upload_tasks.spawn({
+                let blob_service = self.blob_service.clone();
+                let expected_digest = digest.clone();
+                let path = path.to_owned();
+                let r = Cursor::new(buffer);
+                async move {
+                    let digest = upload_blob(&blob_service, &path, expected_size, r).await?;
+
+                    assert_eq!(digest, expected_digest, "Tvix bug: blob digest mismatch");
+
+                    // Make sure we hold the permit until we finish writing the blob
+                    // to the [BlobService].
+                    drop(permit);
+                    Ok(())
+                }
+            });
+
+            return Ok(digest);
+        }
+
+        upload_blob(&self.blob_service, path, expected_size, r).await
+    }
+
+    /// Waits for all background upload jobs to complete, returning any upload errors.
+    pub async fn join(mut self) -> Result<(), Error> {
+        while let Some(result) = self.upload_tasks.join_next().await {
+            result??;
+        }
+        Ok(())
+    }
+}
+
+async fn upload_blob<BS, R>(
+    blob_service: &BS,
+    path: &Path,
+    expected_size: u64,
+    mut r: R,
+) -> Result<B3Digest, Error>
+where
+    BS: BlobService,
+    R: AsyncRead + Unpin,
+{
+    let mut writer = blob_service.open_write().await;
+
+    let size = tokio::io::copy(&mut r, &mut writer)
+        .await
+        .map_err(|e| Error::BlobRead(path.into(), e))?;
+
+    let digest = writer
+        .close()
+        .await
+        .map_err(|e| Error::BlobFinalize(path.into(), e))?;
+
+    if size != expected_size {
+        return Err(Error::UnexpectedSize {
+            path: path.into(),
+            wanted: expected_size,
+            got: size,
+        });
+    }
+
+    Ok(digest)
+}
diff --git a/tvix/castore/src/import/error.rs b/tvix/castore/src/import/error.rs
new file mode 100644
index 0000000000..e3fba617e0
--- /dev/null
+++ b/tvix/castore/src/import/error.rs
@@ -0,0 +1,20 @@
+use super::PathBuf;
+
+use crate::Error as CastoreError;
+
+/// Represents all error types that emitted by ingest_entries.
+/// It can represent errors uploading individual Directories and finalizing
+/// the upload.
+/// It also contains a generic error kind that'll carry ingestion-method
+/// specific errors.
+#[derive(Debug, thiserror::Error)]
+pub enum IngestionError<E: std::fmt::Display> {
+    #[error("error from producer: {0}")]
+    Producer(#[from] E),
+
+    #[error("failed to upload directory at {0}: {1}")]
+    UploadDirectoryError(PathBuf, CastoreError),
+
+    #[error("failed to finalize directory upload: {0}")]
+    FinalizeDirectoryUpload(CastoreError),
+}
diff --git a/tvix/castore/src/import/fs.rs b/tvix/castore/src/import/fs.rs
new file mode 100644
index 0000000000..265d772355
--- /dev/null
+++ b/tvix/castore/src/import/fs.rs
@@ -0,0 +1,198 @@
+//! Import from a real filesystem.
+
+use futures::stream::BoxStream;
+use futures::StreamExt;
+use std::fs::FileType;
+use std::os::unix::ffi::OsStringExt;
+use std::os::unix::fs::MetadataExt;
+use std::os::unix::fs::PermissionsExt;
+use tracing::instrument;
+use tracing::Span;
+use tracing_indicatif::span_ext::IndicatifSpanExt;
+use walkdir::DirEntry;
+use walkdir::WalkDir;
+
+use crate::blobservice::BlobService;
+use crate::directoryservice::DirectoryService;
+use crate::proto::node::Node;
+use crate::B3Digest;
+
+use super::ingest_entries;
+use super::IngestionEntry;
+use super::IngestionError;
+
+/// Ingests the contents at a given path into the tvix store, interacting with a [BlobService] and
+/// [DirectoryService]. It returns the root node or an error.
+///
+/// It does not follow symlinks at the root, they will be ingested as actual symlinks.
+///
+/// This function will walk the filesystem using `walkdir` and will consume
+/// `O(#number of entries)` space.
+#[instrument(skip(blob_service, directory_service), fields(path), err)]
+pub async fn ingest_path<BS, DS, P>(
+    blob_service: BS,
+    directory_service: DS,
+    path: P,
+) -> Result<Node, IngestionError<Error>>
+where
+    P: AsRef<std::path::Path> + std::fmt::Debug,
+    BS: BlobService + Clone,
+    DS: DirectoryService,
+{
+    Span::current().pb_start();
+    let iter = WalkDir::new(path.as_ref())
+        .follow_links(false)
+        .follow_root_links(false)
+        .contents_first(true)
+        .into_iter();
+
+    let entries = dir_entries_to_ingestion_stream(blob_service, iter, path.as_ref());
+    ingest_entries(
+        directory_service,
+        entries.inspect(|e| {
+            if let Ok(e) = e {
+                let s = Span::current();
+                s.pb_inc(1);
+                s.pb_set_message(&format!("Ingesting {}", e.path()));
+            }
+        }),
+    )
+    .await
+}
+
+/// Converts an iterator of [walkdir::DirEntry]s into a stream of ingestion entries.
+/// This can then be fed into [ingest_entries] to ingest all the entries into the castore.
+///
+/// The produced stream is buffered, so uploads can happen concurrently.
+///
+/// The root is the [Path] in the filesystem that is being ingested into the castore.
+pub fn dir_entries_to_ingestion_stream<'a, BS, I>(
+    blob_service: BS,
+    iter: I,
+    root: &'a std::path::Path,
+) -> BoxStream<'a, Result<IngestionEntry, Error>>
+where
+    BS: BlobService + Clone + 'a,
+    I: Iterator<Item = Result<DirEntry, walkdir::Error>> + Send + 'a,
+{
+    let prefix = root.parent().unwrap_or_else(|| std::path::Path::new(""));
+
+    Box::pin(
+        futures::stream::iter(iter)
+            .map(move |x| {
+                let blob_service = blob_service.clone();
+                async move {
+                    match x {
+                        Ok(dir_entry) => {
+                            dir_entry_to_ingestion_entry(blob_service, &dir_entry, prefix).await
+                        }
+                        Err(e) => Err(Error::Stat(
+                            prefix.to_path_buf(),
+                            e.into_io_error().expect("walkdir err must be some"),
+                        )),
+                    }
+                }
+            })
+            .buffered(50),
+    )
+}
+
+/// Converts a [walkdir::DirEntry] into an [IngestionEntry], uploading blobs to the
+/// provided [BlobService].
+///
+/// The prefix path is stripped from the path of each entry. This is usually the parent path
+/// of the path being ingested so that the last element of the stream only has one component.
+pub async fn dir_entry_to_ingestion_entry<BS>(
+    blob_service: BS,
+    entry: &DirEntry,
+    prefix: &std::path::Path,
+) -> Result<IngestionEntry, Error>
+where
+    BS: BlobService,
+{
+    let file_type = entry.file_type();
+
+    let fs_path = entry
+        .path()
+        .strip_prefix(prefix)
+        .expect("Tvix bug: failed to strip root path prefix");
+
+    // convert to castore PathBuf
+    let path = crate::path::PathBuf::from_host_path(fs_path, false)
+        .unwrap_or_else(|e| panic!("Tvix bug: walkdir direntry cannot be parsed: {}", e));
+
+    if file_type.is_dir() {
+        Ok(IngestionEntry::Dir { path })
+    } else if file_type.is_symlink() {
+        let target = std::fs::read_link(entry.path())
+            .map_err(|e| Error::Stat(entry.path().to_path_buf(), e))?
+            .into_os_string()
+            .into_vec();
+
+        Ok(IngestionEntry::Symlink { path, target })
+    } else if file_type.is_file() {
+        let metadata = entry
+            .metadata()
+            .map_err(|e| Error::Stat(entry.path().to_path_buf(), e.into()))?;
+
+        let digest = upload_blob(blob_service, entry.path().to_path_buf()).await?;
+
+        Ok(IngestionEntry::Regular {
+            path,
+            size: metadata.size(),
+            // If it's executable by the user, it'll become executable.
+            // This matches nix's dump() function behaviour.
+            executable: metadata.permissions().mode() & 64 != 0,
+            digest,
+        })
+    } else {
+        return Err(Error::FileType(fs_path.to_path_buf(), file_type));
+    }
+}
+
+/// Uploads the file at the provided [Path] the the [BlobService].
+#[instrument(skip(blob_service), fields(path), err)]
+async fn upload_blob<BS>(
+    blob_service: BS,
+    path: impl AsRef<std::path::Path>,
+) -> Result<B3Digest, Error>
+where
+    BS: BlobService,
+{
+    let mut file = match tokio::fs::File::open(path.as_ref()).await {
+        Ok(file) => file,
+        Err(e) => return Err(Error::BlobRead(path.as_ref().to_path_buf(), e)),
+    };
+
+    let mut writer = blob_service.open_write().await;
+
+    if let Err(e) = tokio::io::copy(&mut file, &mut writer).await {
+        return Err(Error::BlobRead(path.as_ref().to_path_buf(), e));
+    };
+
+    let digest = writer
+        .close()
+        .await
+        .map_err(|e| Error::BlobFinalize(path.as_ref().to_path_buf(), e))?;
+
+    Ok(digest)
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    #[error("unsupported file type at {0}: {1:?}")]
+    FileType(std::path::PathBuf, FileType),
+
+    #[error("unable to stat {0}: {1}")]
+    Stat(std::path::PathBuf, std::io::Error),
+
+    #[error("unable to open {0}: {1}")]
+    Open(std::path::PathBuf, std::io::Error),
+
+    #[error("unable to read {0}: {1}")]
+    BlobRead(std::path::PathBuf, std::io::Error),
+
+    // TODO: proper error for blob finalize
+    #[error("unable to finalize blob {0}: {1}")]
+    BlobFinalize(std::path::PathBuf, std::io::Error),
+}
diff --git a/tvix/castore/src/import/mod.rs b/tvix/castore/src/import/mod.rs
new file mode 100644
index 0000000000..c57c5bcada
--- /dev/null
+++ b/tvix/castore/src/import/mod.rs
@@ -0,0 +1,345 @@
+//! The main library function here is [ingest_entries], receiving a stream of
+//! [IngestionEntry].
+//!
+//! Specific implementations, such as ingesting from the filesystem, live in
+//! child modules.
+
+use crate::directoryservice::DirectoryPutter;
+use crate::directoryservice::DirectoryService;
+use crate::path::{Path, PathBuf};
+use crate::proto::node::Node;
+use crate::proto::Directory;
+use crate::proto::DirectoryNode;
+use crate::proto::FileNode;
+use crate::proto::SymlinkNode;
+use crate::B3Digest;
+use futures::{Stream, StreamExt};
+use tracing::Span;
+use tracing_indicatif::span_ext::IndicatifSpanExt;
+
+use tracing::Level;
+
+use std::collections::HashMap;
+use tracing::instrument;
+
+mod error;
+pub use error::IngestionError;
+
+pub mod archive;
+pub mod blobs;
+pub mod fs;
+
+/// Ingests [IngestionEntry] from the given stream into a the passed [DirectoryService].
+/// On success, returns the root [Node].
+///
+/// The stream must have the following invariants:
+/// - All children entries must come before their parents.
+/// - The last entry must be the root node which must have a single path component.
+/// - Every entry should have a unique path, and only consist of normal components.
+///   This means, no windows path prefixes, absolute paths, `.` or `..`.
+/// - All referenced directories must have an associated directory entry in the stream.
+///   This means if there is a file entry for `foo/bar`, there must also be a `foo` directory
+///   entry.
+///
+/// Internally we maintain a [HashMap] of [PathBuf] to partially populated [Directory] at that
+/// path. Once we receive an [IngestionEntry] for the directory itself, we remove it from the
+/// map and upload it to the [DirectoryService] through a lazily created [DirectoryPutter].
+///
+/// On success, returns the root node.
+#[instrument(skip_all, fields(indicatif.pb_show=1), ret(level = Level::TRACE), err)]
+pub async fn ingest_entries<DS, S, E>(
+    directory_service: DS,
+    mut entries: S,
+) -> Result<Node, IngestionError<E>>
+where
+    DS: DirectoryService,
+    S: Stream<Item = Result<IngestionEntry, E>> + Send + std::marker::Unpin,
+    E: std::error::Error,
+{
+    // For a given path, this holds the [Directory] structs as they are populated.
+    let mut directories: HashMap<PathBuf, Directory> = HashMap::default();
+    let mut maybe_directory_putter: Option<Box<dyn DirectoryPutter>> = None;
+
+    Span::current().pb_start();
+
+    let root_node = loop {
+        let mut entry = entries
+            .next()
+            .await
+            // The last entry of the stream must have 1 path component, after which
+            // we break the loop manually.
+            .expect("Tvix bug: unexpected end of stream")?;
+
+        let name = entry
+            .path()
+            .file_name()
+            // If this is the root node, it will have an empty name.
+            .unwrap_or_default()
+            .to_owned()
+            .into();
+
+        let node = match &mut entry {
+            IngestionEntry::Dir { .. } => {
+                // If the entry is a directory, we traversed all its children (and
+                // populated it in `directories`).
+                // If we don't have it in directories, it's a directory without
+                // children.
+                let directory = directories
+                    .remove(entry.path())
+                    // In that case, it contained no children
+                    .unwrap_or_default();
+
+                let directory_size = directory.size();
+                let directory_digest = directory.digest();
+
+                // Use the directory_putter to upload the directory.
+                // If we don't have one yet (as that's the first one to upload),
+                // initialize the putter.
+                maybe_directory_putter
+                    .get_or_insert_with(|| directory_service.put_multiple_start())
+                    .put(directory)
+                    .await
+                    .map_err(|e| {
+                        IngestionError::UploadDirectoryError(entry.path().to_owned(), e)
+                    })?;
+
+                Node::Directory(DirectoryNode {
+                    name,
+                    digest: directory_digest.into(),
+                    size: directory_size,
+                })
+            }
+            IngestionEntry::Symlink { ref target, .. } => Node::Symlink(SymlinkNode {
+                name,
+                target: target.to_owned().into(),
+            }),
+            IngestionEntry::Regular {
+                size,
+                executable,
+                digest,
+                ..
+            } => Node::File(FileNode {
+                name,
+                digest: digest.to_owned().into(),
+                size: *size,
+                executable: *executable,
+            }),
+        };
+
+        let parent = entry
+            .path()
+            .parent()
+            .expect("Tvix bug: got entry with root node");
+
+        if parent == crate::Path::ROOT {
+            break node;
+        } else {
+            // record node in parent directory, creating a new [Directory] if not there yet.
+            directories.entry(parent.to_owned()).or_default().add(node);
+        }
+    };
+
+    assert!(
+        entries.count().await == 0,
+        "Tvix bug: left over elements in the stream"
+    );
+
+    assert!(
+        directories.is_empty(),
+        "Tvix bug: left over directories after processing ingestion stream"
+    );
+
+    // if there were directories uploaded, make sure we flush the putter, so
+    // they're all persisted to the backend.
+    if let Some(mut directory_putter) = maybe_directory_putter {
+        #[cfg_attr(not(debug_assertions), allow(unused))]
+        let root_directory_digest = directory_putter
+            .close()
+            .await
+            .map_err(|e| IngestionError::FinalizeDirectoryUpload(e))?;
+
+        #[cfg(debug_assertions)]
+        {
+            if let Node::Directory(directory_node) = &root_node {
+                debug_assert_eq!(
+                    root_directory_digest,
+                    directory_node
+                        .digest
+                        .to_vec()
+                        .try_into()
+                        .expect("invalid digest len")
+                )
+            } else {
+                unreachable!("Tvix bug: directory putter initialized but no root directory node");
+            }
+        }
+    };
+
+    Ok(root_node)
+}
+
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub enum IngestionEntry {
+    Regular {
+        path: PathBuf,
+        size: u64,
+        executable: bool,
+        digest: B3Digest,
+    },
+    Symlink {
+        path: PathBuf,
+        target: Vec<u8>,
+    },
+    Dir {
+        path: PathBuf,
+    },
+}
+
+impl IngestionEntry {
+    fn path(&self) -> &Path {
+        match self {
+            IngestionEntry::Regular { path, .. } => path,
+            IngestionEntry::Symlink { path, .. } => path,
+            IngestionEntry::Dir { path } => path,
+        }
+    }
+
+    fn is_dir(&self) -> bool {
+        matches!(self, IngestionEntry::Dir { .. })
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use rstest::rstest;
+
+    use crate::fixtures::{DIRECTORY_COMPLICATED, DIRECTORY_WITH_KEEP, EMPTY_BLOB_DIGEST};
+    use crate::proto::node::Node;
+    use crate::proto::{Directory, DirectoryNode, FileNode, SymlinkNode};
+    use crate::{directoryservice::MemoryDirectoryService, fixtures::DUMMY_DIGEST};
+
+    use super::ingest_entries;
+    use super::IngestionEntry;
+
+    #[rstest]
+    #[case::single_file(vec![IngestionEntry::Regular {
+        path: "foo".parse().unwrap(),
+        size: 42,
+        executable: true,
+        digest: DUMMY_DIGEST.clone(),
+    }],
+        Node::File(FileNode { name: "foo".into(), digest: DUMMY_DIGEST.clone().into(), size: 42, executable: true }
+    ))]
+    #[case::single_symlink(vec![IngestionEntry::Symlink {
+        path: "foo".parse().unwrap(),
+        target: b"blub".into(),
+    }],
+        Node::Symlink(SymlinkNode { name: "foo".into(), target: "blub".into()})
+    )]
+    #[case::single_dir(vec![IngestionEntry::Dir {
+        path: "foo".parse().unwrap(),
+    }],
+        Node::Directory(DirectoryNode { name: "foo".into(), digest: Directory::default().digest().into(), size: Directory::default().size()})
+    )]
+    #[case::dir_with_keep(vec![
+        IngestionEntry::Regular {
+            path: "foo/.keep".parse().unwrap(),
+            size: 0,
+            executable: false,
+            digest: EMPTY_BLOB_DIGEST.clone(),
+        },
+        IngestionEntry::Dir {
+            path: "foo".parse().unwrap(),
+        },
+    ],
+        Node::Directory(DirectoryNode { name: "foo".into(), digest: DIRECTORY_WITH_KEEP.digest().into(), size: DIRECTORY_WITH_KEEP.size() })
+    )]
+    /// This is intentionally a bit unsorted, though it still satisfies all
+    /// requirements we have on the order of elements in the stream.
+    #[case::directory_complicated(vec![
+        IngestionEntry::Regular {
+            path: "blub/.keep".parse().unwrap(),
+            size: 0,
+            executable: false,
+            digest: EMPTY_BLOB_DIGEST.clone(),
+        },
+        IngestionEntry::Regular {
+            path: "blub/keep/.keep".parse().unwrap(),
+            size: 0,
+            executable: false,
+            digest: EMPTY_BLOB_DIGEST.clone(),
+        },
+        IngestionEntry::Dir {
+            path: "blub/keep".parse().unwrap(),
+        },
+        IngestionEntry::Symlink {
+            path: "blub/aa".parse().unwrap(),
+            target: b"/nix/store/somewhereelse".into(),
+        },
+        IngestionEntry::Dir {
+            path: "blub".parse().unwrap(),
+        },
+    ],
+        Node::Directory(DirectoryNode { name: "blub".into(), digest: DIRECTORY_COMPLICATED.digest().into(), size:DIRECTORY_COMPLICATED.size() })
+    )]
+    #[tokio::test]
+    async fn test_ingestion(#[case] entries: Vec<IngestionEntry>, #[case] exp_root_node: Node) {
+        let directory_service = MemoryDirectoryService::default();
+
+        let root_node = ingest_entries(
+            directory_service.clone(),
+            futures::stream::iter(entries.into_iter().map(Ok::<_, std::io::Error>)),
+        )
+        .await
+        .expect("must succeed");
+
+        assert_eq!(exp_root_node, root_node, "root node should match");
+    }
+
+    #[rstest]
+    #[should_panic]
+    #[case::empty_entries(vec![])]
+    #[should_panic]
+    #[case::missing_intermediate_dir(vec![
+        IngestionEntry::Regular {
+            path: "blub/.keep".parse().unwrap(),
+            size: 0,
+            executable: false,
+            digest: EMPTY_BLOB_DIGEST.clone(),
+        },
+    ])]
+    #[should_panic]
+    #[case::leaf_after_parent(vec![
+        IngestionEntry::Dir {
+            path: "blub".parse().unwrap(),
+        },
+        IngestionEntry::Regular {
+            path: "blub/.keep".parse().unwrap(),
+            size: 0,
+            executable: false,
+            digest: EMPTY_BLOB_DIGEST.clone(),
+        },
+    ])]
+    #[should_panic]
+    #[case::root_in_entry(vec![
+        IngestionEntry::Regular {
+            path: ".keep".parse().unwrap(),
+            size: 0,
+            executable: false,
+            digest: EMPTY_BLOB_DIGEST.clone(),
+        },
+        IngestionEntry::Dir {
+            path: "".parse().unwrap(),
+        },
+    ])]
+    #[tokio::test]
+    async fn test_ingestion_fail(#[case] entries: Vec<IngestionEntry>) {
+        let directory_service = MemoryDirectoryService::default();
+
+        let _ = ingest_entries(
+            directory_service.clone(),
+            futures::stream::iter(entries.into_iter().map(Ok::<_, std::io::Error>)),
+        )
+        .await;
+    }
+}
diff --git a/tvix/castore/src/lib.rs b/tvix/castore/src/lib.rs
new file mode 100644
index 0000000000..bdc533a8c5
--- /dev/null
+++ b/tvix/castore/src/lib.rs
@@ -0,0 +1,30 @@
+mod digests;
+mod errors;
+mod hashing_reader;
+
+pub mod blobservice;
+pub mod directoryservice;
+pub mod fixtures;
+
+#[cfg(feature = "fs")]
+pub mod fs;
+
+mod path;
+pub use path::{Path, PathBuf};
+
+pub mod import;
+pub mod proto;
+pub mod tonic;
+
+pub use digests::{B3Digest, B3_LEN};
+pub use errors::Error;
+pub use hashing_reader::{B3HashingReader, HashingReader};
+
+#[cfg(test)]
+mod tests;
+
+// That's what the rstest_reuse README asks us do, and fails about being unable
+// to find rstest_reuse in crate root.
+#[cfg(test)]
+#[allow(clippy::single_component_path_imports)]
+use rstest_reuse;
diff --git a/tvix/castore/src/path.rs b/tvix/castore/src/path.rs
new file mode 100644
index 0000000000..fcc2bd01fb
--- /dev/null
+++ b/tvix/castore/src/path.rs
@@ -0,0 +1,446 @@
+//! Contains data structures to deal with Paths in the tvix-castore model.
+
+use std::{
+    borrow::Borrow,
+    fmt::{self, Debug, Display},
+    mem,
+    ops::Deref,
+    str::FromStr,
+};
+
+use bstr::ByteSlice;
+
+use crate::proto::validate_node_name;
+
+/// Represents a Path in the castore model.
+/// These are always relative, and platform-independent, which distinguishes
+/// them from the ones provided in the standard library.
+#[derive(Eq, Hash, PartialEq)]
+#[repr(transparent)] // SAFETY: Representation has to match [u8]
+pub struct Path {
+    // As node names in the castore model cannot contain slashes,
+    // we use them as component separators here.
+    inner: [u8],
+}
+
+#[allow(dead_code)]
+impl Path {
+    // SAFETY: The empty path is valid.
+    pub const ROOT: &'static Path = unsafe { Path::from_bytes_unchecked(&[]) };
+
+    /// Convert a byte slice to a path, without checking validity.
+    const unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Path {
+        // SAFETY: &[u8] and &Path have the same representation.
+        unsafe { mem::transmute(bytes) }
+    }
+
+    fn from_bytes(bytes: &[u8]) -> Option<&Path> {
+        if !bytes.is_empty() {
+            // Ensure all components are valid castore node names.
+            for component in bytes.split_str(b"/") {
+                validate_node_name(component).ok()?;
+            }
+        }
+
+        // SAFETY: We have verified that the path contains no empty components.
+        Some(unsafe { Path::from_bytes_unchecked(bytes) })
+    }
+
+    pub fn into_boxed_bytes(self: Box<Path>) -> Box<[u8]> {
+        // SAFETY: Box<Path> and Box<[u8]> have the same representation.
+        unsafe { mem::transmute(self) }
+    }
+
+    /// Returns the path without its final component, if there is one.
+    ///
+    /// Note that the parent of a bare file name is [Path::ROOT].
+    /// [Path::ROOT] is the only path without a parent.
+    pub fn parent(&self) -> Option<&Path> {
+        // The root does not have a parent.
+        if self.inner.is_empty() {
+            return None;
+        }
+
+        Some(
+            if let Some((parent, _file_name)) = self.inner.rsplit_once_str(b"/") {
+                // SAFETY: The parent of a valid Path is a valid Path.
+                unsafe { Path::from_bytes_unchecked(parent) }
+            } else {
+                // The parent of a bare file name is the root.
+                Path::ROOT
+            },
+        )
+    }
+
+    /// Creates a PathBuf with `name` adjoined to self.
+    pub fn try_join(&self, name: &[u8]) -> Result<PathBuf, std::io::Error> {
+        let mut v = PathBuf::with_capacity(self.inner.len() + name.len() + 1);
+        v.inner.extend_from_slice(&self.inner);
+        v.try_push(name)?;
+
+        Ok(v)
+    }
+
+    /// Produces an iterator over the components of the path, which are
+    /// individual byte slices.
+    /// In case the path is empty, an empty iterator is returned.
+    pub fn components(&self) -> impl Iterator<Item = &[u8]> {
+        let mut iter = self.inner.split_str(&b"/");
+
+        // We don't want to return an empty element, consume it if it's the only one.
+        if self.inner.is_empty() {
+            let _ = iter.next();
+        }
+
+        iter
+    }
+
+    /// Returns the final component of the Path, if there is one.
+    pub fn file_name(&self) -> Option<&[u8]> {
+        self.components().last()
+    }
+
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.inner
+    }
+}
+
+impl Debug for Path {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        Debug::fmt(self.inner.as_bstr(), f)
+    }
+}
+
+impl Display for Path {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        Display::fmt(self.inner.as_bstr(), f)
+    }
+}
+
+impl AsRef<Path> for Path {
+    fn as_ref(&self) -> &Path {
+        self
+    }
+}
+
+/// Represents a owned PathBuf in the castore model.
+/// These are always relative, and platform-independent, which distinguishes
+/// them from the ones provided in the standard library.
+#[derive(Clone, Default, Eq, Hash, PartialEq)]
+pub struct PathBuf {
+    inner: Vec<u8>,
+}
+
+impl Deref for PathBuf {
+    type Target = Path;
+
+    fn deref(&self) -> &Self::Target {
+        // SAFETY: PathBuf always contains a valid Path.
+        unsafe { Path::from_bytes_unchecked(&self.inner) }
+    }
+}
+
+impl AsRef<Path> for PathBuf {
+    fn as_ref(&self) -> &Path {
+        self
+    }
+}
+
+impl ToOwned for Path {
+    type Owned = PathBuf;
+
+    fn to_owned(&self) -> Self::Owned {
+        PathBuf {
+            inner: self.inner.to_owned(),
+        }
+    }
+}
+
+impl Borrow<Path> for PathBuf {
+    fn borrow(&self) -> &Path {
+        self
+    }
+}
+
+impl From<Box<Path>> for PathBuf {
+    fn from(value: Box<Path>) -> Self {
+        // SAFETY: Box<Path> is always a valid path.
+        unsafe { PathBuf::from_bytes_unchecked(value.into_boxed_bytes().into_vec()) }
+    }
+}
+
+impl From<&Path> for PathBuf {
+    fn from(value: &Path) -> Self {
+        value.to_owned()
+    }
+}
+
+impl FromStr for PathBuf {
+    type Err = std::io::Error;
+
+    fn from_str(s: &str) -> Result<PathBuf, Self::Err> {
+        Ok(Path::from_bytes(s.as_bytes())
+            .ok_or(std::io::ErrorKind::InvalidData)?
+            .to_owned())
+    }
+}
+
+impl Debug for PathBuf {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        Debug::fmt(&**self, f)
+    }
+}
+
+impl Display for PathBuf {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        Display::fmt(&**self, f)
+    }
+}
+
+impl PathBuf {
+    pub fn new() -> PathBuf {
+        Self::default()
+    }
+
+    pub fn with_capacity(capacity: usize) -> PathBuf {
+        // SAFETY: The empty path is a valid path.
+        Self {
+            inner: Vec::with_capacity(capacity),
+        }
+    }
+
+    /// Adjoins `name` to self.
+    pub fn try_push(&mut self, name: &[u8]) -> Result<(), std::io::Error> {
+        validate_node_name(name).map_err(|_| std::io::ErrorKind::InvalidData)?;
+
+        if !self.inner.is_empty() {
+            self.inner.push(b'/');
+        }
+
+        self.inner.extend_from_slice(name);
+
+        Ok(())
+    }
+
+    /// Convert a byte vector to a PathBuf, without checking validity.
+    unsafe fn from_bytes_unchecked(bytes: Vec<u8>) -> PathBuf {
+        PathBuf { inner: bytes }
+    }
+
+    /// Convert from a [&std::path::Path] to [Self].
+    ///
+    /// - Self uses `/` as path separator.
+    /// - Absolute paths are always rejected, are are these with custom prefixes.
+    /// - Repeated separators are deduplicated.
+    /// - Occurrences of `.` are normalized away.
+    /// - A trailing slash is normalized away.
+    ///
+    /// A `canonicalize_dotdot` boolean controls whether `..` will get
+    /// canonicalized if possible, or should return an error.
+    ///
+    /// For more exotic paths, this conversion might produce different results
+    /// on different platforms, due to different underlying byte
+    /// representations, which is why it's restricted to unix for now.
+    #[cfg(unix)]
+    pub fn from_host_path(
+        host_path: &std::path::Path,
+        canonicalize_dotdot: bool,
+    ) -> Result<Self, std::io::Error> {
+        let mut p = PathBuf::with_capacity(host_path.as_os_str().len());
+
+        for component in host_path.components() {
+            match component {
+                std::path::Component::Prefix(_) | std::path::Component::RootDir => {
+                    return Err(std::io::Error::new(
+                        std::io::ErrorKind::InvalidData,
+                        "found disallowed prefix or rootdir",
+                    ))
+                }
+                std::path::Component::CurDir => continue, // ignore
+                std::path::Component::ParentDir => {
+                    if canonicalize_dotdot {
+                        // Try popping the last element from the path being constructed.
+                        // FUTUREWORK: pop method?
+                        p = p
+                            .parent()
+                            .ok_or_else(|| {
+                                std::io::Error::new(
+                                    std::io::ErrorKind::InvalidData,
+                                    "found .. going too far up",
+                                )
+                            })?
+                            .to_owned();
+                    } else {
+                        return Err(std::io::Error::new(
+                            std::io::ErrorKind::InvalidData,
+                            "found disallowed ..",
+                        ));
+                    }
+                }
+                std::path::Component::Normal(s) => {
+                    // append the new component to the path being constructed.
+                    p.try_push(s.as_encoded_bytes()).map_err(|_| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::InvalidData,
+                            "encountered invalid node in sub_path component",
+                        )
+                    })?
+                }
+            }
+        }
+
+        Ok(p)
+    }
+
+    pub fn into_boxed_path(self) -> Box<Path> {
+        // SAFETY: Box<[u8]> and Box<Path> have the same representation,
+        // and PathBuf always contains a valid Path.
+        unsafe { mem::transmute(self.inner.into_boxed_slice()) }
+    }
+
+    pub fn into_bytes(self) -> Vec<u8> {
+        self.inner
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::{Path, PathBuf};
+    use bstr::ByteSlice;
+    use rstest::rstest;
+
+    // TODO: add some manual tests including invalid UTF-8 (hard to express
+    // with rstest)
+
+    #[rstest]
+    #[case::empty("", 0)]
+    #[case("a", 1)]
+    #[case("a/b", 2)]
+    #[case("a/b/c", 3)]
+    // add two slightly more cursed variants.
+    // Technically nothing prevents us from representing this with castore,
+    // but maybe we want to disallow constructing paths like this as it's a
+    // bad idea.
+    #[case::cursed("C:\\a/b", 2)]
+    #[case::cursed("\\\\tvix-store", 1)]
+    pub fn from_str(#[case] s: &str, #[case] num_components: usize) {
+        let p: PathBuf = s.parse().expect("must parse");
+
+        assert_eq!(s.as_bytes(), p.as_bytes(), "inner bytes mismatch");
+        assert_eq!(
+            num_components,
+            p.components().count(),
+            "number of components mismatch"
+        );
+    }
+
+    #[rstest]
+    #[case::absolute("/a/b")]
+    #[case::two_forward_slashes_start("//a/b")]
+    #[case::two_forward_slashes_middle("a/b//c/d")]
+    #[case::trailing_slash("a/b/")]
+    #[case::dot(".")]
+    #[case::dotdot("..")]
+    #[case::dot_start("./a")]
+    #[case::dotdot_start("../a")]
+    #[case::dot_middle("a/./b")]
+    #[case::dotdot_middle("a/../b")]
+    #[case::dot_end("a/b/.")]
+    #[case::dotdot_end("a/b/..")]
+    #[case::null("fo\0o")]
+    pub fn from_str_fail(#[case] s: &str) {
+        s.parse::<PathBuf>().expect_err("must fail");
+    }
+
+    #[rstest]
+    #[case("foo", "")]
+    #[case("foo/bar", "foo")]
+    #[case("foo2/bar2", "foo2")]
+    #[case("foo/bar/baz", "foo/bar")]
+    pub fn parent(#[case] p: PathBuf, #[case] exp_parent: PathBuf) {
+        assert_eq!(Some(&*exp_parent), p.parent());
+    }
+
+    #[rstest]
+    pub fn no_parent() {
+        assert!(Path::ROOT.parent().is_none());
+    }
+
+    #[rstest]
+    #[case("a", "b", "a/b")]
+    #[case("a", "b", "a/b")]
+    pub fn join_push(#[case] mut p: PathBuf, #[case] name: &str, #[case] exp_p: PathBuf) {
+        assert_eq!(exp_p, p.try_join(name.as_bytes()).expect("join failed"));
+        p.try_push(name.as_bytes()).expect("push failed");
+        assert_eq!(exp_p, p);
+    }
+
+    #[rstest]
+    #[case("a", "/")]
+    #[case("a", "")]
+    #[case("a", "b/c")]
+    #[case("", "/")]
+    #[case("", "")]
+    #[case("", "b/c")]
+    #[case("", ".")]
+    #[case("", "..")]
+    pub fn join_push_fail(#[case] mut p: PathBuf, #[case] name: &str) {
+        p.try_join(name.as_bytes())
+            .expect_err("join succeeded unexpectedly");
+        p.try_push(name.as_bytes())
+            .expect_err("push succeeded unexpectedly");
+    }
+
+    #[rstest]
+    #[case::empty("", vec![])]
+    #[case("a", vec!["a"])]
+    #[case("a/b", vec!["a", "b"])]
+    #[case("a/b/c", vec!["a","b", "c"])]
+    pub fn components(#[case] p: PathBuf, #[case] exp_components: Vec<&str>) {
+        assert_eq!(
+            exp_components,
+            p.components()
+                .map(|x| x.to_str().unwrap())
+                .collect::<Vec<_>>()
+        );
+    }
+
+    #[rstest]
+    #[case::empty("", "", false)]
+    #[case::path("a", "a", false)]
+    #[case::path2("a/b", "a/b", false)]
+    #[case::double_slash_middle("a//b", "a/b", false)]
+    #[case::dot(".", "", false)]
+    #[case::dot_start("./a/b", "a/b", false)]
+    #[case::dot_middle("a/./b", "a/b", false)]
+    #[case::dot_end("a/b/.", "a/b", false)]
+    #[case::trailing_slash("a/b/", "a/b", false)]
+    #[case::dotdot_canonicalize("a/..", "", true)]
+    #[case::dotdot_canonicalize2("a/../b", "b", true)]
+    #[cfg_attr(unix, case::faux_prefix("\\\\nix-store", "\\\\nix-store", false))]
+    #[cfg_attr(unix, case::faux_letter("C:\\foo.txt", "C:\\foo.txt", false))]
+    pub fn from_host_path(
+        #[case] host_path: std::path::PathBuf,
+        #[case] exp_path: PathBuf,
+        #[case] canonicalize_dotdot: bool,
+    ) {
+        let p = PathBuf::from_host_path(&host_path, canonicalize_dotdot).expect("must succeed");
+
+        assert_eq!(exp_path, p);
+    }
+
+    #[rstest]
+    #[case::absolute("/", false)]
+    #[case::dotdot_root("..", false)]
+    #[case::dotdot_root_canonicalize("..", true)]
+    #[case::dotdot_root_no_canonicalize("a/..", false)]
+    #[case::invalid_name("foo/bar\0", false)]
+    // #[cfg_attr(windows, case::prefix("\\\\nix-store", false))]
+    // #[cfg_attr(windows, case::letter("C:\\foo.txt", false))]
+    pub fn from_host_path_fail(
+        #[case] host_path: std::path::PathBuf,
+        #[case] canonicalize_dotdot: bool,
+    ) {
+        PathBuf::from_host_path(&host_path, canonicalize_dotdot).expect_err("must fail");
+    }
+}
diff --git a/tvix/castore/src/proto/grpc_blobservice_wrapper.rs b/tvix/castore/src/proto/grpc_blobservice_wrapper.rs
new file mode 100644
index 0000000000..41bd0698ec
--- /dev/null
+++ b/tvix/castore/src/proto/grpc_blobservice_wrapper.rs
@@ -0,0 +1,175 @@
+use crate::blobservice::BlobService;
+use core::pin::pin;
+use data_encoding::BASE64;
+use futures::{stream::BoxStream, TryFutureExt};
+use std::{
+    collections::VecDeque,
+    ops::{Deref, DerefMut},
+};
+use tokio_stream::StreamExt;
+use tokio_util::io::ReaderStream;
+use tonic::{async_trait, Request, Response, Status, Streaming};
+use tracing::{instrument, warn};
+
+pub struct GRPCBlobServiceWrapper<T> {
+    blob_service: T,
+}
+
+impl<T> GRPCBlobServiceWrapper<T> {
+    pub fn new(blob_service: T) -> Self {
+        Self { blob_service }
+    }
+}
+
+// This is necessary because bytes::BytesMut comes up with
+// a default 64 bytes capacity that cannot be changed
+// easily if you assume a bytes::BufMut trait implementation
+// Therefore, we override the Default implementation here
+// TODO(raitobezarius?): upstream me properly
+struct BytesMutWithDefaultCapacity<const N: usize> {
+    inner: bytes::BytesMut,
+}
+
+impl<const N: usize> Deref for BytesMutWithDefaultCapacity<N> {
+    type Target = bytes::BytesMut;
+    fn deref(&self) -> &Self::Target {
+        &self.inner
+    }
+}
+
+impl<const N: usize> DerefMut for BytesMutWithDefaultCapacity<N> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.inner
+    }
+}
+
+impl<const N: usize> Default for BytesMutWithDefaultCapacity<N> {
+    fn default() -> Self {
+        BytesMutWithDefaultCapacity {
+            inner: bytes::BytesMut::with_capacity(N),
+        }
+    }
+}
+
+impl<const N: usize> bytes::Buf for BytesMutWithDefaultCapacity<N> {
+    fn remaining(&self) -> usize {
+        self.inner.remaining()
+    }
+
+    fn chunk(&self) -> &[u8] {
+        self.inner.chunk()
+    }
+
+    fn advance(&mut self, cnt: usize) {
+        self.inner.advance(cnt);
+    }
+}
+
+unsafe impl<const N: usize> bytes::BufMut for BytesMutWithDefaultCapacity<N> {
+    fn remaining_mut(&self) -> usize {
+        self.inner.remaining_mut()
+    }
+
+    unsafe fn advance_mut(&mut self, cnt: usize) {
+        self.inner.advance_mut(cnt);
+    }
+
+    fn chunk_mut(&mut self) -> &mut bytes::buf::UninitSlice {
+        self.inner.chunk_mut()
+    }
+}
+
+#[async_trait]
+impl<T> super::blob_service_server::BlobService for GRPCBlobServiceWrapper<T>
+where
+    T: Deref<Target = dyn BlobService> + Send + Sync + 'static,
+{
+    // https://github.com/tokio-rs/tokio/issues/2723#issuecomment-1534723933
+    type ReadStream = BoxStream<'static, Result<super::BlobChunk, Status>>;
+
+    #[instrument(skip_all, fields(blob.digest=format!("b3:{}", BASE64.encode(&request.get_ref().digest))))]
+    async fn stat(
+        &self,
+        request: Request<super::StatBlobRequest>,
+    ) -> Result<Response<super::StatBlobResponse>, Status> {
+        let rq = request.into_inner();
+        let req_digest = rq
+            .digest
+            .try_into()
+            .map_err(|_e| Status::invalid_argument("invalid digest length"))?;
+
+        match self.blob_service.chunks(&req_digest).await {
+            Ok(None) => Err(Status::not_found(format!("blob {} not found", &req_digest))),
+            Ok(Some(chunk_metas)) => Ok(Response::new(super::StatBlobResponse {
+                chunks: chunk_metas,
+                ..Default::default()
+            })),
+            Err(e) => {
+                warn!(err=%e, "failed to request chunks");
+                Err(e.into())
+            }
+        }
+    }
+
+    #[instrument(skip_all, fields(blob.digest=format!("b3:{}", BASE64.encode(&request.get_ref().digest))))]
+    async fn read(
+        &self,
+        request: Request<super::ReadBlobRequest>,
+    ) -> Result<Response<Self::ReadStream>, Status> {
+        let rq = request.into_inner();
+
+        let req_digest = rq
+            .digest
+            .try_into()
+            .map_err(|_e| Status::invalid_argument("invalid digest length"))?;
+
+        match self.blob_service.open_read(&req_digest).await {
+            Ok(Some(r)) => {
+                let chunks_stream =
+                    ReaderStream::new(r).map(|chunk| Ok(super::BlobChunk { data: chunk? }));
+                Ok(Response::new(Box::pin(chunks_stream)))
+            }
+            Ok(None) => Err(Status::not_found(format!("blob {} not found", &req_digest))),
+            Err(e) => {
+                warn!(err=%e, "failed to call open_read");
+                Err(e.into())
+            }
+        }
+    }
+
+    #[instrument(skip_all)]
+    async fn put(
+        &self,
+        request: Request<Streaming<super::BlobChunk>>,
+    ) -> Result<Response<super::PutBlobResponse>, Status> {
+        let req_inner = request.into_inner();
+
+        let data_stream = req_inner.map(|x| {
+            x.map(|x| VecDeque::from(x.data.to_vec()))
+                .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))
+        });
+
+        let mut data_reader = tokio_util::io::StreamReader::new(data_stream);
+
+        let mut blob_writer = pin!(self.blob_service.open_write().await);
+
+        tokio::io::copy(&mut data_reader, &mut blob_writer)
+            .await
+            .map_err(|e| {
+                warn!("error copying: {}", e);
+                Status::internal("error copying")
+            })?;
+
+        let digest = blob_writer
+            .close()
+            .map_err(|e| {
+                warn!("error closing stream: {}", e);
+                Status::internal("error closing stream")
+            })
+            .await?;
+
+        Ok(Response::new(super::PutBlobResponse {
+            digest: digest.into(),
+        }))
+    }
+}
diff --git a/tvix/castore/src/proto/grpc_directoryservice_wrapper.rs b/tvix/castore/src/proto/grpc_directoryservice_wrapper.rs
new file mode 100644
index 0000000000..5c1428690c
--- /dev/null
+++ b/tvix/castore/src/proto/grpc_directoryservice_wrapper.rs
@@ -0,0 +1,103 @@
+use crate::directoryservice::ClosureValidator;
+use crate::proto;
+use crate::{directoryservice::DirectoryService, B3Digest};
+use futures::stream::BoxStream;
+use futures::TryStreamExt;
+use std::ops::Deref;
+use tokio_stream::once;
+use tonic::{async_trait, Request, Response, Status, Streaming};
+use tracing::{instrument, warn};
+
+pub struct GRPCDirectoryServiceWrapper<T> {
+    directory_service: T,
+}
+
+impl<T> GRPCDirectoryServiceWrapper<T> {
+    pub fn new(directory_service: T) -> Self {
+        Self { directory_service }
+    }
+}
+
+#[async_trait]
+impl<T> proto::directory_service_server::DirectoryService for GRPCDirectoryServiceWrapper<T>
+where
+    T: Deref<Target = dyn DirectoryService> + Send + Sync + 'static,
+{
+    type GetStream = BoxStream<'static, tonic::Result<proto::Directory, Status>>;
+
+    #[instrument(skip_all)]
+    async fn get<'a>(
+        &'a self,
+        request: Request<proto::GetDirectoryRequest>,
+    ) -> Result<Response<Self::GetStream>, Status> {
+        let req_inner = request.into_inner();
+
+        let by_what = &req_inner
+            .by_what
+            .ok_or_else(|| Status::invalid_argument("invalid by_what"))?;
+
+        match by_what {
+            proto::get_directory_request::ByWhat::Digest(ref digest) => {
+                let digest: B3Digest = digest
+                    .clone()
+                    .try_into()
+                    .map_err(|_e| Status::invalid_argument("invalid digest length"))?;
+
+                Ok(tonic::Response::new({
+                    if !req_inner.recursive {
+                        let directory = self
+                            .directory_service
+                            .get(&digest)
+                            .await
+                            .map_err(|e| {
+                                warn!(err = %e, directory.digest=%digest, "failed to get directory");
+                                tonic::Status::new(tonic::Code::Internal, e.to_string())
+                            })?
+                            .ok_or_else(|| {
+                                Status::not_found(format!("directory {} not found", digest))
+                            })?;
+
+                        Box::pin(once(Ok(directory)))
+                    } else {
+                        // If recursive was requested, traverse via get_recursive.
+                        Box::pin(
+                            self.directory_service.get_recursive(&digest).map_err(|e| {
+                                tonic::Status::new(tonic::Code::Internal, e.to_string())
+                            }),
+                        )
+                    }
+                }))
+            }
+        }
+    }
+
+    #[instrument(skip_all)]
+    async fn put(
+        &self,
+        request: Request<Streaming<proto::Directory>>,
+    ) -> Result<Response<proto::PutDirectoryResponse>, Status> {
+        let mut req_inner = request.into_inner();
+
+        // We put all Directory messages we receive into ClosureValidator first.
+        let mut validator = ClosureValidator::default();
+        while let Some(directory) = req_inner.message().await? {
+            validator.add(directory)?;
+        }
+
+        // drain, which validates connectivity too.
+        let directories = validator.finalize()?;
+
+        let mut directory_putter = self.directory_service.put_multiple_start();
+        for directory in directories {
+            directory_putter.put(directory).await?;
+        }
+
+        // Properly close the directory putter. Peek at last_directory_digest
+        // and return it, or propagate errors.
+        let last_directory_dgst = directory_putter.close().await?;
+
+        Ok(Response::new(proto::PutDirectoryResponse {
+            root_digest: last_directory_dgst.into(),
+        }))
+    }
+}
diff --git a/tvix/castore/src/proto/mod.rs b/tvix/castore/src/proto/mod.rs
new file mode 100644
index 0000000000..5374e3ae5a
--- /dev/null
+++ b/tvix/castore/src/proto/mod.rs
@@ -0,0 +1,471 @@
+#![allow(non_snake_case)]
+// https://github.com/hyperium/tonic/issues/1056
+use bstr::ByteSlice;
+use std::{collections::HashSet, iter::Peekable, str};
+
+use prost::Message;
+
+mod grpc_blobservice_wrapper;
+mod grpc_directoryservice_wrapper;
+
+pub use grpc_blobservice_wrapper::GRPCBlobServiceWrapper;
+pub use grpc_directoryservice_wrapper::GRPCDirectoryServiceWrapper;
+
+use crate::{B3Digest, B3_LEN};
+
+tonic::include_proto!("tvix.castore.v1");
+
+#[cfg(feature = "tonic-reflection")]
+/// Compiled file descriptors for implementing [gRPC
+/// reflection](https://github.com/grpc/grpc/blob/master/doc/server-reflection.md) with e.g.
+/// [`tonic_reflection`](https://docs.rs/tonic-reflection).
+pub const FILE_DESCRIPTOR_SET: &[u8] = tonic::include_file_descriptor_set!("tvix.castore.v1");
+
+#[cfg(test)]
+mod tests;
+
+/// Errors that can occur during the validation of [Directory] messages.
+#[derive(Debug, PartialEq, Eq, thiserror::Error)]
+pub enum ValidateDirectoryError {
+    /// Elements are not in sorted order
+    #[error("{:?} is not sorted", .0.as_bstr())]
+    WrongSorting(Vec<u8>),
+    /// Multiple elements with the same name encountered
+    #[error("{:?} is a duplicate name", .0.as_bstr())]
+    DuplicateName(Vec<u8>),
+    /// Invalid node
+    #[error("invalid node with name {:?}: {:?}", .0.as_bstr(), .1.to_string())]
+    InvalidNode(Vec<u8>, ValidateNodeError),
+    #[error("Total size exceeds u32::MAX")]
+    SizeOverflow,
+}
+
+/// Errors that occur during Node validation
+#[derive(Debug, PartialEq, Eq, thiserror::Error)]
+pub enum ValidateNodeError {
+    #[error("No node set")]
+    NoNodeSet,
+    /// Invalid digest length encountered
+    #[error("Invalid Digest length: {0}")]
+    InvalidDigestLen(usize),
+    /// Invalid name encountered
+    #[error("Invalid name: {}", .0.as_bstr())]
+    InvalidName(Vec<u8>),
+    /// Invalid symlink target
+    #[error("Invalid symlink target: {}", .0.as_bstr())]
+    InvalidSymlinkTarget(Vec<u8>),
+}
+
+/// Errors that occur during StatBlobResponse validation
+#[derive(Debug, PartialEq, Eq, thiserror::Error)]
+pub enum ValidateStatBlobResponseError {
+    /// Invalid digest length encountered
+    #[error("Invalid digest length {0} for chunk #{1}")]
+    InvalidDigestLen(usize, usize),
+}
+
+/// Checks a Node name for validity as an intermediate node.
+/// We disallow slashes, null bytes, '.', '..' and the empty string.
+pub(crate) fn validate_node_name(name: &[u8]) -> Result<(), ValidateNodeError> {
+    if name.is_empty()
+        || name == b".."
+        || name == b"."
+        || name.contains(&0x00)
+        || name.contains(&b'/')
+    {
+        Err(ValidateNodeError::InvalidName(name.to_owned()))
+    } else {
+        Ok(())
+    }
+}
+
+/// NamedNode is implemented for [FileNode], [DirectoryNode] and [SymlinkNode]
+/// and [node::Node], so we can ask all of them for the name easily.
+pub trait NamedNode {
+    fn get_name(&self) -> &[u8];
+}
+
+impl NamedNode for &FileNode {
+    fn get_name(&self) -> &[u8] {
+        &self.name
+    }
+}
+
+impl NamedNode for &DirectoryNode {
+    fn get_name(&self) -> &[u8] {
+        &self.name
+    }
+}
+
+impl NamedNode for &SymlinkNode {
+    fn get_name(&self) -> &[u8] {
+        &self.name
+    }
+}
+
+impl NamedNode for node::Node {
+    fn get_name(&self) -> &[u8] {
+        match self {
+            node::Node::File(node_file) => &node_file.name,
+            node::Node::Directory(node_directory) => &node_directory.name,
+            node::Node::Symlink(node_symlink) => &node_symlink.name,
+        }
+    }
+}
+
+impl Node {
+    /// Ensures the node has a valid enum kind (is Some), and passes its
+    // per-enum validation.
+    pub fn validate(&self) -> Result<(), ValidateNodeError> {
+        if let Some(node) = self.node.as_ref() {
+            node.validate()
+        } else {
+            Err(ValidateNodeError::NoNodeSet)
+        }
+    }
+}
+
+impl node::Node {
+    /// Returns the node with a new name.
+    pub fn rename(self, name: bytes::Bytes) -> Self {
+        match self {
+            node::Node::Directory(n) => node::Node::Directory(DirectoryNode { name, ..n }),
+            node::Node::File(n) => node::Node::File(FileNode { name, ..n }),
+            node::Node::Symlink(n) => node::Node::Symlink(SymlinkNode { name, ..n }),
+        }
+    }
+
+    /// Ensures the node has a valid name, and checks the type-specific fields too.
+    pub fn validate(&self) -> Result<(), ValidateNodeError> {
+        match self {
+            // for a directory root node, ensure the digest has the appropriate size.
+            node::Node::Directory(directory_node) => {
+                if directory_node.digest.len() != B3_LEN {
+                    Err(ValidateNodeError::InvalidDigestLen(
+                        directory_node.digest.len(),
+                    ))?;
+                }
+                validate_node_name(&directory_node.name)
+            }
+            // for a file root node, ensure the digest has the appropriate size.
+            node::Node::File(file_node) => {
+                if file_node.digest.len() != B3_LEN {
+                    Err(ValidateNodeError::InvalidDigestLen(file_node.digest.len()))?;
+                }
+                validate_node_name(&file_node.name)
+            }
+            // ensure the symlink target is not empty and doesn't contain null bytes.
+            node::Node::Symlink(symlink_node) => {
+                if symlink_node.target.is_empty() || symlink_node.target.contains(&b'\0') {
+                    Err(ValidateNodeError::InvalidSymlinkTarget(
+                        symlink_node.target.to_vec(),
+                    ))?;
+                }
+                validate_node_name(&symlink_node.name)
+            }
+        }
+    }
+}
+
+impl PartialOrd for node::Node {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for node::Node {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.get_name().cmp(other.get_name())
+    }
+}
+
+impl PartialOrd for FileNode {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for FileNode {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.get_name().cmp(other.get_name())
+    }
+}
+
+impl PartialOrd for SymlinkNode {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for SymlinkNode {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.get_name().cmp(other.get_name())
+    }
+}
+
+impl PartialOrd for DirectoryNode {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for DirectoryNode {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.get_name().cmp(other.get_name())
+    }
+}
+
+/// Accepts a name, and a mutable reference to the previous name.
+/// If the passed name is larger than the previous one, the reference is updated.
+/// If it's not, an error is returned.
+fn update_if_lt_prev<'n>(
+    prev_name: &mut &'n [u8],
+    name: &'n [u8],
+) -> Result<(), ValidateDirectoryError> {
+    if *name < **prev_name {
+        return Err(ValidateDirectoryError::WrongSorting(name.to_vec()));
+    }
+    *prev_name = name;
+    Ok(())
+}
+
+/// Inserts the given name into a HashSet if it's not already in there.
+/// If it is, an error is returned.
+fn insert_once<'n>(
+    seen_names: &mut HashSet<&'n [u8]>,
+    name: &'n [u8],
+) -> Result<(), ValidateDirectoryError> {
+    if seen_names.get(name).is_some() {
+        return Err(ValidateDirectoryError::DuplicateName(name.to_vec()));
+    }
+    seen_names.insert(name);
+    Ok(())
+}
+
+fn checked_sum(iter: impl IntoIterator<Item = u64>) -> Option<u64> {
+    iter.into_iter().try_fold(0u64, |acc, i| acc.checked_add(i))
+}
+
+impl Directory {
+    /// The size of a directory is the number of all regular and symlink elements,
+    /// the number of directory elements, and their size fields.
+    pub fn size(&self) -> u64 {
+        if cfg!(debug_assertions) {
+            self.size_checked()
+                .expect("Directory::size exceeds u64::MAX")
+        } else {
+            self.size_checked().unwrap_or(u64::MAX)
+        }
+    }
+
+    fn size_checked(&self) -> Option<u64> {
+        checked_sum([
+            self.files.len().try_into().ok()?,
+            self.symlinks.len().try_into().ok()?,
+            self.directories.len().try_into().ok()?,
+            checked_sum(self.directories.iter().map(|e| e.size))?,
+        ])
+    }
+
+    /// Calculates the digest of a Directory, which is the blake3 hash of a
+    /// Directory protobuf message, serialized in protobuf canonical form.
+    pub fn digest(&self) -> B3Digest {
+        let mut hasher = blake3::Hasher::new();
+
+        hasher
+            .update(&self.encode_to_vec())
+            .finalize()
+            .as_bytes()
+            .into()
+    }
+
+    /// validate checks the directory for invalid data, such as:
+    /// - violations of name restrictions
+    /// - invalid digest lengths
+    /// - not properly sorted lists
+    /// - duplicate names in the three lists
+    pub fn validate(&self) -> Result<(), ValidateDirectoryError> {
+        let mut seen_names: HashSet<&[u8]> = HashSet::new();
+
+        let mut last_directory_name: &[u8] = b"";
+        let mut last_file_name: &[u8] = b"";
+        let mut last_symlink_name: &[u8] = b"";
+
+        // check directories
+        for directory_node in &self.directories {
+            node::Node::Directory(directory_node.clone())
+                .validate()
+                .map_err(|e| {
+                    ValidateDirectoryError::InvalidNode(directory_node.name.to_vec(), e)
+                })?;
+
+            update_if_lt_prev(&mut last_directory_name, &directory_node.name)?;
+            insert_once(&mut seen_names, &directory_node.name)?;
+        }
+
+        // check files
+        for file_node in &self.files {
+            node::Node::File(file_node.clone())
+                .validate()
+                .map_err(|e| ValidateDirectoryError::InvalidNode(file_node.name.to_vec(), e))?;
+
+            update_if_lt_prev(&mut last_file_name, &file_node.name)?;
+            insert_once(&mut seen_names, &file_node.name)?;
+        }
+
+        // check symlinks
+        for symlink_node in &self.symlinks {
+            node::Node::Symlink(symlink_node.clone())
+                .validate()
+                .map_err(|e| ValidateDirectoryError::InvalidNode(symlink_node.name.to_vec(), e))?;
+
+            update_if_lt_prev(&mut last_symlink_name, &symlink_node.name)?;
+            insert_once(&mut seen_names, &symlink_node.name)?;
+        }
+
+        self.size_checked()
+            .ok_or(ValidateDirectoryError::SizeOverflow)?;
+
+        Ok(())
+    }
+
+    /// Allows iterating over all three nodes ([DirectoryNode], [FileNode],
+    /// [SymlinkNode]) in an ordered fashion, as long as the individual lists
+    /// are sorted (which can be checked by the [Directory::validate]).
+    pub fn nodes(&self) -> DirectoryNodesIterator {
+        return DirectoryNodesIterator {
+            i_directories: self.directories.iter().peekable(),
+            i_files: self.files.iter().peekable(),
+            i_symlinks: self.symlinks.iter().peekable(),
+        };
+    }
+
+    /// Adds the specified [node::Node] to the [Directory], preserving sorted entries.
+    /// This assumes the [Directory] to be sorted prior to adding the node.
+    ///
+    /// Inserting an element that already exists with the same name in the directory is not
+    /// supported.
+    pub fn add(&mut self, node: node::Node) {
+        debug_assert!(
+            !self.files.iter().any(|x| x.get_name() == node.get_name()),
+            "name already exists in files"
+        );
+        debug_assert!(
+            !self
+                .directories
+                .iter()
+                .any(|x| x.get_name() == node.get_name()),
+            "name already exists in directories"
+        );
+        debug_assert!(
+            !self
+                .symlinks
+                .iter()
+                .any(|x| x.get_name() == node.get_name()),
+            "name already exists in symlinks"
+        );
+
+        match node {
+            node::Node::File(node) => {
+                let pos = self
+                    .files
+                    .binary_search(&node)
+                    .expect_err("Tvix bug: dir entry with name already exists");
+                self.files.insert(pos, node);
+            }
+            node::Node::Directory(node) => {
+                let pos = self
+                    .directories
+                    .binary_search(&node)
+                    .expect_err("Tvix bug: dir entry with name already exists");
+                self.directories.insert(pos, node);
+            }
+            node::Node::Symlink(node) => {
+                let pos = self
+                    .symlinks
+                    .binary_search(&node)
+                    .expect_err("Tvix bug: dir entry with name already exists");
+                self.symlinks.insert(pos, node);
+            }
+        }
+    }
+}
+
+impl StatBlobResponse {
+    /// Validates a StatBlobResponse. All chunks must have valid blake3 digests.
+    /// It is allowed to send an empty list, if no more granular chunking is
+    /// available.
+    pub fn validate(&self) -> Result<(), ValidateStatBlobResponseError> {
+        for (i, chunk) in self.chunks.iter().enumerate() {
+            if chunk.digest.len() != blake3::KEY_LEN {
+                return Err(ValidateStatBlobResponseError::InvalidDigestLen(
+                    chunk.digest.len(),
+                    i,
+                ));
+            }
+        }
+        Ok(())
+    }
+}
+
+/// Struct to hold the state of an iterator over all nodes of a Directory.
+///
+/// Internally, this keeps peekable Iterators over all three lists of a
+/// Directory message.
+pub struct DirectoryNodesIterator<'a> {
+    // directory: &Directory,
+    i_directories: Peekable<std::slice::Iter<'a, DirectoryNode>>,
+    i_files: Peekable<std::slice::Iter<'a, FileNode>>,
+    i_symlinks: Peekable<std::slice::Iter<'a, SymlinkNode>>,
+}
+
+/// looks at two elements implementing NamedNode, and returns true if "left
+/// is smaller / comes first".
+///
+/// Some(_) is preferred over None.
+fn left_name_lt_right<A: NamedNode, B: NamedNode>(left: Option<&A>, right: Option<&B>) -> bool {
+    match left {
+        // if left is None, right always wins
+        None => false,
+        Some(left_inner) => {
+            // left is Some.
+            match right {
+                // left is Some, right is None - left wins.
+                None => true,
+                Some(right_inner) => {
+                    // both are Some - compare the name.
+                    return left_inner.get_name() < right_inner.get_name();
+                }
+            }
+        }
+    }
+}
+
+impl Iterator for DirectoryNodesIterator<'_> {
+    type Item = node::Node;
+
+    // next returns the next node in the Directory.
+    // we peek at all three internal iterators, and pick the one with the
+    // smallest name, to ensure lexicographical ordering.
+    // The individual lists are already known to be sorted.
+    fn next(&mut self) -> Option<Self::Item> {
+        if left_name_lt_right(self.i_directories.peek(), self.i_files.peek()) {
+            // i_directories is still in the game, compare with symlinks
+            if left_name_lt_right(self.i_directories.peek(), self.i_symlinks.peek()) {
+                self.i_directories
+                    .next()
+                    .cloned()
+                    .map(node::Node::Directory)
+            } else {
+                self.i_symlinks.next().cloned().map(node::Node::Symlink)
+            }
+        } else {
+            // i_files is still in the game, compare with symlinks
+            if left_name_lt_right(self.i_files.peek(), self.i_symlinks.peek()) {
+                self.i_files.next().cloned().map(node::Node::File)
+            } else {
+                self.i_symlinks.next().cloned().map(node::Node::Symlink)
+            }
+        }
+    }
+}
diff --git a/tvix/castore/src/proto/tests/directory.rs b/tvix/castore/src/proto/tests/directory.rs
new file mode 100644
index 0000000000..81b73a048d
--- /dev/null
+++ b/tvix/castore/src/proto/tests/directory.rs
@@ -0,0 +1,452 @@
+use crate::proto::{
+    node, Directory, DirectoryNode, FileNode, SymlinkNode, ValidateDirectoryError,
+    ValidateNodeError,
+};
+
+use hex_literal::hex;
+
+const DUMMY_DIGEST: [u8; 32] = [0; 32];
+
+#[test]
+fn size() {
+    {
+        let d = Directory::default();
+        assert_eq!(d.size(), 0);
+    }
+    {
+        let d = Directory {
+            directories: vec![DirectoryNode {
+                name: "foo".into(),
+                digest: DUMMY_DIGEST.to_vec().into(),
+                size: 0,
+            }],
+            ..Default::default()
+        };
+        assert_eq!(d.size(), 1);
+    }
+    {
+        let d = Directory {
+            directories: vec![DirectoryNode {
+                name: "foo".into(),
+                digest: DUMMY_DIGEST.to_vec().into(),
+                size: 4,
+            }],
+            ..Default::default()
+        };
+        assert_eq!(d.size(), 5);
+    }
+    {
+        let d = Directory {
+            files: vec![FileNode {
+                name: "foo".into(),
+                digest: DUMMY_DIGEST.to_vec().into(),
+                size: 42,
+                executable: false,
+            }],
+            ..Default::default()
+        };
+        assert_eq!(d.size(), 1);
+    }
+    {
+        let d = Directory {
+            symlinks: vec![SymlinkNode {
+                name: "foo".into(),
+                target: "bar".into(),
+            }],
+            ..Default::default()
+        };
+        assert_eq!(d.size(), 1);
+    }
+}
+
+#[test]
+#[cfg_attr(not(debug_assertions), ignore)]
+#[should_panic = "Directory::size exceeds u64::MAX"]
+fn size_unchecked_panic() {
+    let d = Directory {
+        directories: vec![DirectoryNode {
+            name: "foo".into(),
+            digest: DUMMY_DIGEST.to_vec().into(),
+            size: u64::MAX,
+        }],
+        ..Default::default()
+    };
+
+    d.size();
+}
+
+#[test]
+#[cfg_attr(debug_assertions, ignore)]
+fn size_unchecked_saturate() {
+    let d = Directory {
+        directories: vec![DirectoryNode {
+            name: "foo".into(),
+            digest: DUMMY_DIGEST.to_vec().into(),
+            size: u64::MAX,
+        }],
+        ..Default::default()
+    };
+
+    assert_eq!(d.size(), u64::MAX);
+}
+
+#[test]
+fn size_checked() {
+    // We don't test the overflow cases that rely purely on immediate
+    // child count, since that would take an absurd amount of memory.
+    {
+        let d = Directory {
+            directories: vec![DirectoryNode {
+                name: "foo".into(),
+                digest: DUMMY_DIGEST.to_vec().into(),
+                size: u64::MAX - 1,
+            }],
+            ..Default::default()
+        };
+        assert_eq!(d.size_checked(), Some(u64::MAX));
+    }
+    {
+        let d = Directory {
+            directories: vec![DirectoryNode {
+                name: "foo".into(),
+                digest: DUMMY_DIGEST.to_vec().into(),
+                size: u64::MAX,
+            }],
+            ..Default::default()
+        };
+        assert_eq!(d.size_checked(), None);
+    }
+    {
+        let d = Directory {
+            directories: vec![
+                DirectoryNode {
+                    name: "foo".into(),
+                    digest: DUMMY_DIGEST.to_vec().into(),
+                    size: u64::MAX / 2,
+                },
+                DirectoryNode {
+                    name: "foo".into(),
+                    digest: DUMMY_DIGEST.to_vec().into(),
+                    size: u64::MAX / 2,
+                },
+            ],
+            ..Default::default()
+        };
+        assert_eq!(d.size_checked(), None);
+    }
+}
+
+#[test]
+fn digest() {
+    let d = Directory::default();
+
+    assert_eq!(
+        d.digest(),
+        (&hex!("af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262")).into()
+    )
+}
+
+#[test]
+fn validate_empty() {
+    let d = Directory::default();
+    assert_eq!(d.validate(), Ok(()));
+}
+
+#[test]
+fn validate_invalid_names() {
+    {
+        let d = Directory {
+            directories: vec![DirectoryNode {
+                name: "".into(),
+                digest: DUMMY_DIGEST.to_vec().into(),
+                size: 42,
+            }],
+            ..Default::default()
+        };
+        match d.validate().expect_err("must fail") {
+            ValidateDirectoryError::InvalidNode(n, ValidateNodeError::InvalidName(_)) => {
+                assert_eq!(n, b"")
+            }
+            _ => panic!("unexpected error"),
+        };
+    }
+
+    {
+        let d = Directory {
+            directories: vec![DirectoryNode {
+                name: ".".into(),
+                digest: DUMMY_DIGEST.to_vec().into(),
+                size: 42,
+            }],
+            ..Default::default()
+        };
+        match d.validate().expect_err("must fail") {
+            ValidateDirectoryError::InvalidNode(n, ValidateNodeError::InvalidName(_)) => {
+                assert_eq!(n, b".")
+            }
+            _ => panic!("unexpected error"),
+        };
+    }
+
+    {
+        let d = Directory {
+            files: vec![FileNode {
+                name: "..".into(),
+                digest: DUMMY_DIGEST.to_vec().into(),
+                size: 42,
+                executable: false,
+            }],
+            ..Default::default()
+        };
+        match d.validate().expect_err("must fail") {
+            ValidateDirectoryError::InvalidNode(n, ValidateNodeError::InvalidName(_)) => {
+                assert_eq!(n, b"..")
+            }
+            _ => panic!("unexpected error"),
+        };
+    }
+
+    {
+        let d = Directory {
+            symlinks: vec![SymlinkNode {
+                name: "\x00".into(),
+                target: "foo".into(),
+            }],
+            ..Default::default()
+        };
+        match d.validate().expect_err("must fail") {
+            ValidateDirectoryError::InvalidNode(n, ValidateNodeError::InvalidName(_)) => {
+                assert_eq!(n, b"\x00")
+            }
+            _ => panic!("unexpected error"),
+        };
+    }
+
+    {
+        let d = Directory {
+            symlinks: vec![SymlinkNode {
+                name: "foo/bar".into(),
+                target: "foo".into(),
+            }],
+            ..Default::default()
+        };
+        match d.validate().expect_err("must fail") {
+            ValidateDirectoryError::InvalidNode(n, ValidateNodeError::InvalidName(_)) => {
+                assert_eq!(n, b"foo/bar")
+            }
+            _ => panic!("unexpected error"),
+        };
+    }
+}
+
+#[test]
+fn validate_invalid_digest() {
+    let d = Directory {
+        directories: vec![DirectoryNode {
+            name: "foo".into(),
+            digest: vec![0x00, 0x42].into(), // invalid length
+            size: 42,
+        }],
+        ..Default::default()
+    };
+    match d.validate().expect_err("must fail") {
+        ValidateDirectoryError::InvalidNode(_, ValidateNodeError::InvalidDigestLen(n)) => {
+            assert_eq!(n, 2)
+        }
+        _ => panic!("unexpected error"),
+    }
+}
+
+#[test]
+fn validate_sorting() {
+    // "b" comes before "a", bad.
+    {
+        let d = Directory {
+            directories: vec![
+                DirectoryNode {
+                    name: "b".into(),
+                    digest: DUMMY_DIGEST.to_vec().into(),
+                    size: 42,
+                },
+                DirectoryNode {
+                    name: "a".into(),
+                    digest: DUMMY_DIGEST.to_vec().into(),
+                    size: 42,
+                },
+            ],
+            ..Default::default()
+        };
+        match d.validate().expect_err("must fail") {
+            ValidateDirectoryError::WrongSorting(s) => {
+                assert_eq!(s, b"a");
+            }
+            _ => panic!("unexpected error"),
+        }
+    }
+
+    // "a" exists twice, bad.
+    {
+        let d = Directory {
+            directories: vec![
+                DirectoryNode {
+                    name: "a".into(),
+                    digest: DUMMY_DIGEST.to_vec().into(),
+                    size: 42,
+                },
+                DirectoryNode {
+                    name: "a".into(),
+                    digest: DUMMY_DIGEST.to_vec().into(),
+                    size: 42,
+                },
+            ],
+            ..Default::default()
+        };
+        match d.validate().expect_err("must fail") {
+            ValidateDirectoryError::DuplicateName(s) => {
+                assert_eq!(s, b"a");
+            }
+            _ => panic!("unexpected error"),
+        }
+    }
+
+    // "a" comes before "b", all good.
+    {
+        let d = Directory {
+            directories: vec![
+                DirectoryNode {
+                    name: "a".into(),
+                    digest: DUMMY_DIGEST.to_vec().into(),
+                    size: 42,
+                },
+                DirectoryNode {
+                    name: "b".into(),
+                    digest: DUMMY_DIGEST.to_vec().into(),
+                    size: 42,
+                },
+            ],
+            ..Default::default()
+        };
+
+        d.validate().expect("validate shouldn't error");
+    }
+
+    // [b, c] and [a] are both properly sorted.
+    {
+        let d = Directory {
+            directories: vec![
+                DirectoryNode {
+                    name: "b".into(),
+                    digest: DUMMY_DIGEST.to_vec().into(),
+                    size: 42,
+                },
+                DirectoryNode {
+                    name: "c".into(),
+                    digest: DUMMY_DIGEST.to_vec().into(),
+                    size: 42,
+                },
+            ],
+            symlinks: vec![SymlinkNode {
+                name: "a".into(),
+                target: "foo".into(),
+            }],
+            ..Default::default()
+        };
+
+        d.validate().expect("validate shouldn't error");
+    }
+}
+
+#[test]
+fn validate_overflow() {
+    let d = Directory {
+        directories: vec![DirectoryNode {
+            name: "foo".into(),
+            digest: DUMMY_DIGEST.to_vec().into(),
+            size: u64::MAX,
+        }],
+        ..Default::default()
+    };
+
+    match d.validate().expect_err("must fail") {
+        ValidateDirectoryError::SizeOverflow => {}
+        _ => panic!("unexpected error"),
+    }
+}
+
+#[test]
+fn add_nodes_to_directory() {
+    let mut d = Directory {
+        ..Default::default()
+    };
+
+    d.add(node::Node::Directory(DirectoryNode {
+        name: "b".into(),
+        digest: DUMMY_DIGEST.to_vec().into(),
+        size: 1,
+    }));
+    d.add(node::Node::Directory(DirectoryNode {
+        name: "a".into(),
+        digest: DUMMY_DIGEST.to_vec().into(),
+        size: 1,
+    }));
+    d.add(node::Node::Directory(DirectoryNode {
+        name: "z".into(),
+        digest: DUMMY_DIGEST.to_vec().into(),
+        size: 1,
+    }));
+
+    d.add(node::Node::File(FileNode {
+        name: "f".into(),
+        digest: DUMMY_DIGEST.to_vec().into(),
+        size: 1,
+        executable: true,
+    }));
+    d.add(node::Node::File(FileNode {
+        name: "c".into(),
+        digest: DUMMY_DIGEST.to_vec().into(),
+        size: 1,
+        executable: true,
+    }));
+    d.add(node::Node::File(FileNode {
+        name: "g".into(),
+        digest: DUMMY_DIGEST.to_vec().into(),
+        size: 1,
+        executable: true,
+    }));
+
+    d.add(node::Node::Symlink(SymlinkNode {
+        name: "t".into(),
+        target: "a".into(),
+    }));
+    d.add(node::Node::Symlink(SymlinkNode {
+        name: "o".into(),
+        target: "a".into(),
+    }));
+    d.add(node::Node::Symlink(SymlinkNode {
+        name: "e".into(),
+        target: "a".into(),
+    }));
+
+    d.validate().expect("directory should be valid");
+}
+
+#[test]
+#[cfg_attr(not(debug_assertions), ignore)]
+#[should_panic = "name already exists in directories"]
+fn add_duplicate_node_to_directory_panics() {
+    let mut d = Directory {
+        ..Default::default()
+    };
+
+    d.add(node::Node::Directory(DirectoryNode {
+        name: "a".into(),
+        digest: DUMMY_DIGEST.to_vec().into(),
+        size: 1,
+    }));
+    d.add(node::Node::File(FileNode {
+        name: "a".into(),
+        digest: DUMMY_DIGEST.to_vec().into(),
+        size: 1,
+        executable: true,
+    }));
+}
diff --git a/tvix/castore/src/proto/tests/directory_nodes_iterator.rs b/tvix/castore/src/proto/tests/directory_nodes_iterator.rs
new file mode 100644
index 0000000000..68f147a332
--- /dev/null
+++ b/tvix/castore/src/proto/tests/directory_nodes_iterator.rs
@@ -0,0 +1,78 @@
+use crate::proto::Directory;
+use crate::proto::DirectoryNode;
+use crate::proto::FileNode;
+use crate::proto::NamedNode;
+use crate::proto::SymlinkNode;
+
+#[test]
+fn iterator() {
+    let d = Directory {
+        directories: vec![
+            DirectoryNode {
+                name: "c".into(),
+                ..DirectoryNode::default()
+            },
+            DirectoryNode {
+                name: "d".into(),
+                ..DirectoryNode::default()
+            },
+            DirectoryNode {
+                name: "h".into(),
+                ..DirectoryNode::default()
+            },
+            DirectoryNode {
+                name: "l".into(),
+                ..DirectoryNode::default()
+            },
+        ],
+        files: vec![
+            FileNode {
+                name: "b".into(),
+                ..FileNode::default()
+            },
+            FileNode {
+                name: "e".into(),
+                ..FileNode::default()
+            },
+            FileNode {
+                name: "g".into(),
+                ..FileNode::default()
+            },
+            FileNode {
+                name: "j".into(),
+                ..FileNode::default()
+            },
+        ],
+        symlinks: vec![
+            SymlinkNode {
+                name: "a".into(),
+                ..SymlinkNode::default()
+            },
+            SymlinkNode {
+                name: "f".into(),
+                ..SymlinkNode::default()
+            },
+            SymlinkNode {
+                name: "i".into(),
+                ..SymlinkNode::default()
+            },
+            SymlinkNode {
+                name: "k".into(),
+                ..SymlinkNode::default()
+            },
+        ],
+    };
+
+    // We keep this strings here and convert to string to make the comparison
+    // less messy.
+    let mut node_names: Vec<String> = vec![];
+
+    for node in d.nodes() {
+        node_names.push(String::from_utf8(node.get_name().to_vec()).unwrap());
+    }
+
+    assert_eq!(
+        vec!["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l"],
+        node_names
+    );
+}
diff --git a/tvix/castore/src/proto/tests/mod.rs b/tvix/castore/src/proto/tests/mod.rs
new file mode 100644
index 0000000000..8d903bacb6
--- /dev/null
+++ b/tvix/castore/src/proto/tests/mod.rs
@@ -0,0 +1,2 @@
+mod directory;
+mod directory_nodes_iterator;
diff --git a/tvix/castore/src/tests/import.rs b/tvix/castore/src/tests/import.rs
new file mode 100644
index 0000000000..8b3bd5ce0f
--- /dev/null
+++ b/tvix/castore/src/tests/import.rs
@@ -0,0 +1,129 @@
+use crate::blobservice::{self, BlobService};
+use crate::directoryservice;
+use crate::fixtures::*;
+use crate::import::fs::ingest_path;
+use crate::proto;
+
+use std::sync::Arc;
+use tempfile::TempDir;
+
+#[cfg(target_family = "unix")]
+use std::os::unix::ffi::OsStrExt;
+
+#[cfg(target_family = "unix")]
+#[tokio::test]
+async fn symlink() {
+    let blob_service = blobservice::from_addr("memory://").await.unwrap();
+    let directory_service = directoryservice::from_addr("memory://").await.unwrap();
+
+    let tmpdir = TempDir::new().unwrap();
+
+    std::fs::create_dir_all(&tmpdir).unwrap();
+    std::os::unix::fs::symlink(
+        "/nix/store/somewhereelse",
+        tmpdir.path().join("doesntmatter"),
+    )
+    .unwrap();
+
+    let root_node = ingest_path(
+        Arc::from(blob_service),
+        directory_service,
+        tmpdir.path().join("doesntmatter"),
+    )
+    .await
+    .expect("must succeed");
+
+    assert_eq!(
+        proto::node::Node::Symlink(proto::SymlinkNode {
+            name: "doesntmatter".into(),
+            target: "/nix/store/somewhereelse".into(),
+        }),
+        root_node,
+    )
+}
+
+#[tokio::test]
+async fn single_file() {
+    let blob_service =
+        Arc::from(blobservice::from_addr("memory://").await.unwrap()) as Arc<dyn BlobService>;
+    let directory_service = directoryservice::from_addr("memory://").await.unwrap();
+
+    let tmpdir = TempDir::new().unwrap();
+
+    std::fs::write(tmpdir.path().join("root"), HELLOWORLD_BLOB_CONTENTS).unwrap();
+
+    let root_node = ingest_path(
+        blob_service.clone(),
+        directory_service,
+        tmpdir.path().join("root"),
+    )
+    .await
+    .expect("must succeed");
+
+    assert_eq!(
+        proto::node::Node::File(proto::FileNode {
+            name: "root".into(),
+            digest: HELLOWORLD_BLOB_DIGEST.clone().into(),
+            size: HELLOWORLD_BLOB_CONTENTS.len() as u64,
+            executable: false,
+        }),
+        root_node,
+    );
+
+    // ensure the blob has been uploaded
+    assert!(blob_service.has(&HELLOWORLD_BLOB_DIGEST).await.unwrap());
+}
+
+#[cfg(target_family = "unix")]
+#[tokio::test]
+async fn complicated() {
+    let blob_service =
+        Arc::from(blobservice::from_addr("memory://").await.unwrap()) as Arc<dyn BlobService>;
+    let directory_service = directoryservice::from_addr("memory://").await.unwrap();
+
+    let tmpdir = TempDir::new().unwrap();
+
+    // File ``.keep`
+    std::fs::write(tmpdir.path().join(".keep"), vec![]).unwrap();
+    // Symlink `aa`
+    std::os::unix::fs::symlink("/nix/store/somewhereelse", tmpdir.path().join("aa")).unwrap();
+    // Directory `keep`
+    std::fs::create_dir(tmpdir.path().join("keep")).unwrap();
+    // File ``keep/.keep`
+    std::fs::write(tmpdir.path().join("keep").join(".keep"), vec![]).unwrap();
+
+    let root_node = ingest_path(blob_service.clone(), &directory_service, tmpdir.path())
+        .await
+        .expect("must succeed");
+
+    // ensure root_node matched expectations
+    assert_eq!(
+        proto::node::Node::Directory(proto::DirectoryNode {
+            name: tmpdir
+                .path()
+                .file_name()
+                .unwrap()
+                .as_bytes()
+                .to_owned()
+                .into(),
+            digest: DIRECTORY_COMPLICATED.digest().into(),
+            size: DIRECTORY_COMPLICATED.size(),
+        }),
+        root_node,
+    );
+
+    // ensure DIRECTORY_WITH_KEEP and DIRECTORY_COMPLICATED have been uploaded
+    assert!(directory_service
+        .get(&DIRECTORY_WITH_KEEP.digest())
+        .await
+        .unwrap()
+        .is_some());
+    assert!(directory_service
+        .get(&DIRECTORY_COMPLICATED.digest())
+        .await
+        .unwrap()
+        .is_some());
+
+    // ensure EMPTY_BLOB_CONTENTS has been uploaded
+    assert!(blob_service.has(&EMPTY_BLOB_DIGEST).await.unwrap());
+}
diff --git a/tvix/castore/src/tests/mod.rs b/tvix/castore/src/tests/mod.rs
new file mode 100644
index 0000000000..d016f3e0aa
--- /dev/null
+++ b/tvix/castore/src/tests/mod.rs
@@ -0,0 +1 @@
+mod import;
diff --git a/tvix/castore/src/tonic.rs b/tvix/castore/src/tonic.rs
new file mode 100644
index 0000000000..4b65d6b028
--- /dev/null
+++ b/tvix/castore/src/tonic.rs
@@ -0,0 +1,122 @@
+use tokio::net::UnixStream;
+use tonic::transport::{Channel, Endpoint};
+
+fn url_wants_wait_connect(url: &url::Url) -> bool {
+    url.query_pairs()
+        .filter(|(k, v)| k == "wait-connect" && v == "1")
+        .count()
+        > 0
+}
+
+/// Turn a [url::Url] to a [Channel] if it can be parsed successfully.
+/// It supports the following schemes (and URLs):
+///  - `grpc+http://[::1]:8000`, connecting over unencrypted HTTP/2 (h2c)
+///  - `grpc+https://[::1]:8000`, connecting over encrypted HTTP/2
+///  - `grpc+unix:/path/to/socket`, connecting to a unix domain socket
+///
+/// All URLs support adding `wait-connect=1` as a URL parameter, in which case
+/// the connection is established lazily.
+pub async fn channel_from_url(url: &url::Url) -> Result<Channel, self::Error> {
+    match url.scheme() {
+        "grpc+unix" => {
+            if url.host_str().is_some() {
+                return Err(Error::HostSetForUnixSocket());
+            }
+
+            let connector = tower::service_fn({
+                let url = url.clone();
+                move |_: tonic::transport::Uri| UnixStream::connect(url.path().to_string().clone())
+            });
+
+            // the URL doesn't matter
+            let endpoint = Endpoint::from_static("http://[::]:50051");
+            if url_wants_wait_connect(url) {
+                Ok(endpoint.connect_with_connector(connector).await?)
+            } else {
+                Ok(endpoint.connect_with_connector_lazy(connector))
+            }
+        }
+        _ => {
+            // ensure path is empty, not supported with gRPC.
+            if !url.path().is_empty() {
+                return Err(Error::PathMayNotBeSet());
+            }
+
+            // Stringify the URL and remove the grpc+ prefix.
+            // We can't use `url.set_scheme(rest)`, as it disallows
+            // setting something http(s) that previously wasn't.
+            let unprefixed_url_str = match url.to_string().strip_prefix("grpc+") {
+                None => return Err(Error::MissingGRPCPrefix()),
+                Some(url_str) => url_str.to_owned(),
+            };
+
+            // Use the regular tonic transport::Endpoint logic, but unprefixed_url_str,
+            // as tonic doesn't know about grpc+http[s].
+            let endpoint = Endpoint::try_from(unprefixed_url_str)?;
+            if url_wants_wait_connect(url) {
+                Ok(endpoint.connect().await?)
+            } else {
+                Ok(endpoint.connect_lazy())
+            }
+        }
+    }
+}
+
+/// Errors occuring when trying to connect to a backend
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    #[error("grpc+ prefix is missing from URL")]
+    MissingGRPCPrefix(),
+
+    #[error("host may not be set for unix domain sockets")]
+    HostSetForUnixSocket(),
+
+    #[error("path may not be set")]
+    PathMayNotBeSet(),
+
+    #[error("transport error: {0}")]
+    TransportError(tonic::transport::Error),
+}
+
+impl From<tonic::transport::Error> for Error {
+    fn from(value: tonic::transport::Error) -> Self {
+        Self::TransportError(value)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::channel_from_url;
+    use rstest::rstest;
+    use url::Url;
+
+    #[rstest]
+    /// Correct scheme to connect to a unix socket.
+    #[case::valid_unix_socket("grpc+unix:///path/to/somewhere", true)]
+    /// Connecting with wait-connect set to 0 succeeds, as that's the default.
+    #[case::valid_unix_socket_wait_connect_0("grpc+unix:///path/to/somewhere?wait-connect=0", true)]
+    /// Connecting with wait-connect set to 1 fails, as the path doesn't exist.
+    #[case::valid_unix_socket_wait_connect_1(
+        "grpc+unix:///path/to/somewhere?wait-connect=1",
+        false
+    )]
+    /// Correct scheme for unix socket, but setting a host too, which is invalid.
+    #[case::invalid_unix_socket_and_host("grpc+unix://host.example/path/to/somewhere", false)]
+    /// Correct scheme to connect to localhost, with port 12345
+    #[case::valid_ipv6_localhost_port_12345("grpc+http://[::1]:12345", true)]
+    /// Correct scheme to connect to localhost over http, without specifying a port.
+    #[case::valid_http_host_without_port("grpc+http://localhost", true)]
+    /// Correct scheme to connect to localhost over http, without specifying a port.
+    #[case::valid_https_host_without_port("grpc+https://localhost", true)]
+    /// Correct scheme to connect to localhost over http, but with additional path, which is invalid.
+    #[case::invalid_host_and_path("grpc+http://localhost/some-path", false)]
+    /// Connecting with wait-connect set to 0 succeeds, as that's the default.
+    #[case::valid_host_wait_connect_0("grpc+http://localhost?wait-connect=0", true)]
+    /// Connecting with wait-connect set to 1 fails, as the host doesn't exist.
+    #[case::valid_host_wait_connect_1_fails("grpc+http://nonexist.invalid?wait-connect=1", false)]
+    #[tokio::test]
+    async fn test_from_addr_tokio(#[case] uri_str: &str, #[case] is_ok: bool) {
+        let url = Url::parse(uri_str).expect("must parse");
+        assert_eq!(channel_from_url(&url).await.is_ok(), is_ok)
+    }
+}