7 files changed, 745 insertions, 2 deletions
diff --git a/tvix/build/src/buildservice/from_addr.rs b/tvix/build/src/buildservice/from_addr.rs
index cc5403edefff..a7afba1138af 100644
--- a/tvix/build/src/buildservice/from_addr.rs
+++ b/tvix/build/src/buildservice/from_addr.rs
@@ -2,18 +2,22 @@ use super::{grpc::GRPCBuildService, BuildService, DummyBuildService};
 use tvix_castore::{blobservice::BlobService, directoryservice::DirectoryService};
 use url::Url;
 
+#[cfg(target_os = "linux")]
+use super::oci::OCIBuildService;
+
 /// Constructs a new instance of a [BuildService] from an URI.
 ///
 /// The following schemes are supported by the following services:
 /// - `dummy://` ([DummyBuildService])
+/// - `oci://` ([OCIBuildService])
 /// - `grpc+*://` ([GRPCBuildService])
 ///
 /// As some of these [BuildService] need to talk to a [BlobService] and
 /// [DirectoryService], these also need to be passed in.
 pub async fn from_addr<BS, DS>(
     uri: &str,
-    _blob_service: BS,
-    _directory_service: DS,
+    blob_service: BS,
+    directory_service: DS,
 ) -> std::io::Result<Box<dyn BuildService>>
 where
     BS: AsRef<dyn BlobService> + Send + Sync + Clone + 'static,
@@ -25,6 +29,21 @@ where
     Ok(match url.scheme() {
         // dummy doesn't care about parameters.
         "dummy" => Box::<DummyBuildService>::default(),
+        #[cfg(target_os = "linux")]
+        "oci" => {
+            // oci wants a path in which it creates bundles.
+            if url.path().is_empty() {
+                Err(std::io::Error::other("oci needs a bundle dir as path"))?
+            }
+
+            // TODO: make sandbox shell and rootless_uid_gid
+
+            Box::new(OCIBuildService::new(
+                url.path().into(),
+                blob_service,
+                directory_service,
+            ))
+        }
         scheme => {
             if scheme.starts_with("grpc+") {
                 let client = crate::proto::build_service_client::BuildServiceClient::new(
@@ -50,12 +69,18 @@ mod tests {
     use std::sync::Arc;
 
     use super::from_addr;
+    use lazy_static::lazy_static;
     use rstest::rstest;
+    use tempfile::TempDir;
     use tvix_castore::{
         blobservice::{BlobService, MemoryBlobService},
         directoryservice::{DirectoryService, MemoryDirectoryService},
     };
 
+    lazy_static! {
+        static ref TMPDIR_OCI_1: TempDir = TempDir::new().unwrap();
+    }
+
     #[rstest]
     /// This uses an unsupported scheme.
     #[case::unsupported_scheme("http://foo.example/test", false)]
@@ -73,6 +98,10 @@ mod tests {
     #[case::grpc_valid_https_host_without_port("grpc+https://localhost", true)]
     /// Correct scheme to connect to localhost over http, but with additional path, which is invalid.
     #[case::grpc_invalid_host_and_path("grpc+http://localhost/some-path", false)]
+    /// This configures OCI, but doesn't specify the bundle path
+    #[case::oci_missing_bundle_dir("oci://", false)]
+    /// This configures OCI, specifying the bundle path
+    #[case::oci_bundle_path(&format!("oci://{}", TMPDIR_OCI_1.path().to_str().unwrap()), true)]
     #[tokio::test]
     async fn test_from_addr(#[case] uri_str: &str, #[case] exp_succeed: bool) {
         let blob_service: Arc<dyn BlobService> = Arc::from(MemoryBlobService::default());
diff --git a/tvix/build/src/buildservice/mod.rs b/tvix/build/src/buildservice/mod.rs
index a61d782919b9..cdc3cb2afcc3 100644
--- a/tvix/build/src/buildservice/mod.rs
+++ b/tvix/build/src/buildservice/mod.rs
@@ -6,6 +6,9 @@ mod dummy;
 mod from_addr;
 mod grpc;
 
+#[cfg(target_os = "linux")]
+mod oci;
+
 pub use dummy::DummyBuildService;
 pub use from_addr::from_addr;
 
diff --git a/tvix/build/src/buildservice/oci.rs b/tvix/build/src/buildservice/oci.rs
new file mode 100644
index 000000000000..26e6f5027f49
--- /dev/null
+++ b/tvix/build/src/buildservice/oci.rs
@@ -0,0 +1,245 @@
+use anyhow::Context;
+use bstr::BStr;
+use oci_spec::runtime::{LinuxIdMapping, LinuxIdMappingBuilder};
+use tokio::process::{Child, Command};
+use tonic::async_trait;
+use tracing::{debug, instrument, warn, Span};
+use tvix_castore::{
+    blobservice::BlobService, directoryservice::DirectoryService, fs::fuse::FuseDaemon,
+    import::fs::ingest_path, Node, PathComponent,
+};
+use uuid::Uuid;
+
+use crate::{
+    oci::{get_host_output_paths, make_bundle, make_spec},
+    proto::{Build, BuildRequest},
+};
+use std::{collections::BTreeMap, ffi::OsStr, path::PathBuf, process::Stdio};
+
+use super::BuildService;
+
+const SANDBOX_SHELL: &str = env!("TVIX_BUILD_SANDBOX_SHELL");
+const MAX_CONCURRENT_BUILDS: usize = 2; // TODO: make configurable
+
+pub struct OCIBuildService<BS, DS> {
+    /// Root path in which all bundles are created in
+    bundle_root: PathBuf,
+
+    /// uid mappings to set up for the workloads
+    uid_mappings: Vec<LinuxIdMapping>,
+    /// uid mappings to set up for the workloads
+    gid_mappings: Vec<LinuxIdMapping>,
+
+    /// Handle to a [BlobService], used by filesystems spawned during builds.
+    blob_service: BS,
+    /// Handle to a [DirectoryService], used by filesystems spawned during builds.
+    directory_service: DS,
+
+    // semaphore to track number of concurrently running builds.
+    // this is necessary, as otherwise we very quickly run out of open file handles.
+    concurrent_builds: tokio::sync::Semaphore,
+}
+
+impl<BS, DS> OCIBuildService<BS, DS> {
+    pub fn new(bundle_root: PathBuf, blob_service: BS, directory_service: DS) -> Self {
+        // We map root inside the container to the uid/gid this is running at,
+        // and allocate one for uid 1000 into the container from the range we
+        // got in /etc/sub{u,g}id.
+        // TODO: actually read uid, and /etc/subuid. Maybe only when we try to build?
+        // FUTUREWORK: use different uids?
+        Self {
+            bundle_root,
+            blob_service,
+            directory_service,
+            uid_mappings: vec![
+                LinuxIdMappingBuilder::default()
+                    .host_id(1000_u32)
+                    .container_id(0_u32)
+                    .size(1_u32)
+                    .build()
+                    .unwrap(),
+                LinuxIdMappingBuilder::default()
+                    .host_id(100000_u32)
+                    .container_id(1000_u32)
+                    .size(1_u32)
+                    .build()
+                    .unwrap(),
+            ],
+            gid_mappings: vec![
+                LinuxIdMappingBuilder::default()
+                    .host_id(100_u32)
+                    .container_id(0_u32)
+                    .size(1_u32)
+                    .build()
+                    .unwrap(),
+                LinuxIdMappingBuilder::default()
+                    .host_id(100000_u32)
+                    .container_id(100_u32)
+                    .size(1_u32)
+                    .build()
+                    .unwrap(),
+            ],
+            concurrent_builds: tokio::sync::Semaphore::new(MAX_CONCURRENT_BUILDS),
+        }
+    }
+}
+
+#[async_trait]
+impl<BS, DS> BuildService for OCIBuildService<BS, DS>
+where
+    BS: AsRef<dyn BlobService> + Send + Sync + Clone + 'static,
+    DS: AsRef<dyn DirectoryService> + Send + Sync + Clone + 'static,
+{
+    #[instrument(skip_all, err)]
+    async fn do_build(&self, request: BuildRequest) -> std::io::Result<Build> {
+        let _permit = self.concurrent_builds.acquire().await.unwrap();
+
+        let bundle_name = Uuid::new_v4();
+        let bundle_path = self.bundle_root.join(bundle_name.to_string());
+
+        let span = Span::current();
+        span.record("bundle_name", bundle_name.to_string());
+
+        let mut runtime_spec = make_spec(&request, true, SANDBOX_SHELL)
+            .context("failed to create spec")
+            .map_err(std::io::Error::other)?;
+
+        let mut linux = runtime_spec.linux().clone().unwrap();
+
+        // edit the spec, we need to setup uid/gid mappings.
+        linux.set_uid_mappings(Some(self.uid_mappings.clone()));
+        linux.set_gid_mappings(Some(self.gid_mappings.clone()));
+
+        runtime_spec.set_linux(Some(linux));
+
+        make_bundle(&request, &runtime_spec, &bundle_path)
+            .context("failed to produce bundle")
+            .map_err(std::io::Error::other)?;
+
+        // pre-calculate the locations we want to later ingest, in the order of
+        // the original outputs.
+        // If we can't find calculate that path, don't start the build in first place.
+        let host_output_paths = get_host_output_paths(&request, &bundle_path)
+            .context("failed to calculate host output paths")
+            .map_err(std::io::Error::other)?;
+
+        // NOTE: impl Drop for FuseDaemon unmounts, so if the call is cancelled, umount.
+        let _fuse_daemon = tokio::task::spawn_blocking({
+            let blob_service = self.blob_service.clone();
+            let directory_service = self.directory_service.clone();
+            // assemble a BTreeMap of Nodes to pass into TvixStoreFs.
+            let root_nodes: BTreeMap<PathComponent, Node> =
+                BTreeMap::from_iter(request.inputs.iter().map(|input| {
+                    // We know from validation this is Some.
+                    input.clone().into_name_and_node().unwrap()
+                }));
+
+            debug!(inputs=?root_nodes.keys(), "got inputs");
+
+            let dest = bundle_path.join("inputs");
+
+            move || {
+                let fs = tvix_castore::fs::TvixStoreFs::new(
+                    blob_service,
+                    directory_service,
+                    Box::new(root_nodes),
+                    true,
+                    false,
+                );
+                // mount the filesystem and wait for it to be unmounted.
+                // FUTUREWORK: make fuse daemon threads configurable?
+                FuseDaemon::new(fs, dest, 4, true).context("failed to start fuse daemon")
+            }
+        })
+        .await?
+        .context("mounting")
+        .map_err(std::io::Error::other)?;
+
+        debug!(bundle.path=?bundle_path, bundle.name=%bundle_name, "about to spawn bundle");
+
+        // start the bundle as another process.
+        let child = spawn_bundle(bundle_path, &bundle_name.to_string())?;
+
+        // wait for the process to exit
+        // FUTUREWORK: change the trait to allow reporting progress / logs…
+        let child_output = child
+            .wait_with_output()
+            .await
+            .context("failed to run process")
+            .map_err(std::io::Error::other)?;
+
+        // Check the exit code
+        if !child_output.status.success() {
+            let stdout = BStr::new(&child_output.stdout);
+            let stderr = BStr::new(&child_output.stderr);
+
+            warn!(stdout=%stdout, stderr=%stderr, exit_code=%child_output.status, "build failed");
+
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "nonzero exit code".to_string(),
+            ));
+        }
+
+        // Ingest build outputs into the castore.
+        // We use try_join_all here. No need to spawn new tasks, as this is
+        // mostly IO bound.
+        let outputs = futures::future::try_join_all(host_output_paths.into_iter().enumerate().map(
+            |(i, p)| {
+                let output_path = request.outputs[i].clone();
+                async move {
+                    debug!(host.path=?p, output.path=?output_path, "ingesting path");
+
+                    let output_node = ingest_path::<_, _, _, &[u8]>(
+                        self.blob_service.clone(),
+                        &self.directory_service,
+                        p,
+                        None,
+                    )
+                    .await
+                    .map_err(|e| {
+                        std::io::Error::new(
+                            std::io::ErrorKind::InvalidData,
+                            format!("Unable to ingest output: {}", e),
+                        )
+                    })?;
+
+                    Ok::<_, std::io::Error>(tvix_castore::proto::Node::from_name_and_node(
+                        "".into(),
+                        output_node,
+                    ))
+                }
+            },
+        ))
+        .await?;
+
+        Ok(Build {
+            build_request: Some(request.clone()),
+            outputs,
+            outputs_needles: vec![], // TODO refscanning
+        })
+    }
+}
+
+/// Spawns runc with the bundle at bundle_path.
+/// On success, returns the child.
+#[instrument(err)]
+fn spawn_bundle(
+    bundle_path: impl AsRef<OsStr> + std::fmt::Debug,
+    bundle_name: &str,
+) -> std::io::Result<Child> {
+    let mut command = Command::new("runc");
+
+    command
+        .args(&[
+            "run".into(),
+            "--bundle".into(),
+            bundle_path.as_ref().to_os_string(),
+            bundle_name.into(),
+        ])
+        .stderr(Stdio::piped())
+        .stdout(Stdio::piped())
+        .stdin(Stdio::null());
+
+    command.spawn()
+}
diff --git a/tvix/build/src/lib.rs b/tvix/build/src/lib.rs
index b173657e431c..ccd7657101d8 100644
--- a/tvix/build/src/lib.rs
+++ b/tvix/build/src/lib.rs
@@ -1,2 +1,3 @@
 pub mod buildservice;
+mod oci;
 pub mod proto;
diff --git a/tvix/build/src/oci/bundle.rs b/tvix/build/src/oci/bundle.rs
new file mode 100644
index 000000000000..c3c2e83e89e5
--- /dev/null
+++ b/tvix/build/src/oci/bundle.rs
@@ -0,0 +1,133 @@
+//! Module to create an OCI runtime bundle for a given [BuildRequest].
+use std::{
+    fs,
+    path::{Path, PathBuf},
+};
+
+use super::scratch_name;
+use crate::proto::BuildRequest;
+use anyhow::{bail, Context};
+use tracing::{debug, instrument};
+
+/// Produce an OCI bundle in a given path.
+/// Check [make_spec] for a description about the paths produced.
+#[instrument(err)]
+pub(crate) fn make_bundle<'a>(
+    request: &BuildRequest,
+    runtime_spec: &oci_spec::runtime::Spec,
+    path: &Path,
+) -> anyhow::Result<()> {
+    fs::create_dir_all(path).context("failed to create bundle path")?;
+
+    let spec_json = serde_json::to_string(runtime_spec).context("failed to render spec to json")?;
+    fs::write(path.join("config.json"), spec_json).context("failed to write config.json")?;
+
+    fs::create_dir_all(path.join("inputs")).context("failed to create inputs dir")?;
+
+    let root_path = path.join("root");
+
+    fs::create_dir_all(&root_path).context("failed to create root path dir")?;
+    fs::create_dir_all(root_path.join("etc")).context("failed to create root/etc dir")?;
+
+    // TODO: populate /etc/{group,passwd}. It's a mess?
+
+    let scratch_root = path.join("scratch");
+    fs::create_dir_all(&scratch_root).context("failed to create scratch/ dir")?;
+
+    // for each scratch path, calculate its name inside scratch, and ensure the
+    // directory exists.
+    for p in request.scratch_paths.iter() {
+        let scratch_path = scratch_root.join(scratch_name(p));
+        debug!(scratch_path=?scratch_path, path=?p, "about to create scratch dir");
+        fs::create_dir_all(scratch_path).context("Unable to create scratch dir")?;
+    }
+
+    Ok(())
+}
+
+/// Determine the path of all outputs specified in a [BuildRequest]
+/// as seen from the host, for post-build ingestion.
+/// This lookup needs to take scratch paths into consideration, as the build
+/// root is not writable on its own.
+/// If a path can't be determined, an error is returned.
+pub(crate) fn get_host_output_paths(
+    request: &BuildRequest,
+    bundle_path: &Path,
+) -> anyhow::Result<Vec<PathBuf>> {
+    let scratch_root = bundle_path.join("scratch");
+
+    let mut host_output_paths: Vec<PathBuf> = Vec::with_capacity(request.outputs.len());
+
+    for output_path in request.outputs.iter() {
+        // calculate the location of the path.
+        if let Some((mp, relpath)) =
+            find_path_in_scratchs(output_path, request.scratch_paths.as_slice())
+        {
+            host_output_paths.push(scratch_root.join(scratch_name(mp)).join(relpath));
+        } else {
+            bail!("unable to find path {}", output_path);
+        }
+    }
+
+    Ok(host_output_paths)
+}
+
+/// For a given list of mountpoints (sorted) and a search_path, find the
+/// specific mountpoint parenting that search_path and return it, as well as the
+/// relative path from there to the search_path.
+/// mountpoints must be sorted, so we can iterate over the list from the back
+/// and match on the prefix.
+fn find_path_in_scratchs<'a, 'b>(
+    search_path: &'a str,
+    mountpoints: &'b [String],
+) -> Option<(&'b str, &'a str)> {
+    mountpoints.iter().rev().find_map(|mp| {
+        Some((
+            mp.as_str(),
+            search_path.strip_prefix(mp)?.strip_prefix('/')?,
+        ))
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use std::path::{Path, PathBuf};
+
+    use rstest::rstest;
+
+    use crate::{oci::scratch_name, proto::BuildRequest};
+
+    use super::{find_path_in_scratchs, get_host_output_paths};
+
+    #[rstest]
+    #[case::simple("nix/store/aaaa", &["nix/store".into()], Some(("nix/store", "aaaa")))]
+    #[case::prefix_no_sep("nix/store/aaaa", &["nix/sto".into()], None)]
+    #[case::not_found("nix/store/aaaa", &["build".into()], None)]
+    fn test_test_find_path_in_scratchs(
+        #[case] search_path: &str,
+        #[case] mountpoints: &[String],
+        #[case] expected: Option<(&str, &str)>,
+    ) {
+        assert_eq!(find_path_in_scratchs(search_path, mountpoints), expected);
+    }
+
+    #[test]
+    fn test_get_host_output_paths_simple() {
+        let request = BuildRequest {
+            outputs: vec!["nix/store/fhaj6gmwns62s6ypkcldbaj2ybvkhx3p-foo".into()],
+            scratch_paths: vec!["build".into(), "nix/store".into()],
+            ..Default::default()
+        };
+
+        let paths =
+            get_host_output_paths(&request, Path::new("bundle-root")).expect("must succeed");
+
+        let mut expected_path = PathBuf::new();
+        expected_path.push("bundle-root");
+        expected_path.push("scratch");
+        expected_path.push(scratch_name("nix/store"));
+        expected_path.push("fhaj6gmwns62s6ypkcldbaj2ybvkhx3p-foo");
+
+        assert_eq!(vec![expected_path], paths)
+    }
+}
diff --git a/tvix/build/src/oci/mod.rs b/tvix/build/src/oci/mod.rs
new file mode 100644
index 000000000000..26dab3059a58
--- /dev/null
+++ b/tvix/build/src/oci/mod.rs
@@ -0,0 +1,13 @@
+mod bundle;
+mod spec;
+
+pub(crate) use bundle::get_host_output_paths;
+pub(crate) use bundle::make_bundle;
+pub(crate) use spec::make_spec;
+
+/// For a given scratch path, return the scratch_name that's allocated.
+// We currently use use lower hex encoding of the b3 digest of the scratch
+// path, so we don't need to globally allocate and pass down some uuids.
+pub(crate) fn scratch_name(scratch_path: &str) -> String {
+    data_encoding::BASE32.encode(blake3::hash(scratch_path.as_bytes()).as_bytes())
+}
diff --git a/tvix/build/src/oci/spec.rs b/tvix/build/src/oci/spec.rs
new file mode 100644
index 000000000000..d804aa1171c1
--- /dev/null
+++ b/tvix/build/src/oci/spec.rs
@@ -0,0 +1,319 @@
+//! Module to create a OCI runtime spec for a given [BuildRequest].
+use crate::proto::BuildRequest;
+use oci_spec::{
+    runtime::{Capability, LinuxNamespace, LinuxNamespaceBuilder, LinuxNamespaceType},
+    OciSpecError,
+};
+use std::{collections::HashSet, path::Path};
+use tvix_castore::proto as castorepb;
+
+use super::scratch_name;
+
+/// For a given [BuildRequest], return an OCI runtime spec.
+///
+/// While there's no IO occuring in this function, the generated spec contains
+/// path references relative to the "bundle location".
+/// Due to overlayfs requiring its layers to be absolute paths, we also need a
+/// [bundle_dir] parameter, pointing to the location of the bundle dir itself.
+///
+/// The paths used in the spec are the following (relative to a "bundle root"):
+///
+/// - `inputs`, a directory where the castore nodes specified the build request
+///   inputs are supposed to be populated.
+/// - `outputs`, a directory where all writes to the store_dir during the build
+///   are directed to.
+/// - `root`, a minimal skeleton of files that'll be present at /.
+/// - `scratch`, a directory containing other directories which will be
+///   bind-mounted read-write into the container and used as scratch space
+///   during the build.
+///   No assumptions should be made about what's inside this directory.
+///
+/// Generating these paths, and populating contents, like a skeleton root
+/// is up to another function, this function doesn't do filesystem IO.
+pub(crate) fn make_spec(
+    request: &BuildRequest,
+    rootless: bool,
+    sandbox_shell: &str,
+) -> Result<oci_spec::runtime::Spec, oci_spec::OciSpecError> {
+    // TODO: add BuildRequest validations. BuildRequest must contain strings as inputs
+
+    let allow_network = request
+        .constraints
+        .as_ref()
+        .is_some_and(|c| c.network_access);
+
+    // Assemble ro_host_mounts. Start with constraints.available_ro_paths.
+    let mut ro_host_mounts = request
+        .constraints
+        .as_ref()
+        .map(|constraints| {
+            constraints
+                .available_ro_paths
+                .iter()
+                .map(|e| (e.as_str(), e.as_str()))
+                .collect::<Vec<_>>()
+        })
+        .unwrap_or_default();
+
+    // If provide_bin_sh is set, mount sandbox_shell to /bin/sh
+    if request
+        .constraints
+        .as_ref()
+        .is_some_and(|c| c.provide_bin_sh)
+    {
+        ro_host_mounts.push((sandbox_shell, "/bin/sh"))
+    }
+
+    oci_spec::runtime::SpecBuilder::default()
+        .process(configure_process(
+            &request.command_args,
+            &request.working_dir,
+            request
+                .environment_vars
+                .iter()
+                .map(|e| {
+                    (
+                        e.key.as_str(),
+                        // TODO: decide what to do with non-bytes env values
+                        String::from_utf8(e.value.to_vec()).expect("invalid string in env"),
+                    )
+                })
+                .collect::<Vec<_>>(),
+            rootless,
+        )?)
+        .linux(configure_linux(allow_network, rootless)?)
+        .root(
+            oci_spec::runtime::RootBuilder::default()
+                .path("root")
+                .readonly(true)
+                .build()?,
+        )
+        .hostname("localhost")
+        .mounts(configure_mounts(
+            rootless,
+            allow_network,
+            request.scratch_paths.iter().map(|e| e.as_str()),
+            request.inputs.iter(),
+            &request.inputs_dir, // TODO: validate
+            ro_host_mounts,
+        )?)
+        .build()
+}
+
+/// Return the Process part of the OCI Runtime spec.
+/// This configures the command, it's working dir, env and terminal setup.
+/// It also takes care of setting rlimits and capabilities.
+/// Capabilities are a bit more complicated in case rootless building is requested.
+fn configure_process<'a>(
+    command_args: &[String],
+    cwd: &String,
+    env: impl IntoIterator<Item = (&'a str, String)>,
+    rootless: bool,
+) -> Result<oci_spec::runtime::Process, oci_spec::OciSpecError> {
+    let spec_builder = oci_spec::runtime::ProcessBuilder::default()
+        .args(command_args)
+        .env(
+            env.into_iter()
+                .map(|(k, v)| format!("{}={}", k, v))
+                .collect::<Vec<_>>(),
+        )
+        .terminal(true)
+        .user(
+            oci_spec::runtime::UserBuilder::default()
+                .uid(1000u32)
+                .gid(100u32)
+                .build()?,
+        )
+        .cwd(Path::new("/").join(cwd)) // relative to the bundle root, but at least runc wants it to also be absolute.
+        .capabilities({
+            let caps: HashSet<Capability> = if !rootless {
+                HashSet::from([Capability::AuditWrite, Capability::Kill])
+            } else {
+                HashSet::from([
+                    Capability::AuditWrite,
+                    Capability::Chown,
+                    Capability::DacOverride,
+                    Capability::Fowner,
+                    Capability::Fsetid,
+                    Capability::Kill,
+                    Capability::Mknod,
+                    Capability::NetBindService,
+                    Capability::NetRaw,
+                    Capability::Setfcap,
+                    Capability::Setgid,
+                    Capability::Setpcap,
+                    Capability::Setuid,
+                    Capability::SysChroot,
+                ])
+            };
+
+            oci_spec::runtime::LinuxCapabilitiesBuilder::default()
+                .bounding(caps.clone())
+                .effective(caps.clone())
+                .inheritable(caps.clone())
+                .permitted(caps.clone())
+                .ambient(caps)
+                .build()?
+        })
+        .rlimits([oci_spec::runtime::LinuxRlimitBuilder::default()
+            .typ(oci_spec::runtime::LinuxRlimitType::RlimitNofile)
+            .hard(1024_u64)
+            .soft(1024_u64)
+            .build()?])
+        .no_new_privileges(true);
+
+    spec_builder.build()
+}
+
+/// Return the Linux part of the OCI Runtime spec.
+/// This configures various namespaces, masked and read-only paths.
+fn configure_linux(
+    allow_network: bool,
+    rootless: bool,
+) -> Result<oci_spec::runtime::Linux, OciSpecError> {
+    let mut linux = oci_spec::runtime::Linux::default();
+
+    // explicitly set namespaces, depending on allow_network.
+    linux.set_namespaces(Some({
+        let mut namespace_types = vec![
+            LinuxNamespaceType::Pid,
+            LinuxNamespaceType::Ipc,
+            LinuxNamespaceType::Uts,
+            LinuxNamespaceType::Mount,
+            LinuxNamespaceType::Cgroup,
+        ];
+        if !allow_network {
+            namespace_types.push(LinuxNamespaceType::Network)
+        }
+        if rootless {
+            namespace_types.push(LinuxNamespaceType::User)
+        }
+
+        namespace_types
+            .into_iter()
+            .map(|e| LinuxNamespaceBuilder::default().typ(e).build())
+            .collect::<Result<Vec<LinuxNamespace>, _>>()?
+    }));
+
+    linux.set_masked_paths(Some(
+        [
+            "/proc/kcore",
+            "/proc/latency_stats",
+            "/proc/timer_list",
+            "/proc/timer_stats",
+            "/proc/sched_debug",
+            "/sys/firmware",
+        ]
+        .into_iter()
+        .map(|e| e.to_string())
+        .collect::<Vec<_>>(),
+    ));
+
+    linux.set_readonly_paths(Some(
+        [
+            "/proc/asound",
+            "/proc/bus",
+            "/proc/fs",
+            "/proc/irq",
+            "/proc/sys",
+            "/proc/sysrq-trigger",
+        ]
+        .into_iter()
+        .map(|e| e.to_string())
+        .collect::<Vec<_>>(),
+    ));
+
+    Ok(linux)
+}
+
+/// Return the Mounts part of the OCI Runtime spec.
+/// It first sets up the standard mounts, then scratch paths, bind mounts for
+/// all inputs, and finally read-only paths from the hosts.
+fn configure_mounts<'a>(
+    rootless: bool,
+    allow_network: bool,
+    scratch_paths: impl IntoIterator<Item = &'a str>,
+    inputs: impl Iterator<Item = &'a castorepb::Node>,
+    inputs_dir: &str,
+    ro_host_mounts: impl IntoIterator<Item = (&'a str, &'a str)>,
+) -> Result<Vec<oci_spec::runtime::Mount>, oci_spec::OciSpecError> {
+    let mut mounts: Vec<_> = if rootless {
+        oci_spec::runtime::get_rootless_mounts()
+    } else {
+        oci_spec::runtime::get_default_mounts()
+    };
+
+    mounts.push(configure_mount(
+        "tmpfs",
+        "/tmp",
+        "tmpfs",
+        &["nosuid", "noatime", "mode=700"],
+    )?);
+
+    // For each scratch path, create a bind mount entry.
+    let scratch_root = Path::new("scratch"); // relative path
+    for scratch_path in scratch_paths.into_iter() {
+        let src = scratch_root.join(scratch_name(scratch_path));
+        mounts.push(configure_mount(
+            src.to_str().unwrap(),
+            Path::new("/").join(scratch_path).to_str().unwrap(),
+            "none",
+            &["rbind", "rw"],
+        )?);
+    }
+
+    // For each input, create a bind mount from inputs/$name into $inputs_dir/$name.
+    for input in inputs {
+        let (input_name, _input) = input
+            .clone()
+            .into_name_and_node()
+            .expect("invalid input name");
+
+        let input_name = std::str::from_utf8(input_name.as_ref()).expect("invalid input name");
+        mounts.push(configure_mount(
+            Path::new("inputs").join(input_name).to_str().unwrap(),
+            Path::new("/")
+                .join(inputs_dir)
+                .join(input_name)
+                .to_str()
+                .unwrap(),
+            "none",
+            &[
+                "rbind", "ro",
+                // "nosuid" is required, otherwise mounting will just fail with
+                // a generic permission error.
+                // See https://github.com/wllenyj/containerd/commit/42a386c8164bef16d59590c61ab00806f854d8fd
+                "nosuid", "nodev",
+            ],
+        )?);
+    }
+
+    // Process ro_host_mounts
+    for (src, dst) in ro_host_mounts.into_iter() {
+        mounts.push(configure_mount(src, dst, "none", &["rbind", "ro"])?);
+    }
+
+    // In case network is enabled, also mount in /etc/{resolv.conf,services,hosts}
+    if allow_network {
+        for p in ["/etc/resolv.conf", "/etc/services", "/etc/hosts"] {
+            mounts.push(configure_mount(p, p, "none", &["rbind", "ro"])?);
+        }
+    }
+
+    Ok(mounts)
+}
+
+/// Helper function to produce a mount.
+fn configure_mount(
+    source: &str,
+    destination: &str,
+    typ: &str,
+    options: &[&str],
+) -> Result<oci_spec::runtime::Mount, oci_spec::OciSpecError> {
+    oci_spec::runtime::MountBuilder::default()
+        .destination(destination.to_string())
+        .typ(typ.to_string())
+        .source(source.to_string())
+        .options(options.iter().map(|e| e.to_string()).collect::<Vec<_>>())
+        .build()
+}