diff options
author | Florian Klink <flokli@flokli.de> | 2024-09-27T12·13+0200 |
---|---|---|
committer | clbot <clbot@tvl.fyi> | 2024-10-01T13·41+0000 |
commit | 2414c872821ab7ad8c9ff8dca5a91433ffb307f3 (patch) | |
tree | 22479c65cd145aea584853156bafce9ddd72e825 /tvix/build | |
parent | cf91917a9d8a4ca34a8451a0e0f31f0c07f755dc (diff) |
feat(tvix/build), add OciBuildService, the old way r/8744
This is just patchset 10 of CL10855, before the color_eyre changes, rebased to the tvix_castore api. Change-Id: If4b42412ff8568058908cda971ad7d6f2d9f9b7b --- This provides a build service invoking runc. It can be used by using the `oci://$path_to_some_tempdir` builder URL for now. For now, it can be tested as such: ``` BUILD_SERVICE_ADDR=oci://$PWD/bundles target/debug/tvix let pkgs = (import <nixpkgs> {}); in builtins.readDir pkgs.perl ``` readDir is to actually trigger IO into the store path (which triggers the builds). For now it fails due to missing reference scanning (see followup CLs). Change-Id: I09b40e410114ce69966a41a0e3c33281b859e443 Reviewed-on: https://cl.tvl.fyi/c/depot/+/12526 Autosubmit: yuka <yuka@yuka.dev> Tested-by: BuildkiteCI Reviewed-by: flokli <flokli@flokli.de>
Diffstat (limited to 'tvix/build')
-rw-r--r-- | tvix/build/Cargo.toml | 18 | ||||
-rw-r--r-- | tvix/build/build.rs | 4 | ||||
-rw-r--r-- | tvix/build/src/buildservice/from_addr.rs | 33 | ||||
-rw-r--r-- | tvix/build/src/buildservice/mod.rs | 3 | ||||
-rw-r--r-- | tvix/build/src/buildservice/oci.rs | 245 | ||||
-rw-r--r-- | tvix/build/src/lib.rs | 1 | ||||
-rw-r--r-- | tvix/build/src/oci/bundle.rs | 133 | ||||
-rw-r--r-- | tvix/build/src/oci/mod.rs | 13 | ||||
-rw-r--r-- | tvix/build/src/oci/spec.rs | 319 |
9 files changed, 763 insertions, 6 deletions
diff --git a/tvix/build/Cargo.toml b/tvix/build/Cargo.toml index b9073b7ff61a..842c4e7afad8 100644 --- a/tvix/build/Cargo.toml +++ b/tvix/build/Cargo.toml @@ -9,16 +9,26 @@ clap = { workspace = true, features = ["derive", "env"] } itertools = { workspace = true } prost = { workspace = true } thiserror = { workspace = true } -tokio = { workspace = true } +tokio = { workspace = true, features = ["process"] } tokio-listener = { workspace = true, features = ["tonic012"] } tonic = { workspace = true, features = ["tls", "tls-roots"] } -tvix-castore = { path = "../castore" } -tvix-tracing = { path = "../tracing" } +# TODO: put the fuse dep behind a feature flag? +tvix-castore = { path = "../castore", features = ["fuse"]} tracing = { workspace = true } url = { workspace = true } mimalloc = { workspace = true } tonic-reflection = { workspace = true, optional = true } +anyhow = "1.0.79" +blake3 = "1.5.0" +bstr = "1.6.0" +data-encoding = "2.5.0" +futures = "0.3.30" +oci-spec = "0.6.4" +serde_json = "1.0.111" +tvix-tracing = { path = "../tracing" } +uuid = { version = "1.7.0", features = ["v4"] } + [build-dependencies] prost-build = { workspace = true } tonic-build = { workspace = true } @@ -29,6 +39,8 @@ tonic-reflection = ["dep:tonic-reflection", "tvix-castore/tonic-reflection"] [dev-dependencies] rstest = { workspace = true } +lazy_static = "1.4.0" +tempfile = "3.3.0" [lints] workspace = true diff --git a/tvix/build/build.rs b/tvix/build/build.rs index fe230cbeca8a..909d6bab0875 100644 --- a/tvix/build/build.rs +++ b/tvix/build/build.rs @@ -30,5 +30,7 @@ fn main() -> Result<()> { Some(proto_root) => proto_root.to_str().unwrap().to_owned(), None => "../..".to_string(), }], - ) + )?; + + Ok(()) } diff --git a/tvix/build/src/buildservice/from_addr.rs b/tvix/build/src/buildservice/from_addr.rs index cc5403edefff..a7afba1138af 100644 --- a/tvix/build/src/buildservice/from_addr.rs +++ b/tvix/build/src/buildservice/from_addr.rs @@ -2,18 +2,22 @@ use super::{grpc::GRPCBuildService, BuildService, DummyBuildService}; use tvix_castore::{blobservice::BlobService, directoryservice::DirectoryService}; use url::Url; +#[cfg(target_os = "linux")] +use super::oci::OCIBuildService; + /// Constructs a new instance of a [BuildService] from an URI. /// /// The following schemes are supported by the following services: /// - `dummy://` ([DummyBuildService]) +/// - `oci://` ([OCIBuildService]) /// - `grpc+*://` ([GRPCBuildService]) /// /// As some of these [BuildService] need to talk to a [BlobService] and /// [DirectoryService], these also need to be passed in. pub async fn from_addr<BS, DS>( uri: &str, - _blob_service: BS, - _directory_service: DS, + blob_service: BS, + directory_service: DS, ) -> std::io::Result<Box<dyn BuildService>> where BS: AsRef<dyn BlobService> + Send + Sync + Clone + 'static, @@ -25,6 +29,21 @@ where Ok(match url.scheme() { // dummy doesn't care about parameters. "dummy" => Box::<DummyBuildService>::default(), + #[cfg(target_os = "linux")] + "oci" => { + // oci wants a path in which it creates bundles. + if url.path().is_empty() { + Err(std::io::Error::other("oci needs a bundle dir as path"))? + } + + // TODO: make sandbox shell and rootless_uid_gid + + Box::new(OCIBuildService::new( + url.path().into(), + blob_service, + directory_service, + )) + } scheme => { if scheme.starts_with("grpc+") { let client = crate::proto::build_service_client::BuildServiceClient::new( @@ -50,12 +69,18 @@ mod tests { use std::sync::Arc; use super::from_addr; + use lazy_static::lazy_static; use rstest::rstest; + use tempfile::TempDir; use tvix_castore::{ blobservice::{BlobService, MemoryBlobService}, directoryservice::{DirectoryService, MemoryDirectoryService}, }; + lazy_static! { + static ref TMPDIR_OCI_1: TempDir = TempDir::new().unwrap(); + } + #[rstest] /// This uses an unsupported scheme. #[case::unsupported_scheme("http://foo.example/test", false)] @@ -73,6 +98,10 @@ mod tests { #[case::grpc_valid_https_host_without_port("grpc+https://localhost", true)] /// Correct scheme to connect to localhost over http, but with additional path, which is invalid. #[case::grpc_invalid_host_and_path("grpc+http://localhost/some-path", false)] + /// This configures OCI, but doesn't specify the bundle path + #[case::oci_missing_bundle_dir("oci://", false)] + /// This configures OCI, specifying the bundle path + #[case::oci_bundle_path(&format!("oci://{}", TMPDIR_OCI_1.path().to_str().unwrap()), true)] #[tokio::test] async fn test_from_addr(#[case] uri_str: &str, #[case] exp_succeed: bool) { let blob_service: Arc<dyn BlobService> = Arc::from(MemoryBlobService::default()); diff --git a/tvix/build/src/buildservice/mod.rs b/tvix/build/src/buildservice/mod.rs index a61d782919b9..cdc3cb2afcc3 100644 --- a/tvix/build/src/buildservice/mod.rs +++ b/tvix/build/src/buildservice/mod.rs @@ -6,6 +6,9 @@ mod dummy; mod from_addr; mod grpc; +#[cfg(target_os = "linux")] +mod oci; + pub use dummy::DummyBuildService; pub use from_addr::from_addr; diff --git a/tvix/build/src/buildservice/oci.rs b/tvix/build/src/buildservice/oci.rs new file mode 100644 index 000000000000..26e6f5027f49 --- /dev/null +++ b/tvix/build/src/buildservice/oci.rs @@ -0,0 +1,245 @@ +use anyhow::Context; +use bstr::BStr; +use oci_spec::runtime::{LinuxIdMapping, LinuxIdMappingBuilder}; +use tokio::process::{Child, Command}; +use tonic::async_trait; +use tracing::{debug, instrument, warn, Span}; +use tvix_castore::{ + blobservice::BlobService, directoryservice::DirectoryService, fs::fuse::FuseDaemon, + import::fs::ingest_path, Node, PathComponent, +}; +use uuid::Uuid; + +use crate::{ + oci::{get_host_output_paths, make_bundle, make_spec}, + proto::{Build, BuildRequest}, +}; +use std::{collections::BTreeMap, ffi::OsStr, path::PathBuf, process::Stdio}; + +use super::BuildService; + +const SANDBOX_SHELL: &str = env!("TVIX_BUILD_SANDBOX_SHELL"); +const MAX_CONCURRENT_BUILDS: usize = 2; // TODO: make configurable + +pub struct OCIBuildService<BS, DS> { + /// Root path in which all bundles are created in + bundle_root: PathBuf, + + /// uid mappings to set up for the workloads + uid_mappings: Vec<LinuxIdMapping>, + /// uid mappings to set up for the workloads + gid_mappings: Vec<LinuxIdMapping>, + + /// Handle to a [BlobService], used by filesystems spawned during builds. + blob_service: BS, + /// Handle to a [DirectoryService], used by filesystems spawned during builds. + directory_service: DS, + + // semaphore to track number of concurrently running builds. + // this is necessary, as otherwise we very quickly run out of open file handles. + concurrent_builds: tokio::sync::Semaphore, +} + +impl<BS, DS> OCIBuildService<BS, DS> { + pub fn new(bundle_root: PathBuf, blob_service: BS, directory_service: DS) -> Self { + // We map root inside the container to the uid/gid this is running at, + // and allocate one for uid 1000 into the container from the range we + // got in /etc/sub{u,g}id. + // TODO: actually read uid, and /etc/subuid. Maybe only when we try to build? + // FUTUREWORK: use different uids? + Self { + bundle_root, + blob_service, + directory_service, + uid_mappings: vec![ + LinuxIdMappingBuilder::default() + .host_id(1000_u32) + .container_id(0_u32) + .size(1_u32) + .build() + .unwrap(), + LinuxIdMappingBuilder::default() + .host_id(100000_u32) + .container_id(1000_u32) + .size(1_u32) + .build() + .unwrap(), + ], + gid_mappings: vec![ + LinuxIdMappingBuilder::default() + .host_id(100_u32) + .container_id(0_u32) + .size(1_u32) + .build() + .unwrap(), + LinuxIdMappingBuilder::default() + .host_id(100000_u32) + .container_id(100_u32) + .size(1_u32) + .build() + .unwrap(), + ], + concurrent_builds: tokio::sync::Semaphore::new(MAX_CONCURRENT_BUILDS), + } + } +} + +#[async_trait] +impl<BS, DS> BuildService for OCIBuildService<BS, DS> +where + BS: AsRef<dyn BlobService> + Send + Sync + Clone + 'static, + DS: AsRef<dyn DirectoryService> + Send + Sync + Clone + 'static, +{ + #[instrument(skip_all, err)] + async fn do_build(&self, request: BuildRequest) -> std::io::Result<Build> { + let _permit = self.concurrent_builds.acquire().await.unwrap(); + + let bundle_name = Uuid::new_v4(); + let bundle_path = self.bundle_root.join(bundle_name.to_string()); + + let span = Span::current(); + span.record("bundle_name", bundle_name.to_string()); + + let mut runtime_spec = make_spec(&request, true, SANDBOX_SHELL) + .context("failed to create spec") + .map_err(std::io::Error::other)?; + + let mut linux = runtime_spec.linux().clone().unwrap(); + + // edit the spec, we need to setup uid/gid mappings. + linux.set_uid_mappings(Some(self.uid_mappings.clone())); + linux.set_gid_mappings(Some(self.gid_mappings.clone())); + + runtime_spec.set_linux(Some(linux)); + + make_bundle(&request, &runtime_spec, &bundle_path) + .context("failed to produce bundle") + .map_err(std::io::Error::other)?; + + // pre-calculate the locations we want to later ingest, in the order of + // the original outputs. + // If we can't find calculate that path, don't start the build in first place. + let host_output_paths = get_host_output_paths(&request, &bundle_path) + .context("failed to calculate host output paths") + .map_err(std::io::Error::other)?; + + // NOTE: impl Drop for FuseDaemon unmounts, so if the call is cancelled, umount. + let _fuse_daemon = tokio::task::spawn_blocking({ + let blob_service = self.blob_service.clone(); + let directory_service = self.directory_service.clone(); + // assemble a BTreeMap of Nodes to pass into TvixStoreFs. + let root_nodes: BTreeMap<PathComponent, Node> = + BTreeMap::from_iter(request.inputs.iter().map(|input| { + // We know from validation this is Some. + input.clone().into_name_and_node().unwrap() + })); + + debug!(inputs=?root_nodes.keys(), "got inputs"); + + let dest = bundle_path.join("inputs"); + + move || { + let fs = tvix_castore::fs::TvixStoreFs::new( + blob_service, + directory_service, + Box::new(root_nodes), + true, + false, + ); + // mount the filesystem and wait for it to be unmounted. + // FUTUREWORK: make fuse daemon threads configurable? + FuseDaemon::new(fs, dest, 4, true).context("failed to start fuse daemon") + } + }) + .await? + .context("mounting") + .map_err(std::io::Error::other)?; + + debug!(bundle.path=?bundle_path, bundle.name=%bundle_name, "about to spawn bundle"); + + // start the bundle as another process. + let child = spawn_bundle(bundle_path, &bundle_name.to_string())?; + + // wait for the process to exit + // FUTUREWORK: change the trait to allow reporting progress / logs… + let child_output = child + .wait_with_output() + .await + .context("failed to run process") + .map_err(std::io::Error::other)?; + + // Check the exit code + if !child_output.status.success() { + let stdout = BStr::new(&child_output.stdout); + let stderr = BStr::new(&child_output.stderr); + + warn!(stdout=%stdout, stderr=%stderr, exit_code=%child_output.status, "build failed"); + + return Err(std::io::Error::new( + std::io::ErrorKind::Other, + "nonzero exit code".to_string(), + )); + } + + // Ingest build outputs into the castore. + // We use try_join_all here. No need to spawn new tasks, as this is + // mostly IO bound. + let outputs = futures::future::try_join_all(host_output_paths.into_iter().enumerate().map( + |(i, p)| { + let output_path = request.outputs[i].clone(); + async move { + debug!(host.path=?p, output.path=?output_path, "ingesting path"); + + let output_node = ingest_path::<_, _, _, &[u8]>( + self.blob_service.clone(), + &self.directory_service, + p, + None, + ) + .await + .map_err(|e| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("Unable to ingest output: {}", e), + ) + })?; + + Ok::<_, std::io::Error>(tvix_castore::proto::Node::from_name_and_node( + "".into(), + output_node, + )) + } + }, + )) + .await?; + + Ok(Build { + build_request: Some(request.clone()), + outputs, + outputs_needles: vec![], // TODO refscanning + }) + } +} + +/// Spawns runc with the bundle at bundle_path. +/// On success, returns the child. +#[instrument(err)] +fn spawn_bundle( + bundle_path: impl AsRef<OsStr> + std::fmt::Debug, + bundle_name: &str, +) -> std::io::Result<Child> { + let mut command = Command::new("runc"); + + command + .args(&[ + "run".into(), + "--bundle".into(), + bundle_path.as_ref().to_os_string(), + bundle_name.into(), + ]) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .stdin(Stdio::null()); + + command.spawn() +} diff --git a/tvix/build/src/lib.rs b/tvix/build/src/lib.rs index b173657e431c..ccd7657101d8 100644 --- a/tvix/build/src/lib.rs +++ b/tvix/build/src/lib.rs @@ -1,2 +1,3 @@ pub mod buildservice; +mod oci; pub mod proto; diff --git a/tvix/build/src/oci/bundle.rs b/tvix/build/src/oci/bundle.rs new file mode 100644 index 000000000000..c3c2e83e89e5 --- /dev/null +++ b/tvix/build/src/oci/bundle.rs @@ -0,0 +1,133 @@ +//! Module to create an OCI runtime bundle for a given [BuildRequest]. +use std::{ + fs, + path::{Path, PathBuf}, +}; + +use super::scratch_name; +use crate::proto::BuildRequest; +use anyhow::{bail, Context}; +use tracing::{debug, instrument}; + +/// Produce an OCI bundle in a given path. +/// Check [make_spec] for a description about the paths produced. +#[instrument(err)] +pub(crate) fn make_bundle<'a>( + request: &BuildRequest, + runtime_spec: &oci_spec::runtime::Spec, + path: &Path, +) -> anyhow::Result<()> { + fs::create_dir_all(path).context("failed to create bundle path")?; + + let spec_json = serde_json::to_string(runtime_spec).context("failed to render spec to json")?; + fs::write(path.join("config.json"), spec_json).context("failed to write config.json")?; + + fs::create_dir_all(path.join("inputs")).context("failed to create inputs dir")?; + + let root_path = path.join("root"); + + fs::create_dir_all(&root_path).context("failed to create root path dir")?; + fs::create_dir_all(root_path.join("etc")).context("failed to create root/etc dir")?; + + // TODO: populate /etc/{group,passwd}. It's a mess? + + let scratch_root = path.join("scratch"); + fs::create_dir_all(&scratch_root).context("failed to create scratch/ dir")?; + + // for each scratch path, calculate its name inside scratch, and ensure the + // directory exists. + for p in request.scratch_paths.iter() { + let scratch_path = scratch_root.join(scratch_name(p)); + debug!(scratch_path=?scratch_path, path=?p, "about to create scratch dir"); + fs::create_dir_all(scratch_path).context("Unable to create scratch dir")?; + } + + Ok(()) +} + +/// Determine the path of all outputs specified in a [BuildRequest] +/// as seen from the host, for post-build ingestion. +/// This lookup needs to take scratch paths into consideration, as the build +/// root is not writable on its own. +/// If a path can't be determined, an error is returned. +pub(crate) fn get_host_output_paths( + request: &BuildRequest, + bundle_path: &Path, +) -> anyhow::Result<Vec<PathBuf>> { + let scratch_root = bundle_path.join("scratch"); + + let mut host_output_paths: Vec<PathBuf> = Vec::with_capacity(request.outputs.len()); + + for output_path in request.outputs.iter() { + // calculate the location of the path. + if let Some((mp, relpath)) = + find_path_in_scratchs(output_path, request.scratch_paths.as_slice()) + { + host_output_paths.push(scratch_root.join(scratch_name(mp)).join(relpath)); + } else { + bail!("unable to find path {}", output_path); + } + } + + Ok(host_output_paths) +} + +/// For a given list of mountpoints (sorted) and a search_path, find the +/// specific mountpoint parenting that search_path and return it, as well as the +/// relative path from there to the search_path. +/// mountpoints must be sorted, so we can iterate over the list from the back +/// and match on the prefix. +fn find_path_in_scratchs<'a, 'b>( + search_path: &'a str, + mountpoints: &'b [String], +) -> Option<(&'b str, &'a str)> { + mountpoints.iter().rev().find_map(|mp| { + Some(( + mp.as_str(), + search_path.strip_prefix(mp)?.strip_prefix('/')?, + )) + }) +} + +#[cfg(test)] +mod tests { + use std::path::{Path, PathBuf}; + + use rstest::rstest; + + use crate::{oci::scratch_name, proto::BuildRequest}; + + use super::{find_path_in_scratchs, get_host_output_paths}; + + #[rstest] + #[case::simple("nix/store/aaaa", &["nix/store".into()], Some(("nix/store", "aaaa")))] + #[case::prefix_no_sep("nix/store/aaaa", &["nix/sto".into()], None)] + #[case::not_found("nix/store/aaaa", &["build".into()], None)] + fn test_test_find_path_in_scratchs( + #[case] search_path: &str, + #[case] mountpoints: &[String], + #[case] expected: Option<(&str, &str)>, + ) { + assert_eq!(find_path_in_scratchs(search_path, mountpoints), expected); + } + + #[test] + fn test_get_host_output_paths_simple() { + let request = BuildRequest { + outputs: vec!["nix/store/fhaj6gmwns62s6ypkcldbaj2ybvkhx3p-foo".into()], + scratch_paths: vec!["build".into(), "nix/store".into()], + ..Default::default() + }; + + let paths = + get_host_output_paths(&request, Path::new("bundle-root")).expect("must succeed"); + + let mut expected_path = PathBuf::new(); + expected_path.push("bundle-root"); + expected_path.push("scratch"); + expected_path.push(scratch_name("nix/store")); + expected_path.push("fhaj6gmwns62s6ypkcldbaj2ybvkhx3p-foo"); + + assert_eq!(vec![expected_path], paths) + } +} diff --git a/tvix/build/src/oci/mod.rs b/tvix/build/src/oci/mod.rs new file mode 100644 index 000000000000..26dab3059a58 --- /dev/null +++ b/tvix/build/src/oci/mod.rs @@ -0,0 +1,13 @@ +mod bundle; +mod spec; + +pub(crate) use bundle::get_host_output_paths; +pub(crate) use bundle::make_bundle; +pub(crate) use spec::make_spec; + +/// For a given scratch path, return the scratch_name that's allocated. +// We currently use use lower hex encoding of the b3 digest of the scratch +// path, so we don't need to globally allocate and pass down some uuids. +pub(crate) fn scratch_name(scratch_path: &str) -> String { + data_encoding::BASE32.encode(blake3::hash(scratch_path.as_bytes()).as_bytes()) +} diff --git a/tvix/build/src/oci/spec.rs b/tvix/build/src/oci/spec.rs new file mode 100644 index 000000000000..d804aa1171c1 --- /dev/null +++ b/tvix/build/src/oci/spec.rs @@ -0,0 +1,319 @@ +//! Module to create a OCI runtime spec for a given [BuildRequest]. +use crate::proto::BuildRequest; +use oci_spec::{ + runtime::{Capability, LinuxNamespace, LinuxNamespaceBuilder, LinuxNamespaceType}, + OciSpecError, +}; +use std::{collections::HashSet, path::Path}; +use tvix_castore::proto as castorepb; + +use super::scratch_name; + +/// For a given [BuildRequest], return an OCI runtime spec. +/// +/// While there's no IO occuring in this function, the generated spec contains +/// path references relative to the "bundle location". +/// Due to overlayfs requiring its layers to be absolute paths, we also need a +/// [bundle_dir] parameter, pointing to the location of the bundle dir itself. +/// +/// The paths used in the spec are the following (relative to a "bundle root"): +/// +/// - `inputs`, a directory where the castore nodes specified the build request +/// inputs are supposed to be populated. +/// - `outputs`, a directory where all writes to the store_dir during the build +/// are directed to. +/// - `root`, a minimal skeleton of files that'll be present at /. +/// - `scratch`, a directory containing other directories which will be +/// bind-mounted read-write into the container and used as scratch space +/// during the build. +/// No assumptions should be made about what's inside this directory. +/// +/// Generating these paths, and populating contents, like a skeleton root +/// is up to another function, this function doesn't do filesystem IO. +pub(crate) fn make_spec( + request: &BuildRequest, + rootless: bool, + sandbox_shell: &str, +) -> Result<oci_spec::runtime::Spec, oci_spec::OciSpecError> { + // TODO: add BuildRequest validations. BuildRequest must contain strings as inputs + + let allow_network = request + .constraints + .as_ref() + .is_some_and(|c| c.network_access); + + // Assemble ro_host_mounts. Start with constraints.available_ro_paths. + let mut ro_host_mounts = request + .constraints + .as_ref() + .map(|constraints| { + constraints + .available_ro_paths + .iter() + .map(|e| (e.as_str(), e.as_str())) + .collect::<Vec<_>>() + }) + .unwrap_or_default(); + + // If provide_bin_sh is set, mount sandbox_shell to /bin/sh + if request + .constraints + .as_ref() + .is_some_and(|c| c.provide_bin_sh) + { + ro_host_mounts.push((sandbox_shell, "/bin/sh")) + } + + oci_spec::runtime::SpecBuilder::default() + .process(configure_process( + &request.command_args, + &request.working_dir, + request + .environment_vars + .iter() + .map(|e| { + ( + e.key.as_str(), + // TODO: decide what to do with non-bytes env values + String::from_utf8(e.value.to_vec()).expect("invalid string in env"), + ) + }) + .collect::<Vec<_>>(), + rootless, + )?) + .linux(configure_linux(allow_network, rootless)?) + .root( + oci_spec::runtime::RootBuilder::default() + .path("root") + .readonly(true) + .build()?, + ) + .hostname("localhost") + .mounts(configure_mounts( + rootless, + allow_network, + request.scratch_paths.iter().map(|e| e.as_str()), + request.inputs.iter(), + &request.inputs_dir, // TODO: validate + ro_host_mounts, + )?) + .build() +} + +/// Return the Process part of the OCI Runtime spec. +/// This configures the command, it's working dir, env and terminal setup. +/// It also takes care of setting rlimits and capabilities. +/// Capabilities are a bit more complicated in case rootless building is requested. +fn configure_process<'a>( + command_args: &[String], + cwd: &String, + env: impl IntoIterator<Item = (&'a str, String)>, + rootless: bool, +) -> Result<oci_spec::runtime::Process, oci_spec::OciSpecError> { + let spec_builder = oci_spec::runtime::ProcessBuilder::default() + .args(command_args) + .env( + env.into_iter() + .map(|(k, v)| format!("{}={}", k, v)) + .collect::<Vec<_>>(), + ) + .terminal(true) + .user( + oci_spec::runtime::UserBuilder::default() + .uid(1000u32) + .gid(100u32) + .build()?, + ) + .cwd(Path::new("/").join(cwd)) // relative to the bundle root, but at least runc wants it to also be absolute. + .capabilities({ + let caps: HashSet<Capability> = if !rootless { + HashSet::from([Capability::AuditWrite, Capability::Kill]) + } else { + HashSet::from([ + Capability::AuditWrite, + Capability::Chown, + Capability::DacOverride, + Capability::Fowner, + Capability::Fsetid, + Capability::Kill, + Capability::Mknod, + Capability::NetBindService, + Capability::NetRaw, + Capability::Setfcap, + Capability::Setgid, + Capability::Setpcap, + Capability::Setuid, + Capability::SysChroot, + ]) + }; + + oci_spec::runtime::LinuxCapabilitiesBuilder::default() + .bounding(caps.clone()) + .effective(caps.clone()) + .inheritable(caps.clone()) + .permitted(caps.clone()) + .ambient(caps) + .build()? + }) + .rlimits([oci_spec::runtime::LinuxRlimitBuilder::default() + .typ(oci_spec::runtime::LinuxRlimitType::RlimitNofile) + .hard(1024_u64) + .soft(1024_u64) + .build()?]) + .no_new_privileges(true); + + spec_builder.build() +} + +/// Return the Linux part of the OCI Runtime spec. +/// This configures various namespaces, masked and read-only paths. +fn configure_linux( + allow_network: bool, + rootless: bool, +) -> Result<oci_spec::runtime::Linux, OciSpecError> { + let mut linux = oci_spec::runtime::Linux::default(); + + // explicitly set namespaces, depending on allow_network. + linux.set_namespaces(Some({ + let mut namespace_types = vec![ + LinuxNamespaceType::Pid, + LinuxNamespaceType::Ipc, + LinuxNamespaceType::Uts, + LinuxNamespaceType::Mount, + LinuxNamespaceType::Cgroup, + ]; + if !allow_network { + namespace_types.push(LinuxNamespaceType::Network) + } + if rootless { + namespace_types.push(LinuxNamespaceType::User) + } + + namespace_types + .into_iter() + .map(|e| LinuxNamespaceBuilder::default().typ(e).build()) + .collect::<Result<Vec<LinuxNamespace>, _>>()? + })); + + linux.set_masked_paths(Some( + [ + "/proc/kcore", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + ] + .into_iter() + .map(|e| e.to_string()) + .collect::<Vec<_>>(), + )); + + linux.set_readonly_paths(Some( + [ + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger", + ] + .into_iter() + .map(|e| e.to_string()) + .collect::<Vec<_>>(), + )); + + Ok(linux) +} + +/// Return the Mounts part of the OCI Runtime spec. +/// It first sets up the standard mounts, then scratch paths, bind mounts for +/// all inputs, and finally read-only paths from the hosts. +fn configure_mounts<'a>( + rootless: bool, + allow_network: bool, + scratch_paths: impl IntoIterator<Item = &'a str>, + inputs: impl Iterator<Item = &'a castorepb::Node>, + inputs_dir: &str, + ro_host_mounts: impl IntoIterator<Item = (&'a str, &'a str)>, +) -> Result<Vec<oci_spec::runtime::Mount>, oci_spec::OciSpecError> { + let mut mounts: Vec<_> = if rootless { + oci_spec::runtime::get_rootless_mounts() + } else { + oci_spec::runtime::get_default_mounts() + }; + + mounts.push(configure_mount( + "tmpfs", + "/tmp", + "tmpfs", + &["nosuid", "noatime", "mode=700"], + )?); + + // For each scratch path, create a bind mount entry. + let scratch_root = Path::new("scratch"); // relative path + for scratch_path in scratch_paths.into_iter() { + let src = scratch_root.join(scratch_name(scratch_path)); + mounts.push(configure_mount( + src.to_str().unwrap(), + Path::new("/").join(scratch_path).to_str().unwrap(), + "none", + &["rbind", "rw"], + )?); + } + + // For each input, create a bind mount from inputs/$name into $inputs_dir/$name. + for input in inputs { + let (input_name, _input) = input + .clone() + .into_name_and_node() + .expect("invalid input name"); + + let input_name = std::str::from_utf8(input_name.as_ref()).expect("invalid input name"); + mounts.push(configure_mount( + Path::new("inputs").join(input_name).to_str().unwrap(), + Path::new("/") + .join(inputs_dir) + .join(input_name) + .to_str() + .unwrap(), + "none", + &[ + "rbind", "ro", + // "nosuid" is required, otherwise mounting will just fail with + // a generic permission error. + // See https://github.com/wllenyj/containerd/commit/42a386c8164bef16d59590c61ab00806f854d8fd + "nosuid", "nodev", + ], + )?); + } + + // Process ro_host_mounts + for (src, dst) in ro_host_mounts.into_iter() { + mounts.push(configure_mount(src, dst, "none", &["rbind", "ro"])?); + } + + // In case network is enabled, also mount in /etc/{resolv.conf,services,hosts} + if allow_network { + for p in ["/etc/resolv.conf", "/etc/services", "/etc/hosts"] { + mounts.push(configure_mount(p, p, "none", &["rbind", "ro"])?); + } + } + + Ok(mounts) +} + +/// Helper function to produce a mount. +fn configure_mount( + source: &str, + destination: &str, + typ: &str, + options: &[&str], +) -> Result<oci_spec::runtime::Mount, oci_spec::OciSpecError> { + oci_spec::runtime::MountBuilder::default() + .destination(destination.to_string()) + .typ(typ.to_string()) + .source(source.to_string()) + .options(options.iter().map(|e| e.to_string()).collect::<Vec<_>>()) + .build() +} |