about summary refs log tree commit diff
path: root/tvix/castore/src/proto/mod.rs
#![allow(non_snake_case)]
// https://github.com/hyperium/tonic/issues/1056
use bstr::ByteSlice;
use std::{collections::HashSet, iter::Peekable, str};

use prost::Message;

mod grpc_blobservice_wrapper;
mod grpc_directoryservice_wrapper;

pub use grpc_blobservice_wrapper::GRPCBlobServiceWrapper;
pub use grpc_directoryservice_wrapper::GRPCDirectoryServiceWrapper;

use crate::{B3Digest, B3_LEN};

tonic::include_proto!("tvix.castore.v1");

#[cfg(feature = "tonic-reflection")]
/// Compiled file descriptors for implementing [gRPC
/// reflection](https://github.com/grpc/grpc/blob/master/doc/server-reflection.md) with e.g.
/// [`tonic_reflection`](https://docs.rs/tonic-reflection).
pub const FILE_DESCRIPTOR_SET: &[u8] = tonic::include_file_descriptor_set!("tvix.castore.v1");

#[cfg(test)]
mod tests;

/// Errors that can occur during the validation of [Directory] messages.
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
pub enum ValidateDirectoryError {
    /// Elements are not in sorted order
    #[error("{:?} is not sorted", .0.as_bstr())]
    WrongSorting(Vec<u8>),
    /// Multiple elements with the same name encountered
    #[error("{:?} is a duplicate name", .0.as_bstr())]
    DuplicateName(Vec<u8>),
    /// Invalid node
    #[error("invalid node with name {:?}: {:?}", .0.as_bstr(), .1.to_string())]
    InvalidNode(Vec<u8>, ValidateNodeError),
    #[error("Total size exceeds u32::MAX")]
    SizeOverflow,
}

/// Errors that occur during Node validation
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
pub enum ValidateNodeError {
    #[error("No node set")]
    NoNodeSet,
    /// Invalid digest length encountered
    #[error("Invalid Digest length: {0}")]
    InvalidDigestLen(usize),
    /// Invalid name encountered
    #[error("Invalid name: {}", .0.as_bstr())]
    InvalidName(Vec<u8>),
    /// Invalid symlink target
    #[error("Invalid symlink target: {}", .0.as_bstr())]
    InvalidSymlinkTarget(Vec<u8>),
}

/// Errors that occur during StatBlobResponse validation
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
pub enum ValidateStatBlobResponseError {
    /// Invalid digest length encountered
    #[error("Invalid digest length {0} for chunk #{1}")]
    InvalidDigestLen(usize, usize),
}

/// Checks a Node name for validity as an intermediate node.
/// We disallow slashes, null bytes, '.', '..' and the empty string.
pub(crate) fn validate_node_name(name: &[u8]) -> Result<(), ValidateNodeError> {
    if name.is_empty()
        || name == b".."
        || name == b"."
        || name.contains(&0x00)
        || name.contains(&b'/')
    {
        Err(ValidateNodeError::InvalidName(name.to_owned()))
    } else {
        Ok(())
    }
}

/// NamedNode is implemented for [FileNode], [DirectoryNode] and [SymlinkNode]
/// and [node::Node], so we can ask all of them for the name easily.
pub trait NamedNode {
    fn get_name(&self) -> &[u8];
}

impl NamedNode for &FileNode {
    fn get_name(&self) -> &[u8] {
        &self.name
    }
}

impl NamedNode for &DirectoryNode {
    fn get_name(&self) -> &[u8] {
        &self.name
    }
}

impl NamedNode for &SymlinkNode {
    fn get_name(&self) -> &[u8] {
        &self.name
    }
}

impl NamedNode for node::Node {
    fn get_name(&self) -> &[u8] {
        match self {
            node::Node::File(node_file) => &node_file.name,
            node::Node::Directory(node_directory) => &node_directory.name,
            node::Node::Symlink(node_symlink) => &node_symlink.name,
        }
    }
}

impl Node {
    /// Ensures the node has a valid enum kind (is Some), and passes its
    // per-enum validation.
    pub fn validate(&self) -> Result<(), ValidateNodeError> {
        if let Some(node) = self.node.as_ref() {
            node.validate()
        } else {
            Err(ValidateNodeError::NoNodeSet)
        }
    }
}

impl node::Node {
    /// Returns the node with a new name.
    pub fn rename(self, name: bytes::Bytes) -> Self {
        match self {
            node::Node::Directory(n) => node::Node::Directory(DirectoryNode { name, ..n }),
            node::Node::File(n) => node::Node::File(FileNode { name, ..n }),
            node::Node::Symlink(n) => node::Node::Symlink(SymlinkNode { name, ..n }),
        }
    }

    /// Ensures the node has a valid name, and checks the type-specific fields too.
    pub fn validate(&self) -> Result<(), ValidateNodeError> {
        match self {
            // for a directory root node, ensure the digest has the appropriate size.
            node::Node::Directory(directory_node) => {
                if directory_node.digest.len() != B3_LEN {
                    Err(ValidateNodeError::InvalidDigestLen(
                        directory_node.digest.len(),
                    ))?;
                }
                validate_node_name(&directory_node.name)
            }
            // for a file root node, ensure the digest has the appropriate size.
            node::Node::File(file_node) => {
                if file_node.digest.len() != B3_LEN {
                    Err(ValidateNodeError::InvalidDigestLen(file_node.digest.len()))?;
                }
                validate_node_name(&file_node.name)
            }
            // ensure the symlink target is not empty and doesn't contain null bytes.
            node::Node::Symlink(symlink_node) => {
                if symlink_node.target.is_empty() || symlink_node.target.contains(&b'\0') {
                    Err(ValidateNodeError::InvalidSymlinkTarget(
                        symlink_node.target.to_vec(),
                    ))?;
                }
                validate_node_name(&symlink_node.name)
            }
        }
    }
}

impl PartialOrd for node::Node {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
    }
}

impl Ord for node::Node {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        self.get_name().cmp(other.get_name())
    }
}

impl PartialOrd for FileNode {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
    }
}

impl Ord for FileNode {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        self.get_name().cmp(other.get_name())
    }
}

impl PartialOrd for SymlinkNode {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
    }
}

impl Ord for SymlinkNode {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        self.get_name().cmp(other.get_name())
    }
}

impl PartialOrd for DirectoryNode {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
    }
}

impl Ord for DirectoryNode {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        self.get_name().cmp(other.get_name())
    }
}

/// Accepts a name, and a mutable reference to the previous name.
/// If the passed name is larger than the previous one, the reference is updated.
/// If it's not, an error is returned.
fn update_if_lt_prev<'n>(
    prev_name: &mut &'n [u8],
    name: &'n [u8],
) -> Result<(), ValidateDirectoryError> {
    if *name < **prev_name {
        return Err(ValidateDirectoryError::WrongSorting(name.to_vec()));
    }
    *prev_name = name;
    Ok(())
}

/// Inserts the given name into a HashSet if it's not already in there.
/// If it is, an error is returned.
fn insert_once<'n>(
    seen_names: &mut HashSet<&'n [u8]>,
    name: &'n [u8],
) -> Result<(), ValidateDirectoryError> {
    if seen_names.get(name).is_some() {
        return Err(ValidateDirectoryError::DuplicateName(name.to_vec()));
    }
    seen_names.insert(name);
    Ok(())
}

fn checked_sum(iter: impl IntoIterator<Item = u64>) -> Option<u64> {
    iter.into_iter().try_fold(0u64, |acc, i| acc.checked_add(i))
}

impl Directory {
    /// The size of a directory is the number of all regular and symlink elements,
    /// the number of directory elements, and their size fields.
    pub fn size(&self) -> u64 {
        if cfg!(debug_assertions) {
            self.size_checked()
                .expect("Directory::size exceeds u64::MAX")
        } else {
            self.size_checked().unwrap_or(u64::MAX)
        }
    }

    fn size_checked(&self) -> Option<u64> {
        checked_sum([
            self.files.len().try_into().ok()?,
            self.symlinks.len().try_into().ok()?,
            self.directories.len().try_into().ok()?,
            checked_sum(self.directories.iter().map(|e| e.size))?,
        ])
    }

    /// Calculates the digest of a Directory, which is the blake3 hash of a
    /// Directory protobuf message, serialized in protobuf canonical form.
    pub fn digest(&self) -> B3Digest {
        let mut hasher = blake3::Hasher::new();

        hasher
            .update(&self.encode_to_vec())
            .finalize()
            .as_bytes()
            .into()
    }

    /// validate checks the directory for invalid data, such as:
    /// - violations of name restrictions
    /// - invalid digest lengths
    /// - not properly sorted lists
    /// - duplicate names in the three lists
    pub fn validate(&self) -> Result<(), ValidateDirectoryError> {
        let mut seen_names: HashSet<&[u8]> = HashSet::new();

        let mut last_directory_name: &[u8] = b"";
        let mut last_file_name: &[u8] = b"";
        let mut last_symlink_name: &[u8] = b"";

        // check directories
        for directory_node in &self.directories {
            node::Node::Directory(directory_node.clone())
                .validate()
                .map_err(|e| {
                    ValidateDirectoryError::InvalidNode(directory_node.name.to_vec(), e)
                })?;

            update_if_lt_prev(&mut last_directory_name, &directory_node.name)?;
            insert_once(&mut seen_names, &directory_node.name)?;
        }

        // check files
        for file_node in &self.files {
            node::Node::File(file_node.clone())
                .validate()
                .map_err(|e| ValidateDirectoryError::InvalidNode(file_node.name.to_vec(), e))?;

            update_if_lt_prev(&mut last_file_name, &file_node.name)?;
            insert_once(&mut seen_names, &file_node.name)?;
        }

        // check symlinks
        for symlink_node in &self.symlinks {
            node::Node::Symlink(symlink_node.clone())
                .validate()
                .map_err(|e| ValidateDirectoryError::InvalidNode(symlink_node.name.to_vec(), e))?;

            update_if_lt_prev(&mut last_symlink_name, &symlink_node.name)?;
            insert_once(&mut seen_names, &symlink_node.name)?;
        }

        self.size_checked()
            .ok_or(ValidateDirectoryError::SizeOverflow)?;

        Ok(())
    }

    /// Allows iterating over all three nodes ([DirectoryNode], [FileNode],
    /// [SymlinkNode]) in an ordered fashion, as long as the individual lists
    /// are sorted (which can be checked by the [Directory::validate]).
    pub fn nodes(&self) -> DirectoryNodesIterator {
        return DirectoryNodesIterator {
            i_directories: self.directories.iter().peekable(),
            i_files: self.files.iter().peekable(),
            i_symlinks: self.symlinks.iter().peekable(),
        };
    }

    /// Adds the specified [node::Node] to the [Directory], preserving sorted entries.
    /// This assumes the [Directory] to be sorted prior to adding the node.
    ///
    /// Inserting an element that already exists with the same name in the directory is not
    /// supported.
    pub fn add(&mut self, node: node::Node) {
        debug_assert!(
            !self.files.iter().any(|x| x.get_name() == node.get_name()),
            "name already exists in files"
        );
        debug_assert!(
            !self
                .directories
                .iter()
                .any(|x| x.get_name() == node.get_name()),
            "name already exists in directories"
        );
        debug_assert!(
            !self
                .symlinks
                .iter()
                .any(|x| x.get_name() == node.get_name()),
            "name already exists in symlinks"
        );

        match node {
            node::Node::File(node) => {
                let pos = self
                    .files
                    .binary_search(&node)
                    .expect_err("Tvix bug: dir entry with name already exists");
                self.files.insert(pos, node);
            }
            node::Node::Directory(node) => {
                let pos = self
                    .directories
                    .binary_search(&node)
                    .expect_err("Tvix bug: dir entry with name already exists");
                self.directories.insert(pos, node);
            }
            node::Node::Symlink(node) => {
                let pos = self
                    .symlinks
                    .binary_search(&node)
                    .expect_err("Tvix bug: dir entry with name already exists");
                self.symlinks.insert(pos, node);
            }
        }
    }
}

impl StatBlobResponse {
    /// Validates a StatBlobResponse. All chunks must have valid blake3 digests.
    /// It is allowed to send an empty list, if no more granular chunking is
    /// available.
    pub fn validate(&self) -> Result<(), ValidateStatBlobResponseError> {
        for (i, chunk) in self.chunks.iter().enumerate() {
            if chunk.digest.len() != blake3::KEY_LEN {
                return Err(ValidateStatBlobResponseError::InvalidDigestLen(
                    chunk.digest.len(),
                    i,
                ));
            }
        }
        Ok(())
    }
}

/// Struct to hold the state of an iterator over all nodes of a Directory.
///
/// Internally, this keeps peekable Iterators over all three lists of a
/// Directory message.
pub struct DirectoryNodesIterator<'a> {
    // directory: &Directory,
    i_directories: Peekable<std::slice::Iter<'a, DirectoryNode>>,
    i_files: Peekable<std::slice::Iter<'a, FileNode>>,
    i_symlinks: Peekable<std::slice::Iter<'a, SymlinkNode>>,
}

/// looks at two elements implementing NamedNode, and returns true if "left
/// is smaller / comes first".
///
/// Some(_) is preferred over None.
fn left_name_lt_right<A: NamedNode, B: NamedNode>(left: Option<&A>, right: Option<&B>) -> bool {
    match left {
        // if left is None, right always wins
        None => false,
        Some(left_inner) => {
            // left is Some.
            match right {
                // left is Some, right is None - left wins.
                None => true,
                Some(right_inner) => {
                    // both are Some - compare the name.
                    return left_inner.get_name() < right_inner.get_name();
                }
            }
        }
    }
}

impl Iterator for DirectoryNodesIterator<'_> {
    type Item = node::Node;

    // next returns the next node in the Directory.
    // we peek at all three internal iterators, and pick the one with the
    // smallest name, to ensure lexicographical ordering.
    // The individual lists are already known to be sorted.
    fn next(&mut self) -> Option<Self::Item> {
        if left_name_lt_right(self.i_directories.peek(), self.i_files.peek()) {
            // i_directories is still in the game, compare with symlinks
            if left_name_lt_right(self.i_directories.peek(), self.i_symlinks.peek()) {
                self.i_directories
                    .next()
                    .cloned()
                    .map(node::Node::Directory)
            } else {
                self.i_symlinks.next().cloned().map(node::Node::Symlink)
            }
        } else {
            // i_files is still in the game, compare with symlinks
            if left_name_lt_right(self.i_files.peek(), self.i_symlinks.peek()) {
                self.i_files.next().cloned().map(node::Node::File)
            } else {
                self.i_symlinks.next().cloned().map(node::Node::Symlink)
            }
        }
    }
}