about summary refs log tree commit diff
path: root/tvix/tools/crunch-v2/src/bin/extract.rs
diff options
context:
space:
mode:
authoredef <edef@edef.eu>2024-01-17T16·04+0000
committeredef <edef@edef.eu>2024-01-27T18·23+0000
commit4f22203a3aecd070881ae9b4eabc47532d948f01 (patch)
tree39e493d8d4fcf6362dd4e25be8bcb64fe0b0e25e /tvix/tools/crunch-v2/src/bin/extract.rs
parente0a1c03b2471271fc96690b3dc5dd5423f93fa42 (diff)
feat(tvix/tools/crunch-v2): init r/7452
This is a tool for ingesting subsets of cache.nixos.org into its own flattened castore format.
Currently, produced chunks are not preserved, and this purely serves as a way of measuring
compression/deduplication ratios for various chunking and compression parameters.

Change-Id: I3983af02a66f7837d76874ee0fc8b2fab62ac17e
Reviewed-on: https://cl.tvl.fyi/c/depot/+/10486
Tested-by: BuildkiteCI
Reviewed-by: flokli <flokli@flokli.de>
Diffstat (limited to 'tvix/tools/crunch-v2/src/bin/extract.rs')
-rw-r--r--tvix/tools/crunch-v2/src/bin/extract.rs139
1 files changed, 139 insertions, 0 deletions
diff --git a/tvix/tools/crunch-v2/src/bin/extract.rs b/tvix/tools/crunch-v2/src/bin/extract.rs
new file mode 100644
index 000000000000..8da8df707a0e
--- /dev/null
+++ b/tvix/tools/crunch-v2/src/bin/extract.rs
@@ -0,0 +1,139 @@
+//! This tool lossily converts a Sled database produced by crunch-v2 into a Parquet file for analysis.
+//! The resulting `crunch.parquet` has columns file_hash`, `nar_hash`, and `chunk`.
+//! The first two are SHA-256 hashes of the compressed file and the NAR it decompresses to.
+//! `chunk` is a struct array corresponding to [crunch_v2::proto::Chunk] messages.
+//! They are concatenated without any additional structure, so nothing but the chunk list is preserved.
+
+use anyhow::Result;
+use indicatif::{ProgressBar, ProgressStyle};
+use std::fs::File;
+
+use crunch_v2::{
+    proto::{self, path::Node},
+    FILES,
+};
+use prost::Message;
+
+use polars::{
+    chunked_array::builder::AnonymousOwnedListBuilder,
+    prelude::{
+        df, BinaryChunkedBuilder, ChunkedBuilder, DataFrame, DataType, Field, ListBuilderTrait,
+        NamedFrom, ParquetWriter, PrimitiveChunkedBuilder, Series, UInt32Type,
+    },
+    series::IntoSeries,
+};
+
+fn main() -> Result<()> {
+    let w = ParquetWriter::new(File::create("crunch.parquet")?);
+
+    let progress = ProgressBar::new(FILES.len() as u64).with_style(ProgressStyle::with_template(
+        "{elapsed_precise}/{duration_precise} {wide_bar} {pos}/{len}",
+    )?);
+
+    let mut frame = FrameBuilder::new();
+    for entry in &*FILES {
+        let (file_hash, pb) = entry?;
+        frame.push(
+            file_hash[..].try_into().unwrap(),
+            proto::Path::decode(&pb[..])?,
+        );
+        progress.inc(1);
+    }
+
+    w.finish(&mut frame.finish())?;
+
+    Ok(())
+}
+
+struct FrameBuilder {
+    file_hash: BinaryChunkedBuilder,
+    nar_hash: BinaryChunkedBuilder,
+    chunk: AnonymousOwnedListBuilder,
+}
+
+impl FrameBuilder {
+    fn new() -> Self {
+        Self {
+            file_hash: BinaryChunkedBuilder::new("file_hash", 0, 0),
+            nar_hash: BinaryChunkedBuilder::new("nar_hash", 0, 0),
+            chunk: AnonymousOwnedListBuilder::new(
+                "chunk",
+                0,
+                Some(DataType::Struct(vec![
+                    Field::new("hash", DataType::Binary),
+                    Field::new("size", DataType::UInt32),
+                    Field::new("size_compressed", DataType::UInt32),
+                ])),
+            ),
+        }
+    }
+
+    fn push(&mut self, file_hash: [u8; 32], pb: proto::Path) {
+        self.file_hash.append_value(&file_hash[..]);
+        self.nar_hash.append_value(pb.nar_hash);
+        self.chunk
+            .append_series(&ChunkFrameBuilder::new(pb.node.unwrap()))
+            .unwrap();
+    }
+
+    fn finish(mut self) -> DataFrame {
+        df! {
+            "file_hash" => self.file_hash.finish().into_series(),
+            "nar_hash" => self.nar_hash.finish().into_series(),
+            "chunk" => self.chunk.finish().into_series()
+        }
+        .unwrap()
+    }
+}
+
+struct ChunkFrameBuilder {
+    hash: BinaryChunkedBuilder,
+    size: PrimitiveChunkedBuilder<UInt32Type>,
+    size_compressed: PrimitiveChunkedBuilder<UInt32Type>,
+}
+
+impl ChunkFrameBuilder {
+    fn new(node: proto::path::Node) -> Series {
+        let mut this = Self {
+            hash: BinaryChunkedBuilder::new("hash", 0, 0),
+            size: PrimitiveChunkedBuilder::new("size", 0),
+            size_compressed: PrimitiveChunkedBuilder::new("size_compressed", 0),
+        };
+
+        this.push(node);
+        this.finish()
+    }
+
+    fn push(&mut self, node: Node) {
+        match node {
+            Node::Directory(node) => {
+                for node in node.files {
+                    self.push(Node::File(node));
+                }
+
+                for node in node.directories {
+                    self.push(Node::Directory(node));
+                }
+            }
+            Node::File(node) => {
+                for chunk in node.chunks {
+                    self.hash.append_value(&chunk.hash);
+                    self.size.append_value(chunk.size);
+                    self.size_compressed.append_value(chunk.size_compressed);
+                }
+            }
+            Node::Symlink(_) => {}
+        }
+    }
+
+    fn finish(self) -> Series {
+        df! {
+            "hash" => self.hash.finish().into_series(),
+            "size" => self.size.finish().into_series(),
+            "size_compressed" => self.size_compressed.finish().into_series()
+        }
+        .unwrap()
+        .into_struct("chunk")
+        .into_series()
+    }
+}