about summary refs log tree commit diff
diff options
context:
space:
mode:
authoredef <edef@edef.eu>2024-10-17T13·26+0000
committeredef <edef@edef.eu>2024-10-19T17·01+0000
commit06d2536eec88bfcfd2388e3ca153ba99815b7e97 (patch)
treec0291acbe891415b2e025ea956c20c3f6832d37c
parent313899c291f0295506c275418e570b39b4a5f079 (diff)
feat(users/edef/weave): ingest roots in Parquet format r/8845
Parsing of store-paths.xz is now handled by //users/edef/fetchroots.

Change-Id: I78be5aada0c0a321ed79d80c9b615e5f997ac3e0
Reviewed-on: https://cl.tvl.fyi/c/depot/+/12670
Tested-by: BuildkiteCI
Reviewed-by: flokli <flokli@flokli.de>
-rw-r--r--users/edef/weave/src/main.rs28
1 files changed, 15 insertions, 13 deletions
diff --git a/users/edef/weave/src/main.rs b/users/edef/weave/src/main.rs
index 243add9047a5..c86725003275 100644
--- a/users/edef/weave/src/main.rs
+++ b/users/edef/weave/src/main.rs
@@ -1,4 +1,4 @@
-//! Weave resolves a list of roots from `nixpkgs.roots` against `narinfo.parquet`,
+//! Weave resolves a list of roots from `releases.parquet` against `narinfo.parquet`,
 //! and then uses the reference graph from the accompanying `narinfo-references.parquet`
 //! produced by `swizzle` to collect the closure of the roots.
 //!
@@ -7,11 +7,10 @@
 
 use anyhow::Result;
 use hashbrown::{hash_table, HashTable};
-use nix_compat::nixbase32;
 use rayon::prelude::*;
 use std::{
     collections::{BTreeMap, HashSet},
-    fs::{self, File},
+    fs::File,
     ops::Index,
     sync::atomic::{AtomicU32, Ordering},
 };
@@ -19,22 +18,24 @@ use std::{
 use polars::{
     datatypes::StaticArray,
     export::arrow::{array::UInt32Array, offset::OffsetsBuffer},
+    lazy::dsl::col,
     prelude::*,
 };
 
-use weave::{hash64, DONE, INDEX_NULL};
+use weave::{as_fixed_binary, hash64, DONE, INDEX_NULL};
 
 fn main() -> Result<()> {
     eprint!("… parse roots\r");
-    let roots: PathSet32 = {
-        let mut roots = Vec::new();
-        fs::read("nixpkgs.roots")?
-            .par_chunks_exact(32 + 1)
-            .map(|e| nixbase32::decode_fixed::<20>(&e[0..32]).unwrap())
-            .collect_into_vec(&mut roots);
-
-        roots.iter().collect()
-    };
+    let roots: PathSet32 = as_fixed_binary::<20>(
+        LazyFrame::scan_parquet("releases.parquet", ScanArgsParquet::default())?
+            .explode([col("store_path_hash")])
+            .select([col("store_path_hash")])
+            .collect()?
+            .column("store_path_hash")?
+            .binary()?,
+    )
+    .flatten()
+    .collect();
     eprintln!("{DONE}");
 
     {
@@ -182,6 +183,7 @@ impl<'a> FromIterator<&'a [u8; 20]> for PathSet32 {
             this.insert(item);
         }
 
+        this.table.shrink_to_fit(|(x, _)| hash64(x));
         this
     }
 }