From 06d2536eec88bfcfd2388e3ca153ba99815b7e97 Mon Sep 17 00:00:00 2001 From: edef Date: Thu, 17 Oct 2024 13:26:01 +0000 Subject: feat(users/edef/weave): ingest roots in Parquet format Parsing of store-paths.xz is now handled by //users/edef/fetchroots. Change-Id: I78be5aada0c0a321ed79d80c9b615e5f997ac3e0 Reviewed-on: https://cl.tvl.fyi/c/depot/+/12670 Tested-by: BuildkiteCI Reviewed-by: flokli --- users/edef/weave/src/main.rs | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'users/edef/weave/src/main.rs') diff --git a/users/edef/weave/src/main.rs b/users/edef/weave/src/main.rs index 243add9047a5..c86725003275 100644 --- a/users/edef/weave/src/main.rs +++ b/users/edef/weave/src/main.rs @@ -1,4 +1,4 @@ -//! Weave resolves a list of roots from `nixpkgs.roots` against `narinfo.parquet`, +//! Weave resolves a list of roots from `releases.parquet` against `narinfo.parquet`, //! and then uses the reference graph from the accompanying `narinfo-references.parquet` //! produced by `swizzle` to collect the closure of the roots. //! @@ -7,11 +7,10 @@ use anyhow::Result; use hashbrown::{hash_table, HashTable}; -use nix_compat::nixbase32; use rayon::prelude::*; use std::{ collections::{BTreeMap, HashSet}, - fs::{self, File}, + fs::File, ops::Index, sync::atomic::{AtomicU32, Ordering}, }; @@ -19,22 +18,24 @@ use std::{ use polars::{ datatypes::StaticArray, export::arrow::{array::UInt32Array, offset::OffsetsBuffer}, + lazy::dsl::col, prelude::*, }; -use weave::{hash64, DONE, INDEX_NULL}; +use weave::{as_fixed_binary, hash64, DONE, INDEX_NULL}; fn main() -> Result<()> { eprint!("… parse roots\r"); - let roots: PathSet32 = { - let mut roots = Vec::new(); - fs::read("nixpkgs.roots")? - .par_chunks_exact(32 + 1) - .map(|e| nixbase32::decode_fixed::<20>(&e[0..32]).unwrap()) - .collect_into_vec(&mut roots); - - roots.iter().collect() - }; + let roots: PathSet32 = as_fixed_binary::<20>( + LazyFrame::scan_parquet("releases.parquet", ScanArgsParquet::default())? + .explode([col("store_path_hash")]) + .select([col("store_path_hash")]) + .collect()? + .column("store_path_hash")? + .binary()?, + ) + .flatten() + .collect(); eprintln!("{DONE}"); { @@ -182,6 +183,7 @@ impl<'a> FromIterator<&'a [u8; 20]> for PathSet32 { this.insert(item); } + this.table.shrink_to_fit(|(x, _)| hash64(x)); this } } -- cgit 1.4.1