diff options
author | edef <edef@edef.eu> | 2024-10-17T13·26+0000 |
---|---|---|
committer | edef <edef@edef.eu> | 2024-10-19T17·01+0000 |
commit | 06d2536eec88bfcfd2388e3ca153ba99815b7e97 (patch) | |
tree | c0291acbe891415b2e025ea956c20c3f6832d37c | |
parent | 313899c291f0295506c275418e570b39b4a5f079 (diff) |
feat(users/edef/weave): ingest roots in Parquet format r/8845
Parsing of store-paths.xz is now handled by //users/edef/fetchroots. Change-Id: I78be5aada0c0a321ed79d80c9b615e5f997ac3e0 Reviewed-on: https://cl.tvl.fyi/c/depot/+/12670 Tested-by: BuildkiteCI Reviewed-by: flokli <flokli@flokli.de>
-rw-r--r-- | users/edef/weave/src/main.rs | 28 |
1 files changed, 15 insertions, 13 deletions
diff --git a/users/edef/weave/src/main.rs b/users/edef/weave/src/main.rs index 243add9047a5..c86725003275 100644 --- a/users/edef/weave/src/main.rs +++ b/users/edef/weave/src/main.rs @@ -1,4 +1,4 @@ -//! Weave resolves a list of roots from `nixpkgs.roots` against `narinfo.parquet`, +//! Weave resolves a list of roots from `releases.parquet` against `narinfo.parquet`, //! and then uses the reference graph from the accompanying `narinfo-references.parquet` //! produced by `swizzle` to collect the closure of the roots. //! @@ -7,11 +7,10 @@ use anyhow::Result; use hashbrown::{hash_table, HashTable}; -use nix_compat::nixbase32; use rayon::prelude::*; use std::{ collections::{BTreeMap, HashSet}, - fs::{self, File}, + fs::File, ops::Index, sync::atomic::{AtomicU32, Ordering}, }; @@ -19,22 +18,24 @@ use std::{ use polars::{ datatypes::StaticArray, export::arrow::{array::UInt32Array, offset::OffsetsBuffer}, + lazy::dsl::col, prelude::*, }; -use weave::{hash64, DONE, INDEX_NULL}; +use weave::{as_fixed_binary, hash64, DONE, INDEX_NULL}; fn main() -> Result<()> { eprint!("… parse roots\r"); - let roots: PathSet32 = { - let mut roots = Vec::new(); - fs::read("nixpkgs.roots")? - .par_chunks_exact(32 + 1) - .map(|e| nixbase32::decode_fixed::<20>(&e[0..32]).unwrap()) - .collect_into_vec(&mut roots); - - roots.iter().collect() - }; + let roots: PathSet32 = as_fixed_binary::<20>( + LazyFrame::scan_parquet("releases.parquet", ScanArgsParquet::default())? + .explode([col("store_path_hash")]) + .select([col("store_path_hash")]) + .collect()? + .column("store_path_hash")? + .binary()?, + ) + .flatten() + .collect(); eprintln!("{DONE}"); { @@ -182,6 +183,7 @@ impl<'a> FromIterator<&'a [u8; 20]> for PathSet32 { this.insert(item); } + this.table.shrink_to_fit(|(x, _)| hash64(x)); this } } |