about summary refs log tree commit diff
path: root/tvix/tools/weave
diff options
context:
space:
mode:
authoredef <edef@edef.eu>2024-03-02T21·52+0000
committeredef <edef@edef.eu>2024-03-03T16·45+0000
commit6bdaebcb55eef5663f93dbbc8de6a48b459a10c0 (patch)
tree8388aaf4fa4cec1f4e3ee56b9e36174443d728e3 /tvix/tools/weave
parentca97e5f4858b11ba198077734913a697c4fe9ce7 (diff)
fix(tvix/tools/weave): handle sliced arrays correctly r/7644
The start/end offsets are not necessarily coterminous with the underlying
values array, so even if the stride is fixed, we still we need to slice
the chunks down to match the start/end offsets.

This bug shouldn't affect the correctness of any existing code, since
we're always working with unsliced arrays read directly from Parquet.

Change-Id: I2f7ddc4e66d4d3b2317a44bd436a35bff36bac79
Reviewed-on: https://cl.tvl.fyi/c/depot/+/11081
Tested-by: BuildkiteCI
Reviewed-by: flokli <flokli@flokli.de>
Diffstat (limited to 'tvix/tools/weave')
-rw-r--r--tvix/tools/weave/src/lib.rs20
1 files changed, 12 insertions, 8 deletions
diff --git a/tvix/tools/weave/src/lib.rs b/tvix/tools/weave/src/lib.rs
index 12a86f9fb002..bc2221bf5c9b 100644
--- a/tvix/tools/weave/src/lib.rs
+++ b/tvix/tools/weave/src/lib.rs
@@ -1,6 +1,6 @@
 use anyhow::Result;
 use rayon::prelude::*;
-use std::{fs::File, slice};
+use std::{fs::File, ops::Range, slice};
 
 use polars::{
     datatypes::BinaryChunked,
@@ -49,8 +49,8 @@ pub fn as_fixed_binary<const N: usize>(
     chunked: &BinaryChunked,
 ) -> impl Iterator<Item = &[[u8; N]]> + DoubleEndedIterator {
     chunked.downcast_iter().map(|array| {
-        assert_fixed_dense::<N>(array);
-        exact_chunks(array.values()).unwrap()
+        let range = assert_fixed_dense::<N>(array);
+        exact_chunks(&array.values()[range]).unwrap()
     })
 }
 
@@ -61,20 +61,22 @@ fn into_fixed_binary_rechunk<const N: usize>(chunked: &BinaryChunked) -> FixedBy
     let mut iter = chunked.downcast_iter();
     let array = iter.next().unwrap();
 
-    assert_fixed_dense::<N>(array);
-    Bytes(array.values().clone()).map(|buf| exact_chunks(buf).unwrap())
+    let range = assert_fixed_dense::<N>(array);
+    Bytes(array.values().clone().sliced(range.start, range.len()))
+        .map(|buf| exact_chunks(buf).unwrap())
 }
 
 /// Ensures that the supplied Arrow array consists of densely packed bytestrings of length `N`.
 /// In other words, ensure that it is free of nulls, and that the offsets have a fixed stride of `N`.
-fn assert_fixed_dense<const N: usize>(array: &BinaryArray<i64>) {
+#[must_use = "only the range returned is guaranteed to be conformant"]
+fn assert_fixed_dense<const N: usize>(array: &BinaryArray<i64>) -> Range<usize> {
     let null_count = array.validity().map_or(0, |bits| bits.unset_bits());
     if null_count > 0 {
         panic!("null values present");
     }
 
-    let length_check = array
-        .offsets()
+    let offsets = array.offsets();
+    let length_check = offsets
         .as_slice()
         .par_windows(2)
         .all(|w| (w[1] - w[0]) == N as i64);
@@ -82,6 +84,8 @@ fn assert_fixed_dense<const N: usize>(array: &BinaryArray<i64>) {
     if !length_check {
         panic!("lengths are inconsistent");
     }
+
+    (*offsets.first() as usize)..(*offsets.last() as usize)
 }
 
 fn exact_chunks<const K: usize>(buf: &[u8]) -> Option<&[[u8; K]]> {