about summary refs log tree commit diff
path: root/third_party/overlays
diff options
context:
space:
mode:
authoredef <edef@edef.eu>2023-11-06T20·04+0000
committeredef <edef@edef.eu>2023-11-06T21·46+0000
commit14849829fd4a2906d5cd2f723226b02801897983 (patch)
tree9e1056487bae7e5cb5a74fe5c218d81f0b96039b /third_party/overlays
parentedea6dadddbc8d8484d5790dd3aed787630b8258 (diff)
feat(third_party/overlays): support LargeListArray in Clickhouse r/6970
Link: https://github.com/ClickHouse/ClickHouse/pull/56118
Change-Id: I41339ce662b8a169746237eb1d0aad34453bc0a8
Reviewed-on: https://cl.tvl.fyi/c/depot/+/9986
Tested-by: BuildkiteCI
Reviewed-by: flokli <flokli@flokli.de>
Diffstat (limited to 'third_party/overlays')
-rw-r--r--third_party/overlays/patches/clickhouse-support-reading-arrow-LargeListArray.patch107
-rw-r--r--third_party/overlays/tvl.nix7
2 files changed, 114 insertions, 0 deletions
diff --git a/third_party/overlays/patches/clickhouse-support-reading-arrow-LargeListArray.patch b/third_party/overlays/patches/clickhouse-support-reading-arrow-LargeListArray.patch
new file mode 100644
index 0000000000..59231dbbc0
--- /dev/null
+++ b/third_party/overlays/patches/clickhouse-support-reading-arrow-LargeListArray.patch
@@ -0,0 +1,107 @@
+From 26e65e4addc990cc09b59b587792ac4a454e5cdd Mon Sep 17 00:00:00 2001
+From: edef <edef@edef.eu>
+Date: Mon, 30 Oct 2023 08:08:10 +0000
+Subject: [PATCH] [backport] Support reading arrow::LargeListArray
+
+---
+ .../Formats/Impl/ArrowColumnToCHColumn.cpp    | 35 ++++++++++++++-----
+ 1 file changed, 26 insertions(+), 9 deletions(-)
+
+diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp
+index 54a6c8493ea..94cf59fd357 100644
+--- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp
++++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp
+@@ -336,7 +336,22 @@ static ColumnPtr readByteMapFromArrowColumn(std::shared_ptr<arrow::ChunkedArray>
+     return nullmap_column;
+ }
+ 
+-static ColumnPtr readOffsetsFromArrowListColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column)
++template<typename T>
++struct ArrowOffsetArray;
++
++template<>
++struct ArrowOffsetArray<arrow::ListArray>
++{
++    using type = arrow::Int32Array;
++};
++
++template<>
++struct ArrowOffsetArray<arrow::LargeListArray>
++{
++    using type = arrow::Int64Array;
++};
++
++template<typename ArrowListArray> static ColumnPtr readOffsetsFromArrowListColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column)
+ {
+     auto offsets_column = ColumnUInt64::create();
+     ColumnArray::Offsets & offsets_data = assert_cast<ColumnVector<UInt64> &>(*offsets_column).getData();
+@@ -346,9 +361,9 @@ static ColumnPtr readOffsetsFromArrowListColumn(std::shared_ptr<arrow::ChunkedAr
+ 
+     for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i)
+     {
+-        arrow::ListArray & list_chunk = dynamic_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
++        ArrowListArray & list_chunk = dynamic_cast<ArrowListArray &>(*(arrow_column->chunk(chunk_i)));
+         auto arrow_offsets_array = list_chunk.offsets();
+-        auto & arrow_offsets = dynamic_cast<arrow::Int32Array &>(*arrow_offsets_array);
++        auto & arrow_offsets = dynamic_cast<typename ArrowOffsetArray<ArrowListArray>::type &>(*arrow_offsets_array);
+ 
+         /*
+          * It seems like arrow::ListArray::values() (nested column data) might or might not be shared across chunks.
+@@ -498,13 +513,13 @@ static ColumnPtr readColumnWithIndexesData(std::shared_ptr<arrow::ChunkedArray>
+     }
+ }
+ 
+-static std::shared_ptr<arrow::ChunkedArray> getNestedArrowColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column)
++template<typename ArrowListArray> static std::shared_ptr<arrow::ChunkedArray> getNestedArrowColumn(std::shared_ptr<arrow::ChunkedArray> & arrow_column)
+ {
+     arrow::ArrayVector array_vector;
+     array_vector.reserve(arrow_column->num_chunks());
+     for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i)
+     {
+-        arrow::ListArray & list_chunk = dynamic_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
++        ArrowListArray & list_chunk = dynamic_cast<ArrowListArray &>(*(arrow_column->chunk(chunk_i)));
+ 
+         /*
+          * It seems like arrow::ListArray::values() (nested column data) might or might not be shared across chunks.
+@@ -636,12 +651,12 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
+                 if (map_type_hint)
+                     nested_type_hint = assert_cast<const DataTypeArray *>(map_type_hint->getNestedType().get())->getNestedType();
+             }
+-            auto arrow_nested_column = getNestedArrowColumn(arrow_column);
++            auto arrow_nested_column = getNestedArrowColumn<arrow::ListArray>(arrow_column);
+             auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint);
+             if (skipped)
+                 return {};
+ 
+-            auto offsets_column = readOffsetsFromArrowListColumn(arrow_column);
++            auto offsets_column = readOffsetsFromArrowListColumn<arrow::ListArray>(arrow_column);
+ 
+             const auto * tuple_column = assert_cast<const ColumnTuple *>(nested_column.column.get());
+             const auto * tuple_type = assert_cast<const DataTypeTuple *>(nested_column.type.get());
+@@ -650,7 +665,9 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
+             return {std::move(map_column), std::move(map_type), column_name};
+         }
+         case arrow::Type::LIST:
++        case arrow::Type::LARGE_LIST:
+         {
++            bool is_large = arrow_column->type()->id() == arrow::Type::LARGE_LIST;
+             DataTypePtr nested_type_hint;
+             if (type_hint)
+             {
+@@ -658,11 +675,11 @@ static ColumnWithTypeAndName readColumnFromArrowColumn(
+                 if (array_type_hint)
+                     nested_type_hint = array_type_hint->getNestedType();
+             }
+-            auto arrow_nested_column = getNestedArrowColumn(arrow_column);
++            auto arrow_nested_column = is_large ? getNestedArrowColumn<arrow::LargeListArray>(arrow_column) : getNestedArrowColumn<arrow::ListArray>(arrow_column);
+             auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint);
+             if (skipped)
+                 return {};
+-            auto offsets_column = readOffsetsFromArrowListColumn(arrow_column);
++            auto offsets_column = is_large ? readOffsetsFromArrowListColumn<arrow::LargeListArray>(arrow_column) : readOffsetsFromArrowListColumn<arrow::ListArray>(arrow_column);
+             auto array_column = ColumnArray::create(nested_column.column, offsets_column);
+             auto array_type = std::make_shared<DataTypeArray>(nested_column.type);
+             return {std::move(array_column), std::move(array_type), column_name};
+-- 
+2.42.0
+
diff --git a/third_party/overlays/tvl.nix b/third_party/overlays/tvl.nix
index de4788a29b..600161b661 100644
--- a/third_party/overlays/tvl.nix
+++ b/third_party/overlays/tvl.nix
@@ -147,4 +147,11 @@ depot.nix.readTree.drvTargets {
       license = licenses.asl20;
     };
   };
+
+  clickhouse = super.clickhouse.overrideAttrs (old: {
+    patches = old.patches or [ ] ++ [
+      # https://github.com/ClickHouse/ClickHouse/pull/56118
+      ./patches/clickhouse-support-reading-arrow-LargeListArray.patch
+    ];
+  });
 }