From 14849829fd4a2906d5cd2f723226b02801897983 Mon Sep 17 00:00:00 2001 From: edef Date: Mon, 6 Nov 2023 20:04:17 +0000 Subject: feat(third_party/overlays): support LargeListArray in Clickhouse Link: https://github.com/ClickHouse/ClickHouse/pull/56118 Change-Id: I41339ce662b8a169746237eb1d0aad34453bc0a8 Reviewed-on: https://cl.tvl.fyi/c/depot/+/9986 Tested-by: BuildkiteCI Reviewed-by: flokli --- ...ouse-support-reading-arrow-LargeListArray.patch | 107 +++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 third_party/overlays/patches/clickhouse-support-reading-arrow-LargeListArray.patch (limited to 'third_party/overlays/patches') diff --git a/third_party/overlays/patches/clickhouse-support-reading-arrow-LargeListArray.patch b/third_party/overlays/patches/clickhouse-support-reading-arrow-LargeListArray.patch new file mode 100644 index 000000000000..59231dbbc011 --- /dev/null +++ b/third_party/overlays/patches/clickhouse-support-reading-arrow-LargeListArray.patch @@ -0,0 +1,107 @@ +From 26e65e4addc990cc09b59b587792ac4a454e5cdd Mon Sep 17 00:00:00 2001 +From: edef +Date: Mon, 30 Oct 2023 08:08:10 +0000 +Subject: [PATCH] [backport] Support reading arrow::LargeListArray + +--- + .../Formats/Impl/ArrowColumnToCHColumn.cpp | 35 ++++++++++++++----- + 1 file changed, 26 insertions(+), 9 deletions(-) + +diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +index 54a6c8493ea..94cf59fd357 100644 +--- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp ++++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +@@ -336,7 +336,22 @@ static ColumnPtr readByteMapFromArrowColumn(std::shared_ptr + return nullmap_column; + } + +-static ColumnPtr readOffsetsFromArrowListColumn(std::shared_ptr & arrow_column) ++template ++struct ArrowOffsetArray; ++ ++template<> ++struct ArrowOffsetArray ++{ ++ using type = arrow::Int32Array; ++}; ++ ++template<> ++struct ArrowOffsetArray ++{ ++ using type = arrow::Int64Array; ++}; ++ ++template static ColumnPtr readOffsetsFromArrowListColumn(std::shared_ptr & arrow_column) + { + auto offsets_column = ColumnUInt64::create(); + ColumnArray::Offsets & offsets_data = assert_cast &>(*offsets_column).getData(); +@@ -346,9 +361,9 @@ static ColumnPtr readOffsetsFromArrowListColumn(std::shared_ptrnum_chunks(); chunk_i < num_chunks; ++chunk_i) + { +- arrow::ListArray & list_chunk = dynamic_cast(*(arrow_column->chunk(chunk_i))); ++ ArrowListArray & list_chunk = dynamic_cast(*(arrow_column->chunk(chunk_i))); + auto arrow_offsets_array = list_chunk.offsets(); +- auto & arrow_offsets = dynamic_cast(*arrow_offsets_array); ++ auto & arrow_offsets = dynamic_cast::type &>(*arrow_offsets_array); + + /* + * It seems like arrow::ListArray::values() (nested column data) might or might not be shared across chunks. +@@ -498,13 +513,13 @@ static ColumnPtr readColumnWithIndexesData(std::shared_ptr + } + } + +-static std::shared_ptr getNestedArrowColumn(std::shared_ptr & arrow_column) ++template static std::shared_ptr getNestedArrowColumn(std::shared_ptr & arrow_column) + { + arrow::ArrayVector array_vector; + array_vector.reserve(arrow_column->num_chunks()); + for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i) + { +- arrow::ListArray & list_chunk = dynamic_cast(*(arrow_column->chunk(chunk_i))); ++ ArrowListArray & list_chunk = dynamic_cast(*(arrow_column->chunk(chunk_i))); + + /* + * It seems like arrow::ListArray::values() (nested column data) might or might not be shared across chunks. +@@ -636,12 +651,12 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( + if (map_type_hint) + nested_type_hint = assert_cast(map_type_hint->getNestedType().get())->getNestedType(); + } +- auto arrow_nested_column = getNestedArrowColumn(arrow_column); ++ auto arrow_nested_column = getNestedArrowColumn(arrow_column); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint); + if (skipped) + return {}; + +- auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); ++ auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); + + const auto * tuple_column = assert_cast(nested_column.column.get()); + const auto * tuple_type = assert_cast(nested_column.type.get()); +@@ -650,7 +665,9 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( + return {std::move(map_column), std::move(map_type), column_name}; + } + case arrow::Type::LIST: ++ case arrow::Type::LARGE_LIST: + { ++ bool is_large = arrow_column->type()->id() == arrow::Type::LARGE_LIST; + DataTypePtr nested_type_hint; + if (type_hint) + { +@@ -658,11 +675,11 @@ static ColumnWithTypeAndName readColumnFromArrowColumn( + if (array_type_hint) + nested_type_hint = array_type_hint->getNestedType(); + } +- auto arrow_nested_column = getNestedArrowColumn(arrow_column); ++ auto arrow_nested_column = is_large ? getNestedArrowColumn(arrow_column) : getNestedArrowColumn(arrow_column); + auto nested_column = readColumnFromArrowColumn(arrow_nested_column, column_name, format_name, false, dictionary_infos, allow_null_type, skip_columns_with_unsupported_types, skipped, nested_type_hint); + if (skipped) + return {}; +- auto offsets_column = readOffsetsFromArrowListColumn(arrow_column); ++ auto offsets_column = is_large ? readOffsetsFromArrowListColumn(arrow_column) : readOffsetsFromArrowListColumn(arrow_column); + auto array_column = ColumnArray::create(nested_column.column, offsets_column); + auto array_type = std::make_shared(nested_column.type); + return {std::move(array_column), std::move(array_type), column_name}; +-- +2.42.0 + -- cgit 1.4.1