From ff5b8ab62618d6c4ae7a8cbf60eceec25a53ce3e Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Wed, 31 Dec 2025 21:28:00 +0200 Subject: [PATCH 1/2] perf: improve calculating length performance for `GenericByteArray` in row conversion --- arrow-row/src/lib.rs | 51 ++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 3ffa71e98c30..2b9969e29390 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -164,7 +164,7 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use arrow_array::cast::*; -use arrow_array::types::ArrowDictionaryKeyType; +use arrow_array::types::{ArrowDictionaryKeyType, ByteArrayType}; use arrow_array::*; use arrow_buffer::{ArrowNativeType, Buffer, OffsetBuffer, ScalarBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; @@ -1498,31 +1498,15 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker { array => tracker.push_fixed(fixed::encoded_len(array)), DataType::Null => {}, DataType::Boolean => tracker.push_fixed(bool::ENCODED_LEN), - DataType::Binary => tracker.push_variable( - as_generic_binary_array::(array) - .iter() - .map(|slice| variable::encoded_len(slice)) - ), - DataType::LargeBinary => tracker.push_variable( - as_generic_binary_array::(array) - .iter() - .map(|slice| variable::encoded_len(slice)) - ), + DataType::Binary => push_generic_byte_array_lengths(&mut tracker, as_generic_binary_array::(array)), + DataType::LargeBinary => push_generic_byte_array_lengths(&mut tracker, as_generic_binary_array::(array)), DataType::BinaryView => tracker.push_variable( array.as_binary_view() .iter() .map(|slice| variable::encoded_len(slice)) ), - DataType::Utf8 => tracker.push_variable( - array.as_string::() - .iter() - .map(|slice| variable::encoded_len(slice.map(|x| x.as_bytes()))) - ), - DataType::LargeUtf8 => tracker.push_variable( - array.as_string::() - .iter() - .map(|slice| variable::encoded_len(slice.map(|x| x.as_bytes()))) - ), + DataType::Utf8 => push_generic_byte_array_lengths(&mut tracker, array.as_string::()), + DataType::LargeUtf8 => push_generic_byte_array_lengths(&mut tracker, array.as_string::()), DataType::Utf8View => tracker.push_variable( array.as_string_view() .iter() @@ -1617,6 +1601,31 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker { tracker } +/// Add to [`LengthTracker`] the encoded length of each item in the [`GenericByteArray`] +fn push_generic_byte_array_lengths( + tracker: &mut LengthTracker, + array: &GenericByteArray, +) { + if let Some(nulls) = array.nulls().filter(|n| n.null_count() > 0) { + tracker.push_variable( + array + .offsets() + .lengths() + .zip(nulls.iter()) + .map(|(length, is_valid)| if is_valid { Some(length) } else { None }) + .map(variable::padded_length), + ) + } else { + tracker.push_variable( + array + .offsets() + .lengths() + .map(Some) + .map(variable::padded_length), + ) + } +} + /// Encodes a column to the provided [`Rows`] incrementing the offsets as it progresses fn encode_column( data: &mut [u8], From c678b9975b3fcc9665799426a6f6b3bd7731eaac Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:23:28 +0200 Subject: [PATCH 2/2] update based on cr comments --- arrow-row/src/lib.rs | 3 +-- arrow-row/src/variable.rs | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index f96ee5bf00d7..361b5544149a 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -1659,8 +1659,7 @@ fn push_generic_byte_array_lengths( array .offsets() .lengths() - .map(Some) - .map(variable::padded_length), + .map(variable::non_null_padded_length), ) } } diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs index 73e19b197f92..1c1df00528a3 100644 --- a/arrow-row/src/variable.rs +++ b/arrow-row/src/variable.rs @@ -55,11 +55,20 @@ pub fn encoded_len(a: Option<&[u8]>) -> usize { #[inline] pub fn padded_length(a: Option) -> usize { match a { - Some(a) if a <= BLOCK_SIZE => 1 + ceil(a, MINI_BLOCK_SIZE) * (MINI_BLOCK_SIZE + 1), + Some(a) => non_null_padded_length(a), + None => 1, + } +} + +/// Returns the padded length of the encoded length of the given length +#[inline] +pub(crate) fn non_null_padded_length(len: usize) -> usize { + if len <= BLOCK_SIZE { + 1 + ceil(len, MINI_BLOCK_SIZE) * (MINI_BLOCK_SIZE + 1) + } else { // Each miniblock ends with a 1 byte continuation, therefore add // `(MINI_BLOCK_COUNT - 1)` additional bytes over non-miniblock size - Some(a) => MINI_BLOCK_COUNT + ceil(a, BLOCK_SIZE) * (BLOCK_SIZE + 1), - None => 1, + MINI_BLOCK_COUNT + ceil(len, BLOCK_SIZE) * (BLOCK_SIZE + 1) } }