diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 28c65c5994bf..361b5544149a 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -164,7 +164,7 @@ use std::hash::{Hash, Hasher}; use std::sync::Arc; use arrow_array::cast::*; -use arrow_array::types::{ArrowDictionaryKeyType, ByteViewType}; +use arrow_array::types::{ArrowDictionaryKeyType, ByteArrayType, ByteViewType}; use arrow_array::*; use arrow_buffer::{ArrowNativeType, Buffer, OffsetBuffer, ScalarBuffer}; use arrow_data::{ArrayData, ArrayDataBuilder}; @@ -1545,27 +1545,11 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker { array => tracker.push_fixed(fixed::encoded_len(array)), DataType::Null => {}, DataType::Boolean => tracker.push_fixed(bool::ENCODED_LEN), - DataType::Binary => tracker.push_variable( - as_generic_binary_array::(array) - .iter() - .map(|slice| variable::encoded_len(slice)) - ), - DataType::LargeBinary => tracker.push_variable( - as_generic_binary_array::(array) - .iter() - .map(|slice| variable::encoded_len(slice)) - ), + DataType::Binary => push_generic_byte_array_lengths(&mut tracker, as_generic_binary_array::(array)), + DataType::LargeBinary => push_generic_byte_array_lengths(&mut tracker, as_generic_binary_array::(array)), DataType::BinaryView => push_byte_view_array_lengths(&mut tracker, array.as_binary_view()), - DataType::Utf8 => tracker.push_variable( - array.as_string::() - .iter() - .map(|slice| variable::encoded_len(slice.map(|x| x.as_bytes()))) - ), - DataType::LargeUtf8 => tracker.push_variable( - array.as_string::() - .iter() - .map(|slice| variable::encoded_len(slice.map(|x| x.as_bytes()))) - ), + DataType::Utf8 => push_generic_byte_array_lengths(&mut tracker, array.as_string::()), + DataType::LargeUtf8 => push_generic_byte_array_lengths(&mut tracker, array.as_string::()), DataType::Utf8View => push_byte_view_array_lengths(&mut tracker, array.as_string_view()), DataType::FixedSizeBinary(len) => { let len = len.to_usize().unwrap(); @@ -1656,6 +1640,30 @@ fn row_lengths(cols: &[ArrayRef], encoders: &[Encoder]) -> LengthTracker { tracker } +/// Add to [`LengthTracker`] the encoded length of each item in the [`GenericByteArray`] +fn push_generic_byte_array_lengths( + tracker: &mut LengthTracker, + array: &GenericByteArray, +) { + if let Some(nulls) = array.nulls().filter(|n| n.null_count() > 0) { + tracker.push_variable( + array + .offsets() + .lengths() + .zip(nulls.iter()) + .map(|(length, is_valid)| if is_valid { Some(length) } else { None }) + .map(variable::padded_length), + ) + } else { + tracker.push_variable( + array + .offsets() + .lengths() + .map(variable::non_null_padded_length), + ) + } +} + /// Add to [`LengthTracker`] the encoded length of each item in the [`GenericByteViewArray`] fn push_byte_view_array_lengths( tracker: &mut LengthTracker, diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs index 73e19b197f92..1c1df00528a3 100644 --- a/arrow-row/src/variable.rs +++ b/arrow-row/src/variable.rs @@ -55,11 +55,20 @@ pub fn encoded_len(a: Option<&[u8]>) -> usize { #[inline] pub fn padded_length(a: Option) -> usize { match a { - Some(a) if a <= BLOCK_SIZE => 1 + ceil(a, MINI_BLOCK_SIZE) * (MINI_BLOCK_SIZE + 1), + Some(a) => non_null_padded_length(a), + None => 1, + } +} + +/// Returns the padded length of the encoded length of the given length +#[inline] +pub(crate) fn non_null_padded_length(len: usize) -> usize { + if len <= BLOCK_SIZE { + 1 + ceil(len, MINI_BLOCK_SIZE) * (MINI_BLOCK_SIZE + 1) + } else { // Each miniblock ends with a 1 byte continuation, therefore add // `(MINI_BLOCK_COUNT - 1)` additional bytes over non-miniblock size - Some(a) => MINI_BLOCK_COUNT + ceil(a, BLOCK_SIZE) * (BLOCK_SIZE + 1), - None => 1, + MINI_BLOCK_COUNT + ceil(len, BLOCK_SIZE) * (BLOCK_SIZE + 1) } }