From 797456060d3fe5b997b9b1a81f29954ab188c05b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Mon, 21 Aug 2023 13:31:15 +0100 Subject: [PATCH 1/3] Cleanup length and bit_length kernels --- arrow-string/src/length.rs | 245 +++++++++++++------------------------ 1 file changed, 82 insertions(+), 163 deletions(-) diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index 25d6414ec8e6..8cd82ba3a32c 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -19,135 +19,30 @@ use arrow_array::*; use arrow_array::{cast::AsArray, types::*}; -use arrow_buffer::Buffer; +use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer}; use arrow_data::ArrayData; use arrow_schema::{ArrowError, DataType}; use std::sync::Arc; -macro_rules! unary_offsets { - ($array: expr, $data_type: expr, $op: expr) => {{ - let slice = $array.value_offsets(); - - let lengths = slice.windows(2).map(|offset| $op(offset[1] - offset[0])); - - // JUSTIFICATION - // Benefit - // ~60% speedup - // Soundness - // `values` come from a slice iterator with a known size. - let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) }; - - let null_bit_buffer = $array.nulls().map(|b| b.inner().sliced()); - - let data = unsafe { - ArrayData::new_unchecked( - $data_type, - $array.len(), - None, - null_bit_buffer, - 0, - vec![buffer], - vec![], - ) - }; - make_array(data) - }}; +fn length_impl( + offsets: &OffsetBuffer, + nulls: Option<&NullBuffer>, +) -> ArrayRef { + let v: Vec<_> = offsets + .windows(2) + .map(|w| w[1].sub_wrapping(w[0])) + .collect(); + Arc::new(PrimitiveArray::

::new(v.into(), nulls.cloned())) } -macro_rules! kernel_dict { - ($array: ident, $kernel: expr, $kt: ident, $($t: ident: $gt: ident), *) => { - match $kt.as_ref() { - $(&DataType::$t => { - let dict = $array - .as_any() - .downcast_ref::>() - .unwrap_or_else(|| { - panic!("Expect 'DictionaryArray<{}>' but got array of data type {:?}", - stringify!($gt), $array.data_type()) - }); - let values = $kernel(dict.values())?; - let result = DictionaryArray::try_new(dict.keys().clone(), values)?; - Ok(Arc::new(result)) - }, - )* - t => panic!("Unsupported dictionary key type: {}", t) - } - } -} - -fn length_list(array: &dyn Array) -> ArrayRef -where - O: OffsetSizeTrait, - T: ArrowPrimitiveType, - T::Native: OffsetSizeTrait, -{ - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - unary_offsets!(array, T::DATA_TYPE, |x| x) -} - -fn length_list_fixed_size(array: &dyn Array, length: i32) -> ArrayRef { - let array = array.as_fixed_size_list(); - let length_list = array.len(); - let buffer = Buffer::from_vec(vec![length; length_list]); - let data = Int32Array::new(buffer.into(), array.nulls().cloned()); - Arc::new(data) -} - -fn length_binary(array: &dyn Array) -> ArrayRef -where - O: OffsetSizeTrait, - T: ArrowPrimitiveType, - T::Native: OffsetSizeTrait, -{ - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - unary_offsets!(array, T::DATA_TYPE, |x| x) -} - -fn length_string(array: &dyn Array) -> ArrayRef -where - O: OffsetSizeTrait, - T: ArrowPrimitiveType, - T::Native: OffsetSizeTrait, -{ - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - unary_offsets!(array, T::DATA_TYPE, |x| x) -} - -fn bit_length_binary(array: &dyn Array) -> ArrayRef -where - O: OffsetSizeTrait, - T: ArrowPrimitiveType, - T::Native: OffsetSizeTrait, -{ - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - let bits_in_bytes = O::from_usize(8).unwrap(); - unary_offsets!(array, T::DATA_TYPE, |x| x * bits_in_bytes) -} - -fn bit_length_string(array: &dyn Array) -> ArrayRef -where - O: OffsetSizeTrait, - T: ArrowPrimitiveType, - T::Native: OffsetSizeTrait, -{ - let array = array - .as_any() - .downcast_ref::>() - .unwrap(); - let bits_in_bytes = O::from_usize(8).unwrap(); - unary_offsets!(array, T::DATA_TYPE, |x| x * bits_in_bytes) +fn bit_length_impl( + offsets: &OffsetBuffer, + nulls: Option<&NullBuffer>, +) -> ArrayRef { + let bits = P::Native::usize_as(8); + let c = |w: &[P::Native]| w[1].sub_wrapping(w[0]).mul_wrapping(bits); + let v: Vec<_> = offsets.windows(2).map(c).collect(); + Arc::new(PrimitiveArray::

::new(v.into(), nulls.cloned())) } /// Returns an array of Int32/Int64 denoting the length of each value in the array. @@ -158,29 +53,39 @@ where /// or DictionaryArray with above Arrays as values /// * length of null is null. pub fn length(array: &dyn Array) -> Result { + if let Some(d) = array.as_any_dictionary_opt() { + let lengths = length(d.values().as_ref())?; + return Ok(d.with_values(lengths)); + } + match array.data_type() { - DataType::Dictionary(kt, _) => { - kernel_dict!( - array, - |a| { length(a) }, - kt, - Int8: Int8Type, - Int16: Int16Type, - Int32: Int32Type, - Int64: Int64Type, - UInt8: UInt8Type, - UInt16: UInt16Type, - UInt32: UInt32Type, - UInt64: UInt64Type - ) + DataType::List(_) => { + let list = array.as_list::(); + Ok(length_impl::(list.offsets(), list.nulls())) + } + DataType::LargeList(_) => { + let list = array.as_list::(); + Ok(length_impl::(list.offsets(), list.nulls())) + } + DataType::Utf8 => { + let list = array.as_string::(); + Ok(length_impl::(list.offsets(), list.nulls())) + } + DataType::LargeUtf8 => { + let list = array.as_string::(); + Ok(length_impl::(list.offsets(), list.nulls())) } - DataType::List(_) => Ok(length_list::(array)), - DataType::LargeList(_) => Ok(length_list::(array)), - DataType::Utf8 => Ok(length_string::(array)), - DataType::LargeUtf8 => Ok(length_string::(array)), - DataType::Binary => Ok(length_binary::(array)), - DataType::LargeBinary => Ok(length_binary::(array)), - DataType::FixedSizeList(_, len) => Ok(length_list_fixed_size(array, *len)), + DataType::Binary => { + let list = array.as_binary::(); + Ok(length_impl::(list.offsets(), list.nulls())) + } + DataType::LargeBinary => { + let list = array.as_binary::(); + Ok(length_impl::(list.offsets(), list.nulls())) + } + DataType::FixedSizeBinary(len) | DataType::FixedSizeList(_, len) => Ok(Arc::new( + Int32Array::new(vec![*len; array.len()].into(), array.nulls().cloned()), + )), other => Err(ArrowError::ComputeError(format!( "length not supported for {other:?}" ))), @@ -194,28 +99,42 @@ pub fn length(array: &dyn Array) -> Result { /// * bit_length of null is null. /// * bit_length is in number of bits pub fn bit_length(array: &dyn Array) -> Result { + if let Some(d) = array.as_any_dictionary_opt() { + let lengths = bit_length(d.values().as_ref())?; + return Ok(d.with_values(lengths)); + } + match array.data_type() { - DataType::Dictionary(kt, _) => { - kernel_dict!( - array, - |a| { bit_length(a) }, - kt, - Int8: Int8Type, - Int16: Int16Type, - Int32: Int32Type, - Int64: Int64Type, - UInt8: UInt8Type, - UInt16: UInt16Type, - UInt32: UInt32Type, - UInt64: UInt64Type - ) + DataType::List(_) => { + let list = array.as_list::(); + Ok(bit_length_impl::(list.offsets(), list.nulls())) } - DataType::Utf8 => Ok(bit_length_string::(array)), - DataType::LargeUtf8 => Ok(bit_length_string::(array)), - DataType::Binary => Ok(bit_length_binary::(array)), - DataType::LargeBinary => Ok(bit_length_binary::(array)), + DataType::LargeList(_) => { + let list = array.as_list::(); + Ok(bit_length_impl::(list.offsets(), list.nulls())) + } + DataType::Utf8 => { + let list = array.as_string::(); + Ok(bit_length_impl::(list.offsets(), list.nulls())) + } + DataType::LargeUtf8 => { + let list = array.as_string::(); + Ok(bit_length_impl::(list.offsets(), list.nulls())) + } + DataType::Binary => { + let list = array.as_binary::(); + Ok(bit_length_impl::(list.offsets(), list.nulls())) + } + DataType::LargeBinary => { + let list = array.as_binary::(); + Ok(bit_length_impl::(list.offsets(), list.nulls())) + } + DataType::FixedSizeBinary(len) => Ok(Arc::new(Int32Array::new( + vec![*len * 8; array.len()].into(), + array.nulls().cloned(), + ))), other => Err(ArrowError::ComputeError(format!( - "bit_length not supported for {other:?}" + "length not supported for {other:?}" ))), } } From 81e4ffbb7a18440a975ac633cf0aee1c26faf587 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Mon, 21 Aug 2023 14:14:20 +0100 Subject: [PATCH 2/3] Clippy --- arrow-string/src/length.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index 8cd82ba3a32c..42abbc557836 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -19,8 +19,7 @@ use arrow_array::*; use arrow_array::{cast::AsArray, types::*}; -use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, OffsetBuffer}; -use arrow_data::ArrayData; +use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer}; use arrow_schema::{ArrowError, DataType}; use std::sync::Arc; @@ -143,7 +142,8 @@ pub fn bit_length(array: &dyn Array) -> Result { mod tests { use super::*; use arrow_array::cast::AsArray; - use arrow_buffer::NullBuffer; + use arrow_buffer::{Buffer, NullBuffer}; + use arrow_data::ArrayData; use arrow_schema::Field; fn double_vec(v: Vec) -> Vec { From 55425b566151eeaf0ed45b07ca6bf72958ac9c79 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 25 Aug 2023 10:33:54 +0100 Subject: [PATCH 3/3] Review feedback --- arrow-string/src/length.rs | 41 +++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/arrow-string/src/length.rs b/arrow-string/src/length.rs index 36d2baeac98e..ab5fbb0c6425 100644 --- a/arrow-string/src/length.rs +++ b/arrow-string/src/length.rs @@ -16,7 +16,6 @@ // under the License. //! Defines kernel for length of string arrays and binary arrays -#![allow(clippy::redundant_closure_call)] use arrow_array::*; use arrow_array::{cast::AsArray, types::*}; @@ -134,7 +133,7 @@ pub fn bit_length(array: &dyn Array) -> Result { array.nulls().cloned(), ))), other => Err(ArrowError::ComputeError(format!( - "length not supported for {other:?}" + "bit_length not supported for {other:?}" ))), } } @@ -147,18 +146,11 @@ mod tests { use arrow_data::ArrayData; use arrow_schema::Field; - fn double_vec(v: Vec) -> Vec { - [&v[..], &v[..]].concat() - } - fn length_cases_string() -> Vec<(Vec<&'static str>, usize, Vec)> { // a large array - let mut values = vec!["one", "on", "o", ""]; - let mut expected = vec![3, 2, 1, 0]; - for _ in 0..10 { - values = double_vec(values); - expected = double_vec(expected); - } + let values = ["one", "on", "o", ""]; + let values = values.into_iter().cycle().take(4096).collect(); + let expected = [3, 2, 1, 0].into_iter().cycle().take(4096).collect(); vec![ (vec!["hello", " ", "world"], 3, vec![5, 1, 5]), @@ -192,7 +184,6 @@ mod tests { } #[test] - #[cfg_attr(miri, ignore)] // running forever fn length_test_string() { length_cases_string() .into_iter() @@ -208,7 +199,6 @@ mod tests { } #[test] - #[cfg_attr(miri, ignore)] // running forever fn length_test_large_string() { length_cases_string() .into_iter() @@ -379,12 +369,9 @@ mod tests { fn bit_length_cases() -> Vec<(Vec<&'static str>, usize, Vec)> { // a large array - let mut values = vec!["one", "on", "o", ""]; - let mut expected = vec![24, 16, 8, 0]; - for _ in 0..10 { - values = double_vec(values); - expected = double_vec(expected); - } + let values = ["one", "on", "o", ""]; + let values = values.into_iter().cycle().take(4096).collect(); + let expected = [24, 16, 8, 0].into_iter().cycle().take(4096).collect(); vec![ (vec!["hello", " ", "world", "!"], 4, vec![40, 8, 40, 8]), @@ -395,7 +382,6 @@ mod tests { } #[test] - #[cfg_attr(miri, ignore)] // error: this test uses too much memory to run on CI fn bit_length_test_string() { bit_length_cases() .into_iter() @@ -411,7 +397,6 @@ mod tests { } #[test] - #[cfg_attr(miri, ignore)] // error: this test uses too much memory to run on CI fn bit_length_test_large_string() { bit_length_cases() .into_iter() @@ -650,11 +635,21 @@ mod tests { let list_array = FixedSizeListArray::from(list_data); let lengths = length(&list_array).unwrap(); - let lengths = lengths.as_any().downcast_ref::().unwrap(); + let lengths = lengths.as_primitive::(); assert_eq!(lengths.len(), 3); assert_eq!(lengths.value(0), 3); assert!(lengths.is_null(1)); assert_eq!(lengths.value(2), 3); } + + #[test] + fn test_fixed_size_binary() { + let array = FixedSizeBinaryArray::new(4, [0; 16].into(), None); + let result = length(&array).unwrap(); + assert_eq!(result.as_ref(), &Int32Array::from(vec![4; 4])); + + let result = bit_length(&array).unwrap(); + assert_eq!(result.as_ref(), &Int32Array::from(vec![32; 4])); + } }