From 1a38b687a1310e8260edd54ec0fa0094d3860651 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 20 Jun 2025 09:32:47 -0400 Subject: [PATCH] Introduce MAX_INLINE_VIEW_LEN constant for string/byte views --- arrow-array/src/array/byte_view_array.rs | 36 ++++++++++--------- .../src/builder/generic_bytes_view_builder.rs | 14 ++++---- arrow-data/src/byte_view.rs | 18 +++++++--- arrow-select/src/coalesce/byte_view.rs | 10 +++--- 4 files changed, 44 insertions(+), 34 deletions(-) diff --git a/arrow-array/src/array/byte_view_array.rs b/arrow-array/src/array/byte_view_array.rs index 713e275d186c..44df00aeb3cb 100644 --- a/arrow-array/src/array/byte_view_array.rs +++ b/arrow-array/src/array/byte_view_array.rs @@ -22,7 +22,7 @@ use crate::types::bytes::ByteArrayNativeType; use crate::types::{BinaryViewType, ByteViewType, StringViewType}; use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar}; use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer}; -use arrow_data::{ArrayData, ArrayDataBuilder, ByteView}; +use arrow_data::{ArrayData, ArrayDataBuilder, ByteView, MAX_INLINE_VIEW_LEN}; use arrow_schema::{ArrowError, DataType}; use core::str; use num::ToPrimitive; @@ -77,8 +77,9 @@ use super::ByteArrayType; /// 0 31 63 95 127 /// ``` /// -/// * Strings with length <= 12 are stored directly in the view. See -/// [`Self::inline_value`] to access the inlined prefix from a short view. +/// * Strings with length <= 12 ([`MAX_INLINE_VIEW_LEN`]) are stored directly in +/// the view. See [`Self::inline_value`] to access the inlined prefix from a +/// short view. /// /// * Strings with length > 12: The first four bytes are stored inline in the /// view and the entire string is stored in one of the buffers. See [`ByteView`] @@ -128,6 +129,7 @@ use super::ByteArrayType; /// assert_eq!(value, "this string is also longer than 12 bytes"); /// ``` /// +/// [`MAX_INLINE_VIEW_LEN`]: arrow_data::MAX_INLINE_VIEW_LEN /// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html /// /// Unlike [`GenericByteArray`], there are no constraints on the offsets other @@ -316,7 +318,7 @@ impl GenericByteViewArray { pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native { let v = self.views.get_unchecked(idx); let len = *v as u32; - let b = if len <= 12 { + let b = if len <= MAX_INLINE_VIEW_LEN { Self::inline_value(v, len as usize) } else { let view = ByteView::from(*v); @@ -331,10 +333,10 @@ impl GenericByteViewArray { /// /// # Safety /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout. - /// - The `len` must be the length of the inlined value. It should never be larger than 12. + /// - The `len` must be the length of the inlined value. It should never be larger than [`MAX_INLINE_VIEW_LEN`]. #[inline(always)] pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] { - debug_assert!(len <= 12); + debug_assert!(len <= MAX_INLINE_VIEW_LEN as usize); std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len) } @@ -347,7 +349,7 @@ impl GenericByteViewArray { pub fn bytes_iter(&self) -> impl Iterator { self.views.iter().map(move |v| { let len = *v as u32; - if len <= 12 { + if len <= MAX_INLINE_VIEW_LEN { unsafe { Self::inline_value(v, len as usize) } } else { let view = ByteView::from(*v); @@ -371,7 +373,7 @@ impl GenericByteViewArray { return &[] as &[u8]; } - if prefix_len <= 4 || len <= 12 { + if prefix_len <= 4 || len as u32 <= MAX_INLINE_VIEW_LEN { unsafe { StringViewArray::inline_value(v, prefix_len) } } else { let view = ByteView::from(*v); @@ -401,7 +403,7 @@ impl GenericByteViewArray { return &[] as &[u8]; } - if len <= 12 { + if len as u32 <= MAX_INLINE_VIEW_LEN { unsafe { &StringViewArray::inline_value(v, len)[len - suffix_len..] } } else { let view = ByteView::from(*v); @@ -495,9 +497,9 @@ impl GenericByteViewArray { self.views() .iter() .map(|v| { - let len = (*v as u32) as usize; - if len > 12 { - len + let len = *v as u32; + if len > MAX_INLINE_VIEW_LEN { + len as usize } else { 0 } @@ -511,11 +513,11 @@ impl GenericByteViewArray { /// It takes a bit of patience to understand why we don't just compare two &[u8] directly. /// /// ByteView types give us the following two advantages, and we need to be careful not to lose them: - /// (1) For string/byte smaller than 12 bytes, the entire data is inlined in the view. + /// (1) For string/byte smaller than [`MAX_INLINE_VIEW_LEN`] bytes, the entire data is inlined in the view. /// Meaning that reading one array element requires only one memory access /// (two memory access required for StringArray, one for offset buffer, the other for value buffer). /// - /// (2) For string/byte larger than 12 bytes, we can still be faster than (for certain operations) StringArray/ByteArray, + /// (2) For string/byte larger than [`MAX_INLINE_VIEW_LEN`] bytes, we can still be faster than (for certain operations) StringArray/ByteArray, /// thanks to the inlined 4 bytes. /// Consider equality check: /// If the first four bytes of the two strings are different, we can return false immediately (with just one memory access). @@ -525,8 +527,8 @@ impl GenericByteViewArray { /// e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string. /// /// # Order check flow - /// (1) if both string are smaller than 12 bytes, we can directly compare the data inlined to the view. - /// (2) if any of the string is larger than 12 bytes, we need to compare the full string. + /// (1) if both string are smaller than [`MAX_INLINE_VIEW_LEN`] bytes, we can directly compare the data inlined to the view. + /// (2) if any of the string is larger than [`MAX_INLINE_VIEW_LEN`] bytes, we need to compare the full string. /// (2.1) if the inlined 4 bytes are different, we can return the result immediately. /// (2.2) o.w., we need to compare the full string. /// @@ -544,7 +546,7 @@ impl GenericByteViewArray { let r_view = right.views().get_unchecked(right_idx); let r_len = *r_view as u32; - if l_len <= 12 && r_len <= 12 { + if l_len <= MAX_INLINE_VIEW_LEN && r_len <= MAX_INLINE_VIEW_LEN { let l_data = unsafe { GenericByteViewArray::::inline_value(l_view, l_len as usize) }; let r_data = unsafe { GenericByteViewArray::::inline_value(r_view, r_len as usize) }; return l_data.cmp(r_data); diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index ae7355433f81..5e7e942d8ba4 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -20,7 +20,7 @@ use std::marker::PhantomData; use std::sync::Arc; use arrow_buffer::{Buffer, NullBufferBuilder, ScalarBuffer}; -use arrow_data::ByteView; +use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN}; use arrow_schema::ArrowError; use hashbrown::hash_table::Entry; use hashbrown::HashTable; @@ -68,8 +68,8 @@ impl BlockSizeGrowthStrategy { /// /// To avoid bump allocating, this builder allocates data in fixed size blocks, configurable /// using [`GenericByteViewBuilder::with_fixed_block_size`]. [`GenericByteViewBuilder::append_value`] -/// writes values larger than 12 bytes to the current in-progress block, with values smaller -/// than 12 bytes inlined into the views. If a value is appended that will not fit in the +/// writes values larger than [`MAX_INLINE_VIEW_LEN`] bytes to the current in-progress block, with values smaller +/// than [`MAX_INLINE_VIEW_LEN`] bytes inlined into the views. If a value is appended that will not fit in the /// in-progress block, it will be closed, and a new block of sufficient size allocated /// /// # Append Views @@ -114,7 +114,7 @@ impl GenericByteViewBuilder { /// Set a fixed buffer size for variable length strings /// /// The block size is the size of the buffer used to store values greater - /// than 12 bytes. The builder allocates new buffers when the current + /// than [`MAX_INLINE_VIEW_LEN`] bytes. The builder allocates new buffers when the current /// buffer is full. /// /// By default the builder balances buffer size and buffer count by @@ -221,7 +221,7 @@ impl GenericByteViewBuilder { } else { self.views_buffer.extend(array.views().iter().map(|v| { let mut byte_view = ByteView::from(*v); - if byte_view.length > 12 { + if byte_view.length > MAX_INLINE_VIEW_LEN { // Small views (<=12 bytes) are inlined, so only need to update large views byte_view.buffer_index += starting_buffer; }; @@ -289,7 +289,7 @@ impl GenericByteViewBuilder { pub fn get_value(&self, index: usize) -> &[u8] { let view = self.views_buffer.as_slice().get(index).unwrap(); let len = *view as u32; - if len <= 12 { + if len <= MAX_INLINE_VIEW_LEN { // # Safety // The view is valid from the builder unsafe { GenericByteViewArray::::inline_value(view, len as usize) } @@ -315,7 +315,7 @@ impl GenericByteViewBuilder { pub fn append_value(&mut self, value: impl AsRef) { let v: &[u8] = value.as_ref().as_ref(); let length: u32 = v.len().try_into().unwrap(); - if length <= 12 { + if length <= MAX_INLINE_VIEW_LEN { let mut view_buffer = [0; 16]; view_buffer[0..4].copy_from_slice(&length.to_le_bytes()); view_buffer[4..4 + v.len()].copy_from_slice(v); diff --git a/arrow-data/src/byte_view.rs b/arrow-data/src/byte_view.rs index 3b3ec6246066..270f4f9948ac 100644 --- a/arrow-data/src/byte_view.rs +++ b/arrow-data/src/byte_view.rs @@ -18,6 +18,14 @@ use arrow_buffer::Buffer; use arrow_schema::ArrowError; +/// The maximum number of bytes that can be stored inline in a byte view. +/// +/// See [`ByteView`] and [`GenericByteViewArray`] for more information on the +/// layout of the views. +/// +/// [`GenericByteViewArray`]: https://docs.rs/arrow/latest/arrow/array/struct.GenericByteViewArray.html +pub const MAX_INLINE_VIEW_LEN: u32 = 12; + /// Helper to access views of [`GenericByteViewArray`] (`StringViewArray` and /// `BinaryViewArray`) where the length is greater than 12 bytes. /// @@ -76,15 +84,15 @@ impl ByteView { /// See example on [`ByteView`] docs /// /// Notes: - /// * the length should always be greater than 12 (Data less than 12 - /// bytes is stored as an inline view) + /// * the length should always be greater than [`MAX_INLINE_VIEW_LEN`] + /// (Data less than 12 bytes is stored as an inline view) /// * buffer and offset are set to `0` /// /// # Panics /// If the prefix is not exactly 4 bytes #[inline] pub fn new(length: u32, prefix: &[u8]) -> Self { - debug_assert!(length > 12); + debug_assert!(length > MAX_INLINE_VIEW_LEN); Self { length, prefix: u32::from_le_bytes(prefix.try_into().unwrap()), @@ -159,8 +167,8 @@ where { for (idx, v) in views.iter().enumerate() { let len = *v as u32; - if len <= 12 { - if len < 12 && (v >> (32 + len * 8)) != 0 { + if len <= MAX_INLINE_VIEW_LEN { + if len < MAX_INLINE_VIEW_LEN && (v >> (32 + len * 8)) != 0 { return Err(ArrowError::InvalidArgumentError(format!( "View at index {idx} contained non-zero padding for string of length {len}", ))); diff --git a/arrow-select/src/coalesce/byte_view.rs b/arrow-select/src/coalesce/byte_view.rs index 9f87d14a8e4f..00b2210cb8d9 100644 --- a/arrow-select/src/coalesce/byte_view.rs +++ b/arrow-select/src/coalesce/byte_view.rs @@ -20,7 +20,7 @@ use arrow_array::cast::AsArray; use arrow_array::types::ByteViewType; use arrow_array::{Array, ArrayRef, GenericByteViewArray}; use arrow_buffer::{Buffer, NullBufferBuilder}; -use arrow_data::ByteView; +use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN}; use arrow_schema::ArrowError; use std::marker::PhantomData; use std::sync::Arc; @@ -125,7 +125,7 @@ impl InProgressByteViewArray { // If there are buffers, we need to update the buffer index let updated_views = views.iter().map(|v| { let mut byte_view = ByteView::from(*v); - if byte_view.length > 12 { + if byte_view.length > MAX_INLINE_VIEW_LEN { // Small views (<=12 bytes) are inlined, so only need to update large views byte_view.buffer_index += starting_buffer; }; @@ -182,7 +182,7 @@ impl InProgressByteViewArray { if remaining_capacity < str_len as usize { break; } - if str_len > 12 { + if str_len > MAX_INLINE_VIEW_LEN { remaining_capacity -= str_len as usize; } num_view_to_current += 1; @@ -233,7 +233,7 @@ impl InProgressByteViewArray { .iter() .filter_map(|v| { let b = ByteView::from(*v); - if b.length > 12 { + if b.length > MAX_INLINE_VIEW_LEN { Some(b.length as usize) } else { None @@ -251,7 +251,7 @@ impl InProgressByteViewArray { // Copy the views, updating the buffer index and copying the data as needed let new_views = views.iter().map(|v| { let mut b: ByteView = ByteView::from(*v); - if b.length > 12 { + if b.length > MAX_INLINE_VIEW_LEN { let buffer_index = b.buffer_index as usize; let buffer_offset = b.offset as usize; let str_len = b.length as usize;