From 19db4eeccce7cc114a90b98a1bbfda36c8e7cd86 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Fri, 27 Jun 2025 09:34:37 -0700 Subject: [PATCH 1/3] support both fallible and infallible access to variant metadata --- parquet-variant/src/variant/metadata.rs | 150 +++++++++++++++++++----- parquet-variant/src/variant/object.rs | 11 +- 2 files changed, 128 insertions(+), 33 deletions(-) diff --git a/parquet-variant/src/variant/metadata.rs b/parquet-variant/src/variant/metadata.rs index 16b4df6f3f12..a331e3f5b11f 100644 --- a/parquet-variant/src/variant/metadata.rs +++ b/parquet-variant/src/variant/metadata.rs @@ -17,8 +17,8 @@ use crate::decoder::OffsetSizeBytes; use crate::utils::{ - first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice, - validate_fallible_iterator, + first_byte_from_slice, overflow_error, slice_from_slice, slice_from_slice_at_offset, + string_from_slice, validate_fallible_iterator, }; use arrow_schema::ArrowError; @@ -78,35 +78,75 @@ impl VariantMetadataHeader { /// /// See the [Variant Spec] file for more information /// +/// # Validation +/// +/// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the +/// underlying bytes are a valid encoding of variant metadata (see below). +/// +/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully _validated_, and always +/// contain _valid_ data. +/// +/// Instances produced by [`Self::new`] are _unvalidated_ and may contain either _valid_ or +/// _invalid_ data. Infallible accesses such as iteration and indexing may panic if the underlying +/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are +/// strongly recommended. [`Self::validate`] can also be used to _validate_ an _unvalidated_ +/// instance, if desired. +/// +/// A _validated_ instance guarantees that: +/// +/// - header byte is valid +/// - dictionary size is in bounds +/// - offset array content is in-bounds +/// - first offset is zero +/// - last offset is in-bounds +/// - all other offsets are in-bounds (*) +/// - all offsets are monotonically increasing (*) +/// - all values are valid utf-8 (*) +/// +/// NOTE: [`Self::new`] only skips expensive validation checks (marked by `(*)` in the list above); +/// it panics any of the other checks fails. +/// /// [`Variant`]: crate::Variant /// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding #[derive(Clone, Copy, Debug, PartialEq)] pub struct VariantMetadata<'m> { bytes: &'m [u8], header: VariantMetadataHeader, - dict_size: usize, + dictionary_size: usize, dictionary_key_start_byte: usize, + validated: bool, } impl<'m> VariantMetadata<'m> { - /// View the raw bytes (needed by very low-level decoders) - #[inline] - pub const fn as_bytes(&self) -> &'m [u8] { - self.bytes + /// Attempts to interpret `bytes` as a variant metadata instance, with full [validation] of all + /// dictionary entries. + /// + /// [validation]: Self#Validation + pub fn try_new(bytes: &'m [u8]) -> Result { + Self::try_new_impl(bytes)?.validate() } - /// Attempts to interpret `bytes` as a variant metadata instance. + /// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary + /// entries. Panics if basic sanity checking fails, and subsequent infallible accesses such as + /// indexing and iteration could also panic due to invalid bytes. /// - /// # Validation + /// This constructor can be a useful lightweight alternative to [`Self::try_new`] if the bytes + /// were already validated previously by other means, or if the caller expects a small number of + /// accesses to a large dictionary (preferring to use a small number of fallible accesses as + /// needed, instead of paying expensive full validation up front). /// - /// This constructor verifies that `bytes` points to a valid variant metadata instance. In - /// particular, all offsets are in-bounds and point to valid utf8 strings. - pub fn try_new(bytes: &'m [u8]) -> Result { + /// [validate]: Self#Validation + pub fn new(bytes: &'m [u8]) -> Self { + Self::try_new_impl(bytes).unwrap() + } + + // The actual constructor, which performs only basic sanity checking. + fn try_new_impl(bytes: &'m [u8]) -> Result { let header_byte = first_byte_from_slice(bytes)?; let header = VariantMetadataHeader::try_new(header_byte)?; // First element after header is dictionary size - let dict_size = header + let dictionary_size = header .offset_size .unpack_usize(bytes, NUM_HEADER_BYTES, 0)?; @@ -115,7 +155,7 @@ impl<'m> VariantMetadata<'m> { // Value header, dict_size (offset_size bytes), and dict_size+1 offsets // = NUM_HEADER_BYTES + offset_size + (dict_size + 1) * offset_size // = (dict_size + 2) * offset_size + NUM_HEADER_BYTES - let dictionary_key_start_byte = dict_size + let dictionary_key_start_byte = dictionary_size .checked_add(2) .and_then(|n| n.checked_mul(header.offset_size as usize)) .and_then(|n| n.checked_add(NUM_HEADER_BYTES)) @@ -124,16 +164,45 @@ impl<'m> VariantMetadata<'m> { let new_self = Self { bytes, header, - dict_size, + dictionary_size, dictionary_key_start_byte, + validated: false, }; - // Iterate over all string keys in this dictionary in order to validate the offset array and - // prove that the string bytes are all in bounds. Otherwise, `iter` might panic on `unwrap`. - validate_fallible_iterator(new_self.iter_checked())?; + // Validate just the first and last offset, ignoring the other offsets and all value bytes. + let first_offset = new_self.get_offset(0)?; + if first_offset != 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "First offset is not zero: {first_offset}" + ))); + } + // Bounds check the last offset by requesting the empty slice it points to. + let last_offset = new_self.get_offset(dictionary_size)?; + let _ = + slice_from_slice_at_offset(bytes, dictionary_key_start_byte, last_offset..last_offset)?; Ok(new_self) } + /// True if this instance is fully [validated] for panic-free infallible accesses. + /// + /// [validated]: Self#Validation + pub fn is_validated(&self) -> bool { + self.validated + } + + /// Performs a full [validation] of this metadata dictionary. + /// + /// [validation]: Self#Validation + pub fn validate(mut self) -> Result { + if !self.validated { + // Iterate over all string keys in this dictionary in order to prove that the offset + // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. + validate_fallible_iterator(self.iter_try())?; + self.validated = true; + } + Ok(self) + } + /// Whether the dictionary keys are sorted and unique pub fn is_sorted(&self) -> bool { self.header.is_sorted @@ -141,7 +210,7 @@ impl<'m> VariantMetadata<'m> { /// Get the dictionary size pub fn dictionary_size(&self) -> usize { - self.dict_size + self.dictionary_size } /// The variant protocol version @@ -161,22 +230,41 @@ impl<'m> VariantMetadata<'m> { .unpack_usize(bytes, NUM_HEADER_BYTES, i + 1) } - /// Gets a dictionary entry by index + /// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the + /// underlying bytes are [invalid]. + /// + /// [invalid]: Self#Validation pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> { let byte_range = self.get_offset(i)?..self.get_offset(i + 1)?; string_from_slice(self.bytes, self.dictionary_key_start_byte, byte_range) } - /// Get all dictionary entries as an Iterator of strings + /// Returns an iterator that attempts to visit all dictionary entries, producing `Err` if the + /// iterator encounters [invalid] data. + /// + /// [invalid]: Self#Validation + pub fn iter_try(&self) -> impl Iterator> + '_ { + (0..self.dictionary_size).map(move |i| self.get(i)) + } + + /// Iterates oer all dictionary entries. When working with [unvalidated] input, prefer + /// [`Self::iter_try`] to avoid panics due to invalid data. + /// + /// [unvalidated]: Self#Validation pub fn iter(&self) -> impl Iterator + '_ { - // NOTE: It is safe to unwrap because the constructor already made a successful traversal. - self.iter_checked().map(Result::unwrap) + self.iter_try().map(Result::unwrap) } +} + +/// Retrieves the ith dictionary entry, panicking if the index is out of bounds. Accessing +/// [unvalidated] input could also panic if the underlying bytes are invalid. +/// +/// [unvalidated]: Self#Validation +impl std::ops::Index for VariantMetadata<'_> { + type Output = str; - // Fallible iteration over the fields of this dictionary. The constructor traverses the iterator - // to prove it has no errors, so that all other use sites can blindly `unwrap` the result. - fn iter_checked(&self) -> impl Iterator> + '_ { - (0..self.dict_size).map(move |i| self.get(i)) + fn index(&self, i: usize) -> &str { + self.get(i).unwrap() } } @@ -204,8 +292,8 @@ mod tests { let md = VariantMetadata::try_new(bytes).expect("should parse"); assert_eq!(md.dictionary_size(), 2); // Fields - assert_eq!(md.get(0).unwrap(), "cat"); - assert_eq!(md.get(1).unwrap(), "dog"); + assert_eq!(&md[0], "cat"); + assert_eq!(&md[1], "dog"); // Offsets assert_eq!(md.get_offset(0).unwrap(), 0x00); @@ -238,8 +326,8 @@ mod tests { let working_md = VariantMetadata::try_new(bytes).expect("should parse"); assert_eq!(working_md.dictionary_size(), 2); - assert_eq!(working_md.get(0).unwrap(), "a"); - assert_eq!(working_md.get(1).unwrap(), "b"); + assert_eq!(&working_md[0], "a"); + assert_eq!(&working_md[1], "b"); let truncated = &bytes[..bytes.len() - 1]; diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs index 9530f111f143..128e3902c1a8 100644 --- a/parquet-variant/src/variant/object.rs +++ b/parquet-variant/src/variant/object.rs @@ -63,6 +63,11 @@ pub struct VariantObject<'m, 'v> { } impl<'m, 'v> VariantObject<'m, 'v> { + pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self { + // TODO! + Self::try_new(metadata, value).unwrap() + } + /// Attempts to interpret `value` as a variant object value. /// /// # Validation @@ -147,8 +152,10 @@ impl<'m, 'v> VariantObject<'m, 'v> { /// Get a field's value by index in `0..self.len()` /// /// # Panics - /// If the variant object is corrupted (e.g., invalid offsets or field IDs). - /// This should never happen since the constructor validates all data upfront. + /// + /// If the index is out of bounds. Also if variant object is corrupted (e.g., invalid offsets or + /// field IDs). The latter can only happen when working with an unvalidated object produced by + /// [`Self::new`]. pub fn field(&self, i: usize) -> Option> { Some( self.try_field(i) From b132d5cfee5ee069798a071b7333df9033fba49a Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 30 Jun 2025 15:56:34 -0700 Subject: [PATCH 2/3] Tweak object and array validation as well --- parquet-variant/src/decoder.rs | 78 ++++--- parquet-variant/src/variant.rs | 126 +++++++++++- parquet-variant/src/variant/list.rs | 226 +++++++++++++++------ parquet-variant/src/variant/metadata.rs | 112 ++++++----- parquet-variant/src/variant/object.rs | 257 ++++++++++++++++-------- 5 files changed, 571 insertions(+), 228 deletions(-) diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs index e73911aa2953..6bdffe37668d 100644 --- a/parquet-variant/src/decoder.rs +++ b/parquet-variant/src/decoder.rs @@ -14,7 +14,9 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. -use crate::utils::{array_from_slice, slice_from_slice_at_offset, string_from_slice}; +use crate::utils::{ + array_from_slice, overflow_error, slice_from_slice_at_offset, string_from_slice, +}; use crate::ShortString; use arrow_schema::ArrowError; @@ -132,23 +134,41 @@ impl OffsetSizeBytes { /// Return one unsigned little-endian value from `bytes`. /// - /// * `bytes` – the Variant-metadata buffer. + /// * `bytes` – the byte buffer to index + /// * `index` – 0-based index into the buffer + /// + /// Each value is `self as usize` bytes wide (1, 2, 3 or 4). + /// Three-byte values are zero-extended to 32 bits before the final + /// fallible cast to `usize`. + pub(crate) fn unpack_usize(&self, bytes: &[u8], index: usize) -> Result { + self.unpack_usize_at_offset(bytes, 0, index) + } + + /// Return one unsigned little-endian value from `bytes`. + /// + /// * `bytes` – the byte buffer to index /// * `byte_offset` – number of bytes to skip **before** reading the first - /// value (usually `1` to move past the header byte). - /// * `offset_index` – 0-based index **after** the skip + /// value (e.g. `1` to move past a header byte). + /// * `offset_index` – 0-based index **after** the skipped bytes /// (`0` is the first value, `1` the next, …). /// /// Each value is `self as usize` bytes wide (1, 2, 3 or 4). /// Three-byte values are zero-extended to 32 bits before the final /// fallible cast to `usize`. - pub(crate) fn unpack_usize( + pub(crate) fn unpack_usize_at_offset( &self, bytes: &[u8], byte_offset: usize, // how many bytes to skip offset_index: usize, // which offset in an array of offsets ) -> Result { use OffsetSizeBytes::*; - let offset = byte_offset + (*self as usize) * offset_index; + + // Index into the byte array: + // byte_offset + (*self as usize) * offset_index + let offset = offset_index + .checked_mul(*self as usize) + .and_then(|n| n.checked_add(byte_offset)) + .ok_or_else(|| overflow_error("unpacking offset array value"))?; let result = match self { One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(), Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(), @@ -159,14 +179,14 @@ impl OffsetSizeBytes { let mut buf = [0u8; 4]; buf[..3].copy_from_slice(&b3_chunks); u32::from_le_bytes(buf) - .try_into() - .map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))? } - Four => u32::from_le_bytes(array_from_slice(bytes, offset)?) - .try_into() - .map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))?, + Four => u32::from_le_bytes(array_from_slice(bytes, offset)?), }; - Ok(result) + + // Convert the u32 we extracted to usize (should always succeed on 32- and 64-bit arch) + result + .try_into() + .map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string())) } } @@ -453,48 +473,44 @@ mod tests { // One-byte offsets let buf_one = [0x01u8, 0xAB, 0xCD]; assert_eq!( - OffsetSizeBytes::One.unpack_usize(&buf_one, 0, 0).unwrap(), + OffsetSizeBytes::One.unpack_usize(&buf_one, 0).unwrap(), 0x01 ); assert_eq!( - OffsetSizeBytes::One.unpack_usize(&buf_one, 0, 2).unwrap(), + OffsetSizeBytes::One.unpack_usize(&buf_one, 2).unwrap(), 0xCD ); // Two-byte offsets (little-endian 0x1234, 0x5678) let buf_two = [0x34, 0x12, 0x78, 0x56]; assert_eq!( - OffsetSizeBytes::Two.unpack_usize(&buf_two, 0, 0).unwrap(), + OffsetSizeBytes::Two.unpack_usize(&buf_two, 0).unwrap(), 0x1234 ); assert_eq!( - OffsetSizeBytes::Two.unpack_usize(&buf_two, 0, 1).unwrap(), + OffsetSizeBytes::Two.unpack_usize(&buf_two, 1).unwrap(), 0x5678 ); // Three-byte offsets (0x030201 and 0x0000FF) let buf_three = [0x01, 0x02, 0x03, 0xFF, 0x00, 0x00]; assert_eq!( - OffsetSizeBytes::Three - .unpack_usize(&buf_three, 0, 0) - .unwrap(), + OffsetSizeBytes::Three.unpack_usize(&buf_three, 0).unwrap(), 0x030201 ); assert_eq!( - OffsetSizeBytes::Three - .unpack_usize(&buf_three, 0, 1) - .unwrap(), + OffsetSizeBytes::Three.unpack_usize(&buf_three, 1).unwrap(), 0x0000FF ); // Four-byte offsets (0x12345678, 0x90ABCDEF) let buf_four = [0x78, 0x56, 0x34, 0x12, 0xEF, 0xCD, 0xAB, 0x90]; assert_eq!( - OffsetSizeBytes::Four.unpack_usize(&buf_four, 0, 0).unwrap(), + OffsetSizeBytes::Four.unpack_usize(&buf_four, 0).unwrap(), 0x1234_5678 ); assert_eq!( - OffsetSizeBytes::Four.unpack_usize(&buf_four, 0, 1).unwrap(), + OffsetSizeBytes::Four.unpack_usize(&buf_four, 1).unwrap(), 0x90AB_CDEF ); } @@ -502,8 +518,8 @@ mod tests { #[test] fn unpack_usize_out_of_bounds() { let tiny = [0x00u8]; // deliberately too short - assert!(OffsetSizeBytes::Two.unpack_usize(&tiny, 0, 0).is_err()); - assert!(OffsetSizeBytes::Three.unpack_usize(&tiny, 0, 0).is_err()); + assert!(OffsetSizeBytes::Two.unpack_usize(&tiny, 0).is_err()); + assert!(OffsetSizeBytes::Three.unpack_usize(&tiny, 0).is_err()); } #[test] @@ -519,20 +535,20 @@ mod tests { let width = OffsetSizeBytes::Two; // dictionary_size starts immediately after the header byte - let dict_size = width.unpack_usize(&buf, 1, 0).unwrap(); + let dict_size = width.unpack_usize_at_offset(&buf, 1, 0).unwrap(); assert_eq!(dict_size, 2); // offset array immediately follows the dictionary size - let first = width.unpack_usize(&buf, 1, 1).unwrap(); + let first = width.unpack_usize_at_offset(&buf, 1, 1).unwrap(); assert_eq!(first, 0); - let second = width.unpack_usize(&buf, 1, 2).unwrap(); + let second = width.unpack_usize_at_offset(&buf, 1, 2).unwrap(); assert_eq!(second, 5); - let third = width.unpack_usize(&buf, 1, 3).unwrap(); + let third = width.unpack_usize_at_offset(&buf, 1, 3).unwrap(); assert_eq!(third, 9); - let err = width.unpack_usize(&buf, 1, 4); + let err = width.unpack_usize_at_offset(&buf, 1, 4); assert!(err.is_err()) } } diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index 28583f165897..ead4cc41deb5 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -162,7 +162,7 @@ impl Deref for ShortString<'_> { /// // parse the header metadata /// assert_eq!( /// Variant::from("HI"), -/// Variant::try_new(&metadata, &value).unwrap() +/// Variant::new(&metadata, &value) /// ); /// ``` /// @@ -177,6 +177,38 @@ impl Deref for ShortString<'_> { /// _ => println!("Other variant"), /// } /// ``` +/// +/// # Validation +/// +/// Every instance of variant is either _valid_ or _invalid_. depending on whether the +/// underlying bytes are a valid encoding of a variant value (see below). +/// +/// Instances produced by [`Self::try_new`], [`Self::try_new_with_metadata`], or [`Self::validate`] +/// are fully _validated_. They always contain _valid_ data, and infallible accesses such as +/// iteration and indexing are panic-free. The validation cost is linear in `O(m + v)` where `m` and +/// `v` are the number of bytes in the metadata and value buffers. +/// +/// Instances produced by [`Self::new_with_metadata`] are _unvalidated_ and so they may contain +/// either _valid_ or _invalid_ data. Infallible accesses to variant objects and arrays, such as +/// iteration and indexing will panic if the underlying bytes are _invalid_, and fallible +/// alternatives are provided as panic-free alternatives. [`Self::validate`] can also be used to +/// _validate_ an _unvalidated_ instance, if desired. +/// +/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller +/// knows the underlying bytes were already validated previously, or if the caller intends to +/// perform a small number of (fallible) accesses to a large variant value. +/// +/// A _validated_ variant value guarantees that the associated [metadata] and all nested [object] +/// and [array] values are _valid_. Primitive variant subtypes are always _valid_ by construction. +/// +/// # Safety +/// +/// Even an _invalid_ variant value is still _safe_ to use in the Rust sense. Accessing it with +/// infallible methods may cause panics but will never lead to undefined behavior. +/// +/// [metadata]: VariantMetadata#Validation +/// [object]: VariantObject#Validation +/// [array]: VariantList#Validation #[derive(Clone, Debug, PartialEq)] pub enum Variant<'m, 'v> { /// Primitive type: Null @@ -224,7 +256,9 @@ pub enum Variant<'m, 'v> { } impl<'m, 'v> Variant<'m, 'v> { - /// Create a new `Variant` from metadata and value. + /// Attempts to interpret a metadata and value buffer pair as a new `Variant`. + /// + /// The instance is fully [validated]. /// /// # Example /// ``` @@ -237,12 +271,38 @@ impl<'m, 'v> Variant<'m, 'v> { /// Variant::try_new(&metadata, &value).unwrap() /// ); /// ``` + /// + /// [validated]: Self#Validation pub fn try_new(metadata: &'m [u8], value: &'v [u8]) -> Result { let metadata = VariantMetadata::try_new(metadata)?; Self::try_new_with_metadata(metadata, value) } - /// Create a new variant with existing metadata + /// Attempts to interpret a metadata and value buffer pair as a new `Variant`. + /// + /// The instance is [unvalidated]. + /// + /// # Example + /// ``` + /// use parquet_variant::{Variant, VariantMetadata}; + /// let metadata = [0x01, 0x00, 0x00]; + /// let value = [0x09, 0x48, 0x49]; + /// // parse the header metadata + /// assert_eq!( + /// Variant::from("HI"), + /// Variant::new(&metadata, &value) + /// ); + /// ``` + /// + /// [unvalidated]: Self#Validation + pub fn new(metadata: &'m [u8], value: &'v [u8]) -> Self { + let metadata = VariantMetadata::try_new_impl(metadata).expect("Invalid variant metadata"); + Self::try_new_with_metadata_impl(metadata, value).expect("Invalid variant data") + } + + /// Create a new variant with existing metadata. + /// + /// The instance is fully [validated]. /// /// # Example /// ``` @@ -250,15 +310,32 @@ impl<'m, 'v> Variant<'m, 'v> { /// let metadata = [0x01, 0x00, 0x00]; /// let value = [0x09, 0x48, 0x49]; /// // parse the header metadata first - /// let metadata = VariantMetadata::try_new(&metadata).unwrap(); + /// let metadata = VariantMetadata::new(&metadata); /// assert_eq!( /// Variant::from("HI"), /// Variant::try_new_with_metadata(metadata, &value).unwrap() /// ); /// ``` + /// + /// [validated]: Self#Validation pub fn try_new_with_metadata( metadata: VariantMetadata<'m>, value: &'v [u8], + ) -> Result { + Self::try_new_with_metadata_impl(metadata, value)?.validate() + } + + /// Similar to [`Self::try_new_with_metadata`], but [unvalidated]. + /// + /// [unvalidated]: Self#Validation + pub fn new_with_metadata(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self { + Self::try_new_with_metadata_impl(metadata, value).expect("Invalid variant") + } + + // The actual constructor, which only performs shallow (constant-time) validation. + fn try_new_with_metadata_impl( + metadata: VariantMetadata<'m>, + value: &'v [u8], ) -> Result { let value_metadata = first_byte_from_slice(value)?; let value_data = slice_from_slice(value, 1..)?; @@ -304,12 +381,45 @@ impl<'m, 'v> Variant<'m, 'v> { VariantBasicType::ShortString => { Variant::ShortString(decoder::decode_short_string(value_metadata, value_data)?) } - VariantBasicType::Object => Variant::Object(VariantObject::try_new(metadata, value)?), - VariantBasicType::Array => Variant::List(VariantList::try_new(metadata, value)?), + VariantBasicType::Object => { + Variant::Object(VariantObject::try_new_impl(metadata, value)?) + } + VariantBasicType::Array => Variant::List(VariantList::try_new_impl(metadata, value)?), }; Ok(new_self) } + /// True if this variant instance has already been [validated]. + /// + /// [validated]: Self#Validation + pub fn is_validated(&self) -> bool { + match self { + Variant::List(list) => list.is_validated(), + Variant::Object(obj) => obj.is_validated(), + _ => true, + } + } + + /// Recursively validates this variant value, ensuring that infallible access will not panic due + /// to invalid bytes. + /// + /// Variant leaf values are always valid by construction, but [objects] and [arrays] can be + /// constructed in unvalidated (and potentially invalid) state. + /// + /// If [`Self::is_validated`] is true, validation is a no-op. Otherwise, the cost is `O(m + v)` + /// where `m` and `v` are the sizes of metadata and value buffers, respectively. + /// + /// [objects]: VariantObject#Validation + /// [arrays]: VariantArray#Validation + pub fn validate(self) -> Result { + use Variant::*; + match self { + List(list) => list.validate().map(List), + Object(obj) => obj.validate().map(Object), + _ => Ok(self), + } + } + /// Converts this variant to `()` if it is null. /// /// Returns `Some(())` for null variants, @@ -827,7 +937,7 @@ impl<'m, 'v> Variant<'m, 'v> { /// # builder.finish() /// # }; /// // object that is {"name": "John"} - /// let variant = Variant::try_new(&metadata, &value).unwrap(); + /// let variant = Variant::new(&metadata, &value); /// // use the `as_object` method to access the object /// let obj = variant.as_object().expect("variant should be an object"); /// assert_eq!(obj.get("name"), Some(Variant::from("John"))); @@ -857,7 +967,7 @@ impl<'m, 'v> Variant<'m, 'v> { /// # builder.finish() /// # }; /// // list that is ["John", "Doe"] - /// let variant = Variant::try_new(&metadata, &value).unwrap(); + /// let variant = Variant::new(&metadata, &value); /// // use the `as_list` method to access the list /// let list = variant.as_list().expect("variant should be a list"); /// assert_eq!(list.len(), 2); diff --git a/parquet-variant/src/variant/list.rs b/parquet-variant/src/variant/list.rs index 320cdbbee90a..7f0c7c1701af 100644 --- a/parquet-variant/src/variant/list.rs +++ b/parquet-variant/src/variant/list.rs @@ -16,7 +16,8 @@ // under the License. use crate::decoder::OffsetSizeBytes; use crate::utils::{ - first_byte_from_slice, overflow_error, slice_from_slice_at_offset, validate_fallible_iterator, + first_byte_from_slice, overflow_error, slice_from_slice, slice_from_slice_at_offset, + validate_fallible_iterator, }; use crate::variant::{Variant, VariantMetadata}; @@ -28,39 +29,103 @@ const NUM_HEADER_BYTES: usize = 1; /// A parsed version of the variant array value header byte. #[derive(Clone, Debug, PartialEq)] pub(crate) struct VariantListHeader { + num_elements_size: OffsetSizeBytes, offset_size: OffsetSizeBytes, - is_large: bool, } impl VariantListHeader { + // Hide the ugly casting + const fn num_elements_size(&self) -> usize { + self.num_elements_size as _ + } + const fn offset_size(&self) -> usize { + self.offset_size as _ + } + + // Avoid materializing this offset, since it's cheaply and safely computable + const fn first_offset_byte(&self) -> usize { + NUM_HEADER_BYTES + self.num_elements_size() + } + pub(crate) fn try_new(header_byte: u8) -> Result { // The 6 first bits to the left are the value_header and the 2 bits // to the right are the basic type, so we shift to get only the value_header let value_header = header_byte >> 2; let is_large = (value_header & 0x04) != 0; // 3rd bit from the right let field_offset_size_minus_one = value_header & 0x03; // Last two bits + + // The size of the num_elements entry in the array value_data is 4 bytes if + // is_large is true, otherwise 1 byte. + let num_elements_size = match is_large { + true => OffsetSizeBytes::Four, + false => OffsetSizeBytes::One, + }; let offset_size = OffsetSizeBytes::try_new(field_offset_size_minus_one)?; Ok(Self { + num_elements_size, offset_size, - is_large, }) } } /// [`Variant`] Array. /// +/// See the [Variant spec] for details. +/// /// NOTE: The "list" naming differs from the variant spec -- which calls it "array" -- in order to be /// consistent with Parquet and Arrow type naming. Otherwise, the name would conflict with the /// `VariantArray : Array` we must eventually define for variant-typed arrow arrays. +/// +/// # Validation +/// +/// Every instance of variant list is either _valid_ or _invalid_. depending on whether the +/// underlying bytes are a valid encoding of a variant array (see below). +/// +/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully _validated_. They always +/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The +/// validation cost is linear in the number of underlying bytes. +/// +/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or +/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying +/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are +/// provided as panic-free alternatives. [`Self::validate`] can also be used to _validate_ an +/// _unvalidated_ instance, if desired. +/// +/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller +/// knows the underlying bytes were already validated previously, or if the caller intends to +/// perform a small number of (fallible) accesses to a large list. +/// +/// A _validated_ variant list instance guarantees that: +/// +/// - header byte is valid +/// - num_elements is in bounds +/// - offset array content is in-bounds +/// - first offset is zero +/// - last offset is in-bounds +/// - all other offsets are in-bounds (*) +/// - all offsets are monotonically increasing (*) +/// - all values are (recursively) valid variant objects (*) +/// - the associated variant metadata is [valid] (*) +/// +/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)` +/// in the list above); it panics any of the other checks fails. +/// +/// # Safety +/// +/// Even an _invalid_ variant list instance is still _safe_ to use in the Rust sense. Accessing +/// it with infallible methods may cause panics but will never lead to undefined behavior. +/// +/// [valid]: VariantMetadata#Validation +/// [Variant spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-array-basic_type3 #[derive(Clone, Debug, PartialEq)] pub struct VariantList<'m, 'v> { pub metadata: VariantMetadata<'m>, pub value: &'v [u8], header: VariantListHeader, num_elements: usize, - first_offset_byte: usize, first_value_byte: usize, + validated: bool, } impl<'m, 'v> VariantList<'m, 'v> { @@ -69,46 +134,88 @@ impl<'m, 'v> VariantList<'m, 'v> { /// # Validation /// /// This constructor verifies that `value` points to a valid variant array value. In particular, - /// that all offsets are in-bounds and point to valid objects. - // TODO: How to make the validation non-recursive while still making iterators safely infallible?? - // See https://github.com/apache/arrow-rs/issues/7711 + /// that all offsets are in-bounds and point to valid (recursively validated) objects. pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result { + Self::try_new_impl(metadata, value)?.validate() + } + + pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self { + Self::try_new_impl(metadata, value).expect("Invalid variant list value") + } + + /// Attempts to interpet `metadata` and `value` as a variant array, performing only basic + /// (constant-cost) [validation]. + /// + /// [validation]: Self#Validation + pub(crate) fn try_new_impl( + metadata: VariantMetadata<'m>, + value: &'v [u8], + ) -> Result { let header_byte = first_byte_from_slice(value)?; let header = VariantListHeader::try_new(header_byte)?; - // The size of the num_elements entry in the array value_data is 4 bytes if - // is_large is true, otherwise 1 byte. - let num_elements_size = match header.is_large { - true => OffsetSizeBytes::Four, - false => OffsetSizeBytes::One, - }; - // Skip the header byte to read the num_elements; the offset array immediately follows - let num_elements = num_elements_size.unpack_usize(value, NUM_HEADER_BYTES, 0)?; - let first_offset_byte = NUM_HEADER_BYTES + num_elements_size as usize; + let num_elements = + header + .num_elements_size + .unpack_usize_at_offset(value, NUM_HEADER_BYTES, 0)?; // (num_elements + 1) * offset_size + first_offset_byte let first_value_byte = num_elements .checked_add(1) - .and_then(|n| n.checked_mul(header.offset_size as usize)) - .and_then(|n| n.checked_add(first_offset_byte)) + .and_then(|n| n.checked_mul(header.offset_size())) + .and_then(|n| n.checked_add(header.first_offset_byte())) .ok_or_else(|| overflow_error("offset of variant list values"))?; - let new_self = Self { + let mut new_self = Self { metadata, value, header, num_elements, - first_offset_byte, first_value_byte, + validated: false, }; - // Iterate over all values of this array in order to validate the field_offset array and - // prove that the field values are all in bounds. Otherwise, `iter` might panic on `unwrap`. - validate_fallible_iterator(new_self.iter_checked())?; + // Validate just the first and last offset, ignoring the other offsets and all value bytes. + let first_offset = new_self.get_offset(0)?; + if first_offset != 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "First offset is not zero: {first_offset}" + ))); + } + + // Use the last offset to upper-bound the value buffer + let last_offset = new_self + .get_offset(num_elements)? + .checked_add(first_value_byte) + .ok_or_else(|| overflow_error("variant array size"))?; + new_self.value = slice_from_slice(value, ..last_offset)?; Ok(new_self) } + /// True if this instance is fully [validated] for panic-free infallible accesses. + /// + /// [validated]: Self#Validation + pub fn is_validated(&self) -> bool { + self.validated + } + + /// Performs a full [validation] of this variant array and returns the result. + /// + /// [validation]: Self#Validation + pub fn validate(mut self) -> Result { + if !self.validated { + // Validate the metadata dictionary, if not already validated. + self.metadata = self.metadata.validate()?; + + // Iterate over all string keys in this dictionary in order to prove that the offset + // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. + validate_fallible_iterator(self.iter_try())?; + self.validated = true; + } + Ok(self) + } + /// Return the length of this array pub fn len(&self) -> usize { self.num_elements @@ -119,54 +226,51 @@ impl<'m, 'v> VariantList<'m, 'v> { self.len() == 0 } - /// Returns element by index in `0..self.len()`, if any + /// Returns element by index in `0..self.len()`, if any. May panic if this list is [invalid]. + /// + /// [invalid]: Self#Validation pub fn get(&self, index: usize) -> Option> { - if index >= self.num_elements { - return None; - } - - match self.try_get(index) { - Ok(variant) => Some(variant), - Err(err) => panic!("validation error: {err}"), - } + (index < self.num_elements).then(|| { + self.try_get_impl(index) + .and_then(Variant::validate) + .expect("Invalid variant array element") + }) } /// Fallible version of `get`. Returns element by index, capturing validation errors - fn try_get(&self, index: usize) -> Result, ArrowError> { - if index >= self.num_elements { - return Err(ArrowError::InvalidArgumentError(format!( - "Index {} out of bounds for list of length {}", - index, self.num_elements, - ))); - } - - // Skip header and num_elements bytes to read the offsets - let unpack = |i| { - self.header - .offset_size - .unpack_usize(self.value, self.first_offset_byte, i) - }; + pub fn try_get(&self, index: usize) -> Result, ArrowError> { + self.try_get_impl(index)?.validate() + } - // Read the value bytes from the offsets - let variant_value_bytes = slice_from_slice_at_offset( - self.value, - self.first_value_byte, - unpack(index)?..unpack(index + 1)?, - )?; - let variant = Variant::try_new_with_metadata(self.metadata, variant_value_bytes)?; - Ok(variant) + /// Fallible iteration over the elements of this list. + pub fn iter_try(&self) -> impl Iterator, ArrowError>> + '_ { + (0..self.len()).map(move |i| self.try_get_impl(i)) } - /// Iterates over the values of this list + /// Iterates over the values of this list. When working with [unvalidated] input, consider + /// [`Self::iter_try`] to avoid panics due to invalid data. + /// + /// [unvalidated]: Self#Validation pub fn iter(&self) -> impl Iterator> + '_ { - // NOTE: It is safe to unwrap because the constructor already made a successful traversal. - self.iter_checked().map(Result::unwrap) + self.iter_try() + .map(|result| result.expect("Invalid variant list entry")) + } + + // Attempts to retrieve the ith offset from the offset array region of the byte buffer. + fn get_offset(&self, index: usize) -> Result { + let byte_range = self.header.first_offset_byte()..self.first_value_byte; + let offset_bytes = slice_from_slice(self.value, byte_range)?; + self.header.offset_size.unpack_usize(offset_bytes, index) } - // Fallible iteration over the fields of this dictionary. The constructor traverses the iterator - // to prove it has no errors, so that all other use sites can blindly `unwrap` the result. - fn iter_checked(&self) -> impl Iterator, ArrowError>> + '_ { - (0..self.len()).map(move |i| self.try_get(i)) + // Fallible version of `get`, performing only basic (constant-time) validation. + fn try_get_impl(&self, index: usize) -> Result, ArrowError> { + // Fetch the value bytes between the two offsets for this index, from the value array region + // of the byte buffer + let byte_range = self.get_offset(index)?..self.get_offset(index + 1)?; + let value_bytes = + slice_from_slice_at_offset(self.value, self.first_value_byte, byte_range)?; + Variant::try_new_with_metadata(self.metadata, value_bytes) } } diff --git a/parquet-variant/src/variant/metadata.rs b/parquet-variant/src/variant/metadata.rs index a331e3f5b11f..6a449ec73655 100644 --- a/parquet-variant/src/variant/metadata.rs +++ b/parquet-variant/src/variant/metadata.rs @@ -17,8 +17,8 @@ use crate::decoder::OffsetSizeBytes; use crate::utils::{ - first_byte_from_slice, overflow_error, slice_from_slice, slice_from_slice_at_offset, - string_from_slice, validate_fallible_iterator, + first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice, + validate_fallible_iterator, }; use arrow_schema::ArrowError; @@ -40,6 +40,16 @@ const CORRECT_VERSION_VALUE: u8 = 1; const NUM_HEADER_BYTES: usize = 1; impl VariantMetadataHeader { + // Hide the cast + const fn offset_size(&self) -> usize { + self.offset_size as usize + } + + // Avoid materializing this offset, since it's cheaply and safely computable + const fn first_offset_byte(&self) -> usize { + NUM_HEADER_BYTES + self.offset_size() + } + /// Tries to construct the variant metadata header, which has the form /// /// ```text @@ -83,16 +93,21 @@ impl VariantMetadataHeader { /// Every instance of variant metadata is either _valid_ or _invalid_. depending on whether the /// underlying bytes are a valid encoding of variant metadata (see below). /// -/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully _validated_, and always -/// contain _valid_ data. +/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully _validated_. They always +/// contain _valid_ data, and infallible accesses such as iteration and indexing are panic-free. The +/// validation cost is linear in the number of underlying bytes. /// -/// Instances produced by [`Self::new`] are _unvalidated_ and may contain either _valid_ or -/// _invalid_ data. Infallible accesses such as iteration and indexing may panic if the underlying +/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or +/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying /// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are -/// strongly recommended. [`Self::validate`] can also be used to _validate_ an _unvalidated_ -/// instance, if desired. +/// provided as panic-free alternatives. [`Self::validate`] can also be used to _validate_ an +/// _unvalidated_ instance, if desired. /// -/// A _validated_ instance guarantees that: +/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller +/// knows the underlying bytes were already validated previously, or if the caller intends to +/// perform a small number of (fallible) accesses to a large dictionary. +/// +/// A _validated_ variant [metadata instance guarantees that: /// /// - header byte is valid /// - dictionary size is in bounds @@ -103,8 +118,13 @@ impl VariantMetadataHeader { /// - all offsets are monotonically increasing (*) /// - all values are valid utf-8 (*) /// -/// NOTE: [`Self::new`] only skips expensive validation checks (marked by `(*)` in the list above); -/// it panics any of the other checks fails. +/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)` +/// in the list above); it panics any of the other checks fails. +/// +/// # Safety +/// +/// Even an _invalid_ variant metadata instance is still _safe_ to use in the Rust sense. Accessing +/// it with infallible methods may cause panics but will never lead to undefined behavior. /// /// [`Variant`]: crate::Variant /// [Variant Spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#metadata-encoding @@ -113,7 +133,7 @@ pub struct VariantMetadata<'m> { bytes: &'m [u8], header: VariantMetadataHeader, dictionary_size: usize, - dictionary_key_start_byte: usize, + first_value_byte: usize, validated: bool, } @@ -128,7 +148,7 @@ impl<'m> VariantMetadata<'m> { /// Interprets `bytes` as a variant metadata instance, without attempting to [validate] dictionary /// entries. Panics if basic sanity checking fails, and subsequent infallible accesses such as - /// indexing and iteration could also panic due to invalid bytes. + /// indexing and iteration could also panic if the underlying bytes are invalid. /// /// This constructor can be a useful lightweight alternative to [`Self::try_new`] if the bytes /// were already validated previously by other means, or if the caller expects a small number of @@ -137,35 +157,35 @@ impl<'m> VariantMetadata<'m> { /// /// [validate]: Self#Validation pub fn new(bytes: &'m [u8]) -> Self { - Self::try_new_impl(bytes).unwrap() + Self::try_new_impl(bytes).expect("Invalid variant metadata") } - // The actual constructor, which performs only basic sanity checking. - fn try_new_impl(bytes: &'m [u8]) -> Result { + // The actual constructor, which performs only basic (constant-const) validation. + pub(crate) fn try_new_impl(bytes: &'m [u8]) -> Result { let header_byte = first_byte_from_slice(bytes)?; let header = VariantMetadataHeader::try_new(header_byte)?; - // First element after header is dictionary size - let dictionary_size = header - .offset_size - .unpack_usize(bytes, NUM_HEADER_BYTES, 0)?; + // First element after header is dictionary size; the offset array immediately follows. + let dictionary_size = + header + .offset_size + .unpack_usize_at_offset(bytes, NUM_HEADER_BYTES, 0)?; // Calculate the starting offset of the dictionary string bytes. // - // Value header, dict_size (offset_size bytes), and dict_size+1 offsets - // = NUM_HEADER_BYTES + offset_size + (dict_size + 1) * offset_size - // = (dict_size + 2) * offset_size + NUM_HEADER_BYTES - let dictionary_key_start_byte = dictionary_size - .checked_add(2) - .and_then(|n| n.checked_mul(header.offset_size as usize)) - .and_then(|n| n.checked_add(NUM_HEADER_BYTES)) + // There are dict_size + 1 offsets, and the value bytes immediately follow + // = (dict_size + 1) * offset_size + header.first_offset_byte() + let first_value_byte = dictionary_size + .checked_add(1) + .and_then(|n| n.checked_mul(header.offset_size())) + .and_then(|n| n.checked_add(header.first_offset_byte())) .ok_or_else(|| overflow_error("offset of variant metadata dictionary"))?; - let new_self = Self { + let mut new_self = Self { bytes, header, dictionary_size, - dictionary_key_start_byte, + first_value_byte, validated: false, }; @@ -176,10 +196,13 @@ impl<'m> VariantMetadata<'m> { "First offset is not zero: {first_offset}" ))); } - // Bounds check the last offset by requesting the empty slice it points to. - let last_offset = new_self.get_offset(dictionary_size)?; - let _ = - slice_from_slice_at_offset(bytes, dictionary_key_start_byte, last_offset..last_offset)?; + + // Use the last offset to upper-bound the byte slice + let last_offset = new_self + .get_offset(dictionary_size)? + .checked_add(first_value_byte) + .ok_or_else(|| overflow_error("variant metadata size"))?; + new_self.bytes = slice_from_slice(bytes, ..last_offset)?; Ok(new_self) } @@ -190,7 +213,7 @@ impl<'m> VariantMetadata<'m> { self.validated } - /// Performs a full [validation] of this metadata dictionary. + /// Performs a full [validation] of this metadata dictionary and returns the result. /// /// [validation]: Self#Validation pub fn validate(mut self) -> Result { @@ -209,12 +232,12 @@ impl<'m> VariantMetadata<'m> { } /// Get the dictionary size - pub fn dictionary_size(&self) -> usize { + pub const fn dictionary_size(&self) -> usize { self.dictionary_size } /// The variant protocol version - pub fn version(&self) -> u8 { + pub const fn version(&self) -> u8 { self.header.version } @@ -223,11 +246,9 @@ impl<'m> VariantMetadata<'m> { /// This offset is an index into the dictionary, at the boundary between string `i-1` and string /// `i`. See [`Self::get`] to retrieve a specific dictionary entry. fn get_offset(&self, i: usize) -> Result { - // Skip the header byte and the dictionary_size entry (by offset_index + 1) - let bytes = slice_from_slice(self.bytes, ..self.dictionary_key_start_byte)?; - self.header - .offset_size - .unpack_usize(bytes, NUM_HEADER_BYTES, i + 1) + let offset_byte_range = self.header.first_offset_byte()..self.first_value_byte; + let bytes = slice_from_slice(self.bytes, offset_byte_range)?; + self.header.offset_size.unpack_usize(bytes, i) } /// Attempts to retrieve a dictionary entry by index, failing if out of bounds or if the @@ -236,7 +257,7 @@ impl<'m> VariantMetadata<'m> { /// [invalid]: Self#Validation pub fn get(&self, i: usize) -> Result<&'m str, ArrowError> { let byte_range = self.get_offset(i)?..self.get_offset(i + 1)?; - string_from_slice(self.bytes, self.dictionary_key_start_byte, byte_range) + string_from_slice(self.bytes, self.first_value_byte, byte_range) } /// Returns an iterator that attempts to visit all dictionary entries, producing `Err` if the @@ -247,12 +268,13 @@ impl<'m> VariantMetadata<'m> { (0..self.dictionary_size).map(move |i| self.get(i)) } - /// Iterates oer all dictionary entries. When working with [unvalidated] input, prefer + /// Iterates over all dictionary entries. When working with [unvalidated] input, consider /// [`Self::iter_try`] to avoid panics due to invalid data. /// /// [unvalidated]: Self#Validation pub fn iter(&self) -> impl Iterator + '_ { - self.iter_try().map(Result::unwrap) + self.iter_try() + .map(|result| result.expect("Invalid metadata dictionary entry")) } } @@ -264,7 +286,7 @@ impl std::ops::Index for VariantMetadata<'_> { type Output = str; fn index(&self, i: usize) -> &str { - self.get(i).unwrap() + self.get(i).expect("Invalid metadata dictionary entry") } } diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs index 128e3902c1a8..e9f3c263e197 100644 --- a/parquet-variant/src/variant/object.rs +++ b/parquet-variant/src/variant/object.rs @@ -29,116 +29,196 @@ const NUM_HEADER_BYTES: usize = 1; /// Header structure for [`VariantObject`] #[derive(Clone, Debug, PartialEq)] pub(crate) struct VariantObjectHeader { - field_offset_size: OffsetSizeBytes, + num_elements_size: OffsetSizeBytes, field_id_size: OffsetSizeBytes, - is_large: bool, + field_offset_size: OffsetSizeBytes, } impl VariantObjectHeader { + // Hide the ugly casting + const fn num_elements_size(&self) -> usize { + self.num_elements_size as _ + } + const fn field_id_size(&self) -> usize { + self.field_id_size as _ + } + const fn field_offset_size(&self) -> usize { + self.field_offset_size as _ + } + + // Avoid materializing this offset, since it's cheaply and safely computable + const fn field_ids_start_byte(&self) -> usize { + NUM_HEADER_BYTES + self.num_elements_size() + } + pub(crate) fn try_new(header_byte: u8) -> Result { // Parse the header byte to get object parameters let value_header = header_byte >> 2; let field_offset_size_minus_one = value_header & 0x03; // Last 2 bits let field_id_size_minus_one = (value_header >> 2) & 0x03; // Next 2 bits let is_large = (value_header & 0x10) != 0; // 5th bit - + let num_elements_size = match is_large { + true => OffsetSizeBytes::Four, + false => OffsetSizeBytes::One, + }; Ok(Self { - field_offset_size: OffsetSizeBytes::try_new(field_offset_size_minus_one)?, + num_elements_size, field_id_size: OffsetSizeBytes::try_new(field_id_size_minus_one)?, - is_large, + field_offset_size: OffsetSizeBytes::try_new(field_offset_size_minus_one)?, }) } } /// A [`Variant`] Object (struct with named fields). +/// +/// See the [Variant spec] file for more information. +/// +/// # Validation +/// +/// Every instance of variant object is either _valid_ or _invalid_. depending on whether the +/// underlying bytes are a valid encoding of a variant object subtype (see below). +/// +/// Instances produced by [`Self::try_new`] or [`Self::validate`] are fully (and recursively) +/// _validated_. They always contain _valid_ data, and infallible accesses such as iteration and +/// indexing are panic-free. The validation cost is linear in the number of underlying bytes. +/// +/// Instances produced by [`Self::new`] are _unvalidated_ and so they may contain either _valid_ or +/// _invalid_ data. Infallible accesses such as iteration and indexing will panic if the underlying +/// bytes are _invalid_, and fallible alternatives such as [`Self::iter_try`] and [`Self::get`] are +/// provided as panic-free alternatives. [`Self::validate`] can also be used to _validate_ an +/// _unvalidated_ instance, if desired. +/// +/// _Unvalidated_ instances can be constructed in constant time. They can be useful if the caller +/// knows the underlying bytes were already validated previously, or if the caller intends to +/// perform a small number of (fallible) field accesses against a large object. +/// +/// A _validated_ instance guarantees that: +/// +/// - header byte is valid +/// - num_elements is in bounds +/// - field id array is in bounds +/// - field offset array is in bounds +/// - field value array is in bounds +/// - all field ids are valid metadata dictionary entries (*) +/// - field ids are lexically ordered according by their corresponding string values (*) +/// - all field offsets are in bounds (*) +/// - all field values are (recursively) _valid_ variant values (*) +/// - the associated variant metadata is [valid] (*) +/// +/// NOTE: [`Self::new`] only skips expensive (non-constant cost) validation checks (marked by `(*)` +/// in the list above); it panics any of the other checks fails. +/// +/// # Safety +/// +/// Even an _invalid_ variant object instance is still _safe_ to use in the Rust sense. Accessing it +/// with infallible methods may cause panics but will never lead to undefined behavior. +/// +/// [valid]: VariantMetadata#Validation +/// [Variant spec]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#value-data-for-object-basic_type2 #[derive(Clone, Debug, PartialEq)] pub struct VariantObject<'m, 'v> { pub metadata: VariantMetadata<'m>, pub value: &'v [u8], header: VariantObjectHeader, num_elements: usize, - field_ids_start_byte: usize, - field_offsets_start_byte: usize, - values_start_byte: usize, + first_field_offset_byte: usize, + first_value_byte: usize, + validated: bool, } impl<'m, 'v> VariantObject<'m, 'v> { pub fn new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self { - // TODO! - Self::try_new(metadata, value).unwrap() + Self::try_new_impl(metadata, value).expect("Invalid variant object") } - /// Attempts to interpret `value` as a variant object value. + /// Attempts to interpet `metadata` and `value` as a variant object. /// /// # Validation /// /// This constructor verifies that `value` points to a valid variant object value. In /// particular, that all field ids exist in `metadata`, and all offsets are in-bounds and point /// to valid objects. - // TODO: How to make the validation non-recursive while still making iterators safely infallible?? - // See https://github.com/apache/arrow-rs/issues/7711 pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result { + let mut new_self = Self::try_new_impl(metadata, value)?.validate()?; + new_self.validated = true; + Ok(new_self) + } + + /// Attempts to interpet `metadata` and `value` as a variant object, performing only basic + /// (constant-cost) [validation]. + /// + /// [validation]: Self#Validation + pub(crate) fn try_new_impl( + metadata: VariantMetadata<'m>, + value: &'v [u8], + ) -> Result { let header_byte = first_byte_from_slice(value)?; let header = VariantObjectHeader::try_new(header_byte)?; // Determine num_elements size based on is_large flag and fetch the value - let num_elements_size = if header.is_large { - OffsetSizeBytes::Four - } else { - OffsetSizeBytes::One - }; - let num_elements = num_elements_size.unpack_usize(value, NUM_HEADER_BYTES, 0)?; - - // Calculate byte offsets for different sections with overflow protection - let field_ids_start_byte = NUM_HEADER_BYTES - .checked_add(num_elements_size as usize) - .ok_or_else(|| overflow_error("offset of variant object field ids"))?; - - let field_offsets_start_byte = num_elements - .checked_mul(header.field_id_size as usize) - .and_then(|n| n.checked_add(field_ids_start_byte)) + let num_elements = + header + .num_elements_size + .unpack_usize_at_offset(value, NUM_HEADER_BYTES, 0)?; + + // Calculate byte offsets for field offsets and values with overflow protection, and verify + // they're in bounds + let first_field_offset_byte = num_elements + .checked_mul(header.field_id_size()) + .and_then(|n| n.checked_add(header.field_ids_start_byte())) .ok_or_else(|| overflow_error("offset of variant object field offsets"))?; - let values_start_byte = num_elements + let first_value_byte = num_elements .checked_add(1) - .and_then(|n| n.checked_mul(header.field_offset_size as usize)) - .and_then(|n| n.checked_add(field_offsets_start_byte)) + .and_then(|n| n.checked_mul(header.field_offset_size())) + .and_then(|n| n.checked_add(first_field_offset_byte)) .ok_or_else(|| overflow_error("offset of variant object field values"))?; - // Spec says: "The last field_offset points to the byte after the end of the last value" - // - // Use the last offset as a bounds check. The iterator check below doesn't use it -- offsets - // are not monotonic -- so we have to check separately here. - let end_offset = header - .field_offset_size - .unpack_usize(value, field_offsets_start_byte, num_elements)? - .checked_add(values_start_byte) - .ok_or_else(|| overflow_error("end of variant object field values"))?; - if end_offset > value.len() { - return Err(ArrowError::InvalidArgumentError(format!( - "Last field offset value {} is outside the value slice of length {}", - end_offset, - value.len() - ))); - } - - let new_self = Self { + let mut new_self = Self { metadata, value, header, num_elements, - field_ids_start_byte, - field_offsets_start_byte, - values_start_byte, + first_field_offset_byte, + first_value_byte, + validated: false, }; - // Iterate over all fields of this object in order to validate the field_id and field_offset - // arrays, and also to prove the field values are all in bounds. Otherwise, `iter` might - // panic on `unwrap`. - validate_fallible_iterator(new_self.iter_checked())?; + // Spec says: "The last field_offset points to the byte after the end of the last value" + // + // Use it to upper-bound the value bytes, which also verifies that the field id and field + // offset arrays are in bounds. + let last_offset = new_self + .get_offset(num_elements)? + .checked_add(first_value_byte) + .ok_or_else(|| overflow_error("variant object size"))?; + new_self.value = slice_from_slice(value, ..last_offset)?; Ok(new_self) } + /// True if this instance is fully [validated] for panic-free infallible accesses. + /// + /// [validated]: Self#Validation + pub fn is_validated(&self) -> bool { + self.validated + } + + /// Performs a full [validation] of this variant object. + /// + /// [validation]: Self#Validation + pub fn validate(mut self) -> Result { + if !self.validated { + // Validate the metadata dictionary, if not already validated. + self.metadata = self.metadata.validate()?; + + // Iterate over all string keys in this dictionary in order to prove that the offset + // array is valid, all offsets are in bounds, and all string bytes are valid utf-8. + validate_fallible_iterator(self.iter_try_impl())?; + self.validated = true; + } + Ok(self) + } + /// Returns the number of key-value pairs in this object pub fn len(&self) -> usize { self.num_elements @@ -157,57 +237,68 @@ impl<'m, 'v> VariantObject<'m, 'v> { /// field IDs). The latter can only happen when working with an unvalidated object produced by /// [`Self::new`]. pub fn field(&self, i: usize) -> Option> { - Some( - self.try_field(i) - .expect("validation error after construction"), - ) + (i < self.len()).then(|| self.try_field_impl(i).expect("Invalid object field value")) } /// Fallible version of `field`. Returns field value by index, capturing validation errors - fn try_field(&self, i: usize) -> Result, ArrowError> { - let start_offset = self.header.field_offset_size.unpack_usize( - self.value, - self.field_offsets_start_byte, - i, - )?; - let value_start = self - .values_start_byte - .checked_add(start_offset) - .ok_or_else(|| overflow_error("offset of variant object field"))?; - let value_bytes = slice_from_slice(self.value, value_start..)?; + pub fn try_field(&self, i: usize) -> Result, ArrowError> { + self.try_field_impl(i)?.validate() + } + + // Attempts to retrieve the ith field value from the value region of the byte buffer; it + // performs only basic (constant-cost) validation. + fn try_field_impl(&self, i: usize) -> Result, ArrowError> { + let value_bytes = slice_from_slice(self.value, self.first_value_byte..)?; + let value_bytes = slice_from_slice(value_bytes, self.get_offset(i)?..)?; Variant::try_new_with_metadata(self.metadata, value_bytes) } + // Attempts to retrieve the ith offset from the field offset region of the byte buffer. + fn get_offset(&self, i: usize) -> Result { + let byte_range = self.first_field_offset_byte..self.first_value_byte; + let field_offsets = slice_from_slice(self.value, byte_range)?; + self.header.field_offset_size.unpack_usize(field_offsets, i) + } + /// Get a field's name by index in `0..self.len()` /// /// # Panics /// If the variant object is corrupted (e.g., invalid offsets or field IDs). /// This should never happen since the constructor validates all data upfront. pub fn field_name(&self, i: usize) -> Option<&'m str> { - Some( + (i < self.len()).then(|| { self.try_field_name(i) - .expect("validation error after construction"), - ) + .expect("Invalid variant object field name") + }) } /// Fallible version of `field_name`. Returns field name by index, capturing validation errors fn try_field_name(&self, i: usize) -> Result<&'m str, ArrowError> { - let field_id = - self.header - .field_id_size - .unpack_usize(self.value, self.field_ids_start_byte, i)?; + let byte_range = self.header.field_ids_start_byte()..self.first_field_offset_byte; + let field_id_bytes = slice_from_slice(self.value, byte_range)?; + let field_id = self.header.field_id_size.unpack_usize(field_id_bytes, i)?; self.metadata.get(field_id) } /// Returns an iterator of (name, value) pairs over the fields of this object. pub fn iter(&self) -> impl Iterator)> + '_ { - // NOTE: It is safe to unwrap because the constructor already made a successful traversal. - self.iter_checked().map(Result::unwrap) + self.iter_try_impl() + .map(|result| result.expect("Invalid variant object field value")) + } + + /// Fallible iteration over the fields of this object. + pub fn iter_try( + &self, + ) -> impl Iterator), ArrowError>> + '_ { + self.iter_try_impl().map(|result| { + let (name, value) = result?; + Ok((name, value.validate()?)) + }) } - // Fallible iteration over the fields of this object. The constructor traverses the iterator to - // prove it has no errors, so that all other use sites can blindly `unwrap` the result. - fn iter_checked( + // Fallible iteration over the fields of this object that performs only shallow (constant-cost) + // validation of field values. + fn iter_try_impl( &self, ) -> impl Iterator), ArrowError>> + '_ { (0..self.num_elements).map(move |i| Ok((self.try_field_name(i)?, self.try_field(i)?))) From 780a7f41bdd358c419a8cd9affb778cdcd1b69c6 Mon Sep 17 00:00:00 2001 From: Ryan Johnson Date: Mon, 30 Jun 2025 16:38:43 -0700 Subject: [PATCH 3/3] address self-review comments --- parquet-variant/src/variant.rs | 16 ++++++++-------- parquet-variant/src/variant/list.rs | 10 ++++++++-- parquet-variant/src/variant/object.rs | 7 +++---- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/parquet-variant/src/variant.rs b/parquet-variant/src/variant.rs index ead4cc41deb5..19e731832668 100644 --- a/parquet-variant/src/variant.rs +++ b/parquet-variant/src/variant.rs @@ -185,14 +185,14 @@ impl Deref for ShortString<'_> { /// /// Instances produced by [`Self::try_new`], [`Self::try_new_with_metadata`], or [`Self::validate`] /// are fully _validated_. They always contain _valid_ data, and infallible accesses such as -/// iteration and indexing are panic-free. The validation cost is linear in `O(m + v)` where `m` and -/// `v` are the number of bytes in the metadata and value buffers. +/// iteration and indexing are panic-free. The validation cost is `O(m + v)` where `m` and +/// `v` are the number of bytes in the metadata and value buffers, respectively. /// -/// Instances produced by [`Self::new_with_metadata`] are _unvalidated_ and so they may contain -/// either _valid_ or _invalid_ data. Infallible accesses to variant objects and arrays, such as -/// iteration and indexing will panic if the underlying bytes are _invalid_, and fallible -/// alternatives are provided as panic-free alternatives. [`Self::validate`] can also be used to -/// _validate_ an _unvalidated_ instance, if desired. +/// Instances produced by [`Self::new`] and [`Self::new_with_metadata`] are _unvalidated_ and so +/// they may contain either _valid_ or _invalid_ data. Infallible accesses to variant objects and +/// arrays, such as iteration and indexing will panic if the underlying bytes are _invalid_, and +/// fallible alternatives are provided as panic-free alternatives. [`Self::validate`] can also be +/// used to _validate_ an _unvalidated_ instance, if desired. /// /// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller /// knows the underlying bytes were already validated previously, or if the caller intends to @@ -410,7 +410,7 @@ impl<'m, 'v> Variant<'m, 'v> { /// where `m` and `v` are the sizes of metadata and value buffers, respectively. /// /// [objects]: VariantObject#Validation - /// [arrays]: VariantArray#Validation + /// [arrays]: VariantList#Validation pub fn validate(self) -> Result { use Variant::*; match self { diff --git a/parquet-variant/src/variant/list.rs b/parquet-variant/src/variant/list.rs index 7f0c7c1701af..00935016e133 100644 --- a/parquet-variant/src/variant/list.rs +++ b/parquet-variant/src/variant/list.rs @@ -205,7 +205,8 @@ impl<'m, 'v> VariantList<'m, 'v> { /// [validation]: Self#Validation pub fn validate(mut self) -> Result { if !self.validated { - // Validate the metadata dictionary, if not already validated. + // Validate the metadata dictionary first, if not already validated, because we pass it + // by value to all the children (who would otherwise re-validate it repeatedly). self.metadata = self.metadata.validate()?; // Iterate over all string keys in this dictionary in order to prove that the offset @@ -244,6 +245,11 @@ impl<'m, 'v> VariantList<'m, 'v> { /// Fallible iteration over the elements of this list. pub fn iter_try(&self) -> impl Iterator, ArrowError>> + '_ { + self.iter_try_impl().map(|result| result?.validate()) + } + + // Fallible iteration that only performs basic (constant-time) validation. + fn iter_try_impl(&self) -> impl Iterator, ArrowError>> + '_ { (0..self.len()).map(move |i| self.try_get_impl(i)) } @@ -252,7 +258,7 @@ impl<'m, 'v> VariantList<'m, 'v> { /// /// [unvalidated]: Self#Validation pub fn iter(&self) -> impl Iterator> + '_ { - self.iter_try() + self.iter_try_impl() .map(|result| result.expect("Invalid variant list entry")) } diff --git a/parquet-variant/src/variant/object.rs b/parquet-variant/src/variant/object.rs index e9f3c263e197..dacd352069df 100644 --- a/parquet-variant/src/variant/object.rs +++ b/parquet-variant/src/variant/object.rs @@ -139,9 +139,7 @@ impl<'m, 'v> VariantObject<'m, 'v> { /// particular, that all field ids exist in `metadata`, and all offsets are in-bounds and point /// to valid objects. pub fn try_new(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Result { - let mut new_self = Self::try_new_impl(metadata, value)?.validate()?; - new_self.validated = true; - Ok(new_self) + Self::try_new_impl(metadata, value)?.validate() } /// Attempts to interpet `metadata` and `value` as a variant object, performing only basic @@ -208,7 +206,8 @@ impl<'m, 'v> VariantObject<'m, 'v> { /// [validation]: Self#Validation pub fn validate(mut self) -> Result { if !self.validated { - // Validate the metadata dictionary, if not already validated. + // Validate the metadata dictionary first, if not already validated, because we pass it + // by value to all the children (who would otherwise re-validate it repeatedly). self.metadata = self.metadata.validate()?; // Iterate over all string keys in this dictionary in order to prove that the offset