-
Notifications
You must be signed in to change notification settings - Fork 1.2k
[VARIANT] Support both fallible and infallible access to variants #7807
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,7 +14,9 @@ | |
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
| use crate::utils::{array_from_slice, slice_from_slice_at_offset, string_from_slice}; | ||
| use crate::utils::{ | ||
| array_from_slice, overflow_error, slice_from_slice_at_offset, string_from_slice, | ||
| }; | ||
| use crate::ShortString; | ||
|
|
||
| use arrow_schema::ArrowError; | ||
|
|
@@ -132,23 +134,41 @@ impl OffsetSizeBytes { | |
|
|
||
| /// Return one unsigned little-endian value from `bytes`. | ||
| /// | ||
| /// * `bytes` – the Variant-metadata buffer. | ||
| /// * `bytes` – the byte buffer to index | ||
| /// * `index` – 0-based index into the buffer | ||
| /// | ||
| /// Each value is `self as usize` bytes wide (1, 2, 3 or 4). | ||
| /// Three-byte values are zero-extended to 32 bits before the final | ||
| /// fallible cast to `usize`. | ||
| pub(crate) fn unpack_usize(&self, bytes: &[u8], index: usize) -> Result<usize, ArrowError> { | ||
| self.unpack_usize_at_offset(bytes, 0, index) | ||
| } | ||
|
|
||
| /// Return one unsigned little-endian value from `bytes`. | ||
| /// | ||
| /// * `bytes` – the byte buffer to index | ||
| /// * `byte_offset` – number of bytes to skip **before** reading the first | ||
| /// value (usually `1` to move past the header byte). | ||
| /// * `offset_index` – 0-based index **after** the skip | ||
| /// value (e.g. `1` to move past a header byte). | ||
| /// * `offset_index` – 0-based index **after** the skipped bytes | ||
| /// (`0` is the first value, `1` the next, …). | ||
| /// | ||
| /// Each value is `self as usize` bytes wide (1, 2, 3 or 4). | ||
| /// Three-byte values are zero-extended to 32 bits before the final | ||
| /// fallible cast to `usize`. | ||
| pub(crate) fn unpack_usize( | ||
| pub(crate) fn unpack_usize_at_offset( | ||
| &self, | ||
| bytes: &[u8], | ||
| byte_offset: usize, // how many bytes to skip | ||
| offset_index: usize, // which offset in an array of offsets | ||
| ) -> Result<usize, ArrowError> { | ||
| use OffsetSizeBytes::*; | ||
| let offset = byte_offset + (*self as usize) * offset_index; | ||
|
|
||
| // Index into the byte array: | ||
| // byte_offset + (*self as usize) * offset_index | ||
| let offset = offset_index | ||
| .checked_mul(*self as usize) | ||
| .and_then(|n| n.checked_add(byte_offset)) | ||
|
Comment on lines
+168
to
+170
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was unchecked arithmetic that risked overflow panic on 32-bit arch. |
||
| .ok_or_else(|| overflow_error("unpacking offset array value"))?; | ||
| let result = match self { | ||
| One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(), | ||
| Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(), | ||
|
|
@@ -159,14 +179,14 @@ impl OffsetSizeBytes { | |
| let mut buf = [0u8; 4]; | ||
| buf[..3].copy_from_slice(&b3_chunks); | ||
| u32::from_le_bytes(buf) | ||
| .try_into() | ||
| .map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))? | ||
| } | ||
| Four => u32::from_le_bytes(array_from_slice(bytes, offset)?) | ||
| .try_into() | ||
| .map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))?, | ||
| Four => u32::from_le_bytes(array_from_slice(bytes, offset)?), | ||
| }; | ||
| Ok(result) | ||
|
|
||
| // Convert the u32 we extracted to usize (should always succeed on 32- and 64-bit arch) | ||
| result | ||
| .try_into() | ||
| .map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string())) | ||
|
Comment on lines
+187
to
+189
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Factored out the u32 -> usize conversion to reduce redundancy. |
||
| } | ||
| } | ||
|
|
||
|
|
@@ -453,57 +473,53 @@ mod tests { | |
| // One-byte offsets | ||
| let buf_one = [0x01u8, 0xAB, 0xCD]; | ||
| assert_eq!( | ||
| OffsetSizeBytes::One.unpack_usize(&buf_one, 0, 0).unwrap(), | ||
| OffsetSizeBytes::One.unpack_usize(&buf_one, 0).unwrap(), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think these tests are much more readable now |
||
| 0x01 | ||
| ); | ||
| assert_eq!( | ||
| OffsetSizeBytes::One.unpack_usize(&buf_one, 0, 2).unwrap(), | ||
| OffsetSizeBytes::One.unpack_usize(&buf_one, 2).unwrap(), | ||
| 0xCD | ||
| ); | ||
|
|
||
| // Two-byte offsets (little-endian 0x1234, 0x5678) | ||
| let buf_two = [0x34, 0x12, 0x78, 0x56]; | ||
| assert_eq!( | ||
| OffsetSizeBytes::Two.unpack_usize(&buf_two, 0, 0).unwrap(), | ||
| OffsetSizeBytes::Two.unpack_usize(&buf_two, 0).unwrap(), | ||
| 0x1234 | ||
| ); | ||
| assert_eq!( | ||
| OffsetSizeBytes::Two.unpack_usize(&buf_two, 0, 1).unwrap(), | ||
| OffsetSizeBytes::Two.unpack_usize(&buf_two, 1).unwrap(), | ||
| 0x5678 | ||
| ); | ||
|
|
||
| // Three-byte offsets (0x030201 and 0x0000FF) | ||
| let buf_three = [0x01, 0x02, 0x03, 0xFF, 0x00, 0x00]; | ||
| assert_eq!( | ||
| OffsetSizeBytes::Three | ||
| .unpack_usize(&buf_three, 0, 0) | ||
| .unwrap(), | ||
| OffsetSizeBytes::Three.unpack_usize(&buf_three, 0).unwrap(), | ||
| 0x030201 | ||
| ); | ||
| assert_eq!( | ||
| OffsetSizeBytes::Three | ||
| .unpack_usize(&buf_three, 0, 1) | ||
| .unwrap(), | ||
| OffsetSizeBytes::Three.unpack_usize(&buf_three, 1).unwrap(), | ||
| 0x0000FF | ||
| ); | ||
|
|
||
| // Four-byte offsets (0x12345678, 0x90ABCDEF) | ||
| let buf_four = [0x78, 0x56, 0x34, 0x12, 0xEF, 0xCD, 0xAB, 0x90]; | ||
| assert_eq!( | ||
| OffsetSizeBytes::Four.unpack_usize(&buf_four, 0, 0).unwrap(), | ||
| OffsetSizeBytes::Four.unpack_usize(&buf_four, 0).unwrap(), | ||
| 0x1234_5678 | ||
| ); | ||
| assert_eq!( | ||
| OffsetSizeBytes::Four.unpack_usize(&buf_four, 0, 1).unwrap(), | ||
| OffsetSizeBytes::Four.unpack_usize(&buf_four, 1).unwrap(), | ||
| 0x90AB_CDEF | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn unpack_usize_out_of_bounds() { | ||
| let tiny = [0x00u8]; // deliberately too short | ||
| assert!(OffsetSizeBytes::Two.unpack_usize(&tiny, 0, 0).is_err()); | ||
| assert!(OffsetSizeBytes::Three.unpack_usize(&tiny, 0, 0).is_err()); | ||
| assert!(OffsetSizeBytes::Two.unpack_usize(&tiny, 0).is_err()); | ||
| assert!(OffsetSizeBytes::Three.unpack_usize(&tiny, 0).is_err()); | ||
| } | ||
|
|
||
| #[test] | ||
|
|
@@ -519,20 +535,20 @@ mod tests { | |
| let width = OffsetSizeBytes::Two; | ||
|
|
||
| // dictionary_size starts immediately after the header byte | ||
| let dict_size = width.unpack_usize(&buf, 1, 0).unwrap(); | ||
| let dict_size = width.unpack_usize_at_offset(&buf, 1, 0).unwrap(); | ||
| assert_eq!(dict_size, 2); | ||
|
|
||
| // offset array immediately follows the dictionary size | ||
| let first = width.unpack_usize(&buf, 1, 1).unwrap(); | ||
| let first = width.unpack_usize_at_offset(&buf, 1, 1).unwrap(); | ||
| assert_eq!(first, 0); | ||
|
|
||
| let second = width.unpack_usize(&buf, 1, 2).unwrap(); | ||
| let second = width.unpack_usize_at_offset(&buf, 1, 2).unwrap(); | ||
| assert_eq!(second, 5); | ||
|
|
||
| let third = width.unpack_usize(&buf, 1, 3).unwrap(); | ||
| let third = width.unpack_usize_at_offset(&buf, 1, 3).unwrap(); | ||
| assert_eq!(third, 9); | ||
|
|
||
| let err = width.unpack_usize(&buf, 1, 4); | ||
| let err = width.unpack_usize_at_offset(&buf, 1, 4); | ||
| assert!(err.is_err()) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -162,7 +162,7 @@ impl Deref for ShortString<'_> { | |
| /// // parse the header metadata | ||
| /// assert_eq!( | ||
| /// Variant::from("HI"), | ||
| /// Variant::try_new(&metadata, &value).unwrap() | ||
| /// Variant::new(&metadata, &value) | ||
| /// ); | ||
| /// ``` | ||
| /// | ||
|
|
@@ -177,6 +177,38 @@ impl Deref for ShortString<'_> { | |
| /// _ => println!("Other variant"), | ||
| /// } | ||
| /// ``` | ||
| /// | ||
| /// # Validation | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👨🍳 👌 -- very nice |
||
| /// | ||
| /// Every instance of variant is either _valid_ or _invalid_. depending on whether the | ||
| /// underlying bytes are a valid encoding of a variant value (see below). | ||
| /// | ||
| /// Instances produced by [`Self::try_new`], [`Self::try_new_with_metadata`], or [`Self::validate`] | ||
| /// are fully _validated_. They always contain _valid_ data, and infallible accesses such as | ||
| /// iteration and indexing are panic-free. The validation cost is `O(m + v)` where `m` and | ||
| /// `v` are the number of bytes in the metadata and value buffers, respectively. | ||
| /// | ||
| /// Instances produced by [`Self::new`] and [`Self::new_with_metadata`] are _unvalidated_ and so | ||
| /// they may contain either _valid_ or _invalid_ data. Infallible accesses to variant objects and | ||
| /// arrays, such as iteration and indexing will panic if the underlying bytes are _invalid_, and | ||
| /// fallible alternatives are provided as panic-free alternatives. [`Self::validate`] can also be | ||
| /// used to _validate_ an _unvalidated_ instance, if desired. | ||
| /// | ||
| /// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller | ||
| /// knows the underlying bytes were already validated previously, or if the caller intends to | ||
| /// perform a small number of (fallible) accesses to a large variant value. | ||
| /// | ||
| /// A _validated_ variant value guarantees that the associated [metadata] and all nested [object] | ||
| /// and [array] values are _valid_. Primitive variant subtypes are always _valid_ by construction. | ||
| /// | ||
| /// # Safety | ||
| /// | ||
| /// Even an _invalid_ variant value is still _safe_ to use in the Rust sense. Accessing it with | ||
| /// infallible methods may cause panics but will never lead to undefined behavior. | ||
| /// | ||
| /// [metadata]: VariantMetadata#Validation | ||
| /// [object]: VariantObject#Validation | ||
| /// [array]: VariantList#Validation | ||
| #[derive(Clone, Debug, PartialEq)] | ||
| pub enum Variant<'m, 'v> { | ||
| /// Primitive type: Null | ||
|
|
@@ -224,7 +256,9 @@ pub enum Variant<'m, 'v> { | |
| } | ||
|
|
||
| impl<'m, 'v> Variant<'m, 'v> { | ||
| /// Create a new `Variant` from metadata and value. | ||
| /// Attempts to interpret a metadata and value buffer pair as a new `Variant`. | ||
| /// | ||
| /// The instance is fully [validated]. | ||
| /// | ||
| /// # Example | ||
| /// ``` | ||
|
|
@@ -237,28 +271,71 @@ impl<'m, 'v> Variant<'m, 'v> { | |
| /// Variant::try_new(&metadata, &value).unwrap() | ||
| /// ); | ||
| /// ``` | ||
| /// | ||
| /// [validated]: Self#Validation | ||
| pub fn try_new(metadata: &'m [u8], value: &'v [u8]) -> Result<Self, ArrowError> { | ||
| let metadata = VariantMetadata::try_new(metadata)?; | ||
| Self::try_new_with_metadata(metadata, value) | ||
| } | ||
|
|
||
| /// Create a new variant with existing metadata | ||
| /// Attempts to interpret a metadata and value buffer pair as a new `Variant`. | ||
| /// | ||
| /// The instance is [unvalidated]. | ||
| /// | ||
| /// # Example | ||
| /// ``` | ||
| /// use parquet_variant::{Variant, VariantMetadata}; | ||
| /// let metadata = [0x01, 0x00, 0x00]; | ||
| /// let value = [0x09, 0x48, 0x49]; | ||
| /// // parse the header metadata | ||
| /// assert_eq!( | ||
| /// Variant::from("HI"), | ||
| /// Variant::new(&metadata, &value) | ||
| /// ); | ||
| /// ``` | ||
| /// | ||
| /// [unvalidated]: Self#Validation | ||
| pub fn new(metadata: &'m [u8], value: &'v [u8]) -> Self { | ||
| let metadata = VariantMetadata::try_new_impl(metadata).expect("Invalid variant metadata"); | ||
| Self::try_new_with_metadata_impl(metadata, value).expect("Invalid variant data") | ||
| } | ||
|
|
||
| /// Create a new variant with existing metadata. | ||
| /// | ||
| /// The instance is fully [validated]. | ||
| /// | ||
| /// # Example | ||
| /// ``` | ||
| /// # use parquet_variant::{Variant, VariantMetadata}; | ||
| /// let metadata = [0x01, 0x00, 0x00]; | ||
| /// let value = [0x09, 0x48, 0x49]; | ||
| /// // parse the header metadata first | ||
| /// let metadata = VariantMetadata::try_new(&metadata).unwrap(); | ||
| /// let metadata = VariantMetadata::new(&metadata); | ||
| /// assert_eq!( | ||
| /// Variant::from("HI"), | ||
| /// Variant::try_new_with_metadata(metadata, &value).unwrap() | ||
| /// ); | ||
| /// ``` | ||
| /// | ||
| /// [validated]: Self#Validation | ||
| pub fn try_new_with_metadata( | ||
| metadata: VariantMetadata<'m>, | ||
| value: &'v [u8], | ||
| ) -> Result<Self, ArrowError> { | ||
| Self::try_new_with_metadata_impl(metadata, value)?.validate() | ||
| } | ||
|
|
||
| /// Similar to [`Self::try_new_with_metadata`], but [unvalidated]. | ||
| /// | ||
| /// [unvalidated]: Self#Validation | ||
| pub fn new_with_metadata(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self { | ||
| Self::try_new_with_metadata_impl(metadata, value).expect("Invalid variant") | ||
| } | ||
|
|
||
| // The actual constructor, which only performs shallow (constant-time) validation. | ||
| fn try_new_with_metadata_impl( | ||
| metadata: VariantMetadata<'m>, | ||
| value: &'v [u8], | ||
| ) -> Result<Self, ArrowError> { | ||
| let value_metadata = first_byte_from_slice(value)?; | ||
| let value_data = slice_from_slice(value, 1..)?; | ||
|
|
@@ -304,12 +381,45 @@ impl<'m, 'v> Variant<'m, 'v> { | |
| VariantBasicType::ShortString => { | ||
| Variant::ShortString(decoder::decode_short_string(value_metadata, value_data)?) | ||
| } | ||
| VariantBasicType::Object => Variant::Object(VariantObject::try_new(metadata, value)?), | ||
| VariantBasicType::Array => Variant::List(VariantList::try_new(metadata, value)?), | ||
| VariantBasicType::Object => { | ||
| Variant::Object(VariantObject::try_new_impl(metadata, value)?) | ||
| } | ||
| VariantBasicType::Array => Variant::List(VariantList::try_new_impl(metadata, value)?), | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By directly invoking
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It makes sense to me and I don't really have any better solution |
||
| }; | ||
| Ok(new_self) | ||
| } | ||
|
|
||
| /// True if this variant instance has already been [validated]. | ||
| /// | ||
| /// [validated]: Self#Validation | ||
| pub fn is_validated(&self) -> bool { | ||
| match self { | ||
| Variant::List(list) => list.is_validated(), | ||
| Variant::Object(obj) => obj.is_validated(), | ||
| _ => true, | ||
| } | ||
| } | ||
|
|
||
| /// Recursively validates this variant value, ensuring that infallible access will not panic due | ||
| /// to invalid bytes. | ||
| /// | ||
| /// Variant leaf values are always valid by construction, but [objects] and [arrays] can be | ||
| /// constructed in unvalidated (and potentially invalid) state. | ||
| /// | ||
| /// If [`Self::is_validated`] is true, validation is a no-op. Otherwise, the cost is `O(m + v)` | ||
| /// where `m` and `v` are the sizes of metadata and value buffers, respectively. | ||
| /// | ||
| /// [objects]: VariantObject#Validation | ||
| /// [arrays]: VariantList#Validation | ||
| pub fn validate(self) -> Result<Self, ArrowError> { | ||
| use Variant::*; | ||
| match self { | ||
| List(list) => list.validate().map(List), | ||
| Object(obj) => obj.validate().map(Object), | ||
| _ => Ok(self), | ||
| } | ||
| } | ||
|
|
||
| /// Converts this variant to `()` if it is null. | ||
| /// | ||
| /// Returns `Some(())` for null variants, | ||
|
|
@@ -827,7 +937,7 @@ impl<'m, 'v> Variant<'m, 'v> { | |
| /// # builder.finish() | ||
| /// # }; | ||
| /// // object that is {"name": "John"} | ||
| /// let variant = Variant::try_new(&metadata, &value).unwrap(); | ||
| /// let variant = Variant::new(&metadata, &value); | ||
| /// // use the `as_object` method to access the object | ||
| /// let obj = variant.as_object().expect("variant should be an object"); | ||
| /// assert_eq!(obj.get("name"), Some(Variant::from("John"))); | ||
|
|
@@ -857,7 +967,7 @@ impl<'m, 'v> Variant<'m, 'v> { | |
| /// # builder.finish() | ||
| /// # }; | ||
| /// // list that is ["John", "Doe"] | ||
| /// let variant = Variant::try_new(&metadata, &value).unwrap(); | ||
| /// let variant = Variant::new(&metadata, &value); | ||
| /// // use the `as_list` method to access the list | ||
| /// let list = variant.as_list().expect("variant should be a list"); | ||
| /// assert_eq!(list.len(), 2); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It turns out that a bunch of slice accesses were not correctly bounded; once I fixed that, most
unpack_usizecalls were passing offset 0. So I renamed the original method asunpack_usize_at_offset, and created a new 2-argunpack_usizethat doesn't take an offset.