Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 47 additions & 31 deletions parquet-variant/src/decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::utils::{array_from_slice, slice_from_slice_at_offset, string_from_slice};
use crate::utils::{
array_from_slice, overflow_error, slice_from_slice_at_offset, string_from_slice,
};
use crate::ShortString;

use arrow_schema::ArrowError;
Expand Down Expand Up @@ -132,23 +134,41 @@ impl OffsetSizeBytes {

/// Return one unsigned little-endian value from `bytes`.
///
/// * `bytes` – the Variant-metadata buffer.
/// * `bytes` – the byte buffer to index
/// * `index` – 0-based index into the buffer
///
/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
/// Three-byte values are zero-extended to 32 bits before the final
/// fallible cast to `usize`.
pub(crate) fn unpack_usize(&self, bytes: &[u8], index: usize) -> Result<usize, ArrowError> {
self.unpack_usize_at_offset(bytes, 0, index)
Comment on lines +143 to +144
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It turns out that a bunch of slice accesses were not correctly bounded; once I fixed that, most unpack_usize calls were passing offset 0. So I renamed the original method as unpack_usize_at_offset, and created a new 2-arg unpack_usize that doesn't take an offset.

}

/// Return one unsigned little-endian value from `bytes`.
///
/// * `bytes` – the byte buffer to index
/// * `byte_offset` – number of bytes to skip **before** reading the first
/// value (usually `1` to move past the header byte).
/// * `offset_index` – 0-based index **after** the skip
/// value (e.g. `1` to move past a header byte).
/// * `offset_index` – 0-based index **after** the skipped bytes
/// (`0` is the first value, `1` the next, …).
///
/// Each value is `self as usize` bytes wide (1, 2, 3 or 4).
/// Three-byte values are zero-extended to 32 bits before the final
/// fallible cast to `usize`.
pub(crate) fn unpack_usize(
pub(crate) fn unpack_usize_at_offset(
&self,
bytes: &[u8],
byte_offset: usize, // how many bytes to skip
offset_index: usize, // which offset in an array of offsets
) -> Result<usize, ArrowError> {
use OffsetSizeBytes::*;
let offset = byte_offset + (*self as usize) * offset_index;

// Index into the byte array:
// byte_offset + (*self as usize) * offset_index
let offset = offset_index
.checked_mul(*self as usize)
.and_then(|n| n.checked_add(byte_offset))
Comment on lines +168 to +170
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was unchecked arithmetic that risked overflow panic on 32-bit arch.

.ok_or_else(|| overflow_error("unpacking offset array value"))?;
let result = match self {
One => u8::from_le_bytes(array_from_slice(bytes, offset)?).into(),
Two => u16::from_le_bytes(array_from_slice(bytes, offset)?).into(),
Expand All @@ -159,14 +179,14 @@ impl OffsetSizeBytes {
let mut buf = [0u8; 4];
buf[..3].copy_from_slice(&b3_chunks);
u32::from_le_bytes(buf)
.try_into()
.map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))?
}
Four => u32::from_le_bytes(array_from_slice(bytes, offset)?)
.try_into()
.map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))?,
Four => u32::from_le_bytes(array_from_slice(bytes, offset)?),
};
Ok(result)

// Convert the u32 we extracted to usize (should always succeed on 32- and 64-bit arch)
result
.try_into()
.map_err(|e: TryFromIntError| ArrowError::InvalidArgumentError(e.to_string()))
Comment on lines +187 to +189
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Factored out the u32 -> usize conversion to reduce redundancy.

}
}

Expand Down Expand Up @@ -453,57 +473,53 @@ mod tests {
// One-byte offsets
let buf_one = [0x01u8, 0xAB, 0xCD];
assert_eq!(
OffsetSizeBytes::One.unpack_usize(&buf_one, 0, 0).unwrap(),
OffsetSizeBytes::One.unpack_usize(&buf_one, 0).unwrap(),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these tests are much more readable now

0x01
);
assert_eq!(
OffsetSizeBytes::One.unpack_usize(&buf_one, 0, 2).unwrap(),
OffsetSizeBytes::One.unpack_usize(&buf_one, 2).unwrap(),
0xCD
);

// Two-byte offsets (little-endian 0x1234, 0x5678)
let buf_two = [0x34, 0x12, 0x78, 0x56];
assert_eq!(
OffsetSizeBytes::Two.unpack_usize(&buf_two, 0, 0).unwrap(),
OffsetSizeBytes::Two.unpack_usize(&buf_two, 0).unwrap(),
0x1234
);
assert_eq!(
OffsetSizeBytes::Two.unpack_usize(&buf_two, 0, 1).unwrap(),
OffsetSizeBytes::Two.unpack_usize(&buf_two, 1).unwrap(),
0x5678
);

// Three-byte offsets (0x030201 and 0x0000FF)
let buf_three = [0x01, 0x02, 0x03, 0xFF, 0x00, 0x00];
assert_eq!(
OffsetSizeBytes::Three
.unpack_usize(&buf_three, 0, 0)
.unwrap(),
OffsetSizeBytes::Three.unpack_usize(&buf_three, 0).unwrap(),
0x030201
);
assert_eq!(
OffsetSizeBytes::Three
.unpack_usize(&buf_three, 0, 1)
.unwrap(),
OffsetSizeBytes::Three.unpack_usize(&buf_three, 1).unwrap(),
0x0000FF
);

// Four-byte offsets (0x12345678, 0x90ABCDEF)
let buf_four = [0x78, 0x56, 0x34, 0x12, 0xEF, 0xCD, 0xAB, 0x90];
assert_eq!(
OffsetSizeBytes::Four.unpack_usize(&buf_four, 0, 0).unwrap(),
OffsetSizeBytes::Four.unpack_usize(&buf_four, 0).unwrap(),
0x1234_5678
);
assert_eq!(
OffsetSizeBytes::Four.unpack_usize(&buf_four, 0, 1).unwrap(),
OffsetSizeBytes::Four.unpack_usize(&buf_four, 1).unwrap(),
0x90AB_CDEF
);
}

#[test]
fn unpack_usize_out_of_bounds() {
let tiny = [0x00u8]; // deliberately too short
assert!(OffsetSizeBytes::Two.unpack_usize(&tiny, 0, 0).is_err());
assert!(OffsetSizeBytes::Three.unpack_usize(&tiny, 0, 0).is_err());
assert!(OffsetSizeBytes::Two.unpack_usize(&tiny, 0).is_err());
assert!(OffsetSizeBytes::Three.unpack_usize(&tiny, 0).is_err());
}

#[test]
Expand All @@ -519,20 +535,20 @@ mod tests {
let width = OffsetSizeBytes::Two;

// dictionary_size starts immediately after the header byte
let dict_size = width.unpack_usize(&buf, 1, 0).unwrap();
let dict_size = width.unpack_usize_at_offset(&buf, 1, 0).unwrap();
assert_eq!(dict_size, 2);

// offset array immediately follows the dictionary size
let first = width.unpack_usize(&buf, 1, 1).unwrap();
let first = width.unpack_usize_at_offset(&buf, 1, 1).unwrap();
assert_eq!(first, 0);

let second = width.unpack_usize(&buf, 1, 2).unwrap();
let second = width.unpack_usize_at_offset(&buf, 1, 2).unwrap();
assert_eq!(second, 5);

let third = width.unpack_usize(&buf, 1, 3).unwrap();
let third = width.unpack_usize_at_offset(&buf, 1, 3).unwrap();
assert_eq!(third, 9);

let err = width.unpack_usize(&buf, 1, 4);
let err = width.unpack_usize_at_offset(&buf, 1, 4);
assert!(err.is_err())
}
}
126 changes: 118 additions & 8 deletions parquet-variant/src/variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ impl Deref for ShortString<'_> {
/// // parse the header metadata
/// assert_eq!(
/// Variant::from("HI"),
/// Variant::try_new(&metadata, &value).unwrap()
/// Variant::new(&metadata, &value)
/// );
/// ```
///
Expand All @@ -177,6 +177,38 @@ impl Deref for ShortString<'_> {
/// _ => println!("Other variant"),
/// }
/// ```
///
/// # Validation
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👨‍🍳 👌 -- very nice

///
/// Every instance of variant is either _valid_ or _invalid_. depending on whether the
/// underlying bytes are a valid encoding of a variant value (see below).
///
/// Instances produced by [`Self::try_new`], [`Self::try_new_with_metadata`], or [`Self::validate`]
/// are fully _validated_. They always contain _valid_ data, and infallible accesses such as
/// iteration and indexing are panic-free. The validation cost is `O(m + v)` where `m` and
/// `v` are the number of bytes in the metadata and value buffers, respectively.
///
/// Instances produced by [`Self::new`] and [`Self::new_with_metadata`] are _unvalidated_ and so
/// they may contain either _valid_ or _invalid_ data. Infallible accesses to variant objects and
/// arrays, such as iteration and indexing will panic if the underlying bytes are _invalid_, and
/// fallible alternatives are provided as panic-free alternatives. [`Self::validate`] can also be
/// used to _validate_ an _unvalidated_ instance, if desired.
///
/// _Unvalidated_ instances can be constructed in constant time. This can be useful if the caller
/// knows the underlying bytes were already validated previously, or if the caller intends to
/// perform a small number of (fallible) accesses to a large variant value.
///
/// A _validated_ variant value guarantees that the associated [metadata] and all nested [object]
/// and [array] values are _valid_. Primitive variant subtypes are always _valid_ by construction.
///
/// # Safety
///
/// Even an _invalid_ variant value is still _safe_ to use in the Rust sense. Accessing it with
/// infallible methods may cause panics but will never lead to undefined behavior.
///
/// [metadata]: VariantMetadata#Validation
/// [object]: VariantObject#Validation
/// [array]: VariantList#Validation
#[derive(Clone, Debug, PartialEq)]
pub enum Variant<'m, 'v> {
/// Primitive type: Null
Expand Down Expand Up @@ -224,7 +256,9 @@ pub enum Variant<'m, 'v> {
}

impl<'m, 'v> Variant<'m, 'v> {
/// Create a new `Variant` from metadata and value.
/// Attempts to interpret a metadata and value buffer pair as a new `Variant`.
///
/// The instance is fully [validated].
///
/// # Example
/// ```
Expand All @@ -237,28 +271,71 @@ impl<'m, 'v> Variant<'m, 'v> {
/// Variant::try_new(&metadata, &value).unwrap()
/// );
/// ```
///
/// [validated]: Self#Validation
pub fn try_new(metadata: &'m [u8], value: &'v [u8]) -> Result<Self, ArrowError> {
let metadata = VariantMetadata::try_new(metadata)?;
Self::try_new_with_metadata(metadata, value)
}

/// Create a new variant with existing metadata
/// Attempts to interpret a metadata and value buffer pair as a new `Variant`.
///
/// The instance is [unvalidated].
///
/// # Example
/// ```
/// use parquet_variant::{Variant, VariantMetadata};
/// let metadata = [0x01, 0x00, 0x00];
/// let value = [0x09, 0x48, 0x49];
/// // parse the header metadata
/// assert_eq!(
/// Variant::from("HI"),
/// Variant::new(&metadata, &value)
/// );
/// ```
///
/// [unvalidated]: Self#Validation
pub fn new(metadata: &'m [u8], value: &'v [u8]) -> Self {
let metadata = VariantMetadata::try_new_impl(metadata).expect("Invalid variant metadata");
Self::try_new_with_metadata_impl(metadata, value).expect("Invalid variant data")
}

/// Create a new variant with existing metadata.
///
/// The instance is fully [validated].
///
/// # Example
/// ```
/// # use parquet_variant::{Variant, VariantMetadata};
/// let metadata = [0x01, 0x00, 0x00];
/// let value = [0x09, 0x48, 0x49];
/// // parse the header metadata first
/// let metadata = VariantMetadata::try_new(&metadata).unwrap();
/// let metadata = VariantMetadata::new(&metadata);
/// assert_eq!(
/// Variant::from("HI"),
/// Variant::try_new_with_metadata(metadata, &value).unwrap()
/// );
/// ```
///
/// [validated]: Self#Validation
pub fn try_new_with_metadata(
metadata: VariantMetadata<'m>,
value: &'v [u8],
) -> Result<Self, ArrowError> {
Self::try_new_with_metadata_impl(metadata, value)?.validate()
}

/// Similar to [`Self::try_new_with_metadata`], but [unvalidated].
///
/// [unvalidated]: Self#Validation
pub fn new_with_metadata(metadata: VariantMetadata<'m>, value: &'v [u8]) -> Self {
Self::try_new_with_metadata_impl(metadata, value).expect("Invalid variant")
}

// The actual constructor, which only performs shallow (constant-time) validation.
fn try_new_with_metadata_impl(
metadata: VariantMetadata<'m>,
value: &'v [u8],
) -> Result<Self, ArrowError> {
let value_metadata = first_byte_from_slice(value)?;
let value_data = slice_from_slice(value, 1..)?;
Expand Down Expand Up @@ -304,12 +381,45 @@ impl<'m, 'v> Variant<'m, 'v> {
VariantBasicType::ShortString => {
Variant::ShortString(decoder::decode_short_string(value_metadata, value_data)?)
}
VariantBasicType::Object => Variant::Object(VariantObject::try_new(metadata, value)?),
VariantBasicType::Array => Variant::List(VariantList::try_new(metadata, value)?),
VariantBasicType::Object => {
Variant::Object(VariantObject::try_new_impl(metadata, value)?)
}
VariantBasicType::Array => Variant::List(VariantList::try_new_impl(metadata, value)?),
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By directly invoking try_new_impl methods, we avoid recursive validation of array elements and object fields. Unfortunately this requires making them pub(crate), when I would have preferred to keep them private.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It makes sense to me and I don't really have any better solution

};
Ok(new_self)
}

/// True if this variant instance has already been [validated].
///
/// [validated]: Self#Validation
pub fn is_validated(&self) -> bool {
match self {
Variant::List(list) => list.is_validated(),
Variant::Object(obj) => obj.is_validated(),
_ => true,
}
}

/// Recursively validates this variant value, ensuring that infallible access will not panic due
/// to invalid bytes.
///
/// Variant leaf values are always valid by construction, but [objects] and [arrays] can be
/// constructed in unvalidated (and potentially invalid) state.
///
/// If [`Self::is_validated`] is true, validation is a no-op. Otherwise, the cost is `O(m + v)`
/// where `m` and `v` are the sizes of metadata and value buffers, respectively.
///
/// [objects]: VariantObject#Validation
/// [arrays]: VariantList#Validation
pub fn validate(self) -> Result<Self, ArrowError> {
use Variant::*;
match self {
List(list) => list.validate().map(List),
Object(obj) => obj.validate().map(Object),
_ => Ok(self),
}
}

/// Converts this variant to `()` if it is null.
///
/// Returns `Some(())` for null variants,
Expand Down Expand Up @@ -827,7 +937,7 @@ impl<'m, 'v> Variant<'m, 'v> {
/// # builder.finish()
/// # };
/// // object that is {"name": "John"}
/// let variant = Variant::try_new(&metadata, &value).unwrap();
/// let variant = Variant::new(&metadata, &value);
/// // use the `as_object` method to access the object
/// let obj = variant.as_object().expect("variant should be an object");
/// assert_eq!(obj.get("name"), Some(Variant::from("John")));
Expand Down Expand Up @@ -857,7 +967,7 @@ impl<'m, 'v> Variant<'m, 'v> {
/// # builder.finish()
/// # };
/// // list that is ["John", "Doe"]
/// let variant = Variant::try_new(&metadata, &value).unwrap();
/// let variant = Variant::new(&metadata, &value);
/// // use the `as_list` method to access the list
/// let list = variant.as_list().expect("variant should be a list");
/// assert_eq!(list.len(), 2);
Expand Down
Loading
Loading