Skip to content

Unable to read Dictionary(u8, FixedSizeBinary(_)) using datafusion. #7545

@albertlockett

Description

@albertlockett

Describe the bug
I'm not sure if this is a bug in parquet or datafusion. If this is is a datafusion bug, I'll close here and open in that repo.

If I write a column of type Dictionary(u8, FixedSizeBinary(_)), and try to read it using datafusion, I get the error:

thread 'main' panicked at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/buffer/offset_buffer.rs:133:48:
called `Result::unwrap()` on an `Err` value: InvalidArgumentError("Expected 1 buffers in array of type FixedSizeBinary(8), got 2")

To Reproduce

use std::sync::Arc;

use arrow::{
    datatypes::{DataType, Field, Schema},
    util::pretty::print_batches,
};
use arrow_array::{FixedSizeBinaryArray, RecordBatch, UInt8Array, UInt8DictionaryArray};
use datafusion::{
    prelude::{ParquetReadOptions, SessionContext},
    sql::TableReference,
};
use object_store::{local::LocalFileSystem, path::Path};
use parquet::{
    arrow::{arrow_reader::ParquetRecordBatchReaderBuilder, async_writer::ParquetObjectWriter, AsyncArrowWriter},
    file::properties::WriterProperties,
};

#[tokio::main]
async fn main() {
    let schema = Arc::new(Schema::new(vec![Field::new(
        "a",
        DataType::Dictionary(
            Box::new(DataType::UInt8),
            Box::new(DataType::FixedSizeBinary(8)),
        ),
        true,
    )]));

    let keys = UInt8Array::from_iter_values(vec![0, 0, 1]);
    // let values = ;
    let values = FixedSizeBinaryArray::try_from_iter(
        vec![
            (0u8..8u8).into_iter().collect::<Vec<u8>>(),
            (24u8..32u8).into_iter().collect::<Vec<u8>>(),
        ]
        .into_iter(),
    )
    .unwrap();
    let arr = UInt8DictionaryArray::new(keys, Arc::new(values));
    let batch = RecordBatch::try_new(schema, vec![Arc::new(arr)]).unwrap();

    // write batch to parquet
    let object_store = Arc::new(LocalFileSystem::new_with_prefix("/tmp").unwrap());
    let parquet_object_writer =
        ParquetObjectWriter::new(object_store.clone(), Path::from("test.parquet"));
    let mut parquet_writer = AsyncArrowWriter::try_new(
        parquet_object_writer,
        batch.schema().clone(),
        Some(WriterProperties::default()),
    )
    .unwrap();
    parquet_writer.write(&batch).await.unwrap();
    parquet_writer.close().await.unwrap();

    // read directly using parquet (this works)
    let file = std::fs::File::open("/tmp/test.parquet").unwrap();
    let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
    let mut reader = builder.build().unwrap();
    let read_batch = reader.next().unwrap().unwrap();
    print_batches(&[read_batch]).unwrap();

    // read using datafusion (this does not work)
    let ctx = SessionContext::new();
    ctx.register_parquet(
        TableReference::bare("tab"),
        "/tmp/test.parquet",
        ParquetReadOptions::default(),
    )
    .await
    .unwrap();
    let df = ctx.sql("select * from tab").await.unwrap();
    let batches = df.collect().await.unwrap();
    print_batches(&batches).unwrap();
}

Expected behavior
I think I should be able to read the column in this table.

Additional context
Full stack trace:

thread 'main' panicked at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/buffer/offset_buffer.rs:133:48:
called `Result::unwrap()` on an `Err` value: InvalidArgumentError("Expected 1 buffers in array of type FixedSizeBinary(8), got 2")
stack backtrace:
   0: rust_begin_unwind
             at /rustc/05f9846f893b09a1be1fc8560e33fc3c815cfecb/library/std/src/panicking.rs:695:5
   1: core::panicking::panic_fmt
             at /rustc/05f9846f893b09a1be1fc8560e33fc3c815cfecb/library/core/src/panicking.rs:75:14
   2: core::result::unwrap_failed
             at /rustc/05f9846f893b09a1be1fc8560e33fc3c815cfecb/library/core/src/result.rs:1704:5
   3: core::result::Result<T,E>::unwrap
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/result.rs:1109:23
   4: parquet::arrow::buffer::offset_buffer::OffsetBuffer<I>::into_array
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/buffer/offset_buffer.rs:133:21
   5: parquet::arrow::buffer::dictionary_buffer::DictionaryBuffer<K,V>::into_array
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/buffer/dictionary_buffer.rs:187:39
   6: <parquet::arrow::array_reader::byte_array_dictionary::ByteArrayDictionaryReader<K,V> as parquet::arrow::array_reader::ArrayReader>::consume_batch
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/array_reader/byte_array_dictionary.rs:170:21
   7: <parquet::arrow::array_reader::struct_array::StructArrayReader as parquet::arrow::array_reader::ArrayReader>::consume_batch::{{closure}}
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/array_reader/struct_array.rs:111:27
   8: core::iter::adapters::map::map_try_fold::{{closure}}
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/adapters/map.rs:95:28
   9: core::iter::traits::iterator::Iterator::try_fold
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/traits/iterator.rs:2370:21
  10: <core::iter::adapters::map::Map<I,F> as core::iter::traits::iterator::Iterator>::try_fold
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/adapters/map.rs:121:9
  11: <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::try_fold
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/adapters/mod.rs:191:9
  12: core::iter::traits::iterator::Iterator::try_for_each
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/traits/iterator.rs:2431:9
  13: <core::iter::adapters::GenericShunt<I,R> as core::iter::traits::iterator::Iterator>::next
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/adapters/mod.rs:174:14
  14: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter_nested::SpecFromIterNested<T,I>>::from_iter
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/alloc/src/vec/spec_from_iter_nested.rs:25:32
  15: <alloc::vec::Vec<T> as alloc::vec::spec_from_iter::SpecFromIter<T,I>>::from_iter
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/alloc/src/vec/spec_from_iter.rs:34:9
  16: <alloc::vec::Vec<T> as core::iter::traits::collect::FromIterator<T>>::from_iter
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/alloc/src/vec/mod.rs:3424:9
  17: core::iter::traits::iterator::Iterator::collect
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/traits/iterator.rs:1971:9
  18: <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter::{{closure}}
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/result.rs:1985:51
  19: core::iter::adapters::try_process
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/adapters/mod.rs:160:17
  20: <core::result::Result<V,E> as core::iter::traits::collect::FromIterator<core::result::Result<A,E>>>::from_iter
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/result.rs:1985:9
  21: core::iter::traits::iterator::Iterator::collect
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/iter/traits/iterator.rs:1971:9
  22: <parquet::arrow::array_reader::struct_array::StructArrayReader as parquet::arrow::array_reader::ArrayReader>::consume_batch
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/array_reader/struct_array.rs:108:30
  23: <parquet::arrow::arrow_reader::ParquetRecordBatchReader as core::iter::traits::iterator::Iterator>::next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/arrow_reader/mod.rs:855:15
  24: <parquet::arrow::async_reader::ParquetRecordBatchStream<T> as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/parquet-55.1.0/src/arrow/async_reader/mod.rs:811:62
  25: <S as futures_core::stream::TryStream>::try_poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-core-0.3.31/src/stream.rs:206:9
  26: <futures_util::stream::try_stream::into_stream::IntoStream<St> as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/stream/try_stream/into_stream.rs:38:9
  27: <futures_util::stream::stream::map::Map<St,F> as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/stream/stream/map.rs:58:26
  28: <futures_util::stream::try_stream::MapErr<St,F> as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/lib.rs:97:13
  29: <futures_util::stream::stream::map::Map<St,F> as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/stream/stream/map.rs:58:26
  30: <core::pin::Pin<P> as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-core-0.3.31/src/stream.rs:130:9
  31: futures_util::stream::stream::StreamExt::poll_next_unpin
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/stream/stream/mod.rs:1638:9
  32: datafusion_datasource::file_stream::FileStream::poll_inner
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/datafusion-datasource-47.0.0/src/file_stream.rs:220:34
  33: <datafusion_datasource::file_stream::FileStream as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/datafusion-datasource-47.0.0/src/file_stream.rs:333:22
  34: <core::pin::Pin<P> as futures_core::stream::Stream>::poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-core-0.3.31/src/stream.rs:130:9
  35: <S as futures_core::stream::TryStream>::try_poll_next
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-core-0.3.31/src/stream.rs:206:9
  36: <futures_util::stream::try_stream::try_collect::TryCollect<St,C> as core::future::future::Future>::poll
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/futures-util-0.3.31/src/stream/try_stream/try_collect.rs:46:26
  37: datafusion_physical_plan::common::collect::{{closure}}
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/datafusion-physical-plan-47.0.0/src/common.rs:45:36
  38: datafusion_physical_plan::execution_plan::collect::{{closure}}
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/datafusion-physical-plan-47.0.0/src/execution_plan.rs:868:36
  39: datafusion::dataframe::DataFrame::collect::{{closure}}
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/datafusion-47.0.0/src/dataframe/mod.rs:1351:33
  40: parquet_bug_repro::main::{{closure}}
             at ./src/bin/parquet_bug_repro.rs:72:32
  41: <core::pin::Pin<P> as core::future::future::Future>::poll
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/future/future.rs:124:9
  42: tokio::runtime::park::CachedParkThread::block_on::{{closure}}
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/park.rs:284:60
  43: tokio::task::coop::with_budget
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/task/coop/mod.rs:167:5
  44: tokio::task::coop::budget
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/task/coop/mod.rs:133:5
  45: tokio::runtime::park::CachedParkThread::block_on
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/park.rs:284:31
  46: tokio::runtime::context::blocking::BlockingRegionGuard::block_on
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/context/blocking.rs:66:9
  47: tokio::runtime::scheduler::multi_thread::MultiThread::block_on::{{closure}}
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/scheduler/multi_thread/mod.rs:87:13
  48: tokio::runtime::context::runtime::enter_runtime
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/context/runtime.rs:65:16
  49: tokio::runtime::scheduler::multi_thread::MultiThread::block_on
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/scheduler/multi_thread/mod.rs:86:9
  50: tokio::runtime::runtime::Runtime::block_on_inner
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/runtime.rs:358:45
  51: tokio::runtime::runtime::Runtime::block_on
             at /Users/a.lockett/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/tokio-1.45.0/src/runtime/runtime.rs:328:13
  52: parquet_bug_repro::main
             at ./src/bin/parquet_bug_repro.rs:73:5
  53: core::ops::function::FnOnce::call_once
             at /Users/a.lockett/.rustup/toolchains/stable-aarch64-apple-darwin/lib/rustlib/src/rust/library/core/src/ops/function.rs:250:5
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace.

Versions:

arrow = { version = "55", features = ["prettyprint", "chrono-tz"] }
arrow-array = "55"
datafusion = "47"
parquet = { version = "55", features = ["arrow", "async", "object_store"]}
object_store = "0.12"
tokio = { version = "1", features = ["full"] }

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugparquetChanges to the parquet crate

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions