in parquet/src/arrow/schema/mod.rs [1906:2119]
fn test_arrow_schema_roundtrip() -> Result<()> {
let meta = |a: &[(&str, &str)]| -> HashMap<String, String> {
a.iter()
.map(|(a, b)| (a.to_string(), b.to_string()))
.collect()
};
let schema = Schema::new_with_metadata(
vec![
Field::new("c1", DataType::Utf8, false)
.with_metadata(meta(&[("Key", "Foo"), (PARQUET_FIELD_ID_META_KEY, "2")])),
Field::new("c2", DataType::Binary, false),
Field::new("c3", DataType::FixedSizeBinary(3), false),
Field::new("c4", DataType::Boolean, false),
Field::new("c5", DataType::Date32, false),
Field::new("c6", DataType::Date64, false),
Field::new("c7", DataType::Time32(TimeUnit::Second), false),
Field::new("c8", DataType::Time32(TimeUnit::Millisecond), false),
Field::new("c13", DataType::Time64(TimeUnit::Microsecond), false),
Field::new("c14", DataType::Time64(TimeUnit::Nanosecond), false),
Field::new("c15", DataType::Timestamp(TimeUnit::Second, None), false),
Field::new(
"c16",
DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".into())),
false,
),
Field::new(
"c17",
DataType::Timestamp(TimeUnit::Microsecond, Some("Africa/Johannesburg".into())),
false,
),
Field::new(
"c18",
DataType::Timestamp(TimeUnit::Nanosecond, None),
false,
),
Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false),
Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false),
Field::new_list(
"c21",
Field::new_list_field(DataType::Boolean, true)
.with_metadata(meta(&[("Key", "Bar"), (PARQUET_FIELD_ID_META_KEY, "5")])),
false,
)
.with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "4")])),
Field::new(
"c22",
DataType::FixedSizeList(
Arc::new(Field::new_list_field(DataType::Boolean, true)),
5,
),
false,
),
Field::new_list(
"c23",
Field::new_large_list(
"inner",
Field::new_list_field(
DataType::Struct(
vec![
Field::new("a", DataType::Int16, true),
Field::new("b", DataType::Float64, false),
Field::new("c", DataType::Float32, false),
Field::new("d", DataType::Float16, false),
]
.into(),
),
false,
),
true,
),
false,
),
Field::new(
"c24",
DataType::Struct(Fields::from(vec![
Field::new("a", DataType::Utf8, false),
Field::new("b", DataType::UInt16, false),
])),
false,
),
Field::new("c25", DataType::Interval(IntervalUnit::YearMonth), true),
Field::new("c26", DataType::Interval(IntervalUnit::DayTime), true),
// Duration types not supported
// Field::new("c27", DataType::Duration(TimeUnit::Second), false),
// Field::new("c28", DataType::Duration(TimeUnit::Millisecond), false),
// Field::new("c29", DataType::Duration(TimeUnit::Microsecond), false),
// Field::new("c30", DataType::Duration(TimeUnit::Nanosecond), false),
#[allow(deprecated)]
Field::new_dict(
"c31",
DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
true,
123,
true,
)
.with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "6")])),
Field::new("c32", DataType::LargeBinary, true),
Field::new("c33", DataType::LargeUtf8, true),
Field::new_large_list(
"c34",
Field::new_list(
"inner",
Field::new_list_field(
DataType::Struct(
vec![
Field::new("a", DataType::Int16, true),
Field::new("b", DataType::Float64, true),
]
.into(),
),
true,
),
true,
),
true,
),
Field::new("c35", DataType::Null, true),
Field::new("c36", DataType::Decimal128(2, 1), false),
Field::new("c37", DataType::Decimal256(50, 20), false),
Field::new("c38", DataType::Decimal128(18, 12), true),
Field::new_map(
"c39",
"key_value",
Field::new("key", DataType::Utf8, false),
Field::new_list("value", Field::new("element", DataType::Utf8, true), true),
false, // fails to roundtrip keys_sorted
true,
),
Field::new_map(
"c40",
"my_entries",
Field::new("my_key", DataType::Utf8, false)
.with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "8")])),
Field::new_list(
"my_value",
Field::new_list_field(DataType::Utf8, true)
.with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "10")])),
true,
)
.with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "9")])),
false, // fails to roundtrip keys_sorted
true,
)
.with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "7")])),
Field::new_map(
"c41",
"my_entries",
Field::new("my_key", DataType::Utf8, false),
Field::new_list(
"my_value",
Field::new_list_field(DataType::Utf8, true)
.with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "11")])),
true,
),
false, // fails to roundtrip keys_sorted
false,
),
],
meta(&[("Key", "Value")]),
);
// write to an empty parquet file so that schema is serialized
let file = tempfile::tempfile().unwrap();
let writer =
ArrowWriter::try_new(file.try_clone().unwrap(), Arc::new(schema.clone()), None)?;
writer.close()?;
// read file back
let arrow_reader = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
// Check arrow schema
let read_schema = arrow_reader.schema();
assert_eq!(&schema, read_schema.as_ref());
// Walk schema finding field IDs
let mut stack = Vec::with_capacity(10);
let mut out = Vec::with_capacity(10);
let root = arrow_reader.parquet_schema().root_schema_ptr();
stack.push((root.name().to_string(), root));
while let Some((p, t)) = stack.pop() {
if t.is_group() {
for f in t.get_fields() {
stack.push((format!("{p}.{}", f.name()), f.clone()))
}
}
let info = t.get_basic_info();
if info.has_id() {
out.push(format!("{p} -> {}", info.id()))
}
}
out.sort_unstable();
let out: Vec<_> = out.iter().map(|x| x.as_str()).collect();
assert_eq!(
&out,
&[
"arrow_schema.c1 -> 2",
"arrow_schema.c21 -> 4",
"arrow_schema.c21.list.item -> 5",
"arrow_schema.c31 -> 6",
"arrow_schema.c40 -> 7",
"arrow_schema.c40.my_entries.my_key -> 8",
"arrow_schema.c40.my_entries.my_value -> 9",
"arrow_schema.c40.my_entries.my_value.list.item -> 10",
"arrow_schema.c41.my_entries.my_value.list.item -> 11",
]
);
Ok(())
}