in parquet/src/arrow/schema/mod.rs [687:888]
fn test_parquet_lists() {
let mut arrow_fields = Vec::new();
// LIST encoding example taken from parquet-format/LogicalTypes.md
let message_type = "
message test_schema {
REQUIRED GROUP my_list (LIST) {
REPEATED GROUP list {
OPTIONAL BINARY element (UTF8);
}
}
OPTIONAL GROUP my_list (LIST) {
REPEATED GROUP list {
REQUIRED BINARY element (UTF8);
}
}
OPTIONAL GROUP array_of_arrays (LIST) {
REPEATED GROUP list {
REQUIRED GROUP element (LIST) {
REPEATED GROUP list {
REQUIRED INT32 element;
}
}
}
}
OPTIONAL GROUP my_list (LIST) {
REPEATED GROUP element {
REQUIRED BINARY str (UTF8);
}
}
OPTIONAL GROUP my_list (LIST) {
REPEATED INT32 element;
}
OPTIONAL GROUP my_list (LIST) {
REPEATED GROUP element {
REQUIRED BINARY str (UTF8);
REQUIRED INT32 num;
}
}
OPTIONAL GROUP my_list (LIST) {
REPEATED GROUP array {
REQUIRED BINARY str (UTF8);
}
}
OPTIONAL GROUP my_list (LIST) {
REPEATED GROUP my_list_tuple {
REQUIRED BINARY str (UTF8);
}
}
REPEATED INT32 name;
}
";
// // List<String> (list non-null, elements nullable)
// required group my_list (LIST) {
// repeated group list {
// optional binary element (UTF8);
// }
// }
{
arrow_fields.push(Field::new_list(
"my_list",
Field::new("element", DataType::Utf8, true),
false,
));
}
// // List<String> (list nullable, elements non-null)
// optional group my_list (LIST) {
// repeated group list {
// required binary element (UTF8);
// }
// }
{
arrow_fields.push(Field::new_list(
"my_list",
Field::new("element", DataType::Utf8, false),
true,
));
}
// Element types can be nested structures. For example, a list of lists:
//
// // List<List<Integer>>
// optional group array_of_arrays (LIST) {
// repeated group list {
// required group element (LIST) {
// repeated group list {
// required int32 element;
// }
// }
// }
// }
{
let arrow_inner_list = Field::new("element", DataType::Int32, false);
arrow_fields.push(Field::new_list(
"array_of_arrays",
Field::new_list("element", arrow_inner_list, false),
true,
));
}
// // List<String> (list nullable, elements non-null)
// optional group my_list (LIST) {
// repeated group element {
// required binary str (UTF8);
// };
// }
{
arrow_fields.push(Field::new_list(
"my_list",
Field::new("str", DataType::Utf8, false),
true,
));
}
// // List<Integer> (nullable list, non-null elements)
// optional group my_list (LIST) {
// repeated int32 element;
// }
{
arrow_fields.push(Field::new_list(
"my_list",
Field::new("element", DataType::Int32, false),
true,
));
}
// // List<Tuple<String, Integer>> (nullable list, non-null elements)
// optional group my_list (LIST) {
// repeated group element {
// required binary str (UTF8);
// required int32 num;
// };
// }
{
let fields = vec![
Field::new("str", DataType::Utf8, false),
Field::new("num", DataType::Int32, false),
];
arrow_fields.push(Field::new_list(
"my_list",
Field::new_struct("element", fields, false),
true,
));
}
// // List<OneTuple<String>> (nullable list, non-null elements)
// optional group my_list (LIST) {
// repeated group array {
// required binary str (UTF8);
// };
// }
// Special case: group is named array
{
let fields = vec![Field::new("str", DataType::Utf8, false)];
arrow_fields.push(Field::new_list(
"my_list",
Field::new_struct("array", fields, false),
true,
));
}
// // List<OneTuple<String>> (nullable list, non-null elements)
// optional group my_list (LIST) {
// repeated group my_list_tuple {
// required binary str (UTF8);
// };
// }
// Special case: group named ends in _tuple
{
let fields = vec![Field::new("str", DataType::Utf8, false)];
arrow_fields.push(Field::new_list(
"my_list",
Field::new_struct("my_list_tuple", fields, false),
true,
));
}
// One-level encoding: Only allows required lists with required cells
// repeated value_type name
{
arrow_fields.push(Field::new_list(
"name",
Field::new("name", DataType::Int32, false),
false,
));
}
let parquet_group_type = parse_message_type(message_type).unwrap();
let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type));
let converted_arrow_schema =
parquet_to_arrow_schema(&parquet_schema, None).unwrap();
let converted_fields = converted_arrow_schema.fields();
assert_eq!(arrow_fields.len(), converted_fields.len());
for i in 0..arrow_fields.len() {
assert_eq!(&arrow_fields[i], converted_fields[i].as_ref(), "{i}");
}
}