in parquet/src/arrow/schema/complex.rs [388:522]
fn visit_list(
&mut self,
list_type: &TypePtr,
context: VisitorContext,
) -> Result<Option<ParquetField>> {
if list_type.is_primitive() {
return Err(arrow_err!(
"{:?} is a list type and can't be processed as primitive.",
list_type
));
}
let fields = list_type.get_fields();
if fields.len() != 1 {
return Err(arrow_err!(
"list type must have a single child, found {}",
fields.len()
));
}
let repeated_field = &fields[0];
if get_repetition(repeated_field) != Repetition::REPEATED {
return Err(arrow_err!("List child must be repeated"));
}
// If the list is nullable
let (def_level, nullable) = match list_type.get_basic_info().repetition() {
Repetition::REQUIRED => (context.def_level, false),
Repetition::OPTIONAL => (context.def_level + 1, true),
Repetition::REPEATED => return Err(arrow_err!("List type cannot be repeated")),
};
let arrow_field = match &context.data_type {
Some(DataType::List(f)) => Some(f.as_ref()),
Some(DataType::LargeList(f)) => Some(f.as_ref()),
Some(DataType::FixedSizeList(f, _)) => Some(f.as_ref()),
Some(d) => {
return Err(arrow_err!(
"incompatible arrow schema, expected list got {}",
d
))
}
None => None,
};
if repeated_field.is_primitive() {
// If the repeated field is not a group, then its type is the element type and elements are required.
//
// required/optional group my_list (LIST) {
// repeated int32 element;
// }
//
let context = VisitorContext {
rep_level: context.rep_level,
def_level,
data_type: arrow_field.map(|f| f.data_type().clone()),
};
return match self.visit_primitive(repeated_field, context) {
Ok(Some(mut field)) => {
// visit_primitive will infer a non-nullable list, update if necessary
field.nullable = nullable;
Ok(Some(field))
}
r => r,
};
}
// test to see if the repeated field is a struct or one-tuple
let items = repeated_field.get_fields();
if items.len() != 1
|| (!repeated_field.is_list()
&& !repeated_field.has_single_repeated_child()
&& (repeated_field.name() == "array"
|| repeated_field.name() == format!("{}_tuple", list_type.name())))
{
// If the repeated field is a group with multiple fields, then its type is the element
// type and elements are required.
//
// If the repeated field is a group with one field and is named either array or uses
// the LIST-annotated group's name with _tuple appended then the repeated type is the
// element type and elements are required. But this rule only applies if the
// repeated field is not annotated, and the single child field is not `repeated`.
let context = VisitorContext {
rep_level: context.rep_level,
def_level,
data_type: arrow_field.map(|f| f.data_type().clone()),
};
return match self.visit_struct(repeated_field, context) {
Ok(Some(mut field)) => {
field.nullable = nullable;
Ok(Some(field))
}
r => r,
};
}
// Regular list handling logic
let item_type = &items[0];
let rep_level = context.rep_level + 1;
let def_level = def_level + 1;
let new_context = VisitorContext {
def_level,
rep_level,
data_type: arrow_field.map(|f| f.data_type().clone()),
};
match self.dispatch(item_type, new_context) {
Ok(Some(item)) => {
let item_field = Arc::new(convert_field(item_type, &item, arrow_field));
// Use arrow type as hint for index size
let arrow_type = match context.data_type {
Some(DataType::LargeList(_)) => DataType::LargeList(item_field),
Some(DataType::FixedSizeList(_, len)) => {
DataType::FixedSizeList(item_field, len)
}
_ => DataType::List(item_field),
};
Ok(Some(ParquetField {
rep_level,
def_level,
nullable,
arrow_type,
field_type: ParquetFieldType::Group {
children: vec![item],
},
}))
}
r => r,
}
}