in parquet/pqarrow/schema.go [1033:1153]
func applyOriginalStorageMetadata(origin arrow.Field, inferred *SchemaField) (modified bool, err error) {
nchildren := len(inferred.Children)
switch origin.Type.ID() {
case arrow.EXTENSION:
extType := origin.Type.(arrow.ExtensionType)
modified, err = applyOriginalStorageMetadata(arrow.Field{
Type: extType.StorageType(),
Metadata: origin.Metadata,
}, inferred)
if err != nil {
return
}
if modified && !arrow.TypeEqual(extType, inferred.Field.Type) {
if !arrow.TypeEqual(extType.StorageType(), inferred.Field.Type) {
return modified, fmt.Errorf("%w: mismatch storage type '%s' for extension type '%s'",
arrow.ErrInvalid, inferred.Field.Type, extType)
}
inferred.Field.Type = extType
}
case arrow.SPARSE_UNION, arrow.DENSE_UNION:
err = xerrors.New("unimplemented type")
case arrow.STRUCT:
typ := origin.Type.(*arrow.StructType)
if nchildren != typ.NumFields() {
return
}
factory := getNestedFactory(typ, inferred.Field.Type)
if factory == nil {
return
}
modified = typ.ID() != inferred.Field.Type.ID()
for idx := range inferred.Children {
childMod, err := applyOriginalMetadata(typ.Field(idx), &inferred.Children[idx])
if err != nil {
return false, err
}
modified = modified || childMod
}
if modified {
modifiedChildren := make([]arrow.Field, len(inferred.Children))
for idx, child := range inferred.Children {
modifiedChildren[idx] = *child.Field
}
inferred.Field.Type = factory(modifiedChildren)
}
case arrow.FIXED_SIZE_LIST, arrow.LIST, arrow.LARGE_LIST, arrow.MAP: // arrow.ListLike
if nchildren != 1 {
return
}
factory := getNestedFactory(origin.Type, inferred.Field.Type)
if factory == nil {
return
}
modified = origin.Type.ID() != inferred.Field.Type.ID()
childModified, err := applyOriginalMetadata(arrow.Field{Type: origin.Type.(arrow.ListLikeType).Elem()}, &inferred.Children[0])
if err != nil {
return modified, err
}
modified = modified || childModified
if modified {
inferred.Field.Type = factory([]arrow.Field{*inferred.Children[0].Field})
}
case arrow.TIMESTAMP:
if inferred.Field.Type.ID() != arrow.TIMESTAMP {
return
}
tsOtype := origin.Type.(*arrow.TimestampType)
tsInfType := inferred.Field.Type.(*arrow.TimestampType)
// if the unit is the same and the data is tz-aware, then set the original time zone
// since parquet has no native storage of timezones
if tsOtype.Unit == tsInfType.Unit && tsInfType.TimeZone == "UTC" && tsOtype.TimeZone != "" {
inferred.Field.Type = origin.Type
}
modified = true
case arrow.LARGE_STRING, arrow.LARGE_BINARY:
inferred.Field.Type = origin.Type
modified = true
case arrow.DICTIONARY:
if origin.Type.ID() != arrow.DICTIONARY || (inferred.Field.Type.ID() == arrow.DICTIONARY || !isDictionaryReadSupported(inferred.Field.Type)) {
return
}
// direct dictionary reads are only supported for a few primitive types
// so no need to recurse on value types
dictOriginType := origin.Type.(*arrow.DictionaryType)
inferred.Field.Type = &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32,
ValueType: inferred.Field.Type, Ordered: dictOriginType.Ordered}
modified = true
case arrow.DECIMAL256:
if inferred.Field.Type.ID() == arrow.DECIMAL128 {
inferred.Field.Type = origin.Type
modified = true
}
}
if origin.HasMetadata() {
meta := origin.Metadata
if inferred.Field.HasMetadata() {
final := make(map[string]string)
for idx, k := range meta.Keys() {
final[k] = meta.Values()[idx]
}
for idx, k := range inferred.Field.Metadata.Keys() {
final[k] = inferred.Field.Metadata.Values()[idx]
}
inferred.Field.Metadata = arrow.MetadataFrom(final)
} else {
inferred.Field.Metadata = meta
}
modified = true
}
return
}