func applyOriginalStorageMetadata()

in parquet/pqarrow/schema.go [1033:1153]


func applyOriginalStorageMetadata(origin arrow.Field, inferred *SchemaField) (modified bool, err error) {
	nchildren := len(inferred.Children)
	switch origin.Type.ID() {
	case arrow.EXTENSION:
		extType := origin.Type.(arrow.ExtensionType)
		modified, err = applyOriginalStorageMetadata(arrow.Field{
			Type:     extType.StorageType(),
			Metadata: origin.Metadata,
		}, inferred)
		if err != nil {
			return
		}

		if modified && !arrow.TypeEqual(extType, inferred.Field.Type) {
			if !arrow.TypeEqual(extType.StorageType(), inferred.Field.Type) {
				return modified, fmt.Errorf("%w: mismatch storage type '%s' for extension type '%s'",
					arrow.ErrInvalid, inferred.Field.Type, extType)
			}

			inferred.Field.Type = extType
		}
	case arrow.SPARSE_UNION, arrow.DENSE_UNION:
		err = xerrors.New("unimplemented type")
	case arrow.STRUCT:
		typ := origin.Type.(*arrow.StructType)
		if nchildren != typ.NumFields() {
			return
		}

		factory := getNestedFactory(typ, inferred.Field.Type)
		if factory == nil {
			return
		}

		modified = typ.ID() != inferred.Field.Type.ID()
		for idx := range inferred.Children {
			childMod, err := applyOriginalMetadata(typ.Field(idx), &inferred.Children[idx])
			if err != nil {
				return false, err
			}
			modified = modified || childMod
		}
		if modified {
			modifiedChildren := make([]arrow.Field, len(inferred.Children))
			for idx, child := range inferred.Children {
				modifiedChildren[idx] = *child.Field
			}
			inferred.Field.Type = factory(modifiedChildren)
		}
	case arrow.FIXED_SIZE_LIST, arrow.LIST, arrow.LARGE_LIST, arrow.MAP: // arrow.ListLike
		if nchildren != 1 {
			return
		}
		factory := getNestedFactory(origin.Type, inferred.Field.Type)
		if factory == nil {
			return
		}

		modified = origin.Type.ID() != inferred.Field.Type.ID()
		childModified, err := applyOriginalMetadata(arrow.Field{Type: origin.Type.(arrow.ListLikeType).Elem()}, &inferred.Children[0])
		if err != nil {
			return modified, err
		}
		modified = modified || childModified
		if modified {
			inferred.Field.Type = factory([]arrow.Field{*inferred.Children[0].Field})
		}
	case arrow.TIMESTAMP:
		if inferred.Field.Type.ID() != arrow.TIMESTAMP {
			return
		}

		tsOtype := origin.Type.(*arrow.TimestampType)
		tsInfType := inferred.Field.Type.(*arrow.TimestampType)

		// if the unit is the same and the data is tz-aware, then set the original time zone
		// since parquet has no native storage of timezones
		if tsOtype.Unit == tsInfType.Unit && tsInfType.TimeZone == "UTC" && tsOtype.TimeZone != "" {
			inferred.Field.Type = origin.Type
		}
		modified = true
	case arrow.LARGE_STRING, arrow.LARGE_BINARY:
		inferred.Field.Type = origin.Type
		modified = true
	case arrow.DICTIONARY:
		if origin.Type.ID() != arrow.DICTIONARY || (inferred.Field.Type.ID() == arrow.DICTIONARY || !isDictionaryReadSupported(inferred.Field.Type)) {
			return
		}

		// direct dictionary reads are only supported for a few primitive types
		// so no need to recurse on value types
		dictOriginType := origin.Type.(*arrow.DictionaryType)
		inferred.Field.Type = &arrow.DictionaryType{IndexType: arrow.PrimitiveTypes.Int32,
			ValueType: inferred.Field.Type, Ordered: dictOriginType.Ordered}
		modified = true
	case arrow.DECIMAL256:
		if inferred.Field.Type.ID() == arrow.DECIMAL128 {
			inferred.Field.Type = origin.Type
			modified = true
		}
	}

	if origin.HasMetadata() {
		meta := origin.Metadata
		if inferred.Field.HasMetadata() {
			final := make(map[string]string)
			for idx, k := range meta.Keys() {
				final[k] = meta.Values()[idx]
			}
			for idx, k := range inferred.Field.Metadata.Keys() {
				final[k] = inferred.Field.Metadata.Values()[idx]
			}
			inferred.Field.Metadata = arrow.MetadataFrom(final)
		} else {
			inferred.Field.Metadata = meta
		}
		modified = true
	}

	return
}