fn test_arrow_schema_roundtrip()

in parquet/src/arrow/schema/mod.rs [1906:2119]


    fn test_arrow_schema_roundtrip() -> Result<()> {
        let meta = |a: &[(&str, &str)]| -> HashMap<String, String> {
            a.iter()
                .map(|(a, b)| (a.to_string(), b.to_string()))
                .collect()
        };

        let schema = Schema::new_with_metadata(
            vec![
                Field::new("c1", DataType::Utf8, false)
                    .with_metadata(meta(&[("Key", "Foo"), (PARQUET_FIELD_ID_META_KEY, "2")])),
                Field::new("c2", DataType::Binary, false),
                Field::new("c3", DataType::FixedSizeBinary(3), false),
                Field::new("c4", DataType::Boolean, false),
                Field::new("c5", DataType::Date32, false),
                Field::new("c6", DataType::Date64, false),
                Field::new("c7", DataType::Time32(TimeUnit::Second), false),
                Field::new("c8", DataType::Time32(TimeUnit::Millisecond), false),
                Field::new("c13", DataType::Time64(TimeUnit::Microsecond), false),
                Field::new("c14", DataType::Time64(TimeUnit::Nanosecond), false),
                Field::new("c15", DataType::Timestamp(TimeUnit::Second, None), false),
                Field::new(
                    "c16",
                    DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".into())),
                    false,
                ),
                Field::new(
                    "c17",
                    DataType::Timestamp(TimeUnit::Microsecond, Some("Africa/Johannesburg".into())),
                    false,
                ),
                Field::new(
                    "c18",
                    DataType::Timestamp(TimeUnit::Nanosecond, None),
                    false,
                ),
                Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false),
                Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false),
                Field::new_list(
                    "c21",
                    Field::new_list_field(DataType::Boolean, true)
                        .with_metadata(meta(&[("Key", "Bar"), (PARQUET_FIELD_ID_META_KEY, "5")])),
                    false,
                )
                .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "4")])),
                Field::new(
                    "c22",
                    DataType::FixedSizeList(
                        Arc::new(Field::new_list_field(DataType::Boolean, true)),
                        5,
                    ),
                    false,
                ),
                Field::new_list(
                    "c23",
                    Field::new_large_list(
                        "inner",
                        Field::new_list_field(
                            DataType::Struct(
                                vec![
                                    Field::new("a", DataType::Int16, true),
                                    Field::new("b", DataType::Float64, false),
                                    Field::new("c", DataType::Float32, false),
                                    Field::new("d", DataType::Float16, false),
                                ]
                                .into(),
                            ),
                            false,
                        ),
                        true,
                    ),
                    false,
                ),
                Field::new(
                    "c24",
                    DataType::Struct(Fields::from(vec![
                        Field::new("a", DataType::Utf8, false),
                        Field::new("b", DataType::UInt16, false),
                    ])),
                    false,
                ),
                Field::new("c25", DataType::Interval(IntervalUnit::YearMonth), true),
                Field::new("c26", DataType::Interval(IntervalUnit::DayTime), true),
                // Duration types not supported
                // Field::new("c27", DataType::Duration(TimeUnit::Second), false),
                // Field::new("c28", DataType::Duration(TimeUnit::Millisecond), false),
                // Field::new("c29", DataType::Duration(TimeUnit::Microsecond), false),
                // Field::new("c30", DataType::Duration(TimeUnit::Nanosecond), false),
                #[allow(deprecated)]
                Field::new_dict(
                    "c31",
                    DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
                    true,
                    123,
                    true,
                )
                .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "6")])),
                Field::new("c32", DataType::LargeBinary, true),
                Field::new("c33", DataType::LargeUtf8, true),
                Field::new_large_list(
                    "c34",
                    Field::new_list(
                        "inner",
                        Field::new_list_field(
                            DataType::Struct(
                                vec![
                                    Field::new("a", DataType::Int16, true),
                                    Field::new("b", DataType::Float64, true),
                                ]
                                .into(),
                            ),
                            true,
                        ),
                        true,
                    ),
                    true,
                ),
                Field::new("c35", DataType::Null, true),
                Field::new("c36", DataType::Decimal128(2, 1), false),
                Field::new("c37", DataType::Decimal256(50, 20), false),
                Field::new("c38", DataType::Decimal128(18, 12), true),
                Field::new_map(
                    "c39",
                    "key_value",
                    Field::new("key", DataType::Utf8, false),
                    Field::new_list("value", Field::new("element", DataType::Utf8, true), true),
                    false, // fails to roundtrip keys_sorted
                    true,
                ),
                Field::new_map(
                    "c40",
                    "my_entries",
                    Field::new("my_key", DataType::Utf8, false)
                        .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "8")])),
                    Field::new_list(
                        "my_value",
                        Field::new_list_field(DataType::Utf8, true)
                            .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "10")])),
                        true,
                    )
                    .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "9")])),
                    false, // fails to roundtrip keys_sorted
                    true,
                )
                .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "7")])),
                Field::new_map(
                    "c41",
                    "my_entries",
                    Field::new("my_key", DataType::Utf8, false),
                    Field::new_list(
                        "my_value",
                        Field::new_list_field(DataType::Utf8, true)
                            .with_metadata(meta(&[(PARQUET_FIELD_ID_META_KEY, "11")])),
                        true,
                    ),
                    false, // fails to roundtrip keys_sorted
                    false,
                ),
            ],
            meta(&[("Key", "Value")]),
        );

        // write to an empty parquet file so that schema is serialized
        let file = tempfile::tempfile().unwrap();
        let writer =
            ArrowWriter::try_new(file.try_clone().unwrap(), Arc::new(schema.clone()), None)?;
        writer.close()?;

        // read file back
        let arrow_reader = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();

        // Check arrow schema
        let read_schema = arrow_reader.schema();
        assert_eq!(&schema, read_schema.as_ref());

        // Walk schema finding field IDs
        let mut stack = Vec::with_capacity(10);
        let mut out = Vec::with_capacity(10);

        let root = arrow_reader.parquet_schema().root_schema_ptr();
        stack.push((root.name().to_string(), root));

        while let Some((p, t)) = stack.pop() {
            if t.is_group() {
                for f in t.get_fields() {
                    stack.push((format!("{p}.{}", f.name()), f.clone()))
                }
            }

            let info = t.get_basic_info();
            if info.has_id() {
                out.push(format!("{p} -> {}", info.id()))
            }
        }
        out.sort_unstable();
        let out: Vec<_> = out.iter().map(|x| x.as_str()).collect();

        assert_eq!(
            &out,
            &[
                "arrow_schema.c1 -> 2",
                "arrow_schema.c21 -> 4",
                "arrow_schema.c21.list.item -> 5",
                "arrow_schema.c31 -> 6",
                "arrow_schema.c40 -> 7",
                "arrow_schema.c40.my_entries.my_key -> 8",
                "arrow_schema.c40.my_entries.my_value -> 9",
                "arrow_schema.c40.my_entries.my_value.list.item -> 10",
                "arrow_schema.c41.my_entries.my_value.list.item -> 11",
            ]
        );

        Ok(())
    }