in crates/iceberg/src/spec/snapshot_summary.rs [719:828]
fn test_snapshot_summary_collector_build() {
let schema = Arc::new(
Schema::builder()
.with_fields(vec![
NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(),
NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(),
])
.build()
.unwrap(),
);
let partition_spec = Arc::new(
PartitionSpec::builder(schema.clone())
.add_unbound_fields(vec![UnboundPartitionField::builder()
.source_id(2)
.name("year".to_string())
.transform(Transform::Identity)
.build()])
.unwrap()
.with_spec_id(1)
.build()
.unwrap(),
);
let mut collector = SnapshotSummaryCollector::default();
collector.set_partition_summary_limit(10);
let file1 = DataFile {
content: DataContentType::Data,
file_path: "s3://testbucket/path/to/file1.parquet".to_string(),
file_format: DataFileFormat::Parquet,
partition: Struct::from_iter(vec![]),
record_count: 10,
file_size_in_bytes: 100,
column_sizes: HashMap::from([(1, 46), (2, 48), (3, 48)]),
value_counts: HashMap::from([(1, 10), (2, 10), (3, 10)]),
null_value_counts: HashMap::from([(1, 0), (2, 0), (3, 0)]),
nan_value_counts: HashMap::new(),
lower_bounds: HashMap::from([
(1, Datum::long(1)),
(2, Datum::string("a")),
(3, Datum::string("x")),
]),
upper_bounds: HashMap::from([
(1, Datum::long(1)),
(2, Datum::string("a")),
(3, Datum::string("x")),
]),
key_metadata: None,
split_offsets: vec![4],
equality_ids: vec![],
sort_order_id: Some(0),
partition_spec_id: 0,
first_row_id: None,
referenced_data_file: None,
content_offset: None,
content_size_in_bytes: None,
};
let file2 = DataFile {
content: DataContentType::Data,
file_path: "s3://testbucket/path/to/file2.parquet".to_string(),
file_format: DataFileFormat::Parquet,
partition: Struct::from_iter(vec![Some(Literal::string("2025"))]),
record_count: 20,
file_size_in_bytes: 200,
column_sizes: HashMap::from([(1, 46), (2, 48), (3, 48)]),
value_counts: HashMap::from([(1, 20), (2, 20), (3, 20)]),
null_value_counts: HashMap::from([(1, 0), (2, 0), (3, 0)]),
nan_value_counts: HashMap::new(),
lower_bounds: HashMap::from([
(1, Datum::long(1)),
(2, Datum::string("a")),
(3, Datum::string("x")),
]),
upper_bounds: HashMap::from([
(1, Datum::long(1)),
(2, Datum::string("a")),
(3, Datum::string("x")),
]),
key_metadata: None,
split_offsets: vec![4],
equality_ids: vec![],
sort_order_id: Some(0),
partition_spec_id: 0,
first_row_id: None,
referenced_data_file: None,
content_offset: None,
content_size_in_bytes: None,
};
collector.add_file(&file1, schema.clone(), partition_spec.clone());
collector.add_file(&file2, schema.clone(), partition_spec.clone());
collector.remove_file(&file1, schema.clone(), partition_spec.clone());
let props = collector.build();
assert_eq!(props.get(ADDED_FILE_SIZE).unwrap(), "300");
assert_eq!(props.get(REMOVED_FILE_SIZE).unwrap(), "100");
let partition_key = format!("{}{}", CHANGED_PARTITION_PREFIX, "year=\"2025\"");
assert!(props.contains_key(&partition_key));
let partition_summary = props.get(&partition_key).unwrap();
assert!(partition_summary.contains(&format!("{}=200", ADDED_FILE_SIZE)));
assert!(partition_summary.contains(&format!("{}=1", ADDED_DATA_FILES)));
assert!(partition_summary.contains(&format!("{}=20", ADDED_RECORDS)));
}