c_glib/parquet-glib/metadata.cpp (423 lines of code) (raw):
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <arrow-glib/arrow-glib.hpp>
#include <parquet-glib/metadata.hpp>
#include <parquet-glib/statistics.hpp>
G_BEGIN_DECLS
/**
* SECTION: metadata
* @title: Metadata related classes
* @include: parquet-glib/parquet-glib.h
*
* #GParquetColumnChunkMetadata is a class for column chunk level metadata.
*
* #GParquetRowGroupMetadata is a class for row group level metadata.
*
* #GParquetFileMetadata is a class for file level metadata.
*/
struct GParquetColumnChunkMetadataPrivate
{
parquet::ColumnChunkMetaData *metadata;
GParquetRowGroupMetadata *owner;
};
enum {
PROP_METADATA = 1,
PROP_OWNER,
};
G_DEFINE_TYPE_WITH_PRIVATE(GParquetColumnChunkMetadata,
gparquet_column_chunk_metadata,
G_TYPE_OBJECT)
#define GPARQUET_COLUMN_CHUNK_METADATA_GET_PRIVATE(object) \
static_cast<GParquetColumnChunkMetadataPrivate *>( \
gparquet_column_chunk_metadata_get_instance_private( \
GPARQUET_COLUMN_CHUNK_METADATA(object)))
static void
gparquet_column_chunk_metadata_dispose(GObject *object)
{
auto priv = GPARQUET_COLUMN_CHUNK_METADATA_GET_PRIVATE(object);
if (priv->owner) {
g_object_unref(priv->owner);
priv->owner = nullptr;
}
G_OBJECT_CLASS(gparquet_column_chunk_metadata_parent_class)->dispose(object);
}
static void
gparquet_column_chunk_metadata_set_property(GObject *object,
guint prop_id,
const GValue *value,
GParamSpec *pspec)
{
auto priv = GPARQUET_COLUMN_CHUNK_METADATA_GET_PRIVATE(object);
switch (prop_id) {
case PROP_METADATA:
priv->metadata =
static_cast<parquet::ColumnChunkMetaData *>(g_value_get_pointer(value));
break;
case PROP_OWNER:
priv->owner = GPARQUET_ROW_GROUP_METADATA(g_value_dup_object(value));
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
break;
}
}
static void
gparquet_column_chunk_metadata_init(GParquetColumnChunkMetadata *object)
{
}
static void
gparquet_column_chunk_metadata_class_init(GParquetColumnChunkMetadataClass *klass)
{
auto gobject_class = G_OBJECT_CLASS(klass);
gobject_class->dispose = gparquet_column_chunk_metadata_dispose;
gobject_class->set_property = gparquet_column_chunk_metadata_set_property;
GParamSpec *spec;
spec = g_param_spec_pointer(
"metadata",
"Metadata",
"The raw parquet::ColumnChunkMetaData *",
static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
g_object_class_install_property(gobject_class, PROP_METADATA, spec);
spec = g_param_spec_object(
"owner",
"Owner",
"The row group metadata that owns this metadata",
GPARQUET_TYPE_ROW_GROUP_METADATA,
static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
g_object_class_install_property(gobject_class, PROP_OWNER, spec);
}
/**
* gparquet_column_chunk_metadata_equal:
* @metadata: A #GParquetColumnChunkMetadata.
* @other_metadata: A #GParquetColumnChunkMetadata.
*
* Returns: %TRUE if both of them have the same data, %FALSE
* otherwise.
*
* Since: 8.0.0
*/
gboolean
gparquet_column_chunk_metadata_equal(GParquetColumnChunkMetadata *metadata,
GParquetColumnChunkMetadata *other_metadata)
{
auto parquet_metadata = gparquet_column_chunk_metadata_get_raw(metadata);
auto parquet_other_metadata = gparquet_column_chunk_metadata_get_raw(other_metadata);
return parquet_metadata->Equals(*parquet_other_metadata);
}
/**
* gparquet_column_chunk_metadata_get_total_size:
* @metadata: A #GParquetColumnChunkMetadata.
*
* Returns: Total byte size of all the uncompressed data in this
* column chunk.
*
* Since: 8.0.0
*/
gint64
gparquet_column_chunk_metadata_get_total_size(GParquetColumnChunkMetadata *metadata)
{
auto parquet_metadata = gparquet_column_chunk_metadata_get_raw(metadata);
return parquet_metadata->total_uncompressed_size();
}
/**
* gparquet_column_chunk_metadata_get_total_compressed_size:
* @metadata: A #GParquetColumnChunkMetadata.
*
* Returns: Total byte size of all the compressed (and potentially
* encrypted) data in this column chunk.
*
* Since: 8.0.0
*/
gint64
gparquet_column_chunk_metadata_get_total_compressed_size(
GParquetColumnChunkMetadata *metadata)
{
auto parquet_metadata = gparquet_column_chunk_metadata_get_raw(metadata);
return parquet_metadata->total_compressed_size();
}
/**
* gparquet_column_chunk_metadata_get_file_offset:
* @metadata: A #GParquetColumnChunkMetadata.
*
* Returns: Byte offset from beginning of file to first page (data or
* dictionary) in this column chunk.
*
* Since: 8.0.0
*/
gint64
gparquet_column_chunk_metadata_get_file_offset(GParquetColumnChunkMetadata *metadata)
{
auto parquet_metadata = gparquet_column_chunk_metadata_get_raw(metadata);
return parquet_metadata->file_offset();
}
/**
* gparquet_column_chunk_metadata_can_decompress:
* @metadata: A #GParquetColumnChunkMetadata.
*
* Returns: %TRUE if all of the column chunk can be decompressed,
* %FALSE otherwise.
*
* Since: 8.0.0
*/
gboolean
gparquet_column_chunk_metadata_can_decompress(GParquetColumnChunkMetadata *metadata)
{
auto parquet_metadata = gparquet_column_chunk_metadata_get_raw(metadata);
return parquet_metadata->can_decompress();
}
/**
* gparquet_column_chunk_metadata_get_statistics:
* @metadata: A #GParquetColumnChunkMetadata.
*
* Returns: (transfer full) (nullable): The statistics of this column chunk if
* it's set, %NULL otherwise.
*
* Since: 8.0.0
*/
GParquetStatistics *
gparquet_column_chunk_metadata_get_statistics(GParquetColumnChunkMetadata *metadata)
{
auto parquet_metadata = gparquet_column_chunk_metadata_get_raw(metadata);
auto parquet_statistics = parquet_metadata->statistics();
if (parquet_statistics) {
return gparquet_statistics_new_raw(&parquet_statistics);
} else {
return NULL;
}
}
struct GParquetRowGroupMetadataPrivate
{
parquet::RowGroupMetaData *metadata;
GParquetFileMetadata *owner;
};
G_DEFINE_TYPE_WITH_PRIVATE(GParquetRowGroupMetadata,
gparquet_row_group_metadata,
G_TYPE_OBJECT)
#define GPARQUET_ROW_GROUP_METADATA_GET_PRIVATE(object) \
static_cast<GParquetRowGroupMetadataPrivate *>( \
gparquet_row_group_metadata_get_instance_private( \
GPARQUET_ROW_GROUP_METADATA(object)))
static void
gparquet_row_group_metadata_dispose(GObject *object)
{
auto priv = GPARQUET_ROW_GROUP_METADATA_GET_PRIVATE(object);
if (priv->owner) {
g_object_unref(priv->owner);
priv->owner = nullptr;
}
G_OBJECT_CLASS(gparquet_row_group_metadata_parent_class)->dispose(object);
}
static void
gparquet_row_group_metadata_set_property(GObject *object,
guint prop_id,
const GValue *value,
GParamSpec *pspec)
{
auto priv = GPARQUET_ROW_GROUP_METADATA_GET_PRIVATE(object);
switch (prop_id) {
case PROP_METADATA:
priv->metadata = static_cast<parquet::RowGroupMetaData *>(g_value_get_pointer(value));
break;
case PROP_OWNER:
priv->owner = GPARQUET_FILE_METADATA(g_value_dup_object(value));
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
break;
}
}
static void
gparquet_row_group_metadata_init(GParquetRowGroupMetadata *object)
{
}
static void
gparquet_row_group_metadata_class_init(GParquetRowGroupMetadataClass *klass)
{
auto gobject_class = G_OBJECT_CLASS(klass);
gobject_class->finalize = gparquet_row_group_metadata_dispose;
gobject_class->set_property = gparquet_row_group_metadata_set_property;
GParamSpec *spec;
spec = g_param_spec_pointer(
"metadata",
"Metadata",
"The raw parquet::RowGroupMetaData *",
static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
g_object_class_install_property(gobject_class, PROP_METADATA, spec);
spec = g_param_spec_object(
"owner",
"Owner",
"The file group metadata that owns this metadata",
GPARQUET_TYPE_FILE_METADATA,
static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
g_object_class_install_property(gobject_class, PROP_OWNER, spec);
}
/**
* gparquet_row_group_metadata_equal:
* @metadata: A #GParquetRowGroupMetadata.
* @other_metadata: A #GParquetRowGroupMetadata.
*
* Returns: %TRUE if both of them have the same data, %FALSE
* otherwise.
*
* Since: 8.0.0
*/
gboolean
gparquet_row_group_metadata_equal(GParquetRowGroupMetadata *metadata,
GParquetRowGroupMetadata *other_metadata)
{
auto parquet_metadata = gparquet_row_group_metadata_get_raw(metadata);
auto parquet_other_metadata = gparquet_row_group_metadata_get_raw(other_metadata);
return parquet_metadata->Equals(*parquet_other_metadata);
}
/**
* gparquet_row_group_metadata_get_n_columns:
* @metadata: A #GParquetRowGroupMetadata.
*
* Returns: The number of columns in this row group. The order must
* match the parent's column ordering.
*
* Since: 8.0.0
*/
gint
gparquet_row_group_metadata_get_n_columns(GParquetRowGroupMetadata *metadata)
{
auto parquet_metadata = gparquet_row_group_metadata_get_raw(metadata);
return parquet_metadata->num_columns();
}
/**
* gparquet_row_group_metadata_get_column_chunk:
* @metadata: A #GParquetRowGroupMetadata.
* @index: An index of the column chunk to retrieve.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: (transfer full) (nullable): A #GParquetColumnChunkMetadata
* at @index on success, %NULL on error.
*
* Since: 8.0.0
*/
GParquetColumnChunkMetadata *
gparquet_row_group_metadata_get_column_chunk(GParquetRowGroupMetadata *metadata,
gint index,
GError **error)
{
auto parquet_metadata = gparquet_row_group_metadata_get_raw(metadata);
std::unique_ptr<parquet::ColumnChunkMetaData> parquet_column_chunk_metadata;
auto status = ([&] {
BEGIN_PARQUET_CATCH_EXCEPTIONS
parquet_column_chunk_metadata = parquet_metadata->ColumnChunk(index);
return arrow::Status::OK();
END_PARQUET_CATCH_EXCEPTIONS
})();
if (garrow::check(error, status, "[parquet][row-group-metadata][get-column-chunk]")) {
return gparquet_column_chunk_metadata_new_raw(parquet_column_chunk_metadata.release(),
metadata);
} else {
return NULL;
}
}
/**
* gparquet_row_group_metadata_get_n_rows:
* @metadata: A #GParquetRowGroupMetadata.
*
* Returns: The number of rows in this row group.
*
* Since: 8.0.0
*/
gint64
gparquet_row_group_metadata_get_n_rows(GParquetRowGroupMetadata *metadata)
{
auto parquet_metadata = gparquet_row_group_metadata_get_raw(metadata);
return parquet_metadata->num_rows();
}
/**
* gparquet_row_group_metadata_get_total_size:
* @metadata: A #GParquetRowGroupMetadata.
*
* Returns: Total byte size of all the uncompressed column data in
* this row group.
*
* Since: 8.0.0
*/
gint64
gparquet_row_group_metadata_get_total_size(GParquetRowGroupMetadata *metadata)
{
auto parquet_metadata = gparquet_row_group_metadata_get_raw(metadata);
return parquet_metadata->total_byte_size();
}
/**
* gparquet_row_group_metadata_get_total_compressed_size:
* @metadata: A #GParquetRowGroupMetadata.
*
* Returns: Total byte size of all the compressed (and potentially
* encrypted) column data in this row group.
*
* Since: 8.0.0
*/
gint64
gparquet_row_group_metadata_get_total_compressed_size(GParquetRowGroupMetadata *metadata)
{
auto parquet_metadata = gparquet_row_group_metadata_get_raw(metadata);
return parquet_metadata->total_compressed_size();
}
/**
* gparquet_row_group_metadata_get_file_offset:
* @metadata: A #GParquetRowGroupMetadata.
*
* Returns: Byte offset from beginning of file to first page (data or
* dictionary) in this row group.
*
* The `file_offset` field that this method exposes is
* optional. This method will return 0 if that field is not set to a
* meaningful value.
*
* Since: 8.0.0
*/
gint64
gparquet_row_group_metadata_get_file_offset(GParquetRowGroupMetadata *metadata)
{
auto parquet_metadata = gparquet_row_group_metadata_get_raw(metadata);
return parquet_metadata->file_offset();
}
/**
* gparquet_row_group_metadata_can_decompress:
* @metadata: A #GParquetRowGroupMetadata.
*
* Returns: %TRUE if all of the row group's column chunks can be
* decompressed, %FALSE otherwise.
*
* Since: 8.0.0
*/
gboolean
gparquet_row_group_metadata_can_decompress(GParquetRowGroupMetadata *metadata)
{
auto parquet_metadata = gparquet_row_group_metadata_get_raw(metadata);
return parquet_metadata->can_decompress();
}
struct GParquetFileMetadataPrivate
{
std::shared_ptr<parquet::FileMetaData> metadata;
};
G_DEFINE_TYPE_WITH_PRIVATE(GParquetFileMetadata, gparquet_file_metadata, G_TYPE_OBJECT)
#define GPARQUET_FILE_METADATA_GET_PRIVATE(object) \
static_cast<GParquetFileMetadataPrivate *>( \
gparquet_file_metadata_get_instance_private(GPARQUET_FILE_METADATA(object)))
static void
gparquet_file_metadata_finalize(GObject *object)
{
auto priv = GPARQUET_FILE_METADATA_GET_PRIVATE(object);
priv->metadata.~shared_ptr();
G_OBJECT_CLASS(gparquet_file_metadata_parent_class)->finalize(object);
}
static void
gparquet_file_metadata_set_property(GObject *object,
guint prop_id,
const GValue *value,
GParamSpec *pspec)
{
auto priv = GPARQUET_FILE_METADATA_GET_PRIVATE(object);
switch (prop_id) {
case PROP_METADATA:
priv->metadata =
*static_cast<std::shared_ptr<parquet::FileMetaData> *>(g_value_get_pointer(value));
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
break;
}
}
static void
gparquet_file_metadata_init(GParquetFileMetadata *object)
{
auto priv = GPARQUET_FILE_METADATA_GET_PRIVATE(object);
new (&priv->metadata) std::shared_ptr<parquet::FileMetaData>;
}
static void
gparquet_file_metadata_class_init(GParquetFileMetadataClass *klass)
{
auto gobject_class = G_OBJECT_CLASS(klass);
gobject_class->finalize = gparquet_file_metadata_finalize;
gobject_class->set_property = gparquet_file_metadata_set_property;
GParamSpec *spec;
spec = g_param_spec_pointer(
"metadata",
"Metadata",
"The raw std::shared_ptr<parquet::FileMetaData>",
static_cast<GParamFlags>(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY));
g_object_class_install_property(gobject_class, PROP_METADATA, spec);
}
/**
* gparquet_file_metadata_equal:
* @metadata: A #GParquetFileMetadata.
* @other_metadata: A #GParquetFileMetadata.
*
* Returns: %TRUE if both of them have the same data, %FALSE
* otherwise.
*
* Since: 8.0.0
*/
gboolean
gparquet_file_metadata_equal(GParquetFileMetadata *metadata,
GParquetFileMetadata *other_metadata)
{
auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
auto parquet_other_metadata = gparquet_file_metadata_get_raw(other_metadata);
return parquet_metadata->Equals(*parquet_other_metadata);
}
/**
* gparquet_file_metadata_get_n_columns:
* @metadata: A #GParquetFileMetadata.
*
* Returns: The number of top-level columns in the schema.
*
* Parquet thrift definition requires that nested schema elements are
* flattened. This method returns the number of columns in the un-flattened
* version.
*
* Since: 8.0.0
*/
gint
gparquet_file_metadata_get_n_columns(GParquetFileMetadata *metadata)
{
auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
return parquet_metadata->num_columns();
}
/**
* gparquet_file_metadata_get_n_schema_elements:
* @metadata: A #GParquetFileMetadata.
*
* Returns: The number of flattened schema elements.
*
* Parquet thrift definition requires that nested schema elements are
* flattened. This method returns the total number of elements in the
* flattened list.
*
* Since: 8.0.0
*/
gint
gparquet_file_metadata_get_n_schema_elements(GParquetFileMetadata *metadata)
{
auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
return parquet_metadata->num_schema_elements();
}
/**
* gparquet_file_metadata_get_n_rows:
* @metadata: A #GParquetFileMetadata.
*
* Returns: The total number of rows.
*
* Since: 8.0.0
*/
gint64
gparquet_file_metadata_get_n_rows(GParquetFileMetadata *metadata)
{
auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
return parquet_metadata->num_rows();
}
/**
* gparquet_file_metadata_get_n_row_groups:
* @metadata: A #GParquetFileMetadata.
*
* Returns: The number of row groups in the file.
*
* Since: 8.0.0
*/
gint
gparquet_file_metadata_get_n_row_groups(GParquetFileMetadata *metadata)
{
auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
return parquet_metadata->num_row_groups();
}
/**
* gparquet_file_metadata_get_row_group:
* @metadata: A #GParquetFileMetadata.
* @index: An index of the row group to retrieve.
* @error: (nullable): Return location for a #GError or %NULL.
*
* Returns: (transfer full) (nullable): A #GParquetRowGroupMetadata
* at @index on success, %NULL on error.
*
* Since: 8.0.0
*/
GParquetRowGroupMetadata *
gparquet_file_metadata_get_row_group(GParquetFileMetadata *metadata,
gint index,
GError **error)
{
auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
std::unique_ptr<parquet::RowGroupMetaData> parquet_row_group_metadata;
auto status = ([&] {
BEGIN_PARQUET_CATCH_EXCEPTIONS
parquet_row_group_metadata = parquet_metadata->RowGroup(index);
return arrow::Status::OK();
END_PARQUET_CATCH_EXCEPTIONS
})();
if (garrow::check(error, status, "[parquet][file-metadata][get-row-group]")) {
return gparquet_row_group_metadata_new_raw(parquet_row_group_metadata.release(),
metadata);
} else {
return NULL;
}
}
/**
* gparquet_file_metadata_get_created_by:
* @metadata: A #GParquetFileMetadata.
*
* Returns: The application's user-agent string of the writer.
*
* Since: 8.0.0
*/
const gchar *
gparquet_file_metadata_get_created_by(GParquetFileMetadata *metadata)
{
auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
return parquet_metadata->created_by().c_str();
}
/**
* gparquet_file_metadata_get_size:
* @metadata: A #GParquetFileMetadata.
*
* Returns: The size of the original thrift encoded metadata footer.
*
* Since: 8.0.0
*/
guint32
gparquet_file_metadata_get_size(GParquetFileMetadata *metadata)
{
auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
return parquet_metadata->size();
}
/**
* gparquet_file_metadata_can_decompress:
* @metadata: A #GParquetFileMetadata.
*
* Returns: %TRUE if all of the row groups can be decompressed, %FALSE
* otherwise.
*
* This will return false if any of the RowGroup's page is
* compressed with a compression format which is not compiled in the
* current Parquet library.
*
* Since: 8.0.0
*/
gboolean
gparquet_file_metadata_can_decompress(GParquetFileMetadata *metadata)
{
auto parquet_metadata = gparquet_file_metadata_get_raw(metadata);
return parquet_metadata->can_decompress();
}
G_END_DECLS
GParquetColumnChunkMetadata *
gparquet_column_chunk_metadata_new_raw(parquet::ColumnChunkMetaData *parquet_metadata,
GParquetRowGroupMetadata *owner)
{
auto metadata =
GPARQUET_COLUMN_CHUNK_METADATA(g_object_new(GPARQUET_TYPE_COLUMN_CHUNK_METADATA,
"metadata",
parquet_metadata,
"owner",
owner,
NULL));
return metadata;
}
parquet::ColumnChunkMetaData *
gparquet_column_chunk_metadata_get_raw(GParquetColumnChunkMetadata *metadata)
{
auto priv = GPARQUET_COLUMN_CHUNK_METADATA_GET_PRIVATE(metadata);
return priv->metadata;
}
GParquetRowGroupMetadata *
gparquet_row_group_metadata_new_raw(parquet::RowGroupMetaData *parquet_metadata,
GParquetFileMetadata *owner)
{
auto metadata =
GPARQUET_ROW_GROUP_METADATA(g_object_new(GPARQUET_TYPE_ROW_GROUP_METADATA,
"metadata",
parquet_metadata,
"owner",
owner,
NULL));
return metadata;
}
parquet::RowGroupMetaData *
gparquet_row_group_metadata_get_raw(GParquetRowGroupMetadata *metadata)
{
auto priv = GPARQUET_ROW_GROUP_METADATA_GET_PRIVATE(metadata);
return priv->metadata;
}
GParquetFileMetadata *
gparquet_file_metadata_new_raw(std::shared_ptr<parquet::FileMetaData> *parquet_metadata)
{
auto metadata = GPARQUET_FILE_METADATA(
g_object_new(GPARQUET_TYPE_FILE_METADATA, "metadata", parquet_metadata, NULL));
return metadata;
}
std::shared_ptr<parquet::FileMetaData>
gparquet_file_metadata_get_raw(GParquetFileMetadata *metadata)
{
auto priv = GPARQUET_FILE_METADATA_GET_PRIVATE(metadata);
return priv->metadata;
}