in petastorm/unischema.py [0:0]
def from_arrow_schema(cls, parquet_dataset, omit_unsupported_fields=False):
"""
Convert an apache arrow schema into a unischema object. This is useful for datasets of only scalars
which need no special encoding/decoding. If there is an unsupported type in the arrow schema, it will
throw an exception.
When the warn_only parameter is turned to True, unsupported column types prints only warnings.
We do not set codec field in the generated fields since all parquet fields are out-of-the-box supported
by pyarrow and we do not need perform any custom decoding.
:param arrow_schema: :class:`pyarrow.lib.Schema`
:param omit_unsupported_fields: :class:`Boolean`
:return: A :class:`Unischema` object.
"""
meta = compat_get_metadata(parquet_dataset.pieces[0], parquet_dataset.fs.open)
arrow_schema = meta.schema.to_arrow_schema()
unischema_fields = []
for partition in parquet_dataset.partitions:
if (pa.types.is_binary(partition.dictionary.type) and six.PY2) or \
(pa.types.is_string(partition.dictionary.type) and six.PY3):
numpy_dtype = np.str_
elif pa.types.is_int64(partition.dictionary.type):
numpy_dtype = np.int64
else:
raise RuntimeError(('Expected partition type to be one of currently supported types: string or int64. '
'Got {}').format(partition.dictionary.type))
unischema_fields.append(UnischemaField(partition.name, numpy_dtype, (), None, False))
for column_name in arrow_schema.names:
arrow_field = compat_schema_field(arrow_schema, column_name)
field_type = arrow_field.type
field_shape = ()
if isinstance(field_type, ListType):
if isinstance(field_type.value_type, ListType) or isinstance(field_type.value_type, pyStructType):
warnings.warn('[ARROW-1644] Ignoring unsupported structure %r for field %r'
% (field_type, column_name))
continue
field_shape = (None,)
try:
np_type = _numpy_and_codec_from_arrow_type(field_type)
except ValueError:
if omit_unsupported_fields:
warnings.warn('Column %r has an unsupported field %r. Ignoring...'
% (column_name, field_type))
continue
else:
raise
unischema_fields.append(UnischemaField(column_name, np_type, field_shape, None, arrow_field.nullable))
return Unischema('inferred_schema', unischema_fields)