in petastorm/etl/rowgroup_indexers.py [0:0]
def build_index(self, decoded_rows, piece_index):
field_column = [row[self._column_name] for row in decoded_rows]
if not field_column:
raise ValueError("Cannot build index for empty rows, column '{}'"
.format(self._column_name))
for field_val in field_column:
if field_val is not None:
# check type of field, if it is array index each array value,
# otherwise index field value directly
if isinstance(field_val, np.ndarray):
for val in field_val:
self._index_data[val].add(piece_index)
else:
self._index_data[field_val].add(piece_index)
return self._index_data