in src/datasets/arrow_writer.py [0:0]
def write_examples_on_file(self):
"""Write stored examples from the write-pool of examples. It makes a table out of the examples and write it."""
if not self.current_examples:
return
# preserve the order the columns
if self.schema:
schema_cols = set(self.schema.names)
examples_cols = self.current_examples[0][0].keys() # .keys() preserves the order (unlike set)
common_cols = [col for col in self.schema.names if col in examples_cols]
extra_cols = [col for col in examples_cols if col not in schema_cols]
cols = common_cols + extra_cols
else:
cols = list(self.current_examples[0][0])
batch_examples = {}
for col in cols:
# We use row[0][col] since current_examples contains (example, key) tuples.
# Moreover, examples could be Arrow arrays of 1 element.
# This can happen in `.map()` when we want to re-write the same Arrow data
if all(isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) for row in self.current_examples):
arrays = [row[0][col] for row in self.current_examples]
arrays = [
chunk
for array in arrays
for chunk in (array.chunks if isinstance(array, pa.ChunkedArray) else [array])
]
batch_examples[col] = pa.concat_arrays(arrays)
else:
batch_examples[col] = [
row[0][col].to_pylist()[0] if isinstance(row[0][col], (pa.Array, pa.ChunkedArray)) else row[0][col]
for row in self.current_examples
]
self.write_batch(batch_examples=batch_examples)
self.current_examples = []