in src/datasets/packaged_modules/text/text.py [0:0]
def _generate_tables(self, files):
pa_table_names = list(self.config.features) if self.config.features is not None else ["text"]
for file_idx, file in enumerate(itertools.chain.from_iterable(files)):
# open in text mode, by default translates universal newlines ("\n", "\r\n" and "\r") into "\n"
with open(file, encoding=self.config.encoding, errors=self.config.encoding_errors) as f:
if self.config.sample_by == "line":
batch_idx = 0
while True:
batch = f.read(self.config.chunksize)
if not batch:
break
batch += f.readline() # finish current line
# StringIO.readlines, by default splits only on "\n" (and keeps line breaks)
batch = StringIO(batch).readlines()
if not self.config.keep_linebreaks:
batch = [line.rstrip("\n") for line in batch]
pa_table = pa.Table.from_arrays([pa.array(batch)], names=pa_table_names)
# Uncomment for debugging (will print the Arrow table size and elements)
# logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
# logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
yield (file_idx, batch_idx), self._cast_table(pa_table)
batch_idx += 1
elif self.config.sample_by == "paragraph":
batch_idx = 0
batch = ""
while True:
new_batch = f.read(self.config.chunksize)
if not new_batch:
break
batch += new_batch
batch += f.readline() # finish current line
batch = batch.split("\n\n")
pa_table = pa.Table.from_arrays(
[pa.array([example for example in batch[:-1] if example])], names=pa_table_names
)
# Uncomment for debugging (will print the Arrow table size and elements)
# logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
# logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
yield (file_idx, batch_idx), self._cast_table(pa_table)
batch_idx += 1
batch = batch[-1]
if batch:
pa_table = pa.Table.from_arrays([pa.array([batch])], names=pa_table_names)
yield (file_idx, batch_idx), self._cast_table(pa_table)
elif self.config.sample_by == "document":
text = f.read()
pa_table = pa.Table.from_arrays([pa.array([text])], names=pa_table_names)
yield file_idx, self._cast_table(pa_table)