in misc/reference_datasets/monolingual/ar/download_arabicweb24.py [0:0]
def read_file(self, filepath: str):
import pyarrow as pa
with self.data_folder.open(filepath, "rb") as f:
reader = pa.ipc.open_stream(f)
li = 0
columns = [self.text_key, self.id_key] if not self.read_metadata else None
documents = []
with self.track_time("table"):
df = reader.read_pandas(categories=columns)
for _, row in df.iterrows():
document = self.get_document_from_dict(row.to_dict(), filepath, li)
if not document:
continue
documents.append(document)
li += 1
yield from documents