in misc/reference_datasets/multilingual/copy_raw_data.py [0:0]
def adapter(self, data: dict, path: str, id_in_file: int | str):
"""
The default data adapter to adapt input data into the datatrove Document format
Args:
data: a dictionary with the "raw" representation of the data
path: file path or source for this sample
id_in_file: its id in this particular file or source
Returns: a dictionary with text, id, media and metadata fields
"""
return {
"text": data.pop(self.text_key, data.pop("content", "")),
"id": data.pop(self.id_key, data.pop("data-id", f"{path}/{id_in_file}")),
"media": data.pop("media", []),
"metadata": data.pop("metadata", {}) | data,
# remaining data goes into metadata
}