in misc/reference_datasets/multilingual/copy_raw_data.py [0:0]
def read_file(self, filepath: str):
import orjson
from orjson import JSONDecodeError
from loguru import logger
with self.data_folder.open(filepath, "r", compression=self.compression) as f:
try:
for li, line in enumerate(f):
with self.track_time():
try:
document = self.get_document_from_dict(orjson.loads(line), filepath, li)
if not document:
continue
except (EOFError, JSONDecodeError) as e:
logger.warning(f"Error when reading `{filepath}`: {e}")
continue
yield document
except UnicodeDecodeError as e:
logger.warning(f"File `{filepath}` may be corrupted: raised UnicodeDecodeError ({e})")
except Exception as e:
if "Error -3 while decompressing data" in str(e):
logger.warning(f"CORRUPTED `{filepath}`: {e}")
else:
logger.warning(f"Unknwon: {e}")