def read_file()

in misc/reference_datasets/multilingual/copy_raw_data.py [0:0]


    def read_file(self, filepath: str):
        import orjson
        from orjson import JSONDecodeError
        from loguru import logger

        with self.data_folder.open(filepath, "r", compression=self.compression) as f:
            try:
                for li, line in enumerate(f):
                    with self.track_time():
                        try:
                            document = self.get_document_from_dict(orjson.loads(line), filepath, li)
                            if not document:
                                continue
                        except (EOFError, JSONDecodeError) as e:
                            logger.warning(f"Error when reading `{filepath}`: {e}")
                            continue
                    yield document
            except UnicodeDecodeError as e:
                logger.warning(f"File `{filepath}` may be corrupted: raised UnicodeDecodeError ({e})")
            except Exception as e:
                if "Error -3 while decompressing data" in str(e):
                    logger.warning(f"CORRUPTED `{filepath}`: {e}")
                else:
                    logger.warning(f"Unknwon: {e}")