misc/reference_datasets/monolingual/zh/download_mapcc.py [46:89]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class JsonlPartReader(JsonlReader):
    def __init__(
            self,
            data_folder,
            adapter=None,
            text_key: str = "text",
            id_key: str = "id",
            default_metadata: dict = None,
            recursive: bool = True,
            glob_pattern: str | None = None,
    ):
        super().__init__(
            data_folder,
            adapter=adapter,
            text_key=text_key,
            id_key=id_key,
            default_metadata=default_metadata,
            recursive=recursive,
            glob_pattern=glob_pattern,
        )

    def read_files_shard(self, shard: list[str]):
        """
            Reads a list of files and yield Documents
        Args:
            shard: a list of file paths

        Returns: generator of Document

        """
        from tqdm import tqdm
        li = 0
        skipped = 0
        with (
            tqdm(
                total=self.limit if self.limit != -1 else None,
                desc="Document progress",
                unit="doc",
                disable=not self.doc_progress,
            ) as doc_pbar,
            tqdm(total=len(shard), desc="File progress", unit="file", disable=not self.file_progress) as file_pbar,
        ):
            for i, filepath in enumerate(shard):
                self.stat_update("input_files")
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



misc/reference_datasets/multilingual/part jsons.py [56:98]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class JsonlPartReader(JsonlReader):
    def __init__(
            self,
            data_folder,
            adapter=None,
            text_key: str = "text",
            id_key: str = "id",
            default_metadata: dict = None,
            recursive: bool = True,
            glob_pattern: str | None = None,
    ):
        super().__init__(
            data_folder,
            adapter=adapter,
            text_key=text_key,
            id_key=id_key,
            default_metadata=default_metadata,
            recursive=recursive,
            glob_pattern=glob_pattern,
        )

    def read_files_shard(self, shard: list[str]):
        """
            Reads a list of files and yield Documents
        Args:
            shard: a list of file paths

        Returns: generator of Document

        """
        li = 0
        skipped = 0
        with (
            tqdm(
                total=self.limit if self.limit != -1 else None,
                desc="Document progress",
                unit="doc",
                disable=not self.doc_progress,
            ) as doc_pbar,
            tqdm(total=len(shard), desc="File progress", unit="file", disable=not self.file_progress) as file_pbar,
        ):
            for i, filepath in enumerate(shard):
                self.stat_update("input_files")
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



