misc/reference_datasets/monolingual/zh/download_mapcc.py [10:44]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class ConcatenatedFileStream:
    def __init__(self, filepaths):
        self.filepaths = filepaths
        self.file_index = 0
        self.current_file = None
        self._open_next_file()

    def _open_next_file(self):
        if self.current_file:
            self.current_file.close()
        if self.file_index < len(self.filepaths):
            print(f"opening {self.filepaths[self.file_index]}")
            self.current_file = fsspec.open(self.filepaths[self.file_index], mode="rb").open()
            self.file_index += 1
        else:
            self.current_file = None

    def read(self, size=-1):
        result = b""
        while size != 0:
            if self.current_file is None:
                break  # No more files to read from

            chunk = self.current_file.read(size)
            if not chunk:  # End of current file
                self._open_next_file()
            else:
                result += chunk
                if size > 0:
                    size -= len(chunk)
        return result

    def close(self):
        if self.current_file:
            self.current_file.close()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



misc/reference_datasets/multilingual/part jsons.py [9:43]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class ConcatenatedFileStream:
    def __init__(self, filepaths):
        self.filepaths = filepaths
        self.file_index = 0
        self.current_file = None
        self._open_next_file()

    def _open_next_file(self):
        if self.current_file:
            self.current_file.close()
        if self.file_index < len(self.filepaths):
            print(f"opening {self.filepaths[self.file_index]}")
            self.current_file = fsspec.open(self.filepaths[self.file_index], mode="rb").open()
            self.file_index += 1
        else:
            self.current_file = None

    def read(self, size=-1):
        result = b""
        while size != 0:
            if self.current_file is None:
                break  # No more files to read from

            chunk = self.current_file.read(size)
            if not chunk:  # End of current file
                self._open_next_file()
            else:
                result += chunk
                if size > 0:
                    size -= len(chunk)
        return result

    def close(self):
        if self.current_file:
            self.current_file.close()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



