def __getitem__()

in src/nanotron/data/tokenized_bytes.py [0:0]


    def __getitem__(self, item):
        # We loop on the dataset if asking for an index larger than the dataset size
        epoch_item = item % len(self)
        # if item >= len(self):
        #     raise IndexError(f"Index {item} requested for file {self.file_path} but it only has size {len(self)}")
        # skip ahead without creating a new stream
        if self._stream and epoch_item > self._last_item_requested and self.skip_in_stream:
            while self._last_item_requested < epoch_item - 1:
                self._last_item_requested += 1
                self._get_next_from_stream()  # consume stream
        # new stream starting from "epoch_item"
        elif not self._stream or epoch_item != self._last_item_requested + 1:
            self._stream = self._get_new_stream(epoch_item)

        self._last_item_requested = epoch_item

        return {"input_ids": self._get_next_from_stream()}