def __init__()

in pyspark_huggingface/huggingface_source.py [0:0]


    def __init__(self, options):
        super().__init__(options)
        from datasets import load_dataset_builder

        if "path" not in options or not options["path"]:
            raise Exception("You must specify a dataset name.")
        
        from huggingface_hub import get_token

        kwargs = dict(self.options)
        self.dataset_name = kwargs.pop("path")
        self.config_name = kwargs.pop("config", None)
        self.split = kwargs.pop("split", self.DEFAULT_SPLIT)
        self.revision = kwargs.pop("revision", None)
        self.streaming = kwargs.pop("streaming", "true").lower() == "true"
        self.token = kwargs.pop("token", None) or get_token()
        self.endpoint = kwargs.pop("endpoint", None)
        for arg in kwargs:
            if kwargs[arg].lower() == "true":
                kwargs[arg] = True
            elif kwargs[arg].lower() == "false":
                kwargs[arg] = False
            else:
                try:
                    kwargs[arg] = ast.literal_eval(kwargs[arg])
                except ValueError:
                    pass
                    
        # Raise the right error if the dataset doesn't exist
        api = self._get_api()
        api.repo_info(self.dataset_name, repo_type="dataset", revision=self.revision)

        self.builder = load_dataset_builder(self.dataset_name, self.config_name, token=self.token, revision=self.revision, **kwargs)
        streaming_dataset = self.builder.as_streaming_dataset()
        if self.split not in streaming_dataset:
            raise Exception(f"Split {self.split} is invalid. Valid options are {list(streaming_dataset)}")

        self.streaming_dataset = streaming_dataset[self.split]
        if not self.streaming_dataset.features:
            self.streaming_dataset = self.streaming_dataset._resolve_features()