in pyspark_huggingface/huggingface_source.py [0:0]
def __init__(self, options):
super().__init__(options)
from datasets import load_dataset_builder
if "path" not in options or not options["path"]:
raise Exception("You must specify a dataset name.")
from huggingface_hub import get_token
kwargs = dict(self.options)
self.dataset_name = kwargs.pop("path")
self.config_name = kwargs.pop("config", None)
self.split = kwargs.pop("split", self.DEFAULT_SPLIT)
self.revision = kwargs.pop("revision", None)
self.streaming = kwargs.pop("streaming", "true").lower() == "true"
self.token = kwargs.pop("token", None) or get_token()
self.endpoint = kwargs.pop("endpoint", None)
for arg in kwargs:
if kwargs[arg].lower() == "true":
kwargs[arg] = True
elif kwargs[arg].lower() == "false":
kwargs[arg] = False
else:
try:
kwargs[arg] = ast.literal_eval(kwargs[arg])
except ValueError:
pass
# Raise the right error if the dataset doesn't exist
api = self._get_api()
api.repo_info(self.dataset_name, repo_type="dataset", revision=self.revision)
self.builder = load_dataset_builder(self.dataset_name, self.config_name, token=self.token, revision=self.revision, **kwargs)
streaming_dataset = self.builder.as_streaming_dataset()
if self.split not in streaming_dataset:
raise Exception(f"Split {self.split} is invalid. Valid options are {list(streaming_dataset)}")
self.streaming_dataset = streaming_dataset[self.split]
if not self.streaming_dataset.features:
self.streaming_dataset = self.streaming_dataset._resolve_features()