in src/datasets/features/pdf.py [0:0]
def decode_example(self, value: dict, token_per_repo_id=None) -> "pdfplumber.pdf.PDF":
"""Decode example pdf file into pdf data.
Args:
value (`str` or `dict`):
A string with the absolute pdf file path, a dictionary with
keys:
- `path`: String with absolute or relative pdf file path.
- `bytes`: The bytes of the pdf file.
token_per_repo_id (`dict`, *optional*):
To access and decode pdf files from private repositories on
the Hub, you can pass a dictionary
repo_id (`str`) -> token (`bool` or `str`).
Returns:
`pdfplumber.pdf.PDF`
"""
if not self.decode:
raise RuntimeError("Decoding is disabled for this feature. Please use Pdf(decode=True) instead.")
if config.PDFPLUMBER_AVAILABLE:
import pdfplumber
else:
raise ImportError("To support decoding pdfs, please install 'pdfplumber'.")
if token_per_repo_id is None:
token_per_repo_id = {}
path, bytes_ = value["path"], value["bytes"]
if bytes_ is None:
if path is None:
raise ValueError(f"A pdf should have one of 'path' or 'bytes' but both are None in {value}.")
else:
if is_local_path(path):
pdf = pdfplumber.open(path)
else:
source_url = path.split("::")[-1]
pattern = (
config.HUB_DATASETS_URL
if source_url.startswith(config.HF_ENDPOINT)
else config.HUB_DATASETS_HFFS_URL
)
try:
repo_id = string_to_dict(source_url, pattern)["repo_id"]
token = token_per_repo_id.get(repo_id)
except ValueError:
token = None
download_config = DownloadConfig(token=token)
f = xopen(path, "rb", download_config=download_config)
return pdfplumber.open(f)
else:
with pdfplumber.open(BytesIO(bytes_)) as p:
pdf = p
return pdf