taskcluster/translations_taskgraph/util/dataset_helpers.py (30 lines of code) (raw):
from pathlib import Path
from urllib.parse import urlparse
import hashlib
# We keep this relatively short because these datasets end up in task labels,
# which end up in task cache routes, which need to be <= 256 characters.
DATASET_NAME_MAX_LENGTH = 50
# Important! Keep in sync with `Dataset._escape` in pipeline/common/datasets.py.
def sanitize_dataset_name(dataset: str) -> str:
# URLs can be too large when used as Taskcluster labels. Create a nice identifier for them.
# See https://github.com/mozilla/translations/issues/527
if dataset.startswith("https://") or dataset.startswith("http://"):
url = urlparse(dataset)
hostname = url.hostname
if hostname == "storage.googleapis.com":
hostname = "gcp"
# Get the name of the file from theh path without the extension.
file = Path(url.path).stem
file = file.replace(".[LANG]", "").replace("[LANG]", "")
# Compute a hash to avoid any name collisions.
md5 = hashlib.md5()
md5.update(dataset.encode("utf-8"))
hash = md5.hexdigest()[:6]
dataset = f"{hostname}_{file}_{hash}"
# Even non-URL datasets can be too long, for example:
# mtdata_ELRC-convention_against_torture_other_cruel_inhuman_or_degrading_treatment_or_punishment_united_nations-1-ell-eng
# We need to truncate and hash any that are over a certain length
elif len(dataset) > DATASET_NAME_MAX_LENGTH:
md5 = hashlib.md5()
md5.update(dataset.encode("utf-8"))
hash = md5.hexdigest()[:6]
truncated = dataset[:DATASET_NAME_MAX_LENGTH]
dataset = f"{truncated}_{hash}"
return (
dataset.replace("://", "_")
.replace("/", "_")
.replace(".", "_")
.replace(":", "_")
.replace("[", "_")
.replace("]", "_")
)