in jobs/socorro_import_crash_data.py [0:0]
def fetch_schema():
"""
Fetch the crash data schema from an gcs location or github location.
This returns the corresponding JSON schema in a python dictionary.
"""
bucket = "moz-fx-socorro-prod-prod-telemetry"
key = "telemetry_socorro_crash.json"
fallback_url = f"https://raw.githubusercontent.com/mozilla-services/socorro/master/socorro/schemas/{key}"
try:
log.info(f"Fetching latest crash data schema from s3://{bucket}/{key}")
# Use spark to pull schema file instead of boto since the dataproc hadoop configs only work with spark.
# Note: only do this on small json files, since collect will bring the file onto the driver
json_obj = (
spark.read.json(f"gs://{bucket}/{key}", multiLine=True).toJSON().collect()
)
resp = json.loads(json_obj[0])
except Exception as e:
log.warning(
f"Could not fetch schema from s3://{bucket}/{key}: {e}\n"
f"Fetching crash data schema from {fallback_url}"
)
resp = json.loads(urllib.request.urlopen(fallback_url).read())
return resp