def fetch_schema()

in jobs/socorro_import_crash_data.py [0:0]


def fetch_schema():
    """
    Fetch the crash data schema from an gcs location or github location.

    This returns the corresponding JSON schema in a python dictionary.
    """

    bucket = "moz-fx-socorro-prod-prod-telemetry"
    key = "telemetry_socorro_crash.json"
    fallback_url = f"https://raw.githubusercontent.com/mozilla-services/socorro/master/socorro/schemas/{key}"

    try:
        log.info(f"Fetching latest crash data schema from s3://{bucket}/{key}")

        # Use spark to pull schema file instead of boto since the dataproc hadoop configs only work with spark.
        # Note: only do this on small json files, since collect will bring the file onto the driver
        json_obj = (
            spark.read.json(f"gs://{bucket}/{key}", multiLine=True).toJSON().collect()
        )
        resp = json.loads(json_obj[0])
    except Exception as e:
        log.warning(
            f"Could not fetch schema from s3://{bucket}/{key}: {e}\n"
            f"Fetching crash data schema from {fallback_url}"
        )
        resp = json.loads(urllib.request.urlopen(fallback_url).read())

    return resp