def validate_dataset()

in data_annotation_platform/app/utils/datasets.py [0:0]


def validate_dataset(filename):
    if not os.path.exists(filename):
        return "File not found."

    with open(filename, "rb") as fp:
        try:
            data = json.load(fp)
        except json.JSONDecodeError as err:
            return "JSON decoding error: %s" % err.msg

    try:
        schema = load_schema()
    except FileNotFoundError:
        return "Schema file not found."

    try:
        jsonschema.validate(instance=data, schema=schema)
    except jsonschema.ValidationError as err:
        return "JSONSchema validation error: %s" % err.message

    if len(data["series"]) != data["n_dim"]:
        return "Number of dimensions and number of series don't match"

    if "time" in data.keys():
        if not "format" in data["time"] and "raw" in data["time"]:
            return "'raw' must be accompanied by format"
        if "format" in data["time"] and not "raw" in data["time"]:
            return "Format must be accompanied by 'raw'"
        if "index" in data["time"]:
            if not data["time"]["index"][0] == 0:
                return "Index should start at zero."
            if not len(data["time"]["index"]) == data["n_obs"]:
                return "Number of indices must match number of observations"
        if "raw" in data["time"]:
            if len(data["time"]["raw"]) != data["n_obs"]:
                return "Number of time points doesn't match number of observations"
            if None in data["time"]["raw"]:
                return "Null is not supported in time axis. Use 'NaN' instead."

    has_missing = False
    for var in data["series"]:
        if len(var["raw"]) != data["n_obs"]:
            return "Number of observations doesn't match for %s" % var["label"]
        if None in var["raw"]:
            return "Null is not supported in series. Use 'NaN' instead."
        has_missing = has_missing or any(map(math.isnan, var["raw"]))

    # this doesn't happen in any dataset yet, so let's not implement it until
    # we need it.
    if data["n_dim"] > 1 and has_missing:
        return "Missing values are not yet supported for multidimensional data"

    return None