def get_latest_valid_per_client()

in mozetl/hardware_report/summarize_json.py [0:0]


def get_latest_valid_per_client(entry, time_start, time_end):
    """Get the most recently submitted ping for a client within the given timeframe.

    Then use this index to look up the data from the other columns (we can assume that the sizes
    of these arrays match, otherwise the longitudinal dataset is broken).
    Once we have the data, we make sure it's valid and return it.

    Args:
        entry: The record containing all the data for a single client.
        time_start: The beginning of the reference timeframe.
        time_end: The end of the reference timeframe.

    Returns:
        An object containing the valid hardware data for the client or a string
        describing why the data is discarded. Either REASON_INACTIVE, if the client didn't
        submit a ping within the desired timeframe, or REASON_BROKEN_DATA if it send
        broken data.

    Raises:
        ValueError: if the columns within the record have mismatching lengths. This
        means the longitudinal dataset is corrupted.

    """
    latest_entry = None
    for index, pkt_date in enumerate(entry["submission_date"]):
        try:
            sub_date = dt.datetime.strptime(pkt_date, "%Y-%m-%dT%H:%M:%S.%fZ").date()
        except ValueError:
            sub_date = dt.datetime.strptime(pkt_date, "%Y-%m-%dT%H:%M:%SZ").date()

        # The data is in descending order, the most recent ping comes first.
        # The first item less or equal than the time_end date is our thing.
        if sub_date >= time_start.date() and sub_date <= time_end.date():
            latest_entry = index
            break

        # Ok, we went too far, we're not really interested in the data
        # outside of [time_start, time_end]. Since records are ordered,
        # we can actually skip this.
        if sub_date < time_start.date():
            break

    # This client wasn't active in the reference timeframe, just map it to no
    # data.
    if latest_entry is None:
        return REASON_INACTIVE

    # Some clients might be missing entire sections. If that's
    # a basic section, skip them, we don't want partial data.
    # Don't enforce the presence of "active_plugins", as it's not included
    # by the pipeline if no plugin is reported by Firefox (see bug 1333806).
    desired_sections = [
        "build",
        "system_os",
        "submission_date",
        "system",
        "system_gfx",
        "system_cpu",
    ]

    for field in desired_sections:
        if entry[field] is None:
            return REASON_BROKEN_DATA

        # All arrays in the longitudinal dataset should have the same length, for a
        # single client. If that's not the case, if our index is not there,
        # throw.
        if entry[field][latest_entry] is None:
            raise ValueError("Null " + field + " index: " + str(latest_entry))

    return get_valid_client_record(entry, latest_entry)