in bigquery_etl/shredder/config.py [0:0]
def _get_client_id_field(table, deletion_request_view=False, study_name=None):
"""Determine which column should be used as client id for a given table."""
if table.dataset_id.startswith("rally_"):
# `rally_zero_one` is a special case where top-level rally_id is used
# both in the ping tables and the deletion_requests view
if table.dataset_id in ["rally_zero_one_stable", "rally_zero_one_derived"]:
return RALLY_ID_TOP_LEVEL
# deletion request views expose rally_id as a top-level field
if deletion_request_view:
return RALLY_ID_TOP_LEVEL
else:
return RALLY_ID
elif table.dataset_id == "analysis":
# Rally analysis tables do not have schemas specified upfront,
# analysts might decide to use either nested or top-level rally_id.
# Shared datasets, like attention stream, may also have derived
# datasets with rally IDs
# See https://github.com/mozilla-services/cloudops-infra/blob/master/projects/data-pioneer/tf/prod/envs/prod/study-projects/main.tf#L60-L67 # noqa
if any(_has_nested_rally_id(field) for field in table.schema):
return RALLY_ID
elif any(field.name == RALLY_ID_TOP_LEVEL for field in table.schema):
return RALLY_ID_TOP_LEVEL
# Pioneer derived tables will have a PIONEER_ID
elif any(field.name == PIONEER_ID for field in table.schema):
return PIONEER_ID
else:
logging.error(f"Failed to find client_id field for {table}")
else:
return PIONEER_ID