bigquery_etl/shredder/config.py (852 lines of code) (raw):
#!/usr/bin/env python3
"""Meta data about tables and ids for self serve deletion."""
import logging
import re
from collections import defaultdict
from dataclasses import dataclass
from functools import partial
from itertools import chain
from multiprocessing.pool import ThreadPool
from typing import Dict, List
import requests
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
from ..util.bigquery_id import qualified_table_id
MOZDATA = "mozdata"
SHARED_PROD = "moz-fx-data-shared-prod"
GLEAN_SCHEMA_ID = "glean_ping_1"
GLEAN_MIN_SCHEMA_ID = "glean-min_ping_1"
GLEAN_APP_LISTINGS_URL = "https://probeinfo.telemetry.mozilla.org/v2/glean/app-listings"
@dataclass(frozen=True)
class DeleteSource:
"""Data class for deletion request source."""
table: str
field: str
project: str = SHARED_PROD
conditions: tuple[str, ...] = ()
def __post_init__(self):
"""Validate the table string."""
if len(self.table.split(".")) != 2:
raise ValueError(
"DeleteSource table must be in the 'dataset.table' format."
)
@property
def table_id(self):
"""Table Id."""
return self.table.split(".", 1)[-1]
@property
def dataset_id(self):
"""Dataset Id."""
return self.table.split(".", 1)[0]
@dataclass(frozen=True)
class DeleteTarget:
"""Data class for deletion request target.
Rows will be removed using either one DELETE statement for the whole table,
or one DELETE statement per partition if the table is larger than some
configurable threshold.
"""
table: str
field: str | tuple[str, ...]
project: str = SHARED_PROD
def __post_init__(self):
"""Validate the table string."""
if len(self.table.split(".")) != 2:
raise ValueError(
"DeleteTarget table must be in the 'dataset.table' format."
)
@property
def table_id(self):
"""Table Id."""
return self.table.partition(".")[2]
@property
def dataset_id(self):
"""Dataset Id."""
return self.table.partition(".")[0]
@property
def fields(self) -> tuple[str, ...]:
"""Fields."""
if isinstance(self.field, tuple):
return self.field
return (self.field,)
DeleteIndex = dict[DeleteTarget, DeleteSource | tuple[DeleteSource, ...]]
CLIENT_ID = "client_id"
GLEAN_CLIENT_ID = "client_info.client_id"
GLEAN_USAGE_PROFILE_ID = "metrics.uuid.usage_profile_id"
IMPRESSION_ID = "impression_id"
USER_ID = "user_id"
POCKET_ID = "pocket_id"
SHIELD_ID = "shield_id"
PIONEER_ID = "pioneer_id"
RALLY_ID = "metrics.uuid.rally_id"
RALLY_ID_TOP_LEVEL = "rally_id"
ID = "id"
FXA_USER_ID = "jsonPayload.fields.user_id"
# these must be in the same order as SYNC_SOURCES
SYNC_IDS = ("SUBSTR(payload.device_id, 0, 32)", "payload.uid")
CONTEXT_ID = "context_id"
QUICK_SUGGEST_CONTEXT_ID = "metrics.uuid.quick_suggest_context_id"
USER_CHARACTERISTICS_ID = "metrics.uuid.characteristics_client_identifier"
DESKTOP_SRC = DeleteSource(
table="telemetry_stable.deletion_request_v4", field=CLIENT_ID
)
DESKTOP_GLEAN_SRC = DeleteSource(
table="firefox_desktop_stable.deletion_request_v1", field=GLEAN_CLIENT_ID
)
IMPRESSION_SRC = DeleteSource(
table="telemetry_stable.deletion_request_v4",
field="payload.scalars.parent.deletion_request_impression_id",
)
CONTEXTUAL_SERVICES_SRC = DeleteSource(
table="telemetry_stable.deletion_request_v4",
field="payload.scalars.parent.deletion_request_context_id",
)
QUICK_SUGGEST_SRC = DeleteSource(
table="firefox_desktop_stable.quick_suggest_deletion_request_v1",
field=QUICK_SUGGEST_CONTEXT_ID,
)
FXA_HMAC_SRC = DeleteSource(
table="firefox_accounts.fxa_delete_events", field="hmac_user_id"
)
FXA_SRC = DeleteSource(table="firefox_accounts.fxa_delete_events", field=USER_ID)
FXA_UNHASHED_SRC = DeleteSource(
table="firefox_accounts.fxa_delete_events", field="user_id_unhashed"
)
FXA_FRONTEND_GLEAN_SRC = DeleteSource(
table="accounts_frontend_stable.deletion_request_v1", field=GLEAN_CLIENT_ID
)
REGRETS_SRC = DeleteSource(
table="regrets_reporter_stable.regrets_reporter_update_v1",
field="data_deletion_request.extension_installation_uuid",
conditions=("data_deletion_request IS NOT NULL",),
)
# these must be in the same order as SYNC_IDS
SYNC_SOURCES = (
DeleteSource(
table="telemetry_stable.deletion_request_v4",
field="payload.scalars.parent.deletion_request_sync_device_id",
),
DeleteSource(
table="firefox_accounts.fxa_delete_events",
field="SUBSTR(hmac_user_id, 0, 32)",
),
)
LEGACY_MOBILE_SOURCES = tuple(
DeleteSource(
table=f"{product}_stable.deletion_request_v1",
field="metrics.uuid.legacy_ids_client_id",
)
for product in (
"org_mozilla_ios_fennec",
"org_mozilla_ios_firefox",
"org_mozilla_ios_firefoxbeta",
"org_mozilla_tv_firefox",
"mozilla_lockbox",
)
)
FOCUS_ADDITIONAL_DELETIONS = tuple(
DeleteSource(
table=f"{product}_derived.additional_deletion_requests_v1",
field="client_id",
)
for product in (
"org_mozilla_focus",
"org_mozilla_focus_beta",
"org_mozilla_focus_nightly",
)
)
USER_CHARACTERISTICS_SRC = DeleteSource(
table="firefox_desktop_stable.deletion_request_v1",
field=USER_CHARACTERISTICS_ID,
)
SOURCES = (
[
DESKTOP_SRC,
IMPRESSION_SRC,
CONTEXTUAL_SERVICES_SRC,
FXA_HMAC_SRC,
FXA_SRC,
]
+ list(SYNC_SOURCES)
+ list(LEGACY_MOBILE_SOURCES)
)
LEGACY_MOBILE_IDS = tuple(CLIENT_ID for _ in LEGACY_MOBILE_SOURCES)
client_id_target = partial(DeleteTarget, field=CLIENT_ID)
glean_target = partial(DeleteTarget, field=GLEAN_CLIENT_ID)
impression_id_target = partial(DeleteTarget, field=IMPRESSION_ID)
fxa_user_id_target = partial(DeleteTarget, field=FXA_USER_ID)
user_id_target = partial(DeleteTarget, field=USER_ID)
context_id_target = partial(DeleteTarget, field=CONTEXT_ID)
DELETE_TARGETS: DeleteIndex = {
# Other
client_id_target(table="search_derived.acer_cohort_v1"): DESKTOP_SRC,
client_id_target(
table="search_derived.mobile_search_clients_daily_v1"
): DESKTOP_SRC,
client_id_target(
table="search_derived.mobile_search_clients_daily_v2"
): DESKTOP_SRC,
client_id_target(table="search_derived.search_clients_daily_v8"): DESKTOP_SRC,
client_id_target(
table="telemetry_derived.desktop_engagement_clients_v1"
): DESKTOP_SRC,
client_id_target(table="search_derived.search_clients_last_seen_v1"): DESKTOP_SRC,
client_id_target(table="telemetry_derived.clients_daily_v6"): DESKTOP_SRC,
client_id_target(table="telemetry_derived.clients_daily_joined_v1"): DESKTOP_SRC,
client_id_target(table="telemetry_derived.clients_last_seen_v1"): DESKTOP_SRC,
client_id_target(table="telemetry_derived.clients_last_seen_v2"): DESKTOP_SRC,
client_id_target(
table="telemetry_derived.clients_last_seen_joined_v1"
): DESKTOP_SRC,
client_id_target(table="telemetry_derived.core_clients_daily_v1"): DESKTOP_SRC,
client_id_target(table="telemetry_derived.core_clients_last_seen_v1"): DESKTOP_SRC,
client_id_target(table="telemetry_derived.event_events_v1"): DESKTOP_SRC,
client_id_target(table="telemetry_derived.main_events_v1"): DESKTOP_SRC,
client_id_target(table="telemetry_derived.main_1pct_v1"): DESKTOP_SRC,
client_id_target(table="telemetry_derived.main_remainder_1pct_v1"): DESKTOP_SRC,
client_id_target(table="telemetry_derived.main_use_counter_1pct_v1"): DESKTOP_SRC,
client_id_target(
table="telemetry_derived.desktop_retention_clients_v1"
): DESKTOP_SRC,
client_id_target(
table="ltv_derived.firefox_desktop_new_profile_ltv_v1"
): DESKTOP_SRC,
client_id_target(table="telemetry_stable.block_autoplay_v1"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.crash_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.downgrade_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.event_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.first_shutdown_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.first_shutdown_v5"): DESKTOP_SRC,
client_id_target(
table="telemetry_stable.first_shutdown_use_counter_v4"
): DESKTOP_SRC,
client_id_target(table="telemetry_stable.focus_event_v1"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.frecency_update_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.health_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.heartbeat_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.main_v5"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.main_use_counter_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.modules_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.new_profile_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.saved_session_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.saved_session_v5"): DESKTOP_SRC,
client_id_target(
table="telemetry_stable.saved_session_use_counter_v4"
): DESKTOP_SRC,
client_id_target(table="telemetry_stable.shield_icq_v1_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.shield_study_addon_v3"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.shield_study_error_v3"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.shield_study_v3"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.testpilot_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.third_party_modules_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.untrusted_modules_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.update_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.voice_v4"): DESKTOP_SRC,
DeleteTarget(
table="telemetry_derived.mobile_engagement_clients_v1",
field=(CLIENT_ID, CLIENT_ID),
): (
DeleteSource(table="firefox_ios.deletion_request", field=GLEAN_CLIENT_ID),
DeleteSource(table="fenix.deletion_request", field=GLEAN_CLIENT_ID),
),
DeleteTarget(
table="ltv_derived.fenix_new_profile_ltv_v1",
field=(CLIENT_ID),
): (DeleteSource(table="fenix.deletion_request", field=GLEAN_CLIENT_ID)),
DeleteTarget(
table="ltv_derived.firefox_ios_new_profile_ltv_v1",
field=(CLIENT_ID),
): (DeleteSource(table="firefox_ios.deletion_request", field=GLEAN_CLIENT_ID)),
DeleteTarget(
table="telemetry_derived.fx_accounts_active_daily_clients_v1",
field=(CLIENT_ID),
): (DESKTOP_GLEAN_SRC),
DeleteTarget(
table="telemetry_derived.fx_accounts_linked_clients_staging_v1",
field=(CLIENT_ID, "linked_client_id"),
): (DESKTOP_GLEAN_SRC, DESKTOP_GLEAN_SRC),
DeleteTarget(
table="telemetry_derived.fx_accounts_linked_clients_v1",
field=(CLIENT_ID, "linked_client_id"),
): (DESKTOP_GLEAN_SRC, DESKTOP_GLEAN_SRC),
DeleteTarget(
table="telemetry_derived.fx_accounts_linked_clients_ordered_v1",
field=(CLIENT_ID, "linked_client_id"),
): (DESKTOP_GLEAN_SRC, DESKTOP_GLEAN_SRC),
DeleteTarget(
table="fx_quant_user_research_analysis.viewpoint_desktop_telem_current",
field=CLIENT_ID,
project=MOZDATA,
): DESKTOP_SRC,
DeleteTarget(
table="fx_quant_user_research_analysis.viewpoint_desktop_telem_old",
field=CLIENT_ID,
project=MOZDATA,
): DESKTOP_SRC,
DeleteTarget(
table="fx_quant_user_research_analysis.viewpoint_desktop_telem_temp",
field=CLIENT_ID,
project=MOZDATA,
): DESKTOP_SRC,
DeleteTarget(
table="fx_quant_user_research_analysis.viewpoint_mobile_telem_current",
field=(CLIENT_ID, CLIENT_ID),
project=MOZDATA,
): (
DeleteSource(table="firefox_ios.deletion_request", field=GLEAN_CLIENT_ID),
DeleteSource(table="fenix.deletion_request", field=GLEAN_CLIENT_ID),
),
DeleteTarget(
table="fx_quant_user_research_analysis.viewpoint_mobile_telem_old",
field=(CLIENT_ID, CLIENT_ID),
project=MOZDATA,
): (
DeleteSource(table="firefox_ios.deletion_request", field=GLEAN_CLIENT_ID),
DeleteSource(table="fenix.deletion_request", field=GLEAN_CLIENT_ID),
),
DeleteTarget(
table="fx_quant_user_research_analysis.viewpoint_mobile_telem_temp",
field=(CLIENT_ID, CLIENT_ID),
project=MOZDATA,
): (
DeleteSource(table="firefox_ios.deletion_request", field=GLEAN_CLIENT_ID),
DeleteSource(table="fenix.deletion_request", field=GLEAN_CLIENT_ID),
),
DeleteTarget(
table="telemetry_derived.rolling_cohorts_v2",
field=(CLIENT_ID,) * 15,
): (
DESKTOP_SRC,
DeleteSource(table="focus_android.deletion_request", field=GLEAN_CLIENT_ID),
DeleteSource(table="firefox_ios.deletion_request", field=GLEAN_CLIENT_ID),
DeleteSource(table="fenix.deletion_request", field=GLEAN_CLIENT_ID),
DeleteSource(table="klar_ios.deletion_request", field=GLEAN_CLIENT_ID),
DeleteSource(table="focus_ios.deletion_request", field=GLEAN_CLIENT_ID),
DeleteSource(table="klar_android.deletion_request", field=GLEAN_CLIENT_ID),
*FOCUS_ADDITIONAL_DELETIONS,
*LEGACY_MOBILE_SOURCES,
),
# activity stream
DeleteTarget(
table="messaging_system_stable.cfr_v1", field=(CLIENT_ID, IMPRESSION_ID)
): (DESKTOP_SRC, IMPRESSION_SRC),
DeleteTarget(
table="messaging_system_derived.cfr_users_daily_v1",
field=(CLIENT_ID, IMPRESSION_ID),
): (DESKTOP_SRC, IMPRESSION_SRC),
DeleteTarget(
table="messaging_system_derived.cfr_users_last_seen_v1",
field=(CLIENT_ID, IMPRESSION_ID),
): (DESKTOP_SRC, IMPRESSION_SRC),
client_id_target(table="activity_stream_stable.events_v1"): DESKTOP_SRC,
client_id_target(table="messaging_system_stable.onboarding_v1"): DESKTOP_SRC,
client_id_target(table="messaging_system_stable.snippets_v1"): DESKTOP_SRC,
client_id_target(table="activity_stream_stable.sessions_v1"): DESKTOP_SRC,
client_id_target(
table="messaging_system_derived.onboarding_users_daily_v1"
): DESKTOP_SRC,
client_id_target(
table="messaging_system_derived.onboarding_users_last_seen_v1"
): DESKTOP_SRC,
client_id_target(
table="messaging_system_derived.snippets_users_daily_v1"
): DESKTOP_SRC,
client_id_target(
table="messaging_system_derived.snippets_users_last_seen_v1"
): DESKTOP_SRC,
impression_id_target(
table="activity_stream_stable.impression_stats_v1"
): IMPRESSION_SRC,
impression_id_target(table="activity_stream_stable.spoc_fills_v1"): IMPRESSION_SRC,
impression_id_target(
table="messaging_system_stable.undesired_events_v1"
): IMPRESSION_SRC,
impression_id_target(
table="messaging_system_stable.personalization_experiment_v1"
): IMPRESSION_SRC,
# sync
DeleteTarget(table="telemetry_stable.sync_v4", field=SYNC_IDS): SYNC_SOURCES,
DeleteTarget(table="telemetry_stable.sync_v5", field=SYNC_IDS): SYNC_SOURCES,
# fxa
user_id_target(
table="firefox_accounts_derived.fxa_amplitude_export_v1"
): FXA_HMAC_SRC,
user_id_target(
table="firefox_accounts_derived.fxa_amplitude_user_ids_v1"
): FXA_HMAC_SRC,
fxa_user_id_target(
table="firefox_accounts_derived.fxa_auth_bounce_events_v1"
): FXA_SRC,
fxa_user_id_target(table="firefox_accounts_derived.fxa_auth_events_v1"): FXA_SRC,
fxa_user_id_target(table="firefox_accounts_derived.fxa_content_events_v1"): FXA_SRC,
fxa_user_id_target(
table="firefox_accounts_derived.fxa_gcp_stderr_events_v1"
): FXA_SRC,
fxa_user_id_target(
table="firefox_accounts_derived.fxa_gcp_stdout_events_v1"
): FXA_SRC,
user_id_target(
table="firefox_accounts_derived.fxa_log_device_command_events_v1"
): FXA_HMAC_SRC,
user_id_target(
table="firefox_accounts_derived.fxa_log_device_command_events_v2"
): FXA_HMAC_SRC,
fxa_user_id_target(table="firefox_accounts_derived.fxa_oauth_events_v1"): FXA_SRC,
fxa_user_id_target(table="firefox_accounts_derived.fxa_stdout_events_v1"): FXA_SRC,
user_id_target(table="firefox_accounts_derived.fxa_users_daily_v1"): FXA_SRC,
user_id_target(table="firefox_accounts_derived.fxa_users_daily_v2"): FXA_SRC,
user_id_target(table="firefox_accounts_derived.fxa_users_first_seen_v1"): FXA_SRC,
user_id_target(table="firefox_accounts_derived.fxa_users_first_seen_v2"): FXA_SRC,
user_id_target(table="firefox_accounts_derived.fxa_users_last_seen_v1"): FXA_SRC,
user_id_target(table="firefox_accounts_derived.fxa_users_last_seen_v2"): FXA_SRC,
user_id_target(
table="firefox_accounts_derived.fxa_users_services_daily_v1"
): FXA_SRC,
user_id_target(
table="firefox_accounts_derived.fxa_users_services_daily_v2"
): FXA_SRC,
user_id_target(
table="firefox_accounts_derived.fxa_users_services_first_seen_v1"
): FXA_SRC,
user_id_target(
table="firefox_accounts_derived.fxa_users_services_first_seen_v2"
): FXA_SRC,
user_id_target(
table="firefox_accounts_derived.fxa_users_services_last_seen_v1"
): FXA_SRC,
user_id_target(
table="firefox_accounts_derived.fxa_users_services_last_seen_v2"
): FXA_SRC,
user_id_target(
table="firefox_accounts_derived.fxa_users_services_devices_daily_v1"
): FXA_SRC,
user_id_target(
table="firefox_accounts_derived.fxa_users_services_devices_first_seen_v1"
): FXA_SRC,
user_id_target(
table="firefox_accounts_derived.fxa_users_services_devices_last_seen_v1"
): FXA_SRC,
context_id_target(
table="contextual_services_stable.topsites_click_v1"
): CONTEXTUAL_SERVICES_SRC,
context_id_target(
table="contextual_services_stable.topsites_impression_v1"
): CONTEXTUAL_SERVICES_SRC,
context_id_target(
table="contextual_services_stable.quicksuggest_click_v1"
): CONTEXTUAL_SERVICES_SRC,
context_id_target(
table="contextual_services_stable.quicksuggest_impression_v1"
): CONTEXTUAL_SERVICES_SRC,
DeleteTarget(
table="firefox_desktop_stable.quick_suggest_v1",
field=QUICK_SUGGEST_CONTEXT_ID,
): QUICK_SUGGEST_SRC,
# client association ping
DeleteTarget(
table="firefox_desktop_stable.fx_accounts_v1",
field=("metrics.string.client_association_uid", GLEAN_CLIENT_ID),
): (FXA_UNHASHED_SRC, DESKTOP_GLEAN_SRC),
# FxA on Glean
DeleteTarget(
table="accounts_backend_stable.events_v1",
field="metrics.string.account_user_id_sha256",
): FXA_SRC,
DeleteTarget(
table="accounts_backend_stable.accounts_events_v1",
field="metrics.string.account_user_id_sha256",
): FXA_SRC,
DeleteTarget(
table="accounts_backend_derived.events_stream_v1",
field="metrics.string.account_user_id_sha256",
): FXA_SRC,
DeleteTarget(
table="accounts_backend_derived.users_services_daily_v1",
field="user_id_sha256",
): FXA_SRC,
DeleteTarget(
table="accounts_backend_derived.users_services_last_seen_v1",
field="user_id_sha256",
): FXA_SRC,
DeleteTarget(
table="accounts_frontend_stable.events_v1",
field=("metrics.string.account_user_id_sha256", GLEAN_CLIENT_ID),
): (FXA_SRC, FXA_FRONTEND_GLEAN_SRC),
DeleteTarget(
table="accounts_frontend_stable.accounts_events_v1",
field=("metrics.string.account_user_id_sha256", GLEAN_CLIENT_ID),
): (FXA_SRC, FXA_FRONTEND_GLEAN_SRC),
DeleteTarget(
table="accounts_frontend_derived.events_stream_v1",
field=("metrics.string.account_user_id_sha256", CLIENT_ID),
): (FXA_SRC, FXA_FRONTEND_GLEAN_SRC),
DeleteTarget(
table="relay_backend_stable.events_v1",
# Temporary workaround for identifier nested in event extras
# This triggers custom query override in `delete.py`
# We'll be able to remove this once fxa_id is migrated to string metric
# See https://mozilla-hub.atlassian.net/browse/DENG-7965 and 7964
field="events[*].extra.fxa_id",
): FXA_SRC,
# legacy mobile
DeleteTarget(
table="telemetry_stable.core_v1",
field=LEGACY_MOBILE_IDS,
): LEGACY_MOBILE_SOURCES,
DeleteTarget(
table="telemetry_stable.core_v2",
field=LEGACY_MOBILE_IDS,
): LEGACY_MOBILE_SOURCES,
DeleteTarget(
table="telemetry_stable.core_v3",
field=LEGACY_MOBILE_IDS,
): LEGACY_MOBILE_SOURCES,
DeleteTarget(
table="telemetry_stable.core_v4",
field=LEGACY_MOBILE_IDS,
): LEGACY_MOBILE_SOURCES,
DeleteTarget(
table="telemetry_stable.core_v5",
field=LEGACY_MOBILE_IDS,
): LEGACY_MOBILE_SOURCES,
DeleteTarget(
table="telemetry_stable.core_v6",
field=LEGACY_MOBILE_IDS,
): LEGACY_MOBILE_SOURCES,
DeleteTarget(
table="telemetry_stable.core_v7",
field=LEGACY_MOBILE_IDS,
): LEGACY_MOBILE_SOURCES,
DeleteTarget(
table="telemetry_stable.core_v8",
field=LEGACY_MOBILE_IDS,
): LEGACY_MOBILE_SOURCES,
DeleteTarget(
table="telemetry_stable.core_v9",
field=LEGACY_MOBILE_IDS,
): LEGACY_MOBILE_SOURCES,
DeleteTarget(
table="telemetry_stable.core_v10",
field=LEGACY_MOBILE_IDS,
): LEGACY_MOBILE_SOURCES,
DeleteTarget(
table="telemetry_stable.mobile_event_v1",
field=LEGACY_MOBILE_IDS,
): LEGACY_MOBILE_SOURCES,
DeleteTarget(
table=REGRETS_SRC.table,
field="event_metadata.extension_installation_uuid",
): REGRETS_SRC,
DeleteTarget(
table="firefox_desktop_stable.user_characteristics_v1",
field=USER_CHARACTERISTICS_ID,
): USER_CHARACTERISTICS_SRC,
# tables in Glean derived datasets that use different sources than the find_glean_targets defaults
client_id_target(table="firefox_desktop_derived.adclick_history_v1"): DESKTOP_SRC,
client_id_target(table="firefox_desktop_derived.client_ltv_v1"): DESKTOP_SRC,
client_id_target(table="firefox_desktop_derived.ltv_states_v1"): DESKTOP_SRC,
}
SEARCH_IGNORE_TABLES = {source.table for source in SOURCES}
SEARCH_IGNORE_TABLES |= {target.table for target in DELETE_TARGETS}
# these tables have a known user identifier, but do not yet have associated
# deletion requests, or do not keep data for older than 30 days
SEARCH_IGNORE_TABLES |= {
target.table
for target in [
# glean migration
glean_target(table="org_mozilla_fenix_nightly_stable.migration_v1"),
glean_target(table="org_mozilla_fenix_stable.migration_v1"),
glean_target(table="org_mozilla_fennec_aurora_stable.migration_v1"),
glean_target(table="org_mozilla_firefox_beta_stable.migration_v1"),
glean_target(table="org_mozilla_firefox_stable.migration_v1"),
# pocket
DeleteTarget(table="pocket_stable.fire_tv_events_v1", field=POCKET_ID),
# mobile
client_id_target(table="mobile_stable.activation_v1"),
client_id_target(table="telemetry_stable.core_v1"),
client_id_target(table="telemetry_stable.core_v2"),
client_id_target(table="telemetry_stable.core_v3"),
client_id_target(table="telemetry_stable.core_v4"),
client_id_target(table="telemetry_stable.core_v5"),
client_id_target(table="telemetry_stable.core_v6"),
client_id_target(table="telemetry_stable.core_v7"),
client_id_target(table="telemetry_stable.core_v8"),
client_id_target(table="telemetry_stable.core_v9"),
client_id_target(table="telemetry_stable.core_v10"),
client_id_target(table="telemetry_stable.mobile_event_v1"),
client_id_target(table="telemetry_stable.mobile_metrics_v1"),
# internal
client_id_target(table="eng_workflow_stable.build_v1"),
# other
DeleteTarget(table="telemetry_stable.pioneer_study_v4", field=PIONEER_ID),
]
}
# these fields should be ignored by search because they are not user identifiers
SEARCH_IGNORE_FIELDS = {
# id is the source for document_id in these tables
("firefox_launcher_process_stable.launcher_process_failure_v1", ID),
("telemetry_stable.anonymous_v4", ID),
("telemetry_stable.optout_v4", ID),
("telemetry_stable.pre_account_v4", ID),
("telemetry_stable.prio_v4", ID),
}
# list of dataset_id.table_id to ignore in find_glean_targets function
GLEAN_IGNORE_LIST = {
# deletion request table
"firefox_desktop_derived.migration_esr_incorrect_deletion_request_v1",
# subset of firefox_desktop_stable.pageload_v1 which doesn't have client ids
"firefox_desktop_derived.pageload_1pct_v1",
"firefox_desktop_derived.pageload_nightly_v1",
}
def find_glean_targets(
pool: ThreadPool, client: bigquery.Client, project: str = SHARED_PROD
) -> DeleteIndex:
"""Return a dict like DELETE_TARGETS for glean tables.
Note that dict values *must* be either DeleteSource or tuple[DeleteSource, ...],
and other iterable types, e.g. list[DeleteSource] are not allowed or supported.
"""
datasets = {dataset.dataset_id for dataset in client.list_datasets(project)}
def stable_tables_by_schema(schema_id):
return [
table
for tables in pool.map(
client.list_tables,
[
bigquery.DatasetReference(project, dataset_id)
for dataset_id in datasets
if dataset_id.endswith("_stable")
],
chunksize=1,
)
for table in tables
if table.labels.get("schema_id") == schema_id
]
glean_stable_tables = stable_tables_by_schema(GLEAN_SCHEMA_ID)
channel_to_app_name = get_glean_channel_to_app_name_mapping()
# create mapping of dataset -> (tables containing associated deletion requests)
# construct values as tuples because that is what they must be in the return type
sources: dict[str, tuple[DeleteSource, ...]] = defaultdict(tuple)
app_names = set()
source_doctype = "deletion_request"
for table in glean_stable_tables:
if table.table_id.startswith(source_doctype):
source = DeleteSource(qualified_table_id(table), GLEAN_CLIENT_ID, project)
channel_name = re.sub("_stable$", "", table.dataset_id)
derived_dataset = channel_name + "_derived"
app_name = channel_to_app_name.get(channel_name)
# append to tuple to use every version of deletion request tables
sources[table.dataset_id] += (source,)
sources[derived_dataset] += (source,)
# find the name of all apps that have a dataset of combined channels
if app_name is not None and app_name != channel_name:
app_names.add(app_name)
sources[app_name + "_derived"] += (source,)
# Use deletion request view containing all channels if found, otherwise use per-channel
# tables as delete sources. Some apps don't have views generated by glean_usage
# because they are skipped in bqetl_project.yaml
for app_name in app_names:
try:
source_view = client.get_table(f"{project}.{app_name}.{source_doctype}")
except NotFound:
pass
else:
source = DeleteSource(
qualified_table_id(source_view), GLEAN_CLIENT_ID, project
)
sources[app_name + "_derived"] = (source,)
glean_derived_tables = [
table
for table in pool.map(
client.get_table,
chain(
*pool.starmap(
_list_tables,
[
(bigquery.DatasetReference(project, dataset_id), client)
for dataset_id in sources
if dataset_id.endswith("_derived")
],
chunksize=1,
)
),
chunksize=1,
)
if table.table_type == "TABLE"
]
# handle additional source for deletion requests for things like
# https://bugzilla.mozilla.org/show_bug.cgi?id=1810236
# table must contain client_id at the top level and be partitioned on
# submission_timestamp
derived_source_prefix = "additional_deletion_requests"
for table in glean_derived_tables:
if table.table_id.startswith(derived_source_prefix):
source = DeleteSource(qualified_table_id(table), CLIENT_ID, project)
channel_name = re.sub("_derived$", "", table.dataset_id)
app_name = channel_to_app_name.get(channel_name)
sources[channel_name + "_stable"] += (source,)
sources[table.dataset_id] += (source,)
if app_name is not None and app_name != channel_name:
sources[app_name + "_derived"] += (source,)
# skip tables already added to DELETE_TARGETS
manually_added_tables = {target.table for target in DELETE_TARGETS.keys()}
skipped_tables = manually_added_tables | GLEAN_IGNORE_LIST
# Handle custom deletion requests for usage_reporting pings
glean_min_stable_tables = stable_tables_by_schema(GLEAN_MIN_SCHEMA_ID)
usage_reporting_sources: dict[str, tuple[DeleteSource, ...]] = defaultdict(tuple)
for table in glean_min_stable_tables:
if table.table_id == "usage_deletion_request_v1":
# usage_deletion_request ping is defined in application code,
# so we need to confirm that it has `usage_profile_id` metric
table = client.get_table(table)
if any(
field.name == "metrics"
and any(
metric_type_field.name == "uuid"
and any(
[
metric_field.name == "usage_profile_id"
for metric_field in metric_type_field.fields
]
)
for metric_type_field in field.fields
)
for field in table.schema
):
source = DeleteSource(
qualified_table_id(table), GLEAN_USAGE_PROFILE_ID, project
)
usage_reporting_sources[table.dataset_id] += (source,)
return {
**{
# glean stable tables that have a source
DeleteTarget(
table=qualified_table_id(table),
# field must be repeated for each deletion source
field=(GLEAN_CLIENT_ID,) * len(sources[table.dataset_id]),
): sources[table.dataset_id]
for table in glean_stable_tables
if table.dataset_id in sources
and not table.table_id.startswith(source_doctype)
# migration tables not yet supported
and not table.table_id.startswith("migration")
# skip tables with explicitly excluded client ids
and table.labels.get("include_client_id", "true").lower() != "false"
and qualified_table_id(table) not in skipped_tables
},
**{
# glean derived tables that contain client_id
DeleteTarget(
table=qualified_table_id(table),
# field must be repeated for each deletion source
field=(CLIENT_ID,) * len(sources[table.dataset_id]),
): sources[table.dataset_id]
for table in glean_derived_tables
if any(field.name == CLIENT_ID for field in table.schema)
and not table.table_id.startswith(derived_source_prefix)
and qualified_table_id(table) not in skipped_tables
},
**{
# glean derived tables that contain client_info.client_id but not client_id
DeleteTarget(
table=qualified_table_id(table),
# field must be repeated for each deletion source
field=(GLEAN_CLIENT_ID,) * len(sources[table.dataset_id]),
): sources[table.dataset_id]
for table in glean_derived_tables
if any(
field.name == "client_info"
and any(
[nested_field.name == "client_id" for nested_field in field.fields]
)
for field in table.schema
)
and all(field.name != CLIENT_ID for field in table.schema)
and not table.table_id.startswith(derived_source_prefix)
and qualified_table_id(table) not in skipped_tables
},
**{
# usage_reporting tables via custom usage_deletion_request ping
DeleteTarget(
table=qualified_table_id(table),
# field must be repeated for each deletion source
field=(GLEAN_USAGE_PROFILE_ID,)
* len(usage_reporting_sources[table.dataset_id]),
): usage_reporting_sources[table.dataset_id]
for table in glean_min_stable_tables
if table.table_id == "usage_reporting_v1"
and table.dataset_id in usage_reporting_sources
},
}
def get_glean_channel_to_app_name_mapping() -> Dict[str, str]:
"""Return a dict where key is the channel app id and the value is the shared app name.
e.g. {
"org_mozilla_firefox": "fenix",
"org_mozilla_firefox_beta": "fenix",
"org_mozilla_ios_firefox": "firefox_ios",
"org_mozilla_ios_firefoxbeta": "firefox_ios",
}
"""
response = requests.get(GLEAN_APP_LISTINGS_URL)
response.raise_for_status()
app_listings = response.json()
return {
app["bq_dataset_family"]: app["app_name"]
for app in app_listings
if "bq_dataset_family" in app and "app_name" in app
}
def _list_tables(
dataset_ref: bigquery.DatasetReference,
client: bigquery.Client,
) -> List:
"""Wrap bigquery list_tables and return an empty list for non-existent datasets.
Intended to be used with thread pool map function. Some glean apps do not have
derived datasets so this wrapper handles exceptions when listing them.
"""
try:
return list(client.list_tables(dataset_ref))
except NotFound:
return []
EXPERIMENT_ANALYSIS = "moz-fx-data-experiments"
def find_experiment_analysis_targets(
client: bigquery.Client, project: str = EXPERIMENT_ANALYSIS
) -> DeleteIndex:
"""Return a dict like DELETE_TARGETS for experiment analysis tables.
Note that dict values *must* be either DeleteSource or tuple[DeleteSource, ...],
and other iterable types, e.g. list[DeleteSource] are not allowed or supported.
"""
table_query = f"""
SELECT
table_schema AS dataset_id,
table_name,
FROM
`{project}.region-us.INFORMATION_SCHEMA.TABLES`
WHERE
table_type = 'BASE TABLE'
AND NOT STARTS_WITH(table_name, "statistics")
AND ddl LIKE '%client_id STRING,%'
"""
target_tables = client.query_and_wait(query=table_query)
return {
client_id_target(
table=f"{table.dataset_id}.{table.table_name}", project=project
): DESKTOP_SRC
for table in target_tables
}
PIONEER_PROD = "moz-fx-data-pioneer-prod"
def find_pioneer_targets(
pool: ThreadPool,
client: bigquery.Client,
project: str = PIONEER_PROD,
study_projects: list[str] = [],
) -> DeleteIndex:
"""Return a dict like DELETE_TARGETS for Pioneer tables.
Note that dict values *must* be either DeleteSource or tuple[DeleteSource, ...],
and other iterable types, e.g. list[DeleteSource] are not allowed or supported.
"""
def _has_nested_rally_id(field):
"""Check if any of the fields contains nested `metrics.uuid_rally_id`."""
if field.name == "metrics" and field.field_type == "RECORD":
uuid_field = next(filter(lambda f: f.name == "uuid", field.fields), None)
if uuid_field and uuid_field.field_type == "RECORD":
return any(field.name == "rally_id" for field in uuid_field.fields)
return False
def _get_tables_with_pioneer_id(dataset):
tables_with_pioneer_id = []
for table in client.list_tables(dataset):
table_ref = client.get_table(table)
if (
any(field.name == PIONEER_ID for field in table_ref.schema)
or any(field.name == RALLY_ID_TOP_LEVEL for field in table_ref.schema)
or any(_has_nested_rally_id(field) for field in table_ref.schema)
) and table_ref.table_type != "VIEW":
tables_with_pioneer_id.append(table_ref)
return tables_with_pioneer_id
def _get_client_id_field(table, deletion_request_view=False, study_name=None):
"""Determine which column should be used as client id for a given table."""
if table.dataset_id.startswith("rally_"):
# `rally_zero_one` is a special case where top-level rally_id is used
# both in the ping tables and the deletion_requests view
if table.dataset_id in ["rally_zero_one_stable", "rally_zero_one_derived"]:
return RALLY_ID_TOP_LEVEL
# deletion request views expose rally_id as a top-level field
if deletion_request_view:
return RALLY_ID_TOP_LEVEL
else:
return RALLY_ID
elif table.dataset_id == "analysis":
# Rally analysis tables do not have schemas specified upfront,
# analysts might decide to use either nested or top-level rally_id.
# Shared datasets, like attention stream, may also have derived
# datasets with rally IDs
# See https://github.com/mozilla-services/cloudops-infra/blob/master/projects/data-pioneer/tf/prod/envs/prod/study-projects/main.tf#L60-L67 # noqa
if any(_has_nested_rally_id(field) for field in table.schema):
return RALLY_ID
elif any(field.name == RALLY_ID_TOP_LEVEL for field in table.schema):
return RALLY_ID_TOP_LEVEL
# Pioneer derived tables will have a PIONEER_ID
elif any(field.name == PIONEER_ID for field in table.schema):
return PIONEER_ID
else:
logging.error(f"Failed to find client_id field for {table}")
else:
return PIONEER_ID
datasets = {
dataset.reference
for dataset in client.list_datasets(project)
if dataset.reference.dataset_id.startswith("pioneer_")
or dataset.reference.dataset_id.startswith("rally_")
}
# There should be a single stable and derived dataset per study
stable_datasets = {dr for dr in datasets if dr.dataset_id.endswith("_stable")}
derived_datasets = {dr for dr in datasets if dr.dataset_id.endswith("_derived")}
stable_tables = [
table
for tables in pool.map(client.list_tables, stable_datasets, chunksize=1)
for table in tables
]
# Each derived deletion request view is a union of:
# * corresponding deletion_request table from the _stable dataset
# * uninstall_deletion pings from pioneer_core_stable dataset
derived_deletion_request_views = [
table
for tables in pool.map(client.list_tables, derived_datasets, chunksize=1)
for table in tables
if (table.table_type == "VIEW" and table.table_id == "deletion_requests")
]
# There is a derived dataset for each stable one
# For simplicity when accessing this map later on, keys are changed to `_stable` here
sources = {
table.dataset_id.replace("_derived", "_stable"): DeleteSource(
qualified_table_id(table),
_get_client_id_field(table, deletion_request_view=True),
project,
)
# dict comprehension will only keep the last value for a given key, so
# sort by table_id to use the latest version
for table in sorted(derived_deletion_request_views, key=lambda t: t.table_id)
}
# Dictionary mapping analysis dataset names to corresponding study names.
# We expect analysis tables to be created only under `analysis` datasets
# in study projects. These datasets are labeled with study names which
# we use for discovering corresponding delete request tables later on.
analysis_datasets = {}
for project in study_projects:
analysis_dataset = bigquery.DatasetReference(project, "analysis")
labels = client.get_dataset(analysis_dataset).labels
# study names in labels are not normalized (contain '-', not '_')
study_name = labels.get("study_name")
if study_name is None:
logging.error(
f"Dataset {analysis_dataset} does not have `study_name` label, skipping..."
)
else:
analysis_datasets[analysis_dataset] = study_name
return {
**{
# stable tables
DeleteTarget(
table=qualified_table_id(table),
field=_get_client_id_field(table),
project=PIONEER_PROD,
): sources[table.dataset_id]
for table in stable_tables
if not table.table_id.startswith("deletion_request_")
and not table.table_id.startswith("pioneer_enrollment_")
and not table.table_id.startswith("enrollment_")
and not table.table_id.startswith("study_enrollment_")
and not table.table_id.startswith("study_unenrollment_")
and not table.table_id.startswith("unenrollment_")
},
**{
# derived tables with pioneer_id
DeleteTarget(
table=qualified_table_id(table),
field=_get_client_id_field(table),
project=PIONEER_PROD,
): sources[table.dataset_id]
for dataset in derived_datasets
for table in _get_tables_with_pioneer_id(dataset)
},
**{
# tables with pioneer_id located in study analysis projects
DeleteTarget(
table=qualified_table_id(table),
field=_get_client_id_field(table, study_name=study),
project=table.project,
): sources[study.replace("-", "_") + "_stable"]
for dataset, study in analysis_datasets.items()
for table in _get_tables_with_pioneer_id(dataset)
},
}