in data_validation.py [0:0]
def calculate_data_validation_metrics(metadata_source, languages_source):
"""
Calculate metrics for determining whether our search volume is changing in ways that might invalidate our current sanitization model.
Arguments:
- metadata_source: a string. The name of the table containing the metadata to be fetched.
- languages_source: a string. The name of the table containing language distributions for search term jobs.
Returns: A dataframe of the data validation metrics for the sanitization jobs.
"""
if re.fullmatch(r"[A-Za-z0-9\.\-\_]+", metadata_source):
metadata_source_no_injection = metadata_source
else:
raise Exception(
"metadata_source in incorrect format. This should be a fully qualified table name like myproject.mydataset.my_table"
)
if re.fullmatch(r"[A-Za-z0-9\.\-\_]+", languages_source):
languages_source_no_injection = languages_source
else:
raise Exception(
"metadata_source in incorrect format. This should be a fully qualified table name like myproject.mydataset.my_table"
)
# We are using f-strings here because BQ does not allow table names to be parametrized
# and we need to be able to run the same script in the staging and prod db environments for reliable testing outcomes.
SUCCESSFUL_SANITIZATION_JOB_RUN_METADATA = f"""
SELECT
finished_at,
SAFE_DIVIDE(total_search_terms_removed_by_sanitization_job, total_search_terms_analyzed) AS pct_sanitized_search_terms,
SAFE_DIVIDE(contained_at, total_search_terms_analyzed) AS pct_sanitized_contained_at,
SAFE_DIVIDE(contained_numbers, total_search_terms_analyzed) AS pct_sanitized_contained_numbers,
SAFE_DIVIDE(contained_name, total_search_terms_analyzed) AS pct_sanitized_contained_name,
SAFE_DIVIDE(sum_terms_containing_us_census_surname, total_search_terms_analyzed) AS pct_terms_containing_us_census_surname,
SAFE_DIVIDE(sum_uppercase_chars_all_search_terms, sum_chars_all_search_terms) AS pct_uppercase_chars_all_search_terms,
SAFE_DIVIDE(sum_words_all_search_terms, total_search_terms_analyzed) AS avg_words_all_search_terms,
1 - SAFE_DIVIDE(languages.english_count, languages.all_languages_count) AS pct_terms_non_english
FROM `{metadata_source_no_injection}` AS metadata
JOIN
(
SELECT
max(case when language_code = 'en' then search_term_count end) english_count,
sum(search_term_count) as all_languages_count,
FROM `{languages_source_no_injection}`
GROUP BY job_start_time
) AS languages
ON metadata.started_at = languages.job_start_time
WHERE status = 'SUCCESS'
ORDER BY finished_at ASC;
"""
client = bigquery.Client(project=project)
query_job = client.query(SUCCESSFUL_SANITIZATION_JOB_RUN_METADATA)
results_as_dataframe = query_job.result().to_dataframe()
return results_as_dataframe