def calculate_data_validation_metrics()

in data_validation.py [0:0]


def calculate_data_validation_metrics(metadata_source, languages_source):
    """
    Calculate metrics for determining whether our search volume is changing in ways that might invalidate our current sanitization model.

    Arguments:

    - metadata_source: a string. The name of the table containing the metadata to be fetched.
    - languages_source: a string. The name of the table containing language distributions for search term jobs.

    Returns: A dataframe of the data validation metrics for the sanitization jobs.
    """
    if re.fullmatch(r"[A-Za-z0-9\.\-\_]+", metadata_source):
        metadata_source_no_injection = metadata_source
    else:
        raise Exception(
            "metadata_source in incorrect format. This should be a fully qualified table name like myproject.mydataset.my_table"
        )

    if re.fullmatch(r"[A-Za-z0-9\.\-\_]+", languages_source):
        languages_source_no_injection = languages_source
    else:
        raise Exception(
            "metadata_source in incorrect format. This should be a fully qualified table name like myproject.mydataset.my_table"
        )

    # We are using f-strings here because BQ does not allow table names to be parametrized
    # and we need to be able to run the same script in the staging and prod db environments for reliable testing outcomes.
    SUCCESSFUL_SANITIZATION_JOB_RUN_METADATA = f"""
    SELECT
        finished_at,
        SAFE_DIVIDE(total_search_terms_removed_by_sanitization_job, total_search_terms_analyzed) AS pct_sanitized_search_terms,
        SAFE_DIVIDE(contained_at, total_search_terms_analyzed) AS pct_sanitized_contained_at,
        SAFE_DIVIDE(contained_numbers, total_search_terms_analyzed) AS pct_sanitized_contained_numbers,
        SAFE_DIVIDE(contained_name, total_search_terms_analyzed) AS pct_sanitized_contained_name,
        SAFE_DIVIDE(sum_terms_containing_us_census_surname, total_search_terms_analyzed) AS pct_terms_containing_us_census_surname,
        SAFE_DIVIDE(sum_uppercase_chars_all_search_terms, sum_chars_all_search_terms) AS pct_uppercase_chars_all_search_terms,
        SAFE_DIVIDE(sum_words_all_search_terms, total_search_terms_analyzed) AS avg_words_all_search_terms,
        1 - SAFE_DIVIDE(languages.english_count, languages.all_languages_count) AS pct_terms_non_english
        FROM `{metadata_source_no_injection}` AS metadata
    JOIN 
    (
        SELECT 
            max(case when language_code = 'en' then search_term_count end) english_count,
            sum(search_term_count) as all_languages_count,
        FROM `{languages_source_no_injection}` 
        GROUP BY job_start_time
    ) AS languages
    ON metadata.started_at = languages.job_start_time
    WHERE status = 'SUCCESS'
    ORDER BY finished_at ASC;
    """
    client = bigquery.Client(project=project)
    query_job = client.query(SUCCESSFUL_SANITIZATION_JOB_RUN_METADATA)
    results_as_dataframe = query_job.result().to_dataframe()

    return results_as_dataframe