in data_validation.py [0:0]
def record_validation_results(val_df, destination_table):
print(f"Recording validation results to destination table: {destination_table}")
InputSet = namedtuple(
"InputSet",
"name full_lookback_window range_test_window range_lower_bound range_upper_bound mean_test_window mean_lower_bound mean_upper_bound moving_average_window",
)
client = bigquery.Client(project=project)
started_at = datetime.utcnow()
for metric in [
InputSet(
name="pct_sanitized_search_terms",
full_lookback_window=90,
range_test_window=4,
range_lower_bound=0.125,
range_upper_bound=0.875,
mean_test_window=8,
mean_lower_bound=0.01,
mean_upper_bound=0.99,
moving_average_window=7,
),
InputSet(
name="pct_sanitized_contained_at",
full_lookback_window=90,
range_test_window=4,
range_lower_bound=0.125,
range_upper_bound=0.875,
mean_test_window=8,
mean_lower_bound=0.025,
mean_upper_bound=0.975,
moving_average_window=7,
),
InputSet(
name="pct_sanitized_contained_numbers",
full_lookback_window=90,
range_test_window=3,
range_lower_bound=0.075,
range_upper_bound=0.925,
mean_test_window=8,
mean_lower_bound=0.01,
mean_upper_bound=0.99,
moving_average_window=7,
),
InputSet(
name="pct_sanitized_contained_name",
full_lookback_window=90,
range_test_window=5,
range_lower_bound=0.025,
range_upper_bound=0.975,
mean_test_window=7,
mean_lower_bound=0.01,
mean_upper_bound=0.99,
moving_average_window=7,
),
InputSet(
name="pct_terms_containing_us_census_surname",
full_lookback_window=90,
range_test_window=3,
range_lower_bound=0.1,
range_upper_bound=0.9,
mean_test_window=8,
mean_lower_bound=0.01,
mean_upper_bound=0.99,
moving_average_window=9,
),
InputSet(
name="pct_uppercase_chars_all_search_terms",
full_lookback_window=90,
range_test_window=4,
range_lower_bound=0.075,
range_upper_bound=0.925,
mean_test_window=8,
mean_lower_bound=0.01,
mean_upper_bound=0.99,
moving_average_window=7,
),
InputSet(
name="avg_words_all_search_terms",
full_lookback_window=90,
range_test_window=4,
range_lower_bound=0.125,
range_upper_bound=0.875,
mean_test_window=8,
mean_lower_bound=0.025,
mean_upper_bound=0.975,
moving_average_window=7,
),
InputSet(
name="pct_terms_non_english",
full_lookback_window=90,
range_test_window=4,
range_lower_bound=0.125,
range_upper_bound=0.875,
mean_test_window=8,
mean_lower_bound=0.01,
mean_upper_bound=0.99,
moving_average_window=5,
),
]:
(
finished_at,
num_ranges_compared,
range_alarm,
range_low,
range_high,
range_test_vals,
) = range_check(
val_df,
metric.name,
metric.full_lookback_window,
metric.range_test_window,
metric.range_lower_bound,
metric.range_upper_bound,
)
(
finished_at,
num_moving_averages_compared,
mean_alarm,
mean_low,
mean_high,
mean_window,
mean_test_vals,
) = mean_check(
val_df,
metric.name,
metric.full_lookback_window,
metric.mean_test_window,
metric.moving_average_window,
metric.mean_lower_bound,
metric.mean_upper_bound,
)
rows_to_insert = [
{
"from_sanitization_job_finished_at": finished_at.strftime(
"%Y-%m-%d %H:%M:%S"
),
"started_at": started_at.strftime("%Y-%m-%d %H:%M:%S"),
"range_alarm": range_alarm,
"range_low": range_low,
"range_high": range_high,
"num_ranges_compared": num_ranges_compared,
"range_test_vals": str(range_test_vals),
"mean_alarm": mean_alarm,
"mean_low": mean_low,
"mean_high": mean_high,
"num_moving_averages_compared": num_moving_averages_compared,
"mean_test_vals": str(mean_test_vals),
"metric": metric.name,
"full_lookback_window_num_days": metric.full_lookback_window,
"range_test_window_num_days": metric.range_test_window,
"mean_test_window_num_days": metric.mean_test_window,
"moving_average_window_num_days": metric.moving_average_window,
"range_percentile_lower_bound": metric.range_lower_bound,
"range_percentile_upper_bound": metric.range_upper_bound,
"mean_percentile_lower_bound": metric.range_lower_bound,
"mean_percentile_upper_bound": metric.range_upper_bound,
},
]
errors = client.insert_rows_json(destination_table, rows_to_insert)
if errors:
print(f"Problem recording data validation results: {errors}")
else:
print("Data validation results recorded successfully!")