in repos/build_pipeline/pipelines/xgboost_pipeline.py [0:0]
def generate_query(dataset_dict: Dict, sagemaker_session: sagemaker.Session):
customer_fg_info = get_fg_info(
dataset_dict["customers_fg_name"],
sagemaker_session=sagemaker_session,
)
claims_fg_info = get_fg_info(
dataset_dict["claims_fg_name"],
sagemaker_session=sagemaker_session,
)
label_name = dataset_dict["label_name"]
features_names = dataset_dict["features_names"]
training_columns = [label_name] + features_names
training_columns_string = ", ".join(f'"{c}"' for c in training_columns)
query_string = f"""SELECT DISTINCT {training_columns_string}
FROM "{claims_fg_info.table_name}" claims LEFT JOIN "{customer_fg_info.table_name}" customers
ON claims.policy_id = customers.policy_id
"""
return dict(
catalog=claims_fg_info.catalog,
database=claims_fg_info.database,
query_string=query_string,
)