def generate_query()

in repos/build_pipeline/pipelines/xgboost_pipeline.py [0:0]


def generate_query(dataset_dict: Dict, sagemaker_session: sagemaker.Session):
    customer_fg_info = get_fg_info(
        dataset_dict["customers_fg_name"],
        sagemaker_session=sagemaker_session,
    )
    claims_fg_info = get_fg_info(
        dataset_dict["claims_fg_name"],
        sagemaker_session=sagemaker_session,
    )

    label_name = dataset_dict["label_name"]
    features_names = dataset_dict["features_names"]
    training_columns = [label_name] + features_names
    training_columns_string = ", ".join(f'"{c}"' for c in training_columns)

    query_string = f"""SELECT DISTINCT {training_columns_string}
        FROM "{claims_fg_info.table_name}" claims LEFT JOIN "{customer_fg_info.table_name}" customers
        ON claims.policy_id = customers.policy_id
    """
    return dict(
        catalog=claims_fg_info.catalog,
        database=claims_fg_info.database,
        query_string=query_string,
    )