def main()

in src/kg_validator.py [0:0]


def main(golden_queries_file, k=2):
    conn = get_sql_connection_for_validation()
    if os.path.exists(golden_queries_file):
        golden_queries = pd.read_csv(golden_queries_file, usecols=['search_query', 'url'])
        golden_queries['search_query'] = golden_queries['search_query'].str.lower()
        logger.info(f"Number of golden queries = {len(golden_queries)}")
        logger.info(golden_queries.head().T)

        url_hashes = get_url_hash_batch(golden_queries, conn)
        queries_with_ground_truth = golden_queries.merge(url_hashes, on='url', how='inner')
        queries_with_ground_truth = queries_with_ground_truth.groupby('search_query')['url_hash'].agg(list).reset_index()\
                                               .rename(columns={'search_query': 'keyword'})
    else:
        logger.warn(f"{golden_queries_file} does not exist and hence using the places DB")
        queries_with_ground_truth = fetch_ground_truths(conn)
        
    os.makedirs(f"{DATA_PATH}/kg_results", exist_ok=True)
    metric_cols = [f'precision@{k}',f'recall@{k}',f'ndcg@{k}','reciprocal_rank','average_precision']

    logger.info("\n Use keywords + topics + tags")
    golden_eval_df = perform_traditional_evals(conn, queries_with_ground_truth, k=k)
    if golden_eval_df is None or golden_eval_df.empty:
        logger.warning("No matches using keywords + topics + tag. Please check.")
    else:
        logger.info(golden_eval_df[metric_cols].mean())
        logger.info(len(golden_eval_df))
        golden_eval_df.to_csv(f"{DATA_PATH}/kg_results/golden_eval_df_keywords_topics_tags.csv", index=False)

    logger.info("\n Use keywords + tags and no topics")
    golden_eval_df = perform_traditional_evals(conn, queries_with_ground_truth, k=k, use_topics=False)
    if golden_eval_df is None or golden_eval_df.empty:
        logger.warning("No matches using keywords + tags and no topics. Please check.")
    else:
        logger.info(golden_eval_df[metric_cols].mean())
        logger.info(len(golden_eval_df))
        golden_eval_df.to_csv(f"{DATA_PATH}/kg_results/golden_eval_df_keywords_tags.csv", index=False)

    logger.info("\n Use keywords + topics and no tags")
    golden_eval_df = perform_traditional_evals(conn, queries_with_ground_truth, k=k, use_tags=False)
    if golden_eval_df is None or golden_eval_df.empty:
        logger.warning("No matches using keywords + topics and no tags. Please check.")
    else:
        logger.info(golden_eval_df[metric_cols].mean())
        logger.info(len(golden_eval_df))
        golden_eval_df.to_csv(f"{DATA_PATH}/kg_results/golden_eval_df_keywords_topics.csv", index=False)

    logger.info("\n Use keywords only")
    golden_eval_df = perform_traditional_evals(conn, queries_with_ground_truth,
                                               k=k, use_tags=False, use_topics=False)
    if golden_eval_df is None or golden_eval_df.empty:
        logger.warning("No matches using  keywords only. Please check.")
    else:
        logger.info(golden_eval_df[metric_cols].mean())
        logger.info(len(golden_eval_df))
        golden_eval_df.to_csv(f"{DATA_PATH}/kg_results/golden_eval_df_keywords.csv", index=False)