in src/kg_validator.py [0:0]
def main(golden_queries_file, k=2):
conn = get_sql_connection_for_validation()
if os.path.exists(golden_queries_file):
golden_queries = pd.read_csv(golden_queries_file, usecols=['search_query', 'url'])
golden_queries['search_query'] = golden_queries['search_query'].str.lower()
logger.info(f"Number of golden queries = {len(golden_queries)}")
logger.info(golden_queries.head().T)
url_hashes = get_url_hash_batch(golden_queries, conn)
queries_with_ground_truth = golden_queries.merge(url_hashes, on='url', how='inner')
queries_with_ground_truth = queries_with_ground_truth.groupby('search_query')['url_hash'].agg(list).reset_index()\
.rename(columns={'search_query': 'keyword'})
else:
logger.warn(f"{golden_queries_file} does not exist and hence using the places DB")
queries_with_ground_truth = fetch_ground_truths(conn)
os.makedirs(f"{DATA_PATH}/kg_results", exist_ok=True)
metric_cols = [f'precision@{k}',f'recall@{k}',f'ndcg@{k}','reciprocal_rank','average_precision']
logger.info("\n Use keywords + topics + tags")
golden_eval_df = perform_traditional_evals(conn, queries_with_ground_truth, k=k)
if golden_eval_df is None or golden_eval_df.empty:
logger.warning("No matches using keywords + topics + tag. Please check.")
else:
logger.info(golden_eval_df[metric_cols].mean())
logger.info(len(golden_eval_df))
golden_eval_df.to_csv(f"{DATA_PATH}/kg_results/golden_eval_df_keywords_topics_tags.csv", index=False)
logger.info("\n Use keywords + tags and no topics")
golden_eval_df = perform_traditional_evals(conn, queries_with_ground_truth, k=k, use_topics=False)
if golden_eval_df is None or golden_eval_df.empty:
logger.warning("No matches using keywords + tags and no topics. Please check.")
else:
logger.info(golden_eval_df[metric_cols].mean())
logger.info(len(golden_eval_df))
golden_eval_df.to_csv(f"{DATA_PATH}/kg_results/golden_eval_df_keywords_tags.csv", index=False)
logger.info("\n Use keywords + topics and no tags")
golden_eval_df = perform_traditional_evals(conn, queries_with_ground_truth, k=k, use_tags=False)
if golden_eval_df is None or golden_eval_df.empty:
logger.warning("No matches using keywords + topics and no tags. Please check.")
else:
logger.info(golden_eval_df[metric_cols].mean())
logger.info(len(golden_eval_df))
golden_eval_df.to_csv(f"{DATA_PATH}/kg_results/golden_eval_df_keywords_topics.csv", index=False)
logger.info("\n Use keywords only")
golden_eval_df = perform_traditional_evals(conn, queries_with_ground_truth,
k=k, use_tags=False, use_topics=False)
if golden_eval_df is None or golden_eval_df.empty:
logger.warning("No matches using keywords only. Please check.")
else:
logger.info(golden_eval_df[metric_cols].mean())
logger.info(len(golden_eval_df))
golden_eval_df.to_csv(f"{DATA_PATH}/kg_results/golden_eval_df_keywords.csv", index=False)