def process_history()

in evaluation_pipeline/retrieval.py [0:0]


def process_history(row_limit, history_file_path):
    browsing_history = pd.read_csv(history_file_path).head(row_limit)
    browsing_history['last_visit_date'] = pd.to_datetime(browsing_history['last_visit_date'], unit='us')
    # fill empty last_visit_date with default value "1970-01-01"
    browsing_history['last_visit_date'] = browsing_history['last_visit_date'].fillna(pd.to_datetime("1970-01-01"))
    browsing_history['combined_text'] = browsing_history['title'].fillna('') + " " + browsing_history['description'].fillna('')
    browsing_history['combined_text_url'] = browsing_history['title'].fillna('') + " " + browsing_history['description'].fillna('') + browsing_history['url'].fillna('')
    browsing_history = browsing_history.loc[browsing_history['combined_text'] != ''].reset_index(drop=True)

    print(len(browsing_history))

    return browsing_history