This notebook benchmarks ML models (downloaded into the model directory) against various validation sets.

The 'all_users' dataset may not be public, but the single tab dataset is.


In [None]:
import pandas as pd
from pandas import DataFrame
from functools import partial

In [None]:
%pwd
%cd "~/Documents/GitHub/smart-tab-grouping"


In [None]:
from rouge_score import rouge_scorer

In [None]:
multitab_tests = pd.read_csv("data/individual_tests/private/all_users2.csv")
single_tab_tests = pd.read_csv("data/individual_tests/single_tab_validation.csv")
single_tab_tests.keywords = ""

garbled_tests = pd.read_csv("data/individual_tests/garbled.csv")



In [None]:
garbled_tests.loc[:, "keywords"] = ""

In [None]:
garbled_tests

In [None]:
from spellchecker import SpellChecker
spell = SpellChecker()
spell.word_frequency.load_words(['microsoft', 'apple', 'google', 'bing', 'search', 'duckduckgo', 'yahoo'])


def is_clean_string(s: str):
    for word in s.split():
        if ("'" in word):
            segments = word.split("'")
            if len(segments) == 1:
                break
            if len(segments) > 2:
                return False
            if len(segments) == 2:
                if len(segments[0]) > 1 and len(segments[1]) > 1:
                    return False
            continue # don't check spelling with 's
        if (len(spell.unknown([word])) == 1):
            return False
        last_char = None
        for cur_char in word:
            if last_char is None:
                last_char = cur_char
                continue
            if (not last_char.isalpha()) or (not cur_char.isalpha()):
                last_char = cur_char
                continue
            if cur_char.upper() == cur_char and last_char.lower() == last_char: # switch to uppercase
                return False
            last_char = cur_char
    return True
            
        
    

In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, pipeline
import numpy as np

In [None]:
embedder = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2", device=-1)


In [None]:
def cos_sim(s1, s2):
    embeddings = [np.mean(embedder(s)[0], axis=0) for s in [s1, s2]]
    similarity = cosine_similarity(embeddings[0].reshape(1,-1), embeddings[1].reshape(1,-1)).squeeze()
    return similarity

    

In [None]:
cos_sim("Dogs", "Apple")

In [None]:
def compute_scores(row, pred_key=None):
    scores = scorer.score(row['label'], row[pred_key])
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure,
        'pred_len': len(row[pred_key]),
        'label_len': len(row['label']),
        'cos_sim': cos_sim(row['label'], row[pred_key]),
        'clean': 1 if is_clean_string(row[pred_key]) else 0
    }

def compute_scores_no_label(row, pred_key=None):
        return {
            'clean': 1 if is_clean_string(row[pred_key]) else 0
        }



In [None]:
def get_avg_scores(input_df: DataFrame, compare_column: str):
    scorer = compute_scores_no_label if 'label' not in input_df.columns else compute_scores
    rouge_scores_df = input_df.apply(partial(scorer, pred_key=compare_column) , axis=1, result_type='expand')
    average_scores = rouge_scores_df.mean().to_dict()
    return average_scores


In [None]:
import sys
sys.path.append("/Users/Rrando/Documents/GitHub/smart-tab-grouping/src")
from util.tab_titles import T5TopicGenerator, OnnxT5TopicGenerator

In [None]:
def compute_topic_keywords(row, legacy=False, prob_limit=None):
    return topic_gen.get_topic_with_keywords({"documents": row["three_titles"].split('\n'), "keywords": row["keywords"].split(',')}, legacy=legacy, prob_limit=prob_limit)

In [None]:
topic_gen = T5TopicGenerator("./models/still-durian-309")

In [None]:
#topic_gen.tokenizer.decode(topic_gen.model.generation_config.bad_words_ids[88])

topic_gen.tokenizer.convert_ids_to_tokens(topic_gen.model.generation_config.bad_words_ids[600])

In [None]:
def compute_topic_keywords_single(row, legacy=False, prob_limit=None):
    return topic_gen.get_topic_with_keywords({"documents": [row["title"]], "keywords": row["keywords"].split(',')}, legacy=legacy, 
                                             prob_limit=prob_limit)

In [None]:
def compute_topic(row):
    return topic_gen.get_topic({"documents": row["three_titles"].split('\n')})

In [None]:

multitab_tests["recomputed_titles_keywords"] = multitab_tests.apply(lambda row: compute_topic_keywords(row), axis=1)


In [None]:

torch_models = [
           {"name": "cool-yogurt-98", "legacy_data_format": False},
          {"name": "dainty-blaze-127", "legacy_data_format": False},
        {"name": "dainty-river-189","legacy_data_format": False},
        {"name": "gallant-sunset-190","legacy_data_format": False},
        {"name": "upbeat-eon-195", "legacy_data_format": False},
        {"name": "devoted-puddle-246", "legacy_data_format": False},
        {"name": "genial-tree-283", "legacy_data_format": False},
        {"name": "major-elevator-302", "legacy_data_format": False},
        {"name": "olive-silence-303", "legacy_data_format": False},
        {"name": "sandy-forest-305", "legacy_data_format": False},
        {"name": "still-durian-309", "legacy_data_format": False},
        {"name": "eager-plant-323", "legacy_data_format": False},
        {"name": "dulcet-durian-136", "legacy_data_format": False},
        {"name": "lively-planet-17", "legacy_data_format": False},
        {"name": "eager-fog-84", "legacy_data_format": False},
        {"name": "dry-meadow-86", "legacy_data_format": False},
        {"name": "classic-forest-87", "legacy_data_format": False},
        {"name": "laced-terrain-88", "legacy_data_format": False},
         {"name": "drawn-water-93",  "legacy_data_format": False}
         ]

onnx_quantized_models = [
           {"name": "cool-yogurt-98", "legacy_data_format": False},
          {"name": "dainty-blaze-127", "legacy_data_format": False},
        {"name": "devoted-puddle-246", "legacy_data_format": False},
        {"name": "sandy-forest-305", "legacy_data_format": False},
        {"name": "still-durian-309", "legacy_data_format": False},
        {"name": "eager-plant-323", "legacy_data_format": False},
        {"name": "eager-fog-84", "legacy_data_format": False},
        {"name": "dry-meadow-86", "legacy_data_format": False},
        {"name": "classic-forest-87", "legacy_data_format": False},
         {"name": "drawn-water-93",  "legacy_data_format": False}
         ]




In [None]:
TEST_ONNX = True

In [None]:
models = onnx_quantized_models if TEST_ONNX else torch_models

In [None]:
single_tab_tests["keywords"] = pd.Series(dtype=str)
single_tab_tests = single_tab_tests.fillna("")

In [None]:
single_tab_score = []
multi_tab_score = []

for model_info in models:
    name = model_info["name"]
    topic_gen = OnnxT5TopicGenerator(model_name=f"./models_onnx/{name}") if TEST_ONNX else T5TopicGenerator(model_name=f"./models/{name}")
    col = f"recomputed_title_keywords_{name}"
    multitab_tests[col] = multitab_tests.apply(lambda row: compute_topic_keywords(row, legacy=model_info["legacy_data_format"]), axis=1)
    print(f"{name} - MultiTab Tests")
    score = get_avg_scores(multitab_tests, col)
    score["model"] = name
    multi_tab_score.append(score)
    
    single_tab_tests[col] = single_tab_tests.apply(lambda row: compute_topic_keywords_single(row, legacy=model_info["legacy_data_format"]), axis=1)
    print(f"{name} - Single Tab Tests")
    score = get_avg_scores(single_tab_tests, col)
    score["model"] = name
    single_tab_score.append(score)
    
    

In [None]:
garbled_tests.title.to_list()

In [None]:
single_tab_df = pd.DataFrame(single_tab_score)
multi_tab_df = pd.DataFrame(multi_tab_score)


In [None]:
multi_tab_df

In [None]:
single_tab_df