def benchmark_experimental_vocab()

in benchmark/benchmark_pytext_vocab.py [0:0]


def benchmark_experimental_vocab():
    train, = AG_NEWS(data_select='train')
    vocab = train.get_vocab()
    tokens: List[str] = []
    tokens_lists: List[List[str]] = []

    for (_, text) in train:
        cur_tokens = []
        for id in text.tolist():
            cur_tokens.append(vocab.itos[id])
        tokens_lists.append(cur_tokens)
        tokens += cur_tokens

    print("Tokens size:", len(tokens))
    print("Tokens list size:", len(tokens_lists))

    counter = Counter(tokens)
    sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)

    vocab_list = [pair[0] for pair in sorted_by_freq_tuples]
    vocab_list.insert(0, "<unk>")
    ordered_dict = OrderedDict(sorted_by_freq_tuples)

    # pytext vocab construction
    print("Pytext Vocabulary")
    t0 = time.monotonic()
    pytext_vocab = PytextVocabulary(vocab_list)
    print("Construction time:", time.monotonic() - t0)

    # pytext ScriptVocab construction
    print("Pytext Script Vocabulary")
    t0 = time.monotonic()
    pytext_script_vocab = PytextScriptVocabulary(vocab_list)
    print("Construction time:", time.monotonic() - t0)
    jit_pytext_script_vocab = torch.jit.script(pytext_script_vocab)

    # experimental ScriptVocab construction
    print("Experimental Script Vocabulary")
    t0 = time.monotonic()
    experimental_script_vocab = ExperimentalScriptVocabulary(ordered_dict, unk_token="<unk>")
    print("Construction time:", time.monotonic() - t0)
    jit_experimental_script_vocab = torch.jit.script(experimental_script_vocab)

    # pytext Vocab eager lookup
    print("Pytext Vocabulary - Eager Mode")
    _run_benchmark_lookup(tokens, pytext_vocab)
    _run_benchmark_lookup([tokens], pytext_vocab)
    _run_benchmark_lookup(tokens_lists, pytext_vocab)

    # pytext ScriptVocab eager lookup
    print("Pytext ScriptVocab - Eager Mode")
    _run_benchmark_lookup(tokens, pytext_script_vocab)
    _run_benchmark_lookup([tokens], pytext_script_vocab)
    _run_benchmark_lookup(tokens_lists, pytext_script_vocab)

    # experimental ScriptVocab eager lookup
    print("Experimental ScriptVocab - Eager Mode")
    _run_benchmark_lookup(tokens, experimental_script_vocab)
    _run_benchmark_lookup([tokens], experimental_script_vocab)
    _run_benchmark_lookup(tokens_lists, experimental_script_vocab)

    # pytext ScriptVocab jit lookup
    print("Pytext ScriptVocab - Jit Mode")
    _run_benchmark_lookup(tokens, jit_pytext_script_vocab)
    _run_benchmark_lookup([tokens], jit_pytext_script_vocab)
    _run_benchmark_lookup(tokens_lists, jit_pytext_script_vocab)

    # experimental ScriptVocab jit lookup
    print("Experimental ScriptVocab - Jit Mode")
    _run_benchmark_lookup(tokens, jit_experimental_script_vocab)
    _run_benchmark_lookup([tokens], jit_experimental_script_vocab)
    _run_benchmark_lookup(tokens_lists, jit_experimental_script_vocab)

    # pytext ScriptVocab JITed for loop
    print("Pytext ScriptVocab - Jit For Loop")
    _run_benchmark_lookup_jit_for_loop(tokens, jit_pytext_script_vocab)
    _run_benchmark_lookup_jit_for_loop([tokens], jit_pytext_script_vocab)
    _run_benchmark_lookup_jit_for_loop(tokens_lists, jit_pytext_script_vocab)

    # experimental ScriptVocab JITed for loop
    print("Experimental ScriptVocab - Jit For Loop")
    _run_benchmark_lookup_jit_for_loop(tokens, jit_experimental_script_vocab)
    _run_benchmark_lookup_jit_for_loop([tokens], jit_experimental_script_vocab)
    _run_benchmark_lookup_jit_for_loop(tokens_lists, jit_experimental_script_vocab)