def get_word_embeddings()

in custom_tensorflow_keras_nlp/util/preprocessing.py [0:0]


def get_word_embeddings(t, folder, lang="en"):
    """Download pre-trained word vectors and construct an embedding matrix for tokenizer `t`

    Any tokens in `t` not found in the embedding vectors are mapped to all-zeros.
    """
    vecs_url = f"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.{lang}.300.vec.gz"
    vecs_gz_filename = vecs_url.rpartition("/")[2]
    os.makedirs(folder, exist_ok=True)
    vecs_gz_filepath = os.path.join(folder, vecs_gz_filename)

    # Tokenizer.num_words is nullable, and there's an OOV token, so:
    tokenizer_vocab_size = len(t.word_index) + 1

    if wait_for_file_stable(vecs_gz_filepath):
        print("Using existing embeddings file")
    else:
        print("Downloading word vectors...")
        subprocess.run([" ".join(["wget", "-NP", folder, vecs_url])], check=True, shell=True)

    print("Loading into memory...")
    embeddings_index = dict()
    with gzip.open(vecs_gz_filepath, "rt") as zipf:
        firstline = zipf.readline()
        emb_vocab_size, emb_d = firstline.split(" ")
        emb_vocab_size = int(emb_vocab_size)
        emb_d = int(emb_d)
        for line in zipf:
            values = line.split()
            word = values[0]
            # Only load subset of the embeddings recognised by the tokenizer:
            if word in t.word_index:
                coefs = np.asarray(values[1:], dtype="float32")
                embeddings_index[word] = coefs
    print("Loaded {} of {} word vectors for tokenizer vocabulary length {}".format(
        len(embeddings_index),
        emb_vocab_size,
        tokenizer_vocab_size,
    ))

    # create a weight matrix for words in training docs
    embedding_matrix = np.zeros((tokenizer_vocab_size, emb_d))
    for word, i in t.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix