def tokenize_and_pad_docs()

in pytorch_alternatives/custom_pytorch_nlp/util/preprocessing.py [0:0]


def tokenize_and_pad_docs(df, columns, max_length=40):
    docs = df[columns].values

    t = torchtext.data.Field(
      lower       = True,
      tokenize   = "basic_english",
      fix_length  = max_length
    )
    docs = list(map(t.preprocess, docs))
    padded_docs = t.pad(docs)
    t.build_vocab(padded_docs)
    print(f"Vocabulary size: {len(t.vocab)}")
    numericalized_docs = []
    for d in padded_docs:
        temp = []
        for c in d:
            temp.append(t.vocab.stoi[c])
        numericalized_docs.append(temp)
    print(f"Number of headlines: {len(numericalized_docs)}")
    return np.array(numericalized_docs), t