in pytorch_alternatives/custom_pytorch_nlp/util/preprocessing.py [0:0]
def tokenize_and_pad_docs(df, columns, max_length=40):
docs = df[columns].values
t = torchtext.data.Field(
lower = True,
tokenize = "basic_english",
fix_length = max_length
)
docs = list(map(t.preprocess, docs))
padded_docs = t.pad(docs)
t.build_vocab(padded_docs)
print(f"Vocabulary size: {len(t.vocab)}")
numericalized_docs = []
for d in padded_docs:
temp = []
for c in d:
temp.append(t.vocab.stoi[c])
numericalized_docs.append(temp)
print(f"Number of headlines: {len(numericalized_docs)}")
return np.array(numericalized_docs), t