in custom_tensorflow_keras_nlp/util/preprocessing.py [0:0]
def tokenize_and_pad_docs(df, columns, max_length=40):
docs = df[columns].values
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
print(f"Vocabulary size: {vocab_size}")
print("Padding docs to max_length={} (truncating {} docs)".format(
max_length,
sum(1 for doc in encoded_docs if len(doc) > max_length),
))
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding="post")
print(f"Number of headlines: {len(padded_docs)}")
return padded_docs, t