def tokenize_and_pad_docs()

in custom_tensorflow_keras_nlp/util/preprocessing.py [0:0]


def tokenize_and_pad_docs(df, columns, max_length=40):
    docs = df[columns].values
    # prepare tokenizer
    t = Tokenizer()
    t.fit_on_texts(docs)
    vocab_size = len(t.word_index) + 1
    # integer encode the documents
    encoded_docs = t.texts_to_sequences(docs)
    print(f"Vocabulary size: {vocab_size}")
    print("Padding docs to max_length={} (truncating {} docs)".format(
        max_length,
        sum(1 for doc in encoded_docs if len(doc) > max_length),
    ))
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding="post")
    print(f"Number of headlines: {len(padded_docs)}")
    return padded_docs, t