in ludwig/features/text_feature.py [0:0]
def feature_data(column, metadata, preprocessing_parameters):
char_data = build_sequence_matrix(
sequences=column,
inverse_vocabulary=metadata['char_str2idx'],
tokenizer_type=preprocessing_parameters['char_tokenizer'],
length_limit=metadata['char_max_sequence_length'],
padding_symbol=metadata['char_pad_symbol'],
padding=preprocessing_parameters['padding'],
unknown_symbol=metadata['char_unk_symbol'],
lowercase=preprocessing_parameters['lowercase'],
tokenizer_vocab_file=preprocessing_parameters[
'char_vocab_file'
],
pretrained_model_name_or_path=preprocessing_parameters[
'pretrained_model_name_or_path'
]
)
word_data = build_sequence_matrix(
sequences=column,
inverse_vocabulary=metadata['word_str2idx'],
tokenizer_type=preprocessing_parameters['word_tokenizer'],
length_limit=metadata['word_max_sequence_length'],
padding_symbol=metadata['word_pad_symbol'],
padding=preprocessing_parameters['padding'],
unknown_symbol=metadata['word_unk_symbol'],
lowercase=preprocessing_parameters['lowercase'],
tokenizer_vocab_file=preprocessing_parameters[
'word_vocab_file'
],
pretrained_model_name_or_path=preprocessing_parameters[
'pretrained_model_name_or_path'
]
)
return char_data, word_data