in ludwig/features/text_feature.py [0:0]
def feature_meta(column, preprocessing_parameters):
(
char_idx2str,
char_str2idx,
char_str2freq,
char_max_len,
char_pad_idx,
char_pad_symbol,
char_unk_symbol,
) = create_vocabulary(
column,
tokenizer_type='characters',
num_most_frequent=preprocessing_parameters['char_most_common'],
lowercase=preprocessing_parameters['lowercase'],
unknown_symbol=preprocessing_parameters['unknown_symbol'],
padding_symbol=preprocessing_parameters['padding_symbol'],
pretrained_model_name_or_path=preprocessing_parameters[
'pretrained_model_name_or_path']
)
(
word_idx2str,
word_str2idx,
word_str2freq,
word_max_len,
word_pad_idx,
word_pad_symbol,
word_unk_symbol,
) = create_vocabulary(
column,
tokenizer_type=preprocessing_parameters['word_tokenizer'],
num_most_frequent=preprocessing_parameters['word_most_common'],
lowercase=preprocessing_parameters['lowercase'],
vocab_file=preprocessing_parameters['word_vocab_file'],
unknown_symbol=preprocessing_parameters['unknown_symbol'],
padding_symbol=preprocessing_parameters['padding_symbol'],
pretrained_model_name_or_path=preprocessing_parameters[
'pretrained_model_name_or_path']
)
return (
char_idx2str,
char_str2idx,
char_str2freq,
char_max_len,
char_pad_idx,
char_pad_symbol,
char_unk_symbol,
word_idx2str,
word_str2idx,
word_str2freq,
word_max_len,
word_pad_idx,
word_pad_symbol,
word_unk_symbol,
)