in tzrec/features/tokenize_feature.py [0:0]
def fg_json(self) -> List[Dict[str, Any]]:
"""Get fg json config."""
fg_cfgs = []
expression = self.config.expression
if self.config.HasField("text_normalizer"):
norm_cfg = self.config.text_normalizer
norm_fg_name = self.name + "__text_norm"
expression = "feature:" + norm_fg_name
norm_fg_cfg = {
"feature_type": "text_normalizer",
"feature_name": norm_fg_name,
"expression": self.config.expression,
"is_gbk_input": False,
"is_gbk_output": False,
"stub_type": True,
}
if norm_cfg.HasField("max_length"):
norm_fg_cfg["max_length"] = norm_cfg.max_length
if len(self.stop_char_file) > 0:
norm_fg_cfg["stop_char_file"] = self.stop_char_file
if len(norm_cfg.norm_options) > 0:
parameter = 0
for norm_option in norm_cfg.norm_options:
if norm_option in NORM_OPTION_MAPPING:
parameter += NORM_OPTION_MAPPING[norm_option]
if norm_option == TextNormalizeOption.TEXT_REMOVE_SPACE:
norm_fg_cfg["remove_space"] = True
norm_fg_cfg["parameter"] = parameter
fg_cfgs.append(norm_fg_cfg)
assert self.config.tokenizer_type in [
"bpe",
"sentencepiece",
], "tokenizer_type only support [bpe, sentencepiece] now."
fg_cfg = {
"feature_type": "tokenize_feature",
"feature_name": self.name,
"default_value": self.config.default_value,
"vocab_file": self.vocab_file,
"expression": expression,
"tokenizer_type": self.config.tokenizer_type,
"output_type": "word_id",
"output_delim": self._fg_encoded_multival_sep,
}
fg_cfgs.append(fg_cfg)
return fg_cfgs