in tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc [881:949]
absl::StatusOr<std::string> FastWordpieceBuilder::ExportToFlatBuffer() const {
flatbuffers::FlatBufferBuilder builder;
const auto trie_array = builder.CreateVector(trie_array_);
std::vector<tensorflow::text::FailureStruct> failure_struct_fbs_vector;
failure_struct_fbs_vector.reserve(failure_struct_array_.size());
for (const auto& item : failure_struct_array_) {
failure_struct_fbs_vector.emplace_back(item.failure_link,
item.failure_pops_offset_length);
}
const auto failure_structure_array =
builder.CreateVectorOfStructs(failure_struct_fbs_vector);
const auto failure_pops_pool = builder.CreateVector(failure_pops_pool_);
const auto precomputed_result_for_suffix_indicator =
builder.CreateVector(precomputed_result_for_suffix_indicator_);
const auto suffix_indicator = builder.CreateString(suffix_indicator_);
const auto unk_token = builder.CreateString(unk_token_);
std::vector<flatbuffers::Offset<flatbuffers::String>> vocab_fbs_vector;
std::vector<bool> vocab_is_suffix_fbs_vector;
if (support_detokenization_) {
vocab_fbs_vector.reserve(vocab_->Size());
for (int i = 0; i < vocab_->Size(); ++i) {
const absl::optional<absl::string_view> word = vocab_->LookupWord(i);
if (!word.has_value()) {
return absl::FailedPreconditionError(
"Impossible. `token_id` is definitely within the range of vocab "
"token ids; hence LookupWord() should always succeed.");
}
absl::string_view token = word.value();
bool is_suffix_token = false;
if (!suffix_indicator_.empty() && token != suffix_indicator_ &&
absl::StartsWith(token, suffix_indicator_)) {
is_suffix_token = true;
// For suffix tokens, we remove the suffix indicator to save spac and
// for ease of use in detokenization (where the suffix indicator will be
// stripped anyway).
token = token.substr(suffix_indicator_.size());
}
vocab_fbs_vector.emplace_back(builder.CreateString(token));
vocab_is_suffix_fbs_vector.emplace_back(is_suffix_token);
}
}
auto vocab_array = builder.CreateVector(vocab_fbs_vector);
auto vocab_is_suffix_array = builder.CreateVector(vocab_is_suffix_fbs_vector);
FastWordpieceTokenizerConfigBuilder wtcb(builder);
wtcb.add_trie_array(trie_array);
wtcb.add_failure_struct_array(failure_structure_array);
wtcb.add_failure_pops_pool(failure_pops_pool);
wtcb.add_trie_suffix_root(trie_suffix_root_);
wtcb.add_trie_punct_failure_link_node(trie_punct_failure_link_node_);
wtcb.add_max_bytes_per_token(max_bytes_per_token_);
wtcb.add_suffix_indicator(suffix_indicator);
wtcb.add_unk_token(unk_token);
wtcb.add_unk_token_id(unk_token_id_);
wtcb.add_precomputed_result_for_suffix_indicator(
precomputed_result_for_suffix_indicator);
wtcb.add_end_to_end(!no_pretokenization_);
wtcb.add_support_detokenization(support_detokenization_);
wtcb.add_vocab_array(vocab_array);
wtcb.add_vocab_is_suffix_array(vocab_is_suffix_array);
FinishFastWordpieceTokenizerConfigBuffer(builder, wtcb.Finish());
return std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
builder.GetSize());
}