absl::StatusOr FastWordpieceBuilder::ExportToFlatBuffer()

in tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc [881:949]


absl::StatusOr<std::string> FastWordpieceBuilder::ExportToFlatBuffer() const {
  flatbuffers::FlatBufferBuilder builder;

  const auto trie_array = builder.CreateVector(trie_array_);
  std::vector<tensorflow::text::FailureStruct> failure_struct_fbs_vector;
  failure_struct_fbs_vector.reserve(failure_struct_array_.size());
  for (const auto& item : failure_struct_array_) {
    failure_struct_fbs_vector.emplace_back(item.failure_link,
                                           item.failure_pops_offset_length);
  }
  const auto failure_structure_array =
      builder.CreateVectorOfStructs(failure_struct_fbs_vector);
  const auto failure_pops_pool = builder.CreateVector(failure_pops_pool_);
  const auto precomputed_result_for_suffix_indicator =
      builder.CreateVector(precomputed_result_for_suffix_indicator_);
  const auto suffix_indicator = builder.CreateString(suffix_indicator_);
  const auto unk_token = builder.CreateString(unk_token_);

  std::vector<flatbuffers::Offset<flatbuffers::String>> vocab_fbs_vector;
  std::vector<bool> vocab_is_suffix_fbs_vector;

  if (support_detokenization_) {
    vocab_fbs_vector.reserve(vocab_->Size());
    for (int i = 0; i < vocab_->Size(); ++i) {
      const absl::optional<absl::string_view> word = vocab_->LookupWord(i);
      if (!word.has_value()) {
        return absl::FailedPreconditionError(
            "Impossible. `token_id` is definitely within the range of vocab "
            "token ids; hence LookupWord() should always succeed.");
      }
      absl::string_view token = word.value();
      bool is_suffix_token = false;
      if (!suffix_indicator_.empty() && token != suffix_indicator_ &&
          absl::StartsWith(token, suffix_indicator_)) {
        is_suffix_token = true;
        // For suffix tokens, we remove the suffix indicator to save spac and
        // for ease of use in detokenization (where the suffix indicator will be
        // stripped anyway).
        token = token.substr(suffix_indicator_.size());
      }
      vocab_fbs_vector.emplace_back(builder.CreateString(token));
      vocab_is_suffix_fbs_vector.emplace_back(is_suffix_token);
    }
  }

  auto vocab_array = builder.CreateVector(vocab_fbs_vector);
  auto vocab_is_suffix_array = builder.CreateVector(vocab_is_suffix_fbs_vector);

  FastWordpieceTokenizerConfigBuilder wtcb(builder);
  wtcb.add_trie_array(trie_array);
  wtcb.add_failure_struct_array(failure_structure_array);
  wtcb.add_failure_pops_pool(failure_pops_pool);
  wtcb.add_trie_suffix_root(trie_suffix_root_);
  wtcb.add_trie_punct_failure_link_node(trie_punct_failure_link_node_);

  wtcb.add_max_bytes_per_token(max_bytes_per_token_);
  wtcb.add_suffix_indicator(suffix_indicator);
  wtcb.add_unk_token(unk_token);
  wtcb.add_unk_token_id(unk_token_id_);
  wtcb.add_precomputed_result_for_suffix_indicator(
      precomputed_result_for_suffix_indicator);
  wtcb.add_end_to_end(!no_pretokenization_);
  wtcb.add_support_detokenization(support_detokenization_);
  wtcb.add_vocab_array(vocab_array);
  wtcb.add_vocab_is_suffix_array(vocab_is_suffix_array);
  FinishFastWordpieceTokenizerConfigBuffer(builder, wtcb.Finish());
  return std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
                     builder.GetSize());
}