absl::Status FastWordpieceTokenizeWithOffsetsOp::Invoke()

in tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel_template.h [96:181]


absl::Status FastWordpieceTokenizeWithOffsetsOp<Rt>::Invoke(
    InvokeContext* context) {
  SH_ASSIGN_OR_RETURN(const auto input_values, context->GetInput(kInputValues));
  const auto& values_vec = input_values->template As<tstring, 1>();

  SH_ASSIGN_OR_RETURN(const auto wp_model, context->GetInput(kWpModel));
  // OK to create on every call because FastWordpieceTokenizer is a
  // lightweight, memory-mapped wrapper on `wp_model` tensor, and thus
  // Create() is very cheap.
  auto fast_wordpiece_tokenizer =
      ::tensorflow::text::FastWordpieceTokenizer::Create(
          wp_model->template Data<uint8>().data());
  SH_RETURN_IF_ERROR(fast_wordpiece_tokenizer.status());

  // TODO(xysong): Optimize based on which information below is requested.
  std::vector<std::string> subwords;
  std::vector<int> subword_ids;
  std::vector<int> begin_offset;
  std::vector<int> end_offset;
  std::vector<int> row_splits;

  row_splits.push_back(0);

  // Iterate through all the values and wordpiece tokenize them.
  for (int i = 0; i < values_vec.Dim(0); ++i) {
    // Tokenize into subwords and record the offset locations.
    const int original_num_wordpieces = subwords.size();
    fast_wordpiece_tokenizer->Tokenize(values_vec(i), &subwords, &subword_ids,
                                       &begin_offset, &end_offset);
    const int delta_num_wordpieces = subwords.size() - original_num_wordpieces;

    // Record the row splits.
    row_splits.push_back(delta_num_wordpieces + row_splits.back());
  }

  const int subwords_size = subwords.size();
  SH_ASSIGN_OR_RETURN(
      auto output_subwords,
      context->GetOutput(kOutputSubwords, Shape({subwords_size})));
  auto output_subwords_vec =
      output_subwords->template As<tensorflow::tstring, 1>();

  SH_ASSIGN_OR_RETURN(
      auto output_ids,
      context->GetOutput(
          kOutputIds,
          Shape({static_cast<int>(
              subword_ids.size())}))); /* same shape as `output_subwords` */
  auto output_ids_vec = output_ids->template As<int64, 1>();

  SH_ASSIGN_OR_RETURN(
      auto output_row_splits,
      context->GetOutput(kOutputRowSplits,
                         Shape({static_cast<int>(row_splits.size())})));
  auto output_row_splits_vec = output_row_splits->template As<int64, 1>();

  SH_ASSIGN_OR_RETURN(auto start_values,
                      context->GetOutput(kStartValues, Shape({subwords_size})));
  auto start_values_vec = start_values->template As<int64, 1>();

  SH_ASSIGN_OR_RETURN(auto end_values,
                      context->GetOutput(kEndValues, Shape({subwords_size})));
  auto end_values_vec = end_values->template As<int64, 1>();

  for (int i = 0; i < subwords.size(); ++i) {
    output_subwords_vec(i) = subwords[i];
  }

  for (int i = 0; i < subword_ids.size(); ++i) {
    output_ids_vec(i) = subword_ids[i];
  }

  for (int i = 0; i < row_splits.size(); ++i) {
    output_row_splits_vec(i) = row_splits[i];
  }

  for (int i = 0; i < begin_offset.size(); ++i) {
    start_values_vec(i) = begin_offset[i];
  }

  for (int i = 0; i < end_offset.size(); ++i) {
    end_values_vec(i) = end_offset[i];
  }

  return absl::OkStatus();
}