in tensorflow_text/core/kernels/fast_wordpiece_tokenizer_kernel_template.h [96:181]
absl::Status FastWordpieceTokenizeWithOffsetsOp<Rt>::Invoke(
InvokeContext* context) {
SH_ASSIGN_OR_RETURN(const auto input_values, context->GetInput(kInputValues));
const auto& values_vec = input_values->template As<tstring, 1>();
SH_ASSIGN_OR_RETURN(const auto wp_model, context->GetInput(kWpModel));
// OK to create on every call because FastWordpieceTokenizer is a
// lightweight, memory-mapped wrapper on `wp_model` tensor, and thus
// Create() is very cheap.
auto fast_wordpiece_tokenizer =
::tensorflow::text::FastWordpieceTokenizer::Create(
wp_model->template Data<uint8>().data());
SH_RETURN_IF_ERROR(fast_wordpiece_tokenizer.status());
// TODO(xysong): Optimize based on which information below is requested.
std::vector<std::string> subwords;
std::vector<int> subword_ids;
std::vector<int> begin_offset;
std::vector<int> end_offset;
std::vector<int> row_splits;
row_splits.push_back(0);
// Iterate through all the values and wordpiece tokenize them.
for (int i = 0; i < values_vec.Dim(0); ++i) {
// Tokenize into subwords and record the offset locations.
const int original_num_wordpieces = subwords.size();
fast_wordpiece_tokenizer->Tokenize(values_vec(i), &subwords, &subword_ids,
&begin_offset, &end_offset);
const int delta_num_wordpieces = subwords.size() - original_num_wordpieces;
// Record the row splits.
row_splits.push_back(delta_num_wordpieces + row_splits.back());
}
const int subwords_size = subwords.size();
SH_ASSIGN_OR_RETURN(
auto output_subwords,
context->GetOutput(kOutputSubwords, Shape({subwords_size})));
auto output_subwords_vec =
output_subwords->template As<tensorflow::tstring, 1>();
SH_ASSIGN_OR_RETURN(
auto output_ids,
context->GetOutput(
kOutputIds,
Shape({static_cast<int>(
subword_ids.size())}))); /* same shape as `output_subwords` */
auto output_ids_vec = output_ids->template As<int64, 1>();
SH_ASSIGN_OR_RETURN(
auto output_row_splits,
context->GetOutput(kOutputRowSplits,
Shape({static_cast<int>(row_splits.size())})));
auto output_row_splits_vec = output_row_splits->template As<int64, 1>();
SH_ASSIGN_OR_RETURN(auto start_values,
context->GetOutput(kStartValues, Shape({subwords_size})));
auto start_values_vec = start_values->template As<int64, 1>();
SH_ASSIGN_OR_RETURN(auto end_values,
context->GetOutput(kEndValues, Shape({subwords_size})));
auto end_values_vec = end_values->template As<int64, 1>();
for (int i = 0; i < subwords.size(); ++i) {
output_subwords_vec(i) = subwords[i];
}
for (int i = 0; i < subword_ids.size(); ++i) {
output_ids_vec(i) = subword_ids[i];
}
for (int i = 0; i < row_splits.size(); ++i) {
output_row_splits_vec(i) = row_splits[i];
}
for (int i = 0; i < begin_offset.size(); ++i) {
start_values_vec(i) = begin_offset[i];
}
for (int i = 0; i < end_offset.size(); ++i) {
end_values_vec(i) = end_offset[i];
}
return absl::OkStatus();
}