in tensorflow_text/core/kernels/wordpiece_kernel.cc [198:286]
void Compute(OpKernelContext* ctx) override {
const Tensor* input_values;
OP_REQUIRES_OK(ctx, ctx->input("input_values", &input_values));
const auto& values_vec = input_values->flat<tstring>();
lookup::LookupInterface* lookup_table;
OP_REQUIRES_OK(ctx,
GetLookupTable("vocab_lookup_table", ctx, &lookup_table));
core::ScopedUnref unref_me(lookup_table);
LookupTableVocab vocab_map(lookup_table, ctx);
std::vector<string> subwords;
std::vector<int> begin_offset;
std::vector<int> end_offset;
std::vector<int> row_partition;
if (row_partition_type_ == ROW_SPLITS) {
row_partition.push_back(0);
}
// Iterate through all the values and wordpiece tokenize them.
for (int i = 0; i < values_vec.size(); ++i) {
// Tokenize into subwords and record the offset locations.
int num_wordpieces = 0;
OP_REQUIRES_OK(
ctx, ToStatus(WordpieceTokenize(
values_vec(i), max_bytes_per_word_, max_chars_per_token_,
suffix_indicator_, use_unknown_token_, unknown_token_,
split_unknown_characters_, &vocab_map, &subwords,
&begin_offset, &end_offset, &num_wordpieces)));
// Record the row splits.
switch (row_partition_type_) {
case ROW_LENGTHS:
row_partition.push_back(num_wordpieces);
break;
case ROW_SPLITS:
row_partition.push_back(num_wordpieces + row_partition.back());
break;
}
}
std::vector<int64> output_subwords_shape;
output_subwords_shape.push_back(subwords.size());
std::vector<int64> output_row_partition_shape;
output_row_partition_shape.push_back(row_partition.size());
Tensor* output_values;
OP_REQUIRES_OK(ctx, ctx->allocate_output("output_values",
TensorShape(output_subwords_shape),
&output_values));
auto output_values_vec = output_values->vec<tstring>();
Tensor* output_row_partition;
OP_REQUIRES_OK(ctx,
ctx->allocate_output("output_row_lengths",
TensorShape(output_row_partition_shape),
&output_row_partition));
auto output_row_partition_vec = output_row_partition->vec<int64>();
Tensor* start_values;
OP_REQUIRES_OK(ctx, ctx->allocate_output("start_values",
TensorShape(output_subwords_shape),
&start_values));
auto start_values_vec = start_values->vec<int64>();
Tensor* limit_values;
OP_REQUIRES_OK(ctx, ctx->allocate_output("limit_values",
TensorShape(output_subwords_shape),
&limit_values));
auto limit_values_vec = limit_values->vec<int64>();
for (int i = 0; i < subwords.size(); ++i) {
output_values_vec(i) = subwords[i];
}
for (int i = 0; i < row_partition.size(); ++i) {
output_row_partition_vec(i) = row_partition[i];
}
for (int i = 0; i < begin_offset.size(); ++i) {
start_values_vec(i) = begin_offset[i];
}
for (int i = 0; i < end_offset.size(); ++i) {
limit_values_vec(i) = end_offset[i];
}
}