in tensorflow_text/core/kernels/split_merge_tokenize_kernel.cc [124:206]
void Compute(OpKernelContext* ctx) override {
const Tensor* input_values;
OP_REQUIRES_OK(ctx, ctx->input("input_values", &input_values));
const Tensor* labels;
OP_REQUIRES_OK(ctx, ctx->input("labels", &labels));
const Tensor* row_splits;
OP_REQUIRES_OK(ctx, ctx->input("row_splits", &row_splits));
OP_REQUIRES(ctx, input_values->dim_size(0) == row_splits->dim_size(0) - 1,
errors::InvalidArgument("Expecting row_splits have ",
input_values->dim_size(0) + 1,
" elements, got ",
row_splits->dim_size(0)));
std::vector<string> tokens;
std::vector<int> begin_offset;
std::vector<int> end_offset;
std::vector<int> output_row_splits(1, 0);
// Iterate through all the values and tokenize them.
const auto& values_vec = input_values->flat<tstring>();
const auto& row_splits_vec = row_splits->flat<int32>();
for (int i = 0; i < values_vec.size(); ++i) {
// Tokenize into tokens and record the offset locations.
int num_tokens = 0;
OP_REQUIRES_OK(
ctx, TokenizeByLabel(
values_vec(i),
labels->Slice(row_splits_vec(i), row_splits_vec(i + 1)),
force_split_at_break_character_, &tokens, &begin_offset,
&end_offset, &num_tokens));
// Record the row splits.
output_row_splits.push_back(num_tokens + output_row_splits.back());
}
std::vector<int64> output_tokens_shape;
output_tokens_shape.push_back(tokens.size());
std::vector<int64> output_row_splits_shape;
output_row_splits_shape.push_back(output_row_splits.size());
Tensor* output_values;
OP_REQUIRES_OK(ctx, ctx->allocate_output("output_values",
TensorShape(output_tokens_shape),
&output_values));
auto output_values_vec = output_values->vec<tstring>();
Tensor* output_row_splits_tensor;
OP_REQUIRES_OK(ctx,
ctx->allocate_output("output_row_splits",
TensorShape(output_row_splits_shape),
&output_row_splits_tensor));
auto output_row_splits_vec = output_row_splits_tensor->vec<int64>();
Tensor* start_values;
OP_REQUIRES_OK(ctx, ctx->allocate_output("start_values",
TensorShape(output_tokens_shape),
&start_values));
auto start_values_vec = start_values->vec<int64>();
Tensor* limit_values;
OP_REQUIRES_OK(ctx, ctx->allocate_output("limit_values",
TensorShape(output_tokens_shape),
&limit_values));
auto limit_values_vec = limit_values->vec<int64>();
for (int i = 0; i < tokens.size(); ++i) {
output_values_vec(i) = tokens[i];
}
for (int i = 0; i < output_row_splits.size(); ++i) {
output_row_splits_vec(i) = output_row_splits[i];
}
for (int i = 0; i < begin_offset.size(); ++i) {
start_values_vec(i) = begin_offset[i];
}
for (int i = 0; i < end_offset.size(); ++i) {
limit_values_vec(i) = end_offset[i];
}
}