void Compute()

in tensorflow_text/core/kernels/whitespace_tokenize_kernel.cc [54:145]


  void Compute(OpKernelContext* context) override {
    // Get inputs
    const Tensor& input_values_tensor = context->input(0);
    const auto input_values_flat = input_values_tensor.flat<int32>();
    const Tensor& input_splits_tensor = context->input(1);
    const auto input_splits_flat = input_splits_tensor.flat<SPLITS_TYPE>();

    // Since we limit to a 2-D input (flat_values of rank 1 and a single splits
    // tensor), our output dimension will always be 3-D (flat_values of rank 1
    // with two splits - inner for the tokenized values and the outer for those
    // grouped by the original strings).
    // A few things to note:
    // 1) The values and inner splits of the tokenized strings have an unknown
    // length, as well as the offsets, so we allocate them at the end.
    // 2) The outer splits of the tokenized strings matches that of the offset
    // splits. Thus, we will only return one set and use it for all of them.
    // 3) The outer splits shape will match the original input_splits.
    Tensor* output_outer_splits_tensor;
    OP_REQUIRES_OK(context,
                   context->allocate_output("output_outer_splits",
                                            input_splits_tensor.shape(),
                                            &output_outer_splits_tensor));
    auto output_outer_splits_flat =
        output_outer_splits_tensor->flat<SPLITS_TYPE>();

    std::vector<int32> output_values;
    std::vector<SPLITS_TYPE> output_values_inner_splits;
    std::vector<int64> output_offset_starts;
    std::vector<int64> output_offset_limits;

    // Loop over the codepoints (a split at a time) and create splits of tokens.
    for (int splits_idx = 0; splits_idx < input_splits_flat.size() - 1;
         splits_idx++) {
      output_outer_splits_flat(splits_idx) = output_offset_starts.size();
      bool token_has_start_set = false;
      int32 curr_skipped_spaces = 0;  // Used when computing the end of a token
      const int curr_word_start_idx = input_splits_flat(splits_idx);
      for (int values_idx = curr_word_start_idx;
           values_idx < input_splits_flat(splits_idx + 1); values_idx++) {
        // End current token if we find whitespace
        if (u_isUWhiteSpace(input_values_flat(values_idx))) {
          if (token_has_start_set) {
            output_offset_limits.push_back(values_idx - curr_word_start_idx -
                                           curr_skipped_spaces);
          }
          token_has_start_set = false;
          ++curr_skipped_spaces;
        } else {
          // Non whitespace. Start a new token if needed, and append the
          // codepoint to our current token.
          if (!token_has_start_set) {
            // Set token start offset relative to current string.
            output_offset_starts.push_back(values_idx - curr_word_start_idx);
            // Set split to indicate start of a new token.
            output_values_inner_splits.push_back(output_values.size());
            token_has_start_set = true;
          }
          output_values.push_back(input_values_flat(values_idx));
          curr_skipped_spaces = 0;
        }
      }
      // Looping through the codepoints for current tokens complete. Now set the
      // last limit of out last token (if we found a start earlier).
      if (token_has_start_set) {
        output_offset_limits.push_back(input_splits_flat(splits_idx + 1) -
                                       curr_word_start_idx -
                                       curr_skipped_spaces);
      }
    }
    // Now set the closing value of our splits.
    output_outer_splits_flat(input_splits_flat.size() - 1) =
        output_offset_starts.size();
    output_values_inner_splits.push_back(output_values.size());

// Allocate output & fill output tensors.
#define DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(name, dtype)                 \
  int64 name##_size = name.size();                                           \
  Tensor* name##_tensor = nullptr;                                           \
  OP_REQUIRES_OK(context,                                                    \
                 context->allocate_output(#name, TensorShape({name##_size}), \
                                          &name##_tensor));                  \
  auto name##_data = name##_tensor->flat<dtype>().data();                    \
  memcpy(name##_data, name.data(), name##_size * sizeof(dtype));

    DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values, int32);
    DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values_inner_splits,
                                            SPLITS_TYPE);
    DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_starts, int64);
    DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_limits, int64);

#undef DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR
  }