void Compute()

in tensorflow_text/core/kernels/unicode_script_tokenize_kernel.cc [58:171]


  void Compute(OpKernelContext* context) override {
    // Get inputs
    const Tensor& input_values_tensor = context->input(0);
    const auto input_values_flat = input_values_tensor.flat<int32>();
    const Tensor& input_splits_tensor = context->input(1);
    const auto input_splits_flat = input_splits_tensor.flat<SPLITS_TYPE>();

    // Since we limit to a 2-D input (flat_values of rank 1 and a single splits
    // tensor), our output dimension will always be 3-D (flat_values of rank 1
    // with two splits - inner for the tokenized values and the outer for those
    // grouped by the original strings).
    // A few things to note:
    // 1) The values and inner splits of the tokenized strings have an unknown
    // length, as well as the offsets, so we allocate them at the end.
    // 2) The outer splits of the tokenized strings matches that of the offset
    // splits. Thus, we will only return one set and use it for all of them.
    // 3) The outer splits shape will match the original input_splits.
    Tensor* output_outer_splits_tensor;
    OP_REQUIRES_OK(context,
                   context->allocate_output("output_outer_splits",
                                            input_splits_tensor.shape(),
                                            &output_outer_splits_tensor));
    auto output_outer_splits_flat =
        output_outer_splits_tensor->flat<SPLITS_TYPE>();

    std::vector<int32> output_values;
    std::vector<SPLITS_TYPE> output_values_inner_splits;
    std::vector<int64> output_offset_starts;
    std::vector<int64> output_offset_limits;

    // Loop over the codepoints (a split at a time) and create splits of tokens.
    icu::ErrorCode status;
    for (int splits_idx = 0; splits_idx < input_splits_flat.size() - 1;
         splits_idx++) {
      output_outer_splits_flat(splits_idx) = output_offset_starts.size();
      UScriptCode prev_script = USCRIPT_INVALID_CODE;
      bool token_has_start_set = false;
      int32 curr_skipped_spaces = 0;  // Used when computing the end of a token
      const int curr_word_start_idx = input_splits_flat(splits_idx);
      bool was_space = false;
      for (int values_idx = curr_word_start_idx;
           values_idx < input_splits_flat(splits_idx + 1); values_idx++) {
        const int32 input_value = input_values_flat(values_idx);
        const bool is_space = u_isUWhiteSpace(input_value);
        UScriptCode script = uscript_getScript(input_value, status);
        // Split these failures out as if they are a different code and ignore
        // the error.
        if (status.isFailure()) {
          status.reset();
          script = USCRIPT_INVALID_CODE;
        }
        // Split out a new token if the unicode script changes from the
        // previous token.
        if (script != prev_script ||
            (keep_whitespace_ && is_space != was_space)) {
          if (token_has_start_set) {
            output_offset_limits.push_back(values_idx - curr_word_start_idx -
                                           curr_skipped_spaces);
          }
          prev_script = script;
          token_has_start_set = false;
        }
        // Only copy characters other than whitespace. Because of this, also do
        // not start new tokens until a character other than a space is reached.
        if (!is_space || keep_whitespace_) {
          if (!token_has_start_set) {
            // Set token start offset relative to current string.
            output_offset_starts.push_back(values_idx - curr_word_start_idx);
            // Set split to indicate start of a new token.
            output_values_inner_splits.push_back(output_values.size());
            token_has_start_set = true;
          }
          output_values.push_back(input_value);
        }
        if (!keep_whitespace_) {
          if (is_space) {
            curr_skipped_spaces++;
          } else {
            curr_skipped_spaces = 0;
          }
        }
        was_space = is_space;
      }
      // Looping through the codepoints for current tokens complete. Now set the
      // last limit of out last token (if we found a start earlier).
      if (token_has_start_set) {
        output_offset_limits.push_back(input_splits_flat(splits_idx + 1) -
                                       curr_word_start_idx -
                                       curr_skipped_spaces);
      }
    }
    // Now set the closing value of our splits.
    output_outer_splits_flat(input_splits_flat.size() - 1) =
        output_offset_starts.size();
    output_values_inner_splits.push_back(output_values.size());

// Allocate output & fill output tensors.
#define DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(name, dtype)                 \
  int64 name##_size = name.size();                                           \
  Tensor* name##_tensor = nullptr;                                           \
  OP_REQUIRES_OK(context,                                                    \
                 context->allocate_output(#name, TensorShape({name##_size}), \
                                          &name##_tensor));                  \
  auto name##_data = name##_tensor->flat<dtype>().data();                    \
  memcpy(name##_data, name.data(), name##_size * sizeof(dtype));

    DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values, int32);
    DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_values_inner_splits,
                                            SPLITS_TYPE);
    DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_starts, int64);
    DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR(output_offset_limits, int64);

#undef DECLARE_ALLOCATE_AND_FILL_OUTPUT_TENSOR
  }