void TextProcessor::process()

in inference/src/translator/text_processor.cpp [110:216]


void TextProcessor::process(std::string&& input, AnnotatedText& source, Segments& segments) const {
  /**
   * This is an RAII guard for auto-freeing a malloc-allocated pointer when it leaves scope.
   *
   * The address of the internal pointer is passed to JavaScript where it is allocated
   * via `malloc`, however we are responsible for deallocating the memory via `free` on
   * the C++ side.
   *
   * Wrapping the deallocation in the checked destructor ensures that once the memory has been 
   * allocated on the JS side, it is guaranteed to be deallocated when this stack frame is popped,
   * whether from returning or from an exception being thrown.
   */
  struct ScopedPtr {
    int32_t* ptr = nullptr;

    const int32_t operator[](const size_t index) const {
      return ptr[index];
    }

    ~ScopedPtr() {
      if (ptr) {
        free(ptr);
        ptr = nullptr;
      }
    }
  };

  ScopedPtr starts;
  ScopedPtr ends;
  int32_t sentenceCount = 0;

  source = std::move(AnnotatedText(std::move(input)));
  std::string_view input_converted(source.text.data(), source.text.size());

  EM_ASM({
    // Attach a lazily initialized cache of the Intl.Segmenter to the WASM Module.
    // This is critical to prevent creating one instance of Intl.Segmenter per
    // translation request.
    if (!Module.getOrCreateSentenceSegmenter) {
      Module.getOrCreateSentenceSegmenter = (function() {
        let segmenters = new Map();

        return function(lang) {
          let segmenter = segmenters.get(lang);

          if (!segmenter) {
            segmenter = new Intl.Segmenter(lang, { granularity: "sentence" });
            segmenters.set(lang, segmenter);
          }

          return segmenter;
        };
      })();
    }

    // Convert the UTF-8 C++ strings into UTF-16 for JavaScript.
    const inputUTF16 = UTF8ToString($0);
    const lang = UTF8ToString($1);

    // Segment the UTF-16 input with the Intl.Segmenter.
    const segmenter = Module.getOrCreateSentenceSegmenter(lang);
    const sentencesUTF16 = Array.from(segmenter.segment(inputUTF16));
    const sentenceCount = sentencesUTF16.length;

    // Allocate enough space to mark the start and end UTF-8 byte indices for each sentence.
    const bytesPerInt = 4;
    const startsPtr = _malloc(sentenceCount * bytesPerInt);
    const endsPtr = _malloc(sentenceCount * bytesPerInt);

    if (!startsPtr || !endsPtr) {
      throw new Error("Failed to allocate WASM memory for segmentation.");
    }

    // Iterate through all of the segments and map the start and end of each
    // sentence as UTF-8 byte ranges so that the C++ code can operate on them.
    let sentenceEndUTF8 = 0;
    sentencesUTF16.forEach(({ segment: sentenceUTF16 }, index) => {
      const sentenceStartUTF8 = sentenceEndUTF8;
      sentenceEndUTF8 += lengthBytesUTF8(sentenceUTF16);

      setValue(startsPtr + index * bytesPerInt, sentenceStartUTF8, 'i32');
      setValue(endsPtr + index * bytesPerInt, sentenceEndUTF8, 'i32');
    });

    setValue($2, sentenceCount, 'i32');
    setValue($3, startsPtr, 'i32');
    setValue($4, endsPtr, 'i32');
  }, input_converted.data(), sourceLanguage_.c_str(), &sentenceCount, &starts.ptr, &ends.ptr);

  for (int32_t idx = 0; idx < sentenceCount; idx++) {
    int32_t start = starts[idx];
    int32_t end = ends[idx];
    int32_t length = end - start;

    marian::string_view sentence(source.text.data() + start, length);
    std::vector<string_view> wordRanges;
    Segment segment = tokenize(sentence, wordRanges);

    // There are some cases where SentencePiece or vocab returns no words
    // after normalization. 0 prevents any empty entries from being added.
    if (segment.size() > 0) {
      // Wrap segment into sentences of at most maxLengthBreak_ tokens and
      // tell source about them.
      wrap(segment, wordRanges, segments, source);
    }
  }
}