in inference/src/translator/text_processor.cpp [110:216]
void TextProcessor::process(std::string&& input, AnnotatedText& source, Segments& segments) const {
/**
* This is an RAII guard for auto-freeing a malloc-allocated pointer when it leaves scope.
*
* The address of the internal pointer is passed to JavaScript where it is allocated
* via `malloc`, however we are responsible for deallocating the memory via `free` on
* the C++ side.
*
* Wrapping the deallocation in the checked destructor ensures that once the memory has been
* allocated on the JS side, it is guaranteed to be deallocated when this stack frame is popped,
* whether from returning or from an exception being thrown.
*/
struct ScopedPtr {
int32_t* ptr = nullptr;
const int32_t operator[](const size_t index) const {
return ptr[index];
}
~ScopedPtr() {
if (ptr) {
free(ptr);
ptr = nullptr;
}
}
};
ScopedPtr starts;
ScopedPtr ends;
int32_t sentenceCount = 0;
source = std::move(AnnotatedText(std::move(input)));
std::string_view input_converted(source.text.data(), source.text.size());
EM_ASM({
// Attach a lazily initialized cache of the Intl.Segmenter to the WASM Module.
// This is critical to prevent creating one instance of Intl.Segmenter per
// translation request.
if (!Module.getOrCreateSentenceSegmenter) {
Module.getOrCreateSentenceSegmenter = (function() {
let segmenters = new Map();
return function(lang) {
let segmenter = segmenters.get(lang);
if (!segmenter) {
segmenter = new Intl.Segmenter(lang, { granularity: "sentence" });
segmenters.set(lang, segmenter);
}
return segmenter;
};
})();
}
// Convert the UTF-8 C++ strings into UTF-16 for JavaScript.
const inputUTF16 = UTF8ToString($0);
const lang = UTF8ToString($1);
// Segment the UTF-16 input with the Intl.Segmenter.
const segmenter = Module.getOrCreateSentenceSegmenter(lang);
const sentencesUTF16 = Array.from(segmenter.segment(inputUTF16));
const sentenceCount = sentencesUTF16.length;
// Allocate enough space to mark the start and end UTF-8 byte indices for each sentence.
const bytesPerInt = 4;
const startsPtr = _malloc(sentenceCount * bytesPerInt);
const endsPtr = _malloc(sentenceCount * bytesPerInt);
if (!startsPtr || !endsPtr) {
throw new Error("Failed to allocate WASM memory for segmentation.");
}
// Iterate through all of the segments and map the start and end of each
// sentence as UTF-8 byte ranges so that the C++ code can operate on them.
let sentenceEndUTF8 = 0;
sentencesUTF16.forEach(({ segment: sentenceUTF16 }, index) => {
const sentenceStartUTF8 = sentenceEndUTF8;
sentenceEndUTF8 += lengthBytesUTF8(sentenceUTF16);
setValue(startsPtr + index * bytesPerInt, sentenceStartUTF8, 'i32');
setValue(endsPtr + index * bytesPerInt, sentenceEndUTF8, 'i32');
});
setValue($2, sentenceCount, 'i32');
setValue($3, startsPtr, 'i32');
setValue($4, endsPtr, 'i32');
}, input_converted.data(), sourceLanguage_.c_str(), &sentenceCount, &starts.ptr, &ends.ptr);
for (int32_t idx = 0; idx < sentenceCount; idx++) {
int32_t start = starts[idx];
int32_t end = ends[idx];
int32_t length = end - start;
marian::string_view sentence(source.text.data() + start, length);
std::vector<string_view> wordRanges;
Segment segment = tokenize(sentence, wordRanges);
// There are some cases where SentencePiece or vocab returns no words
// after normalization. 0 prevents any empty entries from being added.
if (segment.size() > 0) {
// Wrap segment into sentences of at most maxLengthBreak_ tokens and
// tell source about them.
wrap(segment, wordRanges, segments, source);
}
}
}