in blingfiretools/blingfiretokdll/blingfiretokdll.cpp [182:373]
const int TextToSentencesWithOffsetsWithModel(const char * pInUtf8Str, int InUtf8StrByteCount,
char * pOutUtf8Str, int * pStartOffsets, int * pEndOffsets, const int MaxOutUtf8StrByteCount,
void * hModel)
{
#ifdef SIZE_OPTIMIZATION
if (NULL == hModel) {
return -1;
}
#else
// check if the initilization is needed
if (false == g_fInitialized) {
// make sure only one thread can get the mutex
std::lock_guard<std::mutex> guard(g_InitializationMutex);
// see if the g_fInitialized is still false
if (false == g_fInitialized) {
InitializeWbdSbd();
g_fInitialized = true;
}
}
// use the default model if it was not provided
if (NULL == hModel) {
hModel = &g_DefaultSbd;
}
#endif
// get the types right, hModel is always defined
const FAModelData * pModel = (const FAModelData *) hModel;
// validate the parameters
if (0 == InUtf8StrByteCount) {
return 0;
}
if (0 > InUtf8StrByteCount || InUtf8StrByteCount > FALimits::MaxArrSize) {
return -1;
}
if (NULL == pInUtf8Str) {
return -1;
}
// allocate buffer for UTF-32, sentence breaking results, word-breaking results
std::vector< int > utf32input(InUtf8StrByteCount);
int * pBuff = utf32input.data();
if (NULL == pBuff) {
return -1;
}
std::vector< int > utf32offsets(InUtf8StrByteCount);
int * pOffsets = utf32offsets.data();
if (NULL == pOffsets) {
return -1;
}
// make sure there are no uninitialized offsets
if (pStartOffsets) {
memset(pStartOffsets, 0, MaxOutUtf8StrByteCount * sizeof(int));
}
if (pEndOffsets) {
memset(pEndOffsets, 0, MaxOutUtf8StrByteCount * sizeof(int));
}
// convert input to UTF-32
const int MaxBuffSize = ::FAStrUtf8ToArray(pInUtf8Str, InUtf8StrByteCount, pBuff, pOffsets, InUtf8StrByteCount);
if (MaxBuffSize <= 0 || MaxBuffSize > InUtf8StrByteCount) {
return -1;
}
// make sure the utf32input does not contain 'U+0000' elements
std::replace(pBuff, pBuff + MaxBuffSize, 0, 0x20);
// allocated a buffer for UTF-8 output
std::vector< char > utf8output(InUtf8StrByteCount + 1);
char * pTmpUtf8 = utf8output.data();
if (NULL == pTmpUtf8) {
return -1;
}
// keep sentence boundary information here
std::vector< int > SbdRes(MaxBuffSize * 3);
int * pSbdRes = SbdRes.data();
if (NULL == pSbdRes) {
return -1;
}
// get the sentence breaking results
const int SbdOutSize = pModel->m_Engine.Process(pBuff, MaxBuffSize, pSbdRes, MaxBuffSize * 3);
if (SbdOutSize > MaxBuffSize * 3 || 0 != SbdOutSize % 3) {
return -1;
}
// number of sentences
int SentCount = 0;
// accumulate the output here
std::ostringstream Os;
// keep track if a sentence was already added
bool fAdded = false;
// set previous sentence end to -1
int PrevEnd = -1;
for (int i = 0; i < SbdOutSize; i += 3) {
// we don't care about Tag or From for p2s task
const int From = PrevEnd + 1;
const int To = pSbdRes[i + 2];
const int Len = To - From + 1;
PrevEnd = To;
// adjust sentence start if needed
const int Delta = FAGetFirstNonWhiteSpace(pBuff + From, Len);
if (Delta < Len) {
// convert buffer to a UTF-8 string, we temporary use pOutUtf8Str, MaxOutUtf8StrByteCount
const int StrOutSize = ::FAArrayToStrUtf8(pBuff + From + Delta, Len - Delta, pTmpUtf8, InUtf8StrByteCount);
if (pStartOffsets && SentCount < MaxOutUtf8StrByteCount) {
pStartOffsets[SentCount] = pOffsets[From + Delta];
}
if (pEndOffsets && SentCount < MaxOutUtf8StrByteCount) {
const int ToCharSize = ::FAUtf8Size(pInUtf8Str + pOffsets[To]);
pEndOffsets[SentCount] = pOffsets[To] + (0 < ToCharSize ? ToCharSize - 1 : 0);
}
SentCount++;
// check the output size
if (0 > StrOutSize || StrOutSize > InUtf8StrByteCount) {
// should never happen, but happened :-(
return -1;
}
else {
// add a new line separator
if (fAdded) {
Os << '\n';
}
// make sure this buffer does not contain '\n' since it is a delimiter
std::replace(pTmpUtf8, pTmpUtf8 + StrOutSize, '\n', ' ');
// actually copy the data into the string builder
pTmpUtf8[StrOutSize] = 0;
Os << pTmpUtf8;
fAdded = true;
}
}
}
// always use the end of paragraph as the end of sentence
if (PrevEnd + 1 < MaxBuffSize) {
const int From = PrevEnd + 1;
const int To = MaxBuffSize - 1;
const int Len = To - From + 1;
// adjust sentence start if needed
const int Delta = FAGetFirstNonWhiteSpace(pBuff + From, Len);
if (Delta < Len) {
// convert buffer to a UTF-8 string, we temporary use pOutUtf8Str, MaxOutUtf8StrByteCount
const int StrOutSize = ::FAArrayToStrUtf8(pBuff + From + Delta, Len - Delta, pTmpUtf8, InUtf8StrByteCount);
if (pStartOffsets && SentCount < MaxOutUtf8StrByteCount) {
pStartOffsets[SentCount] = pOffsets[From + Delta];
}
if (pEndOffsets && SentCount < MaxOutUtf8StrByteCount) {
const int ToCharSize = ::FAUtf8Size(pInUtf8Str + pOffsets[To]);
pEndOffsets[SentCount] = pOffsets[To] + (0 < ToCharSize ? ToCharSize - 1 : 0);
}
SentCount++;
// check the output size
if (0 > StrOutSize || StrOutSize > InUtf8StrByteCount) {
// should never happen, but happened :-(
return -1;
}
else {
// add a new line separator
if (fAdded) {
Os << '\n';
}
// make sure this buffer does not contain '\n' since it is a delimiter
std::replace(pTmpUtf8, pTmpUtf8 + StrOutSize, '\n', ' ');
// actually copy the data into the string builder
pTmpUtf8[StrOutSize] = 0;
Os << pTmpUtf8;
}
}
}
// we will include the 0 just in case some scriping languages expect 0-terminated buffers and cannot use the size
Os << char(0);
// get the actual output buffer as one string
const std::string & OsStr = Os.str();
const char * pStr = OsStr.c_str();
const int StrLen = (int)OsStr.length();
if (StrLen <= MaxOutUtf8StrByteCount) {
memcpy(pOutUtf8Str, pStr, StrLen);
}
return StrLen;
}