in blingfiretools/blingfiretokdll/blingfiretokdll.cpp [1368:1553]
const int TextToIdsWithOffsets_sp(
void* ModelPtr,
const char * pInUtf8Str,
int InUtf8StrByteCount,
int32_t * pIdsArr,
int * pStartOffsets,
int * pEndOffsets,
const int MaxIdsArrLength,
const int UnkId = 0
)
{
// validate the parameters
if (0 >= InUtf8StrByteCount || InUtf8StrByteCount > FALimits::MaxArrSize || NULL == pInUtf8Str || 0 == ModelPtr) {
return 0;
}
// allocate buffer for UTF-8 --> UTF-32 conversion
std::vector< int > utf32input(InUtf8StrByteCount + 1);
int * pBuff = utf32input.data();
if (NULL == pBuff) {
return 0;
}
pBuff[0] = __FASpDelimiter__; // always add a space in the beginning, SP uses U+2581 as a space mark
// a container for the offsets
std::vector< int > utf32offsets;
int * pOffsets = NULL;
// flag to alter the logic in case we don't need the offsets
const bool fNeedOffsets = NULL != pStartOffsets && NULL != pEndOffsets;
if (fNeedOffsets) {
utf32offsets.resize(InUtf8StrByteCount + 1);
pOffsets = utf32offsets.data();
if (NULL == pOffsets) {
return 0;
}
pOffsets[0] = -1; // an offset of the appended space (it is not a part of the input)
}
// get the model data
const FAModelData * pModelData = (const FAModelData *)ModelPtr;
const FADictConfKeeper * pConf = &(pModelData->m_DictConf);
const FAMultiMapCA * pCharMap = pConf->GetCharMap ();
// see if we need to get a summy space added
const int BUFF_DATA_OFFSET = pConf->GetNoDummyPrefix() ? 0 : 1;
// convert input to UTF-32 or bytes (write output past the added first space)
int BuffSize;
if(false == pModelData->m_useRawBytes) {
BuffSize = fNeedOffsets ?
::FAStrUtf8ToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + BUFF_DATA_OFFSET, pOffsets + BUFF_DATA_OFFSET, InUtf8StrByteCount) :
::FAStrUtf8ToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + BUFF_DATA_OFFSET, InUtf8StrByteCount);
} else {
BuffSize = fNeedOffsets ?
::FAStrUtf8AsBytesToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + BUFF_DATA_OFFSET, pOffsets + BUFF_DATA_OFFSET, InUtf8StrByteCount) :
::FAStrUtf8AsBytesToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + BUFF_DATA_OFFSET, InUtf8StrByteCount);
}
if (BuffSize <= 0 || BuffSize > InUtf8StrByteCount) {
return 0;
}
BuffSize += BUFF_DATA_OFFSET; // to accomodate the first space
// needed for normalization
std::vector< int > utf32input_norm;
int * pNormBuff = NULL;
std::vector< int > utf32norm_offsets;
int * pNormOffsets = NULL;
// do normalization, if needed
if (NULL != pCharMap) {
const int MaxNormBuffSize = (InUtf8StrByteCount + 1) * 2;
utf32input_norm.resize(MaxNormBuffSize);
pNormBuff = utf32input_norm.data();
if (NULL == pNormBuff) {
return 0;
}
if (fNeedOffsets) {
utf32norm_offsets.resize(MaxNormBuffSize);
pNormOffsets = utf32norm_offsets.data();
if (NULL == pNormOffsets) {
return 0;
}
}
// do the normalization for the entire input
const int ActualNormBuffSize = fNeedOffsets ?
::FANormalize(pBuff, BuffSize, pNormBuff, pNormOffsets, MaxNormBuffSize, pCharMap) :
::FANormalize(pBuff, BuffSize, pNormBuff, MaxNormBuffSize, pCharMap);
if (ActualNormBuffSize <= 0 || ActualNormBuffSize > MaxNormBuffSize) {
pCharMap = NULL;
// don't proceed without normalization, TODO: 99% times it does not change anything... so it is ok to proceed
return 0;
} else {
BuffSize = ActualNormBuffSize;
pBuff = pNormBuff;
}
}
// Replace every space sequence with U+2581 in-place
//
// Note: This operation affect offsets. Since the output sequence is always the
// same length or shorter we can update the offsets in-place.
// If normalization is enbled the offsets are computed as a superposition of
// normalization and utf-32 offsets, then transformation should be applied to
// normalization offsets, otherwise to utf-32 offsets.
//
int * pAdjustedOffsets = fNeedOffsets ? (NULL != pCharMap ? pNormOffsets : pOffsets) : NULL;
int i = 0; // index for reading
int j = 0; // index for writing
while (i < BuffSize) {
const int Ci = pBuff[i];
// check if the Ci is not a space
if (!__FAIsWhiteSpace__(Ci)) {
// copy it
pBuff[j] = Ci;
if (fNeedOffsets) {
pAdjustedOffsets[j] = pAdjustedOffsets[i];
}
j++;
// if Ci is a space, check if the previous character was not a space
} else if (0 == j || __FASpDelimiter__ != pBuff[j - 1]) {
// copy normalized space
pBuff[j] = __FASpDelimiter__;
if (fNeedOffsets) {
pAdjustedOffsets[j] = pAdjustedOffsets[i];
}
j++;
}
i++;
} // of while ...
// trim the final space if there was no content characters after
if (1 < j && pBuff[j - 1] == __FASpDelimiter__) {
j--;
}
// adjust the length
BuffSize = j;
// do the segmentation
const int WbdResMaxSize = BuffSize * 3;
std::vector< int > WbdResults(WbdResMaxSize);
int * pWbdResults = WbdResults.data ();
// tokenize input with a selected algorithm
const int WbdOutSize = pModelData->m_pAlgo->Process (pBuff, BuffSize, pWbdResults, WbdResMaxSize, UnkId);
if (WbdOutSize > WbdResMaxSize || 0 != WbdOutSize % 3) {
return 0;
}
int OutSize = 0;
int IdOffset = pConf->GetIdOffset (); // see if we need to shift output IDs by a constant
// return the ids only
for (int i = 0; i < WbdOutSize && OutSize < MaxIdsArrLength; i += 3) {
// copy id
const int id = pWbdResults [i];
pIdsArr [OutSize] = id + IdOffset;
// copy offsets if needed
if (fNeedOffsets) {
const int TokenFrom = pWbdResults [i + 1];
const int FromOffset = pOffsets[(pCharMap) ? pNormOffsets [TokenFrom] : TokenFrom];
pStartOffsets[OutSize] = FromOffset;
const int TokenTo = pWbdResults [i + 2];
const int ToOffset = pOffsets[(pCharMap) ? pNormOffsets [TokenTo] : TokenTo];
const int ToCharSize = ::FAUtf8Size(pInUtf8Str + ToOffset);
pEndOffsets[OutSize] = ToOffset + (0 < ToCharSize ? ToCharSize - 1 : 0);
}
OutSize++;
}
return OutSize;
}