const int TextToIdsWithOffsets_sp()

in blingfiretools/blingfiretokdll/blingfiretokdll.cpp [1368:1553]


const int TextToIdsWithOffsets_sp(
        void* ModelPtr,
        const char * pInUtf8Str,
        int InUtf8StrByteCount,
        int32_t * pIdsArr,
        int * pStartOffsets, 
        int * pEndOffsets,
        const int MaxIdsArrLength,
        const int UnkId = 0
)
{
    // validate the parameters
    if (0 >= InUtf8StrByteCount || InUtf8StrByteCount > FALimits::MaxArrSize || NULL == pInUtf8Str || 0 == ModelPtr) {
        return 0;
    }

    // allocate buffer for UTF-8 --> UTF-32 conversion
    std::vector< int > utf32input(InUtf8StrByteCount + 1);
    int * pBuff = utf32input.data();
    if (NULL == pBuff) {
        return 0;
    }
    pBuff[0] = __FASpDelimiter__; // always add a space in the beginning, SP uses U+2581 as a space mark

    // a container for the offsets
    std::vector< int > utf32offsets;
    int * pOffsets = NULL;

    // flag to alter the logic in case we don't need the offsets
    const bool fNeedOffsets = NULL != pStartOffsets && NULL != pEndOffsets;

    if (fNeedOffsets) {
        utf32offsets.resize(InUtf8StrByteCount + 1);
        pOffsets = utf32offsets.data();
        if (NULL == pOffsets) {
            return 0;
        }
        pOffsets[0] = -1; // an offset of the appended space (it is not a part of the input)
    }

    // get the model data
    const FAModelData * pModelData = (const FAModelData *)ModelPtr;
    const FADictConfKeeper * pConf = &(pModelData->m_DictConf);
    const FAMultiMapCA * pCharMap = pConf->GetCharMap ();

    // see if we need to get a summy space added
    const int BUFF_DATA_OFFSET = pConf->GetNoDummyPrefix() ? 0 : 1;

    // convert input to UTF-32 or bytes (write output past the added first space)
    int BuffSize;
    if(false == pModelData->m_useRawBytes) {
        BuffSize = fNeedOffsets ? 
            ::FAStrUtf8ToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + BUFF_DATA_OFFSET, pOffsets + BUFF_DATA_OFFSET, InUtf8StrByteCount) :
            ::FAStrUtf8ToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + BUFF_DATA_OFFSET, InUtf8StrByteCount);
    } else {
        BuffSize = fNeedOffsets ? 
            ::FAStrUtf8AsBytesToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + BUFF_DATA_OFFSET, pOffsets + BUFF_DATA_OFFSET, InUtf8StrByteCount) :
            ::FAStrUtf8AsBytesToArray(pInUtf8Str, InUtf8StrByteCount, pBuff + BUFF_DATA_OFFSET, InUtf8StrByteCount);
    }
    if (BuffSize <= 0 || BuffSize > InUtf8StrByteCount) {
        return 0;
    }
    BuffSize += BUFF_DATA_OFFSET; // to accomodate the first space

    // needed for normalization
    std::vector< int > utf32input_norm;
    int * pNormBuff = NULL;
    std::vector< int > utf32norm_offsets;
    int * pNormOffsets = NULL;

    // do normalization, if needed
    if (NULL != pCharMap) {

        const int MaxNormBuffSize = (InUtf8StrByteCount + 1) * 2;
        utf32input_norm.resize(MaxNormBuffSize);
        pNormBuff = utf32input_norm.data();
        if (NULL == pNormBuff) {
            return 0;
        }
        if (fNeedOffsets) {
            utf32norm_offsets.resize(MaxNormBuffSize);
            pNormOffsets = utf32norm_offsets.data();
            if (NULL == pNormOffsets) {
                return 0;
            }
        }

        // do the normalization for the entire input
        const int ActualNormBuffSize = fNeedOffsets ? 
            ::FANormalize(pBuff, BuffSize, pNormBuff, pNormOffsets, MaxNormBuffSize, pCharMap) :
            ::FANormalize(pBuff, BuffSize, pNormBuff, MaxNormBuffSize, pCharMap);

        if (ActualNormBuffSize <= 0 || ActualNormBuffSize > MaxNormBuffSize) {
            pCharMap = NULL;
            // don't proceed without normalization, TODO: 99% times it does not change anything... so it is ok to proceed
            return 0;
        } else {
            BuffSize = ActualNormBuffSize;
            pBuff = pNormBuff;
        }
    }

    // Replace every space sequence with U+2581 in-place
    //
    // Note: This operation affect offsets. Since the output sequence is always the 
    //       same length or shorter we can update the offsets in-place.
    //       If normalization is enbled the offsets are computed as a superposition of
    //       normalization and utf-32 offsets, then transformation should be applied to 
    //       normalization offsets, otherwise to utf-32 offsets.
    //
    int * pAdjustedOffsets = fNeedOffsets ? (NULL != pCharMap ? pNormOffsets : pOffsets) : NULL;

    int i = 0; // index for reading
    int j = 0; // index for writing
    while (i < BuffSize) {

        const int Ci = pBuff[i];

        // check if the Ci is not a space
        if (!__FAIsWhiteSpace__(Ci)) {
            // copy it
            pBuff[j] = Ci;
            if (fNeedOffsets) {
                pAdjustedOffsets[j] = pAdjustedOffsets[i];
            }
            j++;
        // if Ci is a space, check if the previous character was not a space
        } else if (0 == j || __FASpDelimiter__ != pBuff[j - 1]) {
            // copy normalized space
            pBuff[j] = __FASpDelimiter__;
            if (fNeedOffsets) {
                pAdjustedOffsets[j] = pAdjustedOffsets[i];
            }
            j++;
        }

        i++;

    } // of while ...

    // trim the final space if there was no content characters after
    if (1 < j && pBuff[j - 1] == __FASpDelimiter__) {
        j--;
    }

    // adjust the length
    BuffSize = j;

    // do the segmentation
    const int WbdResMaxSize = BuffSize * 3;
    std::vector< int > WbdResults(WbdResMaxSize);
    int * pWbdResults = WbdResults.data ();

    // tokenize input with a selected algorithm
    const int WbdOutSize = pModelData->m_pAlgo->Process (pBuff, BuffSize, pWbdResults, WbdResMaxSize, UnkId);
    if (WbdOutSize > WbdResMaxSize || 0 != WbdOutSize % 3) {
        return 0;
    }

    int OutSize = 0;
    int IdOffset = pConf->GetIdOffset (); // see if we need to shift output IDs by a constant
    // return the ids only
    for (int i = 0; i < WbdOutSize && OutSize < MaxIdsArrLength; i += 3) {

        // copy id
        const int id = pWbdResults [i];
        pIdsArr [OutSize] = id + IdOffset;

        // copy offsets if needed
        if (fNeedOffsets) {

            const int TokenFrom = pWbdResults [i + 1];
            const int FromOffset = pOffsets[(pCharMap) ? pNormOffsets [TokenFrom] : TokenFrom];
            pStartOffsets[OutSize] = FromOffset;

            const int TokenTo = pWbdResults [i + 2];
            const int ToOffset = pOffsets[(pCharMap) ? pNormOffsets [TokenTo] : TokenTo];
            const int ToCharSize = ::FAUtf8Size(pInUtf8Str + ToOffset);
            pEndOffsets[OutSize] = ToOffset + (0 < ToCharSize ? ToCharSize - 1 : 0);
        }

        OutSize++;
    }

    return OutSize;
}