void ProcessorFilterNativeUnittest::TestFilterNoneUtf8()

in core/unittest/processor/ProcessorFilterNativeUnittest.cpp [736:1089]


void ProcessorFilterNativeUnittest::TestFilterNoneUtf8() {
    const int caseCharacterNum = 80; // ten character for every situation
    std::string characterSet[caseCharacterNum];
    bool isBlunk[caseCharacterNum][4]; // every character has 4 byte at most

    // generate one byte utf8;

    for (int i = 0; i < 10; ++i) {
        char tmp;
        do {
            tmp = rand() & 0xff;
            tmp &= 0x7f;
        } while (tmp == 32 || tmp == 9); // tmp should not be space or \t

        characterSet[i] = std::string(1, tmp);
        isBlunk[i][0] = false;
    }

    // generate one byte none utf8
    for (int i = 10; i < 20; ++i) {
        char tmp;
        do {
            tmp = rand() & 0xff;
            tmp |= 0x80;
            tmp &= 0xbf;
        } while (tmp == 32 || tmp == 9); // tmp should be 10xx xxxx;
        characterSet[i] = std::string(1, tmp);
        isBlunk[i][0] = true;
    }

    // generate two byte utf8

    for (int i = 20; i < 30; ++i) {
        char tmp1, tmp2;
        uint16_t unicode;
        do {
            tmp1 = rand() & 0xff;
            tmp2 = rand() & 0xff;
            tmp1 |= 0xc0;
            tmp1 &= 0xdf; // tmp1 should be 0x 110x xxxx
            tmp2 |= 0x80;
            tmp2 &= 0xbf; // tmp2 should be 0x 10xx xxxx
            unicode = (((tmp1 & 0x1f) << 6) | (tmp2 & 0x3f));
        } while (!(unicode >= 0x80 && unicode <= 0x7ff));

        characterSet[i] = std::string(1, tmp1) + std::string(1, tmp2);
        isBlunk[i][0] = false;
        isBlunk[i][1] = false;
    }

    // generate two byte noneutf8
    char randArr1[10], randArr2[10], randArr3[10], randArr4[10];
    for (int i = 0; i < 10; ++i) {
        char tmp1, tmp2;
        tmp1 = rand() & 0xff;
        tmp2 = rand() & 0xff;
        tmp1 |= 0xc0;
        tmp1 &= 0xdf; // tmp1 should be 0x110x xxxx
        tmp2 |= 0x80;
        tmp2 &= 0xbf;
        randArr1[i] = tmp1;
        randArr2[i] = tmp2;
    }
    // five case with second binary 0xxx xxxx;
    for (int i = 0; i < 5; ++i) {
        do {
            randArr2[i] = rand() & 0xff;
            randArr2[i] &= 0x7f;
        } while (randArr2[i] == 32);
    }

    for (int index = 30; index < 35; ++index) {
        characterSet[index] = std::string(1, randArr1[index - 30]) + std::string(1, randArr2[index - 30]);
        isBlunk[index][0] = true;
        isBlunk[index][1] = false;
    }
    // five case of the situation that only the format is utf8, but not unicode;

    for (int index = 35; index < 40; ++index) {
        randArr1[index - 30] &= 0xe1; // unicode must in rand [0x80,0x7fff]; ant two byte has 11 bits ,so the
                                      // situation can only be < 0x80
        characterSet[index] = std::string(1, randArr1[index - 30]) + std::string(1, randArr2[index - 30]);
        isBlunk[index][0] = true;
        isBlunk[index][1] = true;
    }

    // generate three bytes utf8

    for (int i = 40; i < 50; ++i) {
        char tmp1, tmp2, tmp3;
        uint16_t unicode;
        do {
            tmp1 = rand() & 0xff;
            tmp2 = rand() & 0xff;
            tmp3 = rand() & 0xff;
            tmp1 |= 0xe0;
            tmp1 &= 0xef; // tmp1 should be 0x 1110x xxxx
            tmp2 |= 0x80;
            tmp2 &= 0xbf; // tmp2 should be 10xx xxxx
            tmp3 |= 0x80;
            tmp3 &= 0xbf; // tmp3 should be 10xx xxxx
            unicode = (((tmp1 & 0x0f) << 12) | ((tmp2 & 0x3f) << 6) | (tmp3 & 0x3f));
        } while (!(unicode >= 0x800));

        characterSet[i] = std::string(1, tmp1) + std::string(1, tmp2) + std::string(1, tmp3);
        isBlunk[i][0] = false;
        isBlunk[i][1] = false;
        isBlunk[i][2] = false;
    }

    // generate three bytes none utf8
    for (int i = 50; i < 60; ++i) {
        char tmp1, tmp2, tmp3;
        tmp1 = rand() & 0xff;
        tmp2 = rand() & 0xff;
        tmp3 = rand() & 0xff;
        tmp1 |= 0xe0;
        tmp1 &= 0xef; // tmp1 should be 0x 1110x xxxx
        tmp2 |= 0x80;
        tmp2 &= 0xbf; // tmp2 should be 10xx xxxx
        tmp3 |= 0x80;
        tmp3 &= 0xbf; // tmp3 should be 10xx xxxx
        randArr1[i - 50] = tmp1;
        randArr2[i - 50] = tmp2;
        randArr3[i - 50] = tmp3;
    }

    // the situation of 1110xxxx 0xxxxxxx 10xxxxxxx
    for (int i = 50; i < 52; ++i) {
        do {
            randArr2[i - 50] = rand() & 0xff;
            randArr2[i - 50] &= 0x7f; // second bytes is 0xxx xxxx;
        } while (randArr2[i - 50] == 32);
        characterSet[i]
            = std::string(1, randArr1[i - 50]) + std::string(1, randArr2[i - 50]) + std::string(1, randArr3[i - 50]);
        isBlunk[i][0] = true;
        isBlunk[i][1] = false;
        isBlunk[i][2] = true;
    }
    // the situation of 1110xxxx 10xxxxxx 0xxxxxxx
    for (int i = 52; i < 54; ++i) {
        do {
            randArr3[i - 50] = rand() & 0xff;
            randArr3[i - 50] &= 0x7f; // second bytes is 0xxx xxxx;
        } while (randArr3[i - 50] == 32);
        characterSet[i]
            = std::string(1, randArr1[i - 50]) + std::string(1, randArr2[i - 50]) + std::string(1, randArr3[i - 50]);
        isBlunk[i][0] = true;
        isBlunk[i][1] = true;
        isBlunk[i][2] = false;
    }
    // the situation of 1110xxxx 0xxxxxxx 0xxxxxxx

    for (int i = 54; i < 56; ++i) {
        do {
            randArr2[i - 50] = rand() & 0xff;
            randArr2[i - 50] &= 0x7f;
            randArr3[i - 50] = rand() & 0xff;
            randArr3[i - 50] &= 0x7f; // second bytes is 0xxx xxxx
        } while (randArr3[i - 50] == 32 || randArr2[i - 50] == 32);
        characterSet[i]
            = std::string(1, randArr1[i - 50]) + std::string(1, randArr2[i - 50]) + std::string(1, randArr3[i - 50]);
        isBlunk[i][0] = true;
        isBlunk[i][1] = false;
        isBlunk[i][2] = false;
    }

    // the situation of only format in utf8;
    for (int i = 56; i < 60; ++i) {
        randArr1[i - 50] &= 0xf0;
        randArr2[i - 50] &= 0xdf; // 1110 0000  100xxxxx 10xxxxxx

        characterSet[i]
            = std::string(1, randArr1[i - 50]) + std::string(1, randArr2[i - 50]) + std::string(1, randArr3[i - 50]);
        isBlunk[i][0] = true;
        isBlunk[i][1] = true;
        isBlunk[i][2] = true;
    }
    // generate four bytes utf8

    for (int i = 60; i < 70; ++i) {
        char tmp1, tmp2, tmp3, tmp4;
        uint32_t unicode;
        do {
            tmp1 = rand() & 0xff;
            tmp2 = rand() & 0xff;
            tmp3 = rand() & 0xff;
            tmp4 = rand() & 0xff;
            tmp1 |= 0xf0;
            tmp1 &= 0xf7; // tmp1 should be 0x 11110x xxxx
            tmp2 |= 0x80;
            tmp2 &= 0xbf; // tmp2 should be 10xx xxxx
            tmp3 |= 0x80;
            tmp3 &= 0xbf; // tmp3 should be 10xx xxxx
            tmp4 |= 0x80;
            tmp4 &= 0xbf; // tmp3 should be 10xx xxxx
            unicode = ((tmp1 & 0x07) << 18) | ((tmp2 & 0x3f) << 12) | ((tmp3 & 0x3f) << 6) | (tmp4 & 0x3f);
        } while (!(unicode >= 0x00010000 && unicode <= 0x0010ffff));

        characterSet[i] = std::string(1, tmp1) + std::string(1, tmp2) + std::string(1, tmp3) + std::string(1, tmp4);
        isBlunk[i][0] = false;
        isBlunk[i][1] = false;
        isBlunk[i][2] = false;
        isBlunk[i][3] = false;
    }

    // generate 4 bytes none utf8

    for (int i = 70; i < 80; ++i) {
        char tmp1, tmp2, tmp3, tmp4;
        tmp1 = rand() & 0xff;
        tmp2 = rand() & 0xff;
        tmp3 = rand() & 0xff;
        tmp4 = rand() & 0xff;
        tmp1 |= 0xf0;
        tmp1 &= 0xf7; // tmp1 should be 0x 1110x xxxx
        tmp2 |= 0x80;
        tmp2 &= 0xbf; // tmp2 should be 10xx xxxx
        tmp3 |= 0x80;
        tmp3 &= 0xbf; // tmp3 should be 10xx xxxx
        tmp4 |= 0x80;
        tmp4 &= 0xbf; // tmp3 should be 10xx xxxx

        randArr1[i - 70] = tmp1;
        randArr2[i - 70] = tmp2;
        randArr3[i - 70] = tmp3;
        randArr4[i - 70] = tmp4;
    }

    // the situation of 11110xxx 0xxxxxxx 10xxxxxxx 10xxxxxx
    for (int i = 70; i < 72; ++i) {
        do {
            randArr2[i - 70] = rand() & 0xff;
            randArr2[i - 70] &= 0x7f; // second bytes is 0xxx xxxx;
        } while (randArr2[i - 70] == 32);

        characterSet[i] = std::string(1, randArr1[i - 70]) + std::string(1, randArr2[i - 70])
            + std::string(1, randArr3[i - 70]) + std::string(1, randArr4[i - 70]);
        isBlunk[i][0] = true;
        isBlunk[i][1] = false;
        isBlunk[i][2] = true;
        isBlunk[i][3] = true;
    }
    // the situation of 1110xxxx 10xxxxxx 0xxxxxxx 10xxxxxx
    for (int i = 72; i < 74; ++i) {
        do {
            randArr3[i - 70] = rand() & 0xff;
            randArr3[i - 70] &= 0x7f; // second bytes is 0xxx xxxx;
        } while (randArr3[i - 70] == 32);
        characterSet[i] = std::string(1, randArr1[i - 70]) + std::string(1, randArr2[i - 70])
            + std::string(1, randArr3[i - 70]) + std::string(1, randArr4[i - 70]);
        isBlunk[i][0] = true;
        isBlunk[i][1] = true;
        isBlunk[i][2] = false;
        isBlunk[i][3] = true;
    }
    // the situation of 1110xxxx 0xxxxxxx 0xxxxxxx 0xxxxxxxx

    for (int i = 74; i < 76; ++i) {
        do {
            randArr2[i - 70] = rand() & 0xff;
            randArr2[i - 70] &= 0x7f;
            randArr3[i - 70] = rand() & 0xff;
            randArr3[i - 70] &= 0x7f; // second bytes is 0xxx xxxx
            randArr4[i - 70] = rand() & 0xff;
            randArr4[i - 70] &= 0x7f; // second bytes is 0xxx xxxx
        } while (randArr4[i - 70] == 32 || randArr2[i - 70] == 32 || randArr3[i - 70] == 32);
        characterSet[i] = std::string(1, randArr1[i - 70]) + std::string(1, randArr2[i - 70])
            + std::string(1, randArr3[i - 70]) + std::string(1, randArr4[i - 70]);
        isBlunk[i][0] = true;
        isBlunk[i][1] = false;
        isBlunk[i][2] = false;
        isBlunk[i][3] = false;
    }

    // the situation of only format in utf8; and the real unicode is not in range

    // less than range
    for (int i = 76; i < 78; ++i) {
        randArr1[i - 70] &= 0xf0;
        randArr2[i - 70] &= 0x8f; // 1110 0000  100xxxxx 10xxxxxx
        characterSet[i] = std::string(1, randArr1[i - 70]) + std::string(1, randArr2[i - 70])
            + std::string(1, randArr3[i - 70]) + std::string(1, randArr4[i - 70]);
        isBlunk[i][0] = true;
        isBlunk[i][1] = true;
        isBlunk[i][2] = true;
        isBlunk[i][3] = true;
    }

    // greater than range
    for (int i = 78; i < 80; ++i) {
        randArr1[i - 70] |= 0x04;
        randArr2[i - 70] |= 0x10; // 1110 0000  100xxxxx 10xxxxxx

        characterSet[i] = std::string(1, randArr1[i - 70]) + std::string(1, randArr2[i - 70])
            + std::string(1, randArr3[i - 70]) + std::string(1, randArr4[i - 70]);
        isBlunk[i][0] = true;
        isBlunk[i][1] = true;
        isBlunk[i][2] = true;
        isBlunk[i][3] = true;
    }

    for (int i = 0; i < 10; ++i) {
        std::string testStr;
        const int CHARACTER_COUNT = 8192;
        bool flow[CHARACTER_COUNT * 4];
        int index = 0; // index of flow
        // generate test string with character randomly, and record whether a position should be replaced by blunck
        for (int j = 0; j < CHARACTER_COUNT; ++j) {
            int randIndex = rand() % 80;
            testStr += characterSet[randIndex];
            if (randIndex >= 0 && randIndex < 20) {
                flow[index] = isBlunk[randIndex][0];
                index++;
            } else if (randIndex >= 20 && randIndex < 40) {
                flow[index] = isBlunk[randIndex][0];
                flow[index + 1] = isBlunk[randIndex][1];
                index += 2;
            } else if (randIndex >= 40 && randIndex < 60) {
                flow[index] = isBlunk[randIndex][0];
                flow[index + 1] = isBlunk[randIndex][1];
                flow[index + 2] = isBlunk[randIndex][2];
                index += 3;
            } else if (randIndex >= 60 && randIndex < 80) {
                flow[index] = isBlunk[randIndex][0];
                flow[index + 1] = isBlunk[randIndex][1];
                flow[index + 2] = isBlunk[randIndex][2];
                flow[index + 3] = isBlunk[randIndex][3];
                index += 4;
            }

            if (j == (CHARACTER_COUNT - 1) && randIndex >= 20
                && randIndex % 20 < 10) // the last character of string ,and at least two bytes,ant is utf8
            {
                testStr = testStr.substr(0, testStr.size() - 1);
                if (randIndex >= 20 && randIndex < 30)
                    flow[index - 2] = true;
                else if (randIndex >= 40 && randIndex < 50)
                    flow[index - 3] = flow[index - 2] = true;
                else if (randIndex >= 60 && randIndex < 70)
                    flow[index - 4] = flow[index - 3] = flow[index - 2] = true;
            }
        }
        ProcessorFilterNative processor;
        processor.FilterNoneUtf8(testStr);
        for (uint32_t indexOfString = 0; indexOfString < testStr.size(); ++indexOfString) {
            if (flow[indexOfString] == true) {
                APSARA_TEST_EQUAL_FATAL(testStr[indexOfString], ' ');
            } else {
                APSARA_TEST_NOT_EQUAL_FATAL(testStr[indexOfString], ' ');
            }
        }
    }
} // end of case