in core/unittest/processor/ProcessorFilterNativeUnittest.cpp [736:1089]
void ProcessorFilterNativeUnittest::TestFilterNoneUtf8() {
const int caseCharacterNum = 80; // ten character for every situation
std::string characterSet[caseCharacterNum];
bool isBlunk[caseCharacterNum][4]; // every character has 4 byte at most
// generate one byte utf8;
for (int i = 0; i < 10; ++i) {
char tmp;
do {
tmp = rand() & 0xff;
tmp &= 0x7f;
} while (tmp == 32 || tmp == 9); // tmp should not be space or \t
characterSet[i] = std::string(1, tmp);
isBlunk[i][0] = false;
}
// generate one byte none utf8
for (int i = 10; i < 20; ++i) {
char tmp;
do {
tmp = rand() & 0xff;
tmp |= 0x80;
tmp &= 0xbf;
} while (tmp == 32 || tmp == 9); // tmp should be 10xx xxxx;
characterSet[i] = std::string(1, tmp);
isBlunk[i][0] = true;
}
// generate two byte utf8
for (int i = 20; i < 30; ++i) {
char tmp1, tmp2;
uint16_t unicode;
do {
tmp1 = rand() & 0xff;
tmp2 = rand() & 0xff;
tmp1 |= 0xc0;
tmp1 &= 0xdf; // tmp1 should be 0x 110x xxxx
tmp2 |= 0x80;
tmp2 &= 0xbf; // tmp2 should be 0x 10xx xxxx
unicode = (((tmp1 & 0x1f) << 6) | (tmp2 & 0x3f));
} while (!(unicode >= 0x80 && unicode <= 0x7ff));
characterSet[i] = std::string(1, tmp1) + std::string(1, tmp2);
isBlunk[i][0] = false;
isBlunk[i][1] = false;
}
// generate two byte noneutf8
char randArr1[10], randArr2[10], randArr3[10], randArr4[10];
for (int i = 0; i < 10; ++i) {
char tmp1, tmp2;
tmp1 = rand() & 0xff;
tmp2 = rand() & 0xff;
tmp1 |= 0xc0;
tmp1 &= 0xdf; // tmp1 should be 0x110x xxxx
tmp2 |= 0x80;
tmp2 &= 0xbf;
randArr1[i] = tmp1;
randArr2[i] = tmp2;
}
// five case with second binary 0xxx xxxx;
for (int i = 0; i < 5; ++i) {
do {
randArr2[i] = rand() & 0xff;
randArr2[i] &= 0x7f;
} while (randArr2[i] == 32);
}
for (int index = 30; index < 35; ++index) {
characterSet[index] = std::string(1, randArr1[index - 30]) + std::string(1, randArr2[index - 30]);
isBlunk[index][0] = true;
isBlunk[index][1] = false;
}
// five case of the situation that only the format is utf8, but not unicode;
for (int index = 35; index < 40; ++index) {
randArr1[index - 30] &= 0xe1; // unicode must in rand [0x80,0x7fff]; ant two byte has 11 bits ,so the
// situation can only be < 0x80
characterSet[index] = std::string(1, randArr1[index - 30]) + std::string(1, randArr2[index - 30]);
isBlunk[index][0] = true;
isBlunk[index][1] = true;
}
// generate three bytes utf8
for (int i = 40; i < 50; ++i) {
char tmp1, tmp2, tmp3;
uint16_t unicode;
do {
tmp1 = rand() & 0xff;
tmp2 = rand() & 0xff;
tmp3 = rand() & 0xff;
tmp1 |= 0xe0;
tmp1 &= 0xef; // tmp1 should be 0x 1110x xxxx
tmp2 |= 0x80;
tmp2 &= 0xbf; // tmp2 should be 10xx xxxx
tmp3 |= 0x80;
tmp3 &= 0xbf; // tmp3 should be 10xx xxxx
unicode = (((tmp1 & 0x0f) << 12) | ((tmp2 & 0x3f) << 6) | (tmp3 & 0x3f));
} while (!(unicode >= 0x800));
characterSet[i] = std::string(1, tmp1) + std::string(1, tmp2) + std::string(1, tmp3);
isBlunk[i][0] = false;
isBlunk[i][1] = false;
isBlunk[i][2] = false;
}
// generate three bytes none utf8
for (int i = 50; i < 60; ++i) {
char tmp1, tmp2, tmp3;
tmp1 = rand() & 0xff;
tmp2 = rand() & 0xff;
tmp3 = rand() & 0xff;
tmp1 |= 0xe0;
tmp1 &= 0xef; // tmp1 should be 0x 1110x xxxx
tmp2 |= 0x80;
tmp2 &= 0xbf; // tmp2 should be 10xx xxxx
tmp3 |= 0x80;
tmp3 &= 0xbf; // tmp3 should be 10xx xxxx
randArr1[i - 50] = tmp1;
randArr2[i - 50] = tmp2;
randArr3[i - 50] = tmp3;
}
// the situation of 1110xxxx 0xxxxxxx 10xxxxxxx
for (int i = 50; i < 52; ++i) {
do {
randArr2[i - 50] = rand() & 0xff;
randArr2[i - 50] &= 0x7f; // second bytes is 0xxx xxxx;
} while (randArr2[i - 50] == 32);
characterSet[i]
= std::string(1, randArr1[i - 50]) + std::string(1, randArr2[i - 50]) + std::string(1, randArr3[i - 50]);
isBlunk[i][0] = true;
isBlunk[i][1] = false;
isBlunk[i][2] = true;
}
// the situation of 1110xxxx 10xxxxxx 0xxxxxxx
for (int i = 52; i < 54; ++i) {
do {
randArr3[i - 50] = rand() & 0xff;
randArr3[i - 50] &= 0x7f; // second bytes is 0xxx xxxx;
} while (randArr3[i - 50] == 32);
characterSet[i]
= std::string(1, randArr1[i - 50]) + std::string(1, randArr2[i - 50]) + std::string(1, randArr3[i - 50]);
isBlunk[i][0] = true;
isBlunk[i][1] = true;
isBlunk[i][2] = false;
}
// the situation of 1110xxxx 0xxxxxxx 0xxxxxxx
for (int i = 54; i < 56; ++i) {
do {
randArr2[i - 50] = rand() & 0xff;
randArr2[i - 50] &= 0x7f;
randArr3[i - 50] = rand() & 0xff;
randArr3[i - 50] &= 0x7f; // second bytes is 0xxx xxxx
} while (randArr3[i - 50] == 32 || randArr2[i - 50] == 32);
characterSet[i]
= std::string(1, randArr1[i - 50]) + std::string(1, randArr2[i - 50]) + std::string(1, randArr3[i - 50]);
isBlunk[i][0] = true;
isBlunk[i][1] = false;
isBlunk[i][2] = false;
}
// the situation of only format in utf8;
for (int i = 56; i < 60; ++i) {
randArr1[i - 50] &= 0xf0;
randArr2[i - 50] &= 0xdf; // 1110 0000 100xxxxx 10xxxxxx
characterSet[i]
= std::string(1, randArr1[i - 50]) + std::string(1, randArr2[i - 50]) + std::string(1, randArr3[i - 50]);
isBlunk[i][0] = true;
isBlunk[i][1] = true;
isBlunk[i][2] = true;
}
// generate four bytes utf8
for (int i = 60; i < 70; ++i) {
char tmp1, tmp2, tmp3, tmp4;
uint32_t unicode;
do {
tmp1 = rand() & 0xff;
tmp2 = rand() & 0xff;
tmp3 = rand() & 0xff;
tmp4 = rand() & 0xff;
tmp1 |= 0xf0;
tmp1 &= 0xf7; // tmp1 should be 0x 11110x xxxx
tmp2 |= 0x80;
tmp2 &= 0xbf; // tmp2 should be 10xx xxxx
tmp3 |= 0x80;
tmp3 &= 0xbf; // tmp3 should be 10xx xxxx
tmp4 |= 0x80;
tmp4 &= 0xbf; // tmp3 should be 10xx xxxx
unicode = ((tmp1 & 0x07) << 18) | ((tmp2 & 0x3f) << 12) | ((tmp3 & 0x3f) << 6) | (tmp4 & 0x3f);
} while (!(unicode >= 0x00010000 && unicode <= 0x0010ffff));
characterSet[i] = std::string(1, tmp1) + std::string(1, tmp2) + std::string(1, tmp3) + std::string(1, tmp4);
isBlunk[i][0] = false;
isBlunk[i][1] = false;
isBlunk[i][2] = false;
isBlunk[i][3] = false;
}
// generate 4 bytes none utf8
for (int i = 70; i < 80; ++i) {
char tmp1, tmp2, tmp3, tmp4;
tmp1 = rand() & 0xff;
tmp2 = rand() & 0xff;
tmp3 = rand() & 0xff;
tmp4 = rand() & 0xff;
tmp1 |= 0xf0;
tmp1 &= 0xf7; // tmp1 should be 0x 1110x xxxx
tmp2 |= 0x80;
tmp2 &= 0xbf; // tmp2 should be 10xx xxxx
tmp3 |= 0x80;
tmp3 &= 0xbf; // tmp3 should be 10xx xxxx
tmp4 |= 0x80;
tmp4 &= 0xbf; // tmp3 should be 10xx xxxx
randArr1[i - 70] = tmp1;
randArr2[i - 70] = tmp2;
randArr3[i - 70] = tmp3;
randArr4[i - 70] = tmp4;
}
// the situation of 11110xxx 0xxxxxxx 10xxxxxxx 10xxxxxx
for (int i = 70; i < 72; ++i) {
do {
randArr2[i - 70] = rand() & 0xff;
randArr2[i - 70] &= 0x7f; // second bytes is 0xxx xxxx;
} while (randArr2[i - 70] == 32);
characterSet[i] = std::string(1, randArr1[i - 70]) + std::string(1, randArr2[i - 70])
+ std::string(1, randArr3[i - 70]) + std::string(1, randArr4[i - 70]);
isBlunk[i][0] = true;
isBlunk[i][1] = false;
isBlunk[i][2] = true;
isBlunk[i][3] = true;
}
// the situation of 1110xxxx 10xxxxxx 0xxxxxxx 10xxxxxx
for (int i = 72; i < 74; ++i) {
do {
randArr3[i - 70] = rand() & 0xff;
randArr3[i - 70] &= 0x7f; // second bytes is 0xxx xxxx;
} while (randArr3[i - 70] == 32);
characterSet[i] = std::string(1, randArr1[i - 70]) + std::string(1, randArr2[i - 70])
+ std::string(1, randArr3[i - 70]) + std::string(1, randArr4[i - 70]);
isBlunk[i][0] = true;
isBlunk[i][1] = true;
isBlunk[i][2] = false;
isBlunk[i][3] = true;
}
// the situation of 1110xxxx 0xxxxxxx 0xxxxxxx 0xxxxxxxx
for (int i = 74; i < 76; ++i) {
do {
randArr2[i - 70] = rand() & 0xff;
randArr2[i - 70] &= 0x7f;
randArr3[i - 70] = rand() & 0xff;
randArr3[i - 70] &= 0x7f; // second bytes is 0xxx xxxx
randArr4[i - 70] = rand() & 0xff;
randArr4[i - 70] &= 0x7f; // second bytes is 0xxx xxxx
} while (randArr4[i - 70] == 32 || randArr2[i - 70] == 32 || randArr3[i - 70] == 32);
characterSet[i] = std::string(1, randArr1[i - 70]) + std::string(1, randArr2[i - 70])
+ std::string(1, randArr3[i - 70]) + std::string(1, randArr4[i - 70]);
isBlunk[i][0] = true;
isBlunk[i][1] = false;
isBlunk[i][2] = false;
isBlunk[i][3] = false;
}
// the situation of only format in utf8; and the real unicode is not in range
// less than range
for (int i = 76; i < 78; ++i) {
randArr1[i - 70] &= 0xf0;
randArr2[i - 70] &= 0x8f; // 1110 0000 100xxxxx 10xxxxxx
characterSet[i] = std::string(1, randArr1[i - 70]) + std::string(1, randArr2[i - 70])
+ std::string(1, randArr3[i - 70]) + std::string(1, randArr4[i - 70]);
isBlunk[i][0] = true;
isBlunk[i][1] = true;
isBlunk[i][2] = true;
isBlunk[i][3] = true;
}
// greater than range
for (int i = 78; i < 80; ++i) {
randArr1[i - 70] |= 0x04;
randArr2[i - 70] |= 0x10; // 1110 0000 100xxxxx 10xxxxxx
characterSet[i] = std::string(1, randArr1[i - 70]) + std::string(1, randArr2[i - 70])
+ std::string(1, randArr3[i - 70]) + std::string(1, randArr4[i - 70]);
isBlunk[i][0] = true;
isBlunk[i][1] = true;
isBlunk[i][2] = true;
isBlunk[i][3] = true;
}
for (int i = 0; i < 10; ++i) {
std::string testStr;
const int CHARACTER_COUNT = 8192;
bool flow[CHARACTER_COUNT * 4];
int index = 0; // index of flow
// generate test string with character randomly, and record whether a position should be replaced by blunck
for (int j = 0; j < CHARACTER_COUNT; ++j) {
int randIndex = rand() % 80;
testStr += characterSet[randIndex];
if (randIndex >= 0 && randIndex < 20) {
flow[index] = isBlunk[randIndex][0];
index++;
} else if (randIndex >= 20 && randIndex < 40) {
flow[index] = isBlunk[randIndex][0];
flow[index + 1] = isBlunk[randIndex][1];
index += 2;
} else if (randIndex >= 40 && randIndex < 60) {
flow[index] = isBlunk[randIndex][0];
flow[index + 1] = isBlunk[randIndex][1];
flow[index + 2] = isBlunk[randIndex][2];
index += 3;
} else if (randIndex >= 60 && randIndex < 80) {
flow[index] = isBlunk[randIndex][0];
flow[index + 1] = isBlunk[randIndex][1];
flow[index + 2] = isBlunk[randIndex][2];
flow[index + 3] = isBlunk[randIndex][3];
index += 4;
}
if (j == (CHARACTER_COUNT - 1) && randIndex >= 20
&& randIndex % 20 < 10) // the last character of string ,and at least two bytes,ant is utf8
{
testStr = testStr.substr(0, testStr.size() - 1);
if (randIndex >= 20 && randIndex < 30)
flow[index - 2] = true;
else if (randIndex >= 40 && randIndex < 50)
flow[index - 3] = flow[index - 2] = true;
else if (randIndex >= 60 && randIndex < 70)
flow[index - 4] = flow[index - 3] = flow[index - 2] = true;
}
}
ProcessorFilterNative processor;
processor.FilterNoneUtf8(testStr);
for (uint32_t indexOfString = 0; indexOfString < testStr.size(); ++indexOfString) {
if (flow[indexOfString] == true) {
APSARA_TEST_EQUAL_FATAL(testStr[indexOfString], ' ');
} else {
APSARA_TEST_NOT_EQUAL_FATAL(testStr[indexOfString], ' ');
}
}
}
} // end of case