in subversion/libsvn_subr/utf8proc/utf8proc.c [558:668]
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
utf8proc_ssize_t rpos;
utf8proc_ssize_t wpos = 0;
utf8proc_int32_t uc;
for (rpos = 0; rpos < length; rpos++) {
uc = buffer[rpos];
if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
if (options & UTF8PROC_NLF2LS) {
if (options & UTF8PROC_NLF2PS) {
buffer[wpos++] = 0x000A;
} else {
buffer[wpos++] = 0x2028;
}
} else {
if (options & UTF8PROC_NLF2PS) {
buffer[wpos++] = 0x2029;
} else {
buffer[wpos++] = 0x0020;
}
}
} else if ((options & UTF8PROC_STRIPCC) &&
(uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
if (uc == 0x0009) buffer[wpos++] = 0x0020;
} else {
buffer[wpos++] = uc;
}
}
length = wpos;
}
if (options & UTF8PROC_COMPOSE) {
utf8proc_int32_t *starter = NULL;
utf8proc_int32_t current_char;
const utf8proc_property_t *starter_property = NULL, *current_property;
utf8proc_propval_t max_combining_class = -1;
utf8proc_ssize_t rpos;
utf8proc_ssize_t wpos = 0;
utf8proc_int32_t composition;
for (rpos = 0; rpos < length; rpos++) {
current_char = buffer[rpos];
current_property = unsafe_get_property(current_char);
if (starter && current_property->combining_class > max_combining_class) {
/* combination perhaps possible */
utf8proc_int32_t hangul_lindex;
utf8proc_int32_t hangul_sindex;
hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
utf8proc_int32_t hangul_vindex;
hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
*starter = UTF8PROC_HANGUL_SBASE +
(hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
UTF8PROC_HANGUL_TCOUNT;
starter_property = NULL;
continue;
}
}
hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
(hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
utf8proc_int32_t hangul_tindex;
hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
*starter += hangul_tindex;
starter_property = NULL;
continue;
}
}
if (!starter_property) {
starter_property = unsafe_get_property(*starter);
}
if (starter_property->comb_index < 0x8000 &&
current_property->comb_index != UINT16_MAX &&
current_property->comb_index >= 0x8000) {
int sidx = starter_property->comb_index;
int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx];
if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) {
idx += sidx + 2;
if (current_property->comb_index & 0x4000) {
composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
} else
composition = utf8proc_combinations[idx];
if (composition > 0 && (!(options & UTF8PROC_STABLE) ||
!(unsafe_get_property(composition)->comp_exclusion))) {
*starter = composition;
starter_property = NULL;
continue;
}
}
}
}
buffer[wpos] = current_char;
if (current_property->combining_class) {
if (current_property->combining_class > max_combining_class) {
max_combining_class = current_property->combining_class;
}
} else {
starter = buffer + wpos;
starter_property = NULL;
max_combining_class = -1;
}
wpos++;
}
length = wpos;
}
return length;
}