static_inline bool read_string()

in include/yyjson/yyjson.c [5186:5679]


static_inline bool read_string(u8 **ptr,
                               u8 *lst,
                               bool inv,
                               yyjson_val *val,
                               const char **msg) {
    /*
     Each unicode code point is encoded as 1 to 4 bytes in UTF-8 encoding,
     we use 4-byte mask and pattern value to validate UTF-8 byte sequence,
     this requires the input data to have 4-byte zero padding.
     ---------------------------------------------------
     1 byte
     unicode range [U+0000, U+007F]
     unicode min   [.......0]
     unicode max   [.1111111]
     bit pattern   [0.......]
     ---------------------------------------------------
     2 byte
     unicode range [U+0080, U+07FF]
     unicode min   [......10 ..000000]
     unicode max   [...11111 ..111111]
     bit require   [...xxxx. ........] (1E 00)
     bit mask      [xxx..... xx......] (E0 C0)
     bit pattern   [110..... 10......] (C0 80)
     ---------------------------------------------------
     3 byte
     unicode range [U+0800, U+FFFF]
     unicode min   [........ ..100000 ..000000]
     unicode max   [....1111 ..111111 ..111111]
     bit require   [....xxxx ..x..... ........] (0F 20 00)
     bit mask      [xxxx.... xx...... xx......] (F0 C0 C0)
     bit pattern   [1110.... 10...... 10......] (E0 80 80)
     ---------------------------------------------------
     3 byte invalid (reserved for surrogate halves)
     unicode range [U+D800, U+DFFF]
     unicode min   [....1101 ..100000 ..000000]
     unicode max   [....1101 ..111111 ..111111]
     bit mask      [....xxxx ..x..... ........] (0F 20 00)
     bit pattern   [....1101 ..1..... ........] (0D 20 00)
     ---------------------------------------------------
     4 byte
     unicode range [U+10000, U+10FFFF]
     unicode min   [........ ...10000 ..000000 ..000000]
     unicode max   [.....100 ..001111 ..111111 ..111111]
     bit require   [.....xxx ..xx.... ........ ........] (07 30 00 00)
     bit mask      [xxxxx... xx...... xx...... xx......] (F8 C0 C0 C0)
     bit pattern   [11110... 10...... 10...... 10......] (F0 80 80 80)
     ---------------------------------------------------
     */
#if YYJSON_ENDIAN == YYJSON_BIG_ENDIAN
    const u32 b1_mask = 0x80000000UL;
    const u32 b1_patt = 0x00000000UL;
    const u32 b2_mask = 0xE0C00000UL;
    const u32 b2_patt = 0xC0800000UL;
    const u32 b2_requ = 0x1E000000UL;
    const u32 b3_mask = 0xF0C0C000UL;
    const u32 b3_patt = 0xE0808000UL;
    const u32 b3_requ = 0x0F200000UL;
    const u32 b3_erro = 0x0D200000UL;
    const u32 b4_mask = 0xF8C0C0C0UL;
    const u32 b4_patt = 0xF0808080UL;
    const u32 b4_requ = 0x07300000UL;
    const u32 b4_err0 = 0x04000000UL;
    const u32 b4_err1 = 0x03300000UL;
#elif YYJSON_ENDIAN == YYJSON_LITTLE_ENDIAN
    const u32 b1_mask = 0x00000080UL;
    const u32 b1_patt = 0x00000000UL;
    const u32 b2_mask = 0x0000C0E0UL;
    const u32 b2_patt = 0x000080C0UL;
    const u32 b2_requ = 0x0000001EUL;
    const u32 b3_mask = 0x00C0C0F0UL;
    const u32 b3_patt = 0x008080E0UL;
    const u32 b3_requ = 0x0000200FUL;
    const u32 b3_erro = 0x0000200DUL;
    const u32 b4_mask = 0xC0C0C0F8UL;
    const u32 b4_patt = 0x808080F0UL;
    const u32 b4_requ = 0x00003007UL;
    const u32 b4_err0 = 0x00000004UL;
    const u32 b4_err1 = 0x00003003UL;
#else
    /* this should be evaluated at compile-time */
    v32_uni b1_mask_uni = {{ 0x80, 0x00, 0x00, 0x00 }};
    v32_uni b1_patt_uni = {{ 0x00, 0x00, 0x00, 0x00 }};
    v32_uni b2_mask_uni = {{ 0xE0, 0xC0, 0x00, 0x00 }};
    v32_uni b2_patt_uni = {{ 0xC0, 0x80, 0x00, 0x00 }};
    v32_uni b2_requ_uni = {{ 0x1E, 0x00, 0x00, 0x00 }};
    v32_uni b3_mask_uni = {{ 0xF0, 0xC0, 0xC0, 0x00 }};
    v32_uni b3_patt_uni = {{ 0xE0, 0x80, 0x80, 0x00 }};
    v32_uni b3_requ_uni = {{ 0x0F, 0x20, 0x00, 0x00 }};
    v32_uni b3_erro_uni = {{ 0x0D, 0x20, 0x00, 0x00 }};
    v32_uni b4_mask_uni = {{ 0xF8, 0xC0, 0xC0, 0xC0 }};
    v32_uni b4_patt_uni = {{ 0xF0, 0x80, 0x80, 0x80 }};
    v32_uni b4_requ_uni = {{ 0x07, 0x30, 0x00, 0x00 }};
    v32_uni b4_err0_uni = {{ 0x04, 0x00, 0x00, 0x00 }};
    v32_uni b4_err1_uni = {{ 0x03, 0x30, 0x00, 0x00 }};
    u32 b1_mask = b1_mask_uni.u;
    u32 b1_patt = b1_patt_uni.u;
    u32 b2_mask = b2_mask_uni.u;
    u32 b2_patt = b2_patt_uni.u;
    u32 b2_requ = b2_requ_uni.u;
    u32 b3_mask = b3_mask_uni.u;
    u32 b3_patt = b3_patt_uni.u;
    u32 b3_requ = b3_requ_uni.u;
    u32 b3_erro = b3_erro_uni.u;
    u32 b4_mask = b4_mask_uni.u;
    u32 b4_patt = b4_patt_uni.u;
    u32 b4_requ = b4_requ_uni.u;
    u32 b4_err0 = b4_err0_uni.u;
    u32 b4_err1 = b4_err1_uni.u;
#endif
    
#define is_valid_seq_1(uni) ( \
    ((uni & b1_mask) == b1_patt) \
)

#define is_valid_seq_2(uni) ( \
    ((uni & b2_mask) == b2_patt) && \
    ((uni & b2_requ)) \
)
    
#define is_valid_seq_3(uni) ( \
    ((uni & b3_mask) == b3_patt) && \
    ((tmp = (uni & b3_requ))) && \
    ((tmp != b3_erro)) \
)
    
#define is_valid_seq_4(uni) ( \
    ((uni & b4_mask) == b4_patt) && \
    ((tmp = (uni & b4_requ))) && \
    ((tmp & b4_err0) == 0 || (tmp & b4_err1) == 0) \
)
    
#define return_err(_end, _msg) do { \
    *msg = _msg; \
    *end = _end; \
    return false; \
} while (false)
    
    u8 *cur = *ptr;
    u8 **end = ptr;
    u8 *src = ++cur, *dst, *pos;
    u16 hi, lo;
    u32 uni, tmp;
    
skip_ascii:
    /* Most strings have no escaped characters, so we can jump them quickly. */
    
skip_ascii_begin:
    /*
     We want to make loop unrolling, as shown in the following code. Some
     compiler may not generate instructions as expected, so we rewrite it with
     explicit goto statements. We hope the compiler can generate instructions
     like this: https://godbolt.org/z/8vjsYq
     
         while (true) repeat16({
            if (likely(!(char_is_ascii_stop(*src)))) src++;
            else break;
         })
     */
#define expr_jump(i) \
    if (likely(!char_is_ascii_stop(src[i]))) {} \
    else goto skip_ascii_stop##i;
    
#define expr_stop(i) \
    skip_ascii_stop##i: \
    src += i; \
    goto skip_ascii_end;
    
    repeat16_incr(expr_jump)
    src += 16;
    goto skip_ascii_begin;
    repeat16_incr(expr_stop)
    
#undef expr_jump
#undef expr_stop
    
skip_ascii_end:
    
    /*
     GCC may store src[i] in a register at each line of expr_jump(i) above.
     These instructions are useless and will degrade performance.
     This inline asm is a hint for gcc: "the memory has been modified,
     do not cache it".
     
     MSVC, Clang, ICC can generate expected instructions without this hint.
     */
#if YYJSON_IS_REAL_GCC
    __asm__ volatile("":"=m"(*src));
#endif
    if (likely(*src == '"')) {
        val->tag = ((u64)(src - cur) << YYJSON_TAG_BIT) |
                    (u64)(YYJSON_TYPE_STR);
        val->uni.str = (const char *)cur;
        *src = '\0';
        *end = src + 1;
        return true;
    }
    
skip_utf8:
    if (*src & 0x80) { /* non-ASCII character */
        /*
         Non-ASCII character appears here, which means that the text is likely
         to be written in non-English or emoticons. According to some common
         data set statistics, byte sequences of the same length may appear
         consecutively. We process the byte sequences of the same length in each
         loop, which is more friendly to branch prediction.
         */
        pos = src;
#if YYJSON_DISABLE_UTF8_VALIDATION
        while (true) repeat8({
            if (likely((*src & 0xF0) == 0xE0)) src += 3;
            else break;
        })
        if (*src < 0x80) goto skip_ascii;
        while (true) repeat8({
            if (likely((*src & 0xE0) == 0xC0)) src += 2;
            else break;
        })
        while (true) repeat8({
            if (likely((*src & 0xF8) == 0xF0)) src += 4;
            else break;
        })
#else
        uni = byte_load_4(src);
        while (is_valid_seq_3(uni)) {
            src += 3;
            uni = byte_load_4(src);
        }
        if (is_valid_seq_1(uni)) goto skip_ascii;
        while (is_valid_seq_2(uni)) {
            src += 2;
            uni = byte_load_4(src);
        }
        while (is_valid_seq_4(uni)) {
            src += 4;
            uni = byte_load_4(src);
        }
#endif
        if (false) {
            if (!inv) return_err(src, "invalid UTF-8 encoding in string");
            ++src;
        }
        goto skip_ascii;
    }
    
    /* The escape character appears, we need to copy it. */
    dst = src;
copy_escape:
    if (likely(*src == '\\')) {
        switch (*++src) {
            case '"':  *dst++ = '"';  src++; break;
            case '\\': *dst++ = '\\'; src++; break;
            case '/':  *dst++ = '/';  src++; break;
            case 'b':  *dst++ = '\b'; src++; break;
            case 'f':  *dst++ = '\f'; src++; break;
            case 'n':  *dst++ = '\n'; src++; break;
            case 'r':  *dst++ = '\r'; src++; break;
            case 't':  *dst++ = '\t'; src++; break;
            case 'u':
                if (unlikely(!read_hex_u16(++src, &hi))) {
                    return_err(src - 2, "invalid escaped sequence in string");
                }
                src += 4;
                if (likely((hi & 0xF800) != 0xD800)) {
                    /* a BMP character */
                    if (hi >= 0x800) {
                        *dst++ = (u8)(0xE0 | (hi >> 12));
                        *dst++ = (u8)(0x80 | ((hi >> 6) & 0x3F));
                        *dst++ = (u8)(0x80 | (hi & 0x3F));
                    } else if (hi >= 0x80) {
                        *dst++ = (u8)(0xC0 | (hi >> 6));
                        *dst++ = (u8)(0x80 | (hi & 0x3F));
                    } else {
                        *dst++ = (u8)hi;
                    }
                } else {
                    /* a non-BMP character, represented as a surrogate pair */
                    if (unlikely((hi & 0xFC00) != 0xD800)) {
                        return_err(src - 6, "invalid high surrogate in string");
                    }
                    if (unlikely(!byte_match_2(src, "\\u"))) {
                        return_err(src, "no low surrogate in string");
                    }
                    if (unlikely(!read_hex_u16(src + 2, &lo))) {
                        return_err(src, "invalid escaped sequence in string");
                    }
                    if (unlikely((lo & 0xFC00) != 0xDC00)) {
                        return_err(src, "invalid low surrogate in string");
                    }
                    uni = ((((u32)hi - 0xD800) << 10) |
                            ((u32)lo - 0xDC00)) + 0x10000;
                    *dst++ = (u8)(0xF0 | (uni >> 18));
                    *dst++ = (u8)(0x80 | ((uni >> 12) & 0x3F));
                    *dst++ = (u8)(0x80 | ((uni >> 6) & 0x3F));
                    *dst++ = (u8)(0x80 | (uni & 0x3F));
                    src += 6;
                }
                break;
            default: return_err(src, "invalid escaped character in string");
        }
    } else if (likely(*src == '"')) {
        val->tag = ((u64)(dst - cur) << YYJSON_TAG_BIT) | YYJSON_TYPE_STR;
        val->uni.str = (const char *)cur;
        *dst = '\0';
        *end = src + 1;
        return true;
    } else {
        if (!inv) return_err(src, "unexpected control character in string");
        if (src >= lst) return_err(src, "unclosed string");
        *dst++ = *src++;
    }
    
copy_ascii:
    /*
     Copy continuous ASCII, loop unrolling, same as the following code:
     
         while (true) repeat16({
            if (unlikely(char_is_ascii_stop(*src))) break;
            *dst++ = *src++;
         })
     */
#if YYJSON_IS_REAL_GCC
#   define expr_jump(i) \
    if (likely(!(char_is_ascii_stop(src[i])))) {} \
    else { __asm__ volatile("":"=m"(src[i])); goto copy_ascii_stop_##i; }
#else
#   define expr_jump(i) \
    if (likely(!(char_is_ascii_stop(src[i])))) {} \
    else { goto copy_ascii_stop_##i; }
#endif
    repeat16_incr(expr_jump)
#undef expr_jump
    
    byte_move_16(dst, src);
    src += 16;
    dst += 16;
    goto copy_ascii;
    
    /*
     The memory will be moved forward by at least 1 byte. So the `byte_move`
     can be one byte more than needed to reduce the number of instructions.
     */
copy_ascii_stop_0:
    goto copy_utf8;
copy_ascii_stop_1:
    byte_move_2(dst, src);
    src += 1;
    dst += 1;
    goto copy_utf8;
copy_ascii_stop_2:
    byte_move_2(dst, src);
    src += 2;
    dst += 2;
    goto copy_utf8;
copy_ascii_stop_3:
    byte_move_4(dst, src);
    src += 3;
    dst += 3;
    goto copy_utf8;
copy_ascii_stop_4:
    byte_move_4(dst, src);
    src += 4;
    dst += 4;
    goto copy_utf8;
copy_ascii_stop_5:
    byte_move_4(dst, src);
    byte_move_2(dst + 4, src + 4);
    src += 5;
    dst += 5;
    goto copy_utf8;
copy_ascii_stop_6:
    byte_move_4(dst, src);
    byte_move_2(dst + 4, src + 4);
    src += 6;
    dst += 6;
    goto copy_utf8;
copy_ascii_stop_7:
    byte_move_8(dst, src);
    src += 7;
    dst += 7;
    goto copy_utf8;
copy_ascii_stop_8:
    byte_move_8(dst, src);
    src += 8;
    dst += 8;
    goto copy_utf8;
copy_ascii_stop_9:
    byte_move_8(dst, src);
    byte_move_2(dst + 8, src + 8);
    src += 9;
    dst += 9;
    goto copy_utf8;
copy_ascii_stop_10:
    byte_move_8(dst, src);
    byte_move_2(dst + 8, src + 8);
    src += 10;
    dst += 10;
    goto copy_utf8;
copy_ascii_stop_11:
    byte_move_8(dst, src);
    byte_move_4(dst + 8, src + 8);
    src += 11;
    dst += 11;
    goto copy_utf8;
copy_ascii_stop_12:
    byte_move_8(dst, src);
    byte_move_4(dst + 8, src + 8);
    src += 12;
    dst += 12;
    goto copy_utf8;
copy_ascii_stop_13:
    byte_move_8(dst, src);
    byte_move_4(dst + 8, src + 8);
    byte_move_2(dst + 12, src + 12);
    src += 13;
    dst += 13;
    goto copy_utf8;
copy_ascii_stop_14:
    byte_move_8(dst, src);
    byte_move_4(dst + 8, src + 8);
    byte_move_2(dst + 12, src + 12);
    src += 14;
    dst += 14;
    goto copy_utf8;
copy_ascii_stop_15:
    byte_move_16(dst, src);
    src += 15;
    dst += 15;
    goto copy_utf8;
    
copy_utf8:
    if (*src & 0x80) { /* non-ASCII character */
        pos = src;
        uni = byte_load_4(src);
#if YYJSON_DISABLE_UTF8_VALIDATION
        while (true) repeat4({
            if ((uni & b3_mask) == b3_patt) {
                byte_copy_4(dst, &uni);
                dst += 3;
                src += 3;
                uni = byte_load_4(src);
            } else break;
        })
        if ((uni & b1_mask) == b1_patt) goto copy_ascii;
        while (true) repeat4({
            if ((uni & b2_mask) == b2_patt) {
                byte_copy_2(dst, &uni);
                dst += 2;
                src += 2;
                uni = byte_load_4(src);
            } else break;
        })
        while (true) repeat4({
            if ((uni & b4_mask) == b4_patt) {
                byte_copy_4(dst, &uni);
                dst += 4;
                src += 4;
                uni = byte_load_4(src);
            } else break;
        })
#else
        while (is_valid_seq_3(uni)) {
            byte_copy_4(dst, &uni);
            dst += 3;
            src += 3;
            uni = byte_load_4(src);
        }
        if (is_valid_seq_1(uni)) goto copy_ascii;
        while (is_valid_seq_2(uni)) {
            byte_copy_2(dst, &uni);
            dst += 2;
            src += 2;
            uni = byte_load_4(src);
        }
        while (is_valid_seq_4(uni)) {
            byte_copy_4(dst, &uni);
            dst += 4;
            src += 4;
            uni = byte_load_4(src);
        }
#endif
        if (false) {
            if (!inv) return_err(src, "invalid UTF-8 encoding in string");
            goto copy_ascii_stop_1;
        }
        goto copy_ascii;
    }
    goto copy_escape;
    
#undef return_err
#undef is_valid_seq_1
#undef is_valid_seq_2
#undef is_valid_seq_3
#undef is_valid_seq_4
}