static inline void agtype_lex_string()

in src/backend/utils/adt/agtype_parser.c [735:1011]


static inline void agtype_lex_string(agtype_lex_context *lex)
{
    char *s;
    int len;
    int hi_surrogate = -1;

    if (lex->strval != NULL)
        resetStringInfo(lex->strval);

    Assert(lex->input_length > 0);
    s = lex->token_start;
    len = lex->token_start - lex->input;
    for (;;)
    {
        s++;
        len++;
        /* Premature end of the string. */
        if (len >= lex->input_length)
        {
            lex->token_terminator = s;
            report_invalid_token(lex);
        }
        else if (*s == '"')
        {
            break;
        }
        else if ((unsigned char)*s < 32)
        {
            /* Per RFC4627, these characters MUST be escaped. */
            /* Since *s isn't printable, exclude it from the context string */
            lex->token_terminator = s;
            ereport(ERROR,
                    (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                     errmsg("invalid input syntax for type %s", "agtype"),
                     errdetail("Character with value 0x%02x must be escaped.",
                               (unsigned char)*s),
                     report_agtype_context(lex)));
        }
        else if (*s == '\\')
        {
            /* OK, we have an escape character. */
            s++;
            len++;
            if (len >= lex->input_length)
            {
                lex->token_terminator = s;
                report_invalid_token(lex);
            }
            else if (*s == 'u')
            {
                int i;
                int ch = 0;

                for (i = 1; i <= 4; i++)
                {
                    s++;
                    len++;
                    if (len >= lex->input_length)
                    {
                        lex->token_terminator = s;
                        report_invalid_token(lex);
                    }
                    else if (*s >= '0' && *s <= '9')
                    {
                        ch = (ch * 16) + (*s - '0');
                    }
                    else if (*s >= 'a' && *s <= 'f')
                    {
                        ch = (ch * 16) + (*s - 'a') + 10;
                    }
                    else if (*s >= 'A' && *s <= 'F')
                    {
                        ch = (ch * 16) + (*s - 'A') + 10;
                    }
                    else
                    {
                        lex->token_terminator = s + pg_mblen(s);
                        ereport(
                            ERROR,
                            (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                             errmsg("invalid input syntax for type %s",
                                    "agtype"),
                             errdetail(
                                 "\"\\u\" must be followed by four hexadecimal digits."),
                             report_agtype_context(lex)));
                    }
                }
                if (lex->strval != NULL)
                {
                    char utf8str[5];
                    int utf8len;

                    if (ch >= 0xd800 && ch <= 0xdbff)
                    {
                        if (hi_surrogate != -1)
                        {
                            ereport(
                                ERROR,
                                (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                 errmsg("invalid input syntax for type %s",
                                        "agtype"),
                                 errdetail(
                                     "Unicode high surrogate must not follow a high surrogate."),
                                 report_agtype_context(lex)));
                        }
                        hi_surrogate = (ch & 0x3ff) << 10;
                        continue;
                    }
                    else if (ch >= 0xdc00 && ch <= 0xdfff)
                    {
                        if (hi_surrogate == -1)
                        {
                            ereport(
                                ERROR,
                                (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                                 errmsg("invalid input syntax for type %s",
                                        "agtype"),
                                 errdetail(
                                     "Unicode low surrogate must follow a high surrogate."),
                                 report_agtype_context(lex)));
                        }
                        ch = 0x10000 + hi_surrogate + (ch & 0x3ff);
                        hi_surrogate = -1;
                    }

                    if (hi_surrogate != -1)
                    {
                        ereport(
                            ERROR,
                            (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                             errmsg("invalid input syntax for type %s",
                                    "agtype"),
                             errdetail(
                                 "Unicode low surrogate must follow a high surrogate."),
                             report_agtype_context(lex)));
                    }

                    /*
                     * For UTF8, replace the escape sequence by the actual
                     * utf8 character in lex->strval. Do this also for other
                     * encodings if the escape designates an ASCII character,
                     * otherwise raise an error.
                     */

                    if (ch == 0)
                    {
                        /* We can't allow this, since our TEXT type doesn't */
                        ereport(
                            ERROR,
                            (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
                             errmsg("unsupported Unicode escape sequence"),
                             errdetail("\\u0000 cannot be converted to text."),
                             report_agtype_context(lex)));
                    }
                    else if (GetDatabaseEncoding() == PG_UTF8)
                    {
                        unicode_to_utf8(ch, (unsigned char *)utf8str);
                        utf8len = pg_utf_mblen((unsigned char *)utf8str);
                        appendBinaryStringInfo(lex->strval, utf8str, utf8len);
                    }
                    else if (ch <= 0x007f)
                    {
                        /*
                         * This is the only way to designate things like a
                         * form feed character in agtype, so it's useful in all
                         * encodings.
                         */
                        appendStringInfoChar(lex->strval, (char)ch);
                    }
                    else
                    {
                        ereport(
                            ERROR,
                            (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
                             errmsg("unsupported Unicode escape sequence"),
                             errdetail(
                                 "Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."),
                             report_agtype_context(lex)));
                    }
                }
            }
            else if (lex->strval != NULL)
            {
                if (hi_surrogate != -1)
                {
                    ereport(
                        ERROR,
                        (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                         errmsg("invalid input syntax for type %s", "agtype"),
                         errdetail(
                             "Unicode low surrogate must follow a high surrogate."),
                         report_agtype_context(lex)));
                }

                switch (*s)
                {
                case '"':
                case '\\':
                case '/':
                    appendStringInfoChar(lex->strval, *s);
                    break;
                case 'b':
                    appendStringInfoChar(lex->strval, '\b');
                    break;
                case 'f':
                    appendStringInfoChar(lex->strval, '\f');
                    break;
                case 'n':
                    appendStringInfoChar(lex->strval, '\n');
                    break;
                case 'r':
                    appendStringInfoChar(lex->strval, '\r');
                    break;
                case 't':
                    appendStringInfoChar(lex->strval, '\t');
                    break;
                default:
                    /* Not a valid string escape, so error out. */
                    lex->token_terminator = s + pg_mblen(s);
                    ereport(
                        ERROR,
                        (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                         errmsg("invalid input syntax for type %s", "agtype"),
                         errdetail("Escape sequence \"\\%s\" is invalid.",
                                   extract_mb_char(s)),
                         report_agtype_context(lex)));
                }
            }
            else if (strchr("\"\\/bfnrt", *s) == NULL)
            {
                /*
                 * Simpler processing if we're not bothered about de-escaping
                 *
                 * It's very tempting to remove the strchr() call here and
                 * replace it with a switch statement, but testing so far has
                 * shown it's not a performance win.
                 */
                lex->token_terminator = s + pg_mblen(s);
                ereport(ERROR,
                        (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                         errmsg("invalid input syntax for type %s", "agtype"),
                         errdetail("Escape sequence \"\\%s\" is invalid.",
                                   extract_mb_char(s)),
                         report_agtype_context(lex)));
            }
        }
        else if (lex->strval != NULL)
        {
            if (hi_surrogate != -1)
            {
                ereport(
                    ERROR,
                    (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
                     errmsg("invalid input syntax for type %s", "agtype"),
                     errdetail(
                         "Unicode low surrogate must follow a high surrogate."),
                     report_agtype_context(lex)));
            }

            appendStringInfoChar(lex->strval, *s);
        }
    }

    if (hi_surrogate != -1)
    {
        ereport(
            ERROR,
            (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
             errmsg("invalid input syntax for type %s", "agtype"),
             errdetail("Unicode low surrogate must follow a high surrogate."),
             report_agtype_context(lex)));
    }

    /* Hooray, we found the end of the string! */
    lex->prev_token_terminator = lex->token_terminator;
    lex->token_terminator = s + 1;
}