public static void encodeURIAttribute()

in impl/src/main/java/org/apache/myfaces/renderkit/html/util/HTMLEncoder.java [931:1161]
95 lines of code
34 McCabe index (conditional complexity)

    public static void encodeURIAttribute(Writer writer, final String string, final String characterEncoding)
        throws IOException
    {
        int start = 0;
        String app;
        char c;
        boolean endLoop = false;
        int length = string.length();
        for (int i = 0; i < length; ++i)
        {
            app = null;
            c = string.charAt(i);
            
            // This are the guidelines to be taken into account by this algorithm to encode:
            
            // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
            //
            // control     = <US-ASCII coded characters 00-1F and 7F hexadecimal>
            // space       = <US-ASCII coded character 20 hexadecimal>
            // delims      = "<" | ">" | "#" | "%" | <">
            //               %3C   %3E   %23   %25   %22
            // unwise      = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
            //               %7D   %7B   %7C   %5C   %5E   %5B   %5D   %60
            //
            // ".... Data corresponding to excluded characters must be escaped in order to
            // be properly represented within a URI....."
            
            // RFC 3986 Section 3.  Syntax Components
            //
            // "... The generic URI syntax consists of a hierarchical sequence of
            // components referred to as the scheme, authority, path, query, and
            // fragment.
            //
            //   URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
            //
            //   hier-part   = "//" authority path-abempty
            //               / path-absolute
            //               / path-rootless
            //               / path-empty
            // ...."
            
            // RFC 3986 Section 2.2:
            // Reserved characters (should not be percent-encoded)
            // reserved    = gen-delims / sub-delims
            // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
            //               %3A   %2F   %3F   %23   %5B   %5D   %40
            // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
            //               %21   %24   %26   %27   %28   %29   %2A   %2B   %2C   %3B   %3D
            
            // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
            // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 
            // "...those rules were redefined to directly specify the characters allowed...."
            // There is also other characters moved from excluded list to reserved:
            // "[" / "]" / "#"  
            
            // RFC 3986 Section 2.3:
            // "... for consistency, percent-encoded octets in the ranges of ALPHA
            // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
            // underscore (%5F), or tilde (%7E) should not be created by URI
            // producers...."
            
            // RFC 3986 Section  3.2.2.  Host

            // host = IP-literal / IPv4address / reg-name

            // The reg-name syntax allows percent-encoded octets in order to
            // represent non-ASCII registered names in a uniform way that is
            // independent of the underlying name resolution technology.  Non-ASCII
            // characters must first be encoded according to UTF-8 [STD63], and then
            // each octet of the corresponding UTF-8 sequence must be percent-
            // encoded to be represented as URI characters.  URI producing
            // applications must not use percent-encoding in host unless it is used
            // to represent a UTF-8 character sequence.
            
            // RFC 3986 Section 3.4 Query 
            //         query       = *( pchar / "/" / "?" )
            //
            // "...  However, as query components are often used to carry identifying information 
            // in the form of "key=value" pairs and one frequently used value is a reference to
            // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
            //
            // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
            //
            // When a new URI scheme defines a component that represents textual
            // data consisting of characters from the Universal Character Set [UCS],
            // the data should first be encoded as octets according to the UTF-8
            // character encoding [STD63]; then only those octets that do not
            // correspond to characters in the unreserved set should be percent-
            // encoded.  For example, the character A would be represented as "A",
            // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
            // as "%C3%80", and the character KATAKANA LETTER A would be represented
            // as "%E3%82%A2".
            //
            // RFC 3986 Section 3.5 Fragment
            //         fragment    = *( pchar / "/" / "?" )
            //
            // Note that follows the same as query
            
            // Based on the extracts the strategy to apply on this method is:
            // 
            // On scheme ":" hier-part
            //
            // Escape or percent encode chars inside :
            // 
            // - From %00 to %20, 
            // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
            //                     duplicate encoding, encode it when we are sure 
            //                     that there are not encoded twice)
            // - "<" %3C, ">" %3E
            // - "\" %5C, "^" %5E, "`" %60 
            // - "{" %7B, "|" %7C, "}" %7D
            // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
            //   part of an URI, but it is preferred to encode it that omit it).
            //
            // The remaining characters must not be encoded
            //
            // Characters after ? or # should be percent encoding but only the necessary ones:
            //
            // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
            // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
            //                     duplicate encoding, encode it when we are sure 
            //                     that there are not encoded twice)
            // - "<" %3C, ">" %3E,
            // - "\" %5C, "^" %5E, "`" %60 
            // - "{" %7B, "|" %7C, "}" %7D
            // - From %7F ad infinitum (each character as many bytes as necessary but take into account
            //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
            //   translating from the document character encoding to percent encoding, because this values
            //   could be retrieved from httpRequest.getParameter() and it uses the current character encoding
            //   for decode values)
            //
            // "&" should be encoded as "&amp;" because this link is inside an html page, and 
            // put only & is invalid in this context.

            if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
                    c == '"' || c == '<' ||
                    c == '>' || c == '\\' || c == '^' || c == '`' ||
                    c == '{' || c == '|' || c == '}')
            {
                // The percent encoding on this part should be done using UTF-8 charset
                // as RFC 3986 Section 3.2.2 says.
                // Also there is a reference on 
                // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
                // that recommend use of UTF-8 instead the document character encoding.
                // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
                //app = percentEncode(c, "UTF-8");
                if (start < i)
                {
                    writer.write(string, start, i-start);
                }
                start = i+1;
                percentEncode(writer, c, "UTF-8");
            }
            else if (c == '%')
            {
                if (i + 2 < length)
                {
                    char c1 = string.charAt(i+1);
                    char c2 = string.charAt(i+2);
                    if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) &&
                        (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z')))
                    {
                        // do not percent encode, because it could be already encoded
                        // and we don't want encode it twice
                    }
                    else
                    {
                        if (start < i)
                        {
                            writer.write(string, start, i-start);
                        }
                        start = i+1;
                        percentEncode(writer, c, UTF8);
                    }
                }
                else
                {
                    if (start < i)
                    {
                        writer.write(string, start, i-start);
                    }
                    start = i+1;
                    percentEncode(writer, c, UTF8);
                }
            }
            else if (c == '?' || c == '#')
            {
                if (i+1 < length)
                {
                    // The remaining part of the URI are data that should be encoded
                    // using the document character encoding.
                    if (start < i)
                    {
                        writer.write(string, start, i-start);
                    }
                    start = i+1;
                    writer.write(c);
                    encodeURIQuery(writer, string, i+1, characterEncoding);
                    endLoop = true;
                }
            }
            else
            {
                //No encoding, just do nothing, char will be added later.
            }
                        
            if (app != null)
            {
                if (start < i)
                {
                    writer.write(string, start, i-start);
                }
                start = i+1;
                writer.write(app);
            }
            if (endLoop)
            {
                start = length;
                break;
            }
        }

        if (start == 0)
        {
            writer.write(string);
        }
        else if (start < length)
        {
            writer.write(string,start,length-start);
        }
    }