in impl/src/main/java/org/apache/myfaces/renderkit/html/util/HTMLEncoder.java [931:1161]
public static void encodeURIAttribute(Writer writer, final String string, final String characterEncoding)
throws IOException
{
int start = 0;
String app;
char c;
boolean endLoop = false;
int length = string.length();
for (int i = 0; i < length; ++i)
{
app = null;
c = string.charAt(i);
// This are the guidelines to be taken into account by this algorithm to encode:
// RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
//
// control = <US-ASCII coded characters 00-1F and 7F hexadecimal>
// space = <US-ASCII coded character 20 hexadecimal>
// delims = "<" | ">" | "#" | "%" | <">
// %3C %3E %23 %25 %22
// unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
// %7D %7B %7C %5C %5E %5B %5D %60
//
// ".... Data corresponding to excluded characters must be escaped in order to
// be properly represented within a URI....."
// RFC 3986 Section 3. Syntax Components
//
// "... The generic URI syntax consists of a hierarchical sequence of
// components referred to as the scheme, authority, path, query, and
// fragment.
//
// URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
//
// hier-part = "//" authority path-abempty
// / path-absolute
// / path-rootless
// / path-empty
// ...."
// RFC 3986 Section 2.2:
// Reserved characters (should not be percent-encoded)
// reserved = gen-delims / sub-delims
// gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
// %3A %2F %3F %23 %5B %5D %40
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
// %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D
// Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
// but on the part D. Changes from RFC 2396 says about this chars (used on IPv6)
// "...those rules were redefined to directly specify the characters allowed...."
// There is also other characters moved from excluded list to reserved:
// "[" / "]" / "#"
// RFC 3986 Section 2.3:
// "... for consistency, percent-encoded octets in the ranges of ALPHA
// (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
// underscore (%5F), or tilde (%7E) should not be created by URI
// producers...."
// RFC 3986 Section 3.2.2. Host
// host = IP-literal / IPv4address / reg-name
// The reg-name syntax allows percent-encoded octets in order to
// represent non-ASCII registered names in a uniform way that is
// independent of the underlying name resolution technology. Non-ASCII
// characters must first be encoded according to UTF-8 [STD63], and then
// each octet of the corresponding UTF-8 sequence must be percent-
// encoded to be represented as URI characters. URI producing
// applications must not use percent-encoding in host unless it is used
// to represent a UTF-8 character sequence.
// RFC 3986 Section 3.4 Query
// query = *( pchar / "/" / "?" )
//
// "... However, as query components are often used to carry identifying information
// in the form of "key=value" pairs and one frequently used value is a reference to
// another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
//
// RFC 3986 Section 2.5 Identifying Data (Apply to query section)
//
// When a new URI scheme defines a component that represents textual
// data consisting of characters from the Universal Character Set [UCS],
// the data should first be encoded as octets according to the UTF-8
// character encoding [STD63]; then only those octets that do not
// correspond to characters in the unreserved set should be percent-
// encoded. For example, the character A would be represented as "A",
// the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
// as "%C3%80", and the character KATAKANA LETTER A would be represented
// as "%E3%82%A2".
//
// RFC 3986 Section 3.5 Fragment
// fragment = *( pchar / "/" / "?" )
//
// Note that follows the same as query
// Based on the extracts the strategy to apply on this method is:
//
// On scheme ":" hier-part
//
// Escape or percent encode chars inside :
//
// - From %00 to %20,
// - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
// duplicate encoding, encode it when we are sure
// that there are not encoded twice)
// - "<" %3C, ">" %3E
// - "\" %5C, "^" %5E, "`" %60
// - "{" %7B, "|" %7C, "}" %7D
// - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
// part of an URI, but it is preferred to encode it that omit it).
//
// The remaining characters must not be encoded
//
// Characters after ? or # should be percent encoding but only the necessary ones:
//
// - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
// - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
// duplicate encoding, encode it when we are sure
// that there are not encoded twice)
// - "<" %3C, ">" %3E,
// - "\" %5C, "^" %5E, "`" %60
// - "{" %7B, "|" %7C, "}" %7D
// - From %7F ad infinitum (each character as many bytes as necessary but take into account
// that a single char should contain 2,3 or more bytes!. This data should be encoded
// translating from the document character encoding to percent encoding, because this values
// could be retrieved from httpRequest.getParameter() and it uses the current character encoding
// for decode values)
//
// "&" should be encoded as "&" because this link is inside an html page, and
// put only & is invalid in this context.
if ( (c <= (char)0x20) || (c >= (char)0x7F) ||
c == '"' || c == '<' ||
c == '>' || c == '\\' || c == '^' || c == '`' ||
c == '{' || c == '|' || c == '}')
{
// The percent encoding on this part should be done using UTF-8 charset
// as RFC 3986 Section 3.2.2 says.
// Also there is a reference on
// http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
// that recommend use of UTF-8 instead the document character encoding.
// Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
//app = percentEncode(c, "UTF-8");
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
percentEncode(writer, c, "UTF-8");
}
else if (c == '%')
{
if (i + 2 < length)
{
char c1 = string.charAt(i+1);
char c2 = string.charAt(i+2);
if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) &&
(( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z')))
{
// do not percent encode, because it could be already encoded
// and we don't want encode it twice
}
else
{
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
percentEncode(writer, c, UTF8);
}
}
else
{
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
percentEncode(writer, c, UTF8);
}
}
else if (c == '?' || c == '#')
{
if (i+1 < length)
{
// The remaining part of the URI are data that should be encoded
// using the document character encoding.
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
writer.write(c);
encodeURIQuery(writer, string, i+1, characterEncoding);
endLoop = true;
}
}
else
{
//No encoding, just do nothing, char will be added later.
}
if (app != null)
{
if (start < i)
{
writer.write(string, start, i-start);
}
start = i+1;
writer.write(app);
}
if (endLoop)
{
start = length;
break;
}
}
if (start == 0)
{
writer.write(string);
}
else if (start < length)
{
writer.write(string,start,length-start);
}
}