in serializer/src/main/java/org/apache/xml/serializer/ToHTMLStream.java [1140:1374]
public void writeAttrURI(
final java.io.Writer writer, String string, boolean doURLEscaping)
throws IOException
{
// http://www.ietf.org/rfc/rfc2396.txt says:
// A URI is always in an "escaped" form, since escaping or unescaping a
// completed URI might change its semantics. Normally, the only time
// escape encodings can safely be made is when the URI is being created
// from its component parts; each component may have its own set of
// characters that are reserved, so only the mechanism responsible for
// generating or interpreting that component can determine whether or
// not escaping a character will change its semantics. Likewise, a URI
// must be separated into its components before the escaped characters
// within those components can be safely decoded.
//
// ...So we do our best to do limited escaping of the URL, without
// causing damage. If the URL is already properly escaped, in theory, this
// function should not change the string value.
final int end = string.length();
if (end > m_attrBuff.length)
{
m_attrBuff = new char[end*2 + 1];
}
string.getChars(0,end, m_attrBuff, 0);
final char[] chars = m_attrBuff;
int cleanStart = 0;
int cleanLength = 0;
char ch = 0;
for (int i = 0; i < end; i++)
{
ch = chars[i];
if ((ch < 32) || (ch > 126))
{
if (cleanLength > 0)
{
writer.write(chars, cleanStart, cleanLength);
cleanLength = 0;
}
if (doURLEscaping)
{
// Encode UTF16 to UTF8.
// Reference is Unicode, A Primer, by Tony Graham.
// Page 92.
// Note that Kay doesn't escape 0x20...
// if(ch == 0x20) // Not sure about this... -sb
// {
// writer.write(ch);
// }
// else
if (ch <= 0x7F)
{
writer.write('%');
writer.write(makeHHString(ch));
}
else if (ch <= 0x7FF)
{
// Clear low 6 bits before rotate, put high 4 bits in low byte,
// and set two high bits.
int high = (ch >> 6) | 0xC0;
int low = (ch & 0x3F) | 0x80;
// First 6 bits, + high bit
writer.write('%');
writer.write(makeHHString(high));
writer.write('%');
writer.write(makeHHString(low));
}
else if (Encodings.isHighUTF16Surrogate(ch)) // high surrogate
{
// I'm sure this can be done in 3 instructions, but I choose
// to try and do it exactly like it is done in the book, at least
// until we are sure this is totally clean. I don't think performance
// is a big issue with this particular function, though I could be
// wrong. Also, the stuff below clearly does more masking than
// it needs to do.
// Clear high 6 bits.
int highSurrogate = ((int) ch) & 0x03FF;
// Middle 4 bits (wwww) + 1
// "Note that the value of wwww from the high surrogate bit pattern
// is incremented to make the uuuuu bit pattern in the scalar value
// so the surrogate pair don't address the BMP."
int wwww = ((highSurrogate & 0x03C0) >> 6);
int uuuuu = wwww + 1;
// next 4 bits
int zzzz = (highSurrogate & 0x003C) >> 2;
// low 2 bits
int yyyyyy = ((highSurrogate & 0x0003) << 4) & 0x30;
// Get low surrogate character.
ch = chars[++i];
// Clear high 6 bits.
int lowSurrogate = ((int) ch) & 0x03FF;
// put the middle 4 bits into the bottom of yyyyyy (byte 3)
yyyyyy = yyyyyy | ((lowSurrogate & 0x03C0) >> 6);
// bottom 6 bits.
int xxxxxx = (lowSurrogate & 0x003F);
int byte1 = 0xF0 | (uuuuu >> 2); // top 3 bits of uuuuu
int byte2 =
0x80 | (((uuuuu & 0x03) << 4) & 0x30) | zzzz;
int byte3 = 0x80 | yyyyyy;
int byte4 = 0x80 | xxxxxx;
writer.write('%');
writer.write(makeHHString(byte1));
writer.write('%');
writer.write(makeHHString(byte2));
writer.write('%');
writer.write(makeHHString(byte3));
writer.write('%');
writer.write(makeHHString(byte4));
}
else
{
int high = (ch >> 12) | 0xE0; // top 4 bits
int middle = ((ch & 0x0FC0) >> 6) | 0x80;
// middle 6 bits
int low = (ch & 0x3F) | 0x80;
// First 6 bits, + high bit
writer.write('%');
writer.write(makeHHString(high));
writer.write('%');
writer.write(makeHHString(middle));
writer.write('%');
writer.write(makeHHString(low));
}
}
else if (escapingNotNeeded(ch))
{
writer.write(ch);
}
else if (Encodings.isHighUTF16Surrogate(ch))
{
writeUTF16Surrogate(ch, chars, i, end);
i++; // two input characters processed
// this increments by one and the for()
// loop itself increments by another one.
}
else
{
writer.write("&#");
writer.write(Integer.toString(ch));
writer.write(';');
}
// In this character range we have first written out any previously accumulated
// "clean" characters, then processed the current more complicated character,
// which may have incremented "i".
// We now we reset the next possible clean character.
cleanStart = i + 1;
}
// Since http://www.ietf.org/rfc/rfc2396.txt refers to the URI grammar as
// not allowing quotes in the URI proper syntax, nor in the fragment
// identifier, we believe that it's OK to double escape quotes.
else if (ch == '"')
{
// If the character is a '%' number number, try to avoid double-escaping.
// There is a question if this is legal behavior.
// Dmitri Ilyin: to check if '%' number number is invalid. It must be checked if %xx is a sign, that would be encoded
// The encoded signes are in Hex form. So %xx my be in form %3C that is "<" sign. I will try to change here a little.
// if( ((i+2) < len) && isASCIIDigit(stringArray[i+1]) && isASCIIDigit(stringArray[i+2]) )
// We are no longer escaping '%'
if (cleanLength > 0)
{
writer.write(chars, cleanStart, cleanLength);
cleanLength = 0;
}
// Mike Kay encodes this as ", so he may know something I don't?
if (doURLEscaping)
writer.write("%22");
else
writer.write("""); // we have to escape this, I guess.
// We have written out any clean characters, then the escaped '%' and now we
// We now we reset the next possible clean character.
cleanStart = i + 1;
}
else if (ch == '&')
{
// HTML 4.01 reads, "Authors should use "&" (ASCII decimal 38)
// instead of "&" to avoid confusion with the beginning of a character
// reference (entity reference open delimiter).
if (cleanLength > 0)
{
writer.write(chars, cleanStart, cleanLength);
cleanLength = 0;
}
writer.write("&");
cleanStart = i + 1;
}
else
{
// no processing for this character, just count how
// many characters in a row that we have that need no processing
cleanLength++;
}
}
// are there any clean characters at the end of the array
// that we haven't processed yet?
if (cleanLength > 1)
{
// if the whole string can be written out as-is do so
// otherwise write out the clean chars at the end of the
// array
if (cleanStart == 0)
writer.write(string);
else
writer.write(chars, cleanStart, cleanLength);
}
else if (cleanLength == 1)
{
// a little optimization for 1 clean character
// (we could have let the previous if(...) handle them all)
writer.write(ch);
}
}