public void writeAttrURI()

in serializer/src/main/java/org/apache/xml/serializer/ToHTMLStream.java [1140:1374]


    public void writeAttrURI(
        final java.io.Writer writer, String string, boolean doURLEscaping)
        throws IOException
    {
        // http://www.ietf.org/rfc/rfc2396.txt says:
        // A URI is always in an "escaped" form, since escaping or unescaping a
        // completed URI might change its semantics.  Normally, the only time
        // escape encodings can safely be made is when the URI is being created
        // from its component parts; each component may have its own set of
        // characters that are reserved, so only the mechanism responsible for
        // generating or interpreting that component can determine whether or
        // not escaping a character will change its semantics. Likewise, a URI
        // must be separated into its components before the escaped characters
        // within those components can be safely decoded.
        //
        // ...So we do our best to do limited escaping of the URL, without 
        // causing damage.  If the URL is already properly escaped, in theory, this 
        // function should not change the string value.

        final int end = string.length();
        if (end > m_attrBuff.length)
        {
           m_attrBuff = new char[end*2 + 1];               
        }
        string.getChars(0,end, m_attrBuff, 0); 
        final char[] chars = m_attrBuff;

        int cleanStart = 0;
        int cleanLength = 0;
        
        
        char ch = 0;
        for (int i = 0; i < end; i++)
        {
            ch = chars[i];

            if ((ch < 32) || (ch > 126))
            {
                if (cleanLength > 0)
                {
                    writer.write(chars, cleanStart, cleanLength);
                    cleanLength = 0;
                }
                if (doURLEscaping)
                {
                    // Encode UTF16 to UTF8.
                    // Reference is Unicode, A Primer, by Tony Graham.
                    // Page 92.

                    // Note that Kay doesn't escape 0x20...
                    //  if(ch == 0x20) // Not sure about this... -sb
                    //  {
                    //    writer.write(ch);
                    //  }
                    //  else 
                    if (ch <= 0x7F)
                    {
                        writer.write('%');
                        writer.write(makeHHString(ch));
                    }
                    else if (ch <= 0x7FF)
                    {
                        // Clear low 6 bits before rotate, put high 4 bits in low byte, 
                        // and set two high bits.
                        int high = (ch >> 6) | 0xC0;
                        int low = (ch & 0x3F) | 0x80;
                        // First 6 bits, + high bit
                        writer.write('%');
                        writer.write(makeHHString(high));
                        writer.write('%');
                        writer.write(makeHHString(low));
                    }
                    else if (Encodings.isHighUTF16Surrogate(ch)) // high surrogate
                    {
                        // I'm sure this can be done in 3 instructions, but I choose 
                        // to try and do it exactly like it is done in the book, at least 
                        // until we are sure this is totally clean.  I don't think performance 
                        // is a big issue with this particular function, though I could be 
                        // wrong.  Also, the stuff below clearly does more masking than 
                        // it needs to do.

                        // Clear high 6 bits.
                        int highSurrogate = ((int) ch) & 0x03FF;

                        // Middle 4 bits (wwww) + 1
                        // "Note that the value of wwww from the high surrogate bit pattern
                        // is incremented to make the uuuuu bit pattern in the scalar value 
                        // so the surrogate pair don't address the BMP."
                        int wwww = ((highSurrogate & 0x03C0) >> 6);
                        int uuuuu = wwww + 1;

                        // next 4 bits
                        int zzzz = (highSurrogate & 0x003C) >> 2;

                        // low 2 bits
                        int yyyyyy = ((highSurrogate & 0x0003) << 4) & 0x30;

                        // Get low surrogate character.
                        ch = chars[++i];

                        // Clear high 6 bits.
                        int lowSurrogate = ((int) ch) & 0x03FF;

                        // put the middle 4 bits into the bottom of yyyyyy (byte 3)
                        yyyyyy = yyyyyy | ((lowSurrogate & 0x03C0) >> 6);

                        // bottom 6 bits.
                        int xxxxxx = (lowSurrogate & 0x003F);

                        int byte1 = 0xF0 | (uuuuu >> 2); // top 3 bits of uuuuu
                        int byte2 =
                            0x80 | (((uuuuu & 0x03) << 4) & 0x30) | zzzz;
                        int byte3 = 0x80 | yyyyyy;
                        int byte4 = 0x80 | xxxxxx;

                        writer.write('%');
                        writer.write(makeHHString(byte1));
                        writer.write('%');
                        writer.write(makeHHString(byte2));
                        writer.write('%');
                        writer.write(makeHHString(byte3));
                        writer.write('%');
                        writer.write(makeHHString(byte4));
                    }
                    else
                    {
                        int high = (ch >> 12) | 0xE0; // top 4 bits
                        int middle = ((ch & 0x0FC0) >> 6) | 0x80;
                        // middle 6 bits
                        int low = (ch & 0x3F) | 0x80;
                        // First 6 bits, + high bit
                        writer.write('%');
                        writer.write(makeHHString(high));
                        writer.write('%');
                        writer.write(makeHHString(middle));
                        writer.write('%');
                        writer.write(makeHHString(low));
                    }

                }
                else if (escapingNotNeeded(ch))
                {
                    writer.write(ch);
                }
                else if (Encodings.isHighUTF16Surrogate(ch))
                {
                    writeUTF16Surrogate(ch, chars, i, end);
                    i++; // two input characters processed
                         // this increments by one and the for()
                         // loop itself increments by another one.
                }
                else
                {
                    writer.write("&#");
                    writer.write(Integer.toString(ch));
                    writer.write(';');
                }
                // In this character range we have first written out any previously accumulated 
                // "clean" characters, then processed the current more complicated character,
                // which may have incremented "i".
                // We now we reset the next possible clean character.
                cleanStart = i + 1;
            }
            // Since http://www.ietf.org/rfc/rfc2396.txt refers to the URI grammar as
            // not allowing quotes in the URI proper syntax, nor in the fragment 
            // identifier, we believe that it's OK to double escape quotes.
            else if (ch == '"')
            {
                // If the character is a '%' number number, try to avoid double-escaping.
                // There is a question if this is legal behavior.

                // Dmitri Ilyin: to check if '%' number number is invalid. It must be checked if %xx is a sign, that would be encoded
                // The encoded signes are in Hex form. So %xx my be in form %3C that is "<" sign. I will try to change here a little.

                //        if( ((i+2) < len) && isASCIIDigit(stringArray[i+1]) && isASCIIDigit(stringArray[i+2]) )

                // We are no longer escaping '%'

                if (cleanLength > 0)
                {
                    writer.write(chars, cleanStart, cleanLength);
                    cleanLength = 0;
                }   
                
                
                // Mike Kay encodes this as &#34;, so he may know something I don't?
                if (doURLEscaping)
                    writer.write("%22");
                else
                    writer.write("&quot;"); // we have to escape this, I guess.

                // We have written out any clean characters, then the escaped '%' and now we
                // We now we reset the next possible clean character.
                cleanStart = i + 1;    
            }
            else if (ch == '&')
            {
                // HTML 4.01 reads, "Authors should use "&amp;" (ASCII decimal 38) 
                // instead of "&" to avoid confusion with the beginning of a character 
                // reference (entity reference open delimiter). 
                if (cleanLength > 0)
                {
                    writer.write(chars, cleanStart, cleanLength);
                    cleanLength = 0;
                } 
                writer.write("&amp;");
                cleanStart = i + 1; 
            }
            else
            {
                // no processing for this character, just count how
                // many characters in a row that we have that need no processing
                cleanLength++;
            }
        }
        
        // are there any clean characters at the end of the array
        // that we haven't processed yet?
        if (cleanLength > 1)
        {
            // if the whole string can be written out as-is do so
            // otherwise write out the clean chars at the end of the
            // array
            if (cleanStart == 0)
                writer.write(string);
            else
                writer.write(chars, cleanStart, cleanLength);
        }
        else if (cleanLength == 1)
        {
            // a little optimization for 1 clean character
            // (we could have let the previous if(...) handle them all)
            writer.write(ch);
        }
    }