internal static string GetXmlNodeInnerContents()

in src/Deprecated/Engine/Engine/Utilities.cs [327:454]
118 lines of code
18 McCabe index (conditional complexity)

        internal static string GetXmlNodeInnerContents(XmlNode node)
        {
            // XmlNode.InnerXml gives back a string that consists of the set of characters
            // in between the opening and closing elements of the XML node, without doing any
            // unescaping.  Any "strange" character sequences (like "<![CDATA[...]]>" will remain 
            // exactly so and will not be translated or interpreted.  The only modification that
            // .InnerXml will do is that it will normalize any Xml contained within.  This means
            // normalizing whitespace between XML attributes and quote characters that surround XML 
            // attributes.  If PreserveWhitespace is false, then it will also normalize whitespace
            // between elements.
            //
            // XmlNode.InnerText strips out any Xml contained within, and then unescapes the rest
            // of the text.  So if the remaining text contains certain character sequences such as
            // "&amp;" or "<![CDATA[...]]>", these will be translated into their equivalent representations.
            //
            // It's hard to explain, but much easier to demonstrate with examples:
            //
            // Original XML                     XmlNode.InnerText               XmlNode.InnerXml
            // ===========================      ==============================  ======================================
            //
            // <a><![CDATA[whatever]]></a>      whatever                        <![CDATA[whatever]]>
            //
            // <a>123<MyNode/>456</a>           123456                          123<MyNode />456
            //
            // <a>123456</a>                    123456                          123456
            //
            // <a>123<MyNode b='&lt;'/>456</a>  123456                          123<MyNode b="&lt;" />456
            //
            // <a>123&amp;456</a>               123&456                         123&amp;456

            // So the trick for MSBuild when interpreting a property value is to know which one to
            // use ... InnerXml or InnerText.  There are two basic scenarios we care about.
            //
            // 1.)  The first scenario is that the user is trying to create a property whose
            //      contents are actually XML.  That is to say that the contents may be written 
            //      to a XML file, or may be passed in as a string to XmlDocument.LoadXml.
            //      In this case, we would want to use XmlNode.InnerXml, because we DO NOT want 
            //      character sequences to be unescaped.  If we did unescape them, then whatever 
            //      XML parser tried to read in the stream as XML later on would totally barf.
            //
            // 2.)  The second scenario is the the user is trying to create a property that
            //      is just intended to be treated as a string.  That string may be very large
            //      and could contain all sorts of whitespace, carriage returns, special characters,
            //      etc.  But in the end, it's just a big string.  In this case, whatever 
            //      task is actually processing this string ... it's not going to know anything
            //      about character sequences such as &amp; and &lt;.  These character sequences
            //      are specific to XML markup.  So, here we want to use XmlNode.InnerText so that 
            //      the character sequences get unescaped into their actual character before
            //      the string is passed to the task (or wherever else the property is used).
            //      Of course, if the string value of the property needs to contain characters
            //      like <, >, &, etc., then the user must XML escape these characters otherwise
            //      the XML parser reading the project file will croak.  Or if the user doesn't
            //      want to escape every instance of these characters, he can surround the whole
            //      thing with a CDATA tag.  Again, if he does this, we don't want the task to
            //      receive the C, D, A, T, A as part of the string ... this should be stripped off.
            //      Again, using XmlNode.InnerText takes care of this.
            //
            // 2b.) A variation of the second scenario is that the user is trying to create a property
            //      that is just intended to be a string, but wants to comment out part of the string.
            //      For example, it's a semicolon separated list that's going ultimately to end up in a list.
            //      eg. (DDB #56841)
            //
            //     <BuildDirectories>
            //        <!--
            //              env\TestTools\tshell\pkg;
            //        -->
            //                ndp\fx\src\VSIP\FrameWork;
            //                ndp\fx\src\xmlTools;
            //                ddsuites\src\vs\xmlTools;
            //     </BuildDirectories>
            //
            //      In this case, we want to treat the string as text, so that we don't retrieve the comment.
            //      We only want to retrieve the comment if there's some other XML in there. The
            //      mere presence of an XML comment shouldn't make us think the value is XML.
            //
            // Given these two scenarios, how do we know whether the user intended to treat
            // a property value as XML or text?  We use a simple heuristic which is that if
            // XmlNode.InnerXml contains any "<" characters, then there pretty much has to be
            // XML in there, so we'll just use XmlNode.InnerXml.  If there are no "<" characters that aren't merely comments,
            // then we assume it's to be treated as text and we use XmlNode.InnerText.  Also, if
            // it looks like the whole thing is one big CDATA block, then we also use XmlNode.InnerText.

            // XmlNode.InnerXml is much more expensive than InnerText. Don't use it for trivial cases.
            // (single child node with a trivial value or no child nodes)
            if (!node.HasChildNodes)
            {
                return string.Empty;
            }

            if (node.ChildNodes.Count == 1 && (node.FirstChild.NodeType == XmlNodeType.Text || node.FirstChild.NodeType == XmlNodeType.CDATA))
            {
                return node.InnerText;
            }

            string innerXml = node.InnerXml;

            // If there is no markup under the XML node (detected by the presence
            // of a '<' sign
            int firstLessThan = innerXml.IndexOf('<');
            if (firstLessThan == -1)
            {
                // return the inner text so it gets properly unescaped
                return node.InnerText;
            }

            bool containsNoTagsOtherThanComments = ContainsNoTagsOtherThanComments(innerXml, firstLessThan);

            // ... or if the only XML is comments,
            if (containsNoTagsOtherThanComments)
            {
                // return the inner text so the comments are stripped
                // (this is how one might comment out part of a list in a property value)
                return node.InnerText;
            }

            // ...or it looks like the whole thing is a big CDATA tag ...
            bool startsWithCData = (innerXml.IndexOf("<![CDATA[", StringComparison.Ordinal) == 0);

            if (startsWithCData)
            {
                // return the inner text so it gets properly extracted from the CDATA
                return node.InnerText;
            }

            // otherwise, it looks like genuine XML; return the inner XML so that
            // tags and comments are preserved and any XML escaping is preserved
            return innerXml;
        }