in src/Deprecated/Engine/Engine/Utilities.cs [327:454]
internal static string GetXmlNodeInnerContents(XmlNode node)
{
// XmlNode.InnerXml gives back a string that consists of the set of characters
// in between the opening and closing elements of the XML node, without doing any
// unescaping. Any "strange" character sequences (like "<![CDATA[...]]>" will remain
// exactly so and will not be translated or interpreted. The only modification that
// .InnerXml will do is that it will normalize any Xml contained within. This means
// normalizing whitespace between XML attributes and quote characters that surround XML
// attributes. If PreserveWhitespace is false, then it will also normalize whitespace
// between elements.
//
// XmlNode.InnerText strips out any Xml contained within, and then unescapes the rest
// of the text. So if the remaining text contains certain character sequences such as
// "&" or "<![CDATA[...]]>", these will be translated into their equivalent representations.
//
// It's hard to explain, but much easier to demonstrate with examples:
//
// Original XML XmlNode.InnerText XmlNode.InnerXml
// =========================== ============================== ======================================
//
// <a><![CDATA[whatever]]></a> whatever <![CDATA[whatever]]>
//
// <a>123<MyNode/>456</a> 123456 123<MyNode />456
//
// <a>123456</a> 123456 123456
//
// <a>123<MyNode b='<'/>456</a> 123456 123<MyNode b="<" />456
//
// <a>123&456</a> 123&456 123&456
// So the trick for MSBuild when interpreting a property value is to know which one to
// use ... InnerXml or InnerText. There are two basic scenarios we care about.
//
// 1.) The first scenario is that the user is trying to create a property whose
// contents are actually XML. That is to say that the contents may be written
// to a XML file, or may be passed in as a string to XmlDocument.LoadXml.
// In this case, we would want to use XmlNode.InnerXml, because we DO NOT want
// character sequences to be unescaped. If we did unescape them, then whatever
// XML parser tried to read in the stream as XML later on would totally barf.
//
// 2.) The second scenario is the the user is trying to create a property that
// is just intended to be treated as a string. That string may be very large
// and could contain all sorts of whitespace, carriage returns, special characters,
// etc. But in the end, it's just a big string. In this case, whatever
// task is actually processing this string ... it's not going to know anything
// about character sequences such as & and <. These character sequences
// are specific to XML markup. So, here we want to use XmlNode.InnerText so that
// the character sequences get unescaped into their actual character before
// the string is passed to the task (or wherever else the property is used).
// Of course, if the string value of the property needs to contain characters
// like <, >, &, etc., then the user must XML escape these characters otherwise
// the XML parser reading the project file will croak. Or if the user doesn't
// want to escape every instance of these characters, he can surround the whole
// thing with a CDATA tag. Again, if he does this, we don't want the task to
// receive the C, D, A, T, A as part of the string ... this should be stripped off.
// Again, using XmlNode.InnerText takes care of this.
//
// 2b.) A variation of the second scenario is that the user is trying to create a property
// that is just intended to be a string, but wants to comment out part of the string.
// For example, it's a semicolon separated list that's going ultimately to end up in a list.
// eg. (DDB #56841)
//
// <BuildDirectories>
// <!--
// env\TestTools\tshell\pkg;
// -->
// ndp\fx\src\VSIP\FrameWork;
// ndp\fx\src\xmlTools;
// ddsuites\src\vs\xmlTools;
// </BuildDirectories>
//
// In this case, we want to treat the string as text, so that we don't retrieve the comment.
// We only want to retrieve the comment if there's some other XML in there. The
// mere presence of an XML comment shouldn't make us think the value is XML.
//
// Given these two scenarios, how do we know whether the user intended to treat
// a property value as XML or text? We use a simple heuristic which is that if
// XmlNode.InnerXml contains any "<" characters, then there pretty much has to be
// XML in there, so we'll just use XmlNode.InnerXml. If there are no "<" characters that aren't merely comments,
// then we assume it's to be treated as text and we use XmlNode.InnerText. Also, if
// it looks like the whole thing is one big CDATA block, then we also use XmlNode.InnerText.
// XmlNode.InnerXml is much more expensive than InnerText. Don't use it for trivial cases.
// (single child node with a trivial value or no child nodes)
if (!node.HasChildNodes)
{
return string.Empty;
}
if (node.ChildNodes.Count == 1 && (node.FirstChild.NodeType == XmlNodeType.Text || node.FirstChild.NodeType == XmlNodeType.CDATA))
{
return node.InnerText;
}
string innerXml = node.InnerXml;
// If there is no markup under the XML node (detected by the presence
// of a '<' sign
int firstLessThan = innerXml.IndexOf('<');
if (firstLessThan == -1)
{
// return the inner text so it gets properly unescaped
return node.InnerText;
}
bool containsNoTagsOtherThanComments = ContainsNoTagsOtherThanComments(innerXml, firstLessThan);
// ... or if the only XML is comments,
if (containsNoTagsOtherThanComments)
{
// return the inner text so the comments are stripped
// (this is how one might comment out part of a list in a property value)
return node.InnerText;
}
// ...or it looks like the whole thing is a big CDATA tag ...
bool startsWithCData = (innerXml.IndexOf("<![CDATA[", StringComparison.Ordinal) == 0);
if (startsWithCData)
{
// return the inner text so it gets properly extracted from the CDATA
return node.InnerText;
}
// otherwise, it looks like genuine XML; return the inner XML so that
// tags and comments are preserved and any XML escaping is preserved
return innerXml;
}