in src/parser/html/PhutilHTMLParser.php [7:119]
public function parseDocument($corpus) {
// Divide the block into sequences of "tag" and "non-tag" content. Tag
// content is anything between angle brackets ("<" and ">"). Non-tag
// content is anything else.
$segment_pos = 0;
$segments = array();
$in_tag = false;
for ($ii = 0; $ii < strlen($corpus); $ii++) {
$c = $corpus[$ii];
if ($in_tag && ($c === '>')) {
if ($segment_pos !== null) {
$segments[] = array(
'tag' => $in_tag,
'pos' => $segment_pos,
'end' => $ii + 1,
);
}
$segment_pos = $ii + 1;
$in_tag = false;
continue;
}
// When we encounter a "<", we start a new tag whether we're already in
// a tag or not. We want to parse "<x>1 < 2</x>" as a single tag with
// the content "1 < 2".
if ($c === '<') {
$segments[] = array(
'tag' => false,
'pos' => $segment_pos,
'end' => $ii,
);
$segment_pos = $ii;
$in_tag = true;
continue;
}
}
// Add whatever content was left at the end of the string. If we were in
// a tag but did not find a closing ">", we treat this as normal content.
$segments[] = array(
'tag' => false,
'pos' => $segment_pos,
'end' => $ii,
);
// Slice the marked segments out of the raw corpus so we get a list of
// "tag" strings and a list of "non-tag" strings.
$parts = array();
$corpus_length = strlen($corpus);
foreach ($segments as $segment) {
$tag = $segment['tag'];
$pos = $segment['pos'];
$len = $segment['end'] - $pos;
// If this is a tag, we'll drop the "<" at the beginning and the ">"
// at the end here.
if ($tag) {
$slice_pos = $pos + 1;
$slice_len = $len - 2;
} else {
$slice_pos = $pos;
$slice_len = $len;
}
if (($slice_pos < $corpus_length) && ($slice_len > 0)) {
$content = substr($corpus, $slice_pos, $slice_len);
} else {
$content = '';
}
$parts[] = array(
'tag' => $tag,
'pos' => $pos,
'len' => $len,
'content' => $content,
);
}
$root = new PhutilDOMNode();
$this->setCursor($root);
foreach ($parts as $part) {
$tag = $this->newTagDOMNode($part);
if ($tag !== null) {
continue;
}
$content = $part['content'];
// If this part is a tag, restore the angle brackets.
if ($part['tag']) {
$content = '<'.$content.'>';
}
$node = id(new PhutilDOMNode())
->setContent($content)
->setRawHead($content);
$this->getCursor()->appendChild($node);
}
$root->mergeContent();
return $root;
}