public function parseDocument()

in src/parser/html/PhutilHTMLParser.php [7:119]


  public function parseDocument($corpus) {
    // Divide the block into sequences of "tag" and "non-tag" content. Tag
    // content is anything between angle brackets ("<" and ">"). Non-tag
    // content is anything else.

    $segment_pos = 0;
    $segments = array();
    $in_tag = false;

    for ($ii = 0; $ii < strlen($corpus); $ii++) {
      $c = $corpus[$ii];

      if ($in_tag && ($c === '>')) {
        if ($segment_pos !== null) {
          $segments[] = array(
            'tag' => $in_tag,
            'pos' => $segment_pos,
            'end' => $ii + 1,
          );
        }

        $segment_pos = $ii + 1;
        $in_tag = false;
        continue;
      }

      // When we encounter a "<", we start a new tag whether we're already in
      // a tag or not. We want to parse "<x>1 < 2</x>" as a single tag with
      // the content "1 < 2".

      if ($c === '<') {
        $segments[] = array(
          'tag' => false,
          'pos' => $segment_pos,
          'end' => $ii,
        );

        $segment_pos = $ii;
        $in_tag = true;
        continue;
      }
    }

    // Add whatever content was left at the end of the string. If we were in
    // a tag but did not find a closing ">", we treat this as normal content.
    $segments[] = array(
      'tag' => false,
      'pos' => $segment_pos,
      'end' => $ii,
    );

    // Slice the marked segments out of the raw corpus so we get a list of
    // "tag" strings and a list of "non-tag" strings.

    $parts = array();
    $corpus_length = strlen($corpus);
    foreach ($segments as $segment) {
      $tag = $segment['tag'];
      $pos = $segment['pos'];
      $len = $segment['end'] - $pos;

      // If this is a tag, we'll drop the "<" at the beginning and the ">"
      // at the end here.
      if ($tag) {
        $slice_pos = $pos + 1;
        $slice_len = $len - 2;
      } else {
        $slice_pos = $pos;
        $slice_len = $len;
      }

      if (($slice_pos < $corpus_length) && ($slice_len > 0)) {
        $content = substr($corpus, $slice_pos, $slice_len);
      } else {
        $content = '';
      }

      $parts[] = array(
        'tag' => $tag,
        'pos' => $pos,
        'len' => $len,
        'content' => $content,
      );
    }

    $root = new PhutilDOMNode();
    $this->setCursor($root);

    foreach ($parts as $part) {
      $tag = $this->newTagDOMNode($part);

      if ($tag !== null) {
        continue;
      }

      $content = $part['content'];

      // If this part is a tag, restore the angle brackets.
      if ($part['tag']) {
        $content = '<'.$content.'>';
      }

      $node = id(new PhutilDOMNode())
        ->setContent($content)
        ->setRawHead($content);

      $this->getCursor()->appendChild($node);
    }

    $root->mergeContent();

    return $root;
  }