protected function parseChangeset()

in src/parser/ArcanistDiffParser.php [875:1043]


  protected function parseChangeset(ArcanistDiffChange $change) {
    // If a diff includes two sets of changes to the same file, let the
    // second one win. In particular, this occurs when adding subdirectories
    // in Subversion that contain files: the file text will be present in
    // both the directory diff and the file diff. See T5555. Dropping the
    // hunks lets whichever one shows up later win instead of showing changes
    // twice.
    $change->dropHunks();

    $all_changes = array();
    do {
      $hunk = new ArcanistDiffHunk();
      $line = $this->getLineTrimmed();
      $real = array();

      // In the case where only one line is changed, the length is omitted.
      // The final group is for git, which appends a guess at the function
      // context to the diff.
      $matches = null;
      $ok = preg_match(
        '/^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@(?: .*?)?$/U',
        $line,
        $matches);

      if (!$ok) {
        // It's possible we hit the style of an svn1.7 property change.
        // This is a 4-line Index block, followed by an empty line, followed
        // by a "Property changes on:" section similar to svn1.6.
        if ($line == '') {
          $line = $this->nextNonemptyLine();
          $ok = preg_match('/^Property changes on:/', $line);
          if (!$ok) {
            $this->didFailParse(pht('Confused by empty line'));
          }
          $line = $this->nextLine();
          return $this->parsePropertyHunk($change);
        }
        $this->didFailParse(pht(
          "Expected hunk header '%s'.",
          '@@ -NN,NN +NN,NN @@'));
      }

      $hunk->setOldOffset($matches[1]);
      $hunk->setNewOffset($matches[3]);

      // Cover for the cases where length wasn't present (implying one line).
      $old_len = idx($matches, 2);
      if (!strlen($old_len)) {
        $old_len = 1;
      }
      $new_len = idx($matches, 4);
      if (!strlen($new_len)) {
        $new_len = 1;
      }

      $hunk->setOldLength($old_len);
      $hunk->setNewLength($new_len);

      $add = 0;
      $del = 0;

      $hit_next_hunk = false;
      while ((($line = $this->nextLine()) !== null)) {
        if (strlen(rtrim($line, "\r\n"))) {
          $char = $line[0];
        } else {
          // Normally, we do not encouter empty lines in diffs, because
          // unchanged lines have an initial space. However, in Git, with
          // the option `diff.suppress-blank-empty` set, unchanged blank lines
          // emit as completely empty. If we encounter a completely empty line,
          // treat it as a ' ' (i.e., unchanged empty line) line.
          $char = ' ';
        }
        switch ($char) {
          case '\\':
            if (!preg_match('@\\ No newline at end of file@', $line)) {
              $this->didFailParse(
                pht("Expected '\ No newline at end of file'."));
            }
            if ($new_len) {
              $real[] = $line;
              $hunk->setIsMissingOldNewline(true);
            } else {
              $real[] = $line;
              $hunk->setIsMissingNewNewline(true);
            }
            if (!$new_len) {
              break 2;
            }
            break;
          case '+':
            ++$add;
            --$new_len;
            $real[] = $line;
            break;
          case '-':
            if (!$old_len) {
              // In this case, we've hit "---" from a new file. So don't
              // advance the line cursor.
              $hit_next_hunk = true;
              break 2;
            }
            ++$del;
            --$old_len;
            $real[] = $line;
            break;
          case ' ':
            if (!$old_len && !$new_len) {
              break 2;
            }
            --$old_len;
            --$new_len;
            $real[] = $line;
            break;
          default:
            // We hit something, likely another hunk.
            $hit_next_hunk = true;
            break 2;
        }
      }

      if ($old_len || $new_len) {
        $this->didFailParse(pht('Found the wrong number of hunk lines.'));
      }

      $corpus = implode('', $real);

      $is_binary = false;
      if ($this->detectBinaryFiles) {
        $is_binary = !phutil_is_utf8($corpus);
        $try_encoding = $this->tryEncoding;

        if ($is_binary && $try_encoding) {
          $is_binary = ArcanistDiffUtils::isHeuristicBinaryFile($corpus);
          if (!$is_binary) {
            $corpus = phutil_utf8_convert($corpus, 'UTF-8', $try_encoding);
            if (!phutil_is_utf8($corpus)) {
              throw new Exception(
                pht(
                  "Failed to convert a hunk from '%s' to UTF-8. ".
                  "Check that the specified encoding is correct.",
                  $try_encoding));
            }
          }
        }

      }

      if ($is_binary) {
        // SVN happily treats binary files which aren't marked with the right
        // mime type as text files. Detect that junk here and mark the file
        // binary. We'll catch stuff with unicode too, but that's verboten
        // anyway. If there are too many false positives with this we might
        // need to make it threshold-triggered instead of triggering on any
        // unprintable byte.
        $change->setFileType(ArcanistDiffChangeType::FILE_BINARY);
      } else {
        $hunk->setCorpus($corpus);
        $hunk->setAddLines($add);
        $hunk->setDelLines($del);
        $change->addHunk($hunk);
      }

      if (!$hit_next_hunk) {
        $line = $this->nextNonemptyLine();
      }

    } while (preg_match('/^@@ /', $line));
  }