internal/gitaly/diff/diff.go (404 lines of code) (raw):
package diff
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"path/filepath"
"regexp"
"strconv"
"gitlab.com/gitlab-org/gitaly/v16/internal/git"
"gitlab.com/gitlab-org/gitaly/v16/internal/git/gitcmd"
"gitlab.com/gitlab-org/gitaly/v16/internal/helper"
)
// Diff represents a single parsed diff entry
type Diff struct {
Binary bool
OverflowMarker bool
Collapsed bool
TooLarge bool
CollectAllPaths bool
Status byte
lineCount int
byteCount int
FromID string
ToID string
OldMode int32
NewMode int32
PatchSize int32
FromPath []byte
ToPath []byte
Patch []byte
}
// Reset clears all fields of d in a way that lets the underlying memory
// allocations of the []byte fields be reused.
func (d *Diff) Reset() {
*d = Diff{
FromPath: d.FromPath[:0],
ToPath: d.ToPath[:0],
Patch: d.Patch[:0],
}
}
// ClearPatch clears only patch field of Diff d
func (d *Diff) ClearPatch() {
// Clear Patch data, but preserve underlying memory allocation
// to reduce allocation/gc cycles of the []bytes field and improve performance
d.Patch = d.Patch[:0]
}
// Parser holds necessary state for parsing a diff stream
type Parser struct {
objectHash git.ObjectHash
limits Limits
patchReader *bufio.Reader
rawLines [][]byte
currentDiff Diff
nextPatchFromPath []byte
filesProcessed int
linesProcessed int
bytesProcessed int
finished bool
stopPatchCollection bool
err error
}
// Limits holds the limits at which either parsing stops or patches are collapsed
type Limits struct {
// If true, Max{Files,Lines,Bytes} will cause parsing to stop if any of these limits is reached
EnforceLimits bool
// If true, SafeMax{Files,Lines,Bytes} will cause diffs to collapse (i.e. patches are emptied) after any of these limits reached
CollapseDiffs bool
// If true, all diffs are parsed, processed, and returned but info outside of path may be empty
CollectAllPaths bool
// Number of maximum files to parse. The file parsed after this limit is reached is marked as the overflow.
MaxFiles int
// Number of diffs lines to parse (including lines preceded with --- or +++).
// The file in which this limit is reached is discarded and marked as the overflow.
MaxLines int
// Number of bytes to parse (including lines preceded with --- or +++).
// The file in which this limit is reached is discarded and marked as the overflow.
MaxBytes int
// Number of files to parse, after which all subsequent files are collapsed.
SafeMaxFiles int
// Number of lines to parse (including lines preceded with --- or +++), after which all subsequent files are collapsed.
SafeMaxLines int
// Number of bytes to parse (including lines preceded with --- or +++), after which all subsequent files are collapsed.
SafeMaxBytes int
// Number of bytes a single patch can have. Patches surpassing this limit are pruned / nullified.
MaxPatchBytes int
// Max number of bytes for specific file type. This overrides MaxPatchBytes
MaxPatchBytesForFileExtension map[string]int
// PatchLimitsOnly uses only the MaxPatchBytes and MaxPatchBytesForFileExtension limits when
// enabled. Cumulative size limits, such as Max{Files,Lines,Bytes}, are ignored.
PatchLimitsOnly bool
}
const (
// maxFilesUpperBound controls how much MaxFiles limit can reach
maxFilesUpperBound = 5000
// maxLinesUpperBound controls how much MaxLines limit can reach
maxLinesUpperBound = 250000
// maxBytesUpperBound controls how much MaxBytes limit can reach
maxBytesUpperBound = 5000 * 5120 // 24MB
// safeMaxFilesUpperBound controls how much SafeMaxBytes limit can reach
safeMaxFilesUpperBound = 500
// safeMaxLinesUpperBound controls how much SafeMaxLines limit can reach
safeMaxLinesUpperBound = 25000
// safeMaxBytesUpperBound controls how much SafeMaxBytes limit can reach
safeMaxBytesUpperBound = 500 * 5120 // 2.4MB
// maxPatchBytesUpperBound controls how much MaxPatchBytes limit can reach
maxPatchBytesUpperBound = 512000 // 500KB
)
func init() {
if maxBytesUpperBound > gitcmd.BigFileThresholdMB*1024*1024 {
panic("big file threshold smaller than maximum upper bytes bound")
}
}
var (
rawSHA1LineRegexp = regexp.MustCompile(`(?m)^:(\d+) (\d+) ([[:xdigit:]]{40}) ([[:xdigit:]]{40}) ([ADTUXMRC]\d*)\t(.*?)(?:\t(.*?))?$`)
rawSHA256LineRegexp = regexp.MustCompile(`(?m)^:(\d+) (\d+) ([[:xdigit:]]{64}) ([[:xdigit:]]{64}) ([ADTUXMRC]\d*)\t(.*?)(?:\t(.*?))?$`)
diffHeaderRegexp = regexp.MustCompile(`(?m)^diff --git "?a/(.*?)"? "?b/(.*?)"?$`)
)
// NewDiffParser returns a new Parser
func NewDiffParser(objectHash git.ObjectHash, src io.Reader, limits Limits) *Parser {
limits.enforceUpperBound()
parser := &Parser{}
reader := bufio.NewReader(src)
parser.objectHash = objectHash
parser.cacheRawLines(reader)
parser.patchReader = reader
parser.limits = limits
return parser
}
// Parse parses a single diff. It returns true if successful, false if it finished
// parsing all diffs or when it encounters an error, in which case use Parser.Err()
// to get the error.
func (parser *Parser) Parse() bool {
if parser.finished || len(parser.rawLines) == 0 {
// In case we didn't consume the whole output due to reaching limitations
_, _ = io.Copy(io.Discard, parser.patchReader)
return false
}
if err := parser.initializeCurrentDiff(); err != nil {
return false
}
// The patch being parsed may not correspond to the current expected diff. In such cases, this
// indicates the current diff has no patch output and is empty. Since we can only go forward
// when reading from a bufio.Reader, we save the matched FromPath we parsed until we reach its
// counterpart in raw diff.
if parser.nextPatchFromPath == nil {
path, err := readDiffHeaderFromPath(parser.patchReader)
if err != nil {
parser.err = err
return false
}
parser.nextPatchFromPath = path
}
if !bytes.Equal(parser.nextPatchFromPath, parser.currentDiff.FromPath) {
// The current diff has an empty patch
return true
}
// We are consuming this patch so it is no longer 'next'
parser.nextPatchFromPath = nil
if err := readNextDiff(parser.patchReader, &parser.currentDiff, parser.stopPatchCollection); err != nil {
parser.err = err
return false
}
// Update parser line and byte counts.
parser.linesProcessed += parser.currentDiff.lineCount
parser.bytesProcessed += parser.currentDiff.byteCount
// PatchSize is needed for clients to determine if patch exceeded the soft or hard limit when patch was pruned.
parser.currentDiff.PatchSize = int32(len(parser.currentDiff.Patch))
if parser.limits.CollapseDiffs && parser.isOverSafeLimits() && parser.currentDiff.lineCount > 0 {
parser.prunePatch()
parser.currentDiff.Collapsed = true
if parser.limits.CollectAllPaths {
parser.currentDiff.CollectAllPaths = true
}
}
if parser.limits.EnforceLimits {
// Apply single-file size limit
maxPatchBytesExceeded := len(parser.currentDiff.Patch) >= parser.maxPatchBytesForCurrentFile()
if maxPatchBytesExceeded {
parser.prunePatch()
parser.currentDiff.TooLarge = true
}
maxFilesExceeded := parser.filesProcessed - parser.limits.MaxFiles
maxLinesExceeded := parser.linesProcessed - parser.limits.MaxLines
maxBytesExceeded := parser.bytesProcessed - parser.limits.MaxBytes
maxLimitsExceeded := maxLinesExceeded >= 0 || maxBytesExceeded >= 0 || maxFilesExceeded > 0
if maxLimitsExceeded && !parser.limits.PatchLimitsOnly {
if parser.limits.CollectAllPaths {
parser.currentDiff.CollectAllPaths = true
// Do allow parser to finish, but since limits are hit
// do not allow it to continue collecting patches
// only info about patches
parser.currentDiff.ClearPatch()
parser.stopPatchCollection = true
} else {
parser.finished = true
parser.currentDiff.Reset()
}
parser.currentDiff.OverflowMarker = true
}
}
return true
}
func readNextDiff(reader *bufio.Reader, diff *Diff, skipPatch bool) error {
for currentPatchDone := false; !currentPatchDone || reader.Buffered() > 0; {
// We cannot use bufio.Scanner because the line may be very long.
line, err := reader.Peek(10)
if errors.Is(err, io.EOF) {
// If the last diff has an empty patch (e.g. --ignore-space-change),
// patchReader will read EOF, but Parser not finished.
currentPatchDone = true
} else if err != nil {
return fmt.Errorf("peek diff line: %w", err)
}
switch {
case bytes.HasPrefix(line, []byte("diff --git")):
// If the next diff header is detected, the current patch is complete.
return nil
case helper.ByteSliceHasAnyPrefix(line, "---", "+++") && len(diff.Patch) == 0:
// File headers occur before the first hunk header and therefore before any patch data
// has been recorded.
if err := discardLine(reader); err != nil {
return err
}
case bytes.HasPrefix(line, []byte("@@")):
// Hunk headers do not count towards limits.
if err := consumeChunkLine(reader, diff, skipPatch, false); err != nil {
return err
}
case bytes.HasPrefix(line, []byte("Binary")):
diff.Binary = true
fallthrough
case bytes.HasPrefix(line, []byte("~\n")),
helper.ByteSliceHasAnyPrefix(line, "-", "+", " ", "\\"):
if err := consumeChunkLine(reader, diff, skipPatch, true); err != nil {
return err
}
default:
if err := discardLine(reader); err != nil {
return err
}
}
}
return nil
}
// enforceUpperBound ensures every limit value is within its corresponding upperbound
func (limit *Limits) enforceUpperBound() {
limit.MaxFiles = min(limit.MaxFiles, maxFilesUpperBound)
limit.MaxLines = min(limit.MaxLines, maxLinesUpperBound)
limit.MaxBytes = min(limit.MaxBytes, maxBytesUpperBound)
limit.SafeMaxFiles = min(limit.SafeMaxFiles, safeMaxFilesUpperBound)
limit.SafeMaxLines = min(limit.SafeMaxLines, safeMaxLinesUpperBound)
limit.SafeMaxBytes = min(limit.SafeMaxBytes, safeMaxBytesUpperBound)
limit.MaxPatchBytes = min(limit.MaxPatchBytes, maxPatchBytesUpperBound)
}
// prunePatch nullifies the current diff patch and reduce lines and bytes processed
// according to it.
func (parser *Parser) prunePatch() {
parser.linesProcessed -= parser.currentDiff.lineCount
parser.bytesProcessed -= len(parser.currentDiff.Patch)
parser.currentDiff.ClearPatch()
}
// Diff returns a successfully parsed diff. It should be called only when Parser.Parse()
// returns true. The return value is valid only until the next call to Parser.Parse().
func (parser *Parser) Diff() *Diff {
return &parser.currentDiff
}
// Err returns the error encountered (if any) when parsing the diff stream. It should be called only when Parser.Parse()
// returns false.
func (parser *Parser) Err() error {
return parser.err
}
func (parser *Parser) isOverSafeLimits() bool {
return parser.filesProcessed > parser.limits.SafeMaxFiles ||
parser.linesProcessed > parser.limits.SafeMaxLines ||
parser.bytesProcessed > parser.limits.SafeMaxBytes
}
func (parser *Parser) maxPatchBytesForCurrentFile() int {
return maxPatchBytesFor(parser.limits.MaxPatchBytes, parser.limits.MaxPatchBytesForFileExtension, parser.Diff().ToPath)
}
func maxPatchBytesFor(maxPatchBytes int, maxBytesForExtension map[string]int, toPath []byte) int {
if len(maxBytesForExtension) > 0 {
fileName := filepath.Base(string(toPath))
key := filepath.Ext(fileName)
if key == "" {
key = fileName
}
if limitForExtension, ok := maxBytesForExtension[key]; ok {
return limitForExtension
}
}
return maxPatchBytes
}
func min(x, y int) int {
if x < y {
return x
}
return y
}
func (parser *Parser) cacheRawLines(reader *bufio.Reader) {
for {
line, err := reader.ReadBytes('\n')
if err != nil {
if !errors.Is(err, io.EOF) {
parser.err = err
parser.finished = true
}
// According to the documentation of bufio.Reader.ReadBytes we cannot get
// both an error and a line ending in '\n'. So the current line cannot be
// valid and we want to discard it.
return
}
if !bytes.HasPrefix(line, []byte(":")) {
// Discard the current line and stop reading: we expect a blank line
// between raw lines and patch data.
return
}
parser.rawLines = append(parser.rawLines, line)
}
}
func (parser *Parser) nextRawLine() []byte {
if len(parser.rawLines) == 0 {
return nil
}
line := parser.rawLines[0]
parser.rawLines = parser.rawLines[1:]
return line
}
func (parser *Parser) initializeCurrentDiff() error {
parser.currentDiff.Reset()
// Raw and regular diff formats don't necessarily have the same files, since some flags (e.g. --ignore-space-change)
// can suppress certain kinds of diffs from showing in regular format, but raw format will always have all the files.
if err := parseRawLine(parser.objectHash, parser.nextRawLine(), &parser.currentDiff); err != nil {
parser.err = err
return err
}
if parser.currentDiff.Status == 'T' {
parser.handleTypeChangeDiff()
}
parser.filesProcessed++
return nil
}
func readDiffHeaderFromPath(reader *bufio.Reader) ([]byte, error) {
line, err := reader.ReadBytes('\n')
if errors.Is(err, io.EOF) {
return nil, nil
} else if err != nil {
return nil, fmt.Errorf("read diff header line: %w", err)
}
if matches := diffHeaderRegexp.FindSubmatch(line); len(matches) > 0 {
return unescape(matches[1]), nil
}
return nil, fmt.Errorf("diff header regexp mismatch")
}
func (parser *Parser) handleTypeChangeDiff() {
// GitLab wants to display the type change in the current diff as a removal followed by an addition.
// To make this happen we add a new raw line, which will become the addition on the next iteration of the parser.
// We change the current diff in-place so that it becomes a deletion.
newRawLine := fmt.Sprintf(
":%o %o %s %s A\t%s\n",
0,
parser.currentDiff.NewMode,
parser.objectHash.ZeroOID,
parser.currentDiff.ToID,
parser.currentDiff.FromPath,
)
parser.currentDiff.NewMode = 0
parser.currentDiff.ToID = parser.objectHash.ZeroOID.String()
parser.rawLines = append([][]byte{[]byte(newRawLine)}, parser.rawLines...)
}
func parseRawLine(objectHash git.ObjectHash, line []byte, diff *Diff) error {
var regexp *regexp.Regexp
switch objectHash.Format {
case "sha1":
regexp = rawSHA1LineRegexp
case "sha256":
regexp = rawSHA256LineRegexp
default:
return fmt.Errorf("cannot parse raw diff line with unknown object format %q", objectHash.Format)
}
matches := regexp.FindSubmatch(line)
if len(matches) == 0 {
return fmt.Errorf("raw line regexp mismatch")
}
mode, err := strconv.ParseInt(string(matches[1]), 8, 0)
if err != nil {
return fmt.Errorf("raw old mode: %w", err)
}
diff.OldMode = int32(mode)
mode, err = strconv.ParseInt(string(matches[2]), 8, 0)
if err != nil {
return fmt.Errorf("raw new mode: %w", err)
}
diff.NewMode = int32(mode)
diff.FromID = string(matches[3])
diff.ToID = string(matches[4])
diff.Status = matches[5][0]
diff.FromPath = unescape(helper.UnquoteBytes(matches[6]))
if diff.Status == 'C' || diff.Status == 'R' {
diff.ToPath = unescape(helper.UnquoteBytes(matches[7]))
} else {
diff.ToPath = diff.FromPath
}
return nil
}
func consumeChunkLine(reader *bufio.Reader, diff *Diff, skipPatch, updateStats bool) error {
// The code that follows would be much simpler if we used
// bufio.Reader.ReadBytes, but that allocates an intermediate copy of
// each line which adds up to a lot of allocations. By using ReadSlice we
// can copy bytes into currentDiff.Patch without intermediate
// allocations.
var byteCount int
for done := false; !done; {
line, err := reader.ReadSlice('\n')
byteCount += len(line)
switch {
case errors.Is(err, bufio.ErrBufferFull):
// long line: keep reading
case err != nil && !errors.Is(err, io.EOF):
return fmt.Errorf("read chunk line: %w", err)
default:
done = true
}
if !skipPatch {
diff.Patch = append(diff.Patch, line...)
}
}
if updateStats {
diff.byteCount += byteCount
diff.lineCount++
}
return nil
}
func discardLine(reader *bufio.Reader) error {
_, err := reader.ReadBytes('\n')
if err != nil && !errors.Is(err, io.EOF) {
return fmt.Errorf("read line: %w", err)
}
return nil
}
// unescape unescapes the escape codes used by 'git diff'
func unescape(s []byte) []byte {
var unescaped []byte
for i := 0; i < len(s); i++ {
if s[i] == '\\' {
if i+3 < len(s) && helper.IsNumber(s[i+1:i+4]) {
octalByte, err := strconv.ParseUint(string(s[i+1:i+4]), 8, 8)
if err == nil {
unescaped = append(unescaped, byte(octalByte))
i += 3
continue
}
}
if i+1 < len(s) {
var unescapedByte byte
switch s[i+1] {
case '"', '\\', '/', '\'':
unescapedByte = s[i+1]
case 'b':
unescapedByte = '\b'
case 'f':
unescapedByte = '\f'
case 'n':
unescapedByte = '\n'
case 'r':
unescapedByte = '\r'
case 't':
unescapedByte = '\t'
default:
unescaped = append(unescaped, '\\')
unescapedByte = s[i+1]
}
unescaped = append(unescaped, unescapedByte)
i++
continue
}
}
unescaped = append(unescaped, s[i])
}
return unescaped
}