internal/mode/advanced/indexer/blob.go (120 lines of code) (raw):

package indexer import ( "bytes" "crypto/sha1" //nolint:gosec "encoding/json" "fmt" "io" "path" "strconv" "github.com/go-enry/go-enry/v2" "gitlab.com/gitlab-org/gitlab-elasticsearch-indexer/internal/mode/advanced/git" ) var ( NoCodeContentMsgHolder = "" ) const ( binarySearchLimit = 8 * 1024 // 8 KiB, Same as git defaultLanguage = "Text" ) type Blob struct { Type string `json:"type"` ID string `json:"-"` OID string `json:"oid"` RepoID string `json:"rid"` CommitSHA string `json:"commit_sha"` Content string `json:"content"` Path string `json:"path"` // Message copied from gitlab-elasticsearch-git: // // We're duplicating file_name parameter here because we need another // analyzer for it. // //Ideally this should be done with copy_to: 'blob.file_name' option, //but it does not work in ES v2.3.*. We're doing it so to not make users //install newest versions // //https://github.com/elastic/elasticsearch-mapper-attachments/issues/124 Filename string `json:"file_name"` Language string `json:"language"` } func (c *Blob) ToMap() (newMap map[string]interface{}, err error) { data, err := json.Marshal(c) // Convert to a json string if err != nil { return } err = json.Unmarshal(data, &newMap) // Convert to a map return } // GenerateBlobID Avoid Ids that exceed the Elasticsearch limit of 512 bytes // The path will be hashed if the created BlobID byte count is over 512 // This allows support for existing blobs in the index without the need // to regenerate the id for each indexed document func GenerateBlobID(parentID int64, path string) string { blobID := fmt.Sprintf("%v_%s", parentID, path) if len(blobID) > 512 { blobID = fmt.Sprintf("%v_%s", parentID, hashStr(path)) } return blobID } func GenerateWikiBlobId(parentID int64, path string, isProjectDocument bool) string { parentIDWithPrefix := "" if isProjectDocument { parentIDWithPrefix = fmt.Sprintf("p_%v", parentID) } else { parentIDWithPrefix = fmt.Sprintf("g_%v", parentID) } blobID := fmt.Sprintf("%s_%s", parentIDWithPrefix, path) if len(blobID) > 512 { blobID = fmt.Sprintf("%s_%s", parentIDWithPrefix, hashStr(path)) } return blobID } func hashStr(s string) string { sBytes := []byte(s) return fmt.Sprintf("%x", sha1.Sum(sBytes)) //nolint:gosec } func BuildBlob(file *git.File, parentID int64, commitSHA string, blobType string, encoder *Encoder, isProjectDocument bool) (*Blob, error) { content := NoCodeContentMsgHolder language := defaultLanguage filename := file.Path // Do not read files that are too large if !file.SkipTooLarge { reader, err := file.Blob() if err != nil { return nil, err } defer reader.Close() //nolint:errcheck // FIXME(nick): This doesn't look cheap. Check the RAM & CPU pressure, esp. // for large blobs b, err := io.ReadAll(reader) if err != nil { return nil, err } if !DetectBinary(b) { content = encoder.tryEncodeBytes(b) } language = DetectLanguage(filename, b) } var id string if blobType == "wiki_blob" { id = GenerateWikiBlobId(parentID, filename, isProjectDocument) } else { id = GenerateBlobID(parentID, filename) } blob := &Blob{ ID: id, OID: file.Oid, CommitSHA: commitSHA, Content: content, Path: filename, Filename: path.Base(filename), Language: language, } switch blobType { case "blob": blob.Type = "blob" blob.RepoID = strconv.FormatInt(parentID, 10) case "wiki_blob": blob.Type = "wiki_blob" blob.RepoID = fmt.Sprintf("wiki_%d", parentID) } return blob, nil } // DetectLanguage returns a string describing the language of the file. This is // programming language, rather than natural language. // // If no language is detected, "Text" is returned. func DetectLanguage(filename string, data []byte) string { lang := enry.GetLanguage(filename, data) if len(lang) != 0 { return lang } return defaultLanguage } // DetectBinary checks whether the passed-in data contains a NUL byte. Only scan // the start of large blobs. This is the same test performed by git to check // text/binary func DetectBinary(data []byte) bool { searchLimit := binarySearchLimit if len(data) < searchLimit { searchLimit = len(data) } return bytes.Contains(data[:searchLimit], []byte{0}) }