pkg/linguist/linguist.go (268 lines of code) (raw):

package linguist import ( "bufio" "fmt" "io" "os" "path/filepath" "runtime" "sort" "strconv" "strings" "github.com/Azure/draft/pkg/osutil" log "github.com/sirupsen/logrus" ) var ( isIgnored func(string) bool isDetectedInGitAttributes func(filename string) string ) // used for displaying results type ( // Language is the programming langage and the percentage on how sure linguist feels about its // decision. Language struct { Language string `json:"language"` Percent float64 `json:"percent"` // Color represents the color associated with the language in HTML hex notation. Color string `json:"color"` } ) // sortableResult is a list or programming languages, sorted based on the likelihood of the // primary programming language the application was written in. type sortableResult []*Language func (s sortableResult) Len() int { return len(s) } func (s sortableResult) Less(i, j int) bool { return s[i].Percent < s[j].Percent } func (s sortableResult) Swap(i, j int) { s[i], s[j] = s[j], s[i] } func initLinguistAttributes(dir string) error { ignore := []string{} except := []string{} detected := make(map[string]string) gitignoreExists, err := osutil.Exists(filepath.Join(dir, ".gitignore")) if err != nil { return err } if gitignoreExists { log.Debugln("found .gitignore") f, err := os.Open(filepath.Join(dir, ".gitignore")) if err != nil { return err } defer f.Close() ignoreScanner := bufio.NewScanner(f) for ignoreScanner.Scan() { var isExcept bool path := strings.TrimSpace(ignoreScanner.Text()) // if it's whitespace or a comment if len(path) == 0 || string(path[0]) == "#" { continue } if string(path[0]) == "!" { isExcept = true path = path[1:] } p := strings.Trim(path, string(filepath.Separator)) if isExcept { except = append(except, p) } else { ignore = append(ignore, p) } } if err := ignoreScanner.Err(); err != nil { return fmt.Errorf("error reading .gitignore: %v", err) } } gitAttributesExists, err := osutil.Exists(filepath.Join(dir, ".gitattributes")) if err != nil { return err } if gitAttributesExists { log.Debugln("found .gitattributes") f, err := os.Open(filepath.Join(dir, ".gitattributes")) if err != nil { return err } defer f.Close() attributeScanner := bufio.NewScanner(f) var lineNumber int for attributeScanner.Scan() { lineNumber++ line := strings.TrimSpace(attributeScanner.Text()) words := strings.Fields(line) if len(words) != 2 { log.Printf("invalid line in .gitattributes at L%d: '%s'\n", lineNumber, line) continue } path := strings.Trim(words[0], string(filepath.Separator)) if runtime.GOOS == "windows" { // on Windows, we also accept / as a path separator, so let's strip those as well path = strings.Trim(words[0], "/") } attribute := words[1] if strings.HasPrefix(attribute, "linguist-documentation") || strings.HasPrefix(attribute, "linguist-vendored") || strings.HasPrefix(attribute, "linguist-generated") { if !strings.HasSuffix(strings.ToLower(attribute), "false") { ignore = append(ignore, path) } } else if strings.HasPrefix(attribute, "linguist-language") { attr := strings.Split(attribute, "=") if len(attr) != 2 { log.Printf("invalid line in .gitattributes at L%d: '%s'\n", lineNumber, line) continue } language := attr[1] detected[path] = language } } if err := attributeScanner.Err(); err != nil { return fmt.Errorf("error reading .gitattributes: %v", err) } } isIgnored = func(filename string) bool { for _, p := range ignore { cleanPath, err := filepath.Rel(dir, filename) if err != nil { log.Debugf("could not get relative path: %v", err) return false } if m, _ := filepath.Match(p, cleanPath); m { for _, e := range except { if m, _ := filepath.Match(e, cleanPath); m { return false } } return true } } return false } isDetectedInGitAttributes = func(filename string) string { for p, lang := range detected { cleanPath, err := filepath.Rel(dir, filename) if err != nil { log.Debugf("could not get relative path: %v", err) return "" } if m, _ := filepath.Match(p, cleanPath); m { return lang } } return "" } return nil } // shoutouts to php func fileGetContents(filename string) ([]byte, error) { log.Debugln("reading contents of", filename) // read only first 512 bytes of files contents := make([]byte, 512) f, err := os.Open(filename) if err != nil { return nil, err } _, err = f.Read(contents) f.Close() if err != io.EOF { if err != nil { return nil, err } } return contents, nil } // ProcessDir walks through a directory and returns a list of sorted languages within that directory. func ProcessDir(dirname string) ([]*Language, error) { var ( langs = make(map[string]int) totalSize int ) if err := initLinguistAttributes(dirname); err != nil { return nil, err } exists, err := osutil.Exists(dirname) if err != nil { return nil, err } if !exists { return nil, os.ErrNotExist } filepath.Walk(dirname, func(path string, file os.FileInfo, err error) error { size := int(file.Size()) log.Debugf("with file: %s", path) log.Debugln(path, "is", size, "bytes") if isIgnored(path) { log.Debugln(path, "is ignored, skipping") if file.IsDir() { return filepath.SkipDir } return nil } if size == 0 { log.Debugln(path, "is empty file, skipping") return nil } if file.IsDir() { if file.Name() == ".git" { log.Debugln(".git directory, skipping") return filepath.SkipDir } } else if (file.Mode() & os.ModeSymlink) == 0 { log.Debugf("%s: filename to be ignored: %s", path, strconv.FormatBool(ShouldIgnoreFilename(path))) if ShouldIgnoreFilename(path) { log.Debugf("%s: filename should be ignored, skipping", path) return nil } byGitAttr := isDetectedInGitAttributes(path) if byGitAttr != "" { log.Debugln(path, "got result by .gitattributes: ", byGitAttr) langs[byGitAttr] += size totalSize += size return nil } if byName := LanguageByFilename(path); byName != "" { log.Debugln(path, "got result by name: ", byName) langs[byName] += size totalSize += size return nil } contents, err := fileGetContents(path) if err != nil { return err } if ShouldIgnoreContents(contents) { log.Debugln(path, ": contents should be ignored, skipping") return nil } hints := LanguageHints(path) log.Debugf("%s got language hints: %#v\n", path, hints) byData := LanguageByContents(contents, hints) if byData != "" { log.Debugln(path, "got result by data: ", byData) langs[byData] += size totalSize += size return nil } log.Debugln(path, "got no result!!") langs["(unknown)"] += size totalSize += size } return nil }) results := []*Language{} for lang, size := range langs { l := &Language{ Language: lang, Percent: (float64(size) / float64(totalSize)) * 100.0, Color: LanguageColor(lang), } results = append(results, l) log.Debugf("language: %s percent: %f color: %s", l.Language, l.Percent, l.Color) } sort.Sort(sort.Reverse(sortableResult(results))) return results, nil } // Alias returns the language name for a given known alias. // // Occasionally linguist comes up with odd language names, or determines a Java app as a "Maven POM" // app, which in essence is the same thing for Draft's intent. func Alias(lang *Language) *Language { packAliases := map[string]string{ "maven pom": "Java", "c#": "csharp", "go module": "gomodule", "typescript": "javascript", "java server pages": "java-tomcat", } if alias, ok := packAliases[strings.ToLower(lang.Language)]; ok { lang.Language = alias } return lang }