pkg/linguist/util.go (159 lines of code) (raw):

package linguist import ( "bufio" "bytes" "path/filepath" "regexp" "strings" "github.com/ghodss/yaml" log "github.com/sirupsen/logrus" ) var ( vendorRE *regexp.Regexp doxRE *regexp.Regexp extensions = map[string][]string{} filenames = map[string][]string{} interpreters = map[string][]string{} colors = map[string]string{} shebangRE = regexp.MustCompile(`^#!\s*(\S+)(?:\s+(\S+))?.*`) scriptVersionRE = regexp.MustCompile(`((?:\d+\.?)+)`) configurationSuffixes = []string{".yaml", ".yml", ".xml", ".toml"} ) func init() { var regexps []string bytes := []byte(files["data/vendor.yml"]) if err := yaml.Unmarshal(bytes, &regexps); err != nil { log.Fatal(err) return } vendorRE = regexp.MustCompile(strings.Join(regexps, "|")) var moreregex []string bytes = []byte(files["data/documentation.yml"]) if err := yaml.Unmarshal(bytes, &moreregex); err != nil { log.Fatal(err) return } doxRE = regexp.MustCompile(strings.Join(moreregex, "|")) type language struct { Extensions []string `yaml:"extensions,omitempty"` Filenames []string `yaml:"filenames,omitempty"` Interpreters []string `yaml:"interpreters,omitempty"` Color string `yaml:"color,omitempty"` } languages := map[string]*language{} bytes = []byte(files["data/languages.yml"]) if err := yaml.Unmarshal(bytes, &languages); err != nil { log.Fatal(err) } for n, l := range languages { for _, e := range l.Extensions { extensions[e] = append(extensions[e], n) log.Debugf("loading ext: %s for lang %s", e, l) } for _, f := range l.Filenames { filenames[f] = append(filenames[f], n) } for _, i := range l.Interpreters { interpreters[i] = append(interpreters[i], n) } colors[n] = l.Color } } // LanguageColor is a convenience function that returns the color associated // with the language, in HTML Hex notation (e.g. "#123ABC") // from the languages.yml file provided by https://github.com/github/linguist // // Returns the empty string if there is no associated color for the language. func LanguageColor(language string) string { if c, ok := colors[language]; ok { return c } return "" } // LanguageByFilename attempts to determine the language of a source file based solely on // common naming conventions and file extensions // from the languages.yml file provided by https://github.com/github/linguist // // Returns the empty string in ambiguous or unrecognized cases. func LanguageByFilename(filename string) string { if l := filenames[filename]; len(l) == 1 { return l[0] } ext := filepath.Ext(filename) log.Debugf("filename extension lookup '%s'", ext) if ext != "" { log.Debugf("extension lookup: %v", extensions[ext]) if l := extensions[ext]; len(l) == 1 { return l[0] } } return "" } // LanguageHints attempts to detect all possible languages of a source file based solely on // common naming conventions and file extensions // from the languages.yml file provided by https://github.com/github/linguist // // Intended to be used with LanguageByContents. // // May return an empty slice. func LanguageHints(filename string) (hints []string) { if l, ok := filenames[filename]; ok { hints = append(hints, l...) } if ext := filepath.Ext(filename); ext != "" { if l, ok := extensions[ext]; ok { hints = append(hints, l...) } } return hints } // LanguageByContents attempts to detect the language of a source file based on its // contents and a slice of hints to the possible answer. // // Obtain hints with LanguageHints() // // Returns the empty string a language could not be determined. func LanguageByContents(contents []byte, hints []string) string { interpreter := detectInterpreter(contents) if interpreter != "" { if l := interpreters[interpreter]; len(l) == 1 { return l[0] } } return Analyse(contents, hints) } func detectInterpreter(contents []byte) string { scanner := bufio.NewScanner(bytes.NewReader(contents)) scanner.Scan() line := scanner.Text() m := shebangRE.FindStringSubmatch(line) if m == nil || len(m) != 3 { return "" } base := filepath.Base(m[1]) if base == "env" && m[2] != "" { base = m[2] } // Strip suffixed version number. return scriptVersionRE.ReplaceAllString(base, "") } // ShouldIgnoreFilename checks if filename should not be passed to LanguageByFilename. // // (this simply calls IsVendored and IsDocumentation) func ShouldIgnoreFilename(filename string) bool { vendored := IsVendored(filename) documentation := IsDocumentation(filename) isConfiguration := IsConfiguration(filename) return vendored || documentation || isConfiguration // return IsVendored(filename) || IsDocumentation(filename) } // ShouldIgnoreContents checks if contents should not be passed to LangugeByContents. // // (this simply calls IsBinary) func ShouldIgnoreContents(contents []byte) bool { return IsBinary(contents) } // IsVendored checks if path contains a filename commonly belonging to configuration files. func IsVendored(path string) bool { return vendorRE.MatchString(path) } // IsDocumentation checks if path contains a filename commonly belonging to documentation. func IsDocumentation(path string) bool { return doxRE.MatchString(path) } func IsConfiguration(path string) bool { isConfig := false for _, suffix := range configurationSuffixes { isConfig = isConfig || strings.HasSuffix(path, suffix) } return isConfig } // IsBinary checks contents for known character escape codes which // frequently show up in binary files but rarely (if ever) in text. // // Use this check before using LanguageFromContents to reduce likelihood // of passing binary data into it which can cause inaccurate results. func IsBinary(contents []byte) bool { // NOTE(tso): preliminary testing on this method of checking for binary // contents were promising, having fed a document consisting of all // utf-8 codepoints from 0000 to FFFF with satisfactory results. Thanks // to robpike.io/cmd/unicode: // ``` // unicode -c $(seq 0 65535 | xargs printf "%04x ") | tr -d '\n' > unicode_test // ``` // // However, the intentional presence of character escape codes to throw // this function off is entirely possible, as is, potentially, a binary // file consisting entirely of the 4 exceptions to the rule for the first // 512 bytes. It is also possible that more character escape codes need // to be added. // // Further analysis and real world testing of this is required. for n, b := range contents { if n >= 512 { break } if b < 32 { switch b { case 0: fallthrough case 9: fallthrough case 10: fallthrough case 13: continue default: return true } } } return false }