pkg/linguist/analyse.go (50 lines of code) (raw):
package linguist
import (
"bytes"
"log"
"math"
"github.com/Azure/draft/pkg/linguist/data"
"github.com/Azure/draft/pkg/linguist/tokenizer"
"github.com/jbrukh/bayesian"
)
var classifier *bayesian.Classifier
var classifierInitialized = false
// Gets the baysian.Classifier which has been trained on programming language
// samples from github.com/github/linguist after running the generator
//
// See also cmd/generate-classifier
func getClassifier() *bayesian.Classifier {
// NOTE(tso): this could probably go into an init() function instead
// but this lazy loading approach works, and it's conceivable that the
// analyse() function might not invoked in an actual runtime anyway
if !classifierInitialized {
d, err := data.Asset("classifier")
if err != nil {
log.Panicln(err)
}
reader := bytes.NewReader(d)
classifier, err = bayesian.NewClassifierFromReader(reader)
if err != nil {
log.Panicln(err)
}
classifierInitialized = true
}
return classifier
}
// Analyse returns the name of a programming language, or the empty string if one could
// not be determined.
//
// Uses Naive Bayesian Classification on the file contents provided.
//
// It is recommended to use LanguageByContents() instead of this function directly.
//
// Obtain hints from LanguageHints()
//
// NOTE(tso): May yield inaccurate results
func Analyse(contents []byte, hints []string) (language string) {
document := tokenizer.Tokenize(contents)
classifier := getClassifier()
scores, idx, _ := classifier.LogScores(document)
if len(hints) == 0 {
return string(classifier.Classes[idx])
}
langs := map[string]struct{}{}
for _, hint := range hints {
langs[hint] = struct{}{}
}
bestScore := math.Inf(-1)
bestAnswer := ""
for id, score := range scores {
answer := string(classifier.Classes[id])
if _, ok := langs[answer]; ok {
if score >= bestScore {
bestScore = score
bestAnswer = answer
}
}
}
return bestAnswer
}