in pkg/linguist/data/generate_classifier.go [31:171]
func main() {
const (
sourcePath = "./linguist/samples"
outfile = "./classifier"
quiet = false
)
log.SetFlags(0)
if quiet {
log.SetOutput(ioutil.Discard)
}
// first we only read all the paths of the sample files
// and their corresponding and language names into:
sampleFiles := []*sampleFile{}
// and store all the language names into:
languages := []string{}
/*
github/linguist has directory structure:
...
├── samples
│ ├── (name of programming language)
│ │ ├── (sample file in language)
│ │ ├── (sample file in language)
│ │ └── (sample file in language)
│ ├── (name of another programming language)
│ │ └── (sample file)
...
the following hard-coded logic expects this layout
*/
log.Println("Scanning", sourcePath, "...")
srcDir, err := os.Open(sourcePath)
checkErr(err)
subDirs, err := srcDir.Readdir(-1)
checkErr(err)
for _, langDir := range subDirs {
lang := langDir.Name()
if !langDir.IsDir() {
log.Println("unexpected file:", lang)
continue
}
languages = append(languages, lang)
samplePath := sourcePath + "/" + lang
sampleDir, err := os.Open(samplePath)
checkErr(err)
files, err := sampleDir.Readdir(-1)
checkErr(err)
for _, file := range files {
fp := samplePath + "/" + file.Name()
if file.IsDir() {
// Skip subdirectories
continue
}
sampleFiles = append(sampleFiles, &sampleFile{lang, fp, nil})
}
sampleDir.Close()
}
log.Println("Found", len(languages), "languages in", len(sampleFiles), "files")
// simple progress bar
progress := 0.0
total := float64(len(sampleFiles)) * 2.0
progressBar := func() {
progress++
fmt.Printf("Processing files ... %.2f%%\r", progress/total*100.0)
}
// then we concurrently read and tokenize the samples
sampleChan := make(chan *sampleFile)
readyChan := make(chan struct{})
received := 0
tokenize := func(s *sampleFile) {
f, err := os.Open(s.fp)
checkErr(err)
contents, err := ioutil.ReadAll(f)
f.Close()
checkErr(err)
s.tokens = tokenizer.Tokenize(contents)
sampleChan <- s
}
dox := map[string][]string{}
for _, lang := range languages {
dox[lang] = []string{}
}
// this receives the processed files and stores their tokens with their language
go func() {
for {
s := <-sampleChan
dox[s.lang] = append(dox[s.lang], s.tokens...)
received++
progressBar()
if received == len(sampleFiles) {
close(readyChan)
return
}
}
}()
// this balances the workload (implementation at end of file)
requests := getRequestsChan(len(sampleFiles))
for i := range sampleFiles {
requests <- &request{
workFn: tokenize,
arg: sampleFiles[i],
}
progressBar()
}
// once that's done
<-readyChan
close(requests)
fmt.Println() // for the progress bar
// we train the classifier in the arbitrary manner that its API demands
classes := make([]bayesian.Class, 1)
documents := make(map[bayesian.Class][]string)
for _, lang := range languages {
var class = bayesian.Class(lang)
classes = append(classes, class)
documents[class] = dox[lang]
}
log.Println("Creating bayesian.Classifier ...")
clsf := bayesian.NewClassifier(classes...)
for cls, dox := range documents {
clsf.Learn(dox, cls)
}
// and write the data to disk
log.Println("Serializing and exporting bayesian.Classifier to", outfile, "...")
checkErr(clsf.WriteToFile("classifier"))
log.Println("Done.")
}