pkg/linguist/data/generate_classifier.go (190 lines of code) (raw):
// +build ignore
/*
This program trains a naive bayesian classifier
provided by https://github.com/jbrukh/bayesian
on a set of source code files
provided by https://github.com/github/linguist
This file is meant by run by go generate,
refer to generate.go for its intended invokation
*/
package main
import (
"container/heap"
"fmt"
"io/ioutil"
"log"
"os"
"runtime"
"github.com/Azure/draft/pkg/linguist/tokenizer"
"github.com/jbrukh/bayesian"
)
type sampleFile struct {
lang, fp string
tokens []string
}
func main() {
const (
sourcePath = "./linguist/samples"
outfile = "./classifier"
quiet = false
)
log.SetFlags(0)
if quiet {
log.SetOutput(ioutil.Discard)
}
// first we only read all the paths of the sample files
// and their corresponding and language names into:
sampleFiles := []*sampleFile{}
// and store all the language names into:
languages := []string{}
/*
github/linguist has directory structure:
...
├── samples
│ ├── (name of programming language)
│ │ ├── (sample file in language)
│ │ ├── (sample file in language)
│ │ └── (sample file in language)
│ ├── (name of another programming language)
│ │ └── (sample file)
...
the following hard-coded logic expects this layout
*/
log.Println("Scanning", sourcePath, "...")
srcDir, err := os.Open(sourcePath)
checkErr(err)
subDirs, err := srcDir.Readdir(-1)
checkErr(err)
for _, langDir := range subDirs {
lang := langDir.Name()
if !langDir.IsDir() {
log.Println("unexpected file:", lang)
continue
}
languages = append(languages, lang)
samplePath := sourcePath + "/" + lang
sampleDir, err := os.Open(samplePath)
checkErr(err)
files, err := sampleDir.Readdir(-1)
checkErr(err)
for _, file := range files {
fp := samplePath + "/" + file.Name()
if file.IsDir() {
// Skip subdirectories
continue
}
sampleFiles = append(sampleFiles, &sampleFile{lang, fp, nil})
}
sampleDir.Close()
}
log.Println("Found", len(languages), "languages in", len(sampleFiles), "files")
// simple progress bar
progress := 0.0
total := float64(len(sampleFiles)) * 2.0
progressBar := func() {
progress++
fmt.Printf("Processing files ... %.2f%%\r", progress/total*100.0)
}
// then we concurrently read and tokenize the samples
sampleChan := make(chan *sampleFile)
readyChan := make(chan struct{})
received := 0
tokenize := func(s *sampleFile) {
f, err := os.Open(s.fp)
checkErr(err)
contents, err := ioutil.ReadAll(f)
f.Close()
checkErr(err)
s.tokens = tokenizer.Tokenize(contents)
sampleChan <- s
}
dox := map[string][]string{}
for _, lang := range languages {
dox[lang] = []string{}
}
// this receives the processed files and stores their tokens with their language
go func() {
for {
s := <-sampleChan
dox[s.lang] = append(dox[s.lang], s.tokens...)
received++
progressBar()
if received == len(sampleFiles) {
close(readyChan)
return
}
}
}()
// this balances the workload (implementation at end of file)
requests := getRequestsChan(len(sampleFiles))
for i := range sampleFiles {
requests <- &request{
workFn: tokenize,
arg: sampleFiles[i],
}
progressBar()
}
// once that's done
<-readyChan
close(requests)
fmt.Println() // for the progress bar
// we train the classifier in the arbitrary manner that its API demands
classes := make([]bayesian.Class, 1)
documents := make(map[bayesian.Class][]string)
for _, lang := range languages {
var class = bayesian.Class(lang)
classes = append(classes, class)
documents[class] = dox[lang]
}
log.Println("Creating bayesian.Classifier ...")
clsf := bayesian.NewClassifier(classes...)
for cls, dox := range documents {
clsf.Learn(dox, cls)
}
// and write the data to disk
log.Println("Serializing and exporting bayesian.Classifier to", outfile, "...")
checkErr(clsf.WriteToFile("classifier"))
log.Println("Done.")
}
func checkErr(err error) {
if err != nil {
log.Panicln(err)
}
}
// simple load balancer from "concurrency is not parallelism" talk
type request struct {
workFn func(s *sampleFile)
arg *sampleFile
}
type worker struct {
requests chan *request
pending, index int
}
func (w *worker) work(done chan *worker) {
for {
req := <-w.requests
req.workFn(req.arg)
done <- w
}
}
type pool []*worker
func (p pool) Less(i, j int) bool { return p[i].pending < p[j].pending }
func (p pool) Len() int { return len(p) }
func (p pool) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
func (p *pool) Push(x interface{}) { *p = append(*p, x.(*worker)) }
func (p *pool) Pop() interface{} {
old := *p
n := len(old)
x := old[n-1]
*p = old[0 : n-1]
return x
}
type balancer struct {
workers pool
done chan *worker
}
func (b *balancer) balance(work chan *request) {
for {
select {
case req, ok := <-work:
if ok {
b.dispatch(req)
} else {
return
}
case w := <-b.done:
b.completed(w)
}
}
}
func (b *balancer) dispatch(req *request) {
w := heap.Pop(&b.workers).(*worker)
w.requests <- req
w.pending++
heap.Push(&b.workers, w)
}
func (b *balancer) completed(w *worker) {
w.pending--
heap.Remove(&b.workers, w.index)
heap.Push(&b.workers, w)
}
func getRequestsChan(jobs int) chan *request {
done := make(chan *worker)
workers := make(pool, runtime.GOMAXPROCS(0)*4) // I don't know how many workers there should be
for i := 0; i < len(workers); i++ {
w := &worker{make(chan *request, jobs), 0, i}
go w.work(done)
workers[i] = w
}
heap.Init(&workers)
b := &balancer{workers, done}
requests := make(chan *request)
go b.balance(requests)
return requests
}