func main()

in pkg/linguist/data/generate_classifier.go [31:171]


func main() {
	const (
		sourcePath = "./linguist/samples"
		outfile    = "./classifier"
		quiet      = false
	)

	log.SetFlags(0)
	if quiet {
		log.SetOutput(ioutil.Discard)
	}

	// first we only read all the paths of the sample files
	// and their corresponding and language names into:
	sampleFiles := []*sampleFile{}
	// and store all the language names into:
	languages := []string{}

	/*
			   github/linguist has directory structure:

			   ...
			   ├── samples
			   │   ├── (name of programming language)
			   │   │   ├── (sample file in language)
			   │   │   ├── (sample file in language)
			   │   │   └── (sample file in language)
			   │   ├── (name of another programming language)
			   │   │   └── (sample file)
			   ...

		       the following hard-coded logic expects this layout
	*/

	log.Println("Scanning", sourcePath, "...")
	srcDir, err := os.Open(sourcePath)
	checkErr(err)

	subDirs, err := srcDir.Readdir(-1)
	checkErr(err)

	for _, langDir := range subDirs {
		lang := langDir.Name()
		if !langDir.IsDir() {
			log.Println("unexpected file:", lang)
			continue
		}

		languages = append(languages, lang)

		samplePath := sourcePath + "/" + lang
		sampleDir, err := os.Open(samplePath)
		checkErr(err)
		files, err := sampleDir.Readdir(-1)
		checkErr(err)
		for _, file := range files {
			fp := samplePath + "/" + file.Name()
			if file.IsDir() {
				// Skip subdirectories
				continue
			}
			sampleFiles = append(sampleFiles, &sampleFile{lang, fp, nil})
		}
		sampleDir.Close()
	}
	log.Println("Found", len(languages), "languages in", len(sampleFiles), "files")

	// simple progress bar
	progress := 0.0
	total := float64(len(sampleFiles)) * 2.0
	progressBar := func() {
		progress++
		fmt.Printf("Processing files ... %.2f%%\r", progress/total*100.0)
	}

	// then we concurrently read and tokenize the samples
	sampleChan := make(chan *sampleFile)
	readyChan := make(chan struct{})
	received := 0
	tokenize := func(s *sampleFile) {
		f, err := os.Open(s.fp)
		checkErr(err)
		contents, err := ioutil.ReadAll(f)
		f.Close()
		checkErr(err)
		s.tokens = tokenizer.Tokenize(contents)
		sampleChan <- s
	}
	dox := map[string][]string{}
	for _, lang := range languages {
		dox[lang] = []string{}
	}
	// this receives the processed files and stores their tokens with their language
	go func() {
		for {
			s := <-sampleChan
			dox[s.lang] = append(dox[s.lang], s.tokens...)
			received++
			progressBar()
			if received == len(sampleFiles) {
				close(readyChan)
				return
			}
		}
	}()

	// this balances the workload (implementation at end of file)
	requests := getRequestsChan(len(sampleFiles))
	for i := range sampleFiles {
		requests <- &request{
			workFn: tokenize,
			arg:    sampleFiles[i],
		}
		progressBar()
	}

	// once that's done
	<-readyChan
	close(requests)
	fmt.Println() // for the progress bar

	// we train the classifier in the arbitrary manner that its API demands
	classes := make([]bayesian.Class, 1)
	documents := make(map[bayesian.Class][]string)
	for _, lang := range languages {
		var class = bayesian.Class(lang)
		classes = append(classes, class)
		documents[class] = dox[lang]
	}
	log.Println("Creating bayesian.Classifier ...")
	clsf := bayesian.NewClassifier(classes...)
	for cls, dox := range documents {
		clsf.Learn(dox, cls)
	}

	// and write the data to disk
	log.Println("Serializing and exporting bayesian.Classifier to", outfile, "...")
	checkErr(clsf.WriteToFile("classifier"))

	log.Println("Done.")
}