in lib/linguist/samples.rb [66:96]
def self.data
db = {}
db['extnames'] = {}
db['filenames'] = {}
each do |sample|
language_name = sample[:language]
if sample[:extname]
db['extnames'][language_name] ||= []
if !db['extnames'][language_name].include?(sample[:extname])
db['extnames'][language_name] << sample[:extname]
db['extnames'][language_name].sort!
end
end
if sample[:filename]
db['filenames'][language_name] ||= []
db['filenames'][language_name] << sample[:filename]
db['filenames'][language_name].sort!
end
data = File.read(sample[:path])
Classifier.train!(db, language_name, data)
end
db['md5'] = Linguist::MD5.hexdigest(db)
db
end