in lib/linguist/tokenizer.rb [56:120]
def extract_tokens(data)
s = StringScanner.new(data)
tokens = []
until s.eos?
break if s.pos >= BYTE_LIMIT
if token = s.scan(/^
if name = extract_shebang(token)
tokens << "SHEBANG#!#{name}"
end
elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
s.skip_until(/\n|\Z/)
elsif token = s.scan(START_MULTI_LINE_COMMENT)
close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
s.skip_until(Regexp.compile(Regexp.escape(close_token)))
elsif s.scan(/"/)
if s.peek(1) == "\""
s.getch
else
s.skip_until(/[^\\]"/)
end
elsif s.scan(/'/)
if s.peek(1) == "'"
s.getch
else
s.skip_until(/[^\\]'/)
end
elsif s.scan(/(0x)?\d(\d|\.)*/)
elsif token = s.scan(/<[^\s<>][^<>]*>/)
extract_sgml_tokens(token).each { |t| tokens << t }
elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
tokens << token
elsif token = s.scan(/[\w\.@
tokens << token
elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
tokens << token
else
s.getch
end
end
tokens
end