extract_tokens

in lib/linguist/tokenizer.rb [56:120]


    def extract_tokens(data)
      s = StringScanner.new(data)

      tokens = []
      until s.eos?
        break if s.pos >= BYTE_LIMIT

        if token = s.scan(/^
          if name = extract_shebang(token)
            tokens << "SHEBANG#!#{name}"
          end

        
        elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
          
          s.skip_until(/\n|\Z/)

        
        elsif token = s.scan(START_MULTI_LINE_COMMENT)
          
          close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
          s.skip_until(Regexp.compile(Regexp.escape(close_token)))
          

        
        elsif s.scan(/"/)
          if s.peek(1) == "\""
            s.getch
          else
            s.skip_until(/[^\\]"/)
          end
        elsif s.scan(/'/)
          if s.peek(1) == "'"
            s.getch
          else
            s.skip_until(/[^\\]'/)
          end

        
        elsif s.scan(/(0x)?\d(\d|\.)*/)

        
        elsif token = s.scan(/<[^\s<>][^<>]*>/)
          extract_sgml_tokens(token).each { |t| tokens << t }

        
        elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
          tokens << token

        
        elsif token = s.scan(/[\w\.@
          tokens << token

        
        elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
          tokens << token

        else
          s.getch
        end
      end

      tokens
    end