extract_sgml_tokens

in lib/linguist/tokenizer.rb [158:196]


    def extract_sgml_tokens(data)
      s = StringScanner.new(data)

      tokens = []

      until s.eos?
        
        if token = s.scan(/<\/?[^\s>]+/)
          tokens << "#{token}>"

        
        elsif token = s.scan(/\w+=/)
          tokens << token

          
          if s.scan(/"/)
            s.skip_until(/[^\\]"/)
          elsif s.scan(/'/)
            s.skip_until(/[^\\]'/)
          else
            s.skip_until(/\w+/)
          end

        
        elsif token = s.scan(/\w+/)
          tokens << token

        
        elsif s.scan(/>/)
          s.terminate

        else
          s.getch
        end
      end

      tokens
    end