def __iter__()

in bleach/html5lib_shim.py [0:0]


    def __iter__(self):
        last_error_token = None

        for token in super().__iter__():
            if last_error_token is not None:
                if (
                    last_error_token["data"] == "invalid-character-in-attribute-name"
                    and token["type"] in TAG_TOKEN_TYPES
                    and token.get("data")
                ):
                    # token["data"] is an html5lib attributeMap
                    # (OrderedDict 3.7+ and dict otherwise)
                    # of attr name to attr value
                    #
                    # Remove attribute names that have ', " or < in them
                    # because those characters are invalid for attribute names.
                    token["data"] = attributeMap(
                        (attr_name, attr_value)
                        for attr_name, attr_value in token["data"].items()
                        if (
                            '"' not in attr_name
                            and "'" not in attr_name
                            and "<" not in attr_name
                        )
                    )
                    last_error_token = None
                    yield token

                elif (
                    last_error_token["data"] == "expected-closing-tag-but-got-char"
                    and self.parser.tags is not None
                    and token["data"].lower().strip() not in self.parser.tags
                ):
                    # We've got either a malformed tag or a pseudo-tag or
                    # something that html5lib wants to turn into a malformed
                    # comment which Bleach clean() will drop so we interfere
                    # with the token stream to handle it more correctly.
                    #
                    # If this is an allowed tag, it's malformed and we just let
                    # the html5lib parser deal with it--we don't enter into this
                    # block.
                    #
                    # If this is not an allowed tag, then we convert it to
                    # characters and it'll get escaped in the sanitizer.
                    token["data"] = self.stream.get_tag()
                    token["type"] = TAG_TOKEN_TYPE_CHARACTERS

                    last_error_token = None
                    yield token

                elif token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
                    # If the token is a parse error, then let the last_error_token
                    # go, and make token the new last_error_token
                    yield last_error_token
                    last_error_token = token

                else:
                    yield last_error_token
                    yield token
                    last_error_token = None

                continue

            # If the token is a ParseError, we hold on to it so we can get the
            # next token and potentially fix it.
            if token["type"] == TAG_TOKEN_TYPE_PARSEERROR:
                last_error_token = token
                continue

            yield token

        if last_error_token:
            if last_error_token["data"] == "eof-in-tag-name":
                # Handle the case where the text being parsed ends with <
                # followed by a series of characters. It's treated as a tag
                # name that abruptly ends, but we should treat that like
                # character data
                yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}

            elif last_error_token["data"] in (
                "duplicate-attribute",
                "eof-in-attribute-name",
                "eof-in-attribute-value-no-quotes",
                "expected-end-of-tag-but-got-eof",
            ):
                # Handle the case where the text being parsed ends with <
                # followed by characters and then space and then:
                #
                # * more characters
                # * more characters repeated with a space between (e.g. "abc abc")
                # * more characters and then a space and then an EOF (e.g. "abc def ")
                #
                # These cases are treated as a tag name followed by an
                # attribute that abruptly ends, but we should treat that like
                # character data instead.
                yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}

            else:
                yield last_error_token