def extract_functions()

in codegen_sources/preprocessing/lang_processors/cpp_processor.py [0:0]
121 lines of code
32 McCabe index (conditional complexity)

    def extract_functions(self, code):
        """Extract functions from tokenized C++ code"""
        if isinstance(code, list):
            code = " ".join(code)
        else:
            assert isinstance(code, str)

        try:
            code = self.clean_hashtags_function(code)
            code = (
                code.replace("ENDCOM", "\n")
                .replace("▁", "SPACETOKEN")
                .replace(NEW_LINE, "\n")
            )
            tokens, token_types = self.get_tokens_and_types(code)
            tokens = list(zip(tokens, token_types))
        except KeyboardInterrupt:
            raise
        except:
            return [], []
        i = ind_iter(len(tokens))
        functions_standalone = []
        functions_class = []
        try:
            token, token_type = tokens[i.i]
        except:
            return [], []
        while True:
            try:
                # detect function
                if token == ")" and (
                    (tokens[i.i + 1][0] == "{" and tokens[i.i + 2][0] != "}")
                    or (
                        tokens[i.i + 1][0] == "throw"
                        and tokens[i.i + 4][0] == "{"
                        and tokens[i.i + 5][0] != "}"
                    )
                ):
                    # go previous until the start of function
                    while token not in {";", "}", "{", NEW_LINE, "\n"}:
                        try:
                            i.prev()
                        except StopIteration:
                            break
                        token = tokens[i.i][0]
                    # We are at the beginning of the function
                    i.next()
                    token, token_type = tokens[i.i]
                    if token_type == "comment":
                        token = token.strip()
                        token += " ENDCOM"
                    function = [token]
                    token_types = [token_type]
                    while token != "{":
                        i.next()
                        token, token_type = tokens[i.i]
                        if token_type == "comment":
                            token = token.strip()
                            token += " ENDCOM"
                        function.append(token)
                        token_types.append(token_type)

                    if token_types[function.index("(") - 1] not in IDENTIFIERS:
                        continue
                    if token_types[function.index("(") - 1] == "field_identifier":
                        field_identifier = True
                    else:
                        field_identifier = False
                    if token == "{":
                        number_indent = 1
                        while not (token == "}" and number_indent == 0):
                            try:
                                i.next()
                                token, token_type = tokens[i.i]
                                if token == "{":
                                    number_indent += 1
                                elif token == "}":
                                    number_indent -= 1
                                if token_type == "comment":
                                    token = token.strip()
                                    token += " ENDCOM"
                                function.append(token)
                            except StopIteration:
                                break

                        if (
                            "static" in function[0 : function.index("{")]
                            or "::" not in function[0 : function.index("(")]
                            and not field_identifier
                        ):
                            function = " ".join(function)
                            function = re.sub(
                                "[<][ ][D][O][C][U][M][E][N][T].*?[>] ", "", function
                            )
                            function = self.clean_hashtags_function(function)
                            function = function.strip()
                            function = function.replace("\n", "ENDCOM").replace(
                                "SPACETOKEN", "▁"
                            )
                            if not re.sub(
                                "[^ ]*[ ][(][ ]\w*([ ][,][ ]\w*)*[ ][)]",
                                "",
                                function[: function.index("{")],
                            ).strip().startswith("{") and not function.startswith("#"):
                                functions_standalone.append(function)
                        else:
                            function = " ".join(function)
                            function = re.sub(
                                "[<][ ][D][O][C][U][M][E][N][T].*?[>] ", "", function
                            )
                            function = self.clean_hashtags_function(function)
                            function = function.strip()
                            function = function.replace("\n", "ENDCOM").replace(
                                "SPACETOKEN", "▁"
                            )
                            if not re.sub(
                                "[^ ]*[ ][(][ ]\w*([ ][,][ ]\w*)*[ ][)]",
                                "",
                                function[: function.index("{")],
                            ).strip().startswith("{") and not function.startswith("#"):
                                functions_class.append(function)
                i.next()
                token = tokens[i.i][0]
            except KeyboardInterrupt:
                raise
            except:
                break

        return functions_standalone, functions_class