in codegen_sources/preprocessing/lang_processors/cpp_processor.py [0:0]
def extract_functions(self, code):
"""Extract functions from tokenized C++ code"""
if isinstance(code, list):
code = " ".join(code)
else:
assert isinstance(code, str)
try:
code = self.clean_hashtags_function(code)
code = (
code.replace("ENDCOM", "\n")
.replace("▁", "SPACETOKEN")
.replace(NEW_LINE, "\n")
)
tokens, token_types = self.get_tokens_and_types(code)
tokens = list(zip(tokens, token_types))
except KeyboardInterrupt:
raise
except:
return [], []
i = ind_iter(len(tokens))
functions_standalone = []
functions_class = []
try:
token, token_type = tokens[i.i]
except:
return [], []
while True:
try:
# detect function
if token == ")" and (
(tokens[i.i + 1][0] == "{" and tokens[i.i + 2][0] != "}")
or (
tokens[i.i + 1][0] == "throw"
and tokens[i.i + 4][0] == "{"
and tokens[i.i + 5][0] != "}"
)
):
# go previous until the start of function
while token not in {";", "}", "{", NEW_LINE, "\n"}:
try:
i.prev()
except StopIteration:
break
token = tokens[i.i][0]
# We are at the beginning of the function
i.next()
token, token_type = tokens[i.i]
if token_type == "comment":
token = token.strip()
token += " ENDCOM"
function = [token]
token_types = [token_type]
while token != "{":
i.next()
token, token_type = tokens[i.i]
if token_type == "comment":
token = token.strip()
token += " ENDCOM"
function.append(token)
token_types.append(token_type)
if token_types[function.index("(") - 1] not in IDENTIFIERS:
continue
if token_types[function.index("(") - 1] == "field_identifier":
field_identifier = True
else:
field_identifier = False
if token == "{":
number_indent = 1
while not (token == "}" and number_indent == 0):
try:
i.next()
token, token_type = tokens[i.i]
if token == "{":
number_indent += 1
elif token == "}":
number_indent -= 1
if token_type == "comment":
token = token.strip()
token += " ENDCOM"
function.append(token)
except StopIteration:
break
if (
"static" in function[0 : function.index("{")]
or "::" not in function[0 : function.index("(")]
and not field_identifier
):
function = " ".join(function)
function = re.sub(
"[<][ ][D][O][C][U][M][E][N][T].*?[>] ", "", function
)
function = self.clean_hashtags_function(function)
function = function.strip()
function = function.replace("\n", "ENDCOM").replace(
"SPACETOKEN", "▁"
)
if not re.sub(
"[^ ]*[ ][(][ ]\w*([ ][,][ ]\w*)*[ ][)]",
"",
function[: function.index("{")],
).strip().startswith("{") and not function.startswith("#"):
functions_standalone.append(function)
else:
function = " ".join(function)
function = re.sub(
"[<][ ][D][O][C][U][M][E][N][T].*?[>] ", "", function
)
function = self.clean_hashtags_function(function)
function = function.strip()
function = function.replace("\n", "ENDCOM").replace(
"SPACETOKEN", "▁"
)
if not re.sub(
"[^ ]*[ ][(][ ]\w*([ ][,][ ]\w*)*[ ][)]",
"",
function[: function.index("{")],
).strip().startswith("{") and not function.startswith("#"):
functions_class.append(function)
i.next()
token = tokens[i.i][0]
except KeyboardInterrupt:
raise
except:
break
return functions_standalone, functions_class