in preprocessing/src/code_tokenizer.py [0:0]
def detokenize_cpp(s):
assert isinstance(s, str) or isinstance(s, list)
if isinstance(s, list):
s = ' '.join(s)
# the ▁ character created bugs in the cpp tokenizer
s = s.replace('ENDCOM', '\n').replace('▁', ' SPACETOKEN ')
try:
tokens_and_types = get_cpp_tokens_and_types(s)
except:
return ''
new_tokens = []
i = 0
while i < len(tokens_and_types):
token, type = tokens_and_types[i]
if type in STRINGS_AND_COMMENTS_TOKEN_KINDS:
new_tokens.append(token.replace('STRNEWLINE', '\n').replace(
'TABSYMBOL', '\t').replace(' ', '').replace('SPACETOKEN', ' '))
if type == TokenKind.COMMENT:
new_tokens.append('NEW_LINE')
elif token == '}':
if i < len(tokens_and_types) - 1 and tokens_and_types[i + 1][0] == ';':
new_tokens += ['CB_COLON', 'NEW_LINE']
i += 2
continue
if i < len(tokens_and_types) - 1 and tokens_and_types[i + 1][0] == ',':
new_tokens += ['CB_COMA', 'NEW_LINE']
i += 2
continue
new_tokens += ['CB_', 'NEW_LINE']
elif token == '{':
new_tokens += ['OB_', 'NEW_LINE']
elif token == '*/':
new_tokens += ['*/', 'NEW_LINE']
elif token == ';':
new_tokens += [';', 'NEW_LINE']
else:
new_tokens.append(token)
if i < len(tokens_and_types) - 1 and tokens_and_types[i + 1][0] in TOK_NO_SPACE_BEFORE:
next_token = tokens_and_types[i + 1][0]
new_tokens[len(new_tokens) - 1] += next_token
if next_token == ';':
new_tokens.append('NEW_LINE')
i += 2
continue
i += 1
lines = re.split('NEW_LINE', ' '.join(new_tokens))
untok_s = indent_lines(lines)
untok_s = untok_s.replace('CB_COLON', '};').replace(
'CB_COMA', '},').replace('CB_', '}').replace('OB_', '{')
return untok_s