in preprocessing/src/code_tokenizer.py [0:0]
def tokenize_python(s, keep_comments=False):
try:
assert isinstance(s, str)
s = s.replace(r'\r', '')
tokens = []
try:
iterator = tokenize.tokenize(BytesIO(s.encode('utf-8')).readline)
except SyntaxError as excep:
raise SyntaxError(excep)
removed_docstr = 0
while True:
try:
toktype, tok, _, _, line = next(iterator)
except (tokenize.TokenError, IndentationError, SyntaxError, UnicodeDecodeError):
raise Exception(
f"Impossible to parse tokens because icorrect source code \"{s[0:30]}\" ...")
except StopIteration:
raise Exception(f"End of iterator before ENDMARKER token.")
if toktype == tokenize.ENCODING or toktype == tokenize.NL:
continue
elif toktype == tokenize.NEWLINE:
if removed_docstr == 1:
removed_docstr = 0
continue
tokens.append('NEW_LINE')
elif toktype == tokenize.COMMENT:
if keep_comments:
com = process_string(
tok, PYTHON_CHAR2TOKEN, PYTHON_TOKEN2CHAR, True)
if len(com) > 0:
tokens.append(com)
else:
continue
elif toktype == tokenize.STRING:
if tok == line.strip(): # docstring
if not keep_comments:
removed_docstr = 1
continue
else:
coms = process_string(
tok, PYTHON_CHAR2TOKEN, PYTHON_TOKEN2CHAR, True)
if len(coms) > 0:
tokens.append(coms)
else:
removed_docstr = 1
else:
tokens.append(process_string(
tok, PYTHON_CHAR2TOKEN, PYTHON_TOKEN2CHAR, False))
elif toktype == tokenize.INDENT:
tokens.append('INDENT')
elif toktype == tokenize.DEDENT:
# empty block
if tokens[-1] == 'INDENT':
tokens = tokens[:-1]
else:
tokens.append('DEDENT')
elif toktype == tokenize.ENDMARKER:
tokens.append('ENDMARKER')
break
else:
tokens.append(tok)
assert (tokens[-1] == 'ENDMARKER'), "Error, no end marker"
return tokens[:-1]
except KeyboardInterrupt:
raise
except:
return []