in preprocessing/src/javalang_tokenizer.py [0:0]
def tokenize(self, keep_comments=False):
self.reset()
# Convert unicode escapes
self.pre_tokenize()
while self.i < self.length:
token_type = None
c = self.data[self.i]
c_next = None
startswith = c
if self.i + 1 < self.length:
c_next = self.data[self.i + 1]
startswith = c + c_next
if c.isspace():
self.consume_whitespace()
continue
elif startswith in ("//", "/*"):
comment = self.read_comment()
if comment.startswith("/**"):
self.javadoc = comment
if keep_comments:
token_type = Comment
token = token_type(comment)
yield token
continue
elif startswith == '..' and self.try_operator():
# Ensure we don't mistake a '...' operator as a sequence of
# three '.' separators. This is done as an optimization instead
# of moving try_operator higher in the chain because operators
# aren't as common and try_operator is expensive
token_type = Operator
elif c == '@':
token_type = Annotation
self.j = self.i + 1
elif c == '.' and c_next and c_next.isdigit():
token_type = self.read_decimal_float_or_integer()
elif self.try_separator():
token_type = Separator
elif c in ("'", '"'):
token_type = String
self.read_string()
elif c in '0123456789':
token_type = self.read_integer_or_float(c, c_next)
elif self.is_java_identifier_start(c):
token_type = self.read_identifier()
elif self.try_operator():
token_type = Operator
else:
self.error('Could not process token', c)
self.i = self.i + 1
continue
position = Position(self.current_line, self.i - self.start_of_line)
token = token_type(
self.data[self.i:self.j], position, self.javadoc)
yield token
if self.javadoc:
self.javadoc = None
self.i = self.j