in utils/span_heuristic.py [0:0]
def _normalize_tokens(tokens: List[str], keep_empty_str=True) -> List[str]:
"""
Normalize individual tokens.
If keep_empty_str is True, this keeps the overall number of tokens the same.
A particular token could be normalized to an empty string.
"""
normalized_tokens = []
for token in tokens:
token = token.lower()
token = ''.join(ch for ch in token if ch not in EXCLUDED_PUNCTS)
token = re.sub(ARTICLES_RE, '', token)
if keep_empty_str or len(token):
normalized_tokens.append(token)
return normalized_tokens