in distant_supervision/text_preprocessor.py [0:0]
def get_phrases(*, entities, noun_chunks):
"""
:param entities: list of pairs (ent_str, ent_category)
:param noun_chunks: list of pairs
"""
phrases = copy.deepcopy(entities)
ent_str_set = set([ent_str.lower() for ent_str, _ in entities])
discard_set = ent_str_set | STOPWORDS
for nc in noun_chunks:
nc_str, _ = nc # ensure it's in the correct format (i.e. pairs)
nc_str_lower = nc_str.lower()
if nc_str_lower not in discard_set:
phrases.append(nc)
return phrases