in distant_supervision/text_preprocessor.py [0:0]
def compute_ner_and_noun_chunks(self, text):
"""
https://spacy.io/usage/linguistic-features#noun-chunks
ents: [('today', 'DATE'), ('Patrick', 'PERSON')]
noun_chunks: e.g. [('Autonomous cars', 'nsubj'), ('insurance liability', 'dobj')]
:return: (ents, noun_chunks)
"""
if len(text) > ULIM_CHAR_PER_SENTENCE:
return [], []
# spacy has memory leaks: https://github.com/explosion/spaCy/issues/3618
nlp = SpacyMagic.load('my_english', 'en_core_web_sm', disable=[])
doc = nlp(text)
ents = [(ent.text, ent.label_) for ent in doc.ents]
chunks = [(nc.text, nc.root.dep_) for nc in doc.noun_chunks]
ents = sorted(set(ents))
chunks = sorted(set(chunks))
return ents, chunks