in common_ml/src/common_ml/utils.py [0:0]
def generate_entities(doc, assessment_type=None):
"""Function to get candidates using spacy
Args:
doc: Spacy object fitted on input sentence
assessment_type: "external_distractors" - for CTF Type 0
OR "None" - For others
discard_ner_types: Types of named entities to discard
Returns:
named_entities, non_named_entities:
list of dictionaries after removing the entities inside brackets.
"""
named_entities = []
for ent in doc.ents:
if ent[-1].pos_ in valid_named_entity_pos:
named_entities.append({
"text": ent.text,
"start_idx": ent.start_char,
"label": ent.label_
})
doc_ents = [i.text for i in doc.ents]
non_named_entities = []
for ind, _ in enumerate(doc):
if doc[ind].pos_ == "ADJ":
if not (assessment_type == "external_distractors" and \
doc[ind].text in quantitative_adjectives):
curr = ind
flag = False
text = doc[curr].text_with_ws
if doc[curr].text in quantitative_adjectives:
text = ""
while ((curr < len(doc) - 1) and
((doc[curr + 1].pos_ == "NOUN") or
(doc[curr + 1].text.strip() == "-"))) or (
(curr > 0) and (doc[curr].text.strip() == "-")):
flag = True
text += "" + str(doc[curr + 1].text_with_ws)
curr += 1
if doc[curr].pos_ in ["PART", "PUNCT"
] and doc[curr].text in ["'s", "-"]:
text += "" + str(doc[curr + 1].text_with_ws)
curr += 1
if flag:
non_named_entities.append({
"text": text.rstrip(),
"start_idx": doc[ind].idx
})
elif (ind == 0 and doc[ind].pos_ == "NOUN") or (
doc[ind].pos_ == "NOUN" and doc[ind - 1].pos_ != "NOUN" and
doc[ind - 1].pos_ != "ADJ" and doc[ind - 1].text not in ["'s", "-"]):
curr = ind
text = doc[curr].text_with_ws
while (curr < len(doc) - 1 and
(doc[curr + 1].pos_ == "NOUN" or
(doc[curr + 1].pos_ in ["PART", "PUNCT"]) and
doc[curr + 1].text in ["'s", "-"])) or doc[curr].text == "-":
text += "" + str(doc[curr + 1].text_with_ws)
curr += 1
if doc[curr-1].pos_ == "NOUN" and doc[curr-1].text in doc_ents and\
doc[curr].pos_ == "PART":
text = ""
non_named_entities.append({
"text": text.rstrip(),
"start_idx": doc[ind].idx
})
non_named_entities = clean_candidates(non_named_entities)
non_named_entities = remove_overlapping(non_named_entities)
non_named_entities = remove_bracket_entities(non_named_entities, doc.text)
named_entities = remove_bracket_entities(named_entities, doc.text)
return (named_entities, non_named_entities)