def generate_entities()

in common_ml/src/common_ml/utils.py [0:0]


def generate_entities(doc, assessment_type=None):
  """Function to get candidates using spacy
  Args:
    doc: Spacy object fitted on input sentence
    assessment_type: "external_distractors" - for CTF Type 0
                   OR "None" - For others
    discard_ner_types: Types of named entities to discard
  Returns:
    named_entities, non_named_entities:
    list of dictionaries after removing the entities inside brackets.
  """
  named_entities = []
  for ent in doc.ents:
    if ent[-1].pos_ in valid_named_entity_pos:
      named_entities.append({
      "text": ent.text,
      "start_idx": ent.start_char,
      "label": ent.label_
    })
  doc_ents = [i.text for i in doc.ents]
  non_named_entities = []
  for ind, _ in enumerate(doc):
    if doc[ind].pos_ == "ADJ":
      if not (assessment_type == "external_distractors" and \
      doc[ind].text in quantitative_adjectives):
        curr = ind
        flag = False
        text = doc[curr].text_with_ws
        if doc[curr].text in quantitative_adjectives:
          text = ""
        while ((curr < len(doc) - 1) and
               ((doc[curr + 1].pos_ == "NOUN") or
                (doc[curr + 1].text.strip() == "-"))) or (
                    (curr > 0) and (doc[curr].text.strip() == "-")):
          flag = True
          text += "" + str(doc[curr + 1].text_with_ws)
          curr += 1
          if doc[curr].pos_ in ["PART", "PUNCT"
                               ] and doc[curr].text in ["'s", "-"]:
            text += "" + str(doc[curr + 1].text_with_ws)
            curr += 1
        if flag:
          non_named_entities.append({
              "text": text.rstrip(),
              "start_idx": doc[ind].idx
          })
    elif (ind == 0 and doc[ind].pos_ == "NOUN") or (
        doc[ind].pos_ == "NOUN" and doc[ind - 1].pos_ != "NOUN" and
        doc[ind - 1].pos_ != "ADJ" and doc[ind - 1].text not in ["'s", "-"]):
      curr = ind
      text = doc[curr].text_with_ws
      while (curr < len(doc) - 1 and
             (doc[curr + 1].pos_ == "NOUN" or
              (doc[curr + 1].pos_ in ["PART", "PUNCT"]) and
              doc[curr + 1].text in ["'s", "-"])) or doc[curr].text == "-":
        text += "" + str(doc[curr + 1].text_with_ws)
        curr += 1
        if doc[curr-1].pos_ == "NOUN" and doc[curr-1].text in doc_ents and\
            doc[curr].pos_ == "PART":
          text = ""
      non_named_entities.append({
          "text": text.rstrip(),
          "start_idx": doc[ind].idx
      })
  non_named_entities = clean_candidates(non_named_entities)
  non_named_entities = remove_overlapping(non_named_entities)
  non_named_entities = remove_bracket_entities(non_named_entities, doc.text)
  named_entities = remove_bracket_entities(named_entities, doc.text)
  return (named_entities, non_named_entities)