def process_elements_pdf()

in microservices/course_ingestion/services/parsers/custom/custom_pdf_parser.py [0:0]


  def process_elements_pdf(self, elements):
    """Method to process elements obtained from pdf
      Args:
        elements: elements obtained from headers_para method
        type: list
      Returns:
        list which contains the output structure for json"""

    output_json_list = []
    subcompetencies = []
    toc = []
    on_subcom_level = False
    word_incomp = False
    for i, element in enumerate(elements):
      element = element.replace("|", "")
      element = element.replace("   ", " ")
      element = element.replace("  ", " ")
      if "<h2>" in element:
        if "Content" in element:
          count = 0
          for count_element in elements[i + 1:]:
            count += 1
            if "<h2>" in count_element:
              break
          for toc_element in elements[i:i + count + 1]:
            if "<h7>" in toc_element:
              toc_element = self.postprocess_element(toc_element)
              toc_element = toc_element.split(":")[1].strip()
              toc.append({toc_element: []})
            elif "<s1>" in toc_element and "Chapter" in toc_element:
              toc_element = toc_element.split(":")[1].strip()
              toc_element = self.postprocess_element(toc_element)
              toc[-1][list(toc[-1].keys())[0]].append(toc_element)

          for j in toc:
            subcompetencies.append(list(j.values())[0])
          subcompetencies = [j for i in subcompetencies for j in i]

        elif any(x in element for x in subcompetencies):
          structure = self.get_structure()
          competency = self.find_competency(toc, element)
          structure["competency"] = competency
          structure["sub_competency"]["title"] = self.postprocess_element(
              element)
          structure["sub_competency"]["label"] = self.postprocess_element(
              element)
          on_subcom_level = True
          output_json_list.append(structure)
      elif (("<h3>" in element) or ("<h4>" in element)) and on_subcom_level:
        lu_dict = {"text": "", "title": ""}
        lu_dict["title"] = self.postprocess_element(element)
        output_json_list[-1]["sub_competency"]["learning_objectives"][
            "learning_units"].append(lu_dict)
      elif ("<p>" in element) or ("<s3>" in element):
        element = element.strip()
        try:
          prev = output_json_list[-1]["sub_competency"]["learning_objectives"][
              "learning_units"][-1]["text"]
          if ("." in prev[-5:]) or ("?" in prev[-5:]) or (":" in prev[-5:]):
            append_to_text = "\n"
          else:
            if word_incomp:
              append_to_text = ""
            else:
              append_to_text = " "
          word_incomp = bool("-" in element[-5:])
          element = self.postprocess_element(element)
          output_json_list[-1]["sub_competency"]["learning_objectives"][
              "learning_units"][-1]["text"] = output_json_list[-1][
                "sub_competency"]["learning_objectives"]["learning_units"][-1][
                      "text"] + append_to_text + element
        except:  # pylint: disable=bare-except
          pass
    return output_json_list