in microservices/course_ingestion/services/parsers/custom/custom_pdf_parser.py [0:0]
def process_elements_pdf(self, elements):
"""Method to process elements obtained from pdf
Args:
elements: elements obtained from headers_para method
type: list
Returns:
list which contains the output structure for json"""
output_json_list = []
subcompetencies = []
toc = []
on_subcom_level = False
word_incomp = False
for i, element in enumerate(elements):
element = element.replace("|", "")
element = element.replace(" ", " ")
element = element.replace(" ", " ")
if "<h2>" in element:
if "Content" in element:
count = 0
for count_element in elements[i + 1:]:
count += 1
if "<h2>" in count_element:
break
for toc_element in elements[i:i + count + 1]:
if "<h7>" in toc_element:
toc_element = self.postprocess_element(toc_element)
toc_element = toc_element.split(":")[1].strip()
toc.append({toc_element: []})
elif "<s1>" in toc_element and "Chapter" in toc_element:
toc_element = toc_element.split(":")[1].strip()
toc_element = self.postprocess_element(toc_element)
toc[-1][list(toc[-1].keys())[0]].append(toc_element)
for j in toc:
subcompetencies.append(list(j.values())[0])
subcompetencies = [j for i in subcompetencies for j in i]
elif any(x in element for x in subcompetencies):
structure = self.get_structure()
competency = self.find_competency(toc, element)
structure["competency"] = competency
structure["sub_competency"]["title"] = self.postprocess_element(
element)
structure["sub_competency"]["label"] = self.postprocess_element(
element)
on_subcom_level = True
output_json_list.append(structure)
elif (("<h3>" in element) or ("<h4>" in element)) and on_subcom_level:
lu_dict = {"text": "", "title": ""}
lu_dict["title"] = self.postprocess_element(element)
output_json_list[-1]["sub_competency"]["learning_objectives"][
"learning_units"].append(lu_dict)
elif ("<p>" in element) or ("<s3>" in element):
element = element.strip()
try:
prev = output_json_list[-1]["sub_competency"]["learning_objectives"][
"learning_units"][-1]["text"]
if ("." in prev[-5:]) or ("?" in prev[-5:]) or (":" in prev[-5:]):
append_to_text = "\n"
else:
if word_incomp:
append_to_text = ""
else:
append_to_text = " "
word_incomp = bool("-" in element[-5:])
element = self.postprocess_element(element)
output_json_list[-1]["sub_competency"]["learning_objectives"][
"learning_units"][-1]["text"] = output_json_list[-1][
"sub_competency"]["learning_objectives"]["learning_units"][-1][
"text"] + append_to_text + element
except: # pylint: disable=bare-except
pass
return output_json_list