in microservices/course_ingestion/services/parsers/custom/custom_pdf_paragraph_parser.py [0:0]
def generate_paragraph_elements(self, elements, size_tag):
"""Scrapes paragraphs from PDF and return texts with element tags.
Return:
list of texts with pre-prended element tags
"""
raw_paragraphs = []
prev_span = {}
previous_origin = None
for b, block in enumerate(elements):
page = block["page"]
block_string = ""
for l, line in enumerate(block["lines"]):
for ls, line_span in enumerate(line["spans"]):
if ls == 0:
line_origin = line_origin = round(line_span["origin"][0], 2)
is_header = self.check_if_header(elements, b, l, ls, size_tag)
## adding info to blocks:
elements[b]["is_header"] = is_header
block["lines"][l]["is_header"] = is_header
line["spans"][ls]["is_header"] = is_header
if not is_header:
span_text = self.get_text_from_span(page, line_span)
if not prev_span:
if not block_string:
block_string = size_tag[line_span["size"]] + span_text
elif block_string and all((c == "#") for c in block_string):
block_string = size_tag[line_span["size"]] + span_text
else:
block_string += span_text
elif prev_span and line_span["size"] == prev_span["size"]:
if block_string and all((c == "#") for c in block_string):
# block_string only contains pipes
block_string = size_tag[line_span["size"]] + span_text
elif block_string == "":
# new block has started, so append size tag
block_string = size_tag[line_span["size"]] + span_text
elif previous_origin and (previous_origin < line_origin):
raw_paragraphs.append(block_string)
block_string = size_tag[line_span["size"]] + span_text
previous_origin = line_origin
else: # in the same block, so concatenate strings
block_string += " " + \
span_text
else:
raw_paragraphs.append(block_string)
block_string = size_tag[line_span["size"]] + span_text
prev_span = line_span
previous_origin = line_origin
block_string += "##"
raw_paragraphs.append(block_string)
return raw_paragraphs