def generate_paragraph_elements()

in microservices/course_ingestion/services/parsers/custom/custom_pdf_paragraph_parser.py [0:0]


  def generate_paragraph_elements(self, elements, size_tag):
    """Scrapes paragraphs from PDF and return texts with element tags.
          Return:
            list of texts with pre-prended element tags
          """
    raw_paragraphs = []
    prev_span = {}
    previous_origin = None
    for b, block in enumerate(elements):
      page = block["page"]
      block_string = ""
      for l, line in enumerate(block["lines"]):
        for ls, line_span in enumerate(line["spans"]):
          if ls == 0:
            line_origin = line_origin = round(line_span["origin"][0], 2)
          is_header = self.check_if_header(elements, b, l, ls, size_tag)
          ## adding info to blocks:
          elements[b]["is_header"] = is_header
          block["lines"][l]["is_header"] = is_header
          line["spans"][ls]["is_header"] = is_header

          if not is_header:
            span_text = self.get_text_from_span(page, line_span)
            if not prev_span:
              if not block_string:
                block_string = size_tag[line_span["size"]] + span_text
              elif block_string and all((c == "#") for c in block_string):
                block_string = size_tag[line_span["size"]] + span_text
              else:
                block_string += span_text
            elif prev_span and line_span["size"] == prev_span["size"]:
              if block_string and all((c == "#") for c in block_string):
                # block_string only contains pipes
                block_string = size_tag[line_span["size"]] + span_text
              elif block_string == "":
                # new block has started, so append size tag
                block_string = size_tag[line_span["size"]] + span_text
              elif previous_origin and (previous_origin < line_origin):
                raw_paragraphs.append(block_string)
                block_string = size_tag[line_span["size"]] + span_text
                previous_origin = line_origin
              else:  # in the same block, so concatenate strings
                block_string += " " + \
                    span_text
            else:
              raw_paragraphs.append(block_string)
              block_string = size_tag[line_span["size"]] + span_text
            prev_span = line_span
        previous_origin = line_origin
        block_string += "##"
      raw_paragraphs.append(block_string)
    return raw_paragraphs