def xlsx_processor()

in components/processing/libs/processor-xlsx/src/processors/xlsx/xlsx_processor.py [0:0]


def xlsx_processor(source: GCSPath, output_dir: GCSPath) -> Dict:

    # Load the book
    logging.info(f"Extracting spreadsheet {str(source)}")
    with source.read_as_file() as r:
        book = pyexcel.get_book(
            file_name=r,
            force_file_type=source.suffix[1:],
        )

        for name in book.sheet_names():
            sheet = book.sheet_by_name(name)

            # Assume the first row is the header for the data
            sheet.name_columns_by_row(0)

            # Markdown output
            with (
                GCSPath(output_dir, name + ".txt").write_as_file() as f,
                MarkdownGenerator(filename=f) as m,
            ):
                m.addHeader(1, name)

                # Prepare data
                data = []
                first_row = True
                for row in sheet.to_array():
                    if first_row:
                        first_row = False
                        continue
                    data.append([cleanse_string(v) for v in row])

                # Generate the table
                m.addTable(
                    header_names=sheet.colnames, alignment="left", row_elements=data
                )

    return dict()