in components/processing/libs/processor-xlsx/src/processors/xlsx/xlsx_processor.py [0:0]
def xlsx_processor(source: GCSPath, output_dir: GCSPath) -> Dict:
# Load the book
logging.info(f"Extracting spreadsheet {str(source)}")
with source.read_as_file() as r:
book = pyexcel.get_book(
file_name=r,
force_file_type=source.suffix[1:],
)
for name in book.sheet_names():
sheet = book.sheet_by_name(name)
# Assume the first row is the header for the data
sheet.name_columns_by_row(0)
# Markdown output
with (
GCSPath(output_dir, name + ".txt").write_as_file() as f,
MarkdownGenerator(filename=f) as m,
):
m.addHeader(1, name)
# Prepare data
data = []
first_row = True
for row in sheet.to_array():
if first_row:
first_row = False
continue
data.append([cleanse_string(v) for v in row])
# Generate the table
m.addTable(
header_names=sheet.colnames, alignment="left", row_elements=data
)
return dict()