in services/file.py [0:0]
def extract_text_from_file(file: BufferedReader, mimetype: str) -> str:
if mimetype == "application/pdf":
# Extract text from pdf using PyPDF2
reader = PdfReader(file)
extracted_text = " ".join([page.extract_text() for page in reader.pages])
elif mimetype == "text/plain" or mimetype == "text/markdown":
# Read text from plain text file
extracted_text = file.read().decode("utf-8")
elif (
mimetype
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
# Extract text from docx using docx2txt
extracted_text = docx2txt.process(file)
elif mimetype == "text/csv":
# Extract text from csv using csv module
extracted_text = ""
decoded_buffer = (line.decode("utf-8") for line in file)
reader = csv.reader(decoded_buffer)
for row in reader:
extracted_text += " ".join(row) + "\n"
elif (
mimetype
== "application/vnd.openxmlformats-officedocument.presentationml.presentation"
):
# Extract text from pptx using python-pptx
extracted_text = ""
presentation = pptx.Presentation(file)
for slide in presentation.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
extracted_text += run.text + " "
extracted_text += "\n"
else:
# Unsupported file type
raise ValueError("Unsupported file type: {}".format(mimetype))
return extracted_text