| import gradio as gr | |
| from pdf2image import convert_from_path | |
| import pdfplumber | |
| from docx import Document | |
| import subprocess | |
| import os | |
| def convert_pdf_to_image(file): | |
| images = convert_from_path(file) | |
| return images | |
| def extract_text_from_pdf(file): | |
| text = "" | |
| with pdfplumber.open(file) as pdf: | |
| for page in pdf.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| def extract_text_from_docx(file): | |
| text = "" | |
| doc = Document(file.name) | |
| for paragraph in doc.paragraphs: | |
| text += paragraph.text + "\n" | |
| return text | |
| def convert_doc_to_text(doc_path): | |
| try: | |
| subprocess.run( | |
| ["unoconv", "--format", "txt", doc_path], | |
| capture_output=True, | |
| text=True, | |
| check=True, | |
| ) | |
| txt_file_path = doc_path.replace(".doc", ".txt") | |
| with open(txt_file_path, "r") as f: | |
| text = f.read() | |
| text = text.lstrip("\ufeff") | |
| os.remove(txt_file_path) | |
| return text | |
| except subprocess.CalledProcessError as e: | |
| print(f"Error converting {doc_path} to text: {e}") | |
| return "" | |
| def extract_text_from_doc_or_docx(file): | |
| if file.name.endswith(".docx"): | |
| return extract_text_from_docx(file) | |
| elif file.name.endswith(".doc"): | |
| return convert_doc_to_text(file.name) | |
| else: | |
| return "Unsupported file type. Please upload a .doc or .docx file." | |
| pdf_to_img = gr.Interface( | |
| convert_pdf_to_image, gr.File(), gr.Gallery(), api_name="pdf_to_img" | |
| ) | |
| pdf_to_text = gr.Interface( | |
| extract_text_from_pdf, | |
| gr.File(), | |
| gr.Textbox(placeholder="Extracted text will appear here"), | |
| api_name="pdf_to_text", | |
| ) | |
| doc_or_docx_to_text = gr.Interface( | |
| extract_text_from_doc_or_docx, | |
| gr.File(), | |
| gr.Textbox(placeholder="Extracted text from DOC or DOCX will appear here"), | |
| api_name="doc_or_docx_to_text", | |
| ) | |
| demo = gr.TabbedInterface( | |
| [pdf_to_img, pdf_to_text, doc_or_docx_to_text], | |
| ["PDF to Image", "Extract PDF Text", "Extract DOC/DOCX Text"], | |
| ) | |
| demo.launch(debug=True) | |