Spaces:

KingNish
/

Doc-Reader-and-Chat

Running

App Files Files Community

Doc-Reader-and-Chat / app.py

KingNish

Update app.py

e27d06b verified about 1 year ago

raw

history blame contribute delete

5.81 kB

	import PyPDF2
	from openpyxl import load_workbook
	from pptx import Presentation
	import gradio as gr
	import io
	import re
	import zipfile
	import xml.etree.ElementTree as ET
	import filetype

	# Constants
	CHUNK_SIZE = 32000

	# --- Utility Functions ---

	def xml2text(xml):
	"""Extracts text from XML data."""
	text = u''
	root = ET.fromstring(xml)
	for child in root.iter():
	text += child.text + " " if child.text is not None else ''
	return text

	def clean_text(content):
	"""Cleans text content based on the 'clean' parameter."""
	content = content.replace('\n', ' ')
	content = content.replace('\r', ' ')
	content = content.replace('\t', ' ')
	content = re.sub(r'\s+', ' ', content)
	return content


	def split_content(content, chunk_size=CHUNK_SIZE):
	"""Splits content into chunks of a specified size."""
	chunks = []
	for i in range(0, len(content), chunk_size):
	chunks.append(content[i:i + chunk_size])
	return chunks

	# --- Document Reading Functions ---

	def extract_text_from_docx(docx_data, clean=True):
	"""Extracts text from DOCX files."""
	text = u''
	zipf = zipfile.ZipFile(io.BytesIO(docx_data))

	filelist = zipf.namelist()

	header_xmls = 'word/header[0-9]*.xml'
	for fname in filelist:
	if re.match(header_xmls, fname):
	text += xml2text(zipf.read(fname))

	doc_xml = 'word/document.xml'
	text += xml2text(zipf.read(doc_xml))

	footer_xmls = 'word/footer[0-9]*.xml'
	for fname in filelist:
	if re.match(footer_xmls, fname):
	text += xml2text(zipf.read(fname))

	zipf.close()
	if clean:
	text = clean_text(text)
	return text, len(text)

	def extract_text_from_pptx(pptx_data, clean=True):
	"""Extracts text from PPT files."""
	text = u''
	zipf = zipfile.ZipFile(io.BytesIO(pptx_data))

	filelist = zipf.namelist()

	# Extract text from slide notes
	notes_xmls = 'ppt/notesSlides/notesSlide[0-9]*.xml'
	for fname in filelist:
	if re.match(notes_xmls, fname):
	text += xml2text(zipf.read(fname))

	# Extract text from slide content (shapes and text boxes)
	slide_xmls = 'ppt/slides/slide[0-9]*.xml'
	for fname in filelist:
	if re.match(slide_xmls, fname):
	text += xml2text(zipf.read(fname))

	zipf.close()
	if clean:
	text = clean_text(text)
	return text, len(text)

	def read_document(file, clean=True):
	file_path = file.name

	with open(file_path, "rb") as f:
	file_content = f.read()

	kind = filetype.guess(file_content)

	if kind is None:
	mime = "text"
	else:
	mime = kind.mime

	if mime == "application/pdf":
	try:
	pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
	content = ''
	for page in range(len(pdf_reader.pages)):
	content += pdf_reader.pages[page].extract_text()
	if clean:
	content = clean_text(content)
	return content, len(repr(content))
	except Exception as e:
	return f"Error reading PDF: {e}", 0
	elif mime == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
	try:
	wb = load_workbook(io.BytesIO(file_content))
	content = ''
	for sheet in wb.worksheets:
	for row in sheet.rows:
	for cell in row:
	if cell.value is not None:
	content += str(cell.value) + ' '
	if clean:
	content = clean_text(content)
	return content, len(repr(content))
	except Exception as e:
	return f"Error reading XLSX: {e}", 0
	elif mime == "text/plain":
	try:
	content = file_content.decode('utf-8')
	if clean:
	content = clean_text(content)
	return content, len(repr(content))
	except Exception as e:
	return f"Error reading TXT file: {e}", 0
	elif mime == "text/csv":
	try:
	content = file_content.decode('utf-8')
	if clean:
	content = clean_text(content)
	return content, len(repr(content))
	except Exception as e:
	return f"Error reading CSV file: {e}", 0
	elif mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
	try:
	return extract_text_from_docx(file_content, clean)
	except Exception as e:
	return f"Error reading DOCX: {e}", 0
	elif mime == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
	try:
	return extract_text_from_pptx(file_content, clean)
	except Exception as e:
	return f"Error reading PPTX: {e}", 0

	else:
	try:
	content = file_content.decode('utf-8')
	if clean:
	content = clean_text(content)
	return content, len(repr(content))
	except Exception as e:
	return f"Error reading file: {e}", 0


	# --- Gradio Interface ---

	iface = gr.Interface(
	fn=read_document,
	inputs=[
	gr.File(label="Upload a Document"),
	gr.Checkbox(label="Clean Text", value=True),
	],
	outputs=[
	gr.Textbox(label="Document Content"),
	gr.Number(label="Document Length (characters)"),
	],
	title="Better Document Reader for Hugging Face Chat Tools",
	description="Upload a document (PDF, XLSX, PPTX, TXT, CSV, DOC, DOCX and Code or text file) to read its content."
	"This tool is designed for use with Hugging Face Chat Tools: "
	"[https://hf.co/chat/tools/66ed8236a35891a61e2bfcf2](https://hf.co/chat/tools/66ed8236a35891a61e2bfcf2)",
	concurrency_limit = None
	)

	iface.launch()