Spaces:

KingNish
/

Doc-Reader-and-Chat

Running

App Files Files Community

KingNish commited on Sep 19, 2024

Commit

a74d94b

verified ·

1 Parent(s): 3e87e84

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -94

app.py CHANGED Viewed

@@ -10,16 +10,13 @@ import xml.etree.ElementTree as ET
 # Constants
 CHUNK_SIZE = 32000
-SYSTEM_PROMPT = """
-You are a helpful and informative assistant that can answer questions based on the content of documents.
-You will receive the content of a document and a question about it.
-Your task is to provide a concise and accurate answer to the question based solely on the provided document content.
-If the document does not contain enough information to answer the question, simply state that you cannot answer the question based on the provided information.
-"""
 # Initialize the Mistral chat model
 client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407")
 def xml2text(xml):
     """Extracts text from XML data."""
     text = u''
@@ -28,37 +25,54 @@ def xml2text(xml):
         text += child.text + " " if child.text is not None else ''
     return text
-def extract_text_from_docx(docx_data, strip_content):
-    """Extracts text from a DOCX file."""
     text = u''
     zipf = zipfile.ZipFile(io.BytesIO(docx_data))
     filelist = zipf.namelist()
     for fname in filelist:
-        if re.match('word/header[0-9]*.xml', fname):
-            text += xml2text(zipf.read(fname))
-        elif re.match('word/footer[0-9]*.xml', fname):
             text += xml2text(zipf.read(fname))
-    text += xml2text(zipf.read('word/document.xml'))
-    zipf.close()
-    if strip_content:
-        text = strip_text(text)
-    return f"{text}\n\n**Document Length:** {len(text)} characters"
-def strip_text(text):
-    """Strips unnecessary characters from text."""
-    content = text.replace('\n', ' ')
-    content = content.replace('\r', ' ')
-    content = content.replace('\t', ' ')
-    content = content.replace('  ', '')
-    return content.strip()
-def read_document(file, strip_content):
-    """Reads the content of a document based on its file type."""
     file_path = file.name
     file_extension = file_path.split('.')[-1].lower()
@@ -71,11 +85,11 @@ def read_document(file, strip_content):
             content = ''
             for page in range(len(pdf_reader.pages)):
                 content += pdf_reader.pages[page].extract_text()
-            if strip_content:
-                content = strip_text(content)
-            return content
         except Exception as e:
-            return f"Error reading PDF: {e}"
     elif file_extension == 'xlsx':
         try:
@@ -84,13 +98,13 @@ def read_document(file, strip_content):
             for sheet in wb.worksheets:
                 for row in sheet.rows:
                     for cell in row:
-                        if cell.value is not None:
                             content += str(cell.value) + ' '
-            if strip_content:
-                content = strip_text(content)
-            return content
         except Exception as e:
-            return f"Error reading XLSX: {e}"
     elif file_extension == 'pptx':
         try:
@@ -100,74 +114,90 @@ def read_document(file, strip_content):
                 for shape in slide.shapes:
                     if hasattr(shape, "text"):
                         content += shape.text + ' '
-            if strip_content:
-                content = strip_text(content)
-            return content
         except Exception as e:
-            return f"Error reading PPTX: {e}"
     elif file_extension == 'doc' or file_extension == 'docx':
         try:
-            return extract_text_from_docx(file_content, strip_content)
         except Exception as e:
-            return f"Error reading DOC/DOCX: {e}"
     else:
         try:
-            content = file_content.decode('utf-8')
-            if strip_content:
-                content = strip_text(content)
-            return content
         except Exception as e:
-            return f"Error reading file: {e}"
-def split_content(content):
-    """Splits content into chunks for processing."""
-    chunks = []
-    for i in range(0, len(content), CHUNK_SIZE):
-        chunks.append(content[i:i + CHUNK_SIZE])
-    return chunks
-def chat_document(file, question, strip_content):
-    """Handles chat with a document using Mistral."""
-    content = str(read_document(file, strip_content))
-    if len(content) > CHUNK_SIZE:
-        content = content[:CHUNK_SIZE]
-    message = f"""[INST] [SYSTEM] {SYSTEM_PROMPT}
     Document Content: {content}
     Question: {question}
     Answer:"""
-    stream = client.text_generation(message, max_new_tokens=4096, stream=True, details=True, return_full_text=False)
-    output = ""
-    for response in stream:
-        if not response.token.text == "</s>":
-            output += response.token.text
-        yield output
-def chat_document_v2(file, question, strip_content):
-    """Handles chat with a document using Mistral and chunk-based approach."""
-    content = str(read_document(file, strip_content))
     chunks = split_content(content)
     all_answers = []
     for chunk in chunks:
-        message = f"""[INST] [SYSTEM] {SYSTEM_PROMPT}
-        Document Content: {chunk[:CHUNK_SIZE]}
         Question: {question}
         Answer:"""
-        stream = client.text_generation(message, max_new_tokens=4096, stream=True, details=True, return_full_text=False)
-        output = ""
-        for response in stream:
-            if not response.token.text == "</s>":
-                output += response.token.text
-        all_answers.append(output)
     # Summarize all answers using Mistral
     summary_prompt = """
@@ -177,45 +207,56 @@ def chat_document_v2(file, question, strip_content):
     Answers:
     """
     all_answers_str = "\n".join(all_answers)
-    print(all_answers_str)
     summary_message = f"""[INST] [SYSTEM] {summary_prompt}
-    {all_answers_str[:30000]}
     Summary:"""
-    stream = client.text_generation(summary_message, max_new_tokens=4096, stream=True, details=True, return_full_text=False)
-    output = ""
-    for response in stream:
-        if not response.token.text == "</s>":
-            output += response.token.text
-        yield output
 with gr.Blocks() as demo:
     with gr.Tabs():
         with gr.TabItem("Document Reader"):
             iface1 = gr.Interface(
                 fn=read_document,
-                inputs=[gr.File(label="Upload a Document"), gr.Checkbox(label="Strip Content", value=True)],
-                outputs=gr.Textbox(label="Document Content"),
                 title="Document Reader",
                 description="Upload a document (PDF, XLSX, PPTX, TXT, CSV, DOC, DOCX and Code or text file) to read its content."
             )
         with gr.TabItem("Document Chat"):
             iface2 = gr.Interface(
                 fn=chat_document,
-                inputs=[gr.File(label="Upload a Document"), gr.Textbox(label="Question"), gr.Checkbox(label="Strip Content", value=True)],
-                outputs=gr.Markdown(label="Answer"),
                 title="Document Chat",
                 description="Upload a document and ask questions about its content."
             )
         with gr.TabItem("Document Chat V2"):
             iface3 = gr.Interface(
                 fn=chat_document_v2,
-                inputs=[gr.File(label="Upload a Document"), gr.Textbox(label="Question"), gr.Checkbox(label="Strip Content", value=True)],
-                outputs=gr.Markdown(label="Answer"),
                 title="Document Chat V2",
                 description="Upload a document and ask questions about its content (using chunk-based approach)."
             )
-demo.launch()

 # Constants
 CHUNK_SIZE = 32000
+MAX_NEW_TOKENS = 4096
 # Initialize the Mistral chat model
 client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407")
+# --- Utility Functions ---
 def xml2text(xml):
     """Extracts text from XML data."""
     text = u''
         text += child.text + " " if child.text is not None else ''
     return text
+def clean_text(content):
+    """Cleans text content based on the 'clean' parameter."""
+    if clean:
+        content = content.replace('\n', ' ')
+        content = content.replace('\r', ' ')
+        content = content.replace('\t', ' ')
+        content = content.replace('  ', ' ')  # Replace double spaces with single
+        content = content.strip()
+    return content
+def split_content(content, chunk_size=CHUNK_SIZE):
+    """Splits content into chunks of a specified size."""
+    chunks = []
+    for i in range(0, len(content), chunk_size):
+        chunks.append(content[i:i + chunk_size])
+    return chunks
+# --- Document Reading Functions ---
+def extract_text_from_docx(docx_data, clean=True):
+    """Extracts text from DOCX files."""
     text = u''
     zipf = zipfile.ZipFile(io.BytesIO(docx_data))
     filelist = zipf.namelist()
+    header_xmls = 'word/header[0-9]*.xml'
     for fname in filelist:
+        if re.match(header_xmls, fname):
             text += xml2text(zipf.read(fname))
+    doc_xml = 'word/document.xml'
+    text += xml2text(zipf.read(doc_xml))
+    footer_xmls = 'word/footer[0-9]*.xml'
+    for fname in filelist:
+        if re.match(footer_xmls, fname):
+            text += xml2text(zipf.read(fname))
+    zipf.close()
+    if clean
+        text = clean_text(text)
+    return text, len(text)
+def read_document(file, clean=True):
+    """Reads content from various document formats."""
     file_path = file.name
     file_extension = file_path.split('.')[-1].lower()
             content = ''
             for page in range(len(pdf_reader.pages)):
                 content += pdf_reader.pages[page].extract_text()
+            if clean:
+                content = clean_text(content)
+            return content, len(content)
         except Exception as e:
+            return f"Error reading PDF: {e}", 0
     elif file_extension == 'xlsx':
         try:
             for sheet in wb.worksheets:
                 for row in sheet.rows:
                     for cell in row:
+                        if cell.value is not None:
                             content += str(cell.value) + ' '
+            if clean
+                content = clean_text(content)
+            return content, len(content)
         except Exception as e:
+            return f"Error reading XLSX: {e}", 0
     elif file_extension == 'pptx':
         try:
                 for shape in slide.shapes:
                     if hasattr(shape, "text"):
                         content += shape.text + ' '
+            if clean:
+                content = clean_text(content)
+            return content, len(content)
         except Exception as e:
+            return f"Error reading PPTX: {e}", 0
     elif file_extension == 'doc' or file_extension == 'docx':
         try:
+            return extract_text_from_docx(file_content, clean)
         except Exception as e:
+            return f"Error reading DOC/DOCX: {e}", 0
     else:
         try:
+            content = file_content.decode('utf-8')
+            if clean:
+                content = clean_text(content)
+            return content, len(content)
         except Exception as e:
+            return f"Error reading file: {e}", 0
+# --- Chat Functions ---
+def generate_mistral_response(message):
+    """Generates a response from the Mistral API."""
+    stream = client.text_generation(
+        message,
+        max_new_tokens=MAX_NEW_TOKENS,
+        stream=True,
+        details=True,
+        return_full_text=False
+    )
+    output = ""
+    for response in stream:
+        if not response.token.text == "</s>":
+            output += response.token.text
+        yield output
+def chat_document(file, question, clean=True):
+    """Chats with a document using a single Mistral API call."""
+    content, length = read_document(file, clean)
+    if length > CHUNK_SIZE:
+        content = content[:CHUNK_SIZE]  # Limit to max chunk size
+    system_prompt = """
+    You are a helpful and informative assistant that can answer questions based on the content of documents.
+    You will receive the content of a document and a question about it.
+    Your task is to provide a concise and accurate answer to the question based solely on the provided document content.
+    If the document does not contain enough information to answer the question, simply state that you cannot answer the question based on the provided information.
+    """
+    message = f"""[INST] [SYSTEM] {system_prompt}
     Document Content: {content}
     Question: {question}
     Answer:"""
+    yield from generate_mistral_response(message)
+def chat_document_v2(file, question, clean=True):
+    """Chats with a document using chunk-based Mistral API calls and summarizes the answers."""
+    content, length = read_document(file, clean)
     chunks = split_content(content)
+    system_prompt = """
+    You are a helpful and informative assistant that can answer questions based on the content of documents.
+    You will receive the content of a document and a question about it.
+    Your task is to provide a concise and accurate answer to the question based solely on the provided document content.
+    If the document does not contain enough information to answer the question, simply state that you cannot answer the question based on the provided information.
+    """
     all_answers = []
     for chunk in chunks:
+        message = f"""[INST] [SYSTEM] {system_prompt}
+        Document Content: {chunk[:CHUNK_SIZE]}
         Question: {question}
         Answer:"""
+        response = ""
+        for stream_response in generate_mistral_response(message):
+            response = stream_response  # Update with latest response
+        all_answers.append(response)
     # Summarize all answers using Mistral
     summary_prompt = """
     Answers:
     """
     all_answers_str = "\n".join(all_answers)
     summary_message = f"""[INST] [SYSTEM] {summary_prompt}
+    {all_answers_str[:30000]}
     Summary:"""
+    yield from generate_mistral_response(summary_message)
+# --- Gradio Interface ---
 with gr.Blocks() as demo:
     with gr.Tabs():
         with gr.TabItem("Document Reader"):
             iface1 = gr.Interface(
                 fn=read_document,
+                inputs=[
+                    gr.File(label="Upload a Document"),
+                    gr.Checkbox(label="Clean Text", value=True),
+                ],
+                outputs=[
+                    gr.Textbox(label="Document Content"),
+                    gr.Number(label="Document Length (characters)"),
+                ],
                 title="Document Reader",
                 description="Upload a document (PDF, XLSX, PPTX, TXT, CSV, DOC, DOCX and Code or text file) to read its content."
             )
         with gr.TabItem("Document Chat"):
             iface2 = gr.Interface(
                 fn=chat_document,
+                inputs=[
+                    gr.File(label="Upload a Document"),
+                    gr.Textbox(label="Question"),
+                    gr.Checkbox(label="Clean and Compress Text", value=True),
+                ],
+                outputs=gr.Markdown(label="Answer"),
                 title="Document Chat",
                 description="Upload a document and ask questions about its content."
             )
         with gr.TabItem("Document Chat V2"):
             iface3 = gr.Interface(
                 fn=chat_document_v2,
+                inputs=[
+                    gr.File(label="Upload a Document"),
+                    gr.Textbox(label="Question"),
+                    gr.Checkbox(label="Clean Text", value=True),
+                ],
+                outputs=gr.Markdown(label="Answer"),
                 title="Document Chat V2",
                 description="Upload a document and ask questions about its content (using chunk-based approach)."
             )
+demo.launch()