Spaces:

rdune71
/

myspace134v

Runtime error

App Files Files Community

rdune71 commited on Sep 4

Commit

03da349

1 Parent(s): bb60cf1

Optimize for Hugging Face Inference API with streaming support and RAG integration

Browse files

Files changed (3) hide show

app.py +18 -17
modules/analyzer.py +7 -1
modules/rag/rag_chain.py +5 -4

app.py CHANGED Viewed

@@ -34,11 +34,11 @@ def get_cat_greeting():
         "Please ask me anything, and I'll pounce on the best information for you!"
     )
-# Startup check function
 async def perform_startup_check():
-    """Perform startup checks to verify server status"""
     try:
-        # Check 1: Verify server is not returning 503
         test_prompt = "Hello, this is a startup check. Please respond with 'OK' if you're operational."
         # Use a short timeout for the startup check
@@ -59,14 +59,14 @@ async def perform_startup_check():
         if full_response:
             return {
                 "status": "operational",
-                "message": "✅ Server is operational and ready to assist!",
                 "details": f"Received response: {full_response[:50]}..."
             }
         else:
             return {
                 "status": "warning",
-                "message": "⚠️ Server responded but with empty content. May need attention.",
-                "details": "Server connection established but no content returned."
             }
     except Exception as e:
@@ -74,19 +74,19 @@ async def perform_startup_check():
         if "503" in error_msg:
             return {
                 "status": "initializing",
-                "message": "⏳ Server is currently initializing (503 error detected)",
-                "details": "The AI model server is warming up. Please wait approximately 5 minutes before asking questions."
             }
         elif "timeout" in error_msg.lower():
             return {
                 "status": "timeout",
-                "message": "⏰ Server connection timed out",
-                "details": "Connection to the AI model timed out. This may indicate server initialization."
             }
         else:
             return {
                 "status": "error",
-                "message": "❌ Server check failed",
                 "details": f"Error during startup check: {error_msg}"
             }
@@ -211,7 +211,7 @@ async def research_assistant(query, history, use_rag=False):
         wait_time = server_status["estimated_wait"]
         response = (
             f"⏳ **Server Initializing** ⏳\n\n"
-            f"The AI model server is currently starting up. This happens automatically after periods of inactivity.\n\n"
             f"**Estimated wait time: {wait_time} minutes**\n\n"
             f"**What you can do:**\n"
             f"- Wait for {wait_time} minutes and try again\n"
@@ -224,7 +224,7 @@ async def research_assistant(query, history, use_rag=False):
         return
     try:
-        history[-1] = (query, "🧠 Analyzing information...")
         yield history
         stream = analyze_with_model(enriched_input)
@@ -397,8 +397,8 @@ with gr.Blocks(
                     - 🌤️ Context-aware weather data (only when relevant)
                     - 🌌 Context-aware space weather data (only when relevant)
                     - 📚 RAG (Retrieval-Augmented Generation) with document database
                     - 📚 Real-time citations
-                    - ⚡ Streaming output
                     """)
                 with gr.Column(scale=2):
@@ -469,7 +469,7 @@ with gr.Blocks(
         if result["status"] == "operational":
             cat_greeting = get_cat_greeting()
             status_md = f"""
-✅ **Server is operational and ready to assist!**
 🐾 **Cat Greeting:**
 *{cat_greeting}*
@@ -478,7 +478,7 @@ with gr.Blocks(
 """
         elif result["status"] == "initializing":
             status_md = f"""
-⏳ **Server is currently initializing (503 error detected)**
 ⏳ **Estimated wait time:** 5 minutes
@@ -488,7 +488,7 @@ While you wait, why not prepare some treats? I'll be ready to hunt for knowledge
             status_md = "🔄 Performing startup checks..."
         else:
             status_md = f"""
-❌ **Server check failed**
 📝 **Details:** {result["details"]}
 """
@@ -543,6 +543,7 @@ if __name__ == "__main__":
     # Print public link information to logs
     print("===== Application Starting =====")
     print("Creating public link for Hugging Face Space...")
     print("Once the app launches, a public link will be available")
     print("================================")

         "Please ask me anything, and I'll pounce on the best information for you!"
     )
+# Startup check function optimized for Hugging Face endpoint
 async def perform_startup_check():
+    """Perform startup checks to verify Hugging Face endpoint status"""
     try:
+        # Check 1: Verify Hugging Face endpoint is responding
         test_prompt = "Hello, this is a startup check. Please respond with 'OK' if you're operational."
         # Use a short timeout for the startup check
         if full_response:
             return {
                 "status": "operational",
+                "message": "✅ Hugging Face endpoint is operational and ready to assist!",
                 "details": f"Received response: {full_response[:50]}..."
             }
         else:
             return {
                 "status": "warning",
+                "message": "⚠️ Endpoint responded but with empty content. May need attention.",
+                "details": "Endpoint connection established but no content returned."
             }
     except Exception as e:
         if "503" in error_msg:
             return {
                 "status": "initializing",
+                "message": "⏳ Hugging Face endpoint is currently initializing (503 error detected)",
+                "details": "The model server is warming up. Please wait approximately 5 minutes before asking questions."
             }
         elif "timeout" in error_msg.lower():
             return {
                 "status": "timeout",
+                "message": "⏰ Endpoint connection timed out",
+                "details": "Connection to the Hugging Face model timed out. This may indicate server initialization."
             }
         else:
             return {
                 "status": "error",
+                "message": "❌ Endpoint check failed",
                 "details": f"Error during startup check: {error_msg}"
             }
         wait_time = server_status["estimated_wait"]
         response = (
             f"⏳ **Server Initializing** ⏳\n\n"
+            f"The Hugging Face model server is currently starting up. This happens automatically after periods of inactivity.\n\n"
             f"**Estimated wait time: {wait_time} minutes**\n\n"
             f"**What you can do:**\n"
             f"- Wait for {wait_time} minutes and try again\n"
         return
     try:
+        history[-1] = (query, "🧠 Analyzing information with Hugging Face model...")
         yield history
         stream = analyze_with_model(enriched_input)
                     - 🌤️ Context-aware weather data (only when relevant)
                     - 🌌 Context-aware space weather data (only when relevant)
                     - 📚 RAG (Retrieval-Augmented Generation) with document database
+                    - ⚡ Real-time streaming from Hugging Face endpoint
                     - 📚 Real-time citations
                     """)
                 with gr.Column(scale=2):
         if result["status"] == "operational":
             cat_greeting = get_cat_greeting()
             status_md = f"""
+✅ **Hugging Face endpoint is operational and ready to assist!**
 🐾 **Cat Greeting:**
 *{cat_greeting}*
 """
         elif result["status"] == "initializing":
             status_md = f"""
+⏳ **Hugging Face endpoint is currently initializing (503 error detected)**
 ⏳ **Estimated wait time:** 5 minutes
             status_md = "🔄 Performing startup checks..."
         else:
             status_md = f"""
+❌ **Endpoint check failed**
 📝 **Details:** {result["details"]}
 """
     # Print public link information to logs
     print("===== Application Starting =====")
     print("Creating public link for Hugging Face Space...")
+    print("Using Hugging Face Inference API endpoint for optimal performance")
     print("Once the app launches, a public link will be available")
     print("================================")

modules/analyzer.py CHANGED Viewed

@@ -2,6 +2,7 @@ from openai import OpenAI
 import os
 import time
 client = OpenAI(
     base_url="https://zxzbfrlg3ssrk7d9.us-east-1.aws.endpoints.huggingface.cloud/v1/",
     api_key=os.getenv("HF_TOKEN")
@@ -10,15 +11,17 @@ client = OpenAI(
 def analyze_with_model(prompt):
     """Analyze prompt with LLM, returning a generator for streaming"""
     try:
         response = client.chat.completions.create(
             model="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf",
             messages=[{"role": "user", "content": prompt}],
-            stream=True,  # Enable streaming
             temperature=0.7,
             max_tokens=8192,  # Increased token limit
             timeout=120  # Increased timeout for longer responses
         )
         for chunk in response:
             content = chunk.choices[0].delta.content
             if content:
@@ -27,11 +30,14 @@ def analyze_with_model(prompt):
     except Exception as e:
         error_msg = str(e)
         if "503" in error_msg:
             yield f"Error during analysis: Service temporarily unavailable (503). The model server is likely initializing. Please wait 5 minutes and try again. Details: {error_msg}"
         elif "timeout" in error_msg.lower():
             yield f"Error during analysis: Request timed out. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
         elif "connection" in error_msg.lower():
             yield f"Error during analysis: Connection error. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
         else:
             yield f"Error during analysis: {error_msg}"

 import os
 import time
+# Use your existing Hugging Face endpoint
 client = OpenAI(
     base_url="https://zxzbfrlg3ssrk7d9.us-east-1.aws.endpoints.huggingface.cloud/v1/",
     api_key=os.getenv("HF_TOKEN")
 def analyze_with_model(prompt):
     """Analyze prompt with LLM, returning a generator for streaming"""
     try:
+        # Use the Hugging Face Inference API with proper streaming
         response = client.chat.completions.create(
             model="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf",
             messages=[{"role": "user", "content": prompt}],
+            stream=True,  # Enable streaming for real-time responses
             temperature=0.7,
             max_tokens=8192,  # Increased token limit
             timeout=120  # Increased timeout for longer responses
         )
+        # Stream the response chunks
         for chunk in response:
             content = chunk.choices[0].delta.content
             if content:
     except Exception as e:
         error_msg = str(e)
+        # Enhanced error detection for common Hugging Face issues
         if "503" in error_msg:
             yield f"Error during analysis: Service temporarily unavailable (503). The model server is likely initializing. Please wait 5 minutes and try again. Details: {error_msg}"
         elif "timeout" in error_msg.lower():
             yield f"Error during analysis: Request timed out. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
         elif "connection" in error_msg.lower():
             yield f"Error during analysis: Connection error. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
+        elif "limit" in error_msg.lower():
+            yield f"Error during analysis: Rate limit exceeded. Please wait a moment and try again. Details: {error_msg}"
         else:
             yield f"Error during analysis: {error_msg}"

modules/rag/rag_chain.py CHANGED Viewed

@@ -13,11 +13,12 @@ class RAGChain:
             search_kwargs={"k": 5}
         )
-        # Custom prompt template
         self.prompt_template = """
-        You are an AI research assistant with access to a document database.
-        Use the following pieces of context to answer the question at the end.
-        If you don't know the answer, just say that you don't know, don't try to make up an answer.
         Context: {context}

             search_kwargs={"k": 5}
         )
+        # Custom prompt template optimized for your model
         self.prompt_template = """
+        You are an expert research assistant with access to relevant documents.
+        Use the following context to answer the question accurately.
+        If the context doesn't contain enough information, say so.
+        Always cite specific parts of the context in your response.
         Context: {context}