Spaces:
Runtime error
Runtime error
Optimize for Hugging Face Inference API with streaming support and RAG integration
Browse files- app.py +18 -17
- modules/analyzer.py +7 -1
- modules/rag/rag_chain.py +5 -4
app.py
CHANGED
|
@@ -34,11 +34,11 @@ def get_cat_greeting():
|
|
| 34 |
"Please ask me anything, and I'll pounce on the best information for you!"
|
| 35 |
)
|
| 36 |
|
| 37 |
-
# Startup check function
|
| 38 |
async def perform_startup_check():
|
| 39 |
-
"""Perform startup checks to verify
|
| 40 |
try:
|
| 41 |
-
# Check 1: Verify
|
| 42 |
test_prompt = "Hello, this is a startup check. Please respond with 'OK' if you're operational."
|
| 43 |
|
| 44 |
# Use a short timeout for the startup check
|
|
@@ -59,14 +59,14 @@ async def perform_startup_check():
|
|
| 59 |
if full_response:
|
| 60 |
return {
|
| 61 |
"status": "operational",
|
| 62 |
-
"message": "β
|
| 63 |
"details": f"Received response: {full_response[:50]}..."
|
| 64 |
}
|
| 65 |
else:
|
| 66 |
return {
|
| 67 |
"status": "warning",
|
| 68 |
-
"message": "β οΈ
|
| 69 |
-
"details": "
|
| 70 |
}
|
| 71 |
|
| 72 |
except Exception as e:
|
|
@@ -74,19 +74,19 @@ async def perform_startup_check():
|
|
| 74 |
if "503" in error_msg:
|
| 75 |
return {
|
| 76 |
"status": "initializing",
|
| 77 |
-
"message": "β³
|
| 78 |
-
"details": "The
|
| 79 |
}
|
| 80 |
elif "timeout" in error_msg.lower():
|
| 81 |
return {
|
| 82 |
"status": "timeout",
|
| 83 |
-
"message": "β°
|
| 84 |
-
"details": "Connection to the
|
| 85 |
}
|
| 86 |
else:
|
| 87 |
return {
|
| 88 |
"status": "error",
|
| 89 |
-
"message": "β
|
| 90 |
"details": f"Error during startup check: {error_msg}"
|
| 91 |
}
|
| 92 |
|
|
@@ -211,7 +211,7 @@ async def research_assistant(query, history, use_rag=False):
|
|
| 211 |
wait_time = server_status["estimated_wait"]
|
| 212 |
response = (
|
| 213 |
f"β³ **Server Initializing** β³\n\n"
|
| 214 |
-
f"The
|
| 215 |
f"**Estimated wait time: {wait_time} minutes**\n\n"
|
| 216 |
f"**What you can do:**\n"
|
| 217 |
f"- Wait for {wait_time} minutes and try again\n"
|
|
@@ -224,7 +224,7 @@ async def research_assistant(query, history, use_rag=False):
|
|
| 224 |
return
|
| 225 |
|
| 226 |
try:
|
| 227 |
-
history[-1] = (query, "π§ Analyzing information...")
|
| 228 |
yield history
|
| 229 |
|
| 230 |
stream = analyze_with_model(enriched_input)
|
|
@@ -397,8 +397,8 @@ with gr.Blocks(
|
|
| 397 |
- π€οΈ Context-aware weather data (only when relevant)
|
| 398 |
- π Context-aware space weather data (only when relevant)
|
| 399 |
- π RAG (Retrieval-Augmented Generation) with document database
|
|
|
|
| 400 |
- π Real-time citations
|
| 401 |
-
- β‘ Streaming output
|
| 402 |
""")
|
| 403 |
|
| 404 |
with gr.Column(scale=2):
|
|
@@ -469,7 +469,7 @@ with gr.Blocks(
|
|
| 469 |
if result["status"] == "operational":
|
| 470 |
cat_greeting = get_cat_greeting()
|
| 471 |
status_md = f"""
|
| 472 |
-
β
**
|
| 473 |
|
| 474 |
πΎ **Cat Greeting:**
|
| 475 |
*{cat_greeting}*
|
|
@@ -478,7 +478,7 @@ with gr.Blocks(
|
|
| 478 |
"""
|
| 479 |
elif result["status"] == "initializing":
|
| 480 |
status_md = f"""
|
| 481 |
-
β³ **
|
| 482 |
|
| 483 |
β³ **Estimated wait time:** 5 minutes
|
| 484 |
|
|
@@ -488,7 +488,7 @@ While you wait, why not prepare some treats? I'll be ready to hunt for knowledge
|
|
| 488 |
status_md = "π Performing startup checks..."
|
| 489 |
else:
|
| 490 |
status_md = f"""
|
| 491 |
-
β **
|
| 492 |
|
| 493 |
π **Details:** {result["details"]}
|
| 494 |
"""
|
|
@@ -543,6 +543,7 @@ if __name__ == "__main__":
|
|
| 543 |
# Print public link information to logs
|
| 544 |
print("===== Application Starting =====")
|
| 545 |
print("Creating public link for Hugging Face Space...")
|
|
|
|
| 546 |
print("Once the app launches, a public link will be available")
|
| 547 |
print("================================")
|
| 548 |
|
|
|
|
| 34 |
"Please ask me anything, and I'll pounce on the best information for you!"
|
| 35 |
)
|
| 36 |
|
| 37 |
+
# Startup check function optimized for Hugging Face endpoint
|
| 38 |
async def perform_startup_check():
|
| 39 |
+
"""Perform startup checks to verify Hugging Face endpoint status"""
|
| 40 |
try:
|
| 41 |
+
# Check 1: Verify Hugging Face endpoint is responding
|
| 42 |
test_prompt = "Hello, this is a startup check. Please respond with 'OK' if you're operational."
|
| 43 |
|
| 44 |
# Use a short timeout for the startup check
|
|
|
|
| 59 |
if full_response:
|
| 60 |
return {
|
| 61 |
"status": "operational",
|
| 62 |
+
"message": "β
Hugging Face endpoint is operational and ready to assist!",
|
| 63 |
"details": f"Received response: {full_response[:50]}..."
|
| 64 |
}
|
| 65 |
else:
|
| 66 |
return {
|
| 67 |
"status": "warning",
|
| 68 |
+
"message": "β οΈ Endpoint responded but with empty content. May need attention.",
|
| 69 |
+
"details": "Endpoint connection established but no content returned."
|
| 70 |
}
|
| 71 |
|
| 72 |
except Exception as e:
|
|
|
|
| 74 |
if "503" in error_msg:
|
| 75 |
return {
|
| 76 |
"status": "initializing",
|
| 77 |
+
"message": "β³ Hugging Face endpoint is currently initializing (503 error detected)",
|
| 78 |
+
"details": "The model server is warming up. Please wait approximately 5 minutes before asking questions."
|
| 79 |
}
|
| 80 |
elif "timeout" in error_msg.lower():
|
| 81 |
return {
|
| 82 |
"status": "timeout",
|
| 83 |
+
"message": "β° Endpoint connection timed out",
|
| 84 |
+
"details": "Connection to the Hugging Face model timed out. This may indicate server initialization."
|
| 85 |
}
|
| 86 |
else:
|
| 87 |
return {
|
| 88 |
"status": "error",
|
| 89 |
+
"message": "β Endpoint check failed",
|
| 90 |
"details": f"Error during startup check: {error_msg}"
|
| 91 |
}
|
| 92 |
|
|
|
|
| 211 |
wait_time = server_status["estimated_wait"]
|
| 212 |
response = (
|
| 213 |
f"β³ **Server Initializing** β³\n\n"
|
| 214 |
+
f"The Hugging Face model server is currently starting up. This happens automatically after periods of inactivity.\n\n"
|
| 215 |
f"**Estimated wait time: {wait_time} minutes**\n\n"
|
| 216 |
f"**What you can do:**\n"
|
| 217 |
f"- Wait for {wait_time} minutes and try again\n"
|
|
|
|
| 224 |
return
|
| 225 |
|
| 226 |
try:
|
| 227 |
+
history[-1] = (query, "π§ Analyzing information with Hugging Face model...")
|
| 228 |
yield history
|
| 229 |
|
| 230 |
stream = analyze_with_model(enriched_input)
|
|
|
|
| 397 |
- π€οΈ Context-aware weather data (only when relevant)
|
| 398 |
- π Context-aware space weather data (only when relevant)
|
| 399 |
- π RAG (Retrieval-Augmented Generation) with document database
|
| 400 |
+
- β‘ Real-time streaming from Hugging Face endpoint
|
| 401 |
- π Real-time citations
|
|
|
|
| 402 |
""")
|
| 403 |
|
| 404 |
with gr.Column(scale=2):
|
|
|
|
| 469 |
if result["status"] == "operational":
|
| 470 |
cat_greeting = get_cat_greeting()
|
| 471 |
status_md = f"""
|
| 472 |
+
β
**Hugging Face endpoint is operational and ready to assist!**
|
| 473 |
|
| 474 |
πΎ **Cat Greeting:**
|
| 475 |
*{cat_greeting}*
|
|
|
|
| 478 |
"""
|
| 479 |
elif result["status"] == "initializing":
|
| 480 |
status_md = f"""
|
| 481 |
+
β³ **Hugging Face endpoint is currently initializing (503 error detected)**
|
| 482 |
|
| 483 |
β³ **Estimated wait time:** 5 minutes
|
| 484 |
|
|
|
|
| 488 |
status_md = "π Performing startup checks..."
|
| 489 |
else:
|
| 490 |
status_md = f"""
|
| 491 |
+
β **Endpoint check failed**
|
| 492 |
|
| 493 |
π **Details:** {result["details"]}
|
| 494 |
"""
|
|
|
|
| 543 |
# Print public link information to logs
|
| 544 |
print("===== Application Starting =====")
|
| 545 |
print("Creating public link for Hugging Face Space...")
|
| 546 |
+
print("Using Hugging Face Inference API endpoint for optimal performance")
|
| 547 |
print("Once the app launches, a public link will be available")
|
| 548 |
print("================================")
|
| 549 |
|
modules/analyzer.py
CHANGED
|
@@ -2,6 +2,7 @@ from openai import OpenAI
|
|
| 2 |
import os
|
| 3 |
import time
|
| 4 |
|
|
|
|
| 5 |
client = OpenAI(
|
| 6 |
base_url="https://zxzbfrlg3ssrk7d9.us-east-1.aws.endpoints.huggingface.cloud/v1/",
|
| 7 |
api_key=os.getenv("HF_TOKEN")
|
|
@@ -10,15 +11,17 @@ client = OpenAI(
|
|
| 10 |
def analyze_with_model(prompt):
|
| 11 |
"""Analyze prompt with LLM, returning a generator for streaming"""
|
| 12 |
try:
|
|
|
|
| 13 |
response = client.chat.completions.create(
|
| 14 |
model="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf",
|
| 15 |
messages=[{"role": "user", "content": prompt}],
|
| 16 |
-
stream=True, # Enable streaming
|
| 17 |
temperature=0.7,
|
| 18 |
max_tokens=8192, # Increased token limit
|
| 19 |
timeout=120 # Increased timeout for longer responses
|
| 20 |
)
|
| 21 |
|
|
|
|
| 22 |
for chunk in response:
|
| 23 |
content = chunk.choices[0].delta.content
|
| 24 |
if content:
|
|
@@ -27,11 +30,14 @@ def analyze_with_model(prompt):
|
|
| 27 |
|
| 28 |
except Exception as e:
|
| 29 |
error_msg = str(e)
|
|
|
|
| 30 |
if "503" in error_msg:
|
| 31 |
yield f"Error during analysis: Service temporarily unavailable (503). The model server is likely initializing. Please wait 5 minutes and try again. Details: {error_msg}"
|
| 32 |
elif "timeout" in error_msg.lower():
|
| 33 |
yield f"Error during analysis: Request timed out. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
|
| 34 |
elif "connection" in error_msg.lower():
|
| 35 |
yield f"Error during analysis: Connection error. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
|
|
|
|
|
|
|
| 36 |
else:
|
| 37 |
yield f"Error during analysis: {error_msg}"
|
|
|
|
| 2 |
import os
|
| 3 |
import time
|
| 4 |
|
| 5 |
+
# Use your existing Hugging Face endpoint
|
| 6 |
client = OpenAI(
|
| 7 |
base_url="https://zxzbfrlg3ssrk7d9.us-east-1.aws.endpoints.huggingface.cloud/v1/",
|
| 8 |
api_key=os.getenv("HF_TOKEN")
|
|
|
|
| 11 |
def analyze_with_model(prompt):
|
| 12 |
"""Analyze prompt with LLM, returning a generator for streaming"""
|
| 13 |
try:
|
| 14 |
+
# Use the Hugging Face Inference API with proper streaming
|
| 15 |
response = client.chat.completions.create(
|
| 16 |
model="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf",
|
| 17 |
messages=[{"role": "user", "content": prompt}],
|
| 18 |
+
stream=True, # Enable streaming for real-time responses
|
| 19 |
temperature=0.7,
|
| 20 |
max_tokens=8192, # Increased token limit
|
| 21 |
timeout=120 # Increased timeout for longer responses
|
| 22 |
)
|
| 23 |
|
| 24 |
+
# Stream the response chunks
|
| 25 |
for chunk in response:
|
| 26 |
content = chunk.choices[0].delta.content
|
| 27 |
if content:
|
|
|
|
| 30 |
|
| 31 |
except Exception as e:
|
| 32 |
error_msg = str(e)
|
| 33 |
+
# Enhanced error detection for common Hugging Face issues
|
| 34 |
if "503" in error_msg:
|
| 35 |
yield f"Error during analysis: Service temporarily unavailable (503). The model server is likely initializing. Please wait 5 minutes and try again. Details: {error_msg}"
|
| 36 |
elif "timeout" in error_msg.lower():
|
| 37 |
yield f"Error during analysis: Request timed out. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
|
| 38 |
elif "connection" in error_msg.lower():
|
| 39 |
yield f"Error during analysis: Connection error. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
|
| 40 |
+
elif "limit" in error_msg.lower():
|
| 41 |
+
yield f"Error during analysis: Rate limit exceeded. Please wait a moment and try again. Details: {error_msg}"
|
| 42 |
else:
|
| 43 |
yield f"Error during analysis: {error_msg}"
|
modules/rag/rag_chain.py
CHANGED
|
@@ -13,11 +13,12 @@ class RAGChain:
|
|
| 13 |
search_kwargs={"k": 5}
|
| 14 |
)
|
| 15 |
|
| 16 |
-
# Custom prompt template
|
| 17 |
self.prompt_template = """
|
| 18 |
-
You are an
|
| 19 |
-
Use the following
|
| 20 |
-
If
|
|
|
|
| 21 |
|
| 22 |
Context: {context}
|
| 23 |
|
|
|
|
| 13 |
search_kwargs={"k": 5}
|
| 14 |
)
|
| 15 |
|
| 16 |
+
# Custom prompt template optimized for your model
|
| 17 |
self.prompt_template = """
|
| 18 |
+
You are an expert research assistant with access to relevant documents.
|
| 19 |
+
Use the following context to answer the question accurately.
|
| 20 |
+
If the context doesn't contain enough information, say so.
|
| 21 |
+
Always cite specific parts of the context in your response.
|
| 22 |
|
| 23 |
Context: {context}
|
| 24 |
|