rdune71 commited on
Commit
03da349
Β·
1 Parent(s): bb60cf1

Optimize for Hugging Face Inference API with streaming support and RAG integration

Browse files
Files changed (3) hide show
  1. app.py +18 -17
  2. modules/analyzer.py +7 -1
  3. modules/rag/rag_chain.py +5 -4
app.py CHANGED
@@ -34,11 +34,11 @@ def get_cat_greeting():
34
  "Please ask me anything, and I'll pounce on the best information for you!"
35
  )
36
 
37
- # Startup check function
38
  async def perform_startup_check():
39
- """Perform startup checks to verify server status"""
40
  try:
41
- # Check 1: Verify server is not returning 503
42
  test_prompt = "Hello, this is a startup check. Please respond with 'OK' if you're operational."
43
 
44
  # Use a short timeout for the startup check
@@ -59,14 +59,14 @@ async def perform_startup_check():
59
  if full_response:
60
  return {
61
  "status": "operational",
62
- "message": "βœ… Server is operational and ready to assist!",
63
  "details": f"Received response: {full_response[:50]}..."
64
  }
65
  else:
66
  return {
67
  "status": "warning",
68
- "message": "⚠️ Server responded but with empty content. May need attention.",
69
- "details": "Server connection established but no content returned."
70
  }
71
 
72
  except Exception as e:
@@ -74,19 +74,19 @@ async def perform_startup_check():
74
  if "503" in error_msg:
75
  return {
76
  "status": "initializing",
77
- "message": "⏳ Server is currently initializing (503 error detected)",
78
- "details": "The AI model server is warming up. Please wait approximately 5 minutes before asking questions."
79
  }
80
  elif "timeout" in error_msg.lower():
81
  return {
82
  "status": "timeout",
83
- "message": "⏰ Server connection timed out",
84
- "details": "Connection to the AI model timed out. This may indicate server initialization."
85
  }
86
  else:
87
  return {
88
  "status": "error",
89
- "message": "❌ Server check failed",
90
  "details": f"Error during startup check: {error_msg}"
91
  }
92
 
@@ -211,7 +211,7 @@ async def research_assistant(query, history, use_rag=False):
211
  wait_time = server_status["estimated_wait"]
212
  response = (
213
  f"⏳ **Server Initializing** ⏳\n\n"
214
- f"The AI model server is currently starting up. This happens automatically after periods of inactivity.\n\n"
215
  f"**Estimated wait time: {wait_time} minutes**\n\n"
216
  f"**What you can do:**\n"
217
  f"- Wait for {wait_time} minutes and try again\n"
@@ -224,7 +224,7 @@ async def research_assistant(query, history, use_rag=False):
224
  return
225
 
226
  try:
227
- history[-1] = (query, "🧠 Analyzing information...")
228
  yield history
229
 
230
  stream = analyze_with_model(enriched_input)
@@ -397,8 +397,8 @@ with gr.Blocks(
397
  - 🌀️ Context-aware weather data (only when relevant)
398
  - 🌌 Context-aware space weather data (only when relevant)
399
  - πŸ“š RAG (Retrieval-Augmented Generation) with document database
 
400
  - πŸ“š Real-time citations
401
- - ⚑ Streaming output
402
  """)
403
 
404
  with gr.Column(scale=2):
@@ -469,7 +469,7 @@ with gr.Blocks(
469
  if result["status"] == "operational":
470
  cat_greeting = get_cat_greeting()
471
  status_md = f"""
472
- βœ… **Server is operational and ready to assist!**
473
 
474
  🐾 **Cat Greeting:**
475
  *{cat_greeting}*
@@ -478,7 +478,7 @@ with gr.Blocks(
478
  """
479
  elif result["status"] == "initializing":
480
  status_md = f"""
481
- ⏳ **Server is currently initializing (503 error detected)**
482
 
483
  ⏳ **Estimated wait time:** 5 minutes
484
 
@@ -488,7 +488,7 @@ While you wait, why not prepare some treats? I'll be ready to hunt for knowledge
488
  status_md = "πŸ”„ Performing startup checks..."
489
  else:
490
  status_md = f"""
491
- ❌ **Server check failed**
492
 
493
  πŸ“ **Details:** {result["details"]}
494
  """
@@ -543,6 +543,7 @@ if __name__ == "__main__":
543
  # Print public link information to logs
544
  print("===== Application Starting =====")
545
  print("Creating public link for Hugging Face Space...")
 
546
  print("Once the app launches, a public link will be available")
547
  print("================================")
548
 
 
34
  "Please ask me anything, and I'll pounce on the best information for you!"
35
  )
36
 
37
+ # Startup check function optimized for Hugging Face endpoint
38
  async def perform_startup_check():
39
+ """Perform startup checks to verify Hugging Face endpoint status"""
40
  try:
41
+ # Check 1: Verify Hugging Face endpoint is responding
42
  test_prompt = "Hello, this is a startup check. Please respond with 'OK' if you're operational."
43
 
44
  # Use a short timeout for the startup check
 
59
  if full_response:
60
  return {
61
  "status": "operational",
62
+ "message": "βœ… Hugging Face endpoint is operational and ready to assist!",
63
  "details": f"Received response: {full_response[:50]}..."
64
  }
65
  else:
66
  return {
67
  "status": "warning",
68
+ "message": "⚠️ Endpoint responded but with empty content. May need attention.",
69
+ "details": "Endpoint connection established but no content returned."
70
  }
71
 
72
  except Exception as e:
 
74
  if "503" in error_msg:
75
  return {
76
  "status": "initializing",
77
+ "message": "⏳ Hugging Face endpoint is currently initializing (503 error detected)",
78
+ "details": "The model server is warming up. Please wait approximately 5 minutes before asking questions."
79
  }
80
  elif "timeout" in error_msg.lower():
81
  return {
82
  "status": "timeout",
83
+ "message": "⏰ Endpoint connection timed out",
84
+ "details": "Connection to the Hugging Face model timed out. This may indicate server initialization."
85
  }
86
  else:
87
  return {
88
  "status": "error",
89
+ "message": "❌ Endpoint check failed",
90
  "details": f"Error during startup check: {error_msg}"
91
  }
92
 
 
211
  wait_time = server_status["estimated_wait"]
212
  response = (
213
  f"⏳ **Server Initializing** ⏳\n\n"
214
+ f"The Hugging Face model server is currently starting up. This happens automatically after periods of inactivity.\n\n"
215
  f"**Estimated wait time: {wait_time} minutes**\n\n"
216
  f"**What you can do:**\n"
217
  f"- Wait for {wait_time} minutes and try again\n"
 
224
  return
225
 
226
  try:
227
+ history[-1] = (query, "🧠 Analyzing information with Hugging Face model...")
228
  yield history
229
 
230
  stream = analyze_with_model(enriched_input)
 
397
  - 🌀️ Context-aware weather data (only when relevant)
398
  - 🌌 Context-aware space weather data (only when relevant)
399
  - πŸ“š RAG (Retrieval-Augmented Generation) with document database
400
+ - ⚑ Real-time streaming from Hugging Face endpoint
401
  - πŸ“š Real-time citations
 
402
  """)
403
 
404
  with gr.Column(scale=2):
 
469
  if result["status"] == "operational":
470
  cat_greeting = get_cat_greeting()
471
  status_md = f"""
472
+ βœ… **Hugging Face endpoint is operational and ready to assist!**
473
 
474
  🐾 **Cat Greeting:**
475
  *{cat_greeting}*
 
478
  """
479
  elif result["status"] == "initializing":
480
  status_md = f"""
481
+ ⏳ **Hugging Face endpoint is currently initializing (503 error detected)**
482
 
483
  ⏳ **Estimated wait time:** 5 minutes
484
 
 
488
  status_md = "πŸ”„ Performing startup checks..."
489
  else:
490
  status_md = f"""
491
+ ❌ **Endpoint check failed**
492
 
493
  πŸ“ **Details:** {result["details"]}
494
  """
 
543
  # Print public link information to logs
544
  print("===== Application Starting =====")
545
  print("Creating public link for Hugging Face Space...")
546
+ print("Using Hugging Face Inference API endpoint for optimal performance")
547
  print("Once the app launches, a public link will be available")
548
  print("================================")
549
 
modules/analyzer.py CHANGED
@@ -2,6 +2,7 @@ from openai import OpenAI
2
  import os
3
  import time
4
 
 
5
  client = OpenAI(
6
  base_url="https://zxzbfrlg3ssrk7d9.us-east-1.aws.endpoints.huggingface.cloud/v1/",
7
  api_key=os.getenv("HF_TOKEN")
@@ -10,15 +11,17 @@ client = OpenAI(
10
  def analyze_with_model(prompt):
11
  """Analyze prompt with LLM, returning a generator for streaming"""
12
  try:
 
13
  response = client.chat.completions.create(
14
  model="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf",
15
  messages=[{"role": "user", "content": prompt}],
16
- stream=True, # Enable streaming
17
  temperature=0.7,
18
  max_tokens=8192, # Increased token limit
19
  timeout=120 # Increased timeout for longer responses
20
  )
21
 
 
22
  for chunk in response:
23
  content = chunk.choices[0].delta.content
24
  if content:
@@ -27,11 +30,14 @@ def analyze_with_model(prompt):
27
 
28
  except Exception as e:
29
  error_msg = str(e)
 
30
  if "503" in error_msg:
31
  yield f"Error during analysis: Service temporarily unavailable (503). The model server is likely initializing. Please wait 5 minutes and try again. Details: {error_msg}"
32
  elif "timeout" in error_msg.lower():
33
  yield f"Error during analysis: Request timed out. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
34
  elif "connection" in error_msg.lower():
35
  yield f"Error during analysis: Connection error. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
 
 
36
  else:
37
  yield f"Error during analysis: {error_msg}"
 
2
  import os
3
  import time
4
 
5
+ # Use your existing Hugging Face endpoint
6
  client = OpenAI(
7
  base_url="https://zxzbfrlg3ssrk7d9.us-east-1.aws.endpoints.huggingface.cloud/v1/",
8
  api_key=os.getenv("HF_TOKEN")
 
11
  def analyze_with_model(prompt):
12
  """Analyze prompt with LLM, returning a generator for streaming"""
13
  try:
14
+ # Use the Hugging Face Inference API with proper streaming
15
  response = client.chat.completions.create(
16
  model="DavidAU/OpenAi-GPT-oss-20b-abliterated-uncensored-NEO-Imatrix-gguf",
17
  messages=[{"role": "user", "content": prompt}],
18
+ stream=True, # Enable streaming for real-time responses
19
  temperature=0.7,
20
  max_tokens=8192, # Increased token limit
21
  timeout=120 # Increased timeout for longer responses
22
  )
23
 
24
+ # Stream the response chunks
25
  for chunk in response:
26
  content = chunk.choices[0].delta.content
27
  if content:
 
30
 
31
  except Exception as e:
32
  error_msg = str(e)
33
+ # Enhanced error detection for common Hugging Face issues
34
  if "503" in error_msg:
35
  yield f"Error during analysis: Service temporarily unavailable (503). The model server is likely initializing. Please wait 5 minutes and try again. Details: {error_msg}"
36
  elif "timeout" in error_msg.lower():
37
  yield f"Error during analysis: Request timed out. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
38
  elif "connection" in error_msg.lower():
39
  yield f"Error during analysis: Connection error. The model server may be initializing. Please wait 5 minutes and try again. Details: {error_msg}"
40
+ elif "limit" in error_msg.lower():
41
+ yield f"Error during analysis: Rate limit exceeded. Please wait a moment and try again. Details: {error_msg}"
42
  else:
43
  yield f"Error during analysis: {error_msg}"
modules/rag/rag_chain.py CHANGED
@@ -13,11 +13,12 @@ class RAGChain:
13
  search_kwargs={"k": 5}
14
  )
15
 
16
- # Custom prompt template
17
  self.prompt_template = """
18
- You are an AI research assistant with access to a document database.
19
- Use the following pieces of context to answer the question at the end.
20
- If you don't know the answer, just say that you don't know, don't try to make up an answer.
 
21
 
22
  Context: {context}
23
 
 
13
  search_kwargs={"k": 5}
14
  )
15
 
16
+ # Custom prompt template optimized for your model
17
  self.prompt_template = """
18
+ You are an expert research assistant with access to relevant documents.
19
+ Use the following context to answer the question accurately.
20
+ If the context doesn't contain enough information, say so.
21
+ Always cite specific parts of the context in your response.
22
 
23
  Context: {context}
24