Spaces:
Runtime error
Runtime error
fixes
Browse files- app.py +8 -9
- llm_backend.py +4 -7
app.py
CHANGED
|
@@ -16,12 +16,11 @@ llm = LlmBackend()
|
|
| 16 |
_lock = threading.Lock()
|
| 17 |
|
| 18 |
SYSTEM_PROMPT = os.environ.get('SYSTEM_PROMPT') or "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык."
|
| 19 |
-
CONTEXT_SIZE = os.environ.get('CONTEXT_SIZE'
|
| 20 |
HF_CACHE_DIR = os.environ.get('HF_CACHE_DIR') or '/home/user/app/.cache'
|
| 21 |
-
USE_SYSTEM_PROMPT = os.environ.get('USE_SYSTEM_PROMPT') or False
|
| 22 |
-
ENABLE_GPU = os.environ.get('ENABLE_GPU') or False
|
| 23 |
-
GPU_LAYERS = os.environ.get('GPU_LAYERS'
|
| 24 |
-
N_GQA = os.environ.get('N_GQA') or None #must be set to 8 for 70b models
|
| 25 |
CHAT_FORMAT = os.environ.get('CHAT_FORMAT') or 'llama-2'
|
| 26 |
REPO_NAME = os.environ.get('REPO_NAME') or 'IlyaGusev/saiga2_7b_gguf'
|
| 27 |
MODEL_NAME = os.environ.get('MODEL_NAME') or 'model-q4_K.gguf'
|
|
@@ -154,7 +153,7 @@ def generate_response():
|
|
| 154 |
return Response(generate_and_log_tokens(user_request='1', generator=generator), content_type='text/plain', status=200, direct_passthrough=True)
|
| 155 |
|
| 156 |
def init_model():
|
| 157 |
-
llm.load_model(model_path=MODEL_PATH, context_size=CONTEXT_SIZE, enable_gpu=ENABLE_GPU, gpu_layer_number=GPU_LAYERS
|
| 158 |
|
| 159 |
# Function to check if no requests were made in the last 5 minutes
|
| 160 |
def check_last_request_time():
|
|
@@ -171,9 +170,9 @@ if __name__ == "__main__":
|
|
| 171 |
|
| 172 |
init_model()
|
| 173 |
|
| 174 |
-
scheduler = BackgroundScheduler()
|
| 175 |
-
scheduler.add_job(check_last_request_time, trigger='interval', minutes=1)
|
| 176 |
-
scheduler.start()
|
| 177 |
|
| 178 |
app.run(host="0.0.0.0", port=7860, debug=True, threaded=True)
|
| 179 |
|
|
|
|
| 16 |
_lock = threading.Lock()
|
| 17 |
|
| 18 |
SYSTEM_PROMPT = os.environ.get('SYSTEM_PROMPT') or "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык."
|
| 19 |
+
CONTEXT_SIZE = int(os.environ.get('CONTEXT_SIZE', '500'))
|
| 20 |
HF_CACHE_DIR = os.environ.get('HF_CACHE_DIR') or '/home/user/app/.cache'
|
| 21 |
+
USE_SYSTEM_PROMPT = os.environ.get('USE_SYSTEM_PROMPT', '').lower() == "true" or False
|
| 22 |
+
ENABLE_GPU = os.environ.get('ENABLE_GPU', '').lower() == "true" or False
|
| 23 |
+
GPU_LAYERS = int(os.environ.get('GPU_LAYERS', '0'))
|
|
|
|
| 24 |
CHAT_FORMAT = os.environ.get('CHAT_FORMAT') or 'llama-2'
|
| 25 |
REPO_NAME = os.environ.get('REPO_NAME') or 'IlyaGusev/saiga2_7b_gguf'
|
| 26 |
MODEL_NAME = os.environ.get('MODEL_NAME') or 'model-q4_K.gguf'
|
|
|
|
| 153 |
return Response(generate_and_log_tokens(user_request='1', generator=generator), content_type='text/plain', status=200, direct_passthrough=True)
|
| 154 |
|
| 155 |
def init_model():
|
| 156 |
+
llm.load_model(model_path=MODEL_PATH, context_size=CONTEXT_SIZE, enable_gpu=ENABLE_GPU, gpu_layer_number=GPU_LAYERS)
|
| 157 |
|
| 158 |
# Function to check if no requests were made in the last 5 minutes
|
| 159 |
def check_last_request_time():
|
|
|
|
| 170 |
|
| 171 |
init_model()
|
| 172 |
|
| 173 |
+
# scheduler = BackgroundScheduler()
|
| 174 |
+
# scheduler.add_job(check_last_request_time, trigger='interval', minutes=1)
|
| 175 |
+
# scheduler.start()
|
| 176 |
|
| 177 |
app.run(host="0.0.0.0", port=7860, debug=True, threaded=True)
|
| 178 |
|
llm_backend.py
CHANGED
|
@@ -34,14 +34,13 @@ class LlmBackend:
|
|
| 34 |
def is_model_loaded(self):
|
| 35 |
return self._model is not None
|
| 36 |
|
| 37 |
-
def load_model(self, model_path, context_size=2000, enable_gpu=True, gpu_layer_number=35,
|
| 38 |
log.info('load_model - started')
|
| 39 |
self._model_params = {}
|
| 40 |
self._model_params['model_path'] = model_path
|
| 41 |
self._model_params['context_size'] = context_size
|
| 42 |
self._model_params['enable_gpu'] = enable_gpu
|
| 43 |
self._model_params['gpu_layer_number'] = gpu_layer_number
|
| 44 |
-
self._model_params['n_gqa'] = n_gqa
|
| 45 |
self._model_params['chat_format'] = chat_format
|
| 46 |
|
| 47 |
if self._model is not None:
|
|
@@ -57,9 +56,8 @@ class LlmBackend:
|
|
| 57 |
#n_batch=100,
|
| 58 |
logits_all=True,
|
| 59 |
#n_threads=12,
|
| 60 |
-
verbose=
|
| 61 |
-
n_gpu_layers=gpu_layer_number
|
| 62 |
-
n_gqa=n_gqa #must be set for 70b models
|
| 63 |
)
|
| 64 |
log.info('load_model - finished')
|
| 65 |
return self._model
|
|
@@ -72,8 +70,7 @@ class LlmBackend:
|
|
| 72 |
#n_batch=100,
|
| 73 |
logits_all=True,
|
| 74 |
#n_threads=12,
|
| 75 |
-
verbose=
|
| 76 |
-
n_gqa=n_gqa #must be set for 70b models
|
| 77 |
)
|
| 78 |
log.info('load_model - finished')
|
| 79 |
return self._model
|
|
|
|
| 34 |
def is_model_loaded(self):
|
| 35 |
return self._model is not None
|
| 36 |
|
| 37 |
+
def load_model(self, model_path, context_size=2000, enable_gpu=True, gpu_layer_number=35, chat_format='llama-2'):
|
| 38 |
log.info('load_model - started')
|
| 39 |
self._model_params = {}
|
| 40 |
self._model_params['model_path'] = model_path
|
| 41 |
self._model_params['context_size'] = context_size
|
| 42 |
self._model_params['enable_gpu'] = enable_gpu
|
| 43 |
self._model_params['gpu_layer_number'] = gpu_layer_number
|
|
|
|
| 44 |
self._model_params['chat_format'] = chat_format
|
| 45 |
|
| 46 |
if self._model is not None:
|
|
|
|
| 56 |
#n_batch=100,
|
| 57 |
logits_all=True,
|
| 58 |
#n_threads=12,
|
| 59 |
+
verbose=True,
|
| 60 |
+
n_gpu_layers=gpu_layer_number
|
|
|
|
| 61 |
)
|
| 62 |
log.info('load_model - finished')
|
| 63 |
return self._model
|
|
|
|
| 70 |
#n_batch=100,
|
| 71 |
logits_all=True,
|
| 72 |
#n_threads=12,
|
| 73 |
+
verbose=True
|
|
|
|
| 74 |
)
|
| 75 |
log.info('load_model - finished')
|
| 76 |
return self._model
|