small changes

freckletonj · Jan 11, 2024 · 31fa7dd · 31fa7dd
1 parent 7e2edc8
commit 31fa7dd
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 5 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,8 +16,10 @@ node_modules/
 
 
 # Misc
-test.md
-test.txt
+/test*.md
+/test*.txt
+/test*.py
+/sandbox
 *.log
 debug_transcription.wav
 t12_outputs

diff --git a/uniteai/llm_server.py b/uniteai/llm_server.py
@@ -49,7 +49,7 @@
 import llama_cpp
 import queue
 import time
-
+import traceback
 
 device = 'cuda' if torch.cuda.is_available() else 'cpu'  # note: this isn't relevant to GGUF models
 
@@ -78,7 +78,8 @@ def load_model(args):
         from llama_cpp import Llama
         model = Llama(
             model_path=name_or_path,
-            verbose=False
+            verbose=False,
+            n_ctx=2048,
         )
         tokenizer = None
         return tokenizer, model
@@ -201,7 +202,8 @@ def f(input_ids: torch.LongTensor,
 
     stream = model(
         request.text,
-        max_tokens=200,
+        # max_tokens=200,
+        max_tokens=999999999,
         stream=True,
         echo=False,  # echo the prompt back as output
         stopping_criteria=stopping_criteria,
@@ -250,6 +252,7 @@ def f(input_ids: torch.LongTensor,
     try:
         model.generate(**generation_kwargs)  # blocks
     except RuntimeError as e:
+        traceback.print_exc()
         streamer.on_finalized_text(f'\n<LLM SERVER ERROR: {e}>', stream_end=True)
     print('DONE GENERATING')