From 0676c8f41ad6ac7fd2f3ca730d3b41163a92fe02 Mon Sep 17 00:00:00 2001
From: Alex O'Connell <git@alexoconnell.net>
Date: Thu, 2 May 2024 22:34:43 -0400
Subject: [PATCH 1/9] better messaging on timeout

---
 custom_components/llama_conversation/agent.py | 32 +++++++++++--------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/custom_components/llama_conversation/agent.py b/custom_components/llama_conversation/agent.py
index b07a701..d55234c 100644
--- a/custom_components/llama_conversation/agent.py
+++ b/custom_components/llama_conversation/agent.py
@@ -894,15 +894,17 @@ def _generate(self, conversation: dict) -> str:
         if self.api_key:
             headers["Authorization"] = f"Bearer {self.api_key}"
 
-        result = requests.post(
-            f"{self.api_host}{endpoint}", 
-            json=request_params,
-            timeout=timeout,
-            headers=headers,
-        )
-
         try:
+            result = requests.post(
+                f"{self.api_host}{endpoint}", 
+                json=request_params,
+                timeout=timeout,
+                headers=headers,
+            )
+            
             result.raise_for_status()
+        except requests.exceptions.Timeout:
+            return f"The generation request timed out! Please increase the timeout in settings or decrease the number of exposed entities."
         except requests.RequestException as err:
             _LOGGER.debug(f"Err was: {err}")
             _LOGGER.debug(f"Request was: {request_params}")
@@ -1141,15 +1143,17 @@ def _generate(self, conversation: dict) -> str:
         if self.api_key:
             headers["Authorization"] = f"Bearer {self.api_key}"
 
-        result = requests.post(
-            f"{self.api_host}{endpoint}", 
-            json=request_params,
-            timeout=timeout,
-            headers=headers,
-        )
-
         try:
+            result = requests.post(
+                f"{self.api_host}{endpoint}", 
+                json=request_params,
+                timeout=timeout,
+                headers=headers,
+            )
+            
             result.raise_for_status()
+        except requests.exceptions.Timeout:
+            return f"The generation request timed out! Please increase the timeout in settings or decrease the number of exposed entities."
         except requests.RequestException as err:
             _LOGGER.debug(f"Err was: {err}")
             _LOGGER.debug(f"Request was: {request_params}")

From 4b9f9ed2fa23a0692af34bce9dd04ae37855ad6f Mon Sep 17 00:00:00 2001
From: Alex O'Connell <git@alexoconnell.net>
Date: Thu, 2 May 2024 22:46:59 -0400
Subject: [PATCH 2/9] properly validate install if we are re-installing

---
 custom_components/llama_conversation/agent.py |  1 +
 custom_components/llama_conversation/utils.py | 11 +++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/custom_components/llama_conversation/agent.py b/custom_components/llama_conversation/agent.py
index d55234c..8730335 100644
--- a/custom_components/llama_conversation/agent.py
+++ b/custom_components/llama_conversation/agent.py
@@ -548,6 +548,7 @@ def _load_model(self, entry: ConfigEntry) -> None:
             if not install_result == True:
                 raise ConfigEntryError("llama-cpp-python was not installed on startup and re-installing it led to an error!")
             
+            validate_llama_cpp_python_installation()
             self.llama_cpp_module = importlib.import_module("llama_cpp")
             
         Llama = getattr(self.llama_cpp_module, "Llama")
diff --git a/custom_components/llama_conversation/utils.py b/custom_components/llama_conversation/utils.py
index a736906..884a7b9 100644
--- a/custom_components/llama_conversation/utils.py
+++ b/custom_components/llama_conversation/utils.py
@@ -68,9 +68,16 @@ def download_model_from_hf(model_name: str, quantization_type: str, storage_fold
     )
 
 def _load_extension():
-    """This needs to be at the root file level because we are using the 'spawn' start method"""
+    """
+    Makes sure it is possible to load llama-cpp-python without crashing Home Assistant.
+    This needs to be at the root file level because we are using the 'spawn' start method.
+    Also ignore ModuleNotFoundError because that just means it's not installed. Not that it will crash HA
+    """
     import importlib
-    importlib.import_module("llama_cpp")
+    try:
+        importlib.import_module("llama_cpp")
+    except ModuleNotFoundError:
+        pass
     
 def validate_llama_cpp_python_installation():
     """

From 7a649546ff9e636e4b1acc8cfd39b1bca9a6f246 Mon Sep 17 00:00:00 2001
From: Alex O'Connell <git@alexoconnell.net>
Date: Thu, 2 May 2024 22:56:45 -0400
Subject: [PATCH 3/9] fix multiprocessing error

---
 custom_components/llama_conversation/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/custom_components/llama_conversation/utils.py b/custom_components/llama_conversation/utils.py
index 884a7b9..a42654e 100644
--- a/custom_components/llama_conversation/utils.py
+++ b/custom_components/llama_conversation/utils.py
@@ -3,6 +3,7 @@
 import sys
 import platform
 import logging
+import multiprocessing
 import voluptuous as vol
 import webcolors
 from importlib.metadata import version
@@ -83,9 +84,8 @@ def validate_llama_cpp_python_installation():
     """
     Spawns another process and tries to import llama.cpp to avoid crashing the main process
     """
-    import multiprocessing
-    multiprocessing.set_start_method('spawn') # required because of aio
-    process = multiprocessing.Process(target=_load_extension)
+    mp_ctx = multiprocessing.get_context('spawn') # required because of aio
+    process = mp_ctx.Process(target=_load_extension)
     process.start()
     process.join()
 

From cdd7e8415a478b1482322cfaf5449bab35d85c40 Mon Sep 17 00:00:00 2001
From: Alex O'Connell <git@alexoconnell.net>
Date: Thu, 2 May 2024 23:05:43 -0400
Subject: [PATCH 4/9] hook up flash attention

---
 custom_components/llama_conversation/agent.py       | 13 ++++++++++---
 custom_components/llama_conversation/config_flow.py |  7 +++++++
 custom_components/llama_conversation/const.py       |  3 +++
 .../llama_conversation/translations/en.json         |  2 ++
 4 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/custom_components/llama_conversation/agent.py b/custom_components/llama_conversation/agent.py
index 8730335..d18c588 100644
--- a/custom_components/llama_conversation/agent.py
+++ b/custom_components/llama_conversation/agent.py
@@ -41,6 +41,7 @@
     CONF_EXTRA_ATTRIBUTES_TO_EXPOSE,
     CONF_ALLOWED_SERVICE_CALL_ARGUMENTS,
     CONF_PROMPT_TEMPLATE,
+    CONF_ENABLE_FLASH_ATTENTION,
     CONF_USE_GBNF_GRAMMAR,
     CONF_GBNF_GRAMMAR_FILE,
     CONF_USE_IN_CONTEXT_LEARNING_EXAMPLES,
@@ -75,6 +76,7 @@
     DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE,
     DEFAULT_ALLOWED_SERVICE_CALL_ARGUMENTS,
     DEFAULT_PROMPT_TEMPLATE,
+    DEFAULT_ENABLE_FLASH_ATTENTION,
     DEFAULT_USE_GBNF_GRAMMAR,
     DEFAULT_GBNF_GRAMMAR_FILE,
     DEFAULT_USE_IN_CONTEXT_LEARNING_EXAMPLES,
@@ -559,13 +561,15 @@ def _load_model(self, entry: ConfigEntry) -> None:
         self.loaded_model_settings[CONF_BATCH_SIZE] = entry.options.get(CONF_BATCH_SIZE, DEFAULT_BATCH_SIZE)
         self.loaded_model_settings[CONF_THREAD_COUNT] = entry.options.get(CONF_THREAD_COUNT, DEFAULT_THREAD_COUNT)
         self.loaded_model_settings[CONF_BATCH_THREAD_COUNT] = entry.options.get(CONF_BATCH_THREAD_COUNT, DEFAULT_BATCH_THREAD_COUNT)
+        self.loaded_model_settings[CONF_ENABLE_FLASH_ATTENTION] = entry.options.get(CONF_ENABLE_FLASH_ATTENTION, DEFAULT_ENABLE_FLASH_ATTENTION)
 
         self.llm = Llama(
             model_path=self.model_path,
             n_ctx=int(self.loaded_model_settings[CONF_CONTEXT_LENGTH]),
             n_batch=int(self.loaded_model_settings[CONF_BATCH_SIZE]),
             n_threads=int(self.loaded_model_settings[CONF_THREAD_COUNT]),
-            n_threads_batch=int(self.loaded_model_settings[CONF_BATCH_THREAD_COUNT])
+            n_threads_batch=int(self.loaded_model_settings[CONF_BATCH_THREAD_COUNT]),
+            flash_attn=self.loaded_model_settings[CONF_ENABLE_FLASH_ATTENTION],
         )
         _LOGGER.debug("Model loaded")
 
@@ -614,13 +618,15 @@ def _update_options(self):
         if self.loaded_model_settings[CONF_CONTEXT_LENGTH] != self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH) or \
             self.loaded_model_settings[CONF_BATCH_SIZE] != self.entry.options.get(CONF_BATCH_SIZE, DEFAULT_BATCH_SIZE) or \
             self.loaded_model_settings[CONF_THREAD_COUNT] != self.entry.options.get(CONF_THREAD_COUNT, DEFAULT_THREAD_COUNT) or \
-            self.loaded_model_settings[CONF_BATCH_THREAD_COUNT] != self.entry.options.get(CONF_BATCH_THREAD_COUNT, DEFAULT_BATCH_THREAD_COUNT):
+            self.loaded_model_settings[CONF_BATCH_THREAD_COUNT] != self.entry.options.get(CONF_BATCH_THREAD_COUNT, DEFAULT_BATCH_THREAD_COUNT) or \
+            self.loaded_model_settings[CONF_ENABLE_FLASH_ATTENTION] != self.entry.options.get(CONF_ENABLE_FLASH_ATTENTION, DEFAULT_ENABLE_FLASH_ATTENTION):
 
             _LOGGER.debug(f"Reloading model '{self.model_path}'...")
             self.loaded_model_settings[CONF_CONTEXT_LENGTH] = self.entry.options.get(CONF_CONTEXT_LENGTH, DEFAULT_CONTEXT_LENGTH)
             self.loaded_model_settings[CONF_BATCH_SIZE] = self.entry.options.get(CONF_BATCH_SIZE, DEFAULT_BATCH_SIZE)
             self.loaded_model_settings[CONF_THREAD_COUNT] = self.entry.options.get(CONF_THREAD_COUNT, DEFAULT_THREAD_COUNT)
             self.loaded_model_settings[CONF_BATCH_THREAD_COUNT] = self.entry.options.get(CONF_BATCH_THREAD_COUNT, DEFAULT_BATCH_THREAD_COUNT)
+            self.loaded_model_settings[CONF_ENABLE_FLASH_ATTENTION] = self.entry.options.get(CONF_ENABLE_FLASH_ATTENTION, DEFAULT_ENABLE_FLASH_ATTENTION)
 
             Llama = getattr(self.llama_cpp_module, "Llama")
             self.llm = Llama(
@@ -628,7 +634,8 @@ def _update_options(self):
                 n_ctx=int(self.loaded_model_settings[CONF_CONTEXT_LENGTH]),
                 n_batch=int(self.loaded_model_settings[CONF_BATCH_SIZE]),
                 n_threads=int(self.loaded_model_settings[CONF_THREAD_COUNT]),
-                n_threads_batch=int(self.loaded_model_settings[CONF_BATCH_THREAD_COUNT])
+                n_threads_batch=int(self.loaded_model_settings[CONF_BATCH_THREAD_COUNT]),
+                flash_attn=self.loaded_model_settings[CONF_ENABLE_FLASH_ATTENTION],
             )
             _LOGGER.debug("Model loaded")
             model_reloaded = True
diff --git a/custom_components/llama_conversation/config_flow.py b/custom_components/llama_conversation/config_flow.py
index c75eb82..ed7e17f 100644
--- a/custom_components/llama_conversation/config_flow.py
+++ b/custom_components/llama_conversation/config_flow.py
@@ -54,6 +54,7 @@
     CONF_DOWNLOADED_MODEL_QUANTIZATION,
     CONF_DOWNLOADED_MODEL_QUANTIZATION_OPTIONS,
     CONF_PROMPT_TEMPLATE,
+    CONF_ENABLE_FLASH_ATTENTION,
     CONF_USE_GBNF_GRAMMAR,
     CONF_GBNF_GRAMMAR_FILE,
     CONF_EXTRA_ATTRIBUTES_TO_EXPOSE,
@@ -93,6 +94,7 @@
     DEFAULT_BACKEND_TYPE,
     DEFAULT_DOWNLOADED_MODEL_QUANTIZATION,
     DEFAULT_PROMPT_TEMPLATE,
+    DEFAULT_ENABLE_FLASH_ATTENTION,
     DEFAULT_USE_GBNF_GRAMMAR,
     DEFAULT_GBNF_GRAMMAR_FILE,
     DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE,
@@ -811,6 +813,11 @@ def local_llama_config_option_schema(options: MappingProxyType[str, Any], backen
                 description={"suggested_value": options.get(CONF_BATCH_THREAD_COUNT)},
                 default=DEFAULT_BATCH_THREAD_COUNT,
             ): NumberSelector(NumberSelectorConfig(min=1, max=(os.cpu_count() * 2), step=1)),
+            vol.Required(
+                CONF_ENABLE_FLASH_ATTENTION,
+                description={"suggested_value": options.get(CONF_ENABLE_FLASH_ATTENTION)},
+                default=DEFAULT_ENABLE_FLASH_ATTENTION,
+            ): BooleanSelector(BooleanSelectorConfig()),
             vol.Required(
                 CONF_USE_GBNF_GRAMMAR,
                 description={"suggested_value": options.get(CONF_USE_GBNF_GRAMMAR)},
diff --git a/custom_components/llama_conversation/const.py b/custom_components/llama_conversation/const.py
index e235015..b333fab 100644
--- a/custom_components/llama_conversation/const.py
+++ b/custom_components/llama_conversation/const.py
@@ -120,6 +120,8 @@
         "generation_prompt": "<|start_header_id|>assistant<|end_header_id|>\n\n"
     }
 }
+CONF_ENABLE_FLASH_ATTENTION = "enable_flash_attention"
+DEFAULT_ENABLE_FLASH_ATTENTION = False
 CONF_USE_GBNF_GRAMMAR = "gbnf_grammar"
 DEFAULT_USE_GBNF_GRAMMAR = False
 CONF_GBNF_GRAMMAR_FILE = "gbnf_grammar_file"
@@ -178,6 +180,7 @@
         CONF_TEMPERATURE: DEFAULT_TEMPERATURE,
         CONF_REQUEST_TIMEOUT: DEFAULT_REQUEST_TIMEOUT,
         CONF_PROMPT_TEMPLATE: DEFAULT_PROMPT_TEMPLATE,
+        CONF_ENABLE_FLASH_ATTENTION: DEFAULT_ENABLE_FLASH_ATTENTION,
         CONF_USE_GBNF_GRAMMAR: DEFAULT_USE_GBNF_GRAMMAR,
         CONF_EXTRA_ATTRIBUTES_TO_EXPOSE: DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE,
         CONF_ALLOWED_SERVICE_CALL_ARGUMENTS: DEFAULT_ALLOWED_SERVICE_CALL_ARGUMENTS,
diff --git a/custom_components/llama_conversation/translations/en.json b/custom_components/llama_conversation/translations/en.json
index adfb594..24afee4 100644
--- a/custom_components/llama_conversation/translations/en.json
+++ b/custom_components/llama_conversation/translations/en.json
@@ -63,6 +63,7 @@
                     "ollama_json_mode": "JSON Output Mode",
                     "extra_attributes_to_expose": "Additional attribute to expose in the context",
                     "allowed_service_call_arguments": "Arguments allowed to be pass to service calls",
+                    "enable_flash_attention": "Enable Flash Attention",
                     "gbnf_grammar": "Enable GBNF Grammar",
                     "gbnf_grammar_file": "GBNF Grammar Filename",
                     "openai_api_key": "API Key",
@@ -115,6 +116,7 @@
                     "ollama_json_mode": "JSON Output Mode",
                     "extra_attributes_to_expose": "Additional attribute to expose in the context",
                     "allowed_service_call_arguments": "Arguments allowed to be pass to service calls",
+                    "enable_flash_attention": "Enable Flash Attention",
                     "gbnf_grammar": "Enable GBNF Grammar",
                     "gbnf_grammar_file": "GBNF Grammar Filename",
                     "openai_api_key": "API Key",

From 9b9de48ad497ccce452cd02b24044a6c534390b4 Mon Sep 17 00:00:00 2001
From: Alex O'Connell <git@alexoconnell.net>
Date: Fri, 3 May 2024 22:52:34 -0400
Subject: [PATCH 5/9] fix tests

---
 tests/llama_conversation/test_agent.py       | 5 +++++
 tests/llama_conversation/test_config_flow.py | 6 ++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/tests/llama_conversation/test_agent.py b/tests/llama_conversation/test_agent.py
index dd6e312..49df0ce 100644
--- a/tests/llama_conversation/test_agent.py
+++ b/tests/llama_conversation/test_agent.py
@@ -20,6 +20,7 @@
     CONF_EXTRA_ATTRIBUTES_TO_EXPOSE,
     CONF_ALLOWED_SERVICE_CALL_ARGUMENTS,
     CONF_PROMPT_TEMPLATE,
+    CONF_ENABLE_FLASH_ATTENTION,
     CONF_USE_GBNF_GRAMMAR,
     CONF_GBNF_GRAMMAR_FILE,
     CONF_USE_IN_CONTEXT_LEARNING_EXAMPLES,
@@ -55,6 +56,7 @@
     DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE,
     DEFAULT_ALLOWED_SERVICE_CALL_ARGUMENTS,
     DEFAULT_PROMPT_TEMPLATE,
+    DEFAULT_ENABLE_FLASH_ATTENTION,
     DEFAULT_USE_GBNF_GRAMMAR,
     DEFAULT_GBNF_GRAMMAR_FILE,
     DEFAULT_USE_IN_CONTEXT_LEARNING_EXAMPLES,
@@ -208,6 +210,7 @@ async def test_local_llama_agent(local_llama_agent_fixture):
         n_batch=local_llama_agent.entry.options.get(CONF_BATCH_SIZE),
         n_threads=local_llama_agent.entry.options.get(CONF_THREAD_COUNT),
         n_threads_batch=local_llama_agent.entry.options.get(CONF_BATCH_THREAD_COUNT),
+        flash_attn=local_llama_agent.entry.options.get(CONF_ENABLE_FLASH_ATTENTION)
     )
 
     all_mocks["tokenize"].assert_called_once()
@@ -231,6 +234,7 @@ async def test_local_llama_agent(local_llama_agent_fixture):
     local_llama_agent.entry.options[CONF_THREAD_COUNT] = 24
     local_llama_agent.entry.options[CONF_BATCH_THREAD_COUNT] = 24
     local_llama_agent.entry.options[CONF_TEMPERATURE] = 2.0
+    local_llama_agent.entry.options[CONF_ENABLE_FLASH_ATTENTION] = True
     local_llama_agent.entry.options[CONF_TOP_K] = 20
     local_llama_agent.entry.options[CONF_TOP_P] = 0.9
     local_llama_agent.entry.options[CONF_MIN_P] = 0.2
@@ -244,6 +248,7 @@ async def test_local_llama_agent(local_llama_agent_fixture):
         n_batch=local_llama_agent.entry.options.get(CONF_BATCH_SIZE),
         n_threads=local_llama_agent.entry.options.get(CONF_THREAD_COUNT),
         n_threads_batch=local_llama_agent.entry.options.get(CONF_BATCH_THREAD_COUNT),
+        flash_attn=local_llama_agent.entry.options.get(CONF_ENABLE_FLASH_ATTENTION)
     )
 
     # do another turn of the same conversation
diff --git a/tests/llama_conversation/test_config_flow.py b/tests/llama_conversation/test_config_flow.py
index a28a0db..13fb4b5 100644
--- a/tests/llama_conversation/test_config_flow.py
+++ b/tests/llama_conversation/test_config_flow.py
@@ -26,6 +26,7 @@
     CONF_EXTRA_ATTRIBUTES_TO_EXPOSE,
     CONF_ALLOWED_SERVICE_CALL_ARGUMENTS,
     CONF_PROMPT_TEMPLATE,
+    CONF_ENABLE_FLASH_ATTENTION,
     CONF_USE_GBNF_GRAMMAR,
     CONF_GBNF_GRAMMAR_FILE,
     CONF_USE_IN_CONTEXT_LEARNING_EXAMPLES,
@@ -67,6 +68,7 @@
     DEFAULT_EXTRA_ATTRIBUTES_TO_EXPOSE,
     DEFAULT_ALLOWED_SERVICE_CALL_ARGUMENTS,
     DEFAULT_PROMPT_TEMPLATE,
+    DEFAULT_ENABLE_FLASH_ATTENTION,
     DEFAULT_USE_GBNF_GRAMMAR,
     DEFAULT_GBNF_GRAMMAR_FILE,
     DEFAULT_USE_IN_CONTEXT_LEARNING_EXAMPLES,
@@ -304,7 +306,7 @@ def test_validate_options_schema():
     options_llama_hf = local_llama_config_option_schema(None, BACKEND_TYPE_LLAMA_HF)
     assert set(options_llama_hf.keys()) == set(universal_options + [
         CONF_TOP_K, CONF_TEMPERATURE, CONF_TOP_P, CONF_MIN_P, CONF_TYPICAL_P, # supports all sampling parameters
-        CONF_BATCH_SIZE, CONF_THREAD_COUNT, CONF_BATCH_THREAD_COUNT, # llama.cpp specific
+        CONF_BATCH_SIZE, CONF_THREAD_COUNT, CONF_BATCH_THREAD_COUNT, CONF_ENABLE_FLASH_ATTENTION, # llama.cpp specific
         CONF_CONTEXT_LENGTH, # supports context length
         CONF_USE_GBNF_GRAMMAR, CONF_GBNF_GRAMMAR_FILE, # supports GBNF
         CONF_PROMPT_CACHING_ENABLED, CONF_PROMPT_CACHING_INTERVAL # supports prompt caching
@@ -313,7 +315,7 @@ def test_validate_options_schema():
     options_llama_existing = local_llama_config_option_schema(None, BACKEND_TYPE_LLAMA_EXISTING)
     assert set(options_llama_existing.keys()) == set(universal_options + [
         CONF_TOP_K, CONF_TEMPERATURE, CONF_TOP_P, CONF_MIN_P, CONF_TYPICAL_P, # supports all sampling parameters
-        CONF_BATCH_SIZE, CONF_THREAD_COUNT, CONF_BATCH_THREAD_COUNT, # llama.cpp specific
+        CONF_BATCH_SIZE, CONF_THREAD_COUNT, CONF_BATCH_THREAD_COUNT, CONF_ENABLE_FLASH_ATTENTION, # llama.cpp specific
         CONF_CONTEXT_LENGTH, # supports context length
         CONF_USE_GBNF_GRAMMAR, CONF_GBNF_GRAMMAR_FILE, # supports GBNF
         CONF_PROMPT_CACHING_ENABLED, CONF_PROMPT_CACHING_INTERVAL # supports prompt caching

From 0bd969f851d0593206c6efe0d2c4f2f1df09e859 Mon Sep 17 00:00:00 2001
From: Alex O'Connell <git@alexoconnell.net>
Date: Sat, 4 May 2024 07:30:53 -0400
Subject: [PATCH 6/9] re-enable armhf builds

---
 .github/workflows/create-release.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/create-release.yml b/.github/workflows/create-release.yml
index d6a0308..3f41784 100644
--- a/.github/workflows/create-release.yml
+++ b/.github/workflows/create-release.yml
@@ -19,7 +19,7 @@ jobs:
       fail-fast: false
       matrix:
         home_assistant_version: ["2023.12.4", "2024.2.1"]
-        arch: [aarch64, amd64, i386] # armhf
+        arch: [aarch64, armhf, amd64, i386]
         suffix: [""]
         include:
         - home_assistant_version: "2024.2.1"

From f95793433ed2fd49869fb85b7768d8528e376d11 Mon Sep 17 00:00:00 2001
From: Alex O'Connell <git@alexoconnell.net>
Date: Sat, 4 May 2024 07:31:50 -0400
Subject: [PATCH 7/9] one other tweak to timeout warning

---
 custom_components/llama_conversation/agent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/custom_components/llama_conversation/agent.py b/custom_components/llama_conversation/agent.py
index d18c588..5dc99fd 100644
--- a/custom_components/llama_conversation/agent.py
+++ b/custom_components/llama_conversation/agent.py
@@ -912,7 +912,7 @@ def _generate(self, conversation: dict) -> str:
             
             result.raise_for_status()
         except requests.exceptions.Timeout:
-            return f"The generation request timed out! Please increase the timeout in settings or decrease the number of exposed entities."
+            return f"The generation request timed out! Please check your connection settings, increase the timeout in settings, or decrease the number of exposed entities."
         except requests.RequestException as err:
             _LOGGER.debug(f"Err was: {err}")
             _LOGGER.debug(f"Request was: {request_params}")

From 9eacd3edb296c4f8b9022184fc15e9239ff08f56 Mon Sep 17 00:00:00 2001
From: Alex O'Connell <git@alexoconnell.net>
Date: Sat, 4 May 2024 07:33:39 -0400
Subject: [PATCH 8/9] Release v0.2.15

---
 README.md                                          | 1 +
 custom_components/llama_conversation/const.py      | 2 +-
 custom_components/llama_conversation/manifest.json | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a69c1e1..cd06d7b 100644
--- a/README.md
+++ b/README.md
@@ -126,6 +126,7 @@ In order to facilitate running the project entirely on the system where Home Ass
 ## Version History
 | Version | Description                                                                                                                                                                                                          |
 |---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| v0.2.15 | Fix startup error when using llama.cpp backend and add flash attention to llama.cpp backend                                                                                                                          |
 | v0.2.14 | Fix llama.cpp wheels + AVX detection                                                                                                                                                                                 |
 | v0.2.13 | Add support for Llama 3, build llama.cpp wheels that are compatible with non-AVX systems, fix an error with exposing script entities, fix multiple small Ollama backend issues, and add basic multi-language support |
 | v0.2.12 | Fix cover ICL examples, allow setting number of ICL examples, add min P and typical P sampler options, recommend models during setup, add JSON mode for Ollama backend, fix missing default options                  |
diff --git a/custom_components/llama_conversation/const.py b/custom_components/llama_conversation/const.py
index b333fab..f6e8e47 100644
--- a/custom_components/llama_conversation/const.py
+++ b/custom_components/llama_conversation/const.py
@@ -274,5 +274,5 @@
     }
 }
 
-INTEGRATION_VERSION = "0.2.14"
+INTEGRATION_VERSION = "0.2.15"
 EMBEDDED_LLAMA_CPP_PYTHON_VERSION = "0.2.69"
\ No newline at end of file
diff --git a/custom_components/llama_conversation/manifest.json b/custom_components/llama_conversation/manifest.json
index 6ddf608..3079207 100644
--- a/custom_components/llama_conversation/manifest.json
+++ b/custom_components/llama_conversation/manifest.json
@@ -1,7 +1,7 @@
 {
   "domain": "llama_conversation",
   "name": "LLaMA Conversation",
-  "version": "0.2.14",
+  "version": "0.2.15",
   "codeowners": ["@acon96"],
   "config_flow": true,
   "dependencies": ["conversation"],

From 26be7d7dcd97988d84e4959b2b6f69029d57939a Mon Sep 17 00:00:00 2001
From: Alex O'Connell <git@alexoconnell.net>
Date: Sat, 4 May 2024 07:36:15 -0400
Subject: [PATCH 9/9] fix other error message

---
 custom_components/llama_conversation/agent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/custom_components/llama_conversation/agent.py b/custom_components/llama_conversation/agent.py
index 5dc99fd..fe2b8df 100644
--- a/custom_components/llama_conversation/agent.py
+++ b/custom_components/llama_conversation/agent.py
@@ -1161,7 +1161,7 @@ def _generate(self, conversation: dict) -> str:
             
             result.raise_for_status()
         except requests.exceptions.Timeout:
-            return f"The generation request timed out! Please increase the timeout in settings or decrease the number of exposed entities."
+            return f"The generation request timed out! Please check your connection settings, increase the timeout in settings, or decrease the number of exposed entities."
         except requests.RequestException as err:
             _LOGGER.debug(f"Err was: {err}")
             _LOGGER.debug(f"Request was: {request_params}")