Merge branch 'branch-24.03-doca' of github.com:e-ago/MorpheusDoca int…

…o branch-24.03-doca
nv-morpheus · Mar 13, 2024 · f081ceb · f081ceb
2 parents 6b80c63 + faa3d1e
commit f081ceb
Show file tree

Hide file tree

Showing 23 changed files with 517 additions and 169 deletions.
diff --git a/conda/environments/all_cuda-121_arch-x86_64.yaml b/conda/environments/all_cuda-121_arch-x86_64.yaml
@@ -93,7 +93,9 @@ dependencies:
 - pytorch=*=*cuda*
 - rapidjson=1.1.0
 - rdma-core>=48
+- requests
 - requests-cache=1.1
+- requests-toolbelt
 - s3fs=2023.12.2
 - scikit-build=0.17.6
 - scikit-learn=1.3.2
@@ -117,7 +119,7 @@ dependencies:
   - --find-links https://data.dgl.ai/wheels/cu121/repo.html
   - PyMuPDF==1.23.21
   - databricks-connect
-  - dgl
+  - dgl==2.0.0
   - dglgo
   - google-search-results==2.4
   - langchain==0.1.9

diff --git a/conda/environments/dev_cuda-121_arch-x86_64.yaml b/conda/environments/dev_cuda-121_arch-x86_64.yaml
@@ -73,7 +73,9 @@ dependencies:
 - pytorch-cuda
 - pytorch=*=*cuda*
 - rapidjson=1.1.0
+- requests
 - requests-cache=1.1
+- requests-toolbelt
 - scikit-build=0.17.6
 - scikit-learn=1.3.2
 - sphinx

diff --git a/conda/environments/examples_cuda-121_arch-x86_64.yaml b/conda/environments/examples_cuda-121_arch-x86_64.yaml
@@ -46,7 +46,9 @@ dependencies:
 - python=3.10
 - pytorch-cuda
 - pytorch=*=*cuda*
+- requests
 - requests-cache=1.1
+- requests-toolbelt
 - s3fs=2023.12.2
 - scikit-learn=1.3.2
 - sentence-transformers
@@ -61,7 +63,7 @@ dependencies:
   - --find-links https://data.dgl.ai/wheels/cu121/repo.html
   - PyMuPDF==1.23.21
   - databricks-connect
-  - dgl
+  - dgl==2.0.0
   - dglgo
   - google-search-results==2.4
   - langchain==0.1.9

diff --git a/conda/environments/runtime_cuda-121_arch-x86_64.yaml b/conda/environments/runtime_cuda-121_arch-x86_64.yaml
@@ -28,7 +28,9 @@ dependencies:
 - python=3.10
 - pytorch-cuda
 - pytorch=*=*cuda*
+- requests
 - requests-cache=1.1
+- requests-toolbelt
 - scikit-learn=1.3.2
 - sqlalchemy
 - tqdm=4

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -261,7 +261,9 @@ dependencies:
           - python-graphviz
           - pytorch-cuda
           - pytorch=*=*cuda*
+          - requests
           - requests-cache=1.1
+          - requests-toolbelt # Transitive dep needed by nemollm, specified here to ensure we get a compatible version
           - sqlalchemy
           - tqdm=4
           - typing_utils=0.1
@@ -311,7 +313,7 @@ dependencies:
           - pip:
             - --find-links https://data.dgl.ai/wheels/cu121/repo.html
             - --find-links https://data.dgl.ai/wheels-test/repo.html
-            - dgl
+            - dgl==2.0.0
             - dglgo
 
   example-llm-agents:

diff --git a/morpheus.code-workspace b/morpheus.code-workspace
@@ -83,7 +83,7 @@
                 "program": "${workspaceFolder}/morpheus/cli/run.py",
                 "request": "launch",
                 "subProcess": true,
-                "type": "python"
+                "type": "debugpy"
             },
             {
                 "args": [
@@ -139,7 +139,7 @@
                 "program": "${workspaceFolder}/morpheus/cli/run.py",
                 "request": "launch",
                 "subProcess": true,
-                "type": "python"
+                "type": "debugpy"
             },
             {
                 "args": [
@@ -201,7 +201,7 @@
                 "program": "${workspaceFolder}/morpheus/cli/run.py",
                 "request": "launch",
                 "subProcess": true,
-                "type": "python"
+                "type": "debugpy"
             },
             {
                 "args": [
@@ -266,7 +266,7 @@
                 "program": "${workspaceFolder}/morpheus/cli/run.py",
                 "request": "launch",
                 "subProcess": true,
-                "type": "python"
+                "type": "debugpy"
             },
             {
                 "args": [
@@ -285,7 +285,7 @@
                 "name": "Python: Anomaly Detection Example",
                 "program": "${workspaceFolder}/examples/abp_pcap_detection/run.py",
                 "request": "launch",
-                "type": "python"
+                "type": "debugpy"
             },
             {
                 "args": [
@@ -303,7 +303,7 @@
                 "module": "sphinx.cmd.build",
                 "name": "Python: Sphinx",
                 "request": "launch",
-                "type": "python"
+                "type": "debugpy"
             },
             {
                 "MIMode": "gdb",
@@ -598,7 +598,7 @@
                 "name": "Python: GNN DGL inference",
                 "program": "${workspaceFolder}/examples/gnn_fraud_detection_pipeline/run.py",
                 "request": "launch",
-                "type": "python"
+                "type": "debugpy"
             },
             {
                 "args": [
@@ -614,7 +614,7 @@
                 "name": "Python: GNN model training",
                 "program": "${workspaceFolder}/models/training-tuning-scripts/fraud-detection-models/training.py",
                 "request": "launch",
-                "type": "python"
+                "type": "debugpy"
             }
         ]
     },

diff --git a/morpheus/_lib/doca/src/doca_context.cpp b/morpheus/_lib/doca/src/doca_context.cpp
@@ -61,7 +61,7 @@ static doca_error_t open_doca_device_with_pci(const char* pcie_value, struct doc
     res = doca_devinfo_create_list(&dev_list, &nb_devs);
     if (res != DOCA_SUCCESS)
     {
-        MORPHEUS_FAIL("Failed to load doca devices list");
+        LOG(ERROR) << "Failed to load doca devices list";
         return res;
     }
 

diff --git a/morpheus/_lib/doca/src/doca_source.cpp b/morpheus/_lib/doca/src/doca_source.cpp
@@ -23,6 +23,8 @@
 #include "doca_semaphore.hpp"
 #include "doca_source_kernels.hpp"
 
+#include "morpheus/utilities/error.hpp"
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
@@ -38,7 +40,6 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rte_byteorder.h>
-#include <time.h>
 
 #include <iostream>
 #include <memory>
@@ -128,8 +129,7 @@ DocaSourceStage::subscriber_fn_t DocaSourceStage::build()
 
         if (thread_idx >= MAX_QUEUE)
         {
-            MORPHEUS_LOCAL(MORPHEUS_CONCAT_STR("Thread ID " << thread_idx << " bigger than MAX_QUEUE " << MAX_QUEUE));
-            return;
+            MORPHEUS_FAIL(MORPHEUS_CONCAT_STR("Thread ID " << thread_idx << " bigger than MAX_QUEUE " << MAX_QUEUE));
         }
 
         payload_buffer_d.reserve(MAX_SEM_X_QUEUE);

diff --git a/morpheus/llm/nodes/langchain_agent_node.py b/morpheus/llm/nodes/langchain_agent_node.py
@@ -66,9 +66,14 @@ async def _run_single(self, **kwargs: dict[str, typing.Any]) -> dict[str, typing
             return results
 
         # We are not dealing with a list, so run single
-        return await self._agent_executor.arun(**kwargs)
-
-    async def execute(self, context: LLMContext) -> LLMContext:
+        try:
+            return await self._agent_executor.arun(**kwargs)
+        except Exception as e:
+            error_msg = f"Error running agent: {e}"
+            logger.exception(error_msg)
+            return error_msg
+
+    async def execute(self, context: LLMContext) -> LLMContext:  # pylint: disable=invalid-overridden-method
 
         input_dict = context.get_inputs()
 

diff --git a/morpheus/llm/services/llm_service.py b/morpheus/llm/services/llm_service.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import logging
+import typing
 from abc import ABC
 from abc import abstractmethod
 
@@ -33,7 +34,7 @@ def get_input_names(self) -> list[str]:
         pass
 
     @abstractmethod
-    def generate(self, input_dict: dict[str, str]) -> str:
+    def generate(self, **input_dict) -> str:
         """
         Issue a request to generate a response based on a given prompt.
 
@@ -45,7 +46,7 @@ def generate(self, input_dict: dict[str, str]) -> str:
         pass
 
     @abstractmethod
-    async def generate_async(self, input_dict: dict[str, str]) -> str:
+    async def generate_async(self, **input_dict) -> str:
         """
         Issue an asynchronous request to generate a response based on a given prompt.
 
@@ -56,27 +57,59 @@ async def generate_async(self, input_dict: dict[str, str]) -> str:
         """
         pass
 
+    @typing.overload
     @abstractmethod
-    def generate_batch(self, inputs: dict[str, list[str]]) -> list[str]:
+    def generate_batch(self,
+                       inputs: dict[str, list],
+                       return_exceptions: typing.Literal[True] = True) -> list[str | BaseException]:
+        ...
+
+    @typing.overload
+    @abstractmethod
+    def generate_batch(self, inputs: dict[str, list], return_exceptions: typing.Literal[False] = False) -> list[str]:
+        ...
+
+    @abstractmethod
+    def generate_batch(self, inputs: dict[str, list], return_exceptions=False) -> list[str] | list[str | BaseException]:
         """
         Issue a request to generate a list of responses based on a list of prompts.
 
         Parameters
         ----------
         inputs : dict
             Inputs containing prompt data.
+        return_exceptions : bool
+            Whether to return exceptions in the output list or raise them immediately.
         """
         pass
 
+    @typing.overload
+    @abstractmethod
+    async def generate_batch_async(self,
+                                   inputs: dict[str, list],
+                                   return_exceptions: typing.Literal[True] = True) -> list[str | BaseException]:
+        ...
+
+    @typing.overload
+    @abstractmethod
+    async def generate_batch_async(self,
+                                   inputs: dict[str, list],
+                                   return_exceptions: typing.Literal[False] = False) -> list[str]:
+        ...
+
     @abstractmethod
-    async def generate_batch_async(self, inputs: dict[str, list[str]]) -> list[str]:
+    async def generate_batch_async(self,
+                                   inputs: dict[str, list],
+                                   return_exceptions=False) -> list[str] | list[str | BaseException]:
         """
         Issue an asynchronous request to generate a list of responses based on a list of prompts.
 
         Parameters
         ----------
         inputs : dict
             Inputs containing prompt data.
+        return_exceptions : bool
+            Whether to return exceptions in the output list or raise them immediately.
         """
         pass