From 3dc470dc2ed66507172eac8ff442059df3578995 Mon Sep 17 00:00:00 2001 From: Erick Galinkin Date: Thu, 29 May 2025 14:40:53 -0400 Subject: [PATCH 01/12] Update jailbreak detection compatibility for NIM to allow providing an API key. --- nemoguardrails/library/jailbreak_detection/actions.py | 6 +++++- nemoguardrails/library/jailbreak_detection/request.py | 11 +++++++---- nemoguardrails/rails/llm/config.py | 6 +++++- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/nemoguardrails/library/jailbreak_detection/actions.py b/nemoguardrails/library/jailbreak_detection/actions.py index c535a31f9..ae879a188 100644 --- a/nemoguardrails/library/jailbreak_detection/actions.py +++ b/nemoguardrails/library/jailbreak_detection/actions.py @@ -96,6 +96,7 @@ async def jailbreak_detection_model( jailbreak_api_url = jailbreak_config.server_endpoint nim_url = jailbreak_config.nim_url nim_port = jailbreak_config.nim_port + nim_auth_token = jailbreak_config.nim_auth_token if context is not None: prompt = context.get("user_message", "") @@ -116,7 +117,10 @@ async def jailbreak_detection_model( if nim_url: jailbreak = await jailbreak_nim_request( - prompt=prompt, nim_url=nim_url, nim_port=nim_port + prompt=prompt, + nim_url=nim_url, + nim_port=nim_port, + nim_auth_token=nim_auth_token, ) elif jailbreak_api_url: jailbreak = await jailbreak_detection_model_request( diff --git a/nemoguardrails/library/jailbreak_detection/request.py b/nemoguardrails/library/jailbreak_detection/request.py index 0420b4350..a674b0bc2 100644 --- a/nemoguardrails/library/jailbreak_detection/request.py +++ b/nemoguardrails/library/jailbreak_detection/request.py @@ -96,10 +96,9 @@ async def jailbreak_detection_model_request( async def jailbreak_nim_request( - prompt: str, - nim_url: str, - nim_port: int, + prompt: str, nim_url: str, nim_port: int, nim_auth_token: str ): + headers = {"Content-Type": "application/json", "Accept": "application/json"} payload = { "input": prompt, } @@ -108,7 +107,11 @@ async def jailbreak_nim_request( try: async with aiohttp.ClientSession() as session: try: - async with session.post(endpoint, json=payload, timeout=30) as resp: + if nim_auth_token is not None: + headers["Authorization: Bearer"] = nim_auth_token + async with session.post( + endpoint, json=payload, headers=headers, timeout=30 + ) as resp: if resp.status != 200: log.error( f"NemoGuard JailbreakDetect NIM request failed with status {resp.status}" diff --git a/nemoguardrails/rails/llm/config.py b/nemoguardrails/rails/llm/config.py index a9d9c8cb5..729f5c562 100644 --- a/nemoguardrails/rails/llm/config.py +++ b/nemoguardrails/rails/llm/config.py @@ -556,7 +556,7 @@ class JailbreakDetectionConfig(BaseModel): server_endpoint: Optional[str] = Field( default=None, - description="The endpoint for the jailbreak detection heuristics server.", + description="The endpoint for the jailbreak detection server.", ) length_per_perplexity_threshold: float = Field( default=89.79, description="The length/perplexity threshold." @@ -572,6 +572,10 @@ class JailbreakDetectionConfig(BaseModel): default=8000, description="Port the NemoGuard JailbreakDetect NIM is listening on.", ) + nim_auth_token: Optional[str] = Field( + default=None, + description="API key for JailbreakDetect NIM", + ) embedding: Optional[str] = Field( default="nvidia/nv-embedqa-e5-v5", description="DEPRECATED: Model to use for embedding-based detections. Use NIM instead.", From bb311770bbf9f2e7badea4565d01d7c443f9b061 Mon Sep 17 00:00:00 2001 From: Erick Galinkin Date: Thu, 29 May 2025 14:56:04 -0400 Subject: [PATCH 02/12] Allow configurable classification path. --- nemoguardrails/library/jailbreak_detection/actions.py | 2 ++ nemoguardrails/library/jailbreak_detection/request.py | 8 ++++++-- nemoguardrails/rails/llm/config.py | 3 +++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/nemoguardrails/library/jailbreak_detection/actions.py b/nemoguardrails/library/jailbreak_detection/actions.py index ae879a188..41491669a 100644 --- a/nemoguardrails/library/jailbreak_detection/actions.py +++ b/nemoguardrails/library/jailbreak_detection/actions.py @@ -97,6 +97,7 @@ async def jailbreak_detection_model( nim_url = jailbreak_config.nim_url nim_port = jailbreak_config.nim_port nim_auth_token = jailbreak_config.nim_auth_token + nim_classification_path = jailbreak_config.nim_classification_path if context is not None: prompt = context.get("user_message", "") @@ -121,6 +122,7 @@ async def jailbreak_detection_model( nim_url=nim_url, nim_port=nim_port, nim_auth_token=nim_auth_token, + nim_classification_path=nim_classification_path, ) elif jailbreak_api_url: jailbreak = await jailbreak_detection_model_request( diff --git a/nemoguardrails/library/jailbreak_detection/request.py b/nemoguardrails/library/jailbreak_detection/request.py index a674b0bc2..d51a2877a 100644 --- a/nemoguardrails/library/jailbreak_detection/request.py +++ b/nemoguardrails/library/jailbreak_detection/request.py @@ -96,14 +96,18 @@ async def jailbreak_detection_model_request( async def jailbreak_nim_request( - prompt: str, nim_url: str, nim_port: int, nim_auth_token: str + prompt: str, + nim_url: str, + nim_port: int, + nim_auth_token: str, + nim_classification_path: str, ): headers = {"Content-Type": "application/json", "Accept": "application/json"} payload = { "input": prompt, } - endpoint = f"http://{nim_url}:{nim_port}/v1/classify" + endpoint = f"http://{nim_url}:{nim_port}{nim_classification_path}" try: async with aiohttp.ClientSession() as session: try: diff --git a/nemoguardrails/rails/llm/config.py b/nemoguardrails/rails/llm/config.py index 729f5c562..b39d75bc9 100644 --- a/nemoguardrails/rails/llm/config.py +++ b/nemoguardrails/rails/llm/config.py @@ -576,6 +576,9 @@ class JailbreakDetectionConfig(BaseModel): default=None, description="API key for JailbreakDetect NIM", ) + nim_classification_path: Optional[str] = Field( + default="/v1/classify", description="Classification path uri" + ) embedding: Optional[str] = Field( default="nvidia/nv-embedqa-e5-v5", description="DEPRECATED: Model to use for embedding-based detections. Use NIM instead.", From d60a3bcf19a1f3d1e95ee8be262e2519ade692a2 Mon Sep 17 00:00:00 2001 From: Erick Galinkin Date: Fri, 13 Jun 2025 13:37:27 -0400 Subject: [PATCH 03/12] Clean up unused dependencies. Update `JailbreakDetectionConfig` object to use base_url and endpoints. Refactor checks to align with base_uri and api_key_env_var approaches. Add additional error handling and logging. Fix tests to reflect changes. Signed-off-by: Erick Galinkin --- .../library/jailbreak_detection/actions.py | 45 +++++++++++++------ .../jailbreak_detection/model_based/checks.py | 25 ++++++----- .../jailbreak_detection/model_based/models.py | 25 +---------- .../library/jailbreak_detection/request.py | 5 +-- .../library/jailbreak_detection/server.py | 4 ++ nemoguardrails/rails/llm/config.py | 24 ++++------ tests/test_configs/jailbreak_nim/config.yml | 5 ++- tests/test_jailbreak_nim.py | 21 ++++++--- 8 files changed, 80 insertions(+), 74 deletions(-) diff --git a/nemoguardrails/library/jailbreak_detection/actions.py b/nemoguardrails/library/jailbreak_detection/actions.py index 41491669a..6fffa450f 100644 --- a/nemoguardrails/library/jailbreak_detection/actions.py +++ b/nemoguardrails/library/jailbreak_detection/actions.py @@ -28,6 +28,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import logging from typing import Optional @@ -94,15 +95,22 @@ async def jailbreak_detection_model( jailbreak_config = llm_task_manager.config.rails.config.jailbreak_detection jailbreak_api_url = jailbreak_config.server_endpoint - nim_url = jailbreak_config.nim_url - nim_port = jailbreak_config.nim_port - nim_auth_token = jailbreak_config.nim_auth_token - nim_classification_path = jailbreak_config.nim_classification_path + nim_base_url = jailbreak_config.nim_base_url + nim_classification_path = jailbreak_config.nim_server_endpoint + if jailbreak_config.api_key_env_var is not None: + nim_auth_token = os.getenv(jailbreak_config.api_key_env_var) + if nim_auth_token is None: + log.warning( + "Specified a value for jailbreak config api_key_env var at %s but the environment variable was not set!" + % jailbreak_config.api_key_env_var + ) + else: + nim_auth_token = None if context is not None: prompt = context.get("user_message", "") - if not jailbreak_api_url and not nim_url: + if not jailbreak_api_url and not nim_base_url: from nemoguardrails.library.jailbreak_detection.model_based.checks import ( check_jailbreak, initialize_model, @@ -111,16 +119,27 @@ async def jailbreak_detection_model( log.warning( "No jailbreak detection endpoint set. Running in-process, NOT RECOMMENDED FOR PRODUCTION." ) - classifier = initialize_model() - jailbreak = check_jailbreak(prompt=prompt, classifier=classifier) - - return jailbreak["jailbreak"] - - if nim_url: + try: + classifier = initialize_model() + if classifier is None: + log.error( + "No model initialized because EMBEDDING_CLASSIFIER_PATH is not set." + ) + return False + jailbreak = check_jailbreak(prompt=prompt, classifier=classifier) + log.info(f"Local model jailbreak detection result: {jailbreak}") + return jailbreak["jailbreak"] + except ImportError as e: + log.error( + f"Failed to import required dependencies for local model. Install scikit-learn and torch, or use NIM-based approach", + exc_info=e, + ) + return False + + if nim_base_url: jailbreak = await jailbreak_nim_request( prompt=prompt, - nim_url=nim_url, - nim_port=nim_port, + nim_url=nim_base_url, nim_auth_token=nim_auth_token, nim_classification_path=nim_classification_path, ) diff --git a/nemoguardrails/library/jailbreak_detection/model_based/checks.py b/nemoguardrails/library/jailbreak_detection/model_based/checks.py index 09e760509..c76bd6e6d 100644 --- a/nemoguardrails/library/jailbreak_detection/model_based/checks.py +++ b/nemoguardrails/library/jailbreak_detection/model_based/checks.py @@ -13,18 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import os from functools import lru_cache from pathlib import Path -from typing import Tuple, Union +from typing import Union -import numpy as np - -models_path = os.environ.get("EMBEDDING_CLASSIFIER_PATH") +from nemoguardrails.library.jailbreak_detection.model_based.models import ( + JailbreakClassifier, +) +logger = logging.getLogger(__name__) @lru_cache() -def initialize_model(classifier_path: str = models_path) -> "JailbreakClassifier": +def initialize_model() -> Union[None, JailbreakClassifier]: """ Initialize the global classifier model according to the configuration provided. Args @@ -32,14 +34,15 @@ def initialize_model(classifier_path: str = models_path) -> "JailbreakClassifier Returns jailbreak_classifier: JailbreakClassifier object combining embedding model and NemoGuard JailbreakDetect RF """ + + classifier_path = os.environ.get("EMBEDDING_CLASSIFIER_PATH") + if classifier_path is None: - raise EnvironmentError( - "Please set the EMBEDDING_CLASSIFIER_PATH environment variable to point to the Classifier model_based folder" + # Log a warning, but do not throw an exception + logger.warning( + "No embedding classifier path set. Server /model endpoint will not work." ) - - from nemoguardrails.library.jailbreak_detection.model_based.models import ( - JailbreakClassifier, - ) + return None jailbreak_classifier = JailbreakClassifier( str(Path(classifier_path).joinpath("snowflake.pkl")) diff --git a/nemoguardrails/library/jailbreak_detection/model_based/models.py b/nemoguardrails/library/jailbreak_detection/model_based/models.py index b8cd89911..8699449ed 100644 --- a/nemoguardrails/library/jailbreak_detection/model_based/models.py +++ b/nemoguardrails/library/jailbreak_detection/model_based/models.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os +import pickle from typing import Tuple import numpy as np @@ -46,29 +46,6 @@ def __call__(self, text: str): return embeddings.detach().cpu().squeeze(0).numpy() -class NvEmbedE5: - def __init__(self): - self.api_key = os.environ.get("NVIDIA_API_KEY", None) - if self.api_key is None: - raise ValueError("No NVIDIA API key set!") - - from openai import OpenAI - - self.client = OpenAI( - api_key=self.api_key, - base_url="https://integrate.api.nvidia.com/v1", - ) - - def __call__(self, text: str): - response = self.client.embeddings.create( - input=[text], - model="nvidia/nv-embedqa-e5-v5", - encoding_format="float", - extra_body={"input_type": "query", "truncate": "END"}, - ) - return np.array(response.data[0].embedding, dtype="float32") - - class JailbreakClassifier: def __init__(self, random_forest_path: str): import pickle diff --git a/nemoguardrails/library/jailbreak_detection/request.py b/nemoguardrails/library/jailbreak_detection/request.py index d51a2877a..88d91141a 100644 --- a/nemoguardrails/library/jailbreak_detection/request.py +++ b/nemoguardrails/library/jailbreak_detection/request.py @@ -98,7 +98,6 @@ async def jailbreak_detection_model_request( async def jailbreak_nim_request( prompt: str, nim_url: str, - nim_port: int, nim_auth_token: str, nim_classification_path: str, ): @@ -107,12 +106,12 @@ async def jailbreak_nim_request( "input": prompt, } - endpoint = f"http://{nim_url}:{nim_port}{nim_classification_path}" + endpoint = f"{nim_url}/{nim_classification_path}" try: async with aiohttp.ClientSession() as session: try: if nim_auth_token is not None: - headers["Authorization: Bearer"] = nim_auth_token + headers["Authorization"] = f"Bearer {nim_auth_token}" async with session.post( endpoint, json=payload, headers=headers, timeout=30 ) as resp: diff --git a/nemoguardrails/library/jailbreak_detection/server.py b/nemoguardrails/library/jailbreak_detection/server.py index 7ed8a7613..234ba7195 100644 --- a/nemoguardrails/library/jailbreak_detection/server.py +++ b/nemoguardrails/library/jailbreak_detection/server.py @@ -112,6 +112,10 @@ def run_all_heuristics(request: JailbreakHeuristicRequest): @app.post("/model") def run_model_check(request: JailbreakModelRequest): classifier = mc.initialize_model() + if classifier is None: + raise TypeError( + "No model initialized because EMBEDDING_CLASSIFIER_PATH environment variable is not set." + ) result = mc.check_jailbreak(request.prompt, classifier=classifier) jailbreak = result["jailbreak"] score = result["score"] diff --git a/nemoguardrails/rails/llm/config.py b/nemoguardrails/rails/llm/config.py index b39d75bc9..902f49f8b 100644 --- a/nemoguardrails/rails/llm/config.py +++ b/nemoguardrails/rails/llm/config.py @@ -556,7 +556,7 @@ class JailbreakDetectionConfig(BaseModel): server_endpoint: Optional[str] = Field( default=None, - description="The endpoint for the jailbreak detection server.", + description="The endpoint for the jailbreak detection heuristics/model container.", ) length_per_perplexity_threshold: float = Field( default=89.79, description="The length/perplexity threshold." @@ -564,25 +564,17 @@ class JailbreakDetectionConfig(BaseModel): prefix_suffix_perplexity_threshold: float = Field( default=1845.65, description="The prefix/suffix perplexity threshold." ) - nim_url: Optional[str] = Field( + nim_base_url: Optional[str] = Field( default=None, - description="Location of the NemoGuard JailbreakDetect NIM.", + description="Base URL for jailbreak detection model. Example: http://localhost:8000/v1", ) - nim_port: int = Field( - default=8000, - description="Port the NemoGuard JailbreakDetect NIM is listening on.", + nim_server_endpoint: Optional[str] = Field( + default="classify", + description="Classification path uri. Defaults to 'classify' for NemoGuard JailbreakDetect.", ) - nim_auth_token: Optional[str] = Field( + api_key_env_var: Optional[str] = Field( default=None, - description="API key for JailbreakDetect NIM", - ) - nim_classification_path: Optional[str] = Field( - default="/v1/classify", description="Classification path uri" - ) - embedding: Optional[str] = Field( - default="nvidia/nv-embedqa-e5-v5", - description="DEPRECATED: Model to use for embedding-based detections. Use NIM instead.", - deprecated=True, + description="Environment variable containing API key for jailbreak detection model", ) diff --git a/tests/test_configs/jailbreak_nim/config.yml b/tests/test_configs/jailbreak_nim/config.yml index c5c14dbf9..97c0752c8 100644 --- a/tests/test_configs/jailbreak_nim/config.yml +++ b/tests/test_configs/jailbreak_nim/config.yml @@ -2,8 +2,9 @@ rails: config: jailbreak_detection: server_endpoint: "" - nim_url: "0.0.0.0" - nim_port: 8000 + nim_base_url: "http://0.0.0.0:8000/v1" + nim_server_endpoint: "classify" + api_key_env_var: "JB_NIM_TEST" input: flows: diff --git a/tests/test_jailbreak_nim.py b/tests/test_jailbreak_nim.py index 90d19548b..45e283df3 100644 --- a/tests/test_jailbreak_nim.py +++ b/tests/test_jailbreak_nim.py @@ -50,14 +50,25 @@ def check_jailbreak_nim_availability(): llm_task_manager = LLMTaskManager(config=config) # Check if NIM URL is configured - nim_url = llm_task_manager.config.rails.config.jailbreak_detection.nim_url + nim_url = llm_task_manager.config.rails.config.jailbreak_detection.nim_base_url if nim_url is None: return False, "JailbreakDetect NIM URL is not configured in the test config" - # Check if NIM port is configured correctly - nim_port = llm_task_manager.config.rails.config.jailbreak_detection.nim_port - if nim_port is None or nim_port < 1 or nim_port > 65535: - return False, f"Invalid JailbreakDetect NIM port: {nim_port}" + # Check if NIM endpoint is configured correctly + nim_endpoint = ( + llm_task_manager.config.rails.config.jailbreak_detection.nim_server_endpoint + ) + if not isinstance(nim_endpoint, str): + return False, f"Invalid JailbreakDetect NIM server endpoint: {nim_endpoint}" + + # Check that NIM api_key_env_var is set up correctly + test_key = "test_key" + os.environ["JB_NIM_TEST"] = test_key + api_key_env_var = ( + llm_task_manager.config.rails.config.jailbreak_detection.api_key_env_var + ) + if not os.getenv(api_key_env_var) == test_key: + return False, f"Invalid JailbreakDetect environment variable: {api_key_env_var}" # Basic availability check passed return True, "" From d394361f06f8c6c95dc866d78e0ec177e54fa198 Mon Sep 17 00:00:00 2001 From: Erick Galinkin Date: Fri, 13 Jun 2025 13:53:06 -0400 Subject: [PATCH 04/12] apply black Signed-off-by: Erick Galinkin --- tests/test_jailbreak_nim.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_jailbreak_nim.py b/tests/test_jailbreak_nim.py index 45e283df3..f0bd9d942 100644 --- a/tests/test_jailbreak_nim.py +++ b/tests/test_jailbreak_nim.py @@ -68,7 +68,10 @@ def check_jailbreak_nim_availability(): llm_task_manager.config.rails.config.jailbreak_detection.api_key_env_var ) if not os.getenv(api_key_env_var) == test_key: - return False, f"Invalid JailbreakDetect environment variable: {api_key_env_var}" + return ( + False, + f"Invalid JailbreakDetect environment variable: {api_key_env_var}", + ) # Basic availability check passed return True, "" From c4f7136abab36f549a609ebf45df9d8915b5b901 Mon Sep 17 00:00:00 2001 From: Pouyanpi <13303554+Pouyanpi@users.noreply.github.com> Date: Thu, 19 Jun 2025 10:39:58 +0200 Subject: [PATCH 05/12] style: apply pre-commits --- nemoguardrails/library/jailbreak_detection/actions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemoguardrails/library/jailbreak_detection/actions.py b/nemoguardrails/library/jailbreak_detection/actions.py index 6fffa450f..c4c4f5c79 100644 --- a/nemoguardrails/library/jailbreak_detection/actions.py +++ b/nemoguardrails/library/jailbreak_detection/actions.py @@ -28,8 +28,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import logging +import os from typing import Optional from nemoguardrails.actions import action From b27a3c4e05c062ddf0278b5d3b481da72cc643bd Mon Sep 17 00:00:00 2001 From: Erick Galinkin Date: Mon, 23 Jun 2025 10:32:30 -0400 Subject: [PATCH 06/12] Support deprecated `nim_url` and `nim_port` fields. Signed-off-by: Erick Galinkin --- .../library/jailbreak_detection/request.py | 4 +++- nemoguardrails/rails/llm/config.py | 23 +++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/nemoguardrails/library/jailbreak_detection/request.py b/nemoguardrails/library/jailbreak_detection/request.py index 88d91141a..722ab84cb 100644 --- a/nemoguardrails/library/jailbreak_detection/request.py +++ b/nemoguardrails/library/jailbreak_detection/request.py @@ -101,12 +101,14 @@ async def jailbreak_nim_request( nim_auth_token: str, nim_classification_path: str, ): + from urllib.parse import urljoin + headers = {"Content-Type": "application/json", "Accept": "application/json"} payload = { "input": prompt, } - endpoint = f"{nim_url}/{nim_classification_path}" + endpoint = urljoin(nim_url, nim_classification_path) try: async with aiohttp.ClientSession() as session: try: diff --git a/nemoguardrails/rails/llm/config.py b/nemoguardrails/rails/llm/config.py index 902f49f8b..5e3e6f6ea 100644 --- a/nemoguardrails/rails/llm/config.py +++ b/nemoguardrails/rails/llm/config.py @@ -576,6 +576,29 @@ class JailbreakDetectionConfig(BaseModel): default=None, description="Environment variable containing API key for jailbreak detection model", ) + # legacy fields, keep for backward comp with deprecation warnings + nim_url: Optional[str] = Field( + default=None, + deprecated="Use 'nim_base_url' instead. This field will be removed in a future version.", + description="DEPRECATED: Use nim_base_url instead", + ) + nim_port: Optional[int] = Field( + default=None, + deprecated="Include port in 'nim_base_url' instead. This field will be removed in a future version.", + description="DEPRECATED: Include port in nim_base_url instead", + ) + embedding: Optional[str] = Field( + default=None, + deprecated="This field is no longer used.", + ) + + @model_validator(mode="after") + def migrate_deprecated_fields(self) -> "JailbreakDetectionConfig": + """Migrate deprecated nim_url/nim_port fields to nim_base_url format.""" + if self.nim_url and not self.nim_base_url: + port = self.nim_port or 8000 + self.nim_base_url = f"http://{self.nim_url}:{port}/v1" + return self class AutoAlignOptions(BaseModel): From 019bd8294969856abcac0ffd279432cbbeef9ccf Mon Sep 17 00:00:00 2001 From: Erick Galinkin Date: Mon, 23 Jun 2025 12:13:22 -0400 Subject: [PATCH 07/12] Push test update for deprecated parameters Signed-off-by: Erick Galinkin --- tests/test_jailbreak_nim.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/tests/test_jailbreak_nim.py b/tests/test_jailbreak_nim.py index f0bd9d942..7724c2b7e 100644 --- a/tests/test_jailbreak_nim.py +++ b/tests/test_jailbreak_nim.py @@ -52,7 +52,10 @@ def check_jailbreak_nim_availability(): # Check if NIM URL is configured nim_url = llm_task_manager.config.rails.config.jailbreak_detection.nim_base_url if nim_url is None: - return False, "JailbreakDetect NIM URL is not configured in the test config" + return ( + False, + "JailbreakDetect NIM base URL is not configured in the test config", + ) # Check if NIM endpoint is configured correctly nim_endpoint = ( @@ -80,6 +83,29 @@ def check_jailbreak_nim_availability(): return False, f"Error checking JailbreakDetect NIM availability: {str(e)}" +def test_jailbreak_nim_deprecated(): + """Check if the deprecated JailbreakDetect config options work properly.""" + config = RailsConfig.from_content( + """ + define user express greeting + "hello" + """, + """ + rails: + config: + jailbreak_detection: + server_endpoint: "" + nim_url: "0.0.0.0" + nim_port: "8000" + """, + ) + llm_task_manager = LLMTaskManager(config=config) + nim_url = llm_task_manager.config.rails.config.jailbreak_detection.nim_base_url + assert ( + nim_url == "http://0.0.0.0:8000/v1" + ), "NIM deprecated url/port setup not loaded!" + + JAILBREAK_SETUP_PRESENT, JAILBREAK_SKIP_REASON = check_jailbreak_nim_availability() From 92994fb15ac6aa4e1439f213b30f71d1378ce196 Mon Sep 17 00:00:00 2001 From: Pouyanpi <13303554+Pouyanpi@users.noreply.github.com> Date: Wed, 25 Jun 2025 14:21:58 +0200 Subject: [PATCH 08/12] fix: improve error handling in check_jailbreak function - Fix TypeError when classifier is None by adding defensive programming - Replace silent failure with clear RuntimeError and descriptive message - Simplify calling code by removing redundant null checks from actions.py and server.py - Update tests to match new function signature and behavior - Add test coverage for new RuntimeError path This resolves the critical bug where check_jailbreak(prompt) would crash with "TypeError: 'NoneType' object is not callable" when EMBEDDING_CLASSIFIER_PATH is not set. Now it raises a clear RuntimeError with guidance on how to fix it. --- .../library/jailbreak_detection/actions.py | 11 ++---- .../jailbreak_detection/model_based/checks.py | 10 +++++ .../library/jailbreak_detection/server.py | 7 +--- tests/test_jailbreak_model_based.py | 37 +++++++++++++++---- 4 files changed, 44 insertions(+), 21 deletions(-) diff --git a/nemoguardrails/library/jailbreak_detection/actions.py b/nemoguardrails/library/jailbreak_detection/actions.py index c4c4f5c79..a82ff30b2 100644 --- a/nemoguardrails/library/jailbreak_detection/actions.py +++ b/nemoguardrails/library/jailbreak_detection/actions.py @@ -120,15 +120,12 @@ async def jailbreak_detection_model( "No jailbreak detection endpoint set. Running in-process, NOT RECOMMENDED FOR PRODUCTION." ) try: - classifier = initialize_model() - if classifier is None: - log.error( - "No model initialized because EMBEDDING_CLASSIFIER_PATH is not set." - ) - return False - jailbreak = check_jailbreak(prompt=prompt, classifier=classifier) + jailbreak = check_jailbreak(prompt=prompt) log.info(f"Local model jailbreak detection result: {jailbreak}") return jailbreak["jailbreak"] + except RuntimeError as e: + log.error(f"Jailbreak detection model not available: {e}") + return False except ImportError as e: log.error( f"Failed to import required dependencies for local model. Install scikit-learn and torch, or use NIM-based approach", diff --git a/nemoguardrails/library/jailbreak_detection/model_based/checks.py b/nemoguardrails/library/jailbreak_detection/model_based/checks.py index c76bd6e6d..be0e06a71 100644 --- a/nemoguardrails/library/jailbreak_detection/model_based/checks.py +++ b/nemoguardrails/library/jailbreak_detection/model_based/checks.py @@ -25,6 +25,7 @@ logger = logging.getLogger(__name__) + @lru_cache() def initialize_model() -> Union[None, JailbreakClassifier]: """ @@ -60,10 +61,19 @@ def check_jailbreak( Args: prompt: User utterance to classify classifier: Instantiated JailbreakClassifier object + + Raises: + RuntimeError: If no classifier is available and EMBEDDING_CLASSIFIER_PATH is not set """ if classifier is None: classifier = initialize_model() + if classifier is None: + raise RuntimeError( + "No jailbreak classifier available. Please set the EMBEDDING_CLASSIFIER_PATH " + "environment variable to point to the classifier model directory." + ) + classification, score = classifier(prompt) # classification will be 1 or 0 -- cast to boolean. return {"jailbreak": classification, "score": score} diff --git a/nemoguardrails/library/jailbreak_detection/server.py b/nemoguardrails/library/jailbreak_detection/server.py index 234ba7195..e956c0deb 100644 --- a/nemoguardrails/library/jailbreak_detection/server.py +++ b/nemoguardrails/library/jailbreak_detection/server.py @@ -111,12 +111,7 @@ def run_all_heuristics(request: JailbreakHeuristicRequest): @app.post("/model") def run_model_check(request: JailbreakModelRequest): - classifier = mc.initialize_model() - if classifier is None: - raise TypeError( - "No model initialized because EMBEDDING_CLASSIFIER_PATH environment variable is not set." - ) - result = mc.check_jailbreak(request.prompt, classifier=classifier) + result = mc.check_jailbreak(request.prompt) jailbreak = result["jailbreak"] score = result["score"] model_checks = {"jailbreak": jailbreak, "score": score} diff --git a/tests/test_jailbreak_model_based.py b/tests/test_jailbreak_model_based.py index 5544d1507..1c4462c43 100644 --- a/tests/test_jailbreak_model_based.py +++ b/tests/test_jailbreak_model_based.py @@ -91,21 +91,21 @@ def test_model_based_classifier_missing_deps(monkeypatch): models.JailbreakClassifier("fake_model_path.pkl") -# Test 4: Error when classifier_path is None +# Test 4: Return None when EMBEDDING_CLASSIFIER_PATH is not set -def test_initialize_model_with_none_classifier_path(): +def test_initialize_model_with_none_classifier_path(monkeypatch): """ - initialize_model should raise EnvironmentError when classifier_path is None. + initialize_model should return None when EMBEDDING_CLASSIFIER_PATH is not set. """ import nemoguardrails.library.jailbreak_detection.model_based.checks as checks - with pytest.raises(EnvironmentError) as exc_info: - checks.initialize_model(classifier_path=None) + # Mock environment variable to be None + monkeypatch.setenv("EMBEDDING_CLASSIFIER_PATH", "") + monkeypatch.delenv("EMBEDDING_CLASSIFIER_PATH", raising=False) - assert "Please set the EMBEDDING_CLASSIFIER_PATH environment variable" in str( - exc_info.value - ) + result = checks.initialize_model() + assert result is None # Test 5: SnowflakeEmbed initialization and call with torch imports @@ -202,3 +202,24 @@ def test_check_jailbreak_without_classifier(monkeypatch): assert result == {"jailbreak": False, "score": -0.5} mock_initialize_model.assert_called_once() mock_classifier.assert_called_once_with("safe prompt") + + +# Test 8: Check jailbreak raises RuntimeError when no classifier available + + +def test_check_jailbreak_no_classifier_available(monkeypatch): + """ + Test check_jailbreak function raises RuntimeError when initialize_model returns None. + """ + import nemoguardrails.library.jailbreak_detection.model_based.checks as checks + + # Mock initialize_model to return None (no classifier available) + mock_initialize_model = mock.MagicMock(return_value=None) + monkeypatch.setattr(checks, "initialize_model", mock_initialize_model) + + with pytest.raises(RuntimeError) as exc_info: + checks.check_jailbreak("test prompt") + + assert "No jailbreak classifier available" in str(exc_info.value) + assert "EMBEDDING_CLASSIFIER_PATH" in str(exc_info.value) + mock_initialize_model.assert_called_once() From 37f3ea3f24e2ee90c9938edcc71452a04c41fa11 Mon Sep 17 00:00:00 2001 From: Pouyanpi <13303554+Pouyanpi@users.noreply.github.com> Date: Wed, 25 Jun 2025 14:26:57 +0200 Subject: [PATCH 09/12] fix fix --- .../library/jailbreak_detection/model_based/checks.py | 10 +++++----- .../library/jailbreak_detection/model_based/models.py | 1 - 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/nemoguardrails/library/jailbreak_detection/model_based/checks.py b/nemoguardrails/library/jailbreak_detection/model_based/checks.py index be0e06a71..b59bfa1e1 100644 --- a/nemoguardrails/library/jailbreak_detection/model_based/checks.py +++ b/nemoguardrails/library/jailbreak_detection/model_based/checks.py @@ -19,15 +19,11 @@ from pathlib import Path from typing import Union -from nemoguardrails.library.jailbreak_detection.model_based.models import ( - JailbreakClassifier, -) - logger = logging.getLogger(__name__) @lru_cache() -def initialize_model() -> Union[None, JailbreakClassifier]: +def initialize_model() -> Union[None, "JailbreakClassifier"]: """ Initialize the global classifier model according to the configuration provided. Args @@ -45,6 +41,10 @@ def initialize_model() -> Union[None, JailbreakClassifier]: ) return None + from nemoguardrails.library.jailbreak_detection.model_based.models import ( + JailbreakClassifier, + ) + jailbreak_classifier = JailbreakClassifier( str(Path(classifier_path).joinpath("snowflake.pkl")) ) diff --git a/nemoguardrails/library/jailbreak_detection/model_based/models.py b/nemoguardrails/library/jailbreak_detection/model_based/models.py index 8699449ed..80dc23a5c 100644 --- a/nemoguardrails/library/jailbreak_detection/model_based/models.py +++ b/nemoguardrails/library/jailbreak_detection/model_based/models.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pickle from typing import Tuple import numpy as np From f918e8008cff00c50c7685b836c4f3764c6bb5d5 Mon Sep 17 00:00:00 2001 From: Pouyanpi <13303554+Pouyanpi@users.noreply.github.com> Date: Wed, 25 Jun 2025 14:54:40 +0200 Subject: [PATCH 10/12] fix(request): make nim_auth_token optional in request --- nemoguardrails/library/jailbreak_detection/request.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemoguardrails/library/jailbreak_detection/request.py b/nemoguardrails/library/jailbreak_detection/request.py index 722ab84cb..64d5a0b1a 100644 --- a/nemoguardrails/library/jailbreak_detection/request.py +++ b/nemoguardrails/library/jailbreak_detection/request.py @@ -98,7 +98,7 @@ async def jailbreak_detection_model_request( async def jailbreak_nim_request( prompt: str, nim_url: str, - nim_auth_token: str, + nim_auth_token: Optional[str], nim_classification_path: str, ): from urllib.parse import urljoin From f640d298dc8546642b6c75bc45e8c4275eb6902a Mon Sep 17 00:00:00 2001 From: Pouyanpi <13303554+Pouyanpi@users.noreply.github.com> Date: Wed, 25 Jun 2025 16:38:59 +0200 Subject: [PATCH 11/12] test: add more tests --- tests/test_jailbreak_actions.py | 415 ++++++++++++++++++++++++++++ tests/test_jailbreak_config.py | 130 +++++++++ tests/test_jailbreak_model_based.py | 118 ++++++++ tests/test_jailbreak_request.py | 88 ++++++ 4 files changed, 751 insertions(+) create mode 100644 tests/test_jailbreak_actions.py create mode 100644 tests/test_jailbreak_config.py create mode 100644 tests/test_jailbreak_request.py diff --git a/tests/test_jailbreak_actions.py b/tests/test_jailbreak_actions.py new file mode 100644 index 000000000..08d99eec2 --- /dev/null +++ b/tests/test_jailbreak_actions.py @@ -0,0 +1,415 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from unittest import mock + +from nemoguardrails import RailsConfig +from nemoguardrails.llm.taskmanager import LLMTaskManager + + +class TestJailbreakDetectionActions: + """Test suite for jailbreak detection actions with comprehensive coverage of PR changes.""" + + async def test_jailbreak_detection_model_with_nim_base_url(self, monkeypatch): + """Test jailbreak_detection_model action with nim_base_url config.""" + from nemoguardrails.library.jailbreak_detection.actions import ( + jailbreak_detection_model, + ) + + mock_nim_request = mock.AsyncMock(return_value=True) + monkeypatch.setattr( + "nemoguardrails.library.jailbreak_detection.actions.jailbreak_nim_request", + mock_nim_request, + ) + + config = RailsConfig.from_content( + """ + define user express greeting + "hello" + """, + """ + rails: + config: + jailbreak_detection: + nim_base_url: "http://localhost:8000/v1" + nim_server_endpoint: "classify" + api_key_env_var: "TEST_API_KEY" + """, + ) + + monkeypatch.setenv("TEST_API_KEY", "test_token_123") + + llm_task_manager = LLMTaskManager(config=config) + context = {"user_message": "test prompt"} + + result = await jailbreak_detection_model(llm_task_manager, context) + assert result is True + + mock_nim_request.assert_called_once_with( + prompt="test prompt", + nim_url="http://localhost:8000/v1", + nim_auth_token="test_token_123", + nim_classification_path="classify", + ) + + async def test_jailbreak_detection_model_api_key_not_set(self, monkeypatch, caplog): + """Test warning when api_key_env_var is configured but environment variable is not set.""" + from nemoguardrails.library.jailbreak_detection.actions import ( + jailbreak_detection_model, + ) + + mock_nim_request = mock.AsyncMock(return_value=False) + monkeypatch.setattr( + "nemoguardrails.library.jailbreak_detection.actions.jailbreak_nim_request", + mock_nim_request, + ) + + # create config with api_key_env_var but don't set the environment variable + config = RailsConfig.from_content( + """ + define user express greeting + "hello" + """, + """ + rails: + config: + jailbreak_detection: + nim_base_url: "http://localhost:8000/v1" + api_key_env_var: "MISSING_API_KEY" + """, + ) + + # ensure env var is not set + monkeypatch.delenv("MISSING_API_KEY", raising=False) + + llm_task_manager = LLMTaskManager(config=config) + context = {"user_message": "test prompt"} + + result = await jailbreak_detection_model(llm_task_manager, context) + assert result is False + + # verify warning was logged + assert ( + "api_key_env var at MISSING_API_KEY but the environment variable was not set" + in caplog.text + ) + + # verify nim request was called with None token + mock_nim_request.assert_called_once_with( + prompt="test prompt", + nim_url="http://localhost:8000/v1", + nim_auth_token=None, + nim_classification_path="classify", + ) + + async def test_jailbreak_detection_model_no_api_key_env_var(self, monkeypatch): + """Test that None token is used when api_key_env_var is not configured.""" + from nemoguardrails.library.jailbreak_detection.actions import ( + jailbreak_detection_model, + ) + + mock_nim_request = mock.AsyncMock(return_value=False) + monkeypatch.setattr( + "nemoguardrails.library.jailbreak_detection.actions.jailbreak_nim_request", + mock_nim_request, + ) + + # create config without api_key_env_var + config = RailsConfig.from_content( + """ + define user express greeting + "hello" + """, + """ + rails: + config: + jailbreak_detection: + nim_base_url: "http://localhost:8000/v1" + """, + ) + + llm_task_manager = LLMTaskManager(config=config) + context = {"user_message": "test prompt"} + + result = await jailbreak_detection_model(llm_task_manager, context) + assert result is False + + mock_nim_request.assert_called_once_with( + prompt="test prompt", + nim_url="http://localhost:8000/v1", + nim_auth_token=None, + nim_classification_path="classify", + ) + + async def test_jailbreak_detection_model_local_runtime_error( + self, monkeypatch, caplog + ): + """Test RuntimeError handling when local model is not available.""" + from nemoguardrails.library.jailbreak_detection.actions import ( + jailbreak_detection_model, + ) + + mock_check_jailbreak = mock.MagicMock( + side_effect=RuntimeError("No classifier available") + ) + monkeypatch.setattr( + "nemoguardrails.library.jailbreak_detection.model_based.checks.check_jailbreak", + mock_check_jailbreak, + ) + + # create config with no endpoints (forces local mode) + config = RailsConfig.from_content( + """ + define user express greeting + "hello" + """, + """ + rails: + config: + jailbreak_detection: {} + """, + ) + + llm_task_manager = LLMTaskManager(config=config) + context = {"user_message": "test prompt"} + + result = await jailbreak_detection_model(llm_task_manager, context) + assert result is False + + assert "Jailbreak detection model not available" in caplog.text + assert "No classifier available" in caplog.text + + async def test_jailbreak_detection_model_local_import_error( + self, monkeypatch, caplog + ): + """Test ImportError handling when dependencies are missing.""" + from nemoguardrails.library.jailbreak_detection.actions import ( + jailbreak_detection_model, + ) + + # mock check_jailbreak to raise ImportError + mock_check_jailbreak = mock.MagicMock( + side_effect=ImportError("No module named 'sklearn'") + ) + monkeypatch.setattr( + "nemoguardrails.library.jailbreak_detection.model_based.checks.check_jailbreak", + mock_check_jailbreak, + ) + + # create config with no endpoints (forces local mode) + config = RailsConfig.from_content( + """ + define user express greeting + "hello" + """, + """ + rails: + config: + jailbreak_detection: {} + """, + ) + + llm_task_manager = LLMTaskManager(config=config) + context = {"user_message": "test prompt"} + + result = await jailbreak_detection_model(llm_task_manager, context) + assert result is False + + assert "Failed to import required dependencies for local model" in caplog.text + assert ( + "Install scikit-learn and torch, or use NIM-based approach" in caplog.text + ) + + async def test_jailbreak_detection_model_local_success(self, monkeypatch, caplog): + """Test successful local model execution.""" + from nemoguardrails.library.jailbreak_detection.actions import ( + jailbreak_detection_model, + ) + + mock_check_jailbreak = mock.MagicMock( + return_value={"jailbreak": True, "score": 0.95} + ) + monkeypatch.setattr( + "nemoguardrails.library.jailbreak_detection.model_based.checks.check_jailbreak", + mock_check_jailbreak, + ) + + config = RailsConfig.from_content( + """ + define user express greeting + "hello" + """, + """ + rails: + config: + jailbreak_detection: {} + """, + ) + + llm_task_manager = LLMTaskManager(config=config) + context = {"user_message": "malicious prompt"} + + result = await jailbreak_detection_model(llm_task_manager, context) + assert result is True + + assert "Local model jailbreak detection result" in caplog.text + mock_check_jailbreak.assert_called_once_with(prompt="malicious prompt") + + async def test_jailbreak_detection_model_empty_context(self, monkeypatch): + """Test handling of empty context.""" + from nemoguardrails.library.jailbreak_detection.actions import ( + jailbreak_detection_model, + ) + + mock_nim_request = mock.AsyncMock(return_value=False) + monkeypatch.setattr( + "nemoguardrails.library.jailbreak_detection.actions.jailbreak_nim_request", + mock_nim_request, + ) + + config = RailsConfig.from_content( + """ + define user express greeting + "hello" + """, + """ + rails: + config: + jailbreak_detection: + nim_base_url: "http://localhost:8000/v1" + """, + ) + + llm_task_manager = LLMTaskManager(config=config) + + result = await jailbreak_detection_model(llm_task_manager, None) + assert result is False + + mock_nim_request.assert_called_once_with( + prompt="", + nim_url="http://localhost:8000/v1", + nim_auth_token=None, + nim_classification_path="classify", + ) + + async def test_jailbreak_detection_model_context_without_user_message( + self, monkeypatch + ): + """Test handling of context without user_message key.""" + from nemoguardrails.library.jailbreak_detection.actions import ( + jailbreak_detection_model, + ) + + mock_nim_request = mock.AsyncMock(return_value=False) + monkeypatch.setattr( + "nemoguardrails.library.jailbreak_detection.actions.jailbreak_nim_request", + mock_nim_request, + ) + + config = RailsConfig.from_content( + """ + define user express greeting + "hello" + """, + """ + rails: + config: + jailbreak_detection: + nim_base_url: "http://localhost:8000/v1" + """, + ) + + llm_task_manager = LLMTaskManager(config=config) + context = {"other_key": "other_value"} # No user_message key + + result = await jailbreak_detection_model(llm_task_manager, context) + assert result is False + + mock_nim_request.assert_called_once_with( + prompt="", + nim_url="http://localhost:8000/v1", + nim_auth_token=None, + nim_classification_path="classify", + ) + + async def test_jailbreak_detection_model_legacy_server_endpoint(self, monkeypatch): + """Test fallback to legacy server_endpoint when nim_base_url is not set.""" + from nemoguardrails.library.jailbreak_detection.actions import ( + jailbreak_detection_model, + ) + + mock_model_request = mock.AsyncMock(return_value=True) + monkeypatch.setattr( + "nemoguardrails.library.jailbreak_detection.actions.jailbreak_detection_model_request", + mock_model_request, + ) + + config = RailsConfig.from_content( + """ + define user express greeting + "hello" + """, + """ + rails: + config: + jailbreak_detection: + server_endpoint: "http://legacy-server:1337/model" + """, + ) + + llm_task_manager = LLMTaskManager(config=config) + context = {"user_message": "test prompt"} + + result = await jailbreak_detection_model(llm_task_manager, context) + assert result is True + + mock_model_request.assert_called_once_with( + prompt="test prompt", api_url="http://legacy-server:1337/model" + ) + + async def test_jailbreak_detection_model_none_response_handling( + self, monkeypatch, caplog + ): + """Test handling when external service returns None.""" + from nemoguardrails.library.jailbreak_detection.actions import ( + jailbreak_detection_model, + ) + + mock_nim_request = mock.AsyncMock(return_value=None) + monkeypatch.setattr( + "nemoguardrails.library.jailbreak_detection.actions.jailbreak_nim_request", + mock_nim_request, + ) + + config = RailsConfig.from_content( + """ + define user express greeting + "hello" + """, + """ + rails: + config: + jailbreak_detection: + nim_base_url: "http://localhost:8000/v1" + """, + ) + + llm_task_manager = LLMTaskManager(config=config) + context = {"user_message": "test prompt"} + + result = await jailbreak_detection_model(llm_task_manager, context) + assert result is False + + assert "Jailbreak endpoint not set up properly" in caplog.text diff --git a/tests/test_jailbreak_config.py b/tests/test_jailbreak_config.py new file mode 100644 index 000000000..978117a33 --- /dev/null +++ b/tests/test_jailbreak_config.py @@ -0,0 +1,130 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from nemoguardrails.rails.llm.config import JailbreakDetectionConfig + + +class TestJailbreakDetectionConfig: + def test_new_configuration_fields(self): + config = JailbreakDetectionConfig( + nim_base_url="http://localhost:8000/v1", + nim_server_endpoint="classify", + api_key_env_var="MY_API_KEY", + ) + + assert config.nim_base_url == "http://localhost:8000/v1" + assert config.nim_server_endpoint == "classify" + assert config.api_key_env_var == "MY_API_KEY" + + def test_default_values(self): + config = JailbreakDetectionConfig() + + assert config.nim_base_url is None + assert config.nim_server_endpoint == "classify" # Default value + assert config.api_key_env_var is None + + def test_deprecated_field_migration(self): + """Test that deprecated nim_url and nim_port fields are migrated to nim_base_url.""" + config = JailbreakDetectionConfig(nim_url="localhost", nim_port=8000) + + # The model validator should migrate these to nim_base_url + assert config.nim_base_url == "http://localhost:8000/v1" + assert config.nim_url == "localhost" # Original value preserved + assert config.nim_port == 8000 # Original value preserved + + def test_deprecated_field_migration_with_string_port(self): + """Test migration when port is provided as string.""" + config = JailbreakDetectionConfig(nim_url="localhost", nim_port="9000") + + # The model validator should migrate these to nim_base_url + assert config.nim_base_url == "http://localhost:9000/v1" + + def test_deprecated_field_migration_no_port(self): + """Test migration when only nim_url is provided (default port should be used).""" + config = JailbreakDetectionConfig(nim_url="localhost") + + # Should use default port 8000 + assert config.nim_base_url == "http://localhost:8000/v1" + + def test_no_migration_when_nim_base_url_already_set(self): + """Test that migration doesn't occur when nim_base_url is already set.""" + config = JailbreakDetectionConfig( + nim_base_url="http://existing:9999/v1", nim_url="localhost", nim_port=8000 + ) + + # Should not override existing nim_base_url + assert config.nim_base_url == "http://existing:9999/v1" + + def test_embedding_field_deprecated(self): + """Test that embedding field defaults to None (deprecated).""" + config = JailbreakDetectionConfig() + assert config.embedding is None + + def test_server_endpoint_description_updated(self): + """Test that server_endpoint description includes model container.""" + config = JailbreakDetectionConfig(server_endpoint="http://localhost:1337/model") + assert config.server_endpoint == "http://localhost:1337/model" + + def test_configuration_with_all_new_fields(self): + config = JailbreakDetectionConfig( + server_endpoint="http://legacy:1337/heuristics", + nim_base_url="http://nim:8000/v1", + nim_server_endpoint="custom-classify", + api_key_env_var="CUSTOM_API_KEY", + length_per_perplexity_threshold=100.0, + prefix_suffix_perplexity_threshold=2000.0, + ) + + assert config.server_endpoint == "http://legacy:1337/heuristics" + assert config.nim_base_url == "http://nim:8000/v1" + assert config.nim_server_endpoint == "custom-classify" + assert config.api_key_env_var == "CUSTOM_API_KEY" + assert config.length_per_perplexity_threshold == 100.0 + assert config.prefix_suffix_perplexity_threshold == 2000.0 + + def test_backward_compatibility(self): + """Test that old configuration still works with migration.""" + # simulate old config format + config = JailbreakDetectionConfig( + server_endpoint="http://old-server:1337/heuristics", + nim_url="old-nim-host", + nim_port=8888, + length_per_perplexity_threshold=89.79, + prefix_suffix_perplexity_threshold=1845.65, + ) + + # legacy fields should work + assert config.server_endpoint == "http://old-server:1337/heuristics" + assert config.length_per_perplexity_threshold == 89.79 + assert config.prefix_suffix_perplexity_threshold == 1845.65 + + # deprecated fields should be migrated + assert config.nim_base_url == "http://old-nim-host:8888/v1" + + def test_empty_configuration(self): + """Test that completely empty config works with defaults.""" + + config = JailbreakDetectionConfig() + + assert config.server_endpoint is None + assert config.nim_base_url is None + assert config.nim_server_endpoint == "classify" + assert config.api_key_env_var is None + assert config.length_per_perplexity_threshold == 89.79 + assert config.prefix_suffix_perplexity_threshold == 1845.65 + assert config.nim_url is None + assert config.nim_port is None + assert config.embedding is None diff --git a/tests/test_jailbreak_model_based.py b/tests/test_jailbreak_model_based.py index 1c4462c43..0e113cab6 100644 --- a/tests/test_jailbreak_model_based.py +++ b/tests/test_jailbreak_model_based.py @@ -100,6 +100,9 @@ def test_initialize_model_with_none_classifier_path(monkeypatch): """ import nemoguardrails.library.jailbreak_detection.model_based.checks as checks + # Clear the LRU cache to ensure fresh test + checks.initialize_model.cache_clear() + # Mock environment variable to be None monkeypatch.setenv("EMBEDDING_CLASSIFIER_PATH", "") monkeypatch.delenv("EMBEDDING_CLASSIFIER_PATH", raising=False) @@ -223,3 +226,118 @@ def test_check_jailbreak_no_classifier_available(monkeypatch): assert "No jailbreak classifier available" in str(exc_info.value) assert "EMBEDDING_CLASSIFIER_PATH" in str(exc_info.value) mock_initialize_model.assert_called_once() + + +# Test 9: Test initialize_model with valid path + + +def test_initialize_model_with_valid_path(monkeypatch): + """ + Test initialize_model with a valid classifier path. + """ + import nemoguardrails.library.jailbreak_detection.model_based.checks as checks + + checks.initialize_model.cache_clear() + + # mock environment variable + monkeypatch.setenv("EMBEDDING_CLASSIFIER_PATH", "/fake/path/to/model") + + # mock JailbreakClassifier + mock_classifier = mock.MagicMock() + mock_jailbreak_classifier_class = mock.MagicMock(return_value=mock_classifier) + monkeypatch.setattr( + "nemoguardrails.library.jailbreak_detection.model_based.models.JailbreakClassifier", + mock_jailbreak_classifier_class, + ) + + result = checks.initialize_model() + + assert result == mock_classifier + mock_jailbreak_classifier_class.assert_called_once_with( + "/fake/path/to/model/snowflake.pkl" + ) + + +# Test 10: Test that NvEmbedE5 class no longer exists + + +def test_nv_embed_e5_removed(): + """ + Test that NvEmbedE5 class has been removed from the models module. + """ + import nemoguardrails.library.jailbreak_detection.model_based.models as models + + assert not hasattr(models, "NvEmbedE5") + + +# Test 11: Test SnowflakeEmbed still exists and works + + +def test_snowflake_embed_still_available(): + """ + Test that SnowflakeEmbed class is still available. + """ + import nemoguardrails.library.jailbreak_detection.model_based.models as models + + # This class should still exist + assert hasattr(models, "SnowflakeEmbed") + + +# Test 12: Test initialize_model with logging + + +def test_initialize_model_logging(monkeypatch, caplog): + """ + Test that initialize_model logs warning when path is not set. + """ + import logging + + import nemoguardrails.library.jailbreak_detection.model_based.checks as checks + + # clear the LRU cache to ensure fresh test + checks.initialize_model.cache_clear() + + # set log level to capture warnings + caplog.set_level(logging.WARNING) + + # mock environment variable to be None + monkeypatch.delenv("EMBEDDING_CLASSIFIER_PATH", raising=False) + + result = checks.initialize_model() + + assert result is None + assert "No embedding classifier path set" in caplog.text + assert "Server /model endpoint will not work" in caplog.text + + +# Test 13: Test check_jailbreak with explicit None classifier + + +def test_check_jailbreak_explicit_none_classifier(): + """ + Test check_jailbreak when explicitly passed None as classifier. + """ + import nemoguardrails.library.jailbreak_detection.model_based.checks as checks + + with pytest.raises(RuntimeError) as exc_info: + checks.check_jailbreak("test prompt", classifier=None) + + assert "No jailbreak classifier available" in str(exc_info.value) + + +# Test 14: Test check_jailbreak preserves original behavior with valid classifier + + +def test_check_jailbreak_valid_classifier_preserved(): + """ + Test that check_jailbreak still works normally with a valid classifier. + """ + import nemoguardrails.library.jailbreak_detection.model_based.checks as checks + + mock_classifier = mock.MagicMock() + mock_classifier.return_value = (True, 0.95) + + result = checks.check_jailbreak("malicious prompt", classifier=mock_classifier) + + assert result == {"jailbreak": True, "score": 0.95} + mock_classifier.assert_called_once_with("malicious prompt") diff --git a/tests/test_jailbreak_request.py b/tests/test_jailbreak_request.py new file mode 100644 index 000000000..c5227d516 --- /dev/null +++ b/tests/test_jailbreak_request.py @@ -0,0 +1,88 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from urllib.parse import urljoin + +import pytest + + +class TestJailbreakRequestChanges: + """Test jailbreak request function changes introduced in this PR.""" + + def test_url_joining_logic(self): + """Test that URL joining works correctly using urljoin.""" + test_cases = [ + ( + "http://localhost:8000/v1", + "classify", + "http://localhost:8000/classify", + ), # v1 replaced by classify + ( + "http://localhost:8000/v1/", + "classify", + "http://localhost:8000/v1/classify", + ), # trailing slash preserves v1 + ( + "http://localhost:8000", + "v1/classify", + "http://localhost:8000/v1/classify", + ), + ("http://localhost:8000/", "/classify", "http://localhost:8000/classify"), + ] + + for base_url, path, expected_url in test_cases: + result = urljoin(base_url, path) + assert ( + result == expected_url + ), f"urljoin({base_url}, {path}) should equal {expected_url}" + + def test_auth_header_logic(self): + """Test the authorization header logic.""" + headers = {"Content-Type": "application/json", "Accept": "application/json"} + + nim_auth_token = "test_token_123" + if nim_auth_token is not None: + headers["Authorization"] = f"Bearer {nim_auth_token}" + + assert headers["Authorization"] == "Bearer test_token_123" + + headers2 = {"Content-Type": "application/json", "Accept": "application/json"} + nim_auth_token = None + if nim_auth_token is not None: + headers2["Authorization"] = f"Bearer {nim_auth_token}" + + assert "Authorization" not in headers2 + + @pytest.mark.asyncio + async def test_nim_request_signature(self): + import inspect + + from nemoguardrails.library.jailbreak_detection.request import ( + jailbreak_nim_request, + ) + + sig = inspect.signature(jailbreak_nim_request) + params = list(sig.parameters.keys()) + + expected_params = [ + "prompt", + "nim_url", + "nim_auth_token", + "nim_classification_path", + ] + assert params == expected_params, f"Expected {expected_params}, got {params}" From b23863b9f316c66f7a59c73e8b84b53e74644b9a Mon Sep 17 00:00:00 2001 From: Pouyanpi <13303554+Pouyanpi@users.noreply.github.com> Date: Wed, 25 Jun 2025 17:05:58 +0200 Subject: [PATCH 12/12] fix model path mocking and assertion for windows --- tests/test_jailbreak_model_based.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/test_jailbreak_model_based.py b/tests/test_jailbreak_model_based.py index 0e113cab6..3c1d065e5 100644 --- a/tests/test_jailbreak_model_based.py +++ b/tests/test_jailbreak_model_based.py @@ -235,12 +235,15 @@ def test_initialize_model_with_valid_path(monkeypatch): """ Test initialize_model with a valid classifier path. """ + from pathlib import Path + import nemoguardrails.library.jailbreak_detection.model_based.checks as checks checks.initialize_model.cache_clear() # mock environment variable - monkeypatch.setenv("EMBEDDING_CLASSIFIER_PATH", "/fake/path/to/model") + test_path = "/fake/path/to/model" + monkeypatch.setenv("EMBEDDING_CLASSIFIER_PATH", test_path) # mock JailbreakClassifier mock_classifier = mock.MagicMock() @@ -253,9 +256,9 @@ def test_initialize_model_with_valid_path(monkeypatch): result = checks.initialize_model() assert result == mock_classifier - mock_jailbreak_classifier_class.assert_called_once_with( - "/fake/path/to/model/snowflake.pkl" - ) + + expected_path = str(Path(test_path).joinpath("snowflake.pkl")) + mock_jailbreak_classifier_class.assert_called_once_with(expected_path) # Test 10: Test that NvEmbedE5 class no longer exists