Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 32 additions & 10 deletions nemoguardrails/library/jailbreak_detection/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
# limitations under the License.

import logging
import os
from typing import Optional

from nemoguardrails.actions import action
Expand Down Expand Up @@ -94,13 +95,22 @@ async def jailbreak_detection_model(
jailbreak_config = llm_task_manager.config.rails.config.jailbreak_detection

jailbreak_api_url = jailbreak_config.server_endpoint
nim_url = jailbreak_config.nim_url
nim_port = jailbreak_config.nim_port
nim_base_url = jailbreak_config.nim_base_url
nim_classification_path = jailbreak_config.nim_server_endpoint
if jailbreak_config.api_key_env_var is not None:
nim_auth_token = os.getenv(jailbreak_config.api_key_env_var)
if nim_auth_token is None:
log.warning(
"Specified a value for jailbreak config api_key_env var at %s but the environment variable was not set!"
% jailbreak_config.api_key_env_var
)
else:
nim_auth_token = None

if context is not None:
prompt = context.get("user_message", "")

if not jailbreak_api_url and not nim_url:
if not jailbreak_api_url and not nim_base_url:
from nemoguardrails.library.jailbreak_detection.model_based.checks import (
check_jailbreak,
initialize_model,
Expand All @@ -109,14 +119,26 @@ async def jailbreak_detection_model(
log.warning(
"No jailbreak detection endpoint set. Running in-process, NOT RECOMMENDED FOR PRODUCTION."
)
classifier = initialize_model()
jailbreak = check_jailbreak(prompt=prompt, classifier=classifier)

return jailbreak["jailbreak"]

if nim_url:
try:
jailbreak = check_jailbreak(prompt=prompt)
log.info(f"Local model jailbreak detection result: {jailbreak}")
return jailbreak["jailbreak"]
except RuntimeError as e:
log.error(f"Jailbreak detection model not available: {e}")
return False
except ImportError as e:
log.error(
f"Failed to import required dependencies for local model. Install scikit-learn and torch, or use NIM-based approach",
exc_info=e,
)
return False

if nim_base_url:
jailbreak = await jailbreak_nim_request(
prompt=prompt, nim_url=nim_url, nim_port=nim_port
prompt=prompt,
nim_url=nim_base_url,
nim_auth_token=nim_auth_token,
nim_classification_path=nim_classification_path,
)
elif jailbreak_api_url:
jailbreak = await jailbreak_detection_model_request(
Expand Down
27 changes: 20 additions & 7 deletions nemoguardrails/library/jailbreak_detection/model_based/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,29 +13,33 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import os
from functools import lru_cache
from pathlib import Path
from typing import Tuple, Union
from typing import Union

import numpy as np

models_path = os.environ.get("EMBEDDING_CLASSIFIER_PATH")
logger = logging.getLogger(__name__)


@lru_cache()
def initialize_model(classifier_path: str = models_path) -> "JailbreakClassifier":
def initialize_model() -> Union[None, "JailbreakClassifier"]:
"""
Initialize the global classifier model according to the configuration provided.
Args
classifier_path: Path to the classifier model
Returns
jailbreak_classifier: JailbreakClassifier object combining embedding model and NemoGuard JailbreakDetect RF
"""

classifier_path = os.environ.get("EMBEDDING_CLASSIFIER_PATH")

if classifier_path is None:
raise EnvironmentError(
"Please set the EMBEDDING_CLASSIFIER_PATH environment variable to point to the Classifier model_based folder"
# Log a warning, but do not throw an exception
logger.warning(
"No embedding classifier path set. Server /model endpoint will not work."
)
return None

from nemoguardrails.library.jailbreak_detection.model_based.models import (
JailbreakClassifier,
Expand All @@ -57,10 +61,19 @@ def check_jailbreak(
Args:
prompt: User utterance to classify
classifier: Instantiated JailbreakClassifier object

Raises:
RuntimeError: If no classifier is available and EMBEDDING_CLASSIFIER_PATH is not set
"""
if classifier is None:
classifier = initialize_model()

if classifier is None:
raise RuntimeError(
"No jailbreak classifier available. Please set the EMBEDDING_CLASSIFIER_PATH "
"environment variable to point to the classifier model directory."
)

classification, score = classifier(prompt)
# classification will be 1 or 0 -- cast to boolean.
return {"jailbreak": classification, "score": score}
24 changes: 0 additions & 24 deletions nemoguardrails/library/jailbreak_detection/model_based/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from typing import Tuple

import numpy as np
Expand Down Expand Up @@ -46,29 +45,6 @@ def __call__(self, text: str):
return embeddings.detach().cpu().squeeze(0).numpy()


class NvEmbedE5:
def __init__(self):
self.api_key = os.environ.get("NVIDIA_API_KEY", None)
if self.api_key is None:
raise ValueError("No NVIDIA API key set!")

from openai import OpenAI

self.client = OpenAI(
api_key=self.api_key,
base_url="https://integrate.api.nvidia.com/v1",
)

def __call__(self, text: str):
response = self.client.embeddings.create(
input=[text],
model="nvidia/nv-embedqa-e5-v5",
encoding_format="float",
extra_body={"input_type": "query", "truncate": "END"},
)
return np.array(response.data[0].embedding, dtype="float32")


class JailbreakClassifier:
def __init__(self, random_forest_path: str):
import pickle
Expand Down
14 changes: 11 additions & 3 deletions nemoguardrails/library/jailbreak_detection/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,17 +98,25 @@ async def jailbreak_detection_model_request(
async def jailbreak_nim_request(
prompt: str,
nim_url: str,
nim_port: int,
nim_auth_token: Optional[str],
nim_classification_path: str,
):
from urllib.parse import urljoin

headers = {"Content-Type": "application/json", "Accept": "application/json"}
payload = {
"input": prompt,
}

endpoint = f"http://{nim_url}:{nim_port}/v1/classify"
endpoint = urljoin(nim_url, nim_classification_path)
try:
async with aiohttp.ClientSession() as session:
try:
async with session.post(endpoint, json=payload, timeout=30) as resp:
if nim_auth_token is not None:
headers["Authorization"] = f"Bearer {nim_auth_token}"
async with session.post(
endpoint, json=payload, headers=headers, timeout=30
) as resp:
if resp.status != 200:
log.error(
f"NemoGuard JailbreakDetect NIM request failed with status {resp.status}"
Expand Down
3 changes: 1 addition & 2 deletions nemoguardrails/library/jailbreak_detection/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,7 @@ def run_all_heuristics(request: JailbreakHeuristicRequest):

@app.post("/model")
def run_model_check(request: JailbreakModelRequest):
classifier = mc.initialize_model()
result = mc.check_jailbreak(request.prompt, classifier=classifier)
result = mc.check_jailbreak(request.prompt)
jailbreak = result["jailbreak"]
score = result["score"]
model_checks = {"jailbreak": jailbreak, "score": score}
Expand Down
38 changes: 30 additions & 8 deletions nemoguardrails/rails/llm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,28 +556,50 @@ class JailbreakDetectionConfig(BaseModel):

server_endpoint: Optional[str] = Field(
default=None,
description="The endpoint for the jailbreak detection heuristics server.",
description="The endpoint for the jailbreak detection heuristics/model container.",
)
length_per_perplexity_threshold: float = Field(
default=89.79, description="The length/perplexity threshold."
)
prefix_suffix_perplexity_threshold: float = Field(
default=1845.65, description="The prefix/suffix perplexity threshold."
)
nim_base_url: Optional[str] = Field(
default=None,
description="Base URL for jailbreak detection model. Example: http://localhost:8000/v1",
)
nim_server_endpoint: Optional[str] = Field(
default="classify",
description="Classification path uri. Defaults to 'classify' for NemoGuard JailbreakDetect.",
)
api_key_env_var: Optional[str] = Field(
default=None,
description="Environment variable containing API key for jailbreak detection model",
)
# legacy fields, keep for backward comp with deprecation warnings
nim_url: Optional[str] = Field(
default=None,
description="Location of the NemoGuard JailbreakDetect NIM.",
deprecated="Use 'nim_base_url' instead. This field will be removed in a future version.",
description="DEPRECATED: Use nim_base_url instead",
)
nim_port: int = Field(
default=8000,
description="Port the NemoGuard JailbreakDetect NIM is listening on.",
nim_port: Optional[int] = Field(
default=None,
deprecated="Include port in 'nim_base_url' instead. This field will be removed in a future version.",
description="DEPRECATED: Include port in nim_base_url instead",
)
embedding: Optional[str] = Field(
default="nvidia/nv-embedqa-e5-v5",
description="DEPRECATED: Model to use for embedding-based detections. Use NIM instead.",
deprecated=True,
default=None,
deprecated="This field is no longer used.",
)

@model_validator(mode="after")
def migrate_deprecated_fields(self) -> "JailbreakDetectionConfig":
"""Migrate deprecated nim_url/nim_port fields to nim_base_url format."""
if self.nim_url and not self.nim_base_url:
port = self.nim_port or 8000
self.nim_base_url = f"http://{self.nim_url}:{port}/v1"
return self


class AutoAlignOptions(BaseModel):
"""List of guardrails that are activated"""
Expand Down
5 changes: 3 additions & 2 deletions tests/test_configs/jailbreak_nim/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ rails:
config:
jailbreak_detection:
server_endpoint: ""
nim_url: "0.0.0.0"
nim_port: 8000
nim_base_url: "http://0.0.0.0:8000/v1"
nim_server_endpoint: "classify"
api_key_env_var: "JB_NIM_TEST"

input:
flows:
Expand Down
Loading