diff --git a/README.md b/README.md index d866a1c1b..b3ac96786 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,7 @@ rails: flows: - self check facts - self check hallucination - - activefence moderation + - activefence moderation on input config: # Configure the types of entities that should be masked on user input. diff --git a/docs/user-guides/community/active-fence.md b/docs/user-guides/community/active-fence.md index 0c02b0d88..bf1507910 100644 --- a/docs/user-guides/community/active-fence.md +++ b/docs/user-guides/community/active-fence.md @@ -1,24 +1,24 @@ # ActiveFence Integration -NeMo Guardrails supports using the [ActiveFence ActiveScore API](https://docs.activefence.com/index.html) as an input rail out-of-the-box (you need to have the `ACTIVEFENCE_API_KEY` environment variable set). +NeMo Guardrails supports using the [ActiveFence ActiveScore API](https://docs.activefence.com/index.html) as an input and output rail out-of-the-box (you need to have the `ACTIVEFENCE_API_KEY` environment variable set). ```yaml rails: input: flows: # The simplified version - - activefence moderation + - activefence moderation on input # The detailed version with individual risk scores - # - activefence moderation detailed + # - activefence moderation on input detailed ``` -The `activefence moderation` flow uses the maximum risk score with an 0.85 threshold to decide if the input should be allowed or not (i.e., if the risk score is above the threshold, it is considered a violation). The `activefence moderation detailed` has individual scores per category of violation. +The `activefence moderation on input` flow uses the maximum risk score with an 0.85 threshold to decide if the text should be allowed or not (i.e., if the risk score is above the threshold, it is considered a violation). The `activefence moderation on input detailed` has individual scores per category of violation. -To customize the scores, you have to overwrite the [default flows](https://github.com/NVIDIA/NeMo-Guardrails/tree/develop/nemoguardrails/library/activefence/flows.co) in your config. For example, to change the threshold for `activefence moderation` you can add the following flow to your config: +To customize the scores, you have to overwrite the [default flows](https://github.com/NVIDIA/NeMo-Guardrails/tree/develop/nemoguardrails/library/activefence/flows.co) in your config. For example, to change the threshold for `activefence moderation on input` you can add the following flow to your config: ```colang -define subflow activefence moderation +define subflow activefence moderation on input """Guardrail based on the maximum risk score.""" $result = execute call activefence api diff --git a/docs/user-guides/guardrails-library.md b/docs/user-guides/guardrails-library.md index 901dcc223..99c69b07f 100644 --- a/docs/user-guides/guardrails-library.md +++ b/docs/user-guides/guardrails-library.md @@ -684,7 +684,7 @@ This category of rails relies on 3rd party APIs for various guardrailing tasks. ### ActiveFence -NeMo Guardrails supports using the [ActiveFence ActiveScore API](https://docs.activefence.com/index.html) as an input rail out-of-the-box (you need to have the `ACTIVEFENCE_API_KEY` environment variable set). +NeMo Guardrails supports using the [ActiveFence ActiveScore API](https://docs.activefence.com/index.html) as an input and output rail out-of-the-box (you need to have the `ACTIVEFENCE_API_KEY` environment variable set). #### Example usage @@ -692,7 +692,10 @@ NeMo Guardrails supports using the [ActiveFence ActiveScore API](https://docs.ac rails: input: flows: - - activefence moderation + - activefence moderation on input + output: + flows: + - activefence moderation on output ``` For more details, check out the [ActiveFence Integration](./community/active-fence.md) page. diff --git a/examples/sample_config.yml b/examples/sample_config.yml index 254a8d3fb..fb5887eeb 100644 --- a/examples/sample_config.yml +++ b/examples/sample_config.yml @@ -7,7 +7,7 @@ rails: flows: - self check input - check sensitive data - - activefence moderation + - activefence moderation on input - check toxicity - some other moderation thing @@ -35,7 +35,7 @@ rails: flows: - self check facts - self check hallucination - - activefence moderation + - activefence moderation on output - check sensitive data # Execution rails are triggered before and after an action is invoked diff --git a/nemoguardrails/library/activefence/actions.py b/nemoguardrails/library/activefence/actions.py index 2b01974ad..a9ee04c4d 100644 --- a/nemoguardrails/library/activefence/actions.py +++ b/nemoguardrails/library/activefence/actions.py @@ -25,19 +25,17 @@ log = logging.getLogger(__name__) -@action(name="call activefence api", is_system_action=True) -async def call_activefence_api(context: Optional[dict] = None): +@action(is_system_action=True) +async def call_activefence_api(text: Optional[str] = None): api_key = os.environ.get("ACTIVEFENCE_API_KEY") if api_key is None: raise ValueError("ACTIVEFENCE_API_KEY environment variable not set.") - user_message = context.get("user_message") - url = "https://apis.activefence.com/sync/v3/content/text" headers = {"af-api-key": api_key, "af-source": "nemo-guardrails"} data = { - "text": user_message, + "text": text, "content_id": "ng-" + new_uuid(), } diff --git a/nemoguardrails/library/activefence/flows.co b/nemoguardrails/library/activefence/flows.co index 05fdc3d0b..7c41d19ff 100644 --- a/nemoguardrails/library/activefence/flows.co +++ b/nemoguardrails/library/activefence/flows.co @@ -13,9 +13,9 @@ Adult Content General Privacy Violation PII """ -flow activefence moderation +flow activefence moderation on input """Guardrail based on the maximum risk score.""" - $result = await CallActivefenceApiAction() + $result = await CallActivefenceApiAction(text=$user_message) if $result.max_risk_score > 0.7 if $system.config.enable_rails_exceptions @@ -24,9 +24,20 @@ flow activefence moderation bot refuse to respond abort -flow activefence moderation detailed +flow activefence moderation on output + """Guardrail based on the maximum risk score.""" + $result = await CallActivefenceApiAction(text=$bot_message) + + if $result.max_risk_score > 0.7 + if $system.config.enable_rails_exceptions + send ActiveFenceModerationRailException(message="ActiveFence moderation triggered. The maximum risk score exceeded the threshold.") + else + bot refuse to respond + abort + +flow activefence moderation on input detailed """Guardrail based on individual risk scores.""" - $result = await CallActivefenceApiAction() + $result = await CallActivefenceApiAction(text=$user_message) if $result.violations.get("abusive_or_harmful.harassment_or_bullying", 0) > 0.8 if $system.config.enable_rails_exceptions diff --git a/nemoguardrails/library/activefence/flows.v1.co b/nemoguardrails/library/activefence/flows.v1.co index 02d6ffd57..efebe4abd 100644 --- a/nemoguardrails/library/activefence/flows.v1.co +++ b/nemoguardrails/library/activefence/flows.v1.co @@ -13,9 +13,9 @@ Adult Content General Privacy Violation PII """ -define subflow activefence moderation +define subflow activefence moderation on input """Guardrail based on the maximum risk score.""" - $result = execute call activefence api + $result = execute call_activefence_api(text=$user_message) if $result.max_risk_score > 0.7 if $config.enable_rails_exceptions @@ -24,9 +24,21 @@ define subflow activefence moderation bot refuse to respond stop -define subflow activefence moderation detailed + define subflow activefence moderation on output + """Guardrail based on the maximum risk score.""" + $result = execute call_activefence_api(text=$bot_message) + + if $result.max_risk_score > 0.7 + if $config.enable_rails_exceptions + create event ActiveFenceModerationRailException(message="ActiveFence moderation triggered. The maximum risk score exceeded the threshold.") + else + bot refuse to respond + stop + + +define subflow activefence moderation on input detailed """Guardrail based on individual risk scores.""" - $result = execute call activefence api + $result = execute call_activefence_api(text=$user_message) if $result.violations.get("abusive_or_harmful.harassment_or_bullying", 0) > 0.8 if $config.enable_rails_exceptions diff --git a/tests/test_active_fence_input_rail.py b/tests/test_activefence_rail.py similarity index 68% rename from tests/test_active_fence_input_rail.py rename to tests/test_activefence_rail.py index 41a76047c..51c6fbc9c 100644 --- a/tests/test_active_fence_input_rail.py +++ b/tests/test_activefence_rail.py @@ -19,7 +19,7 @@ from tests.utils import TestChat -def test_1(monkeypatch): +def test_input(monkeypatch): monkeypatch.setenv("ACTIVEFENCE_API_KEY", "xxx") config = RailsConfig.from_content( @@ -43,7 +43,7 @@ def test_1(monkeypatch): rails: input: flows: - - activefence moderation + - activefence moderation on input """, ) chat = TestChat( @@ -88,3 +88,47 @@ def test_1(monkeypatch): chat >> "you are stupid!" chat << "I'm sorry, I can't respond to that." + + +def test_output(monkeypatch): + monkeypatch.setenv("ACTIVEFENCE_API_KEY", "xxx") + + config = RailsConfig.from_content( + yaml_content=""" + models: + - type: main + engine: openai + model: gpt-3.5-turbo-instruct + + rails: + output: + flows: + - activefence moderation on output + """, + ) + chat = TestChat( + config, + llm_completions=[ + " You are stupid!", + ], + ) + + with aioresponses() as m: + m.post( + "https://apis.activefence.com/sync/v3/content/text", + payload={ + "response_id": "36f76a43-ddbe-4308-bc86-1a2b068a00ea", + "entity_id": "59fe8fe0-5036-494f-970c-8e28305a3716", + "entity_type": "content", + "violations": [ + { + "violation_type": "abusive_or_harmful.profanity", + "risk_score": 0.95, + } + ], + "errors": [], + }, + ) + + chat >> "Hello!" + chat << "I'm sorry, I can't respond to that."