NVIDIA · noamlevy81 · Dec 4, 2024 · Dec 5, 2024 · Jan 20, 2025 · Jan 20, 2025
diff --git a/README.md b/README.md
@@ -164,7 +164,7 @@ rails:
     flows:
       - self check facts
       - self check hallucination
-      - activefence moderation
+      - activefence moderation on input
 
   config:
     # Configure the types of entities that should be masked on user input.

diff --git a/docs/user-guides/community/active-fence.md b/docs/user-guides/community/active-fence.md
@@ -1,24 +1,24 @@
 # ActiveFence Integration
 
-NeMo Guardrails supports using the [ActiveFence ActiveScore API](https://docs.activefence.com/index.html) as an input rail out-of-the-box (you need to have the `ACTIVEFENCE_API_KEY` environment variable set).
+NeMo Guardrails supports using the [ActiveFence ActiveScore API](https://docs.activefence.com/index.html) as an input and output rail out-of-the-box (you need to have the `ACTIVEFENCE_API_KEY` environment variable set).
 
 ```yaml
 rails:
   input:
     flows:
       # The simplified version
-      - activefence moderation
+      - activefence moderation on input
 
       # The detailed version with individual risk scores
-      # - activefence moderation detailed
+      # - activefence moderation on input detailed
 ```
 
-The `activefence moderation` flow uses the maximum risk score with an 0.85 threshold to decide if the input should be allowed or not (i.e., if the risk score is above the threshold, it is considered a violation). The `activefence moderation detailed` has individual scores per category of violation.
+The `activefence moderation on input` flow uses the maximum risk score with an 0.85 threshold to decide if the text should be allowed or not (i.e., if the risk score is above the threshold, it is considered a violation). The `activefence moderation on input detailed` has individual scores per category of violation.
 
-To customize the scores, you have to overwrite the [default flows](https://github.com/NVIDIA/NeMo-Guardrails/tree/develop/nemoguardrails/library/activefence/flows.co) in your config. For example, to change the threshold for `activefence moderation` you can add the following flow to your config:
+To customize the scores, you have to overwrite the [default flows](https://github.com/NVIDIA/NeMo-Guardrails/tree/develop/nemoguardrails/library/activefence/flows.co) in your config. For example, to change the threshold for `activefence moderation on input` you can add the following flow to your config:
 
 ```colang
-define subflow activefence moderation
+define subflow activefence moderation on input
   """Guardrail based on the maximum risk score."""
   $result = execute call activefence api
 

diff --git a/docs/user-guides/guardrails-library.md b/docs/user-guides/guardrails-library.md
@@ -684,15 +684,18 @@ This category of rails relies on 3rd party APIs for various guardrailing tasks.
 
 ### ActiveFence
 
-NeMo Guardrails supports using the [ActiveFence ActiveScore API](https://docs.activefence.com/index.html) as an input rail out-of-the-box (you need to have the `ACTIVEFENCE_API_KEY` environment variable set).
+NeMo Guardrails supports using the [ActiveFence ActiveScore API](https://docs.activefence.com/index.html) as an input and output rail out-of-the-box (you need to have the `ACTIVEFENCE_API_KEY` environment variable set).
 
 #### Example usage
 
 ```yaml
 rails:
   input:
     flows:
-      - activefence moderation
+      - activefence moderation on input
+  output:
+    flows:
+      - activefence moderation on output
 ```
 
 For more details, check out the [ActiveFence Integration](./community/active-fence.md) page.

diff --git a/examples/sample_config.yml b/examples/sample_config.yml
@@ -7,7 +7,7 @@ rails:
     flows:
       - self check input
       - check sensitive data
-      - activefence moderation
+      - activefence moderation on input
       - check toxicity
       - some other moderation thing
 
@@ -35,7 +35,7 @@ rails:
     flows:
       - self check facts
       - self check hallucination
-      - activefence moderation
+      - activefence moderation on output
       - check sensitive data
 
   # Execution rails are triggered before and after an action is invoked

diff --git a/nemoguardrails/library/activefence/actions.py b/nemoguardrails/library/activefence/actions.py
@@ -25,19 +25,17 @@
 log = logging.getLogger(__name__)
 
 
-@action(name="call activefence api", is_system_action=True)
-async def call_activefence_api(context: Optional[dict] = None):
+@action(is_system_action=True)
+async def call_activefence_api(text: Optional[str] = None):
     api_key = os.environ.get("ACTIVEFENCE_API_KEY")
 
     if api_key is None:
         raise ValueError("ACTIVEFENCE_API_KEY environment variable not set.")
 
-    user_message = context.get("user_message")
-
     url = "https://apis.activefence.com/sync/v3/content/text"
     headers = {"af-api-key": api_key, "af-source": "nemo-guardrails"}
     data = {
-        "text": user_message,
+        "text": text,
         "content_id": "ng-" + new_uuid(),
     }
 

diff --git a/nemoguardrails/library/activefence/flows.co b/nemoguardrails/library/activefence/flows.co
@@ -13,9 +13,9 @@ Adult Content	General
 Privacy Violation	PII
 """
 
-flow activefence moderation
+flow activefence moderation on input
   """Guardrail based on the maximum risk score."""
-  $result = await  CallActivefenceApiAction()
+  $result = await  CallActivefenceApiAction(text=$user_message)
 
   if $result.max_risk_score > 0.7
     if $system.config.enable_rails_exceptions
@@ -24,9 +24,20 @@ flow activefence moderation
       bot refuse to respond
     abort
 
-flow activefence moderation detailed
+flow activefence moderation on output
+  """Guardrail based on the maximum risk score."""
+  $result = await  CallActivefenceApiAction(text=$bot_message)
+
+  if $result.max_risk_score > 0.7
+    if $system.config.enable_rails_exceptions
+      send ActiveFenceModerationRailException(message="ActiveFence moderation triggered. The maximum risk score exceeded the threshold.")
+    else
+      bot refuse to respond
+    abort
+
+flow activefence moderation on input detailed
   """Guardrail based on individual risk scores."""
-  $result = await  CallActivefenceApiAction()
+  $result = await  CallActivefenceApiAction(text=$user_message)
 
   if $result.violations.get("abusive_or_harmful.harassment_or_bullying", 0) > 0.8
     if $system.config.enable_rails_exceptions

diff --git a/nemoguardrails/library/activefence/flows.v1.co b/nemoguardrails/library/activefence/flows.v1.co
@@ -13,9 +13,9 @@ Adult Content	General
 Privacy Violation	PII
 """
 
-define subflow activefence moderation
+define subflow activefence moderation on input
   """Guardrail based on the maximum risk score."""
-  $result = execute call activefence api
+  $result = execute call_activefence_api(text=$user_message)
 
   if $result.max_risk_score > 0.7
     if $config.enable_rails_exceptions
@@ -24,9 +24,21 @@ define subflow activefence moderation
       bot refuse to respond
     stop
 
-define subflow activefence moderation detailed
+ define subflow activefence moderation on output
+  """Guardrail based on the maximum risk score."""
+  $result = execute call_activefence_api(text=$bot_message)
+
+  if $result.max_risk_score > 0.7
+    if $config.enable_rails_exceptions
+      create event ActiveFenceModerationRailException(message="ActiveFence moderation triggered. The maximum risk score exceeded the threshold.")
+    else
+      bot refuse to respond
+    stop
+
+
+define subflow activefence moderation on input detailed
   """Guardrail based on individual risk scores."""
-  $result = execute call activefence api
+  $result = execute call_activefence_api(text=$user_message)
 
   if $result.violations.get("abusive_or_harmful.harassment_or_bullying", 0) > 0.8
     if $config.enable_rails_exceptions

diff --git a/tests/test_active_fence_input_rail.py → tests/test_activefence_rail.py b/tests/test_active_fence_input_rail.py → tests/test_activefence_rail.py
@@ -19,7 +19,7 @@
 from tests.utils import TestChat
 
 
-def test_1(monkeypatch):
+def test_input(monkeypatch):
     monkeypatch.setenv("ACTIVEFENCE_API_KEY", "xxx")
 
     config = RailsConfig.from_content(
@@ -43,7 +43,7 @@ def test_1(monkeypatch):
             rails:
               input:
                 flows:
-                  - activefence moderation
+                  - activefence moderation on input
         """,
     )
     chat = TestChat(
@@ -88,3 +88,47 @@ def test_1(monkeypatch):
 
         chat >> "you are stupid!"
         chat << "I'm sorry, I can't respond to that."
+
+
+def test_output(monkeypatch):
+    monkeypatch.setenv("ACTIVEFENCE_API_KEY", "xxx")
+
+    config = RailsConfig.from_content(
+        yaml_content="""
+            models:
+              - type: main
+                engine: openai
+                model: gpt-3.5-turbo-instruct
+
+            rails:
+              output:
+                flows:
+                  - activefence moderation on output
+        """,
+    )
+    chat = TestChat(
+        config,
+        llm_completions=[
+            " You are stupid!",
+        ],
+    )
+
+    with aioresponses() as m:
+        m.post(
+            "https://apis.activefence.com/sync/v3/content/text",
+            payload={
+                "response_id": "36f76a43-ddbe-4308-bc86-1a2b068a00ea",
+                "entity_id": "59fe8fe0-5036-494f-970c-8e28305a3716",
+                "entity_type": "content",
+                "violations": [
+                    {
+                        "violation_type": "abusive_or_harmful.profanity",
+                        "risk_score": 0.95,
+                    }
+                ],
+                "errors": [],
+            },
+        )
+
+        chat >> "Hello!"
+        chat << "I'm sorry, I can't respond to that."