evidentlyai · emeli-dral · Oct 17, 2024 · Oct 1, 2024 · Oct 1, 2024 · Oct 8, 2024
diff --git a/docs/book/reference/all-metrics.md b/docs/book/reference/all-metrics.md
@@ -274,6 +274,7 @@ Check for regular expression matches.
 | **ExcludesWords()** <ul><li>Checks if the text excludes all specified words.</li><li> Considers only vocabulary words (from NLTK vocabulary). </li><li>By default, considers inflected and variant forms of the same word. </li><li>Returns True/False for every input. </li></ul> Example use:<br> `ExcludesWords(words_list=['buy', 'sell', 'bet']`| **Required:** <br>`words_list: List[str]` <br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li><li>`lemmatize = True` or `False`</li></ul> |
 | **ItemMatch()** <ul><li>Checks whether the text contains **any** (default) or **all** specified  items that are specific to each row (represented as tuples) </li><li>Returns True/False for each row. </li></ul> Example use:<br> `ItemMatch(with_column="expected")`| **Required:** <br>`with_column: str`<br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li></li><li>`case_sensitive = True` or `False`</li></ul> |
 | **ItemNoMatch()** <ul><li>Checks whether the text excludes **any** (default) or **all** specified  items that are specific to each row (represented as tuples) </li><li>Returns True/False for each row. </li></ul> Example use:<br> `ItemMatch(with_column="forbidden")`| **Required:** <br>`with_column: str`<br><br>**Optional:**<ul><li>`display_name`</li><li>`mode = 'all'` or `'any'`</li></li><li>`case_sensitive = True` or `False`</li></ul> |
+| **IsValidJSON()** <ul><li>Checks if the text in a specified column is a valid JSON.</li><li>Returns True/False for every input. </li></ul> Example use:<br> `IsValidJSON(column_name='column_1')`| **Required:** <br>`column_name` <br><br>**Optional:**<ul><li>`display_name`</li></ul> |
 | **JSONSchemaMatch()** <ul><li>Checks if the text contains a JSON object matching the **expected_schema**. Supports exact (**exact=True**) or minimal (**exact=False**) matching, with optional strict type validation (**validate_types=True**).  </li><li>Returns True/False for each row. </li></ul> Example use:<br> `JSONSchemaMatch(expected_schema={"name": str, "age": int}, exact_match=False, validate_types=True)`| **Required:** <br>`expected_schema: Dict[str, type]`<br><br>**Optional:**<ul><li>`exact_match = True` or `False`</li><li>`validate_types = True` or `False`</li></ul> |
 
 ## Descriptors: Text stats

diff --git a/src/evidently/descriptors/__init__.py b/src/evidently/descriptors/__init__.py
@@ -3,6 +3,7 @@
 from .custom_descriptor import CustomPairColumnEval
 from .hf_descriptor import HuggingFaceModel
 from .hf_descriptor import HuggingFaceToxicityModel
+from .is_valid_json_descriptor import IsValidJSON
 from .json_schema_match_descriptor import JSONSchemaMatch
 from .llm_judges import BiasLLMEval
 from .llm_judges import ContextQualityLLMEval
@@ -60,6 +61,7 @@
     "SentenceCount",
     "Sentiment",
     "RegExp",
+    "IsValidJSON",
     "JSONSchemaMatch",
     "_registry",
 ]
diff --git a/src/evidently/descriptors/_registry.py b/src/evidently/descriptors/_registry.py
@@ -111,3 +111,6 @@
     "evidently.descriptors.custom_descriptor.CustomPairColumnEval",
     "evidently:descriptor:CustomPairColumnEval",
 )
+register_type_alias(
+    FeatureDescriptor, "evidently.descriptors.is_valid_json_descriptor.IsValidJSON", "evidently:descriptor:IsValidJSON"
+)
diff --git a/src/evidently/descriptors/is_valid_json_descriptor.py b/src/evidently/descriptors/is_valid_json_descriptor.py
@@ -0,0 +1,11 @@
+from evidently.features import is_valid_json_feature
+from evidently.features.generated_features import FeatureDescriptor
+from evidently.features.generated_features import GeneratedFeature
+
+
+class IsValidJSON(FeatureDescriptor):
+    class Config:
+        type_alias = "evidently:descriptor:IsValidJSON"
+
+    def feature(self, column_name: str) -> GeneratedFeature:
+        return is_valid_json_feature.IsValidJSON(column_name, self.display_name)
diff --git a/src/evidently/features/_registry.py b/src/evidently/features/_registry.py
@@ -85,3 +85,6 @@
 register_type_alias(
     GeneratedFeatures, "evidently.features.words_feature.WordsPresence", "evidently:feature:WordsPresence"
 )
+register_type_alias(
+    GeneratedFeatures, "evidently.features.is_valid_json_feature.IsValidJSON", "evidently:feature:IsValidJSON"
+)
diff --git a/src/evidently/features/is_valid_json_feature.py b/src/evidently/features/is_valid_json_feature.py
@@ -0,0 +1,28 @@
+import json
+from typing import Any
+from typing import ClassVar
+from typing import Optional
+
+from evidently import ColumnType
+from evidently.features.generated_features import ApplyColumnGeneratedFeature
+
+
+class IsValidJSON(ApplyColumnGeneratedFeature):
+    class Config:
+        type_alias = "evidently:feature:IsValidJSON"
+
+    __feature_type__: ClassVar = ColumnType.Categorical
+    display_name_template: ClassVar = "JSON valid for {column_name}"
+    column_name: str
+
+    def __init__(self, column_name: str, display_name: Optional[str] = None):
+        self.column_name = column_name
+        self.display_name = display_name
+        super().__init__()
+
+    def apply(self, value: Any):
+        try:
+            json.loads(value)
+        except ValueError:
+            return False
+        return True
diff --git a/tests/features/test_is_valid_json_feature.py b/tests/features/test_is_valid_json_feature.py
@@ -0,0 +1,23 @@
+import pandas as pd
+import pytest
+
+from evidently.features.is_valid_json_feature import IsValidJSON
+from evidently.pipeline.column_mapping import ColumnMapping
+from evidently.utils.data_preprocessing import create_data_definition
+
+
+@pytest.mark.parametrize(
+    ("item", "expected"),
+    [
+        ('{"test": "abc"}', True),
+        ("not json", False),
+    ],
+)
+def test_is_valid_json_feature(item: str, expected: bool):
+    feature_generator = IsValidJSON("column_1")
+    data = pd.DataFrame(dict(column_1=[item]))
+    result = feature_generator.generate_feature(
+        data=data,
+        data_definition=create_data_definition(None, data, ColumnMapping()),
+    )
+    assert result.equals(pd.DataFrame(dict(column_1=[expected])))