From a2e15b1009e348efabf95835a105d518630d19de Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 5 Oct 2024 00:25:36 +0200 Subject: [PATCH 1/7] implement logic --- src/evidently/features/_registry.py | 3 ++ .../features/contains_link_feature.py | 36 +++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 src/evidently/features/contains_link_feature.py diff --git a/src/evidently/features/_registry.py b/src/evidently/features/_registry.py index ba2e101f5f..e276752bd0 100644 --- a/src/evidently/features/_registry.py +++ b/src/evidently/features/_registry.py @@ -74,3 +74,6 @@ register_type_alias( GeneratedFeatures, "evidently.features.words_feature.WordsPresence", "evidently:feature:WordsPresence" ) +register_type_alias( + GeneratedFeatures, "evidently.features.text_contains_link_feature.ContainsLink", "evidently:feature:ContainsLink" +) diff --git a/src/evidently/features/contains_link_feature.py b/src/evidently/features/contains_link_feature.py new file mode 100644 index 0000000000..3d1978363a --- /dev/null +++ b/src/evidently/features/contains_link_feature.py @@ -0,0 +1,36 @@ +from typing import Any +from typing import ClassVar +from typing import Optional +from urllib.parse import urlparse + +import numpy as np + +from evidently import ColumnType +from evidently.features.generated_features import ApplyColumnGeneratedFeature + + +class ContainsLink(ApplyColumnGeneratedFeature): + class Config: + type_alias = "evidently:feature:ContainsLink" + + __feature_type__: ClassVar = ColumnType.Categorical + display_name_template: ClassVar = "{column_name} contains link" + column_name: str + + def __init__(self, column_name: str, display_name: Optional[str] = None): + self.column_name = column_name + self.display_name = display_name + super().__init__() + + def apply(self, value: Any): + if value is None or (isinstance(value, float) and np.isnan(value)): + return 0 + # Split the text into words + words = str(value).split() + + # Check if any word is a valid URL using urlparse + for word in words: + parsed = urlparse(word) + if parsed.scheme and parsed.netloc: + return True + return False From aec2b4adb836bc0c7bd2ccbeeba9ddfee0381b46 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 5 Oct 2024 00:30:18 +0200 Subject: [PATCH 2/7] implement descriptor --- src/evidently/descriptors/__init__.py | 2 ++ src/evidently/descriptors/_registry.py | 5 +++++ src/evidently/descriptors/contains_link_descriptor.py | 11 +++++++++++ 3 files changed, 18 insertions(+) create mode 100644 src/evidently/descriptors/contains_link_descriptor.py diff --git a/src/evidently/descriptors/__init__.py b/src/evidently/descriptors/__init__.py index 2520abdbe3..32fb6ee096 100644 --- a/src/evidently/descriptors/__init__.py +++ b/src/evidently/descriptors/__init__.py @@ -1,4 +1,5 @@ from . import _registry +from .contains_link_descriptor import ContainsLink from .custom_descriptor import CustomColumnEval from .custom_descriptor import CustomPairColumnEval from .hf_descriptor import HuggingFaceModel @@ -55,5 +56,6 @@ "SentenceCount", "Sentiment", "RegExp", + "ContainsLink", "_registry", ] diff --git a/src/evidently/descriptors/_registry.py b/src/evidently/descriptors/_registry.py index 0f912a86fe..3c860a2f73 100644 --- a/src/evidently/descriptors/_registry.py +++ b/src/evidently/descriptors/_registry.py @@ -100,3 +100,8 @@ "evidently.descriptors.custom_descriptor.CustomPairColumnEval", "evidently:descriptor:CustomPairColumnEval", ) +register_type_alias( + FeatureDescriptor, + "evidently.descriptors.contains_link_descriptor.ContainsLink", + "evidently:descriptor:ContainsLink", +) diff --git a/src/evidently/descriptors/contains_link_descriptor.py b/src/evidently/descriptors/contains_link_descriptor.py new file mode 100644 index 0000000000..dfc78f7e6a --- /dev/null +++ b/src/evidently/descriptors/contains_link_descriptor.py @@ -0,0 +1,11 @@ +from evidently.features import contains_link_feature +from evidently.features.generated_features import FeatureDescriptor +from evidently.features.generated_features import GeneratedFeature + + +class ContainsLink(FeatureDescriptor): + class Config: + type_alias = "evidently:descriptor:ContainsLink" + + def feature(self, column_name: str) -> GeneratedFeature: + return contains_link_feature.ContainsLink(column_name, self.display_name) From 88811329e2108bca61276e1b362f481c0e8586a4 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sat, 5 Oct 2024 00:33:03 +0200 Subject: [PATCH 3/7] implement tests --- tests/features/test_contains_link_feature.py | 35 ++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 tests/features/test_contains_link_feature.py diff --git a/tests/features/test_contains_link_feature.py b/tests/features/test_contains_link_feature.py new file mode 100644 index 0000000000..ab95f05d3a --- /dev/null +++ b/tests/features/test_contains_link_feature.py @@ -0,0 +1,35 @@ +import pandas as pd + +from evidently.features.contains_link_feature import ContainsLink +from evidently.pipeline.column_mapping import ColumnMapping +from evidently.utils.data_preprocessing import create_data_definition + + +def test_contains_link_feature(): + # Initialize the ContainsLink feature generator for column_1 + feature_generator = ContainsLink("column_1") + + # Sample data with varying texts that contain or don't contain links + data = pd.DataFrame( + dict( + column_1=[ + "Check out https://example.com for more info", # Contains a valid link + "Visit our website at http://www.test.com.", # Contains a valid link + "No link here, just plain text", # No link + "Another string without a link", # No link + "Here is a malformed link: www.test.com", # Invalid link (missing scheme) + ] + ) + ) + + # Generate the feature + result = feature_generator.generate_feature( + data=data, + data_definition=create_data_definition(None, data, ColumnMapping()), + ) + + # Expected result: True for valid links, False otherwise + expected_result = pd.DataFrame(dict(column_1=[True, True, False, False, False])) + + # Assert that the generated result matches the expected result + assert result.equals(expected_result) From a2c530f95891e59d41dfde5193f762d5cfff6ad0 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 7 Oct 2024 21:19:51 +0200 Subject: [PATCH 4/7] Fix typo --- src/evidently/features/_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evidently/features/_registry.py b/src/evidently/features/_registry.py index e276752bd0..0989012682 100644 --- a/src/evidently/features/_registry.py +++ b/src/evidently/features/_registry.py @@ -75,5 +75,5 @@ GeneratedFeatures, "evidently.features.words_feature.WordsPresence", "evidently:feature:WordsPresence" ) register_type_alias( - GeneratedFeatures, "evidently.features.text_contains_link_feature.ContainsLink", "evidently:feature:ContainsLink" + GeneratedFeatures, "evidently.features.contains_link_feature.ContainsLink", "evidently:feature:ContainsLink" ) From 94897b95eef520f429add59eb4322e6b9d01e613 Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 16 Oct 2024 23:12:29 +0200 Subject: [PATCH 5/7] Add ContainsLink to text_evals --- src/evidently/metric_preset/text_evals.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/evidently/metric_preset/text_evals.py b/src/evidently/metric_preset/text_evals.py index 8750baadc4..7081473d0f 100644 --- a/src/evidently/metric_preset/text_evals.py +++ b/src/evidently/metric_preset/text_evals.py @@ -4,6 +4,7 @@ from typing import Optional from evidently.descriptors import OOV +from evidently.descriptors import ContainsLink from evidently.descriptors import NonLetterCharacterPercentage from evidently.descriptors import SentenceCount from evidently.descriptors import Sentiment @@ -36,5 +37,6 @@ def generate_metrics( Sentiment(), OOV(), NonLetterCharacterPercentage(), + ContainsLink(), ] return [ColumnSummaryMetric(desc.on(self.column_name)) for desc in descriptors] From 7b74501d675d19da0669bacfb6cdd03188f59a77 Mon Sep 17 00:00:00 2001 From: Your Name Date: Mon, 21 Oct 2024 22:15:53 +0200 Subject: [PATCH 6/7] Update all-metrics.md --- docs/book/reference/all-metrics.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/book/reference/all-metrics.md b/docs/book/reference/all-metrics.md index ba78b2f570..8db08ceb75 100644 --- a/docs/book/reference/all-metrics.md +++ b/docs/book/reference/all-metrics.md @@ -277,6 +277,7 @@ Check for regular expression matches. | **ExactMatch()**
  • Checks if the text matches between two columns.
  • Returns True/False for every input.
Example use:
`ExactMatch(column_name='column_1')`| **Required:**
`with_column`

**Optional:**
  • `display_name`
| | **IsValidJSON()**
  • Checks if the text in a specified column is a valid JSON.
  • Returns True/False for every input.
Example use:
`IsValidJSON(column_name='column_1')`| **Required:**
`column_name`

**Optional:**
  • `display_name`
| | **JSONSchemaMatch()**
  • Checks if the text contains a JSON object matching the **expected_schema**. Supports exact (**exact=True**) or minimal (**exact=False**) matching, with optional strict type validation (**validate_types=True**).
  • Returns True/False for each row.
Example use:
`JSONSchemaMatch(expected_schema={"name": str, "age": int}, exact_match=False, validate_types=True)`| **Required:**
`expected_schema: Dict[str, type]`

**Optional:**
  • `exact_match = True` or `False`
  • `validate_types = True` or `False`
| +| **ContainsLink()**
  • Checks if the text contains at least one valid URL.
  • Returns True/False for each row.
Example use:
`ContainsLink(column_name='column_1')`| **Required:**
`column_name: str`

**Optional:**
  • `display_name`
| ## Descriptors: Text stats From 1e6e09a4b0ec9a9879672d9d9333cb99e7c8cc08 Mon Sep 17 00:00:00 2001 From: Sifr'un <36736908+Rayryu@users.noreply.github.com> Date: Mon, 21 Oct 2024 22:16:59 +0200 Subject: [PATCH 7/7] Update text_evals.py --- src/evidently/metric_preset/text_evals.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/evidently/metric_preset/text_evals.py b/src/evidently/metric_preset/text_evals.py index 7081473d0f..8750baadc4 100644 --- a/src/evidently/metric_preset/text_evals.py +++ b/src/evidently/metric_preset/text_evals.py @@ -4,7 +4,6 @@ from typing import Optional from evidently.descriptors import OOV -from evidently.descriptors import ContainsLink from evidently.descriptors import NonLetterCharacterPercentage from evidently.descriptors import SentenceCount from evidently.descriptors import Sentiment @@ -37,6 +36,5 @@ def generate_metrics( Sentiment(), OOV(), NonLetterCharacterPercentage(), - ContainsLink(), ] return [ColumnSummaryMetric(desc.on(self.column_name)) for desc in descriptors]