Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/add contains link descriptor #1333

Merged
merged 10 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/book/reference/all-metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ Check for regular expression matches.
| **ExactMatch()** <ul><li>Checks if the text matches between two columns.</li><li>Returns True/False for every input. </li></ul> Example use:<br> `ExactMatch(column_name='column_1')`| **Required:** <br>`with_column` <br><br>**Optional:**<ul><li>`display_name`</li></ul> |
| **IsValidJSON()** <ul><li>Checks if the text in a specified column is a valid JSON.</li><li>Returns True/False for every input. </li></ul> Example use:<br> `IsValidJSON(column_name='column_1')`| **Required:** <br>`column_name` <br><br>**Optional:**<ul><li>`display_name`</li></ul> |
| **JSONSchemaMatch()** <ul><li>Checks if the text contains a JSON object matching the **expected_schema**. Supports exact (**exact=True**) or minimal (**exact=False**) matching, with optional strict type validation (**validate_types=True**). </li><li>Returns True/False for each row. </li></ul> Example use:<br> `JSONSchemaMatch(expected_schema={"name": str, "age": int}, exact_match=False, validate_types=True)`| **Required:** <br>`expected_schema: Dict[str, type]`<br><br>**Optional:**<ul><li>`exact_match = True` or `False`</li><li>`validate_types = True` or `False`</li></ul> |
| **ContainsLink()** <ul><li>Checks if the text contains at least one valid URL. </li><li>Returns True/False for each row. </li></ul> Example use:<br> `ContainsLink(column_name='column_1')`| **Required:** <br>`column_name: str`<br><br>**Optional:**<ul><li>`display_name`</li></ul> |

## Descriptors: Text stats

Expand Down
2 changes: 2 additions & 0 deletions src/evidently/descriptors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from . import _registry
from .contains_link_descriptor import ContainsLink
from .custom_descriptor import CustomColumnEval
from .custom_descriptor import CustomPairColumnEval
from .exact_match_descriptor import ExactMatch
Expand Down Expand Up @@ -65,6 +66,7 @@
"Sentiment",
"ExactMatch",
"RegExp",
"ContainsLink",
"WordMatch",
"WordNoMatch",
"IsValidJSON",
Expand Down
5 changes: 5 additions & 0 deletions src/evidently/descriptors/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,11 @@
"evidently.descriptors.custom_descriptor.CustomPairColumnEval",
"evidently:descriptor:CustomPairColumnEval",
)
register_type_alias(
FeatureDescriptor,
"evidently.descriptors.contains_link_descriptor.ContainsLink",
"evidently:descriptor:ContainsLink",
)
register_type_alias(
FeatureDescriptor, "evidently.descriptors.exact_match_descriptor.ExactMatch", "evidently:descriptor:ExactMatch"
)
Expand Down
11 changes: 11 additions & 0 deletions src/evidently/descriptors/contains_link_descriptor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from evidently.features import contains_link_feature
from evidently.features.generated_features import FeatureDescriptor
from evidently.features.generated_features import GeneratedFeature


class ContainsLink(FeatureDescriptor):
class Config:
type_alias = "evidently:descriptor:ContainsLink"

def feature(self, column_name: str) -> GeneratedFeature:
return contains_link_feature.ContainsLink(column_name, self.display_name)
3 changes: 3 additions & 0 deletions src/evidently/features/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@
register_type_alias(
GeneratedFeatures, "evidently.features.words_feature.WordsPresence", "evidently:feature:WordsPresence"
)
register_type_alias(
GeneratedFeatures, "evidently.features.contains_link_feature.ContainsLink", "evidently:feature:ContainsLink"
)
register_type_alias(
GeneratedFeatures, "evidently.features.exact_match_feature.ExactMatchFeature", "evidently:feature:ExactMatchFeature"
)
Expand Down
36 changes: 36 additions & 0 deletions src/evidently/features/contains_link_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from typing import Any
from typing import ClassVar
from typing import Optional
from urllib.parse import urlparse

import numpy as np

from evidently import ColumnType
from evidently.features.generated_features import ApplyColumnGeneratedFeature


class ContainsLink(ApplyColumnGeneratedFeature):
class Config:
type_alias = "evidently:feature:ContainsLink"

__feature_type__: ClassVar = ColumnType.Categorical
display_name_template: ClassVar = "{column_name} contains link"
column_name: str

def __init__(self, column_name: str, display_name: Optional[str] = None):
self.column_name = column_name
self.display_name = display_name
super().__init__()

def apply(self, value: Any):
if value is None or (isinstance(value, float) and np.isnan(value)):
return 0
# Split the text into words
words = str(value).split()

# Check if any word is a valid URL using urlparse
for word in words:
parsed = urlparse(word)
if parsed.scheme and parsed.netloc:
return True
return False
35 changes: 35 additions & 0 deletions tests/features/test_contains_link_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pandas as pd

from evidently.features.contains_link_feature import ContainsLink
from evidently.pipeline.column_mapping import ColumnMapping
from evidently.utils.data_preprocessing import create_data_definition


def test_contains_link_feature():
# Initialize the ContainsLink feature generator for column_1
feature_generator = ContainsLink("column_1")

# Sample data with varying texts that contain or don't contain links
data = pd.DataFrame(
dict(
column_1=[
"Check out https://example.com for more info", # Contains a valid link
"Visit our website at http://www.test.com.", # Contains a valid link
"No link here, just plain text", # No link
"Another string without a link", # No link
"Here is a malformed link: www.test.com", # Invalid link (missing scheme)
]
)
)

# Generate the feature
result = feature_generator.generate_feature(
data=data,
data_definition=create_data_definition(None, data, ColumnMapping()),
)

# Expected result: True for valid links, False otherwise
expected_result = pd.DataFrame(dict(column_1=[True, True, False, False, False]))

# Assert that the generated result matches the expected result
assert result.equals(expected_result)