-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Validate output format and edge cases #149
base: main
Are you sure you want to change the base?
Changes from all commits
acbfe7d
7e36120
4d09cac
a48a185
c42aa31
3f5f0a7
b318954
44dfe75
9f9d999
2c39963
0dca776
f8fca37
281fa79
adf45f7
fd093c9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
import random | ||
from datetime import datetime, timedelta | ||
from typing import List | ||
from uuid import uuid4 | ||
|
||
from pydantic import ValidationError | ||
|
||
from poprox_concepts import AccountInterest, Click, InterestProfile | ||
from poprox_concepts.api.recommendations import RecommendationRequest | ||
from poprox_recommender.data.mind import MindData | ||
|
||
|
||
class RequestGenerator: | ||
""" | ||
Class to generate recommendation request using click history, onboarding topics, and candidate articles from MIND | ||
""" | ||
|
||
def __init__(self, mind_data: MindData): | ||
self.mind_data = mind_data | ||
self.candidate_articles = list() | ||
self.past_articles = list() | ||
self.added_topics = list() | ||
self.clicks = list() | ||
|
||
def set_num_recs(self, num_recs: int): | ||
self.num_recs = num_recs | ||
|
||
def add_clicks(self, num_clicks: int, num_days: int | None = None): | ||
all_articles = list(self.mind_data.news_df.index) | ||
|
||
if num_days: | ||
start_date = datetime.now() - timedelta(days=num_days - 1) | ||
timestamps = [start_date + timedelta(days=random.randint(0, num_days - 1)) for _ in range(num_clicks)] | ||
random.shuffle(timestamps) | ||
else: | ||
timestamps = [datetime.now()] * num_clicks | ||
# generate click history | ||
self.clicks = [ | ||
Click( | ||
article_id=self.mind_data.news_uuid_for_id(random.choice(all_articles)), | ||
newsletter_id=uuid4(), | ||
timestamp=timestamps[i], | ||
) | ||
for i in range(num_clicks) | ||
] | ||
|
||
self.past_articles = [self.mind_data.lookup_article(uuid=click.article_id) for click in self.clicks] | ||
|
||
def add_topics(self, topics: List[str]): | ||
self.added_topics = [ | ||
AccountInterest( | ||
account_id=uuid4(), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these should all be the profile ID, not fresh random UUIDs, shouldn't they? |
||
entity_id=uuid4(), | ||
entity_name=topic, | ||
preference=random.randint(1, 5), | ||
frequency=None, | ||
) | ||
for topic in topics | ||
] | ||
|
||
def add_candidates(self, num_candidates): | ||
all_articles = list(self.mind_data.news_df.index) | ||
selected_candidates = random.sample(all_articles, num_candidates) | ||
|
||
self.candidate_articles = [self.mind_data.lookup_article(id=article_id) for article_id in selected_candidates] | ||
|
||
def get_request(self) -> RecommendationRequest: | ||
interest_profile = InterestProfile( | ||
profile_id=uuid4(), | ||
click_history=self.clicks, | ||
onboarding_topics=self.added_topics, | ||
) | ||
|
||
try: | ||
request = RecommendationRequest( | ||
past_articles=self.past_articles, | ||
todays_articles=self.candidate_articles, | ||
interest_profile=interest_profile, | ||
num_recs=self.num_recs, | ||
) | ||
return request | ||
except ValidationError as e: | ||
raise ValueError(f"Generated request is invalid: {e}") |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
from pytest import fixture | ||
|
||
from poprox_concepts.api.recommendations import RecommendationRequest, RecommendationResponse | ||
from poprox_recommender.data.mind import MindData | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
@@ -128,3 +129,8 @@ def auto_service() -> Generator[TestService, None, None]: | |
# use already-running docker | ||
port = os.environ.get("POPROX_TEST_PORT", "9000") | ||
yield DockerTestService(f"http://localhost:{port}/2015-03-31/functions/function/invocations") | ||
|
||
|
||
@fixture(scope="session") | ||
def mind_data(): | ||
yield MindData | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should be |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
""" | ||
Test basic request by initializing test data. | ||
""" | ||
|
||
import logging | ||
import warnings | ||
|
||
from pydantic import ValidationError | ||
from pytest import mark, skip | ||
|
||
from poprox_concepts.api.recommendations import RecommendationRequest, RecommendationResponse | ||
from poprox_recommender.config import allow_data_test_failures | ||
from poprox_recommender.recommenders import recommendation_pipelines | ||
from poprox_recommender.request_generator import RequestGenerator | ||
from poprox_recommender.testing import auto_service as service | ||
from poprox_recommender.testing import mind_data | ||
|
||
logger = logging.getLogger(__name__) | ||
try: | ||
PIPELINES = recommendation_pipelines().keys() | ||
except Exception as e: | ||
warnings.warn("failed to load models, did you run `dvc pull`?") | ||
if allow_data_test_failures(): | ||
skip("recommendation pipelines unavailable", allow_module_level=True) | ||
else: | ||
raise e | ||
|
||
|
||
@mark.docker | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need |
||
@mark.parametrize("pipeline", PIPELINES) | ||
def test_basic_request(service, mind_data, pipeline): # noqa: F811 | ||
""" | ||
Initialize request data | ||
""" | ||
request_generator = RequestGenerator(mind_data()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. after the change to the fixture, this will become There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This applies to other tests as well. |
||
request_generator.add_candidates(100) | ||
request_generator.add_clicks(num_clicks=37, num_days=7) | ||
request_generator.add_topics( | ||
[ | ||
"Science", | ||
"Technology", | ||
"Sports", | ||
"Lifestyle", | ||
"Oddities", | ||
] | ||
) | ||
request_generator.set_num_recs(10) | ||
req_body = request_generator.get_request() | ||
|
||
logger.info("sending request") | ||
response = service.request(req_body, pipeline) | ||
logger.info("response: %s", response.model_dump_json(indent=2)) | ||
assert response.recommendations | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we make sure that the request account ID is the key in this dictionary? And that its length is exactly 1? (shouldn't be recommending for users we didn't request) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This applies to other tests as well, probably. |
||
assert response.recommendations.values() | ||
recs = next(iter(response.recommendations.values())) | ||
assert len(recs) > 0 | ||
assert len(recs) == request_generator.num_recs | ||
article_ids = [article.article_id for article in recs] | ||
assert len(article_ids) == len(set(article_ids)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
""" | ||
Validate edge case: a user with long history of clicks and onboarding topics | ||
""" | ||
|
||
import logging | ||
import warnings | ||
|
||
from pydantic import ValidationError | ||
from pytest import mark, skip | ||
|
||
from poprox_concepts.api.recommendations import RecommendationRequest, RecommendationResponse | ||
from poprox_recommender.config import allow_data_test_failures | ||
from poprox_recommender.recommenders import recommendation_pipelines | ||
from poprox_recommender.request_generator import RequestGenerator | ||
from poprox_recommender.testing import auto_service as service | ||
from poprox_recommender.testing import mind_data | ||
|
||
logger = logging.getLogger(__name__) | ||
try: | ||
PIPELINES = recommendation_pipelines().keys() | ||
except Exception as e: | ||
warnings.warn("failed to load models, did you run `dvc pull`?") | ||
if allow_data_test_failures(): | ||
skip("recommendation pipelines unavailable", allow_module_level=True) | ||
else: | ||
raise e | ||
|
||
|
||
@mark.docker | ||
@mark.parametrize("pipeline", PIPELINES) | ||
def test_heavy_interaction_history(service, mind_data, pipeline): # noqa: F811 | ||
""" | ||
Initialize request data | ||
""" | ||
request_generator = RequestGenerator(mind_data()) | ||
request_generator.add_candidates(100) | ||
request_generator.add_clicks(num_clicks=100, num_days=10) | ||
request_generator.add_topics( | ||
[ | ||
"U.S. news", | ||
"World news", | ||
"Politics", | ||
"Business", | ||
"Entertainment", | ||
"Sports", | ||
"Health", | ||
"Science", | ||
"Technology", | ||
"Lifestyle", | ||
"Religion", | ||
"Climate and environment", | ||
"Education", | ||
"Oddities", | ||
] | ||
) | ||
request_generator.set_num_recs(10) | ||
req_body = request_generator.get_request() | ||
|
||
logger.info("sending request") | ||
response = service.request(req_body, pipeline) | ||
logger.info("response: %s", response.model_dump_json(indent=2)) | ||
assert response.recommendations | ||
assert response.recommendations.values() | ||
recs = next(iter(response.recommendations.values())) | ||
assert len(recs) > 0 | ||
assert len(recs) == request_generator.num_recs | ||
article_ids = [article.article_id for article in recs] | ||
assert len(article_ids) == len(set(article_ids)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
""" | ||
Validate edge case: user has no click history | ||
""" | ||
|
||
import logging | ||
import warnings | ||
|
||
from pydantic import ValidationError | ||
from pytest import mark, skip | ||
|
||
from poprox_concepts.api.recommendations import RecommendationRequest, RecommendationResponse | ||
from poprox_recommender.config import allow_data_test_failures | ||
from poprox_recommender.recommenders import recommendation_pipelines | ||
from poprox_recommender.request_generator import RequestGenerator | ||
from poprox_recommender.testing import auto_service as service | ||
from poprox_recommender.testing import mind_data | ||
|
||
logger = logging.getLogger(__name__) | ||
try: | ||
PIPELINES = recommendation_pipelines().keys() | ||
except Exception as e: | ||
warnings.warn("failed to load models, did you run `dvc pull`?") | ||
if allow_data_test_failures(): | ||
skip("recommendation pipelines unavailable", allow_module_level=True) | ||
else: | ||
raise e | ||
|
||
|
||
@mark.docker | ||
@mark.parametrize("pipeline", PIPELINES) | ||
def test_no_clicks(service, mind_data, pipeline): # noqa: F811 | ||
""" | ||
Initialize request data | ||
""" | ||
request_generator = RequestGenerator(mind_data()) | ||
request_generator.add_candidates(100) | ||
request_generator.add_clicks(num_clicks=0, num_days=10) | ||
request_generator.add_topics( | ||
[ | ||
"Science", | ||
"Technology", | ||
"Sports", | ||
"Lifestyle", | ||
"Oddities", | ||
] | ||
) | ||
request_generator.set_num_recs(10) | ||
req_body = request_generator.get_request() | ||
|
||
logger.info("sending request") | ||
response = service.request(req_body, pipeline) | ||
logger.info("response: %s", response.model_dump_json(indent=2)) | ||
assert response.recommendations | ||
assert response.recommendations.values() | ||
recs = next(iter(response.recommendations.values())) | ||
assert len(recs) > 0 | ||
assert len(recs) == request_generator.num_recs | ||
article_ids = [article.article_id for article in recs] | ||
assert len(article_ids) == len(set(article_ids)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
""" | ||
Validate edge case: user has no click history & no onboarding preferences | ||
""" | ||
|
||
import logging | ||
import warnings | ||
|
||
from pydantic import ValidationError | ||
from pytest import mark, skip | ||
|
||
from poprox_concepts.api.recommendations import RecommendationRequest, RecommendationResponse | ||
from poprox_recommender.config import allow_data_test_failures | ||
from poprox_recommender.recommenders import recommendation_pipelines | ||
from poprox_recommender.request_generator import RequestGenerator | ||
from poprox_recommender.testing import auto_service as service | ||
from poprox_recommender.testing import mind_data | ||
|
||
logger = logging.getLogger(__name__) | ||
try: | ||
PIPELINES = recommendation_pipelines().keys() | ||
except Exception as e: | ||
warnings.warn("failed to load models, did you run `dvc pull`?") | ||
if allow_data_test_failures(): | ||
skip("recommendation pipelines unavailable", allow_module_level=True) | ||
else: | ||
raise e | ||
|
||
|
||
@mark.docker | ||
@mark.parametrize("pipeline", PIPELINES) | ||
def test_no_clicks_no_onboarding(service, mind_data, pipeline): # noqa: F811 | ||
""" | ||
Initialize request data | ||
""" | ||
request_generator = RequestGenerator(mind_data()) | ||
request_generator.add_candidates(100) | ||
request_generator.add_clicks(num_clicks=0, num_days=0) | ||
request_generator.add_topics([]) | ||
request_generator.set_num_recs(10) | ||
req_body = request_generator.get_request() | ||
|
||
logger.info("sending request") | ||
response = service.request(req_body, pipeline) | ||
logger.info("response: %s", response.model_dump_json(indent=2)) | ||
assert response.recommendations | ||
assert response.recommendations.values() | ||
recs = next(iter(response.recommendations.values())) | ||
assert len(recs) > 0 | ||
assert len(recs) == request_generator.num_recs | ||
article_ids = [article.article_id for article in recs] | ||
assert len(article_ids) == len(set(article_ids)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
""" | ||
Validate edge case: user has no on boarding topic | ||
""" | ||
|
||
import logging | ||
import warnings | ||
|
||
from pydantic import ValidationError | ||
from pytest import mark, skip | ||
|
||
from poprox_concepts.api.recommendations import RecommendationRequest, RecommendationResponse | ||
from poprox_recommender.config import allow_data_test_failures | ||
from poprox_recommender.recommenders import recommendation_pipelines | ||
from poprox_recommender.request_generator import RequestGenerator | ||
from poprox_recommender.testing import auto_service as service | ||
from poprox_recommender.testing import mind_data | ||
|
||
logger = logging.getLogger(__name__) | ||
try: | ||
PIPELINES = recommendation_pipelines().keys() | ||
except Exception as e: | ||
warnings.warn("failed to load models, did you run `dvc pull`?") | ||
if allow_data_test_failures(): | ||
skip("recommendation pipelines unavailable", allow_module_level=True) | ||
else: | ||
raise e | ||
|
||
|
||
@mark.docker | ||
@mark.parametrize("pipeline", PIPELINES) | ||
def test_no_onboarding(service, mind_data, pipeline): # noqa: F811 | ||
""" | ||
Initialize request data | ||
""" | ||
request_generator = RequestGenerator(mind_data()) | ||
request_generator.add_candidates(100) | ||
request_generator.add_clicks(num_clicks=37, num_days=7) | ||
request_generator.add_topics([]) | ||
request_generator.set_num_recs(10) | ||
req_body = request_generator.get_request() | ||
|
||
logger.info("sending request") | ||
response = service.request(req_body, pipeline) | ||
logger.info("response: %s", response.model_dump_json(indent=2)) | ||
assert response.recommendations | ||
assert response.recommendations.values() | ||
recs = next(iter(response.recommendations.values())) | ||
assert len(recs) > 0 | ||
assert len(recs) == request_generator.num_recs | ||
article_ids = [article.article_id for article in recs] | ||
assert len(article_ids) == len(set(article_ids)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should this live in
testing
, or does it make more sense as a separate module?