Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate output format and edge cases #149

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
83 changes: 83 additions & 0 deletions src/poprox_recommender/request_generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import random
from datetime import datetime, timedelta
from typing import List
from uuid import uuid4

from pydantic import ValidationError

from poprox_concepts import AccountInterest, Click, InterestProfile
from poprox_concepts.api.recommendations import RecommendationRequest
from poprox_recommender.data.mind import MindData


class RequestGenerator:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this live in testing, or does it make more sense as a separate module?

"""
Class to generate recommendation request using click history, onboarding topics, and candidate articles from MIND
"""

def __init__(self, mind_data: MindData):
self.mind_data = mind_data
self.candidate_articles = list()
self.past_articles = list()
self.added_topics = list()
self.clicks = list()

def set_num_recs(self, num_recs: int):
self.num_recs = num_recs

def add_clicks(self, num_clicks: int, num_days: int | None = None):
all_articles = list(self.mind_data.news_df.index)

if num_days:
start_date = datetime.now() - timedelta(days=num_days - 1)
timestamps = [start_date + timedelta(days=random.randint(0, num_days - 1)) for _ in range(num_clicks)]
random.shuffle(timestamps)
else:
timestamps = [datetime.now()] * num_clicks
# generate click history
self.clicks = [
Click(
article_id=self.mind_data.news_uuid_for_id(random.choice(all_articles)),
newsletter_id=uuid4(),
timestamp=timestamps[i],
)
for i in range(num_clicks)
]

self.past_articles = [self.mind_data.lookup_article(uuid=click.article_id) for click in self.clicks]

def add_topics(self, topics: List[str]):
self.added_topics = [
AccountInterest(
account_id=uuid4(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these should all be the profile ID, not fresh random UUIDs, shouldn't they?

entity_id=uuid4(),
entity_name=topic,
preference=random.randint(1, 5),
frequency=None,
)
for topic in topics
]

def add_candidates(self, num_candidates):
all_articles = list(self.mind_data.news_df.index)
selected_candidates = random.sample(all_articles, num_candidates)

self.candidate_articles = [self.mind_data.lookup_article(id=article_id) for article_id in selected_candidates]

def get_request(self) -> RecommendationRequest:
interest_profile = InterestProfile(
profile_id=uuid4(),
click_history=self.clicks,
onboarding_topics=self.added_topics,
)

try:
request = RecommendationRequest(
past_articles=self.past_articles,
todays_articles=self.candidate_articles,
interest_profile=interest_profile,
num_recs=self.num_recs,
)
return request
except ValidationError as e:
raise ValueError(f"Generated request is invalid: {e}")
6 changes: 6 additions & 0 deletions src/poprox_recommender/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from pytest import fixture

from poprox_concepts.api.recommendations import RecommendationRequest, RecommendationResponse
from poprox_recommender.data.mind import MindData

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -128,3 +129,8 @@ def auto_service() -> Generator[TestService, None, None]:
# use already-running docker
port = os.environ.get("POPROX_TEST_PORT", "9000")
yield DockerTestService(f"http://localhost:{port}/2015-03-31/functions/function/invocations")


@fixture(scope="session")
def mind_data():
yield MindData
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be yield MindData() — we want to actually instantiate the MIND data object.

59 changes: 59 additions & 0 deletions tests/web_service/test_basic_request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
Test basic request by initializing test data.
"""

import logging
import warnings

from pydantic import ValidationError
from pytest import mark, skip

from poprox_concepts.api.recommendations import RecommendationRequest, RecommendationResponse
from poprox_recommender.config import allow_data_test_failures
from poprox_recommender.recommenders import recommendation_pipelines
from poprox_recommender.request_generator import RequestGenerator
from poprox_recommender.testing import auto_service as service
from poprox_recommender.testing import mind_data

logger = logging.getLogger(__name__)
try:
PIPELINES = recommendation_pipelines().keys()
except Exception as e:
warnings.warn("failed to load models, did you run `dvc pull`?")
if allow_data_test_failures():
skip("recommendation pipelines unavailable", allow_module_level=True)
else:
raise e


@mark.docker
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need mark.docker? What is it for?

@mark.parametrize("pipeline", PIPELINES)
def test_basic_request(service, mind_data, pipeline): # noqa: F811
"""
Initialize request data
"""
request_generator = RequestGenerator(mind_data())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

after the change to the fixture, this will become mind_data (without parentheses)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This applies to other tests as well.

request_generator.add_candidates(100)
request_generator.add_clicks(num_clicks=37, num_days=7)
request_generator.add_topics(
[
"Science",
"Technology",
"Sports",
"Lifestyle",
"Oddities",
]
)
request_generator.set_num_recs(10)
req_body = request_generator.get_request()

logger.info("sending request")
response = service.request(req_body, pipeline)
logger.info("response: %s", response.model_dump_json(indent=2))
assert response.recommendations
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we make sure that the request account ID is the key in this dictionary? And that its length is exactly 1? (shouldn't be recommending for users we didn't request)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This applies to other tests as well, probably.

assert response.recommendations.values()
recs = next(iter(response.recommendations.values()))
assert len(recs) > 0
assert len(recs) == request_generator.num_recs
article_ids = [article.article_id for article in recs]
assert len(article_ids) == len(set(article_ids))
68 changes: 68 additions & 0 deletions tests/web_service/test_heavy_interaction_history.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""
Validate edge case: a user with long history of clicks and onboarding topics
"""

import logging
import warnings

from pydantic import ValidationError
from pytest import mark, skip

from poprox_concepts.api.recommendations import RecommendationRequest, RecommendationResponse
from poprox_recommender.config import allow_data_test_failures
from poprox_recommender.recommenders import recommendation_pipelines
from poprox_recommender.request_generator import RequestGenerator
from poprox_recommender.testing import auto_service as service
from poprox_recommender.testing import mind_data

logger = logging.getLogger(__name__)
try:
PIPELINES = recommendation_pipelines().keys()
except Exception as e:
warnings.warn("failed to load models, did you run `dvc pull`?")
if allow_data_test_failures():
skip("recommendation pipelines unavailable", allow_module_level=True)
else:
raise e


@mark.docker
@mark.parametrize("pipeline", PIPELINES)
def test_heavy_interaction_history(service, mind_data, pipeline): # noqa: F811
"""
Initialize request data
"""
request_generator = RequestGenerator(mind_data())
request_generator.add_candidates(100)
request_generator.add_clicks(num_clicks=100, num_days=10)
request_generator.add_topics(
[
"U.S. news",
"World news",
"Politics",
"Business",
"Entertainment",
"Sports",
"Health",
"Science",
"Technology",
"Lifestyle",
"Religion",
"Climate and environment",
"Education",
"Oddities",
]
)
request_generator.set_num_recs(10)
req_body = request_generator.get_request()

logger.info("sending request")
response = service.request(req_body, pipeline)
logger.info("response: %s", response.model_dump_json(indent=2))
assert response.recommendations
assert response.recommendations.values()
recs = next(iter(response.recommendations.values()))
assert len(recs) > 0
assert len(recs) == request_generator.num_recs
article_ids = [article.article_id for article in recs]
assert len(article_ids) == len(set(article_ids))
59 changes: 59 additions & 0 deletions tests/web_service/test_no_clicks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
Validate edge case: user has no click history
"""

import logging
import warnings

from pydantic import ValidationError
from pytest import mark, skip

from poprox_concepts.api.recommendations import RecommendationRequest, RecommendationResponse
from poprox_recommender.config import allow_data_test_failures
from poprox_recommender.recommenders import recommendation_pipelines
from poprox_recommender.request_generator import RequestGenerator
from poprox_recommender.testing import auto_service as service
from poprox_recommender.testing import mind_data

logger = logging.getLogger(__name__)
try:
PIPELINES = recommendation_pipelines().keys()
except Exception as e:
warnings.warn("failed to load models, did you run `dvc pull`?")
if allow_data_test_failures():
skip("recommendation pipelines unavailable", allow_module_level=True)
else:
raise e


@mark.docker
@mark.parametrize("pipeline", PIPELINES)
def test_no_clicks(service, mind_data, pipeline): # noqa: F811
"""
Initialize request data
"""
request_generator = RequestGenerator(mind_data())
request_generator.add_candidates(100)
request_generator.add_clicks(num_clicks=0, num_days=10)
request_generator.add_topics(
[
"Science",
"Technology",
"Sports",
"Lifestyle",
"Oddities",
]
)
request_generator.set_num_recs(10)
req_body = request_generator.get_request()

logger.info("sending request")
response = service.request(req_body, pipeline)
logger.info("response: %s", response.model_dump_json(indent=2))
assert response.recommendations
assert response.recommendations.values()
recs = next(iter(response.recommendations.values()))
assert len(recs) > 0
assert len(recs) == request_generator.num_recs
article_ids = [article.article_id for article in recs]
assert len(article_ids) == len(set(article_ids))
51 changes: 51 additions & 0 deletions tests/web_service/test_no_clicks_no_onboarding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
Validate edge case: user has no click history & no onboarding preferences
"""

import logging
import warnings

from pydantic import ValidationError
from pytest import mark, skip

from poprox_concepts.api.recommendations import RecommendationRequest, RecommendationResponse
from poprox_recommender.config import allow_data_test_failures
from poprox_recommender.recommenders import recommendation_pipelines
from poprox_recommender.request_generator import RequestGenerator
from poprox_recommender.testing import auto_service as service
from poprox_recommender.testing import mind_data

logger = logging.getLogger(__name__)
try:
PIPELINES = recommendation_pipelines().keys()
except Exception as e:
warnings.warn("failed to load models, did you run `dvc pull`?")
if allow_data_test_failures():
skip("recommendation pipelines unavailable", allow_module_level=True)
else:
raise e


@mark.docker
@mark.parametrize("pipeline", PIPELINES)
def test_no_clicks_no_onboarding(service, mind_data, pipeline): # noqa: F811
"""
Initialize request data
"""
request_generator = RequestGenerator(mind_data())
request_generator.add_candidates(100)
request_generator.add_clicks(num_clicks=0, num_days=0)
request_generator.add_topics([])
request_generator.set_num_recs(10)
req_body = request_generator.get_request()

logger.info("sending request")
response = service.request(req_body, pipeline)
logger.info("response: %s", response.model_dump_json(indent=2))
assert response.recommendations
assert response.recommendations.values()
recs = next(iter(response.recommendations.values()))
assert len(recs) > 0
assert len(recs) == request_generator.num_recs
article_ids = [article.article_id for article in recs]
assert len(article_ids) == len(set(article_ids))
51 changes: 51 additions & 0 deletions tests/web_service/test_no_onboarding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""
Validate edge case: user has no on boarding topic
"""

import logging
import warnings

from pydantic import ValidationError
from pytest import mark, skip

from poprox_concepts.api.recommendations import RecommendationRequest, RecommendationResponse
from poprox_recommender.config import allow_data_test_failures
from poprox_recommender.recommenders import recommendation_pipelines
from poprox_recommender.request_generator import RequestGenerator
from poprox_recommender.testing import auto_service as service
from poprox_recommender.testing import mind_data

logger = logging.getLogger(__name__)
try:
PIPELINES = recommendation_pipelines().keys()
except Exception as e:
warnings.warn("failed to load models, did you run `dvc pull`?")
if allow_data_test_failures():
skip("recommendation pipelines unavailable", allow_module_level=True)
else:
raise e


@mark.docker
@mark.parametrize("pipeline", PIPELINES)
def test_no_onboarding(service, mind_data, pipeline): # noqa: F811
"""
Initialize request data
"""
request_generator = RequestGenerator(mind_data())
request_generator.add_candidates(100)
request_generator.add_clicks(num_clicks=37, num_days=7)
request_generator.add_topics([])
request_generator.set_num_recs(10)
req_body = request_generator.get_request()

logger.info("sending request")
response = service.request(req_body, pipeline)
logger.info("response: %s", response.model_dump_json(indent=2))
assert response.recommendations
assert response.recommendations.values()
recs = next(iter(response.recommendations.values()))
assert len(recs) > 0
assert len(recs) == request_generator.num_recs
article_ids = [article.article_id for article in recs]
assert len(article_ids) == len(set(article_ids))
Loading
Loading