Skip to content

Commit

Permalink
🚸(backend) improve users similarity search and sort results
Browse files Browse the repository at this point in the history
In some edge cases when the domain part of email addresses is
longer than then name part, users searches by email similarity
can return a lot of results.

We can improve this by being more demanding on similarity when
the query looks like an email. Sorting results by the similarity
score is also an obvious improvement.

At the moment, we still think it is good to propose results with
a weak similarity on the name part because we want to avoid
as much as possible creating duplicate users by inviting one of
is many emails, a user who is already in our database.

Fixes 399
  • Loading branch information
AntoLC authored and sampaccoud committed Nov 2, 2024
1 parent 50891af commit f9de9c7
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 0 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ and this project adheres to

## [Unreleased]

## Changed

- 🚸(backend) improve users similarity search and sort results #391

## [1.7.0] - 2024-10-24

## Added
Expand Down
14 changes: 14 additions & 0 deletions src/backend/core/api/viewsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from django.conf import settings
from django.contrib.postgres.aggregates import ArrayAgg
from django.contrib.postgres.search import TrigramSimilarity
from django.core.exceptions import ValidationError
from django.core.files.storage import default_storage
from django.db.models import (
Expand Down Expand Up @@ -156,8 +157,21 @@ def get_queryset(self):

# Filter users by email similarity
if query := self.request.GET.get("q", ""):
# For performance reasons we filter first by similarity, which relies on an index,
# then only calculate precise similarity scores for sorting purposes
queryset = queryset.filter(email__trigram_word_similar=query)

queryset = queryset.annotate(
similarity=TrigramSimilarity("email", query)
)
# When the query only is on the name part, we should try to make many proposals
# But when the query looks like an email we should only propose serious matches
threshold = 0.6 if "@" in query else 0.1

queryset = queryset.filter(similarity__gt=threshold).order_by(
"-similarity"
)

return queryset

@decorators.action(
Expand Down
28 changes: 28 additions & 0 deletions src/backend/core/tests/test_api_users.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,34 @@ def test_api_users_list_query_email():
assert user_ids == [str(nicole.id), str(frank.id)]


def test_api_users_list_query_email_matching():
"""While filtering by email, results should be filtered and sorted by similarity"""
user = factories.UserFactory()

client = APIClient()
client.force_login(user)

alice = factories.UserFactory(email="alice.johnson@example.gouv.fr")
factories.UserFactory(email="jane.smith@example.gouv.fr")
michael_wilson = factories.UserFactory(email="michael.wilson@example.gouv.fr")
factories.UserFactory(email="david.jones@example.gouv.fr")
michael_brown = factories.UserFactory(email="michael.brown@example.gouv.fr")
factories.UserFactory(email="sophia.taylor@example.gouv.fr")

response = client.get(
"/api/v1.0/users/?q=michael.johnson@example.gouv.f",
)
assert response.status_code == 200
user_ids = [user["id"] for user in response.json()["results"]]
assert user_ids == [str(michael_wilson.id)]

response = client.get("/api/v1.0/users/?q=michael.johnson@example.gouv.fr")

assert response.status_code == 200
user_ids = [user["id"] for user in response.json()["results"]]
assert user_ids == [str(michael_wilson.id), str(alice.id), str(michael_brown.id)]


def test_api_users_list_query_email_exclude_doc_user():
"""
Authenticated users should be able to list users
Expand Down

0 comments on commit f9de9c7

Please sign in to comment.