From 43547b183364d9f4b5953b748e812a4b0caba9c9 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Thu, 14 Dec 2023 10:50:54 -0800 Subject: [PATCH 1/2] feat: Count implicit tags --- openedx_tagging/core/tagging/api.py | 15 ++++++++--- .../core/tagging/rest_api/v1/views.py | 5 +++- .../openedx_tagging/core/tagging/test_api.py | 26 +++++++++++++++++++ .../core/tagging/test_views.py | 26 ++++++++++++++++--- 4 files changed, 64 insertions(+), 8 deletions(-) diff --git a/openedx_tagging/core/tagging/api.py b/openedx_tagging/core/tagging/api.py index 64d5c6a83..52e4f84bb 100644 --- a/openedx_tagging/core/tagging/api.py +++ b/openedx_tagging/core/tagging/api.py @@ -196,7 +196,7 @@ def get_object_tags( return tags -def get_object_tag_counts(object_id_pattern: str) -> dict[str, int]: +def get_object_tag_counts(object_id_pattern: str, count_implicit=False) -> dict[str, int]: """ Given an object ID, a "starts with" glob pattern like "course-v1:foo+bar+baz@*", or a list of "comma,separated,IDs", return a @@ -217,8 +217,17 @@ def get_object_tag_counts(object_id_pattern: str) -> dict[str, int]: qs = qs.exclude(taxonomy_id=None) # The whole taxonomy was deleted qs = qs.exclude(taxonomy__enabled=False) # The whole taxonomy is disabled qs = qs.exclude(tag_id=None, taxonomy__allow_free_text=False) # The taxonomy exists but the tag is deleted - qs = qs.values("object_id").annotate(num_tags=models.Count("id")).order_by("object_id") - return {row["object_id"]: row["num_tags"] for row in qs} + if count_implicit: + tags = Tag.annotate_depth(Tag.objects.filter(pk=models.OuterRef("tag_id"))) + qs = qs.annotate(tag_depth=models.Subquery(tags.values('depth'))) + qs = qs.values("object_id").annotate( + num_tags=models.Count("id"), + num_implicit_tags=models.Sum("tag_depth"), + ).order_by("object_id") + return {row["object_id"]: row["num_tags"] + (row["num_implicit_tags"] or 0) for row in qs} + else: + qs = qs.values("object_id").annotate(num_tags=models.Count("id")).order_by("object_id") + return {row["object_id"]: row["num_tags"] for row in qs} def delete_object_tags(object_id: str): diff --git a/openedx_tagging/core/tagging/rest_api/v1/views.py b/openedx_tagging/core/tagging/rest_api/v1/views.py index 591e95e15..99b6a59be 100644 --- a/openedx_tagging/core/tagging/rest_api/v1/views.py +++ b/openedx_tagging/core/tagging/rest_api/v1/views.py @@ -496,9 +496,11 @@ class ObjectTagCountsView( **Retrieve Parameters** * object_id_pattern (required): - The Object ID to retrieve ObjectTags for. Can contain '*' at the end for wildcard matching, or use ',' to separate multiple object IDs. + * count_implicit (optional): If present, implicit parent/grandparent tags will be included in the counts **Retrieve Example Requests** GET api/tagging/v1/object_tag_counts/:object_id_pattern + GET api/tagging/v1/object_tag_counts/:object_id_pattern?count_implicit **Retrieve Query Returns** * 200 - Success @@ -517,8 +519,9 @@ def retrieve(self, request, *args, **kwargs) -> Response: """ # This API does NOT bother doing any permission checks as the # of tags is not considered sensitive information. object_id_pattern = self.kwargs["object_id_pattern"] + count_implicit = "count_implicit" in request.query_params try: - return Response(get_object_tag_counts(object_id_pattern)) + return Response(get_object_tag_counts(object_id_pattern, count_implicit=count_implicit)) except ValueError as err: raise ValidationError(err.args[0]) from err diff --git a/tests/openedx_tagging/core/tagging/test_api.py b/tests/openedx_tagging/core/tagging/test_api.py index 5ad7e3fa4..400073cd8 100644 --- a/tests/openedx_tagging/core/tagging/test_api.py +++ b/tests/openedx_tagging/core/tagging/test_api.py @@ -704,6 +704,29 @@ def test_get_object_tag_counts(self) -> None: assert tagging_api.get_object_tag_counts(f"{obj1},{obj2}") == {obj1: 1, obj2: 2} assert tagging_api.get_object_tag_counts("object_*") == {obj1: 1, obj2: 2} + def test_get_object_tag_counts_implicit(self) -> None: + """ + Basic test of get_object_tag_counts, including implicit (parent) tags + + Note that: + - "DPANN" is "Archaea > DPANN" (2 tags, 1 implicit), and + - "Chordata" is "Eukaryota > Animalia > Chordata" (3 tags, 2 implicit) + """ + obj1 = "object_id1" + obj2 = "object_id2" + other = "other_object" + # Give each object 1-2 tags: + tagging_api.tag_object(object_id=obj1, taxonomy=self.taxonomy, tags=["DPANN"]) + tagging_api.tag_object(object_id=obj2, taxonomy=self.taxonomy, tags=["Chordata"]) + tagging_api.tag_object(object_id=obj2, taxonomy=self.free_text_taxonomy, tags=["has a notochord"]) + tagging_api.tag_object(object_id=other, taxonomy=self.free_text_taxonomy, tags=["other"]) + + assert tagging_api.get_object_tag_counts(obj1, count_implicit=True) == {obj1: 2} + assert tagging_api.get_object_tag_counts(obj2, count_implicit=True) == {obj2: 4} + assert tagging_api.get_object_tag_counts(f"{obj1},{obj2}", count_implicit=True) == {obj1: 2, obj2: 4} + assert tagging_api.get_object_tag_counts("object_*", count_implicit=True) == {obj1: 2, obj2: 4} + assert tagging_api.get_object_tag_counts(other, count_implicit=True) == {other: 1} + def test_get_object_tag_counts_deleted_disabled(self) -> None: """ Test that get_object_tag_counts doesn't "count" disabled taxonomies or @@ -726,6 +749,9 @@ def test_get_object_tag_counts_deleted_disabled(self) -> None: self.free_text_taxonomy.enabled = False self.free_text_taxonomy.save() assert tagging_api.get_object_tag_counts("object_*") == {obj1: 1, obj2: 1} + # Also check the result with count_implicit: + # "English" has no implicit tags but "Chordata" has two, so we expect these totals: + assert tagging_api.get_object_tag_counts("object_*", count_implicit=True) == {obj1: 1, obj2: 3} # But, by the way, if we re-enable the taxonomy and restore the tag, the counts return: self.free_text_taxonomy.enabled = True diff --git a/tests/openedx_tagging/core/tagging/test_views.py b/tests/openedx_tagging/core/tagging/test_views.py index 00151f307..ae1eef4fd 100644 --- a/tests/openedx_tagging/core/tagging/test_views.py +++ b/tests/openedx_tagging/core/tagging/test_views.py @@ -1028,10 +1028,14 @@ def test_get_counts(self): # Course 7 Unit 2 api.tag_object(object_id="course07-unit02-problem01", taxonomy=self.free_text_taxonomy, tags=["b"]) api.tag_object(object_id="course07-unit02-problem02", taxonomy=self.free_text_taxonomy, tags=["c", "d"]) - api.tag_object(object_id="course07-unit02-problem03", taxonomy=self.free_text_taxonomy, tags=["N", "M", "x"]) - - def check(object_id_pattern: str): - result = self.client.get(OBJECT_TAG_COUNTS_URL.format(object_id_pattern=object_id_pattern)) + api.tag_object(object_id="course07-unit02-problem03", taxonomy=self.free_text_taxonomy, tags=["N", "M"]) + api.tag_object(object_id="course07-unit02-problem03", taxonomy=self.taxonomy, tags=["Mammalia"]) + + def check(object_id_pattern: str, count_implicit=False): + url = OBJECT_TAG_COUNTS_URL.format(object_id_pattern=object_id_pattern) + if count_implicit: + url += "?count_implicit" + result = self.client.get(url) assert result.status_code == status.HTTP_200_OK return result.data @@ -1044,6 +1048,20 @@ def check(object_id_pattern: str): "course07-unit01-problem01": 3, "course07-unit01-problem02": 2, } + with self.assertNumQueries(1): + assert check(object_id_pattern="course07-unit02-*") == { + "course07-unit02-problem01": 1, + "course07-unit02-problem02": 2, + "course07-unit02-problem03": 3, + } + with self.assertNumQueries(1): + assert check(object_id_pattern="course07-unit02-*", count_implicit=True) == { + "course07-unit02-problem01": 1, + "course07-unit02-problem02": 2, + # "Mammalia" includes 1 explicit + 3 implicit tags: "Eukaryota > Animalia > Chordata > Mammalia" + # so problem03 has 2 free text tags and "4" life on earth tags: + "course07-unit02-problem03": 6, + } with self.assertNumQueries(1): assert check(object_id_pattern="course07-unit*") == { "course07-unit01-problem01": 3, From eb845c4e56338738379436495fca112be0a23209 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Mon, 18 Dec 2023 12:59:34 -0800 Subject: [PATCH 2/2] fix: Incorrect count of implicit tags when object had multiple tags --- openedx_tagging/core/tagging/api.py | 29 +++++++++++++++---- openedx_tagging/core/tagging/models/utils.py | 22 +++++++++++++- .../openedx_tagging/core/tagging/test_api.py | 13 +++++++-- 3 files changed, 55 insertions(+), 9 deletions(-) diff --git a/openedx_tagging/core/tagging/api.py b/openedx_tagging/core/tagging/api.py index 52e4f84bb..9cde39a47 100644 --- a/openedx_tagging/core/tagging/api.py +++ b/openedx_tagging/core/tagging/api.py @@ -21,7 +21,7 @@ from .data import TagDataQuerySet from .models import ObjectTag, Tag, Taxonomy -from .models.utils import ConcatNull +from .models.utils import ConcatNull, StringAgg # Export this as part of the API TagDoesNotExist = Tag.DoesNotExist @@ -218,13 +218,32 @@ def get_object_tag_counts(object_id_pattern: str, count_implicit=False) -> dict[ qs = qs.exclude(taxonomy__enabled=False) # The whole taxonomy is disabled qs = qs.exclude(tag_id=None, taxonomy__allow_free_text=False) # The taxonomy exists but the tag is deleted if count_implicit: - tags = Tag.annotate_depth(Tag.objects.filter(pk=models.OuterRef("tag_id"))) - qs = qs.annotate(tag_depth=models.Subquery(tags.values('depth'))) + # Counting the implicit tags is tricky, because if two "grandchild" tags have the same implicit parent tag, we + # need to count that parent tag only once. To do that, we collect all the ancestor tag IDs into an aggregate + # string, and then count the unique values using python qs = qs.values("object_id").annotate( num_tags=models.Count("id"), - num_implicit_tags=models.Sum("tag_depth"), + tag_ids_str_1=StringAgg("tag_id"), + tag_ids_str_2=StringAgg("tag__parent_id"), + tag_ids_str_3=StringAgg("tag__parent__parent_id"), + tag_ids_str_4=StringAgg("tag__parent__parent__parent_id"), ).order_by("object_id") - return {row["object_id"]: row["num_tags"] + (row["num_implicit_tags"] or 0) for row in qs} + result = {} + for row in qs: + # ObjectTags for free text taxonomies will be included in "num_tags" count, but not "tag_ids_str_1" since + # they have no tag ID. We can compute how many free text tags each object has now: + if row["tag_ids_str_1"]: + num_free_text_tags = row["num_tags"] - len(row["tag_ids_str_1"].split(",")) + else: + num_free_text_tags = row["num_tags"] + # Then we count the total number of *unique* Tags for this object, both implicit and explicit: + other_tag_ids = set() + for field in ("tag_ids_str_1", "tag_ids_str_2", "tag_ids_str_3", "tag_ids_str_4"): + if row[field] is not None: + for tag_id in row[field].split(","): + other_tag_ids.add(int(tag_id)) + result[row["object_id"]] = num_free_text_tags + len(other_tag_ids) + return result else: qs = qs.values("object_id").annotate(num_tags=models.Count("id")).order_by("object_id") return {row["object_id"]: row["num_tags"] for row in qs} diff --git a/openedx_tagging/core/tagging/models/utils.py b/openedx_tagging/core/tagging/models/utils.py index 1f7dcb920..c2e2201e8 100644 --- a/openedx_tagging/core/tagging/models/utils.py +++ b/openedx_tagging/core/tagging/models/utils.py @@ -1,7 +1,7 @@ """ Utilities for tagging and taxonomy models """ - +from django.db.models import Aggregate, CharField from django.db.models.expressions import Func @@ -22,3 +22,23 @@ def as_sqlite(self, compiler, connection, **extra_context): arg_joiner=" || ", **extra_context, ) + + +class StringAgg(Aggregate): # pylint: disable=abstract-method + """ + Aggregate function that collects the values of some column across all rows, + and creates a string by concatenating those values, with "," as a separator. + + This is the same as Django's django.contrib.postgres.aggregates.StringAgg, + but this version works with MySQL and SQLite. + """ + function = 'GROUP_CONCAT' + template = '%(function)s(%(distinct)s%(expressions)s)' + + def __init__(self, expression, distinct=False, **extra): + super().__init__( + expression, + distinct='DISTINCT ' if distinct else '', + output_field=CharField(), + **extra, + ) diff --git a/tests/openedx_tagging/core/tagging/test_api.py b/tests/openedx_tagging/core/tagging/test_api.py index 400073cd8..5befef5c9 100644 --- a/tests/openedx_tagging/core/tagging/test_api.py +++ b/tests/openedx_tagging/core/tagging/test_api.py @@ -711,20 +711,27 @@ def test_get_object_tag_counts_implicit(self) -> None: Note that: - "DPANN" is "Archaea > DPANN" (2 tags, 1 implicit), and - "Chordata" is "Eukaryota > Animalia > Chordata" (3 tags, 2 implicit) + - "Arthropoda" is "Eukaryota > Animalia > Arthropoda" (same) """ - obj1 = "object_id1" - obj2 = "object_id2" + self.taxonomy.allow_multiple = True + self.taxonomy.save() + obj1, obj2, obj3 = "object_id1", "object_id2", "object_id3" other = "other_object" # Give each object 1-2 tags: tagging_api.tag_object(object_id=obj1, taxonomy=self.taxonomy, tags=["DPANN"]) tagging_api.tag_object(object_id=obj2, taxonomy=self.taxonomy, tags=["Chordata"]) tagging_api.tag_object(object_id=obj2, taxonomy=self.free_text_taxonomy, tags=["has a notochord"]) + tagging_api.tag_object(object_id=obj3, taxonomy=self.taxonomy, tags=["Chordata", "Arthropoda"]) tagging_api.tag_object(object_id=other, taxonomy=self.free_text_taxonomy, tags=["other"]) assert tagging_api.get_object_tag_counts(obj1, count_implicit=True) == {obj1: 2} assert tagging_api.get_object_tag_counts(obj2, count_implicit=True) == {obj2: 4} assert tagging_api.get_object_tag_counts(f"{obj1},{obj2}", count_implicit=True) == {obj1: 2, obj2: 4} - assert tagging_api.get_object_tag_counts("object_*", count_implicit=True) == {obj1: 2, obj2: 4} + assert tagging_api.get_object_tag_counts("object_*", count_implicit=True) == { + obj1: 2, + obj2: 4, + obj3: 4, # obj3 has 2 explicit tags and 2 implicit tags (not 4 because the implicit tags are the same) + } assert tagging_api.get_object_tag_counts(other, count_implicit=True) == {other: 1} def test_get_object_tag_counts_deleted_disabled(self) -> None: