uzh-bf · sjschlapbach · Dec 23, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -22,4 +22,5 @@ packages/prisma/src/seed
 .turbo
 
 out/
+!out/.gitkeep
 .rollup.cache/
diff --git a/apps/analytics/README.md b/apps/analytics/README.md
@@ -0,0 +1,24 @@
+# KlickerUZH Analytics
+
+This service computes learning analytics for KlickerUZH, providing insights into student learning patterns and performance metrics.
+
+## Requirements
+
+- Python 3.12.x (e.g., installed through `asdf`)
+- Node.js 20.x.x
+- Poetry
+
+## Setup
+
+- The project uses Poetry for dependency management and environment isolation. Make sure you have Poetry installed before proceeding. Then run `poetry install` in this folder to prepare the virtual environment.
+- The project uses PNPM to simplify the execution of scripts and to provide a watch mode for execution. Make sure that you have executed `pnpm install` in the repository before trying to run the commands below.
+- Make sure that all `.prisma` files are available in `prisma/`. If this is not the case, run the `util/sync-schema.sh` script first.
+- Make sure that a valid Python environment is used (3.12). If poetry tries to use an environment not matching specifications, the install command or script execution might fail. The Python binary to be used can be set expliticly using `poetry env use /Users/.../bin/python` (after which `poetry install` has to be run). Tools like `asdf` allow the clean management of multiple Python versions on a single machine.
+
+## Available Commands
+
+The following commands are available through PNPM:
+
+- `pnpm generate` - Generate the Prisma client for database access in Python
+- `pnpm main` - Run the analytics service
+- `pnpm dev` - Start the service in watch mode for development
diff --git a/apps/analytics/package.json b/apps/analytics/package.json
@@ -0,0 +1,13 @@
+{
+  "name": "@klicker-uzh/analytics",
+  "version": "3.3.0-alpha.8",
+  "license": "AGPL-3.0",
+  "devDependencies": {
+    "nodemon": "~3.1.7"
+  },
+  "scripts": {
+    "dev": "doppler run --config dev -- nodemon --exec 'poetry run poe main' --watch src,prisma --ext py,prisma",
+    "generate": "poetry run poe generate",
+    "main": "doppler run --config dev -- poetry run poe main"
+  }
+}
diff --git a/apps/analytics/poetry.lock b/apps/analytics/poetry.lock
diff --git a/apps/analytics/pyproject.toml b/apps/analytics/pyproject.toml
@@ -2,25 +2,31 @@
 name = "@klicker-uzh/analytics"
 version = "0.0.1"
 description = ""
-authors = ["Roland Schlaefli <roland.schlaefli@df.uzh.ch>"]
+authors = ["Roland Schlaefli <roland.schlaefli@df.uzh.ch>", "Julius Schlapbach <julius.schlapbach@df.uzh.ch>"]
 license = "AGPL-3.0"
 readme = "README.md"
-packages = [{include = "@klicker_uzh"}]
+package-mode = false
 
 [tool.poetry.dependencies]
 python = "^3.12"
 pandas = "2.2.2"
-prisma = "0.14.0"
-xlsxwriter = "^3.2.0"
+prisma = "0.15.0"
+xlsxwriter = "3.2.0"
 
 [tool.poetry.dev-dependencies]
 poethepoet = "0.27.0"
 ipykernel = "6.29.5"
 
+[tool.poetry.group.dev.dependencies]
+pyright = "1.1.376"
+
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poe.tasks]
 generate = "prisma generate"
-main = "doppler run --config dev -- python main.py"
+main = "doppler run --config dev -- python -m src.main"
+
+[tool.pyright]
+typeCheckingMode = "strict"
diff --git a/apps/analytics/src/__init__.py b/apps/analytics/src/__init__.py
@@ -0,0 +1,2 @@
+from .modules import *
+from .notebooks import *
diff --git a/apps/analytics/main.py → apps/analytics/src/main.py b/apps/analytics/main.py → apps/analytics/src/main.py
diff --git a/apps/analytics/src/modules/__init__.py b/apps/analytics/src/modules/__init__.py
@@ -0,0 +1,7 @@
+from .participant_analytics import *
+from .aggregated_analytics import *
+from .participant_course_analytics import *
+from .aggregated_course_analytics import *
+from .participant_performance import *
+from .instance_activity_performance import *
+from .activity_progress import *
diff --git a/apps/analytics/src/modules/activity_progress/__init__.py b/apps/analytics/src/modules/activity_progress/__init__.py
@@ -0,0 +1,4 @@
+from .get_course_progress_activities import get_course_progress_activities
+from .compute_progress_counts import compute_progress_counts
+from .save_practice_quiz_progress import save_practice_quiz_progress
+from .save_microlearning_progress import save_microlearning_progress
diff --git a/apps/analytics/src/modules/activity_progress/compute_progress_counts.py b/apps/analytics/src/modules/activity_progress/compute_progress_counts.py
@@ -0,0 +1,41 @@
+import pandas as pd
+
+
+def compute_progress_counts(activity):
+    started_count = 0
+    completed_count = 0
+    repeated_count = 0
+
+    if len(activity["responses"]) != 0:
+        # count number of elements in activity stacks
+        num_elements = 0
+        for stack in activity["stacks"]:
+            num_elements += len(stack["elements"])
+
+        # group the activity responses by participant and count them
+        df_responses = pd.DataFrame(activity["responses"])
+        df_statistics = (
+            df_responses[["id", "trialsCount", "participantId"]]
+            .groupby("participantId")
+            .agg({"id": "count", "trialsCount": "min"})
+            .rename(columns={"id": "count", "trialsCount": "min_trials"})
+        )
+
+        # compute number of participants that have started the activity
+        started_count = len(df_statistics[df_statistics["count"] <= num_elements])
+
+        # compute number of participants that have completed the activity
+        completed_count = len(df_statistics[df_statistics["count"] == num_elements])
+
+        # count the number of participants that have repeated the activity (completed and min_trials >= 2)
+        repeated_count = len(
+            df_statistics[
+                (df_statistics["count"] == num_elements)
+                & (df_statistics["min_trials"] >= 2)
+            ]
+        )
+
+    else:
+        print("No responses found for activity", activity["id"])
+
+    return started_count, completed_count, repeated_count
diff --git a/apps/analytics/src/modules/activity_progress/get_course_progress_activities.py b/apps/analytics/src/modules/activity_progress/get_course_progress_activities.py
@@ -0,0 +1,14 @@
+def get_course_progress_activities(db, course_id):
+    pqs = db.practicequiz.find_many(
+        where={"courseId": course_id},
+        include={"stacks": {"include": {"elements": True}}, "responses": True},
+    )
+    pqs = list(map(lambda x: x.dict(), pqs))
+
+    mls = db.microlearning.find_many(
+        where={"courseId": course_id},
+        include={"stacks": {"include": {"elements": True}}, "responses": True},
+    )
+    mls = list(map(lambda x: x.dict(), mls))
+
+    return pqs, mls
diff --git a/apps/analytics/src/modules/activity_progress/save_microlearning_progress.py b/apps/analytics/src/modules/activity_progress/save_microlearning_progress.py
@@ -0,0 +1,21 @@
+def save_microlearning_progress(
+    db,
+    course_participants,
+    started_count,
+    completed_count,
+    course_id,
+    ml_id,
+):
+    values = {
+        "totalCourseParticipants": course_participants,
+        "startedCount": started_count,
+        "completedCount": completed_count,
+    }
+    creation_values = values.copy()
+    creation_values["course"] = {"connect": {"id": course_id}}
+    creation_values["microLearning"] = {"connect": {"id": ml_id}}
+
+    db.activityprogress.upsert(
+        where={"microLearningId": ml_id},
+        data={"create": creation_values, "update": values},
+    )
diff --git a/apps/analytics/src/modules/activity_progress/save_practice_quiz_progress.py b/apps/analytics/src/modules/activity_progress/save_practice_quiz_progress.py
@@ -0,0 +1,23 @@
+def save_practice_quiz_progress(
+    db,
+    course_participants,
+    started_count,
+    completed_count,
+    repeated_count,
+    course_id,
+    quiz_id,
+):
+    values = {
+        "totalCourseParticipants": course_participants,
+        "startedCount": started_count,
+        "completedCount": completed_count,
+        "repeatedCount": repeated_count,
+    }
+    creation_values = values.copy()
+    creation_values["course"] = {"connect": {"id": course_id}}
+    creation_values["practiceQuiz"] = {"connect": {"id": quiz_id}}
+
+    db.activityprogress.upsert(
+        where={"practiceQuizId": quiz_id},
+        data={"create": creation_values, "update": values},
+    )
diff --git a/apps/analytics/src/modules/aggregated_analytics/__init__.py b/apps/analytics/src/modules/aggregated_analytics/__init__.py
@@ -0,0 +1,4 @@
+from .compute_aggregated_analytics import compute_aggregated_analytics
+from .load_participant_analytics import load_participant_analytics
+from .aggregate_participant_analytics import aggregate_participant_analytics
+from .save_aggregated_analytics import save_aggregated_analytics
diff --git a/apps/analytics/src/modules/aggregated_analytics/aggregate_participant_analytics.py b/apps/analytics/src/modules/aggregated_analytics/aggregate_participant_analytics.py
@@ -0,0 +1,29 @@
+def aggregate_participant_analytics(df_participant_analytics, verbose=False):
+    # if the dataframe is empty, return None
+    if df_participant_analytics.empty:
+        if verbose:
+            print("No participant analytics to aggregate")
+
+        return None
+
+    # aggreagte all participant analytics for the specified time range and separate courses
+    df_aggregated_analytics = (
+        df_participant_analytics.groupby("courseId")
+        .agg(
+            {
+                "id": "count",
+                "responseCount": "sum",
+                "totalScore": "sum",
+                "totalPoints": "sum",
+                "totalXp": "sum",
+            }
+        )
+        .reset_index()
+        .rename(
+            columns={
+                "id": "participantCount",
+            }
+        )
+    )
+
+    return df_aggregated_analytics
diff --git a/apps/analytics/src/modules/aggregated_analytics/compute_aggregated_analytics.py b/apps/analytics/src/modules/aggregated_analytics/compute_aggregated_analytics.py
@@ -0,0 +1,34 @@
+from .load_participant_analytics import load_participant_analytics
+from .aggregate_participant_analytics import aggregate_participant_analytics
+from .save_aggregated_analytics import save_aggregated_analytics
+
+
+def compute_aggregated_analytics(
+    db, start_date, end_date, timestamp, analytics_type="DAILY", verbose=False
+):
+    # load all participant analytics for the given timestamp and analytics time range
+    df_participant_analytics = load_participant_analytics(
+        db, timestamp, analytics_type, verbose
+    )
+
+    # aggregate all participant analytics values by course
+    df_aggregated_analytics = aggregate_participant_analytics(
+        df_participant_analytics, verbose
+    )
+
+    if df_aggregated_analytics is not None and verbose:
+        print("Aggregated analytics for time range:" + start_date + " to " + end_date)
+        print(df_aggregated_analytics.head())
+    elif df_aggregated_analytics is None:
+        print(
+            "No aggregated analytics to compute for time range:"
+            + start_date
+            + " to "
+            + end_date
+        )
+
+    # store the computed aggregated analytics in the database
+    if df_aggregated_analytics is not None:
+        save_aggregated_analytics(
+            db, df_aggregated_analytics, timestamp, analytics_type
+        )
diff --git a/apps/analytics/src/modules/aggregated_analytics/load_participant_analytics.py b/apps/analytics/src/modules/aggregated_analytics/load_participant_analytics.py
@@ -0,0 +1,30 @@
+import pandas as pd
+
+
+def convert_to_df(analytics):
+    # convert the database query result into a pandas dataframe
+    rows = []
+    for item in analytics:
+        rows.append(dict(item))
+
+    return pd.DataFrame(rows)
+
+
+def load_participant_analytics(db, timestamp, analytics_type, verbose=False):
+    participant_analytics = db.participantanalytics.find_many(
+        where={"timestamp": timestamp, "type": analytics_type},
+    )
+
+    if verbose:
+        # Print the first participant analytics
+        print(
+            "Found {} analytics for the timespan from {} to {}".format(
+                len(participant_analytics), start_date, end_date
+            )
+        )
+        print(participant_analytics[0])
+
+    # convert the analytics to a dataframe
+    df_loaded_analytics = convert_to_df(participant_analytics)
+
+    return df_loaded_analytics
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,4 +22,5 @@ packages/prisma/src/seed @@
     .turbo
     out/
+    !out/.gitkeep
     .rollup.cache/
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .modules import *
		from .notebooks import *
sjschlapbach marked this conversation as resolved. Show resolved Hide resolved