refactor: update dataset extraction script based on new schema (#120)

tensorplex-labs · Feb 12, 2025 · deac196 · deac196
1 parent 78e23dd
commit deac196
Show file tree

Hide file tree

Showing 8 changed files with 531 additions and 168 deletions.
diff --git a/Makefile b/Makefile
@@ -77,6 +77,9 @@ dojo-cli:
 extract-dataset:
 	docker compose -f docker-compose.validator.yaml run --rm --remove-orphans extract-dataset
 
+fill-score-column:
+	docker compose -f docker-compose.validator.yaml run --rm --remove-orphans fill-score-column
+
 migration:
 	docker compose --env-file .env.validator -f docker-compose.validator.yaml run --rm migration
 

diff --git a/README.md b/README.md
@@ -52,6 +52,7 @@
 - [Auto-updater](#auto-updater)
 - [Dojo CLI](#dojo-cli)
 - [For Dojo developers](#for-dojo-developers)
+  - [Dataset Extraction](#dataset-extraction)
 - [License](#license)
 
 </details>
@@ -575,6 +576,17 @@ make install-dev
 make install-test
 ```
 
+## Dataset Extraction
+
+The dataset should be in different parts, currently `MAX_CHUNK_SIZE_MB` is set to 50MB on the dataset service, due to limitations on the load balancer. Use the commands to combine all into a single dataset file:
+
+```bash
+aws s3 cp s3://amzn-s3-demo-bucket1/ <PATH_ON_LOCAL> --recursive --exclude "*" --include "hotkey_<vali_hotkey>_dataset_20250212*.jsonl"
+cd <PATH_ON_LOCAL>
+# to merge all chunks into a single dataset file
+cat *.jsonl > hotkey_<vali_hotkey>_dataset_combined.jsonl
+```
+
 # License
 
 This repository is licensed under the MIT License.

diff --git a/commons/orm.py b/commons/orm.py
@@ -273,7 +273,7 @@ async def update_miner_task_results(
                             )
                         else:
                             logger.warning(
-                                f"Retrying update, attempt {attempt+2}/{max_retries}"
+                                f"Retrying update, attempt {attempt + 2}/{max_retries}"
                             )
                             await asyncio.sleep(2**attempt)
 
@@ -282,7 +282,7 @@ async def update_miner_task_results(
                         logger.error(f"Error updating task results: {e}")
                     else:
                         logger.warning(
-                            f"Error during attempt {attempt+1}, retrying: {e}"
+                            f"Error during attempt {attempt + 1}, retrying: {e}"
                         )
                         await asyncio.sleep(2**attempt)
 
@@ -300,6 +300,7 @@ async def update_miner_raw_scores(
         max_retries: int = 20,
     ) -> tuple[bool, list[int]]:
         """Update the miner's provided raw scores for a list of miner responses.
+        NOTE: this is to be used when the task is first saved to validator's database.
 
         Args:
             miner_responses: List of TaskSynapseObject containing miner responses
@@ -403,18 +404,18 @@ async def update_miner_raw_scores(
                                 )
 
                     logger.debug(
-                        f"Updating completion responses: updated batch {batch_id+1}/{num_batches}"
+                        f"Updating completion responses: updated batch {batch_id + 1}/{num_batches}"
                     )
                     break
                 except Exception as e:
                     if attempt == max_retries - 1:
                         logger.error(
-                            f"Failed to update batch {batch_id+1}/{num_batches} after {max_retries} attempts: {e}"
+                            f"Failed to update batch {batch_id + 1}/{num_batches} after {max_retries} attempts: {e}"
                         )
                         failed_batch_indices.extend(range(start_idx, end_idx))
                     else:
                         logger.warning(
-                            f"Retrying batch {batch_id+1}/{num_batches}, attempt {attempt+2}/{max_retries}"
+                            f"Retrying batch {batch_id + 1}/{num_batches}, attempt {attempt + 2}/{max_retries}"
                         )
                         await asyncio.sleep(2**attempt)
 

diff --git a/docker-compose.validator.yaml b/docker-compose.validator.yaml
@@ -35,6 +35,7 @@ services:
     logging: *default-logging
 
   synthetic-api:
+    container_name: synthetic-api
     image: ghcr.io/tensorplex-labs/dojo-synthetic-api:main
     env_file:
       - .env.validator
@@ -201,3 +202,18 @@ services:
       - prisma-pip-cache:/root/.cache/pip
       - $HOME/.bittensor:/root/.bittensor
     logging: *default-logging
+
+  fill-score-column:
+    container_name: fill-score-column
+    image: ghcr.io/tensorplex-labs/dojo:main
+    env_file:
+      - .env.validator
+    command: ["fill-score-column"]
+    networks:
+      - dojo-validator
+    volumes:
+      - ./:/app
+      - ./.env.validator:/app/.env
+      - prisma-binary:/root/prisma-python
+      - $HOME/.bittensor:/root/.bittensor
+    logging: *default-logging
diff --git a/docker/Dockerfile.dataset b/docker/Dockerfile.dataset
@@ -2,6 +2,11 @@ FROM python:3.11-slim-bookworm
 
 WORKDIR /app
 
+# Prisma-specific environment variables
+ENV PRISMA_USE_NODEJS_BIN=true
+ENV PRISMA_BINARY_PLATFORM=debian-openssl-3.0.x
+ENV PRISMA_BINARY_CACHE_DIR=/root/prisma-python
+
 ENV PATH="/root/.cargo/bin/:$PATH"
 ENV UV_SYSTEM_PYTHON=true
 ENV NVM_DIR=/root/.nvm
@@ -15,6 +20,8 @@ RUN apt-get update \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
+RUN mkdir -p /root/prisma-python
+
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
 COPY . .
 

diff --git a/entrypoints.sh b/entrypoints.sh
@@ -116,3 +116,9 @@ if [ "$1" = 'validate-migration' ]; then
     echo "Starting migration validation..."
     python scripts/validate_migration.py
 fi
+
+if [ "$1" = 'fill-score-column' ]; then
+    echo "Environment variables:"
+    echo "DATABASE_URL: ${DATABASE_URL}"
+    python scripts/fill_score_column.py --logging.info
+fi