Merge pull request #122 from tensorplex-labs/dev

chore(release): v1.6.2
tensorplex-labs · Feb 13, 2025 · c8b50e5 · c8b50e5
2 parents e8bfcf8 + deac196
commit c8b50e5
Show file tree

Hide file tree

Showing 22 changed files with 959 additions and 544 deletions.
diff --git a/.env.validator.example b/.env.validator.example
@@ -25,10 +25,6 @@ SUBTENSOR_ENDPOINT=wss://entrypoint-finney.opentensor.ai:443
 # NETUID=98
 # SUBTENSOR_NETWORK=test
 # SUBTENSOR_ENDPOINT=ws://testnet-lite:9944
-# WANDB_PROJECT_NAME=dojo-testnet
-
-WANDB_API_KEY=
-WANDB_PROJECT_NAME=dojo-mainnet
 
 # for dojo-synthetic-api
 OPENROUTER_API_KEY=
@@ -43,3 +39,9 @@ DB_NAME=db
 DB_USERNAME=
 DB_PASSWORD=
 DATABASE_URL=postgresql://${DB_USERNAME}:${DB_PASSWORD}@${DB_HOST}/${DB_NAME}
+
+
+#dojo loki
+DOJO_LOKI_URL=<GET_FROM_TPLX_TEAM>
+# hotkey for loki external label
+VALIDATOR_HOTKEY=
diff --git a/Makefile b/Makefile
@@ -77,6 +77,9 @@ dojo-cli:
 extract-dataset:
 	docker compose -f docker-compose.validator.yaml run --rm --remove-orphans extract-dataset
 
+fill-score-column:
+	docker compose -f docker-compose.validator.yaml run --rm --remove-orphans fill-score-column
+
 migration:
 	docker compose --env-file .env.validator -f docker-compose.validator.yaml run --rm migration
 

diff --git a/README.md b/README.md
@@ -52,6 +52,7 @@
 - [Auto-updater](#auto-updater)
 - [Dojo CLI](#dojo-cli)
 - [For Dojo developers](#for-dojo-developers)
+  - [Dataset Extraction](#dataset-extraction)
 - [License](#license)
 
 </details>
@@ -102,7 +103,6 @@ By creating an open platform for gathering human-generated datasets, Tensorplex
 - docker
 - GNU make
 - openrouter api key
-- wandb api key
 
 ### System Requirements
 
@@ -203,6 +203,9 @@ For Docker Compose installation, see https://docs.docker.com/compose/install/lin
 # verify both docker and docker compose are installed
 docker --version
 docker compose version
+
+# for validator please install docker loki plugin
+docker plugin install grafana/loki-docker-driver:3.3.2-amd64 --alias loki --grant-all-permissions
 ```
 
 4. Start local subtensor node (**optional**)
@@ -420,8 +423,6 @@ WALLET_COLDKEY=# the name of the coldkey
 WALLET_HOTKEY=# the name of the hotkey
 DATASET_SERVICE_BASE_URL=https://dojo-validator-api.tensorplex.ai
 
-# head to https://wandb.ai/authorize to get your API key
-WANDB_API_KEY="<wandb_key>"
 
 # for dojo-synthetic-api
 OPENROUTER_API_KEY="sk-or-v1-<KEY>"
@@ -438,6 +439,10 @@ DB_NAME=db
 DB_USERNAME=#set a non-default username
 DB_PASSWORD=#generate and set a secure password
 DATABASE_URL=postgresql://${DB_USERNAME}:${DB_PASSWORD}@${DB_HOST}/${DB_NAME}
+
+# dojo loki
+DOJO_LOKI_URL=# get from TPLX TEAM
+VALIDATOR_HOTKEY=# your running validator hotkey address
 ```
 
 > **Note:** To ensure your validator runs smoothly, enable the auto top-up feature for Openrouter, this ensures that your validator will not fail to call synthetic API during task generation. The estimate cost of generating a task is approximately $0.20 USD.
@@ -571,6 +576,17 @@ make install-dev
 make install-test
 ```
 
+## Dataset Extraction
+
+The dataset should be in different parts, currently `MAX_CHUNK_SIZE_MB` is set to 50MB on the dataset service, due to limitations on the load balancer. Use the commands to combine all into a single dataset file:
+
+```bash
+aws s3 cp s3://amzn-s3-demo-bucket1/ <PATH_ON_LOCAL> --recursive --exclude "*" --include "hotkey_<vali_hotkey>_dataset_20250212*.jsonl"
+cd <PATH_ON_LOCAL>
+# to merge all chunks into a single dataset file
+cat *.jsonl > hotkey_<vali_hotkey>_dataset_combined.jsonl
+```
+
 # License
 
 This repository is licensed under the MIT License.

diff --git a/commons/logging/wandb.py b/commons/logging/wandb.py
diff --git a/commons/orm.py b/commons/orm.py
@@ -273,7 +273,7 @@ async def update_miner_task_results(
                             )
                         else:
                             logger.warning(
-                                f"Retrying update, attempt {attempt+2}/{max_retries}"
+                                f"Retrying update, attempt {attempt + 2}/{max_retries}"
                             )
                             await asyncio.sleep(2**attempt)
 
@@ -282,7 +282,7 @@ async def update_miner_task_results(
                         logger.error(f"Error updating task results: {e}")
                     else:
                         logger.warning(
-                            f"Error during attempt {attempt+1}, retrying: {e}"
+                            f"Error during attempt {attempt + 1}, retrying: {e}"
                         )
                         await asyncio.sleep(2**attempt)
 
@@ -300,6 +300,7 @@ async def update_miner_raw_scores(
         max_retries: int = 20,
     ) -> tuple[bool, list[int]]:
         """Update the miner's provided raw scores for a list of miner responses.
+        NOTE: this is to be used when the task is first saved to validator's database.
 
         Args:
             miner_responses: List of TaskSynapseObject containing miner responses
@@ -403,18 +404,18 @@ async def update_miner_raw_scores(
                                 )
 
                     logger.debug(
-                        f"Updating completion responses: updated batch {batch_id+1}/{num_batches}"
+                        f"Updating completion responses: updated batch {batch_id + 1}/{num_batches}"
                     )
                     break
                 except Exception as e:
                     if attempt == max_retries - 1:
                         logger.error(
-                            f"Failed to update batch {batch_id+1}/{num_batches} after {max_retries} attempts: {e}"
+                            f"Failed to update batch {batch_id + 1}/{num_batches} after {max_retries} attempts: {e}"
                         )
                         failed_batch_indices.extend(range(start_idx, end_idx))
                     else:
                         logger.warning(
-                            f"Retrying batch {batch_id+1}/{num_batches}, attempt {attempt+2}/{max_retries}"
+                            f"Retrying batch {batch_id + 1}/{num_batches}, attempt {attempt + 2}/{max_retries}"
                         )
                         await asyncio.sleep(2**attempt)
 

diff --git a/database/mappers.py b/database/mappers.py
@@ -1,6 +1,7 @@
 import json
 
 import bittensor as bt
+from pydantic import BaseModel
 
 from commons.utils import datetime_to_iso8601_str, iso8601_str_to_datetime
 from database.prisma import Json
@@ -13,6 +14,7 @@
     MinerResponseCreateInput,
     ValidatorTaskCreateInput,
 )
+from dojo import get_commit_hash, get_latest_git_tag
 from dojo.protocol import (
     CompletionResponse,
     CriteriaType,
@@ -21,6 +23,11 @@
 )
 
 
+class Metadata(BaseModel):
+    git_tag: str
+    commit_hash: str
+
+
 # ---------------------------------------------------------------------------- #
 #                 MAP PROTOCOL OBJECTS TO DATABASE MODEL INPUTS                #
 # ---------------------------------------------------------------------------- #
@@ -49,6 +56,7 @@ def map_task_synapse_object_to_validator_task(
         if synapse.ground_truth
         else []
     )
+    metadata = Metadata(git_tag=get_latest_git_tag(), commit_hash=get_commit_hash())
 
     return ValidatorTaskCreateInput(
         id=synapse.task_id,
@@ -59,6 +67,7 @@ def map_task_synapse_object_to_validator_task(
         is_processed=False,
         miner_responses={"create": []},
         ground_truth={"create": ground_truths},
+        metadata=Json(json.dumps(metadata.model_dump())),
     )
 
 

diff --git a/docker-compose.validator.yaml b/docker-compose.validator.yaml
@@ -35,6 +35,7 @@ services:
     logging: *default-logging
 
   synthetic-api:
+    container_name: synthetic-api
     image: ghcr.io/tensorplex-labs/dojo-synthetic-api:main
     env_file:
       - .env.validator
@@ -134,7 +135,12 @@ services:
         condition: service_healthy
       prisma-setup-vali:
         condition: service_completed_successfully
-    logging: *default-logging
+    logging:
+      driver: loki
+      options:
+        mode: non-blocking
+        loki-url: "https://${DOJO_LOKI_URL}"
+        loki-external-labels: "validator=${VALIDATOR_HOTKEY}"
 
   dataset-service:
     container_name: dataset-service
@@ -196,3 +202,18 @@ services:
       - prisma-pip-cache:/root/.cache/pip
       - $HOME/.bittensor:/root/.bittensor
     logging: *default-logging
+
+  fill-score-column:
+    container_name: fill-score-column
+    image: ghcr.io/tensorplex-labs/dojo:main
+    env_file:
+      - .env.validator
+    command: ["fill-score-column"]
+    networks:
+      - dojo-validator
+    volumes:
+      - ./:/app
+      - ./.env.validator:/app/.env
+      - prisma-binary:/root/prisma-python
+      - $HOME/.bittensor:/root/.bittensor
+    logging: *default-logging
diff --git a/docker/Dockerfile.dataset b/docker/Dockerfile.dataset
@@ -2,6 +2,11 @@ FROM python:3.11-slim-bookworm
 
 WORKDIR /app
 
+# Prisma-specific environment variables
+ENV PRISMA_USE_NODEJS_BIN=true
+ENV PRISMA_BINARY_PLATFORM=debian-openssl-3.0.x
+ENV PRISMA_BINARY_CACHE_DIR=/root/prisma-python
+
 ENV PATH="/root/.cargo/bin/:$PATH"
 ENV UV_SYSTEM_PYTHON=true
 ENV NVM_DIR=/root/.nvm
@@ -15,6 +20,8 @@ RUN apt-get update \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
+RUN mkdir -p /root/prisma-python
+
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
 COPY . .
 

diff --git a/dojo/__init__.py b/dojo/__init__.py
@@ -20,6 +20,20 @@ def get_latest_git_tag():
         raise RuntimeError("Failed to get latest Git tag")
 
 
+def get_commit_hash():
+    try:
+        # Get the latest git commit hash
+        latest_commit_hash = (
+            subprocess.check_output(["git", "rev-parse", "HEAD"])
+            .strip()
+            .decode("utf-8")
+        )
+        return latest_commit_hash
+    except subprocess.CalledProcessError as e:
+        print(f"Error getting the latest Git commit hash: {e}")
+        raise RuntimeError("Failed to get latest Git commit hash")
+
+
 # Define the version of the template module.
 __version__ = get_latest_git_tag()
 version_split = __version__.split(".")

diff --git a/dojo/mock.py b/dojo/mock.py
@@ -21,6 +21,12 @@ def __init__(self, netuid, n=16, wallet=None, network="mock"):
             self.create_subnet(netuid)
 
 
+class MockTerminalInfo(bt.TerminalInfo):
+    def __init__(self, hotkey):
+        super().__init__()
+        self.hotkey = hotkey
+
+
 class MockMetagraph(bt.metagraph):
     def __init__(self, netuid=1, network="mock", subtensor=None):
         super().__init__(netuid=netuid, network=network, sync=False)
@@ -50,7 +56,7 @@ def hotkeys(self, value):
     #     self.total_stake = np.array(stakes, dtype=np.float32)
 
 
-class MockDendrite(bt.dendrite):
+class MockDendrite(bt.Dendrite):
     """
     Replaces a real bittensor network request with a mock request that just returns some static response for all axons that are passed and adds some random delay.
     """

diff --git a/dojo/utils/config.py b/dojo/utils/config.py
@@ -218,14 +218,6 @@ def add_args(parser):
             default=0.3,
         )
 
-        wandb_project_names = ["dojo-devnet", "dojo-testnet", "dojo-mainnet"]
-        parser.add_argument(
-            "--wandb.project_name",
-            type=str,
-            choices=wandb_project_names,
-            help="Name of the wandb project to use.",
-        )
-
     elif neuron_type == "miner":
         pass