Merge branch 'dev' into refactor/redesign-schema

tensorplex-labs · Dec 9, 2024 · 48caf6b · 48caf6b
2 parents 68eeb48 + 0d8b93d
commit 48caf6b
Show file tree

Hide file tree

Showing 23 changed files with 1,340 additions and 33 deletions.
diff --git a/.env.miner.example b/.env.miner.example
@@ -17,6 +17,7 @@ SUBTENSOR_ENDPOINT=wss://entrypoint-finney.opentensor.ai:443
 # NETUID=98
 # SUBTENSOR_NETWORK=test
 # SUBTENSOR_ENDPOINT=ws://testnet-lite:9944
+# VALIDATOR_MIN_STAKE=20000
 
 # Task related config
 # this a maximum of 4 workers may submit responses for a single task

diff --git a/.env.validator.example b/.env.validator.example
@@ -1,5 +1,6 @@
 WALLET_COLDKEY=
 WALLET_HOTKEY=
+DATASET_SERVICE_BASE_URL=https://dojo-validator-api.tensorplex.ai
 
 # Mainnet related config
 NETUID=52

diff --git a/.github/workflows/docker_build.yaml b/.github/workflows/docker_build.yaml
@@ -6,6 +6,7 @@ on:
       - dev
       - staging
       - main
+      - simulator
 
 jobs:
   docker_publish:
@@ -28,7 +29,7 @@ jobs:
           echo "BRANCH_NAME=$SANITIZED_BRANCH_NAME" >> $GITHUB_ENV
 
       - name: Build and Push Docker Image with Branch Tag
-        if: github.ref == 'refs/heads/dev' || github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/main'
+        if: github.ref == 'refs/heads/dev' || github.ref == 'refs/heads/staging' || github.ref == 'refs/heads/main' || github.ref == 'refs/heads/simulator'
         uses: macbre/push-to-ghcr@master
         with:
           image_name: ${{ github.repository }}

diff --git a/.gitignore b/.gitignore
@@ -188,3 +188,6 @@ testing/
 
 # prisma
 database/prisma/
+
+# scores data
+scores/*.pt
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,27 @@
+## [1.4.2](https://github.com/tensorplex-labs/dojo/compare/v1.4.1...v1.4.2) (2024-11-22)
+
+### Performance Improvements
+
+* add retries for dojo api calls ([461214a](https://github.com/tensorplex-labs/dojo/commit/461214aef4b51f42574d920effeaa7cbb58f5a92))
+* extend delay ([97a9b04](https://github.com/tensorplex-labs/dojo/commit/97a9b0417106c4e3cfc11501c9408f15574daf23))
+* extend delay ([69e1974](https://github.com/tensorplex-labs/dojo/commit/69e19742f23fd1a2ac81f197f4e2f07fe9018049))
+* removed unused completions in miner's response to optimize network traffic ([e76ce98](https://github.com/tensorplex-labs/dojo/commit/e76ce98dd2c6e7ef3f52b70225cd0f0e07abf996))
+
+## [1.4.1](https://github.com/tensorplex-labs/dojo/compare/v1.4.0...v1.4.1) (2024-11-07)
+
+### Bug Fixes
+
+* add assert check ([f849130](https://github.com/tensorplex-labs/dojo/commit/f849130670abdc8614c302dbb73a1af053aa00ed))
+* add unit test for reward_cubic ([7a8483a](https://github.com/tensorplex-labs/dojo/commit/7a8483ae11569287fdb2474615781c3fa458de35))
+* change nans to -1 instead of 0 ([f15815c](https://github.com/tensorplex-labs/dojo/commit/f15815c9613935857cc7c649cb3dcb8d185c6bd9))
+* potential scoring issue ([335b40b](https://github.com/tensorplex-labs/dojo/commit/335b40b2493a67fdc19aa5366daac9f0e0031c01))
+* remove error log ([e0b1a85](https://github.com/tensorplex-labs/dojo/commit/e0b1a8570457765a395d995b94d936c0e37f3618))
+* scoring shapes ([dc15bb9](https://github.com/tensorplex-labs/dojo/commit/dc15bb93cd9b61bb1219c4766fbc8938e110c1b3))
+
+### Performance Improvements
+
+* refactor logging, and clean synapase_history ([8e36b5c](https://github.com/tensorplex-labs/dojo/commit/8e36b5c6694909506a04f002389fc106b6a4f87d))
+
 ## [1.4.0](https://github.com/tensorplex-labs/dojo/compare/v1.3.3...v1.4.0) (2024-10-31)
 
 ### Features

diff --git a/Makefile b/Makefile
@@ -74,6 +74,9 @@ miner-worker-api:
 dojo-cli:
 	docker compose --env-file .env.miner -f docker-compose.miner.yaml run --rm dojo-cli
 
+extract-dataset:
+	docker compose -f docker-compose.validator.yaml run --rm --remove-orphans extract-dataset
+
 # ---------------------------------------------------------------------------- #
 #                             CORE SERVICE LOGGING                             #
 # ---------------------------------------------------------------------------- #

diff --git a/README.md b/README.md
@@ -48,6 +48,7 @@
     - [Option 2: Decentralised Method](#option-2-decentralised-method)
     - [Setup Subscription Key for Labellers on UI to connect to Dojo Subnet for scoring](#setup-subscription-key-for-labellers-on-ui-to-connect-to-dojo-subnet-for-scoring)
   - [Validating](#validating)
+  - [Data Collection](#data-collection)
 - [Auto-updater](#auto-updater)
 - [Dojo CLI](#dojo-cli)
 - [For Dojo developerss](#for-dojo-developerss)
@@ -278,6 +279,7 @@ DOJO_API_KEY= # blank for now
 WALLET_COLDKEY=# the name of the coldkey
 WALLET_HOTKEY=# the name of the hotkey
 AXON_PORT=8888 # port to serve requests over the public network for validators to call
+VALIDATOR_MIN_STAKE=20000 # minimum stake required for validators default is 20000 TAO (use this to bypass the blacklist function in testnet)
 # Task related config
 TASK_MAX_RESULT=4 # this means that each miner can have up to 4 workers fill in responses
 ```
@@ -416,6 +418,7 @@ cp .env.validator.example .env.validator
 
 WALLET_COLDKEY=# the name of the coldkey
 WALLET_HOTKEY=# the name of the hotkey
+DATASET_SERVICE_BASE_URL=https://dojo-validator-api.tensorplex.ai
 
 # head to https://wandb.ai/authorize to get your API key
 WANDB_API_KEY="<wandb_key>"
@@ -448,6 +451,15 @@ make validator
 
 To start with autoupdate for validators (**strongly recommended**), see the [Auto-updater](#auto-updater) section.
 
+## Data Collection
+
+To export all data that has been collected from the validator, ensure that you have the environment variables setup properly as in [validator-setup](#validating), then run the following:
+
+```bash
+make validator-pull
+make extract-dataset
+```
+
 # Auto-updater
 
 > [!WARNING]

diff --git a/commons/objects.py b/commons/objects.py
@@ -9,18 +9,30 @@ class ObjectManager:
 
     @classmethod
     def get_miner(cls):
-        from neurons.miner import Miner
+        if get_config().simulation:
+            from simulator.miner import MinerSim
 
-        if cls._miner is None:
-            cls._miner = Miner()
+            if cls._miner is None:
+                cls._miner = MinerSim()
+        else:
+            from neurons.miner import Miner
+
+            if cls._miner is None:
+                cls._miner = Miner()
         return cls._miner
 
     @classmethod
     def get_validator(cls):
-        from neurons.validator import Validator
+        if get_config().simulation:
+            from simulator.validator import ValidatorSim
+
+            if cls._validator is None:
+                cls._validator = ValidatorSim()
+        else:
+            from neurons.validator import Validator
 
-        if cls._validator is None:
-            cls._validator = Validator()
+            if cls._validator is None:
+                cls._validator = Validator()
         return cls._validator
 
     @classmethod

diff --git a/commons/score_storage.py b/commons/score_storage.py
@@ -0,0 +1,89 @@
+import json
+from pathlib import Path
+
+import torch
+from bittensor.btlogging import logging as logger
+
+from database.client import connect_db, disconnect_db
+from database.prisma.models import Score_Model
+
+
+class ScoreStorage:
+    """Handles persistence of validator scores"""
+
+    SCORES_DIR = Path("scores")
+    SCORES_FILE = SCORES_DIR / "miner_scores.pt"
+
+    @classmethod
+    async def migrate_from_db(cls) -> bool:
+        """One-time migration of scores from database to .pt file
+        Returns:
+            bool: True if migration successful or file already exists, False if migration failed
+        """
+        try:
+            if cls.SCORES_FILE.exists():
+                logger.info("Scores file already exists, skipping migration")
+                return True
+
+            # Connect to database first
+            await connect_db()
+
+            try:
+                # Get scores from database
+                score_record = await Score_Model.prisma().find_first(
+                    order={"created_at": "desc"}
+                )
+                if not score_record:
+                    logger.warning("No scores found in database to migrate")
+                    return True  # Not an error, just no scores yet
+
+                scores = torch.tensor(json.loads(score_record.score))
+
+                # Create scores directory if it doesn't exist
+                cls.SCORES_DIR.mkdir(exist_ok=True)
+
+                # Save scores to .pt file
+                torch.save(scores, cls.SCORES_FILE)
+                logger.success(f"Successfully migrated scores to {cls.SCORES_FILE}")
+
+                # Verify the migration
+                loaded_scores = torch.load(cls.SCORES_FILE)
+                if torch.equal(scores, loaded_scores):
+                    logger.success("Migration verification successful - scores match")
+                    return True
+                else:
+                    logger.error("Migration verification failed - scores do not match")
+                    return False
+
+            finally:
+                await disconnect_db()
+
+        except Exception as e:
+            logger.error(f"Failed to migrate scores: {e}")
+            return False
+
+    @classmethod
+    async def save(cls, scores: torch.Tensor) -> None:
+        """Save validator scores to .pt file"""
+        try:
+            cls.SCORES_DIR.mkdir(exist_ok=True)
+            torch.save(scores, cls.SCORES_FILE)
+            logger.success("Successfully saved validator scores to file")
+        except Exception as e:
+            logger.error(f"Failed to save validator scores: {e}")
+            raise
+
+    @classmethod
+    async def load(cls) -> torch.Tensor | None:
+        """Load validator scores from .pt file"""
+        try:
+            if not cls.SCORES_FILE.exists():
+                logger.warning("No validator scores file found")
+                return None
+
+            scores = torch.load(cls.SCORES_FILE)
+            logger.success("Successfully loaded validator scores from file")
+            return scores
+        except Exception as e:
+            logger.error(f"Failed to load validator scores: {e}")
+            return None
diff --git a/commons/scoring.py b/commons/scoring.py
@@ -79,8 +79,11 @@ def _reward_cubic(
     # shape: (num_miners,)
     # number range [-1, 1]
     x = F.cosine_similarity(
-        torch.from_numpy(miner_outputs), torch.from_numpy(ground_truth), dim=1
+        torch.from_numpy(miner_outputs.copy()),
+        torch.from_numpy(ground_truth.copy()),
+        dim=1,
     ).numpy()
+
     # Convert nans to -1 to send it to the bottom
     x = np.where(np.isnan(x), -1, x)
 
@@ -411,13 +414,17 @@ def ground_truth_score_V1(
         )
 
         miner_outputs = miner_outputs_normalised
-        logger.debug(f"scoring: raw miner outputs with nans\n{miner_outputs}")
 
         # use minmax scale to ensure ground truth is in the range [0, 1]
         ground_truth_arr = minmax_scale(
             np.array([rank for _, rank in cid_with_rank_sorted])
         ).numpy()
-        logger.debug(f"scoring: ground truth\n{ground_truth_arr}")
+
+        # reverse order here, because the lowest rank is the best
+        # e.g. ranks: ('cid1', 0), ('cid2', 1), ('cid3', 2), ('cid4', 3)
+        # after minmax scale: [0, 0.33, 0.667, 1]
+        # but we want the reverse, so: [1, 0.667, 0.33, 0], since cid1 is the best
+        ground_truth_arr = ground_truth_arr[::-1]
 
         logger.info(f"scoring: Miner outputs\n{miner_outputs}")
         logger.info(f"scoring: Ground truth\n{ground_truth_arr}")
@@ -449,7 +456,7 @@ def ground_truth_score_V1(
             logger.debug(f"scoring: error calculating segment sums: {e}")
             pass
 
-        return torch.from_numpy(cubic_reward)
+        return torch.from_numpy(cubic_reward.copy())
 
     @staticmethod
     def cmp_ground_truth(
@@ -634,8 +641,8 @@ def calculate_score(
                 ground_truth = gt_score[i]
 
                 # NOTE: just use ground truth for now
-                hotkey_to_final_score[r.axon.hotkey] = ground_truth / len(
-                    criteria_types
+                hotkey_to_final_score[r.axon.hotkey] = float(
+                    ground_truth / len(criteria_types)
                 )
 
             criteria_to_miner_scores[criteria.type] = Score(

diff --git a/docker-compose.validator.yaml b/docker-compose.validator.yaml
@@ -131,3 +131,26 @@ services:
       prisma-setup-vali:
         condition: service_completed_successfully
     logging: *default-logging
+
+  dataset-service:
+    image: ghcr.io/tensorplex-labs/dojo:dataset
+    env_file:
+      - .env.validator
+    ports:
+      - "127.0.0.1:9999:9999"
+    command: ["dataset-service"]
+    logging: *default-logging
+
+  extract-dataset:
+    image: ghcr.io/tensorplex-labs/dojo:dataset
+    env_file:
+      - .env.validator
+    command: ["extract-dataset"]
+    networks:
+      - dojo-validator
+    volumes:
+      - ./:/app
+      - ./.env.validator:/app/.env
+      - prisma-binary:/root/prisma-python
+      - $HOME/.bittensor:/root/.bittensor
+    logging: *default-logging
diff --git a/docker/Dockerfile.dataset b/docker/Dockerfile.dataset
@@ -0,0 +1,35 @@
+FROM python:3.11-slim-bookworm
+
+WORKDIR /app
+
+ENV PATH="/root/.cargo/bin/:$PATH"
+ENV UV_SYSTEM_PYTHON=true
+ENV NVM_DIR=/root/.nvm
+ENV NODE_VERSION=v20.11.1
+ENV NODE_PATH=$NVM_DIR/versions/node/$NODE_VERSION/lib/node_modules
+ENV PATH=$NVM_DIR/versions/node/$NODE_VERSION/bin:$PATH
+
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+    build-essential curl git ca-certificates \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
+COPY . .
+
+ARG TARGETPLATFORM
+
+RUN echo "Building for TARGETPLATFORM: $TARGETPLATFORM"
+
+RUN git config --global --add safe.directory /app
+
+# jank because pytorch has different versions for cpu for darwin VS linux, see pyproject.toml for specifics
+# RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
+#     uv pip install --no-cache -e .[dataset] --find-links https://download.pytorch.org/whl/torch_stable.html; \
+#     else \
+#     uv pip install --no-cache -e .[dataset]; \
+#     fi
+RUN uv pip install --no-cache -e ".[dataset]" --find-links https://download.pytorch.org/whl/torch_stable.html;
+
+ENTRYPOINT ["./entrypoints.sh"]
diff --git a/dojo/__init__.py b/dojo/__init__.py
@@ -30,7 +30,7 @@ def get_latest_git_tag():
 )
 
 
-VALIDATOR_MIN_STAKE = 20000
+VALIDATOR_MIN_STAKE = int(os.getenv("VALIDATOR_MIN_STAKE", "20000"))
 TASK_DEADLINE = 6 * 60 * 60
 
 # Define the time intervals for various tasks.
@@ -44,7 +44,7 @@ def get_latest_git_tag():
 
 if get_config().fast_mode:
     print("Running in fast mode for testing purposes...")
-    VALIDATOR_MIN_STAKE = 20000
+    VALIDATOR_MIN_STAKE = int(os.getenv("VALIDATOR_MIN_STAKE", "20000"))
     TASK_DEADLINE = 180
     VALIDATOR_RUN = 60
     VALIDATOR_HEARTBEAT = 15

diff --git a/dojo/utils/config.py b/dojo/utils/config.py
@@ -178,6 +178,18 @@ def add_args(parser):
         help="Whether to run in fast mode, for developers to test locally.",
     )
 
+    parser.add_argument(
+        "--simulation",
+        action="store_true",
+        help="Whether to run the validator in simulation mode",
+    )
+
+    parser.add_argument(
+        "--simulation_bad_miner",
+        action="store_true",
+        help="Set miner simluation to a bad one",
+    )
+
     epoch_length = 100
     known_args, _ = parser.parse_known_args()
     if known_args := vars(known_args):