feat: add validator api dataset service, extract dataset script

build: add deps for dataset service build: add docker compose services & entrypoints
tensorplex-labs · Dec 3, 2024 · b6eb648 · b6eb648
1 parent 17b0068
commit b6eb648
Show file tree

Hide file tree

Showing 8 changed files with 657 additions and 9 deletions.
diff --git a/Makefile b/Makefile
@@ -74,6 +74,9 @@ miner-worker-api:
 dojo-cli:
 	docker compose --env-file .env.miner -f docker-compose.miner.yaml run --rm dojo-cli
 
+extract-dataset:
+	docker compose -f docker-compose.validator.yaml run --rm --remove-orphans extract-dataset
+
 # ---------------------------------------------------------------------------- #
 #                             CORE SERVICE LOGGING                             #
 # ---------------------------------------------------------------------------- #

diff --git a/docker-compose.validator.yaml b/docker-compose.validator.yaml
@@ -131,3 +131,26 @@ services:
       prisma-setup-vali:
         condition: service_completed_successfully
     logging: *default-logging
+
+  dataset-service:
+    image: ghcr.io/tensorplex-labs/dojo:dataset
+    env_file:
+      - .env.validator
+    ports:
+      - "127.0.0.1:9999:9999"
+    command: ["dataset-service"]
+    logging: *default-logging
+
+  extract-dataset:
+    image: ghcr.io/tensorplex-labs/dojo:dataset
+    env_file:
+      - .env.validator
+    command: ["extract-dataset"]
+    networks:
+      - dojo-validator
+    volumes:
+      - ./:/app
+      - ./.env.validator:/app/.env
+      - prisma-binary:/root/prisma-python
+      - $HOME/.bittensor:/root/.bittensor
+    logging: *default-logging
diff --git a/docker/Dockerfile.dataset b/docker/Dockerfile.dataset
@@ -0,0 +1,35 @@
+FROM python:3.11-slim-bookworm
+
+WORKDIR /app
+
+ENV PATH="/root/.cargo/bin/:$PATH"
+ENV UV_SYSTEM_PYTHON=true
+ENV NVM_DIR=/root/.nvm
+ENV NODE_VERSION=v20.11.1
+ENV NODE_PATH=$NVM_DIR/versions/node/$NODE_VERSION/lib/node_modules
+ENV PATH=$NVM_DIR/versions/node/$NODE_VERSION/bin:$PATH
+
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+    build-essential curl git ca-certificates \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
+COPY . .
+
+ARG TARGETPLATFORM
+
+RUN echo "Building for TARGETPLATFORM: $TARGETPLATFORM"
+
+RUN git config --global --add safe.directory /app
+
+# jank because pytorch has different versions for cpu for darwin VS linux, see pyproject.toml for specifics
+# RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
+#     uv pip install --no-cache -e .[dataset] --find-links https://download.pytorch.org/whl/torch_stable.html; \
+#     else \
+#     uv pip install --no-cache -e .[dataset]; \
+#     fi
+RUN uv pip install --no-cache -e ".[dataset]" --find-links https://download.pytorch.org/whl/torch_stable.html;
+
+ENTRYPOINT ["./entrypoints.sh"]
diff --git a/entrypoints.sh b/entrypoints.sh
@@ -53,3 +53,26 @@ if [ "$1" = 'validator' ]; then
     --neuron.type validator \
     --wandb.project_name ${WANDB_PROJECT_NAME}
 fi
+
+if [ "$1" = 'extract-dataset' ]; then
+    echo "Environment variables:"
+    echo "WALLET_HOTKEY: ${WALLET_HOTKEY}"
+    echo "DATABASE_URL: ${DATABASE_URL}"
+    echo "DATASET_SERVICE_BASE_URL: ${DATASET_SERVICE_BASE_URL}"
+    echo "WALLET_COLDKEY: ${WALLET_COLDKEY}"
+    echo "WALLET_HOTKEY: ${WALLET_HOTKEY}"
+    python scripts/extract_dataset.py \
+    --wallet.name ${WALLET_COLDKEY} \
+    --wallet.hotkey ${WALLET_HOTKEY}
+fi
+
+if [ "$1" = 'dataset-service' ]; then
+    echo "Environment variables:"
+    echo "PORT: ${PORT}"
+    echo "S3_BUCKET_NAME: ${S3_BUCKET_NAME}"
+    echo "AWS_REGION: ${AWS_REGION}"
+    echo "MAX_CHUNK_SIZE_MB: ${MAX_CHUNK_SIZE_MB}"
+    python entrypoints/dataset_service.py \
+    --netuid 52 \
+    --subtensor.network finney
+fi
diff --git a/entrypoints/dataset_service.py b/entrypoints/dataset_service.py
@@ -0,0 +1,156 @@
+import asyncio
+import os
+from typing import List
+
+import aioboto3
+import aiofiles
+import bittensor as bt
+import httpx
+import uvicorn
+from bittensor.btlogging import logging as logger
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+from substrateinterface import Keypair
+
+from commons.objects import ObjectManager
+from dojo import VALIDATOR_MIN_STAKE
+
+app = FastAPI(title="Dataset Upload Service")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+config = ObjectManager.get_config()
+subtensor = bt.subtensor(config=config)
+metagraph = subtensor.metagraph(netuid=52, lite=True)
+AWS_REGION = os.getenv("AWS_REGION")
+BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
+MAX_CHUNK_SIZE_MB = int(os.getenv("MAX_CHUNK_SIZE_MB", 50))
+
+
+def verify_signature(hotkey: str, signature: str, message: str) -> bool:
+    keypair = Keypair(ss58_address=hotkey, ss58_format=42)
+    if not keypair.verify(data=message, signature=signature):
+        logger.error(f"Invalid signature for address={hotkey}")
+        return False
+
+    logger.success(f"Signature verified, signed by {hotkey}")
+    return True
+
+
+def check_stake(hotkey: str) -> bool:
+    uid = -1
+    try:
+        uid = metagraph.hotkeys.index(hotkey)
+    except ValueError:
+        logger.error(f"Hotkey {hotkey} not found in metagraph")
+        return False
+
+    # Check if stake meets minimum threshold
+    stake = metagraph.S[uid].item()
+
+    if stake < VALIDATOR_MIN_STAKE:
+        logger.error(
+            f"Insufficient stake for hotkey {hotkey}: {stake} < {VALIDATOR_MIN_STAKE}"
+        )
+        return False
+
+    logger.info(f"Stake check passed for {hotkey} with stake {stake}")
+    return True
+
+
+@app.post("/upload_dataset")
+async def upload_dataset(
+    hotkey: str = Form(...),
+    signature: str = Form(...),
+    message: str = Form(...),
+    files: List[UploadFile] = File(...),
+):
+    try:
+        if not signature.startswith("0x"):
+            raise HTTPException(
+                status_code=401, detail="Invalid signature format, must be hex."
+            )
+
+        session = aioboto3.Session(region_name=AWS_REGION)
+        async with session.resource("s3") as s3:
+            bucket = await s3.Bucket(BUCKET_NAME)
+            for file in files:
+                content = await file.read()
+                file_size = len(content)
+                if file_size > MAX_CHUNK_SIZE_MB * 1024 * 1024:  # 50MB in bytes
+                    raise HTTPException(
+                        status_code=413,
+                        detail=f"File too large. Maximum size is {MAX_CHUNK_SIZE_MB}MB",
+                    )
+
+                filename = f"{file.filename}"
+
+                await bucket.put_object(
+                    Key=filename,
+                    Body=content,
+                )
+    except Exception as e:
+        logger.error(f"Error uploading dataset: {e}")
+        raise HTTPException(status_code=500, detail=f"Error uploading dataset: {e}")
+
+    return {
+        "success": True,
+        "message": "Files uploaded successfully",
+        "filenames": [file.filename for file in files],
+    }
+
+
+async def server():
+    config = uvicorn.Config(app, host="0.0.0.0", port=9999)
+    server = uvicorn.Server(config)
+    await server.serve()
+
+
+async def test_endpoint():
+    # Create test data
+    test_data = {
+        "hotkey": "asdfg",
+        "signature": "0xasdfg",
+        "message": "<Bytes>On 2024-12-02 18:15:23.663947 +08 Tensorplex is awesome</Bytes>",
+    }
+    # Create a temporary test file
+    test_filename = "dataset_20241202.jsonl"
+
+    # Build form data similar to how dojo.py does it
+    files = []
+
+    # Add file to form data if it exists
+    if os.path.exists(test_filename):
+        async with aiofiles.open(test_filename, "rb") as f:
+            file_content = await f.read()
+            files.append(("files", (test_filename, file_content, "application/json")))
+    else:
+        raise FileNotFoundError(f"Test file {test_filename} not found")
+
+    # Make request using httpx
+    async with httpx.AsyncClient() as client:
+        response = await client.post(
+            "http://localhost:8000/upload_dataset",
+            data={
+                "hotkey": test_data["hotkey"],
+                "signature": test_data["signature"],
+                "message": test_data["message"],
+            },
+            files=files,
+            timeout=30.0,
+        )
+        print(f"Status: {response.status_code}")
+        print(f"Response: {response.json()}")
+
+
+if __name__ == "__main__":
+    import sys
+
+    if "--test" in sys.argv:
+        asyncio.run(test_endpoint())
+    else:
+        asyncio.run(server())
diff --git a/pyproject.toml b/pyproject.toml
@@ -8,11 +8,11 @@ name = "dojo"
 description = "dojo subnet"
 readme = "README.md"
 authors = [
-  {name = "jarvis8x7b"},
-  {name = "karootplx"},
-  {name = "codebender37"}
+  { name = "jarvis8x7b" },
+  { name = "karootplx" },
+  { name = "codebender37" }
 ]
-license = {text = "MIT"}
+license = { text = "MIT" }
 classifiers = [
   "Development Status :: 3 - Alpha",
   "Intended Audience :: Developers",
@@ -61,6 +61,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = ["commitizen", "curlify2", "pytest", "ruff"]
 test = ["pytest", "nox"]
+dataset = ["aioboto3", "aiofiles", "python-multipart"]
 
 [project.scripts]
 dojo = "dojo_cli:main"
@@ -143,15 +144,15 @@ select = [
   "E4", # import-related errors (e.g., unused imports, import order)
   "E7", # statement-related errors (e.g., multiple statements on one line)
   "E9", # runtime errors (e.g., syntax errors, undefined names)
-  "F", # pyflakes errors (e.g., unused variables, undefined names)
+  "F",  # pyflakes errors (e.g., unused variables, undefined names)
   "UP", # pyupgrade rules (suggests newer python syntax)
-  "I" # isort rules (sorts and organizes imports)
+  "I"   # isort rules (sorts and organizes imports)
 ]
 ignore = [
   "UP006", # Preserve 'typing.Tuple' instead of 'tuple'
   "UP035", # Preserve 'typing.Dict' instead of 'dict'
-  "C901", # Ignore McCabe complexity (if you use flake8 complexity checks)
-  "E203" # Ignore whitespace before ':', conflicts with Black] # Ignore specific pyupgrade rules that prevent the project from running
+  "C901",  # Ignore McCabe complexity (if you use flake8 complexity checks)
+  "E203"   # Ignore whitespace before ':', conflicts with Black] # Ignore specific pyupgrade rules that prevent the project from running
 ]
 # Allow fix for all enabled rules (when `--fix`) is provided.
 fixable = ["ALL"]
@@ -162,7 +163,7 @@ unfixable = []
 known-third-party = ["wandb"]
 
 [tool.setuptools]
-packages = {find = {}}
+packages = { find = {} }
 include-package-data = true
 # allows us to use CLI even though it is a standalone script
 py-modules = ["dojo_cli"]