Skip to content

Commit

Permalink
feat: add dataset collection endpoint, script to scrape validator db (#…
Browse files Browse the repository at this point in the history
…78)

* feat: add validator api dataset service, extract dataset script

build: add deps for dataset service
build: add docker compose services & entrypoints

* chore: remove test script

* chore: fix comment

* chore: update readme and env example

* fix: add auth

* refactor: add hotkey to filename

* chore: format toml
  • Loading branch information
jarvis8x7b authored and karootplx committed Dec 9, 2024
1 parent 553ad66 commit 1b4ef9e
Show file tree
Hide file tree
Showing 9 changed files with 600 additions and 0 deletions.
1 change: 1 addition & 0 deletions .env.validator.example
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
WALLET_COLDKEY=
WALLET_HOTKEY=
DATASET_SERVICE_BASE_URL=https://dojo-validator-api.tensorplex.ai

# Mainnet related config
NETUID=52
Expand Down
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ miner-worker-api:
dojo-cli:
docker compose --env-file .env.miner -f docker-compose.miner.yaml run --rm dojo-cli

extract-dataset:
docker compose -f docker-compose.validator.yaml run --rm --remove-orphans extract-dataset

# ---------------------------------------------------------------------------- #
# CORE SERVICE LOGGING #
# ---------------------------------------------------------------------------- #
Expand Down
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
- [Option 2: Decentralised Method](#option-2-decentralised-method)
- [Setup Subscription Key for Labellers on UI to connect to Dojo Subnet for scoring](#setup-subscription-key-for-labellers-on-ui-to-connect-to-dojo-subnet-for-scoring)
- [Validating](#validating)
- [Data Collection](#data-collection)
- [Auto-updater](#auto-updater)
- [Dojo CLI](#dojo-cli)
- [For Dojo developerss](#for-dojo-developerss)
Expand Down Expand Up @@ -417,6 +418,7 @@ cp .env.validator.example .env.validator

WALLET_COLDKEY=# the name of the coldkey
WALLET_HOTKEY=# the name of the hotkey
DATASET_SERVICE_BASE_URL=https://dojo-validator-api.tensorplex.ai

# head to https://wandb.ai/authorize to get your API key
WANDB_API_KEY="<wandb_key>"
Expand Down Expand Up @@ -449,6 +451,15 @@ make validator

To start with autoupdate for validators (**strongly recommended**), see the [Auto-updater](#auto-updater) section.

## Data Collection

To export all data that has been collected from the validator, ensure that you have the environment variables setup properly as in [validator-setup](#validating), then run the following:

```bash
make validator-pull
make extract-dataset
```

# Auto-updater

> [!WARNING]
Expand Down
23 changes: 23 additions & 0 deletions docker-compose.validator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,26 @@ services:
prisma-setup-vali:
condition: service_completed_successfully
logging: *default-logging

dataset-service:
image: ghcr.io/tensorplex-labs/dojo:dataset
env_file:
- .env.validator
ports:
- "127.0.0.1:9999:9999"
command: ["dataset-service"]
logging: *default-logging

extract-dataset:
image: ghcr.io/tensorplex-labs/dojo:dataset
env_file:
- .env.validator
command: ["extract-dataset"]
networks:
- dojo-validator
volumes:
- ./:/app
- ./.env.validator:/app/.env
- prisma-binary:/root/prisma-python
- $HOME/.bittensor:/root/.bittensor
logging: *default-logging
35 changes: 35 additions & 0 deletions docker/Dockerfile.dataset
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
FROM python:3.11-slim-bookworm

WORKDIR /app

ENV PATH="/root/.cargo/bin/:$PATH"
ENV UV_SYSTEM_PYTHON=true
ENV NVM_DIR=/root/.nvm
ENV NODE_VERSION=v20.11.1
ENV NODE_PATH=$NVM_DIR/versions/node/$NODE_VERSION/lib/node_modules
ENV PATH=$NVM_DIR/versions/node/$NODE_VERSION/bin:$PATH

RUN apt-get update \
&& apt-get install -y --no-install-recommends \
build-essential curl git ca-certificates \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
COPY . .

ARG TARGETPLATFORM

RUN echo "Building for TARGETPLATFORM: $TARGETPLATFORM"

RUN git config --global --add safe.directory /app

# jank because pytorch has different versions for cpu for darwin VS linux, see pyproject.toml for specifics
# RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
# uv pip install --no-cache -e .[dataset] --find-links https://download.pytorch.org/whl/torch_stable.html; \
# else \
# uv pip install --no-cache -e .[dataset]; \
# fi
RUN uv pip install --no-cache -e ".[dataset]" --find-links https://download.pytorch.org/whl/torch_stable.html;

ENTRYPOINT ["./entrypoints.sh"]
23 changes: 23 additions & 0 deletions entrypoints.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,26 @@ if [ "$1" = 'validator' ]; then
--wandb.project_name ${WANDB_PROJECT_NAME} \
${EXTRA_ARGS}
fi

if [ "$1" = 'extract-dataset' ]; then
echo "Environment variables:"
echo "WALLET_HOTKEY: ${WALLET_HOTKEY}"
echo "DATABASE_URL: ${DATABASE_URL}"
echo "DATASET_SERVICE_BASE_URL: ${DATASET_SERVICE_BASE_URL}"
echo "WALLET_COLDKEY: ${WALLET_COLDKEY}"
echo "WALLET_HOTKEY: ${WALLET_HOTKEY}"
python scripts/extract_dataset.py \
--wallet.name ${WALLET_COLDKEY} \
--wallet.hotkey ${WALLET_HOTKEY}
fi

if [ "$1" = 'dataset-service' ]; then
echo "Environment variables:"
echo "PORT: ${PORT}"
echo "S3_BUCKET_NAME: ${S3_BUCKET_NAME}"
echo "AWS_REGION: ${AWS_REGION}"
echo "MAX_CHUNK_SIZE_MB: ${MAX_CHUNK_SIZE_MB}"
python entrypoints/dataset_service.py \
--netuid 52 \
--subtensor.network finney
fi
176 changes: 176 additions & 0 deletions entrypoints/dataset_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import asyncio
import os
from typing import List

import aioboto3
import aiofiles
import bittensor as bt
import httpx
import uvicorn
from bittensor.btlogging import logging as logger
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from substrateinterface import Keypair

from commons.objects import ObjectManager
from dojo import VALIDATOR_MIN_STAKE

app = FastAPI(title="Dataset Upload Service")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
config = ObjectManager.get_config()
subtensor = bt.subtensor(config=config)
metagraph = subtensor.metagraph(netuid=52, lite=True)
AWS_REGION = os.getenv("AWS_REGION")
BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
MAX_CHUNK_SIZE_MB = int(os.getenv("MAX_CHUNK_SIZE_MB", 50))


def verify_hotkey_in_metagraph(hotkey: str) -> bool:
return hotkey in metagraph.hotkeys


def verify_signature(hotkey: str, signature: str, message: str) -> bool:
keypair = Keypair(ss58_address=hotkey, ss58_format=42)
if not keypair.verify(data=message, signature=signature):
logger.error(f"Invalid signature for address={hotkey}")
return False

logger.success(f"Signature verified, signed by {hotkey}")
return True


def check_stake(hotkey: str) -> bool:
uid = -1
try:
uid = metagraph.hotkeys.index(hotkey)
except ValueError:
logger.error(f"Hotkey {hotkey} not found in metagraph")
return False

# Check if stake meets minimum threshold
stake = metagraph.S[uid].item()

if stake < VALIDATOR_MIN_STAKE:
logger.error(
f"Insufficient stake for hotkey {hotkey}: {stake} < {VALIDATOR_MIN_STAKE}"
)
return False

logger.info(f"Stake check passed for {hotkey} with stake {stake}")
return True


@app.post("/upload_dataset")
async def upload_dataset(
hotkey: str = Form(...),
signature: str = Form(...),
message: str = Form(...),
files: List[UploadFile] = File(...),
):
try:
if not signature.startswith("0x"):
raise HTTPException(
status_code=401, detail="Invalid signature format, must be hex."
)

if not verify_signature(hotkey, signature, message):
logger.error(f"Invalid signature for address={hotkey}")
raise HTTPException(status_code=401, detail="Invalid signature.")

if not verify_hotkey_in_metagraph(hotkey):
logger.error(f"Hotkey {hotkey} not found in metagraph")
raise HTTPException(
status_code=401, detail="Hotkey not found in metagraph."
)

if not check_stake(hotkey):
logger.error(f"Insufficient stake for hotkey {hotkey}")
raise HTTPException(
status_code=401, detail="Insufficient stake for hotkey."
)

session = aioboto3.Session(region_name=AWS_REGION)
async with session.resource("s3") as s3:
bucket = await s3.Bucket(BUCKET_NAME)
for file in files:
content = await file.read()
file_size = len(content)
if file_size > MAX_CHUNK_SIZE_MB * 1024 * 1024: # 50MB in bytes
raise HTTPException(
status_code=413,
detail=f"File too large. Maximum size is {MAX_CHUNK_SIZE_MB}MB",
)

filename = f"hotkey_{hotkey}_{file.filename}"

await bucket.put_object(
Key=filename,
Body=content,
)
except Exception as e:
logger.error(f"Error uploading dataset: {e}")
raise HTTPException(status_code=500, detail=f"Error uploading dataset: {e}")

return {
"success": True,
"message": "Files uploaded successfully",
"filenames": [file.filename for file in files],
}


async def server():
config = uvicorn.Config(app, host="0.0.0.0", port=9999)
server = uvicorn.Server(config)
await server.serve()


async def test_endpoint():
# Create test data
test_data = {
"hotkey": "asdfg",
"signature": "0xasdfg",
"message": "<Bytes>On 2024-12-02 18:15:23.663947 +08 Tensorplex is awesome</Bytes>",
}
# Create a temporary test file
test_filename = "dataset_20241202.jsonl"

# Build form data similar to how dojo.py does it
files = []

# Add file to form data if it exists
if os.path.exists(test_filename):
async with aiofiles.open(test_filename, "rb") as f:
file_content = await f.read()
files.append(("files", (test_filename, file_content, "application/json")))
else:
raise FileNotFoundError(f"Test file {test_filename} not found")

# Make request using httpx
async with httpx.AsyncClient() as client:
response = await client.post(
"http://localhost:8000/upload_dataset",
data={
"hotkey": test_data["hotkey"],
"signature": test_data["signature"],
"message": test_data["message"],
},
files=files,
timeout=30.0,
)
print(f"Status: {response.status_code}")
print(f"Response: {response.json()}")


if __name__ == "__main__":
import sys

if "--test" in sys.argv:
asyncio.run(test_endpoint())
else:
asyncio.run(server())
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ dependencies = [
[project.optional-dependencies]
dev = ["commitizen", "curlify2", "pytest", "ruff"]
test = ["pytest", "nox"]
dataset = ["aioboto3", "aiofiles", "python-multipart"]

[project.scripts]
dojo = "dojo_cli:main"
Expand Down
Loading

0 comments on commit 1b4ef9e

Please sign in to comment.