Skip to content

Commit

Permalink
refactor: update dataset extraction script based on new schema (#120)
Browse files Browse the repository at this point in the history
  • Loading branch information
jarvis8x7b authored Feb 12, 2025
1 parent 78e23dd commit deac196
Show file tree
Hide file tree
Showing 8 changed files with 531 additions and 168 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ dojo-cli:
extract-dataset:
docker compose -f docker-compose.validator.yaml run --rm --remove-orphans extract-dataset

fill-score-column:
docker compose -f docker-compose.validator.yaml run --rm --remove-orphans fill-score-column

migration:
docker compose --env-file .env.validator -f docker-compose.validator.yaml run --rm migration

Expand Down
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
- [Auto-updater](#auto-updater)
- [Dojo CLI](#dojo-cli)
- [For Dojo developers](#for-dojo-developers)
- [Dataset Extraction](#dataset-extraction)
- [License](#license)

</details>
Expand Down Expand Up @@ -575,6 +576,17 @@ make install-dev
make install-test
```

## Dataset Extraction

The dataset should be in different parts, currently `MAX_CHUNK_SIZE_MB` is set to 50MB on the dataset service, due to limitations on the load balancer. Use the commands to combine all into a single dataset file:

```bash
aws s3 cp s3://amzn-s3-demo-bucket1/ <PATH_ON_LOCAL> --recursive --exclude "*" --include "hotkey_<vali_hotkey>_dataset_20250212*.jsonl"
cd <PATH_ON_LOCAL>
# to merge all chunks into a single dataset file
cat *.jsonl > hotkey_<vali_hotkey>_dataset_combined.jsonl
```

# License

This repository is licensed under the MIT License.
Expand Down
11 changes: 6 additions & 5 deletions commons/orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ async def update_miner_task_results(
)
else:
logger.warning(
f"Retrying update, attempt {attempt+2}/{max_retries}"
f"Retrying update, attempt {attempt + 2}/{max_retries}"
)
await asyncio.sleep(2**attempt)

Expand All @@ -282,7 +282,7 @@ async def update_miner_task_results(
logger.error(f"Error updating task results: {e}")
else:
logger.warning(
f"Error during attempt {attempt+1}, retrying: {e}"
f"Error during attempt {attempt + 1}, retrying: {e}"
)
await asyncio.sleep(2**attempt)

Expand All @@ -300,6 +300,7 @@ async def update_miner_raw_scores(
max_retries: int = 20,
) -> tuple[bool, list[int]]:
"""Update the miner's provided raw scores for a list of miner responses.
NOTE: this is to be used when the task is first saved to validator's database.
Args:
miner_responses: List of TaskSynapseObject containing miner responses
Expand Down Expand Up @@ -403,18 +404,18 @@ async def update_miner_raw_scores(
)

logger.debug(
f"Updating completion responses: updated batch {batch_id+1}/{num_batches}"
f"Updating completion responses: updated batch {batch_id + 1}/{num_batches}"
)
break
except Exception as e:
if attempt == max_retries - 1:
logger.error(
f"Failed to update batch {batch_id+1}/{num_batches} after {max_retries} attempts: {e}"
f"Failed to update batch {batch_id + 1}/{num_batches} after {max_retries} attempts: {e}"
)
failed_batch_indices.extend(range(start_idx, end_idx))
else:
logger.warning(
f"Retrying batch {batch_id+1}/{num_batches}, attempt {attempt+2}/{max_retries}"
f"Retrying batch {batch_id + 1}/{num_batches}, attempt {attempt + 2}/{max_retries}"
)
await asyncio.sleep(2**attempt)

Expand Down
16 changes: 16 additions & 0 deletions docker-compose.validator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ services:
logging: *default-logging

synthetic-api:
container_name: synthetic-api
image: ghcr.io/tensorplex-labs/dojo-synthetic-api:main
env_file:
- .env.validator
Expand Down Expand Up @@ -201,3 +202,18 @@ services:
- prisma-pip-cache:/root/.cache/pip
- $HOME/.bittensor:/root/.bittensor
logging: *default-logging

fill-score-column:
container_name: fill-score-column
image: ghcr.io/tensorplex-labs/dojo:main
env_file:
- .env.validator
command: ["fill-score-column"]
networks:
- dojo-validator
volumes:
- ./:/app
- ./.env.validator:/app/.env
- prisma-binary:/root/prisma-python
- $HOME/.bittensor:/root/.bittensor
logging: *default-logging
7 changes: 7 additions & 0 deletions docker/Dockerfile.dataset
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ FROM python:3.11-slim-bookworm

WORKDIR /app

# Prisma-specific environment variables
ENV PRISMA_USE_NODEJS_BIN=true
ENV PRISMA_BINARY_PLATFORM=debian-openssl-3.0.x
ENV PRISMA_BINARY_CACHE_DIR=/root/prisma-python

ENV PATH="/root/.cargo/bin/:$PATH"
ENV UV_SYSTEM_PYTHON=true
ENV NVM_DIR=/root/.nvm
Expand All @@ -15,6 +20,8 @@ RUN apt-get update \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

RUN mkdir -p /root/prisma-python

COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
COPY . .

Expand Down
6 changes: 6 additions & 0 deletions entrypoints.sh
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,9 @@ if [ "$1" = 'validate-migration' ]; then
echo "Starting migration validation..."
python scripts/validate_migration.py
fi

if [ "$1" = 'fill-score-column' ]; then
echo "Environment variables:"
echo "DATABASE_URL: ${DATABASE_URL}"
python scripts/fill_score_column.py --logging.info
fi
Loading

0 comments on commit deac196

Please sign in to comment.