From 4369cf15a0c8abd71cba6b9a48b10efcab2b63d9 Mon Sep 17 00:00:00 2001
From: jarvis8x7b <157810922+jarvis8x7b@users.noreply.github.com>
Date: Wed, 4 Dec 2024 20:01:35 +0800
Subject: [PATCH] feat: add dataset collection endpoint, script to scrape
 validator db (#78)

* feat: add validator api dataset service, extract dataset script

build: add deps for dataset service
build: add docker compose services & entrypoints

* chore: remove test script

* chore: fix comment

* chore: update readme and env example

* fix: add auth

* refactor: add hotkey to filename

* chore: format toml
---
 .env.validator.example         |   1 +
 Makefile                       |   3 +
 README.md                      |  11 ++
 docker-compose.validator.yaml  |  23 +++
 docker/Dockerfile.dataset      |  35 ++++
 entrypoints.sh                 |  23 +++
 entrypoints/dataset_service.py | 176 ++++++++++++++++++
 pyproject.toml                 |   1 +
 scripts/extract_dataset.py     | 327 +++++++++++++++++++++++++++++++++
 9 files changed, 600 insertions(+)
 create mode 100644 docker/Dockerfile.dataset
 create mode 100644 entrypoints/dataset_service.py
 create mode 100644 scripts/extract_dataset.py

diff --git a/.env.validator.example b/.env.validator.example
index 332e8a41..7b41ec36 100644
--- a/.env.validator.example
+++ b/.env.validator.example
@@ -1,5 +1,6 @@
 WALLET_COLDKEY=
 WALLET_HOTKEY=
+DATASET_SERVICE_BASE_URL=https://dojo-validator-api.tensorplex.ai
 
 # Mainnet related config
 NETUID=52
diff --git a/Makefile b/Makefile
index 8156bdbd..b98ba784 100644
--- a/Makefile
+++ b/Makefile
@@ -74,6 +74,9 @@ miner-worker-api:
 dojo-cli:
 	docker compose --env-file .env.miner -f docker-compose.miner.yaml run --rm dojo-cli
 
+extract-dataset:
+	docker compose -f docker-compose.validator.yaml run --rm --remove-orphans extract-dataset
+
 # ---------------------------------------------------------------------------- #
 #                             CORE SERVICE LOGGING                             #
 # ---------------------------------------------------------------------------- #
diff --git a/README.md b/README.md
index 99871d29..f707712e 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,7 @@
     - [Option 2: Decentralised Method](#option-2-decentralised-method)
     - [Setup Subscription Key for Labellers on UI to connect to Dojo Subnet for scoring](#setup-subscription-key-for-labellers-on-ui-to-connect-to-dojo-subnet-for-scoring)
   - [Validating](#validating)
+  - [Data Collection](#data-collection)
 - [Auto-updater](#auto-updater)
 - [Dojo CLI](#dojo-cli)
 - [For Dojo developerss](#for-dojo-developerss)
@@ -417,6 +418,7 @@ cp .env.validator.example .env.validator
 
 WALLET_COLDKEY=# the name of the coldkey
 WALLET_HOTKEY=# the name of the hotkey
+DATASET_SERVICE_BASE_URL=https://dojo-validator-api.tensorplex.ai
 
 # head to https://wandb.ai/authorize to get your API key
 WANDB_API_KEY="<wandb_key>"
@@ -449,6 +451,15 @@ make validator
 
 To start with autoupdate for validators (**strongly recommended**), see the [Auto-updater](#auto-updater) section.
 
+## Data Collection
+
+To export all data that has been collected from the validator, ensure that you have the environment variables setup properly as in [validator-setup](#validating), then run the following:
+
+```bash
+make validator-pull
+make extract-dataset
+```
+
 # Auto-updater
 
 > [!WARNING]
diff --git a/docker-compose.validator.yaml b/docker-compose.validator.yaml
index 02598809..29bfef01 100644
--- a/docker-compose.validator.yaml
+++ b/docker-compose.validator.yaml
@@ -131,3 +131,26 @@ services:
       prisma-setup-vali:
         condition: service_completed_successfully
     logging: *default-logging
+
+  dataset-service:
+    image: ghcr.io/tensorplex-labs/dojo:dataset
+    env_file:
+      - .env.validator
+    ports:
+      - "127.0.0.1:9999:9999"
+    command: ["dataset-service"]
+    logging: *default-logging
+
+  extract-dataset:
+    image: ghcr.io/tensorplex-labs/dojo:dataset
+    env_file:
+      - .env.validator
+    command: ["extract-dataset"]
+    networks:
+      - dojo-validator
+    volumes:
+      - ./:/app
+      - ./.env.validator:/app/.env
+      - prisma-binary:/root/prisma-python
+      - $HOME/.bittensor:/root/.bittensor
+    logging: *default-logging
diff --git a/docker/Dockerfile.dataset b/docker/Dockerfile.dataset
new file mode 100644
index 00000000..5cc80942
--- /dev/null
+++ b/docker/Dockerfile.dataset
@@ -0,0 +1,35 @@
+FROM python:3.11-slim-bookworm
+
+WORKDIR /app
+
+ENV PATH="/root/.cargo/bin/:$PATH"
+ENV UV_SYSTEM_PYTHON=true
+ENV NVM_DIR=/root/.nvm
+ENV NODE_VERSION=v20.11.1
+ENV NODE_PATH=$NVM_DIR/versions/node/$NODE_VERSION/lib/node_modules
+ENV PATH=$NVM_DIR/versions/node/$NODE_VERSION/bin:$PATH
+
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+    build-essential curl git ca-certificates \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv
+COPY . .
+
+ARG TARGETPLATFORM
+
+RUN echo "Building for TARGETPLATFORM: $TARGETPLATFORM"
+
+RUN git config --global --add safe.directory /app
+
+# jank because pytorch has different versions for cpu for darwin VS linux, see pyproject.toml for specifics
+# RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
+#     uv pip install --no-cache -e .[dataset] --find-links https://download.pytorch.org/whl/torch_stable.html; \
+#     else \
+#     uv pip install --no-cache -e .[dataset]; \
+#     fi
+RUN uv pip install --no-cache -e ".[dataset]" --find-links https://download.pytorch.org/whl/torch_stable.html;
+
+ENTRYPOINT ["./entrypoints.sh"]
diff --git a/entrypoints.sh b/entrypoints.sh
index 33001a25..8a2e2fea 100755
--- a/entrypoints.sh
+++ b/entrypoints.sh
@@ -74,3 +74,26 @@ if [ "$1" = 'validator' ]; then
     --wandb.project_name ${WANDB_PROJECT_NAME} \
     ${EXTRA_ARGS}
 fi
+
+if [ "$1" = 'extract-dataset' ]; then
+    echo "Environment variables:"
+    echo "WALLET_HOTKEY: ${WALLET_HOTKEY}"
+    echo "DATABASE_URL: ${DATABASE_URL}"
+    echo "DATASET_SERVICE_BASE_URL: ${DATASET_SERVICE_BASE_URL}"
+    echo "WALLET_COLDKEY: ${WALLET_COLDKEY}"
+    echo "WALLET_HOTKEY: ${WALLET_HOTKEY}"
+    python scripts/extract_dataset.py \
+    --wallet.name ${WALLET_COLDKEY} \
+    --wallet.hotkey ${WALLET_HOTKEY}
+fi
+
+if [ "$1" = 'dataset-service' ]; then
+    echo "Environment variables:"
+    echo "PORT: ${PORT}"
+    echo "S3_BUCKET_NAME: ${S3_BUCKET_NAME}"
+    echo "AWS_REGION: ${AWS_REGION}"
+    echo "MAX_CHUNK_SIZE_MB: ${MAX_CHUNK_SIZE_MB}"
+    python entrypoints/dataset_service.py \
+    --netuid 52 \
+    --subtensor.network finney
+fi
diff --git a/entrypoints/dataset_service.py b/entrypoints/dataset_service.py
new file mode 100644
index 00000000..d712af32
--- /dev/null
+++ b/entrypoints/dataset_service.py
@@ -0,0 +1,176 @@
+import asyncio
+import os
+from typing import List
+
+import aioboto3
+import aiofiles
+import bittensor as bt
+import httpx
+import uvicorn
+from bittensor.btlogging import logging as logger
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+from substrateinterface import Keypair
+
+from commons.objects import ObjectManager
+from dojo import VALIDATOR_MIN_STAKE
+
+app = FastAPI(title="Dataset Upload Service")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+config = ObjectManager.get_config()
+subtensor = bt.subtensor(config=config)
+metagraph = subtensor.metagraph(netuid=52, lite=True)
+AWS_REGION = os.getenv("AWS_REGION")
+BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
+MAX_CHUNK_SIZE_MB = int(os.getenv("MAX_CHUNK_SIZE_MB", 50))
+
+
+def verify_hotkey_in_metagraph(hotkey: str) -> bool:
+    return hotkey in metagraph.hotkeys
+
+
+def verify_signature(hotkey: str, signature: str, message: str) -> bool:
+    keypair = Keypair(ss58_address=hotkey, ss58_format=42)
+    if not keypair.verify(data=message, signature=signature):
+        logger.error(f"Invalid signature for address={hotkey}")
+        return False
+
+    logger.success(f"Signature verified, signed by {hotkey}")
+    return True
+
+
+def check_stake(hotkey: str) -> bool:
+    uid = -1
+    try:
+        uid = metagraph.hotkeys.index(hotkey)
+    except ValueError:
+        logger.error(f"Hotkey {hotkey} not found in metagraph")
+        return False
+
+    # Check if stake meets minimum threshold
+    stake = metagraph.S[uid].item()
+
+    if stake < VALIDATOR_MIN_STAKE:
+        logger.error(
+            f"Insufficient stake for hotkey {hotkey}: {stake} < {VALIDATOR_MIN_STAKE}"
+        )
+        return False
+
+    logger.info(f"Stake check passed for {hotkey} with stake {stake}")
+    return True
+
+
+@app.post("/upload_dataset")
+async def upload_dataset(
+    hotkey: str = Form(...),
+    signature: str = Form(...),
+    message: str = Form(...),
+    files: List[UploadFile] = File(...),
+):
+    try:
+        if not signature.startswith("0x"):
+            raise HTTPException(
+                status_code=401, detail="Invalid signature format, must be hex."
+            )
+
+        if not verify_signature(hotkey, signature, message):
+            logger.error(f"Invalid signature for address={hotkey}")
+            raise HTTPException(status_code=401, detail="Invalid signature.")
+
+        if not verify_hotkey_in_metagraph(hotkey):
+            logger.error(f"Hotkey {hotkey} not found in metagraph")
+            raise HTTPException(
+                status_code=401, detail="Hotkey not found in metagraph."
+            )
+
+        if not check_stake(hotkey):
+            logger.error(f"Insufficient stake for hotkey {hotkey}")
+            raise HTTPException(
+                status_code=401, detail="Insufficient stake for hotkey."
+            )
+
+        session = aioboto3.Session(region_name=AWS_REGION)
+        async with session.resource("s3") as s3:
+            bucket = await s3.Bucket(BUCKET_NAME)
+            for file in files:
+                content = await file.read()
+                file_size = len(content)
+                if file_size > MAX_CHUNK_SIZE_MB * 1024 * 1024:  # 50MB in bytes
+                    raise HTTPException(
+                        status_code=413,
+                        detail=f"File too large. Maximum size is {MAX_CHUNK_SIZE_MB}MB",
+                    )
+
+                filename = f"hotkey_{hotkey}_{file.filename}"
+
+                await bucket.put_object(
+                    Key=filename,
+                    Body=content,
+                )
+    except Exception as e:
+        logger.error(f"Error uploading dataset: {e}")
+        raise HTTPException(status_code=500, detail=f"Error uploading dataset: {e}")
+
+    return {
+        "success": True,
+        "message": "Files uploaded successfully",
+        "filenames": [file.filename for file in files],
+    }
+
+
+async def server():
+    config = uvicorn.Config(app, host="0.0.0.0", port=9999)
+    server = uvicorn.Server(config)
+    await server.serve()
+
+
+async def test_endpoint():
+    # Create test data
+    test_data = {
+        "hotkey": "asdfg",
+        "signature": "0xasdfg",
+        "message": "<Bytes>On 2024-12-02 18:15:23.663947 +08 Tensorplex is awesome</Bytes>",
+    }
+    # Create a temporary test file
+    test_filename = "dataset_20241202.jsonl"
+
+    # Build form data similar to how dojo.py does it
+    files = []
+
+    # Add file to form data if it exists
+    if os.path.exists(test_filename):
+        async with aiofiles.open(test_filename, "rb") as f:
+            file_content = await f.read()
+            files.append(("files", (test_filename, file_content, "application/json")))
+    else:
+        raise FileNotFoundError(f"Test file {test_filename} not found")
+
+    # Make request using httpx
+    async with httpx.AsyncClient() as client:
+        response = await client.post(
+            "http://localhost:8000/upload_dataset",
+            data={
+                "hotkey": test_data["hotkey"],
+                "signature": test_data["signature"],
+                "message": test_data["message"],
+            },
+            files=files,
+            timeout=30.0,
+        )
+        print(f"Status: {response.status_code}")
+        print(f"Response: {response.json()}")
+
+
+if __name__ == "__main__":
+    import sys
+
+    if "--test" in sys.argv:
+        asyncio.run(test_endpoint())
+    else:
+        asyncio.run(server())
diff --git a/pyproject.toml b/pyproject.toml
index bb8b092e..0d842d9c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,6 +61,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = ["commitizen", "curlify2", "pytest", "ruff"]
 test = ["pytest", "nox"]
+dataset = ["aioboto3", "aiofiles", "python-multipart"]
 
 [project.scripts]
 dojo = "dojo_cli:main"
diff --git a/scripts/extract_dataset.py b/scripts/extract_dataset.py
new file mode 100644
index 00000000..29a95f18
--- /dev/null
+++ b/scripts/extract_dataset.py
@@ -0,0 +1,327 @@
+import asyncio
+import os
+from datetime import datetime
+from typing import AsyncGenerator, List
+
+import aiofiles
+import bittensor as bt
+import httpx
+import numpy as np
+from bittensor.btlogging import logging as logger
+from pydantic import BaseModel, model_serializer
+
+from commons.exceptions import (
+    NoNewExpiredTasksYet,
+)
+from commons.objects import ObjectManager
+from database.client import connect_db, disconnect_db
+from database.mappers import (
+    map_feedback_request_model_to_feedback_request,
+)
+from database.prisma.models import (
+    Feedback_Request_Model,
+)
+from database.prisma.types import (
+    Feedback_Request_ModelInclude,
+    Feedback_Request_ModelWhereInput,
+)
+from dojo import TASK_DEADLINE
+from dojo.protocol import (
+    CompletionResponses,
+    DendriteQueryResponse,
+)
+from dojo.utils.config import source_dotenv
+
+source_dotenv()
+
+DATASET_SERVICE_BASE_URL = os.getenv("DATASET_SERVICE_BASE_URL")
+MAX_CHUNK_SIZE_MB = int(os.getenv("MAX_CHUNK_SIZE_MB", 50))
+
+if DATASET_SERVICE_BASE_URL is None:
+    raise ValueError("DATASET_SERVICE_BASE_URL must be set")
+if MAX_CHUNK_SIZE_MB is None:
+    raise ValueError("MAX_CHUNK_SIZE_MB must be set")
+
+
+# represents a row in the jsonl dataset
+class Row(BaseModel):
+    prompt: str
+    completions: list[CompletionResponses]
+    # shape (num_miners, num_completions)
+    raw_scores: list[list[float]]
+    # shape (num_completions)
+    mean_scores: list[float]
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    @model_serializer
+    def serialize_model(self):
+        return {
+            "prompt": self.prompt,
+            "completions": self.completions,
+            "raw_scores": self.raw_scores,
+            "mean_scores": self.mean_scores,
+        }
+
+
+async def build_jsonl(filename: str):
+    with open(filename, "w") as file:
+        batch_size = 10
+        task_count = 0
+        async for task_batch, has_more_batches in get_processed_tasks(batch_size):
+            if not has_more_batches and not task_batch:
+                break
+
+            for task in task_batch:
+                # Extract prompt from validator request
+                prompt = task.request.prompt
+
+                # Extract completions from miner responses
+                completions = task.request.completion_responses
+
+                raw_scores = []
+                for miner_response in task.miner_responses:
+                    miner_ratings = [
+                        c.score for c in miner_response.completion_responses
+                    ]
+                    if any(rating is None for rating in miner_ratings):
+                        continue
+                    raw_scores.append(miner_ratings)
+
+                # shape (num_completions, num_miners)
+                raw_scores_vec = np.array(raw_scores)
+                logger.info(f"raw_scores_vec shape: {raw_scores_vec.shape}")
+                logger.info(f"raw_scores_vec: {raw_scores_vec}")
+
+                if raw_scores_vec.size > 0:
+                    mean_scores = raw_scores_vec.mean(axis=1)
+                    logger.info(f"mean_scores shape: {mean_scores.shape}")
+                    jsonl_row = Row(
+                        prompt=prompt,
+                        completions=completions,
+                        raw_scores=raw_scores,
+                        mean_scores=mean_scores.tolist(),
+                    )
+                else:
+                    jsonl_row = Row(
+                        prompt=prompt,
+                        completions=completions,
+                        raw_scores=[],
+                        mean_scores=[],
+                    )
+
+                # Write the entry as a JSON line
+                file.write(jsonl_row.model_dump_json() + "\n")
+
+            task_count += len(task_batch)
+            logger.info(f"Scraped task count: {task_count}")
+
+
+async def get_processed_tasks(
+    batch_size: int = 10,
+) -> AsyncGenerator[tuple[List[DendriteQueryResponse], bool], None]:
+    """Yields batches of processed Feedback_Request_Model records along with a boolean flag indicating the presence of additional batches.
+
+    This function retrieves tasks that have been fully processed. The batch size can be specified to control the number of tasks returned in each batch.
+
+    Args:
+        batch_size (int, optional): The number of tasks to include in each batch. Defaults to 10.
+
+    Raises:
+        NoNewExpiredTasksYet: Raised if no processed tasks are available for retrieval.
+
+    Yields:
+        AsyncGenerator[tuple[List[DendriteQueryResponse], bool], None]: An asynchronous generator yielding a tuple containing a list of DendriteQueryResponse objects and a boolean indicating if more batches are available.
+    """
+
+    # find all validator requests first
+    include_query = Feedback_Request_ModelInclude(
+        {
+            "completions": True,
+            "criteria_types": True,
+            "ground_truths": True,
+            "parent_request": True,
+        }
+    )
+
+    vali_where_query = Feedback_Request_ModelWhereInput(
+        {
+            "parent_id": None,  # no parent means it's a validator request
+            # only check for tasks that are completely done
+            "is_processed": {"equals": True},
+        }
+    )
+
+    # count first total including non
+    task_count = await Feedback_Request_Model.prisma().count(
+        where=vali_where_query,
+    )
+
+    logger.info(f"Count of processed tasks: {task_count}")
+
+    if not task_count:
+        raise NoNewExpiredTasksYet(
+            f"No expired tasks found for processing, please wait for tasks to pass the task deadline of {TASK_DEADLINE} seconds."
+        )
+
+    for i in range(0, task_count, batch_size):
+        # find all unprocesed validator requests
+        validator_requests = await Feedback_Request_Model.prisma().find_many(
+            include=include_query,
+            where=vali_where_query,
+            order={"created_at": "desc"},
+            skip=i,
+            take=batch_size,
+        )
+
+        # find all miner responses
+        processed_vali_request_ids = [r.id for r in validator_requests]
+        miner_responses = await Feedback_Request_Model.prisma().find_many(
+            include=include_query,
+            where={
+                "parent_id": {"in": processed_vali_request_ids},
+                "is_processed": {"equals": True},
+            },
+            order={"created_at": "desc"},
+        )
+
+        # NOTE: technically a DendriteQueryResponse represents a task
+        tasks: list[DendriteQueryResponse] = []
+        for validator_request in validator_requests:
+            vali_request = map_feedback_request_model_to_feedback_request(
+                validator_request
+            )
+
+            m_responses = list(
+                map(
+                    lambda x: map_feedback_request_model_to_feedback_request(
+                        x, is_miner=True
+                    ),
+                    [m for m in miner_responses if m.parent_id == validator_request.id],
+                )
+            )
+
+            tasks.append(
+                DendriteQueryResponse(request=vali_request, miner_responses=m_responses)
+            )
+
+        # yield responses, so caller can do something
+        has_more_batches = True
+        yield tasks, has_more_batches
+
+    yield [], False
+
+
+async def upload(hotkey: str, signature: str, message: str, filename: str):
+    if not signature.startswith("0x"):
+        signature = f"0x{signature}"
+
+    # Build form data similar to how dojo.py does it
+    form_body = {
+        "hotkey": hotkey,
+        "signature": signature,
+        "message": message,
+    }
+    # Add file to form data if it exists
+    if os.path.exists(filename):
+        chunks = await chunk_file(filename, MAX_CHUNK_SIZE_MB)
+
+        # Make request using httpx
+        async with httpx.AsyncClient() as client:
+            for chunk_filename, chunk_content in chunks:
+                # Append to files list with correct format
+                files = [("files", (chunk_filename, chunk_content, "application/json"))]
+                response = await client.post(
+                    f"{DATASET_SERVICE_BASE_URL}/upload_dataset",
+                    data=form_body,
+                    files=files,
+                    timeout=60.0,
+                )
+                logger.info(f"Status: {response.status_code}")
+                response_json = response.json()
+                logger.info(f"Response: {response_json}")
+                is_success = response.status_code == 200 and response_json.get(
+                    "success"
+                )
+                if not is_success:
+                    raise Exception(f"Failed to upload file {chunk_filename}")
+                await asyncio.sleep(1)
+
+
+async def chunk_file(filename: str, chunk_size_mb: int = 50):
+    chunk_size = chunk_size_mb * 1024 * 1024  # Convert MB to bytes
+
+    if os.path.exists(filename):
+        async with aiofiles.open(filename) as f:  # Open in text mode
+            chunks = []
+            current_chunk = []
+            current_chunk_size = 0
+
+            # ensure that when we chunk, we don't split across lines
+            async for line in f:
+                line_size = len(line.encode("utf-8"))  # Get size of line in bytes
+                if current_chunk_size + line_size > chunk_size:
+                    # Use consistent format
+                    base, ext = os.path.splitext(filename)
+                    chunk_filename = f"{base}_part{len(chunks) + 1}{ext}"
+                    chunks.append((chunk_filename, "".join(current_chunk)))
+                    current_chunk = []
+                    current_chunk_size = 0
+
+                current_chunk.append(line)
+                current_chunk_size += line_size
+
+            # Use same format for last chunk
+            if current_chunk:
+                base, ext = os.path.splitext(filename)
+                chunk_filename = f"{base}_part{len(chunks) + 1}{ext}"
+                chunks.append((chunk_filename, "".join(current_chunk)))
+
+            return chunks
+    else:
+        raise FileNotFoundError(f"Test file {filename} not found")
+
+
+async def main():
+    await connect_db()
+    config = ObjectManager.get_config()
+    wallet = bt.wallet(config=config)
+    hotkey = wallet.hotkey.ss58_address
+    message = f"Uploading dataset for validator with hotkey: {hotkey}"
+    signature = wallet.hotkey.sign(message).hex()  # Convert signature to hex string
+
+    # Create filename with current date
+    current_date = datetime.now().strftime("%Y%m%d")
+    filename = f"dataset_{current_date}.jsonl"
+    # Check if file already exists
+    if os.path.exists(filename):
+        logger.warning(f"File {filename} already exists, skipping scrape db step")
+    else:
+        await build_jsonl(filename)
+
+    try:
+        upload_success = await upload(hotkey, signature, message, filename)
+        if upload_success:
+            logger.info("Upload successful! Removing local dataset file.")
+            os.remove(filename)
+    except Exception as e:
+        logger.error(f"Error occurred while trying to upload dataset: {e}")
+    finally:
+        await disconnect_db()
+
+
+async def _test_chunking():
+    filename = "dummy_dataset.jsonl"
+    chunks = await chunk_file(filename, MAX_CHUNK_SIZE_MB)
+    logger.info(f"number of chunks: {len(chunks)}")
+    for i, (chunk_filename, chunk_content) in enumerate(chunks, 1):
+        logger.info(f"\nSaving chunk {i} to {chunk_filename}")
+        async with aiofiles.open(chunk_filename, "w") as f:
+            await f.write(chunk_content)
+        logger.info(f"Saved chunk {i} ({len(chunk_content)} bytes)")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
+    # asyncio.run(_test_chunking())