Skip to content

Commit

Permalink
chore: add test chunking function
Browse files Browse the repository at this point in the history
  • Loading branch information
jarvis8x7b committed Dec 2, 2024
1 parent 9ba6bb5 commit a2c454c
Showing 1 changed file with 15 additions and 0 deletions.
15 changes: 15 additions & 0 deletions scripts/extract_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
CompletionResponses,
DendriteQueryResponse,
)
from dojo.utils.config import source_dotenv

source_dotenv()

DATASET_SERVICE_BASE_URL = os.getenv("DATASET_SERVICE_BASE_URL")
MAX_CHUNK_SIZE_MB = int(os.getenv("MAX_CHUNK_SIZE_MB", 50))
Expand Down Expand Up @@ -302,5 +305,17 @@ async def main():
await disconnect_db()


async def _test_chunking():
filename = "dummy_dataset.jsonl"
chunks = await chunk_file(filename, MAX_CHUNK_SIZE_MB)
logger.info(f"number of chunks: {len(chunks)}")
for i, (chunk_filename, chunk_content) in enumerate(chunks, 1):
logger.info(f"\nSaving chunk {i} to {chunk_filename}")
async with aiofiles.open(chunk_filename, "w") as f:
await f.write(chunk_content)
logger.info(f"Saved chunk {i} ({len(chunk_content)} bytes)")


if __name__ == "__main__":
asyncio.run(main())
# asyncio.run(_test_chunking())

0 comments on commit a2c454c

Please sign in to comment.