Skip to content

Commit

Permalink
Add test data generation tool. (#217)
Browse files Browse the repository at this point in the history
Added a tool to populate AIPscan with randomly generated example data.
  • Loading branch information
mcantelon committed Oct 11, 2023
1 parent a64364e commit 57351bf
Show file tree
Hide file tree
Showing 9 changed files with 308 additions and 0 deletions.
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,29 @@ Shut down the AIPscan Docker containers and remove the rabbitmq volumes:
docker-composer down --volumes
```

## Tools

The `tools` directory contains scripts that can be run by developers and system
adminsitrators.

#### Test data generator

The test data generator, `tools/generate-test-data`, tool populates
AIPscan's databse with randomly generated example data.

### Running tools

These should be run using the same system user and virtual environment that
AIPscan is running under.

Here's how you would run the `generate-test-data` tool, for example:

$ cd <path to AIPscan base directory>
$ sudo -u <AIPscan system user> /bin/bash
$ source <path to AIPscan virtual environment>/bin/activate
$ ./tools/generate-test-data.py


# Usage

* Ensure that the Flask Server, RabbitMQ server, and Celery worker queue are up and running.
Expand Down
1 change: 1 addition & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
-r base.txt

faker==14.2.1
flake8==5.0.4
pytest==5.4.3
pytest_cov==2.11.1
Expand Down
Empty file added tools/__init__.py
Empty file.
Empty file added tools/app/__init__.py
Empty file.
7 changes: 7 additions & 0 deletions tools/app/init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import os
import sys

relpath = f"{os.path.dirname(__file__)}/../../../AIPscan"
sys.path.append(os.path.abspath(relpath))

config_name = "default"
71 changes: 71 additions & 0 deletions tools/generate-test-data
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/usr/bin/env python3
from app import init
from faker import Faker
from flask import Flask
from helpers import data

from AIPscan import db
from AIPscan.models import FetchJob
from config import CONFIGS

app = Flask(__name__)
app.config.from_object(CONFIGS[init.config_name])

db.init_app(app)

fake = Faker()
randint = fake.random.randint

with app.app_context():
# Add example storage services
ss_to_create = 2

print(f"Creating pipeline and {ss_to_create} storage services...")
pipeline = data.create_pipeline()

ss_ids = []
fetch_jobs = {}

default_created = False
for _ in range(ss_to_create):
is_default = len(ss_ids) == 0

ss = data.create_storage_service(is_default)
ss_ids.append(ss.id)

fetch_job = data.create_fetch_job(ss.id)
fetch_jobs[ss.id] = fetch_job.id

# Populate storage service locations
storage_locations_per_ss = 2
ss_locations_to_create = ss_to_create * storage_locations_per_ss

print(
f"Creating {ss_locations_to_create} storage service locations (and their AIPs)..."
)

aip_batches_created = 0
total_aip_batches = len(ss_ids) * storage_locations_per_ss
for ss_id in ss_ids:
for _ in range(storage_locations_per_ss):
# Add location
sl = data.create_location(ss_id)

# Add AIPs
aip_batches_created += 1

print(f"Creating AIPs ({aip_batches_created}/{total_aip_batches})...")

aipcount = 0
for _ in range(1, randint(100, 300)):
aip = data.create_aip(pipeline.id, ss_id, sl.id, fetch_jobs[ss.id])
data.create_aip_files(100, 300, aip.id)
aipcount += 1

# Update package/AIP counts in fetch job
fetch_job = FetchJob.query.get(fetch_jobs[ss_id])
fetch_job.total_packages += aipcount
fetch_job.total_aips += aipcount
db.session.commit()

print("Done.")
Empty file added tools/helpers/__init__.py
Empty file.
118 changes: 118 additions & 0 deletions tools/helpers/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from datetime import date

from faker import Faker

from AIPscan import db
from AIPscan.models import (
AIP,
FetchJob,
File,
Pipeline,
StorageLocation,
StorageService,
)

fake = Faker()
randint = fake.random.randint


def create_pipeline():
pipeline = Pipeline(origin_pipeline=fake.uuid4(), dashboard_url=fake.url())

db.session.add(pipeline)
db.session.commit()

return pipeline


def create_storage_service(default):
ss = StorageService(
name=fake.text(20)[:-1],
url=fake.url(),
user_name=fake.profile()["username"],
api_key=fake.password(),
download_limit=0,
download_offset=0,
default=default,
)

db.session.add(ss)
db.session.commit()

return ss


def create_fetch_job(storage_service_id):
fetch_job = FetchJob(
total_packages=0,
total_aips=0,
total_deleted_aips=0,
download_start=date.today(),
download_end=date.today(),
download_directory=fake.file_path(),
storage_service_id=storage_service_id,
)
fetch_job.total_dips = 0
fetch_job.total_sips = 0
fetch_job.total_replicas = 0

db.session.add(fetch_job)
db.session.commit()

return fetch_job


def create_location(storage_service_id):
current_location = os.path.join(os.path.dirname(fake.file_path(3)), fake.uuid4())

location = StorageLocation(
current_location=current_location,
description=fake.text(20)[:-1],
storage_service_id=storage_service_id,
)

db.session.add(location)
db.session.commit()

return location


def create_aip(pipeline_id, storage_service_id, storage_location_id, fetch_job_id):
aip = AIP(
uuid=fake.uuid4(),
transfer_name=fake.text(20)[:-1],
create_date=date.today(),
mets_sha256=fake.sha256(),
size=randint(10000, 100_000_000),
storage_service_id=storage_service_id,
storage_location_id=storage_location_id,
fetch_job_id=fetch_job_id,
origin_pipeline_id=pipeline_id,
)

db.session.add(aip)
db.session.commit()

return aip


def create_aip_files(min, max, aip_id):
for _ in range(1, randint(min, max)):
aipfile = File(
aip_id=aip_id,
name=fake.text(20)[:-1],
filepath=fake.file_path(),
uuid=fake.uuid4(),
file_type="original",
size=randint(1000, 1_000_000),
date_created=date.today(),
puid=fake.text(20)[:-1],
file_format=fake.text(20)[:-1],
format_version=fake.text(20)[:-1],
checksum_type=fake.text(20)[:-1],
checksum_value=fake.text(20)[:-1],
premis_object="",
)

db.session.add(aipfile)
db.session.commit()
88 changes: 88 additions & 0 deletions tools/tests/test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import datetime

import pytest

from tools.helpers import data


@pytest.fixture
def mock_db_add(mocker):
mocker.patch("AIPscan.db.session.add")
mocker.patch("AIPscan.db.session.commit")


def test_create_storage_service(mock_db_add):
ss = data.create_storage_service(True)

assert ss.name
assert type(ss.name) == str

assert ss.url
assert type(ss.url) == str

assert ss.user_name
assert type(ss.user_name) == str

assert ss.api_key
assert type(ss.api_key) == str

assert ss.default
assert type(ss.default) == bool

ss = data.create_storage_service(False)
assert not ss.default


def test_create_fetch_job(mock_db_add):
ss = data.create_storage_service(True)
ss.id = 1

fetch_job = data.create_fetch_job(ss.id)

assert fetch_job.download_start
assert type(fetch_job.download_start) == datetime.date

assert fetch_job.download_end
assert type(fetch_job.download_end) == datetime.date

assert fetch_job.download_directory
assert type(fetch_job.download_directory) == str

assert fetch_job.storage_service_id == ss.id


def test_create_location(mock_db_add):
location = data.create_location(1)

assert location.current_location
assert type(location.current_location) == str

assert location.description
assert type(location.description) == str

assert location.storage_service_id == 1


def test_create_aip(mock_db_add):
aip = data.create_aip(1, 2, 3, 4)

assert aip.uuid
assert type(aip.uuid) == str

assert aip.transfer_name
assert type(aip.transfer_name) == str

assert aip.create_date
assert type(aip.create_date) == datetime.date

assert aip.mets_sha256
assert type(aip.mets_sha256) == str

assert aip.size
assert type(aip.size) == int

assert aip.origin_pipeline_id == 1
assert aip.storage_service_id == 2
assert aip.storage_location_id == 3
assert aip.fetch_job_id == 4
assert aip.origin_pipeline_id == 1

0 comments on commit 57351bf

Please sign in to comment.