Skip to content

Commit

Permalink
Add test data generation tool (#217) (#219)
Browse files Browse the repository at this point in the history
Added a tool to populate AIPscan with randomly generated example data.
  • Loading branch information
mcantelon authored Oct 31, 2023
1 parent 80adfe0 commit 1657ff7
Show file tree
Hide file tree
Showing 9 changed files with 360 additions and 0 deletions.
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,29 @@ Shut down the AIPscan Docker containers and remove the rabbitmq volumes:
docker-composer down --volumes
```

## Tools

The `tools` directory contains scripts that can be run by developers and system
adminsitrators.

#### Test data generator

The test data generator, `tools/generate-test-data`, tool populates
AIPscan's databse with randomly generated example data.

### Running tools

These should be run using the same system user and virtual environment that
AIPscan is running under.

Here's how you would run the `generate-test-data` tool, for example:

$ cd <path to AIPscan base directory>
$ sudo -u <AIPscan system user> /bin/bash
$ source <path to AIPscan virtual environment>/bin/activate
$ ./tools/generate-test-data.py


# Usage

* Ensure that the Flask Server, RabbitMQ server, and Celery worker queue are up and running.
Expand Down
1 change: 1 addition & 0 deletions requirements/test.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
-r base.txt

faker==14.2.1
flake8==5.0.4
pytest==6.2.5
pytest_cov==2.11.1
Expand Down
Empty file added tools/__init__.py
Empty file.
Empty file added tools/app/__init__.py
Empty file.
19 changes: 19 additions & 0 deletions tools/app/init.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import os
import sys

from flask import Flask

# Alter path so tools can import AIPscan's modules
relpath = f"{os.path.dirname(__file__)}/../../../AIPscan"
sys.path.append(os.path.abspath(relpath))

config_name = "default"


def create_app_instance(configuration, db):
app = Flask(__name__)
app.config.from_object(configuration)

db.init_app(app)

return app
99 changes: 99 additions & 0 deletions tools/generate-test-data
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/usr/bin/env python3
import sys

import click
from app import init
from helpers import data

from AIPscan import db
from AIPscan.models import FetchJob
from config import CONFIGS


@click.command()
@click.option("--storage-services-to-create", default=2)
@click.option("--locations-per-storage-service", default=2)
@click.option("--locations-min-aip-count", default=10)
@click.option("--locations-max-aip-count", default=30)
@click.option("--aip-min-file-count", default=10)
@click.option("--aip-max-file-count", default=30)
@click.option("--seed", default=0)
def main(
storage_services_to_create,
locations_per_storage_service,
locations_min_aip_count,
locations_max_aip_count,
aip_min_file_count,
aip_max_file_count,
seed,
):
# Initialize Flash app context
app = init.create_app_instance(CONFIGS[init.config_name], db)

# Change seed
if seed > 0:
data.seed(seed)

with app.app_context():
# Add example pipeline and storage services
print(
f"Creating/fetching pipeline and creating {storage_services_to_create} storage services..."
)
pipeline = data.create_or_fetch_fake_pipeline()

ss_ids = []
fetch_jobs = {}

for _ in range(storage_services_to_create):
is_default = len(ss_ids) == 0

ss = data.create_fake_storage_service(is_default)
ss_ids.append(ss.id)

fetch_job = data.create_fake_fetch_job(ss.id)
fetch_jobs[ss.id] = fetch_job.id

# Populate storage service locations
ss_locations_to_create = (
storage_services_to_create * locations_per_storage_service
)

print(
f"Creating {ss_locations_to_create} storage service locations (and their AIPs)..."
)

aip_batches_created = 0
total_aip_batches = len(ss_ids) * locations_per_storage_service
for ss_id in ss_ids:
for _ in range(locations_per_storage_service):
# Add location
sl = data.create_fake_location(ss_id)

# Add AIPs and AIP files
aip_batches_created += 1

print(f"Creating AIPs ({aip_batches_created}/{total_aip_batches})...")

aipcount = 0
for _ in range(
1, data.randint(locations_min_aip_count, locations_max_aip_count)
):
aip = data.create_fake_aip(
pipeline.id, ss_id, sl.id, fetch_jobs[ss.id]
)
data.create_fake_aip_files(
aip_min_file_count, aip_max_file_count, aip.id
)
aipcount += 1

# Update package/AIP counts in fetch job
fetch_job = FetchJob.query.get(fetch_jobs[ss_id])
fetch_job.total_packages += aipcount
fetch_job.total_aips += aipcount
db.session.commit()

print("Done.")


if __name__ == "__main__":
sys.exit(main())
Empty file added tools/helpers/__init__.py
Empty file.
130 changes: 130 additions & 0 deletions tools/helpers/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import os
from datetime import date

from faker import Faker

from AIPscan import db
from AIPscan.models import (
AIP,
FetchJob,
File,
Pipeline,
StorageLocation,
StorageService,
)

# Initialize Faker instance
fake = Faker()


def seed(seed):
fake.seed_instance(seed)


def randint(start, end):
return fake.random.randint(start, end)


def create_or_fetch_fake_pipeline():
pipeline = db.session.query(Pipeline).first()

if pipeline is None:
pipeline = Pipeline(origin_pipeline=fake.uuid4(), dashboard_url=fake.url())

db.session.add(pipeline)
db.session.commit()

return pipeline


def create_fake_storage_service(default):
ss = StorageService(
name=fake.text(20)[:-1],
url=fake.url(),
user_name=fake.profile()["username"],
api_key=fake.password(),
download_limit=0,
download_offset=0,
default=default,
)

db.session.add(ss)
db.session.commit()

return ss


def create_fake_fetch_job(storage_service_id):
fetch_job = FetchJob(
total_packages=0,
total_aips=0,
total_deleted_aips=0,
download_start=date.today(),
download_end=date.today(),
download_directory=fake.file_path(),
storage_service_id=storage_service_id,
)
fetch_job.total_dips = 0
fetch_job.total_sips = 0
fetch_job.total_replicas = 0

db.session.add(fetch_job)
db.session.commit()

return fetch_job


def create_fake_location(storage_service_id):
current_location = os.path.join(os.path.dirname(fake.file_path(3)), fake.uuid4())

location = StorageLocation(
current_location=current_location,
description=fake.text(20)[:-1],
storage_service_id=storage_service_id,
)

db.session.add(location)
db.session.commit()

return location


def create_fake_aip(pipeline_id, storage_service_id, storage_location_id, fetch_job_id):
aip = AIP(
uuid=fake.uuid4(),
transfer_name=fake.text(20)[:-1],
create_date=date.today(),
mets_sha256=fake.sha256(),
size=randint(10000, 100_000_000),
storage_service_id=storage_service_id,
storage_location_id=storage_location_id,
fetch_job_id=fetch_job_id,
origin_pipeline_id=pipeline_id,
)

db.session.add(aip)
db.session.commit()

return aip


def create_fake_aip_files(min, max, aip_id):
for _ in range(1, randint(min, max)):
aipfile = File(
aip_id=aip_id,
name=fake.text(20)[:-1],
filepath=fake.file_path(),
uuid=fake.uuid4(),
file_type="original",
size=randint(1000, 1_000_000),
date_created=date.today(),
puid=fake.text(20)[:-1],
file_format=fake.text(20)[:-1],
format_version=fake.text(20)[:-1],
checksum_type=fake.text(20)[:-1],
checksum_value=fake.text(20)[:-1],
premis_object="",
)

db.session.add(aipfile)
db.session.commit()
88 changes: 88 additions & 0 deletions tools/tests/test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import datetime

import pytest

from tools.helpers import data


@pytest.fixture
def mock_db_add(mocker):
mocker.patch("AIPscan.db.session.add")
mocker.patch("AIPscan.db.session.commit")


def test_create_fake_storage_service(mock_db_add):
ss = data.create_fake_storage_service(True)

assert ss.name
assert type(ss.name) is str

assert ss.url
assert type(ss.url) is str

assert ss.user_name
assert type(ss.user_name) is str

assert ss.api_key
assert type(ss.api_key) is str

assert ss.default
assert type(ss.default) is bool

ss = data.create_fake_storage_service(False)
assert not ss.default


def test_create_fake_fetch_job(mock_db_add):
ss = data.create_fake_storage_service(True)
ss.id = 1

fetch_job = data.create_fake_fetch_job(ss.id)

assert fetch_job.download_start
assert type(fetch_job.download_start) is datetime.date

assert fetch_job.download_end
assert type(fetch_job.download_end) is datetime.date

assert fetch_job.download_directory
assert type(fetch_job.download_directory) is str

assert fetch_job.storage_service_id == ss.id


def test_create_fake_location(mock_db_add):
location = data.create_fake_location(1)

assert location.current_location
assert type(location.current_location) is str

assert location.description
assert type(location.description) is str

assert location.storage_service_id == 1


def test_create_fake_aip(mock_db_add):
aip = data.create_fake_aip(1, 2, 3, 4)

assert aip.uuid
assert type(aip.uuid) is str

assert aip.transfer_name
assert type(aip.transfer_name) is str

assert aip.create_date
assert type(aip.create_date) is datetime.date

assert aip.mets_sha256
assert type(aip.mets_sha256) is str

assert aip.size
assert type(aip.size) is int

assert aip.origin_pipeline_id == 1
assert aip.storage_service_id == 2
assert aip.storage_location_id == 3
assert aip.fetch_job_id == 4
assert aip.origin_pipeline_id == 1

0 comments on commit 1657ff7

Please sign in to comment.