diff --git a/data-serving/scripts/prune-uploads/hooks/aggregate.py b/data-serving/scripts/prune-uploads/hooks/aggregate.py index 7df652397..ffefdb41f 100644 --- a/data-serving/scripts/prune-uploads/hooks/aggregate.py +++ b/data-serving/scripts/prune-uploads/hooks/aggregate.py @@ -8,8 +8,8 @@ AWS_REGION = os.getenv("GDH_AGGREGATE_AWS_REGION", "eu-central-1") # Job definition names are of the form PREFIX- -PREFIX = "gdh-map-aggregation" -JOB_QUEUE = "gdh-map-aggregation" +PREFIX = os.getenv("JOB_DEF_PREFIX", "gdh-map-aggregation") +JOB_QUEUE = os.getenv("AGG_JOB_QUEUE", "gdh-map-aggregation-fargate") def run(sources: list[dict[str, Any]], env: str, dry_run: bool = False): diff --git a/data-serving/scripts/prune-uploads/hooks/country_export.py b/data-serving/scripts/prune-uploads/hooks/country_export.py index 4043e7546..7a5f8703d 100644 --- a/data-serving/scripts/prune-uploads/hooks/country_export.py +++ b/data-serving/scripts/prune-uploads/hooks/country_export.py @@ -3,6 +3,7 @@ from functools import cache import logging +import os from typing import Any import unicodedata @@ -10,6 +11,9 @@ import pycountry +JOB_QUEUE = os.getenv("EXP_JOB_QUEUE", "export-queue") + + # We do not always use the pycountry names, here's a list of exceptions _QUIRKS = { "DEMOCRATIC REPUBLIC OF THE CONGO": "CD", @@ -84,7 +88,7 @@ def run(sources: list[dict[str, Any]], env: str, dry_run: bool = False): logging.info(f"Submitting job for {jobdef} ...") if not dry_run: batch.submit_job( - jobName=jobdef, jobDefinition=jobdef, jobQueue="export-queue" + jobName=jobdef, jobDefinition=jobdef, jobQueue=JOB_QUEUE ) except Exception as e: logging.exception(f"Error occurred while trying to submit {jobdef}") diff --git a/docs/data_landscape.md b/docs/data_landscape.md index ca64903c3..ef0078c6c 100644 --- a/docs/data_landscape.md +++ b/docs/data_landscape.md @@ -6,68 +6,22 @@ What we have, and where it's stored. This is organised by somewhat-physical, som MongoDB Atlas stores line list case data (including revision history), user records, data ingestion source records including ingestion histories, maps of Mapbox administrative area codes to names and front-end session tokens for the line list portal. There are two projects: - 1. Covid19Map-Dev has one cluster, cluster-0, which is hosted in AWS us-east-1. It holds development data which is mostly based on historical snapshots of production. - 2. Covid19Map-Prod has one cluster, covid19-map-cluster01, also hosted in AWS us-east-1. It holds production data. + 1. Covid19Map-Dev has one cluster, cluster-0, which is hosted in AWS eu-central-1. It holds development data which is mostly based on historical snapshots of production. + 2. Covid19Map-Prod has one cluster, covid19-map-cluster01, also hosted in AWS eu-central-1. It holds production data. ## S3 stores -Various buckets (data containers) are used for both temporary and long-term storage of G.h data. Unless otherwise noted, all S3 buckets are in eu-central-1. - -### Unknown use - -* config-bucket-612888738066 (contains logs relating to secrets management of the AWS Lambda infrastructure. This doesn't only relate to the old ADI implementation, so check whether this is still needed. In us-east-2) -* dev-vocviz-sample (old map code, probably not required, in us-east-2) -* ncov19 (us-east-1) - -### Aggregates - -Aggregated data from the line list used by the map visualisation. - -* covid-19-aggregates -* covid-19-aggregates-dev - -### Export - -Country specific (country-) and full (data-) export files in various formats - -* covid-19-country-export -* covid-19-country-export-dev -* covid-19-data-export -* covid-19-data-export-dev - -### Map - -Map is a static site exported to an S3 bucket - -* dev-covid-19.global.health (only one of dev/dev-map is used, in us-east-2) -* dev-map.covid-19.global.health -* map.covid-19.global.health -* dev-react-map.covid-19.global.health (us-east-2, should move to dev-map) -* react-map.covid-19.global.health (should move to map.covid-19.global.health) -* qa-covid-19.global.health - -### Ingestion - -* gdh-credentials (used to authenticate against backend, should move to API keys) -* gdh-sources (raw files downloaded from source URLs, was epid-ingestion-raw) - -### Miscellaneous - -* gdh-terraform-state-main (terraform state for our stack) -* gdh-metrics (telemetry on UI and Map) -* h1n1.global.health (us-east-2, H1N1 map) +Various buckets (data containers) are used for both temporary and long-term storage of G.h data. All S3 buckets except the one storing terraform state are in eu-central-1. ## Application logs -All of the "backend" components log to CloudWatch log streams in us-east-1 with no automatic rotation or expiration. +All of the "backend" components log to CloudWatch log streams in eu-central-1 with no automatic rotation or expiration. ## Computing servers -The kubernetes cluster (i.e. the backend services for the line list app) runs on four EC2 instances in us-east-1. No application data is stored here. - -Ingestion and export both run on AWS Batch "serverless" architecture, both in us-east-1. No application data is stored here. +The kubernetes cluster (i.e. the backend services for the line list app) runs on Fargate in eu-central-1. No application data is stored here. -Data export has until recently run on AWS Lambda, again no application data is stored here. This is on its way out but mentioned for completeness. +Ingestion and export both run on AWS Batch in eu-central-1. No application data is stored here. ## Anything else? diff --git a/ingestion/monitoring/completeness.py b/ingestion/monitoring/completeness.py index ae018c669..c9f951870 100644 --- a/ingestion/monitoring/completeness.py +++ b/ingestion/monitoring/completeness.py @@ -114,8 +114,8 @@ def setup_logger(): setup_logger() endpoint_url = os.getenv("ENDPOINT_URL") objects = data_files( - os.getenv("COUNTRY_EXPORT_BUCKET", "covid-19-country-export"), + os.getenv("COUNTRY_EXPORT_BUCKET", "covid-19-country-export-eu"), endpoint_url=endpoint_url, ) data = completeness_s3_many(objects, endpoint_url) - upload(data, os.getenv("METRICS_BUCKET", "covid-19-aggregates"), endpoint_url) + upload(data, os.getenv("METRICS_BUCKET", "covid-19-aggregates-eu"), endpoint_url) diff --git a/ingestion/monitoring/daily_metrics.py b/ingestion/monitoring/daily_metrics.py index a4d64b512..0854c64c9 100644 --- a/ingestion/monitoring/daily_metrics.py +++ b/ingestion/monitoring/daily_metrics.py @@ -11,7 +11,7 @@ import boto3 -BUCKET = "covid-19-aggregates" +BUCKET = "covid-19-aggregates-eu" WEBHOOK_URL = os.environ.get("SLACK_WEBHOOK_METRICS_URL", None) logger = logging.getLogger(__name__) diff --git a/ingestion/monitoring/freshness.py b/ingestion/monitoring/freshness.py index 3e629d799..bb2b8d7ca 100644 --- a/ingestion/monitoring/freshness.py +++ b/ingestion/monitoring/freshness.py @@ -91,7 +91,7 @@ def setup_logger(): setup_logger() if not (api_key := os.getenv("GDH_API_KEY")): raise ValueError("Set GDH_API_KEY to your Global.health API key") - bucket = os.getenv("BUCKET", "covid-19-aggregates") + bucket = os.getenv("BUCKET", "covid-19-aggregates-eu") s3_endpoint = os.getenv("S3_ENDPOINT") instance = os.getenv("GDH_URL", DEFAULT_INSTANCE) if sources := fetch_sources(api_key, instance): diff --git a/verification/curator-service/ui/cypress/integration/components/BulkCaseForm.spec.ts b/verification/curator-service/ui/cypress/integration/components/BulkCaseForm.spec.ts index 90230cc2b..a45a7ac6a 100644 --- a/verification/curator-service/ui/cypress/integration/components/BulkCaseForm.spec.ts +++ b/verification/curator-service/ui/cypress/integration/components/BulkCaseForm.spec.ts @@ -54,7 +54,6 @@ describe('Bulk upload form', function () { // Case data cy.contains('www.bulksource.com'); - cy.contains('sourceEntryId'); cy.contains('superuser@test.com'); cy.contains('Data upload IDs') .parent() diff --git a/verification/curator-service/ui/cypress/integration/components/Curator.spec.ts b/verification/curator-service/ui/cypress/integration/components/Curator.spec.ts index 48f7eb15f..6221f3d1b 100644 --- a/verification/curator-service/ui/cypress/integration/components/Curator.spec.ts +++ b/verification/curator-service/ui/cypress/integration/components/Curator.spec.ts @@ -200,10 +200,6 @@ describe('Curator', function () { 'www.example.com', ); }); - cy.get('input[name="caseReference.sourceEntryId"]').should( - 'have.value', - 'testSourceEntryID123', - ); // Demographics. cy.get('input[name="gender"]').should('have.value', 'Female'); @@ -308,7 +304,6 @@ describe('Curator', function () { cy.contains('td', 'www.example.com').click({ force: true }); // Case data. cy.contains('www.example.com'); - cy.contains('testSourceEntryID123'); cy.contains('superuser@test.com'); cy.contains('VERIFIED'); // Demographics.