From 0fc51706ae406013f0df3a593d5396d2a19d869c Mon Sep 17 00:00:00 2001 From: Anthony Miyaguchi Date: Fri, 26 Apr 2019 16:23:10 -0700 Subject: [PATCH 1/4] Create documentation on testing BigQuery from ingestion-beam. --- docs/architecture/diagram.svg | 12 ++--- docs/diagrams/workflow.mmd | 27 ++++++++++ docs/diagrams/workflow.svg | 4 ++ docs/ingestion_testing_workflow.md | 85 ++++++++++++++++++++++++++++++ ingestion-beam/README.md | 3 ++ 5 files changed, 125 insertions(+), 6 deletions(-) create mode 100644 docs/diagrams/workflow.mmd create mode 100644 docs/diagrams/workflow.svg create mode 100644 docs/ingestion_testing_workflow.md diff --git a/docs/architecture/diagram.svg b/docs/architecture/diagram.svg index 4006fd861..1a48990bb 100644 --- a/docs/architecture/diagram.svg +++ b/docs/architecture/diagram.svg @@ -1,8 +1,8 @@ -
Colors
Dataflow jobs are green
Kubernetes services are magenta
Producers are orange
PubSub topics are cyan
Google Cloud services are purple
Producers
Ingestion Edge
Raw Topics
Landfill Sink
Cloud Storage
Decoder
Cloud Memorystore
Decoded Topics
BigQuery Sink
BigQuery
Dataset Sink
Cloud Storage
Republisher
Per DocType Topics
Monitoring Sample Topics
\ No newline at end of file diff --git a/docs/diagrams/workflow.mmd b/docs/diagrams/workflow.mmd new file mode 100644 index 000000000..1bdb52249 --- /dev/null +++ b/docs/diagrams/workflow.mmd @@ -0,0 +1,27 @@ +graph TD + +subgraph dataops/sandbox/my-project + dataflow + bigquery + pubsub + subscription + + pubsub --> |gcloud pubsub subscriptions create| subscription + subscription --> dataflow + dataflow --> bigquery +end + +subgraph mozilla-pipeline-schemas + mps[repository archive] +end + +subgraph ingestion-beam + src[src/] + schemas[schemas.tar.gz] + bq-schemas[bq-schemas/] + + src --> |mvn compile exec:java| dataflow + mps --> |download-schemas| schemas + schemas --> |generate-bq-schemas| bq-schemas + bq-schemas --> |update-bq-table| bigquery +end diff --git a/docs/diagrams/workflow.svg b/docs/diagrams/workflow.svg new file mode 100644 index 000000000..eb17819d9 --- /dev/null +++ b/docs/diagrams/workflow.svg @@ -0,0 +1,4 @@ +
ingestion-beam
mozilla-pipeline-schemas
dataops/sandbox/my-project
gcloud pubsub subscriptions create
mvn compile exec:java
download-schemas
generate-bq-schemas
update-bq-table
src/
schemas.tar.gz
bq-schemas/
dataflow
repository archive
bigquery
pubsub
subscription
\ No newline at end of file diff --git a/docs/ingestion_testing_workflow.md b/docs/ingestion_testing_workflow.md new file mode 100644 index 000000000..e96be0934 --- /dev/null +++ b/docs/ingestion_testing_workflow.md @@ -0,0 +1,85 @@ + + + + +- [Ingestion Testing Workflow](#ingestion-testing-workflow) + - [Setting up the GCS project](#setting-up-the-gcs-project) + - [Bootstrapping schemas from `mozilla-pipeline-schemas`](#bootstrapping-schemas-from-mozilla-pipeline-schemas) + - [Building the project](#building-the-project) + + + +# Ingestion Testing Workflow + +The ingestion-beam handles dataflow of documents from the edge into various +sinks. You may be interested in standing up a small testing instance to validate +the integration of the various components. + +![diagrams/workflow.mmd](diagrams/workflow.svg) +__Figure__: _An overview of the various components necessary to query BigQuery +against data from a pubsub subscription._ + +## Setting up the GCS project + +Read through [`whd/gcp-quickstart`](https://github.com/whd/gcp-quickstart) for details +about the sandboxing environment that is provided by data operations. + +* Install the [Google Cloud SDK](https://cloud.google.com/sdk/) +* Navigate to the [Google Cloud Console](https://cloud.google.com/sdk/) +* Create a new project under `firefox.gcp.mozilla.com/dataops/sandbox` + - `gcloud config set project ` +* Create a PubSub subscription (see `gcp-quickstart/pubsub.sh`) +* Create a GCS bucket + - `gsutil mb gs://` +* Enable the [DataFlow API](https://console.cloud.google.com/marketplace/details/google/dataflow.googleapis.com) +* Create a service account and store the key locally + + +## Bootstrapping schemas from `mozilla-pipeline-schemas` + +* Download the latest schemas from `mozilla-pipeline-schemas` using `bin/download-schemas`. + - This script may also inject testing resources into the resulting archive. + - A `schemas.tar.gz` will appear at the project root. +* Generate BigQuery schemas using `bin/generate-bq-schemas`. + - Schemas will be written to `bq-schemas/`. + ``` + bq-schemas/ + ├── activity-stream.impression-stats.1.bigquery.json + ├── coverage.coverage.1.bigquery.json + ├── edge-validator.error-report.1.bigquery.json + ├── eng-workflow.bmobugs.1.bigquery.json + .... + ``` +* Update the BigQuery table in the current project using `bin/update-bq-table`. + - This may take several minutes. Read the script for usage information. +* Verify that tables have been updated by viewing the BigQuery console. + + +## Building the project + +Follow the instructions of the project README. Here is a quick-reference for a running a job from a set of files in GCS. + +```bash +export GOOGLE_APPLICATION_CREDENTIALS=keys.json +PROJECT=$(gcloud config get-value project) +BUCKET="gs://$PROJECT" + +path="$BUCKET/data/*.ndjson" +mvn compile exec:java -Dexec.args="\ + --runner=Dataflow \ + --project=$PROJECT \ + --autoscalingAlgorithm=NONE \ + --workerMachineType=n1-standard-1 \ + --numWorkers=1 \ + --gcpTempLocation=$BUCKET/tmp \ + --inputFileFormat=json \ + --inputType=file \ + --input=$path\ + --outputType=bigquery \ + --output=$PROJECT:\${document_namespace}.\${document_type}_v\${document_version} \ + --bqWriteMethod=file_loads \ + --tempLocation=$BUCKET/temp/bq-loads \ + --errorOutputType=file \ + --errorOutput=$BUCKET/error/ \ +" +``` diff --git a/ingestion-beam/README.md b/ingestion-beam/README.md index 2cf153b48..7990ac7c1 100644 --- a/ingestion-beam/README.md +++ b/ingestion-beam/README.md @@ -562,6 +562,9 @@ use the `bin/mvn` executable to run maven in docker: ./bin/mvn clean test ``` +To run the project in a sandbox against production data, see this document on +![configuring an integration testing workflow](../docs/ingestion_testing_workflow.md). + # License This Source Code Form is subject to the terms of the Mozilla Public From 6614dfcce3bb5a33423c33501ef414d2165ef70b Mon Sep 17 00:00:00 2001 From: Anthony Miyaguchi Date: Mon, 29 Apr 2019 10:44:45 -0700 Subject: [PATCH 2/4] Add a mermaid config file to the docs directory for overflow --- bin/update-diagrams | 1 + docs/architecture/diagram.svg | 13 +++++++------ docs/diagrams/workflow.mmd | 4 ++-- docs/diagrams/workflow.svg | 5 +++-- docs/mermaid-config.json | 3 +++ 5 files changed, 16 insertions(+), 10 deletions(-) create mode 100644 docs/mermaid-config.json diff --git a/bin/update-diagrams b/bin/update-diagrams index b8fbd9502..76f0928ed 100755 --- a/bin/update-diagrams +++ b/bin/update-diagrams @@ -12,5 +12,6 @@ for f in $(find . -name "*.mmd"); do --volume $PWD:/root/project \ --workdir /root/project \ $IMAGE \ + -c docs/mermaid-config.json \ -i ${f} -o ${f/.mmd/.svg} done diff --git a/docs/architecture/diagram.svg b/docs/architecture/diagram.svg index 1a48990bb..99afe2c38 100644 --- a/docs/architecture/diagram.svg +++ b/docs/architecture/diagram.svg @@ -1,8 +1,9 @@ -
Colors
Dataflow jobs are green
Kubernetes services are magenta
Producers are orange
PubSub topics are cyan
Google Cloud services are purple
Producers
Ingestion Edge
Raw Topics
Landfill Sink
Cloud Storage
Decoder
Cloud Memorystore
Decoded Topics
BigQuery Sink
BigQuery
Dataset Sink
Cloud Storage
Republisher
Per DocType Topics
Monitoring Sample Topics
\ No newline at end of file diff --git a/docs/diagrams/workflow.mmd b/docs/diagrams/workflow.mmd index 1bdb52249..c2be14393 100644 --- a/docs/diagrams/workflow.mmd +++ b/docs/diagrams/workflow.mmd @@ -6,7 +6,7 @@ subgraph dataops/sandbox/my-project pubsub subscription - pubsub --> |gcloud pubsub subscriptions create| subscription + pubsub --> |gcloud pubsub subscriptions| subscription subscription --> dataflow dataflow --> bigquery end @@ -20,7 +20,7 @@ subgraph ingestion-beam schemas[schemas.tar.gz] bq-schemas[bq-schemas/] - src --> |mvn compile exec:java| dataflow + src --> |mvn compile| dataflow mps --> |download-schemas| schemas schemas --> |generate-bq-schemas| bq-schemas bq-schemas --> |update-bq-table| bigquery diff --git a/docs/diagrams/workflow.svg b/docs/diagrams/workflow.svg index eb17819d9..01c92da1f 100644 --- a/docs/diagrams/workflow.svg +++ b/docs/diagrams/workflow.svg @@ -1,4 +1,5 @@ -
ingestion-beam
mozilla-pipeline-schemas
dataops/sandbox/my-project
gcloud pubsub subscriptions
mvn compile
download-schemas
generate-bq-schemas
update-bq-table
src/
schemas.tar.gz
bq-schemas/
dataflow
repository archive
bigquery
pubsub
subscription
\ No newline at end of file diff --git a/docs/mermaid-config.json b/docs/mermaid-config.json new file mode 100644 index 000000000..50fe1ba70 --- /dev/null +++ b/docs/mermaid-config.json @@ -0,0 +1,3 @@ +{ + "themeCSS": ".label foreignObject { overflow: visible; }" +} \ No newline at end of file From 6a17c2f80178d784891d452596d1e864962dbd8c Mon Sep 17 00:00:00 2001 From: Anthony Miyaguchi Date: Mon, 29 Apr 2019 10:57:38 -0700 Subject: [PATCH 3/4] Fix spelling mistakes --- .spelling | 3 +++ docs/ingestion_testing_workflow.md | 14 +++++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.spelling b/.spelling index 2a3f20276..1b328c866 100644 --- a/.spelling +++ b/.spelling @@ -29,6 +29,7 @@ failsafe featureful filesystem GCP +GCS GeoIP GKE GroupByKey @@ -58,11 +59,13 @@ protobuf PubSub PubsubMessage Q4 +readme Redis Republisher runtime S3 schemas +SDK sharding SQLite stderr diff --git a/docs/ingestion_testing_workflow.md b/docs/ingestion_testing_workflow.md index e96be0934..bba0ca04a 100644 --- a/docs/ingestion_testing_workflow.md +++ b/docs/ingestion_testing_workflow.md @@ -11,18 +11,18 @@ # Ingestion Testing Workflow -The ingestion-beam handles dataflow of documents from the edge into various +The ingestion-beam handles data flow of documents from the edge into various sinks. You may be interested in standing up a small testing instance to validate the integration of the various components. ![diagrams/workflow.mmd](diagrams/workflow.svg) __Figure__: _An overview of the various components necessary to query BigQuery -against data from a pubsub subscription._ +against data from a PubSub subscription._ ## Setting up the GCS project Read through [`whd/gcp-quickstart`](https://github.com/whd/gcp-quickstart) for details -about the sandboxing environment that is provided by data operations. +about the sandbox environment that is provided by data operations. * Install the [Google Cloud SDK](https://cloud.google.com/sdk/) * Navigate to the [Google Cloud Console](https://cloud.google.com/sdk/) @@ -31,17 +31,17 @@ about the sandboxing environment that is provided by data operations. * Create a PubSub subscription (see `gcp-quickstart/pubsub.sh`) * Create a GCS bucket - `gsutil mb gs://` -* Enable the [DataFlow API](https://console.cloud.google.com/marketplace/details/google/dataflow.googleapis.com) +* Enable the [Dataflow API](https://console.cloud.google.com/marketplace/details/google/dataflow.googleapis.com) * Create a service account and store the key locally ## Bootstrapping schemas from `mozilla-pipeline-schemas` -* Download the latest schemas from `mozilla-pipeline-schemas` using `bin/download-schemas`. +* Download the latest schemas from `mozilla-pipeline-schemas` using `bin/download-schemas`. - This script may also inject testing resources into the resulting archive. - A `schemas.tar.gz` will appear at the project root. * Generate BigQuery schemas using `bin/generate-bq-schemas`. - - Schemas will be written to `bq-schemas/`. + - Schemas will be written to `bq-schemas/`. ``` bq-schemas/ ├── activity-stream.impression-stats.1.bigquery.json @@ -57,7 +57,7 @@ about the sandboxing environment that is provided by data operations. ## Building the project -Follow the instructions of the project README. Here is a quick-reference for a running a job from a set of files in GCS. +Follow the instructions of the project readme. Here is a quick-reference for a running a job from a set of files in GCS. ```bash export GOOGLE_APPLICATION_CREDENTIALS=keys.json From 77a776d42d254b50316c6bdf50068904766c2691 Mon Sep 17 00:00:00 2001 From: Anthony Miyaguchi Date: Mon, 29 Apr 2019 11:02:17 -0700 Subject: [PATCH 4/4] Move mermaid config to top-level .mermaid --- docs/mermaid-config.json => .mermaid | 2 +- bin/update-diagrams | 2 +- docs/architecture/diagram.svg | 14 +++++++------- docs/diagrams/workflow.svg | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) rename docs/mermaid-config.json => .mermaid (96%) diff --git a/docs/mermaid-config.json b/.mermaid similarity index 96% rename from docs/mermaid-config.json rename to .mermaid index 50fe1ba70..cb9fc3580 100644 --- a/docs/mermaid-config.json +++ b/.mermaid @@ -1,3 +1,3 @@ { "themeCSS": ".label foreignObject { overflow: visible; }" -} \ No newline at end of file +} diff --git a/bin/update-diagrams b/bin/update-diagrams index 76f0928ed..412e54304 100755 --- a/bin/update-diagrams +++ b/bin/update-diagrams @@ -12,6 +12,6 @@ for f in $(find . -name "*.mmd"); do --volume $PWD:/root/project \ --workdir /root/project \ $IMAGE \ - -c docs/mermaid-config.json \ + -c .mermaid \ -i ${f} -o ${f/.mmd/.svg} done diff --git a/docs/architecture/diagram.svg b/docs/architecture/diagram.svg index 99afe2c38..6c058d444 100644 --- a/docs/architecture/diagram.svg +++ b/docs/architecture/diagram.svg @@ -1,9 +1,9 @@ -
Colors
Dataflow jobs are green
Kubernetes services are magenta
Producers are orange
PubSub topics are cyan
Google Cloud services are purple
Producers
Ingestion Edge
Raw Topics
Landfill Sink
Cloud Storage
Decoder
Cloud Memorystore
Decoded Topics
BigQuery Sink
BigQuery
Dataset Sink
Cloud Storage
Republisher
Per DocType Topics
Monitoring Sample Topics
\ No newline at end of file diff --git a/docs/diagrams/workflow.svg b/docs/diagrams/workflow.svg index 01c92da1f..5dc79fb9b 100644 --- a/docs/diagrams/workflow.svg +++ b/docs/diagrams/workflow.svg @@ -1,5 +1,5 @@ -
ingestion-beam
mozilla-pipeline-schemas
dataops/sandbox/my-project
gcloud pubsub subscriptions
mvn compile
download-schemas
generate-bq-schemas
update-bq-table
src/
schemas.tar.gz
bq-schemas/
dataflow
repository archive
bigquery
pubsub
subscription
\ No newline at end of file