diff --git a/.circleci/config.yml b/.circleci/config.yml index 9c1987132..de2f8f8c1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -9,14 +9,23 @@ jobs: name: Spell Check command: mdspell --ignore-numbers --en-us --report '**/*.md' - doctoc: + docs: docker: - - image: node:8.10.0 + - image: circleci/python:3.7 steps: - - checkout - - run: - name: Ensure markdown tables of contents are up to date - command: ./.circleci/doctoc-check.sh + - checkout + - run: + name: Install dependencies + command: sudo pip install mkdocs markdown-include + - add_ssh_keys: + fingerprints: + "84:b0:66:dd:ec:68:b1:45:9d:5d:66:fd:4a:4f:1b:57" + - run: + name: Build and deploy docs + command: | + if [ $CIRCLE_BRANCH == "master" ]; then + mkdocs gh-deploy + fi ingestion-edge: &ingestion-edge working_directory: /root/project/ingestion-edge @@ -173,7 +182,10 @@ workflows: build: jobs: - spelling - - doctoc + - docs: + filters: + tags: + only: /.*/ - ingestion-edge - ingestion-edge-release: filters: diff --git a/.circleci/doctoc-check.sh b/.circleci/doctoc-check.sh deleted file mode 100755 index 28f164bdf..000000000 --- a/.circleci/doctoc-check.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -set -e - -bash "$(dirname $0)/doctoc-run.sh" - -# Exit with success code if doctoc modified no files. -git diff --name-only | grep '.md$' || exit 0 - -# Print instructions and fail this test. -echo "Some markdown files have outdated Table of Contents!" -echo "To fix, run ./bin/update-toc" -exit 1 diff --git a/.circleci/doctoc-run.sh b/.circleci/doctoc-run.sh deleted file mode 100755 index 44198af53..000000000 --- a/.circleci/doctoc-run.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# Run doctoc to update tables of contents in markdown files. -# https://www.npmjs.com/package/doctoc - -set -e - -npm install -g --silent doctoc -doctoc . --notitle diff --git a/.spelling b/.spelling index 0b5adfeed..561b19dda 100644 --- a/.spelling +++ b/.spelling @@ -16,6 +16,7 @@ CircleCI CLI cron Dataflow +datapipeline dataset deduplicate deduplication @@ -28,6 +29,8 @@ encodings failsafe featureful filesystem +fx-metrics +gcp-ingestion GCP GCS GeoIP @@ -41,6 +44,7 @@ HTTPS hyperloglog IAM IPs +irc.mozilla.org Javadoc JSON JVM diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f5464bccd..f0ef43f39 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,12 +1,3 @@ - - - - -- [Community Participation Guidelines](#community-participation-guidelines) - - [How to Report](#how-to-report) - - - # Community Participation Guidelines This repository is governed by Mozilla's code of conduct and etiquette guidelines. diff --git a/README.md b/README.md index ff84b0cfd..1c87cf68b 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,16 @@ # Telemetry Ingestion on Google Cloud Platform - - - +[![CircleCI](https://circleci.com/gh/mozilla/gcp-ingestion.svg?style=svg&circle-token=d98a470269580907d5c6d74d0e67612834a21be7)](https://circleci.com/gh/mozilla/gcp-ingestion) A monorepo for documentation and implementation of the Mozilla telemetry ingestion system deployed to Google Cloud Platform (GCP). -The overall architecture is described in [docs/architecture](docs/architecture) -along with commentary on design decisions. -Individual components are specified under [docs](docs) and implemented -under the various `ingestion-*` service directories: +There are currently two components: - [ingestion-edge](ingestion-edge): a simple Python service for accepting HTTP messages and delivering to Google Cloud Pub/Sub - [ingestion-beam](ingestion-beam): a Java module defining [Apache Beam](https://beam.apache.org/) jobs for streaming and batch transformations of ingested messages + +For more information, see [the documentation](https://mozilla.github.io/gcp-ingestion). diff --git a/bin/update-toc b/bin/update-toc deleted file mode 100755 index 2a061f16b..000000000 --- a/bin/update-toc +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -# Updates the Table of Contents in README.md; see - -set -e -cd "$(dirname "$0")/.." -IMAGE=node:8.12.0 - -docker run -it --rm \ - --volume $PWD:/root/project \ - --workdir /root/project \ - $IMAGE \ - /bin/bash .circleci/doctoc-run.sh diff --git a/docs/bigquery_sink.md b/docs/architecture/bigquery_sink_specification.md similarity index 82% rename from docs/bigquery_sink.md rename to docs/architecture/bigquery_sink_specification.md index 5ac91267c..07bc20572 100644 --- a/docs/bigquery_sink.md +++ b/docs/architecture/bigquery_sink_specification.md @@ -3,22 +3,8 @@ This document specifies the behavior of the service that delivers decoded messages into BigQuery. - - -- [Data Flow](#data-flow) - - [Implementation](#implementation) - - [Configuration](#configuration) - - [Coerce Types](#coerce-types) - - [Accumulate Unknown Values As `additional_properties`](#accumulate-unknown-values-as-additional_properties) - - [Errors](#errors) - - [Error Message Schema](#error-message-schema) -- [Other Considerations](#other-considerations) - - [Message Acks](#message-acks) - - - ## Data Flow Consume messages from a PubSub topic or Cloud Storage location and insert them @@ -86,7 +72,7 @@ retries are handled automatically and all errors returned are non-transient. #### Error Message Schema Always include the error attributes specified in the [Decoded Error Message -Schema](decoder.md#error-message-schema). +Schema](decoder_service_specification.md#error-message-schema). Encode errors received as type `TableRow` as JSON in the payload of a `PubsubMessage`, and add error attributes. diff --git a/docs/decoder.md b/docs/architecture/decoder_service_specification.md similarity index 88% rename from docs/decoder.md rename to docs/architecture/decoder_service_specification.md index 6717fb981..8d132bcff 100644 --- a/docs/decoder.md +++ b/docs/architecture/decoder_service_specification.md @@ -3,22 +3,6 @@ This document specifies the behavior of the service that decodes messages in the Structured Ingestion pipeline. - - - - -- [Data Flow](#data-flow) - - [Implementation](#implementation) - - [Decoding Errors](#decoding-errors) - - [Error message schema](#error-message-schema) - - [Raw message schema](#raw-message-schema) - - [Decoded message metadata schema](#decoded-message-metadata-schema) -- [Other Considerations](#other-considerations) - - [Message Acks](#message-acks) - - [Deduplication](#deduplication) - - - ## Data Flow 1. Consume messages from Google Cloud PubSub raw topic @@ -69,7 +53,7 @@ required group attributes { ### Raw message schema -See [Edge Server PubSub Message Schema](edge.md#edge-server-pubsub-message-schema). +See [Edge Service PubSub Message Schema](edge_service_specification.md#pubsub-message-schema). ### Decoded message metadata schema diff --git a/docs/architecture/differences_from_aws.md b/docs/architecture/differences_from_aws.md index 0fbef6892..e81485f6a 100644 --- a/docs/architecture/differences_from_aws.md +++ b/docs/architecture/differences_from_aws.md @@ -1,23 +1,8 @@ -# Differences from AWS Architecture +# Differences from AWS This document explains how GCP Ingestion differs from the [AWS Data Platform Architecture](https://mana.mozilla.org/wiki/display/SVCOPS/Telemetry+-+Data+Pipeline+Architecture). - - - - -- [Replace Heka Framed Protobuf with newline delimited JSON](#replace-heka-framed-protobuf-with-newline-delimited-json) -- [Replace EC2 Edge with Kubernetes Edge](#replace-ec2-edge-with-kubernetes-edge) -- [Replace Kafka with PubSub](#replace-kafka-with-pubsub) -- [Replace Hindsight Data Warehouse Loaders with Dataflow](#replace-hindsight-data-warehouse-loaders-with-dataflow) -- [Replace S3 with Cloud Storage](#replace-s3-with-cloud-storage) -- [Messages Always Delivered to Message Queue](#messages-always-delivered-to-message-queue) -- [Landfill is Downstream from Message Queue](#landfill-is-downstream-from-message-queue) - - - - ## Replace Heka Framed Protobuf with newline delimited JSON Heka framed protobuf requires special code to read and write. Newline delimited diff --git a/docs/edge_migration.md b/docs/architecture/edge_migration_plan.md similarity index 93% rename from docs/edge_migration.md rename to docs/architecture/edge_migration_plan.md index 83d9aa86e..b0d6ce7e7 100644 --- a/docs/edge_migration.md +++ b/docs/architecture/edge_migration_plan.md @@ -2,19 +2,8 @@ This document outlines plans to migrate edge traffic from AWS to GCP using the code in this repository. - - -- [Current state](#current-state) -- [Phase 1](#phase-1) -- [Phase 2](#phase-2) -- [Phase 3](#phase-3) -- [Phase 3 (alternative)](#phase-3-alternative) -- [Phase 4](#phase-4) - - - ## Current state Today, data producers send data to the ingestion stack on AWS as described [here](https://github.com/mozilla/firefox-data-docs/blob/042fddcbf27aa5993ee5578224200a3ef65fd7c7/src/concepts/pipeline/data_pipeline_detail.md#ingestion). diff --git a/docs/edge.md b/docs/architecture/edge_service_specification.md similarity index 86% rename from docs/edge.md rename to docs/architecture/edge_service_specification.md index 15a2b720e..8d25f0ff1 100644 --- a/docs/edge.md +++ b/docs/architecture/edge_service_specification.md @@ -3,31 +3,6 @@ This document specifies the behavior of the server that accepts submissions from HTTP clients e.g. Firefox telemetry. - - - - -- [General Data Flow](#general-data-flow) - - [Namespaces](#namespaces) - - [Forwarding to the pipeline](#forwarding-to-the-pipeline) - - [Edge Server PubSub Message Schema](#edge-server-pubsub-message-schema) -- [Server Request/Response](#server-requestresponse) - - [GET Request](#get-request) - - [GET Response codes](#get-response-codes) - - [POST/PUT Request](#postput-request) - - [Legacy Systems](#legacy-systems) - - [POST/PUT Response codes](#postput-response-codes) - - [Other Response codes](#other-response-codes) -- [Other Considerations](#other-considerations) - - [Compression](#compression) - - [Bad Messages](#bad-messages) - - [PubSub Topics](#pubsub-topics) - - [GeoIP Lookups](#geoip-lookups) - - [Data Retention](#data-retention) - - [Submission Timestamp Format](#submission-timestamp-format) - - - ## General Data Flow HTTP submissions come in from the wild, hit a load balancer, then optionally an @@ -54,7 +29,7 @@ configuration options. The message is written to PubSub. If the message cannot be written to PubSub it is written to a disk queue that will periodically retry writing to PubSub. -### Edge Server PubSub Message Schema +### PubSub Message Schema ``` required string data // base64 encoded body diff --git a/docs/landfill.md b/docs/architecture/landfill_service_specification.md similarity index 69% rename from docs/landfill.md rename to docs/architecture/landfill_service_specification.md index 19679865a..5bdb4b926 100644 --- a/docs/landfill.md +++ b/docs/architecture/landfill_service_specification.md @@ -3,18 +3,8 @@ This document specifies the behavior of the service that batches raw messages into long term storage. - - -- [Data Flow](#data-flow) - - [Implementation](#implementation) - - [Latency](#latency) -- [Other Considerations](#other-considerations) - - [Message Acks](#message-acks) - - - ## Data Flow Consume messages from a Google Cloud PubSub topic and write in batches to diff --git a/docs/architecture/README.md b/docs/architecture/overview.md similarity index 90% rename from docs/architecture/README.md rename to docs/architecture/overview.md index c95fb33ec..288c07699 100644 --- a/docs/architecture/README.md +++ b/docs/architecture/overview.md @@ -2,31 +2,6 @@ This document specifies the architecture for GCP Ingestion as a whole. - - - - -- [Architecture Diagram](#architecture-diagram) -- [Architecture Components](#architecture-components) - - [Ingestion Edge](#ingestion-edge) - - [Landfill Sink](#landfill-sink) - - [Decoder](#decoder) - - [Republisher](#republisher) - - [BigQuery Sink](#bigquery-sink) - - [Dataset Sink](#dataset-sink) - - [Notes](#notes) -- [Design Decisions](#design-decisions) - - [Kubernetes Engine and PubSub](#kubernetes-engine-and-pubsub) - - [Different topics for "raw" and "validated" data](#different-topics-for-raw-and-validated-data) - - [BigQuery](#bigquery) - - [Save messages as newline delimited JSON](#save-messages-as-newline-delimited-json) - - [Use destination tables](#use-destination-tables) - - [Use views for user-facing data](#use-views-for-user-facing-data) -- [Known Issues](#known-issues) -- [Further Reading](#further-reading) - - - ## Architecture Diagram ![diagram.mmd](diagram.svg "Architecture Diagram") diff --git a/docs/pain_points.md b/docs/architecture/pain_points.md similarity index 86% rename from docs/pain_points.md rename to docs/architecture/pain_points.md index cec6fb6e1..caae2c66d 100644 --- a/docs/pain_points.md +++ b/docs/architecture/pain_points.md @@ -1,21 +1,7 @@ -# Overview +# Pain points A running list of things that are suboptimal in GCP. - - - - -- [App Engine](#app-engine) -- [Dataflow](#dataflow) - - [`BigQueryIO.Write`](#bigqueryiowrite) - - [`FileIO.Write`](#fileiowrite) - - [`PubsubIO.Write`](#pubsubiowrite) - - [Templates](#templates) -- [PubSub](#pubsub) - - - # App Engine For network-bound applications it can be prohibitively expensive. A PubSub push diff --git a/docs/reliability.md b/docs/architecture/reliability.md similarity index 88% rename from docs/reliability.md rename to docs/architecture/reliability.md index f3bc5f0d7..16b4e4f9e 100644 --- a/docs/reliability.md +++ b/docs/architecture/reliability.md @@ -5,18 +5,6 @@ Percentage determined by the Reliability Target below. If a component does not meet that then a Stability Work Period should be assigned to each software engineer supporting the component. - - - - -- [Disclaimer and Purpose](#disclaimer-and-purpose) -- [Reliability Target](#reliability-target) -- [Definitions](#definitions) -- [Exclusions](#exclusions) -- [Additional Information](#additional-information) - - - ## Disclaimer and Purpose **This document is intended solely for those directly running, writing, and diff --git a/docs/test_requirements.md b/docs/architecture/test_requirements.md similarity index 93% rename from docs/test_requirements.md rename to docs/architecture/test_requirements.md index 788472caa..c177f4d24 100644 --- a/docs/test_requirements.md +++ b/docs/architecture/test_requirements.md @@ -2,20 +2,8 @@ This document specifies the testing required for GCP Ingestion components. - - -- [Exceptions](#exceptions) -- [Test Phases](#test-phases) -- [Test Categories](#test-categories) - - [Unit Tests](#unit-tests) - - [Integration Tests](#integration-tests) - - [Load Tests](#load-tests) - - [Slow Load Tests](#slow-load-tests) - - - ## Exceptions Code that does not comply with this standard before it is deployed to diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..e5ce47fd9 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,21 @@ +# GCP Ingestion + +[GCP Ingestion](https://github.com/mozilla/gcp-ingestion/) is a monorepo for +documentation and implementation of the Mozilla telemetry ingestion system +deployed to Google Cloud Platform (GCP). + +There are currently two components: + +- [ingestion-edge](./ingestion-edge/index.md): a simple Python service for accepting HTTP + messages and delivering to Google Cloud Pub/Sub +- [ingestion-beam](ingestion-beam): a Java module defining + [Apache Beam](https://beam.apache.org/) jobs for streaming and batch + transformations of ingested messages + +The design behind the system along with various trade offs are documented in +the architecture section. Note that as of this writing (August 2019) this +GCP ingestion is changing quickly, so some parts of this documentation may be out +of date. + +Feel free to ask us on irc.mozilla.org #datapipeline or #fx-metrics +on slack if you have specific questions. diff --git a/ingestion-beam/README.md b/docs/ingestion-beam/index.md similarity index 87% rename from ingestion-beam/README.md rename to docs/ingestion-beam/index.md index c108cf648..bd4b83e5e 100644 --- a/ingestion-beam/README.md +++ b/docs/ingestion-beam/index.md @@ -1,67 +1,17 @@ -[![CircleCI](https://circleci.com/gh/mozilla/gcp-ingestion.svg?style=svg&circle-token=d98a470269580907d5c6d74d0e67612834a21be7)](https://circleci.com/gh/mozilla/gcp-ingestion) - # Apache Beam Jobs for Ingestion -This java module contains our Apache Beam jobs for use in Ingestion. +This ingestion-beam java module contains our [Apache Beam](https://beam.apache.org/) jobs for use in Ingestion. Google Cloud Dataflow is a Google Cloud Platform service that natively runs Apache Beam jobs. - - - - -- [Code Formatting](#code-formatting) -- [Sink Job](#sink-job) - - [Supported Input and Outputs](#supported-input-and-outputs) - - [Encoding](#encoding) - - [Output Path Specification](#output-path-specification) - - [BigQuery](#bigquery) - - [Protocol](#protocol) - - [Attribute placeholders](#attribute-placeholders) - - [File prefix](#file-prefix) - - [Executing Jobs](#executing-jobs) - - [Locally](#locally) - - [On Dataflow](#on-dataflow) - - [On Dataflow with templates](#on-dataflow-with-templates) - - [In streaming mode](#in-streaming-mode) -- [Decoder Job](#decoder-job) - - [Transforms](#transforms) - - [Parse URI](#parse-uri) - - [Decompress](#decompress) - - [GeoIP Lookup](#geoip-lookup) - - [Parse User Agent](#parse-user-agent) - - [Executing Decoder Jobs](#executing-decoder-jobs) -- [Republisher Job](#republisher-job) - - [Capabilities](#capabilities) - - [Marking Messages As Seen](#marking-messages-as-seen) - - [Debug Republishing](#debug-republishing) - - [Per-`docType` Republishing](#per-doctype-republishing) - - [Per-Channel Sampled Republishing](#per-channel-sampled-republishing) - - [Executing Republisher Jobs](#executing-republisher-jobs) -- [Testing](#testing) -- [License](#license) - - - -# Code Formatting - -Use spotless to automatically reformat code: - -```bash -mvn spotless:apply -``` - -or use just check what changes it requires: - -```bash -mvn spotless:check -``` +The source code lives in the [ingestion-beam](https://github.com/mozilla/gcp-ingestion/tree/master/ingestion-beam) +subdirectory of the gcp-ingestion repository. -# Sink Job +## Sink Job A job for delivering messages between Google Cloud services. -## Supported Input and Outputs +### Supported Input and Outputs Supported inputs: @@ -83,7 +33,7 @@ Supported error outputs, must include attributes and must not validate messages: * stdout with JSON encoding * stderr with JSON encoding -## Encoding +### Encoding Internally messages are stored and transported as [PubsubMessage](https://beam.apache.org/documentation/sdks/javadoc/2.6.0/org/apache/beam/sdk/io/gcp/pubsub/PubsubMessage.html). @@ -120,12 +70,12 @@ The above file when stored in the `text` format: Note that the newline embedded at the end of the second JSON message results in two text messages, one of which is blank. -## Output Path Specification +### Output Path Specification Depending on the specified output type, the `--output` path that you provide controls several aspects of the behavior. -### BigQuery +#### BigQuery When `--outputType=bigquery`, `--output` is a `tableSpec` of form `dataset.tablename` or the more verbose `projectId:dataset.tablename`. The values can contain @@ -146,7 +96,7 @@ payloads. Instead, records missing an attribute required by a placeholder will be redirected to error output if no default is provided. -### Protocol +#### Protocol When `--outputType=file`, `--output` may be prefixed by a protocol specifier to determine the @@ -156,7 +106,7 @@ Cloud Storage, use a `gs://` path like: --output=gs://mybucket/somdir/myfileprefix -### Attribute placeholders +#### Attribute placeholders We support `FileIO`'s "Dynamic destinations" feature (`FileIO.writeDynamic`) where it's possible to route individual messages to different output locations based @@ -204,7 +154,7 @@ on attribute names and default values used in placeholders: - attribute names may not contain curly braces (`{` or `}`) - default values may not contain curly braces (`{` or `}`) -### File prefix +#### File prefix Individual files are named by replacing `:` with `-` in the default format discussed in the "File naming" section of Beam's @@ -226,12 +176,12 @@ An output file might be: /tmp/output/out--290308-12-21T20-00-00.000Z--290308-12-21T20-10-00.000Z-00000-of-00001.ndjson -## Executing Jobs +### Executing Jobs Note: `-Dexec.args` does not handle newlines gracefully, but bash will remove `\` escaped newlines in `"`s. -### Locally +#### Locally If you install Java and maven, you can invoke `mvn` directly in the following commands; be aware, though, that Java 8 is the target JVM and some reflection warnings may be thrown on @@ -277,7 +227,7 @@ cat tmp/output/* ./bin/mvn compile exec:java -Dexec.args=--help=SinkOptions ``` -### On Dataflow +#### On Dataflow ```bash # Pick a bucket to store files in @@ -309,7 +259,7 @@ gcloud dataflow jobs list gsutil cat $BUCKET/output/* ``` -### On Dataflow with templates +#### On Dataflow with templates Dataflow templates make a distinction between [runtime parameters that implement the `ValueProvider` interface](https://cloud.google.com/dataflow/docs/guides/templates/creating-templates#runtime-parameters-and-the-valueprovider-interface) @@ -358,7 +308,7 @@ gcloud dataflow jobs show "$JOB_ID" gsutil cat $BUCKET/output/* ``` -### In streaming mode +#### In streaming mode If `--inputType=pubsub`, Beam will execute in streaming mode, requiring some extra configuration for file-based outputs. You will need to specify sharding like: @@ -378,25 +328,25 @@ As codified in [apache/beam/pull/1952](https://github.com/apache/beam/pull/1952) the Dataflow runner suggests a reasonable starting point `numShards` is `2 * maxWorkers` or 10 if `--maxWorkers` is unspecified. -# Decoder Job +## Decoder Job A job for normalizing ingestion messages. -## Transforms +### Transforms These transforms are currently executed against each message in order. -### Parse URI +#### Parse URI Attempt to extract attributes from `uri`, on failure send messages to the configured error output. -### Decompress +#### Decompress Attempt to decompress payload with gzip, on failure pass the message through unmodified. -### GeoIP Lookup +#### GeoIP Lookup 1. Extract `ip` from the `x_forwarded_for` attribute * when the `x_pipeline_proxy` attribute is not present, use the @@ -416,12 +366,12 @@ unmodified. 1. Remove the `x_forwarded_for` and `remote_addr` attributes 1. Remove any `null` values added to attributes -### Parse User Agent +#### Parse User Agent Attempt to extract browser, browser version, and os from the `user_agent` attribute, drop any nulls, and remove `user_agent` from attributes. -## Executing Decoder Jobs +### Executing Decoder Jobs Decoder jobs are executed the same way as [executing sink jobs](#executing-jobs) but with a few extra flags: @@ -458,7 +408,7 @@ echo '{"payload":"dGVzdA==","attributeMap":{"remote_addr":"63.245.208.195"}}' > " ``` -# Republisher Job +## Republisher Job A job for republishing subsets of decoded messages to new destinations. @@ -471,28 +421,28 @@ in `Cloud MemoryStore` for deduplication purposes. That functionality exists here to avoid the expense of an additional separate consumer of the full decoded topic. -## Capabilities +### Capabilities -### Marking Messages As Seen +#### Marking Messages As Seen The job needs to connect to Redis in order to mark `document_id`s of consumed messages as seen. The Decoder is able to use that information to drop duplicate messages flowing through the pipeline. -### Debug Republishing +#### Debug Republishing If `--enableDebugDestination` is set, messages containing an `x_debug_id` attribute will be republished to a destination that's configurable at runtime. This is currently expected to be a feature specific to structured ingestion, so should not be set for `telemetry-decoded` input. -### Per-`docType` Republishing +#### Per-`docType` Republishing If `--perDocTypeEnabledList` is provided, a separate producer will be created for each `docType` specified in the given comma-separated list. See the `--help` output for details on format. -### Per-Channel Sampled Republishing +#### Per-Channel Sampled Republishing If `--perChannelSampleRatios` is provided, a separate producer will be created for each specified release channel. The messages will be randomly sampled @@ -501,7 +451,7 @@ This is currently intended as a feature only for telemetry data, so should not be set for `structured-decoded` input. See the `--help` output for details on format. -## Executing Republisher Jobs +### Executing Republisher Jobs Republisher jobs are executed the same way as [executing sink jobs](#executing-jobs) but with a few differences in flags. You'll need to set the `mainClass`: @@ -551,7 +501,7 @@ echo '{"payload":"dGVzdA==","attributeMap":{"x_debug_id":"mysession"}}' > tmp/in " ``` -# Testing +## Testing Before anything else, be sure to download the test data: @@ -575,10 +525,18 @@ use the `bin/mvn` executable to run maven in docker: ``` To run the project in a sandbox against production data, see this document on -![configuring an integration testing workflow](../docs/ingestion_testing_workflow.md). +[configuring an integration testing workflow](./ingestion_testing_workflow.md). + +## Code Formatting -# License +Use spotless to automatically reformat code: -This Source Code Form is subject to the terms of the Mozilla Public -License, v. 2.0. If a copy of the MPL was not distributed with this -file, You can obtain one at http://mozilla.org/MPL/2.0/. +```bash +mvn spotless:apply +``` + +or just check what changes it requires: + +```bash +mvn spotless:check +``` diff --git a/docs/ingestion_testing_workflow.md b/docs/ingestion-beam/ingestion_testing_workflow.md similarity index 83% rename from docs/ingestion_testing_workflow.md rename to docs/ingestion-beam/ingestion_testing_workflow.md index bba0ca04a..cddfcb6f9 100644 --- a/docs/ingestion_testing_workflow.md +++ b/docs/ingestion-beam/ingestion_testing_workflow.md @@ -1,21 +1,10 @@ - - - - -- [Ingestion Testing Workflow](#ingestion-testing-workflow) - - [Setting up the GCS project](#setting-up-the-gcs-project) - - [Bootstrapping schemas from `mozilla-pipeline-schemas`](#bootstrapping-schemas-from-mozilla-pipeline-schemas) - - [Building the project](#building-the-project) - - - # Ingestion Testing Workflow The ingestion-beam handles data flow of documents from the edge into various sinks. You may be interested in standing up a small testing instance to validate the integration of the various components. -![diagrams/workflow.mmd](diagrams/workflow.svg) +![diagrams/workflow.mmd](../diagrams/workflow.svg) __Figure__: _An overview of the various components necessary to query BigQuery against data from a PubSub subscription._ diff --git a/ingestion-edge/README.md b/docs/ingestion-edge/index.md similarity index 89% rename from ingestion-edge/README.md rename to docs/ingestion-edge/index.md index d8afe20cb..52715a970 100644 --- a/ingestion-edge/README.md +++ b/docs/ingestion-edge/index.md @@ -1,28 +1,16 @@ -[![CircleCI](https://circleci.com/gh/mozilla/gcp-ingestion.svg?style=svg&circle-token=d98a470269580907d5c6d74d0e67612834a21be7)](https://circleci.com/gh/mozilla/gcp-ingestion) - # Ingestion Edge Server A simple service for delivering HTTP messages to Google Cloud PubSub - - - - - - [Building](#building) - - [Running](#running) - - [Configuration](#configuration) - - [Testing](#testing) - - [Style Checks](#style-checks) - - [Unit Tests](#unit-tests) - - [Integration Tests](#integration-tests) - - [Load Tests](#load-tests) -- [License](#license) - - +The source code lives in the [ingestion-edge](https://github.com/mozilla/gcp-ingestion/tree/master/ingestion-beam) +subdirectory of the gcp-ingestion repository. ## Building -Install and update dependencies as-needed +We assume that you have [docker-compose](https://docs.docker.com/compose/) +installed. + +From inside the `ingestion-edge` subdirectory: ```bash # docker-compose @@ -230,8 +218,3 @@ Load test options (from `./bin/test -h`) when --no-generator is specified ``` -# License - -This Source Code Form is subject to the terms of the Mozilla Public -License, v. 2.0. If a copy of the MPL was not distributed with this -file, You can obtain one at http://mozilla.org/MPL/2.0/. diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 000000000..fb666620e --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,20 @@ +site_name: GCP Ingestion +site_description: Mozilla Telemetry ingestion on Google Cloud Platform +site_author: Mozilla Data Platform Team +nav: + - Home: index.md + - ingestion-edge: ingestion-edge/index.md + - ingestion-beam: + - Overview: ingestion-beam/index.md + - Ingestion testing workflow: ingestion-beam/ingestion_testing_workflow.md + - Architecture: + - Overview: architecture/overview.md + - Differences from AWS: architecture/differences_from_aws.md + - Pain Points: architecture/pain_points.md + - Edge Migration Plan: architecture/edge_migration_plan.md + - Reliability: architecture/reliability.md + - Test requirements: architecture/test_requirements.md + - Landfill Service Specification: architecture/landfill_service_specification.md + - Edge Server Specification: architecture/edge_service_specification.md + - BigQuery Sink Specification: architecture/bigquery_sink_specification.md + - Decoder Service Specification: architecture/decoder_service_specification.md