diff --git a/packages/google-cloud-dlp/.flake8 b/packages/google-cloud-dlp/.flake8
index 20fe9bda2ee4..ed9316381c9c 100644
--- a/packages/google-cloud-dlp/.flake8
+++ b/packages/google-cloud-dlp/.flake8
@@ -21,6 +21,8 @@ exclude =
# Exclude generated code.
**/proto/**
**/gapic/**
+ **/services/**
+ **/types/**
*_pb2.py
# Standard linting exemptions.
diff --git a/packages/google-cloud-dlp/.github/CODEOWNERS b/packages/google-cloud-dlp/.github/CODEOWNERS
new file mode 100644
index 000000000000..d2cf5a107cfc
--- /dev/null
+++ b/packages/google-cloud-dlp/.github/CODEOWNERS
@@ -0,0 +1,8 @@
+# Code owners file.
+# This file controls who is tagged for review for any given pull request.
+#
+# For syntax help see:
+# https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners#codeowners-syntax
+
+
+/samples/**/*.py @googleapis/python-samples-owners
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.gitignore b/packages/google-cloud-dlp/.gitignore
index 3fb06e09ce74..b9daa52f118d 100644
--- a/packages/google-cloud-dlp/.gitignore
+++ b/packages/google-cloud-dlp/.gitignore
@@ -10,6 +10,7 @@
dist
build
eggs
+.eggs
parts
bin
var
@@ -45,14 +46,16 @@ pip-log.txt
# Built documentation
docs/_build
bigquery/docs/generated
+docs.metadata
# Virtual environment
env/
coverage.xml
+sponge_log.xml
# System test environment variables.
system_tests/local_test_setup
# Make sure a generated file isn't accidentally committed.
pylintrc
-pylintrc.test
\ No newline at end of file
+pylintrc.test
diff --git a/packages/google-cloud-dlp/.kokoro/build.sh b/packages/google-cloud-dlp/.kokoro/build.sh
index 01c4069d6f89..2ed36f658bbe 100755
--- a/packages/google-cloud-dlp/.kokoro/build.sh
+++ b/packages/google-cloud-dlp/.kokoro/build.sh
@@ -36,4 +36,10 @@ python3.6 -m pip uninstall --yes --quiet nox-automation
python3.6 -m pip install --upgrade --quiet nox
python3.6 -m nox --version
-python3.6 -m nox
+# If NOX_SESSION is set, it only runs the specified session,
+# otherwise run all the sessions.
+if [[ -n "${NOX_SESSION:-}" ]]; then
+ python3.6 -m nox -s "${NOX_SESSION:-}"
+else
+ python3.6 -m nox
+fi
diff --git a/packages/google-cloud-dlp/.kokoro/docker/docs/Dockerfile b/packages/google-cloud-dlp/.kokoro/docker/docs/Dockerfile
new file mode 100644
index 000000000000..412b0b56a921
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/docker/docs/Dockerfile
@@ -0,0 +1,98 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ubuntu:20.04
+
+ENV DEBIAN_FRONTEND noninteractive
+
+# Ensure local Python is preferred over distribution Python.
+ENV PATH /usr/local/bin:$PATH
+
+# Install dependencies.
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+ apt-transport-https \
+ build-essential \
+ ca-certificates \
+ curl \
+ dirmngr \
+ git \
+ gpg-agent \
+ graphviz \
+ libbz2-dev \
+ libdb5.3-dev \
+ libexpat1-dev \
+ libffi-dev \
+ liblzma-dev \
+ libreadline-dev \
+ libsnappy-dev \
+ libssl-dev \
+ libsqlite3-dev \
+ portaudio19-dev \
+ redis-server \
+ software-properties-common \
+ ssh \
+ sudo \
+ tcl \
+ tcl-dev \
+ tk \
+ tk-dev \
+ uuid-dev \
+ wget \
+ zlib1g-dev \
+ && add-apt-repository universe \
+ && apt-get update \
+ && apt-get -y install jq \
+ && apt-get clean autoclean \
+ && apt-get autoremove -y \
+ && rm -rf /var/lib/apt/lists/* \
+ && rm -f /var/cache/apt/archives/*.deb
+
+
+COPY fetch_gpg_keys.sh /tmp
+# Install the desired versions of Python.
+RUN set -ex \
+ && export GNUPGHOME="$(mktemp -d)" \
+ && echo "disable-ipv6" >> "${GNUPGHOME}/dirmngr.conf" \
+ && /tmp/fetch_gpg_keys.sh \
+ && for PYTHON_VERSION in 3.7.8 3.8.5; do \
+ wget --no-check-certificate -O python-${PYTHON_VERSION}.tar.xz "https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tar.xz" \
+ && wget --no-check-certificate -O python-${PYTHON_VERSION}.tar.xz.asc "https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tar.xz.asc" \
+ && gpg --batch --verify python-${PYTHON_VERSION}.tar.xz.asc python-${PYTHON_VERSION}.tar.xz \
+ && rm -r python-${PYTHON_VERSION}.tar.xz.asc \
+ && mkdir -p /usr/src/python-${PYTHON_VERSION} \
+ && tar -xJC /usr/src/python-${PYTHON_VERSION} --strip-components=1 -f python-${PYTHON_VERSION}.tar.xz \
+ && rm python-${PYTHON_VERSION}.tar.xz \
+ && cd /usr/src/python-${PYTHON_VERSION} \
+ && ./configure \
+ --enable-shared \
+ # This works only on Python 2.7 and throws a warning on every other
+ # version, but seems otherwise harmless.
+ --enable-unicode=ucs4 \
+ --with-system-ffi \
+ --without-ensurepip \
+ && make -j$(nproc) \
+ && make install \
+ && ldconfig \
+ ; done \
+ && rm -rf "${GNUPGHOME}" \
+ && rm -rf /usr/src/python* \
+ && rm -rf ~/.cache/
+
+RUN wget -O /tmp/get-pip.py 'https://bootstrap.pypa.io/get-pip.py' \
+ && python3.7 /tmp/get-pip.py \
+ && python3.8 /tmp/get-pip.py \
+ && rm /tmp/get-pip.py
+
+CMD ["python3.7"]
diff --git a/packages/google-cloud-dlp/.kokoro/docker/docs/fetch_gpg_keys.sh b/packages/google-cloud-dlp/.kokoro/docker/docs/fetch_gpg_keys.sh
new file mode 100755
index 000000000000..d653dd868e4b
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/docker/docs/fetch_gpg_keys.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# A script to fetch gpg keys with retry.
+# Avoid jinja parsing the file.
+#
+
+function retry {
+ if [[ "${#}" -le 1 ]]; then
+ echo "Usage: ${0} retry_count commands.."
+ exit 1
+ fi
+ local retries=${1}
+ local command="${@:2}"
+ until [[ "${retries}" -le 0 ]]; do
+ $command && return 0
+ if [[ $? -ne 0 ]]; then
+ echo "command failed, retrying"
+ ((retries--))
+ fi
+ done
+ return 1
+}
+
+# 3.6.9, 3.7.5 (Ned Deily)
+retry 3 gpg --keyserver ha.pool.sks-keyservers.net --recv-keys \
+ 0D96DF4D4110E5C43FBFB17F2D347EA6AA65421D
+
+# 3.8.0 (Ćukasz Langa)
+retry 3 gpg --keyserver ha.pool.sks-keyservers.net --recv-keys \
+ E3FF2839C048B25C084DEBE9B26995E310250568
+
+#
diff --git a/packages/google-cloud-dlp/.kokoro/docs/common.cfg b/packages/google-cloud-dlp/.kokoro/docs/common.cfg
index 12e00f8a14db..aa9c96fefea3 100644
--- a/packages/google-cloud-dlp/.kokoro/docs/common.cfg
+++ b/packages/google-cloud-dlp/.kokoro/docs/common.cfg
@@ -11,12 +11,12 @@ action {
gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline"
# Use the trampoline script to run in docker.
-build_file: "python-dlp/.kokoro/trampoline.sh"
+build_file: "python-dlp/.kokoro/trampoline_v2.sh"
# Configure the docker image for kokoro-trampoline.
env_vars: {
key: "TRAMPOLINE_IMAGE"
- value: "gcr.io/cloud-devrel-kokoro-resources/python-multi"
+ value: "gcr.io/cloud-devrel-kokoro-resources/python-lib-docs"
}
env_vars: {
key: "TRAMPOLINE_BUILD_FILE"
@@ -28,6 +28,23 @@ env_vars: {
value: "docs-staging"
}
+env_vars: {
+ key: "V2_STAGING_BUCKET"
+ value: "docs-staging-v2-staging"
+}
+
+# It will upload the docker image after successful builds.
+env_vars: {
+ key: "TRAMPOLINE_IMAGE_UPLOAD"
+ value: "true"
+}
+
+# It will always build the docker image.
+env_vars: {
+ key: "TRAMPOLINE_DOCKERFILE"
+ value: ".kokoro/docker/docs/Dockerfile"
+}
+
# Fetch the token needed for reporting release status to GitHub
before_action {
fetch_keystore {
diff --git a/packages/google-cloud-dlp/.kokoro/docs/docs-presubmit.cfg b/packages/google-cloud-dlp/.kokoro/docs/docs-presubmit.cfg
new file mode 100644
index 000000000000..1118107829b7
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/docs/docs-presubmit.cfg
@@ -0,0 +1,17 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+env_vars: {
+ key: "STAGING_BUCKET"
+ value: "gcloud-python-test"
+}
+
+env_vars: {
+ key: "V2_STAGING_BUCKET"
+ value: "gcloud-python-test"
+}
+
+# We only upload the image in the main `docs` build.
+env_vars: {
+ key: "TRAMPOLINE_IMAGE_UPLOAD"
+ value: "false"
+}
diff --git a/packages/google-cloud-dlp/.kokoro/publish-docs.sh b/packages/google-cloud-dlp/.kokoro/publish-docs.sh
index b21550213455..8acb14e802b0 100755
--- a/packages/google-cloud-dlp/.kokoro/publish-docs.sh
+++ b/packages/google-cloud-dlp/.kokoro/publish-docs.sh
@@ -18,26 +18,16 @@ set -eo pipefail
# Disable buffering, so that the logs stream through.
export PYTHONUNBUFFERED=1
-cd github/python-dlp
-
-# Remove old nox
-python3.6 -m pip uninstall --yes --quiet nox-automation
+export PATH="${HOME}/.local/bin:${PATH}"
# Install nox
-python3.6 -m pip install --upgrade --quiet nox
-python3.6 -m nox --version
+python3 -m pip install --user --upgrade --quiet nox
+python3 -m nox --version
# build docs
nox -s docs
-python3 -m pip install gcp-docuploader
-
-# install a json parser
-sudo apt-get update
-sudo apt-get -y install software-properties-common
-sudo add-apt-repository universe
-sudo apt-get update
-sudo apt-get -y install jq
+python3 -m pip install --user gcp-docuploader
# create metadata
python3 -m docuploader create-metadata \
@@ -52,4 +42,23 @@ python3 -m docuploader create-metadata \
cat docs.metadata
# upload docs
-python3 -m docuploader upload docs/_build/html --metadata-file docs.metadata --staging-bucket docs-staging
+python3 -m docuploader upload docs/_build/html --metadata-file docs.metadata --staging-bucket "${STAGING_BUCKET}"
+
+
+# docfx yaml files
+nox -s docfx
+
+# create metadata.
+python3 -m docuploader create-metadata \
+ --name=$(jq --raw-output '.name // empty' .repo-metadata.json) \
+ --version=$(python3 setup.py --version) \
+ --language=$(jq --raw-output '.language // empty' .repo-metadata.json) \
+ --distribution-name=$(python3 setup.py --name) \
+ --product-page=$(jq --raw-output '.product_documentation // empty' .repo-metadata.json) \
+ --github-repository=$(jq --raw-output '.repo // empty' .repo-metadata.json) \
+ --issue-tracker=$(jq --raw-output '.issue_tracker // empty' .repo-metadata.json)
+
+cat docs.metadata
+
+# upload docs
+python3 -m docuploader upload docs/_build/html/docfx_yaml --metadata-file docs.metadata --destination-prefix docfx --staging-bucket "${V2_STAGING_BUCKET}"
diff --git a/packages/google-cloud-dlp/.kokoro/samples/lint/common.cfg b/packages/google-cloud-dlp/.kokoro/samples/lint/common.cfg
new file mode 100644
index 000000000000..fb6a3fd4a93e
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/lint/common.cfg
@@ -0,0 +1,34 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Build logs will be here
+action {
+ define_artifacts {
+ regex: "**/*sponge_log.xml"
+ }
+}
+
+# Specify which tests to run
+env_vars: {
+ key: "RUN_TESTS_SESSION"
+ value: "lint"
+}
+
+env_vars: {
+ key: "TRAMPOLINE_BUILD_FILE"
+ value: "github/python-dlp/.kokoro/test-samples.sh"
+}
+
+# Configure the docker image for kokoro-trampoline.
+env_vars: {
+ key: "TRAMPOLINE_IMAGE"
+ value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker"
+}
+
+# Download secrets for samples
+gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples"
+
+# Download trampoline resources.
+gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline"
+
+# Use the trampoline script to run in docker.
+build_file: "python-dlp/.kokoro/trampoline.sh"
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/samples/lint/continuous.cfg b/packages/google-cloud-dlp/.kokoro/samples/lint/continuous.cfg
new file mode 100644
index 000000000000..a1c8d9759c88
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/lint/continuous.cfg
@@ -0,0 +1,6 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+env_vars: {
+ key: "INSTALL_LIBRARY_FROM_SOURCE"
+ value: "True"
+}
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/samples/lint/periodic.cfg b/packages/google-cloud-dlp/.kokoro/samples/lint/periodic.cfg
new file mode 100644
index 000000000000..50fec9649732
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/lint/periodic.cfg
@@ -0,0 +1,6 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+env_vars: {
+ key: "INSTALL_LIBRARY_FROM_SOURCE"
+ value: "False"
+}
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/samples/lint/presubmit.cfg b/packages/google-cloud-dlp/.kokoro/samples/lint/presubmit.cfg
new file mode 100644
index 000000000000..a1c8d9759c88
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/lint/presubmit.cfg
@@ -0,0 +1,6 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+env_vars: {
+ key: "INSTALL_LIBRARY_FROM_SOURCE"
+ value: "True"
+}
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/samples/python3.6/common.cfg b/packages/google-cloud-dlp/.kokoro/samples/python3.6/common.cfg
new file mode 100644
index 000000000000..79abd4ccede8
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/python3.6/common.cfg
@@ -0,0 +1,34 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Build logs will be here
+action {
+ define_artifacts {
+ regex: "**/*sponge_log.xml"
+ }
+}
+
+# Specify which tests to run
+env_vars: {
+ key: "RUN_TESTS_SESSION"
+ value: "py-3.6"
+}
+
+env_vars: {
+ key: "TRAMPOLINE_BUILD_FILE"
+ value: "github/python-dlp/.kokoro/test-samples.sh"
+}
+
+# Configure the docker image for kokoro-trampoline.
+env_vars: {
+ key: "TRAMPOLINE_IMAGE"
+ value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker"
+}
+
+# Download secrets for samples
+gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples"
+
+# Download trampoline resources.
+gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline"
+
+# Use the trampoline script to run in docker.
+build_file: "python-dlp/.kokoro/trampoline.sh"
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/samples/python3.6/continuous.cfg b/packages/google-cloud-dlp/.kokoro/samples/python3.6/continuous.cfg
new file mode 100644
index 000000000000..7218af1499e5
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/python3.6/continuous.cfg
@@ -0,0 +1,7 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+env_vars: {
+ key: "INSTALL_LIBRARY_FROM_SOURCE"
+ value: "True"
+}
+
diff --git a/packages/google-cloud-dlp/.kokoro/samples/python3.6/periodic.cfg b/packages/google-cloud-dlp/.kokoro/samples/python3.6/periodic.cfg
new file mode 100644
index 000000000000..50fec9649732
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/python3.6/periodic.cfg
@@ -0,0 +1,6 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+env_vars: {
+ key: "INSTALL_LIBRARY_FROM_SOURCE"
+ value: "False"
+}
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/samples/python3.6/presubmit.cfg b/packages/google-cloud-dlp/.kokoro/samples/python3.6/presubmit.cfg
new file mode 100644
index 000000000000..a1c8d9759c88
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/python3.6/presubmit.cfg
@@ -0,0 +1,6 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+env_vars: {
+ key: "INSTALL_LIBRARY_FROM_SOURCE"
+ value: "True"
+}
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/samples/python3.7/common.cfg b/packages/google-cloud-dlp/.kokoro/samples/python3.7/common.cfg
new file mode 100644
index 000000000000..357e36be5d1c
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/python3.7/common.cfg
@@ -0,0 +1,34 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Build logs will be here
+action {
+ define_artifacts {
+ regex: "**/*sponge_log.xml"
+ }
+}
+
+# Specify which tests to run
+env_vars: {
+ key: "RUN_TESTS_SESSION"
+ value: "py-3.7"
+}
+
+env_vars: {
+ key: "TRAMPOLINE_BUILD_FILE"
+ value: "github/python-dlp/.kokoro/test-samples.sh"
+}
+
+# Configure the docker image for kokoro-trampoline.
+env_vars: {
+ key: "TRAMPOLINE_IMAGE"
+ value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker"
+}
+
+# Download secrets for samples
+gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples"
+
+# Download trampoline resources.
+gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline"
+
+# Use the trampoline script to run in docker.
+build_file: "python-dlp/.kokoro/trampoline.sh"
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/samples/python3.7/continuous.cfg b/packages/google-cloud-dlp/.kokoro/samples/python3.7/continuous.cfg
new file mode 100644
index 000000000000..a1c8d9759c88
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/python3.7/continuous.cfg
@@ -0,0 +1,6 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+env_vars: {
+ key: "INSTALL_LIBRARY_FROM_SOURCE"
+ value: "True"
+}
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/samples/python3.7/periodic.cfg b/packages/google-cloud-dlp/.kokoro/samples/python3.7/periodic.cfg
new file mode 100644
index 000000000000..50fec9649732
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/python3.7/periodic.cfg
@@ -0,0 +1,6 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+env_vars: {
+ key: "INSTALL_LIBRARY_FROM_SOURCE"
+ value: "False"
+}
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/samples/python3.7/presubmit.cfg b/packages/google-cloud-dlp/.kokoro/samples/python3.7/presubmit.cfg
new file mode 100644
index 000000000000..a1c8d9759c88
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/python3.7/presubmit.cfg
@@ -0,0 +1,6 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+env_vars: {
+ key: "INSTALL_LIBRARY_FROM_SOURCE"
+ value: "True"
+}
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/samples/python3.8/common.cfg b/packages/google-cloud-dlp/.kokoro/samples/python3.8/common.cfg
new file mode 100644
index 000000000000..aefdbeace9fc
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/python3.8/common.cfg
@@ -0,0 +1,34 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+# Build logs will be here
+action {
+ define_artifacts {
+ regex: "**/*sponge_log.xml"
+ }
+}
+
+# Specify which tests to run
+env_vars: {
+ key: "RUN_TESTS_SESSION"
+ value: "py-3.8"
+}
+
+env_vars: {
+ key: "TRAMPOLINE_BUILD_FILE"
+ value: "github/python-dlp/.kokoro/test-samples.sh"
+}
+
+# Configure the docker image for kokoro-trampoline.
+env_vars: {
+ key: "TRAMPOLINE_IMAGE"
+ value: "gcr.io/cloud-devrel-kokoro-resources/python-samples-testing-docker"
+}
+
+# Download secrets for samples
+gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/python-docs-samples"
+
+# Download trampoline resources.
+gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline"
+
+# Use the trampoline script to run in docker.
+build_file: "python-dlp/.kokoro/trampoline.sh"
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/samples/python3.8/continuous.cfg b/packages/google-cloud-dlp/.kokoro/samples/python3.8/continuous.cfg
new file mode 100644
index 000000000000..a1c8d9759c88
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/python3.8/continuous.cfg
@@ -0,0 +1,6 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+env_vars: {
+ key: "INSTALL_LIBRARY_FROM_SOURCE"
+ value: "True"
+}
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/samples/python3.8/periodic.cfg b/packages/google-cloud-dlp/.kokoro/samples/python3.8/periodic.cfg
new file mode 100644
index 000000000000..50fec9649732
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/python3.8/periodic.cfg
@@ -0,0 +1,6 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+env_vars: {
+ key: "INSTALL_LIBRARY_FROM_SOURCE"
+ value: "False"
+}
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/samples/python3.8/presubmit.cfg b/packages/google-cloud-dlp/.kokoro/samples/python3.8/presubmit.cfg
new file mode 100644
index 000000000000..a1c8d9759c88
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/samples/python3.8/presubmit.cfg
@@ -0,0 +1,6 @@
+# Format: //devtools/kokoro/config/proto/build.proto
+
+env_vars: {
+ key: "INSTALL_LIBRARY_FROM_SOURCE"
+ value: "True"
+}
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/test-samples.sh b/packages/google-cloud-dlp/.kokoro/test-samples.sh
new file mode 100755
index 000000000000..0d3b153ac9f2
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/test-samples.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# `-e` enables the script to automatically fail when a command fails
+# `-o pipefail` sets the exit code to the rightmost comment to exit with a non-zero
+set -eo pipefail
+# Enables `**` to include files nested inside sub-folders
+shopt -s globstar
+
+cd github/python-dlp
+
+# Run periodic samples tests at latest release
+if [[ $KOKORO_BUILD_ARTIFACTS_SUBDIR = *"periodic"* ]]; then
+ LATEST_RELEASE=$(git describe --abbrev=0 --tags)
+ git checkout $LATEST_RELEASE
+fi
+
+# Disable buffering, so that the logs stream through.
+export PYTHONUNBUFFERED=1
+
+# Debug: show build environment
+env | grep KOKORO
+
+# Install nox
+python3.6 -m pip install --upgrade --quiet nox
+
+# Use secrets acessor service account to get secrets
+if [[ -f "${KOKORO_GFILE_DIR}/secrets_viewer_service_account.json" ]]; then
+ gcloud auth activate-service-account \
+ --key-file="${KOKORO_GFILE_DIR}/secrets_viewer_service_account.json" \
+ --project="cloud-devrel-kokoro-resources"
+fi
+
+# This script will create 3 files:
+# - testing/test-env.sh
+# - testing/service-account.json
+# - testing/client-secrets.json
+./scripts/decrypt-secrets.sh
+
+source ./testing/test-env.sh
+export GOOGLE_APPLICATION_CREDENTIALS=$(pwd)/testing/service-account.json
+
+# For cloud-run session, we activate the service account for gcloud sdk.
+gcloud auth activate-service-account \
+ --key-file "${GOOGLE_APPLICATION_CREDENTIALS}"
+
+export GOOGLE_CLIENT_SECRETS=$(pwd)/testing/client-secrets.json
+
+echo -e "\n******************** TESTING PROJECTS ********************"
+
+# Switch to 'fail at end' to allow all tests to complete before exiting.
+set +e
+# Use RTN to return a non-zero value if the test fails.
+RTN=0
+ROOT=$(pwd)
+# Find all requirements.txt in the samples directory (may break on whitespace).
+for file in samples/**/requirements.txt; do
+ cd "$ROOT"
+ # Navigate to the project folder.
+ file=$(dirname "$file")
+ cd "$file"
+
+ echo "------------------------------------------------------------"
+ echo "- testing $file"
+ echo "------------------------------------------------------------"
+
+ # Use nox to execute the tests for the project.
+ python3.6 -m nox -s "$RUN_TESTS_SESSION"
+ EXIT=$?
+
+ # If this is a periodic build, send the test log to the Build Cop Bot.
+ # See https://github.com/googleapis/repo-automation-bots/tree/master/packages/buildcop.
+ if [[ $KOKORO_BUILD_ARTIFACTS_SUBDIR = *"periodic"* ]]; then
+ chmod +x $KOKORO_GFILE_DIR/linux_amd64/buildcop
+ $KOKORO_GFILE_DIR/linux_amd64/buildcop
+ fi
+
+ if [[ $EXIT -ne 0 ]]; then
+ RTN=1
+ echo -e "\n Testing failed: Nox returned a non-zero exit code. \n"
+ else
+ echo -e "\n Testing completed.\n"
+ fi
+
+done
+cd "$ROOT"
+
+# Workaround for Kokoro permissions issue: delete secrets
+rm testing/{test-env.sh,client-secrets.json,service-account.json}
+
+exit "$RTN"
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/.kokoro/trampoline_v2.sh b/packages/google-cloud-dlp/.kokoro/trampoline_v2.sh
new file mode 100755
index 000000000000..719bcd5ba84d
--- /dev/null
+++ b/packages/google-cloud-dlp/.kokoro/trampoline_v2.sh
@@ -0,0 +1,487 @@
+#!/usr/bin/env bash
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# trampoline_v2.sh
+#
+# This script does 3 things.
+#
+# 1. Prepare the Docker image for the test
+# 2. Run the Docker with appropriate flags to run the test
+# 3. Upload the newly built Docker image
+#
+# in a way that is somewhat compatible with trampoline_v1.
+#
+# To run this script, first download few files from gcs to /dev/shm.
+# (/dev/shm is passed into the container as KOKORO_GFILE_DIR).
+#
+# gsutil cp gs://cloud-devrel-kokoro-resources/python-docs-samples/secrets_viewer_service_account.json /dev/shm
+# gsutil cp gs://cloud-devrel-kokoro-resources/python-docs-samples/automl_secrets.txt /dev/shm
+#
+# Then run the script.
+# .kokoro/trampoline_v2.sh
+#
+# These environment variables are required:
+# TRAMPOLINE_IMAGE: The docker image to use.
+# TRAMPOLINE_DOCKERFILE: The location of the Dockerfile.
+#
+# You can optionally change these environment variables:
+# TRAMPOLINE_IMAGE_UPLOAD:
+# (true|false): Whether to upload the Docker image after the
+# successful builds.
+# TRAMPOLINE_BUILD_FILE: The script to run in the docker container.
+# TRAMPOLINE_WORKSPACE: The workspace path in the docker container.
+# Defaults to /workspace.
+# Potentially there are some repo specific envvars in .trampolinerc in
+# the project root.
+
+
+set -euo pipefail
+
+TRAMPOLINE_VERSION="2.0.5"
+
+if command -v tput >/dev/null && [[ -n "${TERM:-}" ]]; then
+ readonly IO_COLOR_RED="$(tput setaf 1)"
+ readonly IO_COLOR_GREEN="$(tput setaf 2)"
+ readonly IO_COLOR_YELLOW="$(tput setaf 3)"
+ readonly IO_COLOR_RESET="$(tput sgr0)"
+else
+ readonly IO_COLOR_RED=""
+ readonly IO_COLOR_GREEN=""
+ readonly IO_COLOR_YELLOW=""
+ readonly IO_COLOR_RESET=""
+fi
+
+function function_exists {
+ [ $(LC_ALL=C type -t $1)"" == "function" ]
+}
+
+# Logs a message using the given color. The first argument must be one
+# of the IO_COLOR_* variables defined above, such as
+# "${IO_COLOR_YELLOW}". The remaining arguments will be logged in the
+# given color. The log message will also have an RFC-3339 timestamp
+# prepended (in UTC). You can disable the color output by setting
+# TERM=vt100.
+function log_impl() {
+ local color="$1"
+ shift
+ local timestamp="$(date -u "+%Y-%m-%dT%H:%M:%SZ")"
+ echo "================================================================"
+ echo "${color}${timestamp}:" "$@" "${IO_COLOR_RESET}"
+ echo "================================================================"
+}
+
+# Logs the given message with normal coloring and a timestamp.
+function log() {
+ log_impl "${IO_COLOR_RESET}" "$@"
+}
+
+# Logs the given message in green with a timestamp.
+function log_green() {
+ log_impl "${IO_COLOR_GREEN}" "$@"
+}
+
+# Logs the given message in yellow with a timestamp.
+function log_yellow() {
+ log_impl "${IO_COLOR_YELLOW}" "$@"
+}
+
+# Logs the given message in red with a timestamp.
+function log_red() {
+ log_impl "${IO_COLOR_RED}" "$@"
+}
+
+readonly tmpdir=$(mktemp -d -t ci-XXXXXXXX)
+readonly tmphome="${tmpdir}/h"
+mkdir -p "${tmphome}"
+
+function cleanup() {
+ rm -rf "${tmpdir}"
+}
+trap cleanup EXIT
+
+RUNNING_IN_CI="${RUNNING_IN_CI:-false}"
+
+# The workspace in the container, defaults to /workspace.
+TRAMPOLINE_WORKSPACE="${TRAMPOLINE_WORKSPACE:-/workspace}"
+
+pass_down_envvars=(
+ # TRAMPOLINE_V2 variables.
+ # Tells scripts whether they are running as part of CI or not.
+ "RUNNING_IN_CI"
+ # Indicates which CI system we're in.
+ "TRAMPOLINE_CI"
+ # Indicates the version of the script.
+ "TRAMPOLINE_VERSION"
+)
+
+log_yellow "Building with Trampoline ${TRAMPOLINE_VERSION}"
+
+# Detect which CI systems we're in. If we're in any of the CI systems
+# we support, `RUNNING_IN_CI` will be true and `TRAMPOLINE_CI` will be
+# the name of the CI system. Both envvars will be passing down to the
+# container for telling which CI system we're in.
+if [[ -n "${KOKORO_BUILD_ID:-}" ]]; then
+ # descriptive env var for indicating it's on CI.
+ RUNNING_IN_CI="true"
+ TRAMPOLINE_CI="kokoro"
+ if [[ "${TRAMPOLINE_USE_LEGACY_SERVICE_ACCOUNT:-}" == "true" ]]; then
+ if [[ ! -f "${KOKORO_GFILE_DIR}/kokoro-trampoline.service-account.json" ]]; then
+ log_red "${KOKORO_GFILE_DIR}/kokoro-trampoline.service-account.json does not exist. Did you forget to mount cloud-devrel-kokoro-resources/trampoline? Aborting."
+ exit 1
+ fi
+ # This service account will be activated later.
+ TRAMPOLINE_SERVICE_ACCOUNT="${KOKORO_GFILE_DIR}/kokoro-trampoline.service-account.json"
+ else
+ if [[ "${TRAMPOLINE_VERBOSE:-}" == "true" ]]; then
+ gcloud auth list
+ fi
+ log_yellow "Configuring Container Registry access"
+ gcloud auth configure-docker --quiet
+ fi
+ pass_down_envvars+=(
+ # KOKORO dynamic variables.
+ "KOKORO_BUILD_NUMBER"
+ "KOKORO_BUILD_ID"
+ "KOKORO_JOB_NAME"
+ "KOKORO_GIT_COMMIT"
+ "KOKORO_GITHUB_COMMIT"
+ "KOKORO_GITHUB_PULL_REQUEST_NUMBER"
+ "KOKORO_GITHUB_PULL_REQUEST_COMMIT"
+ # For Build Cop Bot
+ "KOKORO_GITHUB_COMMIT_URL"
+ "KOKORO_GITHUB_PULL_REQUEST_URL"
+ )
+elif [[ "${TRAVIS:-}" == "true" ]]; then
+ RUNNING_IN_CI="true"
+ TRAMPOLINE_CI="travis"
+ pass_down_envvars+=(
+ "TRAVIS_BRANCH"
+ "TRAVIS_BUILD_ID"
+ "TRAVIS_BUILD_NUMBER"
+ "TRAVIS_BUILD_WEB_URL"
+ "TRAVIS_COMMIT"
+ "TRAVIS_COMMIT_MESSAGE"
+ "TRAVIS_COMMIT_RANGE"
+ "TRAVIS_JOB_NAME"
+ "TRAVIS_JOB_NUMBER"
+ "TRAVIS_JOB_WEB_URL"
+ "TRAVIS_PULL_REQUEST"
+ "TRAVIS_PULL_REQUEST_BRANCH"
+ "TRAVIS_PULL_REQUEST_SHA"
+ "TRAVIS_PULL_REQUEST_SLUG"
+ "TRAVIS_REPO_SLUG"
+ "TRAVIS_SECURE_ENV_VARS"
+ "TRAVIS_TAG"
+ )
+elif [[ -n "${GITHUB_RUN_ID:-}" ]]; then
+ RUNNING_IN_CI="true"
+ TRAMPOLINE_CI="github-workflow"
+ pass_down_envvars+=(
+ "GITHUB_WORKFLOW"
+ "GITHUB_RUN_ID"
+ "GITHUB_RUN_NUMBER"
+ "GITHUB_ACTION"
+ "GITHUB_ACTIONS"
+ "GITHUB_ACTOR"
+ "GITHUB_REPOSITORY"
+ "GITHUB_EVENT_NAME"
+ "GITHUB_EVENT_PATH"
+ "GITHUB_SHA"
+ "GITHUB_REF"
+ "GITHUB_HEAD_REF"
+ "GITHUB_BASE_REF"
+ )
+elif [[ "${CIRCLECI:-}" == "true" ]]; then
+ RUNNING_IN_CI="true"
+ TRAMPOLINE_CI="circleci"
+ pass_down_envvars+=(
+ "CIRCLE_BRANCH"
+ "CIRCLE_BUILD_NUM"
+ "CIRCLE_BUILD_URL"
+ "CIRCLE_COMPARE_URL"
+ "CIRCLE_JOB"
+ "CIRCLE_NODE_INDEX"
+ "CIRCLE_NODE_TOTAL"
+ "CIRCLE_PREVIOUS_BUILD_NUM"
+ "CIRCLE_PROJECT_REPONAME"
+ "CIRCLE_PROJECT_USERNAME"
+ "CIRCLE_REPOSITORY_URL"
+ "CIRCLE_SHA1"
+ "CIRCLE_STAGE"
+ "CIRCLE_USERNAME"
+ "CIRCLE_WORKFLOW_ID"
+ "CIRCLE_WORKFLOW_JOB_ID"
+ "CIRCLE_WORKFLOW_UPSTREAM_JOB_IDS"
+ "CIRCLE_WORKFLOW_WORKSPACE_ID"
+ )
+fi
+
+# Configure the service account for pulling the docker image.
+function repo_root() {
+ local dir="$1"
+ while [[ ! -d "${dir}/.git" ]]; do
+ dir="$(dirname "$dir")"
+ done
+ echo "${dir}"
+}
+
+# Detect the project root. In CI builds, we assume the script is in
+# the git tree and traverse from there, otherwise, traverse from `pwd`
+# to find `.git` directory.
+if [[ "${RUNNING_IN_CI:-}" == "true" ]]; then
+ PROGRAM_PATH="$(realpath "$0")"
+ PROGRAM_DIR="$(dirname "${PROGRAM_PATH}")"
+ PROJECT_ROOT="$(repo_root "${PROGRAM_DIR}")"
+else
+ PROJECT_ROOT="$(repo_root $(pwd))"
+fi
+
+log_yellow "Changing to the project root: ${PROJECT_ROOT}."
+cd "${PROJECT_ROOT}"
+
+# To support relative path for `TRAMPOLINE_SERVICE_ACCOUNT`, we need
+# to use this environment variable in `PROJECT_ROOT`.
+if [[ -n "${TRAMPOLINE_SERVICE_ACCOUNT:-}" ]]; then
+
+ mkdir -p "${tmpdir}/gcloud"
+ gcloud_config_dir="${tmpdir}/gcloud"
+
+ log_yellow "Using isolated gcloud config: ${gcloud_config_dir}."
+ export CLOUDSDK_CONFIG="${gcloud_config_dir}"
+
+ log_yellow "Using ${TRAMPOLINE_SERVICE_ACCOUNT} for authentication."
+ gcloud auth activate-service-account \
+ --key-file "${TRAMPOLINE_SERVICE_ACCOUNT}"
+ log_yellow "Configuring Container Registry access"
+ gcloud auth configure-docker --quiet
+fi
+
+required_envvars=(
+ # The basic trampoline configurations.
+ "TRAMPOLINE_IMAGE"
+ "TRAMPOLINE_BUILD_FILE"
+)
+
+if [[ -f "${PROJECT_ROOT}/.trampolinerc" ]]; then
+ source "${PROJECT_ROOT}/.trampolinerc"
+fi
+
+log_yellow "Checking environment variables."
+for e in "${required_envvars[@]}"
+do
+ if [[ -z "${!e:-}" ]]; then
+ log "Missing ${e} env var. Aborting."
+ exit 1
+ fi
+done
+
+# We want to support legacy style TRAMPOLINE_BUILD_FILE used with V1
+# script: e.g. "github/repo-name/.kokoro/run_tests.sh"
+TRAMPOLINE_BUILD_FILE="${TRAMPOLINE_BUILD_FILE#github/*/}"
+log_yellow "Using TRAMPOLINE_BUILD_FILE: ${TRAMPOLINE_BUILD_FILE}"
+
+# ignore error on docker operations and test execution
+set +e
+
+log_yellow "Preparing Docker image."
+# We only download the docker image in CI builds.
+if [[ "${RUNNING_IN_CI:-}" == "true" ]]; then
+ # Download the docker image specified by `TRAMPOLINE_IMAGE`
+
+ # We may want to add --max-concurrent-downloads flag.
+
+ log_yellow "Start pulling the Docker image: ${TRAMPOLINE_IMAGE}."
+ if docker pull "${TRAMPOLINE_IMAGE}"; then
+ log_green "Finished pulling the Docker image: ${TRAMPOLINE_IMAGE}."
+ has_image="true"
+ else
+ log_red "Failed pulling the Docker image: ${TRAMPOLINE_IMAGE}."
+ has_image="false"
+ fi
+else
+ # For local run, check if we have the image.
+ if docker images "${TRAMPOLINE_IMAGE}:latest" | grep "${TRAMPOLINE_IMAGE}"; then
+ has_image="true"
+ else
+ has_image="false"
+ fi
+fi
+
+
+# The default user for a Docker container has uid 0 (root). To avoid
+# creating root-owned files in the build directory we tell docker to
+# use the current user ID.
+user_uid="$(id -u)"
+user_gid="$(id -g)"
+user_name="$(id -un)"
+
+# To allow docker in docker, we add the user to the docker group in
+# the host os.
+docker_gid=$(cut -d: -f3 < <(getent group docker))
+
+update_cache="false"
+if [[ "${TRAMPOLINE_DOCKERFILE:-none}" != "none" ]]; then
+ # Build the Docker image from the source.
+ context_dir=$(dirname "${TRAMPOLINE_DOCKERFILE}")
+ docker_build_flags=(
+ "-f" "${TRAMPOLINE_DOCKERFILE}"
+ "-t" "${TRAMPOLINE_IMAGE}"
+ "--build-arg" "UID=${user_uid}"
+ "--build-arg" "USERNAME=${user_name}"
+ )
+ if [[ "${has_image}" == "true" ]]; then
+ docker_build_flags+=("--cache-from" "${TRAMPOLINE_IMAGE}")
+ fi
+
+ log_yellow "Start building the docker image."
+ if [[ "${TRAMPOLINE_VERBOSE:-false}" == "true" ]]; then
+ echo "docker build" "${docker_build_flags[@]}" "${context_dir}"
+ fi
+
+ # ON CI systems, we want to suppress docker build logs, only
+ # output the logs when it fails.
+ if [[ "${RUNNING_IN_CI:-}" == "true" ]]; then
+ if docker build "${docker_build_flags[@]}" "${context_dir}" \
+ > "${tmpdir}/docker_build.log" 2>&1; then
+ if [[ "${TRAMPOLINE_VERBOSE:-}" == "true" ]]; then
+ cat "${tmpdir}/docker_build.log"
+ fi
+
+ log_green "Finished building the docker image."
+ update_cache="true"
+ else
+ log_red "Failed to build the Docker image, aborting."
+ log_yellow "Dumping the build logs:"
+ cat "${tmpdir}/docker_build.log"
+ exit 1
+ fi
+ else
+ if docker build "${docker_build_flags[@]}" "${context_dir}"; then
+ log_green "Finished building the docker image."
+ update_cache="true"
+ else
+ log_red "Failed to build the Docker image, aborting."
+ exit 1
+ fi
+ fi
+else
+ if [[ "${has_image}" != "true" ]]; then
+ log_red "We do not have ${TRAMPOLINE_IMAGE} locally, aborting."
+ exit 1
+ fi
+fi
+
+# We use an array for the flags so they are easier to document.
+docker_flags=(
+ # Remove the container after it exists.
+ "--rm"
+
+ # Use the host network.
+ "--network=host"
+
+ # Run in priviledged mode. We are not using docker for sandboxing or
+ # isolation, just for packaging our dev tools.
+ "--privileged"
+
+ # Run the docker script with the user id. Because the docker image gets to
+ # write in ${PWD} you typically want this to be your user id.
+ # To allow docker in docker, we need to use docker gid on the host.
+ "--user" "${user_uid}:${docker_gid}"
+
+ # Pass down the USER.
+ "--env" "USER=${user_name}"
+
+ # Mount the project directory inside the Docker container.
+ "--volume" "${PROJECT_ROOT}:${TRAMPOLINE_WORKSPACE}"
+ "--workdir" "${TRAMPOLINE_WORKSPACE}"
+ "--env" "PROJECT_ROOT=${TRAMPOLINE_WORKSPACE}"
+
+ # Mount the temporary home directory.
+ "--volume" "${tmphome}:/h"
+ "--env" "HOME=/h"
+
+ # Allow docker in docker.
+ "--volume" "/var/run/docker.sock:/var/run/docker.sock"
+
+ # Mount the /tmp so that docker in docker can mount the files
+ # there correctly.
+ "--volume" "/tmp:/tmp"
+ # Pass down the KOKORO_GFILE_DIR and KOKORO_KEYSTORE_DIR
+ # TODO(tmatsuo): This part is not portable.
+ "--env" "TRAMPOLINE_SECRET_DIR=/secrets"
+ "--volume" "${KOKORO_GFILE_DIR:-/dev/shm}:/secrets/gfile"
+ "--env" "KOKORO_GFILE_DIR=/secrets/gfile"
+ "--volume" "${KOKORO_KEYSTORE_DIR:-/dev/shm}:/secrets/keystore"
+ "--env" "KOKORO_KEYSTORE_DIR=/secrets/keystore"
+)
+
+# Add an option for nicer output if the build gets a tty.
+if [[ -t 0 ]]; then
+ docker_flags+=("-it")
+fi
+
+# Passing down env vars
+for e in "${pass_down_envvars[@]}"
+do
+ if [[ -n "${!e:-}" ]]; then
+ docker_flags+=("--env" "${e}=${!e}")
+ fi
+done
+
+# If arguments are given, all arguments will become the commands run
+# in the container, otherwise run TRAMPOLINE_BUILD_FILE.
+if [[ $# -ge 1 ]]; then
+ log_yellow "Running the given commands '" "${@:1}" "' in the container."
+ readonly commands=("${@:1}")
+ if [[ "${TRAMPOLINE_VERBOSE:-}" == "true" ]]; then
+ echo docker run "${docker_flags[@]}" "${TRAMPOLINE_IMAGE}" "${commands[@]}"
+ fi
+ docker run "${docker_flags[@]}" "${TRAMPOLINE_IMAGE}" "${commands[@]}"
+else
+ log_yellow "Running the tests in a Docker container."
+ docker_flags+=("--entrypoint=${TRAMPOLINE_BUILD_FILE}")
+ if [[ "${TRAMPOLINE_VERBOSE:-}" == "true" ]]; then
+ echo docker run "${docker_flags[@]}" "${TRAMPOLINE_IMAGE}"
+ fi
+ docker run "${docker_flags[@]}" "${TRAMPOLINE_IMAGE}"
+fi
+
+
+test_retval=$?
+
+if [[ ${test_retval} -eq 0 ]]; then
+ log_green "Build finished with ${test_retval}"
+else
+ log_red "Build finished with ${test_retval}"
+fi
+
+# Only upload it when the test passes.
+if [[ "${update_cache}" == "true" ]] && \
+ [[ $test_retval == 0 ]] && \
+ [[ "${TRAMPOLINE_IMAGE_UPLOAD:-false}" == "true" ]]; then
+ log_yellow "Uploading the Docker image."
+ if docker push "${TRAMPOLINE_IMAGE}"; then
+ log_green "Finished uploading the Docker image."
+ else
+ log_red "Failed uploading the Docker image."
+ fi
+ # Call trampoline_after_upload_hook if it's defined.
+ if function_exists trampoline_after_upload_hook; then
+ trampoline_after_upload_hook
+ fi
+
+fi
+
+exit "${test_retval}"
diff --git a/packages/google-cloud-dlp/.trampolinerc b/packages/google-cloud-dlp/.trampolinerc
new file mode 100644
index 000000000000..995ee29111e1
--- /dev/null
+++ b/packages/google-cloud-dlp/.trampolinerc
@@ -0,0 +1,51 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Template for .trampolinerc
+
+# Add required env vars here.
+required_envvars+=(
+ "STAGING_BUCKET"
+ "V2_STAGING_BUCKET"
+)
+
+# Add env vars which are passed down into the container here.
+pass_down_envvars+=(
+ "STAGING_BUCKET"
+ "V2_STAGING_BUCKET"
+)
+
+# Prevent unintentional override on the default image.
+if [[ "${TRAMPOLINE_IMAGE_UPLOAD:-false}" == "true" ]] && \
+ [[ -z "${TRAMPOLINE_IMAGE:-}" ]]; then
+ echo "Please set TRAMPOLINE_IMAGE if you want to upload the Docker image."
+ exit 1
+fi
+
+# Define the default value if it makes sense.
+if [[ -z "${TRAMPOLINE_IMAGE_UPLOAD:-}" ]]; then
+ TRAMPOLINE_IMAGE_UPLOAD=""
+fi
+
+if [[ -z "${TRAMPOLINE_IMAGE:-}" ]]; then
+ TRAMPOLINE_IMAGE=""
+fi
+
+if [[ -z "${TRAMPOLINE_DOCKERFILE:-}" ]]; then
+ TRAMPOLINE_DOCKERFILE=""
+fi
+
+if [[ -z "${TRAMPOLINE_BUILD_FILE:-}" ]]; then
+ TRAMPOLINE_BUILD_FILE=""
+fi
diff --git a/packages/google-cloud-dlp/MANIFEST.in b/packages/google-cloud-dlp/MANIFEST.in
index 68855abc3f02..e9e29d12033d 100644
--- a/packages/google-cloud-dlp/MANIFEST.in
+++ b/packages/google-cloud-dlp/MANIFEST.in
@@ -20,3 +20,6 @@ recursive-include google *.json *.proto
recursive-include tests *
global-exclude *.py[co]
global-exclude __pycache__
+
+# Exclude scripts for samples readmegen
+prune scripts/readme-gen
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/docs/_templates/layout.html b/packages/google-cloud-dlp/docs/_templates/layout.html
index 228529efe2d2..6316a537f72b 100644
--- a/packages/google-cloud-dlp/docs/_templates/layout.html
+++ b/packages/google-cloud-dlp/docs/_templates/layout.html
@@ -21,8 +21,8 @@
- On January 1, 2020 this library will no longer support Python 2 on the latest released version.
- Previously released library versions will continue to be available. For more information please
+ As of January 1, 2020 this library no longer supports Python 2 on the latest released version.
+ Library versions released prior to that date will continue to be available. For more information please
visit
Python 2 support on Google Cloud.
{% block body %} {% endblock %}
diff --git a/packages/google-cloud-dlp/docs/conf.py b/packages/google-cloud-dlp/docs/conf.py
index fc9991d1ec40..cc9cc3485b21 100644
--- a/packages/google-cloud-dlp/docs/conf.py
+++ b/packages/google-cloud-dlp/docs/conf.py
@@ -20,6 +20,10 @@
# documentation root, use os.path.abspath to make it absolute, like shown here.
sys.path.insert(0, os.path.abspath(".."))
+# For plugins that can not read conf.py.
+# See also: https://github.com/docascode/sphinx-docfx-yaml/issues/85
+sys.path.insert(0, os.path.abspath("."))
+
__version__ = ""
# -- General configuration ------------------------------------------------
@@ -38,21 +42,18 @@
"sphinx.ext.napoleon",
"sphinx.ext.todo",
"sphinx.ext.viewcode",
+ "recommonmark",
]
# autodoc/autosummary flags
autoclass_content = "both"
-autodoc_default_flags = ["members"]
+autodoc_default_options = {"members": True}
autosummary_generate = True
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
-# Allow markdown includes (so releases.md can include CHANGLEOG.md)
-# http://www.sphinx-doc.org/en/master/markdown.html
-source_parsers = {".md": "recommonmark.parser.CommonMarkParser"}
-
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
# source_suffix = ['.rst', '.md']
@@ -93,7 +94,12 @@
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
-exclude_patterns = ["_build"]
+exclude_patterns = [
+ "_build",
+ "samples/AUTHORING_GUIDE.md",
+ "samples/CONTRIBUTING.md",
+ "samples/snippets/README.rst",
+]
# The reST default role (used for this markup: `text`) to use for all
# documents.
@@ -293,7 +299,7 @@
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
- (master_doc, "google-cloud-dlp", u"google-cloud-dlp Documentation", [author], 1)
+ (master_doc, "google-cloud-dlp", u"google-cloud-dlp Documentation", [author], 1,)
]
# If true, show URL addresses after external links.
@@ -334,7 +340,7 @@
intersphinx_mapping = {
"python": ("http://python.readthedocs.org/en/latest/", None),
"google-auth": ("https://google-auth.readthedocs.io/en/stable", None),
- "google.api_core": ("https://googleapis.dev/python/google-api-core/latest/", None),
+ "google.api_core": ("https://googleapis.dev/python/google-api-core/latest/", None,),
"grpc": ("https://grpc.io/grpc/python/", None),
}
diff --git a/packages/google-cloud-dlp/noxfile.py b/packages/google-cloud-dlp/noxfile.py
index cfaff4be5040..e27f448fbad6 100644
--- a/packages/google-cloud-dlp/noxfile.py
+++ b/packages/google-cloud-dlp/noxfile.py
@@ -23,14 +23,15 @@
import nox
-BLACK_VERSION = "black==19.3b0"
+BLACK_VERSION = "black==19.10b0"
BLACK_PATHS = ["docs", "google", "tests", "noxfile.py", "setup.py"]
-if os.path.exists("samples"):
- BLACK_PATHS.append("samples")
+DEFAULT_PYTHON_VERSION = "3.8"
+SYSTEM_TEST_PYTHON_VERSIONS = ["2.7", "3.8"]
+UNIT_TEST_PYTHON_VERSIONS = ["2.7", "3.5", "3.6", "3.7", "3.8"]
-@nox.session(python="3.7")
+@nox.session(python=DEFAULT_PYTHON_VERSION)
def lint(session):
"""Run linters.
@@ -38,7 +39,9 @@ def lint(session):
serious code quality issues.
"""
session.install("flake8", BLACK_VERSION)
- session.run("black", "--check", *BLACK_PATHS)
+ session.run(
+ "black", "--check", *BLACK_PATHS,
+ )
session.run("flake8", "google", "tests")
@@ -53,10 +56,12 @@ def blacken(session):
check the state of the `gcp_ubuntu_config` we use for that Kokoro run.
"""
session.install(BLACK_VERSION)
- session.run("black", *BLACK_PATHS)
+ session.run(
+ "black", *BLACK_PATHS,
+ )
-@nox.session(python="3.7")
+@nox.session(python=DEFAULT_PYTHON_VERSION)
def lint_setup_py(session):
"""Verify that setup.py is valid (including RST check)."""
session.install("docutils", "pygments")
@@ -84,17 +89,21 @@ def default(session):
)
-@nox.session(python=["2.7", "3.5", "3.6", "3.7", "3.8"])
+@nox.session(python=UNIT_TEST_PYTHON_VERSIONS)
def unit(session):
"""Run the unit test suite."""
default(session)
-@nox.session(python=["2.7", "3.7"])
+@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS)
def system(session):
"""Run the system test suite."""
system_test_path = os.path.join("tests", "system.py")
system_test_folder_path = os.path.join("tests", "system")
+
+ # Check the value of `RUN_SYSTEM_TESTS` env var. It defaults to true.
+ if os.environ.get("RUN_SYSTEM_TESTS", "true") == "false":
+ session.skip("RUN_SYSTEM_TESTS is set to false, skipping")
# Sanity check: Only run tests if the environment variable is set.
if not os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", ""):
session.skip("Credentials must be set via environment variable")
@@ -110,7 +119,9 @@ def system(session):
# Install all test dependencies, then install this package into the
# virtualenv's dist-packages.
- session.install("mock", "pytest", "google-cloud-testutils")
+ session.install(
+ "mock", "pytest", "google-cloud-testutils",
+ )
session.install("-e", "test_utils")
session.install("-e", ".")
@@ -121,7 +132,7 @@ def system(session):
session.run("py.test", "--quiet", system_test_folder_path, *session.posargs)
-@nox.session(python="3.7")
+@nox.session(python=DEFAULT_PYTHON_VERSION)
def cover(session):
"""Run the final coverage report.
@@ -134,19 +145,52 @@ def cover(session):
session.run("coverage", "erase")
-@nox.session(python="3.7")
+@nox.session(python=DEFAULT_PYTHON_VERSION)
def docs(session):
"""Build the docs for this library."""
session.install("-e", ".")
- session.install("sphinx<3.0.0", "alabaster", "recommonmark")
+ session.install("sphinx", "alabaster", "recommonmark")
+
+ shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True)
+ session.run(
+ "sphinx-build",
+ # "-W", # warnings as errors
+ "-T", # show full traceback on exception
+ "-N", # no colors
+ "-b",
+ "html",
+ "-d",
+ os.path.join("docs", "_build", "doctrees", ""),
+ os.path.join("docs", ""),
+ os.path.join("docs", "_build", "html", ""),
+ )
+
+
+@nox.session(python=DEFAULT_PYTHON_VERSION)
+def docfx(session):
+ """Build the docfx yaml files for this library."""
+
+ session.install("-e", ".")
+ session.install("sphinx", "alabaster", "recommonmark", "sphinx-docfx-yaml")
shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True)
session.run(
"sphinx-build",
- "-W", # warnings as errors
"-T", # show full traceback on exception
"-N", # no colors
+ "-D",
+ (
+ "extensions=sphinx.ext.autodoc,"
+ "sphinx.ext.autosummary,"
+ "docfx_yaml.extension,"
+ "sphinx.ext.intersphinx,"
+ "sphinx.ext.coverage,"
+ "sphinx.ext.napoleon,"
+ "sphinx.ext.todo,"
+ "sphinx.ext.viewcode,"
+ "recommonmark"
+ ),
"-b",
"html",
"-d",
diff --git a/packages/google-cloud-dlp/samples/AUTHORING_GUIDE.md b/packages/google-cloud-dlp/samples/AUTHORING_GUIDE.md
new file mode 100644
index 000000000000..55c97b32f4c1
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/AUTHORING_GUIDE.md
@@ -0,0 +1 @@
+See https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/AUTHORING_GUIDE.md
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/samples/CONTRIBUTING.md b/packages/google-cloud-dlp/samples/CONTRIBUTING.md
new file mode 100644
index 000000000000..34c882b6f1a3
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/CONTRIBUTING.md
@@ -0,0 +1 @@
+See https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/CONTRIBUTING.md
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/samples/snippets/README.rst b/packages/google-cloud-dlp/samples/snippets/README.rst
new file mode 100644
index 000000000000..0b25cc7acde0
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/README.rst
@@ -0,0 +1,405 @@
+
+.. This file is automatically generated. Do not edit this file directly.
+
+Google Data Loss Prevention Python Samples
+===============================================================================
+
+.. image:: https://gstatic.com/cloudssh/images/open-btn.png
+ :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/README.rst
+
+
+This directory contains samples for Google Data Loss Prevention. `Google Data Loss Prevention`_ provides programmatic access to a powerful detection engine for personally identifiable information and other privacy-sensitive data in unstructured data streams.
+
+
+
+
+.. _Google Data Loss Prevention: https://cloud.google.com/dlp/docs/
+
+
+Setup
+-------------------------------------------------------------------------------
+
+
+
+Authentication
+++++++++++++++
+
+This sample requires you to have authentication setup. Refer to the
+`Authentication Getting Started Guide`_ for instructions on setting up
+credentials for applications.
+
+.. _Authentication Getting Started Guide:
+ https://cloud.google.com/docs/authentication/getting-started
+
+
+
+
+Install Dependencies
+++++++++++++++++++++
+
+#. Clone python-docs-samples and change directory to the sample directory you want to use.
+
+ .. code-block:: bash
+
+ $ git clone https://github.com/GoogleCloudPlatform/python-docs-samples.git
+
+#. Install `pip`_ and `virtualenv`_ if you do not already have them. You may want to refer to the `Python Development Environment Setup Guide`_ for Google Cloud Platform for instructions.
+
+ .. _Python Development Environment Setup Guide:
+ https://cloud.google.com/python/setup
+
+#. Create a virtualenv. Samples are compatible with Python 3.6+.
+
+ .. code-block:: bash
+
+ $ virtualenv env
+ $ source env/bin/activate
+
+#. Install the dependencies needed to run the samples.
+
+ .. code-block:: bash
+
+ $ pip install -r requirements.txt
+
+.. _pip: https://pip.pypa.io/
+.. _virtualenv: https://virtualenv.pypa.io/
+
+
+
+
+
+
+Samples
+-------------------------------------------------------------------------------
+
+
+Quickstart
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+.. image:: https://gstatic.com/cloudssh/images/open-btn.png
+ :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/quickstart.py,dlp/README.rst
+
+
+
+
+To run this sample:
+
+.. code-block:: bash
+
+ $ python quickstart.py
+
+
+
+
+Inspect Content
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+.. image:: https://gstatic.com/cloudssh/images/open-btn.png
+ :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/inspect_content.py,dlp/README.rst
+
+
+
+
+To run this sample:
+
+.. code-block:: bash
+
+ $ python inspect_content.py
+
+
+ usage: inspect_content.py [-h] {string,table,file,gcs,datastore,bigquery} ...
+
+ Sample app that uses the Data Loss Prevention API to inspect a string, a local
+ file or a file on Google Cloud Storage.
+
+ positional arguments:
+ {string,table,file,gcs,datastore,bigquery}
+ Select how to submit content to the API.
+ string Inspect a string.
+ table Inspect a table.
+ file Inspect a local file.
+ gcs Inspect files on Google Cloud Storage.
+ datastore Inspect files on Google Datastore.
+ bigquery Inspect files on Google BigQuery.
+
+ optional arguments:
+ -h, --help show this help message and exit
+
+
+
+
+
+Redact Content
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+.. image:: https://gstatic.com/cloudssh/images/open-btn.png
+ :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/redact.py,dlp/README.rst
+
+
+
+
+To run this sample:
+
+.. code-block:: bash
+
+ $ python redact.py
+
+
+ usage: redact.py [-h] {info_types,all_text} ...
+
+ Sample app that uses the Data Loss Prevent API to redact the contents of an
+ image file.
+
+ positional arguments:
+ {info_types,all_text}
+ Select which content should be redacted.
+ info_types Redact specific infoTypes from an image.
+ all_text Redact all text from an image. The MIME type of the
+ file is inferred via the Python standard library's
+ mimetypes module.
+
+ optional arguments:
+ -h, --help show this help message and exit
+
+
+
+
+
+Metadata
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+.. image:: https://gstatic.com/cloudssh/images/open-btn.png
+ :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/metadata.py,dlp/README.rst
+
+
+
+
+To run this sample:
+
+.. code-block:: bash
+
+ $ python metadata.py
+
+
+ usage: metadata.py [-h] [--language_code LANGUAGE_CODE] [--filter FILTER]
+
+ Sample app that queries the Data Loss Prevention API for supported categories
+ and info types.
+
+ optional arguments:
+ -h, --help show this help message and exit
+ --language_code LANGUAGE_CODE
+ The BCP-47 language code to use, e.g. 'en-US'.
+ --filter FILTER An optional filter to only return info types supported
+ by certain parts of the API. Defaults to
+ "supported_by=INSPECT".
+
+
+
+
+
+Jobs
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+.. image:: https://gstatic.com/cloudssh/images/open-btn.png
+ :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/jobs.py,dlp/README.rst
+
+
+
+
+To run this sample:
+
+.. code-block:: bash
+
+ $ python jobs.py
+
+
+ usage: jobs.py [-h] {list,delete} ...
+
+ Sample app to list and delete DLP jobs using the Data Loss Prevent API.
+
+ positional arguments:
+ {list,delete} Select how to submit content to the API.
+ list List Data Loss Prevention API jobs corresponding to a given
+ filter.
+ delete Delete results of a Data Loss Prevention API job.
+
+ optional arguments:
+ -h, --help show this help message and exit
+
+
+
+
+
+Templates
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+.. image:: https://gstatic.com/cloudssh/images/open-btn.png
+ :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/templates.py,dlp/README.rst
+
+
+
+
+To run this sample:
+
+.. code-block:: bash
+
+ $ python templates.py
+
+
+ usage: templates.py [-h] {create,list,delete} ...
+
+ Sample app that sets up Data Loss Prevention API inspect templates.
+
+ positional arguments:
+ {create,list,delete} Select which action to perform.
+ create Create a template.
+ list List all templates.
+ delete Delete a template.
+
+ optional arguments:
+ -h, --help show this help message and exit
+
+
+
+
+
+Triggers
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+.. image:: https://gstatic.com/cloudssh/images/open-btn.png
+ :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/triggers.py,dlp/README.rst
+
+
+
+
+To run this sample:
+
+.. code-block:: bash
+
+ $ python triggers.py
+
+
+ usage: triggers.py [-h] {create,list,delete} ...
+
+ Sample app that sets up Data Loss Prevention API automation triggers.
+
+ positional arguments:
+ {create,list,delete} Select which action to perform.
+ create Create a trigger.
+ list List all triggers.
+ delete Delete a trigger.
+
+ optional arguments:
+ -h, --help show this help message and exit
+
+
+
+
+
+Risk Analysis
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+.. image:: https://gstatic.com/cloudssh/images/open-btn.png
+ :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/risk.py,dlp/README.rst
+
+
+
+
+To run this sample:
+
+.. code-block:: bash
+
+ $ python risk.py
+
+
+ usage: risk.py [-h] {numerical,categorical,k_anonymity,l_diversity,k_map} ...
+
+ Sample app that uses the Data Loss Prevent API to perform risk anaylsis.
+
+ positional arguments:
+ {numerical,categorical,k_anonymity,l_diversity,k_map}
+ Select how to submit content to the API.
+ numerical
+ categorical
+ k_anonymity Computes the k-anonymity of a column set in a Google
+ BigQuerytable.
+ l_diversity Computes the l-diversity of a column set in a Google
+ BigQuerytable.
+ k_map Computes the k-map risk estimation of a column set in
+ a GoogleBigQuery table.
+
+ optional arguments:
+ -h, --help show this help message and exit
+
+
+
+
+
+DeID
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+.. image:: https://gstatic.com/cloudssh/images/open-btn.png
+ :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/deid.py,dlp/README.rst
+
+
+
+
+To run this sample:
+
+.. code-block:: bash
+
+ $ python deid.py
+
+
+ usage: deid.py [-h]
+ {deid_mask,deid_replace,deid_fpe,reid_fpe,deid_date_shift,replace_with_infotype}
+ ...
+
+ Uses of the Data Loss Prevention API for deidentifying sensitive data.
+
+ positional arguments:
+ {deid_mask,deid_replace,deid_fpe,reid_fpe,deid_date_shift,replace_with_infotype}
+ Select how to submit content to the API.
+ deid_mask Deidentify sensitive data in a string by masking it
+ with a character.
+ deid_replace Deidentify sensitive data in a string by replacing it
+ with another string.
+ deid_fpe Deidentify sensitive data in a string using Format
+ Preserving Encryption (FPE).
+ reid_fpe Reidentify sensitive data in a string using Format
+ Preserving Encryption (FPE).
+ deid_date_shift Deidentify dates in a CSV file by pseudorandomly
+ shifting them.
+ replace_with_infotype
+ Deidentify sensitive data in a string by replacing it
+ with the info type of the data.
+
+ optional arguments:
+ -h, --help show this help message and exit
+
+
+
+
+
+
+
+
+
+The client library
+-------------------------------------------------------------------------------
+
+This sample uses the `Google Cloud Client Library for Python`_.
+You can read the documentation for more details on API usage and use GitHub
+to `browse the source`_ and `report issues`_.
+
+.. _Google Cloud Client Library for Python:
+ https://googlecloudplatform.github.io/google-cloud-python/
+.. _browse the source:
+ https://github.com/GoogleCloudPlatform/google-cloud-python
+.. _report issues:
+ https://github.com/GoogleCloudPlatform/google-cloud-python/issues
+
+
+
+.. _Google Cloud SDK: https://cloud.google.com/sdk/
diff --git a/packages/google-cloud-dlp/samples/snippets/README.rst.in b/packages/google-cloud-dlp/samples/snippets/README.rst.in
new file mode 100644
index 000000000000..708e870fa08a
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/README.rst.in
@@ -0,0 +1,52 @@
+# This file is used to generate README.rst
+
+product:
+ name: Google Data Loss Prevention
+ short_name: Data Loss Prevention
+ url: https://cloud.google.com/dlp/docs/
+ description: >
+ `Google Data Loss Prevention`_ provides programmatic access to a powerful
+ detection engine for personally identifiable information and other
+ privacy-sensitive data in unstructured data streams.
+
+setup:
+- auth
+- install_deps
+
+required_api_url: https://console.cloud.google.com/apis/library/dlp.googleapis.com
+
+required_roles:
+- DLP Administrator
+- DLP API Service Agent
+
+samples:
+- name: Quickstart
+ file: quickstart.py
+- name: Inspect Content
+ file: inspect_content.py
+ show_help: true
+- name: Redact Content
+ file: redact.py
+ show_help: true
+- name: Metadata
+ file: metadata.py
+ show_help: true
+- name: Jobs
+ file: jobs.py
+ show_help: true
+- name: Templates
+ file: templates.py
+ show_help: true
+- name: Triggers
+ file: triggers.py
+ show_help: true
+- name: Risk Analysis
+ file: risk.py
+ show_help: true
+- name: DeID
+ file: deid.py
+ show_help: true
+
+cloud_client_library: true
+
+folder: dlp
diff --git a/packages/google-cloud-dlp/samples/snippets/custom_infotype.py b/packages/google-cloud-dlp/samples/snippets/custom_infotype.py
new file mode 100644
index 000000000000..565fed6994c6
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/custom_infotype.py
@@ -0,0 +1,302 @@
+# Copyright 2020 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Custom infoType snippets.
+
+This file contains sample code that uses the Data Loss Prevention API to create
+custom infoType detectors to refine scan results.
+"""
+
+
+# [START dlp_omit_name_if_also_email]
+def omit_name_if_also_email(
+ project,
+ content_string,
+):
+ """Marches PERSON_NAME and EMAIL_ADDRESS, but not both.
+
+ Uses the Data Loss Prevention API omit matches on PERSON_NAME if the
+ EMAIL_ADDRESS detector also matches.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ content_string: The string to inspect.
+
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Construct a list of infoTypes for DLP to locate in `content_string`. See
+ # https://cloud.google.com/dlp/docs/concepts-infotypes for more information
+ # about supported infoTypes.
+ info_types_to_locate = [{"name": "PERSON_NAME"}, {"name": "EMAIL_ADDRESS"}]
+
+ # Construct the configuration dictionary that will only match on PERSON_NAME
+ # if the EMAIL_ADDRESS doesn't also match. This configuration helps reduce
+ # the total number of findings when there is a large overlap between different
+ # infoTypes.
+ inspect_config = {
+ "info_types":
+ info_types_to_locate,
+ "rule_set": [{
+ "info_types": [{
+ "name": "PERSON_NAME"
+ }],
+ "rules": [{
+ "exclusion_rule": {
+ "exclude_info_types": {
+ "info_types": [{
+ "name": "EMAIL_ADDRESS"
+ }]
+ },
+ "matching_type": "MATCHING_TYPE_PARTIAL_MATCH"
+ }
+ }]
+ }]
+ }
+
+ # Construct the `item`.
+ item = {"value": content_string}
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Call the API.
+ response = dlp.inspect_content(parent, inspect_config, item)
+
+ return [f.info_type.name for f in response.result.findings]
+
+
+# [END dlp_omit_name_if_also_email]
+
+
+# [START inspect_with_person_name_w_custom_hotword]
+def inspect_with_person_name_w_custom_hotword(
+ project,
+ content_string,
+ custom_hotword="patient"
+):
+ """Uses the Data Loss Prevention API increase likelihood for matches on
+ PERSON_NAME if the user specified custom hotword is present. Only
+ includes findings with the increased likelihood by setting a minimum
+ likelihood threshold of VERY_LIKELY.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ content_string: The string to inspect.
+ custom_hotword: The custom hotword used for likelihood boosting.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Construct a rule set with caller provided hotword, with a likelihood
+ # boost to VERY_LIKELY when the hotword are present within the 50 character-
+ # window preceding the PII finding.
+ hotword_rule = {
+ "hotword_regex": {"pattern": custom_hotword},
+ "likelihood_adjustment": {"fixed_likelihood": "VERY_LIKELY"},
+ "proximity": {"window_before": 50},
+ }
+
+ rule_set = [
+ {
+ "info_types": [{"name": "PERSON_NAME"}],
+ "rules": [{"hotword_rule": hotword_rule}],
+ }
+ ]
+
+ # Construct the configuration dictionary with the custom regex info type.
+ inspect_config = {
+ "rule_set": rule_set,
+ "min_likelihood": "VERY_LIKELY",
+ }
+
+ # Construct the `item`.
+ item = {"value": content_string}
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Call the API.
+ response = dlp.inspect_content(parent, inspect_config, item)
+
+ # Print out the results.
+ if response.result.findings:
+ for finding in response.result.findings:
+ try:
+ if finding.quote:
+ print(f"Quote: {finding.quote}")
+ except AttributeError:
+ pass
+ print(f"Info type: {finding.info_type.name}")
+ print(f"Likelihood: {finding.likelihood}")
+ else:
+ print("No findings.")
+
+# [END inspect_with_person_name_w_custom_hotword]
+
+
+# [START dlp_inspect_with_medical_record_number_custom_regex_detector]
+def inspect_with_medical_record_number_custom_regex_detector(
+ project,
+ content_string,
+):
+ """Uses the Data Loss Prevention API to analyze string with medical record
+ number custom regex detector
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ content_string: The string to inspect.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Construct a custom regex detector info type called "C_MRN",
+ # with ###-#-##### pattern, where each # represents a digit from 1 to 9.
+ # The detector has a detection likelihood of POSSIBLE.
+ custom_info_types = [
+ {
+ "info_type": {"name": "C_MRN"},
+ "regex": {"pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"},
+ "likelihood": "POSSIBLE",
+ }
+ ]
+
+ # Construct the configuration dictionary with the custom regex info type.
+ inspect_config = {
+ "custom_info_types": custom_info_types,
+ }
+
+ # Construct the `item`.
+ item = {"value": content_string}
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Call the API.
+ response = dlp.inspect_content(parent, inspect_config, item)
+
+ # Print out the results.
+ if response.result.findings:
+ for finding in response.result.findings:
+ try:
+ if finding.quote:
+ print(f"Quote: {finding.quote}")
+ except AttributeError:
+ pass
+ print(f"Info type: {finding.info_type.name}")
+ print(f"Likelihood: {finding.likelihood}")
+ else:
+ print("No findings.")
+
+# [END dlp_inspect_with_medical_record_number_custom_regex_detector]
+
+
+# [START dlp_inspect_with_medical_record_number_w_custom_hotwords]
+def inspect_with_medical_record_number_w_custom_hotwords(
+ project,
+ content_string,
+):
+ """Uses the Data Loss Prevention API to analyze string with medical record
+ number custom regex detector, with custom hotwords rules to boost finding
+ certainty under some circumstances.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ content_string: The string to inspect.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Construct a custom regex detector info type called "C_MRN",
+ # with ###-#-##### pattern, where each # represents a digit from 1 to 9.
+ # The detector has a detection likelihood of POSSIBLE.
+ custom_info_types = [
+ {
+ "info_type": {"name": "C_MRN"},
+ "regex": {"pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"},
+ "likelihood": "POSSIBLE",
+ }
+ ]
+
+ # Construct a rule set with hotwords "mrn" and "medical", with a likelohood
+ # boost to VERY_LIKELY when hotwords are present within the 10 character-
+ # window preceding the PII finding.
+ hotword_rule = {
+ "hotword_regex": {
+ "pattern": "(?i)(mrn|medical)(?-i)"
+ },
+ "likelihood_adjustment": {
+ "fixed_likelihood": "VERY_LIKELY"
+ },
+ "proximity": {
+ "window_before": 10
+ }
+ }
+
+ rule_set = [
+ {
+ "info_types": [{"name": "C_MRN"}],
+ "rules": [{"hotword_rule": hotword_rule}],
+ }
+ ]
+
+ # Construct the configuration dictionary with the custom regex info type.
+ inspect_config = {
+ "custom_info_types": custom_info_types,
+ "rule_set": rule_set,
+ }
+
+ # Construct the `item`.
+ item = {"value": content_string}
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Call the API.
+ response = dlp.inspect_content(parent, inspect_config, item)
+
+ # Print out the results.
+ if response.result.findings:
+ for finding in response.result.findings:
+ try:
+ if finding.quote:
+ print(f"Quote: {finding.quote}")
+ except AttributeError:
+ pass
+ print(f"Info type: {finding.info_type.name}")
+ print(f"Likelihood: {finding.likelihood}")
+ else:
+ print("No findings.")
+
+# [END dlp_inspect_with_medical_record_number_w_custom_hotwords]
diff --git a/packages/google-cloud-dlp/samples/snippets/custom_infotype_test.py b/packages/google-cloud-dlp/samples/snippets/custom_infotype_test.py
new file mode 100644
index 000000000000..4a81df60adbc
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/custom_infotype_test.py
@@ -0,0 +1,65 @@
+# Copyright 2020 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import custom_infotype
+
+GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT")
+
+
+def test_omit_name_if_also_email(capsys):
+ info_types = custom_infotype.omit_name_if_also_email(
+ GCLOUD_PROJECT, "alice@example.com")
+
+ # Ensure we found only EMAIL_ADDRESS, and not PERSON_NAME.
+ assert len(info_types) == 1
+ assert info_types[0] == "EMAIL_ADDRESS"
+
+
+def test_inspect_with_person_name_w_custom_hotword(capsys):
+ custom_infotype.inspect_with_person_name_w_custom_hotword(
+ GCLOUD_PROJECT, "patient's name is John Doe.", "patient")
+
+ out, _ = capsys.readouterr()
+ assert "Info type: PERSON_NAME" in out
+ assert "Likelihood: 5" in out
+
+
+def test_inspect_with_medical_record_number_custom_regex_detector(capsys):
+ custom_infotype.inspect_with_medical_record_number_custom_regex_detector(
+ GCLOUD_PROJECT, "Patients MRN 444-5-22222")
+
+ out, _ = capsys.readouterr()
+ assert "Info type: C_MRN" in out
+
+
+def test_inspect_with_medical_record_number_w_custom_hotwords_no_hotwords(
+ capsys):
+ custom_infotype.inspect_with_medical_record_number_w_custom_hotwords(
+ GCLOUD_PROJECT, "just a number 444-5-22222")
+
+ out, _ = capsys.readouterr()
+ assert "Info type: C_MRN" in out
+ assert "Likelihood: 3" in out
+
+
+def test_inspect_with_medical_record_number_w_custom_hotwords_has_hotwords(
+ capsys):
+ custom_infotype.inspect_with_medical_record_number_w_custom_hotwords(
+ GCLOUD_PROJECT, "Patients MRN 444-5-22222")
+
+ out, _ = capsys.readouterr()
+ assert "Info type: C_MRN" in out
+ assert "Likelihood: 5" in out
diff --git a/packages/google-cloud-dlp/samples/snippets/deid.py b/packages/google-cloud-dlp/samples/snippets/deid.py
new file mode 100644
index 000000000000..70bd162385b6
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/deid.py
@@ -0,0 +1,1073 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Uses of the Data Loss Prevention API for deidentifying sensitive data."""
+
+from __future__ import print_function
+
+import argparse
+
+
+# [START dlp_deidentify_masking]
+def deidentify_with_mask(
+ project, input_str, info_types, masking_character=None, number_to_mask=0
+):
+ """Uses the Data Loss Prevention API to deidentify sensitive data in a
+ string by masking it with a character.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ input_str: The string to deidentify (will be treated as text).
+ masking_character: The character to mask matching sensitive data with.
+ number_to_mask: The maximum number of sensitive characters to mask in
+ a match. If omitted or set to zero, the API will default to no
+ maximum.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Construct inspect configuration dictionary
+ inspect_config = {
+ "info_types": [{"name": info_type} for info_type in info_types]
+ }
+
+ # Construct deidentify configuration dictionary
+ deidentify_config = {
+ "info_type_transformations": {
+ "transformations": [
+ {
+ "primitive_transformation": {
+ "character_mask_config": {
+ "masking_character": masking_character,
+ "number_to_mask": number_to_mask,
+ }
+ }
+ }
+ ]
+ }
+ }
+
+ # Construct item
+ item = {"value": input_str}
+
+ # Call the API
+ response = dlp.deidentify_content(
+ parent,
+ inspect_config=inspect_config,
+ deidentify_config=deidentify_config,
+ item=item,
+ )
+
+ # Print out the results.
+ print(response.item.value)
+
+
+# [END dlp_deidentify_masking]
+
+# [START dlp_deidentify_redact]
+def deidentify_with_redact(
+ project,
+ input_str,
+ info_types,
+):
+ """Uses the Data Loss Prevention API to deidentify sensitive data in a
+ string by redacting matched input values.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ input_str: The string to deidentify (will be treated as text).
+ info_types: A list of strings representing info types to look for.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+ import google.cloud.dlp
+
+ # Instantiate a client
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Construct inspect configuration dictionary
+ inspect_config = {
+ "info_types": [{"name": info_type} for info_type in info_types]
+ }
+
+ # Construct deidentify configuration dictionary
+ deidentify_config = {
+ "info_type_transformations": {
+ "transformations": [
+ {
+ "primitive_transformation": {
+ "redact_config": {}
+ }
+ }
+ ]
+ }
+ }
+
+ # Construct item
+ item = {"value": input_str}
+
+ # Call the API
+ response = dlp.deidentify_content(
+ parent,
+ inspect_config=inspect_config,
+ deidentify_config=deidentify_config,
+ item=item,
+ )
+
+ # Print out the results.
+ print(response.item.value)
+
+
+# [END dlp_deidentify_redact]
+
+# [START dlp_deidentify_replace]
+def deidentify_with_replace(
+ project,
+ input_str,
+ info_types,
+ replacement_str="REPLACEMENT_STR",
+):
+ """Uses the Data Loss Prevention API to deidentify sensitive data in a
+ string by replacing matched input values with a value you specify.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ input_str: The string to deidentify (will be treated as text).
+ info_types: A list of strings representing info types to look for.
+ replacement_str: The string to replace all values that match given
+ info types.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+ import google.cloud.dlp
+
+ # Instantiate a client
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Construct inspect configuration dictionary
+ inspect_config = {
+ "info_types": [{"name": info_type} for info_type in info_types]
+ }
+
+ # Construct deidentify configuration dictionary
+ deidentify_config = {
+ "info_type_transformations": {
+ "transformations": [
+ {
+ "primitive_transformation": {
+ "replace_config": {
+ "new_value": {
+ "string_value": replacement_str,
+ }
+ }
+ }
+ }
+ ]
+ }
+ }
+
+ # Construct item
+ item = {"value": input_str}
+
+ # Call the API
+ response = dlp.deidentify_content(
+ parent,
+ inspect_config=inspect_config,
+ deidentify_config=deidentify_config,
+ item=item,
+ )
+
+ # Print out the results.
+ print(response.item.value)
+
+# [END dlp_deidentify_replace]
+
+# [START dlp_deidentify_fpe]
+
+
+def deidentify_with_fpe(
+ project,
+ input_str,
+ info_types,
+ alphabet=None,
+ surrogate_type=None,
+ key_name=None,
+ wrapped_key=None,
+):
+ """Uses the Data Loss Prevention API to deidentify sensitive data in a
+ string using Format Preserving Encryption (FPE).
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ input_str: The string to deidentify (will be treated as text).
+ alphabet: The set of characters to replace sensitive ones with. For
+ more information, see https://cloud.google.com/dlp/docs/reference/
+ rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
+ surrogate_type: The name of the surrogate custom info type to use. Only
+ necessary if you want to reverse the deidentification process. Can
+ be essentially any arbitrary string, as long as it doesn't appear
+ in your dataset otherwise.
+ key_name: The name of the Cloud KMS key used to encrypt ('wrap') the
+ AES-256 key. Example:
+ key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/
+ keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'
+ wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key
+ should be encrypted using the Cloud KMS key specified by key_name.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # The wrapped key is base64-encoded, but the library expects a binary
+ # string, so decode it here.
+ import base64
+
+ wrapped_key = base64.b64decode(wrapped_key)
+
+ # Construct FPE configuration dictionary
+ crypto_replace_ffx_fpe_config = {
+ "crypto_key": {
+ "kms_wrapped": {
+ "wrapped_key": wrapped_key,
+ "crypto_key_name": key_name,
+ }
+ },
+ "common_alphabet": alphabet,
+ }
+
+ # Add surrogate type
+ if surrogate_type:
+ crypto_replace_ffx_fpe_config["surrogate_info_type"] = {
+ "name": surrogate_type
+ }
+
+ # Construct inspect configuration dictionary
+ inspect_config = {
+ "info_types": [{"name": info_type} for info_type in info_types]
+ }
+
+ # Construct deidentify configuration dictionary
+ deidentify_config = {
+ "info_type_transformations": {
+ "transformations": [
+ {
+ "primitive_transformation": {
+ "crypto_replace_ffx_fpe_config": crypto_replace_ffx_fpe_config
+ }
+ }
+ ]
+ }
+ }
+
+ # Convert string to item
+ item = {"value": input_str}
+
+ # Call the API
+ response = dlp.deidentify_content(
+ parent,
+ inspect_config=inspect_config,
+ deidentify_config=deidentify_config,
+ item=item,
+ )
+
+ # Print results
+ print(response.item.value)
+
+
+# [END dlp_deidentify_fpe]
+
+
+# [START dlp_reidentify_fpe]
+def reidentify_with_fpe(
+ project,
+ input_str,
+ alphabet=None,
+ surrogate_type=None,
+ key_name=None,
+ wrapped_key=None,
+):
+ """Uses the Data Loss Prevention API to reidentify sensitive data in a
+ string that was encrypted by Format Preserving Encryption (FPE).
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ input_str: The string to deidentify (will be treated as text).
+ alphabet: The set of characters to replace sensitive ones with. For
+ more information, see https://cloud.google.com/dlp/docs/reference/
+ rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
+ surrogate_type: The name of the surrogate custom info type to used
+ during the encryption process.
+ key_name: The name of the Cloud KMS key used to encrypt ('wrap') the
+ AES-256 key. Example:
+ keyName = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/
+ keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'
+ wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key
+ should be encrypted using the Cloud KMS key specified by key_name.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # The wrapped key is base64-encoded, but the library expects a binary
+ # string, so decode it here.
+ import base64
+
+ wrapped_key = base64.b64decode(wrapped_key)
+
+ # Construct Deidentify Config
+ reidentify_config = {
+ "info_type_transformations": {
+ "transformations": [
+ {
+ "primitive_transformation": {
+ "crypto_replace_ffx_fpe_config": {
+ "crypto_key": {
+ "kms_wrapped": {
+ "wrapped_key": wrapped_key,
+ "crypto_key_name": key_name,
+ }
+ },
+ "common_alphabet": alphabet,
+ "surrogate_info_type": {"name": surrogate_type},
+ }
+ }
+ }
+ ]
+ }
+ }
+
+ inspect_config = {
+ "custom_info_types": [
+ {"info_type": {"name": surrogate_type}, "surrogate_type": {}}
+ ]
+ }
+
+ # Convert string to item
+ item = {"value": input_str}
+
+ # Call the API
+ response = dlp.reidentify_content(
+ parent,
+ inspect_config=inspect_config,
+ reidentify_config=reidentify_config,
+ item=item,
+ )
+
+ # Print results
+ print(response.item.value)
+
+
+# [END dlp_reidentify_fpe]
+
+
+# [START dlp_deidentify_free_text_with_fpe_using_surrogate]
+def deidentify_free_text_with_fpe_using_surrogate(
+ project,
+ input_str,
+ alphabet="NUMERIC",
+ info_type="PHONE_NUMBER",
+ surrogate_type="PHONE_TOKEN",
+ unwrapped_key="YWJjZGVmZ2hpamtsbW5vcA==",
+):
+ """Uses the Data Loss Prevention API to deidentify sensitive data in a
+ string using Format Preserving Encryption (FPE).
+ The encryption is performed with an unwrapped key.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ input_str: The string to deidentify (will be treated as text).
+ alphabet: The set of characters to replace sensitive ones with. For
+ more information, see https://cloud.google.com/dlp/docs/reference/
+ rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
+ info_type: The name of the info type to de-identify
+ surrogate_type: The name of the surrogate custom info type to use. Can
+ be essentially any arbitrary string, as long as it doesn't appear
+ in your dataset otherwise.
+ unwrapped_key: The base64-encoded AES-256 key to use.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # The unwrapped key is base64-encoded, but the library expects a binary
+ # string, so decode it here.
+ import base64
+
+ unwrapped_key = base64.b64decode(unwrapped_key)
+
+ # Construct de-identify config
+ transformation = {
+ "info_types": [{"name": info_type}],
+ "primitive_transformation": {
+ "crypto_replace_ffx_fpe_config": {
+ "crypto_key": {
+ "unwrapped": {"key": unwrapped_key}
+ },
+ "common_alphabet": alphabet,
+ "surrogate_info_type": {"name": surrogate_type},
+ }
+ }
+ }
+
+ deidentify_config = {
+ "info_type_transformations": {
+ "transformations": [transformation]
+ }
+ }
+
+ # Construct the inspect config, trying to finding all PII with likelihood
+ # higher than UNLIKELY
+ inspect_config = {
+ "info_types": [{"name": info_type}],
+ "min_likelihood": "UNLIKELY"
+ }
+
+ # Convert string to item
+ item = {"value": input_str}
+
+ # Call the API
+ response = dlp.deidentify_content(
+ parent,
+ inspect_config=inspect_config,
+ deidentify_config=deidentify_config,
+ item=item,
+ )
+
+ # Print results
+ print(response.item.value)
+
+
+# [END dlp_deidentify_free_text_with_fpe_using_surrogate]
+
+
+# [START dlp_reidentify_free_text_with_fpe_using_surrogate]
+def reidentify_free_text_with_fpe_using_surrogate(
+ project,
+ input_str,
+ alphabet="NUMERIC",
+ surrogate_type="PHONE_TOKEN",
+ unwrapped_key="YWJjZGVmZ2hpamtsbW5vcA==",
+):
+ """Uses the Data Loss Prevention API to reidentify sensitive data in a
+ string that was encrypted by Format Preserving Encryption (FPE) with
+ surrogate type. The encryption is performed with an unwrapped key.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ input_str: The string to deidentify (will be treated as text).
+ alphabet: The set of characters to replace sensitive ones with. For
+ more information, see https://cloud.google.com/dlp/docs/reference/
+ rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet
+ surrogate_type: The name of the surrogate custom info type to used
+ during the encryption process.
+ unwrapped_key: The base64-encoded AES-256 key to use.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # The unwrapped key is base64-encoded, but the library expects a binary
+ # string, so decode it here.
+ import base64
+
+ unwrapped_key = base64.b64decode(unwrapped_key)
+
+ # Construct Deidentify Config
+ transformation = {
+ "primitive_transformation": {
+ "crypto_replace_ffx_fpe_config": {
+ "crypto_key": {
+ "unwrapped": {"key": unwrapped_key}
+ },
+ "common_alphabet": alphabet,
+ "surrogate_info_type": {"name": surrogate_type},
+ }
+ }
+ }
+
+ reidentify_config = {
+ "info_type_transformations": {
+ "transformations": [transformation]
+ }
+ }
+
+ inspect_config = {
+ "custom_info_types": [
+ {"info_type": {"name": surrogate_type}, "surrogate_type": {}}
+ ]
+ }
+
+ # Convert string to item
+ item = {"value": input_str}
+
+ # Call the API
+ response = dlp.reidentify_content(
+ parent,
+ inspect_config=inspect_config,
+ reidentify_config=reidentify_config,
+ item=item,
+ )
+
+ # Print results
+ print(response.item.value)
+
+
+# [END dlp_reidentify_free_text_with_fpe_using_surrogate]
+
+
+# [START dlp_deidentify_date_shift]
+def deidentify_with_date_shift(
+ project,
+ input_csv_file=None,
+ output_csv_file=None,
+ date_fields=None,
+ lower_bound_days=None,
+ upper_bound_days=None,
+ context_field_id=None,
+ wrapped_key=None,
+ key_name=None,
+):
+ """Uses the Data Loss Prevention API to deidentify dates in a CSV file by
+ pseudorandomly shifting them.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ input_csv_file: The path to the CSV file to deidentify. The first row
+ of the file must specify column names, and all other rows must
+ contain valid values.
+ output_csv_file: The path to save the date-shifted CSV file.
+ date_fields: The list of (date) fields in the CSV file to date shift.
+ Example: ['birth_date', 'register_date']
+ lower_bound_days: The maximum number of days to shift a date backward
+ upper_bound_days: The maximum number of days to shift a date forward
+ context_field_id: (Optional) The column to determine date shift amount
+ based on. If this is not specified, a random shift amount will be
+ used for every row. If this is specified, then 'wrappedKey' and
+ 'keyName' must also be set. Example:
+ contextFieldId = [{ 'name': 'user_id' }]
+ key_name: (Optional) The name of the Cloud KMS key used to encrypt
+ ('wrap') the AES-256 key. Example:
+ key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/
+ keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'
+ wrapped_key: (Optional) The encrypted ('wrapped') AES-256 key to use.
+ This key should be encrypted using the Cloud KMS key specified by
+ key_name.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Convert date field list to Protobuf type
+ def map_fields(field):
+ return {"name": field}
+
+ if date_fields:
+ date_fields = map(map_fields, date_fields)
+ else:
+ date_fields = []
+
+ # Read and parse the CSV file
+ import csv
+ from datetime import datetime
+
+ f = []
+ with open(input_csv_file, "r") as csvfile:
+ reader = csv.reader(csvfile)
+ for row in reader:
+ f.append(row)
+
+ # Helper function for converting CSV rows to Protobuf types
+ def map_headers(header):
+ return {"name": header}
+
+ def map_data(value):
+ try:
+ date = datetime.strptime(value, "%m/%d/%Y")
+ return {
+ "date_value": {
+ "year": date.year,
+ "month": date.month,
+ "day": date.day,
+ }
+ }
+ except ValueError:
+ return {"string_value": value}
+
+ def map_rows(row):
+ return {"values": map(map_data, row)}
+
+ # Using the helper functions, convert CSV rows to protobuf-compatible
+ # dictionaries.
+ csv_headers = map(map_headers, f[0])
+ csv_rows = map(map_rows, f[1:])
+
+ # Construct the table dict
+ table_item = {"table": {"headers": csv_headers, "rows": csv_rows}}
+
+ # Construct date shift config
+ date_shift_config = {
+ "lower_bound_days": lower_bound_days,
+ "upper_bound_days": upper_bound_days,
+ }
+
+ # If using a Cloud KMS key, add it to the date_shift_config.
+ # The wrapped key is base64-encoded, but the library expects a binary
+ # string, so decode it here.
+ if context_field_id and key_name and wrapped_key:
+ import base64
+
+ date_shift_config["context"] = {"name": context_field_id}
+ date_shift_config["crypto_key"] = {
+ "kms_wrapped": {
+ "wrapped_key": base64.b64decode(wrapped_key),
+ "crypto_key_name": key_name,
+ }
+ }
+ elif context_field_id or key_name or wrapped_key:
+ raise ValueError(
+ """You must set either ALL or NONE of
+ [context_field_id, key_name, wrapped_key]!"""
+ )
+
+ # Construct Deidentify Config
+ deidentify_config = {
+ "record_transformations": {
+ "field_transformations": [
+ {
+ "fields": date_fields,
+ "primitive_transformation": {
+ "date_shift_config": date_shift_config
+ },
+ }
+ ]
+ }
+ }
+
+ # Write to CSV helper methods
+ def write_header(header):
+ return header.name
+
+ def write_data(data):
+ return data.string_value or "%s/%s/%s" % (
+ data.date_value.month,
+ data.date_value.day,
+ data.date_value.year,
+ )
+
+ # Call the API
+ response = dlp.deidentify_content(
+ parent, deidentify_config=deidentify_config, item=table_item
+ )
+
+ # Write results to CSV file
+ with open(output_csv_file, "w") as csvfile:
+ write_file = csv.writer(csvfile, delimiter=",")
+ write_file.writerow(map(write_header, response.item.table.headers))
+ for row in response.item.table.rows:
+ write_file.writerow(map(write_data, row.values))
+ # Print status
+ print("Successfully saved date-shift output to {}".format(output_csv_file))
+
+
+# [END dlp_deidentify_date_shift]
+
+
+# [START dlp_deidentify_replace_infotype]
+def deidentify_with_replace_infotype(project, item, info_types):
+ """Uses the Data Loss Prevention API to deidentify sensitive data in a
+ string by replacing it with the info type.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ item: The string to deidentify (will be treated as text).
+ info_types: A list of strings representing info types to look for.
+ A full list of info type categories can be fetched from the API.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Construct inspect configuration dictionary
+ inspect_config = {
+ "info_types": [{"name": info_type} for info_type in info_types]
+ }
+
+ # Construct deidentify configuration dictionary
+ deidentify_config = {
+ "info_type_transformations": {
+ "transformations": [
+ {
+ "primitive_transformation": {
+ "replace_with_info_type_config": {}
+ }
+ }
+ ]
+ }
+ }
+
+ # Call the API
+ response = dlp.deidentify_content(
+ parent,
+ inspect_config=inspect_config,
+ deidentify_config=deidentify_config,
+ item={"value": item},
+ )
+
+ # Print out the results.
+ print(response.item.value)
+
+
+# [END dlp_deidentify_replace_infotype]
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description=__doc__)
+ subparsers = parser.add_subparsers(
+ dest="content", help="Select how to submit content to the API."
+ )
+ subparsers.required = True
+
+ mask_parser = subparsers.add_parser(
+ "deid_mask",
+ help="Deidentify sensitive data in a string by masking it with a "
+ "character.",
+ )
+ mask_parser.add_argument(
+ "--info_types",
+ nargs="+",
+ help="Strings representing info types to look for. A full list of "
+ "info categories and types is available from the API. Examples "
+ 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+ "If unspecified, the three above examples will be used.",
+ default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
+ )
+ mask_parser.add_argument(
+ "project",
+ help="The Google Cloud project id to use as a parent resource.",
+ )
+ mask_parser.add_argument("item", help="The string to deidentify.")
+ mask_parser.add_argument(
+ "-n",
+ "--number_to_mask",
+ type=int,
+ default=0,
+ help="The maximum number of sensitive characters to mask in a match. "
+ "If omitted the request or set to 0, the API will mask any mathcing "
+ "characters.",
+ )
+ mask_parser.add_argument(
+ "-m",
+ "--masking_character",
+ help="The character to mask matching sensitive data with.",
+ )
+
+ replace_parser = subparsers.add_parser(
+ "deid_replace",
+ help="Deidentify sensitive data in a string by replacing it with "
+ "another string.",
+ )
+ replace_parser.add_argument(
+ "--info_types",
+ nargs="+",
+ help="Strings representing info types to look for. A full list of "
+ "info categories and types is available from the API. Examples "
+ 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+ "If unspecified, the three above examples will be used.",
+ default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
+ )
+ replace_parser.add_argument(
+ "project",
+ help="The Google Cloud project id to use as a parent resource.",
+ )
+ replace_parser.add_argument("item", help="The string to deidentify.")
+ replace_parser.add_argument("replacement_str", help="The string to "
+ "replace all matched values with.")
+
+ fpe_parser = subparsers.add_parser(
+ "deid_fpe",
+ help="Deidentify sensitive data in a string using Format Preserving "
+ "Encryption (FPE).",
+ )
+ fpe_parser.add_argument(
+ "--info_types",
+ action="append",
+ help="Strings representing info types to look for. A full list of "
+ "info categories and types is available from the API. Examples "
+ 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+ "If unspecified, the three above examples will be used.",
+ default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
+ )
+ fpe_parser.add_argument(
+ "project",
+ help="The Google Cloud project id to use as a parent resource.",
+ )
+ fpe_parser.add_argument(
+ "item",
+ help="The string to deidentify. "
+ "Example: string = 'My SSN is 372819127'",
+ )
+ fpe_parser.add_argument(
+ "key_name",
+ help="The name of the Cloud KMS key used to encrypt ('wrap') the "
+ "AES-256 key. Example: "
+ "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/"
+ "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'",
+ )
+ fpe_parser.add_argument(
+ "wrapped_key",
+ help="The encrypted ('wrapped') AES-256 key to use. This key should "
+ "be encrypted using the Cloud KMS key specified by key_name.",
+ )
+ fpe_parser.add_argument(
+ "-a",
+ "--alphabet",
+ default="ALPHA_NUMERIC",
+ help="The set of characters to replace sensitive ones with. Commonly "
+ 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", '
+ '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", '
+ '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"',
+ )
+ fpe_parser.add_argument(
+ "-s",
+ "--surrogate_type",
+ help="The name of the surrogate custom info type to use. Only "
+ "necessary if you want to reverse the deidentification process. Can "
+ "be essentially any arbitrary string, as long as it doesn't appear "
+ "in your dataset otherwise.",
+ )
+
+ reid_parser = subparsers.add_parser(
+ "reid_fpe",
+ help="Reidentify sensitive data in a string using Format Preserving "
+ "Encryption (FPE).",
+ )
+ reid_parser.add_argument(
+ "project",
+ help="The Google Cloud project id to use as a parent resource.",
+ )
+ reid_parser.add_argument(
+ "item",
+ help="The string to deidentify. "
+ "Example: string = 'My SSN is 372819127'",
+ )
+ reid_parser.add_argument(
+ "surrogate_type",
+ help="The name of the surrogate custom info type to use. Only "
+ "necessary if you want to reverse the deidentification process. Can "
+ "be essentially any arbitrary string, as long as it doesn't appear "
+ "in your dataset otherwise.",
+ )
+ reid_parser.add_argument(
+ "key_name",
+ help="The name of the Cloud KMS key used to encrypt ('wrap') the "
+ "AES-256 key. Example: "
+ "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/"
+ "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'",
+ )
+ reid_parser.add_argument(
+ "wrapped_key",
+ help="The encrypted ('wrapped') AES-256 key to use. This key should "
+ "be encrypted using the Cloud KMS key specified by key_name.",
+ )
+ reid_parser.add_argument(
+ "-a",
+ "--alphabet",
+ default="ALPHA_NUMERIC",
+ help="The set of characters to replace sensitive ones with. Commonly "
+ 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", '
+ '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", '
+ '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"',
+ )
+
+ date_shift_parser = subparsers.add_parser(
+ "deid_date_shift",
+ help="Deidentify dates in a CSV file by pseudorandomly shifting them.",
+ )
+ date_shift_parser.add_argument(
+ "project",
+ help="The Google Cloud project id to use as a parent resource.",
+ )
+ date_shift_parser.add_argument(
+ "input_csv_file",
+ help="The path to the CSV file to deidentify. The first row of the "
+ "file must specify column names, and all other rows must contain "
+ "valid values.",
+ )
+ date_shift_parser.add_argument(
+ "output_csv_file", help="The path to save the date-shifted CSV file."
+ )
+ date_shift_parser.add_argument(
+ "lower_bound_days",
+ type=int,
+ help="The maximum number of days to shift a date backward",
+ )
+ date_shift_parser.add_argument(
+ "upper_bound_days",
+ type=int,
+ help="The maximum number of days to shift a date forward",
+ )
+ date_shift_parser.add_argument(
+ "date_fields",
+ nargs="+",
+ help="The list of date fields in the CSV file to date shift. Example: "
+ "['birth_date', 'register_date']",
+ )
+ date_shift_parser.add_argument(
+ "--context_field_id",
+ help="(Optional) The column to determine date shift amount based on. "
+ "If this is not specified, a random shift amount will be used for "
+ "every row. If this is specified, then 'wrappedKey' and 'keyName' "
+ "must also be set.",
+ )
+ date_shift_parser.add_argument(
+ "--key_name",
+ help="(Optional) The name of the Cloud KMS key used to encrypt "
+ "('wrap') the AES-256 key. Example: "
+ "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/"
+ "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'",
+ )
+ date_shift_parser.add_argument(
+ "--wrapped_key",
+ help="(Optional) The encrypted ('wrapped') AES-256 key to use. This "
+ "key should be encrypted using the Cloud KMS key specified by"
+ "key_name.",
+ )
+
+ replace_with_infotype_parser = subparsers.add_parser(
+ "replace_with_infotype",
+ help="Deidentify sensitive data in a string by replacing it with the "
+ "info type of the data."
+ )
+ replace_with_infotype_parser.add_argument(
+ "--info_types",
+ action="append",
+ help="Strings representing info types to look for. A full list of "
+ "info categories and types is available from the API. Examples "
+ 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+ "If unspecified, the three above examples will be used.",
+ default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
+ )
+ replace_with_infotype_parser.add_argument(
+ "project",
+ help="The Google Cloud project id to use as a parent resource.",
+ )
+ replace_with_infotype_parser.add_argument(
+ "item",
+ help="The string to deidentify."
+ "Example: 'My credit card is 4242 4242 4242 4242'",
+ )
+
+ args = parser.parse_args()
+
+ if args.content == "deid_mask":
+ deidentify_with_mask(
+ args.project,
+ args.item,
+ args.info_types,
+ masking_character=args.masking_character,
+ number_to_mask=args.number_to_mask,
+ )
+ elif args.content == "deid_replace":
+ deidentify_with_replace(
+ args.project,
+ args.item,
+ args.info_types,
+ replacement_str=args.replacement_str,
+ )
+ elif args.content == "deid_fpe":
+ deidentify_with_fpe(
+ args.project,
+ args.item,
+ args.info_types,
+ alphabet=args.alphabet,
+ wrapped_key=args.wrapped_key,
+ key_name=args.key_name,
+ surrogate_type=args.surrogate_type,
+ )
+ elif args.content == "reid_fpe":
+ reidentify_with_fpe(
+ args.project,
+ args.item,
+ surrogate_type=args.surrogate_type,
+ wrapped_key=args.wrapped_key,
+ key_name=args.key_name,
+ alphabet=args.alphabet,
+ )
+ elif args.content == "deid_date_shift":
+ deidentify_with_date_shift(
+ args.project,
+ input_csv_file=args.input_csv_file,
+ output_csv_file=args.output_csv_file,
+ lower_bound_days=args.lower_bound_days,
+ upper_bound_days=args.upper_bound_days,
+ date_fields=args.date_fields,
+ context_field_id=args.context_field_id,
+ wrapped_key=args.wrapped_key,
+ key_name=args.key_name,
+ )
+ elif args.content == "replace_with_infotype":
+ deidentify_with_replace_infotype(
+ args.project,
+ item=args.item,
+ info_types=args.info_types,
+ )
diff --git a/packages/google-cloud-dlp/samples/snippets/deid_test.py b/packages/google-cloud-dlp/samples/snippets/deid_test.py
new file mode 100644
index 000000000000..7d886c51e362
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/deid_test.py
@@ -0,0 +1,257 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import tempfile
+
+import pytest
+
+import deid
+
+HARMFUL_STRING = "My SSN is 372819127"
+HARMLESS_STRING = "My favorite color is blue"
+GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT")
+UNWRAPPED_KEY = "YWJjZGVmZ2hpamtsbW5vcA=="
+WRAPPED_KEY = (
+ "CiQAz0hX4+go8fJwn80Fr8pVImwx+tmZdqU7JL+7TN/S5JxBU9gSSQDhFHpFVy"
+ "uzJps0YH9ls480mU+JLG7jI/0lL04i6XJRWqmI6gUSZRUtECYcLH5gXK4SXHlL"
+ "rotx7Chxz/4z7SIpXFOBY61z0/U="
+)
+KEY_NAME = (
+ "projects/python-docs-samples-tests/locations/global/keyRings/"
+ "dlp-test/cryptoKeys/dlp-test"
+)
+SURROGATE_TYPE = "SSN_TOKEN"
+CSV_FILE = os.path.join(os.path.dirname(__file__), "resources/dates.csv")
+DATE_SHIFTED_AMOUNT = 30
+DATE_FIELDS = ["birth_date", "register_date"]
+CSV_CONTEXT_FIELD = "name"
+
+
+@pytest.fixture(scope="module")
+def tempdir():
+ tempdir = tempfile.mkdtemp()
+ yield tempdir
+ shutil.rmtree(tempdir)
+
+
+def test_deidentify_with_mask(capsys):
+ deid.deidentify_with_mask(
+ GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"]
+ )
+
+ out, _ = capsys.readouterr()
+ assert "My SSN is *********" in out
+
+
+def test_deidentify_with_mask_ignore_insensitive_data(capsys):
+ deid.deidentify_with_mask(
+ GCLOUD_PROJECT, HARMLESS_STRING, ["US_SOCIAL_SECURITY_NUMBER"]
+ )
+
+ out, _ = capsys.readouterr()
+ assert HARMLESS_STRING in out
+
+
+def test_deidentify_with_mask_masking_character_specified(capsys):
+ deid.deidentify_with_mask(
+ GCLOUD_PROJECT,
+ HARMFUL_STRING,
+ ["US_SOCIAL_SECURITY_NUMBER"],
+ masking_character="#",
+ )
+
+ out, _ = capsys.readouterr()
+ assert "My SSN is #########" in out
+
+
+def test_deidentify_with_mask_masking_number_specified(capsys):
+ deid.deidentify_with_mask(
+ GCLOUD_PROJECT,
+ HARMFUL_STRING,
+ ["US_SOCIAL_SECURITY_NUMBER"],
+ number_to_mask=7,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "My SSN is *******27" in out
+
+
+def test_deidentify_with_redact(capsys):
+ deid.deidentify_with_redact(
+ GCLOUD_PROJECT, HARMFUL_STRING + "!", ["US_SOCIAL_SECURITY_NUMBER"]
+ )
+ out, _ = capsys.readouterr()
+ assert "My SSN is !" in out
+
+
+def test_deidentify_with_replace(capsys):
+ deid.deidentify_with_replace(
+ GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"],
+ replacement_str="REPLACEMENT_STR"
+ )
+
+ out, _ = capsys.readouterr()
+ assert "My SSN is REPLACEMENT_STR" in out
+
+
+def test_deidentify_with_fpe(capsys):
+ deid.deidentify_with_fpe(
+ GCLOUD_PROJECT,
+ HARMFUL_STRING,
+ ["US_SOCIAL_SECURITY_NUMBER"],
+ alphabet="NUMERIC",
+ wrapped_key=WRAPPED_KEY,
+ key_name=KEY_NAME,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "My SSN is" in out
+ assert "372819127" not in out
+
+
+def test_deidentify_with_fpe_uses_surrogate_info_types(capsys):
+ deid.deidentify_with_fpe(
+ GCLOUD_PROJECT,
+ HARMFUL_STRING,
+ ["US_SOCIAL_SECURITY_NUMBER"],
+ alphabet="NUMERIC",
+ wrapped_key=WRAPPED_KEY,
+ key_name=KEY_NAME,
+ surrogate_type=SURROGATE_TYPE,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "My SSN is SSN_TOKEN" in out
+ assert "372819127" not in out
+
+
+def test_deidentify_with_fpe_ignores_insensitive_data(capsys):
+ deid.deidentify_with_fpe(
+ GCLOUD_PROJECT,
+ HARMLESS_STRING,
+ ["US_SOCIAL_SECURITY_NUMBER"],
+ alphabet="NUMERIC",
+ wrapped_key=WRAPPED_KEY,
+ key_name=KEY_NAME,
+ )
+
+ out, _ = capsys.readouterr()
+ assert HARMLESS_STRING in out
+
+
+def test_deidentify_with_date_shift(tempdir, capsys):
+ output_filepath = os.path.join(tempdir, "dates-shifted.csv")
+
+ deid.deidentify_with_date_shift(
+ GCLOUD_PROJECT,
+ input_csv_file=CSV_FILE,
+ output_csv_file=output_filepath,
+ lower_bound_days=DATE_SHIFTED_AMOUNT,
+ upper_bound_days=DATE_SHIFTED_AMOUNT,
+ date_fields=DATE_FIELDS,
+ )
+
+ out, _ = capsys.readouterr()
+
+ assert "Successful" in out
+
+
+def test_deidentify_with_date_shift_using_context_field(tempdir, capsys):
+ output_filepath = os.path.join(tempdir, "dates-shifted.csv")
+
+ deid.deidentify_with_date_shift(
+ GCLOUD_PROJECT,
+ input_csv_file=CSV_FILE,
+ output_csv_file=output_filepath,
+ lower_bound_days=DATE_SHIFTED_AMOUNT,
+ upper_bound_days=DATE_SHIFTED_AMOUNT,
+ date_fields=DATE_FIELDS,
+ context_field_id=CSV_CONTEXT_FIELD,
+ wrapped_key=WRAPPED_KEY,
+ key_name=KEY_NAME,
+ )
+
+ out, _ = capsys.readouterr()
+
+ assert "Successful" in out
+
+
+def test_reidentify_with_fpe(capsys):
+ labeled_fpe_string = "My SSN is SSN_TOKEN(9):731997681"
+
+ deid.reidentify_with_fpe(
+ GCLOUD_PROJECT,
+ labeled_fpe_string,
+ surrogate_type=SURROGATE_TYPE,
+ wrapped_key=WRAPPED_KEY,
+ key_name=KEY_NAME,
+ alphabet="NUMERIC",
+ )
+
+ out, _ = capsys.readouterr()
+
+ assert "731997681" not in out
+
+
+def test_deidentify_free_text_with_fpe_using_surrogate(capsys):
+ labeled_fpe_string = "My phone number is 4359916732"
+
+ deid.deidentify_free_text_with_fpe_using_surrogate(
+ GCLOUD_PROJECT,
+ labeled_fpe_string,
+ info_type="PHONE_NUMBER",
+ surrogate_type="PHONE_TOKEN",
+ unwrapped_key=UNWRAPPED_KEY,
+ alphabet="NUMERIC",
+ )
+
+ out, _ = capsys.readouterr()
+
+ assert "PHONE_TOKEN" in out
+ assert "My phone number is" in out
+ assert "4359916732" not in out
+
+
+def test_reidentify_free_text_with_fpe_using_surrogate(capsys):
+ labeled_fpe_string = "My phone number is PHONE_TOKEN(10):9617256398"
+
+ deid.reidentify_free_text_with_fpe_using_surrogate(
+ GCLOUD_PROJECT,
+ labeled_fpe_string,
+ surrogate_type="PHONE_TOKEN",
+ unwrapped_key=UNWRAPPED_KEY,
+ alphabet="NUMERIC",
+ )
+
+ out, _ = capsys.readouterr()
+
+ assert "PHONE_TOKEN" not in out
+ assert "9617256398" not in out
+ assert "My phone number is" in out
+
+
+def test_deidentify_with_replace_infotype(capsys):
+ url_to_redact = "https://cloud.google.com"
+ deid.deidentify_with_replace_infotype(
+ GCLOUD_PROJECT,
+ "My favorite site is " + url_to_redact,
+ ["URL"],
+ )
+
+ out, _ = capsys.readouterr()
+
+ assert url_to_redact not in out
+ assert "My favorite site is [URL]" in out
diff --git a/packages/google-cloud-dlp/samples/snippets/inspect_content.py b/packages/google-cloud-dlp/samples/snippets/inspect_content.py
new file mode 100644
index 000000000000..fb2573e4bc8a
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/inspect_content.py
@@ -0,0 +1,1424 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Sample app that uses the Data Loss Prevention API to inspect a string, a
+local file or a file on Google Cloud Storage."""
+
+from __future__ import print_function
+
+import argparse
+import json
+import os
+
+
+# [START dlp_inspect_string_basic]
+def inspect_string_basic(
+ project,
+ content_string,
+ info_types=["PHONE_NUMBER"],
+):
+ """Uses the Data Loss Prevention API to analyze strings for protected data.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ content_string: The string to inspect.
+ info_types: A list of strings representing info types to look for.
+ A full list of info type categories can be fetched from the API.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Prepare info_types by converting the list of strings into a list of
+ # dictionaries (protos are also accepted).
+ info_types = [{"name": info_type} for info_type in info_types]
+
+ # Construct the configuration dictionary.
+ inspect_config = {
+ "info_types": info_types,
+ "include_quote": True,
+ }
+
+ # Construct the `item`.
+ item = {"value": content_string}
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Call the API.
+ response = dlp.inspect_content(parent, inspect_config, item)
+
+ # Print out the results.
+ if response.result.findings:
+ for finding in response.result.findings:
+ print("Quote: {}".format(finding.quote))
+ print("Info type: {}".format(finding.info_type.name))
+ print("Likelihood: {}".format(finding.likelihood))
+ else:
+ print("No findings.")
+
+
+# [END dlp_inspect_string_basic]
+
+
+# [START dlp_inspect_string]
+def inspect_string(
+ project,
+ content_string,
+ info_types,
+ custom_dictionaries=None,
+ custom_regexes=None,
+ min_likelihood=None,
+ max_findings=None,
+ include_quote=True,
+):
+ """Uses the Data Loss Prevention API to analyze strings for protected data.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ content_string: The string to inspect.
+ info_types: A list of strings representing info types to look for.
+ A full list of info type categories can be fetched from the API.
+ min_likelihood: A string representing the minimum likelihood threshold
+ that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
+ 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
+ max_findings: The maximum number of findings to report; 0 = no maximum.
+ include_quote: Boolean for whether to display a quote of the detected
+ information in the results.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Prepare info_types by converting the list of strings into a list of
+ # dictionaries (protos are also accepted).
+ info_types = [{"name": info_type} for info_type in info_types]
+
+ # Prepare custom_info_types by parsing the dictionary word lists and
+ # regex patterns.
+ if custom_dictionaries is None:
+ custom_dictionaries = []
+ dictionaries = [
+ {
+ "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)},
+ "dictionary": {"word_list": {"words": custom_dict.split(",")}},
+ }
+ for i, custom_dict in enumerate(custom_dictionaries)
+ ]
+ if custom_regexes is None:
+ custom_regexes = []
+ regexes = [
+ {
+ "info_type": {"name": "CUSTOM_REGEX_{}".format(i)},
+ "regex": {"pattern": custom_regex},
+ }
+ for i, custom_regex in enumerate(custom_regexes)
+ ]
+ custom_info_types = dictionaries + regexes
+
+ # Construct the configuration dictionary. Keys which are None may
+ # optionally be omitted entirely.
+ inspect_config = {
+ "info_types": info_types,
+ "custom_info_types": custom_info_types,
+ "min_likelihood": min_likelihood,
+ "include_quote": include_quote,
+ "limits": {"max_findings_per_request": max_findings},
+ }
+
+ # Construct the `item`.
+ item = {"value": content_string}
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Call the API.
+ response = dlp.inspect_content(parent, inspect_config, item)
+
+ # Print out the results.
+ if response.result.findings:
+ for finding in response.result.findings:
+ try:
+ if finding.quote:
+ print("Quote: {}".format(finding.quote))
+ except AttributeError:
+ pass
+ print("Info type: {}".format(finding.info_type.name))
+ print("Likelihood: {}".format(finding.likelihood))
+ else:
+ print("No findings.")
+
+
+# [END dlp_inspect_string]
+
+# [START dlp_inspect_table]
+
+
+def inspect_table(
+ project,
+ data,
+ info_types,
+ custom_dictionaries=None,
+ custom_regexes=None,
+ min_likelihood=None,
+ max_findings=None,
+ include_quote=True,
+):
+ """Uses the Data Loss Prevention API to analyze strings for protected data.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ data: Json string representing table data.
+ info_types: A list of strings representing info types to look for.
+ A full list of info type categories can be fetched from the API.
+ min_likelihood: A string representing the minimum likelihood threshold
+ that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
+ 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
+ max_findings: The maximum number of findings to report; 0 = no maximum.
+ include_quote: Boolean for whether to display a quote of the detected
+ information in the results.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ Example:
+ data = {
+ "header":[
+ "email",
+ "phone number"
+ ],
+ "rows":[
+ [
+ "robertfrost@xyz.com",
+ "4232342345"
+ ],
+ [
+ "johndoe@pqr.com",
+ "4253458383"
+ ]
+ ]
+ }
+
+ >> $ python inspect_content.py table \
+ '{"header": ["email", "phone number"],
+ "rows": [["robertfrost@xyz.com", "4232342345"],
+ ["johndoe@pqr.com", "4253458383"]]}'
+ >> Quote: robertfrost@xyz.com
+ Info type: EMAIL_ADDRESS
+ Likelihood: 4
+ Quote: johndoe@pqr.com
+ Info type: EMAIL_ADDRESS
+ Likelihood: 4
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Prepare info_types by converting the list of strings into a list of
+ # dictionaries (protos are also accepted).
+ info_types = [{"name": info_type} for info_type in info_types]
+
+ # Prepare custom_info_types by parsing the dictionary word lists and
+ # regex patterns.
+ if custom_dictionaries is None:
+ custom_dictionaries = []
+ dictionaries = [
+ {
+ "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)},
+ "dictionary": {"word_list": {"words": custom_dict.split(",")}},
+ }
+ for i, custom_dict in enumerate(custom_dictionaries)
+ ]
+ if custom_regexes is None:
+ custom_regexes = []
+ regexes = [
+ {
+ "info_type": {"name": "CUSTOM_REGEX_{}".format(i)},
+ "regex": {"pattern": custom_regex},
+ }
+ for i, custom_regex in enumerate(custom_regexes)
+ ]
+ custom_info_types = dictionaries + regexes
+
+ # Construct the configuration dictionary. Keys which are None may
+ # optionally be omitted entirely.
+ inspect_config = {
+ "info_types": info_types,
+ "custom_info_types": custom_info_types,
+ "min_likelihood": min_likelihood,
+ "include_quote": include_quote,
+ "limits": {"max_findings_per_request": max_findings},
+ }
+
+ # Construct the `table`. For more details on the table schema, please see
+ # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table
+ headers = [{"name": val} for val in data["header"]]
+ rows = []
+ for row in data["rows"]:
+ rows.append(
+ {"values": [{"string_value": cell_val} for cell_val in row]}
+ )
+
+ table = {}
+ table["headers"] = headers
+ table["rows"] = rows
+ item = {"table": table}
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Call the API.
+ response = dlp.inspect_content(parent, inspect_config, item)
+
+ # Print out the results.
+ if response.result.findings:
+ for finding in response.result.findings:
+ try:
+ if finding.quote:
+ print("Quote: {}".format(finding.quote))
+ except AttributeError:
+ pass
+ print("Info type: {}".format(finding.info_type.name))
+ print("Likelihood: {}".format(finding.likelihood))
+ else:
+ print("No findings.")
+
+
+# [END dlp_inspect_table]
+
+# [START dlp_inspect_file]
+
+
+def inspect_file(
+ project,
+ filename,
+ info_types,
+ min_likelihood=None,
+ custom_dictionaries=None,
+ custom_regexes=None,
+ max_findings=None,
+ include_quote=True,
+ mime_type=None,
+):
+ """Uses the Data Loss Prevention API to analyze a file for protected data.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ filename: The path to the file to inspect.
+ info_types: A list of strings representing info types to look for.
+ A full list of info type categories can be fetched from the API.
+ min_likelihood: A string representing the minimum likelihood threshold
+ that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
+ 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
+ max_findings: The maximum number of findings to report; 0 = no maximum.
+ include_quote: Boolean for whether to display a quote of the detected
+ information in the results.
+ mime_type: The MIME type of the file. If not specified, the type is
+ inferred via the Python standard library's mimetypes module.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ import mimetypes
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Prepare info_types by converting the list of strings into a list of
+ # dictionaries (protos are also accepted).
+ if not info_types:
+ info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"]
+ info_types = [{"name": info_type} for info_type in info_types]
+
+ # Prepare custom_info_types by parsing the dictionary word lists and
+ # regex patterns.
+ if custom_dictionaries is None:
+ custom_dictionaries = []
+ dictionaries = [
+ {
+ "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)},
+ "dictionary": {"word_list": {"words": custom_dict.split(",")}},
+ }
+ for i, custom_dict in enumerate(custom_dictionaries)
+ ]
+ if custom_regexes is None:
+ custom_regexes = []
+ regexes = [
+ {
+ "info_type": {"name": "CUSTOM_REGEX_{}".format(i)},
+ "regex": {"pattern": custom_regex},
+ }
+ for i, custom_regex in enumerate(custom_regexes)
+ ]
+ custom_info_types = dictionaries + regexes
+
+ # Construct the configuration dictionary. Keys which are None may
+ # optionally be omitted entirely.
+ inspect_config = {
+ "info_types": info_types,
+ "custom_info_types": custom_info_types,
+ "min_likelihood": min_likelihood,
+ "limits": {"max_findings_per_request": max_findings},
+ }
+
+ # If mime_type is not specified, guess it from the filename.
+ if mime_type is None:
+ mime_guess = mimetypes.MimeTypes().guess_type(filename)
+ mime_type = mime_guess[0]
+
+ # Select the content type index from the list of supported types.
+ supported_content_types = {
+ None: 0, # "Unspecified"
+ "image/jpeg": 1,
+ "image/bmp": 2,
+ "image/png": 3,
+ "image/svg": 4,
+ "text/plain": 5,
+ }
+ content_type_index = supported_content_types.get(mime_type, 0)
+
+ # Construct the item, containing the file's byte data.
+ with open(filename, mode="rb") as f:
+ item = {"byte_item": {"type": content_type_index, "data": f.read()}}
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Call the API.
+ response = dlp.inspect_content(parent, inspect_config, item)
+
+ # Print out the results.
+ if response.result.findings:
+ for finding in response.result.findings:
+ try:
+ print("Quote: {}".format(finding.quote))
+ except AttributeError:
+ pass
+ print("Info type: {}".format(finding.info_type.name))
+ print("Likelihood: {}".format(finding.likelihood))
+ else:
+ print("No findings.")
+
+
+# [END dlp_inspect_file]
+
+
+# [START dlp_inspect_gcs]
+def inspect_gcs_file(
+ project,
+ bucket,
+ filename,
+ topic_id,
+ subscription_id,
+ info_types,
+ custom_dictionaries=None,
+ custom_regexes=None,
+ min_likelihood=None,
+ max_findings=None,
+ timeout=300,
+):
+ """Uses the Data Loss Prevention API to analyze a file on GCS.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ bucket: The name of the GCS bucket containing the file, as a string.
+ filename: The name of the file in the bucket, including the path, as a
+ string; e.g. 'images/myfile.png'.
+ topic_id: The id of the Cloud Pub/Sub topic to which the API will
+ broadcast job completion. The topic must already exist.
+ subscription_id: The id of the Cloud Pub/Sub subscription to listen on
+ while waiting for job completion. The subscription must already
+ exist and be subscribed to the topic.
+ info_types: A list of strings representing info types to look for.
+ A full list of info type categories can be fetched from the API.
+ min_likelihood: A string representing the minimum likelihood threshold
+ that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
+ 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
+ max_findings: The maximum number of findings to report; 0 = no maximum.
+ timeout: The number of seconds to wait for a response from the API.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # This sample additionally uses Cloud Pub/Sub to receive results from
+ # potentially long-running operations.
+ import google.cloud.pubsub
+
+ # This sample also uses threading.Event() to wait for the job to finish.
+ import threading
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Prepare info_types by converting the list of strings into a list of
+ # dictionaries (protos are also accepted).
+ if not info_types:
+ info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"]
+ info_types = [{"name": info_type} for info_type in info_types]
+
+ # Prepare custom_info_types by parsing the dictionary word lists and
+ # regex patterns.
+ if custom_dictionaries is None:
+ custom_dictionaries = []
+ dictionaries = [
+ {
+ "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)},
+ "dictionary": {"word_list": {"words": custom_dict.split(",")}},
+ }
+ for i, custom_dict in enumerate(custom_dictionaries)
+ ]
+ if custom_regexes is None:
+ custom_regexes = []
+ regexes = [
+ {
+ "info_type": {"name": "CUSTOM_REGEX_{}".format(i)},
+ "regex": {"pattern": custom_regex},
+ }
+ for i, custom_regex in enumerate(custom_regexes)
+ ]
+ custom_info_types = dictionaries + regexes
+
+ # Construct the configuration dictionary. Keys which are None may
+ # optionally be omitted entirely.
+ inspect_config = {
+ "info_types": info_types,
+ "custom_info_types": custom_info_types,
+ "min_likelihood": min_likelihood,
+ "limits": {"max_findings_per_request": max_findings},
+ }
+
+ # Construct a storage_config containing the file's URL.
+ url = "gs://{}/{}".format(bucket, filename)
+ storage_config = {"cloud_storage_options": {"file_set": {"url": url}}}
+
+ # Convert the project id into full resource ids.
+ topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id)
+ parent = dlp.location_path(project, 'global')
+
+ # Tell the API where to send a notification when the job is complete.
+ actions = [{"pub_sub": {"topic": topic}}]
+
+ # Construct the inspect_job, which defines the entire inspect content task.
+ inspect_job = {
+ "inspect_config": inspect_config,
+ "storage_config": storage_config,
+ "actions": actions,
+ }
+
+ operation = dlp.create_dlp_job(parent, inspect_job=inspect_job)
+ print("Inspection operation started: {}".format(operation.name))
+
+ # Create a Pub/Sub client and find the subscription. The subscription is
+ # expected to already be listening to the topic.
+ subscriber = google.cloud.pubsub.SubscriberClient()
+ subscription_path = subscriber.subscription_path(project, subscription_id)
+
+ # Set up a callback to acknowledge a message. This closes around an event
+ # so that it can signal that it is done and the main thread can continue.
+ job_done = threading.Event()
+
+ def callback(message):
+ try:
+ if message.attributes["DlpJobName"] == operation.name:
+ # This is the message we're looking for, so acknowledge it.
+ message.ack()
+
+ # Now that the job is done, fetch the results and print them.
+ job = dlp.get_dlp_job(operation.name)
+ if job.inspect_details.result.info_type_stats:
+ for finding in job.inspect_details.result.info_type_stats:
+ print(
+ "Info type: {}; Count: {}".format(
+ finding.info_type.name, finding.count
+ )
+ )
+ else:
+ print("No findings.")
+
+ # Signal to the main thread that we can exit.
+ job_done.set()
+ else:
+ # This is not the message we're looking for.
+ message.drop()
+ except Exception as e:
+ # Because this is executing in a thread, an exception won't be
+ # noted unless we print it manually.
+ print(e)
+ raise
+
+ subscriber.subscribe(subscription_path, callback=callback)
+ finished = job_done.wait(timeout=timeout)
+ if not finished:
+ print(
+ "No event received before the timeout. Please verify that the "
+ "subscription provided is subscribed to the topic provided."
+ )
+
+
+# [END dlp_inspect_gcs]
+
+
+# [START dlp_inspect_datastore]
+def inspect_datastore(
+ project,
+ datastore_project,
+ kind,
+ topic_id,
+ subscription_id,
+ info_types,
+ custom_dictionaries=None,
+ custom_regexes=None,
+ namespace_id=None,
+ min_likelihood=None,
+ max_findings=None,
+ timeout=300,
+):
+ """Uses the Data Loss Prevention API to analyze Datastore data.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ datastore_project: The Google Cloud project id of the target Datastore.
+ kind: The kind of the Datastore entity to inspect, e.g. 'Person'.
+ topic_id: The id of the Cloud Pub/Sub topic to which the API will
+ broadcast job completion. The topic must already exist.
+ subscription_id: The id of the Cloud Pub/Sub subscription to listen on
+ while waiting for job completion. The subscription must already
+ exist and be subscribed to the topic.
+ info_types: A list of strings representing info types to look for.
+ A full list of info type categories can be fetched from the API.
+ namespace_id: The namespace of the Datastore document, if applicable.
+ min_likelihood: A string representing the minimum likelihood threshold
+ that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
+ 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
+ max_findings: The maximum number of findings to report; 0 = no maximum.
+ timeout: The number of seconds to wait for a response from the API.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # This sample additionally uses Cloud Pub/Sub to receive results from
+ # potentially long-running operations.
+ import google.cloud.pubsub
+
+ # This sample also uses threading.Event() to wait for the job to finish.
+ import threading
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Prepare info_types by converting the list of strings into a list of
+ # dictionaries (protos are also accepted).
+ if not info_types:
+ info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"]
+ info_types = [{"name": info_type} for info_type in info_types]
+
+ # Prepare custom_info_types by parsing the dictionary word lists and
+ # regex patterns.
+ if custom_dictionaries is None:
+ custom_dictionaries = []
+ dictionaries = [
+ {
+ "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)},
+ "dictionary": {"word_list": {"words": custom_dict.split(",")}},
+ }
+ for i, custom_dict in enumerate(custom_dictionaries)
+ ]
+ if custom_regexes is None:
+ custom_regexes = []
+ regexes = [
+ {
+ "info_type": {"name": "CUSTOM_REGEX_{}".format(i)},
+ "regex": {"pattern": custom_regex},
+ }
+ for i, custom_regex in enumerate(custom_regexes)
+ ]
+ custom_info_types = dictionaries + regexes
+
+ # Construct the configuration dictionary. Keys which are None may
+ # optionally be omitted entirely.
+ inspect_config = {
+ "info_types": info_types,
+ "custom_info_types": custom_info_types,
+ "min_likelihood": min_likelihood,
+ "limits": {"max_findings_per_request": max_findings},
+ }
+
+ # Construct a storage_config containing the target Datastore info.
+ storage_config = {
+ "datastore_options": {
+ "partition_id": {
+ "project_id": datastore_project,
+ "namespace_id": namespace_id,
+ },
+ "kind": {"name": kind},
+ }
+ }
+
+ # Convert the project id into full resource ids.
+ topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id)
+ parent = dlp.location_path(project, 'global')
+
+ # Tell the API where to send a notification when the job is complete.
+ actions = [{"pub_sub": {"topic": topic}}]
+
+ # Construct the inspect_job, which defines the entire inspect content task.
+ inspect_job = {
+ "inspect_config": inspect_config,
+ "storage_config": storage_config,
+ "actions": actions,
+ }
+
+ operation = dlp.create_dlp_job(parent, inspect_job=inspect_job)
+ print("Inspection operation started: {}".format(operation.name))
+
+ # Create a Pub/Sub client and find the subscription. The subscription is
+ # expected to already be listening to the topic.
+ subscriber = google.cloud.pubsub.SubscriberClient()
+ subscription_path = subscriber.subscription_path(project, subscription_id)
+
+ # Set up a callback to acknowledge a message. This closes around an event
+ # so that it can signal that it is done and the main thread can continue.
+ job_done = threading.Event()
+
+ def callback(message):
+ try:
+ if message.attributes["DlpJobName"] == operation.name:
+ # This is the message we're looking for, so acknowledge it.
+ message.ack()
+
+ # Now that the job is done, fetch the results and print them.
+ job = dlp.get_dlp_job(operation.name)
+ if job.inspect_details.result.info_type_stats:
+ for finding in job.inspect_details.result.info_type_stats:
+ print(
+ "Info type: {}; Count: {}".format(
+ finding.info_type.name, finding.count
+ )
+ )
+ else:
+ print("No findings.")
+
+ # Signal to the main thread that we can exit.
+ job_done.set()
+ else:
+ # This is not the message we're looking for.
+ message.drop()
+ except Exception as e:
+ # Because this is executing in a thread, an exception won't be
+ # noted unless we print it manually.
+ print(e)
+ raise
+
+ # Register the callback and wait on the event.
+ subscriber.subscribe(subscription_path, callback=callback)
+
+ finished = job_done.wait(timeout=timeout)
+ if not finished:
+ print(
+ "No event received before the timeout. Please verify that the "
+ "subscription provided is subscribed to the topic provided."
+ )
+
+
+# [END dlp_inspect_datastore]
+
+
+# [START dlp_inspect_bigquery]
+def inspect_bigquery(
+ project,
+ bigquery_project,
+ dataset_id,
+ table_id,
+ topic_id,
+ subscription_id,
+ info_types,
+ custom_dictionaries=None,
+ custom_regexes=None,
+ min_likelihood=None,
+ max_findings=None,
+ timeout=300,
+):
+ """Uses the Data Loss Prevention API to analyze BigQuery data.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ bigquery_project: The Google Cloud project id of the target table.
+ dataset_id: The id of the target BigQuery dataset.
+ table_id: The id of the target BigQuery table.
+ topic_id: The id of the Cloud Pub/Sub topic to which the API will
+ broadcast job completion. The topic must already exist.
+ subscription_id: The id of the Cloud Pub/Sub subscription to listen on
+ while waiting for job completion. The subscription must already
+ exist and be subscribed to the topic.
+ info_types: A list of strings representing info types to look for.
+ A full list of info type categories can be fetched from the API.
+ namespace_id: The namespace of the Datastore document, if applicable.
+ min_likelihood: A string representing the minimum likelihood threshold
+ that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
+ 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
+ max_findings: The maximum number of findings to report; 0 = no maximum.
+ timeout: The number of seconds to wait for a response from the API.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # This sample additionally uses Cloud Pub/Sub to receive results from
+ # potentially long-running operations.
+ import google.cloud.pubsub
+
+ # This sample also uses threading.Event() to wait for the job to finish.
+ import threading
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Prepare info_types by converting the list of strings into a list of
+ # dictionaries (protos are also accepted).
+ if not info_types:
+ info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"]
+ info_types = [{"name": info_type} for info_type in info_types]
+
+ # Prepare custom_info_types by parsing the dictionary word lists and
+ # regex patterns.
+ if custom_dictionaries is None:
+ custom_dictionaries = []
+ dictionaries = [
+ {
+ "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)},
+ "dictionary": {"word_list": {"words": custom_dict.split(",")}},
+ }
+ for i, custom_dict in enumerate(custom_dictionaries)
+ ]
+ if custom_regexes is None:
+ custom_regexes = []
+ regexes = [
+ {
+ "info_type": {"name": "CUSTOM_REGEX_{}".format(i)},
+ "regex": {"pattern": custom_regex},
+ }
+ for i, custom_regex in enumerate(custom_regexes)
+ ]
+ custom_info_types = dictionaries + regexes
+
+ # Construct the configuration dictionary. Keys which are None may
+ # optionally be omitted entirely.
+ inspect_config = {
+ "info_types": info_types,
+ "custom_info_types": custom_info_types,
+ "min_likelihood": min_likelihood,
+ "limits": {"max_findings_per_request": max_findings},
+ }
+
+ # Construct a storage_config containing the target Bigquery info.
+ storage_config = {
+ "big_query_options": {
+ "table_reference": {
+ "project_id": bigquery_project,
+ "dataset_id": dataset_id,
+ "table_id": table_id,
+ }
+ }
+ }
+
+ # Convert the project id into full resource ids.
+ topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id)
+ parent = dlp.location_path(project, 'global')
+
+ # Tell the API where to send a notification when the job is complete.
+ actions = [{"pub_sub": {"topic": topic}}]
+
+ # Construct the inspect_job, which defines the entire inspect content task.
+ inspect_job = {
+ "inspect_config": inspect_config,
+ "storage_config": storage_config,
+ "actions": actions,
+ }
+
+ operation = dlp.create_dlp_job(parent, inspect_job=inspect_job)
+ print("Inspection operation started: {}".format(operation.name))
+
+ # Create a Pub/Sub client and find the subscription. The subscription is
+ # expected to already be listening to the topic.
+ subscriber = google.cloud.pubsub.SubscriberClient()
+ subscription_path = subscriber.subscription_path(project, subscription_id)
+
+ # Set up a callback to acknowledge a message. This closes around an event
+ # so that it can signal that it is done and the main thread can continue.
+ job_done = threading.Event()
+
+ def callback(message):
+ try:
+ if message.attributes["DlpJobName"] == operation.name:
+ # This is the message we're looking for, so acknowledge it.
+ message.ack()
+
+ # Now that the job is done, fetch the results and print them.
+ job = dlp.get_dlp_job(operation.name)
+ if job.inspect_details.result.info_type_stats:
+ for finding in job.inspect_details.result.info_type_stats:
+ print(
+ "Info type: {}; Count: {}".format(
+ finding.info_type.name, finding.count
+ )
+ )
+ else:
+ print("No findings.")
+
+ # Signal to the main thread that we can exit.
+ job_done.set()
+ else:
+ # This is not the message we're looking for.
+ message.drop()
+ except Exception as e:
+ # Because this is executing in a thread, an exception won't be
+ # noted unless we print it manually.
+ print(e)
+ raise
+
+ # Register the callback and wait on the event.
+ subscriber.subscribe(subscription_path, callback=callback)
+ finished = job_done.wait(timeout=timeout)
+ if not finished:
+ print(
+ "No event received before the timeout. Please verify that the "
+ "subscription provided is subscribed to the topic provided."
+ )
+
+
+# [END dlp_inspect_bigquery]
+
+
+if __name__ == "__main__":
+ default_project = os.environ.get("GOOGLE_CLOUD_PROJECT")
+
+ parser = argparse.ArgumentParser(description=__doc__)
+ subparsers = parser.add_subparsers(
+ dest="content", help="Select how to submit content to the API."
+ )
+ subparsers.required = True
+
+ parser_string = subparsers.add_parser("string", help="Inspect a string.")
+ parser_string.add_argument("item", help="The string to inspect.")
+ parser_string.add_argument(
+ "--project",
+ help="The Google Cloud project id to use as a parent resource.",
+ default=default_project,
+ )
+ parser_string.add_argument(
+ "--info_types",
+ nargs="+",
+ help="Strings representing info types to look for. A full list of "
+ "info categories and types is available from the API. Examples "
+ 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+ "If unspecified, the three above examples will be used.",
+ default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
+ )
+ parser_string.add_argument(
+ "--custom_dictionaries",
+ action="append",
+ help="Strings representing comma-delimited lists of dictionary words"
+ " to search for as custom info types. Each string is a comma "
+ "delimited list of words representing a distinct dictionary.",
+ default=None,
+ )
+ parser_string.add_argument(
+ "--custom_regexes",
+ action="append",
+ help="Strings representing regex patterns to search for as custom "
+ " info types.",
+ default=None,
+ )
+ parser_string.add_argument(
+ "--min_likelihood",
+ choices=[
+ "LIKELIHOOD_UNSPECIFIED",
+ "VERY_UNLIKELY",
+ "UNLIKELY",
+ "POSSIBLE",
+ "LIKELY",
+ "VERY_LIKELY",
+ ],
+ help="A string representing the minimum likelihood threshold that "
+ "constitutes a match.",
+ )
+ parser_string.add_argument(
+ "--max_findings",
+ type=int,
+ help="The maximum number of findings to report; 0 = no maximum.",
+ )
+ parser_string.add_argument(
+ "--include_quote",
+ type=bool,
+ help="A boolean for whether to display a quote of the detected "
+ "information in the results.",
+ default=True,
+ )
+
+ parser_table = subparsers.add_parser("table", help="Inspect a table.")
+ parser_table.add_argument(
+ "data", help="Json string representing a table.", type=json.loads
+ )
+ parser_table.add_argument(
+ "--project",
+ help="The Google Cloud project id to use as a parent resource.",
+ default=default_project,
+ )
+ parser_table.add_argument(
+ "--info_types",
+ action="append",
+ help="Strings representing info types to look for. A full list of "
+ "info categories and types is available from the API. Examples "
+ 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+ "If unspecified, the three above examples will be used.",
+ default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
+ )
+ parser_table.add_argument(
+ "--custom_dictionaries",
+ action="append",
+ help="Strings representing comma-delimited lists of dictionary words"
+ " to search for as custom info types. Each string is a comma "
+ "delimited list of words representing a distinct dictionary.",
+ default=None,
+ )
+ parser_table.add_argument(
+ "--custom_regexes",
+ action="append",
+ help="Strings representing regex patterns to search for as custom "
+ " info types.",
+ default=None,
+ )
+ parser_table.add_argument(
+ "--min_likelihood",
+ choices=[
+ "LIKELIHOOD_UNSPECIFIED",
+ "VERY_UNLIKELY",
+ "UNLIKELY",
+ "POSSIBLE",
+ "LIKELY",
+ "VERY_LIKELY",
+ ],
+ help="A string representing the minimum likelihood threshold that "
+ "constitutes a match.",
+ )
+ parser_table.add_argument(
+ "--max_findings",
+ type=int,
+ help="The maximum number of findings to report; 0 = no maximum.",
+ )
+ parser_table.add_argument(
+ "--include_quote",
+ type=bool,
+ help="A boolean for whether to display a quote of the detected "
+ "information in the results.",
+ default=True,
+ )
+
+ parser_file = subparsers.add_parser("file", help="Inspect a local file.")
+ parser_file.add_argument(
+ "filename", help="The path to the file to inspect."
+ )
+ parser_file.add_argument(
+ "--project",
+ help="The Google Cloud project id to use as a parent resource.",
+ default=default_project,
+ )
+ parser_file.add_argument(
+ "--info_types",
+ action="append",
+ help="Strings representing info types to look for. A full list of "
+ "info categories and types is available from the API. Examples "
+ 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+ "If unspecified, the three above examples will be used.",
+ default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
+ )
+ parser_file.add_argument(
+ "--custom_dictionaries",
+ action="append",
+ help="Strings representing comma-delimited lists of dictionary words"
+ " to search for as custom info types. Each string is a comma "
+ "delimited list of words representing a distinct dictionary.",
+ default=None,
+ )
+ parser_file.add_argument(
+ "--custom_regexes",
+ action="append",
+ help="Strings representing regex patterns to search for as custom "
+ " info types.",
+ default=None,
+ )
+ parser_file.add_argument(
+ "--min_likelihood",
+ choices=[
+ "LIKELIHOOD_UNSPECIFIED",
+ "VERY_UNLIKELY",
+ "UNLIKELY",
+ "POSSIBLE",
+ "LIKELY",
+ "VERY_LIKELY",
+ ],
+ help="A string representing the minimum likelihood threshold that "
+ "constitutes a match.",
+ )
+ parser_file.add_argument(
+ "--max_findings",
+ type=int,
+ help="The maximum number of findings to report; 0 = no maximum.",
+ )
+ parser_file.add_argument(
+ "--include_quote",
+ type=bool,
+ help="A boolean for whether to display a quote of the detected "
+ "information in the results.",
+ default=True,
+ )
+ parser_file.add_argument(
+ "--mime_type",
+ help="The MIME type of the file. If not specified, the type is "
+ "inferred via the Python standard library's mimetypes module.",
+ )
+
+ parser_gcs = subparsers.add_parser(
+ "gcs", help="Inspect files on Google Cloud Storage."
+ )
+ parser_gcs.add_argument(
+ "bucket", help="The name of the GCS bucket containing the file."
+ )
+ parser_gcs.add_argument(
+ "filename",
+ help="The name of the file in the bucket, including the path, e.g. "
+ '"images/myfile.png". Wildcards are permitted.',
+ )
+ parser_gcs.add_argument(
+ "topic_id",
+ help="The id of the Cloud Pub/Sub topic to use to report that the job "
+ 'is complete, e.g. "dlp-sample-topic".',
+ )
+ parser_gcs.add_argument(
+ "subscription_id",
+ help="The id of the Cloud Pub/Sub subscription to monitor for job "
+ 'completion, e.g. "dlp-sample-subscription". The subscription must '
+ "already be subscribed to the topic. See the test files or the Cloud "
+ "Pub/Sub sample files for examples on how to create the subscription.",
+ )
+ parser_gcs.add_argument(
+ "--project",
+ help="The Google Cloud project id to use as a parent resource.",
+ default=default_project,
+ )
+ parser_gcs.add_argument(
+ "--info_types",
+ action="append",
+ help="Strings representing info types to look for. A full list of "
+ "info categories and types is available from the API. Examples "
+ 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+ "If unspecified, the three above examples will be used.",
+ default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
+ )
+ parser_gcs.add_argument(
+ "--custom_dictionaries",
+ action="append",
+ help="Strings representing comma-delimited lists of dictionary words"
+ " to search for as custom info types. Each string is a comma "
+ "delimited list of words representing a distinct dictionary.",
+ default=None,
+ )
+ parser_gcs.add_argument(
+ "--custom_regexes",
+ action="append",
+ help="Strings representing regex patterns to search for as custom "
+ " info types.",
+ default=None,
+ )
+ parser_gcs.add_argument(
+ "--min_likelihood",
+ choices=[
+ "LIKELIHOOD_UNSPECIFIED",
+ "VERY_UNLIKELY",
+ "UNLIKELY",
+ "POSSIBLE",
+ "LIKELY",
+ "VERY_LIKELY",
+ ],
+ help="A string representing the minimum likelihood threshold that "
+ "constitutes a match.",
+ )
+ parser_gcs.add_argument(
+ "--max_findings",
+ type=int,
+ help="The maximum number of findings to report; 0 = no maximum.",
+ )
+ parser_gcs.add_argument(
+ "--timeout",
+ type=int,
+ help="The maximum number of seconds to wait for a response from the "
+ "API. The default is 300 seconds.",
+ default=300,
+ )
+
+ parser_datastore = subparsers.add_parser(
+ "datastore", help="Inspect files on Google Datastore."
+ )
+ parser_datastore.add_argument(
+ "datastore_project",
+ help="The Google Cloud project id of the target Datastore.",
+ )
+ parser_datastore.add_argument(
+ "kind",
+ help='The kind of the Datastore entity to inspect, e.g. "Person".',
+ )
+ parser_datastore.add_argument(
+ "topic_id",
+ help="The id of the Cloud Pub/Sub topic to use to report that the job "
+ 'is complete, e.g. "dlp-sample-topic".',
+ )
+ parser_datastore.add_argument(
+ "subscription_id",
+ help="The id of the Cloud Pub/Sub subscription to monitor for job "
+ 'completion, e.g. "dlp-sample-subscription". The subscription must '
+ "already be subscribed to the topic. See the test files or the Cloud "
+ "Pub/Sub sample files for examples on how to create the subscription.",
+ )
+ parser_datastore.add_argument(
+ "--project",
+ help="The Google Cloud project id to use as a parent resource.",
+ default=default_project,
+ )
+ parser_datastore.add_argument(
+ "--info_types",
+ action="append",
+ help="Strings representing info types to look for. A full list of "
+ "info categories and types is available from the API. Examples "
+ 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+ "If unspecified, the three above examples will be used.",
+ default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
+ )
+ parser_datastore.add_argument(
+ "--custom_dictionaries",
+ action="append",
+ help="Strings representing comma-delimited lists of dictionary words"
+ " to search for as custom info types. Each string is a comma "
+ "delimited list of words representing a distinct dictionary.",
+ default=None,
+ )
+ parser_datastore.add_argument(
+ "--custom_regexes",
+ action="append",
+ help="Strings representing regex patterns to search for as custom "
+ " info types.",
+ default=None,
+ )
+ parser_datastore.add_argument(
+ "--namespace_id", help="The Datastore namespace to use, if applicable."
+ )
+ parser_datastore.add_argument(
+ "--min_likelihood",
+ choices=[
+ "LIKELIHOOD_UNSPECIFIED",
+ "VERY_UNLIKELY",
+ "UNLIKELY",
+ "POSSIBLE",
+ "LIKELY",
+ "VERY_LIKELY",
+ ],
+ help="A string representing the minimum likelihood threshold that "
+ "constitutes a match.",
+ )
+ parser_datastore.add_argument(
+ "--max_findings",
+ type=int,
+ help="The maximum number of findings to report; 0 = no maximum.",
+ )
+ parser_datastore.add_argument(
+ "--timeout",
+ type=int,
+ help="The maximum number of seconds to wait for a response from the "
+ "API. The default is 300 seconds.",
+ default=300,
+ )
+
+ parser_bigquery = subparsers.add_parser(
+ "bigquery", help="Inspect files on Google BigQuery."
+ )
+ parser_bigquery.add_argument(
+ "bigquery_project",
+ help="The Google Cloud project id of the target table.",
+ )
+ parser_bigquery.add_argument(
+ "dataset_id", help="The ID of the target BigQuery dataset."
+ )
+ parser_bigquery.add_argument(
+ "table_id", help="The ID of the target BigQuery table."
+ )
+ parser_bigquery.add_argument(
+ "topic_id",
+ help="The id of the Cloud Pub/Sub topic to use to report that the job "
+ 'is complete, e.g. "dlp-sample-topic".',
+ )
+ parser_bigquery.add_argument(
+ "subscription_id",
+ help="The id of the Cloud Pub/Sub subscription to monitor for job "
+ 'completion, e.g. "dlp-sample-subscription". The subscription must '
+ "already be subscribed to the topic. See the test files or the Cloud "
+ "Pub/Sub sample files for examples on how to create the subscription.",
+ )
+ parser_bigquery.add_argument(
+ "--project",
+ help="The Google Cloud project id to use as a parent resource.",
+ default=default_project,
+ )
+ parser_bigquery.add_argument(
+ "--info_types",
+ nargs="+",
+ help="Strings representing info types to look for. A full list of "
+ "info categories and types is available from the API. Examples "
+ 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+ "If unspecified, the three above examples will be used.",
+ default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
+ )
+ parser_bigquery.add_argument(
+ "--custom_dictionaries",
+ action="append",
+ help="Strings representing comma-delimited lists of dictionary words"
+ " to search for as custom info types. Each string is a comma "
+ "delimited list of words representing a distinct dictionary.",
+ default=None,
+ )
+ parser_bigquery.add_argument(
+ "--custom_regexes",
+ action="append",
+ help="Strings representing regex patterns to search for as custom "
+ " info types.",
+ default=None,
+ )
+ parser_bigquery.add_argument(
+ "--min_likelihood",
+ choices=[
+ "LIKELIHOOD_UNSPECIFIED",
+ "VERY_UNLIKELY",
+ "UNLIKELY",
+ "POSSIBLE",
+ "LIKELY",
+ "VERY_LIKELY",
+ ],
+ help="A string representing the minimum likelihood threshold that "
+ "constitutes a match.",
+ )
+ parser_bigquery.add_argument(
+ "--max_findings",
+ type=int,
+ help="The maximum number of findings to report; 0 = no maximum.",
+ )
+ parser_bigquery.add_argument(
+ "--timeout",
+ type=int,
+ help="The maximum number of seconds to wait for a response from the "
+ "API. The default is 300 seconds.",
+ default=300,
+ )
+
+ args = parser.parse_args()
+
+ if args.content == "string":
+ inspect_string(
+ args.project,
+ args.item,
+ args.info_types,
+ custom_dictionaries=args.custom_dictionaries,
+ custom_regexes=args.custom_regexes,
+ min_likelihood=args.min_likelihood,
+ max_findings=args.max_findings,
+ include_quote=args.include_quote,
+ )
+ elif args.content == "table":
+ inspect_table(
+ args.project,
+ args.data,
+ args.info_types,
+ custom_dictionaries=args.custom_dictionaries,
+ custom_regexes=args.custom_regexes,
+ min_likelihood=args.min_likelihood,
+ max_findings=args.max_findings,
+ include_quote=args.include_quote,
+ )
+ elif args.content == "file":
+ inspect_file(
+ args.project,
+ args.filename,
+ args.info_types,
+ custom_dictionaries=args.custom_dictionaries,
+ custom_regexes=args.custom_regexes,
+ min_likelihood=args.min_likelihood,
+ max_findings=args.max_findings,
+ include_quote=args.include_quote,
+ mime_type=args.mime_type,
+ )
+ elif args.content == "gcs":
+ inspect_gcs_file(
+ args.project,
+ args.bucket,
+ args.filename,
+ args.topic_id,
+ args.subscription_id,
+ args.info_types,
+ custom_dictionaries=args.custom_dictionaries,
+ custom_regexes=args.custom_regexes,
+ min_likelihood=args.min_likelihood,
+ max_findings=args.max_findings,
+ timeout=args.timeout,
+ )
+ elif args.content == "datastore":
+ inspect_datastore(
+ args.project,
+ args.datastore_project,
+ args.kind,
+ args.topic_id,
+ args.subscription_id,
+ args.info_types,
+ custom_dictionaries=args.custom_dictionaries,
+ custom_regexes=args.custom_regexes,
+ namespace_id=args.namespace_id,
+ min_likelihood=args.min_likelihood,
+ max_findings=args.max_findings,
+ timeout=args.timeout,
+ )
+ elif args.content == "bigquery":
+ inspect_bigquery(
+ args.project,
+ args.bigquery_project,
+ args.dataset_id,
+ args.table_id,
+ args.topic_id,
+ args.subscription_id,
+ args.info_types,
+ custom_dictionaries=args.custom_dictionaries,
+ custom_regexes=args.custom_regexes,
+ min_likelihood=args.min_likelihood,
+ max_findings=args.max_findings,
+ timeout=args.timeout,
+ )
diff --git a/packages/google-cloud-dlp/samples/snippets/inspect_content_test.py b/packages/google-cloud-dlp/samples/snippets/inspect_content_test.py
new file mode 100644
index 000000000000..bdabda265c1b
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/inspect_content_test.py
@@ -0,0 +1,467 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import uuid
+
+import google.api_core.exceptions
+import google.cloud.bigquery
+import google.cloud.datastore
+import google.cloud.dlp_v2
+import google.cloud.exceptions
+import google.cloud.pubsub
+import google.cloud.storage
+import pytest
+
+import inspect_content
+
+
+UNIQUE_STRING = str(uuid.uuid4()).split("-")[0]
+
+GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT")
+TEST_BUCKET_NAME = GCLOUD_PROJECT + "-dlp-python-client-test" + UNIQUE_STRING
+RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources")
+RESOURCE_FILE_NAMES = ["test.txt", "test.png", "harmless.txt", "accounts.txt"]
+TOPIC_ID = "dlp-test" + UNIQUE_STRING
+SUBSCRIPTION_ID = "dlp-test-subscription" + UNIQUE_STRING
+DATASTORE_KIND = "DLP test kind"
+DATASTORE_NAME = "DLP test object" + UNIQUE_STRING
+BIGQUERY_DATASET_ID = "dlp_test_dataset" + UNIQUE_STRING
+BIGQUERY_TABLE_ID = "dlp_test_table" + UNIQUE_STRING
+
+TIMEOUT = 900 # 15 minutes
+
+
+@pytest.fixture(scope="module")
+def bucket():
+ # Creates a GCS bucket, uploads files required for the test, and tears down
+ # the entire bucket afterwards.
+
+ client = google.cloud.storage.Client()
+ try:
+ bucket = client.get_bucket(TEST_BUCKET_NAME)
+ except google.cloud.exceptions.NotFound:
+ bucket = client.create_bucket(TEST_BUCKET_NAME)
+
+ # Upoad the blobs and keep track of them in a list.
+ blobs = []
+ for name in RESOURCE_FILE_NAMES:
+ path = os.path.join(RESOURCE_DIRECTORY, name)
+ blob = bucket.blob(name)
+ blob.upload_from_filename(path)
+ blobs.append(blob)
+
+ # Yield the object to the test; lines after this execute as a teardown.
+ yield bucket
+
+ # Delete the files.
+ for blob in blobs:
+ try:
+ blob.delete()
+ except google.cloud.exceptions.NotFound:
+ print("Issue during teardown, missing blob")
+
+ # Attempt to delete the bucket; this will only work if it is empty.
+ bucket.delete()
+
+
+@pytest.fixture(scope="module")
+def topic_id():
+ # Creates a pubsub topic, and tears it down.
+ publisher = google.cloud.pubsub.PublisherClient()
+ topic_path = publisher.topic_path(GCLOUD_PROJECT, TOPIC_ID)
+ try:
+ publisher.create_topic(topic_path)
+ except google.api_core.exceptions.AlreadyExists:
+ pass
+
+ yield TOPIC_ID
+
+ publisher.delete_topic(topic_path)
+
+
+@pytest.fixture(scope="module")
+def subscription_id(topic_id):
+ # Subscribes to a topic.
+ subscriber = google.cloud.pubsub.SubscriberClient()
+ topic_path = subscriber.topic_path(GCLOUD_PROJECT, topic_id)
+ subscription_path = subscriber.subscription_path(
+ GCLOUD_PROJECT, SUBSCRIPTION_ID)
+ try:
+ subscriber.create_subscription(subscription_path, topic_path)
+ except google.api_core.exceptions.AlreadyExists:
+ pass
+
+ yield SUBSCRIPTION_ID
+
+ subscriber.delete_subscription(subscription_path)
+
+
+@pytest.fixture(scope="module")
+def datastore_project():
+ # Adds test Datastore data, yields the project ID and then tears down.
+ datastore_client = google.cloud.datastore.Client()
+
+ kind = DATASTORE_KIND
+ name = DATASTORE_NAME
+ key = datastore_client.key(kind, name)
+ item = google.cloud.datastore.Entity(key=key)
+ item["payload"] = "My name is Gary Smith and my email is gary@example.com"
+
+ datastore_client.put(item)
+
+ yield GCLOUD_PROJECT
+
+ datastore_client.delete(key)
+
+
+@pytest.fixture(scope="module")
+def bigquery_project():
+ # Adds test Bigquery data, yields the project ID and then tears down.
+ bigquery_client = google.cloud.bigquery.Client()
+
+ dataset_ref = bigquery_client.dataset(BIGQUERY_DATASET_ID)
+ dataset = google.cloud.bigquery.Dataset(dataset_ref)
+ try:
+ dataset = bigquery_client.create_dataset(dataset)
+ except google.api_core.exceptions.Conflict:
+ dataset = bigquery_client.get_dataset(dataset)
+
+ table_ref = dataset_ref.table(BIGQUERY_TABLE_ID)
+ table = google.cloud.bigquery.Table(table_ref)
+
+ # DO NOT SUBMIT: trim this down once we find out what works
+ table.schema = (
+ google.cloud.bigquery.SchemaField("Name", "STRING"),
+ google.cloud.bigquery.SchemaField("Comment", "STRING"),
+ )
+
+ try:
+ table = bigquery_client.create_table(table)
+ except google.api_core.exceptions.Conflict:
+ table = bigquery_client.get_table(table)
+
+ rows_to_insert = [(u"Gary Smith", u"My email is gary@example.com")]
+
+ bigquery_client.insert_rows(table, rows_to_insert)
+
+ yield GCLOUD_PROJECT
+
+ bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
+
+
+def test_inspect_string_basic(capsys):
+ test_string = "String with a phone number: 234-555-6789"
+
+ inspect_content.inspect_string_basic(GCLOUD_PROJECT, test_string)
+
+ out, _ = capsys.readouterr()
+ assert "Info type: PHONE_NUMBER" in out
+ assert "Quote: 234-555-6789" in out
+
+
+def test_inspect_string(capsys):
+ test_string = "My name is Gary Smith and my email is gary@example.com"
+
+ inspect_content.inspect_string(
+ GCLOUD_PROJECT,
+ test_string,
+ ["FIRST_NAME", "EMAIL_ADDRESS"],
+ include_quote=True,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Info type: FIRST_NAME" in out
+ assert "Info type: EMAIL_ADDRESS" in out
+
+
+def test_inspect_table(capsys):
+ test_tabular_data = {
+ "header": ["email", "phone number"],
+ "rows": [
+ ["robertfrost@xyz.com", "4232342345"],
+ ["johndoe@pqr.com", "4253458383"],
+ ],
+ }
+
+ inspect_content.inspect_table(
+ GCLOUD_PROJECT,
+ test_tabular_data,
+ ["PHONE_NUMBER", "EMAIL_ADDRESS"],
+ include_quote=True,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Info type: PHONE_NUMBER" in out
+ assert "Info type: EMAIL_ADDRESS" in out
+
+
+def test_inspect_string_with_custom_info_types(capsys):
+ test_string = "My name is Gary Smith and my email is gary@example.com"
+ dictionaries = ["Gary Smith"]
+ regexes = ["\\w+@\\w+.com"]
+
+ inspect_content.inspect_string(
+ GCLOUD_PROJECT,
+ test_string,
+ [],
+ custom_dictionaries=dictionaries,
+ custom_regexes=regexes,
+ include_quote=True,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Info type: CUSTOM_DICTIONARY_0" in out
+ assert "Info type: CUSTOM_REGEX_0" in out
+
+
+def test_inspect_string_no_results(capsys):
+ test_string = "Nothing to see here"
+
+ inspect_content.inspect_string(
+ GCLOUD_PROJECT,
+ test_string,
+ ["FIRST_NAME", "EMAIL_ADDRESS"],
+ include_quote=True,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "No findings" in out
+
+
+def test_inspect_file(capsys):
+ test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.txt")
+
+ inspect_content.inspect_file(
+ GCLOUD_PROJECT,
+ test_filepath,
+ ["FIRST_NAME", "EMAIL_ADDRESS"],
+ include_quote=True,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Info type: EMAIL_ADDRESS" in out
+
+
+def test_inspect_file_with_custom_info_types(capsys):
+ test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.txt")
+ dictionaries = ["gary@somedomain.com"]
+ regexes = ["\\(\\d{3}\\) \\d{3}-\\d{4}"]
+
+ inspect_content.inspect_file(
+ GCLOUD_PROJECT,
+ test_filepath,
+ [],
+ custom_dictionaries=dictionaries,
+ custom_regexes=regexes,
+ include_quote=True,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Info type: CUSTOM_DICTIONARY_0" in out
+ assert "Info type: CUSTOM_REGEX_0" in out
+
+
+def test_inspect_file_no_results(capsys):
+ test_filepath = os.path.join(RESOURCE_DIRECTORY, "harmless.txt")
+
+ inspect_content.inspect_file(
+ GCLOUD_PROJECT,
+ test_filepath,
+ ["FIRST_NAME", "EMAIL_ADDRESS"],
+ include_quote=True,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "No findings" in out
+
+
+def test_inspect_image_file(capsys):
+ test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png")
+
+ inspect_content.inspect_file(
+ GCLOUD_PROJECT,
+ test_filepath,
+ ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"],
+ include_quote=True,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Info type: PHONE_NUMBER" in out
+
+
+def cancel_operation(out):
+ if "Inspection operation started" in out:
+ # Cancel the operation
+ operation_id = out.split(
+ "Inspection operation started: ")[1].split("\n")[0]
+ client = google.cloud.dlp_v2.DlpServiceClient()
+ client.cancel_dlp_job(operation_id)
+
+
+@pytest.mark.flaky(max_runs=2, min_passes=1)
+def test_inspect_gcs_file(bucket, topic_id, subscription_id, capsys):
+ try:
+ inspect_content.inspect_gcs_file(
+ GCLOUD_PROJECT,
+ bucket.name,
+ "test.txt",
+ topic_id,
+ subscription_id,
+ ["EMAIL_ADDRESS", "PHONE_NUMBER"],
+ timeout=TIMEOUT
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Info type: EMAIL_ADDRESS" in out
+ finally:
+ cancel_operation(out)
+
+
+@pytest.mark.flaky(max_runs=2, min_passes=1)
+def test_inspect_gcs_file_with_custom_info_types(
+ bucket, topic_id, subscription_id, capsys):
+ try:
+ dictionaries = ["gary@somedomain.com"]
+ regexes = ["\\(\\d{3}\\) \\d{3}-\\d{4}"]
+
+ inspect_content.inspect_gcs_file(
+ GCLOUD_PROJECT,
+ bucket.name,
+ "test.txt",
+ topic_id,
+ subscription_id,
+ [],
+ custom_dictionaries=dictionaries,
+ custom_regexes=regexes,
+ timeout=TIMEOUT)
+
+ out, _ = capsys.readouterr()
+
+ assert "Info type: EMAIL_ADDRESS" in out
+ finally:
+ cancel_operation(out)
+
+
+@pytest.mark.flaky(max_runs=2, min_passes=1)
+def test_inspect_gcs_file_no_results(
+ bucket, topic_id, subscription_id, capsys):
+ try:
+ inspect_content.inspect_gcs_file(
+ GCLOUD_PROJECT,
+ bucket.name,
+ "harmless.txt",
+ topic_id,
+ subscription_id,
+ ["EMAIL_ADDRESS", "PHONE_NUMBER"],
+ timeout=TIMEOUT)
+
+ out, _ = capsys.readouterr()
+
+ assert "No findings" in out
+ finally:
+ cancel_operation(out)
+
+
+@pytest.mark.flaky(max_runs=2, min_passes=1)
+def test_inspect_gcs_image_file(bucket, topic_id, subscription_id, capsys):
+ try:
+ inspect_content.inspect_gcs_file(
+ GCLOUD_PROJECT,
+ bucket.name,
+ "test.png",
+ topic_id,
+ subscription_id,
+ ["EMAIL_ADDRESS", "PHONE_NUMBER"],
+ timeout=TIMEOUT)
+
+ out, _ = capsys.readouterr()
+ assert "Info type: EMAIL_ADDRESS" in out
+ finally:
+ cancel_operation(out)
+
+
+@pytest.mark.flaky(max_runs=2, min_passes=1)
+def test_inspect_gcs_multiple_files(bucket, topic_id, subscription_id, capsys):
+ try:
+ inspect_content.inspect_gcs_file(
+ GCLOUD_PROJECT,
+ bucket.name,
+ "*",
+ topic_id,
+ subscription_id,
+ ["EMAIL_ADDRESS", "PHONE_NUMBER"],
+ timeout=TIMEOUT)
+
+ out, _ = capsys.readouterr()
+
+ assert "Info type: EMAIL_ADDRESS" in out
+ finally:
+ cancel_operation(out)
+
+
+@pytest.mark.flaky(max_runs=2, min_passes=1)
+def test_inspect_datastore(
+ datastore_project, topic_id, subscription_id, capsys):
+ try:
+ inspect_content.inspect_datastore(
+ GCLOUD_PROJECT,
+ datastore_project,
+ DATASTORE_KIND,
+ topic_id,
+ subscription_id,
+ ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"],
+ timeout=TIMEOUT)
+
+ out, _ = capsys.readouterr()
+ assert "Info type: EMAIL_ADDRESS" in out
+ finally:
+ cancel_operation(out)
+
+
+@pytest.mark.flaky(max_runs=2, min_passes=1)
+def test_inspect_datastore_no_results(
+ datastore_project, topic_id, subscription_id, capsys):
+ try:
+ inspect_content.inspect_datastore(
+ GCLOUD_PROJECT,
+ datastore_project,
+ DATASTORE_KIND,
+ topic_id,
+ subscription_id,
+ ["PHONE_NUMBER"],
+ timeout=TIMEOUT)
+
+ out, _ = capsys.readouterr()
+ assert "No findings" in out
+ finally:
+ cancel_operation(out)
+
+
+def test_inspect_bigquery(bigquery_project, topic_id, subscription_id, capsys):
+ try:
+ inspect_content.inspect_bigquery(
+ GCLOUD_PROJECT,
+ bigquery_project,
+ BIGQUERY_DATASET_ID,
+ BIGQUERY_TABLE_ID,
+ topic_id,
+ subscription_id,
+ ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"],
+ timeout=1)
+
+ out, _ = capsys.readouterr()
+ assert "Inspection operation started" in out
+ finally:
+ cancel_operation(out)
diff --git a/packages/google-cloud-dlp/samples/snippets/jobs.py b/packages/google-cloud-dlp/samples/snippets/jobs.py
new file mode 100644
index 000000000000..a8ac0b43c5e0
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/jobs.py
@@ -0,0 +1,167 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Sample app to list and delete DLP jobs using the Data Loss Prevent API. """
+
+from __future__ import print_function
+
+import argparse
+
+
+# [START dlp_list_jobs]
+def list_dlp_jobs(project, filter_string=None, job_type=None):
+ """Uses the Data Loss Prevention API to lists DLP jobs that match the
+ specified filter in the request.
+ Args:
+ project: The project id to use as a parent resource.
+ filter: (Optional) Allows filtering.
+ Supported syntax:
+ * Filter expressions are made up of one or more restrictions.
+ * Restrictions can be combined by 'AND' or 'OR' logical operators.
+ A sequence of restrictions implicitly uses 'AND'.
+ * A restriction has the form of '
'.
+ * Supported fields/values for inspect jobs:
+ - `state` - PENDING|RUNNING|CANCELED|FINISHED|FAILED
+ - `inspected_storage` - DATASTORE|CLOUD_STORAGE|BIGQUERY
+ - `trigger_name` - The resource name of the trigger that
+ created job.
+ * Supported fields for risk analysis jobs:
+ - `state` - RUNNING|CANCELED|FINISHED|FAILED
+ * The operator must be '=' or '!='.
+ Examples:
+ * inspected_storage = cloud_storage AND state = done
+ * inspected_storage = cloud_storage OR inspected_storage = bigquery
+ * inspected_storage = cloud_storage AND
+ (state = done OR state = canceled)
+ type: (Optional) The type of job. Defaults to 'INSPECT'.
+ Choices:
+ DLP_JOB_TYPE_UNSPECIFIED
+ INSPECT_JOB: The job inspected content for sensitive data.
+ RISK_ANALYSIS_JOB: The job executed a Risk Analysis computation.
+
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Job type dictionary
+ job_type_to_int = {
+ "DLP_JOB_TYPE_UNSPECIFIED":
+ google.cloud.dlp.enums.DlpJobType.DLP_JOB_TYPE_UNSPECIFIED,
+ "INSPECT_JOB": google.cloud.dlp.enums.DlpJobType.INSPECT_JOB,
+ "RISK_ANALYSIS_JOB": google.cloud.dlp.enums.DlpJobType.RISK_ANALYSIS_JOB,
+ }
+ # If job type is specified, convert job type to number through enums.
+ if job_type:
+ job_type = job_type_to_int[job_type]
+
+ # Call the API to get a list of jobs.
+ response = dlp.list_dlp_jobs(parent, filter_=filter_string, type_=job_type)
+
+ # Iterate over results.
+ for job in response:
+ print("Job: %s; status: %s" % (job.name, job.JobState.Name(job.state)))
+
+
+# [END dlp_list_jobs]
+
+
+# [START dlp_delete_job]
+def delete_dlp_job(project, job_name):
+ """Uses the Data Loss Prevention API to delete a long-running DLP job.
+ Args:
+ project: The project id to use as a parent resource.
+ job_name: The name of the DlpJob resource to be deleted.
+
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id and job name into a full resource id.
+ name = dlp.dlp_job_path(project, job_name)
+
+ # Call the API to delete job.
+ dlp.delete_dlp_job(name)
+
+ print("Successfully deleted %s" % job_name)
+
+
+# [END dlp_delete_job]
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description=__doc__)
+ subparsers = parser.add_subparsers(
+ dest="content", help="Select how to submit content to the API."
+ )
+ subparsers.required = True
+
+ list_parser = subparsers.add_parser(
+ "list",
+ help="List Data Loss Prevention API jobs corresponding to a given "
+ "filter.",
+ )
+ list_parser.add_argument(
+ "project", help="The project id to use as a parent resource."
+ )
+ list_parser.add_argument(
+ "-f",
+ "--filter",
+ help="Filter expressions are made up of one or more restrictions.",
+ )
+ list_parser.add_argument(
+ "-t",
+ "--type",
+ choices=[
+ "DLP_JOB_TYPE_UNSPECIFIED",
+ "INSPECT_JOB",
+ "RISK_ANALYSIS_JOB",
+ ],
+ help='The type of job. API defaults to "INSPECT"',
+ )
+
+ delete_parser = subparsers.add_parser(
+ "delete", help="Delete results of a Data Loss Prevention API job."
+ )
+ delete_parser.add_argument(
+ "project", help="The project id to use as a parent resource."
+ )
+ delete_parser.add_argument(
+ "job_name",
+ help="The name of the DlpJob resource to be deleted. "
+ "Example: X-#####",
+ )
+
+ args = parser.parse_args()
+
+ if args.content == "list":
+ list_dlp_jobs(
+ args.project, filter_string=args.filter, job_type=args.type
+ )
+ elif args.content == "delete":
+ delete_dlp_job(args.project, args.job_name)
diff --git a/packages/google-cloud-dlp/samples/snippets/jobs_test.py b/packages/google-cloud-dlp/samples/snippets/jobs_test.py
new file mode 100644
index 000000000000..89997bc5097c
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/jobs_test.py
@@ -0,0 +1,89 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import uuid
+
+import pytest
+
+import jobs
+
+GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT")
+TEST_COLUMN_NAME = "zip_code"
+TEST_TABLE_PROJECT_ID = "bigquery-public-data"
+TEST_DATASET_ID = "san_francisco"
+TEST_TABLE_ID = "bikeshare_trips"
+test_job_id = "test-job-{}".format(uuid.uuid4())
+
+
+@pytest.fixture(scope="module")
+def test_job_name():
+ import google.cloud.dlp
+
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ parent = dlp.project_path(GCLOUD_PROJECT)
+
+ # Construct job request
+ risk_job = {
+ "privacy_metric": {
+ "categorical_stats_config": {"field": {"name": TEST_COLUMN_NAME}}
+ },
+ "source_table": {
+ "project_id": TEST_TABLE_PROJECT_ID,
+ "dataset_id": TEST_DATASET_ID,
+ "table_id": TEST_TABLE_ID,
+ },
+ }
+
+ response = dlp.create_dlp_job(parent, risk_job=risk_job, job_id=test_job_id)
+ full_path = response.name
+ # API expects only job name, not full project path
+ job_name = full_path[full_path.rfind("/") + 1:]
+ yield job_name
+
+ # clean up job if not deleted
+ try:
+ dlp.delete_dlp_job(full_path)
+ except google.api_core.exceptions.NotFound:
+ print("Issue during teardown, missing job")
+
+
+def test_list_dlp_jobs(test_job_name, capsys):
+ jobs.list_dlp_jobs(GCLOUD_PROJECT)
+
+ out, _ = capsys.readouterr()
+ assert test_job_name not in out
+
+
+def test_list_dlp_jobs_with_filter(test_job_name, capsys):
+ jobs.list_dlp_jobs(
+ GCLOUD_PROJECT,
+ filter_string="state=RUNNING OR state=DONE",
+ job_type="RISK_ANALYSIS_JOB",
+ )
+
+ out, _ = capsys.readouterr()
+ assert test_job_name in out
+
+
+def test_list_dlp_jobs_with_job_type(test_job_name, capsys):
+ jobs.list_dlp_jobs(GCLOUD_PROJECT, job_type="INSPECT_JOB")
+
+ out, _ = capsys.readouterr()
+ assert test_job_name not in out # job created is a risk analysis job
+
+
+def test_delete_dlp_job(test_job_name, capsys):
+ jobs.delete_dlp_job(GCLOUD_PROJECT, test_job_name)
diff --git a/packages/google-cloud-dlp/samples/snippets/metadata.py b/packages/google-cloud-dlp/samples/snippets/metadata.py
new file mode 100644
index 000000000000..7a65941d622a
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/metadata.py
@@ -0,0 +1,72 @@
+# -*- coding: utf-8 -*-
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Sample app that queries the Data Loss Prevention API for supported
+categories and info types."""
+
+from __future__ import print_function
+
+import argparse
+
+
+# [START dlp_list_info_types]
+def list_info_types(language_code=None, result_filter=None):
+ """List types of sensitive information within a category.
+ Args:
+ language_code: The BCP-47 language code to use, e.g. 'en-US'.
+ filter: An optional filter to only return info types supported by
+ certain parts of the API. Defaults to "supported_by=INSPECT".
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Make the API call.
+ response = dlp.list_info_types(language_code, result_filter)
+
+ # Print the results to the console.
+ print("Info types:")
+ for info_type in response.info_types:
+ print(
+ u"{name}: {display_name}".format(
+ name=info_type.name, display_name=info_type.display_name
+ )
+ )
+
+
+# [END dlp_list_info_types]
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument(
+ "--language_code",
+ help="The BCP-47 language code to use, e.g. 'en-US'.",
+ )
+ parser.add_argument(
+ "--filter",
+ help="An optional filter to only return info types supported by "
+ 'certain parts of the API. Defaults to "supported_by=INSPECT".',
+ )
+
+ args = parser.parse_args()
+
+ list_info_types(
+ language_code=args.language_code, result_filter=args.filter
+ )
diff --git a/packages/google-cloud-dlp/samples/snippets/metadata_test.py b/packages/google-cloud-dlp/samples/snippets/metadata_test.py
new file mode 100644
index 000000000000..bde63fd3e8fb
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/metadata_test.py
@@ -0,0 +1,22 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import metadata
+
+
+def test_fetch_info_types(capsys):
+ metadata.list_info_types()
+
+ out, _ = capsys.readouterr()
+ assert "EMAIL_ADDRESS" in out
diff --git a/packages/google-cloud-dlp/samples/snippets/noxfile.py b/packages/google-cloud-dlp/samples/snippets/noxfile.py
new file mode 100644
index 000000000000..ba55d7ce53ca
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/noxfile.py
@@ -0,0 +1,224 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+from pathlib import Path
+import sys
+
+import nox
+
+
+# WARNING - WARNING - WARNING - WARNING - WARNING
+# WARNING - WARNING - WARNING - WARNING - WARNING
+# DO NOT EDIT THIS FILE EVER!
+# WARNING - WARNING - WARNING - WARNING - WARNING
+# WARNING - WARNING - WARNING - WARNING - WARNING
+
+# Copy `noxfile_config.py` to your directory and modify it instead.
+
+
+# `TEST_CONFIG` dict is a configuration hook that allows users to
+# modify the test configurations. The values here should be in sync
+# with `noxfile_config.py`. Users will copy `noxfile_config.py` into
+# their directory and modify it.
+
+TEST_CONFIG = {
+ # You can opt out from the test for specific Python versions.
+ 'ignored_versions': ["2.7"],
+
+ # An envvar key for determining the project id to use. Change it
+ # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
+ # build specific Cloud project. You can also use your own string
+ # to use your own Cloud project.
+ 'gcloud_project_env': 'GOOGLE_CLOUD_PROJECT',
+ # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT',
+
+ # A dictionary you want to inject into your test. Don't put any
+ # secrets here. These values will override predefined values.
+ 'envs': {},
+}
+
+
+try:
+ # Ensure we can import noxfile_config in the project's directory.
+ sys.path.append('.')
+ from noxfile_config import TEST_CONFIG_OVERRIDE
+except ImportError as e:
+ print("No user noxfile_config found: detail: {}".format(e))
+ TEST_CONFIG_OVERRIDE = {}
+
+# Update the TEST_CONFIG with the user supplied values.
+TEST_CONFIG.update(TEST_CONFIG_OVERRIDE)
+
+
+def get_pytest_env_vars():
+ """Returns a dict for pytest invocation."""
+ ret = {}
+
+ # Override the GCLOUD_PROJECT and the alias.
+ env_key = TEST_CONFIG['gcloud_project_env']
+ # This should error out if not set.
+ ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key]
+
+ # Apply user supplied envs.
+ ret.update(TEST_CONFIG['envs'])
+ return ret
+
+
+# DO NOT EDIT - automatically generated.
+# All versions used to tested samples.
+ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"]
+
+# Any default versions that should be ignored.
+IGNORED_VERSIONS = TEST_CONFIG['ignored_versions']
+
+TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS])
+
+INSTALL_LIBRARY_FROM_SOURCE = bool(os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False))
+#
+# Style Checks
+#
+
+
+def _determine_local_import_names(start_dir):
+ """Determines all import names that should be considered "local".
+
+ This is used when running the linter to insure that import order is
+ properly checked.
+ """
+ file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)]
+ return [
+ basename
+ for basename, extension in file_ext_pairs
+ if extension == ".py"
+ or os.path.isdir(os.path.join(start_dir, basename))
+ and basename not in ("__pycache__")
+ ]
+
+
+# Linting with flake8.
+#
+# We ignore the following rules:
+# E203: whitespace before â:â
+# E266: too many leading â#â for block comment
+# E501: line too long
+# I202: Additional newline in a section of imports
+#
+# We also need to specify the rules which are ignored by default:
+# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121']
+FLAKE8_COMMON_ARGS = [
+ "--show-source",
+ "--builtin=gettext",
+ "--max-complexity=20",
+ "--import-order-style=google",
+ "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py",
+ "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202",
+ "--max-line-length=88",
+]
+
+
+@nox.session
+def lint(session):
+ session.install("flake8", "flake8-import-order")
+
+ local_names = _determine_local_import_names(".")
+ args = FLAKE8_COMMON_ARGS + [
+ "--application-import-names",
+ ",".join(local_names),
+ "."
+ ]
+ session.run("flake8", *args)
+
+
+#
+# Sample Tests
+#
+
+
+PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"]
+
+
+def _session_tests(session, post_install=None):
+ """Runs py.test for a particular project."""
+ if os.path.exists("requirements.txt"):
+ session.install("-r", "requirements.txt")
+
+ if os.path.exists("requirements-test.txt"):
+ session.install("-r", "requirements-test.txt")
+
+ if INSTALL_LIBRARY_FROM_SOURCE:
+ session.install("-e", _get_repo_root())
+
+ if post_install:
+ post_install(session)
+
+ session.run(
+ "pytest",
+ *(PYTEST_COMMON_ARGS + session.posargs),
+ # Pytest will return 5 when no tests are collected. This can happen
+ # on travis where slow and flaky tests are excluded.
+ # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html
+ success_codes=[0, 5],
+ env=get_pytest_env_vars()
+ )
+
+
+@nox.session(python=ALL_VERSIONS)
+def py(session):
+ """Runs py.test for a sample using the specified version of Python."""
+ if session.python in TESTED_VERSIONS:
+ _session_tests(session)
+ else:
+ session.skip("SKIPPED: {} tests are disabled for this sample.".format(
+ session.python
+ ))
+
+
+#
+# Readmegen
+#
+
+
+def _get_repo_root():
+ """ Returns the root folder of the project. """
+ # Get root of this repository. Assume we don't have directories nested deeper than 10 items.
+ p = Path(os.getcwd())
+ for i in range(10):
+ if p is None:
+ break
+ if Path(p / ".git").exists():
+ return str(p)
+ p = p.parent
+ raise Exception("Unable to detect repository root.")
+
+
+GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")])
+
+
+@nox.session
+@nox.parametrize("path", GENERATED_READMES)
+def readmegen(session, path):
+ """(Re-)generates the readme for a sample."""
+ session.install("jinja2", "pyyaml")
+ dir_ = os.path.dirname(path)
+
+ if os.path.exists(os.path.join(dir_, "requirements.txt")):
+ session.install("-r", os.path.join(dir_, "requirements.txt"))
+
+ in_file = os.path.join(dir_, "README.rst.in")
+ session.run(
+ "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file
+ )
diff --git a/packages/google-cloud-dlp/samples/snippets/quickstart.py b/packages/google-cloud-dlp/samples/snippets/quickstart.py
new file mode 100644
index 000000000000..ec929b45f541
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/quickstart.py
@@ -0,0 +1,98 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Sample app that queries the Data Loss Prevention API for supported
+categories and info types."""
+
+from __future__ import print_function
+
+import argparse
+import sys
+
+
+def quickstart(project_id):
+ """Demonstrates use of the Data Loss Prevention API client library."""
+
+ # [START dlp_quickstart]
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp_client = google.cloud.dlp_v2.DlpServiceClient()
+
+ # The string to inspect
+ content = "Robert Frost"
+
+ # Construct the item to inspect.
+ item = {"value": content}
+
+ # The info types to search for in the content. Required.
+ info_types = [{"name": "FIRST_NAME"}, {"name": "LAST_NAME"}]
+
+ # The minimum likelihood to constitute a match. Optional.
+ min_likelihood = "LIKELIHOOD_UNSPECIFIED"
+
+ # The maximum number of findings to report (0 = server maximum). Optional.
+ max_findings = 0
+
+ # Whether to include the matching string in the results. Optional.
+ include_quote = True
+
+ # Construct the configuration dictionary. Keys which are None may
+ # optionally be omitted entirely.
+ inspect_config = {
+ "info_types": info_types,
+ "min_likelihood": min_likelihood,
+ "include_quote": include_quote,
+ "limits": {"max_findings_per_request": max_findings},
+ }
+
+ # Convert the project id into a full resource id.
+ parent = dlp_client.project_path(project_id)
+
+ # Call the API.
+ response = dlp_client.inspect_content(parent, inspect_config, item)
+
+ # Print out the results.
+ if response.result.findings:
+ for finding in response.result.findings:
+ try:
+ print("Quote: {}".format(finding.quote))
+ except AttributeError:
+ pass
+ print("Info type: {}".format(finding.info_type.name))
+ # Convert likelihood value to string respresentation.
+ likelihood = (
+ google.cloud.dlp.types.Finding.DESCRIPTOR.fields_by_name[
+ "likelihood"
+ ]
+ .enum_type.values_by_number[finding.likelihood]
+ .name
+ )
+ print("Likelihood: {}".format(likelihood))
+ else:
+ print("No findings.")
+ # [END dlp_quickstart]
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "project_id", help="Enter your GCP project id.", type=str
+ )
+ args = parser.parse_args()
+ if len(sys.argv) == 1:
+ parser.print_usage()
+ sys.exit(1)
+ quickstart(args.project_id)
diff --git a/packages/google-cloud-dlp/samples/snippets/quickstart_test.py b/packages/google-cloud-dlp/samples/snippets/quickstart_test.py
new file mode 100644
index 000000000000..1814497c1660
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/quickstart_test.py
@@ -0,0 +1,37 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import google.cloud.dlp
+import mock
+
+import quickstart
+
+
+GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT")
+
+
+def test_quickstart(capsys):
+ # Mock out project_path to use the test runner's project ID.
+ with mock.patch.object(
+ google.cloud.dlp.DlpServiceClient,
+ "project_path",
+ return_value="projects/{}".format(GCLOUD_PROJECT),
+ ):
+ quickstart.quickstart(GCLOUD_PROJECT)
+
+ out, _ = capsys.readouterr()
+ assert "FIRST_NAME" in out
+ assert "LAST_NAME" in out
diff --git a/packages/google-cloud-dlp/samples/snippets/redact.py b/packages/google-cloud-dlp/samples/snippets/redact.py
new file mode 100644
index 000000000000..8a1650a262db
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/redact.py
@@ -0,0 +1,255 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Sample app that uses the Data Loss Prevent API to redact the contents of
+an image file."""
+
+from __future__ import print_function
+
+import argparse
+
+# [START dlp_redact_image]
+import mimetypes
+
+# [END dlp_redact_image]
+import os
+
+
+# [START dlp_redact_image]
+
+
+def redact_image(
+ project,
+ filename,
+ output_filename,
+ info_types,
+ min_likelihood=None,
+ mime_type=None,
+):
+ """Uses the Data Loss Prevention API to redact protected data in an image.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ filename: The path to the file to inspect.
+ output_filename: The path to which the redacted image will be written.
+ info_types: A list of strings representing info types to look for.
+ A full list of info type categories can be fetched from the API.
+ min_likelihood: A string representing the minimum likelihood threshold
+ that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
+ 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
+ mime_type: The MIME type of the file. If not specified, the type is
+ inferred via the Python standard library's mimetypes module.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Prepare info_types by converting the list of strings into a list of
+ # dictionaries (protos are also accepted).
+ info_types = [{"name": info_type} for info_type in info_types]
+
+ # Prepare image_redaction_configs, a list of dictionaries. Each dictionary
+ # contains an info_type and optionally the color used for the replacement.
+ # The color is omitted in this sample, so the default (black) will be used.
+ image_redaction_configs = []
+
+ if info_types is not None:
+ for info_type in info_types:
+ image_redaction_configs.append({"info_type": info_type})
+
+ # Construct the configuration dictionary. Keys which are None may
+ # optionally be omitted entirely.
+ inspect_config = {
+ "min_likelihood": min_likelihood,
+ "info_types": info_types,
+ }
+
+ # If mime_type is not specified, guess it from the filename.
+ if mime_type is None:
+ mime_guess = mimetypes.MimeTypes().guess_type(filename)
+ mime_type = mime_guess[0] or "application/octet-stream"
+
+ # Select the content type index from the list of supported types.
+ supported_content_types = {
+ None: 0, # "Unspecified"
+ "image/jpeg": 1,
+ "image/bmp": 2,
+ "image/png": 3,
+ "image/svg": 4,
+ "text/plain": 5,
+ }
+ content_type_index = supported_content_types.get(mime_type, 0)
+
+ # Construct the byte_item, containing the file's byte data.
+ with open(filename, mode="rb") as f:
+ byte_item = {"type": content_type_index, "data": f.read()}
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Call the API.
+ response = dlp.redact_image(
+ parent,
+ inspect_config=inspect_config,
+ image_redaction_configs=image_redaction_configs,
+ byte_item=byte_item,
+ )
+
+ # Write out the results.
+ with open(output_filename, mode="wb") as f:
+ f.write(response.redacted_image)
+ print(
+ "Wrote {byte_count} to {filename}".format(
+ byte_count=len(response.redacted_image), filename=output_filename
+ )
+ )
+
+
+# [END dlp_redact_image]
+
+# [START dlp_redact_image_all_text]
+
+
+def redact_image_all_text(
+ project,
+ filename,
+ output_filename,
+):
+ """Uses the Data Loss Prevention API to redact all text in an image.
+
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ filename: The path to the file to inspect.
+ output_filename: The path to which the redacted image will be written.
+
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Construct the image_redaction_configs, indicating to DLP that all text in
+ # the input image should be redacted.
+ image_redaction_configs = [{
+ "redact_all_text": True,
+ }]
+
+ # Construct the byte_item, containing the file's byte data.
+ with open(filename, mode="rb") as f:
+ byte_item = {"type": "IMAGE", "data": f.read()}
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Call the API.
+ response = dlp.redact_image(
+ parent,
+ image_redaction_configs=image_redaction_configs,
+ byte_item=byte_item,
+ )
+
+ # Write out the results.
+ with open(output_filename, mode="wb") as f:
+ f.write(response.redacted_image)
+
+ print("Wrote {byte_count} to {filename}".format(
+ byte_count=len(response.redacted_image), filename=output_filename))
+
+
+# [END dlp_redact_image_all_text]
+
+if __name__ == "__main__":
+ default_project = os.environ.get("GOOGLE_CLOUD_PROJECT")
+
+ common_args_parser = argparse.ArgumentParser(add_help=False)
+ common_args_parser.add_argument(
+ "--project",
+ help="The Google Cloud project id to use as a parent resource.",
+ default=default_project,
+ )
+ common_args_parser.add_argument(
+ "filename", help="The path to the file to inspect.")
+ common_args_parser.add_argument(
+ "output_filename",
+ help="The path to which the redacted image will be written.",
+ )
+
+ parser = argparse.ArgumentParser(description=__doc__)
+ subparsers = parser.add_subparsers(
+ dest="content", help="Select which content should be redacted.")
+ subparsers.required = True
+
+ info_types_parser = subparsers.add_parser(
+ "info_types",
+ help="Redact specific infoTypes from an image.",
+ parents=[common_args_parser],
+ )
+ info_types_parser.add_argument(
+ "--info_types",
+ nargs="+",
+ help="Strings representing info types to look for. A full list of "
+ "info categories and types is available from the API. Examples "
+ 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+ "If unspecified, the three above examples will be used.",
+ default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
+ )
+ info_types_parser.add_argument(
+ "--min_likelihood",
+ choices=[
+ "LIKELIHOOD_UNSPECIFIED",
+ "VERY_UNLIKELY",
+ "UNLIKELY",
+ "POSSIBLE",
+ "LIKELY",
+ "VERY_LIKELY",
+ ],
+ help="A string representing the minimum likelihood threshold that "
+ "constitutes a match.",
+ )
+ info_types_parser.add_argument(
+ "--mime_type",
+ help="The MIME type of the file. If not specified, the type is "
+ "inferred via the Python standard library's mimetypes module.",
+ )
+
+ all_text_parser = subparsers.add_parser(
+ "all_text",
+ help="Redact all text from an image. The MIME type of the file is "
+ "inferred via the Python standard library's mimetypes module.",
+ parents=[common_args_parser],
+ )
+
+ args = parser.parse_args()
+
+ if args.content == "info_types":
+ redact_image(
+ args.project,
+ args.filename,
+ args.output_filename,
+ args.info_types,
+ min_likelihood=args.min_likelihood,
+ mime_type=args.mime_type,
+ )
+ elif args.content == "all_text":
+ redact_image_all_text(
+ args.project,
+ args.filename,
+ args.output_filename,
+ )
diff --git a/packages/google-cloud-dlp/samples/snippets/redact_test.py b/packages/google-cloud-dlp/samples/snippets/redact_test.py
new file mode 100644
index 000000000000..0cce514eb1a6
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/redact_test.py
@@ -0,0 +1,60 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import tempfile
+
+import pytest
+
+import redact
+
+GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT")
+RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources")
+
+
+@pytest.fixture(scope="module")
+def tempdir():
+ tempdir = tempfile.mkdtemp()
+ yield tempdir
+ shutil.rmtree(tempdir)
+
+
+def test_redact_image_file(tempdir, capsys):
+ test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png")
+ output_filepath = os.path.join(tempdir, "redacted.png")
+
+ redact.redact_image(
+ GCLOUD_PROJECT,
+ test_filepath,
+ output_filepath,
+ ["FIRST_NAME", "EMAIL_ADDRESS"],
+ )
+
+ out, _ = capsys.readouterr()
+ assert output_filepath in out
+
+
+def test_redact_image_all_text(tempdir, capsys):
+ test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png")
+ output_filepath = os.path.join(tempdir, "redacted.png")
+
+ redact.redact_image_all_text(
+ GCLOUD_PROJECT,
+ test_filepath,
+ output_filepath,
+ )
+
+ out, _ = capsys.readouterr()
+ assert output_filepath in out
diff --git a/packages/google-cloud-dlp/samples/snippets/requirements-test.txt b/packages/google-cloud-dlp/samples/snippets/requirements-test.txt
new file mode 100644
index 000000000000..d0c01cc98c5f
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/requirements-test.txt
@@ -0,0 +1,4 @@
+pytest==6.0.1
+flaky==3.7.0
+mock==4.0.2
+
diff --git a/packages/google-cloud-dlp/samples/snippets/requirements.txt b/packages/google-cloud-dlp/samples/snippets/requirements.txt
new file mode 100644
index 000000000000..08b72bbe1fdf
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/requirements.txt
@@ -0,0 +1,5 @@
+google-cloud-dlp==1.0.0
+google-cloud-storage==1.30.0
+google-cloud-pubsub==1.7.0
+google-cloud-datastore==1.13.2
+google-cloud-bigquery==1.25.0
diff --git a/packages/google-cloud-dlp/samples/snippets/resources/accounts.txt b/packages/google-cloud-dlp/samples/snippets/resources/accounts.txt
new file mode 100644
index 000000000000..2763cd0ab820
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/resources/accounts.txt
@@ -0,0 +1 @@
+My credit card number is 1234 5678 9012 3456, and my CVV is 789.
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/samples/snippets/resources/dates.csv b/packages/google-cloud-dlp/samples/snippets/resources/dates.csv
new file mode 100644
index 000000000000..056fccb328ea
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/resources/dates.csv
@@ -0,0 +1,5 @@
+name,birth_date,register_date,credit_card
+Ann,01/01/1970,07/21/1996,4532908762519852
+James,03/06/1988,04/09/2001,4301261899725540
+Dan,08/14/1945,11/15/2011,4620761856015295
+Laura,11/03/1992,01/04/2017,4564981067258901
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/samples/snippets/resources/harmless.txt b/packages/google-cloud-dlp/samples/snippets/resources/harmless.txt
new file mode 100644
index 000000000000..5666de37ab23
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/resources/harmless.txt
@@ -0,0 +1 @@
+This file is mostly harmless.
diff --git a/packages/google-cloud-dlp/samples/snippets/resources/test.png b/packages/google-cloud-dlp/samples/snippets/resources/test.png
new file mode 100644
index 000000000000..8f32c8258842
Binary files /dev/null and b/packages/google-cloud-dlp/samples/snippets/resources/test.png differ
diff --git a/packages/google-cloud-dlp/samples/snippets/resources/test.txt b/packages/google-cloud-dlp/samples/snippets/resources/test.txt
new file mode 100644
index 000000000000..c2ee3815bc9b
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/resources/test.txt
@@ -0,0 +1 @@
+My phone number is (223) 456-7890 and my email address is gary@somedomain.com.
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/samples/snippets/risk.py b/packages/google-cloud-dlp/samples/snippets/risk.py
new file mode 100644
index 000000000000..518f947eee6b
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/risk.py
@@ -0,0 +1,947 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Sample app that uses the Data Loss Prevent API to perform risk anaylsis."""
+
+from __future__ import print_function
+
+import argparse
+
+
+# [START dlp_numerical_stats]
+def numerical_risk_analysis(
+ project,
+ table_project_id,
+ dataset_id,
+ table_id,
+ column_name,
+ topic_id,
+ subscription_id,
+ timeout=300,
+):
+ """Uses the Data Loss Prevention API to compute risk metrics of a column
+ of numerical data in a Google BigQuery table.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ table_project_id: The Google Cloud project id where the BigQuery table
+ is stored.
+ dataset_id: The id of the dataset to inspect.
+ table_id: The id of the table to inspect.
+ column_name: The name of the column to compute risk metrics for.
+ topic_id: The name of the Pub/Sub topic to notify once the job
+ completes.
+ subscription_id: The name of the Pub/Sub subscription to use when
+ listening for job completion notifications.
+ timeout: The number of seconds to wait for a response from the API.
+
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # This sample additionally uses Cloud Pub/Sub to receive results from
+ # potentially long-running operations.
+ import google.cloud.pubsub
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into full resource ids.
+ topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id)
+ parent = dlp.location_path(project, 'global')
+
+ # Location info of the BigQuery table.
+ source_table = {
+ "project_id": table_project_id,
+ "dataset_id": dataset_id,
+ "table_id": table_id,
+ }
+
+ # Tell the API where to send a notification when the job is complete.
+ actions = [{"pub_sub": {"topic": topic}}]
+
+ # Configure risk analysis job
+ # Give the name of the numeric column to compute risk metrics for
+ risk_job = {
+ "privacy_metric": {
+ "numerical_stats_config": {"field": {"name": column_name}}
+ },
+ "source_table": source_table,
+ "actions": actions,
+ }
+
+ # Call API to start risk analysis job
+ operation = dlp.create_dlp_job(parent, risk_job=risk_job)
+
+ def callback(message):
+ if message.attributes["DlpJobName"] == operation.name:
+ # This is the message we're looking for, so acknowledge it.
+ message.ack()
+
+ # Now that the job is done, fetch the results and print them.
+ job = dlp.get_dlp_job(operation.name)
+ results = job.risk_details.numerical_stats_result
+ print(
+ "Value Range: [{}, {}]".format(
+ results.min_value.integer_value,
+ results.max_value.integer_value,
+ )
+ )
+ prev_value = None
+ for percent, result in enumerate(results.quantile_values):
+ value = result.integer_value
+ if prev_value != value:
+ print("Value at {}% quantile: {}".format(percent, value))
+ prev_value = value
+ subscription.set_result(None)
+ else:
+ # This is not the message we're looking for.
+ message.drop()
+
+ # Create a Pub/Sub client and find the subscription. The subscription is
+ # expected to already be listening to the topic.
+ subscriber = google.cloud.pubsub.SubscriberClient()
+ subscription_path = subscriber.subscription_path(project, subscription_id)
+ subscription = subscriber.subscribe(subscription_path, callback)
+
+ try:
+ subscription.result(timeout=timeout)
+ except TimeoutError:
+ print(
+ "No event received before the timeout. Please verify that the "
+ "subscription provided is subscribed to the topic provided."
+ )
+ subscription.close()
+
+
+# [END dlp_numerical_stats]
+
+
+# [START dlp_categorical_stats]
+def categorical_risk_analysis(
+ project,
+ table_project_id,
+ dataset_id,
+ table_id,
+ column_name,
+ topic_id,
+ subscription_id,
+ timeout=300,
+):
+ """Uses the Data Loss Prevention API to compute risk metrics of a column
+ of categorical data in a Google BigQuery table.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ table_project_id: The Google Cloud project id where the BigQuery table
+ is stored.
+ dataset_id: The id of the dataset to inspect.
+ table_id: The id of the table to inspect.
+ column_name: The name of the column to compute risk metrics for.
+ topic_id: The name of the Pub/Sub topic to notify once the job
+ completes.
+ subscription_id: The name of the Pub/Sub subscription to use when
+ listening for job completion notifications.
+ timeout: The number of seconds to wait for a response from the API.
+
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # This sample additionally uses Cloud Pub/Sub to receive results from
+ # potentially long-running operations.
+ import google.cloud.pubsub
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into full resource ids.
+ topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id)
+ parent = dlp.location_path(project, 'global')
+
+ # Location info of the BigQuery table.
+ source_table = {
+ "project_id": table_project_id,
+ "dataset_id": dataset_id,
+ "table_id": table_id,
+ }
+
+ # Tell the API where to send a notification when the job is complete.
+ actions = [{"pub_sub": {"topic": topic}}]
+
+ # Configure risk analysis job
+ # Give the name of the numeric column to compute risk metrics for
+ risk_job = {
+ "privacy_metric": {
+ "categorical_stats_config": {"field": {"name": column_name}}
+ },
+ "source_table": source_table,
+ "actions": actions,
+ }
+
+ # Call API to start risk analysis job
+ operation = dlp.create_dlp_job(parent, risk_job=risk_job)
+
+ def callback(message):
+ if message.attributes["DlpJobName"] == operation.name:
+ # This is the message we're looking for, so acknowledge it.
+ message.ack()
+
+ # Now that the job is done, fetch the results and print them.
+ job = dlp.get_dlp_job(operation.name)
+ histogram_buckets = (
+ job.risk_details.categorical_stats_result.value_frequency_histogram_buckets # noqa: E501
+ )
+ # Print bucket stats
+ for i, bucket in enumerate(histogram_buckets):
+ print("Bucket {}:".format(i))
+ print(
+ " Most common value occurs {} time(s)".format(
+ bucket.value_frequency_upper_bound
+ )
+ )
+ print(
+ " Least common value occurs {} time(s)".format(
+ bucket.value_frequency_lower_bound
+ )
+ )
+ print(" {} unique values total.".format(bucket.bucket_size))
+ for value in bucket.bucket_values:
+ print(
+ " Value {} occurs {} time(s)".format(
+ value.value.integer_value, value.count
+ )
+ )
+ subscription.set_result(None)
+ else:
+ # This is not the message we're looking for.
+ message.drop()
+
+ # Create a Pub/Sub client and find the subscription. The subscription is
+ # expected to already be listening to the topic.
+ subscriber = google.cloud.pubsub.SubscriberClient()
+ subscription_path = subscriber.subscription_path(project, subscription_id)
+ subscription = subscriber.subscribe(subscription_path, callback)
+
+ try:
+ subscription.result(timeout=timeout)
+ except TimeoutError:
+ print(
+ "No event received before the timeout. Please verify that the "
+ "subscription provided is subscribed to the topic provided."
+ )
+ subscription.close()
+
+
+# [END dlp_categorical_stats]
+
+
+# [START dlp_k_anonymity]
+def k_anonymity_analysis(
+ project,
+ table_project_id,
+ dataset_id,
+ table_id,
+ topic_id,
+ subscription_id,
+ quasi_ids,
+ timeout=300,
+):
+ """Uses the Data Loss Prevention API to compute the k-anonymity of a
+ column set in a Google BigQuery table.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ table_project_id: The Google Cloud project id where the BigQuery table
+ is stored.
+ dataset_id: The id of the dataset to inspect.
+ table_id: The id of the table to inspect.
+ topic_id: The name of the Pub/Sub topic to notify once the job
+ completes.
+ subscription_id: The name of the Pub/Sub subscription to use when
+ listening for job completion notifications.
+ quasi_ids: A set of columns that form a composite key.
+ timeout: The number of seconds to wait for a response from the API.
+
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # This sample additionally uses Cloud Pub/Sub to receive results from
+ # potentially long-running operations.
+ import google.cloud.pubsub
+
+ # Create helper function for unpacking values
+ def get_values(obj):
+ return int(obj.integer_value)
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id)
+ parent = dlp.location_path(project, 'global')
+
+ # Location info of the BigQuery table.
+ source_table = {
+ "project_id": table_project_id,
+ "dataset_id": dataset_id,
+ "table_id": table_id,
+ }
+
+ # Convert quasi id list to Protobuf type
+ def map_fields(field):
+ return {"name": field}
+
+ quasi_ids = map(map_fields, quasi_ids)
+
+ # Tell the API where to send a notification when the job is complete.
+ actions = [{"pub_sub": {"topic": topic}}]
+
+ # Configure risk analysis job
+ # Give the name of the numeric column to compute risk metrics for
+ risk_job = {
+ "privacy_metric": {"k_anonymity_config": {"quasi_ids": quasi_ids}},
+ "source_table": source_table,
+ "actions": actions,
+ }
+
+ # Call API to start risk analysis job
+ operation = dlp.create_dlp_job(parent, risk_job=risk_job)
+
+ def callback(message):
+ if message.attributes["DlpJobName"] == operation.name:
+ # This is the message we're looking for, so acknowledge it.
+ message.ack()
+
+ # Now that the job is done, fetch the results and print them.
+ job = dlp.get_dlp_job(operation.name)
+ histogram_buckets = (
+ job.risk_details.k_anonymity_result.equivalence_class_histogram_buckets
+ )
+ # Print bucket stats
+ for i, bucket in enumerate(histogram_buckets):
+ print("Bucket {}:".format(i))
+ if bucket.equivalence_class_size_lower_bound:
+ print(
+ " Bucket size range: [{}, {}]".format(
+ bucket.equivalence_class_size_lower_bound,
+ bucket.equivalence_class_size_upper_bound,
+ )
+ )
+ for value_bucket in bucket.bucket_values:
+ print(
+ " Quasi-ID values: {}".format(
+ map(get_values, value_bucket.quasi_ids_values)
+ )
+ )
+ print(
+ " Class size: {}".format(
+ value_bucket.equivalence_class_size
+ )
+ )
+ subscription.set_result(None)
+ else:
+ # This is not the message we're looking for.
+ message.drop()
+
+ # Create a Pub/Sub client and find the subscription. The subscription is
+ # expected to already be listening to the topic.
+ subscriber = google.cloud.pubsub.SubscriberClient()
+ subscription_path = subscriber.subscription_path(project, subscription_id)
+ subscription = subscriber.subscribe(subscription_path, callback)
+
+ try:
+ subscription.result(timeout=timeout)
+ except TimeoutError:
+ print(
+ "No event received before the timeout. Please verify that the "
+ "subscription provided is subscribed to the topic provided."
+ )
+ subscription.close()
+
+
+# [END dlp_k_anonymity]
+
+
+# [START dlp_l_diversity]
+def l_diversity_analysis(
+ project,
+ table_project_id,
+ dataset_id,
+ table_id,
+ topic_id,
+ subscription_id,
+ sensitive_attribute,
+ quasi_ids,
+ timeout=300,
+):
+ """Uses the Data Loss Prevention API to compute the l-diversity of a
+ column set in a Google BigQuery table.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ table_project_id: The Google Cloud project id where the BigQuery table
+ is stored.
+ dataset_id: The id of the dataset to inspect.
+ table_id: The id of the table to inspect.
+ topic_id: The name of the Pub/Sub topic to notify once the job
+ completes.
+ subscription_id: The name of the Pub/Sub subscription to use when
+ listening for job completion notifications.
+ sensitive_attribute: The column to measure l-diversity relative to.
+ quasi_ids: A set of columns that form a composite key.
+ timeout: The number of seconds to wait for a response from the API.
+
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # This sample additionally uses Cloud Pub/Sub to receive results from
+ # potentially long-running operations.
+ import google.cloud.pubsub
+
+ # Create helper function for unpacking values
+ def get_values(obj):
+ return int(obj.integer_value)
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id)
+ parent = dlp.location_path(project, 'global')
+
+ # Location info of the BigQuery table.
+ source_table = {
+ "project_id": table_project_id,
+ "dataset_id": dataset_id,
+ "table_id": table_id,
+ }
+
+ # Convert quasi id list to Protobuf type
+ def map_fields(field):
+ return {"name": field}
+
+ quasi_ids = map(map_fields, quasi_ids)
+
+ # Tell the API where to send a notification when the job is complete.
+ actions = [{"pub_sub": {"topic": topic}}]
+
+ # Configure risk analysis job
+ # Give the name of the numeric column to compute risk metrics for
+ risk_job = {
+ "privacy_metric": {
+ "l_diversity_config": {
+ "quasi_ids": quasi_ids,
+ "sensitive_attribute": {"name": sensitive_attribute},
+ }
+ },
+ "source_table": source_table,
+ "actions": actions,
+ }
+
+ # Call API to start risk analysis job
+ operation = dlp.create_dlp_job(parent, risk_job=risk_job)
+
+ def callback(message):
+ if message.attributes["DlpJobName"] == operation.name:
+ # This is the message we're looking for, so acknowledge it.
+ message.ack()
+
+ # Now that the job is done, fetch the results and print them.
+ job = dlp.get_dlp_job(operation.name)
+ histogram_buckets = (
+ job.risk_details.l_diversity_result.sensitive_value_frequency_histogram_buckets # noqa: E501
+ )
+ # Print bucket stats
+ for i, bucket in enumerate(histogram_buckets):
+ print("Bucket {}:".format(i))
+ print(
+ " Bucket size range: [{}, {}]".format(
+ bucket.sensitive_value_frequency_lower_bound,
+ bucket.sensitive_value_frequency_upper_bound,
+ )
+ )
+ for value_bucket in bucket.bucket_values:
+ print(
+ " Quasi-ID values: {}".format(
+ map(get_values, value_bucket.quasi_ids_values)
+ )
+ )
+ print(
+ " Class size: {}".format(
+ value_bucket.equivalence_class_size
+ )
+ )
+ for value in value_bucket.top_sensitive_values:
+ print(
+ (
+ " Sensitive value {} occurs {} time(s)".format(
+ value.value, value.count
+ )
+ )
+ )
+ subscription.set_result(None)
+ else:
+ # This is not the message we're looking for.
+ message.drop()
+
+ # Create a Pub/Sub client and find the subscription. The subscription is
+ # expected to already be listening to the topic.
+ subscriber = google.cloud.pubsub.SubscriberClient()
+ subscription_path = subscriber.subscription_path(project, subscription_id)
+ subscription = subscriber.subscribe(subscription_path, callback)
+
+ try:
+ subscription.result(timeout=timeout)
+ except TimeoutError:
+ print(
+ "No event received before the timeout. Please verify that the "
+ "subscription provided is subscribed to the topic provided."
+ )
+ subscription.close()
+
+
+# [END dlp_l_diversity]
+
+
+# [START dlp_k_map]
+def k_map_estimate_analysis(
+ project,
+ table_project_id,
+ dataset_id,
+ table_id,
+ topic_id,
+ subscription_id,
+ quasi_ids,
+ info_types,
+ region_code="US",
+ timeout=300,
+):
+ """Uses the Data Loss Prevention API to compute the k-map risk estimation
+ of a column set in a Google BigQuery table.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ table_project_id: The Google Cloud project id where the BigQuery table
+ is stored.
+ dataset_id: The id of the dataset to inspect.
+ table_id: The id of the table to inspect.
+ column_name: The name of the column to compute risk metrics for.
+ topic_id: The name of the Pub/Sub topic to notify once the job
+ completes.
+ subscription_id: The name of the Pub/Sub subscription to use when
+ listening for job completion notifications.
+ quasi_ids: A set of columns that form a composite key and optionally
+ their reidentification distributions.
+ info_types: Type of information of the quasi_id in order to provide a
+ statistical model of population.
+ region_code: The ISO 3166-1 region code that the data is representative
+ of. Can be omitted if using a region-specific infoType (such as
+ US_ZIP_5)
+ timeout: The number of seconds to wait for a response from the API.
+
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library.
+ import google.cloud.dlp
+
+ # This sample additionally uses Cloud Pub/Sub to receive results from
+ # potentially long-running operations.
+ import google.cloud.pubsub
+
+ # Create helper function for unpacking values
+ def get_values(obj):
+ return int(obj.integer_value)
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into full resource ids.
+ topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id)
+ parent = dlp.location_path(project, 'global')
+
+ # Location info of the BigQuery table.
+ source_table = {
+ "project_id": table_project_id,
+ "dataset_id": dataset_id,
+ "table_id": table_id,
+ }
+
+ # Check that numbers of quasi-ids and info types are equal
+ if len(quasi_ids) != len(info_types):
+ raise ValueError(
+ """Number of infoTypes and number of quasi-identifiers
+ must be equal!"""
+ )
+
+ # Convert quasi id list to Protobuf type
+ def map_fields(quasi_id, info_type):
+ return {"field": {"name": quasi_id}, "info_type": {"name": info_type}}
+
+ quasi_ids = map(map_fields, quasi_ids, info_types)
+
+ # Tell the API where to send a notification when the job is complete.
+ actions = [{"pub_sub": {"topic": topic}}]
+
+ # Configure risk analysis job
+ # Give the name of the numeric column to compute risk metrics for
+ risk_job = {
+ "privacy_metric": {
+ "k_map_estimation_config": {
+ "quasi_ids": quasi_ids,
+ "region_code": region_code,
+ }
+ },
+ "source_table": source_table,
+ "actions": actions,
+ }
+
+ # Call API to start risk analysis job
+ operation = dlp.create_dlp_job(parent, risk_job=risk_job)
+
+ def callback(message):
+ if message.attributes["DlpJobName"] == operation.name:
+ # This is the message we're looking for, so acknowledge it.
+ message.ack()
+
+ # Now that the job is done, fetch the results and print them.
+ job = dlp.get_dlp_job(operation.name)
+ histogram_buckets = (
+ job.risk_details.k_map_estimation_result.k_map_estimation_histogram
+ )
+ # Print bucket stats
+ for i, bucket in enumerate(histogram_buckets):
+ print("Bucket {}:".format(i))
+ print(
+ " Anonymity range: [{}, {}]".format(
+ bucket.min_anonymity, bucket.max_anonymity
+ )
+ )
+ print(" Size: {}".format(bucket.bucket_size))
+ for value_bucket in bucket.bucket_values:
+ print(
+ " Values: {}".format(
+ map(get_values, value_bucket.quasi_ids_values)
+ )
+ )
+ print(
+ " Estimated k-map anonymity: {}".format(
+ value_bucket.estimated_anonymity
+ )
+ )
+ subscription.set_result(None)
+ else:
+ # This is not the message we're looking for.
+ message.drop()
+
+ # Create a Pub/Sub client and find the subscription. The subscription is
+ # expected to already be listening to the topic.
+ subscriber = google.cloud.pubsub.SubscriberClient()
+ subscription_path = subscriber.subscription_path(project, subscription_id)
+ subscription = subscriber.subscribe(subscription_path, callback)
+
+ try:
+ subscription.result(timeout=timeout)
+ except TimeoutError:
+ print(
+ "No event received before the timeout. Please verify that the "
+ "subscription provided is subscribed to the topic provided."
+ )
+ subscription.close()
+
+
+# [END dlp_k_map]
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description=__doc__)
+ subparsers = parser.add_subparsers(
+ dest="content", help="Select how to submit content to the API."
+ )
+ subparsers.required = True
+
+ numerical_parser = subparsers.add_parser("numerical", help="")
+ numerical_parser.add_argument(
+ "project",
+ help="The Google Cloud project id to use as a parent resource.",
+ )
+ numerical_parser.add_argument(
+ "table_project_id",
+ help="The Google Cloud project id where the BigQuery table is stored.",
+ )
+ numerical_parser.add_argument(
+ "dataset_id", help="The id of the dataset to inspect."
+ )
+ numerical_parser.add_argument(
+ "table_id", help="The id of the table to inspect."
+ )
+ numerical_parser.add_argument(
+ "column_name",
+ help="The name of the column to compute risk metrics for.",
+ )
+ numerical_parser.add_argument(
+ "topic_id",
+ help="The name of the Pub/Sub topic to notify once the job completes.",
+ )
+ numerical_parser.add_argument(
+ "subscription_id",
+ help="The name of the Pub/Sub subscription to use when listening for"
+ "job completion notifications.",
+ )
+ numerical_parser.add_argument(
+ "--timeout",
+ type=int,
+ help="The number of seconds to wait for a response from the API.",
+ )
+
+ categorical_parser = subparsers.add_parser("categorical", help="")
+ categorical_parser.add_argument(
+ "project",
+ help="The Google Cloud project id to use as a parent resource.",
+ )
+ categorical_parser.add_argument(
+ "table_project_id",
+ help="The Google Cloud project id where the BigQuery table is stored.",
+ )
+ categorical_parser.add_argument(
+ "dataset_id", help="The id of the dataset to inspect."
+ )
+ categorical_parser.add_argument(
+ "table_id", help="The id of the table to inspect."
+ )
+ categorical_parser.add_argument(
+ "column_name",
+ help="The name of the column to compute risk metrics for.",
+ )
+ categorical_parser.add_argument(
+ "topic_id",
+ help="The name of the Pub/Sub topic to notify once the job completes.",
+ )
+ categorical_parser.add_argument(
+ "subscription_id",
+ help="The name of the Pub/Sub subscription to use when listening for"
+ "job completion notifications.",
+ )
+ categorical_parser.add_argument(
+ "--timeout",
+ type=int,
+ help="The number of seconds to wait for a response from the API.",
+ )
+
+ k_anonymity_parser = subparsers.add_parser(
+ "k_anonymity",
+ help="Computes the k-anonymity of a column set in a Google BigQuery"
+ "table.",
+ )
+ k_anonymity_parser.add_argument(
+ "project",
+ help="The Google Cloud project id to use as a parent resource.",
+ )
+ k_anonymity_parser.add_argument(
+ "table_project_id",
+ help="The Google Cloud project id where the BigQuery table is stored.",
+ )
+ k_anonymity_parser.add_argument(
+ "dataset_id", help="The id of the dataset to inspect."
+ )
+ k_anonymity_parser.add_argument(
+ "table_id", help="The id of the table to inspect."
+ )
+ k_anonymity_parser.add_argument(
+ "topic_id",
+ help="The name of the Pub/Sub topic to notify once the job completes.",
+ )
+ k_anonymity_parser.add_argument(
+ "subscription_id",
+ help="The name of the Pub/Sub subscription to use when listening for"
+ "job completion notifications.",
+ )
+ k_anonymity_parser.add_argument(
+ "quasi_ids",
+ nargs="+",
+ help="A set of columns that form a composite key.",
+ )
+ k_anonymity_parser.add_argument(
+ "--timeout",
+ type=int,
+ help="The number of seconds to wait for a response from the API.",
+ )
+
+ l_diversity_parser = subparsers.add_parser(
+ "l_diversity",
+ help="Computes the l-diversity of a column set in a Google BigQuery"
+ "table.",
+ )
+ l_diversity_parser.add_argument(
+ "project",
+ help="The Google Cloud project id to use as a parent resource.",
+ )
+ l_diversity_parser.add_argument(
+ "table_project_id",
+ help="The Google Cloud project id where the BigQuery table is stored.",
+ )
+ l_diversity_parser.add_argument(
+ "dataset_id", help="The id of the dataset to inspect."
+ )
+ l_diversity_parser.add_argument(
+ "table_id", help="The id of the table to inspect."
+ )
+ l_diversity_parser.add_argument(
+ "topic_id",
+ help="The name of the Pub/Sub topic to notify once the job completes.",
+ )
+ l_diversity_parser.add_argument(
+ "subscription_id",
+ help="The name of the Pub/Sub subscription to use when listening for"
+ "job completion notifications.",
+ )
+ l_diversity_parser.add_argument(
+ "sensitive_attribute",
+ help="The column to measure l-diversity relative to.",
+ )
+ l_diversity_parser.add_argument(
+ "quasi_ids",
+ nargs="+",
+ help="A set of columns that form a composite key.",
+ )
+ l_diversity_parser.add_argument(
+ "--timeout",
+ type=int,
+ help="The number of seconds to wait for a response from the API.",
+ )
+
+ k_map_parser = subparsers.add_parser(
+ "k_map",
+ help="Computes the k-map risk estimation of a column set in a Google"
+ "BigQuery table.",
+ )
+ k_map_parser.add_argument(
+ "project",
+ help="The Google Cloud project id to use as a parent resource.",
+ )
+ k_map_parser.add_argument(
+ "table_project_id",
+ help="The Google Cloud project id where the BigQuery table is stored.",
+ )
+ k_map_parser.add_argument(
+ "dataset_id", help="The id of the dataset to inspect."
+ )
+ k_map_parser.add_argument(
+ "table_id", help="The id of the table to inspect."
+ )
+ k_map_parser.add_argument(
+ "topic_id",
+ help="The name of the Pub/Sub topic to notify once the job completes.",
+ )
+ k_map_parser.add_argument(
+ "subscription_id",
+ help="The name of the Pub/Sub subscription to use when listening for"
+ "job completion notifications.",
+ )
+ k_map_parser.add_argument(
+ "quasi_ids",
+ nargs="+",
+ help="A set of columns that form a composite key.",
+ )
+ k_map_parser.add_argument(
+ "-t",
+ "--info-types",
+ nargs="+",
+ help="Type of information of the quasi_id in order to provide a"
+ "statistical model of population.",
+ required=True,
+ )
+ k_map_parser.add_argument(
+ "-r",
+ "--region-code",
+ default="US",
+ help="The ISO 3166-1 region code that the data is representative of.",
+ )
+ k_map_parser.add_argument(
+ "--timeout",
+ type=int,
+ help="The number of seconds to wait for a response from the API.",
+ )
+
+ args = parser.parse_args()
+
+ if args.content == "numerical":
+ numerical_risk_analysis(
+ args.project,
+ args.table_project_id,
+ args.dataset_id,
+ args.table_id,
+ args.column_name,
+ args.topic_id,
+ args.subscription_id,
+ timeout=args.timeout,
+ )
+ elif args.content == "categorical":
+ categorical_risk_analysis(
+ args.project,
+ args.table_project_id,
+ args.dataset_id,
+ args.table_id,
+ args.column_name,
+ args.topic_id,
+ args.subscription_id,
+ timeout=args.timeout,
+ )
+ elif args.content == "k_anonymity":
+ k_anonymity_analysis(
+ args.project,
+ args.table_project_id,
+ args.dataset_id,
+ args.table_id,
+ args.topic_id,
+ args.subscription_id,
+ args.quasi_ids,
+ timeout=args.timeout,
+ )
+ elif args.content == "l_diversity":
+ l_diversity_analysis(
+ args.project,
+ args.table_project_id,
+ args.dataset_id,
+ args.table_id,
+ args.topic_id,
+ args.subscription_id,
+ args.sensitive_attribute,
+ args.quasi_ids,
+ timeout=args.timeout,
+ )
+ elif args.content == "k_map":
+ k_map_estimate_analysis(
+ args.project,
+ args.table_project_id,
+ args.dataset_id,
+ args.table_id,
+ args.topic_id,
+ args.subscription_id,
+ args.quasi_ids,
+ args.info_types,
+ region_code=args.region_code,
+ timeout=args.timeout,
+ )
diff --git a/packages/google-cloud-dlp/samples/snippets/risk_test.py b/packages/google-cloud-dlp/samples/snippets/risk_test.py
new file mode 100644
index 000000000000..25d9575d4b0f
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/risk_test.py
@@ -0,0 +1,368 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import uuid
+
+import google.cloud.bigquery
+import google.cloud.pubsub
+import pytest
+
+import risk
+
+
+UNIQUE_STRING = str(uuid.uuid4()).split("-")[0]
+GCLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT")
+TABLE_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT")
+TOPIC_ID = "dlp-test" + UNIQUE_STRING
+SUBSCRIPTION_ID = "dlp-test-subscription" + UNIQUE_STRING
+UNIQUE_FIELD = "Name"
+REPEATED_FIELD = "Mystery"
+NUMERIC_FIELD = "Age"
+STRING_BOOLEAN_FIELD = "Gender"
+
+BIGQUERY_DATASET_ID = "dlp_test_dataset" + UNIQUE_STRING
+BIGQUERY_TABLE_ID = "dlp_test_table" + UNIQUE_STRING
+BIGQUERY_HARMFUL_TABLE_ID = "harmful" + UNIQUE_STRING
+
+TIMEOUT = 120 # 2 minutes
+
+
+# Create new custom topic/subscription
+# We observe sometimes all the tests in this file fail. In a
+# hypothesis where DLP service somehow loses the connection to the
+# topic, now we use function scope for Pub/Sub fixtures.
+@pytest.fixture(scope="module")
+def topic_id():
+ # Creates a pubsub topic, and tears it down.
+ publisher = google.cloud.pubsub.PublisherClient()
+ topic_path = publisher.topic_path(GCLOUD_PROJECT, TOPIC_ID)
+ try:
+ publisher.create_topic(topic_path)
+ except google.api_core.exceptions.AlreadyExists:
+ pass
+
+ yield TOPIC_ID
+
+ publisher.delete_topic(topic_path)
+
+
+@pytest.fixture(scope="module")
+def subscription_id(topic_id):
+ # Subscribes to a topic.
+ subscriber = google.cloud.pubsub.SubscriberClient()
+ topic_path = subscriber.topic_path(GCLOUD_PROJECT, topic_id)
+ subscription_path = subscriber.subscription_path(
+ GCLOUD_PROJECT, SUBSCRIPTION_ID
+ )
+ try:
+ subscriber.create_subscription(subscription_path, topic_path)
+ except google.api_core.exceptions.AlreadyExists:
+ pass
+
+ yield SUBSCRIPTION_ID
+
+ subscriber.delete_subscription(subscription_path)
+
+
+@pytest.fixture(scope="module")
+def bigquery_project():
+ # Adds test Bigquery data, yields the project ID and then tears down.
+
+ bigquery_client = google.cloud.bigquery.Client()
+
+ dataset_ref = bigquery_client.dataset(BIGQUERY_DATASET_ID)
+ dataset = google.cloud.bigquery.Dataset(dataset_ref)
+ try:
+ dataset = bigquery_client.create_dataset(dataset)
+ except google.api_core.exceptions.Conflict:
+ dataset = bigquery_client.get_dataset(dataset)
+ table_ref = dataset_ref.table(BIGQUERY_TABLE_ID)
+ table = google.cloud.bigquery.Table(table_ref)
+
+ harmful_table_ref = dataset_ref.table(BIGQUERY_HARMFUL_TABLE_ID)
+ harmful_table = google.cloud.bigquery.Table(harmful_table_ref)
+
+ table.schema = (
+ google.cloud.bigquery.SchemaField("Name", "STRING"),
+ google.cloud.bigquery.SchemaField("Comment", "STRING"),
+ )
+
+ harmful_table.schema = (
+ google.cloud.bigquery.SchemaField("Name", "STRING", "REQUIRED"),
+ google.cloud.bigquery.SchemaField(
+ "TelephoneNumber", "STRING", "REQUIRED"
+ ),
+ google.cloud.bigquery.SchemaField("Mystery", "STRING", "REQUIRED"),
+ google.cloud.bigquery.SchemaField("Age", "INTEGER", "REQUIRED"),
+ google.cloud.bigquery.SchemaField("Gender", "STRING"),
+ google.cloud.bigquery.SchemaField("RegionCode", "STRING"),
+ )
+
+ try:
+ table = bigquery_client.create_table(table)
+ except google.api_core.exceptions.Conflict:
+ table = bigquery_client.get_table(table)
+
+ try:
+ harmful_table = bigquery_client.create_table(harmful_table)
+ except google.api_core.exceptions.Conflict:
+ harmful_table = bigquery_client.get_table(harmful_table)
+
+ rows_to_insert = [(u"Gary Smith", u"My email is gary@example.com")]
+ harmful_rows_to_insert = [
+ (
+ u"Gandalf",
+ u"(123) 456-7890",
+ "4231 5555 6781 9876",
+ 27,
+ "Male",
+ "US",
+ ),
+ (
+ u"Dumbledore",
+ u"(313) 337-1337",
+ "6291 8765 1095 7629",
+ 27,
+ "Male",
+ "US",
+ ),
+ (u"Joe", u"(452) 123-1234", "3782 2288 1166 3030", 35, "Male", "US"),
+ (u"James", u"(567) 890-1234", "8291 3627 8250 1234", 19, "Male", "US"),
+ (
+ u"Marie",
+ u"(452) 123-1234",
+ "8291 3627 8250 1234",
+ 35,
+ "Female",
+ "US",
+ ),
+ (
+ u"Carrie",
+ u"(567) 890-1234",
+ "2253 5218 4251 4526",
+ 35,
+ "Female",
+ "US",
+ ),
+ ]
+
+ bigquery_client.insert_rows(table, rows_to_insert)
+ bigquery_client.insert_rows(harmful_table, harmful_rows_to_insert)
+ yield GCLOUD_PROJECT
+
+ bigquery_client.delete_dataset(dataset_ref, delete_contents=True)
+
+
+@pytest.mark.flaky(max_runs=3, min_passes=1)
+def test_numerical_risk_analysis(
+ topic_id, subscription_id, bigquery_project, capsys
+):
+ risk.numerical_risk_analysis(
+ GCLOUD_PROJECT,
+ TABLE_PROJECT,
+ BIGQUERY_DATASET_ID,
+ BIGQUERY_HARMFUL_TABLE_ID,
+ NUMERIC_FIELD,
+ topic_id,
+ subscription_id,
+ timeout=TIMEOUT,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Value Range:" in out
+
+
+@pytest.mark.flaky(max_runs=3, min_passes=1)
+def test_categorical_risk_analysis_on_string_field(
+ topic_id, subscription_id, bigquery_project, capsys
+):
+ risk.categorical_risk_analysis(
+ GCLOUD_PROJECT,
+ TABLE_PROJECT,
+ BIGQUERY_DATASET_ID,
+ BIGQUERY_HARMFUL_TABLE_ID,
+ UNIQUE_FIELD,
+ topic_id,
+ subscription_id,
+ timeout=TIMEOUT,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Most common value occurs" in out
+
+
+@pytest.mark.flaky(max_runs=3, min_passes=1)
+def test_categorical_risk_analysis_on_number_field(
+ topic_id, subscription_id, bigquery_project, capsys
+):
+ risk.categorical_risk_analysis(
+ GCLOUD_PROJECT,
+ TABLE_PROJECT,
+ BIGQUERY_DATASET_ID,
+ BIGQUERY_HARMFUL_TABLE_ID,
+ NUMERIC_FIELD,
+ topic_id,
+ subscription_id,
+ timeout=TIMEOUT,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Most common value occurs" in out
+
+
+@pytest.mark.flaky(max_runs=3, min_passes=1)
+def test_k_anonymity_analysis_single_field(
+ topic_id, subscription_id, bigquery_project, capsys
+):
+ risk.k_anonymity_analysis(
+ GCLOUD_PROJECT,
+ TABLE_PROJECT,
+ BIGQUERY_DATASET_ID,
+ BIGQUERY_HARMFUL_TABLE_ID,
+ topic_id,
+ subscription_id,
+ [NUMERIC_FIELD],
+ timeout=TIMEOUT,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Quasi-ID values:" in out
+ assert "Class size:" in out
+
+
+@pytest.mark.flaky(max_runs=3, min_passes=1)
+def test_k_anonymity_analysis_multiple_fields(
+ topic_id, subscription_id, bigquery_project, capsys
+):
+ risk.k_anonymity_analysis(
+ GCLOUD_PROJECT,
+ TABLE_PROJECT,
+ BIGQUERY_DATASET_ID,
+ BIGQUERY_HARMFUL_TABLE_ID,
+ topic_id,
+ subscription_id,
+ [NUMERIC_FIELD, REPEATED_FIELD],
+ timeout=TIMEOUT,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Quasi-ID values:" in out
+ assert "Class size:" in out
+
+
+@pytest.mark.flaky(max_runs=3, min_passes=1)
+def test_l_diversity_analysis_single_field(
+ topic_id, subscription_id, bigquery_project, capsys
+):
+ risk.l_diversity_analysis(
+ GCLOUD_PROJECT,
+ TABLE_PROJECT,
+ BIGQUERY_DATASET_ID,
+ BIGQUERY_HARMFUL_TABLE_ID,
+ topic_id,
+ subscription_id,
+ UNIQUE_FIELD,
+ [NUMERIC_FIELD],
+ timeout=TIMEOUT,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Quasi-ID values:" in out
+ assert "Class size:" in out
+ assert "Sensitive value" in out
+
+
+@pytest.mark.flaky(max_runs=3, min_passes=1)
+def test_l_diversity_analysis_multiple_field(
+ topic_id, subscription_id, bigquery_project, capsys
+):
+ risk.l_diversity_analysis(
+ GCLOUD_PROJECT,
+ TABLE_PROJECT,
+ BIGQUERY_DATASET_ID,
+ BIGQUERY_HARMFUL_TABLE_ID,
+ topic_id,
+ subscription_id,
+ UNIQUE_FIELD,
+ [NUMERIC_FIELD, REPEATED_FIELD],
+ timeout=TIMEOUT,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Quasi-ID values:" in out
+ assert "Class size:" in out
+ assert "Sensitive value" in out
+
+
+@pytest.mark.flaky(max_runs=3, min_passes=1)
+def test_k_map_estimate_analysis_single_field(
+ topic_id, subscription_id, bigquery_project, capsys
+):
+ risk.k_map_estimate_analysis(
+ GCLOUD_PROJECT,
+ TABLE_PROJECT,
+ BIGQUERY_DATASET_ID,
+ BIGQUERY_HARMFUL_TABLE_ID,
+ topic_id,
+ subscription_id,
+ [NUMERIC_FIELD],
+ ["AGE"],
+ timeout=TIMEOUT,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Anonymity range:" in out
+ assert "Size:" in out
+ assert "Values" in out
+
+
+@pytest.mark.flaky(max_runs=3, min_passes=1)
+def test_k_map_estimate_analysis_multiple_field(
+ topic_id, subscription_id, bigquery_project, capsys
+):
+ risk.k_map_estimate_analysis(
+ GCLOUD_PROJECT,
+ TABLE_PROJECT,
+ BIGQUERY_DATASET_ID,
+ BIGQUERY_HARMFUL_TABLE_ID,
+ topic_id,
+ subscription_id,
+ [NUMERIC_FIELD, STRING_BOOLEAN_FIELD],
+ ["AGE", "GENDER"],
+ timeout=TIMEOUT,
+ )
+
+ out, _ = capsys.readouterr()
+ assert "Anonymity range:" in out
+ assert "Size:" in out
+ assert "Values" in out
+
+
+@pytest.mark.flaky(max_runs=3, min_passes=1)
+def test_k_map_estimate_analysis_quasi_ids_info_types_equal(
+ topic_id, subscription_id, bigquery_project
+):
+ with pytest.raises(ValueError):
+ risk.k_map_estimate_analysis(
+ GCLOUD_PROJECT,
+ TABLE_PROJECT,
+ BIGQUERY_DATASET_ID,
+ BIGQUERY_HARMFUL_TABLE_ID,
+ topic_id,
+ subscription_id,
+ [NUMERIC_FIELD, STRING_BOOLEAN_FIELD],
+ ["AGE"],
+ timeout=TIMEOUT,
+ )
diff --git a/packages/google-cloud-dlp/samples/snippets/templates.py b/packages/google-cloud-dlp/samples/snippets/templates.py
new file mode 100644
index 000000000000..2d9f8137d5d1
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/templates.py
@@ -0,0 +1,266 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Sample app that sets up Data Loss Prevention API inspect templates."""
+
+from __future__ import print_function
+
+import argparse
+import os
+import time
+
+
+# [START dlp_create_template]
+def create_inspect_template(
+ project,
+ info_types,
+ template_id=None,
+ display_name=None,
+ min_likelihood=None,
+ max_findings=None,
+ include_quote=None,
+):
+ """Creates a Data Loss Prevention API inspect template.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ info_types: A list of strings representing info types to look for.
+ A full list of info type categories can be fetched from the API.
+ template_id: The id of the template. If omitted, an id will be randomly
+ generated.
+ display_name: The optional display name of the template.
+ min_likelihood: A string representing the minimum likelihood threshold
+ that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
+ 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
+ max_findings: The maximum number of findings to report; 0 = no maximum.
+ include_quote: Boolean for whether to display a quote of the detected
+ information in the results.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Prepare info_types by converting the list of strings into a list of
+ # dictionaries (protos are also accepted).
+ info_types = [{"name": info_type} for info_type in info_types]
+
+ # Construct the configuration dictionary. Keys which are None may
+ # optionally be omitted entirely.
+ inspect_config = {
+ "info_types": info_types,
+ "min_likelihood": min_likelihood,
+ "include_quote": include_quote,
+ "limits": {"max_findings_per_request": max_findings},
+ }
+
+ inspect_template = {
+ "inspect_config": inspect_config,
+ "display_name": display_name,
+ }
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Call the API.
+ response = dlp.create_inspect_template(
+ parent, inspect_template=inspect_template, template_id=template_id
+ )
+
+ print("Successfully created template {}".format(response.name))
+
+
+# [END dlp_create_template]
+
+
+# [START dlp_list_templates]
+def list_inspect_templates(project):
+ """Lists all Data Loss Prevention API inspect templates.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Call the API.
+ response = dlp.list_inspect_templates(parent)
+
+ # Define a helper function to convert the API's "seconds since the epoch"
+ # time format into a human-readable string.
+ def human_readable_time(timestamp):
+ return str(time.localtime(timestamp.seconds))
+
+ for template in response:
+ print("Template {}:".format(template.name))
+ if template.display_name:
+ print(" Display Name: {}".format(template.display_name))
+ print(
+ " Created: {}".format(human_readable_time(template.create_time))
+ )
+ print(
+ " Updated: {}".format(human_readable_time(template.update_time))
+ )
+
+ config = template.inspect_config
+ print(
+ " InfoTypes: {}".format(
+ ", ".join([it.name for it in config.info_types])
+ )
+ )
+ print(" Minimum likelihood: {}".format(config.min_likelihood))
+ print(" Include quotes: {}".format(config.include_quote))
+ print(
+ " Max findings per request: {}".format(
+ config.limits.max_findings_per_request
+ )
+ )
+
+
+# [END dlp_list_templates]
+
+
+# [START dlp_delete_template]
+def delete_inspect_template(project, template_id):
+ """Deletes a Data Loss Prevention API template.
+ Args:
+ project: The id of the Google Cloud project which owns the template.
+ template_id: The id of the template to delete.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Combine the template id with the parent id.
+ template_resource = "{}/inspectTemplates/{}".format(parent, template_id)
+
+ # Call the API.
+ dlp.delete_inspect_template(template_resource)
+
+ print("Template {} successfully deleted.".format(template_resource))
+
+
+# [END dlp_delete_template]
+
+
+if __name__ == "__main__":
+ default_project = os.environ.get("GOOGLE_CLOUD_PROJECT")
+
+ parser = argparse.ArgumentParser(description=__doc__)
+ subparsers = parser.add_subparsers(
+ dest="action", help="Select which action to perform."
+ )
+ subparsers.required = True
+
+ parser_create = subparsers.add_parser("create", help="Create a template.")
+ parser_create.add_argument(
+ "--template_id",
+ help="The id of the template. If omitted, an id will be randomly "
+ "generated",
+ )
+ parser_create.add_argument(
+ "--display_name", help="The optional display name of the template."
+ )
+ parser_create.add_argument(
+ "--project",
+ help="The Google Cloud project id to use as a parent resource.",
+ default=default_project,
+ )
+ parser_create.add_argument(
+ "--info_types",
+ nargs="+",
+ help="Strings representing info types to look for. A full list of "
+ "info categories and types is available from the API. Examples "
+ 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+ "If unspecified, the three above examples will be used.",
+ default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
+ )
+ parser_create.add_argument(
+ "--min_likelihood",
+ choices=[
+ "LIKELIHOOD_UNSPECIFIED",
+ "VERY_UNLIKELY",
+ "UNLIKELY",
+ "POSSIBLE",
+ "LIKELY",
+ "VERY_LIKELY",
+ ],
+ help="A string representing the minimum likelihood threshold that "
+ "constitutes a match.",
+ )
+ parser_create.add_argument(
+ "--max_findings",
+ type=int,
+ help="The maximum number of findings to report; 0 = no maximum.",
+ )
+ parser_create.add_argument(
+ "--include_quote",
+ type=bool,
+ help="A boolean for whether to display a quote of the detected "
+ "information in the results.",
+ default=True,
+ )
+
+ parser_list = subparsers.add_parser("list", help="List all templates.")
+ parser_list.add_argument(
+ "--project",
+ help="The Google Cloud project id to use as a parent resource.",
+ default=default_project,
+ )
+
+ parser_delete = subparsers.add_parser("delete", help="Delete a template.")
+ parser_delete.add_argument(
+ "template_id", help="The id of the template to delete."
+ )
+ parser_delete.add_argument(
+ "--project",
+ help="The Google Cloud project id to use as a parent resource.",
+ default=default_project,
+ )
+
+ args = parser.parse_args()
+
+ if args.action == "create":
+ create_inspect_template(
+ args.project,
+ args.info_types,
+ template_id=args.template_id,
+ display_name=args.display_name,
+ min_likelihood=args.min_likelihood,
+ max_findings=args.max_findings,
+ include_quote=args.include_quote,
+ )
+ elif args.action == "list":
+ list_inspect_templates(args.project)
+ elif args.action == "delete":
+ delete_inspect_template(args.project, args.template_id)
diff --git a/packages/google-cloud-dlp/samples/snippets/templates_test.py b/packages/google-cloud-dlp/samples/snippets/templates_test.py
new file mode 100644
index 000000000000..f8d22118bfcd
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/templates_test.py
@@ -0,0 +1,60 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import uuid
+
+import google.api_core.exceptions
+import google.cloud.storage
+
+import templates
+
+UNIQUE_STRING = str(uuid.uuid4()).split("-")[0]
+GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT")
+TEST_TEMPLATE_ID = "test-template" + UNIQUE_STRING
+
+
+def test_create_list_and_delete_template(capsys):
+ try:
+ templates.create_inspect_template(
+ GCLOUD_PROJECT,
+ ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"],
+ template_id=TEST_TEMPLATE_ID,
+ )
+ except google.api_core.exceptions.InvalidArgument:
+ # Template already exists, perhaps due to a previous interrupted test.
+ templates.delete_inspect_template(GCLOUD_PROJECT, TEST_TEMPLATE_ID)
+
+ out, _ = capsys.readouterr()
+ assert TEST_TEMPLATE_ID in out
+
+ # Try again and move on.
+ templates.create_inspect_template(
+ GCLOUD_PROJECT,
+ ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"],
+ template_id=TEST_TEMPLATE_ID,
+ )
+
+ out, _ = capsys.readouterr()
+ assert TEST_TEMPLATE_ID in out
+
+ templates.list_inspect_templates(GCLOUD_PROJECT)
+
+ out, _ = capsys.readouterr()
+ assert TEST_TEMPLATE_ID in out
+
+ templates.delete_inspect_template(GCLOUD_PROJECT, TEST_TEMPLATE_ID)
+
+ out, _ = capsys.readouterr()
+ assert TEST_TEMPLATE_ID in out
diff --git a/packages/google-cloud-dlp/samples/snippets/triggers.py b/packages/google-cloud-dlp/samples/snippets/triggers.py
new file mode 100644
index 000000000000..7548ab893db8
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/triggers.py
@@ -0,0 +1,297 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Sample app that sets up Data Loss Prevention API automation triggers."""
+
+from __future__ import print_function
+
+import argparse
+import os
+import time
+
+
+# [START dlp_create_trigger]
+def create_trigger(
+ project,
+ bucket,
+ scan_period_days,
+ info_types,
+ trigger_id=None,
+ display_name=None,
+ description=None,
+ min_likelihood=None,
+ max_findings=None,
+ auto_populate_timespan=False,
+):
+ """Creates a scheduled Data Loss Prevention API inspect_content trigger.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ bucket: The name of the GCS bucket to scan. This sample scans all
+ files in the bucket using a wildcard.
+ scan_period_days: How often to repeat the scan, in days.
+ The minimum is 1 day.
+ info_types: A list of strings representing info types to look for.
+ A full list of info type categories can be fetched from the API.
+ trigger_id: The id of the trigger. If omitted, an id will be randomly
+ generated.
+ display_name: The optional display name of the trigger.
+ description: The optional description of the trigger.
+ min_likelihood: A string representing the minimum likelihood threshold
+ that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED',
+ 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'.
+ max_findings: The maximum number of findings to report; 0 = no maximum.
+ auto_populate_timespan: Automatically populates time span config start
+ and end times in order to scan new content only.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Prepare info_types by converting the list of strings into a list of
+ # dictionaries (protos are also accepted).
+ info_types = [{"name": info_type} for info_type in info_types]
+
+ # Construct the configuration dictionary. Keys which are None may
+ # optionally be omitted entirely.
+ inspect_config = {
+ "info_types": info_types,
+ "min_likelihood": min_likelihood,
+ "limits": {"max_findings_per_request": max_findings},
+ }
+
+ # Construct a cloud_storage_options dictionary with the bucket's URL.
+ url = "gs://{}/*".format(bucket)
+ storage_config = {
+ "cloud_storage_options": {"file_set": {"url": url}},
+ # Time-based configuration for each storage object.
+ "timespan_config": {
+ # Auto-populate start and end times in order to scan new objects
+ # only.
+ "enable_auto_population_of_timespan_config": auto_populate_timespan
+ },
+ }
+
+ # Construct the job definition.
+ job = {"inspect_config": inspect_config, "storage_config": storage_config}
+
+ # Construct the schedule definition:
+ schedule = {
+ "recurrence_period_duration": {
+ "seconds": scan_period_days * 60 * 60 * 24
+ }
+ }
+
+ # Construct the trigger definition.
+ job_trigger = {
+ "inspect_job": job,
+ "display_name": display_name,
+ "description": description,
+ "triggers": [{"schedule": schedule}],
+ "status": "HEALTHY",
+ }
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Call the API.
+ response = dlp.create_job_trigger(
+ parent, job_trigger=job_trigger, trigger_id=trigger_id
+ )
+
+ print("Successfully created trigger {}".format(response.name))
+
+
+# [END dlp_create_trigger]
+
+
+# [START dlp_list_triggers]
+def list_triggers(project):
+ """Lists all Data Loss Prevention API triggers.
+ Args:
+ project: The Google Cloud project id to use as a parent resource.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Call the API.
+ response = dlp.list_job_triggers(parent)
+
+ # Define a helper function to convert the API's "seconds since the epoch"
+ # time format into a human-readable string.
+ def human_readable_time(timestamp):
+ return str(time.localtime(timestamp.seconds))
+
+ for trigger in response:
+ print("Trigger {}:".format(trigger.name))
+ print(" Created: {}".format(human_readable_time(trigger.create_time)))
+ print(" Updated: {}".format(human_readable_time(trigger.update_time)))
+ if trigger.display_name:
+ print(" Display Name: {}".format(trigger.display_name))
+ if trigger.description:
+ print(" Description: {}".format(trigger.discription))
+ print(" Status: {}".format(trigger.status))
+ print(" Error count: {}".format(len(trigger.errors)))
+
+
+# [END dlp_list_triggers]
+
+
+# [START dlp_delete_trigger]
+def delete_trigger(project, trigger_id):
+ """Deletes a Data Loss Prevention API trigger.
+ Args:
+ project: The id of the Google Cloud project which owns the trigger.
+ trigger_id: The id of the trigger to delete.
+ Returns:
+ None; the response from the API is printed to the terminal.
+ """
+
+ # Import the client library
+ import google.cloud.dlp
+
+ # Instantiate a client.
+ dlp = google.cloud.dlp_v2.DlpServiceClient()
+
+ # Convert the project id into a full resource id.
+ parent = dlp.project_path(project)
+
+ # Combine the trigger id with the parent id.
+ trigger_resource = "{}/jobTriggers/{}".format(parent, trigger_id)
+
+ # Call the API.
+ dlp.delete_job_trigger(trigger_resource)
+
+ print("Trigger {} successfully deleted.".format(trigger_resource))
+
+
+# [END dlp_delete_triggers]
+
+
+if __name__ == "__main__":
+ default_project = os.environ.get("GOOGLE_CLOUD_PROJECT")
+
+ parser = argparse.ArgumentParser(description=__doc__)
+ subparsers = parser.add_subparsers(
+ dest="action", help="Select which action to perform."
+ )
+ subparsers.required = True
+
+ parser_create = subparsers.add_parser("create", help="Create a trigger.")
+ parser_create.add_argument(
+ "bucket", help="The name of the GCS bucket containing the file."
+ )
+ parser_create.add_argument(
+ "scan_period_days",
+ type=int,
+ help="How often to repeat the scan, in days. The minimum is 1 day.",
+ )
+ parser_create.add_argument(
+ "--trigger_id",
+ help="The id of the trigger. If omitted, an id will be randomly "
+ "generated",
+ )
+ parser_create.add_argument(
+ "--display_name", help="The optional display name of the trigger."
+ )
+ parser_create.add_argument(
+ "--description", help="The optional description of the trigger."
+ )
+ parser_create.add_argument(
+ "--project",
+ help="The Google Cloud project id to use as a parent resource.",
+ default=default_project,
+ )
+ parser_create.add_argument(
+ "--info_types",
+ nargs="+",
+ help="Strings representing info types to look for. A full list of "
+ "info categories and types is available from the API. Examples "
+ 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
+ "If unspecified, the three above examples will be used.",
+ default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
+ )
+ parser_create.add_argument(
+ "--min_likelihood",
+ choices=[
+ "LIKELIHOOD_UNSPECIFIED",
+ "VERY_UNLIKELY",
+ "UNLIKELY",
+ "POSSIBLE",
+ "LIKELY",
+ "VERY_LIKELY",
+ ],
+ help="A string representing the minimum likelihood threshold that "
+ "constitutes a match.",
+ )
+ parser_create.add_argument(
+ "--max_findings",
+ type=int,
+ help="The maximum number of findings to report; 0 = no maximum.",
+ )
+ parser_create.add_argument(
+ "--auto_populate_timespan",
+ type=bool,
+ help="Limit scan to new content only.",
+ )
+
+ parser_list = subparsers.add_parser("list", help="List all triggers.")
+ parser_list.add_argument(
+ "--project",
+ help="The Google Cloud project id to use as a parent resource.",
+ default=default_project,
+ )
+
+ parser_delete = subparsers.add_parser("delete", help="Delete a trigger.")
+ parser_delete.add_argument(
+ "trigger_id", help="The id of the trigger to delete."
+ )
+ parser_delete.add_argument(
+ "--project",
+ help="The Google Cloud project id to use as a parent resource.",
+ default=default_project,
+ )
+
+ args = parser.parse_args()
+
+ if args.action == "create":
+ create_trigger(
+ args.project,
+ args.bucket,
+ args.scan_period_days,
+ args.info_types,
+ trigger_id=args.trigger_id,
+ display_name=args.display_name,
+ description=args.description,
+ min_likelihood=args.min_likelihood,
+ max_findings=args.max_findings,
+ auto_populate_timespan=args.auto_populate_timespan,
+ )
+ elif args.action == "list":
+ list_triggers(args.project)
+ elif args.action == "delete":
+ delete_trigger(args.project, args.trigger_id)
diff --git a/packages/google-cloud-dlp/samples/snippets/triggers_test.py b/packages/google-cloud-dlp/samples/snippets/triggers_test.py
new file mode 100644
index 000000000000..dc219d88c7a9
--- /dev/null
+++ b/packages/google-cloud-dlp/samples/snippets/triggers_test.py
@@ -0,0 +1,103 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import uuid
+
+import google.api_core.exceptions
+import google.cloud.storage
+
+import pytest
+
+import triggers
+
+UNIQUE_STRING = str(uuid.uuid4()).split("-")[0]
+GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT")
+TEST_BUCKET_NAME = GCLOUD_PROJECT + "-dlp-python-client-test" + UNIQUE_STRING
+RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources")
+RESOURCE_FILE_NAMES = ["test.txt", "test.png", "harmless.txt", "accounts.txt"]
+TEST_TRIGGER_ID = "test-trigger" + UNIQUE_STRING
+
+
+@pytest.fixture(scope="module")
+def bucket():
+ # Creates a GCS bucket, uploads files required for the test, and tears down
+ # the entire bucket afterwards.
+
+ client = google.cloud.storage.Client()
+ try:
+ bucket = client.get_bucket(TEST_BUCKET_NAME)
+ except google.cloud.exceptions.NotFound:
+ bucket = client.create_bucket(TEST_BUCKET_NAME)
+
+ # Upoad the blobs and keep track of them in a list.
+ blobs = []
+ for name in RESOURCE_FILE_NAMES:
+ path = os.path.join(RESOURCE_DIRECTORY, name)
+ blob = bucket.blob(name)
+ blob.upload_from_filename(path)
+ blobs.append(blob)
+
+ # Yield the object to the test; lines after this execute as a teardown.
+ yield bucket
+
+ # Delete the files.
+ for blob in blobs:
+ try:
+ blob.delete()
+ except google.cloud.exceptions.NotFound:
+ print("Issue during teardown, missing blob")
+
+ # Attempt to delete the bucket; this will only work if it is empty.
+ bucket.delete()
+
+
+def test_create_list_and_delete_trigger(bucket, capsys):
+ try:
+ triggers.create_trigger(
+ GCLOUD_PROJECT,
+ bucket.name,
+ 7,
+ ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"],
+ trigger_id=TEST_TRIGGER_ID,
+ )
+ except google.api_core.exceptions.InvalidArgument:
+ # Trigger already exists, perhaps due to a previous interrupted test.
+ triggers.delete_trigger(GCLOUD_PROJECT, TEST_TRIGGER_ID)
+
+ out, _ = capsys.readouterr()
+ assert TEST_TRIGGER_ID in out
+
+ # Try again and move on.
+ triggers.create_trigger(
+ GCLOUD_PROJECT,
+ bucket.name,
+ 7,
+ ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"],
+ trigger_id=TEST_TRIGGER_ID,
+ auto_populate_timespan=True,
+ )
+
+ out, _ = capsys.readouterr()
+ assert TEST_TRIGGER_ID in out
+
+ triggers.list_triggers(GCLOUD_PROJECT)
+
+ out, _ = capsys.readouterr()
+ assert TEST_TRIGGER_ID in out
+
+ triggers.delete_trigger(GCLOUD_PROJECT, TEST_TRIGGER_ID)
+
+ out, _ = capsys.readouterr()
+ assert TEST_TRIGGER_ID in out
diff --git a/packages/google-cloud-dlp/scripts/decrypt-secrets.sh b/packages/google-cloud-dlp/scripts/decrypt-secrets.sh
new file mode 100755
index 000000000000..ff599eb2af25
--- /dev/null
+++ b/packages/google-cloud-dlp/scripts/decrypt-secrets.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+ROOT=$( dirname "$DIR" )
+
+# Work from the project root.
+cd $ROOT
+
+# Use SECRET_MANAGER_PROJECT if set, fallback to cloud-devrel-kokoro-resources.
+PROJECT_ID="${SECRET_MANAGER_PROJECT:-cloud-devrel-kokoro-resources}"
+
+gcloud secrets versions access latest --secret="python-docs-samples-test-env" \
+ > testing/test-env.sh
+gcloud secrets versions access latest \
+ --secret="python-docs-samples-service-account" \
+ > testing/service-account.json
+gcloud secrets versions access latest \
+ --secret="python-docs-samples-client-secrets" \
+ > testing/client-secrets.json
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/scripts/readme-gen/readme_gen.py b/packages/google-cloud-dlp/scripts/readme-gen/readme_gen.py
new file mode 100644
index 000000000000..d309d6e97518
--- /dev/null
+++ b/packages/google-cloud-dlp/scripts/readme-gen/readme_gen.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+
+# Copyright 2016 Google Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generates READMEs using configuration defined in yaml."""
+
+import argparse
+import io
+import os
+import subprocess
+
+import jinja2
+import yaml
+
+
+jinja_env = jinja2.Environment(
+ trim_blocks=True,
+ loader=jinja2.FileSystemLoader(
+ os.path.abspath(os.path.join(os.path.dirname(__file__), 'templates'))))
+
+README_TMPL = jinja_env.get_template('README.tmpl.rst')
+
+
+def get_help(file):
+ return subprocess.check_output(['python', file, '--help']).decode()
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('source')
+ parser.add_argument('--destination', default='README.rst')
+
+ args = parser.parse_args()
+
+ source = os.path.abspath(args.source)
+ root = os.path.dirname(source)
+ destination = os.path.join(root, args.destination)
+
+ jinja_env.globals['get_help'] = get_help
+
+ with io.open(source, 'r') as f:
+ config = yaml.load(f)
+
+ # This allows get_help to execute in the right directory.
+ os.chdir(root)
+
+ output = README_TMPL.render(config)
+
+ with io.open(destination, 'w') as f:
+ f.write(output)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/packages/google-cloud-dlp/scripts/readme-gen/templates/README.tmpl.rst b/packages/google-cloud-dlp/scripts/readme-gen/templates/README.tmpl.rst
new file mode 100644
index 000000000000..4fd239765b0a
--- /dev/null
+++ b/packages/google-cloud-dlp/scripts/readme-gen/templates/README.tmpl.rst
@@ -0,0 +1,87 @@
+{# The following line is a lie. BUT! Once jinja2 is done with it, it will
+ become truth! #}
+.. This file is automatically generated. Do not edit this file directly.
+
+{{product.name}} Python Samples
+===============================================================================
+
+.. image:: https://gstatic.com/cloudssh/images/open-btn.png
+ :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor={{folder}}/README.rst
+
+
+This directory contains samples for {{product.name}}. {{product.description}}
+
+{{description}}
+
+.. _{{product.name}}: {{product.url}}
+
+{% if required_api_url %}
+To run the sample, you need to enable the API at: {{required_api_url}}
+{% endif %}
+
+{% if required_role %}
+To run the sample, you need to have `{{required_role}}` role.
+{% endif %}
+
+{{other_required_steps}}
+
+{% if setup %}
+Setup
+-------------------------------------------------------------------------------
+
+{% for section in setup %}
+
+{% include section + '.tmpl.rst' %}
+
+{% endfor %}
+{% endif %}
+
+{% if samples %}
+Samples
+-------------------------------------------------------------------------------
+
+{% for sample in samples %}
+{{sample.name}}
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+{% if not sample.hide_cloudshell_button %}
+.. image:: https://gstatic.com/cloudssh/images/open-btn.png
+ :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor={{folder}}/{{sample.file}},{{folder}}/README.rst
+{% endif %}
+
+
+{{sample.description}}
+
+To run this sample:
+
+.. code-block:: bash
+
+ $ python {{sample.file}}
+{% if sample.show_help %}
+
+ {{get_help(sample.file)|indent}}
+{% endif %}
+
+
+{% endfor %}
+{% endif %}
+
+{% if cloud_client_library %}
+
+The client library
+-------------------------------------------------------------------------------
+
+This sample uses the `Google Cloud Client Library for Python`_.
+You can read the documentation for more details on API usage and use GitHub
+to `browse the source`_ and `report issues`_.
+
+.. _Google Cloud Client Library for Python:
+ https://googlecloudplatform.github.io/google-cloud-python/
+.. _browse the source:
+ https://github.com/GoogleCloudPlatform/google-cloud-python
+.. _report issues:
+ https://github.com/GoogleCloudPlatform/google-cloud-python/issues
+
+{% endif %}
+
+.. _Google Cloud SDK: https://cloud.google.com/sdk/
\ No newline at end of file
diff --git a/packages/google-cloud-dlp/scripts/readme-gen/templates/auth.tmpl.rst b/packages/google-cloud-dlp/scripts/readme-gen/templates/auth.tmpl.rst
new file mode 100644
index 000000000000..1446b94a5e3a
--- /dev/null
+++ b/packages/google-cloud-dlp/scripts/readme-gen/templates/auth.tmpl.rst
@@ -0,0 +1,9 @@
+Authentication
+++++++++++++++
+
+This sample requires you to have authentication setup. Refer to the
+`Authentication Getting Started Guide`_ for instructions on setting up
+credentials for applications.
+
+.. _Authentication Getting Started Guide:
+ https://cloud.google.com/docs/authentication/getting-started
diff --git a/packages/google-cloud-dlp/scripts/readme-gen/templates/auth_api_key.tmpl.rst b/packages/google-cloud-dlp/scripts/readme-gen/templates/auth_api_key.tmpl.rst
new file mode 100644
index 000000000000..11957ce2714a
--- /dev/null
+++ b/packages/google-cloud-dlp/scripts/readme-gen/templates/auth_api_key.tmpl.rst
@@ -0,0 +1,14 @@
+Authentication
+++++++++++++++
+
+Authentication for this service is done via an `API Key`_. To obtain an API
+Key:
+
+1. Open the `Cloud Platform Console`_
+2. Make sure that billing is enabled for your project.
+3. From the **Credentials** page, create a new **API Key** or use an existing
+ one for your project.
+
+.. _API Key:
+ https://developers.google.com/api-client-library/python/guide/aaa_apikeys
+.. _Cloud Console: https://console.cloud.google.com/project?_
diff --git a/packages/google-cloud-dlp/scripts/readme-gen/templates/install_deps.tmpl.rst b/packages/google-cloud-dlp/scripts/readme-gen/templates/install_deps.tmpl.rst
new file mode 100644
index 000000000000..a0406dba8c84
--- /dev/null
+++ b/packages/google-cloud-dlp/scripts/readme-gen/templates/install_deps.tmpl.rst
@@ -0,0 +1,29 @@
+Install Dependencies
+++++++++++++++++++++
+
+#. Clone python-docs-samples and change directory to the sample directory you want to use.
+
+ .. code-block:: bash
+
+ $ git clone https://github.com/GoogleCloudPlatform/python-docs-samples.git
+
+#. Install `pip`_ and `virtualenv`_ if you do not already have them. You may want to refer to the `Python Development Environment Setup Guide`_ for Google Cloud Platform for instructions.
+
+ .. _Python Development Environment Setup Guide:
+ https://cloud.google.com/python/setup
+
+#. Create a virtualenv. Samples are compatible with Python 2.7 and 3.4+.
+
+ .. code-block:: bash
+
+ $ virtualenv env
+ $ source env/bin/activate
+
+#. Install the dependencies needed to run the samples.
+
+ .. code-block:: bash
+
+ $ pip install -r requirements.txt
+
+.. _pip: https://pip.pypa.io/
+.. _virtualenv: https://virtualenv.pypa.io/
diff --git a/packages/google-cloud-dlp/scripts/readme-gen/templates/install_portaudio.tmpl.rst b/packages/google-cloud-dlp/scripts/readme-gen/templates/install_portaudio.tmpl.rst
new file mode 100644
index 000000000000..5ea33d18c00c
--- /dev/null
+++ b/packages/google-cloud-dlp/scripts/readme-gen/templates/install_portaudio.tmpl.rst
@@ -0,0 +1,35 @@
+Install PortAudio
++++++++++++++++++
+
+Install `PortAudio`_. This is required by the `PyAudio`_ library to stream
+audio from your computer's microphone. PyAudio depends on PortAudio for cross-platform compatibility, and is installed differently depending on the
+platform.
+
+* For Mac OS X, you can use `Homebrew`_::
+
+ brew install portaudio
+
+ **Note**: if you encounter an error when running `pip install` that indicates
+ it can't find `portaudio.h`, try running `pip install` with the following
+ flags::
+
+ pip install --global-option='build_ext' \
+ --global-option='-I/usr/local/include' \
+ --global-option='-L/usr/local/lib' \
+ pyaudio
+
+* For Debian / Ubuntu Linux::
+
+ apt-get install portaudio19-dev python-all-dev
+
+* Windows may work without having to install PortAudio explicitly (it will get
+ installed with PyAudio).
+
+For more details, see the `PyAudio installation`_ page.
+
+
+.. _PyAudio: https://people.csail.mit.edu/hubert/pyaudio/
+.. _PortAudio: http://www.portaudio.com/
+.. _PyAudio installation:
+ https://people.csail.mit.edu/hubert/pyaudio/#downloads
+.. _Homebrew: http://brew.sh
diff --git a/packages/google-cloud-dlp/synth.metadata b/packages/google-cloud-dlp/synth.metadata
index be2c13723c6f..0ebb8d417d79 100644
--- a/packages/google-cloud-dlp/synth.metadata
+++ b/packages/google-cloud-dlp/synth.metadata
@@ -4,22 +4,21 @@
"git": {
"name": ".",
"remote": "https://github.com/googleapis/python-dlp.git",
- "sha": "7973a441ae2226ce7c597cb5e7eebfa0e38cd94b"
+ "sha": "973bcc3783029e9b45b23fa13e52bcab4b6f2630"
}
},
{
"git": {
- "name": "googleapis",
- "remote": "https://github.com/googleapis/googleapis.git",
- "sha": "dec3204175104cef49bf21d685d5517caaf0058f",
- "internalRef": "312689208"
+ "name": "synthtool",
+ "remote": "https://github.com/googleapis/synthtool.git",
+ "sha": "5747555f7620113d9a2078a48f4c047a99d31b3e"
}
},
{
"git": {
"name": "synthtool",
"remote": "https://github.com/googleapis/synthtool.git",
- "sha": "d2364eb80b840a36136c8ce12f1c6efabcc9600e"
+ "sha": "5747555f7620113d9a2078a48f4c047a99d31b3e"
}
}
],
diff --git a/packages/google-cloud-dlp/synth.py b/packages/google-cloud-dlp/synth.py
index a6daaa883338..802c4faa7c7a 100644
--- a/packages/google-cloud-dlp/synth.py
+++ b/packages/google-cloud-dlp/synth.py
@@ -16,6 +16,7 @@
import synthtool as s
import synthtool.gcp as gcp
+from synthtool.languages import python
import logging
logging.basicConfig(level=logging.DEBUG)
@@ -259,8 +260,17 @@
# Add templated files
# ----------------------------------------------------------------------------
templated_files = common.py_library(
- cov_level=73, system_test_dependencies=["test_utils"]
+ cov_level=73, system_test_dependencies=["test_utils"], samples=True
)
s.move(templated_files)
+# ----------------------------------------------------------------------------
+# Samples templates
+# ----------------------------------------------------------------------------
+python.py_samples()
+
+# Temporarily disable warnings due to
+# https://github.com/googleapis/gapic-generator-python/issues/525
+s.replace("noxfile.py", '[\"\']-W[\"\']', '# "-W"')
+
s.shell.run(["nox", "-s", "blacken"], hide_output=False)
diff --git a/packages/google-cloud-dlp/testing/.gitignore b/packages/google-cloud-dlp/testing/.gitignore
new file mode 100644
index 000000000000..b05fbd630881
--- /dev/null
+++ b/packages/google-cloud-dlp/testing/.gitignore
@@ -0,0 +1,3 @@
+test-env.sh
+service-account.json
+client-secrets.json
\ No newline at end of file