- On January 1, 2020 this library will no longer support Python 2 on the latest released version. - Previously released library versions will continue to be available. For more information please + As of January 1, 2020 this library no longer supports Python 2 on the latest released version. + Library versions released prior to that date will continue to be available. For more information please visit Python 2 support on Google Cloud.
{% block body %} {% endblock %} diff --git a/packages/google-cloud-dlp/docs/conf.py b/packages/google-cloud-dlp/docs/conf.py index fc9991d1ec40..cc9cc3485b21 100644 --- a/packages/google-cloud-dlp/docs/conf.py +++ b/packages/google-cloud-dlp/docs/conf.py @@ -20,6 +20,10 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.abspath("..")) +# For plugins that can not read conf.py. +# See also: https://github.com/docascode/sphinx-docfx-yaml/issues/85 +sys.path.insert(0, os.path.abspath(".")) + __version__ = "" # -- General configuration ------------------------------------------------ @@ -38,21 +42,18 @@ "sphinx.ext.napoleon", "sphinx.ext.todo", "sphinx.ext.viewcode", + "recommonmark", ] # autodoc/autosummary flags autoclass_content = "both" -autodoc_default_flags = ["members"] +autodoc_default_options = {"members": True} autosummary_generate = True # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] -# Allow markdown includes (so releases.md can include CHANGLEOG.md) -# http://www.sphinx-doc.org/en/master/markdown.html -source_parsers = {".md": "recommonmark.parser.CommonMarkParser"} - # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] @@ -93,7 +94,12 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ["_build"] +exclude_patterns = [ + "_build", + "samples/AUTHORING_GUIDE.md", + "samples/CONTRIBUTING.md", + "samples/snippets/README.rst", +] # The reST default role (used for this markup: `text`) to use for all # documents. @@ -293,7 +299,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, "google-cloud-dlp", u"google-cloud-dlp Documentation", [author], 1) + (master_doc, "google-cloud-dlp", u"google-cloud-dlp Documentation", [author], 1,) ] # If true, show URL addresses after external links. @@ -334,7 +340,7 @@ intersphinx_mapping = { "python": ("http://python.readthedocs.org/en/latest/", None), "google-auth": ("https://google-auth.readthedocs.io/en/stable", None), - "google.api_core": ("https://googleapis.dev/python/google-api-core/latest/", None), + "google.api_core": ("https://googleapis.dev/python/google-api-core/latest/", None,), "grpc": ("https://grpc.io/grpc/python/", None), } diff --git a/packages/google-cloud-dlp/noxfile.py b/packages/google-cloud-dlp/noxfile.py index cfaff4be5040..e27f448fbad6 100644 --- a/packages/google-cloud-dlp/noxfile.py +++ b/packages/google-cloud-dlp/noxfile.py @@ -23,14 +23,15 @@ import nox -BLACK_VERSION = "black==19.3b0" +BLACK_VERSION = "black==19.10b0" BLACK_PATHS = ["docs", "google", "tests", "noxfile.py", "setup.py"] -if os.path.exists("samples"): - BLACK_PATHS.append("samples") +DEFAULT_PYTHON_VERSION = "3.8" +SYSTEM_TEST_PYTHON_VERSIONS = ["2.7", "3.8"] +UNIT_TEST_PYTHON_VERSIONS = ["2.7", "3.5", "3.6", "3.7", "3.8"] -@nox.session(python="3.7") +@nox.session(python=DEFAULT_PYTHON_VERSION) def lint(session): """Run linters. @@ -38,7 +39,9 @@ def lint(session): serious code quality issues. """ session.install("flake8", BLACK_VERSION) - session.run("black", "--check", *BLACK_PATHS) + session.run( + "black", "--check", *BLACK_PATHS, + ) session.run("flake8", "google", "tests") @@ -53,10 +56,12 @@ def blacken(session): check the state of the `gcp_ubuntu_config` we use for that Kokoro run. """ session.install(BLACK_VERSION) - session.run("black", *BLACK_PATHS) + session.run( + "black", *BLACK_PATHS, + ) -@nox.session(python="3.7") +@nox.session(python=DEFAULT_PYTHON_VERSION) def lint_setup_py(session): """Verify that setup.py is valid (including RST check).""" session.install("docutils", "pygments") @@ -84,17 +89,21 @@ def default(session): ) -@nox.session(python=["2.7", "3.5", "3.6", "3.7", "3.8"]) +@nox.session(python=UNIT_TEST_PYTHON_VERSIONS) def unit(session): """Run the unit test suite.""" default(session) -@nox.session(python=["2.7", "3.7"]) +@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS) def system(session): """Run the system test suite.""" system_test_path = os.path.join("tests", "system.py") system_test_folder_path = os.path.join("tests", "system") + + # Check the value of `RUN_SYSTEM_TESTS` env var. It defaults to true. + if os.environ.get("RUN_SYSTEM_TESTS", "true") == "false": + session.skip("RUN_SYSTEM_TESTS is set to false, skipping") # Sanity check: Only run tests if the environment variable is set. if not os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", ""): session.skip("Credentials must be set via environment variable") @@ -110,7 +119,9 @@ def system(session): # Install all test dependencies, then install this package into the # virtualenv's dist-packages. - session.install("mock", "pytest", "google-cloud-testutils") + session.install( + "mock", "pytest", "google-cloud-testutils", + ) session.install("-e", "test_utils") session.install("-e", ".") @@ -121,7 +132,7 @@ def system(session): session.run("py.test", "--quiet", system_test_folder_path, *session.posargs) -@nox.session(python="3.7") +@nox.session(python=DEFAULT_PYTHON_VERSION) def cover(session): """Run the final coverage report. @@ -134,19 +145,52 @@ def cover(session): session.run("coverage", "erase") -@nox.session(python="3.7") +@nox.session(python=DEFAULT_PYTHON_VERSION) def docs(session): """Build the docs for this library.""" session.install("-e", ".") - session.install("sphinx<3.0.0", "alabaster", "recommonmark") + session.install("sphinx", "alabaster", "recommonmark") + + shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) + session.run( + "sphinx-build", + # "-W", # warnings as errors + "-T", # show full traceback on exception + "-N", # no colors + "-b", + "html", + "-d", + os.path.join("docs", "_build", "doctrees", ""), + os.path.join("docs", ""), + os.path.join("docs", "_build", "html", ""), + ) + + +@nox.session(python=DEFAULT_PYTHON_VERSION) +def docfx(session): + """Build the docfx yaml files for this library.""" + + session.install("-e", ".") + session.install("sphinx", "alabaster", "recommonmark", "sphinx-docfx-yaml") shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True) session.run( "sphinx-build", - "-W", # warnings as errors "-T", # show full traceback on exception "-N", # no colors + "-D", + ( + "extensions=sphinx.ext.autodoc," + "sphinx.ext.autosummary," + "docfx_yaml.extension," + "sphinx.ext.intersphinx," + "sphinx.ext.coverage," + "sphinx.ext.napoleon," + "sphinx.ext.todo," + "sphinx.ext.viewcode," + "recommonmark" + ), "-b", "html", "-d", diff --git a/packages/google-cloud-dlp/samples/AUTHORING_GUIDE.md b/packages/google-cloud-dlp/samples/AUTHORING_GUIDE.md new file mode 100644 index 000000000000..55c97b32f4c1 --- /dev/null +++ b/packages/google-cloud-dlp/samples/AUTHORING_GUIDE.md @@ -0,0 +1 @@ +See https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/AUTHORING_GUIDE.md \ No newline at end of file diff --git a/packages/google-cloud-dlp/samples/CONTRIBUTING.md b/packages/google-cloud-dlp/samples/CONTRIBUTING.md new file mode 100644 index 000000000000..34c882b6f1a3 --- /dev/null +++ b/packages/google-cloud-dlp/samples/CONTRIBUTING.md @@ -0,0 +1 @@ +See https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/CONTRIBUTING.md \ No newline at end of file diff --git a/packages/google-cloud-dlp/samples/snippets/README.rst b/packages/google-cloud-dlp/samples/snippets/README.rst new file mode 100644 index 000000000000..0b25cc7acde0 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/README.rst @@ -0,0 +1,405 @@ + +.. This file is automatically generated. Do not edit this file directly. + +Google Data Loss Prevention Python Samples +=============================================================================== + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/README.rst + + +This directory contains samples for Google Data Loss Prevention. `Google Data Loss Prevention`_ provides programmatic access to a powerful detection engine for personally identifiable information and other privacy-sensitive data in unstructured data streams. + + + + +.. _Google Data Loss Prevention: https://cloud.google.com/dlp/docs/ + + +Setup +------------------------------------------------------------------------------- + + + +Authentication +++++++++++++++ + +This sample requires you to have authentication setup. Refer to the +`Authentication Getting Started Guide`_ for instructions on setting up +credentials for applications. + +.. _Authentication Getting Started Guide: + https://cloud.google.com/docs/authentication/getting-started + + + + +Install Dependencies +++++++++++++++++++++ + +#. Clone python-docs-samples and change directory to the sample directory you want to use. + + .. code-block:: bash + + $ git clone https://github.com/GoogleCloudPlatform/python-docs-samples.git + +#. Install `pip`_ and `virtualenv`_ if you do not already have them. You may want to refer to the `Python Development Environment Setup Guide`_ for Google Cloud Platform for instructions. + + .. _Python Development Environment Setup Guide: + https://cloud.google.com/python/setup + +#. Create a virtualenv. Samples are compatible with Python 3.6+. + + .. code-block:: bash + + $ virtualenv env + $ source env/bin/activate + +#. Install the dependencies needed to run the samples. + + .. code-block:: bash + + $ pip install -r requirements.txt + +.. _pip: https://pip.pypa.io/ +.. _virtualenv: https://virtualenv.pypa.io/ + + + + + + +Samples +------------------------------------------------------------------------------- + + +Quickstart ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/quickstart.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python quickstart.py + + + + +Inspect Content ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/inspect_content.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python inspect_content.py + + + usage: inspect_content.py [-h] {string,table,file,gcs,datastore,bigquery} ... + + Sample app that uses the Data Loss Prevention API to inspect a string, a local + file or a file on Google Cloud Storage. + + positional arguments: + {string,table,file,gcs,datastore,bigquery} + Select how to submit content to the API. + string Inspect a string. + table Inspect a table. + file Inspect a local file. + gcs Inspect files on Google Cloud Storage. + datastore Inspect files on Google Datastore. + bigquery Inspect files on Google BigQuery. + + optional arguments: + -h, --help show this help message and exit + + + + + +Redact Content ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/redact.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python redact.py + + + usage: redact.py [-h] {info_types,all_text} ... + + Sample app that uses the Data Loss Prevent API to redact the contents of an + image file. + + positional arguments: + {info_types,all_text} + Select which content should be redacted. + info_types Redact specific infoTypes from an image. + all_text Redact all text from an image. The MIME type of the + file is inferred via the Python standard library's + mimetypes module. + + optional arguments: + -h, --help show this help message and exit + + + + + +Metadata ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/metadata.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python metadata.py + + + usage: metadata.py [-h] [--language_code LANGUAGE_CODE] [--filter FILTER] + + Sample app that queries the Data Loss Prevention API for supported categories + and info types. + + optional arguments: + -h, --help show this help message and exit + --language_code LANGUAGE_CODE + The BCP-47 language code to use, e.g. 'en-US'. + --filter FILTER An optional filter to only return info types supported + by certain parts of the API. Defaults to + "supported_by=INSPECT". + + + + + +Jobs ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/jobs.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python jobs.py + + + usage: jobs.py [-h] {list,delete} ... + + Sample app to list and delete DLP jobs using the Data Loss Prevent API. + + positional arguments: + {list,delete} Select how to submit content to the API. + list List Data Loss Prevention API jobs corresponding to a given + filter. + delete Delete results of a Data Loss Prevention API job. + + optional arguments: + -h, --help show this help message and exit + + + + + +Templates ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/templates.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python templates.py + + + usage: templates.py [-h] {create,list,delete} ... + + Sample app that sets up Data Loss Prevention API inspect templates. + + positional arguments: + {create,list,delete} Select which action to perform. + create Create a template. + list List all templates. + delete Delete a template. + + optional arguments: + -h, --help show this help message and exit + + + + + +Triggers ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/triggers.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python triggers.py + + + usage: triggers.py [-h] {create,list,delete} ... + + Sample app that sets up Data Loss Prevention API automation triggers. + + positional arguments: + {create,list,delete} Select which action to perform. + create Create a trigger. + list List all triggers. + delete Delete a trigger. + + optional arguments: + -h, --help show this help message and exit + + + + + +Risk Analysis ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/risk.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python risk.py + + + usage: risk.py [-h] {numerical,categorical,k_anonymity,l_diversity,k_map} ... + + Sample app that uses the Data Loss Prevent API to perform risk anaylsis. + + positional arguments: + {numerical,categorical,k_anonymity,l_diversity,k_map} + Select how to submit content to the API. + numerical + categorical + k_anonymity Computes the k-anonymity of a column set in a Google + BigQuerytable. + l_diversity Computes the l-diversity of a column set in a Google + BigQuerytable. + k_map Computes the k-map risk estimation of a column set in + a GoogleBigQuery table. + + optional arguments: + -h, --help show this help message and exit + + + + + +DeID ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dlp/deid.py,dlp/README.rst + + + + +To run this sample: + +.. code-block:: bash + + $ python deid.py + + + usage: deid.py [-h] + {deid_mask,deid_replace,deid_fpe,reid_fpe,deid_date_shift,replace_with_infotype} + ... + + Uses of the Data Loss Prevention API for deidentifying sensitive data. + + positional arguments: + {deid_mask,deid_replace,deid_fpe,reid_fpe,deid_date_shift,replace_with_infotype} + Select how to submit content to the API. + deid_mask Deidentify sensitive data in a string by masking it + with a character. + deid_replace Deidentify sensitive data in a string by replacing it + with another string. + deid_fpe Deidentify sensitive data in a string using Format + Preserving Encryption (FPE). + reid_fpe Reidentify sensitive data in a string using Format + Preserving Encryption (FPE). + deid_date_shift Deidentify dates in a CSV file by pseudorandomly + shifting them. + replace_with_infotype + Deidentify sensitive data in a string by replacing it + with the info type of the data. + + optional arguments: + -h, --help show this help message and exit + + + + + + + + + +The client library +------------------------------------------------------------------------------- + +This sample uses the `Google Cloud Client Library for Python`_. +You can read the documentation for more details on API usage and use GitHub +to `browse the source`_ and `report issues`_. + +.. _Google Cloud Client Library for Python: + https://googlecloudplatform.github.io/google-cloud-python/ +.. _browse the source: + https://github.com/GoogleCloudPlatform/google-cloud-python +.. _report issues: + https://github.com/GoogleCloudPlatform/google-cloud-python/issues + + + +.. _Google Cloud SDK: https://cloud.google.com/sdk/ diff --git a/packages/google-cloud-dlp/samples/snippets/README.rst.in b/packages/google-cloud-dlp/samples/snippets/README.rst.in new file mode 100644 index 000000000000..708e870fa08a --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/README.rst.in @@ -0,0 +1,52 @@ +# This file is used to generate README.rst + +product: + name: Google Data Loss Prevention + short_name: Data Loss Prevention + url: https://cloud.google.com/dlp/docs/ + description: > + `Google Data Loss Prevention`_ provides programmatic access to a powerful + detection engine for personally identifiable information and other + privacy-sensitive data in unstructured data streams. + +setup: +- auth +- install_deps + +required_api_url: https://console.cloud.google.com/apis/library/dlp.googleapis.com + +required_roles: +- DLP Administrator +- DLP API Service Agent + +samples: +- name: Quickstart + file: quickstart.py +- name: Inspect Content + file: inspect_content.py + show_help: true +- name: Redact Content + file: redact.py + show_help: true +- name: Metadata + file: metadata.py + show_help: true +- name: Jobs + file: jobs.py + show_help: true +- name: Templates + file: templates.py + show_help: true +- name: Triggers + file: triggers.py + show_help: true +- name: Risk Analysis + file: risk.py + show_help: true +- name: DeID + file: deid.py + show_help: true + +cloud_client_library: true + +folder: dlp diff --git a/packages/google-cloud-dlp/samples/snippets/custom_infotype.py b/packages/google-cloud-dlp/samples/snippets/custom_infotype.py new file mode 100644 index 000000000000..565fed6994c6 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/custom_infotype.py @@ -0,0 +1,302 @@ +# Copyright 2020 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Custom infoType snippets. + +This file contains sample code that uses the Data Loss Prevention API to create +custom infoType detectors to refine scan results. +""" + + +# [START dlp_omit_name_if_also_email] +def omit_name_if_also_email( + project, + content_string, +): + """Marches PERSON_NAME and EMAIL_ADDRESS, but not both. + + Uses the Data Loss Prevention API omit matches on PERSON_NAME if the + EMAIL_ADDRESS detector also matches. + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a list of infoTypes for DLP to locate in `content_string`. See + # https://cloud.google.com/dlp/docs/concepts-infotypes for more information + # about supported infoTypes. + info_types_to_locate = [{"name": "PERSON_NAME"}, {"name": "EMAIL_ADDRESS"}] + + # Construct the configuration dictionary that will only match on PERSON_NAME + # if the EMAIL_ADDRESS doesn't also match. This configuration helps reduce + # the total number of findings when there is a large overlap between different + # infoTypes. + inspect_config = { + "info_types": + info_types_to_locate, + "rule_set": [{ + "info_types": [{ + "name": "PERSON_NAME" + }], + "rules": [{ + "exclusion_rule": { + "exclude_info_types": { + "info_types": [{ + "name": "EMAIL_ADDRESS" + }] + }, + "matching_type": "MATCHING_TYPE_PARTIAL_MATCH" + } + }] + }] + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + return [f.info_type.name for f in response.result.findings] + + +# [END dlp_omit_name_if_also_email] + + +# [START inspect_with_person_name_w_custom_hotword] +def inspect_with_person_name_w_custom_hotword( + project, + content_string, + custom_hotword="patient" +): + """Uses the Data Loss Prevention API increase likelihood for matches on + PERSON_NAME if the user specified custom hotword is present. Only + includes findings with the increased likelihood by setting a minimum + likelihood threshold of VERY_LIKELY. + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + custom_hotword: The custom hotword used for likelihood boosting. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a rule set with caller provided hotword, with a likelihood + # boost to VERY_LIKELY when the hotword are present within the 50 character- + # window preceding the PII finding. + hotword_rule = { + "hotword_regex": {"pattern": custom_hotword}, + "likelihood_adjustment": {"fixed_likelihood": "VERY_LIKELY"}, + "proximity": {"window_before": 50}, + } + + rule_set = [ + { + "info_types": [{"name": "PERSON_NAME"}], + "rules": [{"hotword_rule": hotword_rule}], + } + ] + + # Construct the configuration dictionary with the custom regex info type. + inspect_config = { + "rule_set": rule_set, + "min_likelihood": "VERY_LIKELY", + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + if finding.quote: + print(f"Quote: {finding.quote}") + except AttributeError: + pass + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + +# [END inspect_with_person_name_w_custom_hotword] + + +# [START dlp_inspect_with_medical_record_number_custom_regex_detector] +def inspect_with_medical_record_number_custom_regex_detector( + project, + content_string, +): + """Uses the Data Loss Prevention API to analyze string with medical record + number custom regex detector + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a custom regex detector info type called "C_MRN", + # with ###-#-##### pattern, where each # represents a digit from 1 to 9. + # The detector has a detection likelihood of POSSIBLE. + custom_info_types = [ + { + "info_type": {"name": "C_MRN"}, + "regex": {"pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"}, + "likelihood": "POSSIBLE", + } + ] + + # Construct the configuration dictionary with the custom regex info type. + inspect_config = { + "custom_info_types": custom_info_types, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + if finding.quote: + print(f"Quote: {finding.quote}") + except AttributeError: + pass + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + +# [END dlp_inspect_with_medical_record_number_custom_regex_detector] + + +# [START dlp_inspect_with_medical_record_number_w_custom_hotwords] +def inspect_with_medical_record_number_w_custom_hotwords( + project, + content_string, +): + """Uses the Data Loss Prevention API to analyze string with medical record + number custom regex detector, with custom hotwords rules to boost finding + certainty under some circumstances. + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a custom regex detector info type called "C_MRN", + # with ###-#-##### pattern, where each # represents a digit from 1 to 9. + # The detector has a detection likelihood of POSSIBLE. + custom_info_types = [ + { + "info_type": {"name": "C_MRN"}, + "regex": {"pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"}, + "likelihood": "POSSIBLE", + } + ] + + # Construct a rule set with hotwords "mrn" and "medical", with a likelohood + # boost to VERY_LIKELY when hotwords are present within the 10 character- + # window preceding the PII finding. + hotword_rule = { + "hotword_regex": { + "pattern": "(?i)(mrn|medical)(?-i)" + }, + "likelihood_adjustment": { + "fixed_likelihood": "VERY_LIKELY" + }, + "proximity": { + "window_before": 10 + } + } + + rule_set = [ + { + "info_types": [{"name": "C_MRN"}], + "rules": [{"hotword_rule": hotword_rule}], + } + ] + + # Construct the configuration dictionary with the custom regex info type. + inspect_config = { + "custom_info_types": custom_info_types, + "rule_set": rule_set, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + if finding.quote: + print(f"Quote: {finding.quote}") + except AttributeError: + pass + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + +# [END dlp_inspect_with_medical_record_number_w_custom_hotwords] diff --git a/packages/google-cloud-dlp/samples/snippets/custom_infotype_test.py b/packages/google-cloud-dlp/samples/snippets/custom_infotype_test.py new file mode 100644 index 000000000000..4a81df60adbc --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/custom_infotype_test.py @@ -0,0 +1,65 @@ +# Copyright 2020 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import custom_infotype + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") + + +def test_omit_name_if_also_email(capsys): + info_types = custom_infotype.omit_name_if_also_email( + GCLOUD_PROJECT, "alice@example.com") + + # Ensure we found only EMAIL_ADDRESS, and not PERSON_NAME. + assert len(info_types) == 1 + assert info_types[0] == "EMAIL_ADDRESS" + + +def test_inspect_with_person_name_w_custom_hotword(capsys): + custom_infotype.inspect_with_person_name_w_custom_hotword( + GCLOUD_PROJECT, "patient's name is John Doe.", "patient") + + out, _ = capsys.readouterr() + assert "Info type: PERSON_NAME" in out + assert "Likelihood: 5" in out + + +def test_inspect_with_medical_record_number_custom_regex_detector(capsys): + custom_infotype.inspect_with_medical_record_number_custom_regex_detector( + GCLOUD_PROJECT, "Patients MRN 444-5-22222") + + out, _ = capsys.readouterr() + assert "Info type: C_MRN" in out + + +def test_inspect_with_medical_record_number_w_custom_hotwords_no_hotwords( + capsys): + custom_infotype.inspect_with_medical_record_number_w_custom_hotwords( + GCLOUD_PROJECT, "just a number 444-5-22222") + + out, _ = capsys.readouterr() + assert "Info type: C_MRN" in out + assert "Likelihood: 3" in out + + +def test_inspect_with_medical_record_number_w_custom_hotwords_has_hotwords( + capsys): + custom_infotype.inspect_with_medical_record_number_w_custom_hotwords( + GCLOUD_PROJECT, "Patients MRN 444-5-22222") + + out, _ = capsys.readouterr() + assert "Info type: C_MRN" in out + assert "Likelihood: 5" in out diff --git a/packages/google-cloud-dlp/samples/snippets/deid.py b/packages/google-cloud-dlp/samples/snippets/deid.py new file mode 100644 index 000000000000..70bd162385b6 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/deid.py @@ -0,0 +1,1073 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Uses of the Data Loss Prevention API for deidentifying sensitive data.""" + +from __future__ import print_function + +import argparse + + +# [START dlp_deidentify_masking] +def deidentify_with_mask( + project, input_str, info_types, masking_character=None, number_to_mask=0 +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string by masking it with a character. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + masking_character: The character to mask matching sensitive data with. + number_to_mask: The maximum number of sensitive characters to mask in + a match. If omitted or set to zero, the API will default to no + maximum. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Construct inspect configuration dictionary + inspect_config = { + "info_types": [{"name": info_type} for info_type in info_types] + } + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "character_mask_config": { + "masking_character": masking_character, + "number_to_mask": number_to_mask, + } + } + } + ] + } + } + + # Construct item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item=item, + ) + + # Print out the results. + print(response.item.value) + + +# [END dlp_deidentify_masking] + +# [START dlp_deidentify_redact] +def deidentify_with_redact( + project, + input_str, + info_types, +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string by redacting matched input values. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + info_types: A list of strings representing info types to look for. + Returns: + None; the response from the API is printed to the terminal. + """ + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Construct inspect configuration dictionary + inspect_config = { + "info_types": [{"name": info_type} for info_type in info_types] + } + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "redact_config": {} + } + } + ] + } + } + + # Construct item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item=item, + ) + + # Print out the results. + print(response.item.value) + + +# [END dlp_deidentify_redact] + +# [START dlp_deidentify_replace] +def deidentify_with_replace( + project, + input_str, + info_types, + replacement_str="REPLACEMENT_STR", +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string by replacing matched input values with a value you specify. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + info_types: A list of strings representing info types to look for. + replacement_str: The string to replace all values that match given + info types. + Returns: + None; the response from the API is printed to the terminal. + """ + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Construct inspect configuration dictionary + inspect_config = { + "info_types": [{"name": info_type} for info_type in info_types] + } + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "replace_config": { + "new_value": { + "string_value": replacement_str, + } + } + } + } + ] + } + } + + # Construct item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item=item, + ) + + # Print out the results. + print(response.item.value) + +# [END dlp_deidentify_replace] + +# [START dlp_deidentify_fpe] + + +def deidentify_with_fpe( + project, + input_str, + info_types, + alphabet=None, + surrogate_type=None, + key_name=None, + wrapped_key=None, +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string using Format Preserving Encryption (FPE). + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet + surrogate_type: The name of the surrogate custom info type to use. Only + necessary if you want to reverse the deidentification process. Can + be essentially any arbitrary string, as long as it doesn't appear + in your dataset otherwise. + key_name: The name of the Cloud KMS key used to encrypt ('wrap') the + AES-256 key. Example: + key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key + should be encrypted using the Cloud KMS key specified by key_name. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # The wrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + import base64 + + wrapped_key = base64.b64decode(wrapped_key) + + # Construct FPE configuration dictionary + crypto_replace_ffx_fpe_config = { + "crypto_key": { + "kms_wrapped": { + "wrapped_key": wrapped_key, + "crypto_key_name": key_name, + } + }, + "common_alphabet": alphabet, + } + + # Add surrogate type + if surrogate_type: + crypto_replace_ffx_fpe_config["surrogate_info_type"] = { + "name": surrogate_type + } + + # Construct inspect configuration dictionary + inspect_config = { + "info_types": [{"name": info_type} for info_type in info_types] + } + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": crypto_replace_ffx_fpe_config + } + } + ] + } + } + + # Convert string to item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item=item, + ) + + # Print results + print(response.item.value) + + +# [END dlp_deidentify_fpe] + + +# [START dlp_reidentify_fpe] +def reidentify_with_fpe( + project, + input_str, + alphabet=None, + surrogate_type=None, + key_name=None, + wrapped_key=None, +): + """Uses the Data Loss Prevention API to reidentify sensitive data in a + string that was encrypted by Format Preserving Encryption (FPE). + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet + surrogate_type: The name of the surrogate custom info type to used + during the encryption process. + key_name: The name of the Cloud KMS key used to encrypt ('wrap') the + AES-256 key. Example: + keyName = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: The encrypted ('wrapped') AES-256 key to use. This key + should be encrypted using the Cloud KMS key specified by key_name. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # The wrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + import base64 + + wrapped_key = base64.b64decode(wrapped_key) + + # Construct Deidentify Config + reidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": { + "crypto_key": { + "kms_wrapped": { + "wrapped_key": wrapped_key, + "crypto_key_name": key_name, + } + }, + "common_alphabet": alphabet, + "surrogate_info_type": {"name": surrogate_type}, + } + } + } + ] + } + } + + inspect_config = { + "custom_info_types": [ + {"info_type": {"name": surrogate_type}, "surrogate_type": {}} + ] + } + + # Convert string to item + item = {"value": input_str} + + # Call the API + response = dlp.reidentify_content( + parent, + inspect_config=inspect_config, + reidentify_config=reidentify_config, + item=item, + ) + + # Print results + print(response.item.value) + + +# [END dlp_reidentify_fpe] + + +# [START dlp_deidentify_free_text_with_fpe_using_surrogate] +def deidentify_free_text_with_fpe_using_surrogate( + project, + input_str, + alphabet="NUMERIC", + info_type="PHONE_NUMBER", + surrogate_type="PHONE_TOKEN", + unwrapped_key="YWJjZGVmZ2hpamtsbW5vcA==", +): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string using Format Preserving Encryption (FPE). + The encryption is performed with an unwrapped key. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet + info_type: The name of the info type to de-identify + surrogate_type: The name of the surrogate custom info type to use. Can + be essentially any arbitrary string, as long as it doesn't appear + in your dataset otherwise. + unwrapped_key: The base64-encoded AES-256 key to use. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # The unwrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + import base64 + + unwrapped_key = base64.b64decode(unwrapped_key) + + # Construct de-identify config + transformation = { + "info_types": [{"name": info_type}], + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": { + "crypto_key": { + "unwrapped": {"key": unwrapped_key} + }, + "common_alphabet": alphabet, + "surrogate_info_type": {"name": surrogate_type}, + } + } + } + + deidentify_config = { + "info_type_transformations": { + "transformations": [transformation] + } + } + + # Construct the inspect config, trying to finding all PII with likelihood + # higher than UNLIKELY + inspect_config = { + "info_types": [{"name": info_type}], + "min_likelihood": "UNLIKELY" + } + + # Convert string to item + item = {"value": input_str} + + # Call the API + response = dlp.deidentify_content( + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item=item, + ) + + # Print results + print(response.item.value) + + +# [END dlp_deidentify_free_text_with_fpe_using_surrogate] + + +# [START dlp_reidentify_free_text_with_fpe_using_surrogate] +def reidentify_free_text_with_fpe_using_surrogate( + project, + input_str, + alphabet="NUMERIC", + surrogate_type="PHONE_TOKEN", + unwrapped_key="YWJjZGVmZ2hpamtsbW5vcA==", +): + """Uses the Data Loss Prevention API to reidentify sensitive data in a + string that was encrypted by Format Preserving Encryption (FPE) with + surrogate type. The encryption is performed with an unwrapped key. + Args: + project: The Google Cloud project id to use as a parent resource. + input_str: The string to deidentify (will be treated as text). + alphabet: The set of characters to replace sensitive ones with. For + more information, see https://cloud.google.com/dlp/docs/reference/ + rest/v2beta2/organizations.deidentifyTemplates#ffxcommonnativealphabet + surrogate_type: The name of the surrogate custom info type to used + during the encryption process. + unwrapped_key: The base64-encoded AES-256 key to use. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # The unwrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + import base64 + + unwrapped_key = base64.b64decode(unwrapped_key) + + # Construct Deidentify Config + transformation = { + "primitive_transformation": { + "crypto_replace_ffx_fpe_config": { + "crypto_key": { + "unwrapped": {"key": unwrapped_key} + }, + "common_alphabet": alphabet, + "surrogate_info_type": {"name": surrogate_type}, + } + } + } + + reidentify_config = { + "info_type_transformations": { + "transformations": [transformation] + } + } + + inspect_config = { + "custom_info_types": [ + {"info_type": {"name": surrogate_type}, "surrogate_type": {}} + ] + } + + # Convert string to item + item = {"value": input_str} + + # Call the API + response = dlp.reidentify_content( + parent, + inspect_config=inspect_config, + reidentify_config=reidentify_config, + item=item, + ) + + # Print results + print(response.item.value) + + +# [END dlp_reidentify_free_text_with_fpe_using_surrogate] + + +# [START dlp_deidentify_date_shift] +def deidentify_with_date_shift( + project, + input_csv_file=None, + output_csv_file=None, + date_fields=None, + lower_bound_days=None, + upper_bound_days=None, + context_field_id=None, + wrapped_key=None, + key_name=None, +): + """Uses the Data Loss Prevention API to deidentify dates in a CSV file by + pseudorandomly shifting them. + Args: + project: The Google Cloud project id to use as a parent resource. + input_csv_file: The path to the CSV file to deidentify. The first row + of the file must specify column names, and all other rows must + contain valid values. + output_csv_file: The path to save the date-shifted CSV file. + date_fields: The list of (date) fields in the CSV file to date shift. + Example: ['birth_date', 'register_date'] + lower_bound_days: The maximum number of days to shift a date backward + upper_bound_days: The maximum number of days to shift a date forward + context_field_id: (Optional) The column to determine date shift amount + based on. If this is not specified, a random shift amount will be + used for every row. If this is specified, then 'wrappedKey' and + 'keyName' must also be set. Example: + contextFieldId = [{ 'name': 'user_id' }] + key_name: (Optional) The name of the Cloud KMS key used to encrypt + ('wrap') the AES-256 key. Example: + key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/ + keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME' + wrapped_key: (Optional) The encrypted ('wrapped') AES-256 key to use. + This key should be encrypted using the Cloud KMS key specified by + key_name. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Convert date field list to Protobuf type + def map_fields(field): + return {"name": field} + + if date_fields: + date_fields = map(map_fields, date_fields) + else: + date_fields = [] + + # Read and parse the CSV file + import csv + from datetime import datetime + + f = [] + with open(input_csv_file, "r") as csvfile: + reader = csv.reader(csvfile) + for row in reader: + f.append(row) + + # Helper function for converting CSV rows to Protobuf types + def map_headers(header): + return {"name": header} + + def map_data(value): + try: + date = datetime.strptime(value, "%m/%d/%Y") + return { + "date_value": { + "year": date.year, + "month": date.month, + "day": date.day, + } + } + except ValueError: + return {"string_value": value} + + def map_rows(row): + return {"values": map(map_data, row)} + + # Using the helper functions, convert CSV rows to protobuf-compatible + # dictionaries. + csv_headers = map(map_headers, f[0]) + csv_rows = map(map_rows, f[1:]) + + # Construct the table dict + table_item = {"table": {"headers": csv_headers, "rows": csv_rows}} + + # Construct date shift config + date_shift_config = { + "lower_bound_days": lower_bound_days, + "upper_bound_days": upper_bound_days, + } + + # If using a Cloud KMS key, add it to the date_shift_config. + # The wrapped key is base64-encoded, but the library expects a binary + # string, so decode it here. + if context_field_id and key_name and wrapped_key: + import base64 + + date_shift_config["context"] = {"name": context_field_id} + date_shift_config["crypto_key"] = { + "kms_wrapped": { + "wrapped_key": base64.b64decode(wrapped_key), + "crypto_key_name": key_name, + } + } + elif context_field_id or key_name or wrapped_key: + raise ValueError( + """You must set either ALL or NONE of + [context_field_id, key_name, wrapped_key]!""" + ) + + # Construct Deidentify Config + deidentify_config = { + "record_transformations": { + "field_transformations": [ + { + "fields": date_fields, + "primitive_transformation": { + "date_shift_config": date_shift_config + }, + } + ] + } + } + + # Write to CSV helper methods + def write_header(header): + return header.name + + def write_data(data): + return data.string_value or "%s/%s/%s" % ( + data.date_value.month, + data.date_value.day, + data.date_value.year, + ) + + # Call the API + response = dlp.deidentify_content( + parent, deidentify_config=deidentify_config, item=table_item + ) + + # Write results to CSV file + with open(output_csv_file, "w") as csvfile: + write_file = csv.writer(csvfile, delimiter=",") + write_file.writerow(map(write_header, response.item.table.headers)) + for row in response.item.table.rows: + write_file.writerow(map(write_data, row.values)) + # Print status + print("Successfully saved date-shift output to {}".format(output_csv_file)) + + +# [END dlp_deidentify_date_shift] + + +# [START dlp_deidentify_replace_infotype] +def deidentify_with_replace_infotype(project, item, info_types): + """Uses the Data Loss Prevention API to deidentify sensitive data in a + string by replacing it with the info type. + Args: + project: The Google Cloud project id to use as a parent resource. + item: The string to deidentify (will be treated as text). + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Construct inspect configuration dictionary + inspect_config = { + "info_types": [{"name": info_type} for info_type in info_types] + } + + # Construct deidentify configuration dictionary + deidentify_config = { + "info_type_transformations": { + "transformations": [ + { + "primitive_transformation": { + "replace_with_info_type_config": {} + } + } + ] + } + } + + # Call the API + response = dlp.deidentify_content( + parent, + inspect_config=inspect_config, + deidentify_config=deidentify_config, + item={"value": item}, + ) + + # Print out the results. + print(response.item.value) + + +# [END dlp_deidentify_replace_infotype] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select how to submit content to the API." + ) + subparsers.required = True + + mask_parser = subparsers.add_parser( + "deid_mask", + help="Deidentify sensitive data in a string by masking it with a " + "character.", + ) + mask_parser.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + mask_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + mask_parser.add_argument("item", help="The string to deidentify.") + mask_parser.add_argument( + "-n", + "--number_to_mask", + type=int, + default=0, + help="The maximum number of sensitive characters to mask in a match. " + "If omitted the request or set to 0, the API will mask any mathcing " + "characters.", + ) + mask_parser.add_argument( + "-m", + "--masking_character", + help="The character to mask matching sensitive data with.", + ) + + replace_parser = subparsers.add_parser( + "deid_replace", + help="Deidentify sensitive data in a string by replacing it with " + "another string.", + ) + replace_parser.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + replace_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + replace_parser.add_argument("item", help="The string to deidentify.") + replace_parser.add_argument("replacement_str", help="The string to " + "replace all matched values with.") + + fpe_parser = subparsers.add_parser( + "deid_fpe", + help="Deidentify sensitive data in a string using Format Preserving " + "Encryption (FPE).", + ) + fpe_parser.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + fpe_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + fpe_parser.add_argument( + "item", + help="The string to deidentify. " + "Example: string = 'My SSN is 372819127'", + ) + fpe_parser.add_argument( + "key_name", + help="The name of the Cloud KMS key used to encrypt ('wrap') the " + "AES-256 key. Example: " + "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" + "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", + ) + fpe_parser.add_argument( + "wrapped_key", + help="The encrypted ('wrapped') AES-256 key to use. This key should " + "be encrypted using the Cloud KMS key specified by key_name.", + ) + fpe_parser.add_argument( + "-a", + "--alphabet", + default="ALPHA_NUMERIC", + help="The set of characters to replace sensitive ones with. Commonly " + 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", ' + '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", ' + '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"', + ) + fpe_parser.add_argument( + "-s", + "--surrogate_type", + help="The name of the surrogate custom info type to use. Only " + "necessary if you want to reverse the deidentification process. Can " + "be essentially any arbitrary string, as long as it doesn't appear " + "in your dataset otherwise.", + ) + + reid_parser = subparsers.add_parser( + "reid_fpe", + help="Reidentify sensitive data in a string using Format Preserving " + "Encryption (FPE).", + ) + reid_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + reid_parser.add_argument( + "item", + help="The string to deidentify. " + "Example: string = 'My SSN is 372819127'", + ) + reid_parser.add_argument( + "surrogate_type", + help="The name of the surrogate custom info type to use. Only " + "necessary if you want to reverse the deidentification process. Can " + "be essentially any arbitrary string, as long as it doesn't appear " + "in your dataset otherwise.", + ) + reid_parser.add_argument( + "key_name", + help="The name of the Cloud KMS key used to encrypt ('wrap') the " + "AES-256 key. Example: " + "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" + "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", + ) + reid_parser.add_argument( + "wrapped_key", + help="The encrypted ('wrapped') AES-256 key to use. This key should " + "be encrypted using the Cloud KMS key specified by key_name.", + ) + reid_parser.add_argument( + "-a", + "--alphabet", + default="ALPHA_NUMERIC", + help="The set of characters to replace sensitive ones with. Commonly " + 'used subsets of the alphabet include "NUMERIC", "HEXADECIMAL", ' + '"UPPER_CASE_ALPHA_NUMERIC", "ALPHA_NUMERIC", ' + '"FFX_COMMON_NATIVE_ALPHABET_UNSPECIFIED"', + ) + + date_shift_parser = subparsers.add_parser( + "deid_date_shift", + help="Deidentify dates in a CSV file by pseudorandomly shifting them.", + ) + date_shift_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + date_shift_parser.add_argument( + "input_csv_file", + help="The path to the CSV file to deidentify. The first row of the " + "file must specify column names, and all other rows must contain " + "valid values.", + ) + date_shift_parser.add_argument( + "output_csv_file", help="The path to save the date-shifted CSV file." + ) + date_shift_parser.add_argument( + "lower_bound_days", + type=int, + help="The maximum number of days to shift a date backward", + ) + date_shift_parser.add_argument( + "upper_bound_days", + type=int, + help="The maximum number of days to shift a date forward", + ) + date_shift_parser.add_argument( + "date_fields", + nargs="+", + help="The list of date fields in the CSV file to date shift. Example: " + "['birth_date', 'register_date']", + ) + date_shift_parser.add_argument( + "--context_field_id", + help="(Optional) The column to determine date shift amount based on. " + "If this is not specified, a random shift amount will be used for " + "every row. If this is specified, then 'wrappedKey' and 'keyName' " + "must also be set.", + ) + date_shift_parser.add_argument( + "--key_name", + help="(Optional) The name of the Cloud KMS key used to encrypt " + "('wrap') the AES-256 key. Example: " + "key_name = 'projects/YOUR_GCLOUD_PROJECT/locations/YOUR_LOCATION/" + "keyRings/YOUR_KEYRING_NAME/cryptoKeys/YOUR_KEY_NAME'", + ) + date_shift_parser.add_argument( + "--wrapped_key", + help="(Optional) The encrypted ('wrapped') AES-256 key to use. This " + "key should be encrypted using the Cloud KMS key specified by" + "key_name.", + ) + + replace_with_infotype_parser = subparsers.add_parser( + "replace_with_infotype", + help="Deidentify sensitive data in a string by replacing it with the " + "info type of the data." + ) + replace_with_infotype_parser.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + replace_with_infotype_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + replace_with_infotype_parser.add_argument( + "item", + help="The string to deidentify." + "Example: 'My credit card is 4242 4242 4242 4242'", + ) + + args = parser.parse_args() + + if args.content == "deid_mask": + deidentify_with_mask( + args.project, + args.item, + args.info_types, + masking_character=args.masking_character, + number_to_mask=args.number_to_mask, + ) + elif args.content == "deid_replace": + deidentify_with_replace( + args.project, + args.item, + args.info_types, + replacement_str=args.replacement_str, + ) + elif args.content == "deid_fpe": + deidentify_with_fpe( + args.project, + args.item, + args.info_types, + alphabet=args.alphabet, + wrapped_key=args.wrapped_key, + key_name=args.key_name, + surrogate_type=args.surrogate_type, + ) + elif args.content == "reid_fpe": + reidentify_with_fpe( + args.project, + args.item, + surrogate_type=args.surrogate_type, + wrapped_key=args.wrapped_key, + key_name=args.key_name, + alphabet=args.alphabet, + ) + elif args.content == "deid_date_shift": + deidentify_with_date_shift( + args.project, + input_csv_file=args.input_csv_file, + output_csv_file=args.output_csv_file, + lower_bound_days=args.lower_bound_days, + upper_bound_days=args.upper_bound_days, + date_fields=args.date_fields, + context_field_id=args.context_field_id, + wrapped_key=args.wrapped_key, + key_name=args.key_name, + ) + elif args.content == "replace_with_infotype": + deidentify_with_replace_infotype( + args.project, + item=args.item, + info_types=args.info_types, + ) diff --git a/packages/google-cloud-dlp/samples/snippets/deid_test.py b/packages/google-cloud-dlp/samples/snippets/deid_test.py new file mode 100644 index 000000000000..7d886c51e362 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/deid_test.py @@ -0,0 +1,257 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import tempfile + +import pytest + +import deid + +HARMFUL_STRING = "My SSN is 372819127" +HARMLESS_STRING = "My favorite color is blue" +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +UNWRAPPED_KEY = "YWJjZGVmZ2hpamtsbW5vcA==" +WRAPPED_KEY = ( + "CiQAz0hX4+go8fJwn80Fr8pVImwx+tmZdqU7JL+7TN/S5JxBU9gSSQDhFHpFVy" + "uzJps0YH9ls480mU+JLG7jI/0lL04i6XJRWqmI6gUSZRUtECYcLH5gXK4SXHlL" + "rotx7Chxz/4z7SIpXFOBY61z0/U=" +) +KEY_NAME = ( + "projects/python-docs-samples-tests/locations/global/keyRings/" + "dlp-test/cryptoKeys/dlp-test" +) +SURROGATE_TYPE = "SSN_TOKEN" +CSV_FILE = os.path.join(os.path.dirname(__file__), "resources/dates.csv") +DATE_SHIFTED_AMOUNT = 30 +DATE_FIELDS = ["birth_date", "register_date"] +CSV_CONTEXT_FIELD = "name" + + +@pytest.fixture(scope="module") +def tempdir(): + tempdir = tempfile.mkdtemp() + yield tempdir + shutil.rmtree(tempdir) + + +def test_deidentify_with_mask(capsys): + deid.deidentify_with_mask( + GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"] + ) + + out, _ = capsys.readouterr() + assert "My SSN is *********" in out + + +def test_deidentify_with_mask_ignore_insensitive_data(capsys): + deid.deidentify_with_mask( + GCLOUD_PROJECT, HARMLESS_STRING, ["US_SOCIAL_SECURITY_NUMBER"] + ) + + out, _ = capsys.readouterr() + assert HARMLESS_STRING in out + + +def test_deidentify_with_mask_masking_character_specified(capsys): + deid.deidentify_with_mask( + GCLOUD_PROJECT, + HARMFUL_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + masking_character="#", + ) + + out, _ = capsys.readouterr() + assert "My SSN is #########" in out + + +def test_deidentify_with_mask_masking_number_specified(capsys): + deid.deidentify_with_mask( + GCLOUD_PROJECT, + HARMFUL_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + number_to_mask=7, + ) + + out, _ = capsys.readouterr() + assert "My SSN is *******27" in out + + +def test_deidentify_with_redact(capsys): + deid.deidentify_with_redact( + GCLOUD_PROJECT, HARMFUL_STRING + "!", ["US_SOCIAL_SECURITY_NUMBER"] + ) + out, _ = capsys.readouterr() + assert "My SSN is !" in out + + +def test_deidentify_with_replace(capsys): + deid.deidentify_with_replace( + GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"], + replacement_str="REPLACEMENT_STR" + ) + + out, _ = capsys.readouterr() + assert "My SSN is REPLACEMENT_STR" in out + + +def test_deidentify_with_fpe(capsys): + deid.deidentify_with_fpe( + GCLOUD_PROJECT, + HARMFUL_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + alphabet="NUMERIC", + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + ) + + out, _ = capsys.readouterr() + assert "My SSN is" in out + assert "372819127" not in out + + +def test_deidentify_with_fpe_uses_surrogate_info_types(capsys): + deid.deidentify_with_fpe( + GCLOUD_PROJECT, + HARMFUL_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + alphabet="NUMERIC", + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + surrogate_type=SURROGATE_TYPE, + ) + + out, _ = capsys.readouterr() + assert "My SSN is SSN_TOKEN" in out + assert "372819127" not in out + + +def test_deidentify_with_fpe_ignores_insensitive_data(capsys): + deid.deidentify_with_fpe( + GCLOUD_PROJECT, + HARMLESS_STRING, + ["US_SOCIAL_SECURITY_NUMBER"], + alphabet="NUMERIC", + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + ) + + out, _ = capsys.readouterr() + assert HARMLESS_STRING in out + + +def test_deidentify_with_date_shift(tempdir, capsys): + output_filepath = os.path.join(tempdir, "dates-shifted.csv") + + deid.deidentify_with_date_shift( + GCLOUD_PROJECT, + input_csv_file=CSV_FILE, + output_csv_file=output_filepath, + lower_bound_days=DATE_SHIFTED_AMOUNT, + upper_bound_days=DATE_SHIFTED_AMOUNT, + date_fields=DATE_FIELDS, + ) + + out, _ = capsys.readouterr() + + assert "Successful" in out + + +def test_deidentify_with_date_shift_using_context_field(tempdir, capsys): + output_filepath = os.path.join(tempdir, "dates-shifted.csv") + + deid.deidentify_with_date_shift( + GCLOUD_PROJECT, + input_csv_file=CSV_FILE, + output_csv_file=output_filepath, + lower_bound_days=DATE_SHIFTED_AMOUNT, + upper_bound_days=DATE_SHIFTED_AMOUNT, + date_fields=DATE_FIELDS, + context_field_id=CSV_CONTEXT_FIELD, + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + ) + + out, _ = capsys.readouterr() + + assert "Successful" in out + + +def test_reidentify_with_fpe(capsys): + labeled_fpe_string = "My SSN is SSN_TOKEN(9):731997681" + + deid.reidentify_with_fpe( + GCLOUD_PROJECT, + labeled_fpe_string, + surrogate_type=SURROGATE_TYPE, + wrapped_key=WRAPPED_KEY, + key_name=KEY_NAME, + alphabet="NUMERIC", + ) + + out, _ = capsys.readouterr() + + assert "731997681" not in out + + +def test_deidentify_free_text_with_fpe_using_surrogate(capsys): + labeled_fpe_string = "My phone number is 4359916732" + + deid.deidentify_free_text_with_fpe_using_surrogate( + GCLOUD_PROJECT, + labeled_fpe_string, + info_type="PHONE_NUMBER", + surrogate_type="PHONE_TOKEN", + unwrapped_key=UNWRAPPED_KEY, + alphabet="NUMERIC", + ) + + out, _ = capsys.readouterr() + + assert "PHONE_TOKEN" in out + assert "My phone number is" in out + assert "4359916732" not in out + + +def test_reidentify_free_text_with_fpe_using_surrogate(capsys): + labeled_fpe_string = "My phone number is PHONE_TOKEN(10):9617256398" + + deid.reidentify_free_text_with_fpe_using_surrogate( + GCLOUD_PROJECT, + labeled_fpe_string, + surrogate_type="PHONE_TOKEN", + unwrapped_key=UNWRAPPED_KEY, + alphabet="NUMERIC", + ) + + out, _ = capsys.readouterr() + + assert "PHONE_TOKEN" not in out + assert "9617256398" not in out + assert "My phone number is" in out + + +def test_deidentify_with_replace_infotype(capsys): + url_to_redact = "https://cloud.google.com" + deid.deidentify_with_replace_infotype( + GCLOUD_PROJECT, + "My favorite site is " + url_to_redact, + ["URL"], + ) + + out, _ = capsys.readouterr() + + assert url_to_redact not in out + assert "My favorite site is [URL]" in out diff --git a/packages/google-cloud-dlp/samples/snippets/inspect_content.py b/packages/google-cloud-dlp/samples/snippets/inspect_content.py new file mode 100644 index 000000000000..fb2573e4bc8a --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/inspect_content.py @@ -0,0 +1,1424 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that uses the Data Loss Prevention API to inspect a string, a +local file or a file on Google Cloud Storage.""" + +from __future__ import print_function + +import argparse +import json +import os + + +# [START dlp_inspect_string_basic] +def inspect_string_basic( + project, + content_string, + info_types=["PHONE_NUMBER"], +): + """Uses the Data Loss Prevention API to analyze strings for protected data. + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Construct the configuration dictionary. + inspect_config = { + "info_types": info_types, + "include_quote": True, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + print("Quote: {}".format(finding.quote)) + print("Info type: {}".format(finding.info_type.name)) + print("Likelihood: {}".format(finding.likelihood)) + else: + print("No findings.") + + +# [END dlp_inspect_string_basic] + + +# [START dlp_inspect_string] +def inspect_string( + project, + content_string, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + include_quote=True, +): + """Uses the Data Loss Prevention API to analyze strings for protected data. + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + include_quote: Boolean for whether to display a quote of the detected + information in the results. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + if finding.quote: + print("Quote: {}".format(finding.quote)) + except AttributeError: + pass + print("Info type: {}".format(finding.info_type.name)) + print("Likelihood: {}".format(finding.likelihood)) + else: + print("No findings.") + + +# [END dlp_inspect_string] + +# [START dlp_inspect_table] + + +def inspect_table( + project, + data, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + include_quote=True, +): + """Uses the Data Loss Prevention API to analyze strings for protected data. + Args: + project: The Google Cloud project id to use as a parent resource. + data: Json string representing table data. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + include_quote: Boolean for whether to display a quote of the detected + information in the results. + Returns: + None; the response from the API is printed to the terminal. + Example: + data = { + "header":[ + "email", + "phone number" + ], + "rows":[ + [ + "robertfrost@xyz.com", + "4232342345" + ], + [ + "johndoe@pqr.com", + "4253458383" + ] + ] + } + + >> $ python inspect_content.py table \ + '{"header": ["email", "phone number"], + "rows": [["robertfrost@xyz.com", "4232342345"], + ["johndoe@pqr.com", "4253458383"]]}' + >> Quote: robertfrost@xyz.com + Info type: EMAIL_ADDRESS + Likelihood: 4 + Quote: johndoe@pqr.com + Info type: EMAIL_ADDRESS + Likelihood: 4 + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct the `table`. For more details on the table schema, please see + # https://cloud.google.com/dlp/docs/reference/rest/v2/ContentItem#Table + headers = [{"name": val} for val in data["header"]] + rows = [] + for row in data["rows"]: + rows.append( + {"values": [{"string_value": cell_val} for cell_val in row]} + ) + + table = {} + table["headers"] = headers + table["rows"] = rows + item = {"table": table} + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + if finding.quote: + print("Quote: {}".format(finding.quote)) + except AttributeError: + pass + print("Info type: {}".format(finding.info_type.name)) + print("Likelihood: {}".format(finding.likelihood)) + else: + print("No findings.") + + +# [END dlp_inspect_table] + +# [START dlp_inspect_file] + + +def inspect_file( + project, + filename, + info_types, + min_likelihood=None, + custom_dictionaries=None, + custom_regexes=None, + max_findings=None, + include_quote=True, + mime_type=None, +): + """Uses the Data Loss Prevention API to analyze a file for protected data. + Args: + project: The Google Cloud project id to use as a parent resource. + filename: The path to the file to inspect. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + include_quote: Boolean for whether to display a quote of the detected + information in the results. + mime_type: The MIME type of the file. If not specified, the type is + inferred via the Python standard library's mimetypes module. + Returns: + None; the response from the API is printed to the terminal. + """ + + import mimetypes + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if not info_types: + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, + } + + # If mime_type is not specified, guess it from the filename. + if mime_type is None: + mime_guess = mimetypes.MimeTypes().guess_type(filename) + mime_type = mime_guess[0] + + # Select the content type index from the list of supported types. + supported_content_types = { + None: 0, # "Unspecified" + "image/jpeg": 1, + "image/bmp": 2, + "image/png": 3, + "image/svg": 4, + "text/plain": 5, + } + content_type_index = supported_content_types.get(mime_type, 0) + + # Construct the item, containing the file's byte data. + with open(filename, mode="rb") as f: + item = {"byte_item": {"type": content_type_index, "data": f.read()}} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + print("Quote: {}".format(finding.quote)) + except AttributeError: + pass + print("Info type: {}".format(finding.info_type.name)) + print("Likelihood: {}".format(finding.likelihood)) + else: + print("No findings.") + + +# [END dlp_inspect_file] + + +# [START dlp_inspect_gcs] +def inspect_gcs_file( + project, + bucket, + filename, + topic_id, + subscription_id, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + timeout=300, +): + """Uses the Data Loss Prevention API to analyze a file on GCS. + Args: + project: The Google Cloud project id to use as a parent resource. + bucket: The name of the GCS bucket containing the file, as a string. + filename: The name of the file in the bucket, including the path, as a + string; e.g. 'images/myfile.png'. + topic_id: The id of the Cloud Pub/Sub topic to which the API will + broadcast job completion. The topic must already exist. + subscription_id: The id of the Cloud Pub/Sub subscription to listen on + while waiting for job completion. The subscription must already + exist and be subscribed to the topic. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + timeout: The number of seconds to wait for a response from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # This sample also uses threading.Event() to wait for the job to finish. + import threading + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if not info_types: + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct a storage_config containing the file's URL. + url = "gs://{}/{}".format(bucket, filename) + storage_config = {"cloud_storage_options": {"file_set": {"url": url}}} + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Construct the inspect_job, which defines the entire inspect content task. + inspect_job = { + "inspect_config": inspect_config, + "storage_config": storage_config, + "actions": actions, + } + + operation = dlp.create_dlp_job(parent, inspect_job=inspect_job) + print("Inspection operation started: {}".format(operation.name)) + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + + # Set up a callback to acknowledge a message. This closes around an event + # so that it can signal that it is done and the main thread can continue. + job_done = threading.Event() + + def callback(message): + try: + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + if job.inspect_details.result.info_type_stats: + for finding in job.inspect_details.result.info_type_stats: + print( + "Info type: {}; Count: {}".format( + finding.info_type.name, finding.count + ) + ) + else: + print("No findings.") + + # Signal to the main thread that we can exit. + job_done.set() + else: + # This is not the message we're looking for. + message.drop() + except Exception as e: + # Because this is executing in a thread, an exception won't be + # noted unless we print it manually. + print(e) + raise + + subscriber.subscribe(subscription_path, callback=callback) + finished = job_done.wait(timeout=timeout) + if not finished: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + + +# [END dlp_inspect_gcs] + + +# [START dlp_inspect_datastore] +def inspect_datastore( + project, + datastore_project, + kind, + topic_id, + subscription_id, + info_types, + custom_dictionaries=None, + custom_regexes=None, + namespace_id=None, + min_likelihood=None, + max_findings=None, + timeout=300, +): + """Uses the Data Loss Prevention API to analyze Datastore data. + Args: + project: The Google Cloud project id to use as a parent resource. + datastore_project: The Google Cloud project id of the target Datastore. + kind: The kind of the Datastore entity to inspect, e.g. 'Person'. + topic_id: The id of the Cloud Pub/Sub topic to which the API will + broadcast job completion. The topic must already exist. + subscription_id: The id of the Cloud Pub/Sub subscription to listen on + while waiting for job completion. The subscription must already + exist and be subscribed to the topic. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + namespace_id: The namespace of the Datastore document, if applicable. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + timeout: The number of seconds to wait for a response from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # This sample also uses threading.Event() to wait for the job to finish. + import threading + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if not info_types: + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct a storage_config containing the target Datastore info. + storage_config = { + "datastore_options": { + "partition_id": { + "project_id": datastore_project, + "namespace_id": namespace_id, + }, + "kind": {"name": kind}, + } + } + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Construct the inspect_job, which defines the entire inspect content task. + inspect_job = { + "inspect_config": inspect_config, + "storage_config": storage_config, + "actions": actions, + } + + operation = dlp.create_dlp_job(parent, inspect_job=inspect_job) + print("Inspection operation started: {}".format(operation.name)) + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + + # Set up a callback to acknowledge a message. This closes around an event + # so that it can signal that it is done and the main thread can continue. + job_done = threading.Event() + + def callback(message): + try: + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + if job.inspect_details.result.info_type_stats: + for finding in job.inspect_details.result.info_type_stats: + print( + "Info type: {}; Count: {}".format( + finding.info_type.name, finding.count + ) + ) + else: + print("No findings.") + + # Signal to the main thread that we can exit. + job_done.set() + else: + # This is not the message we're looking for. + message.drop() + except Exception as e: + # Because this is executing in a thread, an exception won't be + # noted unless we print it manually. + print(e) + raise + + # Register the callback and wait on the event. + subscriber.subscribe(subscription_path, callback=callback) + + finished = job_done.wait(timeout=timeout) + if not finished: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + + +# [END dlp_inspect_datastore] + + +# [START dlp_inspect_bigquery] +def inspect_bigquery( + project, + bigquery_project, + dataset_id, + table_id, + topic_id, + subscription_id, + info_types, + custom_dictionaries=None, + custom_regexes=None, + min_likelihood=None, + max_findings=None, + timeout=300, +): + """Uses the Data Loss Prevention API to analyze BigQuery data. + Args: + project: The Google Cloud project id to use as a parent resource. + bigquery_project: The Google Cloud project id of the target table. + dataset_id: The id of the target BigQuery dataset. + table_id: The id of the target BigQuery table. + topic_id: The id of the Cloud Pub/Sub topic to which the API will + broadcast job completion. The topic must already exist. + subscription_id: The id of the Cloud Pub/Sub subscription to listen on + while waiting for job completion. The subscription must already + exist and be subscribed to the topic. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + namespace_id: The namespace of the Datastore document, if applicable. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + timeout: The number of seconds to wait for a response from the API. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # This sample also uses threading.Event() to wait for the job to finish. + import threading + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + if not info_types: + info_types = ["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"] + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare custom_info_types by parsing the dictionary word lists and + # regex patterns. + if custom_dictionaries is None: + custom_dictionaries = [] + dictionaries = [ + { + "info_type": {"name": "CUSTOM_DICTIONARY_{}".format(i)}, + "dictionary": {"word_list": {"words": custom_dict.split(",")}}, + } + for i, custom_dict in enumerate(custom_dictionaries) + ] + if custom_regexes is None: + custom_regexes = [] + regexes = [ + { + "info_type": {"name": "CUSTOM_REGEX_{}".format(i)}, + "regex": {"pattern": custom_regex}, + } + for i, custom_regex in enumerate(custom_regexes) + ] + custom_info_types = dictionaries + regexes + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "custom_info_types": custom_info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct a storage_config containing the target Bigquery info. + storage_config = { + "big_query_options": { + "table_reference": { + "project_id": bigquery_project, + "dataset_id": dataset_id, + "table_id": table_id, + } + } + } + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Construct the inspect_job, which defines the entire inspect content task. + inspect_job = { + "inspect_config": inspect_config, + "storage_config": storage_config, + "actions": actions, + } + + operation = dlp.create_dlp_job(parent, inspect_job=inspect_job) + print("Inspection operation started: {}".format(operation.name)) + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + + # Set up a callback to acknowledge a message. This closes around an event + # so that it can signal that it is done and the main thread can continue. + job_done = threading.Event() + + def callback(message): + try: + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + if job.inspect_details.result.info_type_stats: + for finding in job.inspect_details.result.info_type_stats: + print( + "Info type: {}; Count: {}".format( + finding.info_type.name, finding.count + ) + ) + else: + print("No findings.") + + # Signal to the main thread that we can exit. + job_done.set() + else: + # This is not the message we're looking for. + message.drop() + except Exception as e: + # Because this is executing in a thread, an exception won't be + # noted unless we print it manually. + print(e) + raise + + # Register the callback and wait on the event. + subscriber.subscribe(subscription_path, callback=callback) + finished = job_done.wait(timeout=timeout) + if not finished: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + + +# [END dlp_inspect_bigquery] + + +if __name__ == "__main__": + default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") + + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select how to submit content to the API." + ) + subparsers.required = True + + parser_string = subparsers.add_parser("string", help="Inspect a string.") + parser_string.add_argument("item", help="The string to inspect.") + parser_string.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_string.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_string.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_string.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_string.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_string.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_string.add_argument( + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) + + parser_table = subparsers.add_parser("table", help="Inspect a table.") + parser_table.add_argument( + "data", help="Json string representing a table.", type=json.loads + ) + parser_table.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_table.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_table.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_table.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_table.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_table.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_table.add_argument( + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) + + parser_file = subparsers.add_parser("file", help="Inspect a local file.") + parser_file.add_argument( + "filename", help="The path to the file to inspect." + ) + parser_file.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_file.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_file.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_file.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_file.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_file.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_file.add_argument( + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) + parser_file.add_argument( + "--mime_type", + help="The MIME type of the file. If not specified, the type is " + "inferred via the Python standard library's mimetypes module.", + ) + + parser_gcs = subparsers.add_parser( + "gcs", help="Inspect files on Google Cloud Storage." + ) + parser_gcs.add_argument( + "bucket", help="The name of the GCS bucket containing the file." + ) + parser_gcs.add_argument( + "filename", + help="The name of the file in the bucket, including the path, e.g. " + '"images/myfile.png". Wildcards are permitted.', + ) + parser_gcs.add_argument( + "topic_id", + help="The id of the Cloud Pub/Sub topic to use to report that the job " + 'is complete, e.g. "dlp-sample-topic".', + ) + parser_gcs.add_argument( + "subscription_id", + help="The id of the Cloud Pub/Sub subscription to monitor for job " + 'completion, e.g. "dlp-sample-subscription". The subscription must ' + "already be subscribed to the topic. See the test files or the Cloud " + "Pub/Sub sample files for examples on how to create the subscription.", + ) + parser_gcs.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_gcs.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_gcs.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_gcs.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_gcs.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_gcs.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_gcs.add_argument( + "--timeout", + type=int, + help="The maximum number of seconds to wait for a response from the " + "API. The default is 300 seconds.", + default=300, + ) + + parser_datastore = subparsers.add_parser( + "datastore", help="Inspect files on Google Datastore." + ) + parser_datastore.add_argument( + "datastore_project", + help="The Google Cloud project id of the target Datastore.", + ) + parser_datastore.add_argument( + "kind", + help='The kind of the Datastore entity to inspect, e.g. "Person".', + ) + parser_datastore.add_argument( + "topic_id", + help="The id of the Cloud Pub/Sub topic to use to report that the job " + 'is complete, e.g. "dlp-sample-topic".', + ) + parser_datastore.add_argument( + "subscription_id", + help="The id of the Cloud Pub/Sub subscription to monitor for job " + 'completion, e.g. "dlp-sample-subscription". The subscription must ' + "already be subscribed to the topic. See the test files or the Cloud " + "Pub/Sub sample files for examples on how to create the subscription.", + ) + parser_datastore.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_datastore.add_argument( + "--info_types", + action="append", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_datastore.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_datastore.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_datastore.add_argument( + "--namespace_id", help="The Datastore namespace to use, if applicable." + ) + parser_datastore.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_datastore.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_datastore.add_argument( + "--timeout", + type=int, + help="The maximum number of seconds to wait for a response from the " + "API. The default is 300 seconds.", + default=300, + ) + + parser_bigquery = subparsers.add_parser( + "bigquery", help="Inspect files on Google BigQuery." + ) + parser_bigquery.add_argument( + "bigquery_project", + help="The Google Cloud project id of the target table.", + ) + parser_bigquery.add_argument( + "dataset_id", help="The ID of the target BigQuery dataset." + ) + parser_bigquery.add_argument( + "table_id", help="The ID of the target BigQuery table." + ) + parser_bigquery.add_argument( + "topic_id", + help="The id of the Cloud Pub/Sub topic to use to report that the job " + 'is complete, e.g. "dlp-sample-topic".', + ) + parser_bigquery.add_argument( + "subscription_id", + help="The id of the Cloud Pub/Sub subscription to monitor for job " + 'completion, e.g. "dlp-sample-subscription". The subscription must ' + "already be subscribed to the topic. See the test files or the Cloud " + "Pub/Sub sample files for examples on how to create the subscription.", + ) + parser_bigquery.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_bigquery.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_bigquery.add_argument( + "--custom_dictionaries", + action="append", + help="Strings representing comma-delimited lists of dictionary words" + " to search for as custom info types. Each string is a comma " + "delimited list of words representing a distinct dictionary.", + default=None, + ) + parser_bigquery.add_argument( + "--custom_regexes", + action="append", + help="Strings representing regex patterns to search for as custom " + " info types.", + default=None, + ) + parser_bigquery.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_bigquery.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_bigquery.add_argument( + "--timeout", + type=int, + help="The maximum number of seconds to wait for a response from the " + "API. The default is 300 seconds.", + default=300, + ) + + args = parser.parse_args() + + if args.content == "string": + inspect_string( + args.project, + args.item, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + include_quote=args.include_quote, + ) + elif args.content == "table": + inspect_table( + args.project, + args.data, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + include_quote=args.include_quote, + ) + elif args.content == "file": + inspect_file( + args.project, + args.filename, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + include_quote=args.include_quote, + mime_type=args.mime_type, + ) + elif args.content == "gcs": + inspect_gcs_file( + args.project, + args.bucket, + args.filename, + args.topic_id, + args.subscription_id, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + timeout=args.timeout, + ) + elif args.content == "datastore": + inspect_datastore( + args.project, + args.datastore_project, + args.kind, + args.topic_id, + args.subscription_id, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + namespace_id=args.namespace_id, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + timeout=args.timeout, + ) + elif args.content == "bigquery": + inspect_bigquery( + args.project, + args.bigquery_project, + args.dataset_id, + args.table_id, + args.topic_id, + args.subscription_id, + args.info_types, + custom_dictionaries=args.custom_dictionaries, + custom_regexes=args.custom_regexes, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + timeout=args.timeout, + ) diff --git a/packages/google-cloud-dlp/samples/snippets/inspect_content_test.py b/packages/google-cloud-dlp/samples/snippets/inspect_content_test.py new file mode 100644 index 000000000000..bdabda265c1b --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/inspect_content_test.py @@ -0,0 +1,467 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +import google.api_core.exceptions +import google.cloud.bigquery +import google.cloud.datastore +import google.cloud.dlp_v2 +import google.cloud.exceptions +import google.cloud.pubsub +import google.cloud.storage +import pytest + +import inspect_content + + +UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +TEST_BUCKET_NAME = GCLOUD_PROJECT + "-dlp-python-client-test" + UNIQUE_STRING +RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources") +RESOURCE_FILE_NAMES = ["test.txt", "test.png", "harmless.txt", "accounts.txt"] +TOPIC_ID = "dlp-test" + UNIQUE_STRING +SUBSCRIPTION_ID = "dlp-test-subscription" + UNIQUE_STRING +DATASTORE_KIND = "DLP test kind" +DATASTORE_NAME = "DLP test object" + UNIQUE_STRING +BIGQUERY_DATASET_ID = "dlp_test_dataset" + UNIQUE_STRING +BIGQUERY_TABLE_ID = "dlp_test_table" + UNIQUE_STRING + +TIMEOUT = 900 # 15 minutes + + +@pytest.fixture(scope="module") +def bucket(): + # Creates a GCS bucket, uploads files required for the test, and tears down + # the entire bucket afterwards. + + client = google.cloud.storage.Client() + try: + bucket = client.get_bucket(TEST_BUCKET_NAME) + except google.cloud.exceptions.NotFound: + bucket = client.create_bucket(TEST_BUCKET_NAME) + + # Upoad the blobs and keep track of them in a list. + blobs = [] + for name in RESOURCE_FILE_NAMES: + path = os.path.join(RESOURCE_DIRECTORY, name) + blob = bucket.blob(name) + blob.upload_from_filename(path) + blobs.append(blob) + + # Yield the object to the test; lines after this execute as a teardown. + yield bucket + + # Delete the files. + for blob in blobs: + try: + blob.delete() + except google.cloud.exceptions.NotFound: + print("Issue during teardown, missing blob") + + # Attempt to delete the bucket; this will only work if it is empty. + bucket.delete() + + +@pytest.fixture(scope="module") +def topic_id(): + # Creates a pubsub topic, and tears it down. + publisher = google.cloud.pubsub.PublisherClient() + topic_path = publisher.topic_path(GCLOUD_PROJECT, TOPIC_ID) + try: + publisher.create_topic(topic_path) + except google.api_core.exceptions.AlreadyExists: + pass + + yield TOPIC_ID + + publisher.delete_topic(topic_path) + + +@pytest.fixture(scope="module") +def subscription_id(topic_id): + # Subscribes to a topic. + subscriber = google.cloud.pubsub.SubscriberClient() + topic_path = subscriber.topic_path(GCLOUD_PROJECT, topic_id) + subscription_path = subscriber.subscription_path( + GCLOUD_PROJECT, SUBSCRIPTION_ID) + try: + subscriber.create_subscription(subscription_path, topic_path) + except google.api_core.exceptions.AlreadyExists: + pass + + yield SUBSCRIPTION_ID + + subscriber.delete_subscription(subscription_path) + + +@pytest.fixture(scope="module") +def datastore_project(): + # Adds test Datastore data, yields the project ID and then tears down. + datastore_client = google.cloud.datastore.Client() + + kind = DATASTORE_KIND + name = DATASTORE_NAME + key = datastore_client.key(kind, name) + item = google.cloud.datastore.Entity(key=key) + item["payload"] = "My name is Gary Smith and my email is gary@example.com" + + datastore_client.put(item) + + yield GCLOUD_PROJECT + + datastore_client.delete(key) + + +@pytest.fixture(scope="module") +def bigquery_project(): + # Adds test Bigquery data, yields the project ID and then tears down. + bigquery_client = google.cloud.bigquery.Client() + + dataset_ref = bigquery_client.dataset(BIGQUERY_DATASET_ID) + dataset = google.cloud.bigquery.Dataset(dataset_ref) + try: + dataset = bigquery_client.create_dataset(dataset) + except google.api_core.exceptions.Conflict: + dataset = bigquery_client.get_dataset(dataset) + + table_ref = dataset_ref.table(BIGQUERY_TABLE_ID) + table = google.cloud.bigquery.Table(table_ref) + + # DO NOT SUBMIT: trim this down once we find out what works + table.schema = ( + google.cloud.bigquery.SchemaField("Name", "STRING"), + google.cloud.bigquery.SchemaField("Comment", "STRING"), + ) + + try: + table = bigquery_client.create_table(table) + except google.api_core.exceptions.Conflict: + table = bigquery_client.get_table(table) + + rows_to_insert = [(u"Gary Smith", u"My email is gary@example.com")] + + bigquery_client.insert_rows(table, rows_to_insert) + + yield GCLOUD_PROJECT + + bigquery_client.delete_dataset(dataset_ref, delete_contents=True) + + +def test_inspect_string_basic(capsys): + test_string = "String with a phone number: 234-555-6789" + + inspect_content.inspect_string_basic(GCLOUD_PROJECT, test_string) + + out, _ = capsys.readouterr() + assert "Info type: PHONE_NUMBER" in out + assert "Quote: 234-555-6789" in out + + +def test_inspect_string(capsys): + test_string = "My name is Gary Smith and my email is gary@example.com" + + inspect_content.inspect_string( + GCLOUD_PROJECT, + test_string, + ["FIRST_NAME", "EMAIL_ADDRESS"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: FIRST_NAME" in out + assert "Info type: EMAIL_ADDRESS" in out + + +def test_inspect_table(capsys): + test_tabular_data = { + "header": ["email", "phone number"], + "rows": [ + ["robertfrost@xyz.com", "4232342345"], + ["johndoe@pqr.com", "4253458383"], + ], + } + + inspect_content.inspect_table( + GCLOUD_PROJECT, + test_tabular_data, + ["PHONE_NUMBER", "EMAIL_ADDRESS"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: PHONE_NUMBER" in out + assert "Info type: EMAIL_ADDRESS" in out + + +def test_inspect_string_with_custom_info_types(capsys): + test_string = "My name is Gary Smith and my email is gary@example.com" + dictionaries = ["Gary Smith"] + regexes = ["\\w+@\\w+.com"] + + inspect_content.inspect_string( + GCLOUD_PROJECT, + test_string, + [], + custom_dictionaries=dictionaries, + custom_regexes=regexes, + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: CUSTOM_DICTIONARY_0" in out + assert "Info type: CUSTOM_REGEX_0" in out + + +def test_inspect_string_no_results(capsys): + test_string = "Nothing to see here" + + inspect_content.inspect_string( + GCLOUD_PROJECT, + test_string, + ["FIRST_NAME", "EMAIL_ADDRESS"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "No findings" in out + + +def test_inspect_file(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.txt") + + inspect_content.inspect_file( + GCLOUD_PROJECT, + test_filepath, + ["FIRST_NAME", "EMAIL_ADDRESS"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: EMAIL_ADDRESS" in out + + +def test_inspect_file_with_custom_info_types(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.txt") + dictionaries = ["gary@somedomain.com"] + regexes = ["\\(\\d{3}\\) \\d{3}-\\d{4}"] + + inspect_content.inspect_file( + GCLOUD_PROJECT, + test_filepath, + [], + custom_dictionaries=dictionaries, + custom_regexes=regexes, + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: CUSTOM_DICTIONARY_0" in out + assert "Info type: CUSTOM_REGEX_0" in out + + +def test_inspect_file_no_results(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "harmless.txt") + + inspect_content.inspect_file( + GCLOUD_PROJECT, + test_filepath, + ["FIRST_NAME", "EMAIL_ADDRESS"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "No findings" in out + + +def test_inspect_image_file(capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") + + inspect_content.inspect_file( + GCLOUD_PROJECT, + test_filepath, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + include_quote=True, + ) + + out, _ = capsys.readouterr() + assert "Info type: PHONE_NUMBER" in out + + +def cancel_operation(out): + if "Inspection operation started" in out: + # Cancel the operation + operation_id = out.split( + "Inspection operation started: ")[1].split("\n")[0] + client = google.cloud.dlp_v2.DlpServiceClient() + client.cancel_dlp_job(operation_id) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_gcs_file(bucket, topic_id, subscription_id, capsys): + try: + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + "test.txt", + topic_id, + subscription_id, + ["EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=TIMEOUT + ) + + out, _ = capsys.readouterr() + assert "Info type: EMAIL_ADDRESS" in out + finally: + cancel_operation(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_gcs_file_with_custom_info_types( + bucket, topic_id, subscription_id, capsys): + try: + dictionaries = ["gary@somedomain.com"] + regexes = ["\\(\\d{3}\\) \\d{3}-\\d{4}"] + + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + "test.txt", + topic_id, + subscription_id, + [], + custom_dictionaries=dictionaries, + custom_regexes=regexes, + timeout=TIMEOUT) + + out, _ = capsys.readouterr() + + assert "Info type: EMAIL_ADDRESS" in out + finally: + cancel_operation(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_gcs_file_no_results( + bucket, topic_id, subscription_id, capsys): + try: + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + "harmless.txt", + topic_id, + subscription_id, + ["EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=TIMEOUT) + + out, _ = capsys.readouterr() + + assert "No findings" in out + finally: + cancel_operation(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_gcs_image_file(bucket, topic_id, subscription_id, capsys): + try: + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + "test.png", + topic_id, + subscription_id, + ["EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=TIMEOUT) + + out, _ = capsys.readouterr() + assert "Info type: EMAIL_ADDRESS" in out + finally: + cancel_operation(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_gcs_multiple_files(bucket, topic_id, subscription_id, capsys): + try: + inspect_content.inspect_gcs_file( + GCLOUD_PROJECT, + bucket.name, + "*", + topic_id, + subscription_id, + ["EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=TIMEOUT) + + out, _ = capsys.readouterr() + + assert "Info type: EMAIL_ADDRESS" in out + finally: + cancel_operation(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_datastore( + datastore_project, topic_id, subscription_id, capsys): + try: + inspect_content.inspect_datastore( + GCLOUD_PROJECT, + datastore_project, + DATASTORE_KIND, + topic_id, + subscription_id, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=TIMEOUT) + + out, _ = capsys.readouterr() + assert "Info type: EMAIL_ADDRESS" in out + finally: + cancel_operation(out) + + +@pytest.mark.flaky(max_runs=2, min_passes=1) +def test_inspect_datastore_no_results( + datastore_project, topic_id, subscription_id, capsys): + try: + inspect_content.inspect_datastore( + GCLOUD_PROJECT, + datastore_project, + DATASTORE_KIND, + topic_id, + subscription_id, + ["PHONE_NUMBER"], + timeout=TIMEOUT) + + out, _ = capsys.readouterr() + assert "No findings" in out + finally: + cancel_operation(out) + + +def test_inspect_bigquery(bigquery_project, topic_id, subscription_id, capsys): + try: + inspect_content.inspect_bigquery( + GCLOUD_PROJECT, + bigquery_project, + BIGQUERY_DATASET_ID, + BIGQUERY_TABLE_ID, + topic_id, + subscription_id, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + timeout=1) + + out, _ = capsys.readouterr() + assert "Inspection operation started" in out + finally: + cancel_operation(out) diff --git a/packages/google-cloud-dlp/samples/snippets/jobs.py b/packages/google-cloud-dlp/samples/snippets/jobs.py new file mode 100644 index 000000000000..a8ac0b43c5e0 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/jobs.py @@ -0,0 +1,167 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app to list and delete DLP jobs using the Data Loss Prevent API. """ + +from __future__ import print_function + +import argparse + + +# [START dlp_list_jobs] +def list_dlp_jobs(project, filter_string=None, job_type=None): + """Uses the Data Loss Prevention API to lists DLP jobs that match the + specified filter in the request. + Args: + project: The project id to use as a parent resource. + filter: (Optional) Allows filtering. + Supported syntax: + * Filter expressions are made up of one or more restrictions. + * Restrictions can be combined by 'AND' or 'OR' logical operators. + A sequence of restrictions implicitly uses 'AND'. + * A restriction has the form of ' '. + * Supported fields/values for inspect jobs: + - `state` - PENDING|RUNNING|CANCELED|FINISHED|FAILED + - `inspected_storage` - DATASTORE|CLOUD_STORAGE|BIGQUERY + - `trigger_name` - The resource name of the trigger that + created job. + * Supported fields for risk analysis jobs: + - `state` - RUNNING|CANCELED|FINISHED|FAILED + * The operator must be '=' or '!='. + Examples: + * inspected_storage = cloud_storage AND state = done + * inspected_storage = cloud_storage OR inspected_storage = bigquery + * inspected_storage = cloud_storage AND + (state = done OR state = canceled) + type: (Optional) The type of job. Defaults to 'INSPECT'. + Choices: + DLP_JOB_TYPE_UNSPECIFIED + INSPECT_JOB: The job inspected content for sensitive data. + RISK_ANALYSIS_JOB: The job executed a Risk Analysis computation. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Job type dictionary + job_type_to_int = { + "DLP_JOB_TYPE_UNSPECIFIED": + google.cloud.dlp.enums.DlpJobType.DLP_JOB_TYPE_UNSPECIFIED, + "INSPECT_JOB": google.cloud.dlp.enums.DlpJobType.INSPECT_JOB, + "RISK_ANALYSIS_JOB": google.cloud.dlp.enums.DlpJobType.RISK_ANALYSIS_JOB, + } + # If job type is specified, convert job type to number through enums. + if job_type: + job_type = job_type_to_int[job_type] + + # Call the API to get a list of jobs. + response = dlp.list_dlp_jobs(parent, filter_=filter_string, type_=job_type) + + # Iterate over results. + for job in response: + print("Job: %s; status: %s" % (job.name, job.JobState.Name(job.state))) + + +# [END dlp_list_jobs] + + +# [START dlp_delete_job] +def delete_dlp_job(project, job_name): + """Uses the Data Loss Prevention API to delete a long-running DLP job. + Args: + project: The project id to use as a parent resource. + job_name: The name of the DlpJob resource to be deleted. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id and job name into a full resource id. + name = dlp.dlp_job_path(project, job_name) + + # Call the API to delete job. + dlp.delete_dlp_job(name) + + print("Successfully deleted %s" % job_name) + + +# [END dlp_delete_job] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select how to submit content to the API." + ) + subparsers.required = True + + list_parser = subparsers.add_parser( + "list", + help="List Data Loss Prevention API jobs corresponding to a given " + "filter.", + ) + list_parser.add_argument( + "project", help="The project id to use as a parent resource." + ) + list_parser.add_argument( + "-f", + "--filter", + help="Filter expressions are made up of one or more restrictions.", + ) + list_parser.add_argument( + "-t", + "--type", + choices=[ + "DLP_JOB_TYPE_UNSPECIFIED", + "INSPECT_JOB", + "RISK_ANALYSIS_JOB", + ], + help='The type of job. API defaults to "INSPECT"', + ) + + delete_parser = subparsers.add_parser( + "delete", help="Delete results of a Data Loss Prevention API job." + ) + delete_parser.add_argument( + "project", help="The project id to use as a parent resource." + ) + delete_parser.add_argument( + "job_name", + help="The name of the DlpJob resource to be deleted. " + "Example: X-#####", + ) + + args = parser.parse_args() + + if args.content == "list": + list_dlp_jobs( + args.project, filter_string=args.filter, job_type=args.type + ) + elif args.content == "delete": + delete_dlp_job(args.project, args.job_name) diff --git a/packages/google-cloud-dlp/samples/snippets/jobs_test.py b/packages/google-cloud-dlp/samples/snippets/jobs_test.py new file mode 100644 index 000000000000..89997bc5097c --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/jobs_test.py @@ -0,0 +1,89 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +import pytest + +import jobs + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +TEST_COLUMN_NAME = "zip_code" +TEST_TABLE_PROJECT_ID = "bigquery-public-data" +TEST_DATASET_ID = "san_francisco" +TEST_TABLE_ID = "bikeshare_trips" +test_job_id = "test-job-{}".format(uuid.uuid4()) + + +@pytest.fixture(scope="module") +def test_job_name(): + import google.cloud.dlp + + dlp = google.cloud.dlp_v2.DlpServiceClient() + + parent = dlp.project_path(GCLOUD_PROJECT) + + # Construct job request + risk_job = { + "privacy_metric": { + "categorical_stats_config": {"field": {"name": TEST_COLUMN_NAME}} + }, + "source_table": { + "project_id": TEST_TABLE_PROJECT_ID, + "dataset_id": TEST_DATASET_ID, + "table_id": TEST_TABLE_ID, + }, + } + + response = dlp.create_dlp_job(parent, risk_job=risk_job, job_id=test_job_id) + full_path = response.name + # API expects only job name, not full project path + job_name = full_path[full_path.rfind("/") + 1:] + yield job_name + + # clean up job if not deleted + try: + dlp.delete_dlp_job(full_path) + except google.api_core.exceptions.NotFound: + print("Issue during teardown, missing job") + + +def test_list_dlp_jobs(test_job_name, capsys): + jobs.list_dlp_jobs(GCLOUD_PROJECT) + + out, _ = capsys.readouterr() + assert test_job_name not in out + + +def test_list_dlp_jobs_with_filter(test_job_name, capsys): + jobs.list_dlp_jobs( + GCLOUD_PROJECT, + filter_string="state=RUNNING OR state=DONE", + job_type="RISK_ANALYSIS_JOB", + ) + + out, _ = capsys.readouterr() + assert test_job_name in out + + +def test_list_dlp_jobs_with_job_type(test_job_name, capsys): + jobs.list_dlp_jobs(GCLOUD_PROJECT, job_type="INSPECT_JOB") + + out, _ = capsys.readouterr() + assert test_job_name not in out # job created is a risk analysis job + + +def test_delete_dlp_job(test_job_name, capsys): + jobs.delete_dlp_job(GCLOUD_PROJECT, test_job_name) diff --git a/packages/google-cloud-dlp/samples/snippets/metadata.py b/packages/google-cloud-dlp/samples/snippets/metadata.py new file mode 100644 index 000000000000..7a65941d622a --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/metadata.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that queries the Data Loss Prevention API for supported +categories and info types.""" + +from __future__ import print_function + +import argparse + + +# [START dlp_list_info_types] +def list_info_types(language_code=None, result_filter=None): + """List types of sensitive information within a category. + Args: + language_code: The BCP-47 language code to use, e.g. 'en-US'. + filter: An optional filter to only return info types supported by + certain parts of the API. Defaults to "supported_by=INSPECT". + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Make the API call. + response = dlp.list_info_types(language_code, result_filter) + + # Print the results to the console. + print("Info types:") + for info_type in response.info_types: + print( + u"{name}: {display_name}".format( + name=info_type.name, display_name=info_type.display_name + ) + ) + + +# [END dlp_list_info_types] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--language_code", + help="The BCP-47 language code to use, e.g. 'en-US'.", + ) + parser.add_argument( + "--filter", + help="An optional filter to only return info types supported by " + 'certain parts of the API. Defaults to "supported_by=INSPECT".', + ) + + args = parser.parse_args() + + list_info_types( + language_code=args.language_code, result_filter=args.filter + ) diff --git a/packages/google-cloud-dlp/samples/snippets/metadata_test.py b/packages/google-cloud-dlp/samples/snippets/metadata_test.py new file mode 100644 index 000000000000..bde63fd3e8fb --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/metadata_test.py @@ -0,0 +1,22 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import metadata + + +def test_fetch_info_types(capsys): + metadata.list_info_types() + + out, _ = capsys.readouterr() + assert "EMAIL_ADDRESS" in out diff --git a/packages/google-cloud-dlp/samples/snippets/noxfile.py b/packages/google-cloud-dlp/samples/snippets/noxfile.py new file mode 100644 index 000000000000..ba55d7ce53ca --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/noxfile.py @@ -0,0 +1,224 @@ +# Copyright 2019 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +from pathlib import Path +import sys + +import nox + + +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING +# DO NOT EDIT THIS FILE EVER! +# WARNING - WARNING - WARNING - WARNING - WARNING +# WARNING - WARNING - WARNING - WARNING - WARNING + +# Copy `noxfile_config.py` to your directory and modify it instead. + + +# `TEST_CONFIG` dict is a configuration hook that allows users to +# modify the test configurations. The values here should be in sync +# with `noxfile_config.py`. Users will copy `noxfile_config.py` into +# their directory and modify it. + +TEST_CONFIG = { + # You can opt out from the test for specific Python versions. + 'ignored_versions': ["2.7"], + + # An envvar key for determining the project id to use. Change it + # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a + # build specific Cloud project. You can also use your own string + # to use your own Cloud project. + 'gcloud_project_env': 'GOOGLE_CLOUD_PROJECT', + # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT', + + # A dictionary you want to inject into your test. Don't put any + # secrets here. These values will override predefined values. + 'envs': {}, +} + + +try: + # Ensure we can import noxfile_config in the project's directory. + sys.path.append('.') + from noxfile_config import TEST_CONFIG_OVERRIDE +except ImportError as e: + print("No user noxfile_config found: detail: {}".format(e)) + TEST_CONFIG_OVERRIDE = {} + +# Update the TEST_CONFIG with the user supplied values. +TEST_CONFIG.update(TEST_CONFIG_OVERRIDE) + + +def get_pytest_env_vars(): + """Returns a dict for pytest invocation.""" + ret = {} + + # Override the GCLOUD_PROJECT and the alias. + env_key = TEST_CONFIG['gcloud_project_env'] + # This should error out if not set. + ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key] + + # Apply user supplied envs. + ret.update(TEST_CONFIG['envs']) + return ret + + +# DO NOT EDIT - automatically generated. +# All versions used to tested samples. +ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"] + +# Any default versions that should be ignored. +IGNORED_VERSIONS = TEST_CONFIG['ignored_versions'] + +TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS]) + +INSTALL_LIBRARY_FROM_SOURCE = bool(os.environ.get("INSTALL_LIBRARY_FROM_SOURCE", False)) +# +# Style Checks +# + + +def _determine_local_import_names(start_dir): + """Determines all import names that should be considered "local". + + This is used when running the linter to insure that import order is + properly checked. + """ + file_ext_pairs = [os.path.splitext(path) for path in os.listdir(start_dir)] + return [ + basename + for basename, extension in file_ext_pairs + if extension == ".py" + or os.path.isdir(os.path.join(start_dir, basename)) + and basename not in ("__pycache__") + ] + + +# Linting with flake8. +# +# We ignore the following rules: +# E203: whitespace before ‘:’ +# E266: too many leading ‘#’ for block comment +# E501: line too long +# I202: Additional newline in a section of imports +# +# We also need to specify the rules which are ignored by default: +# ['E226', 'W504', 'E126', 'E123', 'W503', 'E24', 'E704', 'E121'] +FLAKE8_COMMON_ARGS = [ + "--show-source", + "--builtin=gettext", + "--max-complexity=20", + "--import-order-style=google", + "--exclude=.nox,.cache,env,lib,generated_pb2,*_pb2.py,*_pb2_grpc.py", + "--ignore=E121,E123,E126,E203,E226,E24,E266,E501,E704,W503,W504,I202", + "--max-line-length=88", +] + + +@nox.session +def lint(session): + session.install("flake8", "flake8-import-order") + + local_names = _determine_local_import_names(".") + args = FLAKE8_COMMON_ARGS + [ + "--application-import-names", + ",".join(local_names), + "." + ] + session.run("flake8", *args) + + +# +# Sample Tests +# + + +PYTEST_COMMON_ARGS = ["--junitxml=sponge_log.xml"] + + +def _session_tests(session, post_install=None): + """Runs py.test for a particular project.""" + if os.path.exists("requirements.txt"): + session.install("-r", "requirements.txt") + + if os.path.exists("requirements-test.txt"): + session.install("-r", "requirements-test.txt") + + if INSTALL_LIBRARY_FROM_SOURCE: + session.install("-e", _get_repo_root()) + + if post_install: + post_install(session) + + session.run( + "pytest", + *(PYTEST_COMMON_ARGS + session.posargs), + # Pytest will return 5 when no tests are collected. This can happen + # on travis where slow and flaky tests are excluded. + # See http://doc.pytest.org/en/latest/_modules/_pytest/main.html + success_codes=[0, 5], + env=get_pytest_env_vars() + ) + + +@nox.session(python=ALL_VERSIONS) +def py(session): + """Runs py.test for a sample using the specified version of Python.""" + if session.python in TESTED_VERSIONS: + _session_tests(session) + else: + session.skip("SKIPPED: {} tests are disabled for this sample.".format( + session.python + )) + + +# +# Readmegen +# + + +def _get_repo_root(): + """ Returns the root folder of the project. """ + # Get root of this repository. Assume we don't have directories nested deeper than 10 items. + p = Path(os.getcwd()) + for i in range(10): + if p is None: + break + if Path(p / ".git").exists(): + return str(p) + p = p.parent + raise Exception("Unable to detect repository root.") + + +GENERATED_READMES = sorted([x for x in Path(".").rglob("*.rst.in")]) + + +@nox.session +@nox.parametrize("path", GENERATED_READMES) +def readmegen(session, path): + """(Re-)generates the readme for a sample.""" + session.install("jinja2", "pyyaml") + dir_ = os.path.dirname(path) + + if os.path.exists(os.path.join(dir_, "requirements.txt")): + session.install("-r", os.path.join(dir_, "requirements.txt")) + + in_file = os.path.join(dir_, "README.rst.in") + session.run( + "python", _get_repo_root() + "/scripts/readme-gen/readme_gen.py", in_file + ) diff --git a/packages/google-cloud-dlp/samples/snippets/quickstart.py b/packages/google-cloud-dlp/samples/snippets/quickstart.py new file mode 100644 index 000000000000..ec929b45f541 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/quickstart.py @@ -0,0 +1,98 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that queries the Data Loss Prevention API for supported +categories and info types.""" + +from __future__ import print_function + +import argparse +import sys + + +def quickstart(project_id): + """Demonstrates use of the Data Loss Prevention API client library.""" + + # [START dlp_quickstart] + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp_client = google.cloud.dlp_v2.DlpServiceClient() + + # The string to inspect + content = "Robert Frost" + + # Construct the item to inspect. + item = {"value": content} + + # The info types to search for in the content. Required. + info_types = [{"name": "FIRST_NAME"}, {"name": "LAST_NAME"}] + + # The minimum likelihood to constitute a match. Optional. + min_likelihood = "LIKELIHOOD_UNSPECIFIED" + + # The maximum number of findings to report (0 = server maximum). Optional. + max_findings = 0 + + # Whether to include the matching string in the results. Optional. + include_quote = True + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, + } + + # Convert the project id into a full resource id. + parent = dlp_client.project_path(project_id) + + # Call the API. + response = dlp_client.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + print("Quote: {}".format(finding.quote)) + except AttributeError: + pass + print("Info type: {}".format(finding.info_type.name)) + # Convert likelihood value to string respresentation. + likelihood = ( + google.cloud.dlp.types.Finding.DESCRIPTOR.fields_by_name[ + "likelihood" + ] + .enum_type.values_by_number[finding.likelihood] + .name + ) + print("Likelihood: {}".format(likelihood)) + else: + print("No findings.") + # [END dlp_quickstart] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "project_id", help="Enter your GCP project id.", type=str + ) + args = parser.parse_args() + if len(sys.argv) == 1: + parser.print_usage() + sys.exit(1) + quickstart(args.project_id) diff --git a/packages/google-cloud-dlp/samples/snippets/quickstart_test.py b/packages/google-cloud-dlp/samples/snippets/quickstart_test.py new file mode 100644 index 000000000000..1814497c1660 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/quickstart_test.py @@ -0,0 +1,37 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import google.cloud.dlp +import mock + +import quickstart + + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") + + +def test_quickstart(capsys): + # Mock out project_path to use the test runner's project ID. + with mock.patch.object( + google.cloud.dlp.DlpServiceClient, + "project_path", + return_value="projects/{}".format(GCLOUD_PROJECT), + ): + quickstart.quickstart(GCLOUD_PROJECT) + + out, _ = capsys.readouterr() + assert "FIRST_NAME" in out + assert "LAST_NAME" in out diff --git a/packages/google-cloud-dlp/samples/snippets/redact.py b/packages/google-cloud-dlp/samples/snippets/redact.py new file mode 100644 index 000000000000..8a1650a262db --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/redact.py @@ -0,0 +1,255 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that uses the Data Loss Prevent API to redact the contents of +an image file.""" + +from __future__ import print_function + +import argparse + +# [START dlp_redact_image] +import mimetypes + +# [END dlp_redact_image] +import os + + +# [START dlp_redact_image] + + +def redact_image( + project, + filename, + output_filename, + info_types, + min_likelihood=None, + mime_type=None, +): + """Uses the Data Loss Prevention API to redact protected data in an image. + Args: + project: The Google Cloud project id to use as a parent resource. + filename: The path to the file to inspect. + output_filename: The path to which the redacted image will be written. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + mime_type: The MIME type of the file. If not specified, the type is + inferred via the Python standard library's mimetypes module. + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Prepare image_redaction_configs, a list of dictionaries. Each dictionary + # contains an info_type and optionally the color used for the replacement. + # The color is omitted in this sample, so the default (black) will be used. + image_redaction_configs = [] + + if info_types is not None: + for info_type in info_types: + image_redaction_configs.append({"info_type": info_type}) + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "min_likelihood": min_likelihood, + "info_types": info_types, + } + + # If mime_type is not specified, guess it from the filename. + if mime_type is None: + mime_guess = mimetypes.MimeTypes().guess_type(filename) + mime_type = mime_guess[0] or "application/octet-stream" + + # Select the content type index from the list of supported types. + supported_content_types = { + None: 0, # "Unspecified" + "image/jpeg": 1, + "image/bmp": 2, + "image/png": 3, + "image/svg": 4, + "text/plain": 5, + } + content_type_index = supported_content_types.get(mime_type, 0) + + # Construct the byte_item, containing the file's byte data. + with open(filename, mode="rb") as f: + byte_item = {"type": content_type_index, "data": f.read()} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.redact_image( + parent, + inspect_config=inspect_config, + image_redaction_configs=image_redaction_configs, + byte_item=byte_item, + ) + + # Write out the results. + with open(output_filename, mode="wb") as f: + f.write(response.redacted_image) + print( + "Wrote {byte_count} to {filename}".format( + byte_count=len(response.redacted_image), filename=output_filename + ) + ) + + +# [END dlp_redact_image] + +# [START dlp_redact_image_all_text] + + +def redact_image_all_text( + project, + filename, + output_filename, +): + """Uses the Data Loss Prevention API to redact all text in an image. + + Args: + project: The Google Cloud project id to use as a parent resource. + filename: The path to the file to inspect. + output_filename: The path to which the redacted image will be written. + + Returns: + None; the response from the API is printed to the terminal. + """ + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct the image_redaction_configs, indicating to DLP that all text in + # the input image should be redacted. + image_redaction_configs = [{ + "redact_all_text": True, + }] + + # Construct the byte_item, containing the file's byte data. + with open(filename, mode="rb") as f: + byte_item = {"type": "IMAGE", "data": f.read()} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.redact_image( + parent, + image_redaction_configs=image_redaction_configs, + byte_item=byte_item, + ) + + # Write out the results. + with open(output_filename, mode="wb") as f: + f.write(response.redacted_image) + + print("Wrote {byte_count} to {filename}".format( + byte_count=len(response.redacted_image), filename=output_filename)) + + +# [END dlp_redact_image_all_text] + +if __name__ == "__main__": + default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") + + common_args_parser = argparse.ArgumentParser(add_help=False) + common_args_parser.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + common_args_parser.add_argument( + "filename", help="The path to the file to inspect.") + common_args_parser.add_argument( + "output_filename", + help="The path to which the redacted image will be written.", + ) + + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select which content should be redacted.") + subparsers.required = True + + info_types_parser = subparsers.add_parser( + "info_types", + help="Redact specific infoTypes from an image.", + parents=[common_args_parser], + ) + info_types_parser.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + info_types_parser.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + info_types_parser.add_argument( + "--mime_type", + help="The MIME type of the file. If not specified, the type is " + "inferred via the Python standard library's mimetypes module.", + ) + + all_text_parser = subparsers.add_parser( + "all_text", + help="Redact all text from an image. The MIME type of the file is " + "inferred via the Python standard library's mimetypes module.", + parents=[common_args_parser], + ) + + args = parser.parse_args() + + if args.content == "info_types": + redact_image( + args.project, + args.filename, + args.output_filename, + args.info_types, + min_likelihood=args.min_likelihood, + mime_type=args.mime_type, + ) + elif args.content == "all_text": + redact_image_all_text( + args.project, + args.filename, + args.output_filename, + ) diff --git a/packages/google-cloud-dlp/samples/snippets/redact_test.py b/packages/google-cloud-dlp/samples/snippets/redact_test.py new file mode 100644 index 000000000000..0cce514eb1a6 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/redact_test.py @@ -0,0 +1,60 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import tempfile + +import pytest + +import redact + +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources") + + +@pytest.fixture(scope="module") +def tempdir(): + tempdir = tempfile.mkdtemp() + yield tempdir + shutil.rmtree(tempdir) + + +def test_redact_image_file(tempdir, capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") + output_filepath = os.path.join(tempdir, "redacted.png") + + redact.redact_image( + GCLOUD_PROJECT, + test_filepath, + output_filepath, + ["FIRST_NAME", "EMAIL_ADDRESS"], + ) + + out, _ = capsys.readouterr() + assert output_filepath in out + + +def test_redact_image_all_text(tempdir, capsys): + test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png") + output_filepath = os.path.join(tempdir, "redacted.png") + + redact.redact_image_all_text( + GCLOUD_PROJECT, + test_filepath, + output_filepath, + ) + + out, _ = capsys.readouterr() + assert output_filepath in out diff --git a/packages/google-cloud-dlp/samples/snippets/requirements-test.txt b/packages/google-cloud-dlp/samples/snippets/requirements-test.txt new file mode 100644 index 000000000000..d0c01cc98c5f --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/requirements-test.txt @@ -0,0 +1,4 @@ +pytest==6.0.1 +flaky==3.7.0 +mock==4.0.2 + diff --git a/packages/google-cloud-dlp/samples/snippets/requirements.txt b/packages/google-cloud-dlp/samples/snippets/requirements.txt new file mode 100644 index 000000000000..08b72bbe1fdf --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/requirements.txt @@ -0,0 +1,5 @@ +google-cloud-dlp==1.0.0 +google-cloud-storage==1.30.0 +google-cloud-pubsub==1.7.0 +google-cloud-datastore==1.13.2 +google-cloud-bigquery==1.25.0 diff --git a/packages/google-cloud-dlp/samples/snippets/resources/accounts.txt b/packages/google-cloud-dlp/samples/snippets/resources/accounts.txt new file mode 100644 index 000000000000..2763cd0ab820 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/resources/accounts.txt @@ -0,0 +1 @@ +My credit card number is 1234 5678 9012 3456, and my CVV is 789. \ No newline at end of file diff --git a/packages/google-cloud-dlp/samples/snippets/resources/dates.csv b/packages/google-cloud-dlp/samples/snippets/resources/dates.csv new file mode 100644 index 000000000000..056fccb328ea --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/resources/dates.csv @@ -0,0 +1,5 @@ +name,birth_date,register_date,credit_card +Ann,01/01/1970,07/21/1996,4532908762519852 +James,03/06/1988,04/09/2001,4301261899725540 +Dan,08/14/1945,11/15/2011,4620761856015295 +Laura,11/03/1992,01/04/2017,4564981067258901 \ No newline at end of file diff --git a/packages/google-cloud-dlp/samples/snippets/resources/harmless.txt b/packages/google-cloud-dlp/samples/snippets/resources/harmless.txt new file mode 100644 index 000000000000..5666de37ab23 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/resources/harmless.txt @@ -0,0 +1 @@ +This file is mostly harmless. diff --git a/packages/google-cloud-dlp/samples/snippets/resources/test.png b/packages/google-cloud-dlp/samples/snippets/resources/test.png new file mode 100644 index 000000000000..8f32c8258842 Binary files /dev/null and b/packages/google-cloud-dlp/samples/snippets/resources/test.png differ diff --git a/packages/google-cloud-dlp/samples/snippets/resources/test.txt b/packages/google-cloud-dlp/samples/snippets/resources/test.txt new file mode 100644 index 000000000000..c2ee3815bc9b --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/resources/test.txt @@ -0,0 +1 @@ +My phone number is (223) 456-7890 and my email address is gary@somedomain.com. \ No newline at end of file diff --git a/packages/google-cloud-dlp/samples/snippets/risk.py b/packages/google-cloud-dlp/samples/snippets/risk.py new file mode 100644 index 000000000000..518f947eee6b --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/risk.py @@ -0,0 +1,947 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that uses the Data Loss Prevent API to perform risk anaylsis.""" + +from __future__ import print_function + +import argparse + + +# [START dlp_numerical_stats] +def numerical_risk_analysis( + project, + table_project_id, + dataset_id, + table_id, + column_name, + topic_id, + subscription_id, + timeout=300, +): + """Uses the Data Loss Prevention API to compute risk metrics of a column + of numerical data in a Google BigQuery table. + Args: + project: The Google Cloud project id to use as a parent resource. + table_project_id: The Google Cloud project id where the BigQuery table + is stored. + dataset_id: The id of the dataset to inspect. + table_id: The id of the table to inspect. + column_name: The name of the column to compute risk metrics for. + topic_id: The name of the Pub/Sub topic to notify once the job + completes. + subscription_id: The name of the Pub/Sub subscription to use when + listening for job completion notifications. + timeout: The number of seconds to wait for a response from the API. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Location info of the BigQuery table. + source_table = { + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + } + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Configure risk analysis job + # Give the name of the numeric column to compute risk metrics for + risk_job = { + "privacy_metric": { + "numerical_stats_config": {"field": {"name": column_name}} + }, + "source_table": source_table, + "actions": actions, + } + + # Call API to start risk analysis job + operation = dlp.create_dlp_job(parent, risk_job=risk_job) + + def callback(message): + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + results = job.risk_details.numerical_stats_result + print( + "Value Range: [{}, {}]".format( + results.min_value.integer_value, + results.max_value.integer_value, + ) + ) + prev_value = None + for percent, result in enumerate(results.quantile_values): + value = result.integer_value + if prev_value != value: + print("Value at {}% quantile: {}".format(percent, value)) + prev_value = value + subscription.set_result(None) + else: + # This is not the message we're looking for. + message.drop() + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + subscription = subscriber.subscribe(subscription_path, callback) + + try: + subscription.result(timeout=timeout) + except TimeoutError: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + subscription.close() + + +# [END dlp_numerical_stats] + + +# [START dlp_categorical_stats] +def categorical_risk_analysis( + project, + table_project_id, + dataset_id, + table_id, + column_name, + topic_id, + subscription_id, + timeout=300, +): + """Uses the Data Loss Prevention API to compute risk metrics of a column + of categorical data in a Google BigQuery table. + Args: + project: The Google Cloud project id to use as a parent resource. + table_project_id: The Google Cloud project id where the BigQuery table + is stored. + dataset_id: The id of the dataset to inspect. + table_id: The id of the table to inspect. + column_name: The name of the column to compute risk metrics for. + topic_id: The name of the Pub/Sub topic to notify once the job + completes. + subscription_id: The name of the Pub/Sub subscription to use when + listening for job completion notifications. + timeout: The number of seconds to wait for a response from the API. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Location info of the BigQuery table. + source_table = { + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + } + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Configure risk analysis job + # Give the name of the numeric column to compute risk metrics for + risk_job = { + "privacy_metric": { + "categorical_stats_config": {"field": {"name": column_name}} + }, + "source_table": source_table, + "actions": actions, + } + + # Call API to start risk analysis job + operation = dlp.create_dlp_job(parent, risk_job=risk_job) + + def callback(message): + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + histogram_buckets = ( + job.risk_details.categorical_stats_result.value_frequency_histogram_buckets # noqa: E501 + ) + # Print bucket stats + for i, bucket in enumerate(histogram_buckets): + print("Bucket {}:".format(i)) + print( + " Most common value occurs {} time(s)".format( + bucket.value_frequency_upper_bound + ) + ) + print( + " Least common value occurs {} time(s)".format( + bucket.value_frequency_lower_bound + ) + ) + print(" {} unique values total.".format(bucket.bucket_size)) + for value in bucket.bucket_values: + print( + " Value {} occurs {} time(s)".format( + value.value.integer_value, value.count + ) + ) + subscription.set_result(None) + else: + # This is not the message we're looking for. + message.drop() + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + subscription = subscriber.subscribe(subscription_path, callback) + + try: + subscription.result(timeout=timeout) + except TimeoutError: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + subscription.close() + + +# [END dlp_categorical_stats] + + +# [START dlp_k_anonymity] +def k_anonymity_analysis( + project, + table_project_id, + dataset_id, + table_id, + topic_id, + subscription_id, + quasi_ids, + timeout=300, +): + """Uses the Data Loss Prevention API to compute the k-anonymity of a + column set in a Google BigQuery table. + Args: + project: The Google Cloud project id to use as a parent resource. + table_project_id: The Google Cloud project id where the BigQuery table + is stored. + dataset_id: The id of the dataset to inspect. + table_id: The id of the table to inspect. + topic_id: The name of the Pub/Sub topic to notify once the job + completes. + subscription_id: The name of the Pub/Sub subscription to use when + listening for job completion notifications. + quasi_ids: A set of columns that form a composite key. + timeout: The number of seconds to wait for a response from the API. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Create helper function for unpacking values + def get_values(obj): + return int(obj.integer_value) + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Location info of the BigQuery table. + source_table = { + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + } + + # Convert quasi id list to Protobuf type + def map_fields(field): + return {"name": field} + + quasi_ids = map(map_fields, quasi_ids) + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Configure risk analysis job + # Give the name of the numeric column to compute risk metrics for + risk_job = { + "privacy_metric": {"k_anonymity_config": {"quasi_ids": quasi_ids}}, + "source_table": source_table, + "actions": actions, + } + + # Call API to start risk analysis job + operation = dlp.create_dlp_job(parent, risk_job=risk_job) + + def callback(message): + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + histogram_buckets = ( + job.risk_details.k_anonymity_result.equivalence_class_histogram_buckets + ) + # Print bucket stats + for i, bucket in enumerate(histogram_buckets): + print("Bucket {}:".format(i)) + if bucket.equivalence_class_size_lower_bound: + print( + " Bucket size range: [{}, {}]".format( + bucket.equivalence_class_size_lower_bound, + bucket.equivalence_class_size_upper_bound, + ) + ) + for value_bucket in bucket.bucket_values: + print( + " Quasi-ID values: {}".format( + map(get_values, value_bucket.quasi_ids_values) + ) + ) + print( + " Class size: {}".format( + value_bucket.equivalence_class_size + ) + ) + subscription.set_result(None) + else: + # This is not the message we're looking for. + message.drop() + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + subscription = subscriber.subscribe(subscription_path, callback) + + try: + subscription.result(timeout=timeout) + except TimeoutError: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + subscription.close() + + +# [END dlp_k_anonymity] + + +# [START dlp_l_diversity] +def l_diversity_analysis( + project, + table_project_id, + dataset_id, + table_id, + topic_id, + subscription_id, + sensitive_attribute, + quasi_ids, + timeout=300, +): + """Uses the Data Loss Prevention API to compute the l-diversity of a + column set in a Google BigQuery table. + Args: + project: The Google Cloud project id to use as a parent resource. + table_project_id: The Google Cloud project id where the BigQuery table + is stored. + dataset_id: The id of the dataset to inspect. + table_id: The id of the table to inspect. + topic_id: The name of the Pub/Sub topic to notify once the job + completes. + subscription_id: The name of the Pub/Sub subscription to use when + listening for job completion notifications. + sensitive_attribute: The column to measure l-diversity relative to. + quasi_ids: A set of columns that form a composite key. + timeout: The number of seconds to wait for a response from the API. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Create helper function for unpacking values + def get_values(obj): + return int(obj.integer_value) + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Location info of the BigQuery table. + source_table = { + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + } + + # Convert quasi id list to Protobuf type + def map_fields(field): + return {"name": field} + + quasi_ids = map(map_fields, quasi_ids) + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Configure risk analysis job + # Give the name of the numeric column to compute risk metrics for + risk_job = { + "privacy_metric": { + "l_diversity_config": { + "quasi_ids": quasi_ids, + "sensitive_attribute": {"name": sensitive_attribute}, + } + }, + "source_table": source_table, + "actions": actions, + } + + # Call API to start risk analysis job + operation = dlp.create_dlp_job(parent, risk_job=risk_job) + + def callback(message): + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + histogram_buckets = ( + job.risk_details.l_diversity_result.sensitive_value_frequency_histogram_buckets # noqa: E501 + ) + # Print bucket stats + for i, bucket in enumerate(histogram_buckets): + print("Bucket {}:".format(i)) + print( + " Bucket size range: [{}, {}]".format( + bucket.sensitive_value_frequency_lower_bound, + bucket.sensitive_value_frequency_upper_bound, + ) + ) + for value_bucket in bucket.bucket_values: + print( + " Quasi-ID values: {}".format( + map(get_values, value_bucket.quasi_ids_values) + ) + ) + print( + " Class size: {}".format( + value_bucket.equivalence_class_size + ) + ) + for value in value_bucket.top_sensitive_values: + print( + ( + " Sensitive value {} occurs {} time(s)".format( + value.value, value.count + ) + ) + ) + subscription.set_result(None) + else: + # This is not the message we're looking for. + message.drop() + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + subscription = subscriber.subscribe(subscription_path, callback) + + try: + subscription.result(timeout=timeout) + except TimeoutError: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + subscription.close() + + +# [END dlp_l_diversity] + + +# [START dlp_k_map] +def k_map_estimate_analysis( + project, + table_project_id, + dataset_id, + table_id, + topic_id, + subscription_id, + quasi_ids, + info_types, + region_code="US", + timeout=300, +): + """Uses the Data Loss Prevention API to compute the k-map risk estimation + of a column set in a Google BigQuery table. + Args: + project: The Google Cloud project id to use as a parent resource. + table_project_id: The Google Cloud project id where the BigQuery table + is stored. + dataset_id: The id of the dataset to inspect. + table_id: The id of the table to inspect. + column_name: The name of the column to compute risk metrics for. + topic_id: The name of the Pub/Sub topic to notify once the job + completes. + subscription_id: The name of the Pub/Sub subscription to use when + listening for job completion notifications. + quasi_ids: A set of columns that form a composite key and optionally + their reidentification distributions. + info_types: Type of information of the quasi_id in order to provide a + statistical model of population. + region_code: The ISO 3166-1 region code that the data is representative + of. Can be omitted if using a region-specific infoType (such as + US_ZIP_5) + timeout: The number of seconds to wait for a response from the API. + + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # This sample additionally uses Cloud Pub/Sub to receive results from + # potentially long-running operations. + import google.cloud.pubsub + + # Create helper function for unpacking values + def get_values(obj): + return int(obj.integer_value) + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into full resource ids. + topic = google.cloud.pubsub.PublisherClient.topic_path(project, topic_id) + parent = dlp.location_path(project, 'global') + + # Location info of the BigQuery table. + source_table = { + "project_id": table_project_id, + "dataset_id": dataset_id, + "table_id": table_id, + } + + # Check that numbers of quasi-ids and info types are equal + if len(quasi_ids) != len(info_types): + raise ValueError( + """Number of infoTypes and number of quasi-identifiers + must be equal!""" + ) + + # Convert quasi id list to Protobuf type + def map_fields(quasi_id, info_type): + return {"field": {"name": quasi_id}, "info_type": {"name": info_type}} + + quasi_ids = map(map_fields, quasi_ids, info_types) + + # Tell the API where to send a notification when the job is complete. + actions = [{"pub_sub": {"topic": topic}}] + + # Configure risk analysis job + # Give the name of the numeric column to compute risk metrics for + risk_job = { + "privacy_metric": { + "k_map_estimation_config": { + "quasi_ids": quasi_ids, + "region_code": region_code, + } + }, + "source_table": source_table, + "actions": actions, + } + + # Call API to start risk analysis job + operation = dlp.create_dlp_job(parent, risk_job=risk_job) + + def callback(message): + if message.attributes["DlpJobName"] == operation.name: + # This is the message we're looking for, so acknowledge it. + message.ack() + + # Now that the job is done, fetch the results and print them. + job = dlp.get_dlp_job(operation.name) + histogram_buckets = ( + job.risk_details.k_map_estimation_result.k_map_estimation_histogram + ) + # Print bucket stats + for i, bucket in enumerate(histogram_buckets): + print("Bucket {}:".format(i)) + print( + " Anonymity range: [{}, {}]".format( + bucket.min_anonymity, bucket.max_anonymity + ) + ) + print(" Size: {}".format(bucket.bucket_size)) + for value_bucket in bucket.bucket_values: + print( + " Values: {}".format( + map(get_values, value_bucket.quasi_ids_values) + ) + ) + print( + " Estimated k-map anonymity: {}".format( + value_bucket.estimated_anonymity + ) + ) + subscription.set_result(None) + else: + # This is not the message we're looking for. + message.drop() + + # Create a Pub/Sub client and find the subscription. The subscription is + # expected to already be listening to the topic. + subscriber = google.cloud.pubsub.SubscriberClient() + subscription_path = subscriber.subscription_path(project, subscription_id) + subscription = subscriber.subscribe(subscription_path, callback) + + try: + subscription.result(timeout=timeout) + except TimeoutError: + print( + "No event received before the timeout. Please verify that the " + "subscription provided is subscribed to the topic provided." + ) + subscription.close() + + +# [END dlp_k_map] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="content", help="Select how to submit content to the API." + ) + subparsers.required = True + + numerical_parser = subparsers.add_parser("numerical", help="") + numerical_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + numerical_parser.add_argument( + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + numerical_parser.add_argument( + "dataset_id", help="The id of the dataset to inspect." + ) + numerical_parser.add_argument( + "table_id", help="The id of the table to inspect." + ) + numerical_parser.add_argument( + "column_name", + help="The name of the column to compute risk metrics for.", + ) + numerical_parser.add_argument( + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) + numerical_parser.add_argument( + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) + numerical_parser.add_argument( + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) + + categorical_parser = subparsers.add_parser("categorical", help="") + categorical_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + categorical_parser.add_argument( + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + categorical_parser.add_argument( + "dataset_id", help="The id of the dataset to inspect." + ) + categorical_parser.add_argument( + "table_id", help="The id of the table to inspect." + ) + categorical_parser.add_argument( + "column_name", + help="The name of the column to compute risk metrics for.", + ) + categorical_parser.add_argument( + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) + categorical_parser.add_argument( + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) + categorical_parser.add_argument( + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) + + k_anonymity_parser = subparsers.add_parser( + "k_anonymity", + help="Computes the k-anonymity of a column set in a Google BigQuery" + "table.", + ) + k_anonymity_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + k_anonymity_parser.add_argument( + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + k_anonymity_parser.add_argument( + "dataset_id", help="The id of the dataset to inspect." + ) + k_anonymity_parser.add_argument( + "table_id", help="The id of the table to inspect." + ) + k_anonymity_parser.add_argument( + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) + k_anonymity_parser.add_argument( + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) + k_anonymity_parser.add_argument( + "quasi_ids", + nargs="+", + help="A set of columns that form a composite key.", + ) + k_anonymity_parser.add_argument( + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) + + l_diversity_parser = subparsers.add_parser( + "l_diversity", + help="Computes the l-diversity of a column set in a Google BigQuery" + "table.", + ) + l_diversity_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + l_diversity_parser.add_argument( + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + l_diversity_parser.add_argument( + "dataset_id", help="The id of the dataset to inspect." + ) + l_diversity_parser.add_argument( + "table_id", help="The id of the table to inspect." + ) + l_diversity_parser.add_argument( + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) + l_diversity_parser.add_argument( + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) + l_diversity_parser.add_argument( + "sensitive_attribute", + help="The column to measure l-diversity relative to.", + ) + l_diversity_parser.add_argument( + "quasi_ids", + nargs="+", + help="A set of columns that form a composite key.", + ) + l_diversity_parser.add_argument( + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) + + k_map_parser = subparsers.add_parser( + "k_map", + help="Computes the k-map risk estimation of a column set in a Google" + "BigQuery table.", + ) + k_map_parser.add_argument( + "project", + help="The Google Cloud project id to use as a parent resource.", + ) + k_map_parser.add_argument( + "table_project_id", + help="The Google Cloud project id where the BigQuery table is stored.", + ) + k_map_parser.add_argument( + "dataset_id", help="The id of the dataset to inspect." + ) + k_map_parser.add_argument( + "table_id", help="The id of the table to inspect." + ) + k_map_parser.add_argument( + "topic_id", + help="The name of the Pub/Sub topic to notify once the job completes.", + ) + k_map_parser.add_argument( + "subscription_id", + help="The name of the Pub/Sub subscription to use when listening for" + "job completion notifications.", + ) + k_map_parser.add_argument( + "quasi_ids", + nargs="+", + help="A set of columns that form a composite key.", + ) + k_map_parser.add_argument( + "-t", + "--info-types", + nargs="+", + help="Type of information of the quasi_id in order to provide a" + "statistical model of population.", + required=True, + ) + k_map_parser.add_argument( + "-r", + "--region-code", + default="US", + help="The ISO 3166-1 region code that the data is representative of.", + ) + k_map_parser.add_argument( + "--timeout", + type=int, + help="The number of seconds to wait for a response from the API.", + ) + + args = parser.parse_args() + + if args.content == "numerical": + numerical_risk_analysis( + args.project, + args.table_project_id, + args.dataset_id, + args.table_id, + args.column_name, + args.topic_id, + args.subscription_id, + timeout=args.timeout, + ) + elif args.content == "categorical": + categorical_risk_analysis( + args.project, + args.table_project_id, + args.dataset_id, + args.table_id, + args.column_name, + args.topic_id, + args.subscription_id, + timeout=args.timeout, + ) + elif args.content == "k_anonymity": + k_anonymity_analysis( + args.project, + args.table_project_id, + args.dataset_id, + args.table_id, + args.topic_id, + args.subscription_id, + args.quasi_ids, + timeout=args.timeout, + ) + elif args.content == "l_diversity": + l_diversity_analysis( + args.project, + args.table_project_id, + args.dataset_id, + args.table_id, + args.topic_id, + args.subscription_id, + args.sensitive_attribute, + args.quasi_ids, + timeout=args.timeout, + ) + elif args.content == "k_map": + k_map_estimate_analysis( + args.project, + args.table_project_id, + args.dataset_id, + args.table_id, + args.topic_id, + args.subscription_id, + args.quasi_ids, + args.info_types, + region_code=args.region_code, + timeout=args.timeout, + ) diff --git a/packages/google-cloud-dlp/samples/snippets/risk_test.py b/packages/google-cloud-dlp/samples/snippets/risk_test.py new file mode 100644 index 000000000000..25d9575d4b0f --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/risk_test.py @@ -0,0 +1,368 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +import google.cloud.bigquery +import google.cloud.pubsub +import pytest + +import risk + + +UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] +GCLOUD_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") +TABLE_PROJECT = os.environ.get("GOOGLE_CLOUD_PROJECT") +TOPIC_ID = "dlp-test" + UNIQUE_STRING +SUBSCRIPTION_ID = "dlp-test-subscription" + UNIQUE_STRING +UNIQUE_FIELD = "Name" +REPEATED_FIELD = "Mystery" +NUMERIC_FIELD = "Age" +STRING_BOOLEAN_FIELD = "Gender" + +BIGQUERY_DATASET_ID = "dlp_test_dataset" + UNIQUE_STRING +BIGQUERY_TABLE_ID = "dlp_test_table" + UNIQUE_STRING +BIGQUERY_HARMFUL_TABLE_ID = "harmful" + UNIQUE_STRING + +TIMEOUT = 120 # 2 minutes + + +# Create new custom topic/subscription +# We observe sometimes all the tests in this file fail. In a +# hypothesis where DLP service somehow loses the connection to the +# topic, now we use function scope for Pub/Sub fixtures. +@pytest.fixture(scope="module") +def topic_id(): + # Creates a pubsub topic, and tears it down. + publisher = google.cloud.pubsub.PublisherClient() + topic_path = publisher.topic_path(GCLOUD_PROJECT, TOPIC_ID) + try: + publisher.create_topic(topic_path) + except google.api_core.exceptions.AlreadyExists: + pass + + yield TOPIC_ID + + publisher.delete_topic(topic_path) + + +@pytest.fixture(scope="module") +def subscription_id(topic_id): + # Subscribes to a topic. + subscriber = google.cloud.pubsub.SubscriberClient() + topic_path = subscriber.topic_path(GCLOUD_PROJECT, topic_id) + subscription_path = subscriber.subscription_path( + GCLOUD_PROJECT, SUBSCRIPTION_ID + ) + try: + subscriber.create_subscription(subscription_path, topic_path) + except google.api_core.exceptions.AlreadyExists: + pass + + yield SUBSCRIPTION_ID + + subscriber.delete_subscription(subscription_path) + + +@pytest.fixture(scope="module") +def bigquery_project(): + # Adds test Bigquery data, yields the project ID and then tears down. + + bigquery_client = google.cloud.bigquery.Client() + + dataset_ref = bigquery_client.dataset(BIGQUERY_DATASET_ID) + dataset = google.cloud.bigquery.Dataset(dataset_ref) + try: + dataset = bigquery_client.create_dataset(dataset) + except google.api_core.exceptions.Conflict: + dataset = bigquery_client.get_dataset(dataset) + table_ref = dataset_ref.table(BIGQUERY_TABLE_ID) + table = google.cloud.bigquery.Table(table_ref) + + harmful_table_ref = dataset_ref.table(BIGQUERY_HARMFUL_TABLE_ID) + harmful_table = google.cloud.bigquery.Table(harmful_table_ref) + + table.schema = ( + google.cloud.bigquery.SchemaField("Name", "STRING"), + google.cloud.bigquery.SchemaField("Comment", "STRING"), + ) + + harmful_table.schema = ( + google.cloud.bigquery.SchemaField("Name", "STRING", "REQUIRED"), + google.cloud.bigquery.SchemaField( + "TelephoneNumber", "STRING", "REQUIRED" + ), + google.cloud.bigquery.SchemaField("Mystery", "STRING", "REQUIRED"), + google.cloud.bigquery.SchemaField("Age", "INTEGER", "REQUIRED"), + google.cloud.bigquery.SchemaField("Gender", "STRING"), + google.cloud.bigquery.SchemaField("RegionCode", "STRING"), + ) + + try: + table = bigquery_client.create_table(table) + except google.api_core.exceptions.Conflict: + table = bigquery_client.get_table(table) + + try: + harmful_table = bigquery_client.create_table(harmful_table) + except google.api_core.exceptions.Conflict: + harmful_table = bigquery_client.get_table(harmful_table) + + rows_to_insert = [(u"Gary Smith", u"My email is gary@example.com")] + harmful_rows_to_insert = [ + ( + u"Gandalf", + u"(123) 456-7890", + "4231 5555 6781 9876", + 27, + "Male", + "US", + ), + ( + u"Dumbledore", + u"(313) 337-1337", + "6291 8765 1095 7629", + 27, + "Male", + "US", + ), + (u"Joe", u"(452) 123-1234", "3782 2288 1166 3030", 35, "Male", "US"), + (u"James", u"(567) 890-1234", "8291 3627 8250 1234", 19, "Male", "US"), + ( + u"Marie", + u"(452) 123-1234", + "8291 3627 8250 1234", + 35, + "Female", + "US", + ), + ( + u"Carrie", + u"(567) 890-1234", + "2253 5218 4251 4526", + 35, + "Female", + "US", + ), + ] + + bigquery_client.insert_rows(table, rows_to_insert) + bigquery_client.insert_rows(harmful_table, harmful_rows_to_insert) + yield GCLOUD_PROJECT + + bigquery_client.delete_dataset(dataset_ref, delete_contents=True) + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_numerical_risk_analysis( + topic_id, subscription_id, bigquery_project, capsys +): + risk.numerical_risk_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + NUMERIC_FIELD, + topic_id, + subscription_id, + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Value Range:" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_categorical_risk_analysis_on_string_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.categorical_risk_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + UNIQUE_FIELD, + topic_id, + subscription_id, + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Most common value occurs" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_categorical_risk_analysis_on_number_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.categorical_risk_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + NUMERIC_FIELD, + topic_id, + subscription_id, + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Most common value occurs" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_k_anonymity_analysis_single_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.k_anonymity_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + [NUMERIC_FIELD], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Quasi-ID values:" in out + assert "Class size:" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_k_anonymity_analysis_multiple_fields( + topic_id, subscription_id, bigquery_project, capsys +): + risk.k_anonymity_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + [NUMERIC_FIELD, REPEATED_FIELD], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Quasi-ID values:" in out + assert "Class size:" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_l_diversity_analysis_single_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.l_diversity_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + UNIQUE_FIELD, + [NUMERIC_FIELD], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Quasi-ID values:" in out + assert "Class size:" in out + assert "Sensitive value" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_l_diversity_analysis_multiple_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.l_diversity_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + UNIQUE_FIELD, + [NUMERIC_FIELD, REPEATED_FIELD], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Quasi-ID values:" in out + assert "Class size:" in out + assert "Sensitive value" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_k_map_estimate_analysis_single_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.k_map_estimate_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + [NUMERIC_FIELD], + ["AGE"], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Anonymity range:" in out + assert "Size:" in out + assert "Values" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_k_map_estimate_analysis_multiple_field( + topic_id, subscription_id, bigquery_project, capsys +): + risk.k_map_estimate_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + [NUMERIC_FIELD, STRING_BOOLEAN_FIELD], + ["AGE", "GENDER"], + timeout=TIMEOUT, + ) + + out, _ = capsys.readouterr() + assert "Anonymity range:" in out + assert "Size:" in out + assert "Values" in out + + +@pytest.mark.flaky(max_runs=3, min_passes=1) +def test_k_map_estimate_analysis_quasi_ids_info_types_equal( + topic_id, subscription_id, bigquery_project +): + with pytest.raises(ValueError): + risk.k_map_estimate_analysis( + GCLOUD_PROJECT, + TABLE_PROJECT, + BIGQUERY_DATASET_ID, + BIGQUERY_HARMFUL_TABLE_ID, + topic_id, + subscription_id, + [NUMERIC_FIELD, STRING_BOOLEAN_FIELD], + ["AGE"], + timeout=TIMEOUT, + ) diff --git a/packages/google-cloud-dlp/samples/snippets/templates.py b/packages/google-cloud-dlp/samples/snippets/templates.py new file mode 100644 index 000000000000..2d9f8137d5d1 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/templates.py @@ -0,0 +1,266 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that sets up Data Loss Prevention API inspect templates.""" + +from __future__ import print_function + +import argparse +import os +import time + + +# [START dlp_create_template] +def create_inspect_template( + project, + info_types, + template_id=None, + display_name=None, + min_likelihood=None, + max_findings=None, + include_quote=None, +): + """Creates a Data Loss Prevention API inspect template. + Args: + project: The Google Cloud project id to use as a parent resource. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + template_id: The id of the template. If omitted, an id will be randomly + generated. + display_name: The optional display name of the template. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + include_quote: Boolean for whether to display a quote of the detected + information in the results. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "min_likelihood": min_likelihood, + "include_quote": include_quote, + "limits": {"max_findings_per_request": max_findings}, + } + + inspect_template = { + "inspect_config": inspect_config, + "display_name": display_name, + } + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.create_inspect_template( + parent, inspect_template=inspect_template, template_id=template_id + ) + + print("Successfully created template {}".format(response.name)) + + +# [END dlp_create_template] + + +# [START dlp_list_templates] +def list_inspect_templates(project): + """Lists all Data Loss Prevention API inspect templates. + Args: + project: The Google Cloud project id to use as a parent resource. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.list_inspect_templates(parent) + + # Define a helper function to convert the API's "seconds since the epoch" + # time format into a human-readable string. + def human_readable_time(timestamp): + return str(time.localtime(timestamp.seconds)) + + for template in response: + print("Template {}:".format(template.name)) + if template.display_name: + print(" Display Name: {}".format(template.display_name)) + print( + " Created: {}".format(human_readable_time(template.create_time)) + ) + print( + " Updated: {}".format(human_readable_time(template.update_time)) + ) + + config = template.inspect_config + print( + " InfoTypes: {}".format( + ", ".join([it.name for it in config.info_types]) + ) + ) + print(" Minimum likelihood: {}".format(config.min_likelihood)) + print(" Include quotes: {}".format(config.include_quote)) + print( + " Max findings per request: {}".format( + config.limits.max_findings_per_request + ) + ) + + +# [END dlp_list_templates] + + +# [START dlp_delete_template] +def delete_inspect_template(project, template_id): + """Deletes a Data Loss Prevention API template. + Args: + project: The id of the Google Cloud project which owns the template. + template_id: The id of the template to delete. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Combine the template id with the parent id. + template_resource = "{}/inspectTemplates/{}".format(parent, template_id) + + # Call the API. + dlp.delete_inspect_template(template_resource) + + print("Template {} successfully deleted.".format(template_resource)) + + +# [END dlp_delete_template] + + +if __name__ == "__main__": + default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") + + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="action", help="Select which action to perform." + ) + subparsers.required = True + + parser_create = subparsers.add_parser("create", help="Create a template.") + parser_create.add_argument( + "--template_id", + help="The id of the template. If omitted, an id will be randomly " + "generated", + ) + parser_create.add_argument( + "--display_name", help="The optional display name of the template." + ) + parser_create.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_create.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_create.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_create.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_create.add_argument( + "--include_quote", + type=bool, + help="A boolean for whether to display a quote of the detected " + "information in the results.", + default=True, + ) + + parser_list = subparsers.add_parser("list", help="List all templates.") + parser_list.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + + parser_delete = subparsers.add_parser("delete", help="Delete a template.") + parser_delete.add_argument( + "template_id", help="The id of the template to delete." + ) + parser_delete.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + + args = parser.parse_args() + + if args.action == "create": + create_inspect_template( + args.project, + args.info_types, + template_id=args.template_id, + display_name=args.display_name, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + include_quote=args.include_quote, + ) + elif args.action == "list": + list_inspect_templates(args.project) + elif args.action == "delete": + delete_inspect_template(args.project, args.template_id) diff --git a/packages/google-cloud-dlp/samples/snippets/templates_test.py b/packages/google-cloud-dlp/samples/snippets/templates_test.py new file mode 100644 index 000000000000..f8d22118bfcd --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/templates_test.py @@ -0,0 +1,60 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +import google.api_core.exceptions +import google.cloud.storage + +import templates + +UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +TEST_TEMPLATE_ID = "test-template" + UNIQUE_STRING + + +def test_create_list_and_delete_template(capsys): + try: + templates.create_inspect_template( + GCLOUD_PROJECT, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + template_id=TEST_TEMPLATE_ID, + ) + except google.api_core.exceptions.InvalidArgument: + # Template already exists, perhaps due to a previous interrupted test. + templates.delete_inspect_template(GCLOUD_PROJECT, TEST_TEMPLATE_ID) + + out, _ = capsys.readouterr() + assert TEST_TEMPLATE_ID in out + + # Try again and move on. + templates.create_inspect_template( + GCLOUD_PROJECT, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + template_id=TEST_TEMPLATE_ID, + ) + + out, _ = capsys.readouterr() + assert TEST_TEMPLATE_ID in out + + templates.list_inspect_templates(GCLOUD_PROJECT) + + out, _ = capsys.readouterr() + assert TEST_TEMPLATE_ID in out + + templates.delete_inspect_template(GCLOUD_PROJECT, TEST_TEMPLATE_ID) + + out, _ = capsys.readouterr() + assert TEST_TEMPLATE_ID in out diff --git a/packages/google-cloud-dlp/samples/snippets/triggers.py b/packages/google-cloud-dlp/samples/snippets/triggers.py new file mode 100644 index 000000000000..7548ab893db8 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/triggers.py @@ -0,0 +1,297 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Sample app that sets up Data Loss Prevention API automation triggers.""" + +from __future__ import print_function + +import argparse +import os +import time + + +# [START dlp_create_trigger] +def create_trigger( + project, + bucket, + scan_period_days, + info_types, + trigger_id=None, + display_name=None, + description=None, + min_likelihood=None, + max_findings=None, + auto_populate_timespan=False, +): + """Creates a scheduled Data Loss Prevention API inspect_content trigger. + Args: + project: The Google Cloud project id to use as a parent resource. + bucket: The name of the GCS bucket to scan. This sample scans all + files in the bucket using a wildcard. + scan_period_days: How often to repeat the scan, in days. + The minimum is 1 day. + info_types: A list of strings representing info types to look for. + A full list of info type categories can be fetched from the API. + trigger_id: The id of the trigger. If omitted, an id will be randomly + generated. + display_name: The optional display name of the trigger. + description: The optional description of the trigger. + min_likelihood: A string representing the minimum likelihood threshold + that constitutes a match. One of: 'LIKELIHOOD_UNSPECIFIED', + 'VERY_UNLIKELY', 'UNLIKELY', 'POSSIBLE', 'LIKELY', 'VERY_LIKELY'. + max_findings: The maximum number of findings to report; 0 = no maximum. + auto_populate_timespan: Automatically populates time span config start + and end times in order to scan new content only. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Prepare info_types by converting the list of strings into a list of + # dictionaries (protos are also accepted). + info_types = [{"name": info_type} for info_type in info_types] + + # Construct the configuration dictionary. Keys which are None may + # optionally be omitted entirely. + inspect_config = { + "info_types": info_types, + "min_likelihood": min_likelihood, + "limits": {"max_findings_per_request": max_findings}, + } + + # Construct a cloud_storage_options dictionary with the bucket's URL. + url = "gs://{}/*".format(bucket) + storage_config = { + "cloud_storage_options": {"file_set": {"url": url}}, + # Time-based configuration for each storage object. + "timespan_config": { + # Auto-populate start and end times in order to scan new objects + # only. + "enable_auto_population_of_timespan_config": auto_populate_timespan + }, + } + + # Construct the job definition. + job = {"inspect_config": inspect_config, "storage_config": storage_config} + + # Construct the schedule definition: + schedule = { + "recurrence_period_duration": { + "seconds": scan_period_days * 60 * 60 * 24 + } + } + + # Construct the trigger definition. + job_trigger = { + "inspect_job": job, + "display_name": display_name, + "description": description, + "triggers": [{"schedule": schedule}], + "status": "HEALTHY", + } + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.create_job_trigger( + parent, job_trigger=job_trigger, trigger_id=trigger_id + ) + + print("Successfully created trigger {}".format(response.name)) + + +# [END dlp_create_trigger] + + +# [START dlp_list_triggers] +def list_triggers(project): + """Lists all Data Loss Prevention API triggers. + Args: + project: The Google Cloud project id to use as a parent resource. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.list_job_triggers(parent) + + # Define a helper function to convert the API's "seconds since the epoch" + # time format into a human-readable string. + def human_readable_time(timestamp): + return str(time.localtime(timestamp.seconds)) + + for trigger in response: + print("Trigger {}:".format(trigger.name)) + print(" Created: {}".format(human_readable_time(trigger.create_time))) + print(" Updated: {}".format(human_readable_time(trigger.update_time))) + if trigger.display_name: + print(" Display Name: {}".format(trigger.display_name)) + if trigger.description: + print(" Description: {}".format(trigger.discription)) + print(" Status: {}".format(trigger.status)) + print(" Error count: {}".format(len(trigger.errors))) + + +# [END dlp_list_triggers] + + +# [START dlp_delete_trigger] +def delete_trigger(project, trigger_id): + """Deletes a Data Loss Prevention API trigger. + Args: + project: The id of the Google Cloud project which owns the trigger. + trigger_id: The id of the trigger to delete. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Combine the trigger id with the parent id. + trigger_resource = "{}/jobTriggers/{}".format(parent, trigger_id) + + # Call the API. + dlp.delete_job_trigger(trigger_resource) + + print("Trigger {} successfully deleted.".format(trigger_resource)) + + +# [END dlp_delete_triggers] + + +if __name__ == "__main__": + default_project = os.environ.get("GOOGLE_CLOUD_PROJECT") + + parser = argparse.ArgumentParser(description=__doc__) + subparsers = parser.add_subparsers( + dest="action", help="Select which action to perform." + ) + subparsers.required = True + + parser_create = subparsers.add_parser("create", help="Create a trigger.") + parser_create.add_argument( + "bucket", help="The name of the GCS bucket containing the file." + ) + parser_create.add_argument( + "scan_period_days", + type=int, + help="How often to repeat the scan, in days. The minimum is 1 day.", + ) + parser_create.add_argument( + "--trigger_id", + help="The id of the trigger. If omitted, an id will be randomly " + "generated", + ) + parser_create.add_argument( + "--display_name", help="The optional display name of the trigger." + ) + parser_create.add_argument( + "--description", help="The optional description of the trigger." + ) + parser_create.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + parser_create.add_argument( + "--info_types", + nargs="+", + help="Strings representing info types to look for. A full list of " + "info categories and types is available from the API. Examples " + 'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". ' + "If unspecified, the three above examples will be used.", + default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"], + ) + parser_create.add_argument( + "--min_likelihood", + choices=[ + "LIKELIHOOD_UNSPECIFIED", + "VERY_UNLIKELY", + "UNLIKELY", + "POSSIBLE", + "LIKELY", + "VERY_LIKELY", + ], + help="A string representing the minimum likelihood threshold that " + "constitutes a match.", + ) + parser_create.add_argument( + "--max_findings", + type=int, + help="The maximum number of findings to report; 0 = no maximum.", + ) + parser_create.add_argument( + "--auto_populate_timespan", + type=bool, + help="Limit scan to new content only.", + ) + + parser_list = subparsers.add_parser("list", help="List all triggers.") + parser_list.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + + parser_delete = subparsers.add_parser("delete", help="Delete a trigger.") + parser_delete.add_argument( + "trigger_id", help="The id of the trigger to delete." + ) + parser_delete.add_argument( + "--project", + help="The Google Cloud project id to use as a parent resource.", + default=default_project, + ) + + args = parser.parse_args() + + if args.action == "create": + create_trigger( + args.project, + args.bucket, + args.scan_period_days, + args.info_types, + trigger_id=args.trigger_id, + display_name=args.display_name, + description=args.description, + min_likelihood=args.min_likelihood, + max_findings=args.max_findings, + auto_populate_timespan=args.auto_populate_timespan, + ) + elif args.action == "list": + list_triggers(args.project) + elif args.action == "delete": + delete_trigger(args.project, args.trigger_id) diff --git a/packages/google-cloud-dlp/samples/snippets/triggers_test.py b/packages/google-cloud-dlp/samples/snippets/triggers_test.py new file mode 100644 index 000000000000..dc219d88c7a9 --- /dev/null +++ b/packages/google-cloud-dlp/samples/snippets/triggers_test.py @@ -0,0 +1,103 @@ +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +import google.api_core.exceptions +import google.cloud.storage + +import pytest + +import triggers + +UNIQUE_STRING = str(uuid.uuid4()).split("-")[0] +GCLOUD_PROJECT = os.getenv("GOOGLE_CLOUD_PROJECT") +TEST_BUCKET_NAME = GCLOUD_PROJECT + "-dlp-python-client-test" + UNIQUE_STRING +RESOURCE_DIRECTORY = os.path.join(os.path.dirname(__file__), "resources") +RESOURCE_FILE_NAMES = ["test.txt", "test.png", "harmless.txt", "accounts.txt"] +TEST_TRIGGER_ID = "test-trigger" + UNIQUE_STRING + + +@pytest.fixture(scope="module") +def bucket(): + # Creates a GCS bucket, uploads files required for the test, and tears down + # the entire bucket afterwards. + + client = google.cloud.storage.Client() + try: + bucket = client.get_bucket(TEST_BUCKET_NAME) + except google.cloud.exceptions.NotFound: + bucket = client.create_bucket(TEST_BUCKET_NAME) + + # Upoad the blobs and keep track of them in a list. + blobs = [] + for name in RESOURCE_FILE_NAMES: + path = os.path.join(RESOURCE_DIRECTORY, name) + blob = bucket.blob(name) + blob.upload_from_filename(path) + blobs.append(blob) + + # Yield the object to the test; lines after this execute as a teardown. + yield bucket + + # Delete the files. + for blob in blobs: + try: + blob.delete() + except google.cloud.exceptions.NotFound: + print("Issue during teardown, missing blob") + + # Attempt to delete the bucket; this will only work if it is empty. + bucket.delete() + + +def test_create_list_and_delete_trigger(bucket, capsys): + try: + triggers.create_trigger( + GCLOUD_PROJECT, + bucket.name, + 7, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + trigger_id=TEST_TRIGGER_ID, + ) + except google.api_core.exceptions.InvalidArgument: + # Trigger already exists, perhaps due to a previous interrupted test. + triggers.delete_trigger(GCLOUD_PROJECT, TEST_TRIGGER_ID) + + out, _ = capsys.readouterr() + assert TEST_TRIGGER_ID in out + + # Try again and move on. + triggers.create_trigger( + GCLOUD_PROJECT, + bucket.name, + 7, + ["FIRST_NAME", "EMAIL_ADDRESS", "PHONE_NUMBER"], + trigger_id=TEST_TRIGGER_ID, + auto_populate_timespan=True, + ) + + out, _ = capsys.readouterr() + assert TEST_TRIGGER_ID in out + + triggers.list_triggers(GCLOUD_PROJECT) + + out, _ = capsys.readouterr() + assert TEST_TRIGGER_ID in out + + triggers.delete_trigger(GCLOUD_PROJECT, TEST_TRIGGER_ID) + + out, _ = capsys.readouterr() + assert TEST_TRIGGER_ID in out diff --git a/packages/google-cloud-dlp/scripts/decrypt-secrets.sh b/packages/google-cloud-dlp/scripts/decrypt-secrets.sh new file mode 100755 index 000000000000..ff599eb2af25 --- /dev/null +++ b/packages/google-cloud-dlp/scripts/decrypt-secrets.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Copyright 2015 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +ROOT=$( dirname "$DIR" ) + +# Work from the project root. +cd $ROOT + +# Use SECRET_MANAGER_PROJECT if set, fallback to cloud-devrel-kokoro-resources. +PROJECT_ID="${SECRET_MANAGER_PROJECT:-cloud-devrel-kokoro-resources}" + +gcloud secrets versions access latest --secret="python-docs-samples-test-env" \ + > testing/test-env.sh +gcloud secrets versions access latest \ + --secret="python-docs-samples-service-account" \ + > testing/service-account.json +gcloud secrets versions access latest \ + --secret="python-docs-samples-client-secrets" \ + > testing/client-secrets.json \ No newline at end of file diff --git a/packages/google-cloud-dlp/scripts/readme-gen/readme_gen.py b/packages/google-cloud-dlp/scripts/readme-gen/readme_gen.py new file mode 100644 index 000000000000..d309d6e97518 --- /dev/null +++ b/packages/google-cloud-dlp/scripts/readme-gen/readme_gen.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python + +# Copyright 2016 Google Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Generates READMEs using configuration defined in yaml.""" + +import argparse +import io +import os +import subprocess + +import jinja2 +import yaml + + +jinja_env = jinja2.Environment( + trim_blocks=True, + loader=jinja2.FileSystemLoader( + os.path.abspath(os.path.join(os.path.dirname(__file__), 'templates')))) + +README_TMPL = jinja_env.get_template('README.tmpl.rst') + + +def get_help(file): + return subprocess.check_output(['python', file, '--help']).decode() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('source') + parser.add_argument('--destination', default='README.rst') + + args = parser.parse_args() + + source = os.path.abspath(args.source) + root = os.path.dirname(source) + destination = os.path.join(root, args.destination) + + jinja_env.globals['get_help'] = get_help + + with io.open(source, 'r') as f: + config = yaml.load(f) + + # This allows get_help to execute in the right directory. + os.chdir(root) + + output = README_TMPL.render(config) + + with io.open(destination, 'w') as f: + f.write(output) + + +if __name__ == '__main__': + main() diff --git a/packages/google-cloud-dlp/scripts/readme-gen/templates/README.tmpl.rst b/packages/google-cloud-dlp/scripts/readme-gen/templates/README.tmpl.rst new file mode 100644 index 000000000000..4fd239765b0a --- /dev/null +++ b/packages/google-cloud-dlp/scripts/readme-gen/templates/README.tmpl.rst @@ -0,0 +1,87 @@ +{# The following line is a lie. BUT! Once jinja2 is done with it, it will + become truth! #} +.. This file is automatically generated. Do not edit this file directly. + +{{product.name}} Python Samples +=============================================================================== + +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor={{folder}}/README.rst + + +This directory contains samples for {{product.name}}. {{product.description}} + +{{description}} + +.. _{{product.name}}: {{product.url}} + +{% if required_api_url %} +To run the sample, you need to enable the API at: {{required_api_url}} +{% endif %} + +{% if required_role %} +To run the sample, you need to have `{{required_role}}` role. +{% endif %} + +{{other_required_steps}} + +{% if setup %} +Setup +------------------------------------------------------------------------------- + +{% for section in setup %} + +{% include section + '.tmpl.rst' %} + +{% endfor %} +{% endif %} + +{% if samples %} +Samples +------------------------------------------------------------------------------- + +{% for sample in samples %} +{{sample.name}} ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +{% if not sample.hide_cloudshell_button %} +.. image:: https://gstatic.com/cloudssh/images/open-btn.png + :target: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor={{folder}}/{{sample.file}},{{folder}}/README.rst +{% endif %} + + +{{sample.description}} + +To run this sample: + +.. code-block:: bash + + $ python {{sample.file}} +{% if sample.show_help %} + + {{get_help(sample.file)|indent}} +{% endif %} + + +{% endfor %} +{% endif %} + +{% if cloud_client_library %} + +The client library +------------------------------------------------------------------------------- + +This sample uses the `Google Cloud Client Library for Python`_. +You can read the documentation for more details on API usage and use GitHub +to `browse the source`_ and `report issues`_. + +.. _Google Cloud Client Library for Python: + https://googlecloudplatform.github.io/google-cloud-python/ +.. _browse the source: + https://github.com/GoogleCloudPlatform/google-cloud-python +.. _report issues: + https://github.com/GoogleCloudPlatform/google-cloud-python/issues + +{% endif %} + +.. _Google Cloud SDK: https://cloud.google.com/sdk/ \ No newline at end of file diff --git a/packages/google-cloud-dlp/scripts/readme-gen/templates/auth.tmpl.rst b/packages/google-cloud-dlp/scripts/readme-gen/templates/auth.tmpl.rst new file mode 100644 index 000000000000..1446b94a5e3a --- /dev/null +++ b/packages/google-cloud-dlp/scripts/readme-gen/templates/auth.tmpl.rst @@ -0,0 +1,9 @@ +Authentication +++++++++++++++ + +This sample requires you to have authentication setup. Refer to the +`Authentication Getting Started Guide`_ for instructions on setting up +credentials for applications. + +.. _Authentication Getting Started Guide: + https://cloud.google.com/docs/authentication/getting-started diff --git a/packages/google-cloud-dlp/scripts/readme-gen/templates/auth_api_key.tmpl.rst b/packages/google-cloud-dlp/scripts/readme-gen/templates/auth_api_key.tmpl.rst new file mode 100644 index 000000000000..11957ce2714a --- /dev/null +++ b/packages/google-cloud-dlp/scripts/readme-gen/templates/auth_api_key.tmpl.rst @@ -0,0 +1,14 @@ +Authentication +++++++++++++++ + +Authentication for this service is done via an `API Key`_. To obtain an API +Key: + +1. Open the `Cloud Platform Console`_ +2. Make sure that billing is enabled for your project. +3. From the **Credentials** page, create a new **API Key** or use an existing + one for your project. + +.. _API Key: + https://developers.google.com/api-client-library/python/guide/aaa_apikeys +.. _Cloud Console: https://console.cloud.google.com/project?_ diff --git a/packages/google-cloud-dlp/scripts/readme-gen/templates/install_deps.tmpl.rst b/packages/google-cloud-dlp/scripts/readme-gen/templates/install_deps.tmpl.rst new file mode 100644 index 000000000000..a0406dba8c84 --- /dev/null +++ b/packages/google-cloud-dlp/scripts/readme-gen/templates/install_deps.tmpl.rst @@ -0,0 +1,29 @@ +Install Dependencies +++++++++++++++++++++ + +#. Clone python-docs-samples and change directory to the sample directory you want to use. + + .. code-block:: bash + + $ git clone https://github.com/GoogleCloudPlatform/python-docs-samples.git + +#. Install `pip`_ and `virtualenv`_ if you do not already have them. You may want to refer to the `Python Development Environment Setup Guide`_ for Google Cloud Platform for instructions. + + .. _Python Development Environment Setup Guide: + https://cloud.google.com/python/setup + +#. Create a virtualenv. Samples are compatible with Python 2.7 and 3.4+. + + .. code-block:: bash + + $ virtualenv env + $ source env/bin/activate + +#. Install the dependencies needed to run the samples. + + .. code-block:: bash + + $ pip install -r requirements.txt + +.. _pip: https://pip.pypa.io/ +.. _virtualenv: https://virtualenv.pypa.io/ diff --git a/packages/google-cloud-dlp/scripts/readme-gen/templates/install_portaudio.tmpl.rst b/packages/google-cloud-dlp/scripts/readme-gen/templates/install_portaudio.tmpl.rst new file mode 100644 index 000000000000..5ea33d18c00c --- /dev/null +++ b/packages/google-cloud-dlp/scripts/readme-gen/templates/install_portaudio.tmpl.rst @@ -0,0 +1,35 @@ +Install PortAudio ++++++++++++++++++ + +Install `PortAudio`_. This is required by the `PyAudio`_ library to stream +audio from your computer's microphone. PyAudio depends on PortAudio for cross-platform compatibility, and is installed differently depending on the +platform. + +* For Mac OS X, you can use `Homebrew`_:: + + brew install portaudio + + **Note**: if you encounter an error when running `pip install` that indicates + it can't find `portaudio.h`, try running `pip install` with the following + flags:: + + pip install --global-option='build_ext' \ + --global-option='-I/usr/local/include' \ + --global-option='-L/usr/local/lib' \ + pyaudio + +* For Debian / Ubuntu Linux:: + + apt-get install portaudio19-dev python-all-dev + +* Windows may work without having to install PortAudio explicitly (it will get + installed with PyAudio). + +For more details, see the `PyAudio installation`_ page. + + +.. _PyAudio: https://people.csail.mit.edu/hubert/pyaudio/ +.. _PortAudio: http://www.portaudio.com/ +.. _PyAudio installation: + https://people.csail.mit.edu/hubert/pyaudio/#downloads +.. _Homebrew: http://brew.sh diff --git a/packages/google-cloud-dlp/synth.metadata b/packages/google-cloud-dlp/synth.metadata index be2c13723c6f..0ebb8d417d79 100644 --- a/packages/google-cloud-dlp/synth.metadata +++ b/packages/google-cloud-dlp/synth.metadata @@ -4,22 +4,21 @@ "git": { "name": ".", "remote": "https://github.com/googleapis/python-dlp.git", - "sha": "7973a441ae2226ce7c597cb5e7eebfa0e38cd94b" + "sha": "973bcc3783029e9b45b23fa13e52bcab4b6f2630" } }, { "git": { - "name": "googleapis", - "remote": "https://github.com/googleapis/googleapis.git", - "sha": "dec3204175104cef49bf21d685d5517caaf0058f", - "internalRef": "312689208" + "name": "synthtool", + "remote": "https://github.com/googleapis/synthtool.git", + "sha": "5747555f7620113d9a2078a48f4c047a99d31b3e" } }, { "git": { "name": "synthtool", "remote": "https://github.com/googleapis/synthtool.git", - "sha": "d2364eb80b840a36136c8ce12f1c6efabcc9600e" + "sha": "5747555f7620113d9a2078a48f4c047a99d31b3e" } } ], diff --git a/packages/google-cloud-dlp/synth.py b/packages/google-cloud-dlp/synth.py index a6daaa883338..802c4faa7c7a 100644 --- a/packages/google-cloud-dlp/synth.py +++ b/packages/google-cloud-dlp/synth.py @@ -16,6 +16,7 @@ import synthtool as s import synthtool.gcp as gcp +from synthtool.languages import python import logging logging.basicConfig(level=logging.DEBUG) @@ -259,8 +260,17 @@ # Add templated files # ---------------------------------------------------------------------------- templated_files = common.py_library( - cov_level=73, system_test_dependencies=["test_utils"] + cov_level=73, system_test_dependencies=["test_utils"], samples=True ) s.move(templated_files) +# ---------------------------------------------------------------------------- +# Samples templates +# ---------------------------------------------------------------------------- +python.py_samples() + +# Temporarily disable warnings due to +# https://github.com/googleapis/gapic-generator-python/issues/525 +s.replace("noxfile.py", '[\"\']-W[\"\']', '# "-W"') + s.shell.run(["nox", "-s", "blacken"], hide_output=False) diff --git a/packages/google-cloud-dlp/testing/.gitignore b/packages/google-cloud-dlp/testing/.gitignore new file mode 100644 index 000000000000..b05fbd630881 --- /dev/null +++ b/packages/google-cloud-dlp/testing/.gitignore @@ -0,0 +1,3 @@ +test-env.sh +service-account.json +client-secrets.json \ No newline at end of file