Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add code sample for string replacement based deidentification. #3956

Merged
merged 7 commits into from
Jun 9, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions dlp/deid.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,72 @@ def deidentify_with_mask(

# [END dlp_deidentify_masking]

# [START dlp_deidentify_replace]
def deidentify_with_replace(
project,
string,
ackul marked this conversation as resolved.
Show resolved Hide resolved
info_types,
replacement_str=None,
):
"""Uses the Data Loss Prevention API to deidentify sensitive data in a
string by replacing matched input values with a value you specify.
Args:
project: The Google Cloud project id to use as a parent resource.
string: The string to deidentify (will be treated as text).
ackul marked this conversation as resolved.
Show resolved Hide resolved
ackul marked this conversation as resolved.
Show resolved Hide resolved
replacement_str: The string to replace all values that match given
info types.
Returns:
None; the response from the API is printed to the terminal.
"""
import google.cloud.dlp

# Instantiate a client
dlp = google.cloud.dlp_v2.DlpServiceClient()

# Convert the project id into a full resource id.
parent = dlp.project_path(project)

# Construct inspect configuration dictionary
inspect_config = {
"info_types": [{"name": info_type} for info_type in info_types]
}

# Construct deidentify configuration dictionary
deidentify_config = {
"info_type_transformations": {
"transformations": [
{
"primitive_transformation": {
"replace_config": {
"new_value": {
"string_value": replacement_str,
ackul marked this conversation as resolved.
Show resolved Hide resolved
}
}
}
}
]
}
}
Comment on lines +118 to +132
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's hard for me to wrap my head around this because of how deeply this is nested.

Could we perhaps use two dictionaries?

transformation = {
    "primitive_transformation": {
        "replace_config": {
            "new_value": {
                "string_value": replacement_str,
            }
        }
    }
}

deidentify_config = {
    "info_type_transformations": {
        "transformations": [transformation]
    }
}

Copy link
Contributor Author

@ackul ackul Jun 4, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review. I was trying to follow surrounding code for consistency. Do you feel strongly about the nesting?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, I didn't notice the existing function. I'm fine with keeping this as is.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 to breaking this out - I think there is significantly more nesting going on here than in the other function that it feel necessary


# Construct item
item = {"value": string}

# Call the API
response = dlp.deidentify_content(
parent,
inspect_config=inspect_config,
deidentify_config=deidentify_config,
item=item,
)

# Print out the results.
print(response.item.value)

# [END dlp_deidentify_replace]

# [START dlp_deidentify_fpe]


def deidentify_with_fpe(
project,
string,
Expand Down Expand Up @@ -476,6 +540,28 @@ def write_data(data):
help="The character to mask matching sensitive data with.",
)

replace_parser = subparsers.add_parser(
"deid_replace",
help="Deidentify sensitive data in a string by replacing it with "
"another string.",
)
replace_parser.add_argument(
"--info_types",
nargs="+",
help="Strings representing info types to look for. A full list of "
"info categories and types is available from the API. Examples "
'include "FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS". '
"If unspecified, the three above examples will be used.",
default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
)
replace_parser.add_argument(
"project",
help="The Google Cloud project id to use as a parent resource.",
)
replace_parser.add_argument("item", help="The string to deidentify.")
replace_parser.add_argument("replacement_str", help="The string to "
"replace all matched values with.")

fpe_parser = subparsers.add_parser(
"deid_fpe",
help="Deidentify sensitive data in a string using Format Preserving "
Expand Down Expand Up @@ -636,6 +722,13 @@ def write_data(data):
masking_character=args.masking_character,
number_to_mask=args.number_to_mask,
)
elif args.content == "deid_replace":
deidentify_with_replace(
args.project,
args.item,
args.info_types,
replacement_str=args.replacement_str,
)
elif args.content == "deid_fpe":
deidentify_with_fpe(
args.project,
Expand Down
10 changes: 10 additions & 0 deletions dlp/deid_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,16 @@ def test_deidentify_with_mask_masking_number_specified(capsys):
assert "My SSN is *******27" in out


def test_deidentify_with_replace(capsys):
deid.deidentify_with_replace(
GCLOUD_PROJECT, HARMFUL_STRING, ["US_SOCIAL_SECURITY_NUMBER"],
replacement_str="REPLACEMENT_STR"
)

out, _ = capsys.readouterr()
assert "My SSN is REPLACEMENT_STR" in out


def test_deidentify_with_fpe(capsys):
deid.deidentify_with_fpe(
GCLOUD_PROJECT,
Expand Down