Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add DLP sample for redacting all image text #4018

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 7 additions & 23 deletions dlp/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -136,37 +136,21 @@ To run this sample:

$ python redact.py

usage: redact.py [-h] [--project PROJECT]
[--info_types INFO_TYPES [INFO_TYPES ...]]
[--min_likelihood {LIKELIHOOD_UNSPECIFIED,VERY_UNLIKELY,UNLIKELY,POSSIBLE,LIKELY,VERY_LIKELY}]
[--mime_type MIME_TYPE]
filename output_filename
usage: redact.py [-h] {info_types,all_text} ...

Sample app that uses the Data Loss Prevent API to redact the contents of an
image file.

positional arguments:
filename The path to the file to inspect.
output_filename The path to which the redacted image will be written.
{info_types,all_text}
Select which content should be redacted.
info_types Redact specific infoTypes from an image.
all_text Redact all text from an image. The MIME type of the
file is inferred via the Python standard library's
mimetypes module.

optional arguments:
-h, --help show this help message and exit
--project PROJECT The Google Cloud project id to use as a parent
resource.
--info_types INFO_TYPES [INFO_TYPES ...]
Strings representing info types to look for. A full
list of info categories and types is available from
the API. Examples include "FIRST_NAME", "LAST_NAME",
"EMAIL_ADDRESS". If unspecified, the three above
examples will be used.
--min_likelihood {LIKELIHOOD_UNSPECIFIED,VERY_UNLIKELY,UNLIKELY,POSSIBLE,LIKELY,VERY_LIKELY}
gguuss marked this conversation as resolved.
Show resolved Hide resolved
A string representing the minimum likelihood threshold
that constitutes a match.
--mime_type MIME_TYPE
The MIME type of the file. If not specified, the type
is inferred via the Python standard library's
mimetypes module.



Metadata
Expand Down
116 changes: 97 additions & 19 deletions dlp/redact.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,23 +121,87 @@ def redact_image(

# [END dlp_redact_image]

# [START dlp_redact_image_all_text]

if __name__ == "__main__":
default_project = os.environ.get("GOOGLE_CLOUD_PROJECT")

parser = argparse.ArgumentParser(description=__doc__)
def redact_image_all_text(
project,
filename,
output_filename,
):
"""Uses the Data Loss Prevention API to redact all text in an image.

parser.add_argument("filename", help="The path to the file to inspect.")
parser.add_argument(
"output_filename",
help="The path to which the redacted image will be written.",
Args:
project: The Google Cloud project id to use as a parent resource.
filename: The path to the file to inspect.
output_filename: The path to which the redacted image will be written.

Returns:
None; the response from the API is printed to the terminal.
"""
# Import the client library
import google.cloud.dlp

# Instantiate a client.
dlp = google.cloud.dlp_v2.DlpServiceClient()

# Construct the image_redaction_configs, indicating to DLP that all text in
# the input image should be redacted.
image_redaction_configs = [{
"redact_all_text": True,
}]

# Construct the byte_item, containing the file's byte data.
with open(filename, mode="rb") as f:
byte_item = {"type": "IMAGE", "data": f.read()}

# Convert the project id into a full resource id.
parent = dlp.project_path(project)

# Call the API.
response = dlp.redact_image(
parent,
image_redaction_configs=image_redaction_configs,
byte_item=byte_item,
)
parser.add_argument(

# Write out the results.
with open(output_filename, mode="wb") as f:
f.write(response.redacted_image)

print("Wrote {byte_count} to {filename}".format(
byte_count=len(response.redacted_image), filename=output_filename))


# [END dlp_redact_image_all_text]

if __name__ == "__main__":
default_project = os.environ.get("GOOGLE_CLOUD_PROJECT")

common_args_parser = argparse.ArgumentParser(add_help=False)
common_args_parser.add_argument(
"--project",
help="The Google Cloud project id to use as a parent resource.",
default=default_project,
)
parser.add_argument(
common_args_parser.add_argument(
"filename", help="The path to the file to inspect.")
common_args_parser.add_argument(
"output_filename",
help="The path to which the redacted image will be written.",
)

parser = argparse.ArgumentParser(description=__doc__)
subparsers = parser.add_subparsers(
dest="content", help="Select which content should be redacted.")
subparsers.required = True

info_types_parser = subparsers.add_parser(
"info_types",
help="Redact specific infoTypes from an image.",
parents=[common_args_parser],
)
info_types_parser.add_argument(
"--info_types",
nargs="+",
help="Strings representing info types to look for. A full list of "
Expand All @@ -146,7 +210,7 @@ def redact_image(
"If unspecified, the three above examples will be used.",
default=["FIRST_NAME", "LAST_NAME", "EMAIL_ADDRESS"],
)
parser.add_argument(
info_types_parser.add_argument(
"--min_likelihood",
choices=[
"LIKELIHOOD_UNSPECIFIED",
Expand All @@ -159,19 +223,33 @@ def redact_image(
help="A string representing the minimum likelihood threshold that "
"constitutes a match.",
)
parser.add_argument(
info_types_parser.add_argument(
"--mime_type",
help="The MIME type of the file. If not specified, the type is "
"inferred via the Python standard library's mimetypes module.",
)

all_text_parser = subparsers.add_parser(
"all_text",
help="Redact all text from an image. The MIME type of the file is "
"inferred via the Python standard library's mimetypes module.",
parents=[common_args_parser],
)

args = parser.parse_args()

redact_image(
args.project,
args.filename,
args.output_filename,
args.info_types,
min_likelihood=args.min_likelihood,
mime_type=args.mime_type,
)
if args.content == "info_types":
redact_image(
args.project,
args.filename,
args.output_filename,
args.info_types,
min_likelihood=args.min_likelihood,
mime_type=args.mime_type,
)
elif args.content == "all_text":
redact_image_all_text(
args.project,
args.filename,
args.output_filename,
)
14 changes: 14 additions & 0 deletions dlp/redact_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,17 @@ def test_redact_image_file(tempdir, capsys):

out, _ = capsys.readouterr()
assert output_filepath in out


def test_redact_image_all_text(tempdir, capsys):
test_filepath = os.path.join(RESOURCE_DIRECTORY, "test.png")
output_filepath = os.path.join(tempdir, "redacted.png")

redact.redact_image_all_text(
GCLOUD_PROJECT,
test_filepath,
output_filepath,
)

out, _ = capsys.readouterr()
assert output_filepath in out