Skip to content

Commit a38ca19

Browse files
authored
fix: Redact sensitive data from OTEL traces and fix env var parsing (#1553)
fix: Redact sensitive data from OTEL traces and fix env var parsing (#1553)
1 parent 93ce515 commit a38ca19

File tree

6 files changed

+201
-57
lines changed

6 files changed

+201
-57
lines changed

google/cloud/storage/_experimental/asyncio/async_multi_range_downloader.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -270,9 +270,10 @@ async def download_ranges(
270270
client_checksum = int.from_bytes(client_crc32c, "big")
271271

272272
if server_checksum != client_checksum:
273-
raise DataCorruption(response,
273+
raise DataCorruption(
274+
response,
274275
f"Checksum mismatch for read_id {object_data_range.read_range.read_id}. "
275-
f"Server sent {server_checksum}, client calculated {client_checksum}."
276+
f"Server sent {server_checksum}, client calculated {client_checksum}.",
276277
)
277278

278279
read_id = object_data_range.read_range.read_id

google/cloud/storage/_opentelemetry_tracing.py

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import os
1919

2020
from contextlib import contextmanager
21-
21+
from urllib.parse import urlparse
2222
from google.api_core import exceptions as api_exceptions
2323
from google.api_core import retry as api_retry
2424
from google.cloud.storage import __version__
@@ -28,7 +28,15 @@
2828
ENABLE_OTEL_TRACES_ENV_VAR = "ENABLE_GCS_PYTHON_CLIENT_OTEL_TRACES"
2929
_DEFAULT_ENABLE_OTEL_TRACES_VALUE = False
3030

31-
enable_otel_traces = os.environ.get(
31+
32+
def _parse_bool_env(name: str, default: bool = False) -> bool:
33+
val = os.environ.get(name, None)
34+
if val is None:
35+
return default
36+
return str(val).strip().lower() in {"1", "true", "yes", "on"}
37+
38+
39+
enable_otel_traces = _parse_bool_env(
3240
ENABLE_OTEL_TRACES_ENV_VAR, _DEFAULT_ENABLE_OTEL_TRACES_VALUE
3341
)
3442
logger = logging.getLogger(__name__)
@@ -105,15 +113,37 @@ def _set_api_request_attr(request, client):
105113
if request.get("method"):
106114
attr["http.request.method"] = request.get("method")
107115
if request.get("path"):
108-
path = request.get("path")
109-
full_path = f"{client._connection.API_BASE_URL}{path}"
110-
attr["url.full"] = full_path
111-
if request.get("timeout"):
112-
attr["connect_timeout,read_timeout"] = request.get("timeout")
116+
full_url = client._connection.build_api_url(request.get("path"))
117+
attr.update(_get_opentelemetry_attributes_from_url(full_url, strip_query=True))
118+
if "timeout" in request:
119+
attr["connect_timeout,read_timeout"] = str(request.get("timeout"))
113120
return attr
114121

115122

116123
def _set_retry_attr(retry, conditional_predicate=None):
117124
predicate = conditional_predicate if conditional_predicate else retry._predicate
118125
retry_info = f"multiplier{retry._multiplier}/deadline{retry._deadline}/max{retry._maximum}/initial{retry._initial}/predicate{predicate}"
119126
return {"retry": retry_info}
127+
128+
129+
def _get_opentelemetry_attributes_from_url(url, strip_query=True):
130+
"""Helper to assemble OpenTelemetry span attributes from a URL."""
131+
u = urlparse(url)
132+
netloc = u.netloc
133+
# u.hostname is always lowercase. We parse netloc to preserve casing.
134+
# netloc format: [userinfo@]host[:port]
135+
if "@" in netloc:
136+
netloc = netloc.split("@", 1)[1]
137+
if ":" in netloc and not netloc.endswith("]"): # Handle IPv6 literal
138+
netloc = netloc.split(":", 1)[0]
139+
140+
attributes = {
141+
"server.address": netloc,
142+
"server.port": u.port,
143+
"url.scheme": u.scheme,
144+
"url.path": u.path,
145+
}
146+
if not strip_query:
147+
attributes["url.query"] = u.query
148+
149+
return attributes

google/cloud/storage/blob.py

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@
4747
from google.cloud._helpers import _rfc3339_nanos_to_datetime
4848
from google.cloud._helpers import _to_bytes
4949
from google.cloud.exceptions import NotFound
50+
from google.cloud.storage._opentelemetry_tracing import (
51+
_get_opentelemetry_attributes_from_url,
52+
)
5053
from google.cloud.storage._helpers import _add_etag_match_headers
5154
from google.cloud.storage._helpers import _add_generation_match_parameters
5255
from google.cloud.storage._helpers import _PropertyMixin
@@ -1055,13 +1058,11 @@ def _do_download(
10551058
Please enable this as per your use case.
10561059
"""
10571060

1058-
extra_attributes = {
1059-
"url.full": download_url,
1060-
"download.chunk_size": f"{self.chunk_size}",
1061-
"download.raw_download": raw_download,
1062-
"upload.checksum": f"{checksum}",
1063-
"download.single_shot_download": single_shot_download,
1064-
}
1061+
extra_attributes = _get_opentelemetry_attributes_from_url(download_url)
1062+
extra_attributes["download.chunk_size"] = f"{self.chunk_size}"
1063+
extra_attributes["download.raw_download"] = raw_download
1064+
extra_attributes["upload.checksum"] = f"{checksum}"
1065+
extra_attributes["download.single_shot_download"] = single_shot_download
10651066
args = {"timeout": timeout}
10661067

10671068
if self.chunk_size is None:
@@ -2048,10 +2049,8 @@ def _do_multipart_upload(
20482049
upload_url, headers=headers, checksum=checksum, retry=retry
20492050
)
20502051

2051-
extra_attributes = {
2052-
"url.full": upload_url,
2053-
"upload.checksum": f"{checksum}",
2054-
}
2052+
extra_attributes = _get_opentelemetry_attributes_from_url(upload_url)
2053+
extra_attributes["upload.checksum"] = f"{checksum}"
20552054
args = {"timeout": timeout}
20562055
with create_trace_span(
20572056
name="Storage.MultipartUpload/transmit",
@@ -2448,11 +2447,10 @@ def _do_resumable_upload(
24482447
command=command,
24492448
crc32c_checksum_value=crc32c_checksum_value,
24502449
)
2451-
extra_attributes = {
2452-
"url.full": upload.resumable_url,
2453-
"upload.chunk_size": upload.chunk_size,
2454-
"upload.checksum": f"{checksum}",
2455-
}
2450+
extra_attributes = _get_opentelemetry_attributes_from_url(upload.resumable_url)
2451+
extra_attributes["upload.chunk_size"] = upload.chunk_size
2452+
extra_attributes["upload.checksum"] = f"{checksum}"
2453+
24562454
args = {"timeout": timeout}
24572455
with create_trace_span(
24582456
name="Storage.ResumableUpload/transmitNextChunk",

tests/unit/asyncio/test_async_multi_range_downloader.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -260,25 +260,37 @@ async def test_downloading_without_opening_should_throw_error(
260260
assert str(exc.value) == "Underlying bidi-gRPC stream is not open"
261261
assert not mrd.is_stream_open
262262

263-
@mock.patch("google.cloud.storage._experimental.asyncio.async_multi_range_downloader.google_crc32c")
264-
@mock.patch("google.cloud.storage._experimental.asyncio.async_grpc_client.AsyncGrpcClient.grpc_client")
263+
@mock.patch(
264+
"google.cloud.storage._experimental.asyncio.async_multi_range_downloader.google_crc32c"
265+
)
266+
@mock.patch(
267+
"google.cloud.storage._experimental.asyncio.async_grpc_client.AsyncGrpcClient.grpc_client"
268+
)
265269
def test_init_raises_if_crc32c_c_extension_is_missing(
266270
self, mock_grpc_client, mock_google_crc32c
267271
):
268272
mock_google_crc32c.implementation = "python"
269273

270274
with pytest.raises(exceptions.NotFound) as exc_info:
271-
AsyncMultiRangeDownloader(
272-
mock_grpc_client, "bucket", "object"
273-
)
275+
AsyncMultiRangeDownloader(mock_grpc_client, "bucket", "object")
274276

275-
assert "The google-crc32c package is not installed with C support" in str(exc_info.value)
277+
assert "The google-crc32c package is not installed with C support" in str(
278+
exc_info.value
279+
)
276280

277281
@pytest.mark.asyncio
278-
@mock.patch("google.cloud.storage._experimental.asyncio.async_multi_range_downloader.Checksum")
279-
@mock.patch("google.cloud.storage._experimental.asyncio.async_grpc_client.AsyncGrpcClient.grpc_client")
280-
async def test_download_ranges_raises_on_checksum_mismatch(self, mock_client, mock_checksum_class):
281-
mock_stream = mock.AsyncMock(spec=async_read_object_stream._AsyncReadObjectStream)
282+
@mock.patch(
283+
"google.cloud.storage._experimental.asyncio.async_multi_range_downloader.Checksum"
284+
)
285+
@mock.patch(
286+
"google.cloud.storage._experimental.asyncio.async_grpc_client.AsyncGrpcClient.grpc_client"
287+
)
288+
async def test_download_ranges_raises_on_checksum_mismatch(
289+
self, mock_client, mock_checksum_class
290+
):
291+
mock_stream = mock.AsyncMock(
292+
spec=async_read_object_stream._AsyncReadObjectStream
293+
)
282294

283295
test_data = b"some-data"
284296
server_checksum = 12345
@@ -299,9 +311,7 @@ async def test_download_ranges_raises_on_checksum_mismatch(self, mock_client, mo
299311

300312
mock_stream.recv.side_effect = [mock_response, None]
301313

302-
mrd = AsyncMultiRangeDownloader(
303-
mock_client, "bucket", "object"
304-
)
314+
mrd = AsyncMultiRangeDownloader(mock_client, "bucket", "object")
305315
mrd.read_obj_str = mock_stream
306316
mrd._is_stream_open = True
307317

@@ -310,4 +320,3 @@ async def test_download_ranges_raises_on_checksum_mismatch(self, mock_client, mo
310320

311321
assert "Checksum mismatch" in str(exc_info.value)
312322
mock_checksum_class.assert_called_once_with(test_data)
313-

tests/unit/test__opentelemetry_tracing.py

Lines changed: 122 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,13 @@ def setup_optin(mock_os_environ):
5858
importlib.reload(_opentelemetry_tracing)
5959

6060

61+
@pytest.fixture()
62+
def setup_optout(mock_os_environ):
63+
"""Mock envar to opt-in tracing for storage client."""
64+
mock_os_environ["ENABLE_GCS_PYTHON_CLIENT_OTEL_TRACES"] = "False"
65+
importlib.reload(_opentelemetry_tracing)
66+
67+
6168
def test_opentelemetry_not_installed(setup, monkeypatch):
6269
monkeypatch.setitem(sys.modules, "opentelemetry", None)
6370
importlib.reload(_opentelemetry_tracing)
@@ -83,6 +90,13 @@ def test_enable_trace_yield_span(setup, setup_optin):
8390
assert span is not None
8491

8592

93+
def test_disable_traces(setup, setup_optout):
94+
assert _opentelemetry_tracing.HAS_OPENTELEMETRY
95+
assert not _opentelemetry_tracing.enable_otel_traces
96+
with _opentelemetry_tracing.create_trace_span("No-ops for opentelemetry") as span:
97+
assert span is None
98+
99+
86100
def test_enable_trace_call(setup, setup_optin):
87101
from opentelemetry import trace as trace_api
88102

@@ -136,7 +150,7 @@ def test_get_final_attributes(setup, setup_optin):
136150
}
137151
api_request = {
138152
"method": "GET",
139-
"path": "/foo/bar/baz",
153+
"path": "/foo/bar/baz?sensitive=true",
140154
"timeout": (100, 100),
141155
}
142156
retry_obj = api_retry.Retry()
@@ -147,15 +161,19 @@ def test_get_final_attributes(setup, setup_optin):
147161
"rpc.system": "http",
148162
"user_agent.original": f"gcloud-python/{__version__}",
149163
"http.request.method": "GET",
150-
"url.full": "https://testOtel.org/foo/bar/baz",
151-
"connect_timeout,read_timeout": (100, 100),
164+
"server.address": "testOtel.org",
165+
"url.path": "/foo/bar/baz",
166+
"url.scheme": "https",
167+
"connect_timeout,read_timeout": str((100, 100)),
152168
"retry": f"multiplier{retry_obj._multiplier}/deadline{retry_obj._deadline}/max{retry_obj._maximum}/initial{retry_obj._initial}/predicate{retry_obj._predicate}",
153169
}
154170
expected_attributes.update(_opentelemetry_tracing._cloud_trace_adoption_attrs)
155171

156172
with mock.patch("google.cloud.storage.client.Client") as test_client:
157173
test_client.project = "test_project"
158-
test_client._connection.API_BASE_URL = "https://testOtel.org"
174+
test_client._connection.build_api_url.return_value = (
175+
"https://testOtel.org/foo/bar/baz?sensitive=true"
176+
)
159177
with _opentelemetry_tracing.create_trace_span(
160178
test_span_name,
161179
attributes=test_span_attributes,
@@ -165,6 +183,7 @@ def test_get_final_attributes(setup, setup_optin):
165183
) as span:
166184
assert span is not None
167185
assert span.name == test_span_name
186+
assert "url.query" not in span.attributes
168187
assert span.attributes == expected_attributes
169188

170189

@@ -196,23 +215,108 @@ def test_set_conditional_retry_attr(setup, setup_optin):
196215
assert span.attributes == expected_attributes
197216

198217

199-
def test_set_api_request_attr():
200-
from google.cloud.storage import Client
218+
def test__get_opentelemetry_attributes_from_url():
219+
url = "https://example.com:8080/path?query=true"
220+
expected = {
221+
"server.address": "example.com",
222+
"server.port": 8080,
223+
"url.scheme": "https",
224+
"url.path": "/path",
225+
}
226+
# Test stripping query
227+
attrs = _opentelemetry_tracing._get_opentelemetry_attributes_from_url(
228+
url, strip_query=True
229+
)
230+
assert attrs == expected
231+
assert "url.query" not in attrs
232+
233+
# Test not stripping query
234+
expected["url.query"] = "query=true"
235+
attrs = _opentelemetry_tracing._get_opentelemetry_attributes_from_url(
236+
url, strip_query=False
237+
)
238+
assert attrs == expected
201239

202-
test_client = Client()
203-
args_method = {"method": "GET"}
204-
expected_attributes = {"http.request.method": "GET"}
205-
attr = _opentelemetry_tracing._set_api_request_attr(args_method, test_client)
206-
assert attr == expected_attributes
207240

208-
args_path = {"path": "/foo/bar/baz"}
209-
expected_attributes = {"url.full": "https://storage.googleapis.com/foo/bar/baz"}
210-
attr = _opentelemetry_tracing._set_api_request_attr(args_path, test_client)
211-
assert attr == expected_attributes
241+
def test__get_opentelemetry_attributes_from_url_with_query():
242+
url = "https://example.com/path?query=true&another=false"
243+
expected = {
244+
"server.address": "example.com",
245+
"server.port": None,
246+
"url.scheme": "https",
247+
"url.path": "/path",
248+
"url.query": "query=true&another=false",
249+
}
250+
# Test not stripping query
251+
attrs = _opentelemetry_tracing._get_opentelemetry_attributes_from_url(
252+
url, strip_query=False
253+
)
254+
assert attrs == expected
212255

213-
args_timeout = {"timeout": (100, 100)}
256+
257+
def test_set_api_request_attr_with_pii_in_query():
258+
client = mock.Mock()
259+
client._connection.build_api_url.return_value = (
260+
"https://example.com/path?sensitive=true&token=secret"
261+
)
262+
263+
request = {
264+
"method": "GET",
265+
"path": "/path?sensitive=true&token=secret",
266+
"timeout": 60,
267+
}
214268
expected_attributes = {
215-
"connect_timeout,read_timeout": (100, 100),
269+
"http.request.method": "GET",
270+
"server.address": "example.com",
271+
"server.port": None,
272+
"url.scheme": "https",
273+
"url.path": "/path",
274+
"connect_timeout,read_timeout": "60",
216275
}
217-
attr = _opentelemetry_tracing._set_api_request_attr(args_timeout, test_client)
276+
attr = _opentelemetry_tracing._set_api_request_attr(request, client)
218277
assert attr == expected_attributes
278+
assert "url.query" not in attr # Ensure query with PII is not captured
279+
280+
281+
def test_set_api_request_attr_no_timeout():
282+
client = mock.Mock()
283+
client._connection.build_api_url.return_value = "https://example.com/path"
284+
285+
request = {"method": "GET", "path": "/path"}
286+
attr = _opentelemetry_tracing._set_api_request_attr(request, client)
287+
assert "connect_timeout,read_timeout" not in attr
288+
289+
290+
@pytest.mark.parametrize(
291+
"env_value, default, expected",
292+
[
293+
# Test default values when env var is not set
294+
(None, False, False),
295+
(None, True, True),
296+
# Test truthy values
297+
("1", False, True),
298+
("true", False, True),
299+
("yes", False, True),
300+
("on", False, True),
301+
("TRUE", False, True),
302+
(" Yes ", False, True),
303+
# Test falsy values
304+
("0", False, False),
305+
("false", False, False),
306+
("no", False, False),
307+
("off", False, False),
308+
("any_other_string", False, False),
309+
("", False, False),
310+
# Test with default=True and falsy values
311+
("false", True, False),
312+
("0", True, False),
313+
],
314+
)
315+
def test__parse_bool_env(monkeypatch, env_value, default, expected):
316+
env_var_name = "TEST_ENV_VAR"
317+
monkeypatch.setenv(
318+
env_var_name, str(env_value)
319+
) if env_value is not None else monkeypatch.delenv(env_var_name, raising=False)
320+
321+
result = _opentelemetry_tracing._parse_bool_env(env_var_name, default)
322+
assert result is expected

0 commit comments

Comments
 (0)