From 61d39d8610cdc9542d0f0ab35980959a562617bb Mon Sep 17 00:00:00 2001 From: Tyler James Harden Date: Fri, 30 Jun 2017 11:14:42 -0400 Subject: [PATCH 01/10] Fixes non UTF-8 surrogateescapes Surrogate escapes in Unicode (non UTF-8 encoding) will be properly escaped with backslashes when encountered, versus breaking the transport layer. --- elasticsearch/transport.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/elasticsearch/transport.py b/elasticsearch/transport.py index 012c5a3c3..3ab901827 100644 --- a/elasticsearch/transport.py +++ b/elasticsearch/transport.py @@ -293,6 +293,9 @@ def perform_request(self, method, url, params=None, body=None): if body is not None: try: body = body.encode('utf-8') + except UnicodeEncodeError as e: + if e.reason == 'surrogates not allowed': + body = body.encode('utf-8', "backslashreplace").decode('utf-8') except (UnicodeDecodeError, AttributeError): # bytes/str - no need to re-encode pass From 047085874202817b8c53e3e9b0c21f5fc45d5b7b Mon Sep 17 00:00:00 2001 From: Tyler James Harden Date: Fri, 30 Jun 2017 11:26:38 -0400 Subject: [PATCH 02/10] Removes erroneous bytes decode and reraises Fixes to re-raise exceptions with different reasons Removes erroneous bytes decode where bytes are desired --- elasticsearch/transport.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/elasticsearch/transport.py b/elasticsearch/transport.py index 3ab901827..f0534e85d 100644 --- a/elasticsearch/transport.py +++ b/elasticsearch/transport.py @@ -295,7 +295,10 @@ def perform_request(self, method, url, params=None, body=None): body = body.encode('utf-8') except UnicodeEncodeError as e: if e.reason == 'surrogates not allowed': - body = body.encode('utf-8', "backslashreplace").decode('utf-8') + body = body.encode('utf-8', "backslashreplace") + pass + + raise e except (UnicodeDecodeError, AttributeError): # bytes/str - no need to re-encode pass From c6fa87bc3fb950dcf7c47abdc1b9fe31a4ed7753 Mon Sep 17 00:00:00 2001 From: Tyler James Harden Date: Fri, 30 Jun 2017 11:52:18 -0400 Subject: [PATCH 03/10] Adds test for surrogate escapes in body Tests that a surrogate escape sequence is properly escaped with backslashes to produce valid UTF-8. --- test_elasticsearch/test_transport.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test_elasticsearch/test_transport.py b/test_elasticsearch/test_transport.py index 74f52fd43..de5f849c1 100644 --- a/test_elasticsearch/test_transport.py +++ b/test_elasticsearch/test_transport.py @@ -96,7 +96,7 @@ def test_body_gets_encoded_into_bytes(self): t.perform_request('GET', '/', body='你好') self.assertEquals(1, len(t.get_connection().calls)) self.assertEquals(('GET', '/', None, b'\xe4\xbd\xa0\xe5\xa5\xbd'), t.get_connection().calls[0][0]) - + def test_body_bytes_get_passed_untouched(self): t = Transport([{}], connection_class=DummyConnection) @@ -105,6 +105,13 @@ def test_body_bytes_get_passed_untouched(self): self.assertEquals(1, len(t.get_connection().calls)) self.assertEquals(('GET', '/', None, body), t.get_connection().calls[0][0]) + def test_body_surrogates_replaced_encoded_into_bytes(self): + t = Transport([{}], connection_class=DummyConnection) + + t.perform_request('GET', '/', body='你好\udd9e') + self.assertEquals(1, len(t.get_connection().calls)) + self.assertEquals(('GET', '/', None, b'\xe4\xbd\xa0\xe5\xa5\xbd\\udd9e'), t.get_connection().calls[0][0]) + def test_kwargs_passed_on_to_connections(self): t = Transport([{'host': 'google.com'}], port=123) self.assertEquals(1, len(t.connection_pool.connections)) From 1d8c0e9573ddbaeab722aec583c0a21f07d14fcc Mon Sep 17 00:00:00 2001 From: Tyler James Harden Date: Fri, 30 Jun 2017 12:01:37 -0400 Subject: [PATCH 04/10] Use proper byte sequence for surrogate --- test_elasticsearch/test_transport.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_elasticsearch/test_transport.py b/test_elasticsearch/test_transport.py index de5f849c1..f3d79ce80 100644 --- a/test_elasticsearch/test_transport.py +++ b/test_elasticsearch/test_transport.py @@ -110,7 +110,7 @@ def test_body_surrogates_replaced_encoded_into_bytes(self): t.perform_request('GET', '/', body='你好\udd9e') self.assertEquals(1, len(t.get_connection().calls)) - self.assertEquals(('GET', '/', None, b'\xe4\xbd\xa0\xe5\xa5\xbd\\udd9e'), t.get_connection().calls[0][0]) + self.assertEquals(('GET', '/', None, b'\xe4\xbd\xa0\xe5\xa5\xbd\xed\xb6\x9e'), t.get_connection().calls[0][0]) def test_kwargs_passed_on_to_connections(self): t = Transport([{'host': 'google.com'}], port=123) From 4fa702341ea00aa00acdd9961213ee0bcfb1d19f Mon Sep 17 00:00:00 2001 From: Tyler James Harden Date: Fri, 30 Jun 2017 12:14:41 -0400 Subject: [PATCH 05/10] Use if/else versus pass --- elasticsearch/transport.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/elasticsearch/transport.py b/elasticsearch/transport.py index f0534e85d..47fda2ff8 100644 --- a/elasticsearch/transport.py +++ b/elasticsearch/transport.py @@ -296,9 +296,8 @@ def perform_request(self, method, url, params=None, body=None): except UnicodeEncodeError as e: if e.reason == 'surrogates not allowed': body = body.encode('utf-8', "backslashreplace") - pass - - raise e + else: + raise e except (UnicodeDecodeError, AttributeError): # bytes/str - no need to re-encode pass From b90231f3852e9e29d6974297cefa663668b7a840 Mon Sep 17 00:00:00 2001 From: Tyler James Harden Date: Fri, 7 Jul 2017 11:01:42 -0400 Subject: [PATCH 06/10] Proper Unicode surrogate escape Use a Unicode Surrogate that properly escapes in both Python2 and Python3 --- test_elasticsearch/test_transport.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_elasticsearch/test_transport.py b/test_elasticsearch/test_transport.py index f3d79ce80..f6fa5d459 100644 --- a/test_elasticsearch/test_transport.py +++ b/test_elasticsearch/test_transport.py @@ -108,9 +108,9 @@ def test_body_bytes_get_passed_untouched(self): def test_body_surrogates_replaced_encoded_into_bytes(self): t = Transport([{}], connection_class=DummyConnection) - t.perform_request('GET', '/', body='你好\udd9e') + t.perform_request('GET', '/', body='你好\uda6a') self.assertEquals(1, len(t.get_connection().calls)) - self.assertEquals(('GET', '/', None, b'\xe4\xbd\xa0\xe5\xa5\xbd\xed\xb6\x9e'), t.get_connection().calls[0][0]) + self.assertEquals(('GET', '/', None, b'\xe4\xbd\xa0\xe5\xa5\xbd\\uda6a'), t.get_connection().calls[0][0]) def test_kwargs_passed_on_to_connections(self): t = Transport([{'host': 'google.com'}], port=123) From cf0672d1ffc54823220cde1b897a153298b48d10 Mon Sep 17 00:00:00 2001 From: Tyler James Harden Date: Fri, 7 Jul 2017 11:49:20 -0400 Subject: [PATCH 07/10] Passing test once surrogatepass is used Updating test to pass once surrogatepass is used --- test_elasticsearch/test_transport.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_elasticsearch/test_transport.py b/test_elasticsearch/test_transport.py index f6fa5d459..de5ec6d53 100644 --- a/test_elasticsearch/test_transport.py +++ b/test_elasticsearch/test_transport.py @@ -110,7 +110,7 @@ def test_body_surrogates_replaced_encoded_into_bytes(self): t.perform_request('GET', '/', body='你好\uda6a') self.assertEquals(1, len(t.get_connection().calls)) - self.assertEquals(('GET', '/', None, b'\xe4\xbd\xa0\xe5\xa5\xbd\\uda6a'), t.get_connection().calls[0][0]) + self.assertEquals(('GET', '/', None, b'\xe4\xbd\xa0\xe5\xa5\xbd\xed\xa9\xaa'), t.get_connection().calls[0][0]) def test_kwargs_passed_on_to_connections(self): t = Transport([{'host': 'google.com'}], port=123) From 6259cb370401d9778d36352a60bd6aa4cd574929 Mon Sep 17 00:00:00 2001 From: Tyler James Harden Date: Fri, 7 Jul 2017 11:50:06 -0400 Subject: [PATCH 08/10] Use surrogatepass instead of backslashreplace This replicates behavior between Python 2 and Python 3 --- elasticsearch/transport.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elasticsearch/transport.py b/elasticsearch/transport.py index 47fda2ff8..df0b35933 100644 --- a/elasticsearch/transport.py +++ b/elasticsearch/transport.py @@ -295,7 +295,7 @@ def perform_request(self, method, url, params=None, body=None): body = body.encode('utf-8') except UnicodeEncodeError as e: if e.reason == 'surrogates not allowed': - body = body.encode('utf-8', "backslashreplace") + body = body.encode('utf-8', "surrogatepass") else: raise e except (UnicodeDecodeError, AttributeError): From 05c2b0acad96c73ee67838124d7d08847a632cc0 Mon Sep 17 00:00:00 2001 From: Tyler James Harden Date: Fri, 7 Jul 2017 13:33:40 -0400 Subject: [PATCH 09/10] Fixes whitespace --- test_elasticsearch/test_transport.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_elasticsearch/test_transport.py b/test_elasticsearch/test_transport.py index de5ec6d53..328325c1c 100644 --- a/test_elasticsearch/test_transport.py +++ b/test_elasticsearch/test_transport.py @@ -96,7 +96,7 @@ def test_body_gets_encoded_into_bytes(self): t.perform_request('GET', '/', body='你好') self.assertEquals(1, len(t.get_connection().calls)) self.assertEquals(('GET', '/', None, b'\xe4\xbd\xa0\xe5\xa5\xbd'), t.get_connection().calls[0][0]) - + def test_body_bytes_get_passed_untouched(self): t = Transport([{}], connection_class=DummyConnection) From 41fb14550c9732dc3708c82e74d5658c78c35d6e Mon Sep 17 00:00:00 2001 From: Tyler James Harden Date: Fri, 7 Jul 2017 14:24:47 -0400 Subject: [PATCH 10/10] Simplifies with no exception block Since `surrogatepass` will only ever explicitly occur when there are surrogate bytes encountered, there is no need to let the error throw and catch it, also uses single-quotes for consistency. --- elasticsearch/transport.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/elasticsearch/transport.py b/elasticsearch/transport.py index df0b35933..dc8cd8911 100644 --- a/elasticsearch/transport.py +++ b/elasticsearch/transport.py @@ -292,12 +292,7 @@ def perform_request(self, method, url, params=None, body=None): if body is not None: try: - body = body.encode('utf-8') - except UnicodeEncodeError as e: - if e.reason == 'surrogates not allowed': - body = body.encode('utf-8', "surrogatepass") - else: - raise e + body = body.encode('utf-8', 'surrogatepass') except (UnicodeDecodeError, AttributeError): # bytes/str - no need to re-encode pass