Skip to content

Commit 5ff0818

Browse files
committed
Patch: Add __str__ and __bytes__ for undecoded content.
Patch.patch assumes all content to be encoded in UTF-8 and forcefully replaces any non-decodable sequences. This can lead to corruption for content that either does not conform to any specific encoding altogether, or uses an encoding that is incompatible with, or ambinuous to UTF-8. This change adds __str__ and __bytes__ implementations to Patch that return the unmodified, raw bytes.
1 parent 795adc7 commit 5ff0818

File tree

3 files changed

+118
-2
lines changed

3 files changed

+118
-2
lines changed

src/patch.c

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,11 @@ Patch_create_from(PyObject *self, PyObject *args, PyObject *kwds)
182182

183183

184184
PyDoc_STRVAR(Patch_patch__doc__,
185-
"Patch diff string. Can be None in some cases, such as empty commits.");
185+
"Patch diff string. Can be None in some cases, such as empty commits. "
186+
"Note that this decodes the content to unicode assuming UTF-8 encoding. "
187+
"For non-UTF-8 content that can lead be a lossy, non-reversible process. "
188+
"To access the raw, un-decoded patch, use `str(patch)` (Python 2), or "
189+
"`bytes(patch)` (Python 3).");
186190

187191
PyObject *
188192
Patch_patch__get__(Patch *self)
@@ -201,9 +205,55 @@ Patch_patch__get__(Patch *self)
201205
return py_patch;
202206
}
203207

208+
PyObject *
209+
Patch__str__(PyObject *self)
210+
{
211+
git_buf buf = {NULL};
212+
int err;
213+
PyObject *ret;
214+
215+
assert(self->patch);
216+
err = git_patch_to_buf(&buf, ((Patch*)self)->patch);
217+
if (err < 0)
218+
return Error_set(err);
219+
220+
#if PY_MAJOR_VERSION == 2
221+
ret = Py_BuildValue("s#", buf.ptr, buf.size);
222+
#else
223+
ret = to_unicode(buf.ptr, NULL, NULL);
224+
#endif
225+
git_buf_free(&buf);
226+
return ret;
227+
}
228+
229+
PyDoc_STRVAR(Patch__bytes____doc__, "The raw bytes of the patch's contents.");
230+
231+
PyObject *
232+
Patch__bytes__(PyObject *self)
233+
{
234+
#if PY_MAJOR_VERSION == 2
235+
return Patch__str__(self);
236+
237+
#else
238+
git_buf buf = {NULL};
239+
int err;
240+
241+
assert(self->patch);
242+
err = git_patch_to_buf(&buf, ((Patch*)self)->patch);
243+
if (err < 0)
244+
return Error_set(err);
245+
246+
PyObject *bytes = PyBytes_FromStringAndSize(buf.ptr, buf.size);
247+
git_buf_free(&buf);
248+
return bytes;
249+
#endif
250+
}
251+
204252
PyMethodDef Patch_methods[] = {
205253
{"create_from", (PyCFunction) Patch_create_from,
206254
METH_KEYWORDS | METH_VARARGS | METH_STATIC, Patch_create_from__doc__},
255+
{"__bytes__", (PyCFunction) Patch__bytes__,
256+
METH_NOARGS, Patch__bytes____doc__},
207257
{NULL}
208258
};
209259

@@ -237,7 +287,7 @@ PyTypeObject PatchType = {
237287
0, /* tp_as_mapping */
238288
0, /* tp_hash */
239289
0, /* tp_call */
240-
0, /* tp_str */
290+
Patch__str__, /* tp_str */
241291
0, /* tp_getattro */
242292
0, /* tp_setattro */
243293
0, /* tp_as_buffer */

test/data/encoding.tar

49 KB
Binary file not shown.

test/test_patch.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
from __future__ import absolute_import
2929
from __future__ import unicode_literals
3030

31+
import six
32+
3133
import pygit2
3234
from . import utils
3335

@@ -84,6 +86,70 @@
8486
"""
8587

8688

89+
class PatchEncodingTest(utils.AutoRepoTestCase):
90+
repo_spec = 'tar', 'encoding'
91+
expected_diff = b"""diff --git a/iso-8859-1.txt b/iso-8859-1.txt
92+
index e84e339..201e0c9 100644
93+
--- a/iso-8859-1.txt
94+
+++ b/iso-8859-1.txt
95+
@@ -1 +1,2 @@
96+
Kristian H\xf8gsberg
97+
+foo
98+
"""
99+
100+
def test_patch_from_non_utf8(self):
101+
# blobs encoded in ISO-8859-1
102+
old_content = b'Kristian H\xf8gsberg\n'
103+
new_content = old_content + b'foo\n'
104+
patch = pygit2.Patch.create_from(
105+
old_content,
106+
new_content,
107+
old_as_path='iso-8859-1.txt',
108+
new_as_path='iso-8859-1.txt',
109+
)
110+
111+
# `patch.patch` corrupted the ISO-8859-1 content as it forced UTF-8
112+
# decoding, so assert that we cannot get the original content back:
113+
self.assertNotEqual(patch.patch.encode('utf8'), self.expected_diff)
114+
115+
if six.PY2:
116+
self.assertIsInstance(str(patch), str)
117+
self.assertEqual(str(patch), self.expected_diff)
118+
119+
self.assertIsInstance(patch.__bytes__(), str)
120+
self.assertEqual(patch.__bytes__(), self.expected_diff)
121+
122+
else:
123+
self.assertIsInstance(str(patch), str)
124+
self.assertEqual(bytes(patch), self.expected_diff)
125+
self.assertEqual(str(patch),
126+
str(self.expected_diff, 'utf8', errors='replace'))
127+
128+
def test_patch_create_from_blobs(self):
129+
patch = pygit2.Patch.create_from(
130+
self.repo['e84e339ac7fcc823106efa65a6972d7a20016c85'],
131+
self.repo['201e0c908e3d9f526659df3e556c3d06384ef0df'],
132+
old_as_path='iso-8859-1.txt',
133+
new_as_path='iso-8859-1.txt',
134+
)
135+
# `patch.patch` corrupted the ISO-8859-1 content as it forced UTF-8
136+
# decoding, so assert that we cannot get the original content back:
137+
self.assertNotEqual(patch.patch.encode('utf8'), self.expected_diff)
138+
139+
if six.PY2:
140+
self.assertIsInstance(str(patch), str)
141+
self.assertEqual(str(patch), self.expected_diff)
142+
143+
self.assertIsInstance(patch.__bytes__(), str)
144+
self.assertEqual(patch.__bytes__(), self.expected_diff)
145+
146+
else:
147+
self.assertIsInstance(str(patch), str)
148+
self.assertEqual(bytes(patch), self.expected_diff)
149+
self.assertEqual(str(patch),
150+
str(self.expected_diff, 'utf8', errors='replace'))
151+
152+
87153
class PatchTest(utils.RepoTestCase):
88154

89155
def test_patch_create_from_buffers(self):

0 commit comments

Comments
 (0)