Merge pull request #120 from lubomir/unicode-boundary

Fix #119
release-engineering · Jan 10, 2020 · 047e9ed · 047e9ed
2 parents 507cecd + bb830f2
commit 047e9ed
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 1 deletion.
diff --git a/kobo/shortcuts.py b/kobo/shortcuts.py
@@ -338,6 +338,8 @@ def run(self):
 
         output = "" if is_text_mode else b""
         sentinel = "" if is_text_mode else b""
+        leftover = None
+        exception = None
         while True:
             if buffer_size == -1:
                 lines = proc.stdout.readline()
@@ -353,16 +355,42 @@ def run(self):
 
             if lines == sentinel:
                 break
+
+            if leftover:
+                lines = leftover + lines
+                leftover = None
+
             if stdout:
                 if not is_text_mode:
-                    sys.stdout.write(lines.decode(encoding))
+                    try:
+                        sys.stdout.write(lines.decode(encoding))
+                    except UnicodeDecodeError as exc:
+                        if exc.reason != "unexpected end of data":
+                            # This error was not caused by us. If there is an
+                            # incomplete sequence in the middle of the string,
+                            # we would get "invalid continuation byte".
+                            raise
+                        # We split the chunk in the middle of a multibyte
+                        # sequence. Print text until this character, and save
+                        # the rest for later. It will be prepended to the next
+                        # chunk. If there is no next chunk, we will re-raise
+                        # the error.
+                        exception = exc
+                        leftover = lines[exc.start:]
+                        lines = lines[:exc.start]
+                        sys.stdout.write(lines.decode(encoding))
                 else:
                     sys.stdout.write(lines)
             if logfile:
                 log.write(lines)
             if return_stdout:
                 output += lines
         proc.wait()
+        if leftover:
+            # There is some data left over. That means there was an unfinished
+            # multibyte sequence not caused by our splitting. Let's raise the
+            # stored exception to report it.
+            raise exception
 
     finally:
         if logfile:

diff --git a/tests/test_shortcuts.py b/tests/test_shortcuts.py
@@ -221,6 +221,24 @@ def test_run_show_cmd_logfile_stdout(self, mock_out):
         self.assertEqual(mock_out.getvalue(),
                          'COMMAND: echo foo\n-----------------\nfoo\n')
 
+    def test_run_split_in_middle_of_utf8_sequence(self):
+        cmd = "printf ' ' && bash -c \"printf 'č%.0s' {1..10000}\""
+        ret, out = run(cmd, stdout=True)
+        self.assertEqual(ret, 0)
+        self.assertEqual(out, b" " + b"\xc4\x8d" * 10000)
+
+    def test_run_chunk_ends_with_incomplete_char(self):
+        cmd = "bash -c \"printf 'a b \\xc4'\""
+        self.assertRaises(UnicodeDecodeError, run, cmd, stdout=True)
+
+    def test_run_chunk_with_incomplete_char_in_middle(self):
+        cmd = "bash -c \"printf 'a \\xc4 b'\""
+        self.assertRaises(UnicodeDecodeError, run, cmd, stdout=True)
+
+    def test_run_other_unicode_decode_error(self):
+        cmd = "bash -c \"printf 'a \\x80 b'\""
+        self.assertRaises(UnicodeDecodeError, run, cmd, stdout=True)
+
     @mock.patch('sys.stdout', new_callable=StringIO)
     def test_run_univ_nl_logfile_stdout(self, mock_out):
         logfile = os.path.join(self.tmp_dir, 'output.log')