From e7c9cb0fc3152a33b84a2ab7229cd6a3e9754e67 Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Sat, 26 Aug 2017 12:09:49 +0200
Subject: [PATCH] Attempt to improve the `EI` detection heuristics, for inline
 images, in streams containing `NUL` bytes (issue 8823)

Since this patch will now treat (some) `NUL` bytes as "ASCII", the number of `followingBytes` checked are thus increased to (hopefully) reduce the risk of introducing new false positives.

Fixes 8823.
---
 src/core/parser.js      |  20 +++++++++++++++++---
 test/pdfs/.gitignore    |   1 +
 test/pdfs/issue8823.pdf | Bin 0 -> 1771 bytes
 test/test_manifest.json |  10 ++++++++--
 4 files changed, 26 insertions(+), 5 deletions(-)
 create mode 100644 test/pdfs/issue8823.pdf

diff --git a/src/core/parser.js b/src/core/parser.js
index db0c4533d61433..954bec8ba9b75c 100644
--- a/src/core/parser.js
+++ b/src/core/parser.js
@@ -148,7 +148,8 @@ var Parser = (function ParserClosure() {
      * @returns {number} The inline stream length.
      */
     findDefaultInlineStreamEnd(stream) {
-      const E = 0x45, I = 0x49, SPACE = 0x20, LF = 0xA, CR = 0xD, n = 5;
+      const E = 0x45, I = 0x49, SPACE = 0x20, LF = 0xA, CR = 0xD;
+      const n = 10, NUL = 0x0;
       let startPos = stream.pos, state = 0, ch, maybeEIPos;
       while ((ch = stream.getByte()) !== -1) {
         if (state === 0) {
@@ -159,10 +160,23 @@ var Parser = (function ParserClosure() {
           assert(state === 2);
           if (ch === SPACE || ch === LF || ch === CR) {
             maybeEIPos = stream.pos;
-            // Let's check the next `n` bytes are ASCII... just be sure.
+            // Let's check that the next `n` bytes are ASCII... just to be sure.
             let followingBytes = stream.peekBytes(n);
-            for (let i = 0; i < n; i++) {
+            for (let i = 0, ii = followingBytes.length; i < ii; i++) {
               ch = followingBytes[i];
+              if (ch === NUL && followingBytes[i + 1] !== NUL) {
+                // NUL bytes are not supposed to occur *outside* of inline
+                // images, but some PDF generators violate that assumption,
+                // thus breaking the EI detection heuristics used below.
+                //
+                // However, we can't unconditionally treat NUL bytes as "ASCII",
+                // since that *could* result in inline images being truncated.
+                //
+                // To attempt to address this, we'll still treat any *sequence*
+                // of NUL bytes as non-ASCII, but for a *single* NUL byte we'll
+                // continue checking the `followingBytes` (fixes issue8823.pdf).
+                continue;
+              }
               if (ch !== LF && ch !== CR && (ch < SPACE || ch > 0x7F)) {
                 // Not a LF, CR, SPACE or any visible ASCII character, i.e.
                 // it's binary stuff. Resetting the state.
diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore
index 8e19cdd2851710..78c790ade606c9 100644
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@@ -58,6 +58,7 @@
 !issue8697.pdf
 !issue8707.pdf
 !issue8798r.pdf
+!issue8823.pdf
 !bad-PageLabels.pdf
 !filled-background.pdf
 !ArabicCIDTrueType.pdf
diff --git a/test/pdfs/issue8823.pdf b/test/pdfs/issue8823.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..9878a62ba342cf5336d6b80ddc9a7f86bedd7d12
GIT binary patch
literal 1771
zcmdT_&2G~`5YBm|yu(~_r2FT!9Vv=b#|c<ak){X^Q4dS94Mwd^*A8lVBwmV3pP?rX
z%sM14$pQ63=}Pu`XJ^Lq?R+!ao2HW&;wXUL{oT(WAdqAE@iPntz~0?{vB*ZIHkaiJ
z*n8`$yeyD;Z8kM4TcITN84QPDi={X7>p#sKJz%f%WrcI))z5*A%5_nrcwp#v4h?~w
z*kx{#@&@Pph@*c(A?Y3U=}n%d##wT;iG%zWH7zR)nz-1^R^{3)C{W!9+CaU+_J40|
zv8q3z5OLo}RMpO!t1euF@~6R3MkqALfOA}2L6QNo52yr9uV{~^(HXMTe$5y5r8BoA
z$WE|!YfE3rbVN9_6u#5w(D)Z~7seTOPO)>19Sr^1aMVEqLO&Z2gaJ+|2}1f@aTGjm
z5QNVTK}#d`I2xrm_Y6FLdPZ%N(UQqSc?6mMKPV4*<-r{AYE#|&#m#m|c2)i~MZQT%
z*LU7c$J+K_-|Bw1wzl8hIC}vE68LUlsI}5~(cQ@enfif6D~sX-nINluJsCv%G7<Et
zZnyqwXYxzynu}#tmel<z%o(tZ&U3&MoyJM1r5Xp~Sn+srtd-JQY8lD69}MyDDs&l~
X<=-qpY<jqHb%R-GA)wbAzn#D@vD)S7

literal 0
HcmV?d00001

diff --git a/test/test_manifest.json b/test/test_manifest.json
index 1b39b8ad67aa31..de8aa12000f604 100644
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@@ -2976,9 +2976,15 @@
     {  "id": "issue8798",
        "file": "pdfs/issue8798r.pdf",
        "md5": "3a0e29f013d9edcceb5d852e37738a77",
+       "link": false,
+       "rounds": 1,
+       "type": "eq"
+    },
+    {  "id": "issue8823",
+       "file": "pdfs/issue8823.pdf",
+       "md5": "ad02d4aa374b315bf1766038d002d57a",
+       "link": false,
        "rounds": 1,
-       "lastPage": 1,
-       "link": true,
        "type": "eq"
     },
     {  "id": "issue8613",