From 433ff938fe2c76d12c24b7c58e7e8974a6014bf4 Mon Sep 17 00:00:00 2001
From: sl2c <akakii.fx@gmail.com>
Date: Thu, 28 Nov 2024 02:23:25 +0400
Subject: [PATCH] Version 2.5b: CCITT G4 ByteAlign: bugfix

---
 ccitt.py     | 63 +++++++++++++++++++++++++++++-----------------------
 pdffilter.py |  6 ++---
 pdffont.py   | 16 ++++++++-----
 pdfimage.py  | 57 ++++++++++++++++++++++++++++-------------------
 4 files changed, 82 insertions(+), 60 deletions(-)

diff --git a/ccitt.py b/ccitt.py
index daaa686..a0482ca 100644
--- a/ccitt.py
+++ b/ccitt.py
@@ -1,6 +1,5 @@
 # CCITT Group4 Decoder
 
-
 class Group4Decoder(object):
     '''
     An implementation of the CCITT Group 4 (T.6) decoder. See:
@@ -9,9 +8,6 @@ class Group4Decoder(object):
     CONTROL FUNCTIONS FOR GROUP 4 FACSIMILE APPARATUS
     '''
 
-    WHITE = 0
-    BLACK = 1
-
     EOFB = '000000000001000000000001'
 
     MODES_ENCODE = {
@@ -79,32 +75,36 @@ class Group4Decoder(object):
 
     # ---------------------------------------------------------------------------------------- decode()
 
-    def decode(self, data:bytes, columns:int, byteAlign:bool = False):
+    def decode(self, data:bytes, Columns:int, EncodedByteAlign:bool = False):
         '''
         Decodes a CCITT Group 4 (T.6) encoded bytes stream. Returns decoded
         bitonal image pixel data as a bytes stream, which consists of a sequence of lines,
         each line consisting of a sequence of bits, contiguously packed,
         with ends of lines padded with 0-bits to whole bytes, if necessary.
         '''
+
         MODES = self.MODES_DECODE
+        WHITE, BLACK = 0, 1
+
+        toBytes = lambda bits: b''.join(int(bits[i:i+8],2).to_bytes(1,'big') for i in range(0,len(bits),8))
 
         # Bit streams
         inBits = ''.join(f'{d:08b}' for d in data)
         inPos = 0
         outBits = ''
         peek = lambda i: inBits[inPos:inPos+i]
-        getBit = lambda color: '0' if color == self.WHITE else '1'
+        getBit = lambda color: '0' if color == WHITE else '1'
 
         b = []
         a = []
-        a0 = -1
-        color = self.WHITE
+        a0 = 0
+        color = WHITE
         line = 0
 
         while True:
 
-            b1 = next((b1 for n,b1 in enumerate(b) if b1 > a0 and n%2 == color), columns)
-            b2 = next((b2 for n,b2 in enumerate(b) if b2 >= b1 and n%2 == color^1), columns)
+            b1 = next((b1 for n,b1 in enumerate(b) if b1 > a0 and n%2 == color), Columns)
+            b2 = next((b2 for n,b2 in enumerate(b) if b2 >= b1 and n%2 == color^1), Columns)
 
             l = None
             for i in range(1,8):
@@ -131,36 +131,42 @@ def decode(self, data:bytes, columns:int, byteAlign:bool = False):
                         # Vertical mode (flips color)
                         outBits += getBit(color)*(b1 + l - a0)
                         a0 = b1 + l
-                        if a0 < columns:
+                        if a0 < Columns:
                             a.append(a0)
                         color ^= 1
                     elif l == 'EXT':
-                        # Extensions, incl. uncompressed mode — implement this later when sample files are available
+                        # Extensions, incl. uncompressed mode: implement later when sample files are available
                         l = peek(3)
                         raise ValueError(f"Extension code not implemented: E{l:03b}")
                     
                     break
                     
             if l is None:
-                # Check for EOFB at the end of file
-                if peek(24) == self.EOFB:
-                    if res := len(outBits) % 8:
-                        outBits += '0'*(8-res)
-                    result = b''.join(int(outBits[i:i+8],2).to_bytes(1,'big') for i in range(0,len(outBits),8))
-                    return result
-                else:
-                    raise ValueError(f'cannot read this: {peek(24):0{24}b}')
-
-            if a0 > columns:
+
+                if peek(24) != self.EOFB:
+
+                    nBytes = (Columns + 7 ) // 8
+                    if a0 < nBytes * 8:
+                        outBits += '0' * (nBytes * 8 - a0)
+                    from PIL import Image, ImageChops
+                    pil = Image.frombytes('1',(Columns, line+1), toBytes(outBits))
+                    ImageChops.invert(pil).save('dump.tif')
+                    raise ValueError(f'cannot read at line = {line}, a0 = {a0}: {peek(24)}; see dump.tif')
+
+                if res := len(outBits) % 8:
+                    outBits += '0'*(8-res)
+                return toBytes(outBits)
+
+            if a0 > Columns:
                 raise ValueError(f'extra bits in row')
             
-            if a0 == columns:
+            if a0 == Columns:
                 a0 = 0
-                color = self.WHITE
+                color = WHITE
                 b = a
                 a = []
                 line += 1
-                if byteAlign:
+                if EncodedByteAlign:
                     if res := inPos % 8:
                         inPos += 8-res
                 if res := len(outBits) % 8:
@@ -172,8 +178,9 @@ def decode(self, data:bytes, columns:int, byteAlign:bool = False):
     def get_run_length(self, inBits:str, inPos:int, color:int):
         '''
         '''
-        MAKEUP = self.MAKEUP_WHITE_DECODE if color == self.WHITE else self.MAKEUP_BLACK_DECODE
-        TERMINAL = self.TERMINALS_WHITE_DECODE if color == self.WHITE else self.TERMINALS_BLACK_DECODE
+        WHITE, BLACK = 0, 1
+        MAKEUP = self.MAKEUP_WHITE_DECODE if color == WHITE else self.MAKEUP_BLACK_DECODE
+        TERMINAL = self.TERMINALS_WHITE_DECODE if color == WHITE else self.TERMINALS_BLACK_DECODE
 
         peek = lambda i: inBits[pos:pos+i]
 
@@ -200,5 +207,5 @@ def get_run_length(self, inBits:str, inPos:int, color:int):
 
             if l is None:
                 if bits == 0:
-                    raise ValueError(f'failed to get run length')
+                    raise ValueError(f'failed to get {"white" if color == WHITE else "black"} run length: {peek(24)}')
                 return bits, pos
diff --git a/pdffilter.py b/pdffilter.py
index 6f0d1e7..598f92c 100755
--- a/pdffilter.py
+++ b/pdffilter.py
@@ -301,12 +301,12 @@ def rle_decode(string:bytes):
             if runLength == 128: break
             elif 0 <= runLength < 128:
                 j = (i + 1) + (runLength + 1)
-                s += string[i+1, j]
+                s += string[i+1:j]
                 i = j
             else:
-                s += string[i+1] * (257 - runLength)
+                s += string[i+1].to_bytes(1,'big') * (257 - runLength)
                 i += 2
-        string = s
+        return s
 
     # -------------------------------------------------------------------- rle_encode()
 
diff --git a/pdffont.py b/pdffont.py
index 04f1948..6fddbdf 100755
--- a/pdffont.py
+++ b/pdffont.py
@@ -400,6 +400,7 @@ def read_ttf_otf_info(self):
         except:
             warn(f'missing or corrupt \'name\' table in font')
 
+        fontName = info.get('FontName')
 
         info['numGlyphs'] = ttFont['maxp'].numGlyphs
 
@@ -425,7 +426,7 @@ def read_ttf_otf_info(self):
                 info['StemV'] = 50 + int(weight*weight + 0.5)
         except:
             os2 = None
-            warn(f'failed to get OS/2 table from font: {info.get("FontName")}')
+            warn(f'failed to get OS/2 table from font: {fontName}')
 
         # Stylistic parameters
         if post := ttFont.get('post'):
@@ -440,14 +441,17 @@ def read_ttf_otf_info(self):
         # info['DefaultWidth'] = int(round(ttf.metrics.defaultWidth, 0))
 
         # Set maps
-        try: glyphSet = ttFont.getGlyphSet()
+        try:
+            glyphSet = ttFont.getGlyphSet()
         except:
             glyphSet = None
-            warn(f'failed to get glyphSet from font: {info.get("FontName")}')
+            warn(f'failed to get glyphSet from font: {fontName}')
 
-        if glyphSet:
-            info['gid2gname'] = {chr(ttFont.getGlyphID(gname)):gname for gname in glyphSet}
-            info['gname2width'] = {gname:glyphSet[gname].width * z for gname in glyphSet}
+        if glyphSet is not None:
+            try: info['gid2gname'] = {chr(ttFont.getGlyphID(gname)):gname for gname in glyphSet}
+            except: warn(f'failed to get gid2gname from font: {fontName}')
+            try: info['gname2width'] = {gname:glyphSet[gname].width * z for gname in glyphSet}
+            except: warn(f'failed to get from gname2width font: {fontName}')
         
         if cmap := ttFont.get('cmap'):
             info['isSymbolic'] = any(table.platformID == 3 and table.platEncID == 0 for table in cmap.tables)
diff --git a/pdfimage.py b/pdfimage.py
index e585aad..d600793 100755
--- a/pdfimage.py
+++ b/pdfimage.py
@@ -209,10 +209,11 @@ def reduce(cs:CS_TYPE, array:np.ndarray, Decode:list[float] = None, bpc:int = 8,
         
         * decodes it using the specified Decode array if it's not None or otherwise
         the default decode array, which is determined based on the specified colorspace and bpc;
+        see PDF Ref. 1.7 sec. 4.8.4, Table 4.40
         * un-multiplies alpha if mask.Matte is not None;
         * if the colorspace is one of `/Separation`, `/DeviceN`, `/NChannel`, reduces the colorspace
         by remapping the image array to the corresponding alternate colorspace;
-        * encodes the image array using the default Decode array based on the ending colorspace
+        * encodes the image array using the default Decode array based on the target colorspace
         (the original or the alternate one, depending on whether the colorspace has been reduced
         in the previous step) and the value of bpc == 8.
         
@@ -282,18 +283,17 @@ def reduce(cs:CS_TYPE, array:np.ndarray, Decode:list[float] = None, bpc:int = 8,
 
         return cs, array
 
-
-    @staticmethod
-    def apply_default_page_colorspace_icc_profile(page:PdfDict, cs:CS_TYPE):
-        '''
-        If image.inf['icc_profile'] == None, apply the ICC profile from the page's default color space
-        (an entry in page.Resources.Colorspace, if present) to the image (in-place).
-        '''
-        if page == None or image.info.get('icc_profile'): return
-        try:
-            default_cs = page.Resources.ColorSpace[cs]
-        except:
-            pass
+    # @staticmethod
+    # def apply_default_page_colorspace_icc_profile(page:PdfDict, cs:CS_TYPE):
+    #     '''
+    #     If image.inf['icc_profile'] == None, apply the ICC profile from the page's default color space
+    #     (an entry in page.Resources.Colorspace, if present) to the image (in-place).
+    #     '''
+    #     if page == None or image.info.get('icc_profile'): return
+    #     try:
+    #         default_cs = page.Resources.ColorSpace[cs]
+    #     except:
+    #         pass
 
     # @staticmethod
     # def make_indexed_colorspace(palette:bytes, baseColorspace):
@@ -475,20 +475,27 @@ def toBytes(s:str): return s.encode('Latin-1')
                 assert parm
 
                 K = int(parm.K or '0')
-                encodedByteAlign = parm.EncodedByteAlign == 'true'
+                EncodedByteAlign = parm.EncodedByteAlign == PdfObject('true')
 
-                if K == -1 and encodedByteAlign:
+                if K == -1 and EncodedByteAlign:
 
                     from pdfrwx.ccitt import Group4Decoder
                     decoder = Group4Decoder()
-                    columns = int(parm.Columns)
-                    result = decoder.decode(stream, columns, encodedByteAlign)
+                    Columns = int(parm.Columns)
+
+                    result = decoder.decode(data = stream,
+                                            Columns = Columns,
+                                            EncodedByteAlign = EncodedByteAlign)
+
                     width, height = int(obj.Width), int(obj.Height)
-                    self.set_pil(ImageChops.invert(Image.frombytes('1',(width,height),result)))
+                    pil = Image.frombytes('1',(width,height),result)
+                    if parm.BlackIs1 != PdfObject('true'):
+                        pil = ImageChops.invert(pil)
+                    self.set_pil(pil)
 
                 else:
 
-                    if encodedByteAlign:
+                    if EncodedByteAlign:
                         warn(f'*** /CCITTFaxDecode Group3 (T4) decompression with /EncodedByteAlign is in beta-testing, check results ***')
 
                     header = PdfImage._tiff_make_header(obj)
@@ -1666,11 +1673,15 @@ def intToPrefix(i:int):
             pprint(obj)
             image = PdfImage(obj = obj)
             array = image.get_array()
+
+            # In order for the JBIG2 compression to work effectively the image has to be mostly white
+            # If it's not, invert it and change the /Decode array appropriately
             if np.mean(array) < 0.5:
                 image.set_array(np.logical_not(array))
                 decode = [float(x) for x in obj.Decode] if obj.Decode != None else None
-                obj.Decode = None if decode == [1,0] else PdfArray(['1','0'])
+                obj.Decode = None if decode == [1,0] else PdfArray([1,0])
                 image.Decode = obj.Decode
+
             tif_stream, _ = image.saveAs('TIFF')
             tif_path = T(f'in-{n:04d}.tif')
             open(tif_path, 'wb').write(tif_stream)
@@ -1895,7 +1906,7 @@ def getPageRange(s:str):
             sys.exit()
 
         # Iterate over pages
-        # cache = set()
+        cache = set()
 
         PROCESSING_REQUESTED = False
 
@@ -1907,8 +1918,8 @@ def getPageRange(s:str):
 
             page = pdf.pages[pageNo-1]
 
-            # objects = PdfObjects(page, cache=cache)
-            objects = PdfObjects(page)
+            objects = PdfObjects(page, cache=cache)
+            # objects = PdfObjects(page)
 
             images = {name+f'_{id(obj)}':obj for name, obj in objects 
                         if isinstance(obj, PdfDict) and obj.Subtype == PdfName.Image