pdfminer · pietermarsman · Oct 22, 2019 · Apr 20, 2015 · Apr 22, 2015 · Apr 23, 2015
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
 
-Nothing yet
+### Added
+- Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46))
 
 ## [20191020] - 2019-10-20
 
@@ -27,7 +28,7 @@ Nothing yet
 - Allow for bounding boxes with zero height or width by removing assertion ([#246](https://github.com/pdfminer/pdfminer.six/pull/246))
 
 ### Changed
-- All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306), [#219](https://github.com/pdfminer/pdfminer.six/pull/219))
+- All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306) and [#219](https://github.com/pdfminer/pdfminer.six/pull/219))
 
 ## [20181108] - 2018-11-08
 

diff --git a/README.md b/README.md
@@ -23,9 +23,10 @@ Features
 
  * Written entirely in Python.
  * Parse, analyze, and convert PDF documents.
- * PDF-1.7 specification support. (well, almost)
+ * PDF-1.7 specification support. (well, almost).
  * CJK languages and vertical writing scripts support.
  * Various font types (Type1, TrueType, Type3, and CID) support.
+ * Support for extracting images (JPG, JBIG2 and Bitmaps).
  * Basic encryption (RC4) support.
  * Outline (TOC) extraction.
  * Tagged contents extraction.

diff --git a/pdfminer/image.py b/pdfminer/image.py
@@ -1,12 +1,14 @@
 
-import struct
 import os
 import os.path
+import struct
 from io import BytesIO
-from .pdftypes import LITERALS_DCT_DECODE
+
+from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter
+from .pdfcolor import LITERAL_DEVICE_CMYK
 from .pdfcolor import LITERAL_DEVICE_GRAY
 from .pdfcolor import LITERAL_DEVICE_RGB
-from .pdfcolor import LITERAL_DEVICE_CMYK
+from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE
 
 
 def align32(x):
@@ -57,9 +59,11 @@ def write_line(self, y, data):
         return
 
 
-##  ImageWriter
-##
 class ImageWriter(object):
+    """Write image to a file
+
+    Supports various image types: JPEG, JBIG2 and bitmaps
+    """
 
     def __init__(self, outdir):
         self.outdir = outdir
@@ -68,21 +72,15 @@ def __init__(self, outdir):
         return
 
     def export_image(self, image):
-        stream = image.stream
-        filters = stream.get_filters()
         (width, height) = image.srcsize
-        if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
-            ext = '.jpg'
-        elif (image.bits == 1 or
-              image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)):
-            ext = '.%dx%d.bmp' % (width, height)
-        else:
-            ext = '.%d.%dx%d.img' % (image.bits, width, height)
-        name = image.name+ext
-        path = os.path.join(self.outdir, name)
-        fp=open(path, 'wb')
+
+        is_jbig2 = self.is_jbig2_image(image)
+        ext = self._get_image_extension(image, width, height, is_jbig2)
+        name, path = self._create_unique_image_name(self.outdir, image.name, ext)
+
+        fp = open(path, 'wb')
         if ext == '.jpg':
-            raw_data = stream.get_rawdata()
+            raw_data = image.stream.get_rawdata()
             if LITERAL_DEVICE_CMYK in image.colorspace:
                 from PIL import Image
                 from PIL import ImageChops
@@ -93,30 +91,74 @@ def export_image(self, image):
                 i.save(fp, 'JPEG')
             else:
                 fp.write(raw_data)
+        elif is_jbig2:
+            input_stream = BytesIO()
+            input_stream.write(image.stream.get_data())
+            input_stream.seek(0)
+            reader = JBIG2StreamReader(input_stream)
+            segments = reader.get_segments()
+
+            writer = JBIG2StreamWriter(fp)
+            writer.write_file(segments)
         elif image.bits == 1:
             bmp = BMPWriter(fp, 1, width, height)
-            data = stream.get_data()
+            data = image.stream.get_data()
             i = 0
             width = (width+7)//8
             for y in range(height):
                 bmp.write_line(y, data[i:i+width])
                 i += width
         elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
             bmp = BMPWriter(fp, 24, width, height)
-            data = stream.get_data()
+            data = image.stream.get_data()
             i = 0
             width = width*3
             for y in range(height):
                 bmp.write_line(y, data[i:i+width])
                 i += width
         elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
             bmp = BMPWriter(fp, 8, width, height)
-            data = stream.get_data()
+            data = image.stream.get_data()
             i = 0
             for y in range(height):
                 bmp.write_line(y, data[i:i+width])
                 i += width
         else:
-            fp.write(stream.get_data())
+            fp.write(image.stream.get_data())
         fp.close()
         return name
+
+    @staticmethod
+    def is_jbig2_image(image):
+        filters = image.stream.get_filters()
+        is_jbig2 = False
+        for filter_name, params in filters:
+            if filter_name in LITERALS_JBIG2_DECODE:
+                is_jbig2 = True
+                break
+        return is_jbig2
+
+    @staticmethod
+    def _get_image_extension(image, width, height, is_jbig2):
+        filters = image.stream.get_filters()
+        if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
+            ext = '.jpg'
+        elif is_jbig2:
+            ext = '.jb2'
+        elif (image.bits == 1 or
+              image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)):
+            ext = '.%dx%d.bmp' % (width, height)
+        else:
+            ext = '.%d.%dx%d.img' % (image.bits, width, height)
+        return ext
+
+    @staticmethod
+    def _create_unique_image_name(dirname, image_name, ext):
+        name = image_name + ext
+        path = os.path.join(dirname, name)
+        img_index = 0
+        while os.path.exists(path):
+            name = '%s.%d%s' % (image_name, img_index, ext)
+            path = os.path.join(dirname, name)
+            img_index += 1
+        return name, path