Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pr jbig2 #311

Merged
merged 24 commits into from
Oct 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
5f09ad2
saving JBIG2 streams as .jb2 images
Apr 20, 2015
2f33f9a
JBIG2 stream decoding basic routines
Apr 22, 2015
0f70748
JBIG2 stream writer basic routines(incomplete)
Apr 23, 2015
dd45b37
finished with basic JBIG2 format encoding routines
Apr 23, 2015
aa27c2a
JBIG2 file writing routines
Apr 23, 2015
47f6940
exporting jb2 images from ImageWriter
Apr 23, 2015
2c996e8
prevent ImageWriter from overwriting images with duplicate names
Apr 23, 2015
5fc5616
added read_file() method to JBIG2StreamReader; fixed a bug in segment…
Apr 24, 2015
0a055d4
fixed bug in JGBIG2StreamReader.encode_retention_flags() causing fiel…
Apr 24, 2015
c32eb89
fix rebase issue
Jul 14, 2019
03a777c
replace cStringIO with BytesIO
Jul 14, 2019
e2a2946
Added test for pdf with jbig2 image
pietermarsman Oct 22, 2019
e0de980
Added test pdf with jbig2 image
pietermarsman Oct 22, 2019
11404c4
Use explicit binary stringsin JBIG2StreamReader/Writer
pietermarsman Oct 22, 2019
29e032e
Merge branch 'develop' into pr-jbig2
pietermarsman Oct 22, 2019
24449bf
Fix unresolved name ceil
pietermarsman Oct 22, 2019
6088111
Merge branch 'pr-jbig2' of github.com:pietermarsman/pdfminer.six into…
pietermarsman Oct 22, 2019
1c8196f
Reformat jbig2.py
pietermarsman Oct 22, 2019
841060d
Added image extraction to feature list in README.md
pietermarsman Oct 22, 2019
2469159
Remove code that is never used
pietermarsman Oct 22, 2019
e303bba
Add minimal docstring documentation to jbig2 classes
pietermarsman Oct 22, 2019
912903f
Reformat ImageWrite
pietermarsman Oct 22, 2019
6307d74
Merge branch 'pr-jbig2' of github.com:pietermarsman/pdfminer.six into…
pietermarsman Oct 22, 2019
80ec153
Added line to changelog.md
pietermarsman Oct 22, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [Unreleased]

Nothing yet
### Added
- Support for extracting JBIG2 encoded images ([#311](https://github.com/pdfminer/pdfminer.six/pull/311) and [#46](https://github.com/pdfminer/pdfminer.six/pull/46))

## [20191020] - 2019-10-20

Expand All @@ -27,7 +28,7 @@ Nothing yet
- Allow for bounding boxes with zero height or width by removing assertion ([#246](https://github.com/pdfminer/pdfminer.six/pull/246))

### Changed
- All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306), [#219](https://github.com/pdfminer/pdfminer.six/pull/219))
- All dependencies are managed in `setup.py` ([#306](https://github.com/pdfminer/pdfminer.six/pull/306) and [#219](https://github.com/pdfminer/pdfminer.six/pull/219))

## [20181108] - 2018-11-08

Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@ Features

* Written entirely in Python.
* Parse, analyze, and convert PDF documents.
* PDF-1.7 specification support. (well, almost)
* PDF-1.7 specification support. (well, almost).
* CJK languages and vertical writing scripts support.
* Various font types (Type1, TrueType, Type3, and CID) support.
* Support for extracting images (JPG, JBIG2 and Bitmaps).
* Basic encryption (RC4) support.
* Outline (TOC) extraction.
* Tagged contents extraction.
Expand Down
86 changes: 64 additions & 22 deletions pdfminer/image.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@

import struct
import os
import os.path
import struct
from io import BytesIO
from .pdftypes import LITERALS_DCT_DECODE

from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter
from .pdfcolor import LITERAL_DEVICE_CMYK
from .pdfcolor import LITERAL_DEVICE_GRAY
from .pdfcolor import LITERAL_DEVICE_RGB
from .pdfcolor import LITERAL_DEVICE_CMYK
from .pdftypes import LITERALS_DCT_DECODE, LITERALS_JBIG2_DECODE


def align32(x):
Expand Down Expand Up @@ -57,9 +59,11 @@ def write_line(self, y, data):
return


## ImageWriter
##
class ImageWriter(object):
"""Write image to a file

Supports various image types: JPEG, JBIG2 and bitmaps
"""

def __init__(self, outdir):
self.outdir = outdir
Expand All @@ -68,21 +72,15 @@ def __init__(self, outdir):
return

def export_image(self, image):
stream = image.stream
filters = stream.get_filters()
(width, height) = image.srcsize
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
ext = '.jpg'
elif (image.bits == 1 or
image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)):
ext = '.%dx%d.bmp' % (width, height)
else:
ext = '.%d.%dx%d.img' % (image.bits, width, height)
name = image.name+ext
path = os.path.join(self.outdir, name)
fp=open(path, 'wb')

is_jbig2 = self.is_jbig2_image(image)
ext = self._get_image_extension(image, width, height, is_jbig2)
name, path = self._create_unique_image_name(self.outdir, image.name, ext)

fp = open(path, 'wb')
if ext == '.jpg':
raw_data = stream.get_rawdata()
raw_data = image.stream.get_rawdata()
if LITERAL_DEVICE_CMYK in image.colorspace:
from PIL import Image
from PIL import ImageChops
Expand All @@ -93,30 +91,74 @@ def export_image(self, image):
i.save(fp, 'JPEG')
else:
fp.write(raw_data)
elif is_jbig2:
input_stream = BytesIO()
input_stream.write(image.stream.get_data())
input_stream.seek(0)
reader = JBIG2StreamReader(input_stream)
segments = reader.get_segments()

writer = JBIG2StreamWriter(fp)
writer.write_file(segments)
elif image.bits == 1:
bmp = BMPWriter(fp, 1, width, height)
data = stream.get_data()
data = image.stream.get_data()
i = 0
width = (width+7)//8
for y in range(height):
bmp.write_line(y, data[i:i+width])
i += width
elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
bmp = BMPWriter(fp, 24, width, height)
data = stream.get_data()
data = image.stream.get_data()
i = 0
width = width*3
for y in range(height):
bmp.write_line(y, data[i:i+width])
i += width
elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
bmp = BMPWriter(fp, 8, width, height)
data = stream.get_data()
data = image.stream.get_data()
i = 0
for y in range(height):
bmp.write_line(y, data[i:i+width])
i += width
else:
fp.write(stream.get_data())
fp.write(image.stream.get_data())
fp.close()
return name

@staticmethod
def is_jbig2_image(image):
filters = image.stream.get_filters()
is_jbig2 = False
for filter_name, params in filters:
if filter_name in LITERALS_JBIG2_DECODE:
is_jbig2 = True
break
return is_jbig2

@staticmethod
def _get_image_extension(image, width, height, is_jbig2):
filters = image.stream.get_filters()
if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
ext = '.jpg'
elif is_jbig2:
ext = '.jb2'
elif (image.bits == 1 or
image.bits == 8 and (LITERAL_DEVICE_RGB in image.colorspace or LITERAL_DEVICE_GRAY in image.colorspace)):
ext = '.%dx%d.bmp' % (width, height)
else:
ext = '.%d.%dx%d.img' % (image.bits, width, height)
return ext

@staticmethod
def _create_unique_image_name(dirname, image_name, ext):
name = image_name + ext
path = os.path.join(dirname, name)
img_index = 0
while os.path.exists(path):
name = '%s.%d%s' % (image_name, img_index, ext)
path = os.path.join(dirname, name)
img_index += 1
return name, path
Loading