Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
 into peterhorsley-stream-api
  • Loading branch information
konstantint committed Dec 23, 2018
2 parents b4d26dc + d6f7a7b commit 1439cd7
Show file tree
Hide file tree
Showing 10 changed files with 203 additions and 36 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@
/.cache
/_docs
/venv*
/_tmp
/_tmp
Thumbs.db
8 changes: 5 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,22 +46,24 @@ has not been tested extensively.
In order to use the recognition function in Python code, simply do::

>> from passporteye import read_mrz
>> mrz = read_mrz(image_filename)
>> mrz = read_mrz(image_fileuri)

Where image_fileuri can be either a path to a file on disk, or a byte stream containing image data.

The returned object (unless it is None, which means no ROI was detected) contains the fields extracted from the MRZ along
with some metainformation. For the description of the available fields, see the docstring for the `passporteye.mrz.text.MRZ` class.
Note that you can convert the object to a dictionary using the ``to_dict()`` method.

If you want to have the ROI reported alongside the MRZ, call the ``read_mrz`` function as follows::

>> mrz = read_mrz(image_filename, save_roi=True)
>> mrz = read_mrz(image_fileuri, save_roi=True)

The ROI can then be accessed as ``mrz.aux['roi']`` -- it is a numpy ndarray, representing the (grayscale) image region where the OCR was applied.

For more flexibility, you may instead use a ``MRZPipeline`` object, which will provide you access to all intermediate computations as follows::

>> from passporteye.mrz.image import MRZPipeline
>> p = MRZPipeline(filename)
>> p = MRZPipeline(fileuri)
>> mrz = p.result

The "pipeline" object stores the intermediate computations in its ``data`` dictionary. Although you need to understand the underlying algorithm
Expand Down
60 changes: 30 additions & 30 deletions passporteye/mrz/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
Author: Konstantin Tretyakov
License: MIT
'''
from skimage import transform, io, morphology, filters, measure
from skimage import transform, morphology, filters, measure
from skimage import io as skimage_io # So as not to clash with builtin io
import io
import numpy as np
import tempfile, os
from ..util.pdf import extract_first_jpeg_in_pdf
Expand All @@ -16,44 +18,42 @@


class Loader(object):
"""Loads `filename` to `img`."""
"""Loads `fileuri` to `img`."""

__depends__ = []
__provides__ = ['img']

def __init__(self, filename, as_gray=True, pdf_aware=True):
self.filename = filename
def __init__(self, fileuri, as_gray=True, pdf_aware=True):
self.fileuri = fileuri
self.as_gray = as_gray
self.pdf_aware = pdf_aware

def _imread(self, filename):
def _imread(self, fileuri):
"""Proxy to skimage.io.imread with some fixes."""
img = io.imread(filename, as_gray=self.as_gray)
# For now, we have to select the imageio plugin to read image from byte stream
# When ski-image v0.15 is released, imageio will be the default plugin, so this
# code can be simplified at that time. See issue report and pull request:
# https://github.com/scikit-image/scikit-image/issues/2889
# https://github.com/scikit-image/scikit-image/pull/3126
img = skimage_io.imread(fileuri, as_gray=self.as_gray, plugin='imageio')
if img is not None and len(img.shape) != 2:
# The PIL plugin somewhy fails to load some images
img = io.imread(filename, as_gray=self.as_gray, plugin='matplotlib')
img = skimage_io.imread(fileuri, as_gray=self.as_gray, plugin='matplotlib')
return img

def __call__(self):
if self.pdf_aware and self.filename.lower().endswith('.pdf'):
with open(self.filename, 'rb') as f:
img_data = extract_first_jpeg_in_pdf(f)
if img_data is None:
return None
else:
fd, fname = tempfile.mkstemp(prefix='pythoneye_', suffix='.jpg')
try:
with open(fname, 'wb') as f:
f.write(img_data)
return self._imread(fname)
except:
if isinstance(self.fileuri, str):
if self.pdf_aware and self.fileuri.lower().endswith('.pdf'):
with open(self.fileuri, 'rb') as f:
img_data = extract_first_jpeg_in_pdf(f)
if img_data is None:
return None
finally:
os.close(fd)
os.remove(fname)
else:
return self._imread(self.filename)

return self._imread(img_data)
else:
return self._imread(self.fileuri)
elif isinstance(self.fileuri, (bytes, io.IOBase)):
return self._imread(self.fileuri)
return None

class Scaler(object):
"""Scales `image` down to `img_scaled` so that its width is at most 250."""
Expand Down Expand Up @@ -308,11 +308,11 @@ def __call__(self, mrz, __pipeline__):
class MRZPipeline(Pipeline):
"""This is the "currently best-performing" pipeline for parsing MRZ from a given image file."""

def __init__(self, filename, extra_cmdline_params=''):
def __init__(self, fileuri, extra_cmdline_params=''):
super(MRZPipeline, self).__init__()
self.version = '1.0' # In principle we might have different pipelines in use, so possible backward compatibility is an issue
self.filename = filename
self.add_component('loader', Loader(filename))
self.fileuri = fileuri
self.add_component('loader', Loader(fileuri))
self.add_component('scaler', Scaler())
self.add_component('boone', BooneTransform())
self.add_component('box_locator', MRZBoxLocator())
Expand All @@ -324,14 +324,14 @@ def result(self):
return self['mrz_final']


def read_mrz(filename, save_roi=False, extra_cmdline_params=''):
def read_mrz(fileuri, save_roi=False, extra_cmdline_params=''):
"""The main interface function to this module, encapsulating the recognition pipeline.
Given an image filename, runs MRZPipeline on it, returning the parsed MRZ object.
:param save_roi: when this is True, the .aux['roi'] field will contain the Region of Interest where the MRZ was parsed from.
:param extra_cmdline_params:extra parameters to the ocr.py
"""
p = MRZPipeline(filename, extra_cmdline_params)
p = MRZPipeline(fileuri, extra_cmdline_params)
mrz = p.result

if mrz is not None:
Expand Down
2 changes: 1 addition & 1 deletion passporteye/util/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def extract_first_jpeg_in_pdf(fstream):
for PDFMiner.
:param fstream: Readable binary stream of the PDF
:return: String, containing the whole contents of the JPEG image or None if extraction failed.
:return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
"""
parser = PDFParser(fstream)
if PY2:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def run_tests(self):
packages=find_packages(exclude=['examples', 'tests']),
include_package_data=True,
zip_safe=False,
install_requires=['numpy', 'scipy', 'scikit-image >= 0.14.1', 'scikit-learn', 'matplotlib', 'pytesseract >= 0.2.0',
install_requires=['numpy', 'scipy', 'scikit-image >= 0.14.1', 'imageio', 'scikit-learn', 'matplotlib', 'pytesseract >= 0.2.0',
'pdfminer' if sys.version_info.major == 2 else 'pdfminer3k'],
extras_require={
"test": ["pytest"],
Expand Down
Binary file added tests/data/passport-td2.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/passport-td2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/passport-td3.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/passport-td3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
164 changes: 164 additions & 0 deletions tests/mrz_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
'''
Test module for use with py.test.
Write each test as a function named test_<something>.
Read more here: http://pytest.org/
Author: Peter Horsley
License: MIT
'''
from passporteye import read_mrz
import io

def test_read_mrz_td3_jpg_file():
mrz = read_mrz('./tests/data/passport-td3.jpg')
assert_td3_jpg(mrz)

def test_read_mrz_td3_jpg_stream():
byteStream = None
mrz = None
try:
byteStream = io.open('./tests/data/passport-td3.jpg', "rb", buffering = 0)
mrz = read_mrz(byteStream)
finally:
if byteStream is not None:
byteStream.close()
assert_td3_jpg(mrz)

def test_read_mrz_td3_png_file():
mrz = read_mrz('./tests/data/passport-td3.png')
assert_td3_png(mrz)

def test_read_mrz_td3_png_stream():
byteStream = None
mrz = None
try:
byteStream = io.open('./tests/data/passport-td3.png', "rb", buffering = 0)
mrz = read_mrz(byteStream)
finally:
if byteStream is not None:
byteStream.close()
assert_td3_png(mrz)

def test_read_mrz_td2_jpg_file():
mrz = read_mrz('./tests/data/passport-td2.jpg')
assert_td2_jpg(mrz)

def test_read_mrz_td2_jpg_stream():
byteStream = None
mrz = None
try:
byteStream = io.open('./tests/data/passport-td2.jpg', "rb", buffering = 0)
mrz = read_mrz(byteStream)
finally:
if byteStream is not None:
byteStream.close()
assert_td2_jpg(mrz)

def test_read_mrz_td2_png_file():
mrz = read_mrz('./tests/data/passport-td2.png')
assert_td2_png(mrz)

def test_read_mrz_td2_png_stream():
byteStream = None
mrz = None
try:
byteStream = io.open('./tests/data/passport-td2.png', "rb", buffering = 0)
mrz = read_mrz(byteStream)
finally:
if byteStream is not None:
byteStream.close()
assert_td2_png(mrz)

def assert_td3_jpg(mrz):
assert mrz != None
assert mrz.mrz_type == 'TD3'
assert mrz.valid_score == 62
assert mrz.type == 'P<'
assert mrz.country == 'UTO'
assert mrz.number == 'L898902C3'
assert mrz.date_of_birth == '740812'
assert mrz.expiration_date == '120415'
assert mrz.nationality == 'UTO'
assert mrz.sex == 'F'
assert mrz.names == 'ANNA MARIA'
assert mrz.surname == 'ERIKSSON'
assert mrz.personal_number == '2E184226B<<<<<'
assert mrz.check_number == '6'
assert mrz.check_date_of_birth == '2'
assert mrz.check_expiration_date == '9'
assert mrz.check_composite == '0'
assert mrz.check_personal_number == '1'
assert mrz.valid_number == True
assert mrz.valid_date_of_birth == True
assert mrz.valid_expiration_date == True
assert mrz.valid_composite == False
assert mrz.valid_personal_number == False

def assert_td3_png(mrz):
assert mrz != None
assert mrz.mrz_type == 'TD3'
assert mrz.valid_score == 100
assert mrz.type == 'P<'
assert mrz.country == 'UTO'
assert mrz.number == 'L898902C3'
assert mrz.date_of_birth == '740812'
assert mrz.expiration_date == '120415'
assert mrz.nationality == 'UTO'
assert mrz.sex == 'F'
assert mrz.names == 'ANNA MARIA'
assert mrz.surname == 'ERIKSSON'
assert mrz.personal_number == 'ZE184226B<<<<<'
assert mrz.check_number == '6'
assert mrz.check_date_of_birth == '2'
assert mrz.check_expiration_date == '9'
assert mrz.check_composite == '0'
assert mrz.check_personal_number == '1'
assert mrz.valid_number == True
assert mrz.valid_date_of_birth == True
assert mrz.valid_expiration_date == True
assert mrz.valid_composite == True
assert mrz.valid_personal_number == True

def assert_td2_jpg(mrz):
assert mrz.mrz_type == 'TD2'
assert mrz.valid_score == 100
assert mrz.type == 'I<'
assert mrz.country == 'UTO'
assert mrz.number == 'D23145890'
assert mrz.date_of_birth == '740812'
assert mrz.expiration_date == '120415'
assert mrz.nationality == 'UTO'
assert mrz.sex == 'F'
assert mrz.names == 'ANNA MARIA'
assert mrz.surname == 'ERIKSSON'
assert mrz.optional1 == '<<<<<<<'
assert mrz.check_number == '7'
assert mrz.check_date_of_birth == '2'
assert mrz.check_expiration_date == '9'
assert mrz.check_composite == '6'
assert mrz.valid_number == True
assert mrz.valid_date_of_birth == True
assert mrz.valid_expiration_date == True
assert mrz.valid_composite == True

def assert_td2_png(mrz):
assert mrz.mrz_type == 'TD2'
assert mrz.valid_score == 100
assert mrz.type == 'I<'
assert mrz.country == 'UTO'
assert mrz.number == 'D23145890'
assert mrz.date_of_birth == '740812'
assert mrz.expiration_date == '120415'
assert mrz.nationality == 'UTO'
assert mrz.sex == 'F'
assert mrz.names == 'ANNA MARIA'
assert mrz.surname == 'ERIKSSON'
assert mrz.optional1 == '<<<<<<<'
assert mrz.check_number == '7'
assert mrz.check_date_of_birth == '2'
assert mrz.check_expiration_date == '9'
assert mrz.check_composite == '6'
assert mrz.valid_number == True
assert mrz.valid_date_of_birth == True
assert mrz.valid_expiration_date == True
assert mrz.valid_composite == True

0 comments on commit 1439cd7

Please sign in to comment.