Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

swap pycryptodome to the faster, smaller, and industry standard cryto… #456

Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Changed
- Hiding fallback xref by default from dumppdf.py output ([#431](https://github.com/pdfminer/pdfminer.six/pull/431))
- Raise a warning instead of an error when extracting text from a non-extractable PDF ([#350](https://github.com/pdfminer/pdfminer.six/issues/350))
- Switched from pycryptodome to cryptography package for AES decryption ([#456](https://github.com/pdfminer/pdfminer.six/pull/456))

## [20200517]

Expand Down
1 change: 0 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -56,4 +56,3 @@ $(CMAPDST)/to-unicode-Adobe-Korea1.pickle.gz: $(CMAPDST)

test: cmap
nosetests
cd samples && $(MAKE) test
3 changes: 0 additions & 3 deletions pdfminer/arcfour.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,3 @@ def process(self, data):
return r

encrypt = decrypt = process


new = Arcfour
83 changes: 46 additions & 37 deletions pdfminer/pdfdocument.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@
import hashlib as md5
import logging
import re
import struct
from hashlib import sha256, md5

from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes

try:
from Crypto.Cipher import ARC4, AES
from Crypto.Hash import SHA256
except ImportError:
AES = SHA256 = None
from . import arcfour as ARC4
from .psparser import PSEOF, literal_name, LIT, KWD
from . import settings
from .arcfour import Arcfour
from .pdfparser import PDFSyntaxError, PDFStreamParser
from .pdftypes import PDFException, uint_value, PDFTypeError, PDFStream, \
PDFObjectNotFound, decipher_all, int_value, str_value, list_value, \
dict_value, stream_value
from .pdfparser import PDFSyntaxError, PDFStreamParser
from .psparser import PSEOF, literal_name, LIT, KWD
from .utils import choplist, nunpack, decode_text

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -325,22 +323,22 @@ def is_extractable(self):
def compute_u(self, key):
if self.r == 2:
# Algorithm 3.4
return ARC4.new(key).encrypt(self.PASSWORD_PADDING) # 2
return Arcfour(key).encrypt(self.PASSWORD_PADDING) # 2
else:
# Algorithm 3.5
hash = md5.md5(self.PASSWORD_PADDING) # 2
hash = md5(self.PASSWORD_PADDING) # 2
hash.update(self.docid[0]) # 3
result = ARC4.new(key).encrypt(hash.digest()) # 4
result = Arcfour(key).encrypt(hash.digest()) # 4
for i in range(1, 20): # 5
k = b''.join(bytes((c ^ i,)) for c in iter(key))
result = ARC4.new(k).encrypt(result)
result = Arcfour(k).encrypt(result)
result += result # 6
return result

def compute_encryption_key(self, password):
# Algorithm 3.2
password = (password + self.PASSWORD_PADDING)[:32] # 1
hash = md5.md5(password) # 2
hash = md5(password) # 2
hash.update(self.o) # 3
# See https://github.com/pdfminer/pdfminer.six/issues/186
hash.update(struct.pack('<L', self.p)) # 4
Expand All @@ -353,7 +351,7 @@ def compute_encryption_key(self, password):
if self.r >= 3:
n = self.length // 8
for _ in range(50):
result = md5.md5(result[:n]).digest()
result = md5(result[:n]).digest()
return result[:n]

def authenticate(self, password):
Expand All @@ -380,21 +378,21 @@ def verify_encryption_key(self, key):
def authenticate_owner_password(self, password):
# Algorithm 3.7
password = (password + self.PASSWORD_PADDING)[:32]
hash = md5.md5(password)
hash = md5(password)
if self.r >= 3:
for _ in range(50):
hash = md5.md5(hash.digest())
hash = md5(hash.digest())
n = 5
if self.r >= 3:
n = self.length // 8
key = hash.digest()[:n]
if self.r == 2:
user_password = ARC4.new(key).decrypt(self.o)
user_password = Arcfour(key).decrypt(self.o)
else:
user_password = self.o
for i in range(19, -1, -1):
k = b''.join(bytes((c ^ i,)) for c in iter(key))
user_password = ARC4.new(k).decrypt(user_password)
user_password = Arcfour(k).decrypt(user_password)
return self.authenticate_user_password(user_password)

def decrypt(self, objid, genno, data, attrs=None):
Expand All @@ -403,9 +401,9 @@ def decrypt(self, objid, genno, data, attrs=None):
def decrypt_rc4(self, objid, genno, data):
key = self.key + struct.pack('<L', objid)[:3] \
+ struct.pack('<L', genno)[:2]
hash = md5.md5(key)
hash = md5(key)
key = hash.digest()[:min(len(key), 16)]
return ARC4.new(key).decrypt(data)
return Arcfour(key).decrypt(data)


class PDFStandardSecurityHandlerV4(PDFStandardSecurityHandler):
Expand Down Expand Up @@ -459,9 +457,14 @@ def decrypt_identity(self, objid, genno, data):
def decrypt_aes128(self, objid, genno, data):
key = self.key + struct.pack('<L', objid)[:3] \
+ struct.pack('<L', genno)[:2] + b'sAlT'
hash = md5.md5(key)
hash = md5(key)
key = hash.digest()[:min(len(key), 16)]
return AES.new(key, mode=AES.MODE_CBC, IV=data[:16]).decrypt(data[16:])
initialization_vector = data[:16]
ciphertext = data[16:]
cipher = Cipher(algorithms.AES(key),
modes.CBC(initialization_vector),
backend=default_backend())
return cipher.decryptor().update(ciphertext)


class PDFStandardSecurityHandlerV5(PDFStandardSecurityHandlerV4):
Expand Down Expand Up @@ -489,27 +492,35 @@ def get_cfm(self, name):

def authenticate(self, password):
password = password.encode('utf-8')[:127]
hash = SHA256.new(password)
hash = sha256(password)
hash.update(self.o_validation_salt)
hash.update(self.u)
if hash.digest() == self.o_hash:
hash = SHA256.new(password)
hash = sha256(password)
hash.update(self.o_key_salt)
hash.update(self.u)
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16)\
.decrypt(self.oe)
hash = SHA256.new(password)
cipher = Cipher(algorithms.AES(hash.digest()),
modes.CBC(b'\0' * 16),
backend=default_backend())
return cipher.decryptor().update(self.oe)
hash = sha256(password)
hash.update(self.u_validation_salt)
if hash.digest() == self.u_hash:
hash = SHA256.new(password)
hash = sha256(password)
hash.update(self.u_key_salt)
return AES.new(hash.digest(), mode=AES.MODE_CBC, IV=b'\x00' * 16)\
.decrypt(self.ue)
cipher = Cipher(algorithms.AES(hash.digest()),
modes.CBC(b'\0' * 16),
backend=default_backend())
return cipher.decryptor().update(self.ue)
return None

def decrypt_aes256(self, objid, genno, data):
return AES.new(self.key, mode=AES.MODE_CBC, IV=data[:16])\
.decrypt(data[16:])
initialization_vector = data[:16]
ciphertext = data[16:]
cipher = Cipher(algorithms.AES(self.key),
modes.CBC(initialization_vector),
backend=default_backend())
return cipher.decryptor().update(ciphertext)


class PDFDocument:
Expand All @@ -528,11 +539,9 @@ class PDFDocument:
security_handler_registry = {
1: PDFStandardSecurityHandler,
2: PDFStandardSecurityHandler,
4: PDFStandardSecurityHandlerV4,
5: PDFStandardSecurityHandlerV5,
}
if AES is not None:
security_handler_registry[4] = PDFStandardSecurityHandlerV4
if SHA256 is not None:
security_handler_registry[5] = PDFStandardSecurityHandlerV5

def __init__(self, parser, password='', caching=True, fallback=True):
"Set the document to use a given PDFParser object."
Expand Down
80 changes: 0 additions & 80 deletions samples/Makefile

This file was deleted.

32 changes: 0 additions & 32 deletions samples/encryption/Makefile

This file was deleted.

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
package_data={'pdfminer': ['cmap/*.pickle.gz']},
install_requires=[
'chardet ; python_version > "3.0"',
'pycryptodome',
'cryptography',
pietermarsman marked this conversation as resolved.
Show resolved Hide resolved
'sortedcontainers',
],
extras_require={
Expand Down
21 changes: 21 additions & 0 deletions tests/test_tools_pdf2txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,27 @@ def test_contrib_excel(self):
"""
run('contrib/issue-00369-excel.pdf', '-t html')

def test_encryption_aes128(self):
run('encryption/aes-128.pdf', '-P foo')

def test_encryption_aes128m(self):
run('encryption/aes-128-m.pdf', '-P foo')

def test_encryption_aes256(self):
run('encryption/aes-256.pdf', '-P foo')

def test_encryption_aes256m(self):
run('encryption/aes-256-m.pdf', '-P foo')

def test_encryption_base(self):
run('encryption/base.pdf', '-P foo')

def test_encryption_rc4_40(self):
run('encryption/rc4-40.pdf', '-P foo')

def test_encryption_rc4_128(self):
run('encryption/rc4-128.pdf', '-P foo')


class TestDumpImages:

Expand Down