Skip to content

Critical: Fixes ZipFileHeader Struct and Zip64 Header creation - Implements switch to enable creation of large sized ZipSegments and ZIP_STORED containers #36

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions pyaff4/aff4_image.py
Original file line number Diff line number Diff line change
@@ -15,6 +15,7 @@
"""This module implements the standard AFF4 Image."""
from __future__ import division
from __future__ import unicode_literals

from builtins import range
from builtins import str
from past.utils import old_div
@@ -23,6 +24,7 @@
import logging
import lz4.block
import struct
import urllib

from expiringdict import ExpiringDict

@@ -491,8 +493,18 @@ def _parse_bevy_index(self, bevy):
return result

def reloadBevy(self, bevy_id):
bevy_urn = self.urn.Append("%08d" % bevy_id)
bevy_index_urn = rdfvalue.URN("%s.index" % bevy_urn)
if "AXIOMProcess" in self.version.tool:
# Axiom does strange stuff with paths and URNs, we need to fix the URN for reading bevys
volume_urn = '/'.join(self.urn.SerializeToString().split('/')[0:3])
original_filename = self.resolver.Get(volume_urn, self.urn, rdfvalue.URN(lexicon.standard11.pathName))[0]
original_filename_escaped = urllib.parse.quote(str(original_filename).encode(), safe='/\\')
corrected_urn = f"{volume_urn}/{original_filename_escaped}\\{'%08d' % bevy_id}".encode()
print(corrected_urn)
bevy_urn = rdfvalue.URN().UnSerializeFromString(corrected_urn)
# bevy_index_urn = rdfvalue.URN("%s.index" % bevy_urn) # This is unused anyway apparently
else:
bevy_urn = self.urn.Append("%08d" % bevy_id)
bevy_index_urn = rdfvalue.URN("%s.index" % bevy_urn)
if LOGGER.isEnabledFor(logging.INFO):
LOGGER.info("Reload Bevy %s", bevy_urn)
chunks = []
29 changes: 20 additions & 9 deletions pyaff4/container.py
Original file line number Diff line number Diff line change
@@ -141,7 +141,7 @@ def open(filename):
return Container.openURN(rdfvalue.URN.FromFileName(filename))

@staticmethod
def createURN(resolver, container_urn, encryption=False):
def createURN(resolver, container_urn, encryption=False, zip_based=False, compression_method=zip.ZIP_DEFLATE):
"""Public method to create a new writable locical AFF4 container."""

resolver.Set(lexicon.transient_graph, container_urn, lexicon.AFF4_STREAM_WRITE_MODE, rdfvalue.XSDString("truncate"))
@@ -151,7 +151,11 @@ def createURN(resolver, container_urn, encryption=False):
with zip.ZipFile.NewZipFile(resolver, version, container_urn) as zip_file:
volume_urn = zip_file.urn
with resolver.AFF4FactoryOpen(zip_file.backing_store_urn) as backing_store:
return WritableHashBasedImageContainer(backing_store, zip_file, version, volume_urn, resolver, lexicon.standard)
if not zip_based:
return WritableHashBasedImageContainer(backing_store, zip_file, version, volume_urn, resolver, lexicon.standard)
else:
return WritableLogicalImageContainer(backing_store, zip_file, version, volume_urn, resolver, lexicon.standard, compression_method=compression_method)

else:
version = Version(1, 2, "pyaff4")
with zip.ZipFile.NewZipFile(resolver, version, container_urn) as zip_file:
@@ -326,9 +330,13 @@ class WritableLogicalImageContainer(Container):
maxSegmentResidentSize = 1 * 1024 * 1024
#maxSegmentResidentSize = 1

def __init__(self, backing_store, zip_file, version, volumeURN, resolver, lex):
compression_method = None

def __init__(self, backing_store, zip_file, version, volumeURN, resolver, lex, compression_method=zip.ZIP_DEFLATE):
super(WritableLogicalImageContainer, self).__init__(backing_store, zip_file, version, volumeURN, resolver, lex)

self.compression_method = compression_method

with self.resolver.AFF4FactoryOpen(self.urn) as volume:
container_description_urn = self.urn.Append("container.description")
volume.version = self.version
@@ -354,11 +362,14 @@ def writeCompressedBlockStream(self, image_urn, filename, readstream):
stream.WriteStream(readstream)

# write the logical stream as a zip segment using the Stream API
def writeZipStream(self, image_urn, filename, readstream):
def writeZipStream(self, image_urn, filename, readstream, progress=None):
with self.resolver.AFF4FactoryOpen(self.urn) as volume:
with volume.CreateMember(image_urn) as streamed:
streamed.compression_method = zip.ZIP_DEFLATE
streamed.WriteStream(readstream)
if self.compression_method is not None and self.compression_method == lexicon.AFF4_IMAGE_COMPRESSION_STORED:
streamed.compression_method = zip.ZIP_STORED
else:
streamed.compression_method = zip.ZIP_DEFLATE
streamed.WriteStream(readstream, progress=progress)

# create a file like object for writing a logical image as a new compressed block stream
def newCompressedBlockStream(self, image_urn, filename):
@@ -393,19 +404,19 @@ def newLogicalStream(self, filename, length):
self.resolver.Add(self.urn, image_urn, rdfvalue.URN(lexicon.standard11.pathName), rdfvalue.XSDString(filename))
return writer

def writeLogicalStream(self, filename, readstream, length):
def writeLogicalStream(self, filename, readstream, length, allow_large_zipsegments=False, progress=None):
image_urn = None
if self.isAFF4Collision(filename):
image_urn = rdfvalue.URN("aff4://%s" % uuid.uuid4())
else:
image_urn = self.urn.Append(escaping.arnPathFragment_from_path(filename), quote=False)

if length > self.maxSegmentResidentSize:
if length > self.maxSegmentResidentSize and not allow_large_zipsegments:
self.writeCompressedBlockStream(image_urn, filename, readstream)
self.resolver.Add(self.urn, image_urn, rdfvalue.URN(lexicon.AFF4_TYPE),
rdfvalue.URN(lexicon.AFF4_IMAGE_TYPE))
else:
self.writeZipStream(image_urn, filename, readstream)
self.writeZipStream(image_urn, filename, readstream, progress=progress)
self.resolver.Add(self.urn, image_urn, rdfvalue.URN(lexicon.AFF4_TYPE), rdfvalue.URN(lexicon.AFF4_ZIP_SEGMENT_IMAGE_TYPE))

self.resolver.Add(self.urn, image_urn, rdfvalue.URN(lexicon.AFF4_TYPE), rdfvalue.URN(lexicon.standard11.FileImage))
5 changes: 5 additions & 0 deletions pyaff4/data_store.py
Original file line number Diff line number Diff line change
@@ -535,6 +535,11 @@ def _DumpToTurtle(self, volumeurn, verbose=False):

return result

def loadZipURN(self, zip):
with zip.OpenZipSegment("container.description") as fd:
urn = streams.ReadAll(fd).strip(b'\n')
return urn

def loadMetadata(self, zip):
# Load the turtle metadata.
#if zip.urn not in self.loadedVolumes:
12 changes: 9 additions & 3 deletions pyaff4/linear_hasher.py
Original file line number Diff line number Diff line change
@@ -22,6 +22,7 @@
from pyaff4 import hashes
from pyaff4 import lexicon
from pyaff4 import zip
from pyaff4 import aff4


class LinearHasher(object):
@@ -144,13 +145,13 @@ def __init__(self, resolver, listener=None):
self.delegate = None
self.resolver = resolver

def hash(self, image):
def hash(self, image, progress=None):

storedHashes = list(self.resolver.QuerySubjectPredicate(image.container.urn, image.urn, lexicon.standard.hash))
with self.resolver.AFF4FactoryOpen(image.urn, version=image.container.version) as stream:
datatypes = [h.datatype for h in storedHashes]
stream2 = StreamHasher(stream, datatypes)
self.readall2(stream2)
self.readall2(stream2, progress=progress)
for storedHash in storedHashes:
dt = storedHash.datatype
shortHashAlgoName = storedHash.shortName()
@@ -162,10 +163,15 @@ def hash(self, image):
self.listener.onInvalidHash(shortHashAlgoName, storedHashHexDigest, calculatedHashHexDigest, image.urn)


def readall2(self, stream):
def readall2(self, stream, progress=None):
total_read = 0
if progress is None:
progress = aff4.EMPTY_PROGRESS
while True:
toRead = 32 * 1024
data = stream.read(toRead)
total_read += len(data)
progress.Report(total_read)
if data == None or len(data) == 0:
# EOF
return
50 changes: 33 additions & 17 deletions pyaff4/zip.py
Original file line number Diff line number Diff line change
@@ -25,6 +25,7 @@
import zlib
import struct
import traceback
import os

from pyaff4 import aff4
from pyaff4 import aff4_file
@@ -132,8 +133,8 @@ class ZipFileHeader(struct_parser.CreateStruct(
uint16_t lastmodtime;
uint16_t lastmoddate;
uint32_t crc32;
int32_t compress_size;
int32_t file_size;
uint32_t compress_size;
uint32_t file_size;
uint16_t file_name_length;
uint16_t extra_field_len = 0;
""")):
@@ -169,7 +170,9 @@ def empty(self):

def Pack(self):
# Size of extra less the header.
#self.Set("data_size", self.sizeof() - 4)
# Data size needs to be set for a zip64 extra field to be compliant with zip specification.
self.Set("data_size", self.sizeof() - 4)
# Don't think the value set below is used anywhere, might be removable.
self.data_size = self.sizeof()
return struct.pack(self.format_string(),
*[v for t, _, v in self.fields if v is not None])
@@ -310,25 +313,28 @@ def WriteFileHeader(self, backing_store):
if USE_UNICODE:
header.flags = header.flags | (1 << 11)

# For local header force usage of ZIP64 even when not needed as we do not know the file size, nor what it would
# compress to, before writing the header the first time
# (similar to how zip works in command line when compressing from stdin)
# Always calculate and reserve the zip64 header size
# Alternatively, as the size of the file is not passed on first header creation
# a file larger than 4GB would triggers creation of the header only after file has been written and would get
# the first bytes overwritten creating a corrupted container.
extra_header_64 = Zip64FileHeaderExtensibleField()
if self.file_size > ZIP32_MAX_SIZE:
header.file_size = 0xFFFFFFFF
extra_header_64.Set("file_size", self.file_size)

if self.compress_size > ZIP32_MAX_SIZE:
header.compress_size = 0xFFFFFFFF
extra_header_64.Set("compress_size", self.compress_size)
header.file_size = 0xFFFFFFFF
extra_header_64.Set("file_size", self.file_size)

# Only write the extra header if we have to.
if not extra_header_64.empty():
header.extra_field_len = extra_header_64.sizeof()
header.compress_size = 0xFFFFFFFF
extra_header_64.Set("compress_size", self.compress_size)

header.extra_field_len = extra_header_64.sizeof()

backing_store.SeekWrite(self.file_header_offset)
backing_store.Write(header.Pack())
backing_store.write(encodedFilename)

if not extra_header_64.empty():
backing_store.Write(extra_header_64.Pack())
backing_store.Write(extra_header_64.Pack())

def WriteCDFileHeader(self, backing_store):
encodedFilename = self.filename
@@ -569,7 +575,8 @@ def __init__(self, *args, **kwargs):
except:
self.version = Version(0,0, "pyaff4")

def parse_cd(self, backing_store_urn):
def parse_cd(self, backing_store_urn, urn: str = None):
# We can pass the urn as parameter, this allows correct opening of images not having the urn in CD comment
with self.resolver.AFF4FactoryOpen(backing_store_urn) as backing_store:
# Find the End of Central Directory Record - We read about 4k of
# data and scan for the header from the end, just in case there is
@@ -612,6 +619,8 @@ def parse_cd(self, backing_store_urn):
# URN and then create a new ZipFile volume. After parsing the
# central directory we discover our URN and therefore we can delete
# the old, randomly selected URN.
if not urn_string and urn:
urn_string = urn
if urn_string and self.urn != urn_string and self.version != basic_zip :
self.resolver.DeleteSubject(self.urn)
self.urn.Set(utils.SmartUnicode(urn_string))
@@ -860,8 +869,6 @@ def OpenMember(self, segment_urn):

return self.resolver.CachePut(result)



def LoadFromURN(self):
self.backing_store_urn = self.resolver.GetUnique(lexicon.transient_graph,
self.urn, lexicon.AFF4_STORED)
@@ -874,7 +881,16 @@ def LoadFromURN(self):
raise IOError("Unable to load backing urn.")

try:
# Possibly inefficient method, but easiest to implement
# Create a copy of transient store, parse zip and read container.description to discover urn
# Reread the ZIP with urn as parameter to ensure the transient store has objects with correct URNs.
# Necessary for containers missing the URN in CD comment.
## Backup Transient Store
transient_store = copy.deepcopy(self.resolver.transient_store)
self.parse_cd(self.backing_store_urn)
# Restore Transient Store
self.resolver.transient_store = transient_store
self.parse_cd(self.backing_store_urn, urn=self.resolver.loadZipURN(self))
self.resolver.loadMetadata(self)
except IOError:
# If we can not parse a CD from the zip file, this is fine, we just
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
future == 0.17.1
aff4-snappy == 0.5.1
rdflib[sparql] == 4.2.2
intervaltree == 2.1.0
pyyaml == 5.1
intervaltree
pyyaml
tzlocal == 2.1
html5lib == 1.0.1
python-dateutil == 2.8.0
pybindgen
fastchunking == 0.0.3
hexdump
pynacl