Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Critical: Fixes ZipFileHeader Struct and Zip64 Header creation - Implements switch to enable creation of large sized ZipSegments and ZIP_STORED containers #36

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions pyaff4/aff4_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""This module implements the standard AFF4 Image."""
from __future__ import division
from __future__ import unicode_literals

from builtins import range
from builtins import str
from past.utils import old_div
Expand All @@ -23,6 +24,7 @@
import logging
import lz4.block
import struct
import urllib

from expiringdict import ExpiringDict

Expand Down Expand Up @@ -491,8 +493,18 @@ def _parse_bevy_index(self, bevy):
return result

def reloadBevy(self, bevy_id):
bevy_urn = self.urn.Append("%08d" % bevy_id)
bevy_index_urn = rdfvalue.URN("%s.index" % bevy_urn)
if "AXIOMProcess" in self.version.tool:
# Axiom does strange stuff with paths and URNs, we need to fix the URN for reading bevys
volume_urn = '/'.join(self.urn.SerializeToString().split('/')[0:3])
original_filename = self.resolver.Get(volume_urn, self.urn, rdfvalue.URN(lexicon.standard11.pathName))[0]
original_filename_escaped = urllib.parse.quote(str(original_filename).encode(), safe='/\\')
corrected_urn = f"{volume_urn}/{original_filename_escaped}\\{'%08d' % bevy_id}".encode()
print(corrected_urn)
bevy_urn = rdfvalue.URN().UnSerializeFromString(corrected_urn)
# bevy_index_urn = rdfvalue.URN("%s.index" % bevy_urn) # This is unused anyway apparently
else:
bevy_urn = self.urn.Append("%08d" % bevy_id)
bevy_index_urn = rdfvalue.URN("%s.index" % bevy_urn)
if LOGGER.isEnabledFor(logging.INFO):
LOGGER.info("Reload Bevy %s", bevy_urn)
chunks = []
Expand Down
29 changes: 20 additions & 9 deletions pyaff4/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def open(filename):
return Container.openURN(rdfvalue.URN.FromFileName(filename))

@staticmethod
def createURN(resolver, container_urn, encryption=False):
def createURN(resolver, container_urn, encryption=False, zip_based=False, compression_method=zip.ZIP_DEFLATE):
"""Public method to create a new writable locical AFF4 container."""

resolver.Set(lexicon.transient_graph, container_urn, lexicon.AFF4_STREAM_WRITE_MODE, rdfvalue.XSDString("truncate"))
Expand All @@ -151,7 +151,11 @@ def createURN(resolver, container_urn, encryption=False):
with zip.ZipFile.NewZipFile(resolver, version, container_urn) as zip_file:
volume_urn = zip_file.urn
with resolver.AFF4FactoryOpen(zip_file.backing_store_urn) as backing_store:
return WritableHashBasedImageContainer(backing_store, zip_file, version, volume_urn, resolver, lexicon.standard)
if not zip_based:
return WritableHashBasedImageContainer(backing_store, zip_file, version, volume_urn, resolver, lexicon.standard)
else:
return WritableLogicalImageContainer(backing_store, zip_file, version, volume_urn, resolver, lexicon.standard, compression_method=compression_method)

else:
version = Version(1, 2, "pyaff4")
with zip.ZipFile.NewZipFile(resolver, version, container_urn) as zip_file:
Expand Down Expand Up @@ -326,9 +330,13 @@ class WritableLogicalImageContainer(Container):
maxSegmentResidentSize = 1 * 1024 * 1024
#maxSegmentResidentSize = 1

def __init__(self, backing_store, zip_file, version, volumeURN, resolver, lex):
compression_method = None

def __init__(self, backing_store, zip_file, version, volumeURN, resolver, lex, compression_method=zip.ZIP_DEFLATE):
super(WritableLogicalImageContainer, self).__init__(backing_store, zip_file, version, volumeURN, resolver, lex)

self.compression_method = compression_method

with self.resolver.AFF4FactoryOpen(self.urn) as volume:
container_description_urn = self.urn.Append("container.description")
volume.version = self.version
Expand All @@ -354,11 +362,14 @@ def writeCompressedBlockStream(self, image_urn, filename, readstream):
stream.WriteStream(readstream)

# write the logical stream as a zip segment using the Stream API
def writeZipStream(self, image_urn, filename, readstream):
def writeZipStream(self, image_urn, filename, readstream, progress=None):
with self.resolver.AFF4FactoryOpen(self.urn) as volume:
with volume.CreateMember(image_urn) as streamed:
streamed.compression_method = zip.ZIP_DEFLATE
streamed.WriteStream(readstream)
if self.compression_method is not None and self.compression_method == lexicon.AFF4_IMAGE_COMPRESSION_STORED:
streamed.compression_method = zip.ZIP_STORED
else:
streamed.compression_method = zip.ZIP_DEFLATE
streamed.WriteStream(readstream, progress=progress)

# create a file like object for writing a logical image as a new compressed block stream
def newCompressedBlockStream(self, image_urn, filename):
Expand Down Expand Up @@ -393,19 +404,19 @@ def newLogicalStream(self, filename, length):
self.resolver.Add(self.urn, image_urn, rdfvalue.URN(lexicon.standard11.pathName), rdfvalue.XSDString(filename))
return writer

def writeLogicalStream(self, filename, readstream, length):
def writeLogicalStream(self, filename, readstream, length, allow_large_zipsegments=False, progress=None):
image_urn = None
if self.isAFF4Collision(filename):
image_urn = rdfvalue.URN("aff4://%s" % uuid.uuid4())
else:
image_urn = self.urn.Append(escaping.arnPathFragment_from_path(filename), quote=False)

if length > self.maxSegmentResidentSize:
if length > self.maxSegmentResidentSize and not allow_large_zipsegments:
self.writeCompressedBlockStream(image_urn, filename, readstream)
self.resolver.Add(self.urn, image_urn, rdfvalue.URN(lexicon.AFF4_TYPE),
rdfvalue.URN(lexicon.AFF4_IMAGE_TYPE))
else:
self.writeZipStream(image_urn, filename, readstream)
self.writeZipStream(image_urn, filename, readstream, progress=progress)
self.resolver.Add(self.urn, image_urn, rdfvalue.URN(lexicon.AFF4_TYPE), rdfvalue.URN(lexicon.AFF4_ZIP_SEGMENT_IMAGE_TYPE))

self.resolver.Add(self.urn, image_urn, rdfvalue.URN(lexicon.AFF4_TYPE), rdfvalue.URN(lexicon.standard11.FileImage))
Expand Down
5 changes: 5 additions & 0 deletions pyaff4/data_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,11 @@ def _DumpToTurtle(self, volumeurn, verbose=False):

return result

def loadZipURN(self, zip):
with zip.OpenZipSegment("container.description") as fd:
urn = streams.ReadAll(fd).strip(b'\n')
return urn

def loadMetadata(self, zip):
# Load the turtle metadata.
#if zip.urn not in self.loadedVolumes:
Expand Down
12 changes: 9 additions & 3 deletions pyaff4/linear_hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from pyaff4 import hashes
from pyaff4 import lexicon
from pyaff4 import zip
from pyaff4 import aff4


class LinearHasher(object):
Expand Down Expand Up @@ -144,13 +145,13 @@ def __init__(self, resolver, listener=None):
self.delegate = None
self.resolver = resolver

def hash(self, image):
def hash(self, image, progress=None):

storedHashes = list(self.resolver.QuerySubjectPredicate(image.container.urn, image.urn, lexicon.standard.hash))
with self.resolver.AFF4FactoryOpen(image.urn, version=image.container.version) as stream:
datatypes = [h.datatype for h in storedHashes]
stream2 = StreamHasher(stream, datatypes)
self.readall2(stream2)
self.readall2(stream2, progress=progress)
for storedHash in storedHashes:
dt = storedHash.datatype
shortHashAlgoName = storedHash.shortName()
Expand All @@ -162,10 +163,15 @@ def hash(self, image):
self.listener.onInvalidHash(shortHashAlgoName, storedHashHexDigest, calculatedHashHexDigest, image.urn)


def readall2(self, stream):
def readall2(self, stream, progress=None):
total_read = 0
if progress is None:
progress = aff4.EMPTY_PROGRESS
while True:
toRead = 32 * 1024
data = stream.read(toRead)
total_read += len(data)
progress.Report(total_read)
if data == None or len(data) == 0:
# EOF
return
Expand Down
50 changes: 33 additions & 17 deletions pyaff4/zip.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import zlib
import struct
import traceback
import os

from pyaff4 import aff4
from pyaff4 import aff4_file
Expand Down Expand Up @@ -132,8 +133,8 @@ class ZipFileHeader(struct_parser.CreateStruct(
uint16_t lastmodtime;
uint16_t lastmoddate;
uint32_t crc32;
int32_t compress_size;
int32_t file_size;
uint32_t compress_size;
uint32_t file_size;
uint16_t file_name_length;
uint16_t extra_field_len = 0;
""")):
Expand Down Expand Up @@ -169,7 +170,9 @@ def empty(self):

def Pack(self):
# Size of extra less the header.
#self.Set("data_size", self.sizeof() - 4)
# Data size needs to be set for a zip64 extra field to be compliant with zip specification.
self.Set("data_size", self.sizeof() - 4)
# Don't think the value set below is used anywhere, might be removable.
self.data_size = self.sizeof()
return struct.pack(self.format_string(),
*[v for t, _, v in self.fields if v is not None])
Expand Down Expand Up @@ -310,25 +313,28 @@ def WriteFileHeader(self, backing_store):
if USE_UNICODE:
header.flags = header.flags | (1 << 11)

# For local header force usage of ZIP64 even when not needed as we do not know the file size, nor what it would
# compress to, before writing the header the first time
# (similar to how zip works in command line when compressing from stdin)
# Always calculate and reserve the zip64 header size
# Alternatively, as the size of the file is not passed on first header creation
# a file larger than 4GB would triggers creation of the header only after file has been written and would get
# the first bytes overwritten creating a corrupted container.
extra_header_64 = Zip64FileHeaderExtensibleField()
if self.file_size > ZIP32_MAX_SIZE:
header.file_size = 0xFFFFFFFF
extra_header_64.Set("file_size", self.file_size)

if self.compress_size > ZIP32_MAX_SIZE:
header.compress_size = 0xFFFFFFFF
extra_header_64.Set("compress_size", self.compress_size)
header.file_size = 0xFFFFFFFF
extra_header_64.Set("file_size", self.file_size)

# Only write the extra header if we have to.
if not extra_header_64.empty():
header.extra_field_len = extra_header_64.sizeof()
header.compress_size = 0xFFFFFFFF
extra_header_64.Set("compress_size", self.compress_size)

header.extra_field_len = extra_header_64.sizeof()

backing_store.SeekWrite(self.file_header_offset)
backing_store.Write(header.Pack())
backing_store.write(encodedFilename)

if not extra_header_64.empty():
backing_store.Write(extra_header_64.Pack())
backing_store.Write(extra_header_64.Pack())

def WriteCDFileHeader(self, backing_store):
encodedFilename = self.filename
Expand Down Expand Up @@ -569,7 +575,8 @@ def __init__(self, *args, **kwargs):
except:
self.version = Version(0,0, "pyaff4")

def parse_cd(self, backing_store_urn):
def parse_cd(self, backing_store_urn, urn: str = None):
# We can pass the urn as parameter, this allows correct opening of images not having the urn in CD comment
with self.resolver.AFF4FactoryOpen(backing_store_urn) as backing_store:
# Find the End of Central Directory Record - We read about 4k of
# data and scan for the header from the end, just in case there is
Expand Down Expand Up @@ -612,6 +619,8 @@ def parse_cd(self, backing_store_urn):
# URN and then create a new ZipFile volume. After parsing the
# central directory we discover our URN and therefore we can delete
# the old, randomly selected URN.
if not urn_string and urn:
urn_string = urn
if urn_string and self.urn != urn_string and self.version != basic_zip :
self.resolver.DeleteSubject(self.urn)
self.urn.Set(utils.SmartUnicode(urn_string))
Expand Down Expand Up @@ -860,8 +869,6 @@ def OpenMember(self, segment_urn):

return self.resolver.CachePut(result)



def LoadFromURN(self):
self.backing_store_urn = self.resolver.GetUnique(lexicon.transient_graph,
self.urn, lexicon.AFF4_STORED)
Expand All @@ -874,7 +881,16 @@ def LoadFromURN(self):
raise IOError("Unable to load backing urn.")

try:
# Possibly inefficient method, but easiest to implement
# Create a copy of transient store, parse zip and read container.description to discover urn
# Reread the ZIP with urn as parameter to ensure the transient store has objects with correct URNs.
# Necessary for containers missing the URN in CD comment.
## Backup Transient Store
transient_store = copy.deepcopy(self.resolver.transient_store)
self.parse_cd(self.backing_store_urn)
# Restore Transient Store
self.resolver.transient_store = transient_store
self.parse_cd(self.backing_store_urn, urn=self.resolver.loadZipURN(self))
self.resolver.loadMetadata(self)
except IOError:
# If we can not parse a CD from the zip file, this is fine, we just
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
future == 0.17.1
aff4-snappy == 0.5.1
rdflib[sparql] == 4.2.2
intervaltree == 2.1.0
pyyaml == 5.1
intervaltree
pyyaml
tzlocal == 2.1
html5lib == 1.0.1
python-dateutil == 2.8.0
pybindgen
fastchunking == 0.0.3
hexdump
pynacl
Expand Down