Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce memory usage when updating ZipArchives #102704

Merged
merged 13 commits into from
Jan 24, 2025
Merged
25 changes: 25 additions & 0 deletions src/libraries/Common/tests/System/IO/Compression/ZipTestHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,31 @@ internal static void AddEntry(ZipArchive archive, string name, string contents,
}
}

public static byte[] CreateZipFile(int entryCount, byte[] entryContents)
{
using (MemoryStream ms = new())
{
using (ZipArchive createdArchive = new(ms, ZipArchiveMode.Create, true))
{
for (int i = 0; i < entryCount; i++)
{
string fileName = $"dummydata/{i}.bin";
ZipArchiveEntry newEntry = createdArchive.CreateEntry(fileName);

newEntry.LastWriteTime = DateTimeOffset.Now.AddHours(-1.0);
using (Stream entryWriteStream = newEntry.Open())
{
entryWriteStream.Write(entryContents);
entryWriteStream.WriteByte((byte)(i % byte.MaxValue));
}
}
}
ms.Flush();

return ms.ToArray();
}
}

protected const string Utf8SmileyEmoji = "\ud83d\ude04";
protected const string Utf8LowerCaseOUmlautChar = "\u00F6";
protected const string Utf8CopyrightChar = "\u00A9";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ public class ZipArchive : IDisposable
private byte[] _archiveComment;
private Encoding? _entryNameAndCommentEncoding;

private long _firstDeletedEntryOffset;

#if DEBUG_FORCE_ZIP64
public bool _forceZip64;
#endif
Expand Down Expand Up @@ -168,12 +170,14 @@ public ZipArchive(Stream stream, ZipArchiveMode mode, bool leaveOpen, Encoding?
_entries = new List<ZipArchiveEntry>();
_entriesCollection = new ReadOnlyCollection<ZipArchiveEntry>(_entries);
_entriesDictionary = new Dictionary<string, ZipArchiveEntry>();
Changed = ChangeState.Unchanged;
_readEntries = false;
_leaveOpen = leaveOpen;
_centralDirectoryStart = 0; // invalid until ReadCentralDirectory
_isDisposed = false;
_numberOfThisDisk = 0; // invalid until ReadCentralDirectory
_archiveComment = Array.Empty<byte>();
_firstDeletedEntryOffset = long.MaxValue;

switch (mode)
{
Expand Down Expand Up @@ -221,7 +225,11 @@ public ZipArchive(Stream stream, ZipArchiveMode mode, bool leaveOpen, Encoding?
public string Comment
{
get => (EntryNameAndCommentEncoding ?? Encoding.UTF8).GetString(_archiveComment);
set => _archiveComment = ZipHelper.GetEncodedTruncatedBytesFromString(value, EntryNameAndCommentEncoding, ZipEndOfCentralDirectoryBlock.ZipFileCommentMaxLength, out _);
set
{
_archiveComment = ZipHelper.GetEncodedTruncatedBytesFromString(value, EntryNameAndCommentEncoding, ZipEndOfCentralDirectoryBlock.ZipFileCommentMaxLength, out _);
Changed |= ChangeState.DynamicLengthMetadata;
}
}

/// <summary>
Expand Down Expand Up @@ -389,6 +397,10 @@ private set
}
}

// This property's value only relates to the top-level fields of the archive (such as the archive comment.)
// New entries in the archive won't change its state.
internal ChangeState Changed { get; private set; }

private ZipArchiveEntry DoCreateEntry(string entryName, CompressionLevel? compressionLevel)
{
ArgumentException.ThrowIfNullOrEmpty(entryName);
Expand All @@ -415,7 +427,7 @@ internal void AcquireArchiveStream(ZipArchiveEntry entry)
{
if (!_archiveStreamOwner.EverOpenedForWrite)
{
_archiveStreamOwner.WriteAndFinishLocalEntry();
_archiveStreamOwner.WriteAndFinishLocalEntry(true);
}
else
{
Expand Down Expand Up @@ -447,6 +459,11 @@ internal void RemoveEntry(ZipArchiveEntry entry)
_entries.Remove(entry);

_entriesDictionary.Remove(entry.FullName);
// Keep track of the offset of the earliest deleted entry in the archive
if (entry.OriginallyInArchive && entry.OffsetOfLocalHeader < _firstDeletedEntryOffset)
{
_firstDeletedEntryOffset = entry.OffsetOfLocalHeader;
}
}

internal void ThrowIfDisposed()
Expand Down Expand Up @@ -502,6 +519,9 @@ private void ReadCentralDirectory()
numberOfEntries++;
}

// Sort _entries by each archive entry's position
_entries.Sort(ZipArchiveEntry.LocalHeaderOffsetComparer.Instance);

if (numberOfEntries != _expectedNumberOfEntries)
throw new InvalidDataException(SR.NumEntriesWrong);
}
Expand Down Expand Up @@ -633,41 +653,108 @@ private void WriteFile()
// if we are in update mode, we call EnsureCentralDirectoryRead, which sets readEntries to true
Debug.Assert(_readEntries);

// Entries starting after this offset have had a dynamically-sized change. Everything on or after this point must be rewritten.
long dynamicDirtyStartingOffset = 0;
List<ZipArchiveEntry> entriesToWrite = _entries;

if (_mode == ZipArchiveMode.Update)
{
List<ZipArchiveEntry> markedForDelete = new List<ZipArchiveEntry>();
// Entries starting after this offset have some kind of change made to them. It might just be a fixed-length field though, in which case
// that single entry's metadata can be rewritten without impacting anything else.
long startingOffset = _firstDeletedEntryOffset;
long nextFileOffset = 0;
dynamicDirtyStartingOffset = startingOffset;

entriesToWrite = new(_entries.Count);
foreach (ZipArchiveEntry entry in _entries)
{
if (!entry.LoadLocalHeaderExtraFieldAndCompressedBytesIfNeeded())
markedForDelete.Add(entry);
if (entry.OriginallyInArchive)
{
if (entry.Changed == ChangeState.Unchanged)
{
// Keep track of the expected position of the file entry after the final untouched file entry so that when the loop completes,
// we'll know which position to start writing new entries from.
nextFileOffset = Math.Max(nextFileOffset, entry.OffsetOfCompressedData + entry.CompressedLength);
}
// When calculating the starting offset to load the files from, only look at dirty entries which are already in the archive.
else
{
startingOffset = Math.Min(startingOffset, entry.OffsetOfLocalHeader);
}

// We want to re-write entries which are after the starting offset of the first entry which has pending data to write.
// NB: the existing ZipArchiveEntries are sorted in _entries by their position ascending.
if (entry.OffsetOfLocalHeader >= startingOffset)
{
// If the pending data to write is fixed-length metadata in the header, there's no need to load the full file for
// inflation and deflation.
if ((entry.Changed & (ChangeState.DynamicLengthMetadata | ChangeState.StoredData)) != 0)
{
dynamicDirtyStartingOffset = Math.Min(dynamicDirtyStartingOffset, entry.OffsetOfLocalHeader);
}
if (entry.OffsetOfLocalHeader >= dynamicDirtyStartingOffset)
{
entry.LoadLocalHeaderExtraFieldAndCompressedBytesIfNeeded();
}

entriesToWrite.Add(entry);
}
}
else
{
entriesToWrite.Add(entry);
}
}

// If the offset of entries to write from is still at long.MaxValue, then we know that nothing has been deleted,
// nothing has been modified - so we just want to move to the end of all remaining files in the archive.
if (startingOffset == long.MaxValue)
{
startingOffset = nextFileOffset;
}
foreach (ZipArchiveEntry entry in markedForDelete)
entry.Delete();

_archiveStream.Seek(0, SeekOrigin.Begin);
_archiveStream.SetLength(0);
_archiveStream.Seek(startingOffset, SeekOrigin.Begin);
}

foreach (ZipArchiveEntry entry in _entries)
foreach (ZipArchiveEntry entry in entriesToWrite)
{
entry.WriteAndFinishLocalEntry();
// We don't always need to write the local header entry, ZipArchiveEntry is usually able to work out when it doesn't need to.
// We want to force this header entry to be written (even for completely untouched entries) if the entry comes after one
// which had a pending dynamically-sized write.
bool forceWriteLocalEntry = !entry.OriginallyInArchive || (entry.OriginallyInArchive && entry.OffsetOfLocalHeader >= dynamicDirtyStartingOffset);

entry.WriteAndFinishLocalEntry(forceWriteLocalEntry);
}

long startOfCentralDirectory = _archiveStream.Position;
// If there are no entries in the archive, we still want to create the archive epilogue.
bool archiveEpilogueRequiresUpdate = _entries.Count == 0;

foreach (ZipArchiveEntry entry in _entries)
{
entry.WriteCentralDirectoryFileHeader();
// The central directory needs to be rewritten if its position has moved, if there's a new entry in the archive, or if the entry might be different.
bool centralDirectoryEntryRequiresUpdate = startOfCentralDirectory != _centralDirectoryStart
| (!entry.OriginallyInArchive || (entry.OriginallyInArchive && entry.OffsetOfLocalHeader >= dynamicDirtyStartingOffset));

entry.WriteCentralDirectoryFileHeader(centralDirectoryEntryRequiresUpdate);
archiveEpilogueRequiresUpdate |= centralDirectoryEntryRequiresUpdate;
}

long sizeOfCentralDirectory = _archiveStream.Position - startOfCentralDirectory;

WriteArchiveEpilogue(startOfCentralDirectory, sizeOfCentralDirectory);
WriteArchiveEpilogue(startOfCentralDirectory, sizeOfCentralDirectory, archiveEpilogueRequiresUpdate);

// If entries have been removed and new (smaller) ones added, there could be empty space at the end of the file.
// Shrink the file to reclaim this space.
if (_mode == ZipArchiveMode.Update && _archiveStream.Position != _archiveStream.Length)
{
_archiveStream.SetLength(_archiveStream.Position);
}
}

// writes eocd, and if needed, zip 64 eocd, zip64 eocd locator
// should only throw an exception in extremely exceptional cases because it is called from dispose
private void WriteArchiveEpilogue(long startOfCentralDirectory, long sizeOfCentralDirectory)
private void WriteArchiveEpilogue(long startOfCentralDirectory, long sizeOfCentralDirectory, bool centralDirectoryChanged)
{
// determine if we need Zip 64
if (startOfCentralDirectory >= uint.MaxValue
Expand All @@ -680,12 +767,37 @@ private void WriteArchiveEpilogue(long startOfCentralDirectory, long sizeOfCentr
{
// if we need zip 64, write zip 64 eocd and locator
long zip64EOCDRecordStart = _archiveStream.Position;
Zip64EndOfCentralDirectoryRecord.WriteBlock(_archiveStream, _entries.Count, startOfCentralDirectory, sizeOfCentralDirectory);
Zip64EndOfCentralDirectoryLocator.WriteBlock(_archiveStream, zip64EOCDRecordStart);

if (centralDirectoryChanged)
{
Zip64EndOfCentralDirectoryRecord.WriteBlock(_archiveStream, _entries.Count, startOfCentralDirectory, sizeOfCentralDirectory);
Zip64EndOfCentralDirectoryLocator.WriteBlock(_archiveStream, zip64EOCDRecordStart);
}
else
{
_archiveStream.Seek(Zip64EndOfCentralDirectoryRecord.TotalSize, SeekOrigin.Current);
_archiveStream.Seek(Zip64EndOfCentralDirectoryLocator.TotalSize, SeekOrigin.Current);
}
}

// write normal eocd
ZipEndOfCentralDirectoryBlock.WriteBlock(_archiveStream, _entries.Count, startOfCentralDirectory, sizeOfCentralDirectory, _archiveComment);
if (centralDirectoryChanged | (Changed != ChangeState.Unchanged))
{
ZipEndOfCentralDirectoryBlock.WriteBlock(_archiveStream, _entries.Count, startOfCentralDirectory, sizeOfCentralDirectory, _archiveComment);
}
else
{
_archiveStream.Seek(ZipEndOfCentralDirectoryBlock.TotalSize + _archiveComment.Length, SeekOrigin.Current);
}
}

[Flags]
internal enum ChangeState
{
Unchanged = 0x0,
FixedLengthMetadata = 0x1,
DynamicLengthMetadata = 0x2,
StoredData = 0x4
}
}
}
Loading
Loading