Skip to content

Commit 0a477a8

Browse files
Reduce memory usage when updating ZipArchives (#102704)
* Changed behaviour of WriteFile WriteFile now takes a more granular approach to writing the ZipArchive to its stream. Adding a new file to the end of the archive will no longer require every file in the archive to be loaded into memory. Changing fixed-length metadata inside a file will no longer require the entire archive to be re-written. * Added tests for changed WriteFile behaviour * Changes following code review Renamed Dirty and DirtyState to Changed and ChangeState. Explicitly assigned Unchanged as a zero-value ChangeState. * Changed field protection levels Reset _originallyInArchive and _offsetOfLocalHeader to private members, exposed as internal properties. Also changed _versionToExtract to be private - this isn't ever used in System.IO.Compression outside of ZipArchiveEntry. * Following code review feedback * Used named parameter when passing a constant to forceWrite. * Replaced two magic numbers with constants. * Small cleanup of ZipArchive.WriteFile for clarity. * Renamed ZipArchiveEntry.Changed to Changes. * Further code review: initial response * Further code review comments The list of entries in a ZipArchive is now only sorted when opened in Update mode. Added/modified a test to verify that these entries appear in the correct order: offset order when opened in Update mode, central directory entry order when opened in Read mode. * Correct the write counts in tests This accounts for the removal of BinaryReader in an earlier PR * Extra test: Update_PerformMinimalWritesWhenEntriesModifiedAndAdded This test modifies an entry at a specific index, adds N entries after it, then verifies that they've been written in the expected order: existing entries first, new entries afterwards --------- Co-authored-by: Carlos Sánchez López <1175054+carlossanlop@users.noreply.github.com>
1 parent d917005 commit 0a477a8

File tree

8 files changed

+883
-118
lines changed

8 files changed

+883
-118
lines changed

src/libraries/Common/tests/System/IO/Compression/ZipTestHelper.cs

+25
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,31 @@ internal static void AddEntry(ZipArchive archive, string name, string contents,
436436
}
437437
}
438438

439+
public static byte[] CreateZipFile(int entryCount, byte[] entryContents)
440+
{
441+
using (MemoryStream ms = new())
442+
{
443+
using (ZipArchive createdArchive = new(ms, ZipArchiveMode.Create, true))
444+
{
445+
for (int i = 0; i < entryCount; i++)
446+
{
447+
string fileName = $"dummydata/{i}.bin";
448+
ZipArchiveEntry newEntry = createdArchive.CreateEntry(fileName);
449+
450+
newEntry.LastWriteTime = DateTimeOffset.Now.AddHours(-1.0);
451+
using (Stream entryWriteStream = newEntry.Open())
452+
{
453+
entryWriteStream.Write(entryContents);
454+
entryWriteStream.WriteByte((byte)(i % byte.MaxValue));
455+
}
456+
}
457+
}
458+
ms.Flush();
459+
460+
return ms.ToArray();
461+
}
462+
}
463+
439464
protected const string Utf8SmileyEmoji = "\ud83d\ude04";
440465
protected const string Utf8LowerCaseOUmlautChar = "\u00F6";
441466
protected const string Utf8CopyrightChar = "\u00A9";

src/libraries/System.IO.Compression/src/System/IO/Compression/ZipArchive.cs

+132-20
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ public class ZipArchive : IDisposable
3030
private readonly Stream? _backingStream;
3131
private byte[] _archiveComment;
3232
private Encoding? _entryNameAndCommentEncoding;
33+
private long _firstDeletedEntryOffset;
3334

3435
#if DEBUG_FORCE_ZIP64
3536
public bool _forceZip64;
@@ -164,12 +165,14 @@ public ZipArchive(Stream stream, ZipArchiveMode mode, bool leaveOpen, Encoding?
164165
_entries = new List<ZipArchiveEntry>();
165166
_entriesCollection = new ReadOnlyCollection<ZipArchiveEntry>(_entries);
166167
_entriesDictionary = new Dictionary<string, ZipArchiveEntry>();
168+
Changed = ChangeState.Unchanged;
167169
_readEntries = false;
168170
_leaveOpen = leaveOpen;
169171
_centralDirectoryStart = 0; // invalid until ReadCentralDirectory
170172
_isDisposed = false;
171173
_numberOfThisDisk = 0; // invalid until ReadCentralDirectory
172174
_archiveComment = Array.Empty<byte>();
175+
_firstDeletedEntryOffset = long.MaxValue;
173176

174177
switch (mode)
175178
{
@@ -217,7 +220,11 @@ public ZipArchive(Stream stream, ZipArchiveMode mode, bool leaveOpen, Encoding?
217220
public string Comment
218221
{
219222
get => (EntryNameAndCommentEncoding ?? Encoding.UTF8).GetString(_archiveComment);
220-
set => _archiveComment = ZipHelper.GetEncodedTruncatedBytesFromString(value, EntryNameAndCommentEncoding, ZipEndOfCentralDirectoryBlock.ZipFileCommentMaxLength, out _);
223+
set
224+
{
225+
_archiveComment = ZipHelper.GetEncodedTruncatedBytesFromString(value, EntryNameAndCommentEncoding, ZipEndOfCentralDirectoryBlock.ZipFileCommentMaxLength, out _);
226+
Changed |= ChangeState.DynamicLengthMetadata;
227+
}
221228
}
222229

223230
/// <summary>
@@ -383,6 +390,10 @@ private set
383390
}
384391
}
385392

393+
// This property's value only relates to the top-level fields of the archive (such as the archive comment.)
394+
// New entries in the archive won't change its state.
395+
internal ChangeState Changed { get; private set; }
396+
386397
private ZipArchiveEntry DoCreateEntry(string entryName, CompressionLevel? compressionLevel)
387398
{
388399
ArgumentException.ThrowIfNullOrEmpty(entryName);
@@ -409,7 +420,7 @@ internal void AcquireArchiveStream(ZipArchiveEntry entry)
409420
{
410421
if (!_archiveStreamOwner.EverOpenedForWrite)
411422
{
412-
_archiveStreamOwner.WriteAndFinishLocalEntry();
423+
_archiveStreamOwner.WriteAndFinishLocalEntry(forceWrite: true);
413424
}
414425
else
415426
{
@@ -441,6 +452,11 @@ internal void RemoveEntry(ZipArchiveEntry entry)
441452
_entries.Remove(entry);
442453

443454
_entriesDictionary.Remove(entry.FullName);
455+
// Keep track of the offset of the earliest deleted entry in the archive
456+
if (entry.OriginallyInArchive && entry.OffsetOfLocalHeader < _firstDeletedEntryOffset)
457+
{
458+
_firstDeletedEntryOffset = entry.OffsetOfLocalHeader;
459+
}
444460
}
445461

446462
internal void ThrowIfDisposed()
@@ -550,7 +566,12 @@ private void ReadCentralDirectory()
550566
throw new InvalidDataException(SR.NumEntriesWrong);
551567
}
552568

553-
_archiveStream.Seek(_centralDirectoryStart + bytesRead, SeekOrigin.Begin);
569+
// Sort _entries by each archive entry's position. This supports the algorithm in WriteFile, so is only
570+
// necessary when the ZipArchive has been opened in Update mode.
571+
if (Mode == ZipArchiveMode.Update)
572+
{
573+
_entries.Sort(ZipArchiveEntry.LocalHeaderOffsetComparer.Instance);
574+
}
554575
}
555576
catch (EndOfStreamException ex)
556577
{
@@ -681,41 +702,107 @@ private void WriteFile()
681702
// if we are in update mode, we call EnsureCentralDirectoryRead, which sets readEntries to true
682703
Debug.Assert(_readEntries);
683704

705+
// Entries starting after this offset have had a dynamically-sized change. Everything on or after this point must be rewritten.
706+
long completeRewriteStartingOffset = 0;
707+
List<ZipArchiveEntry> entriesToWrite = _entries;
708+
684709
if (_mode == ZipArchiveMode.Update)
685710
{
686-
List<ZipArchiveEntry> markedForDelete = new List<ZipArchiveEntry>();
711+
// Entries starting after this offset have some kind of change made to them. It might just be a fixed-length field though, in which case
712+
// that single entry's metadata can be rewritten without impacting anything else.
713+
long startingOffset = _firstDeletedEntryOffset;
714+
long nextFileOffset = 0;
715+
completeRewriteStartingOffset = startingOffset;
716+
717+
entriesToWrite = new(_entries.Count);
687718
foreach (ZipArchiveEntry entry in _entries)
688719
{
689-
if (!entry.LoadLocalHeaderExtraFieldAndCompressedBytesIfNeeded())
690-
markedForDelete.Add(entry);
720+
if (!entry.OriginallyInArchive)
721+
{
722+
entriesToWrite.Add(entry);
723+
}
724+
else
725+
{
726+
if (entry.Changes == ChangeState.Unchanged)
727+
{
728+
// Keep track of the expected position of the file entry after the final untouched file entry so that when the loop completes,
729+
// we'll know which position to start writing new entries from.
730+
nextFileOffset = Math.Max(nextFileOffset, entry.OffsetOfCompressedData + entry.CompressedLength);
731+
}
732+
// When calculating the starting offset to load the files from, only look at changed entries which are already in the archive.
733+
else
734+
{
735+
startingOffset = Math.Min(startingOffset, entry.OffsetOfLocalHeader);
736+
}
737+
738+
// We want to re-write entries which are after the starting offset of the first entry which has pending data to write.
739+
// NB: the existing ZipArchiveEntries are sorted in _entries by their position ascending.
740+
if (entry.OffsetOfLocalHeader >= startingOffset)
741+
{
742+
// If the pending data to write is fixed-length metadata in the header, there's no need to load the compressed file bits.
743+
if ((entry.Changes & (ChangeState.DynamicLengthMetadata | ChangeState.StoredData)) != 0)
744+
{
745+
completeRewriteStartingOffset = Math.Min(completeRewriteStartingOffset, entry.OffsetOfLocalHeader);
746+
}
747+
if (entry.OffsetOfLocalHeader >= completeRewriteStartingOffset)
748+
{
749+
entry.LoadLocalHeaderExtraFieldAndCompressedBytesIfNeeded();
750+
}
751+
752+
entriesToWrite.Add(entry);
753+
}
754+
}
755+
}
756+
757+
// If the offset of entries to write from is still at long.MaxValue, then we know that nothing has been deleted,
758+
// nothing has been modified - so we just want to move to the end of all remaining files in the archive.
759+
if (startingOffset == long.MaxValue)
760+
{
761+
startingOffset = nextFileOffset;
691762
}
692-
foreach (ZipArchiveEntry entry in markedForDelete)
693-
entry.Delete();
694763

695-
_archiveStream.Seek(0, SeekOrigin.Begin);
696-
_archiveStream.SetLength(0);
764+
_archiveStream.Seek(startingOffset, SeekOrigin.Begin);
697765
}
698766

699-
foreach (ZipArchiveEntry entry in _entries)
767+
foreach (ZipArchiveEntry entry in entriesToWrite)
700768
{
701-
entry.WriteAndFinishLocalEntry();
769+
// We don't always need to write the local header entry, ZipArchiveEntry is usually able to work out when it doesn't need to.
770+
// We want to force this header entry to be written (even for completely untouched entries) if the entry comes after one
771+
// which had a pending dynamically-sized write.
772+
bool forceWriteLocalEntry = !entry.OriginallyInArchive || (entry.OriginallyInArchive && entry.OffsetOfLocalHeader >= completeRewriteStartingOffset);
773+
774+
entry.WriteAndFinishLocalEntry(forceWriteLocalEntry);
702775
}
703776

704-
long startOfCentralDirectory = _archiveStream.Position;
777+
long plannedCentralDirectoryPosition = _archiveStream.Position;
778+
// If there are no entries in the archive, we still want to create the archive epilogue.
779+
bool archiveEpilogueRequiresUpdate = _entries.Count == 0;
705780

706781
foreach (ZipArchiveEntry entry in _entries)
707782
{
708-
entry.WriteCentralDirectoryFileHeader();
783+
// The central directory needs to be rewritten if its position has moved, if there's a new entry in the archive, or if the entry might be different.
784+
bool centralDirectoryEntryRequiresUpdate = plannedCentralDirectoryPosition != _centralDirectoryStart
785+
|| !entry.OriginallyInArchive || entry.OffsetOfLocalHeader >= completeRewriteStartingOffset;
786+
787+
entry.WriteCentralDirectoryFileHeader(centralDirectoryEntryRequiresUpdate);
788+
archiveEpilogueRequiresUpdate |= centralDirectoryEntryRequiresUpdate;
709789
}
710790

711-
long sizeOfCentralDirectory = _archiveStream.Position - startOfCentralDirectory;
791+
long sizeOfCentralDirectory = _archiveStream.Position - plannedCentralDirectoryPosition;
792+
793+
WriteArchiveEpilogue(plannedCentralDirectoryPosition, sizeOfCentralDirectory, archiveEpilogueRequiresUpdate);
712794

713-
WriteArchiveEpilogue(startOfCentralDirectory, sizeOfCentralDirectory);
795+
// If entries have been removed and new (smaller) ones added, there could be empty space at the end of the file.
796+
// Shrink the file to reclaim this space.
797+
if (_mode == ZipArchiveMode.Update && _archiveStream.Position != _archiveStream.Length)
798+
{
799+
_archiveStream.SetLength(_archiveStream.Position);
800+
}
714801
}
715802

716803
// writes eocd, and if needed, zip 64 eocd, zip64 eocd locator
717804
// should only throw an exception in extremely exceptional cases because it is called from dispose
718-
private void WriteArchiveEpilogue(long startOfCentralDirectory, long sizeOfCentralDirectory)
805+
private void WriteArchiveEpilogue(long startOfCentralDirectory, long sizeOfCentralDirectory, bool centralDirectoryChanged)
719806
{
720807
// determine if we need Zip 64
721808
if (startOfCentralDirectory >= uint.MaxValue
@@ -728,12 +815,37 @@ private void WriteArchiveEpilogue(long startOfCentralDirectory, long sizeOfCentr
728815
{
729816
// if we need zip 64, write zip 64 eocd and locator
730817
long zip64EOCDRecordStart = _archiveStream.Position;
731-
Zip64EndOfCentralDirectoryRecord.WriteBlock(_archiveStream, _entries.Count, startOfCentralDirectory, sizeOfCentralDirectory);
732-
Zip64EndOfCentralDirectoryLocator.WriteBlock(_archiveStream, zip64EOCDRecordStart);
818+
819+
if (centralDirectoryChanged)
820+
{
821+
Zip64EndOfCentralDirectoryRecord.WriteBlock(_archiveStream, _entries.Count, startOfCentralDirectory, sizeOfCentralDirectory);
822+
Zip64EndOfCentralDirectoryLocator.WriteBlock(_archiveStream, zip64EOCDRecordStart);
823+
}
824+
else
825+
{
826+
_archiveStream.Seek(Zip64EndOfCentralDirectoryRecord.TotalSize, SeekOrigin.Current);
827+
_archiveStream.Seek(Zip64EndOfCentralDirectoryLocator.TotalSize, SeekOrigin.Current);
828+
}
733829
}
734830

735831
// write normal eocd
736-
ZipEndOfCentralDirectoryBlock.WriteBlock(_archiveStream, _entries.Count, startOfCentralDirectory, sizeOfCentralDirectory, _archiveComment);
832+
if (centralDirectoryChanged || (Changed != ChangeState.Unchanged))
833+
{
834+
ZipEndOfCentralDirectoryBlock.WriteBlock(_archiveStream, _entries.Count, startOfCentralDirectory, sizeOfCentralDirectory, _archiveComment);
835+
}
836+
else
837+
{
838+
_archiveStream.Seek(ZipEndOfCentralDirectoryBlock.TotalSize + _archiveComment.Length, SeekOrigin.Current);
839+
}
840+
}
841+
842+
[Flags]
843+
internal enum ChangeState
844+
{
845+
Unchanged = 0x0,
846+
FixedLengthMetadata = 0x1,
847+
DynamicLengthMetadata = 0x2,
848+
StoredData = 0x4
737849
}
738850
}
739851
}

0 commit comments

Comments
 (0)