Skip to content

Commit

Permalink
Avoid name conflicts when adding WARCs to collection
Browse files Browse the repository at this point in the history
Append -index to end of files until there is no conflict
  • Loading branch information
tw4l committed Apr 24, 2024
1 parent e89924b commit 8d5b2be
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 9 deletions.
29 changes: 20 additions & 9 deletions pywb/manager/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,18 +147,29 @@ def add_archives(self, archives, unpack_wacz=False):
if invalid_archives:
logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}')

def _rename_warc(self, source_dir, warc_basename):
dupe_idx = 1
while True:
new_basename = f'{warc_basename}-{dupe_idx}'
if not os.path.exists(os.path.join(self.archive_dir, new_basename)):
break
dupe_idx += 1

return new_basename

def _add_warc(self, warc):
filename = os.path.abspath(warc)
warc_source = os.path.abspath(warc)
source_dir, warc_basename = os.path.split(warc_source)

# don't overwrite existing warcs with duplicate names
if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))):
logging.warning(f'Warc {filename} wasn\'t added because of duplicate name.')
return None

shutil.copy2(filename, self.archive_dir)
full_path = os.path.join(self.archive_dir, filename)
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
return full_path
if os.path.exists(os.path.join(self.archive_dir, warc_basename)):
warc_basename = self._rename_warc(source_dir, warc_basename)
logging.info(f'Warc {os.path.basename(warc)} already exists - renamed to {warc_basename}.')

warc_dest = os.path.join(self.archive_dir, warc_basename)
shutil.copy2(warc_source, warc_dest)
logging.info(f'Copied {warc} to {self.archive_dir} as {warc_basename}')
return warc_dest

def _add_wacz_unpacked(self, wacz):
wacz = os.path.abspath(wacz)
Expand Down
14 changes: 14 additions & 0 deletions tests/test_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,20 @@ def test_add_valid_wacz_unpacked(self, tmp_path):
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
assert '"filename": "valid_example_1-0.warc"' in f.read()

def test_add_valid_wacz_unpacked_dupe_name(self, tmp_path):
"""Test if warc that already exists is renamed with -index suffix"""
manager = self.get_test_collections_manager(tmp_path)
manager._add_wacz_unpacked(VALID_WACZ_PATH)
# Add it again to see if there are name conflicts
manager._add_wacz_unpacked(VALID_WACZ_PATH)
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
assert 'valid_example_1-0-1.warc' in os.listdir(manager.archive_dir)
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
data = f.read()
assert '"filename": "valid_example_1-0.warc"' in data
assert '"filename": "valid_example_1-0-1.warc"' in data

def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
"""Test if adding an invalid wacz file to a collection fails"""
manager = self.get_test_collections_manager(tmp_path)
Expand Down

0 comments on commit 8d5b2be

Please sign in to comment.