bellingcat · pjrobertson · Jan 21, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/.github/workflows/tests-download.yaml b/.github/workflows/tests-download.yaml
@@ -35,4 +35,6 @@ jobs:
         run: poetry install --no-interaction --with dev
 
       - name: Run Download Tests
-        run: poetry run pytest -ra -v -m "download"
+        run: poetry run pytest -ra -v -x -m "download"
+        env:
+          TWITTER_BEARER_TOKEN: ${{ secrets.TWITTER_BEARER_TOKEN }}
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -46,7 +46,7 @@ dependencies = [
     "cryptography (>=41.0.0,<42.0.0)",
     "boto3 (>=1.28.0,<2.0.0)",
     "dataclasses-json (>=0.0.0)",
-    "yt-dlp (==2024.09.27)",
+    "yt-dlp (==2025.1.12)",
     "numpy (==2.1.3)",
     "vk-url-scraper (>=0.0.0)",
     "requests[socks] (>=0.0.0)",
@@ -74,4 +74,5 @@ documentation = "https://github.com/bellingcat/auto-archiver"
 [tool.pytest.ini_options]
 markers = [
     "download: marks tests that download content from the network",
+    "incremental: marks a class to run tests incrementally. If a test fails in the class, the remaining tests will be skipped",
 ]
diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py
@@ -1,12 +1,9 @@
 from .archiver import Archiver
 from .telethon_archiver import TelethonArchiver
-from .twitter_archiver import TwitterArchiver
 from .twitter_api_archiver import TwitterApiArchiver
 from .instagram_archiver import InstagramArchiver
 from .instagram_tbot_archiver import InstagramTbotArchiver
-from .tiktok_archiver import TiktokArchiver
 from .telegram_archiver import TelegramArchiver
 from .vk_archiver import VkArchiver
-from .youtubedl_archiver import YoutubeDLArchiver
-from .instagram_api_archiver import InstagramAPIArchiver
-from .bluesky_archiver import BlueskyArchiver
+from .base_archiver.base_archiver import BaseArchiver as YoutubeDLArchiver
+from .instagram_api_archiver import InstagramAPIArchiver
diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
+from pathlib import Path
 from abc import abstractmethod
 from dataclasses import dataclass
+import mimetypes
 import os
 import mimetypes, requests
 from loguru import logger
@@ -32,6 +34,14 @@ def cleanup(self) -> None:
     def sanitize_url(self, url: str) -> str:
         # used to clean unnecessary URL parameters OR unfurl redirect links
         return url
+
+    def suitable(self, url: str) -> bool:
+        """
+        Returns True if this archiver can handle the given URL
+
+        Should be overridden by subclasses
+        """
+        return True
 
     def _guess_file_type(self, path: str) -> str:
         """
@@ -46,10 +56,8 @@ def _guess_file_type(self, path: str) -> str:
     @retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5)
     def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str:
         """
-        downloads a URL to provided filename, or inferred from URL, returns local filename
+            downloads a URL to provided filename, or inferred from URL, returns local filename
         """
-        # TODO: should we refactor to use requests.get(url, stream=True) and write to file in chunks? compare approaches
-        # TODO: should we guess the extension?
         if not to_filename:
             to_filename = url.split('/')[-1].split('?')[0]
             if len(to_filename) > 64:
@@ -59,11 +67,24 @@ def download_from_url(self, url: str, to_filename: str = None, verbose=True) ->
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
         }
-        d = requests.get(url, headers=headers)
-        assert d.status_code == 200, f"got response code {d.status_code} for {url=}"
-        with open(to_filename, 'wb') as f:
-            f.write(d.content)
-        return to_filename
+        try:
+            d = requests.get(url, stream=True, headers=headers, timeout=30)
+            d.raise_for_status()
+
+            # get mimetype from the response headers
+            if not Path(to_filename).suffix:
+                content_type = d.headers.get('Content-Type')
+                extension = mimetypes.guess_extension(content_type)
+                if extension:
+                    to_filename += extension
+
+            with open(to_filename, 'wb') as f:
+                for chunk in d.iter_content(chunk_size=8192):
+                    f.write(chunk)
+            return to_filename
+
+        except requests.RequestException as e:
+            logger.warning(f"Failed to fetch the Media URL: {e}")
 
     @abstractmethod
     def download(self, item: Metadata) -> Metadata: pass
diff --git a/src/auto_archiver/archivers/base_archiver/__init__.py b/src/auto_archiver/archivers/base_archiver/__init__.py
@@ -0,0 +1 @@
+from .base_archiver import BaseArchiver