Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create generic archiver for all valid youtube-dl URLs, add truthsocial extractor, unit tests for twitter_api extractor, utility methods for cleaning HTML and traversing objects #175

Merged
merged 22 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
c3dd19f
Sniff filetype of downloaded media and add extension
pjrobertson Jan 15, 2025
4f2b9ba
refactor youtubedlp archiver to work for all valid websites
pjrobertson Jan 15, 2025
74cf1f5
Merge branch 'main' into youtubedlp-rewrite
pjrobertson Jan 15, 2025
3ff7a94
Update yt-dlp to latest version (2025.1.12) to add bsky support
pjrobertson Jan 15, 2025
5626bba
Add test on bluesky and note on why it doesn't work
pjrobertson Jan 15, 2025
3168bed
Add (skipped) test for twitter extraction with youtubedlp
pjrobertson Jan 15, 2025
394bcd8
Further refactoring of youtubedl_archiver->base_archiver
pjrobertson Jan 17, 2025
17c1c9c
Fix up core unit tests when a twitter api key isn't provided
pjrobertson Jan 17, 2025
59eb8f7
Add TWITTER_BEARER_TOKEN to env for running download tests
pjrobertson Jan 17, 2025
5b20288
Add a 'version' arg to get the current running version
pjrobertson Jan 17, 2025
5aa7174
Quick test that the app actually runs in core tests
pjrobertson Jan 17, 2025
9c5a9e1
Rename BaseArchiver to GenericArchiver + some other tidyups
pjrobertson Jan 17, 2025
d4893ee
Fix unit tests for base_archiver->generic_archiver rename
pjrobertson Jan 17, 2025
befc92d
Further unit test tidy ups
pjrobertson Jan 17, 2025
fd2e7f9
Further tidy-ups, also adds some ytdlp utils to 'utils'
pjrobertson Jan 20, 2025
dff0105
Small fixups + implement Truth code for posts with multiple media
pjrobertson Jan 20, 2025
4bb4ebd
Further cleanup, abstracts 'dropins' out into generic files
pjrobertson Jan 21, 2025
6388983
Merge branch 'main' into youtubedlp-rewrite
pjrobertson Jan 21, 2025
7c0dcbf
Re-add doc string to generic_archiver
pjrobertson Jan 21, 2025
9dde9b2
Patch in upstream changes to ytdlp for now
pjrobertson Jan 21, 2025
d3e3eb7
unit tests for loading dropins
pjrobertson Jan 21, 2025
cd2ae37
Minor adjustments
pjrobertson Jan 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/tests-download.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,6 @@ jobs:
run: poetry install --no-interaction --with dev

- name: Run Download Tests
run: poetry run pytest -ra -v -m "download"
run: poetry run pytest -ra -v -x -m "download"
env:
TWITTER_BEARER_TOKEN: ${{ secrets.TWITTER_BEARER_TOKEN }}
73 changes: 11 additions & 62 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ dependencies = [
"cryptography (>=41.0.0,<42.0.0)",
"boto3 (>=1.28.0,<2.0.0)",
"dataclasses-json (>=0.0.0)",
"yt-dlp (==2024.09.27)",
"yt-dlp (==2025.1.12)",
"numpy (==2.1.3)",
"vk-url-scraper (>=0.0.0)",
"requests[socks] (>=0.0.0)",
Expand Down Expand Up @@ -74,4 +74,5 @@ documentation = "https://github.com/bellingcat/auto-archiver"
[tool.pytest.ini_options]
markers = [
"download: marks tests that download content from the network",
"incremental: marks a class to run tests incrementally. If a test fails in the class, the remaining tests will be skipped",
]
7 changes: 2 additions & 5 deletions src/auto_archiver/archivers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from .archiver import Archiver
from .telethon_archiver import TelethonArchiver
from .twitter_archiver import TwitterArchiver
from .twitter_api_archiver import TwitterApiArchiver
from .instagram_archiver import InstagramArchiver
from .instagram_tbot_archiver import InstagramTbotArchiver
from .tiktok_archiver import TiktokArchiver
from .telegram_archiver import TelegramArchiver
from .vk_archiver import VkArchiver
from .youtubedl_archiver import YoutubeDLArchiver
from .instagram_api_archiver import InstagramAPIArchiver
from .bluesky_archiver import BlueskyArchiver
from .base_archiver.base_archiver import BaseArchiver as YoutubeDLArchiver
from .instagram_api_archiver import InstagramAPIArchiver
37 changes: 29 additions & 8 deletions src/auto_archiver/archivers/archiver.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from __future__ import annotations
from pathlib import Path
from abc import abstractmethod
from dataclasses import dataclass
import mimetypes
import os
import mimetypes, requests
from loguru import logger
Expand Down Expand Up @@ -32,6 +34,14 @@ def cleanup(self) -> None:
def sanitize_url(self, url: str) -> str:
# used to clean unnecessary URL parameters OR unfurl redirect links
return url

def suitable(self, url: str) -> bool:
"""
Returns True if this archiver can handle the given URL

Should be overridden by subclasses
"""
return True

def _guess_file_type(self, path: str) -> str:
"""
Expand All @@ -46,10 +56,8 @@ def _guess_file_type(self, path: str) -> str:
@retry(wait_random_min=500, wait_random_max=3500, stop_max_attempt_number=5)
def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> str:
"""
downloads a URL to provided filename, or inferred from URL, returns local filename
downloads a URL to provided filename, or inferred from URL, returns local filename
"""
# TODO: should we refactor to use requests.get(url, stream=True) and write to file in chunks? compare approaches
# TODO: should we guess the extension?
if not to_filename:
to_filename = url.split('/')[-1].split('?')[0]
if len(to_filename) > 64:
Expand All @@ -59,11 +67,24 @@ def download_from_url(self, url: str, to_filename: str = None, verbose=True) ->
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
d = requests.get(url, headers=headers)
assert d.status_code == 200, f"got response code {d.status_code} for {url=}"
with open(to_filename, 'wb') as f:
f.write(d.content)
return to_filename
try:
d = requests.get(url, stream=True, headers=headers, timeout=30)
d.raise_for_status()

# get mimetype from the response headers
if not Path(to_filename).suffix:
content_type = d.headers.get('Content-Type')
extension = mimetypes.guess_extension(content_type)
if extension:
to_filename += extension

with open(to_filename, 'wb') as f:
for chunk in d.iter_content(chunk_size=8192):
f.write(chunk)
return to_filename

except requests.RequestException as e:
logger.warning(f"Failed to fetch the Media URL: {e}")

@abstractmethod
def download(self, item: Metadata) -> Metadata: pass
1 change: 1 addition & 0 deletions src/auto_archiver/archivers/base_archiver/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .base_archiver import BaseArchiver
Loading
Loading