Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add yt-dlp based archiving for TwitterArchiver #138

Merged
merged 4 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 42 additions & 6 deletions src/auto_archiver/archivers/twitter_archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from datetime import datetime
from loguru import logger
from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
from yt_dlp import YoutubeDL
from yt_dlp.extractor.twitter import TwitterIE
from slugify import slugify

from . import Archiver
Expand Down Expand Up @@ -98,7 +100,9 @@ def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metad

hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
r = requests.get(hack_url)
if r.status_code != 200: return False
if r.status_code != 200 or r.json()=={}:
logger.warning(f"Failed to get tweet information from {hack_url}, trying ytdl")
return self.download_ytdl(item, url, tweet_id)
tweet = r.json()

urls = []
Expand All @@ -108,7 +112,7 @@ def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metad
# 1 tweet has 1 video max
if "video" in tweet:
v = tweet["video"]
urls.append(self.choose_variant(v.get("variants", [])))
urls.append(self.choose_variant(v.get("variants", []))['url'])

logger.debug(f"Twitter hack got {urls=}")

Expand All @@ -125,6 +129,38 @@ def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metad

result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
return result.success("twitter-hack")

def download_ytdl(self, item: Metadata, url:str, tweet_id:str) -> Metadata:
downloader = YoutubeDL()
tie = TwitterIE(downloader)
tweet = tie._extract_status(tweet_id)
result = Metadata()
result\
.set_title(tweet.get('full_text', ''))\
.set_content(json.dumps(tweet, ensure_ascii=False))\
.set_timestamp(datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"))
if not tweet.get("entities", {}).get("media"):
logger.debug('No media found, archiving tweet text only')
return result
for i, tw_media in enumerate(tweet["entities"]["media"]):
media = Media(filename="")
mimetype = ""
if tw_media["type"] == "photo":
media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
mimetype = "image/jpeg"
elif tw_media["type"] == "video":
variant = self.choose_variant(tw_media['video_info']['variants'])
media.set("src", variant['url'])
mimetype = variant['content_type']
elif tw_media["type"] == "animated_gif":
variant = tw_media['video_info']['variants'][0]
media.set("src", variant['url'])
mimetype = variant['content_type']
ext = mimetypes.guess_extension(mimetype)
media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
result.add_media(media)
return result.success("twitter-ytdl")


def get_username_tweet_id(self, url):
# detect URLs that we definitely cannot handle
Expand All @@ -140,13 +176,13 @@ def choose_variant(self, variants):
# choosing the highest quality possible
variant, width, height = None, 0, 0
for var in variants:
if var.get("type", "") == "video/mp4":
width_height = re.search(r"\/(\d+)x(\d+)\/", var["src"])
if var.get("content_type", "") == "video/mp4":
width_height = re.search(r"\/(\d+)x(\d+)\/", var["url"])
if width_height:
w, h = int(width_height[1]), int(width_height[2])
if w > width or h > height:
width, height = w, h
variant = var.get("src", variant)
variant = var
else:
variant = var.get("src") if not variant else variant
variant = var if not variant else variant
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these key changes in variant_choosing may break the other scrapers, but as they are well... broken ... it's just fine for now.

return variant
2 changes: 1 addition & 1 deletion src/auto_archiver/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
_MINOR = "11"
# On main and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "1"
_PATCH = "2"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""
Expand Down