bellingcat · msramalho · Apr 15, 2024 · Apr 3, 2024 · Apr 3, 2024 · Apr 15, 2024
diff --git a/src/auto_archiver/archivers/twitter_archiver.py b/src/auto_archiver/archivers/twitter_archiver.py
@@ -2,6 +2,8 @@
 from datetime import datetime
 from loguru import logger
 from snscrape.modules.twitter import TwitterTweetScraper, Video, Gif, Photo
+from yt_dlp import YoutubeDL
+from yt_dlp.extractor.twitter import TwitterIE
 from slugify import slugify
 
 from . import Archiver
@@ -98,7 +100,9 @@ def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metad
 
         hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
         r = requests.get(hack_url)
-        if r.status_code != 200: return False
+        if r.status_code != 200 or r.json()=={}: 
+            logger.warning(f"Failed to get tweet information from {hack_url}, trying ytdl")
+            return self.download_ytdl(item, url, tweet_id)
         tweet = r.json()
 
         urls = []
@@ -108,7 +112,7 @@ def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metad
         # 1 tweet has 1 video max
         if "video" in tweet:
             v = tweet["video"]
-            urls.append(self.choose_variant(v.get("variants", [])))
+            urls.append(self.choose_variant(v.get("variants", []))['url'])
 
         logger.debug(f"Twitter hack got {urls=}")
 
@@ -125,6 +129,38 @@ def download_alternative(self, item: Metadata, url: str, tweet_id: str) -> Metad
 
         result.set_title(tweet.get("text")).set_content(json.dumps(tweet, ensure_ascii=False)).set_timestamp(datetime.strptime(tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"))
         return result.success("twitter-hack")
+
+    def download_ytdl(self, item: Metadata, url:str, tweet_id:str) -> Metadata:
+        downloader = YoutubeDL()
+        tie = TwitterIE(downloader)
+        tweet = tie._extract_status(tweet_id)
+        result = Metadata()
+        result\
+            .set_title(tweet.get('full_text', ''))\
+            .set_content(json.dumps(tweet, ensure_ascii=False))\
+            .set_timestamp(datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"))
+        if not tweet.get("entities", {}).get("media"):
+            logger.debug('No media found, archiving tweet text only')
+            return result
+        for i, tw_media in enumerate(tweet["entities"]["media"]):
+            media = Media(filename="")
+            mimetype = ""
+            if tw_media["type"] == "photo":
+                media.set("src", UrlUtil.twitter_best_quality_url(tw_media['media_url_https']))
+                mimetype = "image/jpeg"
+            elif tw_media["type"] == "video":
+                variant = self.choose_variant(tw_media['video_info']['variants'])
+                media.set("src", variant['url'])
+                mimetype = variant['content_type']
+            elif tw_media["type"] == "animated_gif":
+                variant = tw_media['video_info']['variants'][0]
+                media.set("src", variant['url'])
+                mimetype = variant['content_type']
+            ext = mimetypes.guess_extension(mimetype)
+            media.filename = self.download_from_url(media.get("src"), f'{slugify(url)}_{i}{ext}', item)
+            result.add_media(media)
+        return result.success("twitter-ytdl")
+
 
     def get_username_tweet_id(self, url):
         # detect URLs that we definitely cannot handle
@@ -140,13 +176,13 @@ def choose_variant(self, variants):
         # choosing the highest quality possible
         variant, width, height = None, 0, 0
         for var in variants:
-            if var.get("type", "") == "video/mp4":
-                width_height = re.search(r"\/(\d+)x(\d+)\/", var["src"])
+            if var.get("content_type", "") == "video/mp4":
+                width_height = re.search(r"\/(\d+)x(\d+)\/", var["url"])
                 if width_height:
                     w, h = int(width_height[1]), int(width_height[2])
                     if w > width or h > height:
                         width, height = w, h
-                        variant = var.get("src", variant)
+                        variant = var
             else:
-                variant = var.get("src") if not variant else variant
+                variant = var if not variant else variant
         return variant
diff --git a/src/auto_archiver/version.py b/src/auto_archiver/version.py
@@ -3,7 +3,7 @@
 _MINOR = "11"
 # On main and in a nightly release the patch should be one ahead of the last
 # released build.
-_PATCH = "1"
+_PATCH = "2"
 # This is mainly for nightly builds which have the suffix ".dev$DATE". See
 # https://semver.org/#is-v123-a-semantic-version for the semantics.
 _SUFFIX = ""