Skip to content

Commit

Permalink
Fix 'download_syndication' method for tweet archiving (now requires a…
Browse files Browse the repository at this point in the history
… token)

Plus add in unit tests for token generation + download syndication
  • Loading branch information
pjrobertson committed Jan 12, 2025
1 parent c932fb7 commit 66834e9
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 6 deletions.
42 changes: 36 additions & 6 deletions src/auto_archiver/archivers/twitter_archiver.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import re, requests, mimetypes, json
import re, requests, mimetypes, json, math
from typing import Union
from datetime import datetime
from loguru import logger
Expand Down Expand Up @@ -59,17 +59,47 @@ def download(self, item: Metadata) -> Metadata:

logger.warning(f"No free strategy worked for {url}")
return False


def generate_token(self, tweet_id: str) -> str:
# Perform the division and multiplication by π
result = (int(tweet_id) / 1e15) * math.pi
fractional_part = result % 1

# Convert to base 36
base_36 = ''
while result >= 1:
base_36 = "0123456789abcdefghijklmnopqrstuvwxyz"[int(result % 36)] + base_36
result = math.floor(result / 36)

# Append fractional part in base 36
while fractional_part > 0 and len(base_36) < 11: # Limit to avoid infinite loop
fractional_part *= 36
digit = int(fractional_part)
base_36 += "0123456789abcdefghijklmnopqrstuvwxyz"[digit]
fractional_part -= digit

# Remove leading zeros and dots
return base_36.replace('0', '').replace('.', '')



def download_syndication(self, item: Metadata, url: str, tweet_id: str) -> Union[Metadata|bool]:
"""
Hack alternative working again.
https://stackoverflow.com/a/71867055/6196010 (OUTDATED URL)
Hack alternative working again:
https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-1615937362
# as of 2024, requires a token:
https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-2211358215
next to test: https://cdn.embedly.com/widgets/media.html?&schema=twitter&url=https://twitter.com/bellingcat/status/1674700676612386816
"""

hack_url = f"https://cdn.syndication.twimg.com/tweet-result?id={tweet_id}"
r = requests.get(hack_url)
hack_url = "https://cdn.syndication.twimg.com/tweet-result"
params = {
'id': tweet_id,
'token': self.generate_token(tweet_id)
}

r = requests.get(hack_url, params=params, timeout=10)
if r.status_code != 200 or r.json()=={}:
logger.warning(f"SyndicationHack: Failed to get tweet information from {hack_url}.")
return False
Expand All @@ -86,7 +116,7 @@ def download_syndication(self, item: Metadata, url: str, tweet_id: str) -> Union
v = tweet["video"]
urls.append(self.choose_variant(v.get("variants", []))['url'])

logger.debug(f"Twitter hack got {urls=}")
logger.debug(f"Twitter hack got media {urls=}")

for i, u in enumerate(urls):
media = Media(filename="")
Expand Down
24 changes: 24 additions & 0 deletions tests/archivers/test_twitter_archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,30 @@ def test_youtube_dlp_archiver(self):
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)
)

def test_reverse_engineer_token(self):
# see Vercel's implementation here: https://github.com/vercel/react-tweet/blob/main/packages/react-tweet/src/api/fetch-tweet.ts#L27C1-L31C2
# and the discussion here: https://github.com/JustAnotherArchivist/snscrape/issues/996#issuecomment-2211358215

for tweet_id, real_token in [
("1874097816571961839", "4jjngwkifa"),
("1674700676612386816", "42586mwa3uv"),
("1877747914073620506", "4jv4aahw36n"),
("1876710769913450647", "4jruzjz5lux"),
("1346554693649113090", "39ibqxei7mo"),]:
generated_token = self.archiver.generate_token(tweet_id)
self.assertEqual(real_token, generated_token)

def test_syndication_archiver(self):

url = "https://x.com/bellingcat/status/1874097816571961839"
post = self.archiver.download_syndication(self.create_item(url), url, "1874097816571961839")
self.assertTrue(post)
self.assertValidResponseMetadata(
post,
"As 2024 comes to a close, here’s some examples of what Bellingcat investigated per month in our 10th year! 🧵",
datetime.datetime(2024, 12, 31, 14, 18, 33, tzinfo=datetime.timezone.utc)
)

def test_download_nonexistend_tweet(self):
# this tweet does not exist
url = "https://x.com/Bellingcat/status/17197025860711058"
Expand Down

0 comments on commit 66834e9

Please sign in to comment.