diff --git a/example.orchestration.yaml b/example.orchestration.yaml index a36b125c..ebfb8644 100644 --- a/example.orchestration.yaml +++ b/example.orchestration.yaml @@ -2,6 +2,7 @@ steps: # only 1 feeder allowed feeder: gsheet_feeder # defaults to cli_feeder archivers: # order matters, uncomment to activate + - bluesky_archiver # - vk_archiver # - telethon_archiver # - telegram_archiver diff --git a/src/auto_archiver/archivers/__init__.py b/src/auto_archiver/archivers/__init__.py index ac92fdef..996ca3b2 100644 --- a/src/auto_archiver/archivers/__init__.py +++ b/src/auto_archiver/archivers/__init__.py @@ -8,4 +8,5 @@ from .telegram_archiver import TelegramArchiver from .vk_archiver import VkArchiver from .youtubedl_archiver import YoutubeDLArchiver -from .instagram_api_archiver import InstagramAPIArchiver \ No newline at end of file +from .instagram_api_archiver import InstagramAPIArchiver +from .bluesky_archiver import BlueskyArchiver \ No newline at end of file diff --git a/src/auto_archiver/archivers/archiver.py b/src/auto_archiver/archivers/archiver.py index c44ab0a7..25e08c34 100644 --- a/src/auto_archiver/archivers/archiver.py +++ b/src/auto_archiver/archivers/archiver.py @@ -48,6 +48,8 @@ def download_from_url(self, url: str, to_filename: str = None, verbose=True) -> """ downloads a URL to provided filename, or inferred from URL, returns local filename """ + # TODO: should we refactor to use requests.get(url, stream=True) and write to file in chunks? compare approaches + # TODO: should we guess the extension? if not to_filename: to_filename = url.split('/')[-1].split('?')[0] if len(to_filename) > 64: diff --git a/src/auto_archiver/archivers/bluesky_archiver.py b/src/auto_archiver/archivers/bluesky_archiver.py new file mode 100644 index 00000000..534fba29 --- /dev/null +++ b/src/auto_archiver/archivers/bluesky_archiver.py @@ -0,0 +1,119 @@ +import os +import re, requests, mimetypes +from loguru import logger + + +from . import Archiver +from ..core import Metadata, Media, ArchivingContext + + +class BlueskyArchiver(Archiver): + """ + Uses an unauthenticated Bluesky API to archive posts including metadata, images and videos. Relies on `public.api.bsky.app/xrpc` and `bsky.social/xrpc`. Avoids ATProto to avoid auth. + + Some inspiration from https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/extractor/bluesky.py + """ + name = "bluesky_archiver" + BSKY_POST = re.compile(r"/profile/([^/]+)/post/([a-zA-Z0-9]+)") + + def __init__(self, config: dict) -> None: + super().__init__(config) + + @staticmethod + def configs() -> dict: + return {} + + def download(self, item: Metadata) -> Metadata: + url = item.get_url() + if not re.search(self.BSKY_POST, url): + return False + + logger.debug(f"Identified a Bluesky post: {url}, archiving...") + result = Metadata() + + # fetch post info and update result + post = self._get_post_from_uri(url) + logger.debug(f"Extracted post info: {post['record']['text']}") + result.set_title(post["record"]["text"]) + result.set_timestamp(post["record"]["createdAt"]) + for k, v in self._get_post_data(post).items(): + if v: result.set(k, v) + + # download if embeds present (1 video XOR >=1 images) + for media in self._download_bsky_embeds(post): + result.add_media(media) + logger.debug(f"Downloaded {len(result.media)} media files") + + return result.success("bluesky") + + def _get_post_from_uri(self, post_uri: str) -> dict: + """ + Calls a public (no auth needed) Bluesky API to get a post from its uri, uses .getPostThread as it brings author info as well (unlike .getPost). + """ + post_match = re.search(self.BSKY_POST, post_uri) + username = post_match.group(1) + post_id = post_match.group(2) + at_uri = f'at://{username}/app.bsky.feed.post/{post_id}' + r = requests.get(f"https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread?uri={at_uri}&depth=0&parent_height=0") + r.raise_for_status() + thread = r.json() + assert thread["thread"]["$type"] == "app.bsky.feed.defs#threadViewPost" + return thread["thread"]["post"] + + def _download_bsky_embeds(self, post: dict) -> list[Media]: + """ + Iterates over image(s) or video in a Bluesky post and downloads them + """ + media = [] + embed = post.get("record", {}).get("embed", {}) + image_medias = embed.get("images", []) + embed.get("media", {}).get("images", []) + video_medias = [e for e in [embed.get("video"), embed.get("media", {}).get("video")] if e] + + for image_media in image_medias: + image_media = self._download_bsky_file_as_media(image_media["image"]["ref"]["$link"], post["author"]["did"]) + media.append(image_media) + for video_media in video_medias: + video_media = self._download_bsky_file_as_media(video_media["ref"]["$link"], post["author"]["did"]) + media.append(video_media) + return media + + def _download_bsky_file_as_media(self, cid: str, did: str) -> Media: + """ + Uses the Bluesky API to download a file by its `cid` and `did`. + """ + # TODO: replace with self.download_from_url once that function has been cleaned-up + file_url = f"https://bsky.social/xrpc/com.atproto.sync.getBlob?cid={cid}&did={did}" + response = requests.get(file_url, stream=True) + response.raise_for_status() + ext = mimetypes.guess_extension(response.headers["Content-Type"]) + filename = os.path.join(ArchivingContext.get_tmp_dir(), f"{cid}{ext}") + with open(filename, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + media = Media(filename=filename) + media.set("src", file_url) + return media + + def _get_post_data(self, post: dict) -> dict: + """ + Extracts relevant information returned by the .getPostThread api call (excluding text/created_at): author, mentions, tags, links. + """ + author = post["author"] + if "labels" in author and not author["labels"]: del author["labels"] + if "associated" in author: del author["associated"] + + mentions, tags, links = [], [], [] + facets = post.get("record", {}).get("facets", []) + for f in facets: + for feature in f["features"]: + if feature["$type"] == "app.bsky.richtext.facet#mention": + mentions.append(feature["did"]) + elif feature["$type"] == "app.bsky.richtext.facet#tag": + tags.append(feature["tag"]) + elif feature["$type"] == "app.bsky.richtext.facet#link": + links.append(feature["uri"]) + res = {"author": author} + if mentions: res["mentions"] = mentions + if tags: res["tags"] = tags + if links: res["links"] = links + return res diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..1c35782b --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,4 @@ +import unittest + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/archivers/__init__.py b/tests/archivers/__init__.py new file mode 100644 index 00000000..0a0357bd --- /dev/null +++ b/tests/archivers/__init__.py @@ -0,0 +1,7 @@ +import tempfile + +from auto_archiver.core.context import ArchivingContext + + +ArchivingContext.reset(full_reset=True) +ArchivingContext.set_tmp_dir(tempfile.gettempdir()) \ No newline at end of file diff --git a/tests/archivers/test_bluesky_archiver.py b/tests/archivers/test_bluesky_archiver.py new file mode 100644 index 00000000..cb6d8787 --- /dev/null +++ b/tests/archivers/test_bluesky_archiver.py @@ -0,0 +1,80 @@ +from auto_archiver.archivers.bluesky_archiver import BlueskyArchiver +import unittest + +class TestBlueskyArchiver(unittest.TestCase): + """Tests Bluesky Archiver + + Note that these tests will download API responses from the bluesky API, so they may be slow. + This is an intended feature, as we want to test to ensure the bluesky API format hasn't changed, + and also test the archiver's ability to download media. + """ + + # def _download_bsky_embeds(self, post): + # # method to override actual method, and monkey patch requests.get so as to not actually download + # # the media files + # old_requests_get = requests.get + # def mock_requests_get(*args, **kwargs): + # return {"status_code": 200, "json": lambda: {"data": "fake data"}} + # requests.get = mock_requests_get + # media = self.bsky._download_bsky_embeds(post) + # requests.get = old_requests_get + # return media + + def setUp(self): + self.bsky = BlueskyArchiver({}) + return super().setUp() + + def test_download_media_with_images(self): + # url https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y + post = self.bsky._get_post_from_uri("https://bsky.app/profile/colborne.bsky.social/post/3lec2bqjc5s2y") + + # just make sure bsky haven't changed their format, images should be under "record/embed/media/images" + # there should be 2 images + self.assertTrue("record" in post) + self.assertTrue("embed" in post["record"]) + self.assertTrue("media" in post["record"]["embed"]) + self.assertTrue("images" in post["record"]["embed"]["media"]) + self.assertEqual(len(post["record"]["embed"]["media"]["images"]), 2) + + # try downloading the media files + media = self.bsky._download_bsky_embeds(post) + self.assertEqual(len(media), 2) + + # check the IDs + self.assertTrue("bafkreiflrkfihcvwlhka5tb2opw2qog6gfvywsdzdlibveys2acozh75tq" in media[0].get('src')) + self.assertTrue("bafkreibsprmwchf7r6xcstqkdvvuj3ijw7efciw7l3y4crxr4cmynseo7u" in media[1].get('src')) + + def test_download_post_with_single_image(self): + # url https://bsky.app/profile/bellingcat.com/post/3lcxcpgt6j42l + post = self.bsky._get_post_from_uri("https://bsky.app/profile/bellingcat.com/post/3lcxcpgt6j42l") + + # just make sure bsky haven't changed their format, images should be under "record/embed/images" + # there should be 1 image + self.assertTrue("record" in post) + self.assertTrue("embed" in post["record"]) + self.assertTrue("images" in post["record"]["embed"]) + self.assertEqual(len(post["record"]["embed"]["images"]), 1) + + media = self.bsky._download_bsky_embeds(post) + self.assertEqual(len(media), 1) + + # check the ID + self.assertTrue("bafkreihljdtomy4yulx4nfxuqdatlgvdg45vxdmjzzhclsd4ludk7zfma4" in media[0].get('src')) + + + def test_download_post_with_video(self): + # url https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i + post = self.bsky._get_post_from_uri("https://bsky.app/profile/bellingcat.com/post/3le2l4gsxlk2i") + + # just make sure bsky haven't changed their format, video should be under "record/embed/video" + self.assertTrue("record" in post) + self.assertTrue("embed" in post["record"]) + self.assertTrue("video" in post["record"]["embed"]) + + media = self.bsky._download_bsky_embeds(post) + self.assertEqual(len(media), 1) + + # check the ID + self.assertTrue("bafkreiaiskn2nt5cxjnxbgcqqcrnurvkr2ni3unekn6zvhvgr5nrqg6u2q" in media[0].get('src')) + + \ No newline at end of file diff --git a/tests/enrichers/__init__.py b/tests/enrichers/__init__.py new file mode 100644 index 00000000..e69de29b