Skip to content

Commit

Permalink
[twitter] metadata cleanup #2
Browse files Browse the repository at this point in the history
- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'
  • Loading branch information
mikf committed Jun 7, 2020
1 parent 1fcf938 commit 5bc1097
Showing 1 changed file with 98 additions and 33 deletions.
131 changes: 98 additions & 33 deletions gallery_dl/extractor/twitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
class TwitterExtractor(Extractor):
"""Base class for twitter extractors"""
category = "twitter"
directory_fmt = ("{category}", "{user[screen_name]}")
filename_fmt = "{id_str}_{num}.{extension}"
archive_fmt = "{id_str}_{num}"
directory_fmt = ("{category}", "{user[name]}")
filename_fmt = "{tweet_id}_{num}.{extension}"
archive_fmt = "{tweet_id}_{retweet_id}_{num}"
cookiedomain = ".twitter.com"
root = "https://twitter.com"
sizes = (":orig", ":large", ":medium", ":small")
Expand All @@ -32,6 +32,7 @@ def __init__(self, match):
self.replies = self.config("replies", True)
self.twitpic = self.config("twitpic", False)
self.videos = self.config("videos", True)
self._user_cache = {}

def items(self):
self.login()
Expand All @@ -49,51 +50,48 @@ def items(self):
if "extended_entities" not in tweet:
continue

tweet.update(metadata)
tweet["date"] = text.parse_datetime(
tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
entities = tweet["extended_entities"]
del tweet["extended_entities"]
del tweet["entities"]
tdata = self._transform_tweet(tweet)
tdata.update(metadata)

yield Message.Directory, tweet
for tweet["num"], media in enumerate(entities["media"], 1):
yield Message.Directory, tdata
for tdata["num"], media in enumerate(
tweet["extended_entities"]["media"], 1):

tweet["width"] = media["original_info"].get("width", 0)
tweet["height"] = media["original_info"].get("height", 0)
tdata["width"] = media["original_info"].get("width", 0)
tdata["height"] = media["original_info"].get("height", 0)

if "video_info" in media and self.videos:

if self.videos == "ytdl":
url = "ytdl:{}/i/web/status/{}".format(
self.root, tweet["id_str"])
tweet["extension"] = None
yield Message.Url, url, tweet
tdata["extension"] = None
yield Message.Url, url, tdata

else:
video_info = media["video_info"]
variant = max(
video_info["variants"],
key=lambda v: v.get("bitrate", 0),
)
tweet["duration"] = video_info.get(
tdata["duration"] = video_info.get(
"duration_millis", 0) / 1000
tweet["bitrate"] = variant.get("bitrate", 0)
tdata["bitrate"] = variant.get("bitrate", 0)

url = variant["url"]
text.nameext_from_url(url, tweet)
yield Message.Url, url, tweet
text.nameext_from_url(url, tdata)
yield Message.Url, url, tdata

elif "media_url_https" in media:
url = media["media_url_https"]
urls = [url + size for size in self.sizes]
text.nameext_from_url(url, tweet)
yield Message.Urllist, urls, tweet
text.nameext_from_url(url, tdata)
yield Message.Urllist, urls, tdata

else:
url = media["media_url"]
text.nameext_from_url(url, tweet)
yield Message.Url, url, tweet
text.nameext_from_url(url, tdata)
yield Message.Url, url, tdata

def _extract_twitpic(self, tweet):
twitpics = []
Expand All @@ -115,6 +113,73 @@ def _extract_twitpic(self, tweet):
else:
tweet["extended_entities"] = {"media": twitpics}

def _transform_tweet(self, tweet):
entities = tweet["entities"]
tdata = {
"tweet_id" : text.parse_int(tweet["id_str"]),
"retweet_id" : text.parse_int(
tweet.get("retweeted_status_id_str")),
"quote_id" : text.parse_int(
tweet.get("quoted_status_id_str")),
"reply_id" : text.parse_int(
tweet.get("in_reply_to_status_id_str")),
"date" : text.parse_datetime(
tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
"user" : self._transform_user(tweet["user"]),
"lang" : tweet["lang"],
"content" : tweet["full_text"],
"favorite_count": tweet["favorite_count"],
"quote_count" : tweet["quote_count"],
"reply_count" : tweet["reply_count"],
"retweet_count" : tweet["retweet_count"],
}

hashtags = entities.get("hashtags")
if hashtags:
tdata["hashtags"] = [t["text"] for t in hashtags]

mentions = entities.get("user_mentions")
if mentions:
tdata["mentions"] = [{
"id": text.parse_int(u["id_str"]),
"name": u["screen_name"],
"nick": u["name"],
} for u in mentions]

if "full_text_quoted" in tweet:
tdata["content_quoted"] = tweet["full_text_quoted"]

if "author" in tweet:
tdata["author"] = self._transform_user(tweet["author"])

return tdata

def _transform_user(self, user):
uid = user["id_str"]
cache = self._user_cache

if uid not in cache:
cache[uid] = {
"id" : text.parse_int(uid),
"name" : user["screen_name"],
"nick" : user["name"],
"description" : user["description"],
"location" : user["location"],
"date" : text.parse_datetime(
user["created_at"], "%a %b %d %H:%M:%S %z %Y"),
"verified" : user.get("verified", False),
"profile_banner" : user.get("profile_banner_url", ""),
"profile_image" : user.get(
"profile_image_url_https", "").replace("_normal.", "."),
"favourites_count": user["favourites_count"],
"followers_count" : user["followers_count"],
"friends_count" : user["friends_count"],
"listed_count" : user["listed_count"],
"media_count" : user["media_count"],
"statuses_count" : user["statuses_count"],
}
return cache[uid]

def metadata(self):
"""Return general metadata"""
return {}
Expand Down Expand Up @@ -235,7 +300,7 @@ class TwitterTweetExtractor(TwitterExtractor):
}),
# content with emoji, newlines, hashtags (#338)
("https://twitter.com/playpokemon/status/1263832915173048321", {
"keyword": {"full_text": (
"keyword": {"content": (
r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
"Gifts! \n\nYou’ll be able to receive four Galarian form "
"Pokémon with Hidden Abilities, plus some very useful items. "
Expand Down Expand Up @@ -418,16 +483,16 @@ def _pagination(self, endpoint, params=None,
tweet["user"] = users[tweet["user_id_str"]]

if "quoted_status_id_str" in tweet:
quoted = tweets[tweet["quoted_status_id_str"]]
tweet["author"] = tweet["user"]
if "extended_entities" in quoted:
tweet["extended_entities"] = \
quoted["extended_entities"]
quoted = tweets.get(tweet["quoted_status_id_str"])
if quoted:
tweet["full_text_quoted"] = quoted["full_text"]
if "extended_entities" in quoted:
tweet["extended_entities"] = \
quoted["extended_entities"]
elif "retweeted_status_id_str" in tweet:
retweet = tweets[tweet["retweeted_status_id_str"]]
tweet["author"] = users[retweet["user_id_str"]]
else:
tweet["author"] = tweet["user"]
retweet = tweets.get(tweet["retweeted_status_id_str"])
if retweet:
tweet["author"] = users[retweet["user_id_str"]]

yield tweet

Expand Down

0 comments on commit 5bc1097

Please sign in to comment.