From 2cff2c294112dbd1a06ee03fd620f67bbbfb1b8a Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Tue, 26 May 2020 14:15:55 -0400 Subject: [PATCH] Retweet text The json2csv.text function was not extracting tweet text from the retweeted_status object. If the text is more than 140 characters we need to look in there for the complete text. Fixes #329 --- test_twarc.py | 14 ++++++++++++++ twarc/json2csv.py | 10 +++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/test_twarc.py b/test_twarc.py index dee78170..c271ec28 100644 --- a/test_twarc.py +++ b/test_twarc.py @@ -14,6 +14,7 @@ import requests import twarc +from twarc import json2csv """ @@ -616,6 +617,19 @@ def test_extended_compat(): assert 'full_text' in next(T.timeline(screen_name="BarackObama")) assert 'text' in next(t_compat.timeline(screen_name="BarackObama")) +def test_csv_retweet(): + for tweet in T.search('obama'): + if 'retweeted_status' in tweet: + break + text = json2csv.text(tweet) + assert not text.startswith('RT @') + +def test_truncated_text(): + for tweet in T.filter('tweet'): + if tweet['truncated'] == True: + break + assert tweet['text'] != tweet['extended_tweet']['full_text'] + assert json2csv.text(tweet) == tweet['extended_tweet']['full_text'] def test_invalid_credentials(): old_consumer_key = T.consumer_key diff --git a/twarc/json2csv.py b/twarc/json2csv.py index c62f9757..ea118dd7 100755 --- a/twarc/json2csv.py +++ b/twarc/json2csv.py @@ -107,7 +107,15 @@ def clean_str(string): def text(t): - return t.get('full_text') or t.get('extended_tweet', {}).get('full_text') or t['text'] + # retweets have the full text in the original tweet + if t.get('retweeted_status'): + return t['retweeted_status']['full_text'] + elif 'extended_tweet' in t: + return t['extended_tweet']['full_text'] + elif 'full_text' in t: + return t['full_text'] + else: + return t['text'] def coordinates(t):