Skip to content

Commit

Permalink
Adds Excel mode to remove newlines from some fields.
Browse files Browse the repository at this point in the history
  • Loading branch information
Justin Littman committed Mar 4, 2018
1 parent 4b3d0e3 commit 367f7c5
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 50 deletions.
8 changes: 5 additions & 3 deletions twarc/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,10 +209,10 @@ def main():

# optionally create a csv writer
csv_writer = None
if args.format == "csv" and command not in ["filter", "hydrate", "replies",
if args.format in ("csv", "csv-excel") and command not in ["filter", "hydrate", "replies",
"retweets", "sample", "search", "timeline", "tweet"]:
parser.error("csv output not available for %s" % command)
elif args.format == "csv":
elif args.format in ("csv", "csv-excel"):
csv_writer = csv.writer(fh)
csv_writer.writerow(get_headings())

Expand Down Expand Up @@ -243,6 +243,8 @@ def main():
print(json.dumps(thing), file=fh)
elif (args.format == "csv"):
csv_writer.writerow(get_row(thing))
elif (args.format == "csv-excel"):
csv_writer.writerow(get_row(thing, excel=True))
logging.info("archived %s", thing['id_str'])
elif 'woeid' in thing:
# places
Expand Down Expand Up @@ -317,7 +319,7 @@ def get_argparser():
parser.add_argument("--output", action="store", default=None,
dest="output", help="write output to file path")
parser.add_argument("--format", action="store", default="json",
dest="format", choices=["json", "csv"],
dest="format", choices=["json", "csv", "csv-excel"],
help="set output format")
parser.add_argument("--split", action="store", type=int, default=0,
help="used with --output to split into numbered files")
Expand Down
18 changes: 12 additions & 6 deletions twarc/json2csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def get_headings():
]


def get_row(t):
def get_row(t, excel=False):
get = t.get
user = t.get('user').get
return [
Expand All @@ -64,7 +64,7 @@ def get_row(t):
get('created_at'),
date_parse(get('created_at')),
user('screen_name'),
text(t),
text(t) if not excel else tweet_url(t),
tweet_type(t),
coordinates(t),
hashtags(t),
Expand All @@ -85,22 +85,28 @@ def get_row(t):
user('id_str'),
user('created_at'),
user('default_profile_image'),
user('description'),
user('description') if not excel else clean_str(user('description')),
user('favourites_count'),
user('followers_count'),
user('friends_count'),
user('listed_count'),
user('location'),
user('name'),
user('location') if not excel else clean_str(user('location')),
user('name') if not excel else clean_str(user('name')),
user('statuses_count'),
user('time_zone'),
user_urls(t),
user('verified'),
]


def clean_str(string):
if isinstance(string, str):
return string.replace('\n', ' ').replace('\r', '')
return None


def text(t):
return (t.get('full_text') or t.get('extended_tweet', {}).get('full_text') or t['text']).replace('\n', ' ')
return t.get('full_text') or t.get('extended_tweet', {}).get('full_text') or t['text']


def coordinates(t):
Expand Down
7 changes: 4 additions & 3 deletions utils/json2csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def main():
parser.add_argument('--extra-field', '-e', help='extra fields to include. Provide a field name and a pointer to '
'the field. Example: -e verified user.verified',
nargs=2, action='append')
parser.add_argument('--excel', '-x', help='create file compatible with Excel', action='store_true')
parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used')
args = parser.parse_args()

Expand Down Expand Up @@ -64,7 +65,7 @@ def main():
sheet.writerow(get_headings(extra_headings=extra_headings))
file_count += 1
tweet = json.loads(line)
sheet.writerow(get_row(tweet, extra_fields=extra_fields))
sheet.writerow(get_row(tweet, extra_fields=extra_fields, excel=args.excel))


def numbered_filepath(filepath, num):
Expand All @@ -79,8 +80,8 @@ def get_headings(extra_headings=None):
return fields


def get_row(t, extra_fields=None):
row = json2csv.get_row(t)
def get_row(t, extra_fields=None, excel=False):
row = json2csv.get_row(t, excel=excel)
if extra_fields:
for field in extra_fields:
row.append(extra_field(t, field))
Expand Down
81 changes: 49 additions & 32 deletions utils/unshrtn.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
import re
import json
import time
import urllib
import urllib.parse
import urllib.request
import logging
import argparse
import fileinput
Expand All @@ -32,6 +33,46 @@

logging.basicConfig(filename="unshorten.log", level=logging.INFO)

def unshorten_entities(entities, url_cache):
for url_dict in entities['urls']:
if not url_dict.get('unshortened_url'):
if "expanded_url" in url_dict:
url = url_dict["expanded_url"]
else:
url = url_dict['url']

if url:
if re.match(r'^https?://twitter.com/', url):
# don't hammer on twitter.com urls that we know are not short
url_dict['unshortened_url'] = url
else:
if url not in url_cache:
# otherwise we've got work to do
url = url.encode('utf8')
u = '{}/?{}'.format(unshrtn_url, urllib.parse.urlencode({'url': url}))

resp = None
for retry in range(1, retries + 1):
try:
resp = json.loads(urllib.request.urlopen(u).read())
break
except Exception as e:
logging.error("http error: %s when looking up %s. Try %s of %s", e, url, retry, retries)
time.sleep(wait)

# finally assign the long url, giving preference to a
# canonical url if one was found
if resp and resp.get('long'):
url_cache[url] = resp['canonical'] or resp['long']
if url in url_cache:
url_dict['unshortened_url'] = url_cache[url]


def unshorten_tweet(tweet, url_cache):
unshorten_entities(tweet['entities'], url_cache)
if 'extended_tweet' in tweet:
unshorten_entities(tweet['extended_tweet']['entities'], url_cache)


def rewrite_line(line):
try:
Expand All @@ -41,37 +82,12 @@ def rewrite_line(line):
logging.error(e)
return line

# don't do the same work again
if 'unshortened_url' in tweet and tweet['unshortened_url']:
return line

for url_dict in tweet["entities"]["urls"]:
if "expanded_url" in url_dict:
url = url_dict["expanded_url"]
else:
url = url_dict['url']

if url and re.match(r'^https?://twitter.com/', url):
# don't hammer on twitter.com urls that we know are not short
url_dict['unshortened_url'] = url
elif url:
# otherwise we've got work to do
url = url.encode('utf8')
u = '{}/?{}'.format(unshrtn_url, urllib.urlencode({'url': url}))

resp = None
for retry in range(1, retries+1):
try:
resp = json.loads(urllib.urlopen(u).read())
break
except Exception as e:
logging.error("http error: %s when looking up %s. Try %s of %s", e, url, retry, retries)
time.sleep(wait)

# finally assign the long url, giving preference to a
# canonical url if one was found
if resp and 'long' in resp:
url_dict['unshortened_url'] = resp['canonical'] or resp['long']
url_cache = dict()
unshorten_tweet(tweet, url_cache)
if 'quoted_status' in tweet:
unshorten_tweet(tweet['quoted_status'], url_cache)
elif 'retweeted_status' in tweet:
unshorten_tweet(tweet['retweeted_status'], url_cache)

return json.dumps(tweet)

Expand All @@ -95,5 +111,6 @@ def main():
for line in pool.imap_unordered(rewrite_line, fileinput.input(files=args.files if len(args.files) > 0 else ('-',))):
if line != "\n": print(line)


if __name__ == "__main__":
main()
32 changes: 26 additions & 6 deletions utils/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,32 @@
import json
import fileinput

for line in fileinput.input():
tweet = json.loads(line)
for url in tweet["entities"]["urls"]:
def get_urls(tweet):
urls = set()
for url in (tweet.get('extended_tweet', {}).get('entities') or tweet['entities'])['urls']:
if 'unshortened_url' in url:
print(url['unshortened_url'].encode('utf8'))
urls.add(url['unshortened_url'])
elif url.get('expanded_url'):
print(url['expanded_url'].encode('utf8'))
urls.add(url['expanded_url'])
elif url.get('url'):
print(url['url'].encode('utf8'))
urls.add(url['url'])
return urls

for line in fileinput.input():
tweet = json.loads(line)

# urlslist = []
# entities = item.get('extended_tweet', {}).get('entities') or item['entities']
# for url in entities['urls'][:2]:
# urlslist += [url['url'], url['expanded_url']]
# # Padding the row if URLs do not take up all 4 columns
# row += urlslist + [''] * (4 - len(urlslist))

urls = get_urls(tweet)
if 'quoted_status' in tweet:
urls.update(get_urls(tweet['quoted_status']))
elif 'retweeted_status' in tweet:
urls.update(get_urls(tweet['retweeted_status']))

for url in urls:
print(url)

0 comments on commit 367f7c5

Please sign in to comment.