Added excel mode.

DocNow · Mar 4, 2018 · 112832b · 112832b
1 parent dbd3c5a
commit 112832b
Show file tree

Hide file tree

Showing 5 changed files with 90 additions and 50 deletions.
diff --git a/twarc/command.py b/twarc/command.py
@@ -209,10 +209,10 @@ def main():
 
     # optionally create a csv writer
     csv_writer = None
-    if args.format == "csv" and command not in ["filter", "hydrate", "replies",
+    if args.format in ("csv", "csv-excel") and command not in ["filter", "hydrate", "replies",
             "retweets", "sample", "search", "timeline", "tweet"]:
         parser.error("csv output not available for %s" % command)
-    elif args.format == "csv":
+    elif args.format in ("csv", "csv-excel"):
         csv_writer = csv.writer(fh)
         csv_writer.writerow(get_headings())
 
@@ -243,6 +243,8 @@ def main():
                 print(json.dumps(thing), file=fh)
             elif (args.format == "csv"):
                 csv_writer.writerow(get_row(thing))
+            elif (args.format == "csv-excel"):
+                csv_writer.writerow(get_row(thing, excel=True))
             logging.info("archived %s", thing['id_str'])
         elif 'woeid' in thing:
             # places
@@ -317,7 +319,7 @@ def get_argparser():
     parser.add_argument("--output", action="store", default=None,
                         dest="output", help="write output to file path")
     parser.add_argument("--format", action="store", default="json",
-                        dest="format", choices=["json", "csv"],
+                        dest="format", choices=["json", "csv", "csv-excel"],
                         help="set output format")
     parser.add_argument("--split", action="store", type=int, default=0,
                         help="used with --output to split into numbered files")

diff --git a/twarc/json2csv.py b/twarc/json2csv.py
@@ -55,7 +55,7 @@ def get_headings():
     ]
 
 
-def get_row(t):
+def get_row(t, excel=False):
     get = t.get
     user = t.get('user').get
     return [
@@ -64,7 +64,7 @@ def get_row(t):
       get('created_at'),
       date_parse(get('created_at')),
       user('screen_name'),
-      text(t),
+      text(t) if not excel else tweet_url(t),
       tweet_type(t),
       coordinates(t),
       hashtags(t),
@@ -85,13 +85,13 @@ def get_row(t):
       user('id_str'),
       user('created_at'),
       user('default_profile_image'),
-      clean_str(user('description')),
+      user('description') if not excel else clean_str(user('description')),
       user('favourites_count'),
       user('followers_count'),
       user('friends_count'),
       user('listed_count'),
-      clean_str(user('location')),
-      clean_str(user('name')),
+      user('location') if not excel else clean_str(user('location')),
+      user('name') if not excel else clean_str(user('name')),
       user('statuses_count'),
       user('time_zone'),
       user_urls(t),
@@ -106,7 +106,7 @@ def clean_str(string):
 
 
 def text(t):
-    return clean_str(t.get('full_text') or t.get('extended_tweet', {}).get('full_text') or t['text'])
+    return t.get('full_text') or t.get('extended_tweet', {}).get('full_text') or t['text']
 
 
 def coordinates(t):

diff --git a/utils/json2csv.py b/utils/json2csv.py
@@ -31,6 +31,7 @@ def main():
     parser.add_argument('--extra-field', '-e', help='extra fields to include. Provide a field name and a pointer to '
                                                     'the field. Example: -e verified user.verified',
                         nargs=2, action='append')
+    parser.add_argument('--excel', '-x', help='create file compatible with Excel', action='store_true')
     parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used')
     args = parser.parse_args()
 
@@ -64,7 +65,7 @@ def main():
             sheet.writerow(get_headings(extra_headings=extra_headings))
             file_count += 1
         tweet = json.loads(line)
-        sheet.writerow(get_row(tweet, extra_fields=extra_fields))
+        sheet.writerow(get_row(tweet, extra_fields=extra_fields, excel=args.excel))
 
 
 def numbered_filepath(filepath, num):
@@ -79,8 +80,8 @@ def get_headings(extra_headings=None):
     return fields
 
 
-def get_row(t, extra_fields=None):
-    row = json2csv.get_row(t)
+def get_row(t, extra_fields=None, excel=False):
+    row = json2csv.get_row(t, excel=excel)
     if extra_fields:
         for field in extra_fields:
             row.append(extra_field(t, field))

diff --git a/utils/unshrtn.py b/utils/unshrtn.py
@@ -18,7 +18,8 @@
 import re
 import json
 import time
-import urllib
+import urllib.parse
+import urllib.request
 import logging
 import argparse
 import fileinput
@@ -32,6 +33,46 @@
 
 logging.basicConfig(filename="unshorten.log", level=logging.INFO)
 
+def unshorten_entities(entities, url_cache):
+    for url_dict in entities['urls']:
+        if not url_dict.get('unshortened_url'):
+            if "expanded_url" in url_dict:
+                url = url_dict["expanded_url"]
+            else:
+                url = url_dict['url']
+
+            if url:
+                if re.match(r'^https?://twitter.com/', url):
+                    # don't hammer on twitter.com urls that we know are not short
+                    url_dict['unshortened_url'] = url
+                else:
+                    if url not in url_cache:
+                        # otherwise we've got work to do
+                        url = url.encode('utf8')
+                        u = '{}/?{}'.format(unshrtn_url, urllib.parse.urlencode({'url': url}))
+
+                        resp = None
+                        for retry in range(1, retries + 1):
+                            try:
+                                resp = json.loads(urllib.request.urlopen(u).read())
+                                break
+                            except Exception as e:
+                                logging.error("http error: %s when looking up %s. Try %s of %s", e, url, retry, retries)
+                                time.sleep(wait)
+
+                        # finally assign the long url, giving preference to a
+                        # canonical url if one was found
+                        if resp and resp.get('long'):
+                            url_cache[url] = resp['canonical'] or resp['long']
+                    if url in url_cache:
+                            url_dict['unshortened_url'] = url_cache[url]
+
+
+def unshorten_tweet(tweet, url_cache):
+    unshorten_entities(tweet['entities'], url_cache)
+    if 'extended_tweet' in tweet:
+        unshorten_entities(tweet['extended_tweet']['entities'], url_cache)
+
 
 def rewrite_line(line):
     try:
@@ -41,37 +82,12 @@ def rewrite_line(line):
         logging.error(e)
         return line
 
-    # don't do the same work again
-    if 'unshortened_url' in tweet and tweet['unshortened_url']:
-        return line
-
-    for url_dict in tweet["entities"]["urls"]:
-        if "expanded_url" in url_dict:
-            url = url_dict["expanded_url"]
-        else:
-            url = url_dict['url']
-
-        if url and re.match(r'^https?://twitter.com/', url):
-            # don't hammer on twitter.com urls that we know are not short
-            url_dict['unshortened_url'] = url
-        elif url:
-            # otherwise we've got work to do
-            url = url.encode('utf8')
-            u = '{}/?{}'.format(unshrtn_url, urllib.urlencode({'url': url}))
-
-            resp = None
-            for retry in range(1, retries+1):
-                try:
-                    resp = json.loads(urllib.urlopen(u).read())
-                    break
-                except Exception as e:
-                    logging.error("http error: %s when looking up %s. Try %s of %s", e, url, retry, retries)
-                    time.sleep(wait)
-
-            # finally assign the long url, giving preference to a 
-            # canonical url if one was found
-            if resp and 'long' in resp:
-                url_dict['unshortened_url'] = resp['canonical'] or resp['long']
+    url_cache = dict()
+    unshorten_tweet(tweet, url_cache)
+    if 'quoted_status' in tweet:
+        unshorten_tweet(tweet['quoted_status'], url_cache)
+    elif 'retweeted_status' in tweet:
+        unshorten_tweet(tweet['retweeted_status'], url_cache)
 
     return json.dumps(tweet)
 
@@ -95,5 +111,6 @@ def main():
     for line in pool.imap_unordered(rewrite_line, fileinput.input(files=args.files if len(args.files) > 0 else ('-',))):
         if line != "\n": print(line)
 
+
 if __name__ == "__main__":
     main()
diff --git a/utils/urls.py b/utils/urls.py
@@ -8,12 +8,32 @@
 import json
 import fileinput
 
-for line in fileinput.input():
-    tweet = json.loads(line)
-    for url in tweet["entities"]["urls"]:
+def get_urls(tweet):
+    urls = set()
+    for url in (tweet.get('extended_tweet', {}).get('entities') or tweet['entities'])['urls']:
         if 'unshortened_url' in url:
-            print(url['unshortened_url'].encode('utf8'))
+            urls.add(url['unshortened_url'])
         elif url.get('expanded_url'):
-            print(url['expanded_url'].encode('utf8'))
+            urls.add(url['expanded_url'])
         elif url.get('url'):
-            print(url['url'].encode('utf8'))
+           urls.add(url['url'])
+    return urls
+
+for line in fileinput.input():
+    tweet = json.loads(line)
+
+    # urlslist = []
+    # entities = item.get('extended_tweet', {}).get('entities') or item['entities']
+    # for url in entities['urls'][:2]:
+    #     urlslist += [url['url'], url['expanded_url']]
+    # # Padding the row if URLs do not take up all 4 columns
+    # row += urlslist + [''] * (4 - len(urlslist))
+
+    urls = get_urls(tweet)
+    if 'quoted_status' in tweet:
+        urls.update(get_urls(tweet['quoted_status']))
+    elif 'retweeted_status' in tweet:
+        urls.update(get_urls(tweet['retweeted_status']))
+
+    for url in urls:
+        print(url)