From 90b33ebbbbfefe8c197dd07b057f3417eb406bb9 Mon Sep 17 00:00:00 2001 From: Justin Littman Date: Wed, 21 Feb 2018 12:08:53 -0500 Subject: [PATCH 1/2] Adds Excel mode to remove newlines from some fields. --- twarc/command.py | 8 +++++--- twarc/json2csv.py | 18 ++++++++++++------ utils/json2csv.py | 7 ++++--- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/twarc/command.py b/twarc/command.py index e8a075fe..33c145f8 100644 --- a/twarc/command.py +++ b/twarc/command.py @@ -209,10 +209,10 @@ def main(): # optionally create a csv writer csv_writer = None - if args.format == "csv" and command not in ["filter", "hydrate", "replies", + if args.format in ("csv", "csv-excel") and command not in ["filter", "hydrate", "replies", "retweets", "sample", "search", "timeline", "tweet"]: parser.error("csv output not available for %s" % command) - elif args.format == "csv": + elif args.format in ("csv", "csv-excel"): csv_writer = csv.writer(fh) csv_writer.writerow(get_headings()) @@ -243,6 +243,8 @@ def main(): print(json.dumps(thing), file=fh) elif (args.format == "csv"): csv_writer.writerow(get_row(thing)) + elif (args.format == "csv-excel"): + csv_writer.writerow(get_row(thing, excel=True)) logging.info("archived %s", thing['id_str']) elif 'woeid' in thing: # places @@ -317,7 +319,7 @@ def get_argparser(): parser.add_argument("--output", action="store", default=None, dest="output", help="write output to file path") parser.add_argument("--format", action="store", default="json", - dest="format", choices=["json", "csv"], + dest="format", choices=["json", "csv", "csv-excel"], help="set output format") parser.add_argument("--split", action="store", type=int, default=0, help="used with --output to split into numbered files") diff --git a/twarc/json2csv.py b/twarc/json2csv.py index 2b8bb35d..db997229 100755 --- a/twarc/json2csv.py +++ b/twarc/json2csv.py @@ -55,7 +55,7 @@ def get_headings(): ] -def get_row(t): +def get_row(t, excel=False): get = t.get user = t.get('user').get return [ @@ -64,7 +64,7 @@ def get_row(t): get('created_at'), date_parse(get('created_at')), user('screen_name'), - text(t), + text(t) if not excel else tweet_url(t), tweet_type(t), coordinates(t), hashtags(t), @@ -85,13 +85,13 @@ def get_row(t): user('id_str'), user('created_at'), user('default_profile_image'), - user('description'), + user('description') if not excel else clean_str(user('description')), user('favourites_count'), user('followers_count'), user('friends_count'), user('listed_count'), - user('location'), - user('name'), + user('location') if not excel else clean_str(user('location')), + user('name') if not excel else clean_str(user('name')), user('statuses_count'), user('time_zone'), user_urls(t), @@ -99,8 +99,14 @@ def get_row(t): ] +def clean_str(string): + if isinstance(string, str): + return string.replace('\n', ' ').replace('\r', '') + return None + + def text(t): - return (t.get('full_text') or t.get('extended_tweet', {}).get('full_text') or t['text']).replace('\n', ' ') + return t.get('full_text') or t.get('extended_tweet', {}).get('full_text') or t['text'] def coordinates(t): diff --git a/utils/json2csv.py b/utils/json2csv.py index 767ac651..434764f7 100755 --- a/utils/json2csv.py +++ b/utils/json2csv.py @@ -31,6 +31,7 @@ def main(): parser.add_argument('--extra-field', '-e', help='extra fields to include. Provide a field name and a pointer to ' 'the field. Example: -e verified user.verified', nargs=2, action='append') + parser.add_argument('--excel', '-x', help='create file compatible with Excel', action='store_true') parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used') args = parser.parse_args() @@ -64,7 +65,7 @@ def main(): sheet.writerow(get_headings(extra_headings=extra_headings)) file_count += 1 tweet = json.loads(line) - sheet.writerow(get_row(tweet, extra_fields=extra_fields)) + sheet.writerow(get_row(tweet, extra_fields=extra_fields, excel=args.excel)) def numbered_filepath(filepath, num): @@ -79,8 +80,8 @@ def get_headings(extra_headings=None): return fields -def get_row(t, extra_fields=None): - row = json2csv.get_row(t) +def get_row(t, extra_fields=None, excel=False): + row = json2csv.get_row(t, excel=excel) if extra_fields: for field in extra_fields: row.append(extra_field(t, field)) From 5cd51d883dc17ab9cec568e7a4339c694da3cf97 Mon Sep 17 00:00:00 2001 From: Justin Littman Date: Sun, 4 Mar 2018 09:19:26 -0500 Subject: [PATCH 2/2] Fix to text field. --- twarc/json2csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twarc/json2csv.py b/twarc/json2csv.py index db997229..9f912741 100755 --- a/twarc/json2csv.py +++ b/twarc/json2csv.py @@ -64,7 +64,7 @@ def get_row(t, excel=False): get('created_at'), date_parse(get('created_at')), user('screen_name'), - text(t) if not excel else tweet_url(t), + text(t) if not excel else clean_str(text(t)), tweet_type(t), coordinates(t), hashtags(t),