Skip to content

Commit

Permalink
Adds Excel mode to remove newlines from some fields.
Browse files Browse the repository at this point in the history
  • Loading branch information
Justin Littman committed Mar 4, 2018
1 parent 4b3d0e3 commit 90b33eb
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 12 deletions.
8 changes: 5 additions & 3 deletions twarc/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,10 +209,10 @@ def main():

# optionally create a csv writer
csv_writer = None
if args.format == "csv" and command not in ["filter", "hydrate", "replies",
if args.format in ("csv", "csv-excel") and command not in ["filter", "hydrate", "replies",
"retweets", "sample", "search", "timeline", "tweet"]:
parser.error("csv output not available for %s" % command)
elif args.format == "csv":
elif args.format in ("csv", "csv-excel"):
csv_writer = csv.writer(fh)
csv_writer.writerow(get_headings())

Expand Down Expand Up @@ -243,6 +243,8 @@ def main():
print(json.dumps(thing), file=fh)
elif (args.format == "csv"):
csv_writer.writerow(get_row(thing))
elif (args.format == "csv-excel"):
csv_writer.writerow(get_row(thing, excel=True))
logging.info("archived %s", thing['id_str'])
elif 'woeid' in thing:
# places
Expand Down Expand Up @@ -317,7 +319,7 @@ def get_argparser():
parser.add_argument("--output", action="store", default=None,
dest="output", help="write output to file path")
parser.add_argument("--format", action="store", default="json",
dest="format", choices=["json", "csv"],
dest="format", choices=["json", "csv", "csv-excel"],
help="set output format")
parser.add_argument("--split", action="store", type=int, default=0,
help="used with --output to split into numbered files")
Expand Down
18 changes: 12 additions & 6 deletions twarc/json2csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def get_headings():
]


def get_row(t):
def get_row(t, excel=False):
get = t.get
user = t.get('user').get
return [
Expand All @@ -64,7 +64,7 @@ def get_row(t):
get('created_at'),
date_parse(get('created_at')),
user('screen_name'),
text(t),
text(t) if not excel else tweet_url(t),
tweet_type(t),
coordinates(t),
hashtags(t),
Expand All @@ -85,22 +85,28 @@ def get_row(t):
user('id_str'),
user('created_at'),
user('default_profile_image'),
user('description'),
user('description') if not excel else clean_str(user('description')),
user('favourites_count'),
user('followers_count'),
user('friends_count'),
user('listed_count'),
user('location'),
user('name'),
user('location') if not excel else clean_str(user('location')),
user('name') if not excel else clean_str(user('name')),
user('statuses_count'),
user('time_zone'),
user_urls(t),
user('verified'),
]


def clean_str(string):
if isinstance(string, str):
return string.replace('\n', ' ').replace('\r', '')
return None


def text(t):
return (t.get('full_text') or t.get('extended_tweet', {}).get('full_text') or t['text']).replace('\n', ' ')
return t.get('full_text') or t.get('extended_tweet', {}).get('full_text') or t['text']


def coordinates(t):
Expand Down
7 changes: 4 additions & 3 deletions utils/json2csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def main():
parser.add_argument('--extra-field', '-e', help='extra fields to include. Provide a field name and a pointer to '
'the field. Example: -e verified user.verified',
nargs=2, action='append')
parser.add_argument('--excel', '-x', help='create file compatible with Excel', action='store_true')
parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used')
args = parser.parse_args()

Expand Down Expand Up @@ -64,7 +65,7 @@ def main():
sheet.writerow(get_headings(extra_headings=extra_headings))
file_count += 1
tweet = json.loads(line)
sheet.writerow(get_row(tweet, extra_fields=extra_fields))
sheet.writerow(get_row(tweet, extra_fields=extra_fields, excel=args.excel))


def numbered_filepath(filepath, num):
Expand All @@ -79,8 +80,8 @@ def get_headings(extra_headings=None):
return fields


def get_row(t, extra_fields=None):
row = json2csv.get_row(t)
def get_row(t, extra_fields=None, excel=False):
row = json2csv.get_row(t, excel=excel)
if extra_fields:
for field in extra_fields:
row.append(extra_field(t, field))
Expand Down

0 comments on commit 90b33eb

Please sign in to comment.