Skip to content

Commit

Permalink
Merge pull request #208 from justinlittman/clean_str
Browse files Browse the repository at this point in the history
Handle multiline user descriptions.
  • Loading branch information
edsu authored Mar 4, 2018
2 parents bcc820e + 5cd51d8 commit 4df784c
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 12 deletions.
8 changes: 5 additions & 3 deletions twarc/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,10 +213,10 @@ def main():

# optionally create a csv writer
csv_writer = None
if args.format == "csv" and command not in ["filter", "hydrate", "replies",
if args.format in ("csv", "csv-excel") and command not in ["filter", "hydrate", "replies",
"retweets", "sample", "search", "timeline", "tweet"]:
parser.error("csv output not available for %s" % command)
elif args.format == "csv":
elif args.format in ("csv", "csv-excel"):
csv_writer = csv.writer(fh)
csv_writer.writerow(get_headings())

Expand Down Expand Up @@ -247,6 +247,8 @@ def main():
print(json.dumps(thing), file=fh)
elif (args.format == "csv"):
csv_writer.writerow(get_row(thing))
elif (args.format == "csv-excel"):
csv_writer.writerow(get_row(thing, excel=True))
logging.info("archived %s", thing['id_str'])
elif 'woeid' in thing:
# places
Expand Down Expand Up @@ -321,7 +323,7 @@ def get_argparser():
parser.add_argument("--output", action="store", default=None,
dest="output", help="write output to file path")
parser.add_argument("--format", action="store", default="json",
dest="format", choices=["json", "csv"],
dest="format", choices=["json", "csv", "csv-excel"],
help="set output format")
parser.add_argument("--split", action="store", type=int, default=0,
help="used with --output to split into numbered files")
Expand Down
18 changes: 12 additions & 6 deletions twarc/json2csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def get_headings():
]


def get_row(t):
def get_row(t, excel=False):
get = t.get
user = t.get('user').get
return [
Expand All @@ -64,7 +64,7 @@ def get_row(t):
get('created_at'),
date_parse(get('created_at')),
user('screen_name'),
text(t),
text(t) if not excel else clean_str(text(t)),
tweet_type(t),
coordinates(t),
hashtags(t),
Expand All @@ -85,22 +85,28 @@ def get_row(t):
user('id_str'),
user('created_at'),
user('default_profile_image'),
user('description'),
user('description') if not excel else clean_str(user('description')),
user('favourites_count'),
user('followers_count'),
user('friends_count'),
user('listed_count'),
user('location'),
user('name'),
user('location') if not excel else clean_str(user('location')),
user('name') if not excel else clean_str(user('name')),
user('statuses_count'),
user('time_zone'),
user_urls(t),
user('verified'),
]


def clean_str(string):
if isinstance(string, str):
return string.replace('\n', ' ').replace('\r', '')
return None


def text(t):
return (t.get('full_text') or t.get('extended_tweet', {}).get('full_text') or t['text']).replace('\n', ' ')
return t.get('full_text') or t.get('extended_tweet', {}).get('full_text') or t['text']


def coordinates(t):
Expand Down
7 changes: 4 additions & 3 deletions utils/json2csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def main():
parser.add_argument('--extra-field', '-e', help='extra fields to include. Provide a field name and a pointer to '
'the field. Example: -e verified user.verified',
nargs=2, action='append')
parser.add_argument('--excel', '-x', help='create file compatible with Excel', action='store_true')
parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used')
args = parser.parse_args()

Expand Down Expand Up @@ -64,7 +65,7 @@ def main():
sheet.writerow(get_headings(extra_headings=extra_headings))
file_count += 1
tweet = json.loads(line)
sheet.writerow(get_row(tweet, extra_fields=extra_fields))
sheet.writerow(get_row(tweet, extra_fields=extra_fields, excel=args.excel))


def numbered_filepath(filepath, num):
Expand All @@ -79,8 +80,8 @@ def get_headings(extra_headings=None):
return fields


def get_row(t, extra_fields=None):
row = json2csv.get_row(t)
def get_row(t, extra_fields=None, excel=False):
row = json2csv.get_row(t, excel=excel)
if extra_fields:
for field in extra_fields:
row.append(extra_field(t, field))
Expand Down

0 comments on commit 4df784c

Please sign in to comment.