From 95e5106addf05564a7b3f67ec4d4e5a12e9e7fe8 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sun, 20 Jun 2021 00:02:20 +0100 Subject: [PATCH 01/43] add tqdm --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 2abcd235..832808fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ python-dateutil requests_oauthlib +tqdm click click-plugins click-config-file From a33408a6a4ad278314570a9fe125d6df250ed729 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sun, 20 Jun 2021 00:03:24 +0100 Subject: [PATCH 02/43] test tqdm --- twarc/command2.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/twarc/command2.py b/twarc/command2.py index fdb58c50..8c84decc 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -14,6 +14,7 @@ import configobj import threading +from tqdm import tqdm from click_plugins import with_plugins from pkg_resources import iter_entry_points @@ -189,9 +190,11 @@ def search(T, query, outfile, since_id, until_id, start_time, end_time, limit, max_results = 100 search_method = T.search_recent + pbar = _id_progress_bar(since_id, until_id, start_time, end_time) for result in search_method(query, since_id, until_id, start_time, end_time, max_results): _write(result, outfile) + pbar.update(int(result["meta"]["newest_id"]) - pbar.n) count += len(result['data']) if limit != 0 and count >= limit: break @@ -659,6 +662,17 @@ def delete_all(T): click.echo(f"šŸ—‘ Deleted {len(rule_ids)} rules.") +def _id_progress_bar(): + """ + Snowflake ID based progress bar. + """ + return tqdm( + total=1, + ) + +def _date_to_snowflake(date): + return 1 + def _rule_str(rule): s = f"id={rule['id']} value={rule['value']}" if 'tag' in rule: From 00797ef0856e31cda6d685eeaa6da47d8bf36bdd Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sun, 20 Jun 2021 02:46:39 +0100 Subject: [PATCH 03/43] tqdm test --- twarc/command2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twarc/command2.py b/twarc/command2.py index 8c84decc..800d7110 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -662,7 +662,7 @@ def delete_all(T): click.echo(f"šŸ—‘ Deleted {len(rule_ids)} rules.") -def _id_progress_bar(): +def _id_progress_bar(since_id, until_id, start_time, end_time): """ Snowflake ID based progress bar. """ From 423ab5aa810f3e111a860d9ed00c433cde218df1 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 21 Jun 2021 13:21:15 +0100 Subject: [PATCH 04/43] stash changes --- twarc/command2.py | 578 ++++++++++++++++++++++++++------------------ twarc/decorators.py | 168 +++++++++++-- 2 files changed, 491 insertions(+), 255 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index 800d7110..26d763b9 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -14,45 +14,82 @@ import configobj import threading -from tqdm import tqdm +from tqdm.auto import tqdm from click_plugins import with_plugins from pkg_resources import iter_entry_points from twarc.version import version from twarc.handshake import handshake -from twarc.config import ConfigProvider -from twarc.decorators import cli_api_error +from twarc.config import ConfigProvider +from twarc.decorators import cli_api_error, prbar from twarc.expansions import ensure_flattened from click_config_file import configuration_option config_provider = ConfigProvider() -@with_plugins(iter_entry_points('twarc.plugins')) + +@with_plugins(iter_entry_points("twarc.plugins")) @click.group() -@click.option('--consumer-key', type=str, envvar='CONSUMER_KEY', - help='Twitter app consumer key (aka "App Key")') -@click.option('--consumer-secret', type=str, envvar='CONSUMER_SECRET', - help='Twitter app consumer secret (aka "App Secret")') -@click.option('--access-token', type=str, envvar='ACCESS_TOKEN', - help='Twitter app access token for user authentication.') -@click.option('--access-token-secret', type=str, envvar='ACCESS_TOKEN_SECRET', - help='Twitter app access token secret for user authentication.') -@click.option('--bearer-token', type=str, envvar='BEARER_TOKEN', - help='Twitter app access bearer token.') -@click.option('--app-auth/--user-auth', default=True, +@click.option( + "--consumer-key", + type=str, + envvar="CONSUMER_KEY", + help='Twitter app consumer key (aka "App Key")', +) +@click.option( + "--consumer-secret", + type=str, + envvar="CONSUMER_SECRET", + help='Twitter app consumer secret (aka "App Secret")', +) +@click.option( + "--access-token", + type=str, + envvar="ACCESS_TOKEN", + help="Twitter app access token for user authentication.", +) +@click.option( + "--access-token-secret", + type=str, + envvar="ACCESS_TOKEN_SECRET", + help="Twitter app access token secret for user authentication.", +) +@click.option( + "--bearer-token", + type=str, + envvar="BEARER_TOKEN", + help="Twitter app access bearer token.", +) +@click.option( + "--app-auth/--user-auth", + default=True, help="Use application authentication or user authentication. Some rate limits are " "higher with user authentication, but not all endpoints are supported.", show_default=True, ) -@click.option('--log', default='twarc.log') -@click.option('--verbose', is_flag=True, default=False) -@click.option('--metadata/--no-metadata', default=True, show_default=True, - help="Include/don't include metadata about when and how data was collected.") -@configuration_option(cmd_name='twarc', config_file_name='config', provider=config_provider) +@click.option("--log", default="twarc.log") +@click.option("--verbose", is_flag=True, default=False) +@click.option( + "--metadata/--no-metadata", + default=True, + show_default=True, + help="Include/don't include metadata about when and how data was collected.", +) +@configuration_option( + cmd_name="twarc", config_file_name="config", provider=config_provider +) @click.pass_context def twarc2( - ctx, consumer_key, consumer_secret, access_token, access_token_secret, bearer_token, - log, metadata, app_auth, verbose + ctx, + consumer_key, + consumer_secret, + access_token, + access_token_secret, + bearer_token, + log, + metadata, + app_auth, + verbose, ): """ Collect data from the Twitter V2 API. @@ -60,7 +97,7 @@ def twarc2( logging.basicConfig( filename=log, level=logging.DEBUG if verbose else logging.INFO, - format="%(asctime)s %(levelname)s %(message)s" + format="%(asctime)s %(levelname)s %(message)s", ) logging.info("using config %s", config_provider.file_path) @@ -68,26 +105,31 @@ def twarc2( if bearer_token or (consumer_key and consumer_secret): if app_auth and (bearer_token or (consumer_key and consumer_secret)): ctx.obj = twarc.Twarc2( - consumer_key=consumer_key, consumer_secret=consumer_secret, - bearer_token=bearer_token, metadata=metadata + consumer_key=consumer_key, + consumer_secret=consumer_secret, + bearer_token=bearer_token, + metadata=metadata, ) # Check everything is present for user auth. - elif (consumer_key and consumer_secret and access_token and access_token_secret): + elif consumer_key and consumer_secret and access_token and access_token_secret: ctx.obj = twarc.Twarc2( - consumer_key=consumer_key, consumer_secret=consumer_secret, - access_token=access_token, access_token_secret=access_token_secret, - metadata=metadata + consumer_key=consumer_key, + consumer_secret=consumer_secret, + access_token=access_token, + access_token_secret=access_token_secret, + metadata=metadata, ) else: click.echo( click.style( - 'šŸ™ƒ To use user authentication, you need all of the following:\n' - '- consumer_key\n', - '- consumer_secret\n', - '- access_token\n', - '- access_token_secret\n', - fg='red'), - err=True + "šŸ™ƒ To use user authentication, you need all of the following:\n" + "- consumer_key\n", + "- consumer_secret\n", + "- access_token\n", + "- access_token_secret\n", + fg="red", + ), + err=True, ) click.echo("You can configure twarc2 using the `twarc2 configure` command.") else: @@ -103,7 +145,7 @@ def twarc2( ctx.invoke(configure) -@twarc2.command('configure') +@twarc2.command("configure") @click.pass_context def configure(ctx): """ @@ -111,13 +153,13 @@ def configure(ctx): """ config_file = config_provider.file_path - logging.info('creating config file: %s', config_file) + logging.info("creating config file: %s", config_file) config_dir = pathlib.Path(config_file).parent if not config_dir.is_dir(): - logging.info('creating config directory: %s', config_dir) + logging.info("creating config directory: %s", config_dir) config_dir.mkdir(parents=True) - + keys = handshake() if keys is None: raise click.ClickException("Unable to authenticate") @@ -131,49 +173,71 @@ def configure(ctx): "consumer_secret", "access_token", "access_token_secret", - "bearer_token" + "bearer_token", ]: if keys.get(key, None): config[key] = keys[key] config.write() - click.echo(click.style(f'\nYour keys have been written to {config_file}', fg='green')) + click.echo( + click.style(f"\nYour keys have been written to {config_file}", fg="green") + ) click.echo() - click.echo('\nāœØ āœØ āœØ Happy twarcing! āœØ āœØ āœØ\n') + click.echo("\nāœØ āœØ āœØ Happy twarcing! āœØ āœØ āœØ\n") ctx.exit() -@twarc2.command('version') +@twarc2.command("version") def get_version(): """ Return the version of twarc that is installed. """ - click.echo(f'twarc v{version}') - - -@twarc2.command('search') -@click.option('--since-id', type=int, - help='Match tweets sent after tweet id') -@click.option('--until-id', type=int, - help='Match tweets sent prior to tweet id') -@click.option('--start-time', - type=click.DateTime(formats=('%Y-%m-%d', '%Y-%m-%dT%H:%M:%S')), - help='Match tweets created after UTC time (ISO 8601/RFC 3339), e.g. 2021-01-01T12:31:04') -@click.option('--end-time', - type=click.DateTime(formats=('%Y-%m-%d', '%Y-%m-%dT%H:%M:%S')), - help='Match tweets sent before UTC time (ISO 8601/RFC 3339)') -@click.option('--archive', is_flag=True, default=False, - help='Search the full archive (requires Academic Research track)') -@click.option('--limit', default=0, help='Maximum number of tweets to save') -@click.option('--max-results', default=0, help='Maximum number of tweets per API response') -@click.argument('query', type=str) -@click.argument('outfile', type=click.File('w'), default='-') + click.echo(f"twarc v{version}") + + +@twarc2.command("search") +@click.option("--since-id", type=int, help="Match tweets sent after tweet id") +@click.option("--until-id", type=int, help="Match tweets sent prior to tweet id") +@click.option( + "--start-time", + type=click.DateTime(formats=("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S")), + help="Match tweets created after UTC time (ISO 8601/RFC 3339), e.g. 2021-01-01T12:31:04", +) +@click.option( + "--end-time", + type=click.DateTime(formats=("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S")), + help="Match tweets sent before UTC time (ISO 8601/RFC 3339)", +) +@click.option( + "--archive", + is_flag=True, + default=False, + help="Search the full archive (requires Academic Research track)", +) +@click.option("--limit", default=0, help="Maximum number of tweets to save") +@click.option( + "--max-results", default=0, help="Maximum number of tweets per API response" +) +@click.argument("query", type=str) +@click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error -def search(T, query, outfile, since_id, until_id, start_time, end_time, limit, - max_results, archive): +@prbar +def search( + T, + query, + outfile, + since_id, + until_id, + start_time, + end_time, + limit, + max_results, + archive, + **kwargs, +): """ Search for tweets. """ @@ -190,38 +254,50 @@ def search(T, query, outfile, since_id, until_id, start_time, end_time, limit, max_results = 100 search_method = T.search_recent - pbar = _id_progress_bar(since_id, until_id, start_time, end_time) - for result in search_method(query, since_id, until_id, start_time, end_time, - max_results): + pbar = kwargs["progress_bar"] + + for result in search_method( + query, since_id, until_id, start_time, end_time, max_results + ): _write(result, outfile) - pbar.update(int(result["meta"]["newest_id"]) - pbar.n) - count += len(result['data']) + + if kwargs.get("progress_bar"): + # print("Progress bar exists!") + pbar.update_ids(result["meta"]) + + count += len(result["data"]) if limit != 0 and count >= limit: + pbar.desc = f"--limit {limit} reached" break + else: + print("Finishing") + pbar.update(pbar.total - pbar.n) + + pbar.close() -@twarc2.command('tweet') -@click.option('--pretty', is_flag=True, default=False, - help='Pretty print the JSON') -@click.argument('tweet_id', type=str) -@click.argument('outfile', type=click.File('w'), default='-') + +@twarc2.command("tweet") +@click.option("--pretty", is_flag=True, default=False, help="Pretty print the JSON") +@click.argument("tweet_id", type=str) +@click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def tweet(T, tweet_id, outfile, pretty): """ Look up a tweet using its tweet id or URL. """ - if 'https' in tweet_id: - tweet_id = url_or_id.split('/')[-1] - if not re.match('^\d+$', tweet_id): + if "https" in tweet_id: + tweet_id = url_or_id.split("/")[-1] + if not re.match("^\d+$", tweet_id): click.echo(click.style("Please enter a tweet URL or ID", fg="red"), err=True) result = next(T.tweet_lookup([tweet_id])) _write(result, outfile, pretty=pretty) -@twarc2.command('followers') -@click.option('--limit', default=0, help='Maximum number of followers to save') -@click.argument('user', type=str) -@click.argument('outfile', type=click.File('w'), default='-') +@twarc2.command("followers") +@click.option("--limit", default=0, help="Maximum number of followers to save") +@click.argument("user", type=str) +@click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def followers(T, user, outfile, limit): @@ -232,15 +308,15 @@ def followers(T, user, outfile, limit): for result in T.followers(user): _write(result, outfile) - count += len(result['data']) + count += len(result["data"]) if limit != 0 and count >= limit: break -@twarc2.command('following') -@click.option('--limit', default=0, help='Maximum number of friends to save') -@click.argument('userd', type=str) -@click.argument('outfile', type=click.File('w'), default='-') +@twarc2.command("following") +@click.option("--limit", default=0, help="Maximum number of friends to save") +@click.argument("userd", type=str) +@click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def following(T, user, outfile, limit): @@ -251,14 +327,14 @@ def following(T, user, outfile, limit): for result in T.following(user): _write(result, outfile) - count += len(result['data']) + count += len(result["data"]) if limit != 0 and count >= limit: break -@twarc2.command('sample') -@click.option('--limit', default=0, help='Maximum number of tweets to save') -@click.argument('outfile', type=click.File('a+'), default='-') +@twarc2.command("sample") +@click.option("--limit", default=0, help="Maximum number of tweets to save") +@click.argument("outfile", type=click.File("a+"), default="-") @click.pass_obj @cli_api_error def sample(T, outfile, limit): @@ -267,7 +343,12 @@ def sample(T, outfile, limit): """ count = 0 event = threading.Event() - click.echo(click.style(f'Started a random sample stream, writing to {outfile.name}\nCTRL+C to stop...', fg='green')) + click.echo( + click.style( + f"Started a random sample stream, writing to {outfile.name}\nCTRL+C to stop...", + fg="green", + ) + ) for result in T.sample(event=event): count += 1 if limit != 0 and count >= limit: @@ -275,9 +356,9 @@ def sample(T, outfile, limit): _write(result, outfile) -@twarc2.command('hydrate') -@click.argument('infile', type=click.File('r'), default='-') -@click.argument('outfile', type=click.File('w'), default='-') +@twarc2.command("hydrate") +@click.argument("infile", type=click.File("r"), default="-") +@click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def hydrate(T, infile, outfile): @@ -288,10 +369,10 @@ def hydrate(T, infile, outfile): _write(result, outfile) -@twarc2.command('users') -@click.option('--usernames', is_flag=True, default=False) -@click.argument('infile', type=click.File('r'), default='-') -@click.argument('outfile', type=click.File('w'), default='-') +@twarc2.command("users") +@click.option("--usernames", is_flag=True, default=False) +@click.argument("infile", type=click.File("r"), default="-") +@click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def users(T, infile, outfile, usernames): @@ -302,19 +383,21 @@ def users(T, infile, outfile, usernames): _write(result, outfile) -@twarc2.command('mentions') -@click.option('--since-id', type=int, - help='Match tweets sent after tweet id') -@click.option('--until-id', type=int, - help='Match tweets sent prior to tweet id') -@click.option('--start-time', - type=click.DateTime(formats=('%Y-%m-%d', '%Y-%m-%dT%H:%M:%S')), - help='Match tweets created after time (ISO 8601/RFC 3339), e.g. 2021-01-01T12:31:04') -@click.option('--end-time', - type=click.DateTime(formats=('%Y-%m-%d', '%Y-%m-%dT%H:%M:%S')), - help='Match tweets sent before time (ISO 8601/RFC 3339)') -@click.argument('user_id', type=str) -@click.argument('outfile', type=click.File('w'), default='-') +@twarc2.command("mentions") +@click.option("--since-id", type=int, help="Match tweets sent after tweet id") +@click.option("--until-id", type=int, help="Match tweets sent prior to tweet id") +@click.option( + "--start-time", + type=click.DateTime(formats=("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S")), + help="Match tweets created after time (ISO 8601/RFC 3339), e.g. 2021-01-01T12:31:04", +) +@click.option( + "--end-time", + type=click.DateTime(formats=("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S")), + help="Match tweets sent before time (ISO 8601/RFC 3339)", +) +@click.argument("user_id", type=str) +@click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def mentions(T, user_id, outfile, since_id, until_id, start_time, end_time): @@ -325,32 +408,39 @@ def mentions(T, user_id, outfile, since_id, until_id, start_time, end_time): _write(result, outfile) -@twarc2.command('timeline') -@click.option('--limit', default=0, help='Maximum number of tweets to return') -@click.option('--since-id', type=int, - help='Match tweets sent after tweet id') -@click.option('--until-id', type=int, - help='Match tweets sent prior to tweet id') -@click.option('--start-time', - type=click.DateTime(formats=('%Y-%m-%d', '%Y-%m-%dT%H:%M:%S')), - help='Match tweets created after time (ISO 8601/RFC 3339), e.g. 2021-01-01T12:31:04') -@click.option('--end-time', - type=click.DateTime(formats=('%Y-%m-%d', '%Y-%m-%dT%H:%M:%S')), - help='Match tweets sent before time (ISO 8601/RFC 3339)') -@click.option('--use-search', is_flag=True, default=False, - help='Use the search/all API endpoint which is not limited to the last 3200 tweets, but requires Academic Product Track access.') -@click.argument('user_id', type=str) -@click.argument('outfile', type=click.File('w'), default='-') +@twarc2.command("timeline") +@click.option("--limit", default=0, help="Maximum number of tweets to return") +@click.option("--since-id", type=int, help="Match tweets sent after tweet id") +@click.option("--until-id", type=int, help="Match tweets sent prior to tweet id") +@click.option( + "--start-time", + type=click.DateTime(formats=("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S")), + help="Match tweets created after time (ISO 8601/RFC 3339), e.g. 2021-01-01T12:31:04", +) +@click.option( + "--end-time", + type=click.DateTime(formats=("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S")), + help="Match tweets sent before time (ISO 8601/RFC 3339)", +) +@click.option( + "--use-search", + is_flag=True, + default=False, + help="Use the search/all API endpoint which is not limited to the last 3200 tweets, but requires Academic Product Track access.", +) +@click.argument("user_id", type=str) +@click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error -def timeline(T, user_id, outfile, since_id, until_id, start_time, end_time, - use_search, limit): +def timeline( + T, user_id, outfile, since_id, until_id, start_time, end_time, use_search, limit +): """ Retrieve recent tweets for the given user. """ if use_search: - q = f'from:{user_id}' + q = f"from:{user_id}" tweets = T.search_all(q, since_id, until_id, start_time, end_time) else: tweets = T.timeline(user_id, since_id, until_id, start_time, end_time) @@ -359,24 +449,31 @@ def timeline(T, user_id, outfile, since_id, until_id, start_time, end_time, for result in tweets: _write(result, outfile) - count += len(result['data']) + count += len(result["data"]) if limit != 0 and count >= limit: break -@twarc2.command('timelines') -@click.option('--limit', default=0, help='Maximum number of tweets to return') -@click.option('--timeline-limit', default=0, - help='Maximum number of tweets to return per-timeline') -@click.option('--use-search', is_flag=True, default=False, - help='Use the search/all API endpoint which is not limited to the last 3200 tweets, but requires Academic Product Track access.') -@click.argument('infile', type=click.File('r'), default='-') -@click.argument('outfile', type=click.File('w'), default='-') +@twarc2.command("timelines") +@click.option("--limit", default=0, help="Maximum number of tweets to return") +@click.option( + "--timeline-limit", + default=0, + help="Maximum number of tweets to return per-timeline", +) +@click.option( + "--use-search", + is_flag=True, + default=False, + help="Use the search/all API endpoint which is not limited to the last 3200 tweets, but requires Academic Product Track access.", +) +@click.argument("infile", type=click.File("r"), default="-") +@click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj def timelines(T, infile, outfile, limit, timeline_limit, use_search): """ Fetch the timelines of every user in an input source of tweets. If - the input is a line oriented text file of user ids or usernames that will + the input is a line oriented text file of user ids or usernames that will be used instead. """ total_count = 0 @@ -389,7 +486,7 @@ def timelines(T, infile, outfile, limit, timeline_limit, use_search): users = [] try: data = ensure_flattened(json.loads(line)) - users = set([t['author']['id'] for t in ensure_flattened(data)]) + users = set([t["author"]["id"] for t in ensure_flattened(data)]) except json.JSONDecodeError: users = set([line]) except ValueError: @@ -404,9 +501,9 @@ def timelines(T, infile, outfile, limit, timeline_limit, use_search): # which api endpoint to use if use_search and since_id: - tweets = T.search_all(f'from:{user}', since_id=since_id) + tweets = T.search_all(f"from:{user}", since_id=since_id) elif use_search: - tweets = T.search_all(f'from:{user}') + tweets = T.search_all(f"from:{user}") else: tweets = T.timeline(user) @@ -414,27 +511,31 @@ def timelines(T, infile, outfile, limit, timeline_limit, use_search): for response in tweets: _write(response, outfile) - timeline_count += len(response['data']) + timeline_count += len(response["data"]) if timeline_limit != 0 and timeline_count >= timeline_limit: break - total_count += len(response['data']) + total_count += len(response["data"]) if limit != 0 and total_count >= limit: return -@twarc2.command('conversation') -@click.option('--archive', is_flag=True, default=False, - help='Search the full archive (requires Academic Research track)') -@click.argument('tweet_id', type=str) -@click.argument('outfile', type=click.File('w'), default='-') +@twarc2.command("conversation") +@click.option( + "--archive", + is_flag=True, + default=False, + help="Search the full archive (requires Academic Research track)", +) +@click.argument("tweet_id", type=str) +@click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def conversation(T, tweet_id, archive, outfile): """ Retrieve a conversation thread using the tweet id. """ - q = f'conversation_id:{tweet_id}' + q = f"conversation_id:{tweet_id}" if archive: search = T.search_all(q) else: @@ -443,14 +544,21 @@ def conversation(T, tweet_id, archive, outfile): _write(resp, outfile) -@twarc2.command('conversations') -@click.option('--limit', default=0, help='Maximum number of tweets to return') -@click.option('--conversation-limit', default=0, - help='Maximum number of tweets to return per-conversation') -@click.option('--archive', is_flag=True, default=False, - help='Use the Academic Research project track access to the full archive') -@click.argument('infile', type=click.File('r'), default='-') -@click.argument('outfile', type=click.File('w'), default='-') +@twarc2.command("conversations") +@click.option("--limit", default=0, help="Maximum number of tweets to return") +@click.option( + "--conversation-limit", + default=0, + help="Maximum number of tweets to return per-conversation", +) +@click.option( + "--archive", + is_flag=True, + default=False, + help="Use the Academic Research project track access to the full archive", +) +@click.argument("infile", type=click.File("r"), default="-") +@click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error def conversations(T, infile, outfile, archive, limit, conversation_limit): @@ -477,16 +585,18 @@ def conversations(T, infile, outfile, archive, limit, conversation_limit): # get a specific conversation id line = line.strip() - if re.match(r'^\d+$', line): + if re.match(r"^\d+$", line): if line in seen: continue conv_ids = [line] # generate all conversation_ids that are referenced in tweets input else: + def f(): for tweet in ensure_flattened(json.loads(line)): - yield tweet.get('conversation_id') + yield tweet.get("conversation_id") + conv_ids = f() # output results while paying attention to the set limits @@ -495,38 +605,44 @@ def f(): for conv_id in conv_ids: if conv_id in seen: - logging.info(f'already fetched conversation_id {conv_id}') + logging.info(f"already fetched conversation_id {conv_id}") seen.add(conv_id) conv_count = 0 - logging.info(f'fetching conversation {conv_id}') - for result in search(f'conversation_id:{conv_id}'): + logging.info(f"fetching conversation {conv_id}") + for result in search(f"conversation_id:{conv_id}"): _write(result, outfile, False) - count += len(result['data']) + count += len(result["data"]) if limit != 0 and count >= limit: - logging.info(f'reached tweet limit of {limit}') + logging.info(f"reached tweet limit of {limit}") stop = True break - conv_count += len(result['data']) - if conversation_limit !=0 and conv_count >= conversation_limit: - logging.info(f'reached conversation limit {conversation_limit}') + conv_count += len(result["data"]) + if conversation_limit != 0 and conv_count >= conversation_limit: + logging.info(f"reached conversation limit {conversation_limit}") break -@twarc2.command('flatten') -@click.argument('infile', type=click.File('r'), default='-') -@click.argument('outfile', type=click.File('w'), default='-') +@twarc2.command("flatten") +@click.argument("infile", type=click.File("r"), default="-") +@click.argument("outfile", type=click.File("w"), default="-") @cli_api_error def flatten(infile, outfile): """ "Flatten" tweets, or move expansions inline with tweet objects and ensure that each line of output is a single tweet. """ - if (infile.name == outfile.name): - click.echo(click.style(f"šŸ’” Cannot flatten files in-place, specify a different output file!", fg='red'), err=True) + if infile.name == outfile.name: + click.echo( + click.style( + f"šŸ’” Cannot flatten files in-place, specify a different output file!", + fg="red", + ), + err=True, + ) return for line in infile: @@ -534,9 +650,9 @@ def flatten(infile, outfile): _write(tweet, outfile, False) -@twarc2.command('stream') -@click.option('--limit', default=0, help='Maximum number of tweets to return') -@click.argument('outfile', type=click.File('a+'), default='-') +@twarc2.command("stream") +@click.option("--limit", default=0, help="Maximum number of tweets to return") +@click.argument("outfile", type=click.File("a+"), default="-") @click.pass_obj @cli_api_error def stream(T, outfile, limit): @@ -545,15 +661,16 @@ def stream(T, outfile, limit): """ event = threading.Event() count = 0 - click.echo(click.style(f'Started a stream with rules:', fg='green'), - err=True) + click.echo(click.style(f"Started a stream with rules:", fg="green"), err=True) _print_stream_rules(T) - click.echo(click.style(f'Writing to {outfile.name}\nCTRL+C to stop...', - fg='green'), err=True) + click.echo( + click.style(f"Writing to {outfile.name}\nCTRL+C to stop...", fg="green"), + err=True, + ) for result in T.stream(event=event): count += 1 if limit != 0 and count == limit: - logging.info(f'reached limit {limit}') + logging.info(f"reached limit {limit}") event.set() _write(result, outfile) @@ -567,7 +684,7 @@ def stream_rules(T): pass -@stream_rules.command('list') +@stream_rules.command("list") @click.pass_obj @cli_api_error def list_stream_rules(T): @@ -576,29 +693,34 @@ def list_stream_rules(T): """ _print_stream_rules(T) + def _print_stream_rules(T): """ Output all the active stream rules """ result = T.get_stream_rules() - if 'data' not in result or len(result['data']) == 0: - click.echo('No rules yet. Add them with ' + click.style('twarc2 stream-rules add', bold=True), err=True) + if "data" not in result or len(result["data"]) == 0: + click.echo( + "No rules yet. Add them with " + + click.style("twarc2 stream-rules add", bold=True), + err=True, + ) else: count = 0 - for rule in result['data']: + for rule in result["data"]: if count > 5: count = 0 - s = rule['value'] - if 'tag' in rule: + s = rule["value"] + if "tag" in rule: s += f" (tag: {rule['tag']})" - click.echo(click.style(f'ā˜‘ {s}'), err=True) + click.echo(click.style(f"ā˜‘ {s}"), err=True) count += 1 -@stream_rules.command('add') +@stream_rules.command("add") @click.pass_obj -@click.option('--tag', type=str, help='a tag to help identify the rule') -@click.argument('value', type=str) +@click.option("--tag", type=str, help="a tag to help identify the rule") +@click.argument("value", type=str) @cli_api_error def add_stream_rule(T, value, tag): """ @@ -611,14 +733,14 @@ def add_stream_rule(T, value, tag): rules = [{"value": value}] results = T.add_stream_rules(rules) - if 'errors' in results: - click.echo(_error_str(results['errors']), err=True) + if "errors" in results: + click.echo(_error_str(results["errors"]), err=True) else: - click.echo(click.style(f'šŸš€ Added rule for ', fg='green') + f'"{value}"') + click.echo(click.style(f"šŸš€ Added rule for ", fg="green") + f'"{value}"') -@stream_rules.command('delete') -@click.argument('value') +@stream_rules.command("delete") +@click.argument("value") @click.pass_obj @cli_api_error def delete_stream_rule(T, value): @@ -627,26 +749,28 @@ def delete_stream_rule(T, value): """ # find the rule id result = T.get_stream_rules() - if 'data' not in result: - click.echo(click.style('šŸ’” There are no rules to delete!', fg='red'), err=True) + if "data" not in result: + click.echo(click.style("šŸ’” There are no rules to delete!", fg="red"), err=True) else: rule_id = None - for rule in result['data']: - if rule['value'] == value: - rule_id = rule['id'] + for rule in result["data"]: + if rule["value"] == value: + rule_id = rule["id"] break if not rule_id: - click.echo(click.style(f'šŸ™ƒ No rule could be found for "{value}"', - fg='red'), err=True) + click.echo( + click.style(f'šŸ™ƒ No rule could be found for "{value}"', fg="red"), + err=True, + ) else: results = T.delete_stream_rule_ids([rule_id]) - if 'errors' in results: - click.echo(_error_str(results['errors']), err=True) + if "errors" in results: + click.echo(_error_str(results["errors"]), err=True) else: - click.echo(f"šŸ—‘ Deleted stream rule for {value}", color='green') + click.echo(f"šŸ—‘ Deleted stream rule for {value}", color="green") -@stream_rules.command('delete-all') +@stream_rules.command("delete-all") @click.pass_obj @cli_api_error def delete_all(T): @@ -654,28 +778,17 @@ def delete_all(T): Delete all stream rules! """ result = T.get_stream_rules() - if 'data' not in result: - click.echo(click.style('šŸ’” There are no rules to delete!', fg='red'), err=True) + if "data" not in result: + click.echo(click.style("šŸ’” There are no rules to delete!", fg="red"), err=True) else: - rule_ids = [r['id'] for r in result['data']] + rule_ids = [r["id"] for r in result["data"]] results = T.delete_stream_rule_ids(rule_ids) click.echo(f"šŸ—‘ Deleted {len(rule_ids)} rules.") -def _id_progress_bar(since_id, until_id, start_time, end_time): - """ - Snowflake ID based progress bar. - """ - return tqdm( - total=1, - ) - -def _date_to_snowflake(date): - return 1 - def _rule_str(rule): s = f"id={rule['id']} value={rule['value']}" - if 'tag' in rule: + if "tag" in rule: s += f" tag={rule['tag']}" return s @@ -690,20 +803,21 @@ def _error_str(errors): parts = [] for error in errors: - for part in error['errors']: + for part in error["errors"]: s = "šŸ’£ " - if 'message' in part: - s += click.style(part['message'], fg='red') - elif 'title' in part: - s += click.style(part['title'], fg='red') + if "message" in part: + s += click.style(part["message"], fg="red") + elif "title" in part: + s += click.style(part["title"], fg="red") else: - s = click.style('Unknown error', fg='red') - if 'type' in part: + s = click.style("Unknown error", fg="red") + if "type" in part: s += f" see: {part['type']}" parts.append(s) return click.style("\n".join(parts), fg="red") + def _write(results, outfile, pretty=False): indent = 2 if pretty else None click.echo(json.dumps(results, indent=indent), file=outfile) diff --git a/twarc/decorators.py b/twarc/decorators.py index 3d9003cb..5e6e803c 100644 --- a/twarc/decorators.py +++ b/twarc/decorators.py @@ -5,11 +5,11 @@ from requests import HTTPError from requests.packages.urllib3.exceptions import ReadTimeoutError -from requests.exceptions import ChunkedEncodingError, ReadTimeout, \ - ContentDecodingError +from requests.exceptions import ChunkedEncodingError, ReadTimeout, ContentDecodingError +import datetime +from tqdm.auto import tqdm - -log = logging.getLogger('twarc') +log = logging.getLogger("twarc") class InvalidAuthType(Exception): @@ -18,12 +18,118 @@ class InvalidAuthType(Exception): """ +def prbar(f): + """ + A Decorator to add aprogress bar that uses timestamp ranges + """ + + @wraps(f) + def new_f(*args, **kwargs): + # print("f", args, kwargs) + pbar = _id_progress_bar( + kwargs["since_id"], + kwargs["until_id"], + kwargs["start_time"], + kwargs["end_time"], + kwargs["outfile"], + ) + kwargs["progress_bar"] = pbar + return f(*args, **kwargs) + + return new_f + + +class TimestampProgressBar(tqdm): + def __init__(self, start_time, end_time, **kwargs): + total = _date2millis(end_time) - _date2millis(start_time) + kwargs["total"] = total + super().__init__(**kwargs) + + def update_ids(self, meta): + """ + identical to update, except `n` should be current value and not delta. + """ + n = _snowflake2millis(int(meta["newest_id"])) - _snowflake2millis( + int(meta["oldest_id"]) + ) + # self.update(n - self.n) + self.update(n) + + """Provides a `total_time` format parameter""" + + @property + def format_dict(self): + d = super(TimestampProgressBar, self).format_dict + total_time = d["elapsed"] * (d["total"] or 0) / max(d["n"], 1) + d.update(total_time=self.format_interval(total_time) + " in total") + return d + + # def close(self): + # if not super.it.hasnext(): + # self.update(self.total) + # super().close() + + +def _id_progress_bar(since_id, until_id, start_time, end_time, outfile): + """ + Snowflake ID based progress bar. + """ + if start_time is None and since_id is None: + start_time = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta( + seconds=90 + ) + if end_time is None and until_id is None: + end_time = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta( + seconds=30 + ) + # total = _date2millis(end_time) - _date2millis(start_time) if (start_time and end_time) else _snowflake2millis(until_id) - _snowflake2millis(since_id) + return TimestampProgressBar( + # total=total, + start_time=start_time, + end_time=end_time, + disable=(outfile.name == ""), + ) + + +def _date2millis(dt): + return int(dt.timestamp() * 1000) + + +def _snowflake2millis(snowflake_id): + return (snowflake_id >> 22) + 1288834974657 + + +def _millis2snowflake(ms): + return (int(ms) - 1288834974657) << 22 + + +def _millis2date(ms): + return datetime.datetime.utcfromtimestamp(ms // 1000).replace( + microsecond=ms % 1000 * 1000 + ) + + +def _date2snowflake(dt): + ms = int(dt.timestamp() * 1000) + snowflake_id = (int(ms) - 1288834974657) << 22 + return snowflake_id + + +def _snowflake2date(snowflake_id): + ms = (snowflake_id >> 22) + 1288834974657 + dt = datetime.datetime.utcfromtimestamp(ms // 1000).replace( + microsecond=ms % 1000 * 1000 + ) + return dt + + def rate_limit(f): """ A decorator to handle rate limiting from the Twitter API. If a rate limit error is encountered we will sleep until we can issue the API call again. """ + @wraps(f) def new_f(*args, **kwargs): errors = 0 @@ -40,14 +146,16 @@ def new_f(*args, **kwargs): try: resp.raise_for_status() except HTTPError as e: - message = "\nThis is a protected or locked account, or" +\ - " the credentials provided are no longer valid." + message = ( + "\nThis is a protected or locked account, or" + + " the credentials provided are no longer valid." + ) e.args = (e.args[0] + message,) + e.args[1:] log.warning("401 Authentication required for %s", resp.url) raise elif resp.status_code == 429: try: - reset = int(resp.headers['x-rate-limit-reset']) + reset = int(resp.headers["x-rate-limit-reset"]) now = time.time() seconds = reset - now + 10 except KeyError: @@ -63,14 +171,18 @@ def new_f(*args, **kwargs): log.warning("too many errors from Twitter, giving up") resp.raise_for_status() seconds = 60 * errors - log.warning("%s from Twitter API, sleeping %s", - resp.status_code, seconds) + log.warning( + "%s from Twitter API, sleeping %s", resp.status_code, seconds + ) time.sleep(seconds) - elif resp.status_code== 422: - log.error("Recieved HTTP 422 response from Twitter API. Are you using the Premium API and forgot to use --sandbox or sandbox parameter?") + elif resp.status_code == 422: + log.error( + "Recieved HTTP 422 response from Twitter API. Are you using the Premium API and forgot to use --sandbox or sandbox parameter?" + ) return resp else: resp.raise_for_status() + return new_f @@ -82,6 +194,7 @@ def catch_conn_reset(f): """ try: import OpenSSL + ConnectionError = OpenSSL.SSL.SysCallError except: ConnectionError = None @@ -98,6 +211,7 @@ def new_f(self, *args, **kwargs): return f(self, *args, **kwargs) else: return f(self, *args, **kwargs) + return new_f @@ -105,6 +219,7 @@ def catch_timeout(f): """ A decorator to handle read timeouts from Twitter. """ + @wraps(f) def new_f(self, *args, **kwargs): try: @@ -113,6 +228,7 @@ def new_f(self, *args, **kwargs): log.warning("caught read timeout: %s", e) self.connect() return f(self, *args, **kwargs) + return new_f @@ -121,6 +237,7 @@ def catch_gzip_errors(f): A decorator to handle gzip encoding errors which have been known to happen during hydration. """ + @wraps(f) def new_f(self, *args, **kwargs): try: @@ -129,6 +246,7 @@ def new_f(self, *args, **kwargs): log.warning("caught gzip error: %s", e) self.connect() return f(self, *args, **kwargs) + return new_f @@ -146,27 +264,31 @@ def interruptible_sleep(t, event=None): else: return not event.wait(t) + def filter_protected(f): """ filter_protected will filter out protected tweets and users unless explicitly requested not to. """ + @wraps(f) def new_f(self, *args, **kwargs): for obj in f(self, *args, **kwargs): if self.protected == False: - if 'user' in obj and obj['user']['protected']: + if "user" in obj and obj["user"]["protected"]: continue - elif 'protected' in obj and obj['protected']: + elif "protected" in obj and obj["protected"]: continue yield obj return new_f -class cli_api_error(): + +class cli_api_error: """ A decorator to catch HTTP errors for the command line. """ + def __init__(self, f): self.f = f # this is needed for click help docs to work properly @@ -178,22 +300,21 @@ def __call__(self, *args, **kwargs): except HTTPError as e: try: result = e.response.json() - if 'errors' in result: - for error in result['errors']: - msg = error.get('message', 'Unknown error') - elif 'title' in result: - msg = result['title'] + if "errors" in result: + for error in result["errors"]: + msg = error.get("message", "Unknown error") + elif "title" in result: + msg = result["title"] else: - msg = 'Unknown error' + msg = "Unknown error" except ValueError: - msg = f'Unable to parse {e.response.status_code} error as JSON: {e.response.text}' + msg = f"Unable to parse {e.response.status_code} error as JSON: {e.response.text}" except InvalidAuthType as e: msg = "This command requires application authentication, try passing --app-auth" except ValueError as e: msg = str(e) click.echo( - click.style("āš” ", fg="yellow") + click.style(msg, fg="red"), - err=True + click.style("āš” ", fg="yellow") + click.style(msg, fg="red"), err=True ) @@ -202,6 +323,7 @@ def requires_app_auth(f): Ensure that application authentication is set for calls that only work in that mode. """ + @wraps(f) def new_f(self, *args, **kwargs): if self.auth_type != "application": From fa42b74a67261060ef082a2d03632fa8746e22ab Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 21 Jun 2021 14:01:24 +0100 Subject: [PATCH 05/43] set default progress bar properties --- twarc/decorators.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/twarc/decorators.py b/twarc/decorators.py index 4e57cb88..d1c46927 100644 --- a/twarc/decorators.py +++ b/twarc/decorators.py @@ -27,18 +27,22 @@ class TimestampProgressBar(tqdm): def __init__(self, outfile, since_id, until_id, start_time, end_time, **kwargs): if start_time is None and since_id is None: - start_time = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta( - seconds=90 - ) + start_time = datetime.datetime.now( + datetime.timezone.utc + ) - datetime.timedelta(seconds=90) if end_time is None and until_id is None: - end_time = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta( - seconds=30 + end_time = datetime.datetime.now( + datetime.timezone.utc + ) - datetime.timedelta(seconds=30) + + total = ( + _snowflake2millis(until_id) - _snowflake2millis(since_id) + if (since_id and until_id) + else _date2millis(end_time) - _date2millis(start_time) ) - total = _snowflake2millis(until_id) - _snowflake2millis(since_id) if (since_id and until_id) else _date2millis(end_time) - _date2millis(start_time) - kwargs["total"] = total - kwargs["disable"] = (outfile.name == "") + kwargs["disable"] = outfile.name == "" super().__init__(**kwargs) def update_with_snowflake(self, newest_id, oldest_id): From 32ce73534f68ca6a9e801c3126b00c645cfa5e7e Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 21 Jun 2021 14:41:01 +0100 Subject: [PATCH 06/43] addhide progress bar option --- twarc/command2.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index 4aa9b9b0..62384b76 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -220,6 +220,11 @@ def get_version(): @click.option( "--max-results", default=0, help="Maximum number of tweets per API response" ) +@click.option( + "--progress/--no-progress", + default=True, + help="Show Progress bar. Default: yes", +) @click.argument("query", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @@ -235,7 +240,7 @@ def search( limit, max_results, archive, - **kwargs, + progress, ): """ Search for tweets. @@ -253,26 +258,28 @@ def search( max_results = 100 search_method = T.search_recent - pbar = kwargs["progress_bar"] + hide_progressbar = (outfile.name == "") or ( + not progress + ) # hide bar when piping or if option set with TimestampProgressBar( - outfile, since_id, until_id, start_time, end_time - ) as pbar: + hide_progressbar, since_id, until_id, start_time, end_time + ) as progress: for result in search_method( query, since_id, until_id, start_time, end_time, max_results ): _write(result, outfile) - pbar.update_with_snowflake( + progress.update_with_snowflake( result["meta"]["newest_id"], result["meta"]["oldest_id"] ) count += len(result["data"]) if limit != 0 and count >= limit: - pbar.desc = f"--limit {limit} reached" + # Display message when stopped early + progress.desc = f"Set --limit of {limit} reached" + progress.early_stop = True break - else: - pbar.update(pbar.total - pbar.n) @twarc2.command("tweet") From af6c40c96f53d8e9c824d5d03a359426f511aa10 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 21 Jun 2021 14:41:35 +0100 Subject: [PATCH 07/43] improve progress bar close behaviour --- twarc/decorators.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/twarc/decorators.py b/twarc/decorators.py index d1c46927..992d64ad 100644 --- a/twarc/decorators.py +++ b/twarc/decorators.py @@ -24,7 +24,8 @@ class TimestampProgressBar(tqdm): This can be used to display a progress bar for tweet ids and time ranges. """ - def __init__(self, outfile, since_id, until_id, start_time, end_time, **kwargs): + def __init__(self, disable, since_id, until_id, start_time, end_time, **kwargs): + self.early_stop = False if start_time is None and since_id is None: start_time = datetime.datetime.now( @@ -41,30 +42,31 @@ def __init__(self, outfile, since_id, until_id, start_time, end_time, **kwargs): else _date2millis(end_time) - _date2millis(start_time) ) + kwargs["miniters"] = 1 kwargs["total"] = total - kwargs["disable"] = outfile.name == "" + kwargs["disable"] = disable super().__init__(**kwargs) def update_with_snowflake(self, newest_id, oldest_id): """ Update progress bar based on snowflake ids. """ - n = _snowflake2millis(int(newest_id)) - _snowflake2millis( - int(meta["oldest_id"]) - ) + n = _snowflake2millis(int(newest_id)) - _snowflake2millis(int(oldest_id)) self.update(n) @property def format_dict(self): + # Todo: Better Custom display d = super(TimestampProgressBar, self).format_dict total_time = d["elapsed"] * (d["total"] or 0) / max(d["n"], 1) d.update(total_time=self.format_interval(total_time) + " in total") return d - # def close(self): - # if not super.it.hasnext(): - # self.update(self.total) - # super().close() + def close(self): + if not self.early_stop: + # Finish the bar to 100% even if the last tweet ids do not cover the full time range + self.update(self.total - self.n) + super().close() def _date2millis(dt): From aa89d0736bd9f721b748940deb58b716daa8e5fe Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 21 Jun 2021 19:21:49 +0100 Subject: [PATCH 08/43] fix following command --- twarc/command2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twarc/command2.py b/twarc/command2.py index 62384b76..0e84e1aa 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -321,7 +321,7 @@ def followers(T, user, outfile, limit): @twarc2.command("following") @click.option("--limit", default=0, help="Maximum number of friends to save") -@click.argument("userd", type=str) +@click.argument("user", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error From 7a3350d95c91a6f6dc6bb0b46fda404b8ce3c1f8 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Tue, 22 Jun 2021 20:13:58 +0100 Subject: [PATCH 09/43] use simple bar format for now --- twarc/decorators.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/twarc/decorators.py b/twarc/decorators.py index 992d64ad..9c5dbca6 100644 --- a/twarc/decorators.py +++ b/twarc/decorators.py @@ -45,6 +45,7 @@ def __init__(self, disable, since_id, until_id, start_time, end_time, **kwargs): kwargs["miniters"] = 1 kwargs["total"] = total kwargs["disable"] = disable + kwargs["bar_format"] = "{l_bar}{bar}| {total_time} [{elapsed}<{remaining}{postfix}]" super().__init__(**kwargs) def update_with_snowflake(self, newest_id, oldest_id): @@ -56,10 +57,10 @@ def update_with_snowflake(self, newest_id, oldest_id): @property def format_dict(self): - # Todo: Better Custom display + # Todo: Better Custom display, tweets / requests per second / output file size? d = super(TimestampProgressBar, self).format_dict total_time = d["elapsed"] * (d["total"] or 0) / max(d["n"], 1) - d.update(total_time=self.format_interval(total_time) + " in total") + d.update(total_time=self.format_interval(total_time) + " elapsed") return d def close(self): From 30098ac39b4b9ba8531a8445aace528c03ed48bf Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Tue, 22 Jun 2021 20:59:58 +0100 Subject: [PATCH 10/43] file based progress for flatten command --- twarc/command2.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index 0e84e1aa..99cd43a5 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -719,8 +719,13 @@ def f(): @twarc2.command("flatten") @click.argument("infile", type=click.File("r"), default="-") @click.argument("outfile", type=click.File("w"), default="-") +@click.option( + "--progress/--no-progress", + default=True, + help="Show Progress bar. Default: yes", +) @cli_api_error -def flatten(infile, outfile): +def flatten(infile, outfile, progress): """ "Flatten" tweets, or move expansions inline with tweet objects and ensure that each line of output is a single tweet. @@ -735,9 +740,20 @@ def flatten(infile, outfile): ) return - for line in infile: - for tweet in ensure_flattened(json.loads(line)): - _write(tweet, outfile, False) + disable_progress = (infile.name == "" or outfile.name == "") or (progress == False) + with tqdm( + unit="B", + unit_scale=True, + unit_divisor=1024, + total=os.stat(infile.name).st_size if not disable_progress else 1, + disable=disable_progress, + ) as progress: + offset = 0 + for line in infile: + offset += len(line) + for tweet in ensure_flattened(json.loads(line)): + _write(tweet, outfile, False) + progress.update(offset - progress.n) @twarc2.command("stream") From 89047be856a2b4c9752aa09e46aab076e385cd9c Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Wed, 23 Jun 2021 02:31:34 +0100 Subject: [PATCH 11/43] add stubs and make FileSizeProgressBar aware of errors --- twarc/command2.py | 144 ++++++++++++++++++++++++++++++-------------- twarc/decorators.py | 71 +++++++++++++++++++--- 2 files changed, 163 insertions(+), 52 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index 99cd43a5..f2bf96be 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -9,19 +9,16 @@ import click import logging import pathlib -import datetime -import requests import configobj import threading -from tqdm.auto import tqdm from click_plugins import with_plugins from pkg_resources import iter_entry_points from twarc.version import version from twarc.handshake import handshake from twarc.config import ConfigProvider -from twarc.decorators import cli_api_error, TimestampProgressBar +from twarc.decorators import cli_api_error, TimestampProgressBar, FileSizeProgressBar from twarc.expansions import ensure_flattened from click_config_file import configuration_option @@ -221,9 +218,10 @@ def get_version(): "--max-results", default=0, help="Maximum number of tweets per API response" ) @click.option( - "--progress/--no-progress", - default=True, - help="Show Progress bar. Default: yes", + "--hide-progress", + is_flag=True, + default=False, + help="Hide the Progress bar. Default: show progress, unless using pipes.", ) @click.argument("query", type=str) @click.argument("outfile", type=click.File("w"), default="-") @@ -240,7 +238,7 @@ def search( limit, max_results, archive, - progress, + hide_progress, ): """ Search for tweets. @@ -258,22 +256,16 @@ def search( max_results = 100 search_method = T.search_recent - hide_progressbar = (outfile.name == "") or ( - not progress - ) # hide bar when piping or if option set - + hide_progress = True if (outfile.name == "") else hide_progress with TimestampProgressBar( - hide_progressbar, since_id, until_id, start_time, end_time + since_id, until_id, start_time, end_time, disable=hide_progress ) as progress: for result in search_method( query, since_id, until_id, start_time, end_time, max_results ): _write(result, outfile) - - progress.update_with_snowflake( - result["meta"]["newest_id"], result["meta"]["oldest_id"] - ) count += len(result["data"]) + progress.update_with_result(result) if limit != 0 and count >= limit: # Display message when stopped early @@ -302,11 +294,17 @@ def tweet(T, tweet_id, outfile, pretty): @twarc2.command("followers") @click.option("--limit", default=0, help="Maximum number of followers to save") +@click.option( + "--hide-progress", + is_flag=True, + default=False, + help="Hide the Progress bar. Default: show progress, unless using pipes.", +) @click.argument("user", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error -def followers(T, user, outfile, limit): +def followers(T, user, outfile, limit, hide_progress): """ Get the followers for a given user. """ @@ -321,11 +319,17 @@ def followers(T, user, outfile, limit): @twarc2.command("following") @click.option("--limit", default=0, help="Maximum number of friends to save") +@click.option( + "--hide-progress", + is_flag=True, + default=False, + help="Hide the Progress bar. Default: show progress, unless using pipes.", +) @click.argument("user", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error -def following(T, user, outfile, limit): +def following(T, user, outfile, limit, hide_progress): """ Get the users who are following a given user. """ @@ -365,28 +369,52 @@ def sample(T, outfile, limit): @twarc2.command("hydrate") @click.argument("infile", type=click.File("r"), default="-") @click.argument("outfile", type=click.File("w"), default="-") +@click.option( + "--hide-progress", + is_flag=True, + default=False, + help="Hide the Progress bar. Default: show progress, unless using pipes.", +) @click.pass_obj @cli_api_error -def hydrate(T, infile, outfile): +def hydrate(T, infile, outfile, hide_progress): """ Hydrate tweet ids. """ - for result in T.tweet_lookup(infile): - _write(result, outfile) + with FileSizeProgressBar(infile, disable=hide_progress) as progress: + for result in T.tweet_lookup(infile): + _write(result, outfile) + progress.update_with_result(result, error_resource_type="tweet") @twarc2.command("users") -@click.option("--usernames", is_flag=True, default=False) @click.argument("infile", type=click.File("r"), default="-") @click.argument("outfile", type=click.File("w"), default="-") +@click.option("--usernames", is_flag=True, default=False) +@click.option( + "--hide-progress", + is_flag=True, + default=False, + help="Hide the Progress bar. Default: show progress, unless using pipes.", +) @click.pass_obj @cli_api_error -def users(T, infile, outfile, usernames): +def users(T, infile, outfile, usernames, hide_progress): """ Get data for user ids or usernames. """ - for result in T.user_lookup(infile, usernames): - _write(result, outfile) + with FileSizeProgressBar(infile, disable=hide_progress) as progress: + for result in T.user_lookup(infile, usernames): + _write(result, outfile) + if usernames: + progress.update_with_result( + result, + field="username", + error_resource_type="user", + error_parameter="usernames", + ) + else: + progress.update_with_result(result, error_resource_type="user") @twarc2.command("mentions") @@ -402,11 +430,19 @@ def users(T, infile, outfile, usernames): type=click.DateTime(formats=("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S")), help="Match tweets sent before time (ISO 8601/RFC 3339)", ) +@click.option( + "--hide-progress", + is_flag=True, + default=False, + help="Hide the Progress bar. Default: show progress, unless using pipes.", +) @click.argument("user_id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error -def mentions(T, user_id, outfile, since_id, until_id, start_time, end_time): +def mentions( + T, user_id, outfile, since_id, until_id, start_time, end_time, hide_progress +): """ Retrieve the most recent tweets mentioning the given user. """ @@ -446,6 +482,12 @@ def mentions(T, user_id, outfile, since_id, until_id, start_time, end_time): default=False, help="Use the search/all API endpoint which is not limited to the last 3200 tweets, but requires Academic Product Track access.", ) +@click.option( + "--hide-progress", + is_flag=True, + default=False, + help="Hide the Progress bar. Default: show progress, unless using pipes.", +) @click.argument("user_id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @@ -462,6 +504,7 @@ def timeline( limit, exclude_retweets, exclude_replies, + hide_progress, ): """ Retrieve recent tweets for the given user. @@ -513,6 +556,12 @@ def timeline( default=False, help="Exclude replies from timeline", ) +@click.option( + "--hide-progress", + is_flag=True, + default=False, + help="Hide the Progress bar. Default: show progress, unless using pipes.", +) @click.argument("infile", type=click.File("r"), default="-") @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @@ -525,6 +574,7 @@ def timelines( use_search, exclude_retweets, exclude_replies, + hide_progress, ): """ Fetch the timelines of every user in an input source of tweets. If @@ -617,11 +667,17 @@ def _timeline_tweets( default=False, help="Search the full archive (requires Academic Research track)", ) +@click.option( + "--hide-progress", + is_flag=True, + default=False, + help="Hide the Progress bar. Default: show progress, unless using pipes.", +) @click.argument("tweet_id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error -def conversation(T, tweet_id, archive, outfile): +def conversation(T, tweet_id, archive, outfile, hide_progress): """ Retrieve a conversation thread using the tweet id. """ @@ -647,11 +703,19 @@ def conversation(T, tweet_id, archive, outfile): default=False, help="Use the Academic Research project track access to the full archive", ) +@click.option( + "--hide-progress", + is_flag=True, + default=False, + help="Hide the Progress bar. Default: show progress, unless using pipes.", +) @click.argument("infile", type=click.File("r"), default="-") @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error -def conversations(T, infile, outfile, archive, limit, conversation_limit): +def conversations( + T, infile, outfile, archive, limit, conversation_limit, hide_progress +): """ Fetch the full conversation threads that the input tweets are a part of. Alternatively the input can be a line oriented file of conversation ids. @@ -720,12 +784,13 @@ def f(): @click.argument("infile", type=click.File("r"), default="-") @click.argument("outfile", type=click.File("w"), default="-") @click.option( - "--progress/--no-progress", - default=True, - help="Show Progress bar. Default: yes", + "--hide-progress", + is_flag=True, + default=False, + help="Hide the Progress bar. Default: show progress, unless using pipes.", ) @cli_api_error -def flatten(infile, outfile, progress): +def flatten(infile, outfile, hide_progress): """ "Flatten" tweets, or move expansions inline with tweet objects and ensure that each line of output is a single tweet. @@ -740,20 +805,11 @@ def flatten(infile, outfile, progress): ) return - disable_progress = (infile.name == "" or outfile.name == "") or (progress == False) - with tqdm( - unit="B", - unit_scale=True, - unit_divisor=1024, - total=os.stat(infile.name).st_size if not disable_progress else 1, - disable=disable_progress, - ) as progress: - offset = 0 + with FileSizeProgressBar(infile, disable=hide_progress) as progress: for line in infile: - offset += len(line) for tweet in ensure_flattened(json.loads(line)): _write(tweet, outfile, False) - progress.update(offset - progress.n) + progress.update(len(line)) @twarc2.command("stream") diff --git a/twarc/decorators.py b/twarc/decorators.py index 9c5dbca6..26e661f0 100644 --- a/twarc/decorators.py +++ b/twarc/decorators.py @@ -1,12 +1,14 @@ +import os import time import click import logging +import datetime from functools import wraps +from collections import defaultdict from requests import HTTPError from requests.packages.urllib3.exceptions import ReadTimeoutError from requests.exceptions import ChunkedEncodingError, ReadTimeout, ContentDecodingError -import datetime from tqdm.auto import tqdm log = logging.getLogger("twarc") @@ -18,15 +20,62 @@ class InvalidAuthType(Exception): """ +class FileSizeProgressBar(tqdm): + """ + A file size based progress bar. Counts an input file in bytes. + Overrides `disable` parameter if file is a pipe. + """ + + def __init__(self, infile, **kwargs): + disable = False if "disable" not in kwargs else kwargs["disable"] + if infile is not None and (infile.name == ""): + disable = True + kwargs["disable"] = disable + kwargs["unit"] = "B" + kwargs["unit_scale"] = True + kwargs["unit_divisor"] = 1024 + kwargs["miniters"] = 1 + kwargs["total"] = os.stat(infile.name).st_size if not disable else 1 + super().__init__(**kwargs) + + def update_with_result( + self, result, field="id", error_resource_type=None, error_parameter="ids" + ): + # try: + for item in result["data"]: + # Use the length of the id / name and a newline to match original file + self.update(len(item[field]) + len("\n")) + if error_resource_type and "errors" in result: + for error in result["errors"]: + # Account for deleted data + # Errors have very inconsistent format, missing fields for different types of errors... + if ( + "resource_type" in error + and error["resource_type"] == error_resource_type + ): + if "parameter" in error and error["parameter"] == error_parameter: + self.update(len(error["value"]) + len("\n")) + # todo: hide or show this? + # self.set_description( + # "Errors encountered, results may be incomplete" + # ) + # print(error["value"], error["resource_type"], error["parameter"]) + # except Exception as e: + # log.error(f"Failed to update progress bar: {e}") + + class TimestampProgressBar(tqdm): """ A Timestamp based progress bar. Counts timestamp ranges in milliseconds. This can be used to display a progress bar for tweet ids and time ranges. """ - def __init__(self, disable, since_id, until_id, start_time, end_time, **kwargs): + def __init__(self, since_id, until_id, start_time, end_time, **kwargs): self.early_stop = False + disable = False if "disable" not in kwargs else kwargs["disable"] + kwargs["disable"] = disable + if start_time is None and since_id is None: start_time = datetime.datetime.now( datetime.timezone.utc @@ -44,20 +93,26 @@ def __init__(self, disable, since_id, until_id, start_time, end_time, **kwargs): kwargs["miniters"] = 1 kwargs["total"] = total - kwargs["disable"] = disable - kwargs["bar_format"] = "{l_bar}{bar}| {total_time} [{elapsed}<{remaining}{postfix}]" + kwargs[ + "bar_format" + ] = "{l_bar}{bar}| {total_time} [{elapsed}<{remaining}{postfix}]" super().__init__(**kwargs) - def update_with_snowflake(self, newest_id, oldest_id): + def update_with_result(self, result): """ Update progress bar based on snowflake ids. """ - n = _snowflake2millis(int(newest_id)) - _snowflake2millis(int(oldest_id)) - self.update(n) + try: + newest_id = result["meta"]["newest_id"] + oldest_id = result["meta"]["oldest_id"] + n = _snowflake2millis(int(newest_id)) - _snowflake2millis(int(oldest_id)) + self.update(n) + except Exception as e: + log.error(f"Failed to update progress bar: {e}") @property def format_dict(self): - # Todo: Better Custom display, tweets / requests per second / output file size? + # Todo: Better Custom display, tweets / requests per second / output file size? d = super(TimestampProgressBar, self).format_dict total_time = d["elapsed"] * (d["total"] or 0) / max(d["n"], 1) d.update(total_time=self.format_interval(total_time) + " elapsed") From c17d16ce8a7c15d091bbc977b02a8ac44d3df4ea Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 28 Jun 2021 00:54:16 +0100 Subject: [PATCH 12/43] allow friends and followers commands to skip a user lookup if id is known --- twarc/client2.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/twarc/client2.py b/twarc/client2.py index 817d0f25..50e2dbd4 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -747,7 +747,7 @@ def mentions( exclude_replies, ) - def following(self, user): + def following(self, user, user_id=None): """ Retrieve the user profiles of accounts followed by the given user. @@ -759,13 +759,13 @@ def following(self, user): Returns: generator[dict]: A generator, dict for each page of results. """ - user_id = self._ensure_user_id(user) + user_id = self._ensure_user_id(user) if not user_id else user_id params = expansions.USER_EVERYTHING.copy() params["max_results"] = 1000 url = f"https://api.twitter.com/2/users/{user_id}/following" return self.get_paginated(url, params=params) - def followers(self, user): + def followers(self, user, user_id=None): """ Retrieve the user profiles of accounts following the given user. @@ -777,7 +777,7 @@ def followers(self, user): Returns: generator[dict]: A generator, dict for each page of results. """ - user_id = self._ensure_user_id(user) + user_id = self._ensure_user_id(user) if not user_id else user_id params = expansions.USER_EVERYTHING.copy() params["max_results"] = 1000 url = f"https://api.twitter.com/2/users/{user_id}/followers" From 23dddbb22f098a8cac029eaff5ce3afc17d2d860 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 28 Jun 2021 01:40:35 +0100 Subject: [PATCH 13/43] add _ensure_user convenience function --- twarc/client2.py | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/twarc/client2.py b/twarc/client2.py index 50e2dbd4..c6a76c20 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -935,18 +935,24 @@ def connect(self): resource_owner_secret=self.access_token_secret, ) + def _id_exists(user): + """ + Returns True if the user id exists + """ + try: + error_name = next(self.user_lookup([user]))["errors"][0]["title"] + return error_name != "Not Found Error" + except KeyError: + return True + def _ensure_user_id(self, user): + """ + Always return a valid user id, look up if not numeric. + """ user = str(user) is_numeric = re.match(r"^\d+$", user) - def id_exists(user): - try: - error_name = next(self.user_lookup([user]))["errors"][0]["title"] - return error_name != "Not Found Error" - except KeyError: - return True - - if len(user) > 15 or (is_numeric and id_exists(user)): + if len(user) > 15 or (is_numeric and _id_exists(user)): return user else: results = next(self.user_lookup([user], usernames=True)) @@ -957,6 +963,25 @@ def id_exists(user): else: raise ValueError(f"No such user {user}") + def _ensure_user(self, user): + """ + Always return a valid user object. + """ + user = str(user) + is_numeric = re.match(r"^\d+$", user) + + lookup = [] + if len(user) > 15 or (is_numeric and _id_exists(user)): + lookup = expansions.ensure_flattened(list(self.user_lookup([user]))) + else: + lookup = expansions.ensure_flattened( + list(self.user_lookup([user], usernames=True)) + ) + if lookup: + return lookup[-1] + else: + raise ValueError(f"No such user {user}") + def _ts(dt): """ From f8935b80b0ad9ad673cd338d3302579898c30543 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 28 Jun 2021 01:43:42 +0100 Subject: [PATCH 14/43] friends and followers docs string for limit note on min. 1000 results --- twarc/command2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index e4b60b7b..6063558e 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -401,7 +401,7 @@ def tweet(T, tweet_id, outfile, pretty): @twarc2.command("followers") -@click.option("--limit", default=0, help="Maximum number of followers to save") +@click.option("--limit", default=0, help="Maximum number of followers to save. Increments of 1000.") @click.option( "--hide-progress", is_flag=True, @@ -426,7 +426,7 @@ def followers(T, user, outfile, limit, hide_progress): @twarc2.command("following") -@click.option("--limit", default=0, help="Maximum number of friends to save") +@click.option("--limit", default=0, help="Maximum number of friends to save. Increments of 1000.") @click.option( "--hide-progress", is_flag=True, From 1a2c2eb1ce38a539de5816f91b6a725e0fe8e2ff Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 28 Jun 2021 01:45:15 +0100 Subject: [PATCH 15/43] friends and followers progress bars --- twarc/command2.py | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index 6063558e..89edb6ac 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -417,12 +417,21 @@ def followers(T, user, outfile, limit, hide_progress): Get the followers for a given user. """ count = 0 + user_id = None + lookup_total = 0 - for result in T.followers(user): - _write(result, outfile) - count += len(result["data"]) - if limit != 0 and count >= limit: - break + if not hide_progress: + target_user = T._ensure_user(user) + user_id = target_user['id'] + lookup_total = target_user["public_metrics"]["followers_count"] + + with tqdm(disable=hide_progress, total=lookup_total) as progress: + for result in T.followers(user, user_id=user_id): + _write(result, outfile) + count += len(result["data"]) + progress.update(len(result["data"])) + if limit != 0 and count >= limit: + break @twarc2.command("following") @@ -442,12 +451,21 @@ def following(T, user, outfile, limit, hide_progress): Get the users who are following a given user. """ count = 0 + user_id = None + lookup_total = 0 - for result in T.following(user): - _write(result, outfile) - count += len(result["data"]) - if limit != 0 and count >= limit: - break + if not hide_progress: + target_user = T._ensure_user(user) + user_id = target_user['id'] + lookup_total = target_user["public_metrics"]["following_count"] + + with tqdm(disable=hide_progress, total=lookup_total) as progress: + for result in T.following(user, user_id=user_id): + _write(result, outfile) + count += len(result["data"]) + progress.update(len(result["data"])) + if limit != 0 and count >= limit: + break @twarc2.command("sample") From 920e79cd4da522dfb207e81187b94ba8c092720c Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 28 Jun 2021 01:58:17 +0100 Subject: [PATCH 16/43] black format --- twarc/command2.py | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index 89edb6ac..fdd0b791 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -12,6 +12,7 @@ import configobj import threading +from tqdm.auto import tqdm from click_plugins import with_plugins from pkg_resources import iter_entry_points @@ -308,16 +309,9 @@ def search( "--text", is_flag=True, default=False, - help="Output the counts as human readable text" + help="Output the counts as human readable text", ) -@click.option( - "--csv", - is_flag=True, - default=False, - help="Output counts as CSV" -) - - +@click.option("--csv", is_flag=True, default=False, help="Output counts as CSV") @click.argument("query", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @@ -334,7 +328,7 @@ def counts( granularity, limit, text, - csv + csv, ): """ Return counts of tweets matching a query. @@ -347,7 +341,7 @@ def counts( count_method = T.counts_recent if csv: - click.echo(f'start,end,{granularity}_count', file=outfile) + click.echo(f"start,end,{granularity}_count", file=outfile) total_tweets = 0 @@ -360,11 +354,11 @@ def counts( granularity, ): if text: - for r in result['data']: - total_tweets += r['tweet_count'] - click.echo('{start} - {end}: {tweet_count:,}'.format(**r), file=outfile) + for r in result["data"]: + total_tweets += r["tweet_count"] + click.echo("{start} - {end}: {tweet_count:,}".format(**r), file=outfile) elif csv: - for r in result['data']: + for r in result["data"]: click.echo(f'{r["start"]},{r["end"]},{r["tweet_count"]}', file=outfile) else: _write(result, outfile) @@ -374,11 +368,8 @@ def counts( if text: click.echo( - click.style( - '\nTotal Tweets: {:,}\n'.format(total_tweets), - fg='green' - ), - file=outfile + click.style("\nTotal Tweets: {:,}\n".format(total_tweets), fg="green"), + file=outfile, ) @@ -401,7 +392,11 @@ def tweet(T, tweet_id, outfile, pretty): @twarc2.command("followers") -@click.option("--limit", default=0, help="Maximum number of followers to save. Increments of 1000.") +@click.option( + "--limit", + default=0, + help="Maximum number of followers to save. Increments of 1000.", +) @click.option( "--hide-progress", is_flag=True, @@ -422,7 +417,7 @@ def followers(T, user, outfile, limit, hide_progress): if not hide_progress: target_user = T._ensure_user(user) - user_id = target_user['id'] + user_id = target_user["id"] lookup_total = target_user["public_metrics"]["followers_count"] with tqdm(disable=hide_progress, total=lookup_total) as progress: @@ -435,7 +430,9 @@ def followers(T, user, outfile, limit, hide_progress): @twarc2.command("following") -@click.option("--limit", default=0, help="Maximum number of friends to save. Increments of 1000.") +@click.option( + "--limit", default=0, help="Maximum number of friends to save. Increments of 1000." +) @click.option( "--hide-progress", is_flag=True, @@ -456,7 +453,7 @@ def following(T, user, outfile, limit, hide_progress): if not hide_progress: target_user = T._ensure_user(user) - user_id = target_user['id'] + user_id = target_user["id"] lookup_total = target_user["public_metrics"]["following_count"] with tqdm(disable=hide_progress, total=lookup_total) as progress: From 98a1d3e98586d5a3efcad2f8c06be7a4fc768a2f Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 28 Jun 2021 02:29:01 +0100 Subject: [PATCH 17/43] help strings --- twarc/command2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index fdd0b791..65a0784f 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -401,7 +401,7 @@ def tweet(T, tweet_id, outfile, pretty): "--hide-progress", is_flag=True, default=False, - help="Hide the Progress bar. Default: show progress, unless using pipes.", + help="Hide the Progress bar. Default: show progress", ) @click.argument("user", type=str) @click.argument("outfile", type=click.File("w"), default="-") @@ -437,7 +437,7 @@ def followers(T, user, outfile, limit, hide_progress): "--hide-progress", is_flag=True, default=False, - help="Hide the Progress bar. Default: show progress, unless using pipes.", + help="Hide the Progress bar. Default: show progress", ) @click.argument("user", type=str) @click.argument("outfile", type=click.File("w"), default="-") From e70f5fb9dd714cd3b5d591a88782c0c543709165 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 28 Jun 2021 02:41:16 +0100 Subject: [PATCH 18/43] use counts endpoint for search progress bar --- twarc/command2.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index 65a0784f..88a1f44f 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -245,9 +245,11 @@ def search( Search for tweets. """ count = 0 + lookup_total = 0 if archive: search_method = T.search_all + count_method = T.counts_all # default number of tweets per response 500 when not set otherwise if max_results == 0: @@ -256,22 +258,38 @@ def search( if max_results == 0: max_results = 100 search_method = T.search_recent + count_method = T.counts_recent hide_progress = True if (outfile.name == "") else hide_progress - with TimestampProgressBar( - since_id, until_id, start_time, end_time, disable=hide_progress - ) as progress: + + if not hide_progress: + try: + # Single request just for the total + count_lookup = next( + count_method(query, since_id, until_id, start_time, end_time, "day") + ) + lookup_total = count_lookup["meta"]["total_tweet_count"] + except Exception as e: + logging.error("Failed getting counts:", e) + click.echo( + click.style( + f"Failed to get counts, progress bar will not work, but continuing to search. Check twarc.log for errors.", + fg="red", + ), + err=True, + ) + + with tqdm(disable=hide_progress, total=lookup_total) as progress: for result in search_method( query, since_id, until_id, start_time, end_time, max_results ): _write(result, outfile) count += len(result["data"]) - progress.update_with_result(result) + progress.update(len(result["data"])) if limit != 0 and count >= limit: # Display message when stopped early progress.desc = f"Set --limit of {limit} reached" - progress.early_stop = True break From be9afa487f01917e40c618b8e4bc355b2bd368d3 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 28 Jun 2021 03:18:45 +0100 Subject: [PATCH 19/43] change default timestamp progress bar start time --- twarc/decorators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twarc/decorators.py b/twarc/decorators.py index 26e661f0..2315f150 100644 --- a/twarc/decorators.py +++ b/twarc/decorators.py @@ -79,7 +79,7 @@ def __init__(self, since_id, until_id, start_time, end_time, **kwargs): if start_time is None and since_id is None: start_time = datetime.datetime.now( datetime.timezone.utc - ) - datetime.timedelta(seconds=90) + ) - datetime.timedelta(days=7) if end_time is None and until_id is None: end_time = datetime.datetime.now( datetime.timezone.utc From bc4c2ab672c790e69176ddd180ab44f52f5ff184 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 28 Jun 2021 03:19:03 +0100 Subject: [PATCH 20/43] mentions progressbar --- twarc/command2.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index 88a1f44f..972e98c5 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -575,7 +575,7 @@ def users(T, infile, outfile, usernames, hide_progress): "--hide-progress", is_flag=True, default=False, - help="Hide the Progress bar. Default: show progress, unless using pipes.", + help="Hide the Progress bar. Default: show progress", ) @click.argument("user_id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @@ -585,10 +585,13 @@ def mentions( T, user_id, outfile, since_id, until_id, start_time, end_time, hide_progress ): """ - Retrieve the most recent tweets mentioning the given user. + Retrieve max of 800 of the most recent tweets mentioning the given user. """ - for result in T.mentions(user_id, since_id, until_id, start_time, end_time): - _write(result, outfile) + + with tqdm(disable=hide_progress, total=800) as progress: + for result in T.mentions(user_id, since_id, until_id, start_time, end_time): + _write(result, outfile) + progress.update(len(result["data"])) @twarc2.command("timeline") From e76592ada71e76ad378931b39a063bdbefef58a5 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 28 Jun 2021 03:27:02 +0100 Subject: [PATCH 21/43] handle errors in file size progress bar --- twarc/decorators.py | 48 ++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/twarc/decorators.py b/twarc/decorators.py index 2315f150..06ecc00a 100644 --- a/twarc/decorators.py +++ b/twarc/decorators.py @@ -22,7 +22,8 @@ class InvalidAuthType(Exception): class FileSizeProgressBar(tqdm): """ - A file size based progress bar. Counts an input file in bytes. + An input file size based progress bar. Counts an input file in bytes. + This will also dig into the responses and add up the outputs to match the file size. Overrides `disable` parameter if file is a pipe. """ @@ -41,27 +42,30 @@ def __init__(self, infile, **kwargs): def update_with_result( self, result, field="id", error_resource_type=None, error_parameter="ids" ): - # try: - for item in result["data"]: - # Use the length of the id / name and a newline to match original file - self.update(len(item[field]) + len("\n")) - if error_resource_type and "errors" in result: - for error in result["errors"]: - # Account for deleted data - # Errors have very inconsistent format, missing fields for different types of errors... - if ( - "resource_type" in error - and error["resource_type"] == error_resource_type - ): - if "parameter" in error and error["parameter"] == error_parameter: - self.update(len(error["value"]) + len("\n")) - # todo: hide or show this? - # self.set_description( - # "Errors encountered, results may be incomplete" - # ) - # print(error["value"], error["resource_type"], error["parameter"]) - # except Exception as e: - # log.error(f"Failed to update progress bar: {e}") + try: + for item in result["data"]: + # Use the length of the id / name and a newline to match original file + self.update(len(item[field]) + len("\n")) + if error_resource_type and "errors" in result: + for error in result["errors"]: + # Account for deleted data + # Errors have very inconsistent format, missing fields for different types of errors... + if ( + "resource_type" in error + and error["resource_type"] == error_resource_type + ): + if ( + "parameter" in error + and error["parameter"] == error_parameter + ): + self.update(len(error["value"]) + len("\n")) + # todo: hide or show this? + # self.set_description( + # "Errors encountered, results may be incomplete" + # ) + # print(error["value"], error["resource_type"], error["parameter"]) + except Exception as e: + log.error(f"Failed to update progress bar: {e}") class TimestampProgressBar(tqdm): From 293bafcbfc86627fbfe047783756af1b6908c777 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 28 Jun 2021 03:49:48 +0100 Subject: [PATCH 22/43] rework search & conversation to use the same method --- twarc/command2.py | 149 +++++++++++++++++++++++++++++++++------------- 1 file changed, 108 insertions(+), 41 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index 972e98c5..d34339e7 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -5,6 +5,7 @@ import os import re import json +from urllib.parse import quote import twarc import click import logging @@ -195,40 +196,7 @@ def get_version(): click.echo(f"twarc v{version}") -@twarc2.command("search") -@click.option("--since-id", type=int, help="Match tweets sent after tweet id") -@click.option("--until-id", type=int, help="Match tweets sent prior to tweet id") -@click.option( - "--start-time", - type=click.DateTime(formats=("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S")), - help="Match tweets created after UTC time (ISO 8601/RFC 3339), e.g. 2021-01-01T12:31:04", -) -@click.option( - "--end-time", - type=click.DateTime(formats=("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S")), - help="Match tweets sent before UTC time (ISO 8601/RFC 3339)", -) -@click.option( - "--archive", - is_flag=True, - default=False, - help="Search the full archive (requires Academic Research track)", -) -@click.option("--limit", default=0, help="Maximum number of tweets to save") -@click.option( - "--max-results", default=0, help="Maximum number of tweets per API response" -) -@click.option( - "--hide-progress", - is_flag=True, - default=False, - help="Hide the Progress bar. Default: show progress, unless using pipes.", -) -@click.argument("query", type=str) -@click.argument("outfile", type=click.File("w"), default="-") -@click.pass_obj -@cli_api_error -def search( +def _search( T, query, outfile, @@ -293,6 +261,70 @@ def search( break +@twarc2.command("search") +@click.option("--since-id", type=int, help="Match tweets sent after tweet id") +@click.option("--until-id", type=int, help="Match tweets sent prior to tweet id") +@click.option( + "--start-time", + type=click.DateTime(formats=("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S")), + help="Match tweets created after UTC time (ISO 8601/RFC 3339), e.g. 2021-01-01T12:31:04", +) +@click.option( + "--end-time", + type=click.DateTime(formats=("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S")), + help="Match tweets sent before UTC time (ISO 8601/RFC 3339)", +) +@click.option( + "--archive", + is_flag=True, + default=False, + help="Search the full archive (requires Academic Research track)", +) +@click.option("--limit", default=0, help="Maximum number of tweets to save") +@click.option( + "--max-results", default=0, help="Maximum number of tweets per API response" +) +@click.option( + "--hide-progress", + is_flag=True, + default=False, + help="Hide the Progress bar. Default: show progress, unless using pipes.", +) +@click.argument("query", type=str) +@click.argument("outfile", type=click.File("w"), default="-") +@click.pass_obj +@cli_api_error +def search( + T, + query, + outfile, + since_id, + until_id, + start_time, + end_time, + limit, + max_results, + archive, + hide_progress, +): + """ + Search for tweets. + """ + return _search( + T, + query, + outfile, + since_id, + until_id, + start_time, + end_time, + limit, + max_results, + archive, + hide_progress, + ) + + @twarc2.command("counts") @click.option("--since-id", type=int, help="Count tweets sent after tweet id") @click.option("--until-id", type=int, help="Count tweets sent prior to tweet id") @@ -836,12 +868,28 @@ def _timeline_tweets( @twarc2.command("conversation") +@click.option("--since-id", type=int, help="Match tweets sent after tweet id") +@click.option("--until-id", type=int, help="Match tweets sent prior to tweet id") +@click.option( + "--start-time", + type=click.DateTime(formats=("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S")), + help="Match tweets created after UTC time (ISO 8601/RFC 3339), e.g. 2021-01-01T12:31:04", +) +@click.option( + "--end-time", + type=click.DateTime(formats=("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S")), + help="Match tweets sent before UTC time (ISO 8601/RFC 3339)", +) @click.option( "--archive", is_flag=True, default=False, help="Search the full archive (requires Academic Research track)", ) +@click.option("--limit", default=0, help="Maximum number of tweets to save") +@click.option( + "--max-results", default=0, help="Maximum number of tweets per API response" +) @click.option( "--hide-progress", is_flag=True, @@ -852,17 +900,36 @@ def _timeline_tweets( @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @cli_api_error -def conversation(T, tweet_id, archive, outfile, hide_progress): +def conversation( + T, + tweet_id, + outfile, + since_id, + until_id, + start_time, + end_time, + limit, + max_results, + archive, + hide_progress, +): """ Retrieve a conversation thread using the tweet id. """ q = f"conversation_id:{tweet_id}" - if archive: - search = T.search_all(q) - else: - search = T.search_recent(q) - for resp in search: - _write(resp, outfile) + return _search( + T, + q, + outfile, + since_id, + until_id, + start_time, + end_time, + limit, + max_results, + archive, + hide_progress, + ) @twarc2.command("conversations") From c3d6526f25b3d0e15cf4831384a9da45022c5a01 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Mon, 28 Jun 2021 04:20:45 +0100 Subject: [PATCH 23/43] timeline progressbars --- twarc/command2.py | 228 +++++++++++++++++++++++++--------------------- 1 file changed, 126 insertions(+), 102 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index d34339e7..5a0a1a46 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -699,12 +699,28 @@ def timeline( ) count = 0 - for result in tweets: - _write(result, outfile) - count += len(result["data"]) - if limit != 0 and count >= limit: - break + pbar = tqdm(disable=hide_progress, total=3200) + if use_search: + pbar = TimestampProgressBar( + since_id, until_id, start_time, end_time, disable=hide_progress + ) + + with pbar as progress: + for result in tweets: + _write(result, outfile) + + count += len(result["data"]) + if use_search and isinstance(pbar, TimestampProgressBar): + progress.update_with_result(result) + else: + progress.update(len(result["data"])) + + if limit != 0 and count >= limit: + # Display message when stopped early + progress.desc = f"Set --limit of {limit} reached" + progress.early_stop = True + break @twarc2.command("timelines") @@ -766,74 +782,79 @@ def timelines( total_count = 0 line_count = 0 seen = set() - for line in infile: - line_count += 1 - line = line.strip() - if line == "": - logging.warn("skipping blank line on line %s", line_count) - continue - - users = None - try: - # first try to get user ids from a flattened Twitter response - json_data = json.loads(line) - try: - users = set([t["author"]["id"] for t in ensure_flattened(json_data)]) - except (KeyError, ValueError): - # if it's not tweet JSON but it parsed as a string use that as a user - if isinstance(json_data, str) and json_data: - users = set([json_data]) - else: - logging.warn( - "ignored line %s which didn't contain users", line_count - ) - continue - except json.JSONDecodeError: - # assume it's a single user - users = set([line]) + with FileSizeProgressBar(infile, disable=hide_progress) as progress: + for line in infile: + progress.update(len(line)) + line_count += 1 + line = line.strip() + if line == "": + logging.warn("skipping blank line on line %s", line_count) + continue - if users is None: - click.echo( - click.style( - f"unable to find user or users on line {line_count}", - fg="red", - ), - err=True, - ) - break + users = None + try: + # first try to get user ids from a flattened Twitter response + json_data = json.loads(line) + try: + users = set( + [t["author"]["id"] for t in ensure_flattened(json_data)] + ) + except (KeyError, ValueError): + # if it's not tweet JSON but it parsed as a string use that as a user + if isinstance(json_data, str) and json_data: + users = set([json_data]) + else: + logging.warn( + "ignored line %s which didn't contain users", line_count + ) + continue + + except json.JSONDecodeError: + # assume it's a single user + users = set([line]) + + if users is None: + click.echo( + click.style( + f"unable to find user or users on line {line_count}", + fg="red", + ), + err=True, + ) + break - for user in users: + for user in users: - # only process a given user once - if user in seen: - logging.info("already processed %s, skipping", user) - continue - seen.add(user) - - tweets = _timeline_tweets( - T, - use_search, - user, - None, - None, - None, - None, - exclude_retweets, - exclude_replies, - ) + # only process a given user once + if user in seen: + logging.info("already processed %s, skipping", user) + continue + seen.add(user) + + tweets = _timeline_tweets( + T, + use_search, + user, + None, + None, + None, + None, + exclude_retweets, + exclude_replies, + ) - timeline_count = 0 - for response in tweets: - _write(response, outfile) + timeline_count = 0 + for response in tweets: + _write(response, outfile) - timeline_count += len(response["data"]) - if timeline_limit != 0 and timeline_count >= timeline_limit: - break + timeline_count += len(response["data"]) + if timeline_limit != 0 and timeline_count >= timeline_limit: + break - total_count += len(response["data"]) - if limit != 0 and total_count >= limit: - return + total_count += len(response["data"]) + if limit != 0 and total_count >= limit: + return def _timeline_tweets( @@ -972,54 +993,57 @@ def conversations( count = 0 stop = False - for line in infile: - conv_ids = [] - # stop will get set when the total tweet limit has been met - if stop: - break + with FileSizeProgressBar(infile, disable=hide_progress) as progress: + for line in infile: + progress.update(len(line)) + conv_ids = [] - # get a specific conversation id - line = line.strip() - if re.match(r"^\d+$", line): - if line in seen: - continue - conv_ids = [line] + # stop will get set when the total tweet limit has been met + if stop: + break - # generate all conversation_ids that are referenced in tweets input - else: + # get a specific conversation id + line = line.strip() + if re.match(r"^\d+$", line): + if line in seen: + continue + conv_ids = [line] + + # generate all conversation_ids that are referenced in tweets input + else: - def f(): - for tweet in ensure_flattened(json.loads(line)): - yield tweet.get("conversation_id") + def f(): + for tweet in ensure_flattened(json.loads(line)): + yield tweet.get("conversation_id") - conv_ids = f() + conv_ids = f() - # output results while paying attention to the set limits - conv_count = 0 + # output results while paying attention to the set limits + conv_count = 0 - for conv_id in conv_ids: + for conv_id in conv_ids: - if conv_id in seen: - logging.info(f"already fetched conversation_id {conv_id}") - seen.add(conv_id) + if conv_id in seen: + logging.info(f"already fetched conversation_id {conv_id}") + seen.add(conv_id) - conv_count = 0 + conv_count = 0 - logging.info(f"fetching conversation {conv_id}") - for result in search(f"conversation_id:{conv_id}"): - _write(result, outfile, False) + logging.info(f"fetching conversation {conv_id}") + for result in search(f"conversation_id:{conv_id}"): + _write(result, outfile, False) - count += len(result["data"]) - if limit != 0 and count >= limit: - logging.info(f"reached tweet limit of {limit}") - stop = True - break + count += len(result["data"]) + if limit != 0 and count >= limit: + logging.info(f"reached tweet limit of {limit}") + stop = True + break - conv_count += len(result["data"]) - if conversation_limit != 0 and conv_count >= conversation_limit: - logging.info(f"reached conversation limit {conversation_limit}") - break + conv_count += len(result["data"]) + if conversation_limit != 0 and conv_count >= conversation_limit: + logging.info(f"reached conversation limit {conversation_limit}") + break @twarc2.command("flatten") From 54e71c282a7758410e95923128ba34f12c0a42e1 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Fri, 2 Jul 2021 13:26:43 +0100 Subject: [PATCH 24/43] move os import --- twarc/decorators.py | 1 - twarc/decorators2.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/twarc/decorators.py b/twarc/decorators.py index 23d6e7fc..ff667e45 100644 --- a/twarc/decorators.py +++ b/twarc/decorators.py @@ -1,4 +1,3 @@ -import os import time import logging diff --git a/twarc/decorators2.py b/twarc/decorators2.py index 61355dc0..914e9da2 100644 --- a/twarc/decorators2.py +++ b/twarc/decorators2.py @@ -1,3 +1,4 @@ +import os import time import click import types From 40aca41969ffd2cac784cd83d351d1a4f57d5061 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Fri, 2 Jul 2021 13:57:38 +0100 Subject: [PATCH 25/43] fix exception handling when counts fail --- twarc/command2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/twarc/command2.py b/twarc/command2.py index a6ac0319..e25bb6cf 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -232,6 +232,7 @@ def _search( hide_progress = True if (outfile.name == "") else hide_progress if not hide_progress: + try: # Single request just for the total count_lookup = next( @@ -239,7 +240,7 @@ def _search( ) lookup_total = count_lookup["meta"]["total_tweet_count"] except Exception as e: - log.error("Failed getting counts:", e) + log.error(f"Failed getting counts: {str(e)}") click.echo( click.style( f"Failed to get counts, progress bar will not work, but continuing to search. Check twarc.log for errors.", From 568d300b51ccf15a34c0617df3ceb42823df0ed4 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Fri, 2 Jul 2021 14:45:27 +0100 Subject: [PATCH 26/43] move default start time to command2.py --- twarc/client2.py | 14 -------------- twarc/decorators2.py | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/twarc/client2.py b/twarc/client2.py index 28fa4342..e9ea23dc 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -23,8 +23,6 @@ log = logging.getLogger("twarc") -TWITTER_EPOCH = datetime.datetime(2006, 3, 21, tzinfo=datetime.timezone.utc) - class Twarc2: """ @@ -227,12 +225,6 @@ def search_all( """ url = "https://api.twitter.com/2/tweets/search/all" - # start time defaults to the beginning of Twitter to override the - # default of the last month. Only do this if start_time is not already - # specified and since_id isn't being used - if start_time is None and since_id is None: - start_time = TWITTER_EPOCH - return self._search( url, query, @@ -319,12 +311,6 @@ def counts_all( """ url = "https://api.twitter.com/2/tweets/counts/all" - # start time defaults to the beginning of Twitter to override the - # default of the last month. Only do this if start_time is not already - # specified and since_id isn't being used - if start_time is None and since_id is None: - start_time = TWITTER_EPOCH - return self._search( url, query, diff --git a/twarc/decorators2.py b/twarc/decorators2.py index 914e9da2..bfa345d9 100644 --- a/twarc/decorators2.py +++ b/twarc/decorators2.py @@ -305,3 +305,20 @@ def _date2snowflake(dt): def _snowflake2date(snowflake_id): return _millis2date(_snowflake2millis(snowflake_id)) + + +def _time_delta(since_id, until_id, start_time, end_time): + if start_time is None and since_id is None: + start_time = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta( + days=7 + ) + if end_time is None and until_id is None: + end_time = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta( + seconds=30 + ) + + if since_id and until_id: + start_time = _millis2date(_snowflake2millis(since_id)) + end_time = _millis2date(_snowflake2millis(until_id)) + + return start_time - end_time From e2d4156cb636ed49c05e8831d2880ca9cad18975 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Fri, 2 Jul 2021 14:45:52 +0100 Subject: [PATCH 27/43] fix non utx dates for time calculations and imporove search progress bar --- twarc/command2.py | 46 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index e25bb6cf..39f49e50 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -5,24 +5,32 @@ import os import re import json -from urllib.parse import quote import twarc import click import logging import pathlib +import datetime import configobj import threading from tqdm.auto import tqdm +from datetime import timezone +from urllib.parse import quote from click_plugins import with_plugins from pkg_resources import iter_entry_points from twarc.version import version from twarc.handshake import handshake from twarc.config import ConfigProvider -from twarc.decorators2 import cli_api_error, TimestampProgressBar, FileSizeProgressBar from twarc.expansions import ensure_flattened from click_config_file import configuration_option +from twarc.decorators2 import ( + cli_api_error, + TimestampProgressBar, + FileSizeProgressBar, + _time_delta, +) + config_provider = ConfigProvider() log = logging.getLogger("twarc") @@ -216,6 +224,12 @@ def _search( count = 0 lookup_total = 0 + # Make sure times are always in UTC, click sometimes doesn't add timezone: + if start_time is not None and start_time.tzinfo is None: + start_time = start_time.replace(tzinfo=timezone.utc) + if end_time is not None and end_time.tzinfo is None: + end_time = end_time.replace(tzinfo=timezone.utc) + if archive: search_method = T.search_all count_method = T.counts_all @@ -223,6 +237,12 @@ def _search( # default number of tweets per response 500 when not set otherwise if max_results == 0: max_results = 500 + + # start time defaults to the beginning of Twitter to override the + # default of the last month. Only do this if start_time is not already + # specified and since_id isn't being used + if start_time is None and since_id is None: + start_time = datetime.datetime(2006, 3, 21, tzinfo=datetime.timezone.utc) else: if max_results == 0: max_results = 100 @@ -230,34 +250,42 @@ def _search( count_method = T.counts_recent hide_progress = True if (outfile.name == "") else hide_progress + short_timespan = abs(_time_delta(since_id, until_id, start_time, end_time).days) < 30 + pbar = TimestampProgressBar( + since_id, until_id, start_time, end_time, disable=hide_progress + ) - if not hide_progress: - + if not hide_progress and short_timespan: try: # Single request just for the total count_lookup = next( count_method(query, since_id, until_id, start_time, end_time, "day") ) lookup_total = count_lookup["meta"]["total_tweet_count"] + pbar = tqdm(disable=hide_progress, total=lookup_total) except Exception as e: log.error(f"Failed getting counts: {str(e)}") click.echo( click.style( - f"Failed to get counts, progress bar will not work, but continuing to search. Check twarc.log for errors.", + f"šŸ’” Failed to get tweet counts, but continuing to search. Check twarc.log for errors.", fg="red", ), err=True, ) - with tqdm(disable=hide_progress, total=lookup_total) as progress: + with pbar as progress: for result in search_method( query, since_id, until_id, start_time, end_time, max_results ): _write(result, outfile) - + tweet_ids = [t["id"] for t in result.get("data", [])] log.info("archived %s", ",".join(tweet_ids)) - progress.update(len(result["data"])) + + if isinstance(pbar, TimestampProgressBar): + progress.update_with_result(result) + else: + progress.update(len(result["data"])) count += len(result["data"]) if limit != 0 and count >= limit: @@ -283,7 +311,7 @@ def _search( "--archive", is_flag=True, default=False, - help="Search the full archive (requires Academic Research track)", + help="Search the full archive (requires Academic Research track). Defaults to searching the entire twitter archive if --start-time is not specified.", ) @click.option("--limit", default=0, help="Maximum number of tweets to save") @click.option( From 8ff8b6eb48e4cdebffb639e4e9dab6725dabc62b Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Fri, 2 Jul 2021 14:46:58 +0100 Subject: [PATCH 28/43] formatting --- twarc/command2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/twarc/command2.py b/twarc/command2.py index 39f49e50..5161fc74 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -250,7 +250,9 @@ def _search( count_method = T.counts_recent hide_progress = True if (outfile.name == "") else hide_progress - short_timespan = abs(_time_delta(since_id, until_id, start_time, end_time).days) < 30 + short_timespan = ( + abs(_time_delta(since_id, until_id, start_time, end_time).days) < 30 + ) pbar = TimestampProgressBar( since_id, until_id, start_time, end_time, disable=hide_progress ) From 1317f90eb83f7c992bde7a948d60e881560d4206 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Fri, 2 Jul 2021 14:51:34 +0100 Subject: [PATCH 29/43] better earrly stop behaviour for timestamp progress bar --- twarc/command2.py | 2 ++ twarc/decorators2.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/twarc/command2.py b/twarc/command2.py index 5161fc74..38bdba21 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -293,6 +293,8 @@ def _search( if limit != 0 and count >= limit: # Display message when stopped early progress.desc = f"Set --limit of {limit} reached" + if isinstance(pbar, TimestampProgressBar): + progress.early_stop = True break diff --git a/twarc/decorators2.py b/twarc/decorators2.py index bfa345d9..ad549814 100644 --- a/twarc/decorators2.py +++ b/twarc/decorators2.py @@ -227,7 +227,7 @@ class TimestampProgressBar(tqdm): """ def __init__(self, since_id, until_id, start_time, end_time, **kwargs): - self.early_stop = False + self.early_stop = True disable = False if "disable" not in kwargs else kwargs["disable"] kwargs["disable"] = disable @@ -263,6 +263,7 @@ def update_with_result(self, result): oldest_id = result["meta"]["oldest_id"] n = _snowflake2millis(int(newest_id)) - _snowflake2millis(int(oldest_id)) self.update(n) + early_stop = False except Exception as e: log.error(f"Failed to update progress bar: {e}") From f144bb270ae280a0b76556e1d0ded8382cb1f88a Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 3 Jul 2021 00:50:18 +0100 Subject: [PATCH 30/43] add fancy progress bar output with humanize library --- requirements.txt | 1 + twarc/decorators2.py | 21 +++++++++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 832808fb..6f52c0ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ python-dateutil requests_oauthlib tqdm +humanize click click-plugins click-config-file diff --git a/twarc/decorators2.py b/twarc/decorators2.py index ad549814..c3bcecfc 100644 --- a/twarc/decorators2.py +++ b/twarc/decorators2.py @@ -6,6 +6,7 @@ import requests import datetime +import humanize from tqdm.auto import tqdm from functools import wraps from collections import defaultdict @@ -228,6 +229,7 @@ class TimestampProgressBar(tqdm): def __init__(self, since_id, until_id, start_time, end_time, **kwargs): self.early_stop = True + self.tweet_count = 0 disable = False if "disable" not in kwargs else kwargs["disable"] kwargs["disable"] = disable @@ -251,7 +253,7 @@ def __init__(self, since_id, until_id, start_time, end_time, **kwargs): kwargs["total"] = total kwargs[ "bar_format" - ] = "{l_bar}{bar}| {total_time} [{elapsed}<{remaining}{postfix}]" + ] = "{l_bar}{bar}| Processed {n_time}/{total_time} [{elapsed}<{remaining}, {tweet_count} tweets total {postfix}]" super().__init__(**kwargs) def update_with_result(self, result): @@ -263,16 +265,23 @@ def update_with_result(self, result): oldest_id = result["meta"]["oldest_id"] n = _snowflake2millis(int(newest_id)) - _snowflake2millis(int(oldest_id)) self.update(n) - early_stop = False + self.tweet_count += len(result["data"]) + self.early_stop = False except Exception as e: log.error(f"Failed to update progress bar: {e}") @property def format_dict(self): - # Todo: Better Custom display, tweets / requests per second / output file size? - d = super(TimestampProgressBar, self).format_dict - total_time = d["elapsed"] * (d["total"] or 0) / max(d["n"], 1) - d.update(total_time=self.format_interval(total_time) + " elapsed") + d = super(TimestampProgressBar, self).format_dict # original format dict + tweets_per_second = int(self.tweet_count / d["elapsed"] if d["elapsed"] else 0) + n_time = humanize.naturaldelta(datetime.timedelta(seconds=int(d["n"]) // 1000)) + total_time = humanize.naturaldelta( + datetime.timedelta(seconds=int(d["total"]) // 1000) + ) + d.update(n_time=n_time) + d.update(total_time=total_time) + d.update(tweet_count=self.tweet_count) + d.update(tweets_per_second=tweets_per_second) return d def close(self): From 37f0242fd8104344881707c172d91b328cfea43d Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 3 Jul 2021 01:43:14 +0100 Subject: [PATCH 31/43] fix ensure_user_id after refactor --- twarc/client2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/twarc/client2.py b/twarc/client2.py index 7e2c8bdc..686bdcea 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -861,7 +861,7 @@ def connect(self): resource_owner_secret=self.access_token_secret, ) - def _id_exists(user): + def _id_exists(self, user): """ Returns True if the user id exists """ From ff2f2907d616063194d4fe32ed16515a19ac9cb4 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 3 Jul 2021 01:51:27 +0100 Subject: [PATCH 32/43] black format --- twarc/client2.py | 2 +- twarc/command2.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/twarc/client2.py b/twarc/client2.py index 093c3a8c..2b5097fc 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -198,7 +198,7 @@ def search_all( until_id=None, start_time=None, end_time=None, - max_results=100, # temp fix for #504 + max_results=100, # temp fix for #504 ): """ Search Twitter for the given query in the full archive, diff --git a/twarc/command2.py b/twarc/command2.py index 9fa70d11..ef59b3dc 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -236,7 +236,7 @@ def _search( # default number of tweets per response 500 when not set otherwise if max_results == 0: - max_results = 100, # temp fix for #504 + max_results = (100,) # temp fix for #504 # start time defaults to the beginning of Twitter to override the # default of the last month. Only do this if start_time is not already From 7cc16adce2fa222758266d18262d8ba34604d0a4 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 3 Jul 2021 02:45:37 +0100 Subject: [PATCH 33/43] pbar for search early stopping and timelines and id_exists fix --- twarc/client2.py | 4 +-- twarc/command2.py | 66 +++++++++++++++----------------------------- twarc/decorators2.py | 1 - 3 files changed, 24 insertions(+), 47 deletions(-) diff --git a/twarc/client2.py b/twarc/client2.py index 2b5097fc..e63c7405 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -878,7 +878,7 @@ def _ensure_user_id(self, user): user = str(user) is_numeric = re.match(r"^\d+$", user) - if len(user) > 15 or (is_numeric and _id_exists(user)): + if len(user) > 15 or (is_numeric and self._id_exists(user)): return user else: results = next(self.user_lookup([user], usernames=True)) @@ -897,7 +897,7 @@ def _ensure_user(self, user): is_numeric = re.match(r"^\d+$", user) lookup = [] - if len(user) > 15 or (is_numeric and _id_exists(user)): + if len(user) > 15 or (is_numeric and self._id_exists(user)): lookup = expansions.ensure_flattened(list(self.user_lookup([user]))) else: lookup = expansions.ensure_flattened( diff --git a/twarc/command2.py b/twarc/command2.py index 183fbcd6..2cad4f35 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -222,7 +222,6 @@ def _search( Search for tweets. """ count = 0 - lookup_total = 0 # Make sure times are always in UTC, click sometimes doesn't add timezone: if start_time is not None and start_time.tzinfo is None: @@ -232,7 +231,6 @@ def _search( if archive: search_method = T.search_all - count_method = T.counts_all # default number of tweets per response 500 when not set otherwise if max_results == 0: @@ -247,55 +245,26 @@ def _search( if max_results == 0: max_results = 100 search_method = T.search_recent - count_method = T.counts_recent hide_progress = True if (outfile.name == "") else hide_progress - short_timespan = ( - abs(_time_delta(since_id, until_id, start_time, end_time).days) < 30 - ) - pbar = TimestampProgressBar( - since_id, until_id, start_time, end_time, disable=hide_progress - ) - - if not hide_progress and short_timespan: - try: - # Single request just for the total - count_lookup = next( - count_method(query, since_id, until_id, start_time, end_time, "day") - ) - lookup_total = count_lookup["meta"]["total_tweet_count"] - pbar = tqdm(disable=hide_progress, total=lookup_total) - except Exception as e: - log.error(f"Failed getting counts: {str(e)}") - click.echo( - click.style( - f"šŸ’” Failed to get tweet counts, but continuing to search. Check twarc.log for errors.", - fg="red", - ), - err=True, - ) - with pbar as progress: + with TimestampProgressBar( + since_id, until_id, start_time, end_time, disable=hide_progress + ) as progress: for result in search_method( query, since_id, until_id, start_time, end_time, max_results ): _write(result, outfile) - tweet_ids = [t["id"] for t in result.get("data", [])] log.info("archived %s", ",".join(tweet_ids)) - - if isinstance(pbar, TimestampProgressBar): - progress.update_with_result(result) - else: - progress.update(len(result["data"])) - + progress.update_with_result(result) count += len(result["data"]) if limit != 0 and count >= limit: # Display message when stopped early progress.desc = f"Set --limit of {limit} reached" - if isinstance(pbar, TimestampProgressBar): - progress.early_stop = True break + else: + progress.early_stop = False @twarc2.command("search") @@ -741,13 +710,20 @@ def timeline( count = 0 - pbar = tqdm(disable=hide_progress, total=3200) + pbar = tqdm + pbar_params = {"disable": hide_progress, "total": 3200} if use_search: - pbar = TimestampProgressBar( - since_id, until_id, start_time, end_time, disable=hide_progress - ) - - with pbar as progress: + pbar = TimestampProgressBar + pbar_params = { + "since_id": since_id, + "until_id": until_id, + "start_time": start_time, + "end_time": end_time, + "disable": hide_progress, + } + + with pbar(**pbar_params) as progress: + for result in tweets: _write(result, outfile) @@ -760,8 +736,10 @@ def timeline( if limit != 0 and count >= limit: # Display message when stopped early progress.desc = f"Set --limit of {limit} reached" - progress.early_stop = True break + else: + if isinstance(pbar, TimestampProgressBar): + progress.early_stop = False @twarc2.command("timelines") diff --git a/twarc/decorators2.py b/twarc/decorators2.py index c3bcecfc..687c82fd 100644 --- a/twarc/decorators2.py +++ b/twarc/decorators2.py @@ -266,7 +266,6 @@ def update_with_result(self, result): n = _snowflake2millis(int(newest_id)) - _snowflake2millis(int(oldest_id)) self.update(n) self.tweet_count += len(result["data"]) - self.early_stop = False except Exception as e: log.error(f"Failed to update progress bar: {e}") From 79f98b07177b617c71ef17d3e88118114ec74969 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 3 Jul 2021 03:43:34 +0100 Subject: [PATCH 34/43] timeline_tweets progress bar based on user counts --- twarc/command2.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index 2cad4f35..7ff310a5 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -711,8 +711,12 @@ def timeline( count = 0 pbar = tqdm - pbar_params = {"disable": hide_progress, "total": 3200} + user = T._ensure_user(user_id) + pbar_params = {"disable": hide_progress, "total": user["public_metrics"]["tweet_count"]} + if use_search: + if start_time is None and since_id is None: + start_time = datetime.datetime.strptime(user["created_at"], '%Y-%m-%dT%H:%M:%S.%fZ') pbar = TimestampProgressBar pbar_params = { "since_id": since_id, @@ -723,12 +727,11 @@ def timeline( } with pbar(**pbar_params) as progress: - for result in tweets: _write(result, outfile) count += len(result["data"]) - if use_search and isinstance(pbar, TimestampProgressBar): + if use_search and isinstance(progress, TimestampProgressBar): progress.update_with_result(result) else: progress.update(len(result["data"])) @@ -738,7 +741,7 @@ def timeline( progress.desc = f"Set --limit of {limit} reached" break else: - if isinstance(pbar, TimestampProgressBar): + if isinstance(progress, TimestampProgressBar): progress.early_stop = False From ea576e7c53d29c61e47b6588e6bb8a406e8f48ea Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 3 Jul 2021 03:59:26 +0100 Subject: [PATCH 35/43] fix _timeline_tweets search not working when start_time was not set --- twarc/command2.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index 7ff310a5..5759b885 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -696,18 +696,6 @@ def timeline( Retrieve recent tweets for the given user. """ - tweets = _timeline_tweets( - T, - use_search, - user_id, - since_id, - until_id, - start_time, - end_time, - exclude_retweets, - exclude_replies, - ) - count = 0 pbar = tqdm @@ -726,6 +714,18 @@ def timeline( "disable": hide_progress, } + tweets = _timeline_tweets( + T, + use_search, + user_id, + since_id, + until_id, + start_time, + end_time, + exclude_retweets, + exclude_replies, + ) + with pbar(**pbar_params) as progress: for result in tweets: _write(result, outfile) @@ -896,7 +896,7 @@ def _timeline_tweets( q += " -is:retweet" if exclude_replies and "-is:reply" not in q: q += " -is:reply" - tweets = T.search_all(q, since_id, until_id, start_time, end_time) + tweets = T.search_all(q, since_id, until_id, start_time, end_time, 100) else: tweets = T.timeline( user_id, From a31b162440633c92c9a54afb9cd6135c5cd61b57 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 3 Jul 2021 04:10:31 +0100 Subject: [PATCH 36/43] black format --- twarc/command2.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index 5759b885..54147d1d 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -700,11 +700,16 @@ def timeline( pbar = tqdm user = T._ensure_user(user_id) - pbar_params = {"disable": hide_progress, "total": user["public_metrics"]["tweet_count"]} + pbar_params = { + "disable": hide_progress, + "total": user["public_metrics"]["tweet_count"], + } if use_search: if start_time is None and since_id is None: - start_time = datetime.datetime.strptime(user["created_at"], '%Y-%m-%dT%H:%M:%S.%fZ') + start_time = datetime.datetime.strptime( + user["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ" + ) pbar = TimestampProgressBar pbar_params = { "since_id": since_id, From 3059283a79b6dea49621cbd4d8b288b75e8b0527 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 3 Jul 2021 04:28:01 +0100 Subject: [PATCH 37/43] remove unused code --- twarc/command2.py | 1 - twarc/decorators2.py | 25 ------------------------- 2 files changed, 26 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index 54147d1d..a757b3c1 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -28,7 +28,6 @@ cli_api_error, TimestampProgressBar, FileSizeProgressBar, - _time_delta, ) diff --git a/twarc/decorators2.py b/twarc/decorators2.py index 687c82fd..7492b525 100644 --- a/twarc/decorators2.py +++ b/twarc/decorators2.py @@ -306,28 +306,3 @@ def _snowflake2millis(snowflake_id): def _millis2snowflake(ms): return (int(ms) - 1288834974657) << 22 - - -def _date2snowflake(dt): - return _millis2snowflake(_date2millis(dt)) - - -def _snowflake2date(snowflake_id): - return _millis2date(_snowflake2millis(snowflake_id)) - - -def _time_delta(since_id, until_id, start_time, end_time): - if start_time is None and since_id is None: - start_time = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta( - days=7 - ) - if end_time is None and until_id is None: - end_time = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta( - seconds=30 - ) - - if since_id and until_id: - start_time = _millis2date(_snowflake2millis(since_id)) - end_time = _millis2date(_snowflake2millis(until_id)) - - return start_time - end_time From e2641958d44ecf550cfa30d0e5505b6df7a94c52 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 3 Jul 2021 04:34:34 +0100 Subject: [PATCH 38/43] remove unused imports --- twarc/client2.py | 1 - twarc/command2.py | 2 -- twarc/decorators2.py | 2 -- 3 files changed, 5 deletions(-) diff --git a/twarc/client2.py b/twarc/client2.py index e63c7405..5bcbc5d3 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -5,7 +5,6 @@ """ import re -import ssl import json import time import logging diff --git a/twarc/command2.py b/twarc/command2.py index a757b3c1..cde6486e 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -2,7 +2,6 @@ The command line interfact to the Twitter v2 API. """ -import os import re import json import twarc @@ -15,7 +14,6 @@ from tqdm.auto import tqdm from datetime import timezone -from urllib.parse import quote from click_plugins import with_plugins from pkg_resources import iter_entry_points diff --git a/twarc/decorators2.py b/twarc/decorators2.py index 7492b525..e7b30335 100644 --- a/twarc/decorators2.py +++ b/twarc/decorators2.py @@ -1,7 +1,6 @@ import os import time import click -import types import logging import requests @@ -9,7 +8,6 @@ import humanize from tqdm.auto import tqdm from functools import wraps -from collections import defaultdict log = logging.getLogger("twarc") From 84260b1c5c156b109c77054cd121d1a189fcf205 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 3 Jul 2021 18:12:41 +0100 Subject: [PATCH 39/43] fix twarc2 timeline progress bar --- twarc/command2.py | 39 +++++++++++++++++++++++++++++---------- twarc/decorators2.py | 10 ++++++++-- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index cde6486e..2ebcdd77 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -26,6 +26,8 @@ cli_api_error, TimestampProgressBar, FileSizeProgressBar, + _millis2snowflake, + _date2millis, ) @@ -694,20 +696,28 @@ def timeline( """ count = 0 + user = T._ensure_user(user_id) # It's possible to skip this to optimize more - pbar = tqdm - user = T._ensure_user(user_id) - pbar_params = { - "disable": hide_progress, - "total": user["public_metrics"]["tweet_count"], - } + if use_search or (start_time or end_time) or (since_id or until_id): + pbar = TimestampProgressBar - if use_search: - if start_time is None and since_id is None: + # Infer start time as the user created time if not using ids + if start_time is None and (since_id is None and until_id is None): start_time = datetime.datetime.strptime( user["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ" ) - pbar = TimestampProgressBar + # Infer since_id as user created time if using ids + if start_time is None and since_id is None: + infer_id = _millis2snowflake( + _date2millis( + datetime.datetime.strptime( + user["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ" + ) + ) + ) + # Snowflake epoch is 1288834974657 so if older, just set it to "1" + since_id = infer_id if infer_id > 0 else 1 + pbar_params = { "since_id": since_id, "until_id": until_id, @@ -716,6 +726,13 @@ def timeline( "disable": hide_progress, } + else: + pbar = tqdm + pbar_params = { + "disable": hide_progress, + "total": user["public_metrics"]["tweet_count"], + } + tweets = _timeline_tweets( T, use_search, @@ -733,7 +750,7 @@ def timeline( _write(result, outfile) count += len(result["data"]) - if use_search and isinstance(progress, TimestampProgressBar): + if isinstance(progress, TimestampProgressBar): progress.update_with_result(result) else: progress.update(len(result["data"])) @@ -745,6 +762,8 @@ def timeline( else: if isinstance(progress, TimestampProgressBar): progress.early_stop = False + if not use_search and user["public_metrics"]["tweet_count"] > 3200: + progress.desc = f"API limit of 3200 reached" @twarc2.command("timelines") diff --git a/twarc/decorators2.py b/twarc/decorators2.py index e7b30335..917d6a64 100644 --- a/twarc/decorators2.py +++ b/twarc/decorators2.py @@ -232,15 +232,21 @@ def __init__(self, since_id, until_id, start_time, end_time, **kwargs): disable = False if "disable" not in kwargs else kwargs["disable"] kwargs["disable"] = disable - if start_time is None and since_id is None: + if start_time is None and (since_id is None and until_id is None): start_time = datetime.datetime.now( datetime.timezone.utc ) - datetime.timedelta(days=7) - if end_time is None and until_id is None: + if end_time is None and (since_id is None and until_id is None): end_time = datetime.datetime.now( datetime.timezone.utc ) - datetime.timedelta(seconds=30) + if since_id and not until_id: + until_id = _millis2snowflake(_date2millis(datetime.datetime.now(datetime.timezone.utc))) + + if until_id and not since_id: + since_id = 1 + total = ( _snowflake2millis(until_id) - _snowflake2millis(since_id) if (since_id and until_id) From 78c1034ca2cf97969cf8c00a4e9b635506a30f43 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 3 Jul 2021 18:32:01 +0100 Subject: [PATCH 40/43] fix filesize progress bar - turn off for pipes --- twarc/command2.py | 12 ++++++------ twarc/decorators2.py | 10 +++++++--- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/twarc/command2.py b/twarc/command2.py index 2ebcdd77..768cb28d 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -561,7 +561,7 @@ def hydrate(T, infile, outfile, hide_progress): """ Hydrate tweet ids. """ - with FileSizeProgressBar(infile, disable=hide_progress) as progress: + with FileSizeProgressBar(infile, outfile, disable=hide_progress) as progress: for result in T.tweet_lookup(infile): _write(result, outfile) tweet_ids = [t["id"] for t in result.get("data", [])] @@ -585,7 +585,7 @@ def users(T, infile, outfile, usernames, hide_progress): """ Get data for user ids or usernames. """ - with FileSizeProgressBar(infile, disable=hide_progress) as progress: + with FileSizeProgressBar(infile, outfile, disable=hide_progress) as progress: for result in T.user_lookup(infile, usernames): _write(result, outfile) if usernames: @@ -696,7 +696,7 @@ def timeline( """ count = 0 - user = T._ensure_user(user_id) # It's possible to skip this to optimize more + user = T._ensure_user(user_id) # It's possible to skip this to optimize more if use_search or (start_time or end_time) or (since_id or until_id): pbar = TimestampProgressBar @@ -826,7 +826,7 @@ def timelines( line_count = 0 seen = set() - with FileSizeProgressBar(infile, disable=hide_progress) as progress: + with FileSizeProgressBar(infile, outfile, disable=hide_progress) as progress: for line in infile: progress.update(len(line)) line_count += 1 @@ -1037,7 +1037,7 @@ def conversations( count = 0 stop = False - with FileSizeProgressBar(infile, disable=hide_progress) as progress: + with FileSizeProgressBar(infile, outfile, disable=hide_progress) as progress: for line in infile: progress.update(len(line)) conv_ids = [] @@ -1114,7 +1114,7 @@ def flatten(infile, outfile, hide_progress): ) return - with FileSizeProgressBar(infile, disable=hide_progress) as progress: + with FileSizeProgressBar(infile, outfile, disable=hide_progress) as progress: for line in infile: for tweet in ensure_flattened(json.loads(line)): _write(tweet, outfile, False) diff --git a/twarc/decorators2.py b/twarc/decorators2.py index 917d6a64..c454712c 100644 --- a/twarc/decorators2.py +++ b/twarc/decorators2.py @@ -178,10 +178,12 @@ class FileSizeProgressBar(tqdm): Overrides `disable` parameter if file is a pipe. """ - def __init__(self, infile, **kwargs): + def __init__(self, infile, outfile, **kwargs): disable = False if "disable" not in kwargs else kwargs["disable"] if infile is not None and (infile.name == ""): disable = True + if outfile is not None and (outfile.name == ""): + disable = True kwargs["disable"] = disable kwargs["unit"] = "B" kwargs["unit_scale"] = True @@ -242,8 +244,10 @@ def __init__(self, since_id, until_id, start_time, end_time, **kwargs): ) - datetime.timedelta(seconds=30) if since_id and not until_id: - until_id = _millis2snowflake(_date2millis(datetime.datetime.now(datetime.timezone.utc))) - + until_id = _millis2snowflake( + _date2millis(datetime.datetime.now(datetime.timezone.utc)) + ) + if until_id and not since_id: since_id = 1 From 2949aee68d3312cf6f503fd29a5e470482df1bd6 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 3 Jul 2021 19:21:35 +0100 Subject: [PATCH 41/43] better mentions progress bar help messages --- twarc/command2.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/twarc/command2.py b/twarc/command2.py index 768cb28d..d2377a24 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -633,6 +633,12 @@ def mentions( for result in T.mentions(user_id, since_id, until_id, start_time, end_time): _write(result, outfile) progress.update(len(result["data"])) + else: + if progress.n > 800: + progress.desc = f"API limit reached with {progress.n} tweets" + progress.n = 800 + else: + progress.desc = f"Set limit reached with {progress.n} tweets" @twarc2.command("timeline") From 2d39b4519c60c4673d2971ff66daa1923d3c1e40 Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 3 Jul 2021 19:22:03 +0100 Subject: [PATCH 42/43] better filesize progress message --- twarc/decorators2.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/twarc/decorators2.py b/twarc/decorators2.py index c454712c..edf41956 100644 --- a/twarc/decorators2.py +++ b/twarc/decorators2.py @@ -189,6 +189,9 @@ def __init__(self, infile, outfile, **kwargs): kwargs["unit_scale"] = True kwargs["unit_divisor"] = 1024 kwargs["miniters"] = 1 + kwargs[ + "bar_format" + ] = "{l_bar}{bar}| Processed {n_fmt}/{total_fmt} of input file [{elapsed}<{remaining}, {rate_fmt}{postfix}]" kwargs["total"] = os.stat(infile.name).st_size if not disable else 1 super().__init__(**kwargs) From 58198c883f78ad144c8ad3c185af1d2de41d18ad Mon Sep 17 00:00:00 2001 From: Igor Brigadir Date: Sat, 3 Jul 2021 19:27:03 +0100 Subject: [PATCH 43/43] following and follower progress tweak --- twarc/command2.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/twarc/command2.py b/twarc/command2.py index d2377a24..c4f31132 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -470,6 +470,9 @@ def followers(T, user, outfile, limit, hide_progress): user_id = None lookup_total = 0 + if outfile is not None and (outfile.name == ""): + hide_progress = True + if not hide_progress: target_user = T._ensure_user(user) user_id = target_user["id"] @@ -481,6 +484,7 @@ def followers(T, user, outfile, limit, hide_progress): count += len(result["data"]) progress.update(len(result["data"])) if limit != 0 and count >= limit: + progress.desc = f"Set --limit of {limit} reached" break @@ -506,6 +510,9 @@ def following(T, user, outfile, limit, hide_progress): user_id = None lookup_total = 0 + if outfile is not None and (outfile.name == ""): + hide_progress = True + if not hide_progress: target_user = T._ensure_user(user) user_id = target_user["id"] @@ -517,6 +524,7 @@ def following(T, user, outfile, limit, hide_progress): count += len(result["data"]) progress.update(len(result["data"])) if limit != 0 and count >= limit: + progress.desc = f"Set --limit of {limit} reached" break