@@ -178,13 +181,16 @@ def text(t):
diff --git a/README.md b/README.md
index c0551a68..51fc74b8 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ If you prefer you can create a page on the [wiki](https://github.com/docnow/twar
If you are interested in adding functionality to twarc or fixing something that's broken here are the steps to setting up your development environment:
- git clone https://github.com/docnow/twarc
+ git clone https://github.io/docnow/twarc
cd twarc
pip install -r requirements.txt
diff --git a/docs/README.md b/docs/README.md
index 4961bc65..6cec7483 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,7 +1,7 @@
twarc
=====
-***For information about working with the Twitter V2 API please see the [twarc2](https://twarc-project.readthedocs.io/en/latest/twarc2/) page.***
+***For information about working with the Twitter V2 API please see the [twarc2](twarc2) page.***
---
diff --git a/docs/api/client.md b/docs/api/client.md
index dacadbfb..4ccaac08 100644
--- a/docs/api/client.md
+++ b/docs/api/client.md
@@ -2,3 +2,7 @@
::: twarc.client
handler: python
+
+
+
+
diff --git a/docs/api/client2.md b/docs/api/client2.md
index f9dcbb2d..21506c7a 100644
--- a/docs/api/client2.md
+++ b/docs/api/client2.md
@@ -2,3 +2,4 @@
::: twarc.client2
handler: python
+
diff --git a/docs/api/expansions.md b/docs/api/expansions.md
deleted file mode 100644
index 1e6c763d..00000000
--- a/docs/api/expansions.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# twarc.expansions
-
-::: twarc.expansions
- handler: python
diff --git a/docs/plugins.md b/docs/plugins.md
index e7905d6f..1d037504 100644
--- a/docs/plugins.md
+++ b/docs/plugins.md
@@ -16,7 +16,7 @@ add it to this list):
* [twarc-ids](https://pypi.org/project/twarc-ids/): extract tweet ids from tweets
* [twarc-videos](https://pypi.org/project/twarc-videos): extract videos from tweets
* [twarc-csv](https://pypi.org/project/twarc-csv/): export tweets to CSV
-* [twarc-timeline-archive](https://pypi.org/project/twarc-timeline-archive): routinely download tweet timelines for a list of users
+* [twarc-timelines](https://pypi.org/project/twarc-timelines): download tweet timelines for a list of users
## Writing a Plugin
diff --git a/docs/twitter-developer-access.md b/docs/twitter-developer-access.md
index 3f0e53fd..64a440dd 100644
--- a/docs/twitter-developer-access.md
+++ b/docs/twitter-developer-access.md
@@ -61,8 +61,6 @@ Now that you have your keys and tokens, you can start using the API. You may be
Be careful not to commit your keys into a public repository or make them visible to the public - do not include them in a client side js script for example. Most apps will ask for API Key and Secret, but "Consumer Key" is "API Key" and "Consumer Secret" is "API Secret".
-For Academic Access, there is only one endpoint that takes Bearer (App Only) authentication, so in most cases, the Bearer Token is all you need to share.
-
## Step 5: Next Steps
Install `twarc`, and run `twarc2 configure` to set it up.
diff --git a/mkdocs.yml b/mkdocs.yml
index 08662270..b5d0de77 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -28,7 +28,6 @@ nav:
- Library API:
- api/client.md
- api/client2.md
- - api/expansions.md
plugins:
- search
diff --git a/setup.py b/setup.py
index 64e0fb32..bc95d3f0 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@
url='https://github.com/docnow/twarc',
author='Ed Summers',
author_email='ehs@pobox.com',
- packages=['twarc'],
+ packages=['twarc', ],
description='Archive tweets from the command line',
long_description=long_description,
long_description_content_type="text/markdown",
diff --git a/test_twarc2.py b/test_twarc2.py
index ca639cc4..9015b2a3 100644
--- a/test_twarc2.py
+++ b/test_twarc2.py
@@ -5,7 +5,6 @@
import dotenv
import pytest
import logging
-import pathlib
import datetime
import threading
@@ -16,7 +15,6 @@
access_token = os.environ.get('ACCESS_TOKEN')
access_token_secret = os.environ.get('ACCESS_TOKEN_SECRET')
-test_data = pathlib.Path('test-data')
logging.basicConfig(filename="test.log", level=logging.INFO)
# Implicitly test the constructor in application auth mode. This ensures that
@@ -294,7 +292,6 @@ def test_follows():
break
assert found >= 1000
-
def test_follows_username():
"""
Test followers and and following by username.
@@ -333,20 +330,16 @@ def test_flattened():
found_referenced_tweets = False
event = threading.Event()
- for count, response in enumerate(T.sample(event=event)):
-
- # streaming api always returns a tweet at a time but flatten
- # will put these in a list so they can be treated uniformly
- tweets = twarc.expansions.flatten(response)
- assert len(tweets) == 1
- tweet = tweets[0]
+ for count, result in enumerate(T.sample(event=event)):
+ result = twarc.expansions.flatten(result)
+ tweet = result["data"]
assert "id" in tweet
logging.info("got sample tweet #%s %s", count, tweet["id"])
author_id = tweet["author_id"]
assert "author" in tweet
- assert tweet["author"]["id"] == author_id
+ assert result["data"]["author"]["id"] == author_id
if "in_reply_to_user_id" in tweet:
assert "in_reply_to_user" in tweet
@@ -369,11 +362,8 @@ def test_flattened():
assert tweet["entities"]["mentions"][0]["username"]
found_entities_mentions = True
- # need to ensure there are no errors because a referenced tweet
- # might be protected or deleted in which case it would not have been
- # included in the response and would not have been flattened
- if "errors" not in response and "referenced_tweets" in tweet:
- assert tweet["referenced_tweets"][0]["text"]
+ if "referenced_tweets" in tweet:
+ assert tweet["referenced_tweets"][0]["id"]
found_referenced_tweets = True
if found_geo and found_in_reply_to_user and found_attachments_media \
@@ -393,33 +383,18 @@ def test_flattened():
assert found_referenced_tweets, "found referenced tweets"
-def test_ensure_flattened():
- resp = next(T.search_recent('twitter'))
-
- # flatten a response
- flat1 = twarc.expansions.ensure_flattened(resp)
- assert isinstance(flat1, list)
- assert len(flat1) > 1
- assert 'author' in flat1[0]
-
- # flatten the flattened list
- flat2 = twarc.expansions.ensure_flattened(flat1)
- assert isinstance(flat2, list)
- assert len(flat2) == len(flat1)
- assert 'author' in flat2[0]
+def test_flatten_noop():
+ """
+ Flattening twice should be a no-op.
+ """
+ resp = next(T.tweet_lookup(range(1000, 2000)))
- # flatten a tweet object which will force it into a list
- flat3 = twarc.expansions.ensure_flattened(flat2[0])
- assert isinstance(flat3, list)
- assert len(flat3) == 1
+ flat1 = twarc.expansions.flatten(resp)
+ assert len(flat1) > 0
- with pytest.raises(ValueError):
- twarc.expansions.ensure_flattened({'fake': 'tweet'})
- with pytest.raises(ValueError):
- twarc.expansions.ensure_flattened([{'fake': 'tweet'}])
- with pytest.raises(ValueError):
- flat1[0].pop('author')
- twarc.expansions.ensure_flattened(flat1)
+ flat2 = twarc.expansions.flatten(flat1)
+ assert len(flat2) > 0
+ assert len(flat1) == len(flat2)
def test_twarc_metadata():
@@ -433,7 +408,7 @@ def test_twarc_metadata():
for response in T.tweet_lookup(range(1000, 2000)):
assert "__twarc" in response
- assert "__twarc" in twarc.expansions.flatten(response)[0]
+ assert "__twarc" in twarc.expansions.flatten(response)
# Witout metadata
T.metadata = False
diff --git a/twarc/__init__.py b/twarc/__init__.py
index e3773bc6..38e546e8 100644
--- a/twarc/__init__.py
+++ b/twarc/__init__.py
@@ -1,4 +1,3 @@
from .client import Twarc
from .client2 import Twarc2
from .version import version
-from .expansions import ensure_flattened
diff --git a/twarc/__main__.py b/twarc/__main__.py
deleted file mode 100644
index 85497b8c..00000000
--- a/twarc/__main__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from twarc.command2 import twarc2
-
-if __name__ == "__main__":
- twarc2(prog_name="python -m twarc2")
-
diff --git a/twarc/client2.py b/twarc/client2.py
index 563a28f3..2ee2f29f 100644
--- a/twarc/client2.py
+++ b/twarc/client2.py
@@ -7,11 +7,10 @@
import re
import ssl
import json
-import time
import logging
-import datetime
import requests
import datetime
+import time
from oauthlib.oauth2 import BackendApplicationClient
from requests.exceptions import ConnectionError
@@ -25,8 +24,6 @@
log = logging.getLogger("twarc")
-TWITTER_EPOCH = datetime.datetime(2006, 3, 21, tzinfo=datetime.timezone.utc)
-
class Twarc2:
"""
@@ -136,19 +133,17 @@ def _search(
count += len(response['data'])
yield response
- else:
- log.info(f'Retrieved an empty page of results.')
-
- # Calculate the amount of time to sleep, accounting for any
- # processing time used by the rest of the application.
- # This is to satisfy the 1 request / 1 second rate limit
- # on the search/all endpoint.
- time.sleep(
- max(0, sleep_between - (time.monotonic() - made_call))
- )
- made_call = time.monotonic()
+ # Calculate the amount of time to sleep, accounting for any
+ # processing time used by the rest of the application.
+ # This is to satisfy the 1 request / 1 second rate limit
+ # on the search/all endpoint.
- log.info(f'No more results for search {query}.')
+ time.sleep(
+ max(0, sleep_between - (time.monotonic() - made_call))
+ )
+ made_call = time.monotonic()
+ else:
+ log.info(f'no more results for search')
def search_recent(
self, query, since_id=None, until_id=None, start_time=None,
@@ -211,13 +206,6 @@ def search_all(
generator[dict]: a generator, dict for each paginated response.
"""
url = "https://api.twitter.com/2/tweets/search/all"
-
- # start time defaults to the beginning of Twitter to override the
- # default of the last month. Only do this if start_time is not already
- # specified and since_id isn't being used
- if start_time is None and since_id is None:
- start_time = TWITTER_EPOCH
-
return self._search(
url, query, since_id, until_id, start_time, end_time, max_results,
sleep_between=1.05
@@ -365,21 +353,6 @@ def sample(self, event=None, record_keepalive=False):
data = _append_metadata(data, resp.url)
yield data
- # Check for an operational disconnect error in the response
- if data.get("errors", []):
- for error in data["errors"]:
- if error.get("disconnect_type") == "OperationalDisconnect":
- log.info(
- "Received operational disconnect message: "
- "This stream has fallen too far behind in "
- "processing tweets. Some data may have been "
- "lost."
- )
- # Sleep briefly, then break this get call and
- # attempt to reconnect.
- time.sleep(5)
- break
-
except requests.exceptions.HTTPError as e:
errors += 1
log.error("caught http error %s on %s try", e, errors)
@@ -524,9 +497,7 @@ def _timeline(
count += len(response['data'])
yield response
else:
- log.info(f'Retrieved an empty page of results for timeline {user_id}')
-
- log.info(f'No more results for timeline {user_id}.')
+ log.info(f'no more results for timeline')
def timeline(
self, user, since_id=None, until_id=None, start_time=None,
@@ -735,15 +706,13 @@ def connect(self):
self.client.close()
if self.auth_type == "application" and self.bearer_token:
- log.info('creating HTTP session headers for app auth.')
- auth = f"Bearer {self.bearer_token}"
- log.debug('authorization: %s', auth)
+ log.info('Creating HTTP session headers for app auth.')
self.client = requests.Session()
- self.client.headers.update({"Authorization": auth})
+ self.client.headers.update(
+ {"Authorization": f"Bearer {self.bearer_token}"}
+ )
elif self.auth_type == "application":
- log.info('creating app auth client via OAuth2')
- log.debug('client_id: %s', self.consumer_key)
- log.debug('client_secret: %s', self.consumer_secret)
+ log.info('Creating app auth client via OAuth2')
client = BackendApplicationClient(client_id=self.consumer_key)
self.client = OAuth2Session(client=client)
self.client.fetch_token(
@@ -753,10 +722,6 @@ def connect(self):
)
else:
log.info('creating user auth client')
- log.debug('client_id: %s', self.consumer_key)
- log.debug('client_secret: %s', self.consumer_secret)
- log.debug('resource_owner_key: %s', self.access_token)
- log.debug('resource_owner_secret: %s', self.access_token_secret)
self.client = OAuth1Session(
client_key=self.consumer_key,
client_secret=self.consumer_secret,
diff --git a/twarc/command2.py b/twarc/command2.py
index fdb58c50..c0df34bf 100644
--- a/twarc/command2.py
+++ b/twarc/command2.py
@@ -19,12 +19,10 @@
from twarc.version import version
from twarc.handshake import handshake
-from twarc.config import ConfigProvider
from twarc.decorators import cli_api_error
-from twarc.expansions import ensure_flattened
+from twarc.expansions import flatten as flat
from click_config_file import configuration_option
-config_provider = ConfigProvider()
@with_plugins(iter_entry_points('twarc.plugins'))
@click.group()
@@ -44,26 +42,23 @@
show_default=True,
)
@click.option('--log', default='twarc.log')
-@click.option('--verbose', is_flag=True, default=False)
@click.option('--metadata/--no-metadata', default=True, show_default=True,
help="Include/don't include metadata about when and how data was collected.")
-@configuration_option(cmd_name='twarc', config_file_name='config', provider=config_provider)
+@configuration_option(cmd_name='twarc')
@click.pass_context
def twarc2(
ctx, consumer_key, consumer_secret, access_token, access_token_secret, bearer_token,
- log, metadata, app_auth, verbose
+ log, metadata, app_auth
):
"""
Collect data from the Twitter V2 API.
"""
logging.basicConfig(
filename=log,
- level=logging.DEBUG if verbose else logging.INFO,
+ level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s"
)
- logging.info("using config %s", config_provider.file_path)
-
if bearer_token or (consumer_key and consumer_secret):
if app_auth and (bearer_token or (consumer_key and consumer_secret)):
ctx.obj = twarc.Twarc2(
@@ -108,19 +103,15 @@ def configure(ctx):
"""
Set up your Twitter app keys.
"""
-
- config_file = config_provider.file_path
- logging.info('creating config file: %s', config_file)
-
- config_dir = pathlib.Path(config_file).parent
- if not config_dir.is_dir():
- logging.info('creating config directory: %s', config_dir)
- config_dir.mkdir(parents=True)
-
keys = handshake()
if keys is None:
raise click.ClickException("Unable to authenticate")
+ config_dir = pathlib.Path(click.get_app_dir('twarc'))
+ if not config_dir.is_dir():
+ config_dir.mkdir(parents=True)
+ config_file = config_dir / 'config'
+
config = configobj.ConfigObj(unrepr=True)
config.filename = config_file
@@ -159,20 +150,22 @@ def get_version():
help='Match tweets sent prior to tweet id')
@click.option('--start-time',
type=click.DateTime(formats=('%Y-%m-%d', '%Y-%m-%dT%H:%M:%S')),
- help='Match tweets created after UTC time (ISO 8601/RFC 3339), e.g. 2021-01-01T12:31:04')
+ help='Match tweets created after time (ISO 8601/RFC 3339), e.g. 2021-01-01T12:31:04')
@click.option('--end-time',
type=click.DateTime(formats=('%Y-%m-%d', '%Y-%m-%dT%H:%M:%S')),
- help='Match tweets sent before UTC time (ISO 8601/RFC 3339)')
+ help='Match tweets sent before time (ISO 8601/RFC 3339)')
@click.option('--archive', is_flag=True, default=False,
help='Search the full archive (requires Academic Research track)')
@click.option('--limit', default=0, help='Maximum number of tweets to save')
@click.option('--max-results', default=0, help='Maximum number of tweets per API response')
+@click.option('--flatten', is_flag=True, default=False,
+ help='Include expansions inline with tweets, and one line per tweet')
@click.argument('query', type=str)
@click.argument('outfile', type=click.File('w'), default='-')
@click.pass_obj
@cli_api_error
def search(T, query, outfile, since_id, until_id, start_time, end_time, limit,
- max_results, archive):
+ max_results, archive, flatten):
"""
Search for tweets.
"""
@@ -184,6 +177,12 @@ def search(T, query, outfile, since_id, until_id, start_time, end_time, limit,
# default number of tweets per response 500 when not set otherwise
if max_results == 0:
max_results = 500
+
+ # if the user is searching the historical archive the assumption is that
+ # they want to search everything, and not just the previous month which
+ # is the default: https://github.com/DocNow/twarc/issues/434
+ if start_time == None and since_id == None:
+ start_time = datetime.datetime(2006, 3, 21, tzinfo=datetime.timezone.utc)
else:
if max_results == 0:
max_results = 100
@@ -191,19 +190,21 @@ def search(T, query, outfile, since_id, until_id, start_time, end_time, limit,
for result in search_method(query, since_id, until_id, start_time, end_time,
max_results):
- _write(result, outfile)
+ _write(result, outfile, flatten)
count += len(result['data'])
if limit != 0 and count >= limit:
break
@twarc2.command('tweet')
+@click.option('--flatten', is_flag=True, default=False,
+ help='Include expansions inline with tweets, and one line per tweet')
@click.option('--pretty', is_flag=True, default=False,
help='Pretty print the JSON')
@click.argument('tweet_id', type=str)
@click.argument('outfile', type=click.File('w'), default='-')
@click.pass_obj
@cli_api_error
-def tweet(T, tweet_id, outfile, pretty):
+def tweet(T, tweet_id, outfile, flatten, pretty):
"""
Look up a tweet using its tweet id or URL.
"""
@@ -212,23 +213,25 @@ def tweet(T, tweet_id, outfile, pretty):
if not re.match('^\d+$', tweet_id):
click.echo(click.style("Please enter a tweet URL or ID", fg="red"), err=True)
result = next(T.tweet_lookup([tweet_id]))
- _write(result, outfile, pretty=pretty)
+ _write(result, outfile, flatten, pretty=pretty)
@twarc2.command('followers')
@click.option('--limit', default=0, help='Maximum number of followers to save')
+@click.option('--flatten', is_flag=True, default=False,
+ help='Include expansions inline with users, and one line per user')
@click.argument('user', type=str)
@click.argument('outfile', type=click.File('w'), default='-')
@click.pass_obj
@cli_api_error
-def followers(T, user, outfile, limit):
+def followers(T, user, outfile, limit, flatten):
"""
Get the followers for a given user.
"""
count = 0
for result in T.followers(user):
- _write(result, outfile)
+ _write(result, outfile, flatten)
count += len(result['data'])
if limit != 0 and count >= limit:
break
@@ -236,18 +239,20 @@ def followers(T, user, outfile, limit):
@twarc2.command('following')
@click.option('--limit', default=0, help='Maximum number of friends to save')
+@click.option('--flatten', is_flag=True, default=False,
+ help='Include expansions inline with users, and one line per user')
@click.argument('userd', type=str)
@click.argument('outfile', type=click.File('w'), default='-')
@click.pass_obj
@cli_api_error
-def following(T, user, outfile, limit):
+def following(T, user, outfile, limit, flatten):
"""
Get the users who are following a given user.
"""
count = 0
for result in T.following(user):
- _write(result, outfile)
+ _write(result, outfile, flatten)
count += len(result['data'])
if limit != 0 and count >= limit:
break
@@ -255,10 +260,12 @@ def following(T, user, outfile, limit):
@twarc2.command('sample')
@click.option('--limit', default=0, help='Maximum number of tweets to save')
+@click.option('--flatten', is_flag=True, default=False,
+ help='Include expansions inline with tweets, and one line per tweet.')
@click.argument('outfile', type=click.File('a+'), default='-')
@click.pass_obj
@cli_api_error
-def sample(T, outfile, limit):
+def sample(T, flatten, outfile, limit):
"""
Fetch tweets from the sample stream.
"""
@@ -269,35 +276,38 @@ def sample(T, outfile, limit):
count += 1
if limit != 0 and count >= limit:
event.set()
- _write(result, outfile)
+ _write(result, outfile, flatten)
@twarc2.command('hydrate')
@click.argument('infile', type=click.File('r'), default='-')
@click.argument('outfile', type=click.File('w'), default='-')
+@click.option('--flatten', is_flag=True, default=False,
+ help='Include expansions inline with tweets, and one line per tweet.')
@click.pass_obj
@cli_api_error
-def hydrate(T, infile, outfile):
+def hydrate(T, infile, outfile, flatten):
"""
Hydrate tweet ids.
"""
for result in T.tweet_lookup(infile):
- _write(result, outfile)
+ _write(result, outfile, flatten)
@twarc2.command('users')
@click.option('--usernames', is_flag=True, default=False)
+@click.option('--flatten', is_flag=True, default=False,
+ help='Include expansions inline with tweets, and one line per tweet.')
@click.argument('infile', type=click.File('r'), default='-')
@click.argument('outfile', type=click.File('w'), default='-')
@click.pass_obj
@cli_api_error
-def users(T, infile, outfile, usernames):
+def users(T, infile, outfile, usernames, flatten):
"""
Get data for user ids or usernames.
"""
for result in T.user_lookup(infile, usernames):
- _write(result, outfile)
-
+ _write(result, outfile, flatten)
@twarc2.command('mentions')
@click.option('--since-id', type=int,
@@ -310,20 +320,20 @@ def users(T, infile, outfile, usernames):
@click.option('--end-time',
type=click.DateTime(formats=('%Y-%m-%d', '%Y-%m-%dT%H:%M:%S')),
help='Match tweets sent before time (ISO 8601/RFC 3339)')
+@click.option('--flatten', is_flag=True, default=False,
+ help='Include expansions inline with tweets, and one line per tweet')
@click.argument('user_id', type=str)
@click.argument('outfile', type=click.File('w'), default='-')
@click.pass_obj
@cli_api_error
-def mentions(T, user_id, outfile, since_id, until_id, start_time, end_time):
+def mentions(T, user_id, outfile, since_id, until_id, start_time, end_time, flatten):
"""
Retrieve the most recent tweets mentioning the given user.
"""
for result in T.mentions(user_id, since_id, until_id, start_time, end_time):
- _write(result, outfile)
-
+ _write(result, outfile, flatten)
@twarc2.command('timeline')
-@click.option('--limit', default=0, help='Maximum number of tweets to return')
@click.option('--since-id', type=int,
help='Match tweets sent after tweet id')
@click.option('--until-id', type=int,
@@ -334,183 +344,18 @@ def mentions(T, user_id, outfile, since_id, until_id, start_time, end_time):
@click.option('--end-time',
type=click.DateTime(formats=('%Y-%m-%d', '%Y-%m-%dT%H:%M:%S')),
help='Match tweets sent before time (ISO 8601/RFC 3339)')
-@click.option('--use-search', is_flag=True, default=False,
- help='Use the search/all API endpoint which is not limited to the last 3200 tweets, but requires Academic Product Track access.')
+@click.option('--flatten', is_flag=True, default=False,
+ help='Include expansions inline with tweets, and one line per tweet')
@click.argument('user_id', type=str)
@click.argument('outfile', type=click.File('w'), default='-')
@click.pass_obj
@cli_api_error
-def timeline(T, user_id, outfile, since_id, until_id, start_time, end_time,
- use_search, limit):
+def timeline(T, user_id, outfile, since_id, until_id, start_time, end_time, flatten):
"""
- Retrieve recent tweets for the given user.
+ Retrieve the 3200 most recent tweets for the given user.
"""
-
- if use_search:
- q = f'from:{user_id}'
- tweets = T.search_all(q, since_id, until_id, start_time, end_time)
- else:
- tweets = T.timeline(user_id, since_id, until_id, start_time, end_time)
-
- count = 0
- for result in tweets:
- _write(result, outfile)
-
- count += len(result['data'])
- if limit != 0 and count >= limit:
- break
-
-
-@twarc2.command('timelines')
-@click.option('--limit', default=0, help='Maximum number of tweets to return')
-@click.option('--timeline-limit', default=0,
- help='Maximum number of tweets to return per-timeline')
-@click.option('--use-search', is_flag=True, default=False,
- help='Use the search/all API endpoint which is not limited to the last 3200 tweets, but requires Academic Product Track access.')
-@click.argument('infile', type=click.File('r'), default='-')
-@click.argument('outfile', type=click.File('w'), default='-')
-@click.pass_obj
-def timelines(T, infile, outfile, limit, timeline_limit, use_search):
- """
- Fetch the timelines of every user in an input source of tweets. If
- the input is a line oriented text file of user ids or usernames that will
- be used instead.
- """
- total_count = 0
- seen = set()
- for line in infile:
- line = line.strip()
- if line == "":
- continue
-
- users = []
- try:
- data = ensure_flattened(json.loads(line))
- users = set([t['author']['id'] for t in ensure_flattened(data)])
- except json.JSONDecodeError:
- users = set([line])
- except ValueError:
- users = set([line])
-
- for user in users:
-
- # only process a given user once
- if user in seen:
- continue
- seen.add(user)
-
- # which api endpoint to use
- if use_search and since_id:
- tweets = T.search_all(f'from:{user}', since_id=since_id)
- elif use_search:
- tweets = T.search_all(f'from:{user}')
- else:
- tweets = T.timeline(user)
-
- timeline_count = 0
- for response in tweets:
- _write(response, outfile)
-
- timeline_count += len(response['data'])
- if timeline_limit != 0 and timeline_count >= timeline_limit:
- break
-
- total_count += len(response['data'])
- if limit != 0 and total_count >= limit:
- return
-
-
-@twarc2.command('conversation')
-@click.option('--archive', is_flag=True, default=False,
- help='Search the full archive (requires Academic Research track)')
-@click.argument('tweet_id', type=str)
-@click.argument('outfile', type=click.File('w'), default='-')
-@click.pass_obj
-@cli_api_error
-def conversation(T, tweet_id, archive, outfile):
- """
- Retrieve a conversation thread using the tweet id.
- """
- q = f'conversation_id:{tweet_id}'
- if archive:
- search = T.search_all(q)
- else:
- search = T.search_recent(q)
- for resp in search:
- _write(resp, outfile)
-
-
-@twarc2.command('conversations')
-@click.option('--limit', default=0, help='Maximum number of tweets to return')
-@click.option('--conversation-limit', default=0,
- help='Maximum number of tweets to return per-conversation')
-@click.option('--archive', is_flag=True, default=False,
- help='Use the Academic Research project track access to the full archive')
-@click.argument('infile', type=click.File('r'), default='-')
-@click.argument('outfile', type=click.File('w'), default='-')
-@click.pass_obj
-@cli_api_error
-def conversations(T, infile, outfile, archive, limit, conversation_limit):
- """
- Fetch the full conversation threads that the input tweets are a part of.
- Alternatively the input can be a line oriented file of conversation ids.
- """
-
- # keep track of converstation ids that have been fetched so that they
- # aren't fetched twice
- seen = set()
-
- # use the archive or recent search?
- search = T.search_all if archive else T.search_recent
-
- count = 0
- stop = False
- for line in infile:
- conv_ids = []
-
- # stop will get set when the total tweet limit has been met
- if stop:
- break
-
- # get a specific conversation id
- line = line.strip()
- if re.match(r'^\d+$', line):
- if line in seen:
- continue
- conv_ids = [line]
-
- # generate all conversation_ids that are referenced in tweets input
- else:
- def f():
- for tweet in ensure_flattened(json.loads(line)):
- yield tweet.get('conversation_id')
- conv_ids = f()
-
- # output results while paying attention to the set limits
- conv_count = 0
-
- for conv_id in conv_ids:
-
- if conv_id in seen:
- logging.info(f'already fetched conversation_id {conv_id}')
- seen.add(conv_id)
-
- conv_count = 0
-
- logging.info(f'fetching conversation {conv_id}')
- for result in search(f'conversation_id:{conv_id}'):
- _write(result, outfile, False)
-
- count += len(result['data'])
- if limit != 0 and count >= limit:
- logging.info(f'reached tweet limit of {limit}')
- stop = True
- break
-
- conv_count += len(result['data'])
- if conversation_limit !=0 and conv_count >= conversation_limit:
- logging.info(f'reached conversation limit {conversation_limit}')
- break
+ for result in T.timeline(user_id, since_id, until_id, start_time, end_time):
+ _write(result, outfile, flatten)
@twarc2.command('flatten')
@@ -519,40 +364,39 @@ def f():
@cli_api_error
def flatten(infile, outfile):
"""
- "Flatten" tweets, or move expansions inline with tweet objects and ensure
- that each line of output is a single tweet.
+ "Flatten" tweets, or move expansions inline with tweet objects.
"""
if (infile.name == outfile.name):
click.echo(click.style(f"💔 Cannot flatten files in-place, specify a different output file!", fg='red'), err=True)
return
for line in infile:
- for tweet in ensure_flattened(json.loads(line)):
- _write(tweet, outfile, False)
+ result = json.loads(line)
+ _write(result, outfile, True)
@twarc2.command('stream')
@click.option('--limit', default=0, help='Maximum number of tweets to return')
+@click.option('--flatten', is_flag=True, default=False,
+ help='Include expansions inline with tweets, and one line per tweet')
@click.argument('outfile', type=click.File('a+'), default='-')
@click.pass_obj
@cli_api_error
-def stream(T, outfile, limit):
+def stream(T, flatten, outfile, limit):
"""
Fetch tweets from the live stream.
"""
event = threading.Event()
count = 0
- click.echo(click.style(f'Started a stream with rules:', fg='green'),
- err=True)
+ click.echo(click.style(f'Started a stream with rules:', fg='green'))
_print_stream_rules(T)
- click.echo(click.style(f'Writing to {outfile.name}\nCTRL+C to stop...',
- fg='green'), err=True)
+ click.echo(click.style(f'Writing to {outfile.name}\nCTRL+C to stop...', fg='green'))
for result in T.stream(event=event):
count += 1
if limit != 0 and count == limit:
logging.info(f'reached limit {limit}')
event.set()
- _write(result, outfile)
+ _write(result, outfile, flatten)
@twarc2.group()
@@ -579,7 +423,7 @@ def _print_stream_rules(T):
"""
result = T.get_stream_rules()
if 'data' not in result or len(result['data']) == 0:
- click.echo('No rules yet. Add them with ' + click.style('twarc2 stream-rules add', bold=True), err=True)
+ click.echo('No rules yet. Add them with ' + click.style('twarc2 stream-rules add', bold=True))
else:
count = 0
for rule in result['data']:
@@ -588,7 +432,7 @@ def _print_stream_rules(T):
s = rule['value']
if 'tag' in rule:
s += f" (tag: {rule['tag']})"
- click.echo(click.style(f'☑ {s}'), err=True)
+ click.echo(click.style(f'☑ {s}'))
count += 1
@@ -690,6 +534,17 @@ def _error_str(errors):
return click.style("\n".join(parts), fg="red")
-def _write(results, outfile, pretty=False):
+def _write(results, outfile, flatten, pretty=False):
indent = 2 if pretty else None
- click.echo(json.dumps(results, indent=indent), file=outfile)
+ if 'data' in results:
+ if flatten:
+ if isinstance(results['data'], list):
+ for r in flat(results)['data']:
+ click.echo(json.dumps(r, indent=indent), file=outfile)
+ else:
+ r = flat(results)['data']
+ click.echo(json.dumps(r, indent=indent), file=outfile)
+ else:
+ click.echo(json.dumps(results, indent=indent), file=outfile)
+ else:
+ click.echo(json.dumps(results, indent=indent), file=outfile)
diff --git a/twarc/config.py b/twarc/config.py
deleted file mode 100644
index 3fe2096d..00000000
--- a/twarc/config.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import logging
-import configobj
-
-# Adapted from click_config_file.configobj_provider so that we can store the
-# file path that the config was loaded from in order to log it later.
-
-log = logging
-
-class ConfigProvider():
-
- def __init__(self):
- self.file_path = None
-
- def __call__(self, file_path, cmd_name):
- self.file_path = file_path
- return configobj.ConfigObj(file_path, unrepr=True)
diff --git a/twarc/expansions.py b/twarc/expansions.py
index 22a57c24..64ec6fcc 100644
--- a/twarc/expansions.py
+++ b/twarc/expansions.py
@@ -1,10 +1,8 @@
"""
This module contains a list of the known Twitter V2+ API expansions and fields
-for each expansion, and a function flatten() for "flattening" a result set,
-including all expansions inline.
+for each expansion, and a function for "flattening" a result set, including all
+expansions inline
-ensure_flattened() can be used in tweet processing programs that need to make
-sure that data is flattened.
"""
from collections import defaultdict
@@ -117,13 +115,9 @@ def extract_includes(response, expansion, _id="id"):
def flatten(response):
"""
- Flatten an API response by moving all "included" entities inline with the
- tweets they are referenced from. flatten expects an entire page response
- from the API (data, includes, meta) and will raise a ValueError if what is
- passed in does not appear to be an API response. It will return a list of
- dictionaries where each dictionary represents a tweet. Empty objects will
- be returned for things that are missing in includes, which can happen when
- protected or delete users or tweets are referenced.
+ Flatten the response. Expects an entire page response from the API (data,
+ includes, meta) Defaults: Return empty objects for things missing in
+ includes. Doesn't modify tweets, only adds extra data.
"""
# Users extracted both by id and by username for expanding mentions
@@ -197,60 +191,17 @@ def expand_payload(payload):
return payload
- # First expand the included tweets, before processing actual result tweets:
+ # First, expand the included tweets, before processing actual result tweets:
for included_id, included_tweet in extract_includes(response, "tweets").items():
includes_tweets[included_id] = expand_payload(included_tweet)
# Now flatten the list of tweets or an individual tweet
- tweets = []
if "data" in response:
- data = response['data']
-
- if isinstance(data, list):
- tweets = expand_payload(response["data"])
- elif isinstance(data, dict):
- tweets = [expand_payload(response["data"])]
+ response["data"] = expand_payload(response["data"])
# Add the __twarc metadata to each tweet if it's a result set
- if "__twarc" in response:
- for tweet in tweets:
+ if "__twarc" in response and isinstance(response["data"], list):
+ for tweet in response["data"]:
tweet["__twarc"] = response["__twarc"]
- else:
- raise ValueError(f'missing data stanza in response: {response}')
-
- return tweets
-
-
-def ensure_flattened(data):
- """
- Will ensure that the supplied data is "flattened". The input data can be a
- response from the Twitter API, a list of tweet dictionaries, or a single tweet
- dictionary. It will always return a list of tweet dictionaries. A ValueError
- will be thrown if the supplied data is not recognizable or it cannot be
- flattened.
-
- ensure_flattened is designed for use in twarc plugins and other tweet
- processing applications that want to operate on a stream of tweets, and
- examine included entities like users and tweets without hunting and
- pecking in the response data.
- """
- if isinstance(data, dict) and 'data' in data:
- return flatten(data)
-
- elif isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
- # if author is present it is already flattened
- if 'author' in data[0]:
- return data
- else:
- raise ValueError('unable to flatten list of tweets without original response data: {data}')
-
- elif isinstance(data, dict) and 'author' in data:
- # if author is present it is already flattened
- if 'author' in data:
- return [data]
- else:
- raise ValueError(f'unable to flatten tweet dictionary without original response data: {data}')
-
- else:
- raise ValueError(f'cannot flatten unrecognized data: {data}')
+ return response
diff --git a/twarc/handshake.py b/twarc/handshake.py
index cafacd0d..dce86c0a 100644
--- a/twarc/handshake.py
+++ b/twarc/handshake.py
@@ -6,6 +6,7 @@
from requests_oauthlib import OAuth1
from urllib.parse import parse_qs
+from getpass import getpass
def handshake():
@@ -15,7 +16,7 @@ def handshake():
access_token = ""
access_token_secret = ""
- bearer_token = input(
+ bearer_token = getpass(
"Please enter your Bearer Token (leave blank to skip to API key configuration): "
)
@@ -31,7 +32,7 @@ def handshake():
"Configure API keys and secrets."
consumer_key = input("Please enter your API key: ")
- consumer_secret = input("Please enter your API secret: ")
+ consumer_secret = getpass("Please enter your API secret: ")
# verify that the keys work to get the bearer token
url = "https://api.twitter.com/oauth2/token"
@@ -95,7 +96,7 @@ def handshake():
screen_name = credentials.get('screen_name')[0]
else:
access_token = input("Enter your Access Token: ")
- access_token_secret = input("Enter your Access Token Secret: ")
+ access_token_secret = getpass("Enter your Access Token Secret: ")
screen_name = "default"
return {
diff --git a/twarc/version.py b/twarc/version.py
index 4f58e967..9febec5a 100644
--- a/twarc/version.py
+++ b/twarc/version.py
@@ -1 +1 @@
-version = '2.1.8'
+version = '2.0.12'
diff --git a/utils/source.py b/utils/source.py
index 18d87c05..4f9d4284 100755
--- a/utils/source.py
+++ b/utils/source.py
@@ -5,6 +5,7 @@
Example usage:
utils/source.py tweets.jsonl > sources.html
"""
+from __future__ import print_function
import json
import fileinput
from collections import defaultdict
@@ -54,14 +55,14 @@
Twitter client sources
- created on the command line with twarc
+ created on the command line with twarc
{} | {} |
{} | {} |