Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sort_order parameter for search api #645

Merged
merged 4 commits into from
Jun 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/twarc2_en_us.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,13 @@ leave off the `--start-time`:

twarc2 search --end-time 2014-07-24 '"eric garner"' tweets.jsonl

### Sort Order

By default, Twitter returns the results ordered by their published date with the newest tweets being first.
To alter this behavior, it is possible to specify the `--sort-order` parameter.
Currently, it supports `recency` (the default) or `relevancy`.
In the latter case, tweets are ordered based on what Twitter determines to be the best results for your query.

## Searches

Searches works like the [search](#search) command, but instead of taking a single query, it reads from a file containing many queries. You can use the same limit and time options just like a single search command, but it will be applied to every query.
Expand Down
5 changes: 3 additions & 2 deletions test_twarc2.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,13 @@ def test_sample():
assert count == 11


def test_search_recent():
@pytest.mark.parametrize("sort_order", ["recency", "relevancy"])
def test_search_recent(sort_order):

found_tweets = 0
pages = 0

for response_page in T.search_recent("#auspol"):
for response_page in T.search_recent("#auspol", sort_order=sort_order):
pages += 1
tweets = response_page["data"]
found_tweets += len(tweets)
Expand Down
14 changes: 13 additions & 1 deletion twarc/client2.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def _prepare_params(self, **kwargs):
)

# Any other parameters passed as is,
# these include backfill_minutes, next_token, pagination_token
# these include backfill_minutes, next_token, pagination_token, sort_order
params = {**params, **{k: v for k, v in kwargs.items() if v is not None}}

return params
Expand All @@ -201,6 +201,7 @@ def _search(
media_fields,
poll_fields,
place_fields,
sort_order,
next_token=None,
granularity=None,
sleep_between=0,
Expand All @@ -217,6 +218,7 @@ def _search(
start_time=start_time,
end_time=end_time,
next_token=next_token,
sort_order=sort_order
)

if granularity:
Expand Down Expand Up @@ -657,6 +659,7 @@ def search_recent(
poll_fields=None,
place_fields=None,
next_token=None,
sort_order=None,
):
"""
Search Twitter for the given query in the last seven days,
Expand All @@ -677,6 +680,8 @@ def search_recent(
Return all tweets before this time (UTC datetime).
max_results (int):
The maximum number of results per request. Max is 100.
sort_order (str):
Order tweets based on relevancy or recency.

Returns:
generator[dict]: a generator, dict for each paginated response.
Expand All @@ -696,6 +701,7 @@ def search_recent(
poll_fields=poll_fields,
place_fields=place_fields,
next_token=next_token,
sort_order=sort_order,
)

@requires_app_auth
Expand All @@ -714,6 +720,7 @@ def search_all(
poll_fields=None,
place_fields=None,
next_token=None,
sort_order=None,
):
"""
Search Twitter for the given query in the full archive,
Expand All @@ -735,6 +742,8 @@ def search_all(
Return all tweets before this time (UTC datetime).
max_results (int):
The maximum number of results per request. Max is 500.
sort_order (str):
Order tweets based on relevancy or recency.

Returns:
generator[dict]: a generator, dict for each paginated response.
Expand Down Expand Up @@ -762,6 +771,7 @@ def search_all(
place_fields=place_fields,
next_token=next_token,
sleep_between=1.05,
sort_order=sort_order,
)

@requires_app_auth
Expand Down Expand Up @@ -813,6 +823,7 @@ def counts_recent(
poll_fields=None,
place_fields=None,
granularity=granularity,
sort_order=None,
)

@requires_app_auth
Expand Down Expand Up @@ -867,6 +878,7 @@ def counts_all(
next_token=next_token,
granularity=granularity,
sleep_between=1.05,
sort_order=None,
)

def tweet_lookup(
Expand Down
41 changes: 41 additions & 0 deletions twarc/command2.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ def _search(
media_fields,
poll_fields,
place_fields,
sort_order,
):
"""
Common function to Search for tweets.
Expand Down Expand Up @@ -281,6 +282,7 @@ def _search(
media_fields=media_fields,
poll_fields=poll_fields,
place_fields=place_fields,
sort_order=sort_order,
):
_write(result, outfile)
tweet_ids = [t["id"] for t in result.get("data", [])]
Expand Down Expand Up @@ -617,6 +619,11 @@ def command_line_verbose_options(f):


@twarc2.command("search")
@click.option(
"--sort-order",
type=click.Choice(["recency", "relevancy"]),
help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")'
)
@command_line_search_options
@command_line_search_archive_options
@command_line_expansions_shortcuts
Expand Down Expand Up @@ -1290,6 +1297,11 @@ def mentions(T, user_id, outfile, hide_progress, **kwargs):
@command_line_expansions_options
@command_line_progressbar_option
@click.option("--limit", default=0, help="Maximum number of tweets to return")
@click.option(
"--sort-order",
type=click.Choice(["recency", "relevancy"]),
help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")'
)
@click.argument("user_id", type=str)
@click.argument("outfile", type=click.File("w"), default="-")
@click.pass_obj
Expand All @@ -1307,6 +1319,7 @@ def timeline(
exclude_retweets,
exclude_replies,
hide_progress,
sort_order,
**kwargs,
):
"""
Expand Down Expand Up @@ -1363,6 +1376,7 @@ def timeline(
end_time=end_time,
exclude_retweets=exclude_retweets,
exclude_replies=exclude_replies,
sort_order=sort_order,
**kwargs,
)

Expand Down Expand Up @@ -1394,6 +1408,11 @@ def timeline(
default=0,
help="Maximum number of tweets to return per-timeline",
)
@click.option(
"--sort-order",
type=click.Choice(["recency", "relevancy"]),
help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")'
)
@command_line_search_options
@command_line_timelines_options
@command_line_expansions_shortcuts
Expand All @@ -1408,6 +1427,7 @@ def timelines(
limit,
timeline_limit,
use_search,
sort_order,
hide_progress,
**kwargs,
):
Expand Down Expand Up @@ -1489,6 +1509,7 @@ def timelines(
tweets = _timeline_tweets(
T,
use_search=use_search,
sort_order=sort_order,
user_id=user,
**kwargs,
)
Expand Down Expand Up @@ -1516,6 +1537,7 @@ def _timeline_tweets(
end_time,
exclude_retweets,
exclude_replies,
sort_order,
**kwargs,
):
if use_search:
Expand All @@ -1530,6 +1552,7 @@ def _timeline_tweets(
until_id=until_id,
start_time=start_time,
end_time=end_time,
sort_order=sort_order,
**kwargs,
)
else:
Expand All @@ -1549,6 +1572,11 @@ def _timeline_tweets(
@twarc2.command("searches")
@command_line_search_options
@command_line_search_archive_options
@click.option(
"--sort-order",
type=click.Choice(["recency", "relevancy"]),
help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")'
)
@click.option(
"--counts-only",
is_flag=True,
Expand Down Expand Up @@ -1591,6 +1619,7 @@ def searches(
granularity,
combine_queries,
hide_progress,
sort_order,
**kwargs,
):
"""
Expand Down Expand Up @@ -1641,6 +1670,7 @@ def searches(
kwargs.pop("media_fields", None)
kwargs.pop("poll_fields", None)
kwargs.pop("place_fields", None)
kwargs.pop("sort_order", None)
kwargs = {
**kwargs,
**{
Expand All @@ -1665,6 +1695,7 @@ def searches(
"start_time": start_time,
"end_time": end_time,
"max_results": max_results,
"sort_order": sort_order,
},
}

Expand Down Expand Up @@ -1768,6 +1799,11 @@ def searches(


@twarc2.command("conversation")
@click.option(
"--sort-order",
type=click.Choice(["recency", "relevancy"]),
help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")'
)
@command_line_search_options
@command_line_search_archive_options
@command_line_expansions_shortcuts
Expand Down Expand Up @@ -1804,6 +1840,11 @@ def conversation(
default=0,
help="Maximum number of tweets to return per-conversation",
)
@click.option(
"--sort-order",
type=click.Choice(["recency", "relevancy"]),
help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")'
)
@command_line_search_options
@command_line_search_archive_options
@command_line_expansions_shortcuts
Expand Down