diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index dc71098..02c67b1 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -12,7 +12,6 @@ jobs: runs-on: ubuntu-latest strategy: fail-fast: false - max-parallel: 1 matrix: python-version: ["3.7", "3.10", "3.11"] steps: diff --git a/README.md b/README.md index b54a1d6..5662ce2 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ A `Search` specifies a search of arXiv's database. arxiv.Search( query: str = "", id_list: List[str] = [], - max_results: float = float('inf'), + max_results: int | None = None, sort_by: SortCriterion = SortCriterion.Relevance, sort_order: SortOrder = SortOrder.Descending ) @@ -44,11 +44,11 @@ arxiv.Search( + `query`: an arXiv query string. Advanced query formats are documented in the [arXiv API User Manual](https://arxiv.org/help/api/user-manual#query_details). + `id_list`: list of arXiv record IDs (typically of the format `"0710.5765v1"`). See [the arXiv API User's Manual](https://arxiv.org/help/api/user-manual#search_query_and_id_list) for documentation of the interaction between `query` and `id_list`. -+ `max_results`: The maximum number of results to be returned in an execution of this search. To fetch every result available, set `max_results=float('inf')` (default); to fetch up to 10 results, set `max_results=10`. The API's limit is 300,000 results. ++ `max_results`: The maximum number of results to be returned in an execution of this search. To fetch every result available, set `max_results=None` (default); to fetch up to 10 results, set `max_results=10`. The API's limit is 300,000 results. + `sort_by`: The sort criterion for results: `relevance`, `lastUpdatedDate`, or `submittedDate`. + `sort_order`: The sort order for results: `'descending'` or `'ascending'`. -To fetch arXiv records matching a `Search`, use `search.results()` or `(Client).results(search)` to get a generator yielding `Result`s. +To fetch arXiv records matching a `Search`, use `(Client).results(search)` to get a generator yielding `Result`s. #### Example: fetching results @@ -63,7 +63,7 @@ search = arxiv.Search( sort_by = arxiv.SortCriterion.SubmittedDate ) -for result in search.results(): +for result in arxiv.Client().results(search): print(result.title) ``` @@ -72,8 +72,10 @@ Fetch and print the title of the paper with ID "1605.08386v1:" ```python import arxiv +client = arxiv.Client() search = arxiv.Search(id_list=["1605.08386v1"]) -paper = next(search.results()) + +paper = next(arxiv.Client().results(search)) print(paper.title) ``` @@ -81,7 +83,7 @@ print(paper.title) -The `Result` objects yielded by `(Search).results()` include metadata about each paper and some helper functions for downloading their content. +The `Result` objects yielded by `(Client).results()` include metadata about each paper and some helper functions for downloading their content. The meaning of the underlying raw data is documented in the [arXiv API User Manual: Details of Atom Results Returned](https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned). @@ -108,7 +110,7 @@ To download a PDF of the paper with ID "1605.08386v1," run a `Search` and then u ```python import arxiv -paper = next(arxiv.Search(id_list=["1605.08386v1"]).results()) +paper = next(arxiv.Client().results(arxiv.Search(id_list=["1605.08386v1"]))) # Download the PDF to the PWD with a default filename. paper.download_pdf() # Download the PDF to the PWD with a custom filename. @@ -122,7 +124,7 @@ The same interface is available for downloading .tar.gz files of the paper sourc ```python import arxiv -paper = next(arxiv.Search(id_list=["1605.08386v1"]).results()) +paper = next(arxiv.Client().results(arxiv.Search(id_list=["1605.08386v1"]))) # Download the archive to the PWD with a default filename. paper.download_source() # Download the archive to the PWD with a custom filename. @@ -133,14 +135,13 @@ paper.download_source(dirpath="./mydir", filename="downloaded-paper.tar.gz") ### Client -A `Client` specifies a strategy for fetching results from arXiv's API; it obscures pagination and retry logic. - -For most use cases the default client should suffice. You can construct it explicitly with `arxiv.Client()`, or use it via the `(Search).results()` method. +A `Client` specifies a strategy for fetching results from arXiv's API; it obscures pagination and retry logic. For most use cases the default client should suffice. ```python +# Default client properties. arxiv.Client( page_size: int = 100, - delay_seconds: int = 3, + delay_seconds: float = 3.0, num_retries: int = 3 ) ``` @@ -151,14 +152,12 @@ arxiv.Client( #### Example: fetching results with a custom client -`(Search).results()` uses the default client settings. If you want to use a client you've defined instead of the defaults, use `(Client).results(...)`: - ```python import arxiv big_slow_client = arxiv.Client( page_size = 1000, - delay_seconds = 10, + delay_seconds = 10.0, num_retries = 5 ) @@ -173,9 +172,11 @@ To inspect this package's network behavior and API logic, configure an `INFO`-le ```pycon >>> import logging, arxiv ->>> logging.basicConfig(level=logging.INFO) ->>> paper = next(arxiv.Search(id_list=["1605.08386v1"]).results()) +>>> logging.basicConfig(level=logging.DEBUG) +>>> client = arxiv.Client() +>>> paper = next(client.results(arxiv.Search(id_list=["1605.08386v1"]))) INFO:arxiv.arxiv:Requesting 100 results at offset 0 -INFO:arxiv.arxiv:Requesting page of results -INFO:arxiv.arxiv:Got first page; 1 of inf results available +INFO:arxiv.arxiv:Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=&id_list=1605.08386v1&sortBy=relevance&sortOrder=descending&start=0&max_results=100 +DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): export.arxiv.org:443 +DEBUG:urllib3.connectionpool:https://export.arxiv.org:443 "GET /api/query?search_query=&id_list=1605.08386v1&sortBy=relevance&sortOrder=descending&start=0&max_results=100&user-agent=arxiv.py%2F1.4.8 HTTP/1.1" 200 979 ``` diff --git a/arxiv/arxiv.py b/arxiv/arxiv.py index f255aff..9985c69 100644 --- a/arxiv/arxiv.py +++ b/arxiv/arxiv.py @@ -3,8 +3,10 @@ import logging import time +import itertools import feedparser import os +import math import re import requests import warnings @@ -422,12 +424,12 @@ class Search(object): Manual](https://arxiv.org/help/api/user-manual#search_query_and_id_list) for documentation of the interaction between `query` and `id_list`. """ - max_results: float + max_results: int | None """ The maximum number of results to be returned in an execution of this search. - To fetch every result available, set `max_results=float('inf')`. + To fetch every result available, set `max_results=None`. """ sort_by: SortCriterion """The sort criterion for results.""" @@ -438,7 +440,7 @@ def __init__( self, query: str = "", id_list: List[str] = [], - max_results: float = float("inf"), + max_results: int | None = None, sort_by: SortCriterion = SortCriterion.Relevance, sort_order: SortOrder = SortOrder.Descending, ): @@ -447,7 +449,8 @@ def __init__( """ self.query = query self.id_list = id_list - self.max_results = max_results + # Handle deprecated v1 default behavior. + self.max_results = None if max_results == math.inf else max_results self.sort_by = sort_by self.sort_order = sort_order @@ -479,23 +482,19 @@ def _url_args(self) -> Dict[str, str]: "sortOrder": self.sort_order.value, } - def get(self) -> Generator[Result, None, None]: - """ - **Deprecated** after 1.2.0; use `Search.results`. - """ - warnings.warn( - "The 'get' method is deprecated, use 'results' instead", - DeprecationWarning, - stacklevel=2, - ) - return self.results() - def results(self, offset: int = 0) -> Generator[Result, None, None]: """ Executes the specified search using a default arXiv API client. For info on default behavior, see `Client.__init__` and `Client.results`. + + **Deprecated** after 2.0.0; use `Client.results`. """ + warnings.warn( + "The '(Search).results' method is deprecated, use 'Client.results' instead", + DeprecationWarning, + stacklevel=2, + ) return Client().results(self, offset=offset) @@ -511,7 +510,7 @@ class Client(object): """The arXiv query API endpoint format.""" page_size: int """Maximum number of results fetched in a single API request.""" - delay_seconds: int + delay_seconds: float """Number of seconds to wait between API requests.""" num_retries: int """Number of times to retry a failing API request.""" @@ -520,7 +519,7 @@ class Client(object): _session: requests.Session def __init__( - self, page_size: int = 100, delay_seconds: int = 3, num_retries: int = 3 + self, page_size: int = 100, delay_seconds: float = 3.0, num_retries: int = 3 ): """ Constructs an arXiv API client with the specified options. @@ -548,17 +547,6 @@ def __repr__(self) -> str: repr(self.num_retries), ) - def get(self, search: Search) -> Generator[Result, None, None]: - """ - **Deprecated** after 1.2.0; use `Client.results`. - """ - warnings.warn( - "The 'get' method is deprecated, use 'results' instead", - DeprecationWarning, - stacklevel=2, - ) - return self.results(search) - def results(self, search: Search, offset: int = 0) -> Generator[Result, None, None]: """ Uses this client configuration to fetch one page of the search results @@ -574,46 +562,37 @@ def results(self, search: Search, offset: int = 0) -> Generator[Result, None, No For more on using generators, see [Generators](https://wiki.python.org/moin/Generators). """ + limit = search.max_results - offset if search.max_results else None + if limit and limit < 0: + return iter(()) + return itertools.islice(self._results(search, offset), limit) + + def _results( + self, search: Search, offset: int = 0 + ) -> Generator[Result, None, None]: + page_url = self._format_url(search, offset, self.page_size) + feed = self._parse_feed(page_url, first_page=True) + if not feed.entries: + logger.info("Got empty first page; stopping generation") + return + total_results = int(feed.feed.opensearch_totalresults) + logger.info( + "Got first page: %d of %d total results", + len(feed.entries), + total_results, + ) - # total_results may be reduced according to the feed's - # opensearch:totalResults value. - total_results = search.max_results - first_page = True - while offset < total_results: - page_size = min(self.page_size, search.max_results - offset) - logger.info("Requesting %d results at offset %d", page_size, offset) - page_url = self._format_url(search, offset, page_size) - feed = self._parse_feed(page_url, first_page=first_page) - if first_page: - # NOTE: this is an ugly fix for a known bug. The totalresults - # value is set to 1 for results with zero entries. If that API - # bug is fixed, we can remove this conditional and always set - # `total_results = min(...)`. - if len(feed.entries) == 0: - logger.info("Got empty first page; stopping generation") - total_results = 0 - else: - total_results = min( - total_results, int(feed.feed.opensearch_totalresults) - ) - logger.info( - "Got first page: %d of %d total results", - total_results, - search.max_results - if search.max_results != float("inf") - else -1, - ) - # Subsequent pages are not the first page. - first_page = False - # Update offset for next request: account for received results. - offset += len(feed.entries) - # Yield query results until page is exhausted. + while feed.entries: for entry in feed.entries: try: yield Result._from_feed_entry(entry) except Result.MissingFieldError as e: logger.warning("Skipping partial result: %s", e) - continue + offset += len(feed.entries) + if offset >= total_results: + break + page_url = self._format_url(search, offset, self.page_size) + feed = self._parse_feed(page_url, first_page=False) def _format_url(self, search: Search, start: int, page_size: int) -> str: """ @@ -679,7 +658,7 @@ def __try_parse_feed( "Requesting page (first: %r, try: %d): %s", first_page, try_index, url ) - resp = self._session.get(url, headers={"user-agent": "arxiv.py/1.4.8"}) + resp = self._session.get(url, headers={"user-agent": "arxiv.py/2.0.0"}) self._last_request_dt = datetime.now() if resp.status_code != requests.codes.OK: raise HTTPError(url, try_index, resp.status_code) diff --git a/setup.py b/setup.py index 76e6688..53edd36 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from setuptools import setup -version = "1.4.8" +version = "2.0.0" with open("README.md", "r") as fh: long_description = fh.read() diff --git a/tests/test_api_bugs.py b/tests/test_api_bugs.py index cf8c441..0ee0c50 100644 --- a/tests/test_api_bugs.py +++ b/tests/test_api_bugs.py @@ -5,7 +5,7 @@ import unittest -class TestClient(unittest.TestCase): +class TestAPIBugs(unittest.TestCase): def test_missing_title(self): """ Papers with the title "0" do not have a title element in the Atom feed. diff --git a/tests/test_client.py b/tests/test_client.py index b398707..2614aba 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -3,15 +3,9 @@ import arxiv from datetime import datetime, timedelta from pytest import approx -import time class TestClient(unittest.TestCase): - def tearDown(self) -> None: - # Bodge: sleep three seconds between tests to simulate a shared rate limit. - time.sleep(3) - return super().tearDown() - def test_invalid_format_id(self): with self.assertRaises(arxiv.HTTPError): list(arxiv.Client(num_retries=0).results(arxiv.Search(id_list=["abc"]))) @@ -58,7 +52,7 @@ def test_query_page_count(self): "https://export.arxiv.org/api/query?search_query=testing&id_list=&sortBy=relevance&sortOrder=descending&start=20&max_results=10", "https://export.arxiv.org/api/query?search_query=testing&id_list=&sortBy=relevance&sortOrder=descending&start=30&max_results=10", "https://export.arxiv.org/api/query?search_query=testing&id_list=&sortBy=relevance&sortOrder=descending&start=40&max_results=10", - "https://export.arxiv.org/api/query?search_query=testing&id_list=&sortBy=relevance&sortOrder=descending&start=50&max_results=5", + "https://export.arxiv.org/api/query?search_query=testing&id_list=&sortBy=relevance&sortOrder=descending&start=50&max_results=10", }, ) @@ -79,14 +73,12 @@ def test_offset(self): self.assertListEqual(offset_above_max_results, []) def test_search_results_offset(self): + # NOTE: page size is irrelevant here. + client = arxiv.Client(page_size=15) search = arxiv.Search(query="testing", max_results=10) - client = arxiv.Client() - - all_results = list(client.results(search, 0)) + all_results = list(client.results(search, offset=0)) self.assertEqual(len(all_results), 10) - client.page_size = 5 - for offset in [0, 5, 9, 10, 11]: client_results = list(client.results(search, offset=offset)) self.assertEqual(len(client_results), max(0, search.max_results - offset)) @@ -191,12 +183,12 @@ def test_sleep_between_errors(self, patched_time_sleep): self.assertEqual(patched_time_sleep.call_count, client.num_retries) patched_time_sleep.assert_has_calls( [ - call(approx(client.delay_seconds, rel=1e-3)), + call(approx(client.delay_seconds, abs=1e-2)), ] * client.num_retries ) - def get_code_client(code: int, delay_seconds=3, num_retries=3) -> arxiv.Client: + def get_code_client(code: int, delay_seconds=0.1, num_retries=3) -> arxiv.Client: """ get_code_client returns an arxiv.Cient with HTTP requests routed to httpstat.us. diff --git a/tests/test_result.py b/tests/test_result.py index d2d5f32..739e91e 100644 --- a/tests/test_result.py +++ b/tests/test_result.py @@ -6,10 +6,7 @@ class TestResult(unittest.TestCase): - def tearDown(self) -> None: - # Bodge: sleep three seconds between tests to simulate a shared rate limit. - time.sleep(3) - return super().tearDown() + client = arxiv.Client() def assert_nonempty(self, s): self.assertIsNotNone(s) @@ -41,13 +38,13 @@ def assert_valid_result(self, result: arxiv.Result): def test_result_shape(self): max_results = 100 search = arxiv.Search("testing", max_results=max_results) - results = [r for r in search.results()] + results = [r for r in self.client.results(search)] self.assertEqual(len(results), max_results) for result in results: self.assert_valid_result(result) def test_from_feed_entry(self): - feed = arxiv.Client()._parse_feed( + feed = self.client._parse_feed( "https://export.arxiv.org/api/query?search_query=testing" ) feed_entry = feed.entries[0] @@ -56,7 +53,7 @@ def test_from_feed_entry(self): def test_get_short_id(self): result_id = "1707.08567" - result = next(arxiv.Search(id_list=[result_id]).results()) + result = next(self.client.results(arxiv.Search(id_list=[result_id]))) got = result.get_short_id() self.assertTrue(got.startswith(result_id)) # Should be of form `1707.08567v1`. @@ -104,5 +101,5 @@ def test_eq(self): def test_legacy_ids(self): full_legacy_id = "quant-ph/0201082v1" - result = next(arxiv.Search(id_list=[full_legacy_id]).results()) + result = next(self.client.results(arxiv.Search(id_list=[full_legacy_id]))) self.assertEqual(result.get_short_id(), full_legacy_id)