lukasschwab · lukasschwab · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -12,7 +12,6 @@ jobs:
  runs-on: ubuntu-latest
  strategy:
  fail-fast: false
- max-parallel: 1
  matrix:
  python-version: ["3.7", "3.10", "3.11"]
  steps:

diff --git a/README.md b/README.md
@@ -36,19 +36,19 @@ A `Search` specifies a search of arXiv's database.
 arxiv.Search(
  query: str = "",
  id_list: List[str] = [],
- max_results: float = float('inf'),
+ max_results: int | None = None,
  sort_by: SortCriterion = SortCriterion.Relevance,
  sort_order: SortOrder = SortOrder.Descending
 )
 ```
 
 + `query`: an arXiv query string. Advanced query formats are documented in the [arXiv API User Manual](https://arxiv.org/help/api/user-manual#query_details).
 + `id_list`: list of arXiv record IDs (typically of the format `"0710.5765v1"`). See [the arXiv API User's Manual](https://arxiv.org/help/api/user-manual#search_query_and_id_list) for documentation of the interaction between `query` and `id_list`.
-+ `max_results`: The maximum number of results to be returned in an execution of this search. To fetch every result available, set `max_results=float('inf')` (default); to fetch up to 10 results, set `max_results=10`. The API's limit is 300,000 results.
++ `max_results`: The maximum number of results to be returned in an execution of this search. To fetch every result available, set `max_results=None` (default); to fetch up to 10 results, set `max_results=10`. The API's limit is 300,000 results.
 + `sort_by`: The sort criterion for results: `relevance`, `lastUpdatedDate`, or `submittedDate`.
 + `sort_order`: The sort order for results: `'descending'` or `'ascending'`.
 
-To fetch arXiv records matching a `Search`, use `search.results()` or `(Client).results(search)` to get a generator yielding `Result`s.
+To fetch arXiv records matching a `Search`, use `(Client).results(search)` to get a generator yielding `Result`s.
 
 #### Example: fetching results
 
@@ -63,7 +63,7 @@ search = arxiv.Search(
  sort_by = arxiv.SortCriterion.SubmittedDate
 )
 
-for result in search.results():
+for result in arxiv.Client().results(search):
  print(result.title)
 ```
 
@@ -72,16 +72,18 @@ Fetch and print the title of the paper with ID "1605.08386v1:"
 ```python
 import arxiv
 
+client = arxiv.Client()
 search = arxiv.Search(id_list=["1605.08386v1"])
-paper = next(search.results())
+
+paper = next(arxiv.Client().results(search))
 print(paper.title)
 ```
 
 ### Result
 
 <!-- TODO: improve this section. -->
 
-The `Result` objects yielded by `(Search).results()` include metadata about each paper and some helper functions for downloading their content.
+The `Result` objects yielded by `(Client).results()` include metadata about each paper and some helper functions for downloading their content.
 
 The meaning of the underlying raw data is documented in the [arXiv API User Manual: Details of Atom Results Returned](https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned).
 
@@ -108,7 +110,7 @@ To download a PDF of the paper with ID "1605.08386v1," run a `Search` and then u
 ```python
 import arxiv
 
-paper = next(arxiv.Search(id_list=["1605.08386v1"]).results())
+paper = next(arxiv.Client().results(arxiv.Search(id_list=["1605.08386v1"])))
 # Download the PDF to the PWD with a default filename.
 paper.download_pdf()
 # Download the PDF to the PWD with a custom filename.
@@ -122,7 +124,7 @@ The same interface is available for downloading .tar.gz files of the paper sourc
 ```python
 import arxiv
 
-paper = next(arxiv.Search(id_list=["1605.08386v1"]).results())
+paper = next(arxiv.Client().results(arxiv.Search(id_list=["1605.08386v1"])))
 # Download the archive to the PWD with a default filename.
 paper.download_source()
 # Download the archive to the PWD with a custom filename.
@@ -133,14 +135,13 @@ paper.download_source(dirpath="./mydir", filename="downloaded-paper.tar.gz")
 
 ### Client
 
-A `Client` specifies a strategy for fetching results from arXiv's API; it obscures pagination and retry logic.
-
-For most use cases the default client should suffice. You can construct it explicitly with `arxiv.Client()`, or use it via the `(Search).results()` method.
+A `Client` specifies a strategy for fetching results from arXiv's API; it obscures pagination and retry logic. For most use cases the default client should suffice.
 
 ```python
+# Default client properties.
 arxiv.Client(
  page_size: int = 100,
- delay_seconds: int = 3,
+ delay_seconds: float = 3.0,
  num_retries: int = 3
 )
 ```
@@ -151,14 +152,12 @@ arxiv.Client(
 
 #### Example: fetching results with a custom client
 
-`(Search).results()` uses the default client settings. If you want to use a client you've defined instead of the defaults, use `(Client).results(...)`:
-
 ```python
 import arxiv
 
 big_slow_client = arxiv.Client(
  page_size = 1000,
- delay_seconds = 10,
+ delay_seconds = 10.0,
  num_retries = 5
 )
 
@@ -173,9 +172,11 @@ To inspect this package's network behavior and API logic, configure an `INFO`-le
 
 ```pycon
 >>> import logging, arxiv
->>> logging.basicConfig(level=logging.INFO)
->>> paper = next(arxiv.Search(id_list=["1605.08386v1"]).results())
+>>> logging.basicConfig(level=logging.DEBUG)
+>>> client = arxiv.Client()
+>>> paper = next(client.results(arxiv.Search(id_list=["1605.08386v1"])))
 INFO:arxiv.arxiv:Requesting 100 results at offset 0
-INFO:arxiv.arxiv:Requesting page of results
-INFO:arxiv.arxiv:Got first page; 1 of inf results available
+INFO:arxiv.arxiv:Requesting page (first: False, try: 0): https://export.arxiv.org/api/query?search_query=&id_list=1605.08386v1&sortBy=relevance&sortOrder=descending&start=0&max_results=100
+DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): export.arxiv.org:443
+DEBUG:urllib3.connectionpool:https://export.arxiv.org:443 "GET /api/query?search_query=&id_list=1605.08386v1&sortBy=relevance&sortOrder=descending&start=0&max_results=100&user-agent=arxiv.py%2F1.4.8 HTTP/1.1" 200 979
 ```
diff --git a/arxiv/arxiv.py b/arxiv/arxiv.py
@@ -3,8 +3,10 @@
 
 import logging
 import time
+import itertools
 import feedparser
 import os
+import math
 import re
 import requests
 import warnings
@@ -422,12 +424,12 @@ class Search(object):
  Manual](https://arxiv.org/help/api/user-manual#search_query_and_id_list)
  for documentation of the interaction between `query` and `id_list`.
  """
- max_results: float
+ max_results: int | None
  """
  The maximum number of results to be returned in an execution of this
  search.
 
- To fetch every result available, set `max_results=float('inf')`.
+ To fetch every result available, set `max_results=None`.
  """
  sort_by: SortCriterion
  """The sort criterion for results."""
@@ -438,7 +440,7 @@ def __init__(
  self,
  query: str = "",
  id_list: List[str] = [],
- max_results: float = float("inf"),
+ max_results: int | None = None,
  sort_by: SortCriterion = SortCriterion.Relevance,
  sort_order: SortOrder = SortOrder.Descending,
  ):
@@ -447,7 +449,8 @@ def __init__(
  """
  self.query = query
  self.id_list = id_list
- self.max_results = max_results
+ # Handle deprecated v1 default behavior.
+ self.max_results = None if max_results == math.inf else max_results
  self.sort_by = sort_by
  self.sort_order = sort_order
 
@@ -479,23 +482,19 @@ def _url_args(self) -> Dict[str, str]:
  "sortOrder": self.sort_order.value,
  }
 
- def get(self) -> Generator[Result, None, None]:
- """
- **Deprecated** after 1.2.0; use `Search.results`.
- """
- warnings.warn(
- "The 'get' method is deprecated, use 'results' instead",
- DeprecationWarning,
- stacklevel=2,
- )
- return self.results()
-
  def results(self, offset: int = 0) -> Generator[Result, None, None]:
  """
  Executes the specified search using a default arXiv API client.
 
  For info on default behavior, see `Client.__init__` and `Client.results`.
+
+ **Deprecated** after 2.0.0; use `Client.results`.
  """
+ warnings.warn(
+ "The '(Search).results' method is deprecated, use 'Client.results' instead",
+ DeprecationWarning,
+ stacklevel=2,
+ )
  return Client().results(self, offset=offset)
 
 
@@ -511,7 +510,7 @@ class Client(object):
  """The arXiv query API endpoint format."""
  page_size: int
  """Maximum number of results fetched in a single API request."""
- delay_seconds: int
+ delay_seconds: float
  """Number of seconds to wait between API requests."""
  num_retries: int
  """Number of times to retry a failing API request."""
@@ -520,7 +519,7 @@ class Client(object):
  _session: requests.Session
 
  def __init__(
- self, page_size: int = 100, delay_seconds: int = 3, num_retries: int = 3
+ self, page_size: int = 100, delay_seconds: float = 3.0, num_retries: int = 3
  ):
  """
  Constructs an arXiv API client with the specified options.
@@ -548,17 +547,6 @@ def __repr__(self) -> str:
  repr(self.num_retries),
  )
 
- def get(self, search: Search) -> Generator[Result, None, None]:
- """
- **Deprecated** after 1.2.0; use `Client.results`.
- """
- warnings.warn(
- "The 'get' method is deprecated, use 'results' instead",
- DeprecationWarning,
- stacklevel=2,
- )
- return self.results(search)
-
  def results(self, search: Search, offset: int = 0) -> Generator[Result, None, None]:
  """
  Uses this client configuration to fetch one page of the search results
@@ -574,46 +562,37 @@ def results(self, search: Search, offset: int = 0) -> Generator[Result, None, No
  For more on using generators, see
  [Generators](https://wiki.python.org/moin/Generators).
  """
+ limit = search.max_results - offset if search.max_results else None
+ if limit and limit < 0:
+ return iter(())
+ return itertools.islice(self._results(search, offset), limit)
+
+ def _results(
+ self, search: Search, offset: int = 0
+ ) -> Generator[Result, None, None]:
+ page_url = self._format_url(search, offset, self.page_size)
+ feed = self._parse_feed(page_url, first_page=True)
+ if not feed.entries:
+ logger.info("Got empty first page; stopping generation")
+ return
+ total_results = int(feed.feed.opensearch_totalresults)
+ logger.info(
+ "Got first page: %d of %d total results",
+ len(feed.entries),
+ total_results,
+ )
 
- # total_results may be reduced according to the feed's
- # opensearch:totalResults value.
- total_results = search.max_results
- first_page = True
- while offset < total_results:
- page_size = min(self.page_size, search.max_results - offset)
- logger.info("Requesting %d results at offset %d", page_size, offset)
- page_url = self._format_url(search, offset, page_size)
- feed = self._parse_feed(page_url, first_page=first_page)
- if first_page:
- # NOTE: this is an ugly fix for a known bug. The totalresults
- # value is set to 1 for results with zero entries. If that API
- # bug is fixed, we can remove this conditional and always set
- # `total_results = min(...)`.
- if len(feed.entries) == 0:
- logger.info("Got empty first page; stopping generation")
- total_results = 0
- else:
- total_results = min(
- total_results, int(feed.feed.opensearch_totalresults)
- )
- logger.info(
- "Got first page: %d of %d total results",
- total_results,
- search.max_results
- if search.max_results != float("inf")
- else -1,
- )
- # Subsequent pages are not the first page.
- first_page = False
- # Update offset for next request: account for received results.
- offset += len(feed.entries)
- # Yield query results until page is exhausted.
+ while feed.entries:
  for entry in feed.entries:
  try:
  yield Result._from_feed_entry(entry)
  except Result.MissingFieldError as e:
  logger.warning("Skipping partial result: %s", e)
- continue
+ offset += len(feed.entries)
+ if offset >= total_results:
+ break
+ page_url = self._format_url(search, offset, self.page_size)
+ feed = self._parse_feed(page_url, first_page=False)
 
  def _format_url(self, search: Search, start: int, page_size: int) -> str:
  """
@@ -679,7 +658,7 @@ def __try_parse_feed(
  "Requesting page (first: %r, try: %d): %s", first_page, try_index, url
  )
 
- resp = self._session.get(url, headers={"user-agent": "arxiv.py/1.4.8"})
+ resp = self._session.get(url, headers={"user-agent": "arxiv.py/2.0.0"})
  self._last_request_dt = datetime.now()
  if resp.status_code != requests.codes.OK:
  raise HTTPError(url, try_index, resp.status_code)

diff --git a/setup.py b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import setup
 
-version = "1.4.8"
+version = "2.0.0"
 
 with open("README.md", "r") as fh:
  long_description = fh.read()

diff --git a/tests/test_api_bugs.py b/tests/test_api_bugs.py
@@ -5,7 +5,7 @@
 import unittest
 
 
-class TestClient(unittest.TestCase):
+class TestAPIBugs(unittest.TestCase):
  def test_missing_title(self):
  """
  Papers with the title "0" do not have a title element in the Atom feed.

diff --git a/tests/test_client.py b/tests/test_client.py
@@ -3,15 +3,9 @@
 import arxiv
 from datetime import datetime, timedelta
 from pytest import approx
-import time
 
 
 class TestClient(unittest.TestCase):
- def tearDown(self) -> None:
- # Bodge: sleep three seconds between tests to simulate a shared rate limit.
- time.sleep(3)
- return super().tearDown()
-
  def test_invalid_format_id(self):
  with self.assertRaises(arxiv.HTTPError):
  list(arxiv.Client(num_retries=0).results(arxiv.Search(id_list=["abc"])))
@@ -58,7 +52,7 @@ def test_query_page_count(self):
  "https://export.arxiv.org/api/query?search_query=testing&id_list=&sortBy=relevance&sortOrder=descending&start=20&max_results=10",
  "https://export.arxiv.org/api/query?search_query=testing&id_list=&sortBy=relevance&sortOrder=descending&start=30&max_results=10",
  "https://export.arxiv.org/api/query?search_query=testing&id_list=&sortBy=relevance&sortOrder=descending&start=40&max_results=10",
- "https://export.arxiv.org/api/query?search_query=testing&id_list=&sortBy=relevance&sortOrder=descending&start=50&max_results=5",
+ "https://export.arxiv.org/api/query?search_query=testing&id_list=&sortBy=relevance&sortOrder=descending&start=50&max_results=10",
  },
  )
 
@@ -79,14 +73,12 @@ def test_offset(self):
  self.assertListEqual(offset_above_max_results, [])
 
  def test_search_results_offset(self):
+ # NOTE: page size is irrelevant here.
+ client = arxiv.Client(page_size=15)
  search = arxiv.Search(query="testing", max_results=10)
- client = arxiv.Client()
-
- all_results = list(client.results(search, 0))
+ all_results = list(client.results(search, offset=0))
  self.assertEqual(len(all_results), 10)
 
- client.page_size = 5
-
  for offset in [0, 5, 9, 10, 11]:
  client_results = list(client.results(search, offset=offset))
  self.assertEqual(len(client_results), max(0, search.max_results - offset))
@@ -191,12 +183,12 @@ def test_sleep_between_errors(self, patched_time_sleep):
  self.assertEqual(patched_time_sleep.call_count, client.num_retries)
  patched_time_sleep.assert_has_calls(
  [
- call(approx(client.delay_seconds, rel=1e-3)),
+ call(approx(client.delay_seconds, abs=1e-2)),
  ]
  * client.num_retries
  )
 
- def get_code_client(code: int, delay_seconds=3, num_retries=3) -> arxiv.Client:
+ def get_code_client(code: int, delay_seconds=0.1, num_retries=3) -> arxiv.Client:
  """
  get_code_client returns an arxiv.Cient with HTTP requests routed to
  httpstat.us.