blackadad · jamesbraza · Apr 15, 2024 · Apr 14, 2024 · Apr 14, 2024 · Apr 14, 2024
diff --git a/.github/workflows/lint-test.yaml b/.github/workflows/lint-test.yaml
@@ -21,7 +21,7 @@ jobs:
       - run: python -m pip install .[dev]
       - uses: pre-commit/action@v3.0.1
       - name: test
-        run: pytest
+        run: pytest --verbose
         env:
           SERPAPI_API_KEY: ${{ secrets.SERPAPI_API_KEY }}
           SEMANTIC_SCHOLAR_API_KEY: ${{ secrets.SEMANTIC_SCHOLAR_API_KEY }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,11 +18,11 @@ repos:
       - id: trailing-whitespace
         exclude: .gitignore
   - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.3.0
+    rev: 24.4.0
     hooks:
     - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.6
+    rev: v0.3.7
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]

diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -1,3 +1,4 @@
 pytest
 pytest-timeout
+pytest-timer
 pre-commit
diff --git a/paperscraper/lib.py b/paperscraper/lib.py
@@ -523,6 +523,7 @@ class RateLimits(float, Enum):
 
     SEMANTIC_SCHOLAR = 90.0
     GOOGLE_SCHOLAR = 30.0
+    # SEE: https://www.crossref.org/documentation/metadata-plus/#00343
     CROSSREF = 30.0  # noqa: PIE796
     SCRAPER = 30 / 60
     FALLBACK_SLOW = 15 / 60
@@ -598,6 +599,9 @@ def make_url_params(  # noqa: PLR0911
         raise NotImplementedError
 
 
+GOOGLE_SEARCH_PAGE_SIZE = 20
+
+
 async def a_search_papers(  # noqa: C901, PLR0912, PLR0915
     query: str,
     limit: int = 10,
@@ -666,7 +670,7 @@ async def a_search_papers(  # noqa: C901, PLR0912, PLR0915
             "q": query,
             "api_key": os.environ["SERPAPI_API_KEY"],
             "engine": "google_scholar",
-            "num": 20,
+            "num": GOOGLE_SEARCH_PAGE_SIZE,
             "start": _offset,
             # TODO - add offset and limit here  # noqa: TD004
         }
@@ -858,7 +862,8 @@ async def google2s2(
                 pdir=pdir,
                 _paths=paths,  # type: ignore[arg-type]
                 _limit=_limit,
-                _offset=_offset + (20 if search_type == "google" else _limit),
+                _offset=_offset
+                + (GOOGLE_SEARCH_PAGE_SIZE if search_type == "google" else _limit),
                 logger=logger,
                 year=year,
                 verbose=verbose,
@@ -878,7 +883,7 @@ async def a_gsearch_papers(  # noqa: C901
     pdir: str | os.PathLike = os.curdir,
     _paths: dict[str | os.PathLike, dict[str, Any]] | None = None,
     _offset: int = 0,
-    _limit: int = 20,
+    _limit: int = GOOGLE_SEARCH_PAGE_SIZE,
     logger: logging.Logger | None = None,
     year: str | None = None,
     verbose: bool = False,
@@ -928,13 +933,10 @@ async def a_gsearch_papers(  # noqa: C901
         {str(k): v for k, v in _paths.items()} if _paths is not None else {}
     )
     scraper = scraper or default_scraper()
-    ssheader = get_header()
-    # add key to headers
 
-    # Shared rate limits here between gs/crossref
     async with ThrottledClientSession(
-        headers=ssheader,
-        rate_limit=RateLimits.GOOGLE_SCHOLAR.value,
+        headers=get_header(),
+        rate_limit=RateLimits.GOOGLE_SCHOLAR.value,  # Share rate limits between gs/crossref
     ) as session:
         async with session.get(
             url=endpoint,

diff --git a/paperscraper/scraper.py b/paperscraper/scraper.py
@@ -93,7 +93,10 @@ async def scrape(
                         return True
                 except Exception:
                     if logger is not None:
-                        logger.exception(f"\tScraper {scraper.name} failed.")
+                        logger.exception(
+                            f"\tScraper {scraper.name} failed on paper titled"
+                            f" {paper.get('title')!r}."
+                        )
                 scrape_result[scraper.name] = "failed"
             if self.callback is not None:
                 await self.callback(paper["title"], scrape_result)

diff --git a/paperscraper/utils.py b/paperscraper/utils.py
@@ -114,7 +114,7 @@ async def _request(self, *args, **kwargs) -> aiohttp.ClientResponse:
             if response.status not in self.SERVICE_LIMIT_REACHED_STATUS_CODES:
                 break
             if retry_num < self._retry_count:
-                exp_backoff_with_jitter = 2**retry_num + random.random()
+                exp_backoff_with_jitter = 0.1 * (2**retry_num + random.random())
                 logger.warning(
                     f"Hit a service limit per status {response.status} with message"
                     f" {await response.text()}, sleeping"

diff --git a/pyproject.toml b/pyproject.toml
@@ -109,7 +109,7 @@ filterwarnings = [
 ]
 # Timeout in seconds for entire session.  Default is None which means no timeout.
 # Timeout is checked between tests, and will not interrupt a test in progress.
-session_timeout = 1200
+session_timeout = 2400
 # List of directories that should be searched for tests when no specific directories,
 # files or test ids are given in the command line when executing pytest from the rootdir
 # directory. File system paths may use shell-style wildcards, including the recursive **

diff --git a/tests/test_paperscraper.py b/tests/test_paperscraper.py
@@ -16,6 +16,7 @@
 from paperscraper.exceptions import CitationConversionError, DOINotFoundError
 from paperscraper.headers import get_header
 from paperscraper.lib import (
+    GOOGLE_SEARCH_PAGE_SIZE,
     RateLimits,
     clean_upbibtex,
     doi_to_bibtex,
@@ -197,14 +198,17 @@ async def test_google_search_papers(self) -> None:
                 )
                 assert len(papers) >= 3
 
-    async def test_high_limit(self) -> None:
+    async def test_with_multiple_google_search_pages(self) -> None:
         papers = await paperscraper.a_search_papers(
-            "molecular dynamics", search_type="google", year="2019-2023", limit=25
+            "molecular dynamics",
+            search_type="google",
+            year="2019-2023",
+            limit=int(2.1 * GOOGLE_SEARCH_PAGE_SIZE),
         )
-        assert len(papers) > 20
+        assert len(papers) > GOOGLE_SEARCH_PAGE_SIZE
 
 
-class TestGS(IsolatedAsyncioTestCase):
+class TestGSearch(IsolatedAsyncioTestCase):
     async def test_gsearch(self):
         query = "molecular dynamics"
         papers = await paperscraper.a_gsearch_papers(query, year="2019-2023", limit=3)
@@ -220,11 +224,11 @@ async def test_gsearch(self):
             assert paper["citationCount"]
             assert paper["title"]
 
-    async def test_gsearch_high_limit(self) -> None:
+    async def test_with_multiple_google_search_pages(self) -> None:
         papers = await paperscraper.a_gsearch_papers(
-            "molecular dynamics", year="2019-2023", limit=45
+            "molecular dynamics", year="2019-2023", limit=5, _limit=2
         )
-        assert len(papers) > 20
+        assert len(papers) >= 5
 
     async def test_no_link_doesnt_crash_us(self) -> None:
         await paperscraper.a_gsearch_papers(