diff --git a/.github/workflows/lint-test.yaml b/.github/workflows/lint-test.yaml index 1bca64f..4e9490f 100644 --- a/.github/workflows/lint-test.yaml +++ b/.github/workflows/lint-test.yaml @@ -21,7 +21,7 @@ jobs: - run: python -m pip install .[dev] - uses: pre-commit/action@v3.0.1 - name: test - run: pytest + run: pytest --verbose env: SERPAPI_API_KEY: ${{ secrets.SERPAPI_API_KEY }} SEMANTIC_SCHOLAR_API_KEY: ${{ secrets.SEMANTIC_SCHOLAR_API_KEY }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ad42f85..47c55e4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,11 +18,11 @@ repos: - id: trailing-whitespace exclude: .gitignore - repo: https://github.com/psf/black-pre-commit-mirror - rev: 24.3.0 + rev: 24.4.0 hooks: - id: black-jupyter - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.6 + rev: v0.3.7 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] diff --git a/dev-requirements.txt b/dev-requirements.txt index a31854d..632b84b 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,3 +1,4 @@ pytest pytest-timeout +pytest-timer pre-commit diff --git a/paperscraper/lib.py b/paperscraper/lib.py index d11f25a..c252a93 100644 --- a/paperscraper/lib.py +++ b/paperscraper/lib.py @@ -523,6 +523,7 @@ class RateLimits(float, Enum): SEMANTIC_SCHOLAR = 90.0 GOOGLE_SCHOLAR = 30.0 + # SEE: https://www.crossref.org/documentation/metadata-plus/#00343 CROSSREF = 30.0 # noqa: PIE796 SCRAPER = 30 / 60 FALLBACK_SLOW = 15 / 60 @@ -598,6 +599,9 @@ def make_url_params( # noqa: PLR0911 raise NotImplementedError +GOOGLE_SEARCH_PAGE_SIZE = 20 + + async def a_search_papers( # noqa: C901, PLR0912, PLR0915 query: str, limit: int = 10, @@ -666,7 +670,7 @@ async def a_search_papers( # noqa: C901, PLR0912, PLR0915 "q": query, "api_key": os.environ["SERPAPI_API_KEY"], "engine": "google_scholar", - "num": 20, + "num": GOOGLE_SEARCH_PAGE_SIZE, "start": _offset, # TODO - add offset and limit here # noqa: TD004 } @@ -858,7 +862,8 @@ async def google2s2( pdir=pdir, _paths=paths, # type: ignore[arg-type] _limit=_limit, - _offset=_offset + (20 if search_type == "google" else _limit), + _offset=_offset + + (GOOGLE_SEARCH_PAGE_SIZE if search_type == "google" else _limit), logger=logger, year=year, verbose=verbose, @@ -878,7 +883,7 @@ async def a_gsearch_papers( # noqa: C901 pdir: str | os.PathLike = os.curdir, _paths: dict[str | os.PathLike, dict[str, Any]] | None = None, _offset: int = 0, - _limit: int = 20, + _limit: int = GOOGLE_SEARCH_PAGE_SIZE, logger: logging.Logger | None = None, year: str | None = None, verbose: bool = False, @@ -928,13 +933,10 @@ async def a_gsearch_papers( # noqa: C901 {str(k): v for k, v in _paths.items()} if _paths is not None else {} ) scraper = scraper or default_scraper() - ssheader = get_header() - # add key to headers - # Shared rate limits here between gs/crossref async with ThrottledClientSession( - headers=ssheader, - rate_limit=RateLimits.GOOGLE_SCHOLAR.value, + headers=get_header(), + rate_limit=RateLimits.GOOGLE_SCHOLAR.value, # Share rate limits between gs/crossref ) as session: async with session.get( url=endpoint, diff --git a/paperscraper/scraper.py b/paperscraper/scraper.py index d6a04b2..77279ca 100644 --- a/paperscraper/scraper.py +++ b/paperscraper/scraper.py @@ -93,7 +93,10 @@ async def scrape( return True except Exception: if logger is not None: - logger.exception(f"\tScraper {scraper.name} failed.") + logger.exception( + f"\tScraper {scraper.name} failed on paper titled" + f" {paper.get('title')!r}." + ) scrape_result[scraper.name] = "failed" if self.callback is not None: await self.callback(paper["title"], scrape_result) diff --git a/paperscraper/utils.py b/paperscraper/utils.py index 91e2c4c..df996c7 100644 --- a/paperscraper/utils.py +++ b/paperscraper/utils.py @@ -114,7 +114,7 @@ async def _request(self, *args, **kwargs) -> aiohttp.ClientResponse: if response.status not in self.SERVICE_LIMIT_REACHED_STATUS_CODES: break if retry_num < self._retry_count: - exp_backoff_with_jitter = 2**retry_num + random.random() + exp_backoff_with_jitter = 0.1 * (2**retry_num + random.random()) logger.warning( f"Hit a service limit per status {response.status} with message" f" {await response.text()}, sleeping" diff --git a/pyproject.toml b/pyproject.toml index f4c946a..d951a48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,7 +109,7 @@ filterwarnings = [ ] # Timeout in seconds for entire session. Default is None which means no timeout. # Timeout is checked between tests, and will not interrupt a test in progress. -session_timeout = 1200 +session_timeout = 2400 # List of directories that should be searched for tests when no specific directories, # files or test ids are given in the command line when executing pytest from the rootdir # directory. File system paths may use shell-style wildcards, including the recursive ** diff --git a/tests/test_paperscraper.py b/tests/test_paperscraper.py index 897408d..23b4091 100644 --- a/tests/test_paperscraper.py +++ b/tests/test_paperscraper.py @@ -16,6 +16,7 @@ from paperscraper.exceptions import CitationConversionError, DOINotFoundError from paperscraper.headers import get_header from paperscraper.lib import ( + GOOGLE_SEARCH_PAGE_SIZE, RateLimits, clean_upbibtex, doi_to_bibtex, @@ -197,14 +198,17 @@ async def test_google_search_papers(self) -> None: ) assert len(papers) >= 3 - async def test_high_limit(self) -> None: + async def test_with_multiple_google_search_pages(self) -> None: papers = await paperscraper.a_search_papers( - "molecular dynamics", search_type="google", year="2019-2023", limit=25 + "molecular dynamics", + search_type="google", + year="2019-2023", + limit=int(2.1 * GOOGLE_SEARCH_PAGE_SIZE), ) - assert len(papers) > 20 + assert len(papers) > GOOGLE_SEARCH_PAGE_SIZE -class TestGS(IsolatedAsyncioTestCase): +class TestGSearch(IsolatedAsyncioTestCase): async def test_gsearch(self): query = "molecular dynamics" papers = await paperscraper.a_gsearch_papers(query, year="2019-2023", limit=3) @@ -220,11 +224,11 @@ async def test_gsearch(self): assert paper["citationCount"] assert paper["title"] - async def test_gsearch_high_limit(self) -> None: + async def test_with_multiple_google_search_pages(self) -> None: papers = await paperscraper.a_gsearch_papers( - "molecular dynamics", year="2019-2023", limit=45 + "molecular dynamics", year="2019-2023", limit=5, _limit=2 ) - assert len(papers) > 20 + assert len(papers) >= 5 async def test_no_link_doesnt_crash_us(self) -> None: await paperscraper.a_gsearch_papers(