Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reduced internal _limit to increase test speed #87

Merged
merged 14 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/lint-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
- run: python -m pip install .[dev]
- uses: pre-commit/action@v3.0.1
- name: test
run: pytest
run: pytest --verbose
env:
SERPAPI_API_KEY: ${{ secrets.SERPAPI_API_KEY }}
SEMANTIC_SCHOLAR_API_KEY: ${{ secrets.SEMANTIC_SCHOLAR_API_KEY }}
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ repos:
- id: trailing-whitespace
exclude: .gitignore
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 24.3.0
rev: 24.4.0
hooks:
- id: black-jupyter
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.3.6
rev: v0.3.7
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
Expand Down
1 change: 1 addition & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pytest
pytest-timeout
pytest-timer
pre-commit
18 changes: 10 additions & 8 deletions paperscraper/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,7 @@ class RateLimits(float, Enum):

SEMANTIC_SCHOLAR = 90.0
GOOGLE_SCHOLAR = 30.0
# SEE: https://www.crossref.org/documentation/metadata-plus/#00343
CROSSREF = 30.0 # noqa: PIE796
SCRAPER = 30 / 60
FALLBACK_SLOW = 15 / 60
Expand Down Expand Up @@ -598,6 +599,9 @@ def make_url_params( # noqa: PLR0911
raise NotImplementedError


GOOGLE_SEARCH_PAGE_SIZE = 20


async def a_search_papers( # noqa: C901, PLR0912, PLR0915
query: str,
limit: int = 10,
Expand Down Expand Up @@ -666,7 +670,7 @@ async def a_search_papers( # noqa: C901, PLR0912, PLR0915
"q": query,
"api_key": os.environ["SERPAPI_API_KEY"],
"engine": "google_scholar",
"num": 20,
"num": GOOGLE_SEARCH_PAGE_SIZE,
"start": _offset,
# TODO - add offset and limit here # noqa: TD004
}
Expand Down Expand Up @@ -858,7 +862,8 @@ async def google2s2(
pdir=pdir,
_paths=paths, # type: ignore[arg-type]
_limit=_limit,
_offset=_offset + (20 if search_type == "google" else _limit),
_offset=_offset
+ (GOOGLE_SEARCH_PAGE_SIZE if search_type == "google" else _limit),
logger=logger,
year=year,
verbose=verbose,
Expand All @@ -878,7 +883,7 @@ async def a_gsearch_papers( # noqa: C901
pdir: str | os.PathLike = os.curdir,
_paths: dict[str | os.PathLike, dict[str, Any]] | None = None,
_offset: int = 0,
_limit: int = 20,
_limit: int = GOOGLE_SEARCH_PAGE_SIZE,
logger: logging.Logger | None = None,
year: str | None = None,
verbose: bool = False,
Expand Down Expand Up @@ -928,13 +933,10 @@ async def a_gsearch_papers( # noqa: C901
{str(k): v for k, v in _paths.items()} if _paths is not None else {}
)
scraper = scraper or default_scraper()
ssheader = get_header()
# add key to headers

# Shared rate limits here between gs/crossref
async with ThrottledClientSession(
headers=ssheader,
rate_limit=RateLimits.GOOGLE_SCHOLAR.value,
headers=get_header(),
rate_limit=RateLimits.GOOGLE_SCHOLAR.value, # Share rate limits between gs/crossref
) as session:
async with session.get(
url=endpoint,
Expand Down
5 changes: 4 additions & 1 deletion paperscraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,10 @@ async def scrape(
return True
except Exception:
if logger is not None:
logger.exception(f"\tScraper {scraper.name} failed.")
logger.exception(
f"\tScraper {scraper.name} failed on paper titled"
f" {paper.get('title')!r}."
)
scrape_result[scraper.name] = "failed"
if self.callback is not None:
await self.callback(paper["title"], scrape_result)
Expand Down
2 changes: 1 addition & 1 deletion paperscraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ async def _request(self, *args, **kwargs) -> aiohttp.ClientResponse:
if response.status not in self.SERVICE_LIMIT_REACHED_STATUS_CODES:
break
if retry_num < self._retry_count:
exp_backoff_with_jitter = 2**retry_num + random.random()
exp_backoff_with_jitter = 0.1 * (2**retry_num + random.random())
logger.warning(
f"Hit a service limit per status {response.status} with message"
f" {await response.text()}, sleeping"
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ filterwarnings = [
]
# Timeout in seconds for entire session. Default is None which means no timeout.
# Timeout is checked between tests, and will not interrupt a test in progress.
session_timeout = 1200
session_timeout = 2400
# List of directories that should be searched for tests when no specific directories,
# files or test ids are given in the command line when executing pytest from the rootdir
# directory. File system paths may use shell-style wildcards, including the recursive **
Expand Down
18 changes: 11 additions & 7 deletions tests/test_paperscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from paperscraper.exceptions import CitationConversionError, DOINotFoundError
from paperscraper.headers import get_header
from paperscraper.lib import (
GOOGLE_SEARCH_PAGE_SIZE,
RateLimits,
clean_upbibtex,
doi_to_bibtex,
Expand Down Expand Up @@ -197,14 +198,17 @@ async def test_google_search_papers(self) -> None:
)
assert len(papers) >= 3

async def test_high_limit(self) -> None:
async def test_with_multiple_google_search_pages(self) -> None:
papers = await paperscraper.a_search_papers(
"molecular dynamics", search_type="google", year="2019-2023", limit=25
"molecular dynamics",
search_type="google",
year="2019-2023",
limit=int(2.1 * GOOGLE_SEARCH_PAGE_SIZE),
)
assert len(papers) > 20
assert len(papers) > GOOGLE_SEARCH_PAGE_SIZE


class TestGS(IsolatedAsyncioTestCase):
class TestGSearch(IsolatedAsyncioTestCase):
async def test_gsearch(self):
query = "molecular dynamics"
papers = await paperscraper.a_gsearch_papers(query, year="2019-2023", limit=3)
Expand All @@ -220,11 +224,11 @@ async def test_gsearch(self):
assert paper["citationCount"]
assert paper["title"]

async def test_gsearch_high_limit(self) -> None:
async def test_with_multiple_google_search_pages(self) -> None:
papers = await paperscraper.a_gsearch_papers(
"molecular dynamics", year="2019-2023", limit=45
"molecular dynamics", year="2019-2023", limit=5, _limit=2
)
assert len(papers) > 20
assert len(papers) >= 5

async def test_no_link_doesnt_crash_us(self) -> None:
await paperscraper.a_gsearch_papers(
Expand Down
Loading