diff --git a/.gitignore b/.gitignore index 9d304d8..7c91b8b 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,6 @@ dmypy.json *.pdf + +# Matching pyproject.toml +paperscraper/version.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 47c55e4..ea0212b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,25 +18,37 @@ repos: - id: trailing-whitespace exclude: .gitignore - repo: https://github.com/psf/black-pre-commit-mirror - rev: 24.4.0 + rev: 24.4.2 hooks: - - id: black-jupyter + - id: black-jupyter - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.7 + rev: v0.4.5 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.1.0 + hooks: + - id: prettier + additional_dependencies: + - prettier@3.2.5 # SEE: https://github.com/pre-commit/pre-commit/issues/3133 - repo: https://github.com/pappasam/toml-sort rev: v0.23.1 hooks: - id: toml-sort-fix - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 + rev: v2.3.0 hooks: - id: codespell additional_dependencies: [".[toml]"] + - repo: https://github.com/abravalheri/validate-pyproject + rev: v0.18 + hooks: + - id: validate-pyproject + additional_dependencies: + - "validate-pyproject-schema-store[all]" - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.9.0 + rev: v1.10.0 hooks: - id: mypy additional_dependencies: # Versions here match pyproject.toml diff --git a/paperscraper/lib.py b/paperscraper/lib.py index 8ca1aea..13649d6 100644 --- a/paperscraper/lib.py +++ b/paperscraper/lib.py @@ -221,7 +221,8 @@ async def pmc_to_pdf( cause_exc = ValueError("Not a PDF.") if cause_exc: raise RuntimeError( - f"Failed to convert PubMed Central ID {pmc_id} to PDF given URL {pdf_url}." + f"Failed to convert PubMed Central ID {pmc_id} to PDF given URL" + f" {pdf_url}." ) from cause_exc with open(path, "wb") as f: # noqa: ASYNC101 f.write(await r.read()) @@ -306,7 +307,7 @@ async def local_scraper(paper, path) -> bool: # noqa: ARG001 def default_scraper( - callback: Callable[[str, dict[str, str]], Awaitable] | None = None + callback: Callable[[str, dict[str, str]], Awaitable] | None = None, ) -> Scraper: scraper = Scraper(callback=callback) scraper.register_scraper(local_scraper, priority=12) @@ -523,19 +524,17 @@ class RateLimits(float, Enum): FALLBACK_SLOW = 15 / 60 -SEMANTIC_SCHOLAR_API_FIELDS: str = ",".join( - [ - "citationStyles", - "externalIds", - "url", - "openAccessPdf", - "year", - "isOpenAccess", - "influentialCitationCount", - "citationCount", - "title", - ] -) +SEMANTIC_SCHOLAR_API_FIELDS: str = ",".join([ + "citationStyles", + "externalIds", + "url", + "openAccessPdf", + "year", + "isOpenAccess", + "influentialCitationCount", + "citationCount", + "title", +]) SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org" @@ -674,7 +673,7 @@ async def a_search_papers( # noqa: C901, PLR0912, PLR0915 elif search_type == "paper": raise NotImplementedError( f"Only added 'paper' search type to {SematicScholarSearchType.__name__}," - f" but not yet to this function in general." + " but not yet to this function in general." ) if year is not None and search_type == "default": @@ -776,9 +775,9 @@ async def google2s2( ) as response: if not response.ok: logger.warning( - "Error correlating papers from google to semantic scholar:" - f" status {response.status}, reason {response.reason!r}," - f" text {await response.text()!r}." + "Error correlating papers from google to semantic" + f" scholar: status {response.status}, reason" + f" {response.reason!r}, text {await response.text()!r}." ) return None response_data = await response.json() @@ -811,12 +810,10 @@ async def google2s2( return response_data["data"][0] return None - responses = await asyncio.gather( - *( - google2s2(t, y, p) - for t, y, p in zip(titles, years, google_pdf_links) - ) - ) + responses = await asyncio.gather(*( + google2s2(t, y, p) + for t, y, p in zip(titles, years, google_pdf_links) + )) data = {"data": [r for r in responses if r is not None]} data["total"] = len(data["data"]) field = "data" @@ -836,7 +833,8 @@ async def google2s2( papers.sort(key=lambda x: x["influentialCitationCount"], reverse=True) if search_type in ["default", "google"]: logger.info( - f"Found {data['total']} papers, analyzing {_offset} to {_offset + len(papers)}" + f"Found {data['total']} papers, analyzing {_offset} to" + f" {_offset + len(papers)}" ) # batch them, since we may reach desired limit before all done @@ -954,7 +952,8 @@ async def a_gsearch_papers( # noqa: C901 ) total_papers = data["search_information"].get("total_results", 1) logger.info( - f"Found {total_papers} papers, analyzing {_offset} to {_offset + len(papers)}" + f"Found {total_papers} papers, analyzing {_offset} to" + f" {_offset + len(papers)}" ) # batch them, since we may reach desired limit before all done diff --git a/paperscraper/scraper.py b/paperscraper/scraper.py index 77279ca..465439f 100644 --- a/paperscraper/scraper.py +++ b/paperscraper/scraper.py @@ -86,7 +86,8 @@ async def scrape( scrape_result[scraper.name] = "success" if logger is not None: logger.debug( - f"\tsucceeded - key: {paper['paperId']} scraper: {scraper.name}" + f"\tsucceeded - key: {paper['paperId']} scraper:" + f" {scraper.name}" ) if self.callback is not None: await self.callback(paper["title"], scrape_result) @@ -147,12 +148,10 @@ async def scrape_parse( for i in range(0, len(papers), batch_size): aggregated |= { r[0]: r[1] - for r in await asyncio.gather( - *( - scrape_parse(paper=p, i=i + j) - for j, p in enumerate(papers[i : i + batch_size]) - ) - ) + for r in await asyncio.gather(*( + scrape_parse(paper=p, i=i + j) + for j, p in enumerate(papers[i : i + batch_size]) + )) if r is not False } if limit is not None and len(aggregated) >= limit: diff --git a/paperscraper/version.py b/paperscraper/version.py deleted file mode 100644 index 2b12553..0000000 --- a/paperscraper/version.py +++ /dev/null @@ -1,3 +0,0 @@ -from importlib.metadata import version - -__version__ = version("paper-scraper") diff --git a/pyproject.toml b/pyproject.toml index d951a48..436c3d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,6 @@ [build-system] build-backend = "setuptools.build_meta" -# Pin to 62.6 for support from reading requirements from requirements.txt -requires = ["setuptools >= 62.6.0"] +requires = ["setuptools>=64", "setuptools_scm>=8"] [project] authors = [ @@ -25,7 +24,7 @@ dependencies = [ "pybtex", ] description = "LLM Chain for answering questions from docs" -dynamic = ["optional-dependencies"] +dynamic = ["optional-dependencies", "version"] keywords = ["question answering"] license = {file = "LICENSE"} maintainers = [ @@ -36,13 +35,14 @@ name = "paper-scraper" readme = "README.md" requires-python = ">=3.8" urls = {repository = "https://github.com/blackadad/paper-scraper"} -version = "1.8.0" + +[tool.black] +enable-unstable-feature = ["hug_parens_with_braces_and_square_brackets"] +preview = true [tool.codespell] check-filenames = true check-hidden = true -# SEE: https://github.com/codespell-project/codespell/issues/1212#issuecomment-1744768533 -ignore-regex = ".{1024}|.*codespell-ignore.*" ignore-words-list = "cros,ser" [tool.mypy] @@ -203,6 +203,9 @@ file = ["dev-requirements.txt"] [tool.setuptools.packages.find] include = ["paperscraper*"] +[tool.setuptools_scm] +version_file = "paperscraper/version.py" + [tool.tomlsort] all = true in_place = true diff --git a/tests/test_paperscraper.py b/tests/test_paperscraper.py index 507924c..cf0c8a1 100644 --- a/tests/test_paperscraper.py +++ b/tests/test_paperscraper.py @@ -93,16 +93,25 @@ async def test_reconcile_dois(self) -> None: async def test_hard_reconciles(self): test_parameters: list[dict] = [ { - "title": "High-throughput screening of human genetic variants by pooled prime editing.", # noqa: E501 + "title": ( + "High-throughput screening of human genetic variants by pooled" + " prime editing." + ), "doi": "10.1101/2024.04.01.587366", }, { - "title": "High-throughput screening of human genetic variants by pooled prime editing.", # noqa: E501 + "title": ( + "High-throughput screening of human genetic variants by pooled" + " prime editing." + ), "authors": ["garbage", "authors", "that"], "doi": "10.1101/2024.04.01.587366", }, { - "title": "High throughput screening of human genetic variants by pooled prime editing", # noqa: E501 + "title": ( + "High throughput screening of human genetic variants by pooled" + " prime editing" + ), "doi": "10.1101/2024.04.01.587366", }, ] @@ -235,7 +244,8 @@ async def test_with_multiple_google_search_pages(self) -> None: async def test_no_link_doesnt_crash_us(self) -> None: await paperscraper.a_gsearch_papers( - "OAG-BERT: Pre-train Heterogeneous Entity-augmented Academic Language Models", + "OAG-BERT: Pre-train Heterogeneous Entity-augmented Academic Language" + " Models", year="2021", ) @@ -336,7 +346,9 @@ async def mock_session_get(*_, **__): await openaccess_scraper( { "openAccessPdf": { - "url": "https://pubs.acs.org/doi/abs/10.1021/acs.nanolett.0c00513" + "url": ( + "https://pubs.acs.org/doi/abs/10.1021/acs.nanolett.0c00513" + ) } }, os.path.join(tmpdir, "test.pdf"), @@ -534,7 +546,8 @@ async def test_scraper_doi_search(self): class Test15(IsolatedAsyncioTestCase): async def test_pdf_link_from_google(self): papers = await paperscraper.a_search_papers( - "Multiplex Base Editing to Protect from CD33-Directed Therapy: Implications for Immune and Gene Therapy", # noqa: E501 + "Multiplex Base Editing to Protect from CD33-Directed Therapy: Implications" + " for Immune and Gene Therapy", limit=1, search_type="google", )