blackadad · blackadad · May 30, 2024 · May 26, 2024 · May 26, 2024 · May 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -130,3 +130,6 @@ dmypy.json
 
 
 *.pdf
+
+# Matching pyproject.toml
+paperscraper/version.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,25 +18,37 @@ repos:
       - id: trailing-whitespace
         exclude: .gitignore
   - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.4.0
+    rev: 24.4.2
     hooks:
-    - id: black-jupyter
+      - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.7
+    rev: v0.4.5
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: v3.1.0
+    hooks:
+      - id: prettier
+        additional_dependencies:
+          - prettier@3.2.5 # SEE: https://github.com/pre-commit/pre-commit/issues/3133
   - repo: https://github.com/pappasam/toml-sort
     rev: v0.23.1
     hooks:
       - id: toml-sort-fix
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.6
+    rev: v2.3.0
     hooks:
       - id: codespell
         additional_dependencies: [".[toml]"]
+  - repo: https://github.com/abravalheri/validate-pyproject
+    rev: v0.18
+    hooks:
+      - id: validate-pyproject
+        additional_dependencies:
+          - "validate-pyproject-schema-store[all]"
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.9.0
+    rev: v1.10.0
     hooks:
       - id: mypy
         additional_dependencies: # Versions here match pyproject.toml

diff --git a/paperscraper/lib.py b/paperscraper/lib.py
@@ -221,7 +221,8 @@ async def pmc_to_pdf(
             cause_exc = ValueError("Not a PDF.")
         if cause_exc:
             raise RuntimeError(
-                f"Failed to convert PubMed Central ID {pmc_id} to PDF given URL {pdf_url}."
+                f"Failed to convert PubMed Central ID {pmc_id} to PDF given URL"
+                f" {pdf_url}."
             ) from cause_exc
         with open(path, "wb") as f:  # noqa: ASYNC101
             f.write(await r.read())
@@ -306,7 +307,7 @@ async def local_scraper(paper, path) -> bool:  # noqa: ARG001
 
 
 def default_scraper(
-    callback: Callable[[str, dict[str, str]], Awaitable] | None = None
+    callback: Callable[[str, dict[str, str]], Awaitable] | None = None,
 ) -> Scraper:
     scraper = Scraper(callback=callback)
     scraper.register_scraper(local_scraper, priority=12)
@@ -523,19 +524,17 @@ class RateLimits(float, Enum):
     FALLBACK_SLOW = 15 / 60
 
 
-SEMANTIC_SCHOLAR_API_FIELDS: str = ",".join(
-    [
-        "citationStyles",
-        "externalIds",
-        "url",
-        "openAccessPdf",
-        "year",
-        "isOpenAccess",
-        "influentialCitationCount",
-        "citationCount",
-        "title",
-    ]
-)
+SEMANTIC_SCHOLAR_API_FIELDS: str = ",".join([
+    "citationStyles",
+    "externalIds",
+    "url",
+    "openAccessPdf",
+    "year",
+    "isOpenAccess",
+    "influentialCitationCount",
+    "citationCount",
+    "title",
+])
 SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org"
 
 
@@ -674,7 +673,7 @@ async def a_search_papers(  # noqa: C901, PLR0912, PLR0915
     elif search_type == "paper":
         raise NotImplementedError(
             f"Only added 'paper' search type to {SematicScholarSearchType.__name__},"
-            f" but not yet to this function in general."
+            " but not yet to this function in general."
         )
 
     if year is not None and search_type == "default":
@@ -776,9 +775,9 @@ async def google2s2(
                     ) as response:
                         if not response.ok:
                             logger.warning(
-                                "Error correlating papers from google to semantic scholar:"
-                                f" status {response.status}, reason {response.reason!r},"
-                                f" text {await response.text()!r}."
+                                "Error correlating papers from google to semantic"
+                                f" scholar: status {response.status}, reason"
+                                f" {response.reason!r}, text {await response.text()!r}."
                             )
                             return None
                         response_data = await response.json()
@@ -811,12 +810,10 @@ async def google2s2(
                         return response_data["data"][0]
                     return None
 
-                responses = await asyncio.gather(
-                    *(
-                        google2s2(t, y, p)
-                        for t, y, p in zip(titles, years, google_pdf_links)
-                    )
-                )
+                responses = await asyncio.gather(*(
+                    google2s2(t, y, p)
+                    for t, y, p in zip(titles, years, google_pdf_links)
+                ))
             data = {"data": [r for r in responses if r is not None]}
             data["total"] = len(data["data"])
         field = "data"
@@ -836,7 +833,8 @@ async def google2s2(
             papers.sort(key=lambda x: x["influentialCitationCount"], reverse=True)
         if search_type in ["default", "google"]:
             logger.info(
-                f"Found {data['total']} papers, analyzing {_offset} to {_offset + len(papers)}"
+                f"Found {data['total']} papers, analyzing {_offset} to"
+                f" {_offset + len(papers)}"
             )
 
         # batch them, since we may reach desired limit before all done
@@ -954,7 +952,8 @@ async def a_gsearch_papers(  # noqa: C901
         )
         total_papers = data["search_information"].get("total_results", 1)
         logger.info(
-            f"Found {total_papers} papers, analyzing {_offset} to {_offset + len(papers)}"
+            f"Found {total_papers} papers, analyzing {_offset} to"
+            f" {_offset + len(papers)}"
         )
 
         # batch them, since we may reach desired limit before all done

diff --git a/paperscraper/scraper.py b/paperscraper/scraper.py
@@ -86,7 +86,8 @@ async def scrape(
                         scrape_result[scraper.name] = "success"
                         if logger is not None:
                             logger.debug(
-                                f"\tsucceeded - key: {paper['paperId']} scraper: {scraper.name}"
+                                f"\tsucceeded - key: {paper['paperId']} scraper:"
+                                f" {scraper.name}"
                             )
                         if self.callback is not None:
                             await self.callback(paper["title"], scrape_result)
@@ -147,12 +148,10 @@ async def scrape_parse(
         for i in range(0, len(papers), batch_size):
             aggregated |= {
                 r[0]: r[1]
-                for r in await asyncio.gather(
-                    *(
-                        scrape_parse(paper=p, i=i + j)
-                        for j, p in enumerate(papers[i : i + batch_size])
-                    )
-                )
+                for r in await asyncio.gather(*(
+                    scrape_parse(paper=p, i=i + j)
+                    for j, p in enumerate(papers[i : i + batch_size])
+                ))
                 if r is not False
             }
             if limit is not None and len(aggregated) >= limit:

diff --git a/paperscraper/version.py b/paperscraper/version.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,6 @@
 [build-system]
 build-backend = "setuptools.build_meta"
-# Pin to 62.6 for support from reading requirements from requirements.txt
-requires = ["setuptools >= 62.6.0"]
+requires = ["setuptools>=64", "setuptools_scm>=8"]
 
 [project]
 authors = [
@@ -25,7 +24,7 @@ dependencies = [
     "pybtex",
 ]
 description = "LLM Chain for answering questions from docs"
-dynamic = ["optional-dependencies"]
+dynamic = ["optional-dependencies", "version"]
 keywords = ["question answering"]
 license = {file = "LICENSE"}
 maintainers = [
@@ -36,13 +35,14 @@ name = "paper-scraper"
 readme = "README.md"
 requires-python = ">=3.8"
 urls = {repository = "https://github.com/blackadad/paper-scraper"}
-version = "1.8.0"
+
+[tool.black]
+enable-unstable-feature = ["hug_parens_with_braces_and_square_brackets"]
+preview = true
 
 [tool.codespell]
 check-filenames = true
 check-hidden = true
-# SEE: https://github.com/codespell-project/codespell/issues/1212#issuecomment-1744768533
-ignore-regex = ".{1024}|.*codespell-ignore.*"
 ignore-words-list = "cros,ser"
 
 [tool.mypy]
@@ -203,6 +203,9 @@ file = ["dev-requirements.txt"]
 [tool.setuptools.packages.find]
 include = ["paperscraper*"]
 
+[tool.setuptools_scm]
+version_file = "paperscraper/version.py"
+
 [tool.tomlsort]
 all = true
 in_place = true

diff --git a/tests/test_paperscraper.py b/tests/test_paperscraper.py
@@ -93,16 +93,25 @@ async def test_reconcile_dois(self) -> None:
     async def test_hard_reconciles(self):
         test_parameters: list[dict] = [
             {
-                "title": "High-throughput screening of human genetic variants by pooled prime editing.",  # noqa: E501
+                "title": (
+                    "High-throughput screening of human genetic variants by pooled"
+                    " prime editing."
+                ),
                 "doi": "10.1101/2024.04.01.587366",
             },
             {
-                "title": "High-throughput screening of human genetic variants by pooled prime editing.",  # noqa: E501
+                "title": (
+                    "High-throughput screening of human genetic variants by pooled"
+                    " prime editing."
+                ),
                 "authors": ["garbage", "authors", "that"],
                 "doi": "10.1101/2024.04.01.587366",
             },
             {
-                "title": "High throughput screening of human genetic variants by pooled prime editing",  # noqa: E501
+                "title": (
+                    "High throughput screening of human genetic variants by pooled"
+                    " prime editing"
+                ),
                 "doi": "10.1101/2024.04.01.587366",
             },
         ]
@@ -235,7 +244,8 @@ async def test_with_multiple_google_search_pages(self) -> None:
 
     async def test_no_link_doesnt_crash_us(self) -> None:
         await paperscraper.a_gsearch_papers(
-            "OAG-BERT: Pre-train Heterogeneous Entity-augmented Academic Language Models",
+            "OAG-BERT: Pre-train Heterogeneous Entity-augmented Academic Language"
+            " Models",
             year="2021",
         )
 
@@ -336,7 +346,9 @@ async def mock_session_get(*_, **__):
             await openaccess_scraper(
                 {
                     "openAccessPdf": {
-                        "url": "https://pubs.acs.org/doi/abs/10.1021/acs.nanolett.0c00513"
+                        "url": (
+                            "https://pubs.acs.org/doi/abs/10.1021/acs.nanolett.0c00513"
+                        )
                     }
                 },
                 os.path.join(tmpdir, "test.pdf"),
@@ -534,7 +546,8 @@ async def test_scraper_doi_search(self):
 class Test15(IsolatedAsyncioTestCase):
     async def test_pdf_link_from_google(self):
         papers = await paperscraper.a_search_papers(
-            "Multiplex Base Editing to Protect from CD33-Directed Therapy: Implications for Immune and Gene Therapy",  # noqa: E501
+            "Multiplex Base Editing to Protect from CD33-Directed Therapy: Implications"
+            " for Immune and Gene Therapy",
             limit=1,
             search_type="google",
         )
Original file line number	Diff line number	Diff line change
Expand Up		@@ -130,3 +130,6 @@ dmypy.json


		*.pdf

		# Matching pyproject.toml
		paperscraper/version.py