Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tooling update #93

Merged
merged 5 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,6 @@ dmypy.json


*.pdf

# Matching pyproject.toml
paperscraper/version.py
22 changes: 17 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,37 @@ repos:
- id: trailing-whitespace
exclude: .gitignore
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 24.4.0
rev: 24.4.2
hooks:
- id: black-jupyter
- id: black-jupyter
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.3.7
rev: v0.4.5
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- repo: https://github.com/pre-commit/mirrors-prettier
rev: v3.1.0
hooks:
- id: prettier
additional_dependencies:
- prettier@3.2.5 # SEE: https://github.com/pre-commit/pre-commit/issues/3133
- repo: https://github.com/pappasam/toml-sort
rev: v0.23.1
hooks:
- id: toml-sort-fix
- repo: https://github.com/codespell-project/codespell
rev: v2.2.6
rev: v2.3.0
hooks:
- id: codespell
additional_dependencies: [".[toml]"]
- repo: https://github.com/abravalheri/validate-pyproject
rev: v0.18
hooks:
- id: validate-pyproject
additional_dependencies:
- "validate-pyproject-schema-store[all]"
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.9.0
rev: v1.10.0
hooks:
- id: mypy
additional_dependencies: # Versions here match pyproject.toml
Expand Down
53 changes: 26 additions & 27 deletions paperscraper/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,8 @@ async def pmc_to_pdf(
cause_exc = ValueError("Not a PDF.")
if cause_exc:
raise RuntimeError(
f"Failed to convert PubMed Central ID {pmc_id} to PDF given URL {pdf_url}."
f"Failed to convert PubMed Central ID {pmc_id} to PDF given URL"
f" {pdf_url}."
) from cause_exc
with open(path, "wb") as f: # noqa: ASYNC101
f.write(await r.read())
Expand Down Expand Up @@ -306,7 +307,7 @@ async def local_scraper(paper, path) -> bool: # noqa: ARG001


def default_scraper(
callback: Callable[[str, dict[str, str]], Awaitable] | None = None
callback: Callable[[str, dict[str, str]], Awaitable] | None = None,
) -> Scraper:
scraper = Scraper(callback=callback)
scraper.register_scraper(local_scraper, priority=12)
Expand Down Expand Up @@ -523,19 +524,17 @@ class RateLimits(float, Enum):
FALLBACK_SLOW = 15 / 60


SEMANTIC_SCHOLAR_API_FIELDS: str = ",".join(
[
"citationStyles",
"externalIds",
"url",
"openAccessPdf",
"year",
"isOpenAccess",
"influentialCitationCount",
"citationCount",
"title",
]
)
SEMANTIC_SCHOLAR_API_FIELDS: str = ",".join([
"citationStyles",
"externalIds",
"url",
"openAccessPdf",
"year",
"isOpenAccess",
"influentialCitationCount",
"citationCount",
"title",
])
SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org"


Expand Down Expand Up @@ -674,7 +673,7 @@ async def a_search_papers( # noqa: C901, PLR0912, PLR0915
elif search_type == "paper":
raise NotImplementedError(
f"Only added 'paper' search type to {SematicScholarSearchType.__name__},"
f" but not yet to this function in general."
" but not yet to this function in general."
)

if year is not None and search_type == "default":
Expand Down Expand Up @@ -776,9 +775,9 @@ async def google2s2(
) as response:
if not response.ok:
logger.warning(
"Error correlating papers from google to semantic scholar:"
f" status {response.status}, reason {response.reason!r},"
f" text {await response.text()!r}."
"Error correlating papers from google to semantic"
f" scholar: status {response.status}, reason"
f" {response.reason!r}, text {await response.text()!r}."
)
return None
response_data = await response.json()
Expand Down Expand Up @@ -811,12 +810,10 @@ async def google2s2(
return response_data["data"][0]
return None

responses = await asyncio.gather(
*(
google2s2(t, y, p)
for t, y, p in zip(titles, years, google_pdf_links)
)
)
responses = await asyncio.gather(*(
google2s2(t, y, p)
for t, y, p in zip(titles, years, google_pdf_links)
))
data = {"data": [r for r in responses if r is not None]}
data["total"] = len(data["data"])
field = "data"
Expand All @@ -836,7 +833,8 @@ async def google2s2(
papers.sort(key=lambda x: x["influentialCitationCount"], reverse=True)
if search_type in ["default", "google"]:
logger.info(
f"Found {data['total']} papers, analyzing {_offset} to {_offset + len(papers)}"
f"Found {data['total']} papers, analyzing {_offset} to"
f" {_offset + len(papers)}"
)

# batch them, since we may reach desired limit before all done
Expand Down Expand Up @@ -954,7 +952,8 @@ async def a_gsearch_papers( # noqa: C901
)
total_papers = data["search_information"].get("total_results", 1)
logger.info(
f"Found {total_papers} papers, analyzing {_offset} to {_offset + len(papers)}"
f"Found {total_papers} papers, analyzing {_offset} to"
f" {_offset + len(papers)}"
)

# batch them, since we may reach desired limit before all done
Expand Down
13 changes: 6 additions & 7 deletions paperscraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ async def scrape(
scrape_result[scraper.name] = "success"
if logger is not None:
logger.debug(
f"\tsucceeded - key: {paper['paperId']} scraper: {scraper.name}"
f"\tsucceeded - key: {paper['paperId']} scraper:"
f" {scraper.name}"
)
if self.callback is not None:
await self.callback(paper["title"], scrape_result)
Expand Down Expand Up @@ -147,12 +148,10 @@ async def scrape_parse(
for i in range(0, len(papers), batch_size):
aggregated |= {
r[0]: r[1]
for r in await asyncio.gather(
*(
scrape_parse(paper=p, i=i + j)
for j, p in enumerate(papers[i : i + batch_size])
)
)
for r in await asyncio.gather(*(
scrape_parse(paper=p, i=i + j)
for j, p in enumerate(papers[i : i + batch_size])
))
if r is not False
}
if limit is not None and len(aggregated) >= limit:
Expand Down
3 changes: 0 additions & 3 deletions paperscraper/version.py

This file was deleted.

15 changes: 9 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
[build-system]
build-backend = "setuptools.build_meta"
# Pin to 62.6 for support from reading requirements from requirements.txt
requires = ["setuptools >= 62.6.0"]
requires = ["setuptools>=64", "setuptools_scm>=8"]

[project]
authors = [
Expand All @@ -25,7 +24,7 @@ dependencies = [
"pybtex",
]
description = "LLM Chain for answering questions from docs"
dynamic = ["optional-dependencies"]
dynamic = ["optional-dependencies", "version"]
keywords = ["question answering"]
license = {file = "LICENSE"}
maintainers = [
Expand All @@ -36,13 +35,14 @@ name = "paper-scraper"
readme = "README.md"
requires-python = ">=3.8"
urls = {repository = "https://github.com/blackadad/paper-scraper"}
version = "1.8.0"

[tool.black]
enable-unstable-feature = ["hug_parens_with_braces_and_square_brackets"]
preview = true

[tool.codespell]
check-filenames = true
check-hidden = true
# SEE: https://github.com/codespell-project/codespell/issues/1212#issuecomment-1744768533
ignore-regex = ".{1024}|.*codespell-ignore.*"
ignore-words-list = "cros,ser"

[tool.mypy]
Expand Down Expand Up @@ -203,6 +203,9 @@ file = ["dev-requirements.txt"]
[tool.setuptools.packages.find]
include = ["paperscraper*"]

[tool.setuptools_scm]
version_file = "paperscraper/version.py"

[tool.tomlsort]
all = true
in_place = true
Expand Down
25 changes: 19 additions & 6 deletions tests/test_paperscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,16 +93,25 @@ async def test_reconcile_dois(self) -> None:
async def test_hard_reconciles(self):
test_parameters: list[dict] = [
{
"title": "High-throughput screening of human genetic variants by pooled prime editing.", # noqa: E501
"title": (
"High-throughput screening of human genetic variants by pooled"
" prime editing."
),
"doi": "10.1101/2024.04.01.587366",
},
{
"title": "High-throughput screening of human genetic variants by pooled prime editing.", # noqa: E501
"title": (
"High-throughput screening of human genetic variants by pooled"
" prime editing."
),
"authors": ["garbage", "authors", "that"],
"doi": "10.1101/2024.04.01.587366",
},
{
"title": "High throughput screening of human genetic variants by pooled prime editing", # noqa: E501
"title": (
"High throughput screening of human genetic variants by pooled"
" prime editing"
),
"doi": "10.1101/2024.04.01.587366",
},
]
Expand Down Expand Up @@ -235,7 +244,8 @@ async def test_with_multiple_google_search_pages(self) -> None:

async def test_no_link_doesnt_crash_us(self) -> None:
await paperscraper.a_gsearch_papers(
"OAG-BERT: Pre-train Heterogeneous Entity-augmented Academic Language Models",
"OAG-BERT: Pre-train Heterogeneous Entity-augmented Academic Language"
" Models",
year="2021",
)

Expand Down Expand Up @@ -336,7 +346,9 @@ async def mock_session_get(*_, **__):
await openaccess_scraper(
{
"openAccessPdf": {
"url": "https://pubs.acs.org/doi/abs/10.1021/acs.nanolett.0c00513"
"url": (
"https://pubs.acs.org/doi/abs/10.1021/acs.nanolett.0c00513"
)
}
},
os.path.join(tmpdir, "test.pdf"),
Expand Down Expand Up @@ -534,7 +546,8 @@ async def test_scraper_doi_search(self):
class Test15(IsolatedAsyncioTestCase):
async def test_pdf_link_from_google(self):
papers = await paperscraper.a_search_papers(
"Multiplex Base Editing to Protect from CD33-Directed Therapy: Implications for Immune and Gene Therapy", # noqa: E501
"Multiplex Base Editing to Protect from CD33-Directed Therapy: Implications"
" for Immune and Gene Therapy",
limit=1,
search_type="google",
)
Expand Down