Skip to content

Commit

Permalink
Excluding spaces from regex parsing of href (#90)
Browse files Browse the repository at this point in the history
* Moved from .* to \S+ for regex parsing URls

* Added test for malformed URL
  • Loading branch information
jamesbraza authored Apr 15, 2024
1 parent 1cfea31 commit ebd525a
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 15 deletions.
6 changes: 3 additions & 3 deletions paperscraper/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,12 @@ def get_pdf():
return pdf_link.group(1)
# maybe epdf
# should have pdf somewhere (could not be at end)
epdf_link = re.search(r'href="(.*\.epdf)"', html_text)
epdf_link = re.search(r'href="(\S+\.epdf)"', html_text)
if epdf_link:
return epdf_link.group(1).replace("epdf", "pdf")

# obvious thing
pdf_link = re.search(r'href="(.*pdf)"', html_text)
pdf_link = re.search(r'href="(\S+\.pdf)"', html_text)
if pdf_link:
return pdf_link.group(1)

Expand Down Expand Up @@ -190,7 +190,7 @@ async def find_pmc_pdf_link(pmc_id, session: ClientSession) -> str:
f"Failed to download PubMed Central ID {pmc_id} from URL {url}."
) from exc
html_text = await r.text()
pdf_link = re.search(r'href="(.*\.pdf)"', html_text)
pdf_link = re.search(r'href="(\S+\.pdf)"', html_text)
if pdf_link is None:
raise RuntimeError(
f"No PDF link matched for PubMed Central ID {pmc_id} from URL {url}."
Expand Down
37 changes: 25 additions & 12 deletions tests/test_paperscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from unittest.mock import MagicMock

import aiohttp
import pytest
from pybtex.database import parse_string

import paperscraper
Expand Down Expand Up @@ -278,18 +279,30 @@ async def test_openaccess_scraper(self) -> None:
{"openAccessPdf": None}, MagicMock(), MagicMock()
)

with tempfile.TemporaryDirectory() as tmpdir:
await openaccess_scraper(
{
"openAccessPdf": {
"url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6506413/"
}
},
os.path.join(tmpdir, "test.pdf"),
ThrottledClientSession(
rate_limit=RateLimits.SCRAPER.value, headers=get_header()
),
)
async with ThrottledClientSession(
rate_limit=RateLimits.SCRAPER.value, headers=get_header()
) as session:
with tempfile.TemporaryDirectory() as tmpdir:
await openaccess_scraper(
{
"openAccessPdf": {
"url": "https://pubs.acs.org/doi/abs/10.1021/acs.nanolett.0c00513"
}
},
os.path.join(tmpdir, "test1.pdf"),
session,
)
with pytest.raises(RuntimeError, match="No PDF link"):
# Confirm we can regex parse without a malformed URL error
await openaccess_scraper(
{
"openAccessPdf": {
"url": "https://www.annualreviews.org/doi/full/10.1146/annurev-physchem-042018-052331"
}
},
os.path.join(tmpdir, "test2.pdf"),
session,
)

async def test_pubmed_to_pdf(self):
path = "test.pdf"
Expand Down

0 comments on commit ebd525a

Please sign in to comment.