Excluding spaces from regex parsing of href (#90)

* Moved from .* to \S+ for regex parsing URls * Added test for malformed URL
blackadad · Apr 15, 2024 · ebd525a · ebd525a
1 parent 1cfea31
commit ebd525a
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 15 deletions.
diff --git a/paperscraper/lib.py b/paperscraper/lib.py
@@ -146,12 +146,12 @@ def get_pdf():
             return pdf_link.group(1)
         # maybe epdf
         # should have pdf somewhere (could not be at end)
-        epdf_link = re.search(r'href="(.*\.epdf)"', html_text)
+        epdf_link = re.search(r'href="(\S+\.epdf)"', html_text)
         if epdf_link:
             return epdf_link.group(1).replace("epdf", "pdf")
 
         # obvious thing
-        pdf_link = re.search(r'href="(.*pdf)"', html_text)
+        pdf_link = re.search(r'href="(\S+\.pdf)"', html_text)
         if pdf_link:
             return pdf_link.group(1)
 
@@ -190,7 +190,7 @@ async def find_pmc_pdf_link(pmc_id, session: ClientSession) -> str:
                 f"Failed to download PubMed Central ID {pmc_id} from URL {url}."
             ) from exc
         html_text = await r.text()
-        pdf_link = re.search(r'href="(.*\.pdf)"', html_text)
+        pdf_link = re.search(r'href="(\S+\.pdf)"', html_text)
         if pdf_link is None:
             raise RuntimeError(
                 f"No PDF link matched for PubMed Central ID {pmc_id} from URL {url}."

diff --git a/tests/test_paperscraper.py b/tests/test_paperscraper.py
@@ -9,6 +9,7 @@
 from unittest.mock import MagicMock
 
 import aiohttp
+import pytest
 from pybtex.database import parse_string
 
 import paperscraper
@@ -278,18 +279,30 @@ async def test_openaccess_scraper(self) -> None:
             {"openAccessPdf": None}, MagicMock(), MagicMock()
         )
 
-        with tempfile.TemporaryDirectory() as tmpdir:
-            await openaccess_scraper(
-                {
-                    "openAccessPdf": {
-                        "url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6506413/"
-                    }
-                },
-                os.path.join(tmpdir, "test.pdf"),
-                ThrottledClientSession(
-                    rate_limit=RateLimits.SCRAPER.value, headers=get_header()
-                ),
-            )
+        async with ThrottledClientSession(
+            rate_limit=RateLimits.SCRAPER.value, headers=get_header()
+        ) as session:
+            with tempfile.TemporaryDirectory() as tmpdir:
+                await openaccess_scraper(
+                    {
+                        "openAccessPdf": {
+                            "url": "https://pubs.acs.org/doi/abs/10.1021/acs.nanolett.0c00513"
+                        }
+                    },
+                    os.path.join(tmpdir, "test1.pdf"),
+                    session,
+                )
+                with pytest.raises(RuntimeError, match="No PDF link"):
+                    # Confirm we can regex parse without a malformed URL error
+                    await openaccess_scraper(
+                        {
+                            "openAccessPdf": {
+                                "url": "https://www.annualreviews.org/doi/full/10.1146/annurev-physchem-042018-052331"
+                            }
+                        },
+                        os.path.join(tmpdir, "test2.pdf"),
+                        session,
+                    )
 
     async def test_pubmed_to_pdf(self):
         path = "test.pdf"