Skip to content

Commit

Permalink
feat(backscrapers): refactor make_backscrape_iterable; make days inte…
Browse files Browse the repository at this point in the history
…rval dynamic

Solves freelawproject#1095

- Update sample_caller to catch `--days-interval` optional keyword argument
- Refactor make_backscrape_iterable that used days_interval as the AbstractSite default;  all scrapers that used the same pattern are affected
- Changed default behaviour of make_backscrape_iterable to assume dates are passed in %Y/%m/%d a more sensible format than %m/%d/%Y
- Also, add logger.info calls for the start and end date of download_backwards to all the scrapers that did not have it
  • Loading branch information
grossir committed Aug 6, 2024
1 parent 333e1dd commit d335289
Show file tree
Hide file tree
Showing 18 changed files with 69 additions and 404 deletions.
56 changes: 55 additions & 1 deletion juriscraper/AbstractSite.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import hashlib
import json
from datetime import date, datetime
from typing import Dict, List, Tuple

import certifi
import requests

from juriscraper.lib.date_utils import fix_future_year_typo, json_date_handler
from juriscraper.lib.date_utils import (
fix_future_year_typo,
json_date_handler,
make_date_range_tuples,
)
from juriscraper.lib.exceptions import InsanityException
from juriscraper.lib.html_utils import (
clean_html,
Expand Down Expand Up @@ -421,6 +426,55 @@ def _download_backwards(self, d):
# methods for downloading the entire Site
pass

def make_backscrape_iterable(
self, kwargs: Dict
) -> List[Tuple[date, date]]:
"""Creates back_scrape_iterable in the most common variation,
a list of tuples containing (start, end) date pairs, each of
`days_interval` size
Uses default attributes of the scrapers as a fallback, if
expected keyword arguments are not passed in the kwargs input
:param kwargs: if the following keys are present, use them
backscrape_start: str in "%Y/%m/%d" format ;
Default: self.first_opinion_date
backscrape_end: str
days_interval: int; Default: self.days_interval
:return: None; sets self.back_scrape_iterable in place
"""
start = kwargs.get("backscrape_start")
end = kwargs.get("backscrape_end")
days_interval = kwargs.get("days_interval")

if start:
start = datetime.strptime(start, "%Y/%m/%d")
else:
if hasattr(self, "first_opinion_date"):
start = self.first_opinion_date
else:
logger.warning(
"No `backscrape_start` argument passed; and scraper has no `first_opinion_date` default"
)

if end:
end = datetime.strptime(end, "%Y/%m/%d")
else:
end = datetime.now().date()

if not days_interval:
if hasattr(self, "days_interval"):
days_interval = self.days_interval
else:
logger.warning(
"No `days_interval` argument passed; and scraper has no default"
)

self.back_scrape_iterable = make_date_range_tuples(
start, end, days_interval
)

@staticmethod
def cleanup_content(content):
"""
Expand Down
25 changes: 0 additions & 25 deletions juriscraper/opinions/united_states/administrative_agency/olc.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from urllib.parse import urlencode

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import make_date_range_tuples
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


Expand Down Expand Up @@ -65,27 +64,3 @@ def _download_backwards(self, dates: Tuple[date]) -> None:
self.url = f"{self.base_url}?{urlencode(params)}"
self.html = self._download()
self._process_html()

def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Checks if backscrape start and end arguments have been passed
by caller, and parses them accordingly
:param kwargs: passed when initializing the scraper, may or
may not contain backscrape controlling arguments
:return None
"""
start = kwargs.get("backscrape_start")
end = kwargs.get("backscrape_end")

if start:
start = datetime.strptime(start, "%m/%d/%Y")
else:
start = self.first_opinion_date
if end:
end = datetime.strptime(end, "%m/%d/%Y")
else:
end = datetime.now()

self.back_scrape_iterable = make_date_range_tuples(
start, end, self.days_interval
)
26 changes: 1 addition & 25 deletions juriscraper/opinions/united_states/federal_appellate/ca1.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from urllib.parse import urlencode

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import make_date_range_tuples
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


Expand Down Expand Up @@ -63,6 +62,7 @@ def _download_backwards(self, dates: Tuple[date]) -> None:
:return None
"""
start, end = dates
logger.info("Backscraping for range %s %s", *dates)
params = {
"field_opn_csno_value_op": "starts",
"field_opn_issdate_value[min][date]": start.strftime("%m/%d/%Y"),
Expand All @@ -71,27 +71,3 @@ def _download_backwards(self, dates: Tuple[date]) -> None:
self.url = f"{self.base_url}?{urlencode(params)}"
self.html = self._download()
self._process_html()

def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Checks if backscrape start and end arguments have been passed
by caller, and parses them accordingly
:param kwargs: passed when initializing the scraper, may or
may not contain backscrape controlling arguments
:return None
"""
start = kwargs.get("backscrape_start")
end = kwargs.get("backscrape_end")

if start:
start = datetime.strptime(start, "%m/%d/%Y")
else:
start = self.first_opinion_date
if end:
end = datetime.strptime(end, "%m/%d/%Y")
else:
end = datetime.now()

self.back_scrape_iterable = make_date_range_tuples(
start, end, self.days_interval
)
24 changes: 0 additions & 24 deletions juriscraper/opinions/united_states/federal_bankruptcy/bap10.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from urllib.parse import urlencode

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import make_date_range_tuples
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


Expand Down Expand Up @@ -77,26 +76,3 @@ def _download_backwards(self, dates: Tuple[date]) -> None:
self.set_url(*dates)
self.html = self._download()
self._process_html()

def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Checks if backscrape start and end arguments have been passed
by caller, and parses them accordingly
:param kwargs: passed when initializing the scraper, may or
may not contain backscrape controlling arguments
:return None
"""
start = kwargs.get("backscrape_start")
end = kwargs.get("backscrape_end")

if start:
start = datetime.strptime(start, "%m/%d/%Y")
else:
start = self.first_opinion_date
if end:
end = datetime.strptime(end, "%m/%d/%Y")
else:
end = datetime.now()

self.back_scrape_iterable = make_date_range_tuples(
start, end, self.days_interval
)
24 changes: 0 additions & 24 deletions juriscraper/opinions/united_states/federal_special/nmcca.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from urllib.parse import urlencode

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import make_date_range_tuples
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


Expand Down Expand Up @@ -74,26 +73,3 @@ def _download_backwards(self, dates: Tuple[date]) -> None:
self.url = f"{self.base_url}?{urlencode(params)}"
self.html = self._download()
self._process_html()

def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Checks if backscrape start and end arguments have been passed
by caller, and parses them accordingly
:param kwargs: passed when initializing the scraper, may or
may not contain backscrape controlling arguments
:return None
"""
start = kwargs.get("backscrape_start")
end = kwargs.get("backscrape_end")

if start:
start = datetime.strptime(start, "%m/%d/%Y")
else:
start = self.first_opinion_date
if end:
end = datetime.strptime(end, "%m/%d/%Y")
else:
end = datetime.now()

self.back_scrape_iterable = make_date_range_tuples(
start, end, self.days_interval
)
25 changes: 0 additions & 25 deletions juriscraper/opinions/united_states/federal_special/tax.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from typing import Tuple

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import make_date_range_tuples
from juriscraper.lib.string_utils import titlecase
from juriscraper.OpinionSiteLinear import OpinionSiteLinear

Expand Down Expand Up @@ -109,30 +108,6 @@ def _get_url(self, docket_number: str, docketEntryId: str) -> str:
pdf_url = super()._download()["url"]
return pdf_url

def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Checks if backscrape start and end arguments have been passed
by caller, and parses them accordingly
:param kwargs: passed when initializing the scraper, may or
may not contain backscrape controlling arguments
:return None
"""
start = kwargs.get("backscrape_start")
end = kwargs.get("backscrape_end")

if start:
start = datetime.strptime(start, "%m/%d/%Y")
else:
start = self.first_opinion_date
if end:
end = datetime.strptime(end, "%m/%d/%Y")
else:
end = datetime.now()

self.back_scrape_iterable = make_date_range_tuples(
start, end, self.days_interval
)

def _download_backwards(self, dates: Tuple[date]) -> None:
"""Make custom date range request to the API
Expand Down
25 changes: 0 additions & 25 deletions juriscraper/opinions/united_states/state/ark.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from urllib.parse import urlencode

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import make_date_range_tuples
from juriscraper.lib.string_utils import normalize_dashes, titlecase
from juriscraper.OpinionSiteLinear import OpinionSiteLinear

Expand Down Expand Up @@ -121,27 +120,3 @@ def _download_backwards(self, dates: Tuple[date]) -> None:
self.set_url(*dates)
self.html = self._download()
self._process_html()

def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Checks if backscrape start and end arguments have been passed
by caller, and parses them accordingly
:param kwargs: passed when initializing the scraper, may or
may not contain backscrape controlling arguments
:return None
"""
start = kwargs.get("backscrape_start")
end = kwargs.get("backscrape_end")

if start:
start = datetime.strptime(start, "%m/%d/%Y")
else:
start = self.first_opinion_date
if end:
end = datetime.strptime(end, "%m/%d/%Y")
else:
end = datetime.now()

self.back_scrape_iterable = make_date_range_tuples(
start, end, self.days_interval
)
26 changes: 1 addition & 25 deletions juriscraper/opinions/united_states/state/colo.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from urllib.parse import urlencode

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import make_date_range_tuples
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


Expand Down Expand Up @@ -119,6 +118,7 @@ def _download_backwards(self, dates: Tuple[date]) -> None:
:param dates: (start_date, end_date) tuple
:return None
"""
logger.info("Backscraping for range %s %s", *dates)
start = dates[0].strftime("%Y-%m-%d")
end = dates[1].strftime("%Y-%m-%d")
timestamp = str(datetime.now().timestamp())[:10]
Expand All @@ -135,27 +135,3 @@ def _download_backwards(self, dates: Tuple[date]) -> None:
self.url = f"{self.base_url}?{urlencode(params)}"
self.html = self._download()
self._process_html()

def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Checks if backscrape start and end arguments have been passed
by caller, and parses them accordingly
:param kwargs: passed when initializing the scraper, may or
may not contain backscrape controlling arguments
:return None
"""
start = kwargs.get("backscrape_start")
end = kwargs.get("backscrape_end")

if start:
start = datetime.strptime(start, "%m/%d/%Y")
else:
start = self.first_opinion_date
if end:
end = datetime.strptime(end, "%m/%d/%Y")
else:
end = datetime.now()

self.back_scrape_iterable = make_date_range_tuples(
start, end, self.days_interval
)
28 changes: 1 addition & 27 deletions juriscraper/opinions/united_states/state/fla.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
# Court Short Name: fla

from datetime import date, datetime, timedelta
from typing import Dict, Optional, Tuple
from typing import Optional, Tuple

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import make_date_range_tuples
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


Expand Down Expand Up @@ -75,28 +74,3 @@ def _download_backwards(self, dates: Tuple[date]) -> None:
"""
self.set_url(*dates)
logger.info("Backscraping for range %s %s", *dates)

def make_backscrape_iterable(self, kwargs: Dict) -> None:
"""Checks if backscrape start and end arguments have been passed
by caller, and parses them accordingly
:param kwargs: passed when initializing the scraper, may or
may not contain backscrape controlling arguments
:return None
"""
start = kwargs.get("backscrape_start")
end = kwargs.get("backscrape_end")

if start:
start = datetime.strptime(start, "%m/%d/%Y")
else:
start = self.first_opinion_date
if end:
end = datetime.strptime(end, "%m/%d/%Y")
else:
end = datetime.now()

self.back_scrape_iterable = make_date_range_tuples(
start, end, self.days_interval
)
Loading

0 comments on commit d335289

Please sign in to comment.