Skip to content

Commit

Permalink
adding new nyag scraper for New York Attorney General. A backscraper …
Browse files Browse the repository at this point in the history
…is included and should be run after deployment. It will capture 533 cases and take only about 1 second to run. Relates to #168
  • Loading branch information
arderyp committed Feb 4, 2017
1 parent c2f0710 commit c253180
Show file tree
Hide file tree
Showing 3 changed files with 6,568 additions and 0 deletions.
1 change: 1 addition & 0 deletions juriscraper/opinions/united_states/state/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
'nj',
'njsuperctappdiv',
'ny',
'nyag',
'nyappdiv_1st',
'nyappdiv_2nd',
'nyappdiv_3rd',
Expand Down
62 changes: 62 additions & 0 deletions juriscraper/opinions/united_states/state/nyag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Scraper for the California Attorney General
CourtID: nyag
Court Short Name: New York Attorney General
"""

import datetime

from juriscraper.OpinionSite import OpinionSite
from juriscraper.lib.string_utils import convert_date_string


class Site(OpinionSite):
def __init__(self, *args, **kwargs):
super(Site, self).__init__(*args, **kwargs)
self.court_id = self.__module__
self.year = datetime.date.today().year
self.url = "https://ag.ny.gov/appeals-and-opinions/numerical-index"
self.back_scrape_iterable = range(1995, self.year + 1)
self.row_path = False
self.cell_path = False
self.set_paths()

def _download(self, request_dict={}):
html = super(Site, self)._download(request_dict)
if self.method == 'LOCAL':
# Make sure the year-table you want to test is first in example file
self.year = int(html.xpath('//table[1]/caption')[0].text_content())
self.set_paths()
return html

def _get_case_dates(self):
"""All we have are years, so estimate middle most day of year"""
return [convert_date_string('July 2, %d' % self.year)] * len(self.html.xpath(self.row_path))

def _get_case_names(self):
"""No case names available"""
return ["Untitled New York Attorney General Opinion"] * len(self.case_dates)

def _get_download_urls(self):
path = '%s//a/@href' % (self.cell_path % 4)
return [href for href in self.html.xpath(path)]

def _get_docket_numbers(self):
return [cell.text_content().strip() for cell in self.html.xpath(self.cell_path % 1)]

def _get_precedential_statuses(self):
return ['Published'] * len(self.case_dates)

def _get_summaries(self):
"""Use Abstract column value"""
return [cell.text_content().strip() for cell in self.html.xpath(self.cell_path % 2)]

def _get_date_filed_is_approximate(self):
return [True] * len(self.case_dates)

def _download_backwards(self, year):
self.year = year
self.set_paths()

def set_paths(self):
self.row_path = '//table[contains(caption, "%d")]/tbody/tr' % self.year
self.cell_path = self.row_path + '/td[%d]'
Loading

0 comments on commit c253180

Please sign in to comment.