adding calag scraper for California Attorney General. This scraper in…

…cludes a backscraper that should be run after deployment. Related to #168
freelawproject · Feb 4, 2017 · c2f0710 · c2f0710
1 parent 73b6fef
commit c2f0710
Show file tree

Hide file tree

Showing 3 changed files with 984 additions and 0 deletions.
diff --git a/juriscraper/opinions/united_states/state/__init__.py b/juriscraper/opinions/united_states/state/__init__.py
@@ -8,6 +8,7 @@
     'ark',
     'arkctapp',
     'cal',
+    'calag',
     'calctapp_1st',
     'calctapp_2nd',
     'calctapp_3rd',

diff --git a/juriscraper/opinions/united_states/state/calag.py b/juriscraper/opinions/united_states/state/calag.py
@@ -0,0 +1,52 @@
+"""Scraper for the California Attorney General
+CourtID: calag
+Court Short Name: California Attorney General
+"""
+
+import datetime
+
+from juriscraper.OpinionSite import OpinionSite
+from juriscraper.lib.string_utils import convert_date_string
+
+
+class Site(OpinionSite):
+    def __init__(self, *args, **kwargs):
+        super(Site, self).__init__(*args, **kwargs)
+        self.court_id = self.__module__
+        self.year = datetime.date.today().year
+        self.url_base = "https://oag.ca.gov/opinions/yearly-index?conclusion-year[value][year]="
+        self.url = self.url_base + str(self.year)
+        self.back_scrape_iterable = range(1985, self.year + 1)
+        self.rows_path = '//tbody/tr'
+        self.cell_path = self.rows_path + '/td[%d]'
+
+    def _get_case_names(self):
+        """No case names available"""
+        return ["Untitled California Attorney General Opinion"] * len(self.html.xpath(self.rows_path))
+
+    def _get_download_urls(self):
+        path = '%s//a/@href' % (self.cell_path % 1)
+        return [href for href in self.html.xpath(path)]
+
+    def _get_case_dates(self):
+        cells = self.html.xpath(self.cell_path % 4)
+        return [convert_date_string(cell.text_content().strip()) for cell in cells]
+
+    def _get_docket_numbers(self):
+        return [cell.text_content().strip() for cell in self.html.xpath(self.cell_path % 1)]
+
+    def _get_precedential_statuses(self):
+        return ['Published'] * len(self.case_names)
+
+    def _get_summaries(self):
+        """Combine the Questions and Conclusions Columns"""
+        summaries = []
+        for row in self.html.xpath(self.rows_path):
+            questions = row.xpath('./td[2]')[0].text_content()
+            conclusions = row.xpath('./td[3]')[0].text_content()
+            summaries.append('QUESTIONS: %s CONCLUSIONS: %s' % (questions, conclusions))
+        return summaries
+
+    def _download_backwards(self, year):
+        self.url = self.url_base + str(year)
+        self.html = self._download()