opencivicdata · mileswwatkins · May 22, 2015 · May 22, 2015 · May 26, 2015 · May 26, 2015
diff --git a/pupa/cli/__main__.py b/pupa/cli/__main__.py
@@ -15,6 +15,7 @@
     'pupa.cli.commands.dbinit',
     'pupa.cli.commands.update',
     'pupa.cli.commands.dump',
+    'pupa.cli.commands.elasticsearch_push'
 )
 
 

diff --git a/pupa/cli/commands/elasticsearch_push.py b/pupa/cli/commands/elasticsearch_push.py
@@ -0,0 +1,32 @@
+from opencivicdata.models.bill import Bill
+
+from .base import BaseCommand
+from pupa.utils.fulltext import bill_to_elasticsearch
+from pupa.core import elasticsearch
+
+
+class Command(BaseCommand):
+    name = 'elasticsearch-push'
+    help = 'Push information about `Bill` objects to the elasticsearch instance'
+
+    def add_args(self):
+        self.add_argument('jurisdictions', type=str, nargs='+',
+            help="Full OCD jurisdiction IDs to push; pass `all` to push all")
+
+    def handle(self, args, other):
+        if elasticsearch is None:
+            raise AssertionError(
+                "No elasticsearch instance found; make sure it's configured")
+
+        if args.jurisdictions == ['all', ]:
+            for bill in Bill.objects.all():
+                elasticsearch.index(
+                    index='ocd', doc_type='bill', id=bill.id,
+                    doc=bill_to_elasticsearch(bill))
+
+        else:
+            for jurisdiction in args.jurisdictions:
+                for bill in Bill.objects.filter(from_organization__jurisdiction=jurisdiction):
+                    elasticsearch.index(
+                        index='ocd', doc_type='bill', id=bill.id,
+                        doc=bill_to_elasticsearch(bill))
diff --git a/pupa/core/__init__.py b/pupa/core/__init__.py
@@ -0,0 +1 @@
+from .elasticsearch import elasticsearch
diff --git a/pupa/core/elasticsearch.py b/pupa/core/elasticsearch.py
@@ -0,0 +1,10 @@
+import pyelasticsearch
+
+from pupa import settings
+
+
+elasticsearch = None
+if settings.ENABLE_ELASTICSEARCH:
+    elasticsearch = pyelasticsearch.ElasticSearch(
+        urls='http://{}'.format(settings.ELASTICSEARCH_HOST),
+        timeout=settings.ELASTICSEARCH_TIMEOUT)
diff --git a/pupa/settings.py b/pupa/settings.py
@@ -17,8 +17,8 @@
 CACHE_DIR = os.path.join(os.getcwd(), '_cache')
 SCRAPED_DATA_DIR = os.path.join(os.getcwd(), '_data')
 
-ENABLE_ELASTICSEARCH = False
-ELASTICSEARCH_HOST = 'localhost'
+ENABLE_ELASTICSEARCH = os.environ.get('ENABLE_ELASTICSEARCH', False)
+ELASTICSEARCH_HOST = os.environ.get('ELASTICSEARCH_HOST', 'localhost:9200')
 ELASTICSEARCH_TIMEOUT = 2
 
 # dump settings

diff --git a/pupa/utils/fulltext.py b/pupa/utils/fulltext.py
@@ -0,0 +1,119 @@
+import os
+import subprocess
+
+import lxml.html
+import requests
+import scrapelib
+
+
+s = scrapelib.Scraper()
+
+
+def html_to_text(response):
+    doc = lxml.html.fromstring(response.text)
+    text = doc.text_content()
+    return text
+
+
+def pdf_to_text(response):
+    # Download the file
+    if not os.path.exists(os.path.join(os.getcwd(), '_cache')):
+        os.makedirs(os.path.join(os.getcwd(), '_cache'))
+    local_filename = os.path.join(os.getcwd(), '_cache', format(response.url.split('/')[-1]))
+    with open(local_filename, 'wb') as pdf_file:
+        for block in response.iter_content(1024):
+            if block:
+                pdf_file.write(block)
+
+    try:
+        pipe = subprocess.Popen(['pdftotext', '-layout', local_filename, '-'],
+            stdout=subprocess.PIPE,
+            close_fds=True).stdout
+    except OSError as e:
+        print('Unable to parse the bill PDF\n{}'.format(e))
+    text = pipe.read().decode('utf-8')
+
+    pipe.close()
+    os.remove(local_filename)
+
+    return text
+
+
+def clean_text(text):
+    text = ' '.join(text.split())
+
+    return text
+
+
+def version_to_text(version):
+    text = ''
+
+    link = None
+    filetype = None
+    preferred_mimetypes = ['text/html', 'application/pdf', ]
+    for mimetype in preferred_mimetypes:
+        for link_obj in version.links.all():
+            if link_obj.media_type == mimetype:
+                try:
+                    r = s.get(link_obj.url)
+                except (scrapelib.HTTPError, requests.exceptions.ReadTimeout):
+                    pass
+                else:
+                    filetype = mimetype.split('/')[-1]
+                    link = link_obj.url
+        if filetype or link:
+            break
+
+    if filetype == 'html':
+        text = html_to_text(r)
+    elif filetype == 'pdf':
+        text = pdf_to_text(r)
+    else:
+        pass
+
+    text = clean_text(text)
+    return text
+
+
+def bill_to_elasticsearch(bill):
+    DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%S'
+
+    es_bill = {
+        'jurisdiction': bill.get_jurisdiction_name(),
+        'session': bill.get_session_name(),
+        'identifier': bill.identifier,
+        'subjects': bill.subject,
+        'classifications': bill.classification,
+        'updated_at': bill.updated_at.strftime(DATETIME_FORMAT),
+        'created_at': bill.created_at.strftime(DATETIME_FORMAT),
+    }
+
+    es_bill['titles'] = [bill.title, ]
+    for other_title in bill.other_titles.all():
+        es_bill['titles'].append(other_title.title)
+
+    organization = bill.from_organization
+    es_bill['organizations'] = [organization.name, ]
+    for ancestor in organization.get_parents():
+        es_bill['organizations'].append(ancestor.name)
+
+    es_bill['sponsors'] = []
+    for sponsor in bill.sponsorships.all().filter(bill=bill):
+        es_bill['sponsors'].append(sponsor.name)
+
+    es_bill['action_dates'] = []
+    for action in bill.actions.all():
+        es_bill['action_dates'].append(action.date)
+    es_bill['action_dates'] = sorted(list(set(es_bill['action_dates'])))
+
+    # Gather the text of the most recent bill
+    # If dates are present, use the one version most recently added to the database
+    es_bill['text'] = ''
+    latest_version = None
+    for version in bill.versions.all().order_by('date'):
+        latest_version = version
+    if latest_version:
+        text = version_to_text(latest_version)
+        es_bill['text'] = text
+
+    return es_bill
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -8,6 +8,7 @@ psycopg2
 scrapelib
 validictory
 pytz
+pyelasticsearch
 
 mock
 pytest
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,6 +8,7 @@ psycopg2 @@
     scrapelib
     validictory
     pytz
+    pyelasticsearch
     mock
     pytest
@@ Expand Down @@