Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make elasticsearch full-text search functional #190

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pupa/cli/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
'pupa.cli.commands.dbinit',
'pupa.cli.commands.update',
'pupa.cli.commands.dump',
'pupa.cli.commands.elasticsearch_push'
)


Expand Down
32 changes: 32 additions & 0 deletions pupa/cli/commands/elasticsearch_push.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from opencivicdata.models.bill import Bill

from .base import BaseCommand
from pupa.utils.fulltext import bill_to_elasticsearch
from pupa.core import elasticsearch


class Command(BaseCommand):
name = 'elasticsearch-push'
help = 'Push information about `Bill` objects to the elasticsearch instance'

def add_args(self):
self.add_argument('jurisdictions', type=str, nargs='+',
help="Full OCD jurisdiction IDs to push; pass `all` to push all")

def handle(self, args, other):
if elasticsearch is None:
raise AssertionError(
"No elasticsearch instance found; make sure it's configured")

if args.jurisdictions == ['all', ]:
for bill in Bill.objects.all():
elasticsearch.index(
index='ocd', doc_type='bill', id=bill.id,
doc=bill_to_elasticsearch(bill))

else:
for jurisdiction in args.jurisdictions:
for bill in Bill.objects.filter(from_organization__jurisdiction=jurisdiction):
elasticsearch.index(
index='ocd', doc_type='bill', id=bill.id,
doc=bill_to_elasticsearch(bill))
1 change: 1 addition & 0 deletions pupa/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .elasticsearch import elasticsearch
10 changes: 10 additions & 0 deletions pupa/core/elasticsearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import pyelasticsearch

from pupa import settings


elasticsearch = None
if settings.ENABLE_ELASTICSEARCH:
elasticsearch = pyelasticsearch.ElasticSearch(
urls='http://{}'.format(settings.ELASTICSEARCH_HOST),
timeout=settings.ELASTICSEARCH_TIMEOUT)
4 changes: 2 additions & 2 deletions pupa/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
CACHE_DIR = os.path.join(os.getcwd(), '_cache')
SCRAPED_DATA_DIR = os.path.join(os.getcwd(), '_data')

ENABLE_ELASTICSEARCH = False
ELASTICSEARCH_HOST = 'localhost'
ENABLE_ELASTICSEARCH = os.environ.get('ENABLE_ELASTICSEARCH', False)
ELASTICSEARCH_HOST = os.environ.get('ELASTICSEARCH_HOST', 'localhost:9200')
ELASTICSEARCH_TIMEOUT = 2

# dump settings
Expand Down
119 changes: 119 additions & 0 deletions pupa/utils/fulltext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import os
import subprocess

import lxml.html
import requests
import scrapelib


s = scrapelib.Scraper()


def html_to_text(response):
doc = lxml.html.fromstring(response.text)
text = doc.text_content()
return text


def pdf_to_text(response):
# Download the file
if not os.path.exists(os.path.join(os.getcwd(), '_cache')):
os.makedirs(os.path.join(os.getcwd(), '_cache'))
local_filename = os.path.join(os.getcwd(), '_cache', format(response.url.split('/')[-1]))
with open(local_filename, 'wb') as pdf_file:
for block in response.iter_content(1024):
if block:
pdf_file.write(block)

try:
pipe = subprocess.Popen(['pdftotext', '-layout', local_filename, '-'],
stdout=subprocess.PIPE,
close_fds=True).stdout
except OSError as e:
print('Unable to parse the bill PDF\n{}'.format(e))
text = pipe.read().decode('utf-8')

pipe.close()
os.remove(local_filename)

return text


def clean_text(text):
text = ' '.join(text.split())

return text


def version_to_text(version):
text = ''

link = None
filetype = None
preferred_mimetypes = ['text/html', 'application/pdf', ]
for mimetype in preferred_mimetypes:
for link_obj in version.links.all():
if link_obj.media_type == mimetype:
try:
r = s.get(link_obj.url)
except (scrapelib.HTTPError, requests.exceptions.ReadTimeout):
pass
else:
filetype = mimetype.split('/')[-1]
link = link_obj.url
if filetype or link:
break

if filetype == 'html':
text = html_to_text(r)
elif filetype == 'pdf':
text = pdf_to_text(r)
else:
pass

text = clean_text(text)
return text


def bill_to_elasticsearch(bill):
DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%S'

es_bill = {
'jurisdiction': bill.get_jurisdiction_name(),
'session': bill.get_session_name(),
'identifier': bill.identifier,
'subjects': bill.subject,
'classifications': bill.classification,
'updated_at': bill.updated_at.strftime(DATETIME_FORMAT),
'created_at': bill.created_at.strftime(DATETIME_FORMAT),
}

es_bill['titles'] = [bill.title, ]
for other_title in bill.other_titles.all():
es_bill['titles'].append(other_title.title)

organization = bill.from_organization
es_bill['organizations'] = [organization.name, ]
for ancestor in organization.get_parents():
es_bill['organizations'].append(ancestor.name)

es_bill['sponsors'] = []
for sponsor in bill.sponsorships.all().filter(bill=bill):
es_bill['sponsors'].append(sponsor.name)

es_bill['action_dates'] = []
for action in bill.actions.all():
es_bill['action_dates'].append(action.date)
es_bill['action_dates'] = sorted(list(set(es_bill['action_dates'])))

# Gather the text of the most recent bill
# If dates are present, use the one version most recently added to the database
es_bill['text'] = ''
latest_version = None
for version in bill.versions.all().order_by('date'):
latest_version = version
if latest_version:
text = version_to_text(latest_version)
es_bill['text'] = text

return es_bill
1 change: 1 addition & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ psycopg2
scrapelib
validictory
pytz
pyelasticsearch

mock
pytest
Expand Down