Skip to content

Commit

Permalink
Merge pull request #4192 from freelawproject/4087-feat-link-sessions-…
Browse files Browse the repository at this point in the history
…to-proxy

feat(pacer): Link pacer sessions to proxies
  • Loading branch information
mlissner authored Jul 30, 2024
2 parents ea0e036 + 6f31ed8 commit 723b7ec
Show file tree
Hide file tree
Showing 26 changed files with 471 additions and 194 deletions.
3 changes: 2 additions & 1 deletion cl/api/webhooks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import random

import requests
from django.conf import settings
Expand Down Expand Up @@ -38,7 +39,7 @@ def send_webhook_event(
the webhook is sent.
"""
proxy_server = {
"http": settings.EGRESS_PROXY_HOST, # type: ignore
"http": random.choice(settings.EGRESS_PROXY_HOSTS), # type: ignore
}
headers = {
"Content-type": "application/json",
Expand Down
15 changes: 8 additions & 7 deletions cl/corpus_importer/bulk_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from cl.corpus_importer.tasks import get_pacer_doc_by_rd
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.lib.scorched_utils import ExtraSolrInterface
from cl.lib.search_utils import build_main_query_from_query_string
from cl.scrapers.tasks import extract_recap_pdf
Expand Down Expand Up @@ -75,10 +75,10 @@ def get_petitions(
)
q = options["queue"]
throttle = CeleryThrottle(queue_name=q)
pacer_session = ProxyPacerSession(
session = ProxyPacerSession(
username=pacer_username, password=pacer_password
)
pacer_session.login()
session.login()
for i, rd_pk in enumerate(rds):
if i < options["offset"]:
i += 1
Expand All @@ -87,17 +87,18 @@ def get_petitions(
break

if i % 1000 == 0:
pacer_session = ProxyPacerSession(
session = ProxyPacerSession(
username=pacer_username, password=pacer_password
)
pacer_session.login()
session.login()
logger.info(f"Sent {i} tasks to celery so far.")
logger.info("Doing row %s", i)
throttle.maybe_wait()

chain(
get_pacer_doc_by_rd.s(
rd_pk, pacer_session.cookies, tag=tag_petitions
rd_pk,
SessionData(session.cookies, session.proxy_address),
tag=tag_petitions,
).set(queue=q),
extract_recap_pdf.si(rd_pk).set(queue=q),
add_items_to_solr.si([rd_pk], "search.RECAPDocument").set(queue=q),
Expand Down
9 changes: 5 additions & 4 deletions cl/corpus_importer/management/commands/760_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
)
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.search.models import Court, RECAPDocument
from cl.search.tasks import add_or_update_recap_docket

Expand All @@ -36,6 +36,7 @@ def get_dockets(options):
username=PACER_USERNAME, password=PACER_PASSWORD
)
session.login()
session_data = SessionData(session.cookies, session.proxy_address)
for i, row in enumerate(reader):
if i < options["offset"]:
continue
Expand All @@ -55,7 +56,7 @@ def get_dockets(options):
get_appellate_docket_by_docket_number.s(
docket_number=row["Cleaned case_No"],
court_id=row["fjc_court_id"],
cookies=session.cookies,
session_data=session_data,
tag_names=[TAG],
**{
"show_docket_entries": True,
Expand All @@ -75,12 +76,12 @@ def get_dockets(options):
pass_through=None,
docket_number=row["Cleaned case_No"],
court_id=row["fjc_court_id"],
cookies=session.cookies,
session_data=session_data,
case_name=row["Title"],
).set(queue=q),
get_docket_by_pacer_case_id.s(
court_id=row["fjc_court_id"],
cookies=session.cookies,
session_data=session_data,
tag_names=[TAG],
**{
"show_parties_and_counsel": True,
Expand Down
11 changes: 6 additions & 5 deletions cl/corpus_importer/management/commands/adelman_david.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
)
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import CommandUtils, VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.search.tasks import add_or_update_recap_docket

PACER_USERNAME = os.environ.get("PACER_USERNAME", settings.PACER_USERNAME)
Expand All @@ -33,6 +33,7 @@ def download_dockets(options):
username=PACER_USERNAME, password=PACER_PASSWORD
)
session.login()
session_data = SessionData(session.cookies, session.proxy_address)
for i, row in enumerate(reader):
if i < options["offset"]:
continue
Expand All @@ -48,7 +49,7 @@ def download_dockets(options):
get_appellate_docket_by_docket_number.s(
docket_number=row["docket_no1"],
court_id=row["cl_court"],
cookies=session.cookies,
session_data=session_data,
tag_names=[PROJECT_TAG_NAME, row_tag],
# Do not get the docket entries for now. We're only
# interested in the date terminated. If it's an open case,
Expand All @@ -71,17 +72,17 @@ def download_dockets(options):
pass_through=None,
docket_number=row["docket_no1"],
court_id=row["cl_court"],
cookies=session.cookies,
session_data=session_data,
case_name=row["name"],
).set(queue=q),
do_case_query_by_pacer_case_id.s(
court_id=row["cl_court"],
cookies=session.cookies,
session_data=session_data,
tag_names=[PROJECT_TAG_NAME, row_tag],
).set(queue=q),
get_docket_by_pacer_case_id.s(
court_id=row["cl_court"],
cookies=session.cookies,
session_data=session_data,
tag_names=[PROJECT_TAG_NAME, row_tag],
**{
# No docket entries
Expand Down
8 changes: 5 additions & 3 deletions cl/corpus_importer/management/commands/buchwald_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
)
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.search.models import Docket
from cl.search.tasks import add_or_update_recap_docket

Expand Down Expand Up @@ -59,7 +59,7 @@ def add_all_nysd_to_cl(options):
throttle.maybe_wait()
logger.info("Doing pacer_case_id: %s", pacer_case_id)
make_docket_by_iquery.apply_async(
args=("nysd", pacer_case_id, session.cookies, [NYSD_TAG]),
args=("nysd", pacer_case_id, "default", [NYSD_TAG]),
queue=q,
)

Expand Down Expand Up @@ -104,7 +104,9 @@ def get_dockets(options):
get_docket_by_pacer_case_id.s(
data={"pacer_case_id": d.pacer_case_id},
court_id=d.court_id,
cookies=session.cookies,
session_data=SessionData(
session.cookies, session.proxy_address
),
docket_pk=d.pk,
tag_names=[BUCKWALD_TAG],
**{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from cl.corpus_importer.tasks import get_docket_by_pacer_case_id
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.lib.scorched_utils import ExtraSolrInterface
from cl.lib.search_utils import build_main_query_from_query_string
from cl.search.models import Docket
Expand Down Expand Up @@ -64,7 +64,10 @@ def get_pacer_dockets(options, docket_pks, tags):
get_docket_by_pacer_case_id.s(
{"pacer_case_id": d.pacer_case_id, "docket_pk": d.pk},
d.court_id,
cookies=pacer_session.cookies,
session_data=SessionData(
pacer_session.cookies,
pacer_session.proxy_address,
),
tag_names=tags,
**{
"show_parties_and_counsel": True,
Expand Down
7 changes: 4 additions & 3 deletions cl/corpus_importer/management/commands/everything_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
)
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.recap.constants import (
CIVIL_RIGHTS_ACCOMMODATIONS,
CIVIL_RIGHTS_ADA_EMPLOYMENT,
Expand Down Expand Up @@ -136,18 +136,19 @@ def get_dockets(options, items, tags, sample_size=0, doc_num_end=""):

throttle.maybe_wait()
params = make_fjc_idb_lookup_params(row)
session_data = SessionData(session.cookies, session.proxy_address)
chain(
get_pacer_case_id_and_title.s(
pass_through=None,
docket_number=row.docket_number,
court_id=row.district_id,
cookies=session.cookies,
session_data=session_data,
**params,
).set(queue=q),
filter_docket_by_tags.s(tags, row.district_id).set(queue=q),
get_docket_by_pacer_case_id.s(
court_id=row.district_id,
cookies=session.cookies,
session_data=session_data,
tag_names=tags,
**{
"show_parties_and_counsel": True,
Expand Down
4 changes: 2 additions & 2 deletions cl/corpus_importer/management/commands/export_control.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from cl.corpus_importer.tasks import save_ia_docket_to_disk
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.search.models import Court, Docket

PACER_USERNAME = os.environ.get("PACER_USERNAME", settings.PACER_USERNAME)
Expand Down Expand Up @@ -85,7 +85,7 @@ def get_data(options, row_transform, tags):
row["docket_number"],
row["court"],
row["case_name"],
session.cookies,
SessionData(session.cookies, session.proxy_address),
tags,
q,
)
Expand Down
10 changes: 5 additions & 5 deletions cl/corpus_importer/management/commands/import_patent.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
)
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.recap.constants import PATENT, PATENT_ANDA
from cl.recap.models import FjcIntegratedDatabase
from cl.search.models import Docket
Expand Down Expand Up @@ -44,7 +44,7 @@ def get_dockets(options: dict) -> None:
username=PACER_USERNAME, password=PACER_PASSWORD
)
session.login()

session_data = SessionData(session.cookies, session.proxy_address)
NOS_CODES = [PATENT, PATENT_ANDA]
DISTRICTS = ["ded", "txwd"]
START_DATE = "2012-01-01"
Expand Down Expand Up @@ -78,12 +78,12 @@ def get_dockets(options: dict) -> None:
pass_through=None,
docket_number=item.docket_number,
court_id=item.district_id,
cookies=session.cookies,
session_data=session_data,
**params,
).set(queue=q),
get_docket_by_pacer_case_id.s(
court_id=item.district_id,
cookies=session.cookies,
session_data=session_data,
tag_names=PATENT_TAGS,
**{
"show_parties_and_counsel": True,
Expand All @@ -101,7 +101,7 @@ def get_dockets(options: dict) -> None:
get_docket_by_pacer_case_id.s(
data={"pacer_case_id": d.pacer_case_id},
court_id=d.court_id,
cookies=session.cookies,
session_data=session_data,
docket_pk=d.pk,
tag_names=PATENT_TAGS,
**{
Expand Down
17 changes: 10 additions & 7 deletions cl/corpus_importer/management/commands/invoice_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
)
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.lib.scorched_utils import ExtraSolrInterface
from cl.lib.search_utils import build_main_query_from_query_string
from cl.recap.tasks import process_recap_attachment
Expand Down Expand Up @@ -83,9 +83,10 @@ def get_attachment_pages(options):
throttle.maybe_wait()
chain(
# Query the attachment page and process it
get_attachment_page_by_rd.s(result["id"], session.cookies).set(
queue=q
),
get_attachment_page_by_rd.s(
result["id"],
SessionData(session.cookies, session.proxy_address),
).set(queue=q),
# Take that in a new task and make a PQ object
make_attachment_pq_object.s(result["id"], recap_user.pk).set(
queue=q
Expand Down Expand Up @@ -150,9 +151,11 @@ def get_documents(options):
continue

chain(
get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(
queue=q
),
get_pacer_doc_by_rd.s(
rd.pk,
SessionData(session.cookies, session.proxy_address),
tag=TAG_PHASE_2,
).set(queue=q),
extract_recap_pdf.si(rd.pk).set(queue=q),
add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q),
).apply_async()
Expand Down
6 changes: 4 additions & 2 deletions cl/corpus_importer/management/commands/jackson_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from cl.corpus_importer.tasks import get_docket_by_pacer_case_id
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.search.models import Docket
from cl.search.tasks import add_or_update_recap_docket

Expand Down Expand Up @@ -41,7 +41,9 @@ def get_dockets(options):
get_docket_by_pacer_case_id.s(
data={"pacer_case_id": d.pacer_case_id},
court_id=d.court_id,
cookies=session.cookies,
session_data=SessionData(
session.cookies, session.proxy_address
),
docket_pk=d.pk,
tag_names=[JACKSON_TAG],
**{
Expand Down
15 changes: 11 additions & 4 deletions cl/corpus_importer/management/commands/kessler_ilnb.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
)
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.scrapers.tasks import extract_recap_pdf
from cl.search.models import DocketEntry, RECAPDocument
from cl.search.tasks import add_items_to_solr, add_or_update_recap_docket
Expand Down Expand Up @@ -53,20 +53,23 @@ def get_dockets(options):
logger.info(f"Sent {i} tasks to celery so far.")
logger.info("Doing row %s", i)
throttle.maybe_wait()
session_data = SessionData(
pacer_session.cookies, pacer_session.proxy_address
)
chain(
get_pacer_case_id_and_title.s(
pass_through=None,
docket_number=make_bankr_docket_number(
row["docket"], row["office"]
),
court_id="ilnb",
cookies=pacer_session.cookies,
session_data=session_data,
office_number=row["office"],
docket_number_letters="bk",
).set(queue=q),
get_docket_by_pacer_case_id.s(
court_id="ilnb",
cookies=pacer_session.cookies,
cookies_data=session_data,
tag_names=[TAG],
**{
"show_parties_and_counsel": True,
Expand Down Expand Up @@ -118,7 +121,11 @@ def get_final_docs(options):
throttle.maybe_wait()
chain(
get_pacer_doc_by_rd.s(
rd_pk, pacer_session.cookies, tag=TAG_FINALS
rd_pk,
SessionData(
pacer_session.cookies, pacer_session.proxy_address
),
tag=TAG_FINALS,
).set(queue=q),
extract_recap_pdf.si(rd_pk).set(queue=q),
add_items_to_solr.si([rd_pk], "search.RECAPDocument").set(
Expand Down
Loading

0 comments on commit 723b7ec

Please sign in to comment.