Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(pacer): Link pacer sessions to proxies #4192

Merged
merged 20 commits into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
ce3bba1
feat(setting): Adds new setting for managing a list of proxies
ERosendo Jul 10, 2024
f31ca58
feat(lib): Adds proxy selection logic for the ProxyPacerSession class
ERosendo Jul 10, 2024
db2c3ca
feat(lib): Tweaks log_into_pacer to return cookie jar and proxy address
ERosendo Jul 10, 2024
a99ac37
feat(lib): Updates get_or_cache_pacer_cookies to return tuples
ERosendo Jul 10, 2024
8b3417e
feat(corpus_importer): Updates tasks logic and signature for improved…
ERosendo Jul 10, 2024
588818d
feat(recap): Tweaks tasks to handle the new format for user cookies.
ERosendo Jul 10, 2024
8489419
feat(scrapers): Updates update_docket_info_iquery task
ERosendo Jul 10, 2024
b93fd90
feat(corpus_importer): Updates commands to use the new cookie format
ERosendo Jul 10, 2024
693552d
feat(lib): Adds tests for pacer session utils
ERosendo Jul 10, 2024
5a47599
feat(pacer_session): Updates logic to pick a proxy connection str
ERosendo Jul 12, 2024
b0b6eb3
feat(corpus_importer): Adds an exception to the get_pacer_case_id_and…
ERosendo Jul 12, 2024
02141d9
feat(test): Override PROXY_HOSTS setting for tests
ERosendo Jul 12, 2024
3d2af40
Merge branch 'main' into 4087-feat-link-sessions-to-proxy
ERosendo Jul 12, 2024
f997bcc
fix(tests): Override PROXY_HOSTS setting for the ScrapeIqueryPagesTes…
ERosendo Jul 12, 2024
ccfcbd2
Merge branch 'main' into 4087-feat-link-sessions-to-proxy
ERosendo Jul 25, 2024
d1058d0
feat(settings): Removes EGRESS_PROXY_HOST env variable
ERosendo Jul 25, 2024
e34f3d5
feat(pacer): Adds dataclass for storing PACER session data
ERosendo Jul 30, 2024
78dcb23
Merge branch 'main' into 4087-feat-link-sessions-to-proxy
ERosendo Jul 30, 2024
ea91f42
fix(corpus importer): Use correct parameter name for session data
ERosendo Jul 30, 2024
6f31ed8
Merge branch 'main' into 4087-feat-link-sessions-to-proxy
ERosendo Jul 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cl/api/webhooks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import random

import requests
from django.conf import settings
Expand Down Expand Up @@ -38,7 +39,7 @@ def send_webhook_event(
the webhook is sent.
"""
proxy_server = {
"http": settings.EGRESS_PROXY_HOST, # type: ignore
"http": random.choice(settings.EGRESS_PROXY_HOSTS), # type: ignore
}
headers = {
"Content-type": "application/json",
Expand Down
15 changes: 8 additions & 7 deletions cl/corpus_importer/bulk_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from cl.corpus_importer.tasks import get_pacer_doc_by_rd
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.lib.scorched_utils import ExtraSolrInterface
from cl.lib.search_utils import build_main_query_from_query_string
from cl.scrapers.tasks import extract_recap_pdf
Expand Down Expand Up @@ -75,10 +75,10 @@ def get_petitions(
)
q = options["queue"]
throttle = CeleryThrottle(queue_name=q)
pacer_session = ProxyPacerSession(
session = ProxyPacerSession(
username=pacer_username, password=pacer_password
)
pacer_session.login()
session.login()
for i, rd_pk in enumerate(rds):
if i < options["offset"]:
i += 1
Expand All @@ -87,17 +87,18 @@ def get_petitions(
break

if i % 1000 == 0:
pacer_session = ProxyPacerSession(
session = ProxyPacerSession(
username=pacer_username, password=pacer_password
)
pacer_session.login()
session.login()
logger.info(f"Sent {i} tasks to celery so far.")
logger.info("Doing row %s", i)
throttle.maybe_wait()

chain(
get_pacer_doc_by_rd.s(
rd_pk, pacer_session.cookies, tag=tag_petitions
rd_pk,
SessionData(session.cookies, session.proxy_address),
tag=tag_petitions,
).set(queue=q),
extract_recap_pdf.si(rd_pk).set(queue=q),
add_items_to_solr.si([rd_pk], "search.RECAPDocument").set(queue=q),
Expand Down
9 changes: 5 additions & 4 deletions cl/corpus_importer/management/commands/760_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
)
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.search.models import Court, RECAPDocument
from cl.search.tasks import add_or_update_recap_docket

Expand All @@ -36,6 +36,7 @@ def get_dockets(options):
username=PACER_USERNAME, password=PACER_PASSWORD
)
session.login()
session_data = SessionData(session.cookies, session.proxy_address)
for i, row in enumerate(reader):
if i < options["offset"]:
continue
Expand All @@ -55,7 +56,7 @@ def get_dockets(options):
get_appellate_docket_by_docket_number.s(
docket_number=row["Cleaned case_No"],
court_id=row["fjc_court_id"],
cookies=session.cookies,
session_data=session_data,
tag_names=[TAG],
**{
"show_docket_entries": True,
Expand All @@ -75,12 +76,12 @@ def get_dockets(options):
pass_through=None,
docket_number=row["Cleaned case_No"],
court_id=row["fjc_court_id"],
cookies=session.cookies,
session_data=session_data,
case_name=row["Title"],
).set(queue=q),
get_docket_by_pacer_case_id.s(
court_id=row["fjc_court_id"],
cookies=session.cookies,
session_data=session_data,
tag_names=[TAG],
**{
"show_parties_and_counsel": True,
Expand Down
11 changes: 6 additions & 5 deletions cl/corpus_importer/management/commands/adelman_david.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
)
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import CommandUtils, VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.search.tasks import add_or_update_recap_docket

PACER_USERNAME = os.environ.get("PACER_USERNAME", settings.PACER_USERNAME)
Expand All @@ -33,6 +33,7 @@ def download_dockets(options):
username=PACER_USERNAME, password=PACER_PASSWORD
)
session.login()
session_data = SessionData(session.cookies, session.proxy_address)
for i, row in enumerate(reader):
if i < options["offset"]:
continue
Expand All @@ -48,7 +49,7 @@ def download_dockets(options):
get_appellate_docket_by_docket_number.s(
docket_number=row["docket_no1"],
court_id=row["cl_court"],
cookies=session.cookies,
session_data=session_data,
tag_names=[PROJECT_TAG_NAME, row_tag],
# Do not get the docket entries for now. We're only
# interested in the date terminated. If it's an open case,
Expand All @@ -71,17 +72,17 @@ def download_dockets(options):
pass_through=None,
docket_number=row["docket_no1"],
court_id=row["cl_court"],
cookies=session.cookies,
session_data=session_data,
case_name=row["name"],
).set(queue=q),
do_case_query_by_pacer_case_id.s(
court_id=row["cl_court"],
cookies=session.cookies,
session_data=session_data,
tag_names=[PROJECT_TAG_NAME, row_tag],
).set(queue=q),
get_docket_by_pacer_case_id.s(
court_id=row["cl_court"],
cookies=session.cookies,
session_data=session_data,
tag_names=[PROJECT_TAG_NAME, row_tag],
**{
# No docket entries
Expand Down
8 changes: 5 additions & 3 deletions cl/corpus_importer/management/commands/buchwald_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
)
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.search.models import Docket
from cl.search.tasks import add_or_update_recap_docket

Expand Down Expand Up @@ -59,7 +59,7 @@ def add_all_nysd_to_cl(options):
throttle.maybe_wait()
logger.info("Doing pacer_case_id: %s", pacer_case_id)
make_docket_by_iquery.apply_async(
args=("nysd", pacer_case_id, session.cookies, [NYSD_TAG]),
args=("nysd", pacer_case_id, "default", [NYSD_TAG]),
queue=q,
)

Expand Down Expand Up @@ -104,7 +104,9 @@ def get_dockets(options):
get_docket_by_pacer_case_id.s(
data={"pacer_case_id": d.pacer_case_id},
court_id=d.court_id,
cookies=session.cookies,
session_data=SessionData(
session.cookies, session.proxy_address
),
docket_pk=d.pk,
tag_names=[BUCKWALD_TAG],
**{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from cl.corpus_importer.tasks import get_docket_by_pacer_case_id
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.lib.scorched_utils import ExtraSolrInterface
from cl.lib.search_utils import build_main_query_from_query_string
from cl.search.models import Docket
Expand Down Expand Up @@ -64,7 +64,10 @@ def get_pacer_dockets(options, docket_pks, tags):
get_docket_by_pacer_case_id.s(
{"pacer_case_id": d.pacer_case_id, "docket_pk": d.pk},
d.court_id,
cookies=pacer_session.cookies,
session_data=SessionData(
pacer_session.cookies,
pacer_session.proxy_address,
),
tag_names=tags,
**{
"show_parties_and_counsel": True,
Expand Down
7 changes: 4 additions & 3 deletions cl/corpus_importer/management/commands/everything_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
)
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.recap.constants import (
CIVIL_RIGHTS_ACCOMMODATIONS,
CIVIL_RIGHTS_ADA_EMPLOYMENT,
Expand Down Expand Up @@ -136,18 +136,19 @@ def get_dockets(options, items, tags, sample_size=0, doc_num_end=""):

throttle.maybe_wait()
params = make_fjc_idb_lookup_params(row)
session_data = SessionData(session.cookies, session.proxy_address)
chain(
get_pacer_case_id_and_title.s(
pass_through=None,
docket_number=row.docket_number,
court_id=row.district_id,
cookies=session.cookies,
session_data=session_data,
**params,
).set(queue=q),
filter_docket_by_tags.s(tags, row.district_id).set(queue=q),
get_docket_by_pacer_case_id.s(
court_id=row.district_id,
cookies=session.cookies,
session_data=session_data,
tag_names=tags,
**{
"show_parties_and_counsel": True,
Expand Down
4 changes: 2 additions & 2 deletions cl/corpus_importer/management/commands/export_control.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from cl.corpus_importer.tasks import save_ia_docket_to_disk
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.search.models import Court, Docket

PACER_USERNAME = os.environ.get("PACER_USERNAME", settings.PACER_USERNAME)
Expand Down Expand Up @@ -85,7 +85,7 @@ def get_data(options, row_transform, tags):
row["docket_number"],
row["court"],
row["case_name"],
session.cookies,
SessionData(session.cookies, session.proxy_address),
tags,
q,
)
Expand Down
10 changes: 5 additions & 5 deletions cl/corpus_importer/management/commands/import_patent.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
)
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.recap.constants import PATENT, PATENT_ANDA
from cl.recap.models import FjcIntegratedDatabase
from cl.search.models import Docket
Expand Down Expand Up @@ -44,7 +44,7 @@ def get_dockets(options: dict) -> None:
username=PACER_USERNAME, password=PACER_PASSWORD
)
session.login()

session_data = SessionData(session.cookies, session.proxy_address)
NOS_CODES = [PATENT, PATENT_ANDA]
DISTRICTS = ["ded", "txwd"]
START_DATE = "2012-01-01"
Expand Down Expand Up @@ -78,12 +78,12 @@ def get_dockets(options: dict) -> None:
pass_through=None,
docket_number=item.docket_number,
court_id=item.district_id,
cookies=session.cookies,
session_data=session_data,
**params,
).set(queue=q),
get_docket_by_pacer_case_id.s(
court_id=item.district_id,
cookies=session.cookies,
session_data=session_data,
tag_names=PATENT_TAGS,
**{
"show_parties_and_counsel": True,
Expand All @@ -101,7 +101,7 @@ def get_dockets(options: dict) -> None:
get_docket_by_pacer_case_id.s(
data={"pacer_case_id": d.pacer_case_id},
court_id=d.court_id,
cookies=session.cookies,
session_data=session_data,
docket_pk=d.pk,
tag_names=PATENT_TAGS,
**{
Expand Down
17 changes: 10 additions & 7 deletions cl/corpus_importer/management/commands/invoice_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
)
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.lib.scorched_utils import ExtraSolrInterface
from cl.lib.search_utils import build_main_query_from_query_string
from cl.recap.tasks import process_recap_attachment
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are using environment variables inside django app. Use django-environ as it a better alternative for deployment.

Ignore this finding from use-django-environ.

Expand Down Expand Up @@ -83,9 +83,10 @@ def get_attachment_pages(options):
throttle.maybe_wait()
chain(
# Query the attachment page and process it
get_attachment_page_by_rd.s(result["id"], session.cookies).set(
queue=q
),
get_attachment_page_by_rd.s(
result["id"],
SessionData(session.cookies, session.proxy_address),
).set(queue=q),
# Take that in a new task and make a PQ object
make_attachment_pq_object.s(result["id"], recap_user.pk).set(
queue=q
Expand Down Expand Up @@ -150,9 +151,11 @@ def get_documents(options):
continue

chain(
get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(
queue=q
),
get_pacer_doc_by_rd.s(
rd.pk,
SessionData(session.cookies, session.proxy_address),
tag=TAG_PHASE_2,
).set(queue=q),
extract_recap_pdf.si(rd.pk).set(queue=q),
add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q),
).apply_async()
Expand Down
6 changes: 4 additions & 2 deletions cl/corpus_importer/management/commands/jackson_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from cl.corpus_importer.tasks import get_docket_by_pacer_case_id
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.search.models import Docket
from cl.search.tasks import add_or_update_recap_docket

Expand Down Expand Up @@ -41,7 +41,9 @@ def get_dockets(options):
get_docket_by_pacer_case_id.s(
data={"pacer_case_id": d.pacer_case_id},
court_id=d.court_id,
cookies=session.cookies,
session_data=SessionData(
session.cookies, session.proxy_address
),
docket_pk=d.pk,
tag_names=[JACKSON_TAG],
**{
Expand Down
15 changes: 11 additions & 4 deletions cl/corpus_importer/management/commands/kessler_ilnb.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
)
from cl.lib.celery_utils import CeleryThrottle
from cl.lib.command_utils import VerboseCommand, logger
from cl.lib.pacer_session import ProxyPacerSession
from cl.lib.pacer_session import ProxyPacerSession, SessionData
from cl.scrapers.tasks import extract_recap_pdf
from cl.search.models import DocketEntry, RECAPDocument
from cl.search.tasks import add_items_to_solr, add_or_update_recap_docket
Expand Down Expand Up @@ -53,20 +53,23 @@ def get_dockets(options):
logger.info(f"Sent {i} tasks to celery so far.")
logger.info("Doing row %s", i)
throttle.maybe_wait()
session_data = SessionData(
pacer_session.cookies, pacer_session.proxy_address
)
chain(
get_pacer_case_id_and_title.s(
pass_through=None,
docket_number=make_bankr_docket_number(
row["docket"], row["office"]
),
court_id="ilnb",
cookies=pacer_session.cookies,
session_data=session_data,
office_number=row["office"],
docket_number_letters="bk",
).set(queue=q),
get_docket_by_pacer_case_id.s(
court_id="ilnb",
cookies=pacer_session.cookies,
cookies_data=session_data,
tag_names=[TAG],
**{
"show_parties_and_counsel": True,
Expand Down Expand Up @@ -118,7 +121,11 @@ def get_final_docs(options):
throttle.maybe_wait()
chain(
get_pacer_doc_by_rd.s(
rd_pk, pacer_session.cookies, tag=TAG_FINALS
rd_pk,
SessionData(
pacer_session.cookies, pacer_session.proxy_address
),
tag=TAG_FINALS,
).set(queue=q),
extract_recap_pdf.si(rd_pk).set(queue=q),
add_items_to_solr.si([rd_pk], "search.RECAPDocument").set(
Expand Down
Loading
Loading