diff --git a/cl/api/webhooks.py b/cl/api/webhooks.py index cf6e3f6cab..f6ca97d9e3 100644 --- a/cl/api/webhooks.py +++ b/cl/api/webhooks.py @@ -1,4 +1,5 @@ import json +import random import requests from django.conf import settings @@ -38,7 +39,7 @@ def send_webhook_event( the webhook is sent. """ proxy_server = { - "http": settings.EGRESS_PROXY_HOST, # type: ignore + "http": random.choice(settings.EGRESS_PROXY_HOSTS), # type: ignore } headers = { "Content-type": "application/json", diff --git a/cl/corpus_importer/bulk_utils.py b/cl/corpus_importer/bulk_utils.py index 730a98b61d..66e45fdc86 100644 --- a/cl/corpus_importer/bulk_utils.py +++ b/cl/corpus_importer/bulk_utils.py @@ -6,7 +6,7 @@ from cl.corpus_importer.tasks import get_pacer_doc_by_rd from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import logger -from cl.lib.pacer_session import ProxyPacerSession +from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.lib.scorched_utils import ExtraSolrInterface from cl.lib.search_utils import build_main_query_from_query_string from cl.scrapers.tasks import extract_recap_pdf @@ -75,10 +75,10 @@ def get_petitions( ) q = options["queue"] throttle = CeleryThrottle(queue_name=q) - pacer_session = ProxyPacerSession( + session = ProxyPacerSession( username=pacer_username, password=pacer_password ) - pacer_session.login() + session.login() for i, rd_pk in enumerate(rds): if i < options["offset"]: i += 1 @@ -87,17 +87,18 @@ def get_petitions( break if i % 1000 == 0: - pacer_session = ProxyPacerSession( + session = ProxyPacerSession( username=pacer_username, password=pacer_password ) - pacer_session.login() + session.login() logger.info(f"Sent {i} tasks to celery so far.") logger.info("Doing row %s", i) throttle.maybe_wait() - chain( get_pacer_doc_by_rd.s( - rd_pk, pacer_session.cookies, tag=tag_petitions + rd_pk, + SessionData(session.cookies, session.proxy_address), + tag=tag_petitions, ).set(queue=q), extract_recap_pdf.si(rd_pk).set(queue=q), add_items_to_solr.si([rd_pk], "search.RECAPDocument").set(queue=q), diff --git a/cl/corpus_importer/management/commands/760_project.py b/cl/corpus_importer/management/commands/760_project.py index b4a227f0aa..b31c3a810c 100644 --- a/cl/corpus_importer/management/commands/760_project.py +++ b/cl/corpus_importer/management/commands/760_project.py @@ -13,7 +13,7 @@ ) from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.pacer_session import ProxyPacerSession +from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.search.models import Court, RECAPDocument from cl.search.tasks import add_or_update_recap_docket @@ -36,6 +36,7 @@ def get_dockets(options): username=PACER_USERNAME, password=PACER_PASSWORD ) session.login() + session_data = SessionData(session.cookies, session.proxy_address) for i, row in enumerate(reader): if i < options["offset"]: continue @@ -55,7 +56,7 @@ def get_dockets(options): get_appellate_docket_by_docket_number.s( docket_number=row["Cleaned case_No"], court_id=row["fjc_court_id"], - cookies=session.cookies, + session_data=session_data, tag_names=[TAG], **{ "show_docket_entries": True, @@ -75,12 +76,12 @@ def get_dockets(options): pass_through=None, docket_number=row["Cleaned case_No"], court_id=row["fjc_court_id"], - cookies=session.cookies, + session_data=session_data, case_name=row["Title"], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row["fjc_court_id"], - cookies=session.cookies, + session_data=session_data, tag_names=[TAG], **{ "show_parties_and_counsel": True, diff --git a/cl/corpus_importer/management/commands/adelman_david.py b/cl/corpus_importer/management/commands/adelman_david.py index f24f58cae3..25aa72db2f 100644 --- a/cl/corpus_importer/management/commands/adelman_david.py +++ b/cl/corpus_importer/management/commands/adelman_david.py @@ -12,7 +12,7 @@ ) from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import CommandUtils, VerboseCommand, logger -from cl.lib.pacer_session import ProxyPacerSession +from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.search.tasks import add_or_update_recap_docket PACER_USERNAME = os.environ.get("PACER_USERNAME", settings.PACER_USERNAME) @@ -33,6 +33,7 @@ def download_dockets(options): username=PACER_USERNAME, password=PACER_PASSWORD ) session.login() + session_data = SessionData(session.cookies, session.proxy_address) for i, row in enumerate(reader): if i < options["offset"]: continue @@ -48,7 +49,7 @@ def download_dockets(options): get_appellate_docket_by_docket_number.s( docket_number=row["docket_no1"], court_id=row["cl_court"], - cookies=session.cookies, + session_data=session_data, tag_names=[PROJECT_TAG_NAME, row_tag], # Do not get the docket entries for now. We're only # interested in the date terminated. If it's an open case, @@ -71,17 +72,17 @@ def download_dockets(options): pass_through=None, docket_number=row["docket_no1"], court_id=row["cl_court"], - cookies=session.cookies, + session_data=session_data, case_name=row["name"], ).set(queue=q), do_case_query_by_pacer_case_id.s( court_id=row["cl_court"], - cookies=session.cookies, + session_data=session_data, tag_names=[PROJECT_TAG_NAME, row_tag], ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row["cl_court"], - cookies=session.cookies, + session_data=session_data, tag_names=[PROJECT_TAG_NAME, row_tag], **{ # No docket entries diff --git a/cl/corpus_importer/management/commands/buchwald_project.py b/cl/corpus_importer/management/commands/buchwald_project.py index 7beb4865af..ba10538152 100644 --- a/cl/corpus_importer/management/commands/buchwald_project.py +++ b/cl/corpus_importer/management/commands/buchwald_project.py @@ -13,7 +13,7 @@ ) from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.pacer_session import ProxyPacerSession +from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.search.models import Docket from cl.search.tasks import add_or_update_recap_docket @@ -59,7 +59,7 @@ def add_all_nysd_to_cl(options): throttle.maybe_wait() logger.info("Doing pacer_case_id: %s", pacer_case_id) make_docket_by_iquery.apply_async( - args=("nysd", pacer_case_id, session.cookies, [NYSD_TAG]), + args=("nysd", pacer_case_id, "default", [NYSD_TAG]), queue=q, ) @@ -104,7 +104,9 @@ def get_dockets(options): get_docket_by_pacer_case_id.s( data={"pacer_case_id": d.pacer_case_id}, court_id=d.court_id, - cookies=session.cookies, + session_data=SessionData( + session.cookies, session.proxy_address + ), docket_pk=d.pk, tag_names=[BUCKWALD_TAG], **{ diff --git a/cl/corpus_importer/management/commands/buried_alive_project.py b/cl/corpus_importer/management/commands/buried_alive_project.py index 880176072e..d81a4d2185 100644 --- a/cl/corpus_importer/management/commands/buried_alive_project.py +++ b/cl/corpus_importer/management/commands/buried_alive_project.py @@ -7,7 +7,7 @@ from cl.corpus_importer.tasks import get_docket_by_pacer_case_id from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.pacer_session import ProxyPacerSession +from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.lib.scorched_utils import ExtraSolrInterface from cl.lib.search_utils import build_main_query_from_query_string from cl.search.models import Docket @@ -64,7 +64,10 @@ def get_pacer_dockets(options, docket_pks, tags): get_docket_by_pacer_case_id.s( {"pacer_case_id": d.pacer_case_id, "docket_pk": d.pk}, d.court_id, - cookies=pacer_session.cookies, + session_data=SessionData( + pacer_session.cookies, + pacer_session.proxy_address, + ), tag_names=tags, **{ "show_parties_and_counsel": True, diff --git a/cl/corpus_importer/management/commands/everything_project.py b/cl/corpus_importer/management/commands/everything_project.py index 3ea7d27eb2..b48dd4a008 100644 --- a/cl/corpus_importer/management/commands/everything_project.py +++ b/cl/corpus_importer/management/commands/everything_project.py @@ -11,7 +11,7 @@ ) from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.pacer_session import ProxyPacerSession +from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.recap.constants import ( CIVIL_RIGHTS_ACCOMMODATIONS, CIVIL_RIGHTS_ADA_EMPLOYMENT, @@ -136,18 +136,19 @@ def get_dockets(options, items, tags, sample_size=0, doc_num_end=""): throttle.maybe_wait() params = make_fjc_idb_lookup_params(row) + session_data = SessionData(session.cookies, session.proxy_address) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row.docket_number, court_id=row.district_id, - cookies=session.cookies, + session_data=session_data, **params, ).set(queue=q), filter_docket_by_tags.s(tags, row.district_id).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row.district_id, - cookies=session.cookies, + session_data=session_data, tag_names=tags, **{ "show_parties_and_counsel": True, diff --git a/cl/corpus_importer/management/commands/export_control.py b/cl/corpus_importer/management/commands/export_control.py index da434bd83f..4c24adff94 100644 --- a/cl/corpus_importer/management/commands/export_control.py +++ b/cl/corpus_importer/management/commands/export_control.py @@ -8,7 +8,7 @@ from cl.corpus_importer.tasks import save_ia_docket_to_disk from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.pacer_session import ProxyPacerSession +from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.search.models import Court, Docket PACER_USERNAME = os.environ.get("PACER_USERNAME", settings.PACER_USERNAME) @@ -85,7 +85,7 @@ def get_data(options, row_transform, tags): row["docket_number"], row["court"], row["case_name"], - session.cookies, + SessionData(session.cookies, session.proxy_address), tags, q, ) diff --git a/cl/corpus_importer/management/commands/import_patent.py b/cl/corpus_importer/management/commands/import_patent.py index f207f649ab..b6956f0406 100644 --- a/cl/corpus_importer/management/commands/import_patent.py +++ b/cl/corpus_importer/management/commands/import_patent.py @@ -11,7 +11,7 @@ ) from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.pacer_session import ProxyPacerSession +from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.recap.constants import PATENT, PATENT_ANDA from cl.recap.models import FjcIntegratedDatabase from cl.search.models import Docket @@ -44,7 +44,7 @@ def get_dockets(options: dict) -> None: username=PACER_USERNAME, password=PACER_PASSWORD ) session.login() - + session_data = SessionData(session.cookies, session.proxy_address) NOS_CODES = [PATENT, PATENT_ANDA] DISTRICTS = ["ded", "txwd"] START_DATE = "2012-01-01" @@ -78,12 +78,12 @@ def get_dockets(options: dict) -> None: pass_through=None, docket_number=item.docket_number, court_id=item.district_id, - cookies=session.cookies, + session_data=session_data, **params, ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=item.district_id, - cookies=session.cookies, + session_data=session_data, tag_names=PATENT_TAGS, **{ "show_parties_and_counsel": True, @@ -101,7 +101,7 @@ def get_dockets(options: dict) -> None: get_docket_by_pacer_case_id.s( data={"pacer_case_id": d.pacer_case_id}, court_id=d.court_id, - cookies=session.cookies, + session_data=session_data, docket_pk=d.pk, tag_names=PATENT_TAGS, **{ diff --git a/cl/corpus_importer/management/commands/invoice_project.py b/cl/corpus_importer/management/commands/invoice_project.py index 8f3f889c34..1a48d80f25 100644 --- a/cl/corpus_importer/management/commands/invoice_project.py +++ b/cl/corpus_importer/management/commands/invoice_project.py @@ -14,7 +14,7 @@ ) from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.pacer_session import ProxyPacerSession +from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.lib.scorched_utils import ExtraSolrInterface from cl.lib.search_utils import build_main_query_from_query_string from cl.recap.tasks import process_recap_attachment @@ -83,9 +83,10 @@ def get_attachment_pages(options): throttle.maybe_wait() chain( # Query the attachment page and process it - get_attachment_page_by_rd.s(result["id"], session.cookies).set( - queue=q - ), + get_attachment_page_by_rd.s( + result["id"], + SessionData(session.cookies, session.proxy_address), + ).set(queue=q), # Take that in a new task and make a PQ object make_attachment_pq_object.s(result["id"], recap_user.pk).set( queue=q @@ -150,9 +151,11 @@ def get_documents(options): continue chain( - get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set( - queue=q - ), + get_pacer_doc_by_rd.s( + rd.pk, + SessionData(session.cookies, session.proxy_address), + tag=TAG_PHASE_2, + ).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async() diff --git a/cl/corpus_importer/management/commands/jackson_project.py b/cl/corpus_importer/management/commands/jackson_project.py index 1e7fd98e3b..d5afc22f02 100644 --- a/cl/corpus_importer/management/commands/jackson_project.py +++ b/cl/corpus_importer/management/commands/jackson_project.py @@ -6,7 +6,7 @@ from cl.corpus_importer.tasks import get_docket_by_pacer_case_id from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.pacer_session import ProxyPacerSession +from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.search.models import Docket from cl.search.tasks import add_or_update_recap_docket @@ -41,7 +41,9 @@ def get_dockets(options): get_docket_by_pacer_case_id.s( data={"pacer_case_id": d.pacer_case_id}, court_id=d.court_id, - cookies=session.cookies, + session_data=SessionData( + session.cookies, session.proxy_address + ), docket_pk=d.pk, tag_names=[JACKSON_TAG], **{ diff --git a/cl/corpus_importer/management/commands/kessler_ilnb.py b/cl/corpus_importer/management/commands/kessler_ilnb.py index a3ad701b23..2c16d3c5d2 100644 --- a/cl/corpus_importer/management/commands/kessler_ilnb.py +++ b/cl/corpus_importer/management/commands/kessler_ilnb.py @@ -16,7 +16,7 @@ ) from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.pacer_session import ProxyPacerSession +from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.scrapers.tasks import extract_recap_pdf from cl.search.models import DocketEntry, RECAPDocument from cl.search.tasks import add_items_to_solr, add_or_update_recap_docket @@ -53,6 +53,9 @@ def get_dockets(options): logger.info(f"Sent {i} tasks to celery so far.") logger.info("Doing row %s", i) throttle.maybe_wait() + session_data = SessionData( + pacer_session.cookies, pacer_session.proxy_address + ) chain( get_pacer_case_id_and_title.s( pass_through=None, @@ -60,13 +63,13 @@ def get_dockets(options): row["docket"], row["office"] ), court_id="ilnb", - cookies=pacer_session.cookies, + session_data=session_data, office_number=row["office"], docket_number_letters="bk", ).set(queue=q), get_docket_by_pacer_case_id.s( court_id="ilnb", - cookies=pacer_session.cookies, + cookies_data=session_data, tag_names=[TAG], **{ "show_parties_and_counsel": True, @@ -118,7 +121,11 @@ def get_final_docs(options): throttle.maybe_wait() chain( get_pacer_doc_by_rd.s( - rd_pk, pacer_session.cookies, tag=TAG_FINALS + rd_pk, + SessionData( + pacer_session.cookies, pacer_session.proxy_address + ), + tag=TAG_FINALS, ).set(queue=q), extract_recap_pdf.si(rd_pk).set(queue=q), add_items_to_solr.si([rd_pk], "search.RECAPDocument").set( diff --git a/cl/corpus_importer/management/commands/legal_robot.py b/cl/corpus_importer/management/commands/legal_robot.py index d6bc38244f..c435e5780b 100644 --- a/cl/corpus_importer/management/commands/legal_robot.py +++ b/cl/corpus_importer/management/commands/legal_robot.py @@ -7,7 +7,7 @@ from cl.corpus_importer.tasks import add_tags, get_pacer_doc_by_rd from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.pacer_session import ProxyPacerSession +from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.lib.scorched_utils import ExtraSolrInterface from cl.lib.search_utils import build_main_query_from_query_string from cl.scrapers.tasks import extract_recap_pdf @@ -79,9 +79,11 @@ def get_documents(options): continue chain( - get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG).set( - queue=q - ), + get_pacer_doc_by_rd.s( + rd.pk, + SessionData(session.cookies, session.proxy_address), + tag=TAG, + ).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_items_to_solr.si([rd.pk], "search.RECAPDocument").set(queue=q), ).apply_async() diff --git a/cl/corpus_importer/management/commands/list_of_creditors_project.py b/cl/corpus_importer/management/commands/list_of_creditors_project.py index 9783903212..e3a18d56dd 100644 --- a/cl/corpus_importer/management/commands/list_of_creditors_project.py +++ b/cl/corpus_importer/management/commands/list_of_creditors_project.py @@ -16,7 +16,7 @@ from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger from cl.lib.pacer import map_cl_to_pacer_id -from cl.lib.pacer_session import ProxyPacerSession +from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.lib.redis_utils import create_redis_semaphore CLIENT_PACER_USERNAME = os.environ.get("CLIENT_PACER_USERNAME", "") @@ -139,7 +139,7 @@ def query_and_save_creditors_data(options: OptionsType) -> None: ) throttle.maybe_wait() query_and_save_list_of_creditors.si( - session.cookies, + SessionData(session.cookies, session.proxy_address), court_id, d_number_file_name, docket_number, diff --git a/cl/corpus_importer/management/commands/nos_700.py b/cl/corpus_importer/management/commands/nos_700.py index 915c030eef..b95c663891 100644 --- a/cl/corpus_importer/management/commands/nos_700.py +++ b/cl/corpus_importer/management/commands/nos_700.py @@ -12,7 +12,7 @@ ) from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.pacer_session import ProxyPacerSession +from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.recap.constants import ( AIRPLANE_PERSONAL_INJURY, AIRPLANE_PRODUCT_LIABILITY, @@ -251,19 +251,20 @@ def get_dockets(options, items, tags, sample_size=0): logger.info("Doing row %s: %s", i, row) throttle.maybe_wait() + session_data = SessionData(session.cookies, session.proxy_address) params = make_fjc_idb_lookup_params(row) chain( get_pacer_case_id_and_title.s( pass_through=None, docket_number=row.docket_number, court_id=row.district_id, - cookies=session.cookies, + session_data=session_data, **params, ).set(queue=q), filter_docket_by_tags.s(tags, row.district_id).set(queue=q), get_docket_by_pacer_case_id.s( court_id=row.district_id, - cookies=session.cookies, + session_data=session_data, tag_names=tags, **{ "show_parties_and_counsel": True, diff --git a/cl/corpus_importer/management/commands/nywb_chapter_7.py b/cl/corpus_importer/management/commands/nywb_chapter_7.py index 7efa9888fa..72aaa914c7 100644 --- a/cl/corpus_importer/management/commands/nywb_chapter_7.py +++ b/cl/corpus_importer/management/commands/nywb_chapter_7.py @@ -14,7 +14,7 @@ ) from cl.lib.celery_utils import CeleryThrottle from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.pacer_session import ProxyPacerSession +from cl.lib.pacer_session import ProxyPacerSession, SessionData from cl.search.tasks import add_or_update_recap_docket PACER_USERNAME = os.environ.get("PACER_USERNAME", "UNKNOWN!") @@ -48,6 +48,10 @@ def get_dockets(options): logger.info(f"Sent {i} tasks to celery so far.") logger.info("Doing row %s", i) throttle.maybe_wait() + session_data = SessionData( + pacer_session.cookies, + pacer_session.proxy_address, + ) chain( get_pacer_case_id_and_title.s( pass_through=None, @@ -55,13 +59,13 @@ def get_dockets(options): row["DOCKET"], row["OFFICE"] ), court_id="nywb", - cookies=pacer_session.cookies, + session_data=session_data, office_number=row["OFFICE"], docket_number_letters="bk", ).set(queue=q), get_docket_by_pacer_case_id.s( court_id="nywb", - cookies=pacer_session.cookies, + session_data=session_data, tag_names=[TAG], **{ "doc_num_start": 1, diff --git a/cl/corpus_importer/task_canvases.py b/cl/corpus_importer/task_canvases.py index 143c061417..01ace71b32 100644 --- a/cl/corpus_importer/task_canvases.py +++ b/cl/corpus_importer/task_canvases.py @@ -18,7 +18,9 @@ from cl.search.tasks import add_or_update_recap_docket -def get_docket_and_claims(docket_number, court, case_name, cookies, tags, q): +def get_docket_and_claims( + docket_number, court, case_name, cookies_data, tags, q +): """Get the docket report, claims history report, and save it all to the DB and Solr """ @@ -27,13 +29,13 @@ def get_docket_and_claims(docket_number, court, case_name, cookies, tags, q): pass_through=None, docket_number=docket_number, court_id=court, - cookies=cookies, + session_data=cookies_data, case_name=case_name, docket_number_letters="bk", ).set(queue=q), get_docket_by_pacer_case_id.s( court_id=court, - cookies=cookies, + session_data=cookies_data, tag_names=tags, **{ "show_parties_and_counsel": True, @@ -41,9 +43,9 @@ def get_docket_and_claims(docket_number, court, case_name, cookies, tags, q): "show_list_of_member_cases": False, } ).set(queue=q), - get_bankr_claims_registry.s(cookies=cookies, tag_names=tags).set( - queue=q - ), + get_bankr_claims_registry.s( + session_data=cookies_data, tag_names=tags + ).set(queue=q), add_or_update_recap_docket.s().set(queue=q), ).apply_async() @@ -72,7 +74,7 @@ def get_district_attachment_pages(options, rd_pks, tag_names, session): break throttle.maybe_wait() chain( - get_attachment_page_by_rd.s(rd_pk, session.cookies).set(queue=q), + get_attachment_page_by_rd.s(rd_pk, session).set(queue=q), make_attachment_pq_object.s(rd_pk, recap_user.pk).set(queue=q), process_recap_attachment.s(tag_names=tag_names).set(queue=q), ).apply_async() diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 57834f4cc5..5a667f79a7 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -46,7 +46,6 @@ from pyexpat import ExpatError from redis import ConnectionError as RedisConnectionError from requests import Response -from requests.cookies import RequestsCookieJar from requests.exceptions import ( ConnectionError, HTTPError, @@ -83,6 +82,7 @@ ) from cl.lib.pacer_session import ( ProxyPacerSession, + SessionData, get_or_cache_pacer_cookies, get_pacer_cookie_from_cache, ) @@ -336,15 +336,16 @@ def get_and_save_free_document_report( :param end: a date object representing the last day to get results. :return: The status code of the scrape """ - cookies = get_or_cache_pacer_cookies( + session_data = get_or_cache_pacer_cookies( "pacer_scraper", username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD, ) s = ProxyPacerSession( - cookies=cookies, + cookies=session_data.cookies, username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD, + proxy=session_data.proxy_address, ) report = FreeOpinionReport(court_id, s) msg = "" @@ -606,14 +607,14 @@ def get_and_process_free_pdf( return None raise self.retry() - cookies = get_or_cache_pacer_cookies( + cookies_data = get_or_cache_pacer_cookies( "pacer_scraper", username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD, ) try: r, r_msg = download_pacer_pdf_by_rd( - rd.pk, result.pacer_case_id, result.pacer_doc_id, cookies + rd.pk, result.pacer_case_id, result.pacer_doc_id, cookies_data ) except HTTPError as exc: if exc.response and exc.response.status_code in [ @@ -939,12 +940,12 @@ def get_pacer_case_id_and_title( pass_through: Any, docket_number: str, court_id: str, - cookies: Optional[RequestsCookieJar] = None, - user_pk: Optional[int] = None, - case_name: Optional[str] = None, - office_number: Optional[str] = None, - docket_number_letters: Optional[str] = None, -) -> Optional[TaskData]: + session_data: SessionData | None = None, + user_pk: int | None = None, + case_name: str | None = None, + office_number: str | None = None, + docket_number_letters: str | None = None, +) -> TaskData | None: """Get the pacer_case_id and title values for a district court docket. Use heuristics to disambiguate the results. @@ -960,8 +961,8 @@ def get_pacer_case_id_and_title( :param docket_number: The docket number to look up. This is a flexible field that accepts a variety of docket number styles. :param court_id: The CourtListener court ID for the docket number - :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a - logged-in PACER user. + :param session_data: A SessionData object containing the session's cookies + and proxy. :param user_pk: The PK of a user making the request. This can be provided instead of the cookies parameter. If so, this will get the user's cookies from redis instead of passing them in as an argument. @@ -989,10 +990,22 @@ def get_pacer_case_id_and_title( docket_number, court_id, ) - if not cookies: - # Get cookies from Redis if not provided - cookies = get_pacer_cookie_from_cache(user_pk) # type: ignore - s = ProxyPacerSession(cookies=cookies) + + if not session_data and user_pk: + cookies_from_cache = get_pacer_cookie_from_cache(user_pk) + session_data = ( + cookies_from_cache + if isinstance(cookies_from_cache, SessionData) + else SessionData(cookies_from_cache) + ) + else: + raise Exception( + "user_pk is unavailable, cookies cannot be retrieved from cache" + ) + + s = ProxyPacerSession( + cookies=session_data.cookies, proxy=session_data.proxy_address + ) report = PossibleCaseNumberApi(map_cl_to_pacer_id(court_id), s) msg = "" try: @@ -1041,9 +1054,9 @@ def do_case_query_by_pacer_case_id( self: Task, data: TaskData, court_id: str, - cookies: RequestsCookieJar, + session_data: SessionData, tag_names: List[str] | None = None, -) -> Optional[TaskData]: +) -> TaskData | None: """Run a case query (iquery.pl) query on a case and save the data :param self: The celery task @@ -1051,13 +1064,15 @@ def do_case_query_by_pacer_case_id( 'pacer_case_id': The internal pacer case ID for the item. } :param court_id: A courtlistener court ID - :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a - logged-in PACER user. + :param session_data: A SessionData object containing the session's cookies + and proxy. :param tag_names: A list of tag names to associate with the docket when saving it in the DB. :return: A dict with the pacer_case_id and docket_pk values. """ - s = ProxyPacerSession(cookies=cookies) + s = ProxyPacerSession( + cookies=session_data.cookies, proxy=session_data.proxy_address + ) if data is None: logger.info("Empty data argument. Terminating chains and exiting.") self.request.chain = None @@ -1166,15 +1181,16 @@ def query_case_query_report( :return: A two tuple, the report data and the report HTML text. """ - cookies = get_or_cache_pacer_cookies( + session_data = get_or_cache_pacer_cookies( "pacer_scraper", settings.PACER_USERNAME, password=settings.PACER_PASSWORD, ) s = ProxyPacerSession( - cookies=cookies, + cookies=session_data.cookies, username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD, + proxy=session_data.proxy_address, ) report = CaseQuery(map_cl_to_pacer_id(court_id), s) report.query(pacer_case_id) @@ -1543,11 +1559,11 @@ def get_docket_by_pacer_case_id( self: Task, data: TaskData, court_id: str, - cookies: Optional[RequestsCookieJar] = None, + session_data: SessionData, docket_pk: Optional[int] = None, tag_names: Optional[str] = None, **kwargs, -) -> Optional[TaskData]: +) -> TaskData | None: """Get a docket by PACER case id, CL court ID, and a collection of kwargs that can be passed to the DocketReport query. @@ -1559,8 +1575,8 @@ def get_docket_by_pacer_case_id( Optional: 'docket_pk': The ID of the docket to work on to avoid lookups if it's known in advance. :param court_id: A courtlistener court ID. - :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a - logged-in PACER user. + :param session_data: A SessionData object containing the session's cookies + and proxy. :param docket_pk: The PK of the docket to update. Can also be provided in the data param, above. :param tag_names: A list of tag names that should be stored with the item @@ -1594,7 +1610,9 @@ def get_docket_by_pacer_case_id( logging_id = f"{court_id}.{pacer_case_id}" logger.info("Querying docket report %s", logging_id) - s = ProxyPacerSession(cookies=cookies) + s = ProxyPacerSession( + cookies=session_data.cookies, proxy=session_data.proxy_address + ) report = DocketReport(map_cl_to_pacer_id(court_id), s) try: report.query(pacer_case_id, **kwargs) @@ -1645,7 +1663,7 @@ def get_appellate_docket_by_docket_number( self: Task, docket_number: str, court_id: str, - cookies: RequestsCookieJar, + session_data: SessionData, tag_names: Optional[List[str]] = None, **kwargs, ) -> Optional[TaskData]: @@ -1657,13 +1675,16 @@ def get_appellate_docket_by_docket_number( :param self: The celery task :param docket_number: The docket number of the case. :param court_id: A courtlistener/PACER appellate court ID. - :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a - logged-in PACER user. + :param session_data: A SessionData object containing the session's cookies + and proxy. :param tag_names: The tag name that should be stored with the item in the DB, if desired. :param kwargs: A variety of keyword args to pass to DocketReport.query(). """ - s = ProxyPacerSession(cookies=cookies) + + s = ProxyPacerSession( + cookies=session_data.cookies, proxy=session_data.proxy_address + ) report = AppellateDocketReport(court_id, s) logging_id = f"{court_id} - {docket_number}" logger.info("Querying docket report %s", logging_id) @@ -1713,20 +1734,21 @@ def get_appellate_docket_by_docket_number( def get_att_report_by_rd( rd: RECAPDocument, - cookies: RequestsCookieJar, + session_data: SessionData, ) -> Optional[AttachmentPage]: """Method to get the attachment report for the item in PACER. :param rd: The RECAPDocument object to use as a source. - :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a - logged-on PACER user. + :param session_data: A SessionData object containing the session's cookies + and proxy. :return: The attachment report populated with the results """ - if not rd.pacer_doc_id: return None - s = ProxyPacerSession(cookies=cookies) + s = ProxyPacerSession( + cookies=session_data.cookies, proxy=session_data.proxy_address + ) pacer_court_id = map_cl_to_pacer_id(rd.docket_entry.docket.court_id) att_report = AttachmentPage(pacer_court_id, s) att_report.query(rd.pacer_doc_id) @@ -1744,14 +1766,14 @@ def get_att_report_by_rd( def get_attachment_page_by_rd( self: Task, rd_pk: int, - cookies: RequestsCookieJar, + session_data: SessionData, ) -> Optional[AttachmentPage]: """Get the attachment page for the item in PACER. :param self: The celery task :param rd_pk: The PK of a RECAPDocument object to use as a source. - :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a - logged-on PACER user. + :param session_data: A SessionData object containing the session's cookies + and proxy. :return: The attachment report populated with the results """ rd = RECAPDocument.objects.get(pk=rd_pk) @@ -1760,7 +1782,7 @@ def get_attachment_page_by_rd( self.request.chain = None return None try: - att_report = get_att_report_by_rd(rd, cookies) + att_report = get_att_report_by_rd(rd, session_data) except HTTPError as exc: if exc.response and exc.response.status_code in [ HTTPStatus.INTERNAL_SERVER_ERROR, @@ -1798,21 +1820,24 @@ def get_attachment_page_by_rd( def get_bankr_claims_registry( self: Task, data: TaskData, - cookies: RequestsCookieJar, - tag_names: Optional[List[str]] = None, -) -> Optional[TaskData]: + session_data: SessionData, + tag_names: List[str] | None = None, +) -> TaskData | None: """Get the bankruptcy claims registry for a docket :param self: The celery task :param data: A dict of data containing, primarily, a key to 'docket_pk' for the docket for which we want to get the registry. Other keys will be ignored. - :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a - logged-in PACER user. + :param session_data: A SessionData object containing the session's cookies + and proxy. :param tag_names: A list of tag names that should be stored with the claims registry information in the DB. """ - s = ProxyPacerSession(cookies=cookies) + + s = ProxyPacerSession( + cookies=session_data.cookies, proxy=session_data.proxy_address + ) if data is None or data.get("docket_pk") is None: logger.warning( "Empty data argument or parameter. Terminating chains " @@ -1910,8 +1935,8 @@ def download_pacer_pdf_by_rd( rd_pk: int, pacer_case_id: str, pacer_doc_id: int, - cookies: RequestsCookieJar, - magic_number: Optional[str] = None, + session_data: SessionData, + magic_number: str | None = None, ) -> tuple[Response | None, str]: """Using a RECAPDocument object ID, download the PDF if it doesn't already exist. @@ -1919,18 +1944,19 @@ def download_pacer_pdf_by_rd( :param rd_pk: The PK of the RECAPDocument to download :param pacer_case_id: The internal PACER case ID number :param pacer_doc_id: The internal PACER document ID to download - :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a - logged-in PACER user. + :param session_data: A SessionData object containing the session's cookies + and proxy. :param magic_number: The magic number to fetch PACER documents for free this is an optional field, only used by RECAP Email documents :return: A two-tuple of requests.Response object usually containing a PDF, or None if that wasn't possible, and a string representing the error if there was one. """ - rd = RECAPDocument.objects.get(pk=rd_pk) pacer_court_id = map_cl_to_pacer_id(rd.docket_entry.docket.court_id) - s = ProxyPacerSession(cookies=cookies) + s = ProxyPacerSession( + cookies=session_data.cookies, proxy=session_data.proxy_address + ) report = FreeOpinionReport(pacer_court_id, s) r, r_msg = report.download_pdf(pacer_case_id, pacer_doc_id, magic_number) @@ -1942,7 +1968,7 @@ def download_pdf_by_magic_number( court_id: str, pacer_doc_id: str, pacer_case_id: str, - cookies: RequestsCookieJar, + session_data: SessionData, magic_number: str, appellate: bool = False, ) -> tuple[Response | None, str]: @@ -1951,15 +1977,17 @@ def download_pdf_by_magic_number( :param court_id: A CourtListener court ID to query the free document. :param pacer_doc_id: The pacer_doc_id to query the free document. :param pacer_case_id: The pacer_case_id to query the free document. - :param cookies: The cookies of a logged in PACER session + :param session_data: A SessionData object containing the session's cookies + and proxy. :param magic_number: The magic number to fetch PACER documents for free. :param appellate: Whether the download belongs to an appellate court. :return: A two-tuple of requests.Response object usually containing a PDF, or None if that wasn't possible, and a string representing the error if there was one. """ - - s = ProxyPacerSession(cookies=cookies) + s = ProxyPacerSession( + cookies=session_data.cookies, proxy=session_data.proxy_address + ) report = FreeOpinionReport(court_id, s) r, r_msg = report.download_pdf( pacer_case_id, pacer_doc_id, magic_number, appellate @@ -1978,10 +2006,12 @@ def get_document_number_from_confirmation_page( """ recap_email_user = User.objects.get(username="recap-email") - cookies = get_or_cache_pacer_cookies( + session_data = get_or_cache_pacer_cookies( recap_email_user.pk, settings.PACER_USERNAME, settings.PACER_PASSWORD ) - s = ProxyPacerSession(cookies=cookies) + s = ProxyPacerSession( + cookies=session_data.cookies, proxy=session_data.proxy_address + ) doc_num_report = DownloadConfirmationPage(court_id, s) doc_num_report.query(pacer_doc_id) data = doc_num_report.data @@ -2052,11 +2082,12 @@ def is_pacer_doc_sealed(court_id: str, pacer_doc_id: str) -> bool: """ recap_email_user = User.objects.get(username="recap-email") - cookies = get_or_cache_pacer_cookies( + session_data = get_or_cache_pacer_cookies( recap_email_user.pk, settings.PACER_USERNAME, settings.PACER_PASSWORD ) - - s = ProxyPacerSession(cookies=cookies) + s = ProxyPacerSession( + cookies=session_data.cookies, proxy=session_data.proxy_address + ) receipt_report = DownloadConfirmationPage(court_id, s) receipt_report.query(pacer_doc_id) data = receipt_report.data @@ -2083,11 +2114,13 @@ def is_docket_entry_sealed( return False recap_email_user = User.objects.get(username="recap-email") - cookies = get_or_cache_pacer_cookies( + session_data = get_or_cache_pacer_cookies( recap_email_user.pk, settings.PACER_USERNAME, settings.PACER_PASSWORD ) - s = ProxyPacerSession(cookies=cookies) + s = ProxyPacerSession( + cookies=session_data.cookies, proxy=session_data.proxy_address + ) report = BaseReport(court_id, s) return report.is_entry_sealed(case_id, doc_id) @@ -2190,14 +2223,15 @@ def add_tags(rd: RECAPDocument, tag_name: Optional[str]) -> None: def get_pacer_doc_by_rd( self: Task, rd_pk: int, - cookies: RequestsCookieJar, + session_data: SessionData, tag: Optional[str] = None, ) -> Optional[int]: """A simple method for getting the PDF associated with a RECAPDocument. :param self: The bound celery task :param rd_pk: The PK for the RECAPDocument object - :param cookies: The cookies of a logged in PACER session + :param session_data: A SessionData object containing the session's cookies + and proxy. :param tag: The name of a tag to apply to any modified items :return: The RECAPDocument PK """ @@ -2210,7 +2244,7 @@ def get_pacer_doc_by_rd( pacer_case_id = rd.docket_entry.docket.pacer_case_id r, r_msg = download_pacer_pdf_by_rd( - rd.pk, pacer_case_id, rd.pacer_doc_id, cookies + rd.pk, pacer_case_id, rd.pacer_doc_id, session_data ) court_id = rd.docket_entry.docket.court_id @@ -2248,7 +2282,7 @@ def get_pacer_doc_by_rd_and_description( self: Task, rd_pk: int, description_re: Pattern, - cookies: RequestsCookieJar, + session_data: SessionData, fallback_to_main_doc: bool = False, tag_name: Optional[List[str]] = None, ) -> None: @@ -2262,15 +2296,15 @@ def get_pacer_doc_by_rd_and_description( :param rd_pk: The PK of a RECAPDocument object to use as a source. :param description_re: A compiled regular expression to search against the description provided by the attachment page. - :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a - logged-in PACER user. + :param session_data: A SessionData object containing the session's cookies + and proxy. :param fallback_to_main_doc: Should we grab the main doc if none of the attachments match the regex? :param tag_name: A tag name to apply to any downloaded content. :return: None """ rd = RECAPDocument.objects.get(pk=rd_pk) - att_report = get_attachment_page_by_rd(self, rd_pk, cookies) + att_report = get_attachment_page_by_rd(self, rd_pk, session_data) att_found = None for attachment in att_report.data.get("attachments", []): @@ -2319,7 +2353,7 @@ def get_pacer_doc_by_rd_and_description( pacer_case_id = rd.docket_entry.docket.pacer_case_id r, r_msg = download_pacer_pdf_by_rd( - rd.pk, pacer_case_id, att_found["pacer_doc_id"], cookies + rd.pk, pacer_case_id, att_found["pacer_doc_id"], session_data ) court_id = rd.docket_entry.docket.court_id @@ -2357,18 +2391,20 @@ def get_pacer_doc_by_rd_and_description( def get_pacer_doc_id_with_show_case_doc_url( self: Task, rd_pk: int, - cookies: RequestsCookieJar, + session_data: SessionData, ) -> None: """use the show_case_doc URL to get pacer_doc_id values. :param self: The celery task :param rd_pk: The pk of the RECAPDocument you want to get. - :param cookies: A requests.cookies.RequestsCookieJar with the cookies of a - logged-in PACER user. + :param session_data: A SessionData object containing the session's cookies + and proxy. """ rd = RECAPDocument.objects.get(pk=rd_pk) d = rd.docket_entry.docket - s = ProxyPacerSession(cookies=cookies) + s = ProxyPacerSession( + cookies=session_data.cookies, proxy=session_data.proxy_address + ) pacer_court_id = map_cl_to_pacer_id(d.court_id) report = ShowCaseDocApi(pacer_court_id, s) last_try = self.request.retries == self.max_retries @@ -2458,7 +2494,7 @@ def make_list_of_creditors_key(court_id: str, d_number_file_name: str) -> str: @throttle_task("1/s", key="court_id") def query_and_save_list_of_creditors( self: Task, - cookies: RequestsCookieJar, + session_data: SessionData, court_id: str, d_number_file_name: str, docket_number: str, @@ -2470,7 +2506,8 @@ def query_and_save_list_of_creditors( HTML and pipe-limited text files and convert them to CSVs. :param self: The celery task - :param cookies: The cookies for the current PACER session. + :param session_data: A SessionData object containing the session's cookies + and proxy. :param court_id: The court_id for the bankruptcy court. :param d_number_file_name: The docket number to use as file name. :param docket_number: The docket number of the case. @@ -2480,8 +2517,9 @@ def query_and_save_list_of_creditors( :return: None """ - - s = ProxyPacerSession(cookies=cookies) + s = ProxyPacerSession( + cookies=session_data.cookies, proxy=session_data.proxy_address + ) try: report = ListOfCreditors(court_id, s) except AssertionError: diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index a66dd4b2d6..7a76435ded 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -402,6 +402,9 @@ def test_get_appellate_court_object_from_string(self) -> None: self.assertEqual(test["a"], got) +@override_settings( + EGRESS_PROXY_HOSTS=["http://proxy_1:9090", "http://proxy_2:9090"] +) @pytest.mark.django_db class PacerDocketParserTest(TestCase): """Can we parse RECAP dockets successfully?""" @@ -496,10 +499,7 @@ def test_party_parsing(self) -> None: self.assertEqual(godfrey_llp.city, "Seattle") self.assertEqual(godfrey_llp.state, "WA") - @patch( - "cl.corpus_importer.tasks.get_or_cache_pacer_cookies", - return_value=None, - ) + @patch("cl.corpus_importer.tasks.get_or_cache_pacer_cookies") def test_get_and_save_free_document_report(self, mock_cookies) -> None: """Test the retrieval and storage of free document report data.""" @@ -3341,13 +3341,11 @@ def test_merger(self): ) -@patch( - "cl.corpus_importer.tasks.get_or_cache_pacer_cookies", - return_value=None, -) +@patch("cl.corpus_importer.tasks.get_or_cache_pacer_cookies") @override_settings( IQUERY_PROBE_DAEMON_ENABLED=True, IQUERY_SWEEP_UPLOADS_SIGNAL_ENABLED=True, + EGRESS_PROXY_HOSTS=["http://proxy_1:9090", "http://proxy_2:9090"], ) class ScrapeIqueryPagesTest(TestCase): """Tests related to probe_iquery_pages_daemon command.""" diff --git a/cl/lib/pacer_session.py b/cl/lib/pacer_session.py index 7c993556cd..e1d3000837 100644 --- a/cl/lib/pacer_session.py +++ b/cl/lib/pacer_session.py @@ -1,4 +1,6 @@ import pickle +import random +from dataclasses import dataclass from typing import Union from urllib.parse import urlparse @@ -12,6 +14,27 @@ session_key = "session:pacer:cookies:user.%s" +@dataclass +class SessionData: + """ + The goal of this class is to encapsulate data required for PACER requests. + + This class serves as a lightweight container for PACER session data, + excluding authentication details for efficient caching. + + Handles default values for the `proxy` attribute when not explicitly + provided, indicating session data was not generated using the + `ProxyPacerSession` class. + """ + + cookies: RequestsCookieJar + proxy_address: str = "" + + def __post_init__(self): + if not self.proxy_address: + self.proxy_address = settings.EGRESS_PROXY_HOSTS[0] + + class ProxyPacerSession(PacerSession): """ This class overrides the _prepare_login_request and post methods of the @@ -28,14 +51,32 @@ class ProxyPacerSession(PacerSession): """ def __init__( - self, cookies=None, username=None, password=None, client_code=None + self, + cookies=None, + username=None, + password=None, + client_code=None, + proxy=None, ): super().__init__(cookies, username, password, client_code) + self.proxy_address = proxy if proxy else self._pick_proxy_connection() self.proxies = { - "http": settings.EGRESS_PROXY_HOST, + "http": self.proxy_address, } self.headers["X-WhSentry-TLS"] = "true" + def _pick_proxy_connection(self) -> str: + """ + Picks a proxy connection string from available options. + + this function randomly chooses a string from the + `settings.EGRESS_PROXY_HOSTS` list and returns it. + + Returns: + str: The chosen proxy connection string. + """ + return random.choice(settings.EGRESS_PROXY_HOSTS) + def _change_protocol(self, url: str) -> str: """Converts a URL from HTTPS to HTTP protocol. @@ -75,13 +116,14 @@ def log_into_pacer( username: str, password: str, client_code: str | None = None, -) -> RequestsCookieJar: - """Log into PACER and return the cookie jar +) -> SessionData: + """Log into PACER and returns a SessionData object containing the session's + cookies and proxy information. :param username: A PACER username :param password: A PACER password :param client_code: A PACER client_code - :return: Request.CookieJar + :return: A SessionData object containing the session's cookies and proxy. """ s = ProxyPacerSession( username=username, @@ -89,7 +131,7 @@ def log_into_pacer( client_code=client_code, ) s.login() - return s.cookies + return SessionData(s.cookies, s.proxy_address) def get_or_cache_pacer_cookies( @@ -98,7 +140,7 @@ def get_or_cache_pacer_cookies( password: str, client_code: str | None = None, refresh: bool = False, -) -> RequestsCookieJar: +) -> SessionData: """Get PACER cookies for a user or create and cache fresh ones For the PACER Fetch API, we store users' PACER cookies in Redis with a @@ -107,7 +149,7 @@ def get_or_cache_pacer_cookies( This function attempts to get cookies for a user from Redis. If it finds them, it returns them. If not, it attempts to log the user in and then - returns the fresh cookies (after caching them). + returns the fresh cookies and the proxy used to login(after caching them). :param user_pk: The PK of the user attempting to store their credentials. Needed to create the key in Redis. @@ -115,21 +157,27 @@ def get_or_cache_pacer_cookies( :param password: The PACER password of the user :param client_code: The PACER client code of the user :param refresh: If True, refresh the cookies even if they're already cached - :return: Cookies for the PACER user + :return: A SessionData object containing the session's cookies and proxy. """ r = get_redis_interface("CACHE", decode_responses=False) - cookies = get_pacer_cookie_from_cache(user_pk, r=r) + cookies_data = get_pacer_cookie_from_cache(user_pk, r=r) ttl_seconds = r.ttl(session_key % user_pk) - if cookies and ttl_seconds >= 300 and not refresh: + if cookies_data and ttl_seconds >= 300 and not refresh: # cookies were found in cache and ttl >= 5 minutes, return them - return cookies + if isinstance(cookies_data, SessionData): + return cookies_data + return SessionData(cookies_data) # Unable to find cookies in cache, are about to expire or refresh needed # Login and cache new values. - cookies = log_into_pacer(username, password, client_code) + session_data = log_into_pacer(username, password, client_code) cookie_expiration = 60 * 60 - r.set(session_key % user_pk, pickle.dumps(cookies), ex=cookie_expiration) - return cookies + r.set( + session_key % user_pk, + pickle.dumps(session_data), + ex=cookie_expiration, + ) + return session_data def get_pacer_cookie_from_cache( diff --git a/cl/lib/tests.py b/cl/lib/tests.py index 6507fab826..ec1c9bcf7e 100644 --- a/cl/lib/tests.py +++ b/cl/lib/tests.py @@ -1,8 +1,13 @@ import datetime +import pickle from typing import Tuple, TypedDict, cast +from unittest.mock import patch from asgiref.sync import async_to_sync +from django.conf import settings from django.core.files.base import ContentFile +from django.test import override_settings +from requests.cookies import RequestsCookieJar from cl.lib.date_time import midnight_pt from cl.lib.elasticsearch_utils import append_query_conjunctions @@ -21,6 +26,12 @@ normalize_attorney_role, normalize_us_state, ) +from cl.lib.pacer_session import ( + ProxyPacerSession, + SessionData, + get_or_cache_pacer_cookies, + session_key, +) from cl.lib.privacy_tools import anonymize from cl.lib.ratelimiter import parse_rate from cl.lib.redis_utils import ( @@ -80,6 +91,120 @@ def test_auto_blocking_small_bankr_docket(self) -> None: ) +@override_settings( + EGRESS_PROXY_HOSTS=["http://proxy_1:9090", "http://proxy_2:9090"] +) +class TestPacerSessionUtils(TestCase): + + def setUp(self) -> None: + r = get_redis_interface("CACHE", decode_responses=False) + # Clear cached session keys to prevent data inconsistencies. + key = r.keys(session_key % "test_user_new_cookie") + if key: + r.delete(*key) + self.test_cookies = RequestsCookieJar() + self.test_cookies.set("PacerSession", "this-is-a-test") + r.set( + session_key % "test_user_old_format", + pickle.dumps(self.test_cookies), + ex=60 * 60, + ) + r.set( + session_key % "test_user_new_format", + pickle.dumps( + SessionData(self.test_cookies, "http://proxy_1:9090") + ), + ex=60 * 60, + ) + r.set( + session_key % "test_old_format_almost_expired", + pickle.dumps(self.test_cookies), + ex=60, + ) + r.set( + session_key % "test_new_format_almost_expired", + pickle.dumps( + SessionData(self.test_cookies, "http://proxy_1:9090") + ), + ex=60, + ) + + def test_pick_random_proxy_when_list_is_available(self): + """Does ProxyPacerSession choose a random proxy from the available list?""" + session = ProxyPacerSession(username="test", password="password") + self.assertIn( + session.proxy_address, + ["http://proxy_1:9090", "http://proxy_2:9090"], + ) + + def test_use_default_proxy_host_for_old_cookie_format(self): + """Can we handle the old cookie format properly?""" + session_data = get_or_cache_pacer_cookies( + "test_user_old_format", username="test", password="password" + ) + self.assertIsInstance(session_data, SessionData) + self.assertEqual(session_data.proxy_address, "http://proxy_1:9090") + + @patch("cl.lib.pacer_session.log_into_pacer") + def test_compute_new_cookies_with_new_format(self, mock_log_into_pacer): + """Are we using the dataclass for new cookies?""" + mock_log_into_pacer.return_value = SessionData( + self.test_cookies, + "http://proxy_1:9090", + ) + session_data = get_or_cache_pacer_cookies( + "test_user_new_cookie", username="test", password="password" + ) + self.assertEqual(mock_log_into_pacer.call_count, 1) + self.assertIsInstance(session_data, SessionData) + self.assertEqual(session_data.proxy_address, "http://proxy_1:9090") + + @patch("cl.lib.pacer_session.log_into_pacer") + def test_parse_cookie_proxy_pair_properly(self, mock_log_into_pacer): + """Can we parse the dataclass from cache properly?""" + session_data = get_or_cache_pacer_cookies( + "test_user_new_format", username="test", password="password" + ) + self.assertEqual(mock_log_into_pacer.call_count, 0) + self.assertIsInstance(session_data, SessionData) + self.assertEqual(session_data.proxy_address, "http://proxy_1:9090") + + @patch("cl.lib.pacer_session.log_into_pacer") + def test_compute_cookies_for_almost_expired_data( + self, mock_log_into_pacer + ): + """Are we using the dataclass when re-computing session?""" + mock_log_into_pacer.return_value = SessionData( + self.test_cookies, "http://proxy_1:9090" + ) + + # Attempts to get almost expired cookies with the old format from cache + # Expects refresh. + session_data = get_or_cache_pacer_cookies( + "test_old_format_almost_expired", + username="test", + password="password", + ) + self.assertEqual(mock_log_into_pacer.call_count, 1) + self.assertIsInstance(session_data, SessionData) + self.assertEqual(session_data.proxy_address, "http://proxy_1:9090") + + mock_log_into_pacer.return_value = SessionData( + self.test_cookies, "http://proxy_2:9090" + ) + + # Attempts to get almost expired cookies with the new format from cache + # Expects refresh. + session_data = get_or_cache_pacer_cookies( + "test_new_format_almost_expired", + username="test", + password="password", + ) + self.assertIsInstance(session_data, SessionData) + self.assertEqual(mock_log_into_pacer.call_count, 2) + self.assertEqual(session_data.proxy_address, "http://proxy_2:9090") + + class TestStringUtils(SimpleTestCase): def test_trunc(self) -> None: """Does trunc give us the results we expect?""" diff --git a/cl/recap/management/commands/merge_idb_into_dockets.py b/cl/recap/management/commands/merge_idb_into_dockets.py index 0fe62e0c85..ba90c71071 100644 --- a/cl/recap/management/commands/merge_idb_into_dockets.py +++ b/cl/recap/management/commands/merge_idb_into_dockets.py @@ -142,7 +142,9 @@ def update_any_missing_pacer_case_ids(options): pass_through=d.pk, docket_number=d.idb_data.docket_number, court_id=d.idb_data.district_id, - cookies=session.cookies, + cookies_data=SessionData( + session.cookies, session.proxy_address + ), **params, ).set(queue=q), update_docket_from_hidden_api.s().set(queue=q), diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 2b218f3248..ca78940de6 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -38,7 +38,6 @@ from juriscraper.pacer.email import DocketType from redis import ConnectionError as RedisConnectionError from requests import HTTPError -from requests.cookies import RequestsCookieJar from requests.packages.urllib3.exceptions import ReadTimeoutError from cl.alerts.tasks import enqueue_docket_alert, send_alert_and_webhook @@ -61,6 +60,7 @@ from cl.lib.pacer import is_pacer_court_accessible, map_cl_to_pacer_id from cl.lib.pacer_session import ( ProxyPacerSession, + SessionData, delete_pacer_cookie_from_cache, get_or_cache_pacer_cookies, get_pacer_cookie_from_cache, @@ -1647,13 +1647,24 @@ def fetch_pacer_doc_by_rd( self.request.chain = None return + # Ensures session data is a `SessionData` instance for consistent handling. + # + # Currently, handles potential legacy data by converting them to + # `SessionData`. This defensive check can be removed in future versions + # once all data is guaranteed to be in the expected format. + # + # This approach prevents disruptions during processing of enqueued data + # after deployment. + session_data = ( + cookies if isinstance(cookies, SessionData) else SessionData(cookies) + ) pacer_case_id = rd.docket_entry.docket.pacer_case_id try: r, r_msg = download_pacer_pdf_by_rd( rd.pk, pacer_case_id, rd.pacer_doc_id, - cookies, + session_data, magic_number, ) except (requests.RequestException, HTTPError): @@ -1745,8 +1756,14 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> None: mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) return + # Ensures session data is a `SessionData` instance for consistent handling. + # This approach prevents disruptions during processing of enqueued data + # after deployment. + session_data = ( + cookies if isinstance(cookies, SessionData) else SessionData(cookies) + ) try: - r = get_att_report_by_rd(rd, cookies) + r = get_att_report_by_rd(rd, session_data) except HTTPError as exc: msg = "Failed to get attachment page from network." if exc.response.status_code in [ @@ -1918,14 +1935,21 @@ def fetch_docket(self, fq_pk): async_to_sync(mark_pq_status)(fq, "", PROCESSING_STATUS.IN_PROGRESS) - cookies = get_pacer_cookie_from_cache(fq.user_id) - if cookies is None: + cookies_data = get_pacer_cookie_from_cache(fq.user_id) + if cookies_data is None: msg = f"Cookie cache expired before task could run for user: {fq.user_id}" mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None - s = ProxyPacerSession(cookies=cookies) + session_data = ( + cookies_data + if isinstance(cookies_data, SessionData) + else SessionData(cookies_data) + ) + s = ProxyPacerSession( + cookies=session_data.cookies, proxy=session_data.proxy_address + ) try: result = fetch_pacer_case_id_and_title(s, fq, court_id) except (requests.RequestException, ReadTimeoutError) as exc: @@ -2164,7 +2188,7 @@ def save_pacer_doc_from_pq( def download_pacer_pdf_and_save_to_pq( court_id: str, - cookies: RequestsCookieJar, + session_data: SessionData, cutoff_date: datetime, magic_number: str | None, pacer_case_id: str, @@ -2180,7 +2204,8 @@ def download_pacer_pdf_and_save_to_pq( PQ object. Increasing the reliability of saving PACER documents. :param court_id: A CourtListener court ID to query the free document. - :param cookies: The cookies of a logged in PACER session + :param session_data: A SessionData object containing the session's cookies + and proxy. :param cutoff_date: The datetime from which we should query ProcessingQueue objects. For the main RECAPDocument the datetime the EmailProcessingQueue was created. For attachments the datetime the @@ -2217,7 +2242,7 @@ def download_pacer_pdf_and_save_to_pq( court_id, pacer_doc_id, pacer_case_id, - cookies, + session_data, magic_number, appellate, ) @@ -2264,13 +2289,16 @@ def get_and_copy_recap_attachment_docs( """ cookies = get_pacer_cookie_from_cache(user_pk) + session_data = ( + cookies if isinstance(cookies, SessionData) else SessionData(cookies) + ) appellate = False unique_pqs = [] for rd_att in att_rds: cutoff_date = rd_att.date_created pq = download_pacer_pdf_and_save_to_pq( court_id, - cookies, + session_data, cutoff_date, magic_number, pacer_case_id, @@ -2375,6 +2403,12 @@ def get_and_merge_rd_attachments( all_attachment_rds = [] cookies = get_pacer_cookie_from_cache(user_pk) + # Ensures session data is a `SessionData` instance for consistent handling. + # This approach prevents disruptions during processing of enqueued data + # after deployment. + session_data = ( + cookies if isinstance(cookies, SessionData) else SessionData(cookies) + ) # Try to get the attachment page without being logged into PACER att_report_text = get_attachment_page_by_url(document_url, court_id) if att_report_text: @@ -2386,7 +2420,7 @@ def get_and_merge_rd_attachments( .recap_documents.earliest("date_created") ) # Get the attachment page being logged into PACER - att_report = get_att_report_by_rd(main_rd, cookies) + att_report = get_att_report_by_rd(main_rd, session_data) for docket_entry in dockets_updated: # Merge the attachments for each docket/recap document @@ -2472,7 +2506,7 @@ def process_recap_email( start_time = now() # Ensures we have PACER cookies ready to go. - cookies = get_or_cache_pacer_cookies( + cookies_data = get_or_cache_pacer_cookies( user_pk, settings.PACER_USERNAME, settings.PACER_PASSWORD ) appellate = data["appellate"] @@ -2480,7 +2514,7 @@ def process_recap_email( # its future processing. pq = download_pacer_pdf_and_save_to_pq( epq.court_id, - cookies, + cookies_data, epq.date_created, magic_number, pacer_case_id, diff --git a/cl/recap/tests.py b/cl/recap/tests.py index 82bbc62e16..a9f1d31bee 100644 --- a/cl/recap/tests.py +++ b/cl/recap/tests.py @@ -16,7 +16,7 @@ from django.core import mail from django.core.files.base import ContentFile from django.core.files.uploadedfile import SimpleUploadedFile -from django.test import RequestFactory +from django.test import RequestFactory, override_settings from django.urls import reverse from django.utils.timezone import now from juriscraper.pacer import PacerRssFeed @@ -1234,6 +1234,9 @@ def mock_bucket_open(message_id, r, read_file=False): return recap_mail_example +@override_settings( + EGRESS_PROXY_HOSTS=["http://proxy_1:9090", "http://proxy_2:9090"] +) class RecapEmailToEmailProcessingQueueTest(TestCase): """Test the rest endpoint, but exclude the processing tasks.""" @@ -1292,10 +1295,7 @@ async def test_missing_receipt_properties_fails(self): "cl.recap.tasks.RecapEmailSESStorage.open", side_effect=mock_bucket_open, ) - @mock.patch( - "cl.recap.tasks.get_or_cache_pacer_cookies", - side_effect=lambda x, y, z: None, - ) + @mock.patch("cl.recap.tasks.get_or_cache_pacer_cookies") @mock.patch( "cl.recap.tasks.is_docket_entry_sealed", return_value=False, @@ -2946,7 +2946,7 @@ def test_create_from_idb_chunk(self) -> None: ) @mock.patch( "cl.recap.tasks.get_or_cache_pacer_cookies", - side_effect=lambda x, y, z: None, + side_effect=lambda x, y, z: (None, None), ) @mock.patch( "cl.recap.tasks.is_pacer_court_accessible", @@ -5404,7 +5404,7 @@ def test_clean_up_recap_document_file(self, mock_open): ) @mock.patch( "cl.recap.tasks.get_or_cache_pacer_cookies", - side_effect=lambda x, y, z: "Cookie", + side_effect=lambda x, y, z: ("Cookie", settings.EGRESS_PROXY_HOSTS[0]), ) @mock.patch( "cl.recap.tasks.get_pacer_cookie_from_cache", @@ -5778,7 +5778,7 @@ def test_is_pacer_court_accessible_fails( ) @mock.patch( "cl.recap.tasks.get_or_cache_pacer_cookies", - side_effect=lambda x, y, z: None, + side_effect=lambda x, y, z: (None, None), ) @mock.patch( "cl.recap.tasks.is_pacer_court_accessible", diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py index d5609d55da..c60971c572 100644 --- a/cl/scrapers/tasks.py +++ b/cl/scrapers/tasks.py @@ -410,15 +410,16 @@ def update_docket_info_iquery(self, d_pk: int, court_id: str) -> None: :param court_id: The court of the docket. Needed for throttling by court. :return: None """ - cookies = get_or_cache_pacer_cookies( + session_data = get_or_cache_pacer_cookies( "pacer_scraper", settings.PACER_USERNAME, password=settings.PACER_PASSWORD, ) s = ProxyPacerSession( - cookies=cookies, + cookies=session_data.cookies, username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD, + proxy=session_data.proxy_address, ) d = Docket.objects.get(pk=d_pk, court_id=court_id) report = CaseQuery(map_cl_to_pacer_id(d.court_id), s) diff --git a/cl/settings/project/security.py b/cl/settings/project/security.py index 57a0ef19f6..e13eb87bcc 100644 --- a/cl/settings/project/security.py +++ b/cl/settings/project/security.py @@ -13,8 +13,8 @@ "ALLOWED_HOSTS", default=["www.courtlistener.com"] ) -EGRESS_PROXY_HOST = env( - "EGRESS_PROXY_HOST", default="http://cl-webhook-sentry:9090" +EGRESS_PROXY_HOSTS: list[str] = env.list( + "EGRESS_PROXY_HOSTS", default=["http://cl-webhook-sentry:9090"] ) SECURE_HSTS_SECONDS = 63_072_000