|
243 | 243 | }, |
244 | 244 | { |
245 | 245 | "cell_type": "code", |
246 | | - "execution_count": 7, |
| 246 | + "execution_count": 84, |
247 | 247 | "metadata": {}, |
248 | 248 | "outputs": [ |
249 | 249 | { |
250 | 250 | "name": "stdout", |
251 | 251 | "output_type": "stream", |
252 | 252 | "text": [ |
253 | | - "[WARN] No se pudo leer https://www.ilga.gov/Senate/List: 404 Client Error: Not Found for url: https://www.ilga.gov/Senate/List\n", |
254 | 253 | "Perfiles encontrados en la lista: 60\n", |
255 | 254 | "Total miembros parseados: 60\n", |
256 | 255 | "('Member', 8505, 'D', 'https://www.ilga.gov/Senate/Members/Details/3264')\n", |
|
272 | 271 | "3 Member 5966 D https://www.ilga.gov/Senate/Members/Details/3269\n", |
273 | 272 | "4 Member 422 D https://www.ilga.gov/Senate/Members/Details/3270\n", |
274 | 273 | "\n", |
275 | | - "CSV generado: senado_ilga_moderno.csv\n" |
| 274 | + "CSV generado: senado_ilga_members.csv\n" |
276 | 275 | ] |
277 | 276 | } |
278 | 277 | ], |
|
284 | 283 | "\n", |
285 | 284 | "LIST_URLS = [\n", |
286 | 285 | " \"https://www.ilga.gov/Senate/Members/List\",\n", |
287 | | - " \"https://www.ilga.gov/Senate/List\",\n", |
| 286 | + " \"https://www.ilga.gov/Senate/Members\",\n", |
288 | 287 | "]\n", |
289 | 288 | "BASE = \"https://www.ilga.gov\"\n", |
290 | 289 | "\n", |
|
415 | 414 | " print(\"\\nPrimeras 5 filas:\")\n", |
416 | 415 | " print(df.head())\n", |
417 | 416 | " df.to_csv(\"senado_ilga_moderno.csv\", index=False, encoding=\"utf-8\")\n", |
418 | | - " print(\"\\nCSV generado: senado_ilga_moderno.csv\")\n", |
| 417 | + " print(\"\\nCSV generado: senado_ilga_members.csv\")\n", |
419 | 418 | " except ImportError:\n", |
420 | 419 | " print(\"Pandas no está instalado; omitiendo CSV. Instala con: pip install pandas openpyxl\")\n", |
421 | 420 | "\n", |
|
547 | 546 | }, |
548 | 547 | { |
549 | 548 | "cell_type": "code", |
550 | | - "execution_count": 37, |
| 549 | + "execution_count": 96, |
551 | 550 | "metadata": {}, |
552 | 551 | "outputs": [], |
553 | 552 | "source": [ |
554 | | - "import re\n", |
| 553 | + "import re, time\n", |
555 | 554 | "from urllib.parse import urljoin\n", |
556 | 555 | "import requests\n", |
557 | 556 | "from bs4 import BeautifulSoup\n", |
558 | 557 | "\n", |
559 | | - "def get_members(url: str = \"https://www.ilga.gov/Senate/Members/List\"):\n", |
| 558 | + "HEADERS = {\"User-Agent\": \"Mozilla/5.0\"}\n", |
| 559 | + "\n", |
| 560 | + "# --- util: extraer distrito/partido desde texto plano ---\n", |
| 561 | + "def _extract_district_party(text: str):\n", |
| 562 | + " # 1) \"47 R\"\n", |
| 563 | + " m = re.search(r'\\b(\\d+)\\s+([DRI])\\b', text)\n", |
| 564 | + " if m:\n", |
| 565 | + " return int(m.group(1)), m.group(2).upper()\n", |
| 566 | + " # 2) \"District 47 (R)\" u otras variantes\n", |
| 567 | + " m = re.search(r'(?:District\\s*)?(\\d+).*?\\b([DRI])\\b', text, flags=re.I)\n", |
| 568 | + " if m:\n", |
| 569 | + " return int(m.group(1)), m.group(2).upper()\n", |
| 570 | + " # 3) Si solo viene el nombre del partido completo\n", |
| 571 | + " party = \"\"\n", |
| 572 | + " if re.search(r'\\bDemocrat(ic)?\\b', text, flags=re.I):\n", |
| 573 | + " party = \"D\"\n", |
| 574 | + " elif re.search(r'\\bRepublican\\b', text, flags=re.I):\n", |
| 575 | + " party = \"R\"\n", |
| 576 | + " elif re.search(r'\\bIndependent\\b', text, flags=re.I):\n", |
| 577 | + " party = \"I\"\n", |
| 578 | + " return None, party # distrito desconocido, partido si se detectó\n", |
| 579 | + "\n", |
| 580 | + "# --- leer distrito/partido desde el perfil individual ---\n", |
| 581 | + "def _parse_profile(profile_url: str, session: requests.Session):\n", |
| 582 | + " r = session.get(profile_url, timeout=30)\n", |
| 583 | + " r.raise_for_status()\n", |
| 584 | + " psoup = BeautifulSoup(r.text, \"lxml\")\n", |
| 585 | + "\n", |
| 586 | + " # nombre (por si quieres validar)\n", |
| 587 | + " name = \"\"\n", |
| 588 | + " h1 = psoup.select_one(\"h1\")\n", |
| 589 | + " if h1:\n", |
| 590 | + " name = h1.get_text(strip=True)\n", |
| 591 | + " elif psoup.title:\n", |
| 592 | + " name = psoup.title.get_text(strip=True)\n", |
| 593 | + "\n", |
| 594 | + " text = psoup.get_text(\" \", strip=True)\n", |
| 595 | + " district, party = _extract_district_party(text)\n", |
| 596 | + " return name, district, party\n", |
| 597 | + "\n", |
| 598 | + "def get_members(url: str = \"https://www.ilga.gov/Senate/Members\"):\n", |
560 | 599 | " \"\"\"\n", |
561 | | - " Devuelve una lista de tuplas (Nombre, Distrito:int, Partido:str, Perfil:str)\n", |
562 | | - " extraídas desde la página moderna del Senado de Illinois.\n", |
| 600 | + " Devuelve lista de tuplas (Nombre, Distrito:int|None, Partido:str, Perfil:str)\n", |
| 601 | + " Tomando enlaces a /Senate/Members/Details/... desde /Senate/Members o /Senate/Members/List.\n", |
| 602 | + " Si el distrito/partido no está cerca del enlace, se visita el perfil para extraerlos.\n", |
563 | 603 | " \"\"\"\n", |
564 | | - " headers = {\"User-Agent\": \"Mozilla/5.0\"}\n", |
565 | | - " resp = requests.get(url, headers=headers, timeout=30)\n", |
566 | | - " resp.raise_for_status()\n", |
| 604 | + " s = requests.Session()\n", |
| 605 | + " s.headers.update(HEADERS)\n", |
| 606 | + "\n", |
| 607 | + " r = s.get(url, timeout=30)\n", |
| 608 | + " r.raise_for_status()\n", |
| 609 | + " soup = BeautifulSoup(r.text, \"lxml\")\n", |
567 | 610 | "\n", |
568 | | - " soup = BeautifulSoup(resp.text, \"lxml\")\n", |
569 | 611 | " members = []\n", |
| 612 | + " seen = set()\n", |
570 | 613 | "\n", |
571 | | - " # Enlaces a perfiles /Senate/Members/Details/<id>\n", |
| 614 | + " # Enlaces a perfiles\n", |
572 | 615 | " for a in soup.select('a[href*=\"/Senate/Members/Details/\"], a[href*=\"/senate/members/details/\"]'):\n", |
573 | 616 | " name = a.get_text(strip=True)\n", |
574 | 617 | " if not name:\n", |
575 | 618 | " continue\n", |
576 | 619 | "\n", |
577 | | - " # En el contenedor suele venir \"Nombre 47 R\" (número + partido)\n", |
578 | | - " full_text = a.parent.get_text(\" \", strip=True)\n", |
579 | | - " tail = full_text.replace(name, \"\").strip()\n", |
580 | | - "\n", |
581 | | - " # 1) patrón directo: \"47 R\"\n", |
582 | | - " m = re.search(r'(\\d+)\\s+([DRI])\\b', tail)\n", |
583 | | - " # 2) fallback: \"District 47 (R)\" u otras variantes\n", |
584 | | - " if not m:\n", |
585 | | - " m = re.search(r'(?:District\\s*)?(\\d+).*?([DRI])\\b', tail, re.I)\n", |
586 | | - " if not m:\n", |
587 | | - " # si no se detecta distrito/partido, igual guarda el nombre y el perfil\n", |
588 | | - " district, party = None, \"\"\n", |
589 | | - " else:\n", |
590 | | - " district = int(m.group(1))\n", |
591 | | - " party = m.group(2).upper()\n", |
592 | | - "\n", |
593 | | - " profile = urljoin(url, a.get(\"href\"))\n", |
| 620 | + " profile = urljoin(url, a.get(\"href\") or \"\")\n", |
| 621 | + " if profile in seen:\n", |
| 622 | + " continue\n", |
| 623 | + " seen.add(profile)\n", |
| 624 | + "\n", |
| 625 | + " # 1) Intentar extraer distrito/partido del contenedor más cercano\n", |
| 626 | + " container = a.parent\n", |
| 627 | + " # sube hasta 3 niveles si es necesario (algunas páginas usan divs anidados)\n", |
| 628 | + " hops = 0\n", |
| 629 | + " while container and hops < 3 and len(container.get_text(strip=True)) < 10:\n", |
| 630 | + " container = container.parent\n", |
| 631 | + " hops += 1\n", |
| 632 | + "\n", |
| 633 | + " tail_text = \"\"\n", |
| 634 | + " if container:\n", |
| 635 | + " # texto del contenedor sin el nombre, para evitar falsos positivos\n", |
| 636 | + " ctext = container.get_text(\" \", strip=True)\n", |
| 637 | + " tail_text = ctext.replace(name, \"\").strip()\n", |
| 638 | + "\n", |
| 639 | + " district, party = _extract_district_party(tail_text)\n", |
| 640 | + "\n", |
| 641 | + " # 2) Si no encontramos, entramos al perfil\n", |
| 642 | + " if district is None and not party:\n", |
| 643 | + " _, district, party = _parse_profile(profile, s)\n", |
| 644 | + " time.sleep(0.2) # cortesía con el servidor\n", |
| 645 | + "\n", |
594 | 646 | " members.append((name, district, party, profile))\n", |
595 | 647 | "\n", |
596 | | - " return members\n" |
| 648 | + " return members\n", |
| 649 | + "\n" |
597 | 650 | ] |
598 | 651 | }, |
599 | 652 | { |
|
644 | 697 | }, |
645 | 698 | { |
646 | 699 | "cell_type": "code", |
647 | | - "execution_count": 38, |
| 700 | + "execution_count": 97, |
648 | 701 | "metadata": {}, |
649 | 702 | "outputs": [ |
650 | 703 | { |
651 | 704 | "name": "stdout", |
652 | 705 | "output_type": "stream", |
653 | 706 | "text": [ |
654 | 707 | "Total miembros: 60\n", |
655 | | - "('Neil Anderson', None, '', 'https://www.ilga.gov/Senate/Members/Details/3312')\n", |
656 | | - "('Omar Aquino', None, '', 'https://www.ilga.gov/Senate/Members/Details/3316')\n", |
657 | | - "('Li Arellano, Jr.', None, '', 'https://www.ilga.gov/Senate/Members/Details/3383')\n", |
658 | | - "('Chris Balkema', None, '', 'https://www.ilga.gov/Senate/Members/Details/3413')\n", |
659 | | - "('Christopher Belt', None, '', 'https://www.ilga.gov/Senate/Members/Details/3337')\n" |
| 708 | + "('Neil Anderson', 2006, 'R', 'https://www.ilga.gov/Senate/Members/Details/3312')\n", |
| 709 | + "('Omar Aquino', 2016, 'D', 'https://www.ilga.gov/Senate/Members/Details/3316')\n", |
| 710 | + "('Li Arellano, Jr.', 2025, 'D', 'https://www.ilga.gov/Senate/Members/Details/3383')\n", |
| 711 | + "('Chris Balkema', 2025, 'R', 'https://www.ilga.gov/Senate/Members/Details/3413')\n", |
| 712 | + "('Christopher Belt', 2019, 'R', 'https://www.ilga.gov/Senate/Members/Details/3337')\n" |
660 | 713 | ] |
661 | 714 | } |
662 | 715 | ], |
663 | 716 | "source": [ |
664 | 717 | "senate_members = get_members() # o get_members(\"https://www.ilga.gov/Senate/Members/List\")\n", |
665 | 718 | "print(\"Total miembros:\", len(senate_members))\n", |
666 | 719 | "for m in senate_members[:5]:\n", |
667 | | - " print(m)\n" |
| 720 | + " print(m)\n", |
| 721 | + "\n" |
668 | 722 | ] |
669 | 723 | }, |
670 | 724 | { |
|
679 | 733 | }, |
680 | 734 | { |
681 | 735 | "cell_type": "code", |
682 | | - "execution_count": 39, |
| 736 | + "execution_count": 98, |
683 | 737 | "metadata": {}, |
684 | 738 | "outputs": [ |
685 | 739 | { |
686 | 740 | "name": "stdout", |
687 | 741 | "output_type": "stream", |
688 | 742 | "text": [ |
689 | | - " Nombre Distrito Partido \\\n", |
690 | | - "0 Neil Anderson None \n", |
691 | | - "1 Omar Aquino None \n", |
692 | | - "2 Li Arellano, Jr. None \n", |
693 | | - "3 Chris Balkema None \n", |
694 | | - "4 Christopher Belt None \n", |
| 743 | + " Nombre Distrito Partido \\\n", |
| 744 | + "0 Neil Anderson 2006.0 R \n", |
| 745 | + "1 Omar Aquino 2016.0 D \n", |
| 746 | + "2 Li Arellano, Jr. 2025.0 D \n", |
| 747 | + "3 Chris Balkema 2025.0 R \n", |
| 748 | + "4 Christopher Belt 2019.0 R \n", |
695 | 749 | "\n", |
696 | 750 | " Perfil \n", |
697 | 751 | "0 https://www.ilga.gov/Senate/Members/Details/3312 \n", |
|
707 | 761 | "\n", |
708 | 762 | "df = pd.DataFrame(senate_members, columns=[\"Nombre\", \"Distrito\", \"Partido\", \"Perfil\"])\n", |
709 | 763 | "print(df.head())\n", |
710 | | - "# df.to_csv(\"senado_ilga_moderno.csv\", index=False, encoding=\"utf-8\")\n" |
| 764 | + "\n", |
| 765 | + "df.to_csv(\"senado_ilga_moderno.csv\", index=False, encoding=\"utf-8\")\n" |
711 | 766 | ] |
712 | 767 | }, |
713 | 768 | { |
|
742 | 797 | }, |
743 | 798 | { |
744 | 799 | "cell_type": "code", |
745 | | - "execution_count": 55, |
| 800 | + "execution_count": 72, |
746 | 801 | "metadata": {}, |
747 | 802 | "outputs": [], |
748 | 803 | "source": [ |
|
867 | 922 | }, |
868 | 923 | { |
869 | 924 | "cell_type": "code", |
870 | | - "execution_count": 60, |
| 925 | + "execution_count": 101, |
871 | 926 | "metadata": {}, |
872 | 927 | "outputs": [], |
873 | 928 | "source": [ |
|
879 | 934 | }, |
880 | 935 | { |
881 | 936 | "cell_type": "code", |
882 | | - "execution_count": 61, |
| 937 | + "execution_count": 104, |
883 | 938 | "metadata": {}, |
884 | 939 | "outputs": [ |
885 | 940 | { |
886 | 941 | "name": "stdout", |
887 | 942 | "output_type": "stream", |
888 | 943 | "text": [ |
889 | | - "Clave encontrada: None\n", |
890 | | - "No hay datos para distrito 52.\n" |
| 944 | + "Clave encontrada: 2006\n", |
| 945 | + "Clave encontrada: 2016\n", |
| 946 | + "Clave encontrada: 2025\n", |
| 947 | + "Clave encontrada: 2019\n", |
| 948 | + "Número de proyectos para distrito 2006: 0\n" |
891 | 949 | ] |
892 | 950 | } |
893 | 951 | ], |
|
897 | 955 | " print(\"Clave encontrada:\", key)\n", |
898 | 956 | "\n", |
899 | 957 | "# Acceder de forma segura\n", |
900 | | - "if 52 in bills_dict:\n", |
901 | | - " print(\"Número de proyectos para distrito 52:\", len(bills_dict[52]))\n", |
| 958 | + "if 2006 in bills_dict:\n", |
| 959 | + " print(\"Número de proyectos para distrito 2006:\", len(bills_dict[2006]))\n", |
902 | 960 | "else:\n", |
903 | | - " print(\"No hay datos para distrito 52.\")\n" |
| 961 | + " print(\"No hay datos para distrito 2006.\")\n" |
904 | 962 | ] |
905 | 963 | }, |
906 | 964 | { |
907 | 965 | "cell_type": "code", |
908 | | - "execution_count": 63, |
| 966 | + "execution_count": 107, |
909 | 967 | "metadata": {}, |
910 | 968 | "outputs": [ |
911 | 969 | { |
912 | 970 | "name": "stdout", |
913 | 971 | "output_type": "stream", |
914 | 972 | "text": [ |
915 | | - "Claves disponibles en bills_dict: [None]\n", |
916 | | - "Total claves: 1\n", |
917 | | - "No hay datos para el distrito 52.\n" |
| 973 | + "Claves disponibles en bills_dict: [2006, 2016, 2025, 2019]\n", |
| 974 | + "Total claves: 4\n", |
| 975 | + "No hay datos para el distrito 2019.\n" |
918 | 976 | ] |
919 | 977 | } |
920 | 978 | ], |
921 | 979 | "source": [ |
922 | 980 | "print(\"Claves disponibles en bills_dict:\", list(bills_dict.keys()))\n", |
923 | 981 | "print(\"Total claves:\", len(bills_dict))\n", |
924 | 982 | "\n", |
925 | | - "bills_52 = bills_dict.get(52) or bills_dict.get(\"52\")\n", |
926 | | - "if bills_52 is None:\n", |
927 | | - " print(\"No hay datos para el distrito 52.\")\n", |
| 983 | + "bills_2019 = bills_dict.get(2019) or bills_dict.get(\"2019\")\n", |
| 984 | + "if bills_2019 is None:\n", |
| 985 | + " print(\"No hay datos para el distrito 2019.\")\n", |
928 | 986 | "else:\n", |
929 | | - " print(\"Número de proyectos:\", len(bills_52))\n" |
| 987 | + " print(\"Número de proyectos:\", len(bills_2019))\n" |
930 | 988 | ] |
931 | 989 | } |
932 | 990 | ], |
|
0 commit comments