Skip to content

Commit d9351f7

Browse files
authored
Merge pull request #8 from anmerinoto/main
Upgrade rutina
2 parents aed5f90 + 4d764f6 commit d9351f7

File tree

3 files changed

+243
-124
lines changed

3 files changed

+243
-124
lines changed

solutions/02_web_scraping_solutions.ipynb

Lines changed: 122 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -243,14 +243,13 @@
243243
},
244244
{
245245
"cell_type": "code",
246-
"execution_count": 7,
246+
"execution_count": 84,
247247
"metadata": {},
248248
"outputs": [
249249
{
250250
"name": "stdout",
251251
"output_type": "stream",
252252
"text": [
253-
"[WARN] No se pudo leer https://www.ilga.gov/Senate/List: 404 Client Error: Not Found for url: https://www.ilga.gov/Senate/List\n",
254253
"Perfiles encontrados en la lista: 60\n",
255254
"Total miembros parseados: 60\n",
256255
"('Member', 8505, 'D', 'https://www.ilga.gov/Senate/Members/Details/3264')\n",
@@ -272,7 +271,7 @@
272271
"3 Member 5966 D https://www.ilga.gov/Senate/Members/Details/3269\n",
273272
"4 Member 422 D https://www.ilga.gov/Senate/Members/Details/3270\n",
274273
"\n",
275-
"CSV generado: senado_ilga_moderno.csv\n"
274+
"CSV generado: senado_ilga_members.csv\n"
276275
]
277276
}
278277
],
@@ -284,7 +283,7 @@
284283
"\n",
285284
"LIST_URLS = [\n",
286285
" \"https://www.ilga.gov/Senate/Members/List\",\n",
287-
" \"https://www.ilga.gov/Senate/List\",\n",
286+
" \"https://www.ilga.gov/Senate/Members\",\n",
288287
"]\n",
289288
"BASE = \"https://www.ilga.gov\"\n",
290289
"\n",
@@ -415,7 +414,7 @@
415414
" print(\"\\nPrimeras 5 filas:\")\n",
416415
" print(df.head())\n",
417416
" df.to_csv(\"senado_ilga_moderno.csv\", index=False, encoding=\"utf-8\")\n",
418-
" print(\"\\nCSV generado: senado_ilga_moderno.csv\")\n",
417+
" print(\"\\nCSV generado: senado_ilga_members.csv\")\n",
419418
" except ImportError:\n",
420419
" print(\"Pandas no está instalado; omitiendo CSV. Instala con: pip install pandas openpyxl\")\n",
421420
"\n",
@@ -547,53 +546,107 @@
547546
},
548547
{
549548
"cell_type": "code",
550-
"execution_count": 37,
549+
"execution_count": 96,
551550
"metadata": {},
552551
"outputs": [],
553552
"source": [
554-
"import re\n",
553+
"import re, time\n",
555554
"from urllib.parse import urljoin\n",
556555
"import requests\n",
557556
"from bs4 import BeautifulSoup\n",
558557
"\n",
559-
"def get_members(url: str = \"https://www.ilga.gov/Senate/Members/List\"):\n",
558+
"HEADERS = {\"User-Agent\": \"Mozilla/5.0\"}\n",
559+
"\n",
560+
"# --- util: extraer distrito/partido desde texto plano ---\n",
561+
"def _extract_district_party(text: str):\n",
562+
" # 1) \"47 R\"\n",
563+
" m = re.search(r'\\b(\\d+)\\s+([DRI])\\b', text)\n",
564+
" if m:\n",
565+
" return int(m.group(1)), m.group(2).upper()\n",
566+
" # 2) \"District 47 (R)\" u otras variantes\n",
567+
" m = re.search(r'(?:District\\s*)?(\\d+).*?\\b([DRI])\\b', text, flags=re.I)\n",
568+
" if m:\n",
569+
" return int(m.group(1)), m.group(2).upper()\n",
570+
" # 3) Si solo viene el nombre del partido completo\n",
571+
" party = \"\"\n",
572+
" if re.search(r'\\bDemocrat(ic)?\\b', text, flags=re.I):\n",
573+
" party = \"D\"\n",
574+
" elif re.search(r'\\bRepublican\\b', text, flags=re.I):\n",
575+
" party = \"R\"\n",
576+
" elif re.search(r'\\bIndependent\\b', text, flags=re.I):\n",
577+
" party = \"I\"\n",
578+
" return None, party # distrito desconocido, partido si se detectó\n",
579+
"\n",
580+
"# --- leer distrito/partido desde el perfil individual ---\n",
581+
"def _parse_profile(profile_url: str, session: requests.Session):\n",
582+
" r = session.get(profile_url, timeout=30)\n",
583+
" r.raise_for_status()\n",
584+
" psoup = BeautifulSoup(r.text, \"lxml\")\n",
585+
"\n",
586+
" # nombre (por si quieres validar)\n",
587+
" name = \"\"\n",
588+
" h1 = psoup.select_one(\"h1\")\n",
589+
" if h1:\n",
590+
" name = h1.get_text(strip=True)\n",
591+
" elif psoup.title:\n",
592+
" name = psoup.title.get_text(strip=True)\n",
593+
"\n",
594+
" text = psoup.get_text(\" \", strip=True)\n",
595+
" district, party = _extract_district_party(text)\n",
596+
" return name, district, party\n",
597+
"\n",
598+
"def get_members(url: str = \"https://www.ilga.gov/Senate/Members\"):\n",
560599
" \"\"\"\n",
561-
" Devuelve una lista de tuplas (Nombre, Distrito:int, Partido:str, Perfil:str)\n",
562-
" extraídas desde la página moderna del Senado de Illinois.\n",
600+
" Devuelve lista de tuplas (Nombre, Distrito:int|None, Partido:str, Perfil:str)\n",
601+
" Tomando enlaces a /Senate/Members/Details/... desde /Senate/Members o /Senate/Members/List.\n",
602+
" Si el distrito/partido no está cerca del enlace, se visita el perfil para extraerlos.\n",
563603
" \"\"\"\n",
564-
" headers = {\"User-Agent\": \"Mozilla/5.0\"}\n",
565-
" resp = requests.get(url, headers=headers, timeout=30)\n",
566-
" resp.raise_for_status()\n",
604+
" s = requests.Session()\n",
605+
" s.headers.update(HEADERS)\n",
606+
"\n",
607+
" r = s.get(url, timeout=30)\n",
608+
" r.raise_for_status()\n",
609+
" soup = BeautifulSoup(r.text, \"lxml\")\n",
567610
"\n",
568-
" soup = BeautifulSoup(resp.text, \"lxml\")\n",
569611
" members = []\n",
612+
" seen = set()\n",
570613
"\n",
571-
" # Enlaces a perfiles /Senate/Members/Details/<id>\n",
614+
" # Enlaces a perfiles\n",
572615
" for a in soup.select('a[href*=\"/Senate/Members/Details/\"], a[href*=\"/senate/members/details/\"]'):\n",
573616
" name = a.get_text(strip=True)\n",
574617
" if not name:\n",
575618
" continue\n",
576619
"\n",
577-
" # En el contenedor suele venir \"Nombre 47 R\" (número + partido)\n",
578-
" full_text = a.parent.get_text(\" \", strip=True)\n",
579-
" tail = full_text.replace(name, \"\").strip()\n",
580-
"\n",
581-
" # 1) patrón directo: \"47 R\"\n",
582-
" m = re.search(r'(\\d+)\\s+([DRI])\\b', tail)\n",
583-
" # 2) fallback: \"District 47 (R)\" u otras variantes\n",
584-
" if not m:\n",
585-
" m = re.search(r'(?:District\\s*)?(\\d+).*?([DRI])\\b', tail, re.I)\n",
586-
" if not m:\n",
587-
" # si no se detecta distrito/partido, igual guarda el nombre y el perfil\n",
588-
" district, party = None, \"\"\n",
589-
" else:\n",
590-
" district = int(m.group(1))\n",
591-
" party = m.group(2).upper()\n",
592-
"\n",
593-
" profile = urljoin(url, a.get(\"href\"))\n",
620+
" profile = urljoin(url, a.get(\"href\") or \"\")\n",
621+
" if profile in seen:\n",
622+
" continue\n",
623+
" seen.add(profile)\n",
624+
"\n",
625+
" # 1) Intentar extraer distrito/partido del contenedor más cercano\n",
626+
" container = a.parent\n",
627+
" # sube hasta 3 niveles si es necesario (algunas páginas usan divs anidados)\n",
628+
" hops = 0\n",
629+
" while container and hops < 3 and len(container.get_text(strip=True)) < 10:\n",
630+
" container = container.parent\n",
631+
" hops += 1\n",
632+
"\n",
633+
" tail_text = \"\"\n",
634+
" if container:\n",
635+
" # texto del contenedor sin el nombre, para evitar falsos positivos\n",
636+
" ctext = container.get_text(\" \", strip=True)\n",
637+
" tail_text = ctext.replace(name, \"\").strip()\n",
638+
"\n",
639+
" district, party = _extract_district_party(tail_text)\n",
640+
"\n",
641+
" # 2) Si no encontramos, entramos al perfil\n",
642+
" if district is None and not party:\n",
643+
" _, district, party = _parse_profile(profile, s)\n",
644+
" time.sleep(0.2) # cortesía con el servidor\n",
645+
"\n",
594646
" members.append((name, district, party, profile))\n",
595647
"\n",
596-
" return members\n"
648+
" return members\n",
649+
"\n"
597650
]
598651
},
599652
{
@@ -644,27 +697,28 @@
644697
},
645698
{
646699
"cell_type": "code",
647-
"execution_count": 38,
700+
"execution_count": 97,
648701
"metadata": {},
649702
"outputs": [
650703
{
651704
"name": "stdout",
652705
"output_type": "stream",
653706
"text": [
654707
"Total miembros: 60\n",
655-
"('Neil Anderson', None, '', 'https://www.ilga.gov/Senate/Members/Details/3312')\n",
656-
"('Omar Aquino', None, '', 'https://www.ilga.gov/Senate/Members/Details/3316')\n",
657-
"('Li Arellano, Jr.', None, '', 'https://www.ilga.gov/Senate/Members/Details/3383')\n",
658-
"('Chris Balkema', None, '', 'https://www.ilga.gov/Senate/Members/Details/3413')\n",
659-
"('Christopher Belt', None, '', 'https://www.ilga.gov/Senate/Members/Details/3337')\n"
708+
"('Neil Anderson', 2006, 'R', 'https://www.ilga.gov/Senate/Members/Details/3312')\n",
709+
"('Omar Aquino', 2016, 'D', 'https://www.ilga.gov/Senate/Members/Details/3316')\n",
710+
"('Li Arellano, Jr.', 2025, 'D', 'https://www.ilga.gov/Senate/Members/Details/3383')\n",
711+
"('Chris Balkema', 2025, 'R', 'https://www.ilga.gov/Senate/Members/Details/3413')\n",
712+
"('Christopher Belt', 2019, 'R', 'https://www.ilga.gov/Senate/Members/Details/3337')\n"
660713
]
661714
}
662715
],
663716
"source": [
664717
"senate_members = get_members() # o get_members(\"https://www.ilga.gov/Senate/Members/List\")\n",
665718
"print(\"Total miembros:\", len(senate_members))\n",
666719
"for m in senate_members[:5]:\n",
667-
" print(m)\n"
720+
" print(m)\n",
721+
"\n"
668722
]
669723
},
670724
{
@@ -679,19 +733,19 @@
679733
},
680734
{
681735
"cell_type": "code",
682-
"execution_count": 39,
736+
"execution_count": 98,
683737
"metadata": {},
684738
"outputs": [
685739
{
686740
"name": "stdout",
687741
"output_type": "stream",
688742
"text": [
689-
" Nombre Distrito Partido \\\n",
690-
"0 Neil Anderson None \n",
691-
"1 Omar Aquino None \n",
692-
"2 Li Arellano, Jr. None \n",
693-
"3 Chris Balkema None \n",
694-
"4 Christopher Belt None \n",
743+
" Nombre Distrito Partido \\\n",
744+
"0 Neil Anderson 2006.0 R \n",
745+
"1 Omar Aquino 2016.0 D \n",
746+
"2 Li Arellano, Jr. 2025.0 D \n",
747+
"3 Chris Balkema 2025.0 R \n",
748+
"4 Christopher Belt 2019.0 R \n",
695749
"\n",
696750
" Perfil \n",
697751
"0 https://www.ilga.gov/Senate/Members/Details/3312 \n",
@@ -707,7 +761,8 @@
707761
"\n",
708762
"df = pd.DataFrame(senate_members, columns=[\"Nombre\", \"Distrito\", \"Partido\", \"Perfil\"])\n",
709763
"print(df.head())\n",
710-
"# df.to_csv(\"senado_ilga_moderno.csv\", index=False, encoding=\"utf-8\")\n"
764+
"\n",
765+
"df.to_csv(\"senado_ilga_moderno.csv\", index=False, encoding=\"utf-8\")\n"
711766
]
712767
},
713768
{
@@ -742,7 +797,7 @@
742797
},
743798
{
744799
"cell_type": "code",
745-
"execution_count": 55,
800+
"execution_count": 72,
746801
"metadata": {},
747802
"outputs": [],
748803
"source": [
@@ -867,7 +922,7 @@
867922
},
868923
{
869924
"cell_type": "code",
870-
"execution_count": 60,
925+
"execution_count": 101,
871926
"metadata": {},
872927
"outputs": [],
873928
"source": [
@@ -879,15 +934,18 @@
879934
},
880935
{
881936
"cell_type": "code",
882-
"execution_count": 61,
937+
"execution_count": 104,
883938
"metadata": {},
884939
"outputs": [
885940
{
886941
"name": "stdout",
887942
"output_type": "stream",
888943
"text": [
889-
"Clave encontrada: None\n",
890-
"No hay datos para distrito 52.\n"
944+
"Clave encontrada: 2006\n",
945+
"Clave encontrada: 2016\n",
946+
"Clave encontrada: 2025\n",
947+
"Clave encontrada: 2019\n",
948+
"Número de proyectos para distrito 2006: 0\n"
891949
]
892950
}
893951
],
@@ -897,36 +955,36 @@
897955
" print(\"Clave encontrada:\", key)\n",
898956
"\n",
899957
"# Acceder de forma segura\n",
900-
"if 52 in bills_dict:\n",
901-
" print(\"Número de proyectos para distrito 52:\", len(bills_dict[52]))\n",
958+
"if 2006 in bills_dict:\n",
959+
" print(\"Número de proyectos para distrito 2006:\", len(bills_dict[2006]))\n",
902960
"else:\n",
903-
" print(\"No hay datos para distrito 52.\")\n"
961+
" print(\"No hay datos para distrito 2006.\")\n"
904962
]
905963
},
906964
{
907965
"cell_type": "code",
908-
"execution_count": 63,
966+
"execution_count": 107,
909967
"metadata": {},
910968
"outputs": [
911969
{
912970
"name": "stdout",
913971
"output_type": "stream",
914972
"text": [
915-
"Claves disponibles en bills_dict: [None]\n",
916-
"Total claves: 1\n",
917-
"No hay datos para el distrito 52.\n"
973+
"Claves disponibles en bills_dict: [2006, 2016, 2025, 2019]\n",
974+
"Total claves: 4\n",
975+
"No hay datos para el distrito 2019.\n"
918976
]
919977
}
920978
],
921979
"source": [
922980
"print(\"Claves disponibles en bills_dict:\", list(bills_dict.keys()))\n",
923981
"print(\"Total claves:\", len(bills_dict))\n",
924982
"\n",
925-
"bills_52 = bills_dict.get(52) or bills_dict.get(\"52\")\n",
926-
"if bills_52 is None:\n",
927-
" print(\"No hay datos para el distrito 52.\")\n",
983+
"bills_2019 = bills_dict.get(2019) or bills_dict.get(\"2019\")\n",
984+
"if bills_2019 is None:\n",
985+
" print(\"No hay datos para el distrito 2019.\")\n",
928986
"else:\n",
929-
" print(\"Número de proyectos:\", len(bills_52))\n"
987+
" print(\"Número de proyectos:\", len(bills_2019))\n"
930988
]
931989
}
932990
],

0 commit comments

Comments
 (0)