|
510 | 510 | " resources = limits[\"resources\"]\n",
|
511 | 511 | " # print(\"{:3d} keys: \".format(len(resources.keys())), resources.keys())\n",
|
512 | 512 | " # print(resources)\n",
|
| 513 | + " from pprint import pprint as pp\n", |
| 514 | + " pp(f\"{limits=}\")\n", |
513 | 515 | " for reset in list(resources.keys()):\n",
|
514 | 516 | " reset_at = resources[reset][\"reset\"]\n",
|
515 | 517 | " reset_max = max(reset_at, reset_max)\n",
|
|
601 | 603 | " \"\"\"\n",
|
602 | 604 | " mozilla-services\n",
|
603 | 605 | " mozilla\n",
|
| 606 | + " pocket\n", |
604 | 607 | " \"\"\".split()\n",
|
605 | 608 | " )\n",
|
606 | 609 | " elif use_canned_org_list: # old school\n",
|
|
1328 | 1331 | "lines_to_next_cell": 1
|
1329 | 1332 | },
|
1330 | 1333 | "outputs": [],
|
| 1334 | + "source": [ |
| 1335 | + "def prune_hits_to_ignore(full_list, id_to_find):\n", |
| 1336 | + " # remove vulnerability repos (*-ghsa-*) and archived repos (archive status \n", |
| 1337 | + " # requires refresh of repository object\n", |
| 1338 | + " hit_list_1 = [r for r in full_list if (not \"-ghsa-\" in r.repository.name)\n", |
| 1339 | + " and (not r.repository.refresh().archived)\n", |
| 1340 | + " ]\n", |
| 1341 | + " # now eliminate any hits where the search term was not found \"as a word\"\n", |
| 1342 | + " id_re = re.compile(fr\"\\b{id_to_find}\\b\", re.IGNORECASE)\n", |
| 1343 | + " hit_list_2 = []\n", |
| 1344 | + "# print(f\"Checking {len(hit_list_1)} hits\")\n", |
| 1345 | + " for index, hit in enumerate(hit_list_1):\n", |
| 1346 | + "# print(f\" Hit {index} has {len(hit.text_matches)} contexts\")\n", |
| 1347 | + " for ctxt, context in enumerate(hit.text_matches):\n", |
| 1348 | + " if id_re.search(context[\"fragment\"]):\n", |
| 1349 | + " hit_list_2.append(hit)\n", |
| 1350 | + "# print(f\"Adding hit {index}; context {ctxt} ({len(hit_list_2)=}): {context['fragment']}\")\n", |
| 1351 | + " break\n", |
| 1352 | + " else:\n", |
| 1353 | + "# print(f\"ignoring context {context['fragment']}\")\n", |
| 1354 | + " ...\n", |
| 1355 | + "# print(f\"returning {len(hit_list_2)} hits\")\n", |
| 1356 | + " return hit_list_2" |
| 1357 | + ] |
| 1358 | + }, |
| 1359 | + { |
| 1360 | + "cell_type": "code", |
| 1361 | + "execution_count": null, |
| 1362 | + "id": "74a23434", |
| 1363 | + "metadata": { |
| 1364 | + "hidden": true, |
| 1365 | + "init_cell": true, |
| 1366 | + "lines_to_next_cell": 1 |
| 1367 | + }, |
| 1368 | + "outputs": [], |
1331 | 1369 | "source": [
|
1332 | 1370 | "import csv, io\n",
|
1333 | 1371 | "def check_for_acls(logins):\n",
|
|
1341 | 1379 | " # we're now outputing in CSV format, so put in a header line\n",
|
1342 | 1380 | " csvfile = io.StringIO()\n",
|
1343 | 1381 | " writer = csv.writer(csvfile)\n",
|
1344 |
| - " writer.writerow([\"Action Taken\", \"Comment\", \"\", \"File\", \"Search URL\"])\n", |
| 1382 | + " writer.writerow([\"Action Taken\", \"Comment\", \"\", \"Context\", \"File\", \"Search URL\", \"Raw Context\"])\n", |
1345 | 1383 | " # add formula to use for copy down in R2C3 - still requires manual intervention\n",
|
1346 | 1384 | " # 1. in cell C3 select, edit, and enter to make real formula\n",
|
1347 | 1385 | " # 2. fill down for all rows in sheet\n",
|
1348 |
| - " writer.writerow([\"\", \"\", '=if(ISBLANK(E2),\"\", HYPERLINK(E2,\"?\"))', \"\", \"\"])\n", |
| 1386 | + " writer.writerow([\"\", \"\", '=if(ISBLANK(F2),\"\", HYPERLINK(F2,\"?\"))', '=if(isblank(G2),,SUBSTITUTE(G2,\"\\\\n\",char(10)))', \"\", \"\"])\n", |
1349 | 1387 | " writer.writerow([\"\"] * 4)\n",
|
1350 | 1388 | " writer.writerow([f\"Checking for possible ACLs for: {', '.join(possibles)}\", \"\", \"\",])\n",
|
1351 | 1389 | " writer.writerow([\"\"] * 4)\n",
|
|
1360 | 1398 | "# print(f\" {org}..\", end='')\n",
|
1361 | 1399 | " for l in possibles:\n",
|
1362 | 1400 | " full_list = []\n",
|
1363 |
| - " try:\n", |
1364 |
| - " full_list = list(gh.search_code(query=f\"org:{org} {l}\"))\n", |
1365 |
| - " except Exception as e:\n", |
1366 |
| - " if isinstance(e, http.client.RemoteDisconnected):\n", |
1367 |
| - " # This is \"fun\" to run into - doesn't happen very often\n", |
1368 |
| - " # so this recovery is an educated guess (the time I\n", |
1369 |
| - " # did see it, it was after a 'resumed' message from\n", |
1370 |
| - " # the clause below)\n", |
1371 |
| - " for i in range(3):\n", |
1372 |
| - " try_login()\n", |
1373 |
| - " if gh:\n", |
1374 |
| - " # re-established connection\n", |
1375 |
| - " print(f\"re-established connection on try {i+1}\")\n", |
1376 |
| - " break\n", |
| 1401 | + " assume_time_out = True\n", |
| 1402 | + " while assume_time_out:\n", |
| 1403 | + " try:\n", |
| 1404 | + " # 2023-05-25 can't use regex in code search, so return context for further processing\n", |
| 1405 | + " full_list = list(gh.search_code(query=f\"org:{org} {l}\", text_match=True))\n", |
| 1406 | + " assume_time_out = False\n", |
| 1407 | + " except Exception as e:\n", |
| 1408 | + " if isinstance(e, http.client.RemoteDisconnected):\n", |
| 1409 | + " # This is \"fun\" to run into - doesn't happen very often\n", |
| 1410 | + " # so this recovery is an educated guess (the time I\n", |
| 1411 | + " # did see it, it was after a 'resumed' message from\n", |
| 1412 | + " # the clause below)\n", |
| 1413 | + " for i in range(3):\n", |
| 1414 | + " try_login()\n", |
| 1415 | + " if gh:\n", |
| 1416 | + " # re-established connection\n", |
| 1417 | + " print(f\"re-established connection on try {i+1}\")\n", |
| 1418 | + " break\n", |
| 1419 | + " else:\n", |
| 1420 | + " time.sleep(60)\n", |
1377 | 1421 | " else:\n",
|
1378 |
| - " time.sleep(60)\n", |
1379 |
| - " else:\n", |
1380 |
| - " print(f\"failed to re-establish connection after {i+1} tries\")\n", |
1381 |
| - " raise SystemExit\n", |
1382 |
| - " elif e.code not in [403, 422]:\n", |
1383 |
| - " print(f\"org={org} l={l} exception={str(e)}\")\n", |
1384 |
| - " elif e.code in [403]:\n", |
1385 |
| - " print(\"\\n\\nOut of API calls, waiting a minute ..\", end='')\n", |
1386 |
| - " print_limits(verbose=False)\n", |
1387 |
| - " # we can hit this a lot, so just wait a minute\n", |
1388 |
| - " time.sleep(60)\n", |
1389 |
| - " print(\"... resumed.\")\n", |
1390 |
| - " # we've reported on everything of interest, no need for else clause\n", |
1391 |
| - "# else:\n", |
1392 |
| - "# print(f\"Got code {e.code} for org {org}, search {l}\")\n", |
1393 |
| - " # remove vulnerability repos (*-ghsa-*) and archived repos (archive status \n", |
1394 |
| - " # requires refresh of repository object\n", |
1395 |
| - " hit_list = [r for r in full_list if (not \"-ghsa-\" in r.repository.name)\n", |
1396 |
| - " and (not r.repository.refresh().archived)]\n", |
1397 |
| - " num_search_results = len(hit_list)\n", |
| 1422 | + " print(f\"failed to re-establish connection after {i+1} tries\")\n", |
| 1423 | + " raise SystemExit\n", |
| 1424 | + " elif not hasattr(e, 'code'):\n", |
| 1425 | + " print(f\"org={org} l={l} exception={str(e)} (exception type {type(e)})\")\n", |
| 1426 | + " elif e.code not in [403, 422]:\n", |
| 1427 | + " print(f\"org={org} l={l} exception={str(e)}\")\n", |
| 1428 | + " elif e.code in [403]:\n", |
| 1429 | + " seconds_to_wait = 7\n", |
| 1430 | + " print(f\"\\nOut of Code Search API calls, waiting {seconds_to_wait} seconds ({org=}, {l=}) ..\", end='')\n", |
| 1431 | + " # we can hit this a lot, so just wait a minute - only 10 req/min\n", |
| 1432 | + " # per https://docs.github.com/en/enterprise-cloud@latest/rest/search?apiVersion=2022-11-28#rate-limit\n", |
| 1433 | + " time.sleep(seconds_to_wait)\n", |
| 1434 | + " print(\"... resumed.\")\n", |
| 1435 | + " # we've reported on everything of interest, no need for else clause\n", |
| 1436 | + " # else:\n", |
| 1437 | + " # print(f\"Got code {e.code} for org {org}, search {l}\")\n", |
| 1438 | + "\n", |
| 1439 | + " hit_list = prune_hits_to_ignore(full_list, l)\n", |
1398 | 1440 | "\n",
|
1399 | 1441 | " search_urls = []\n",
|
1400 | 1442 | " for search_hit in hit_list:\n",
|
1401 |
| - " new_url = search_hit_to_url(search_hit.html_url, l)\n", |
1402 |
| - "# print(f\"before: {search_hit.html_url}\\n after: {new_url}\")\n", |
| 1443 | + " new_url = search_hit_to_url(search_hit.html_url, l, debug=False)\n", |
1403 | 1444 | " if new_url:\n",
|
1404 |
| - " search_urls.append(new_url)\n", |
| 1445 | + " # add the matching fragments as the 2nd item of a tupple\n", |
| 1446 | + " context = \"\\n----\\n\".join([m['fragment'] for m in search_hit.text_matches])\n", |
| 1447 | + " search_urls.append((*new_url, context.replace(\"\\n\", \"\\\\n\")))\n", |
1405 | 1448 | " num_raw_search_urls = len(search_urls)\n",
|
1406 | 1449 | " search_urls = set(search_urls)\n",
|
1407 | 1450 | " num_search_urls = len(search_urls)\n",
|
1408 |
| - "# print(f\"search results: {num_search_results}; after translation: {num_raw_search_urls}; after dedupe: {num_search_urls}\")\n", |
| 1451 | + "# print(f\"search results: {len(hit_list)}; after translation: {num_raw_search_urls}; after dedupe: {num_search_urls}\")\n", |
1409 | 1452 | " if num_search_urls > 0:\n",
|
1410 | 1453 | " writer.writerow(['', f\"{num_search_urls} files with possible ACLs in {org} for {l}:\", \"\", \"\"])\n",
|
1411 |
| - " for url, repo, path, filename in sorted(search_urls):\n", |
| 1454 | + " for url, repo, path, filename, context in sorted(search_urls):\n", |
1412 | 1455 | " # output in csv format\n",
|
1413 |
| - " writer.writerow([\"\", \"\", \"\", f\"{repo}/{path}/{filename}\", f\"{url}\"])\n", |
| 1456 | + " writer.writerow([\"\", \"\", \"\", \"\", f\"{repo}/{path}/{filename}\", f\"{url}\", context])\n", |
1414 | 1457 | " # import pdb ; pdb.set_trace()\n",
|
1415 | 1458 | " csvfile.seek(0)\n",
|
1416 | 1459 | " hits = [l.strip() for l in csvfile.readlines()]\n",
|
|
2115 | 2158 | "execution_count": null,
|
2116 | 2159 | "id": "9112d45e",
|
2117 | 2160 | "metadata": {
|
2118 |
| - "hidden": true |
| 2161 | + "hidden": true, |
| 2162 | + "scrolled": false |
2119 | 2163 | },
|
2120 | 2164 | "outputs": [],
|
2121 | 2165 | "source": [
|
|
0 commit comments