Skip to content
This repository was archived by the owner on May 3, 2025. It is now read-only.

Commit 4e82073

Browse files
authored
Improve ACL results (#107)
Two major changes: - reduce false positives by requiring the match be as-a-token (word) - short strings often matched base64 encoded text - remove false negatives when patterns were skipped after encountering a search limit
1 parent a32d751 commit 4e82073

File tree

3 files changed

+91
-47
lines changed

3 files changed

+91
-47
lines changed

.secrets.baseline

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@
115115
"filename": "notebooks/UserSearchPy3.ipynb",
116116
"hashed_secret": "13b897fb3181b06360814e15a2535df2624de13a",
117117
"is_verified": false,
118-
"line_number": 2181,
118+
"line_number": 2225,
119119
"is_secret": false
120120
}
121121
],
@@ -130,5 +130,5 @@
130130
}
131131
]
132132
},
133-
"generated_at": "2023-05-04T20:09:16Z"
133+
"generated_at": "2023-05-24T18:00:29Z"
134134
}

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ SHELL := /bin/bash
3131
.PHONY: build debug_build
3232
# New build
3333
build: Dockerfile .dockerignore Makefile notebooks/*ipynb requirements*.txt
34-
docker build --tag $(image_to_use):$(github3_version) --tag $(image_to_use):latest .
34+
docker buildx build --tag $(image_to_use):$(github3_version) --tag $(image_to_use):latest .
3535
# debug the build by not using buildkit - we also assume last one failed, so no need to tag prior
3636
debug-build:
3737
DOCKER_BUILDKIT=0 docker build --tag $(image_to_use):debug .
@@ -77,7 +77,7 @@ run-edit:
7777
& \
7878
job_pid=$$! ; \
7979
sleep 10 ; \
80-
docker ps --filter "ancestor=$(image_to_use):$(github3_version)" ; \
80+
docker ps --filter "ancestor=$(image_to_use)" ; \
8181
wait $$job_pid ; \
8282
) '
8383

notebooks/UserSearchPy3.ipynb

Lines changed: 87 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,8 @@
510510
" resources = limits[\"resources\"]\n",
511511
" # print(\"{:3d} keys: \".format(len(resources.keys())), resources.keys())\n",
512512
" # print(resources)\n",
513+
" from pprint import pprint as pp\n",
514+
" pp(f\"{limits=}\")\n",
513515
" for reset in list(resources.keys()):\n",
514516
" reset_at = resources[reset][\"reset\"]\n",
515517
" reset_max = max(reset_at, reset_max)\n",
@@ -601,6 +603,7 @@
601603
" \"\"\"\n",
602604
" mozilla-services\n",
603605
" mozilla\n",
606+
" pocket\n",
604607
" \"\"\".split()\n",
605608
" )\n",
606609
" elif use_canned_org_list: # old school\n",
@@ -1328,6 +1331,41 @@
13281331
"lines_to_next_cell": 1
13291332
},
13301333
"outputs": [],
1334+
"source": [
1335+
"def prune_hits_to_ignore(full_list, id_to_find):\n",
1336+
" # remove vulnerability repos (*-ghsa-*) and archived repos (archive status \n",
1337+
" # requires refresh of repository object\n",
1338+
" hit_list_1 = [r for r in full_list if (not \"-ghsa-\" in r.repository.name)\n",
1339+
" and (not r.repository.refresh().archived)\n",
1340+
" ]\n",
1341+
" # now eliminate any hits where the search term was not found \"as a word\"\n",
1342+
" id_re = re.compile(fr\"\\b{id_to_find}\\b\", re.IGNORECASE)\n",
1343+
" hit_list_2 = []\n",
1344+
"# print(f\"Checking {len(hit_list_1)} hits\")\n",
1345+
" for index, hit in enumerate(hit_list_1):\n",
1346+
"# print(f\" Hit {index} has {len(hit.text_matches)} contexts\")\n",
1347+
" for ctxt, context in enumerate(hit.text_matches):\n",
1348+
" if id_re.search(context[\"fragment\"]):\n",
1349+
" hit_list_2.append(hit)\n",
1350+
"# print(f\"Adding hit {index}; context {ctxt} ({len(hit_list_2)=}): {context['fragment']}\")\n",
1351+
" break\n",
1352+
" else:\n",
1353+
"# print(f\"ignoring context {context['fragment']}\")\n",
1354+
" ...\n",
1355+
"# print(f\"returning {len(hit_list_2)} hits\")\n",
1356+
" return hit_list_2"
1357+
]
1358+
},
1359+
{
1360+
"cell_type": "code",
1361+
"execution_count": null,
1362+
"id": "74a23434",
1363+
"metadata": {
1364+
"hidden": true,
1365+
"init_cell": true,
1366+
"lines_to_next_cell": 1
1367+
},
1368+
"outputs": [],
13311369
"source": [
13321370
"import csv, io\n",
13331371
"def check_for_acls(logins):\n",
@@ -1341,11 +1379,11 @@
13411379
" # we're now outputing in CSV format, so put in a header line\n",
13421380
" csvfile = io.StringIO()\n",
13431381
" writer = csv.writer(csvfile)\n",
1344-
" writer.writerow([\"Action Taken\", \"Comment\", \"\", \"File\", \"Search URL\"])\n",
1382+
" writer.writerow([\"Action Taken\", \"Comment\", \"\", \"Context\", \"File\", \"Search URL\", \"Raw Context\"])\n",
13451383
" # add formula to use for copy down in R2C3 - still requires manual intervention\n",
13461384
" # 1. in cell C3 select, edit, and enter to make real formula\n",
13471385
" # 2. fill down for all rows in sheet\n",
1348-
" writer.writerow([\"\", \"\", '=if(ISBLANK(E2),\"\", HYPERLINK(E2,\"?\"))', \"\", \"\"])\n",
1386+
" writer.writerow([\"\", \"\", '=if(ISBLANK(F2),\"\", HYPERLINK(F2,\"?\"))', '=if(isblank(G2),,SUBSTITUTE(G2,\"\\\\n\",char(10)))', \"\", \"\"])\n",
13491387
" writer.writerow([\"\"] * 4)\n",
13501388
" writer.writerow([f\"Checking for possible ACLs for: {', '.join(possibles)}\", \"\", \"\",])\n",
13511389
" writer.writerow([\"\"] * 4)\n",
@@ -1360,57 +1398,62 @@
13601398
"# print(f\" {org}..\", end='')\n",
13611399
" for l in possibles:\n",
13621400
" full_list = []\n",
1363-
" try:\n",
1364-
" full_list = list(gh.search_code(query=f\"org:{org} {l}\"))\n",
1365-
" except Exception as e:\n",
1366-
" if isinstance(e, http.client.RemoteDisconnected):\n",
1367-
" # This is \"fun\" to run into - doesn't happen very often\n",
1368-
" # so this recovery is an educated guess (the time I\n",
1369-
" # did see it, it was after a 'resumed' message from\n",
1370-
" # the clause below)\n",
1371-
" for i in range(3):\n",
1372-
" try_login()\n",
1373-
" if gh:\n",
1374-
" # re-established connection\n",
1375-
" print(f\"re-established connection on try {i+1}\")\n",
1376-
" break\n",
1401+
" assume_time_out = True\n",
1402+
" while assume_time_out:\n",
1403+
" try:\n",
1404+
" # 2023-05-25 can't use regex in code search, so return context for further processing\n",
1405+
" full_list = list(gh.search_code(query=f\"org:{org} {l}\", text_match=True))\n",
1406+
" assume_time_out = False\n",
1407+
" except Exception as e:\n",
1408+
" if isinstance(e, http.client.RemoteDisconnected):\n",
1409+
" # This is \"fun\" to run into - doesn't happen very often\n",
1410+
" # so this recovery is an educated guess (the time I\n",
1411+
" # did see it, it was after a 'resumed' message from\n",
1412+
" # the clause below)\n",
1413+
" for i in range(3):\n",
1414+
" try_login()\n",
1415+
" if gh:\n",
1416+
" # re-established connection\n",
1417+
" print(f\"re-established connection on try {i+1}\")\n",
1418+
" break\n",
1419+
" else:\n",
1420+
" time.sleep(60)\n",
13771421
" else:\n",
1378-
" time.sleep(60)\n",
1379-
" else:\n",
1380-
" print(f\"failed to re-establish connection after {i+1} tries\")\n",
1381-
" raise SystemExit\n",
1382-
" elif e.code not in [403, 422]:\n",
1383-
" print(f\"org={org} l={l} exception={str(e)}\")\n",
1384-
" elif e.code in [403]:\n",
1385-
" print(\"\\n\\nOut of API calls, waiting a minute ..\", end='')\n",
1386-
" print_limits(verbose=False)\n",
1387-
" # we can hit this a lot, so just wait a minute\n",
1388-
" time.sleep(60)\n",
1389-
" print(\"... resumed.\")\n",
1390-
" # we've reported on everything of interest, no need for else clause\n",
1391-
"# else:\n",
1392-
"# print(f\"Got code {e.code} for org {org}, search {l}\")\n",
1393-
" # remove vulnerability repos (*-ghsa-*) and archived repos (archive status \n",
1394-
" # requires refresh of repository object\n",
1395-
" hit_list = [r for r in full_list if (not \"-ghsa-\" in r.repository.name)\n",
1396-
" and (not r.repository.refresh().archived)]\n",
1397-
" num_search_results = len(hit_list)\n",
1422+
" print(f\"failed to re-establish connection after {i+1} tries\")\n",
1423+
" raise SystemExit\n",
1424+
" elif not hasattr(e, 'code'):\n",
1425+
" print(f\"org={org} l={l} exception={str(e)} (exception type {type(e)})\")\n",
1426+
" elif e.code not in [403, 422]:\n",
1427+
" print(f\"org={org} l={l} exception={str(e)}\")\n",
1428+
" elif e.code in [403]:\n",
1429+
" seconds_to_wait = 7\n",
1430+
" print(f\"\\nOut of Code Search API calls, waiting {seconds_to_wait} seconds ({org=}, {l=}) ..\", end='')\n",
1431+
" # we can hit this a lot, so just wait a minute - only 10 req/min\n",
1432+
" # per https://docs.github.com/en/enterprise-cloud@latest/rest/search?apiVersion=2022-11-28#rate-limit\n",
1433+
" time.sleep(seconds_to_wait)\n",
1434+
" print(\"... resumed.\")\n",
1435+
" # we've reported on everything of interest, no need for else clause\n",
1436+
" # else:\n",
1437+
" # print(f\"Got code {e.code} for org {org}, search {l}\")\n",
1438+
"\n",
1439+
" hit_list = prune_hits_to_ignore(full_list, l)\n",
13981440
"\n",
13991441
" search_urls = []\n",
14001442
" for search_hit in hit_list:\n",
1401-
" new_url = search_hit_to_url(search_hit.html_url, l)\n",
1402-
"# print(f\"before: {search_hit.html_url}\\n after: {new_url}\")\n",
1443+
" new_url = search_hit_to_url(search_hit.html_url, l, debug=False)\n",
14031444
" if new_url:\n",
1404-
" search_urls.append(new_url)\n",
1445+
" # add the matching fragments as the 2nd item of a tupple\n",
1446+
" context = \"\\n----\\n\".join([m['fragment'] for m in search_hit.text_matches])\n",
1447+
" search_urls.append((*new_url, context.replace(\"\\n\", \"\\\\n\")))\n",
14051448
" num_raw_search_urls = len(search_urls)\n",
14061449
" search_urls = set(search_urls)\n",
14071450
" num_search_urls = len(search_urls)\n",
1408-
"# print(f\"search results: {num_search_results}; after translation: {num_raw_search_urls}; after dedupe: {num_search_urls}\")\n",
1451+
"# print(f\"search results: {len(hit_list)}; after translation: {num_raw_search_urls}; after dedupe: {num_search_urls}\")\n",
14091452
" if num_search_urls > 0:\n",
14101453
" writer.writerow(['', f\"{num_search_urls} files with possible ACLs in {org} for {l}:\", \"\", \"\"])\n",
1411-
" for url, repo, path, filename in sorted(search_urls):\n",
1454+
" for url, repo, path, filename, context in sorted(search_urls):\n",
14121455
" # output in csv format\n",
1413-
" writer.writerow([\"\", \"\", \"\", f\"{repo}/{path}/{filename}\", f\"{url}\"])\n",
1456+
" writer.writerow([\"\", \"\", \"\", \"\", f\"{repo}/{path}/{filename}\", f\"{url}\", context])\n",
14141457
" # import pdb ; pdb.set_trace()\n",
14151458
" csvfile.seek(0)\n",
14161459
" hits = [l.strip() for l in csvfile.readlines()]\n",
@@ -2115,7 +2158,8 @@
21152158
"execution_count": null,
21162159
"id": "9112d45e",
21172160
"metadata": {
2118-
"hidden": true
2161+
"hidden": true,
2162+
"scrolled": false
21192163
},
21202164
"outputs": [],
21212165
"source": [

0 commit comments

Comments
 (0)