Skip to content

Commit

Permalink
Improve ACL results (#107)
Browse files Browse the repository at this point in the history
Two major changes:
- reduce false positives by requiring the match be as-a-token (word)
  - short strings often matched base64 encoded text
- remove false negatives when patterns were skipped after encountering a
  search limit
  • Loading branch information
hwine authored May 24, 2023
1 parent a32d751 commit 4e82073
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 47 deletions.
4 changes: 2 additions & 2 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@
"filename": "notebooks/UserSearchPy3.ipynb",
"hashed_secret": "13b897fb3181b06360814e15a2535df2624de13a",
"is_verified": false,
"line_number": 2181,
"line_number": 2225,
"is_secret": false
}
],
Expand All @@ -130,5 +130,5 @@
}
]
},
"generated_at": "2023-05-04T20:09:16Z"
"generated_at": "2023-05-24T18:00:29Z"
}
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ SHELL := /bin/bash
.PHONY: build debug_build
# New build
build: Dockerfile .dockerignore Makefile notebooks/*ipynb requirements*.txt
docker build --tag $(image_to_use):$(github3_version) --tag $(image_to_use):latest .
docker buildx build --tag $(image_to_use):$(github3_version) --tag $(image_to_use):latest .
# debug the build by not using buildkit - we also assume last one failed, so no need to tag prior
debug-build:
DOCKER_BUILDKIT=0 docker build --tag $(image_to_use):debug .
Expand Down Expand Up @@ -77,7 +77,7 @@ run-edit:
& \
job_pid=$$! ; \
sleep 10 ; \
docker ps --filter "ancestor=$(image_to_use):$(github3_version)" ; \
docker ps --filter "ancestor=$(image_to_use)" ; \
wait $$job_pid ; \
) '

Expand Down
130 changes: 87 additions & 43 deletions notebooks/UserSearchPy3.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,8 @@
" resources = limits[\"resources\"]\n",
" # print(\"{:3d} keys: \".format(len(resources.keys())), resources.keys())\n",
" # print(resources)\n",
" from pprint import pprint as pp\n",
" pp(f\"{limits=}\")\n",
" for reset in list(resources.keys()):\n",
" reset_at = resources[reset][\"reset\"]\n",
" reset_max = max(reset_at, reset_max)\n",
Expand Down Expand Up @@ -601,6 +603,7 @@
" \"\"\"\n",
" mozilla-services\n",
" mozilla\n",
" pocket\n",
" \"\"\".split()\n",
" )\n",
" elif use_canned_org_list: # old school\n",
Expand Down Expand Up @@ -1328,6 +1331,41 @@
"lines_to_next_cell": 1
},
"outputs": [],
"source": [
"def prune_hits_to_ignore(full_list, id_to_find):\n",
" # remove vulnerability repos (*-ghsa-*) and archived repos (archive status \n",
" # requires refresh of repository object\n",
" hit_list_1 = [r for r in full_list if (not \"-ghsa-\" in r.repository.name)\n",
" and (not r.repository.refresh().archived)\n",
" ]\n",
" # now eliminate any hits where the search term was not found \"as a word\"\n",
" id_re = re.compile(fr\"\\b{id_to_find}\\b\", re.IGNORECASE)\n",
" hit_list_2 = []\n",
"# print(f\"Checking {len(hit_list_1)} hits\")\n",
" for index, hit in enumerate(hit_list_1):\n",
"# print(f\" Hit {index} has {len(hit.text_matches)} contexts\")\n",
" for ctxt, context in enumerate(hit.text_matches):\n",
" if id_re.search(context[\"fragment\"]):\n",
" hit_list_2.append(hit)\n",
"# print(f\"Adding hit {index}; context {ctxt} ({len(hit_list_2)=}): {context['fragment']}\")\n",
" break\n",
" else:\n",
"# print(f\"ignoring context {context['fragment']}\")\n",
" ...\n",
"# print(f\"returning {len(hit_list_2)} hits\")\n",
" return hit_list_2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "74a23434",
"metadata": {
"hidden": true,
"init_cell": true,
"lines_to_next_cell": 1
},
"outputs": [],
"source": [
"import csv, io\n",
"def check_for_acls(logins):\n",
Expand All @@ -1341,11 +1379,11 @@
" # we're now outputing in CSV format, so put in a header line\n",
" csvfile = io.StringIO()\n",
" writer = csv.writer(csvfile)\n",
" writer.writerow([\"Action Taken\", \"Comment\", \"\", \"File\", \"Search URL\"])\n",
" writer.writerow([\"Action Taken\", \"Comment\", \"\", \"Context\", \"File\", \"Search URL\", \"Raw Context\"])\n",
" # add formula to use for copy down in R2C3 - still requires manual intervention\n",
" # 1. in cell C3 select, edit, and enter to make real formula\n",
" # 2. fill down for all rows in sheet\n",
" writer.writerow([\"\", \"\", '=if(ISBLANK(E2),\"\", HYPERLINK(E2,\"?\"))', \"\", \"\"])\n",
" writer.writerow([\"\", \"\", '=if(ISBLANK(F2),\"\", HYPERLINK(F2,\"?\"))', '=if(isblank(G2),,SUBSTITUTE(G2,\"\\\\n\",char(10)))', \"\", \"\"])\n",
" writer.writerow([\"\"] * 4)\n",
" writer.writerow([f\"Checking for possible ACLs for: {', '.join(possibles)}\", \"\", \"\",])\n",
" writer.writerow([\"\"] * 4)\n",
Expand All @@ -1360,57 +1398,62 @@
"# print(f\" {org}..\", end='')\n",
" for l in possibles:\n",
" full_list = []\n",
" try:\n",
" full_list = list(gh.search_code(query=f\"org:{org} {l}\"))\n",
" except Exception as e:\n",
" if isinstance(e, http.client.RemoteDisconnected):\n",
" # This is \"fun\" to run into - doesn't happen very often\n",
" # so this recovery is an educated guess (the time I\n",
" # did see it, it was after a 'resumed' message from\n",
" # the clause below)\n",
" for i in range(3):\n",
" try_login()\n",
" if gh:\n",
" # re-established connection\n",
" print(f\"re-established connection on try {i+1}\")\n",
" break\n",
" assume_time_out = True\n",
" while assume_time_out:\n",
" try:\n",
" # 2023-05-25 can't use regex in code search, so return context for further processing\n",
" full_list = list(gh.search_code(query=f\"org:{org} {l}\", text_match=True))\n",
" assume_time_out = False\n",
" except Exception as e:\n",
" if isinstance(e, http.client.RemoteDisconnected):\n",
" # This is \"fun\" to run into - doesn't happen very often\n",
" # so this recovery is an educated guess (the time I\n",
" # did see it, it was after a 'resumed' message from\n",
" # the clause below)\n",
" for i in range(3):\n",
" try_login()\n",
" if gh:\n",
" # re-established connection\n",
" print(f\"re-established connection on try {i+1}\")\n",
" break\n",
" else:\n",
" time.sleep(60)\n",
" else:\n",
" time.sleep(60)\n",
" else:\n",
" print(f\"failed to re-establish connection after {i+1} tries\")\n",
" raise SystemExit\n",
" elif e.code not in [403, 422]:\n",
" print(f\"org={org} l={l} exception={str(e)}\")\n",
" elif e.code in [403]:\n",
" print(\"\\n\\nOut of API calls, waiting a minute ..\", end='')\n",
" print_limits(verbose=False)\n",
" # we can hit this a lot, so just wait a minute\n",
" time.sleep(60)\n",
" print(\"... resumed.\")\n",
" # we've reported on everything of interest, no need for else clause\n",
"# else:\n",
"# print(f\"Got code {e.code} for org {org}, search {l}\")\n",
" # remove vulnerability repos (*-ghsa-*) and archived repos (archive status \n",
" # requires refresh of repository object\n",
" hit_list = [r for r in full_list if (not \"-ghsa-\" in r.repository.name)\n",
" and (not r.repository.refresh().archived)]\n",
" num_search_results = len(hit_list)\n",
" print(f\"failed to re-establish connection after {i+1} tries\")\n",
" raise SystemExit\n",
" elif not hasattr(e, 'code'):\n",
" print(f\"org={org} l={l} exception={str(e)} (exception type {type(e)})\")\n",
" elif e.code not in [403, 422]:\n",
" print(f\"org={org} l={l} exception={str(e)}\")\n",
" elif e.code in [403]:\n",
" seconds_to_wait = 7\n",
" print(f\"\\nOut of Code Search API calls, waiting {seconds_to_wait} seconds ({org=}, {l=}) ..\", end='')\n",
" # we can hit this a lot, so just wait a minute - only 10 req/min\n",
" # per https://docs.github.com/en/enterprise-cloud@latest/rest/search?apiVersion=2022-11-28#rate-limit\n",
" time.sleep(seconds_to_wait)\n",
" print(\"... resumed.\")\n",
" # we've reported on everything of interest, no need for else clause\n",
" # else:\n",
" # print(f\"Got code {e.code} for org {org}, search {l}\")\n",
"\n",
" hit_list = prune_hits_to_ignore(full_list, l)\n",
"\n",
" search_urls = []\n",
" for search_hit in hit_list:\n",
" new_url = search_hit_to_url(search_hit.html_url, l)\n",
"# print(f\"before: {search_hit.html_url}\\n after: {new_url}\")\n",
" new_url = search_hit_to_url(search_hit.html_url, l, debug=False)\n",
" if new_url:\n",
" search_urls.append(new_url)\n",
" # add the matching fragments as the 2nd item of a tupple\n",
" context = \"\\n----\\n\".join([m['fragment'] for m in search_hit.text_matches])\n",
" search_urls.append((*new_url, context.replace(\"\\n\", \"\\\\n\")))\n",
" num_raw_search_urls = len(search_urls)\n",
" search_urls = set(search_urls)\n",
" num_search_urls = len(search_urls)\n",
"# print(f\"search results: {num_search_results}; after translation: {num_raw_search_urls}; after dedupe: {num_search_urls}\")\n",
"# print(f\"search results: {len(hit_list)}; after translation: {num_raw_search_urls}; after dedupe: {num_search_urls}\")\n",
" if num_search_urls > 0:\n",
" writer.writerow(['', f\"{num_search_urls} files with possible ACLs in {org} for {l}:\", \"\", \"\"])\n",
" for url, repo, path, filename in sorted(search_urls):\n",
" for url, repo, path, filename, context in sorted(search_urls):\n",
" # output in csv format\n",
" writer.writerow([\"\", \"\", \"\", f\"{repo}/{path}/{filename}\", f\"{url}\"])\n",
" writer.writerow([\"\", \"\", \"\", \"\", f\"{repo}/{path}/{filename}\", f\"{url}\", context])\n",
" # import pdb ; pdb.set_trace()\n",
" csvfile.seek(0)\n",
" hits = [l.strip() for l in csvfile.readlines()]\n",
Expand Down Expand Up @@ -2115,7 +2158,8 @@
"execution_count": null,
"id": "9112d45e",
"metadata": {
"hidden": true
"hidden": true,
"scrolled": false
},
"outputs": [],
"source": [
Expand Down

0 comments on commit 4e82073

Please sign in to comment.