Improve ACL results (#107)

hwine · web-flow · commit 4e820739665b · 2023-05-24T11:59:28.000-07:00
Two major changes:
- reduce false positives by requiring the match be as-a-token (word)
  - short strings often matched base64 encoded text
- remove false negatives when patterns were skipped after encountering a
  search limit
diff --git a/.secrets.baseline b/.secrets.baseline
@@ -115,7 +115,7 @@
         "filename": "notebooks/UserSearchPy3.ipynb",
         "hashed_secret": "13b897fb3181b06360814e15a2535df2624de13a",
         "is_verified": false,
-        "line_number": 2181,
+        "line_number": 2225,
         "is_secret": false
       }
     ],
@@ -130,5 +130,5 @@
       }
     ]
   },
-  "generated_at": "2023-05-04T20:09:16Z"
+  "generated_at": "2023-05-24T18:00:29Z"
 }
diff --git a/Makefile b/Makefile
@@ -31,7 +31,7 @@ SHELL := /bin/bash
 .PHONY: build debug_build
 # New build
 build: Dockerfile .dockerignore Makefile notebooks/*ipynb requirements*.txt
-	docker build --tag $(image_to_use):$(github3_version) --tag $(image_to_use):latest .
+	docker buildx build --tag $(image_to_use):$(github3_version) --tag $(image_to_use):latest .
 # debug the build by not using buildkit - we also assume last one failed, so no need to tag prior
 debug-build:
 	DOCKER_BUILDKIT=0 docker build --tag $(image_to_use):debug .
@@ -77,7 +77,7 @@ run-edit:
 		& \
 		job_pid=$$! ; \
 		sleep 10 ; \
-		docker ps --filter "ancestor=$(image_to_use):$(github3_version)" ; \
+		docker ps --filter "ancestor=$(image_to_use)" ; \
 		wait $$job_pid ; \
 	) '
 
diff --git a/notebooks/UserSearchPy3.ipynb b/notebooks/UserSearchPy3.ipynb
@@ -510,6 +510,8 @@
     "    resources = limits[\"resources\"]\n",
     "    #     print(\"{:3d} keys: \".format(len(resources.keys())), resources.keys())\n",
     "    #     print(resources)\n",
+    "    from pprint import pprint as pp\n",
+    "    pp(f\"{limits=}\")\n",
     "    for reset in list(resources.keys()):\n",
     "        reset_at = resources[reset][\"reset\"]\n",
     "        reset_max = max(reset_at, reset_max)\n",
@@ -601,6 +603,7 @@
     "            \"\"\"\n",
     "            mozilla-services\n",
     "            mozilla\n",
+    "            pocket\n",
     "        \"\"\".split()\n",
     "        )\n",
     "    elif use_canned_org_list:  # old school\n",
@@ -1328,6 +1331,41 @@
     "lines_to_next_cell": 1
    },
    "outputs": [],
+   "source": [
+    "def prune_hits_to_ignore(full_list, id_to_find):\n",
+    "    # remove vulnerability repos (*-ghsa-*) and archived repos (archive status \n",
+    "    # requires refresh of repository object\n",
+    "    hit_list_1 = [r for r in full_list if (not \"-ghsa-\" in r.repository.name)\n",
+    "                                     and (not r.repository.refresh().archived)\n",
+    "                 ]\n",
+    "    # now eliminate any hits where the search term was not found \"as a word\"\n",
+    "    id_re = re.compile(fr\"\\b{id_to_find}\\b\", re.IGNORECASE)\n",
+    "    hit_list_2 = []\n",
+    "#     print(f\"Checking {len(hit_list_1)} hits\")\n",
+    "    for index, hit in enumerate(hit_list_1):\n",
+    "#         print(f\"  Hit {index} has {len(hit.text_matches)} contexts\")\n",
+    "        for ctxt, context in enumerate(hit.text_matches):\n",
+    "            if id_re.search(context[\"fragment\"]):\n",
+    "                hit_list_2.append(hit)\n",
+    "#                 print(f\"Adding hit {index}; context {ctxt} ({len(hit_list_2)=}): {context['fragment']}\")\n",
+    "                break\n",
+    "            else:\n",
+    "#                 print(f\"ignoring context {context['fragment']}\")\n",
+    "                ...\n",
+    "#     print(f\"returning {len(hit_list_2)} hits\")\n",
+    "    return hit_list_2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74a23434",
+   "metadata": {
+    "hidden": true,
+    "init_cell": true,
+    "lines_to_next_cell": 1
+   },
+   "outputs": [],
    "source": [
     "import csv, io\n",
     "def check_for_acls(logins):\n",
@@ -1341,11 +1379,11 @@
     "    # we're now outputing in CSV format, so put in a header line\n",
     "    csvfile = io.StringIO()\n",
     "    writer = csv.writer(csvfile)\n",
-    "    writer.writerow([\"Action Taken\", \"Comment\", \"\", \"File\", \"Search URL\"])\n",
+    "    writer.writerow([\"Action Taken\", \"Comment\", \"\", \"Context\", \"File\", \"Search URL\", \"Raw Context\"])\n",
     "    # add formula to use for copy down in R2C3 - still requires manual intervention\n",
     "    #  1. in cell C3 select, edit, and enter to make real formula\n",
     "    #  2. fill down for all rows in sheet\n",
-    "    writer.writerow([\"\", \"\", '=if(ISBLANK(E2),\"\", HYPERLINK(E2,\"?\"))', \"\", \"\"])\n",
+    "    writer.writerow([\"\", \"\", '=if(ISBLANK(F2),\"\", HYPERLINK(F2,\"?\"))', '=if(isblank(G2),,SUBSTITUTE(G2,\"\\\\n\",char(10)))', \"\", \"\"])\n",
     "    writer.writerow([\"\"] * 4)\n",
     "    writer.writerow([f\"Checking for possible ACLs for: {', '.join(possibles)}\", \"\", \"\",])\n",
     "    writer.writerow([\"\"] * 4)\n",
@@ -1360,57 +1398,62 @@
     "#         print(f\" {org}..\", end='')\n",
     "        for l in possibles:\n",
     "            full_list = []\n",
-    "            try:\n",
-    "                full_list = list(gh.search_code(query=f\"org:{org} {l}\"))\n",
-    "            except Exception as e:\n",
-    "                if isinstance(e, http.client.RemoteDisconnected):\n",
-    "                    # This is \"fun\" to run into - doesn't happen very often\n",
-    "                    # so this recovery is an educated guess (the time I\n",
-    "                    # did see it, it was after a 'resumed' message from\n",
-    "                    # the clause below)\n",
-    "                    for i in range(3):\n",
-    "                        try_login()\n",
-    "                        if gh:\n",
-    "                            # re-established connection\n",
-    "                            print(f\"re-established connection on try {i+1}\")\n",
-    "                            break\n",
+    "            assume_time_out = True\n",
+    "            while assume_time_out:\n",
+    "                try:\n",
+    "                    # 2023-05-25 can't use regex in code search, so return context for further processing\n",
+    "                    full_list = list(gh.search_code(query=f\"org:{org} {l}\", text_match=True))\n",
+    "                    assume_time_out = False\n",
+    "                except Exception as e:\n",
+    "                    if isinstance(e, http.client.RemoteDisconnected):\n",
+    "                        # This is \"fun\" to run into - doesn't happen very often\n",
+    "                        # so this recovery is an educated guess (the time I\n",
+    "                        # did see it, it was after a 'resumed' message from\n",
+    "                        # the clause below)\n",
+    "                        for i in range(3):\n",
+    "                            try_login()\n",
+    "                            if gh:\n",
+    "                                # re-established connection\n",
+    "                                print(f\"re-established connection on try {i+1}\")\n",
+    "                                break\n",
+    "                            else:\n",
+    "                                time.sleep(60)\n",
     "                        else:\n",
-    "                            time.sleep(60)\n",
-    "                    else:\n",
-    "                        print(f\"failed to re-establish connection after {i+1} tries\")\n",
-    "                        raise SystemExit\n",
-    "                elif e.code not in [403, 422]:\n",
-    "                    print(f\"org={org} l={l} exception={str(e)}\")\n",
-    "                elif e.code in [403]:\n",
-    "                    print(\"\\n\\nOut of API calls, waiting a minute ..\", end='')\n",
-    "                    print_limits(verbose=False)\n",
-    "                    # we can hit this a lot, so just wait a minute\n",
-    "                    time.sleep(60)\n",
-    "                    print(\"... resumed.\")\n",
-    "                # we've reported on everything of interest, no need for else clause\n",
-    "#                 else:\n",
-    "#                     print(f\"Got code {e.code} for org {org}, search {l}\")\n",
-    "            # remove vulnerability repos (*-ghsa-*) and archived repos (archive status \n",
-    "            # requires refresh of repository object\n",
-    "            hit_list = [r for r in full_list if (not \"-ghsa-\" in r.repository.name)\n",
-    "                                            and (not r.repository.refresh().archived)]\n",
-    "            num_search_results = len(hit_list)\n",
+    "                            print(f\"failed to re-establish connection after {i+1} tries\")\n",
+    "                            raise SystemExit\n",
+    "                    elif not hasattr(e, 'code'):\n",
+    "                        print(f\"org={org} l={l} exception={str(e)} (exception type {type(e)})\")\n",
+    "                    elif e.code not in [403, 422]:\n",
+    "                        print(f\"org={org} l={l} exception={str(e)}\")\n",
+    "                    elif e.code in [403]:\n",
+    "                        seconds_to_wait = 7\n",
+    "                        print(f\"\\nOut of Code Search API calls, waiting {seconds_to_wait} seconds ({org=}, {l=}) ..\", end='')\n",
+    "                        # we can hit this a lot, so just wait a minute - only 10 req/min\n",
+    "                        #  per https://docs.github.com/en/enterprise-cloud@latest/rest/search?apiVersion=2022-11-28#rate-limit\n",
+    "                        time.sleep(seconds_to_wait)\n",
+    "                        print(\"... resumed.\")\n",
+    "                    # we've reported on everything of interest, no need for else clause\n",
+    "    #                 else:\n",
+    "    #                     print(f\"Got code {e.code} for org {org}, search {l}\")\n",
+    "\n",
+    "            hit_list = prune_hits_to_ignore(full_list, l)\n",
     "\n",
     "            search_urls = []\n",
     "            for search_hit in hit_list:\n",
-    "                new_url = search_hit_to_url(search_hit.html_url, l)\n",
-    "#                 print(f\"before: {search_hit.html_url}\\n after: {new_url}\")\n",
+    "                new_url = search_hit_to_url(search_hit.html_url, l, debug=False)\n",
     "                if new_url:\n",
-    "                    search_urls.append(new_url)\n",
+    "                    # add the matching fragments as the 2nd item of a tupple\n",
+    "                    context = \"\\n----\\n\".join([m['fragment'] for m in search_hit.text_matches])\n",
+    "                    search_urls.append((*new_url, context.replace(\"\\n\", \"\\\\n\")))\n",
     "            num_raw_search_urls = len(search_urls)\n",
     "            search_urls = set(search_urls)\n",
     "            num_search_urls = len(search_urls)\n",
-    "#             print(f\"search results: {num_search_results}; after translation: {num_raw_search_urls}; after dedupe: {num_search_urls}\")\n",
+    "#             print(f\"search results: {len(hit_list)}; after translation: {num_raw_search_urls}; after dedupe: {num_search_urls}\")\n",
     "            if num_search_urls > 0:\n",
     "                writer.writerow(['', f\"{num_search_urls} files with possible ACLs in {org} for {l}:\", \"\", \"\"])\n",
-    "                for url, repo, path, filename in sorted(search_urls):\n",
+    "                for url, repo, path, filename, context in sorted(search_urls):\n",
     "                    # output in csv format\n",
-    "                    writer.writerow([\"\", \"\", \"\", f\"{repo}/{path}/{filename}\", f\"{url}\"])\n",
+    "                    writer.writerow([\"\", \"\", \"\", \"\", f\"{repo}/{path}/{filename}\", f\"{url}\", context])\n",
     "            # import pdb ; pdb.set_trace()\n",
     "    csvfile.seek(0)\n",
     "    hits = [l.strip() for l in csvfile.readlines()]\n",
@@ -2115,7 +2158,8 @@
    "execution_count": null,
    "id": "9112d45e",
    "metadata": {
-    "hidden": true
+    "hidden": true,
+    "scrolled": false
    },
    "outputs": [],
    "source": [

Original file line number	Diff line number	Diff line change
`@@ -115,7 +115,7 @@`
`115`	`115`	`"filename": "notebooks/UserSearchPy3.ipynb",`
`116`	`116`	`"hashed_secret": "13b897fb3181b06360814e15a2535df2624de13a",`
`117`	`117`	`"is_verified": false,`
`118`		`- "line_number": 2181,`
	`118`	`+ "line_number": 2225,`
`119`	`119`	`"is_secret": false`
`120`	`120`	`}`
`121`	`121`	`],`
`@@ -130,5 +130,5 @@`
`130`	`130`	`}`
`131`	`131`	`]`
`132`	`132`	`},`
`133`		`- "generated_at": "2023-05-04T20:09:16Z"`
	`133`	`+ "generated_at": "2023-05-24T18:00:29Z"`
`134`	`134`	`}`