Merge pull request #328 from crocs-muni/issue/324-Switch-from-NVD-dat…

…a-feeds-to-API Switch from NVD json feeds to API
crocs-muni · Apr 24, 2023 · 8b0600e · 8b0600e
2 parents 9b7535a + d4825d1
commit 8b0600e
Show file tree

Hide file tree

Showing 52 changed files with 3,914 additions and 2,402 deletions.
diff --git a/docs/index.md b/docs/index.md
@@ -32,6 +32,7 @@ Seccerts PyPi <https://pypi.org/project/sec-certs/>
 installation.md
 quickstart.md
 configuration.md
+user_guide.md
 ```
 
 ```{toctree}

diff --git a/docs/user_guide.md b/docs/user_guide.md
@@ -0,0 +1,23 @@
+# User's guide
+
+```{important}
+This guide is in the making.
+```
+
+## NVD datasets
+
+Our tool matches certificates to their possible CVEs using datasets downloaded from [National Vulnerability Database (NVD)](https://nvd.nist.gov). If you're fully processing the `CCDataset` or `FIPSDataset` by yourself, you must somehow obtain the NVD datasets.
+
+Our tool can seamlessly download the required NVD datasets when needed. We support two download mechanisms:
+
+1. Fetching datasets with the [NVD API](https://nvd.nist.gov/developers/start-here) (preferred way).
+1. Fetching snapshots from seccerts.org.
+
+The following two keys control the behaviour:
+
+```yaml
+preferred_source_nvd_datasets: "api" # set to "sec-certs" to fetch them from seccerts.org
+nvd_api_key: null # or the actual key value
+```
+
+If you aim to fetch the sources from NVD, we advise you to get an [NVD API key](https://nvd.nist.gov/developers/request-an-api-key) and set the `nvd_api_key` setting accordingly. The download from NVD will work even without API key, it will just be slow. No API key is needed when `preferred_source_nvd_datasets: "sec-certs"`
diff --git a/notebooks/cc/cpe_eval.ipynb b/notebooks/cc/cpe_eval.ipynb
@@ -11,13 +11,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "from sec_certs.dataset import CCDataset\n",
     "import pandas as pd\n",
-    "import json"
+    "import json\n",
+    "import tempfile"
    ]
   },
   {
@@ -29,14 +30,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Max CPE matches: 37\n"
+      "Downloading CC Dataset: 100%|██████████| 139M/139M [00:15<00:00, 9.61MB/s] \n"
      ]
     }
    ],
@@ -54,9 +55,10 @@
     "\n",
     "# Now you may want to adjust the key `cpe_n_max_matches` config in sec_certs/config/settings.yml according to max_n_cpes\n",
     "# This helps to avoid clutter in label studio interface\n",
-    "\n",
-    "dset.certs = {x.dgst: x for x in dset if x.dgst in eval_certs.index.tolist()}\n",
-    "dset.to_label_studio_json(\"./label_studio_input_data.json\")"
+    "with tempfile.TemporaryDirectory() as tmp_dir:\n",
+    "    dset.root_dir = tmp_dir\n",
+    "    dset.certs = {x.dgst: x for x in dset if x.dgst in eval_certs.index.tolist()}\n",
+    "    dset.to_label_studio_json(\"./label_studio_input_data.json\", update_json=False)"
    ]
   },
   {
@@ -75,7 +77,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {

diff --git a/notebooks/cc/references.ipynb b/notebooks/cc/references.ipynb
@@ -19,7 +19,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -73,29 +73,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
     }
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Downloading CC Dataset: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 135M/135M [00:26<00:00, 5.34MB/s]\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Initialize\n",
     "dset = CCDataset.from_web_latest()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -126,25 +118,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
     },
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\\newcommand{\\numCcAllDirectReferencing}{1500}\n",
-      "\\newcommand{\\numCcAllNotDirectReferencing}{3641}\n",
-      "\\newcommand{\\numCcWithIdDirectReferencing}{1500}\n",
-      "\\newcommand{\\numCcWithIdNotDirectReferencing}{3565}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "df[\"has_outgoing_direct_references\"] = df.directly_referencing.notnull()\n",
     "df[\"has_incoming_direct_references\"] = df.directly_referenced_by.notnull()\n",
@@ -169,26 +150,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\\newcommand{\\numCCActiveDirectReferencing}{545}\n",
-      "\\newcommand{\\numCCActiveDirectReferencingArchived}{169}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(f\"\\\\newcommand{{\\\\numCCActiveDirectReferencing}}{{{df_id_rich.loc[df_id_rich.status == 'active'].has_outgoing_direct_references.sum()}}}\")\n",
     "\n",
     "archived_cert_id_list = set(df_id_rich[df_id_rich.status == \"archived\"].cert_id)\n",
     "def contains_archived_cert_reference(referencing):\n",
-    "    if referencing is np.nan:\n",
+    "    if pd.isnull(referencing):\n",
     "        return False\n",
-    "    \n",
     "    return bool(archived_cert_id_list.intersection(referencing))\n",
     "print(f\"\\\\newcommand{{\\\\numCCActiveDirectReferencingArchived}}{{{df_id_rich[df_id_rich.status == 'active'].directly_referencing.apply(contains_archived_cert_reference).sum()}}}\")"
    ]
@@ -202,7 +173,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -229,31 +200,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\\newcommand{\\numCCDirectRefsSameCategory}{2133}\n",
-      "\\newcommand{\\numCCDirectRefsOtherCategory}{192}\n",
-      "\\newcommand{\\numCCDirectRefs}{2325}\n",
-      "\\newcommand{\\numCCDirectRefsFromSmartcards}{1896}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "cert_id_to_category_mapping = dict(zip(df.cert_id, df.category))\n",
     "cert_id_to_category_mapping[np.NaN] = \"No references\"\n",
     "\n",
     "exploded = df_id_rich.loc[:, [\"category\", \"directly_referencing\"]].explode(\"directly_referencing\")\n",
-    "\n",
-    "exploded[\"ref_category\"] = exploded.directly_referencing.map(cert_id_to_category_mapping)\n",
+    "exploded[\"ref_category\"] = exploded.directly_referencing.map(lambda x: cert_id_to_category_mapping[x] if pd.notnull(x) else np.nan)\n",
     "exploded = exploded.loc[exploded.ref_category.notnull()]\n",
     "\n",
     "exploded_with_refs = exploded.loc[exploded.ref_category != \"No references\"]\n",
@@ -286,7 +245,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -320,21 +279,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
     }
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Number of certificates that reference some archived certificate: 933\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def references_archived_cert(references):\n",
     "    if pd.isnull(references):\n",
@@ -373,7 +324,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -406,18 +357,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\\newcommand{\\numCCUSReferencing}{4}\n",
-      "\\newcommand{\\numCCUS}{959}\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(f\"\\\\newcommand{{\\\\numCCUSReferencing}}{{{len(df_id_rich.loc[(df_id_rich.scheme == 'US') & (df_id_rich.directly_referencing.notnull())])}}}\")\n",
     "print(f\"\\\\newcommand{{\\\\numCCUS}}{{{len(df_id_rich.loc[(df_id_rich.scheme == 'US')])}}}\")"
@@ -434,7 +376,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -1194,7 +1136,6 @@
     "    for i, label in enumerate(labels):\n",
     "        label_widths = {}\n",
     "        label_widths[side] = df[df[side] == label][side + \"Weight\"].sum()\n",
-    "        print(\"a\")\n",
     "        if i == 0:\n",
     "            label_widths[\"bottom\"] = 0\n",
     "            label_widths[\"top\"] = label_widths[side]\n",
@@ -1208,13 +1149,6 @@
     "        LOGGER.debug(\"%s position of '%s' : %s\", side, label, label_widths)\n",
     "    return widths, topEdge\n"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {