Skip to content

Commit

Permalink
Merge pull request #328 from crocs-muni/issue/324-Switch-from-NVD-dat…
Browse files Browse the repository at this point in the history
…a-feeds-to-API

Switch from NVD json feeds to API
  • Loading branch information
J08nY authored Apr 24, 2023
2 parents 9b7535a + d4825d1 commit 8b0600e
Show file tree
Hide file tree
Showing 52 changed files with 3,914 additions and 2,402 deletions.
1 change: 1 addition & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Seccerts PyPi <https://pypi.org/project/sec-certs/>
installation.md
quickstart.md
configuration.md
user_guide.md
```

```{toctree}
Expand Down
23 changes: 23 additions & 0 deletions docs/user_guide.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# User's guide

```{important}
This guide is in the making.
```

## NVD datasets

Our tool matches certificates to their possible CVEs using datasets downloaded from [National Vulnerability Database (NVD)](https://nvd.nist.gov). If you're fully processing the `CCDataset` or `FIPSDataset` by yourself, you must somehow obtain the NVD datasets.

Our tool can seamlessly download the required NVD datasets when needed. We support two download mechanisms:

1. Fetching datasets with the [NVD API](https://nvd.nist.gov/developers/start-here) (preferred way).
1. Fetching snapshots from seccerts.org.

The following two keys control the behaviour:

```yaml
preferred_source_nvd_datasets: "api" # set to "sec-certs" to fetch them from seccerts.org
nvd_api_key: null # or the actual key value
```
If you aim to fetch the sources from NVD, we advise you to get an [NVD API key](https://nvd.nist.gov/developers/request-an-api-key) and set the `nvd_api_key` setting accordingly. The download from NVD will work even without API key, it will just be slow. No API key is needed when `preferred_source_nvd_datasets: "sec-certs"`
20 changes: 11 additions & 9 deletions notebooks/cc/cpe_eval.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from sec_certs.dataset import CCDataset\n",
"import pandas as pd\n",
"import json"
"import json\n",
"import tempfile"
]
},
{
Expand All @@ -29,14 +30,14 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"name": "stderr",
"output_type": "stream",
"text": [
"Max CPE matches: 37\n"
"Downloading CC Dataset: 100%|██████████| 139M/139M [00:15<00:00, 9.61MB/s] \n"
]
}
],
Expand All @@ -54,9 +55,10 @@
"\n",
"# Now you may want to adjust the key `cpe_n_max_matches` config in sec_certs/config/settings.yml according to max_n_cpes\n",
"# This helps to avoid clutter in label studio interface\n",
"\n",
"dset.certs = {x.dgst: x for x in dset if x.dgst in eval_certs.index.tolist()}\n",
"dset.to_label_studio_json(\"./label_studio_input_data.json\")"
"with tempfile.TemporaryDirectory() as tmp_dir:\n",
" dset.root_dir = tmp_dir\n",
" dset.certs = {x.dgst: x for x in dset if x.dgst in eval_certs.index.tolist()}\n",
" dset.to_label_studio_json(\"./label_studio_input_data.json\", update_json=False)"
]
},
{
Expand All @@ -75,7 +77,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand Down
106 changes: 20 additions & 86 deletions notebooks/cc/references.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
Expand Down Expand Up @@ -73,29 +73,21 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading CC Dataset: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 135M/135M [00:26<00:00, 5.34MB/s]\n"
]
}
],
"outputs": [],
"source": [
"# Initialize\n",
"dset = CCDataset.from_web_latest()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -126,25 +118,14 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\\newcommand{\\numCcAllDirectReferencing}{1500}\n",
"\\newcommand{\\numCcAllNotDirectReferencing}{3641}\n",
"\\newcommand{\\numCcWithIdDirectReferencing}{1500}\n",
"\\newcommand{\\numCcWithIdNotDirectReferencing}{3565}\n"
]
}
],
"outputs": [],
"source": [
"df[\"has_outgoing_direct_references\"] = df.directly_referencing.notnull()\n",
"df[\"has_incoming_direct_references\"] = df.directly_referenced_by.notnull()\n",
Expand All @@ -169,26 +150,16 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\\newcommand{\\numCCActiveDirectReferencing}{545}\n",
"\\newcommand{\\numCCActiveDirectReferencingArchived}{169}\n"
]
}
],
"outputs": [],
"source": [
"print(f\"\\\\newcommand{{\\\\numCCActiveDirectReferencing}}{{{df_id_rich.loc[df_id_rich.status == 'active'].has_outgoing_direct_references.sum()}}}\")\n",
"\n",
"archived_cert_id_list = set(df_id_rich[df_id_rich.status == \"archived\"].cert_id)\n",
"def contains_archived_cert_reference(referencing):\n",
" if referencing is np.nan:\n",
" if pd.isnull(referencing):\n",
" return False\n",
" \n",
" return bool(archived_cert_id_list.intersection(referencing))\n",
"print(f\"\\\\newcommand{{\\\\numCCActiveDirectReferencingArchived}}{{{df_id_rich[df_id_rich.status == 'active'].directly_referencing.apply(contains_archived_cert_reference).sum()}}}\")"
]
Expand All @@ -202,7 +173,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
Expand All @@ -229,31 +200,19 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\\newcommand{\\numCCDirectRefsSameCategory}{2133}\n",
"\\newcommand{\\numCCDirectRefsOtherCategory}{192}\n",
"\\newcommand{\\numCCDirectRefs}{2325}\n",
"\\newcommand{\\numCCDirectRefsFromSmartcards}{1896}\n"
]
}
],
"outputs": [],
"source": [
"cert_id_to_category_mapping = dict(zip(df.cert_id, df.category))\n",
"cert_id_to_category_mapping[np.NaN] = \"No references\"\n",
"\n",
"exploded = df_id_rich.loc[:, [\"category\", \"directly_referencing\"]].explode(\"directly_referencing\")\n",
"\n",
"exploded[\"ref_category\"] = exploded.directly_referencing.map(cert_id_to_category_mapping)\n",
"exploded[\"ref_category\"] = exploded.directly_referencing.map(lambda x: cert_id_to_category_mapping[x] if pd.notnull(x) else np.nan)\n",
"exploded = exploded.loc[exploded.ref_category.notnull()]\n",
"\n",
"exploded_with_refs = exploded.loc[exploded.ref_category != \"No references\"]\n",
Expand Down Expand Up @@ -286,7 +245,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
Expand Down Expand Up @@ -320,21 +279,13 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of certificates that reference some archived certificate: 933\n"
]
}
],
"outputs": [],
"source": [
"def references_archived_cert(references):\n",
" if pd.isnull(references):\n",
Expand Down Expand Up @@ -373,7 +324,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
Expand Down Expand Up @@ -406,18 +357,9 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\\newcommand{\\numCCUSReferencing}{4}\n",
"\\newcommand{\\numCCUS}{959}\n"
]
}
],
"outputs": [],
"source": [
"print(f\"\\\\newcommand{{\\\\numCCUSReferencing}}{{{len(df_id_rich.loc[(df_id_rich.scheme == 'US') & (df_id_rich.directly_referencing.notnull())])}}}\")\n",
"print(f\"\\\\newcommand{{\\\\numCCUS}}{{{len(df_id_rich.loc[(df_id_rich.scheme == 'US')])}}}\")"
Expand All @@ -434,7 +376,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": null,
"metadata": {
"pycharm": {
"name": "#%%\n"
Expand Down Expand Up @@ -1194,7 +1136,6 @@
" for i, label in enumerate(labels):\n",
" label_widths = {}\n",
" label_widths[side] = df[df[side] == label][side + \"Weight\"].sum()\n",
" print(\"a\")\n",
" if i == 0:\n",
" label_widths[\"bottom\"] = 0\n",
" label_widths[\"top\"] = label_widths[side]\n",
Expand All @@ -1208,13 +1149,6 @@
" LOGGER.debug(\"%s position of '%s' : %s\", side, label, label_widths)\n",
" return widths, topEdge\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Loading

0 comments on commit 8b0600e

Please sign in to comment.