forked from wagner-group/diversevul
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.py
160 lines (118 loc) · 6.06 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import json
import pandas as pd
import re
import ast
from pathlib import Path
from typing import List, Union
root_path = Path(__file__).parent
dataset_path = root_path.parent / 'diversevul_20230702.json'
# regex to match CVE IDs
cve_id_pattern = r'CVE-\d{4}-\d{4,7}'
def read_data() -> List[dict]:
with dataset_path.open(mode='r') as f:
return [json.loads(line) for line in f.readlines()]
def read_tenet_data() -> pd.DataFrame:
df = pd.read_csv(str(root_path / 'tenet.csv'), sep=',')
# drop unnecessary index column 'Unnamed: 0'
return df.drop(columns=['Unnamed: 0'])
def to_csv(data: List[dict]):
pd.DataFrame(data).to_csv(str(root_path / 'diverse_vul.csv'), index=False)
def find_cve_id(message: str) -> Union[str, None]:
match = re.match(cve_id_pattern, message)
if match:
return match.group(0)
return None
def get_df_with_cve_ids() -> pd.DataFrame:
output_path = root_path / 'diverse_vul_ids.csv'
if output_path.exists():
return pd.read_csv(str(output_path))
# check all files that were created by splitting the dataframe
chunks = []
for f in root_path.iterdir():
if f.is_file() and f.suffix == '.csv' and f.stem.startswith('diverse_vul_ids_'):
print(f"Reading {f}")
chunks.append(pd.read_csv(str(f)))
if len(chunks) > 0:
return pd.concat(chunks)
diverse_vul_df = pd.DataFrame(read_data())
print(f"Number of unique project-commit-id pairs in diversevul dataset: {len(diverse_vul_df.groupby(['project', 'commit_id']))}")
print(f"Finding CVE IDs in diversevul dataset by matching messages")
diverse_vul_df['cve_id'] = diverse_vul_df.message.apply(find_cve_id)
# filter out None values
diverse_vul_ids_df = diverse_vul_df[diverse_vul_df['cve_id'].notnull()]
print(f"Finding remaining CVE IDs in diversevul dataset by matching project/commit with tenet dataset")
tenet_df = read_tenet_data()
tenet_df.rename(columns={'project': 'project_href'}, inplace=True)
tenet_df['project'] = tenet_df['project_href'].apply(lambda x: x.split('/')[-1])
# chain is a string list, so it needs to be converted to a list
tenet_df['chain'] = tenet_df['chain'].apply(lambda x: ast.literal_eval(x))
rows_matches = []
for project, rows in diverse_vul_df[diverse_vul_df['cve_id'].isnull()].groupby('project'):
for sha, rows2 in rows.groupby('commit_id'):
project_search = tenet_df[tenet_df['project'] == project]
if len(project_search) == 0:
continue
sha_search = project_search[project_search['last_fix_commit'] == sha]
if len(sha_search) > 0:
# check if CVE-ID is unique
if len(sha_search['vuln_id'].unique()) > 1:
print(f"Multiple CVE-IDs for {sha}")
continue
rows2['cve_id'] = sha_search['vuln_id'].values[0]
rows_matches.append(rows2)
continue
chain_search = project_search[project_search['chain'].apply(lambda x: sha in x)]
if len(chain_search) > 0:
# check if CVE-ID is unique
if len(chain_search['vuln_id'].unique()) > 1:
print(f"Multiple CVE-IDs for {sha}")
continue
rows2['cve_id'] = chain_search['vuln_id'].values[0]
rows_matches.append(rows2)
res_df = pd.concat(rows_matches)
res_df = pd.concat([res_df, diverse_vul_ids_df])
print(f"Number of collected CVE IDs: {len(res_df['cve_id'].unique())}")
size_mbs = res_df.memory_usage(deep=True).sum() / 1024 ** 2
print(f"Size of the dataframe in MBs: {size_mbs}")
if size_mbs > 50:
# split dataframe in chunks of 49MB
chunks = int(size_mbs / 49) + 1
print(f"Splitting dataframe in {chunks} chunks")
res_df = res_df.reset_index(drop=True)
chunk_size = int(len(res_df) / chunks)
print(f"Chunk size: {chunk_size}")
for i in range(chunks):
print(f"Saving chunk {i}")
res_df[i * chunk_size:(i + 1) * chunk_size].to_csv(str(root_path / f'diverse_vul_ids_{i}.csv'), index=False)
else:
res_df.to_csv(str(root_path / 'diverse_vul_ids.csv'), index=False)
return res_df
def get_single_short_vulnerable_function_ids(diverse_vul_df: pd.DataFrame) -> pd.DataFrame:
# get cve_ids with only one vulnerable function
vul_df = diverse_vul_df[diverse_vul_df['target'] == 1]
counts = vul_df['cve_id'].value_counts()
ids = counts[counts == 1].index
print(f"Number of CVE IDs with only one vulnerable function: {len(ids)}")
single_vul_df = vul_df[vul_df['cve_id'].isin(ids)]
# get cve_ids with only one short vulnerable function
single_vul_sizes = sorted(single_vul_df['size'].value_counts().to_dict().keys())
# get the average size of vulnerable functions
mean = sum(single_vul_sizes) / len(single_vul_sizes)
print(f"Average size of single vulnerable functions: {mean}")
short_single_vul_df = single_vul_df[single_vul_df['size'] < mean]
print(f"Number of CVE IDs with only one short vulnerable function: {len(short_single_vul_df)}")
return short_single_vul_df
if __name__ == '__main__':
diverse_vul_df = get_df_with_cve_ids()
unique_ids = diverse_vul_df['cve_id'].unique()
print(f"Number of unique CVE IDs: {len(unique_ids)}")
short_single_vul_df = get_single_short_vulnerable_function_ids(diverse_vul_df)
# merge with tenet dataset
tenet_df = read_tenet_data()
merged = short_single_vul_df.merge(tenet_df, left_on='cve_id', right_on='vuln_id')
print(f"Number of CVE IDs with only one vulnerable function and in tenet dataset: {len(merged)}")
# include only single patch CVE-IDS
merged = merged[merged['patch'] == 'SINGLE']
print(f"Differing commit ids in diversevul/tenet {len(merged[merged['commit_id'] != merged['last_fix_commit']])}")
print(f"Number of CVE IDs with only one vulnerable function and in tenet dataset and single patch: {len(merged)}")
merged.to_csv(str(root_path / 'diverse_vul_tenet.csv'), index=False)