-
Notifications
You must be signed in to change notification settings - Fork 12
/
kmarginal.py
162 lines (133 loc) · 6.47 KB
/
kmarginal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from typing import List, Optional
import pandas as pd
from pathlib import Path
from sdnist.report.dataset import Dataset
import sdnist.load as load
import sdnist.utils as utils
def compute_marginal_densities(data, marginals):
counts = data.groupby(marginals).size()
return counts / data.shape[0]
class KMarginal:
"""
[t1, t2, t3, t4] are target densities. t1 = count / tN
[s1, s2, s3, s4] are synthetic densities. s1 = count /sN
Sum(ti) = 1. Sum (si) = 1
|(t1 - s1)| + |(t2 - s2)| + |(t3-s3)| + |(t4-s4)| \in range [0, 2]
Puma A = (1,2), Puma B = (3,4)
total pop of Puma A is: (t1 + t2)*N
total pop of Puma B is: (t3 + t4)*N
(|(t1 - s1)| + |(t2 - s2)|) ) * N/pop(A)
(|(t1 - s1)| + |(t2 - s2)|) ) * N/((t1 + t2)*N). = scaled PUMA A score
(|(t1 - s1)| + |(t2 - s2|) ) * 1/(t1 + t2).
(|(t1 - s1)| / (t1 + t2)) + (|(t2 - s2)| / (t1 + t2))
(|(t3 - s3)| + |(t4 - s4)|) ) * N/pop(B)
(|(t3 - s3)| + |(t4 - s4)|) ) * N/((t3 + t4)*N). = scaled PUMA B score
(|(t3 - s3)| + |(t4 - s4|) ) * 1/(t3 + t4).
(|(t3 - s3)| / (t3 + t4)) + (|(t4 - s4)| / (t3 + t4))
Problem if s4 giant and t3+t4 tiny
replace the (|(t3 - s3)| + |(t4 - s4|)) numerator sum with:
min((t3 + t4), (|(t3 - s3)| + |(t4 - s4| ) )
Then change the converstion to the 0-1000 score to work as follows
(1 - avg-density-differences(PUMA A))* 1000
Instead of what it used to be:
(2- avg-density-differences(PUMA A))* 1000)
Because now the max will let you differ is by the size of the whole target data,
and that means you max out at 1 (note, we should only do this for PUMA,
because the target population size for every PUMA is reasonable)
"""
NAME = 'K-Marginal'
def __init__(self,
target_data: pd.DataFrame,
deidentified_data: pd.DataFrame,
group_features: Optional[List[str]] = None):
self.td = target_data
self.deid = deidentified_data
self.group_features = group_features or []
self.features = self.td.columns.tolist()
marg_cols = list(set(self.features).difference(['PUMA', 'INDP']))
marg_cols = sorted(marg_cols)
if len(marg_cols) == 1:
self.marginals = [(marg_cols[0], marg_cols[0])]
else:
self.marginals = [(f1, f2)
for i, f1 in enumerate(marg_cols)
for j, f2 in enumerate(marg_cols)
if i < j]
def marginal_pairs(self):
for _ in self.marginals:
yield list(_)
def compute_score(self):
if len(self.group_features):
return self._compute_score_grouped()
else:
return self._compute_score()
def marginal_densities(self, data: pd.DataFrame, marginal: List[str]):
# target data marginal densities
t_den = compute_marginal_densities(self.td, marginal)
# deidentified data marginal densities
s_den = compute_marginal_densities(self.deid, marginal)
# target and deidentified densities absolute differences
abs_den_diff = t_den.subtract(s_den, fill_value=0).abs()
return t_den, s_den, abs_den_diff
def _compute_score(self):
# sum total of densities absolute differences over all marginals
tdds = 0
# For each 2-marginal find sum of absolute density differences
for marg in self.marginal_pairs():
# t_den: target data marginal densities
# s_den: deidentified data marginal densities
# abs_den_diff: target and deidentified densities absolute differences
t_den, s_den, abs_den_diff = self.marginal_densities(self.td, marg)
# sum of target and deidentified densities absolute differences
den_diff_sum = abs_den_diff.sum()
tdds += den_diff_sum
# find average of overall score and each group feature score
mean_tdds = tdds/len(self.marginals)
# convert to NIST 0 - 1000 score range
self.score = (2 - mean_tdds) * 500
return self.score
def _compute_score_grouped(self):
# sum total of densities absolute differences over all marginals
tdds = 0
gf = self.group_features
group_N = self.td.groupby(gf).size()
group_tdds = group_N * 0
# For each 2-marginal find sum of absolute density differences, and
# for group feature find sum of absolute density differences for each
# feature value
for marg in self.marginal_pairs():
marg = gf + marg
# t_den: target data marginal densities
# s_den: deidentified data marginal densities
# abs_den_diff: target and deidentified densities absolute differences
t_den, s_den, abs_den_diff = self.marginal_densities(self.td, marg)
# get sum of abs densities differences for group feature
group_t_den_sum = t_den.groupby(gf).sum()
# sum density differences in each group
group_den_sum = abs_den_diff.groupby(gf).sum()
# take minimum of target density difference sum and group density difference sum
group_den_sum = group_t_den_sum.where(group_t_den_sum <= group_den_sum).fillna(group_den_sum)
# scale back group feature density different sums
group_den_scaled = (group_den_sum * len(self.td)) / group_N
# add this marginal's scaled density differences to other marginals aggregate
group_tdds = group_tdds + group_den_scaled
# sum of target and deidentified densities absolute differences
den_diff_sum = abs_den_diff.sum()
tdds += den_diff_sum
# find average of overall score and each group feature score
mean_tdds = tdds/len(self.marginals)
mean_group_tdds = group_tdds / len(self.marginals)
# convert to NIST 0 - 1000 score range
self.scores = (1 - mean_group_tdds) * 1000
self.score = (2 - mean_tdds) * 500
return self.score
if __name__ == "__main__":
THIS_DIR = Path(__file__).parent
SCH_P = Path(THIS_DIR, '../../diverse_community_excerpts_data/national/na2019.csv')
S_P = Path(THIS_DIR,
'../../toy_synthetic_data/syn/teams/LostInTheNoise/national/MWEM_PGM-GirishKumar.csv')
log = utils.SimpleLogger()
dataset_name = load.TestDatasetName.national2019
d = Dataset(S_P, log, dataset_name)
km = KMarginal(d.d_target_data, d.d_synthetic_data, ['PUMA'])
km.compute_score()