-
-
Notifications
You must be signed in to change notification settings - Fork 12
/
sensitivity.R
300 lines (231 loc) · 8.04 KB
/
sensitivity.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
##########################################################################
# Simple script to load survival data and show
# survival functionality
#
# Authors: Soumya Banerjee, Tom Bishop, Demetris Avraam, Paul Burton
##########################################################################
####################
# Load libraries
####################
library(survival)
library(RANN)
library(fANCOVA)
##############
# Load data
##############
file <- read.csv(file = "expand_no_missing_study1.csv", header = TRUE, stringsAsFactors = FALSE)
# file <- read.csv(file = "https://raw.githubusercontent.com/neelsoumya/survival_curve_privacy_prototype/main/expand_no_missing_study1.csv", header = TRUE, stringsAsFactors = FALSE)
lung
bladder
veteran
cgd
colon
diabetic
gbsg
heart
jasa
mgus
myeloid
nafld1
##################
# Plotting
##################
survObj <- Surv(time=lung$time, event=lung$status==2, type='right')
survObj <- with(veteran,Surv(time, status))
# subsample
# TODO: parametrize 50 and vary that and figuire out a
survObj <- survObj[1:10]
no_noise <- survfit(survObj ~ 1)
# make a copy
with_noise <- no_noise
# original survival curve
plot(no_noise, main= "raw data")
#noise = 0.0003 # 0.03 0.26
noise = 0.0001
##########################################
# Approach 1: probabilistic anonymization
# add noise before plotting
#
# add noise to:
# surv (i.e. proportion surviving)
# time (times at which events occur, ie when the proportion changes)
# this is for the y axis
# and for time on x axis
##########################################
sd = var(with_noise$time)^0.5
for ( i_temp_counter_inner in c(2:length(with_noise$surv)) )
{
delta_noise_time <- stats::rnorm(n = 1, mean = 0, sd = noise*sd)
# with_noise$time[i_temp_counter_inner] <- with_noise$time[i_temp_counter_inner] + delta_noise
# previous value time
prev_value_temp_time <- with_noise$time[i_temp_counter_inner - 1]
# current value temp time
curr_value_temp_time <- with_noise$time[i_temp_counter_inner]
# proposed value of time with noise subtracted
value_noise_time_proposed = curr_value_temp_time + (curr_value_temp_time*delta_noise_time)
if (prev_value_temp_time <= value_noise_time_proposed)
{
# if previous value time is less then proposed value then ok
# since time is supposed to be increasing
# set this value to current time
with_noise$time[i_temp_counter_inner] = value_noise_time_proposed
}
else
{
# set to previous time value
with_noise$time[i_temp_counter_inner] = prev_value_temp_time
}
}
# modified survival curve
plot(with_noise, main = "probabilistic anonymisation", xlab = 'Time', ylab = 'Survival fraction')
##################################################
# Approach 3: deterministic anonymization take 2
#
##################################################
##################
# Anonymise survival times using the deterministic anonymisation
##################
knn <- 20
with_knn <- no_noise
# Step 1: Standardise the variable
time.standardised <- (with_knn$time-mean(with_knn$time))/stats::sd(with_knn$time)
# Step 2: Find the k-1 nearest neighbours of each data point
nearest <- RANN::nn2(time.standardised, k = knn)
# Step 3: Calculate the centroid of each n nearest data points
time.centroid <- matrix()
for (i in 1:length(with_knn$time))
{
time.centroid[i] <- mean(time.standardised[nearest$nn.idx[i,1:knn]])
}
# Step 4: Calculate the scaling factor
time.scalingFactor <- stats::sd(time.standardised)/stats::sd(time.centroid)
# Step 5: Apply the scaling factor to the centroids
time.masked <- time.centroid * time.scalingFactor
# Step 6: Shift the centroids back to the actual position and scale of the original data
SURVTIME_anon <- (time.masked * stats::sd(with_knn$time)) + mean(with_knn$time)
# modofy time in survfit object (instead of original time)
with_knn$time <- SURVTIME_anon
# TODO: commenting out these to have no survfit call
#with_noise.anon <- survfit(Surv(SURVTIME_anon, EVENT) ~ 1)
#lines(with_noise.anon, col='red', add=TRUE)
# with_noise_determ <- with_noise.anon
#plot(with_noise_determ)
# if time needs adjustment, if min is less than 0 for example
# then adjust and translate everything
if (min(with_knn$time) < 0)
{
with_knn$time <- with_knn$time - min(with_knn$time)
}
plot(with_knn, main = "deterministic anonymisation", xlab = 'Time', ylab = 'Survival fraction')
##################
# LOESS smoothing
##################
# TODO:
span = a /number of points
loess10 = loess(no_noise$surv ~ no_noise$time, span=0.10)
smoothed10 <- predict(loess10)
loess25 = loess(no_noise$surv ~ no_noise$time, span=0.25)
smoothed25 <- predict(loess25)
plot(no_noise)
plot(no_noise$surv, x=no_noise$time, type="l", main="Loess Smoothing and Prediction", xlab="time", ylab="surv")
lines(smoothed10, x=no_noise$time, col="red")
lines(smoothed25, x=no_noise$time, col="green")
################################
# automated LOESS smoothing
################################
# on data fit model
# no_noise is survfit object
loess_as_fit <- fANCOVA::loess.as(no_noise$time, no_noise$surv, plot = TRUE)
# predict
smoothed_loess_as = stats::predict(loess_as_fit)
# assign to survfit object and modify it
# create temp variable
no_noise_temp_loess_as <- no_noise
no_noise_temp_loess_as$surv <- smoothed_loess_as
# plot
plot(no_noise_temp_loess_as$surv, x = no_noise_temp_loess_as$time, main = "Ablation study automatic LOESS", xlab = 'Time', ylab = 'Fraction survived')
##################################################
# differential privacy
# bonomi et al
##################################################
N = no_noise$n
T = 1000
epsilon = 2
partition = ceiling(log(min(N,T),2)/epsilon)
partition = 3
acc = 0
label = 1
no_noise$label = c(1:length(no_noise$time))
# function to define partitioning groups
for (i in 1:length(no_noise$time)){
# need to add partition noise to take half of half the budget
acc = acc + no_noise$n.event[i] + no_noise$n.censor[i]
if (acc > partition){
label = label + 1
acc = 0
}
no_noise$label[i] = label
}
my_df = data.frame(groups = no_noise$label, time = no_noise$time, n.event = no_noise$n.event,
n.censor = no_noise$n.censor, n.risk = no_noise$n.risk)
library(dplyr)
# section to sum up events within a partition - no noise yet
my_df2 <- my_df %>%
group_by(groups) %>%
summarise(
across(.cols = time, .fns = max),
across(.cols = c(n.event,n.censor), .fns = sum)
) %>% arrange(time)
my_df2$cum_sum_censor = cumsum(my_df2$n.censor)
my_df2$cum_sum_event = cumsum(my_df2$n.event)
# add half of half the budget of noise to these
# function to generate binary representation of the partition number
number2binary = function(number, noBits) {
binary_vector = rev(as.numeric(intToBits(number)))
if(missing(noBits)) {
return(binary_vector)
} else {
binary_vector[-(1:(length(binary_vector) - noBits))]
}
}
# recursive function to create a binary tree like structure
binarise2 <- function(N, the_list)
{
tree_level = c()
for (i in seq(2,length(N),2)){
part_sum = N[i]+N[i-1]
# trick for end of odd length N, keep it until the end
if ((length(N) - i)==1) {
tree_level = c(tree_level,part_sum,N[i+1])
}
else {tree_level = c(tree_level,part_sum)}
}
the_list = append(the_list, list(tree_level))
if (length(tree_level) == 1)
return(the_list)
else
binarise2(tree_level, the_list)
}
#then go through and apply remaining budget as noise to the intermediate counts
test = c(1,2,0,3,1,1,0,1,3)
test= my_df2$n.event
#put the first tree level in a list and then pass this to the tree generating function
my_list = list(test)
final_list = binarise2(test, my_list)
final_list = rev(final_list)
#this for loop uses the binary representation of the partition index to locate the required
# nodes in the binary tree structure and add them together
depth = length(final_list)
duration = length(test)
results = c()
for (i in 1:duration){
num = number2binary(i, depth)
total = 0
for (j in depth:1){
if (num[j] ==1) {
loc = floor(i/2^(depth-j))
total = total + final_list[[j]][loc]
}
}
results = c(results, total)
}