-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.R
448 lines (400 loc) · 22.2 KB
/
utils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
# ***************************************************************************
# Author: Yash Gondhalekar Last updated: March, 2023
# Description: Contains utility functions used in other R scripts.
# Notes:
# Limitation of the function getLightCurve:
# Both the duration (in hours) and the period (in days) must be
# either an integer or a half-integer because the code requires
# two times the duration and period to be an integer.
# IMPORTANT: The arguments ar, ma, and order are obsolete. The
# getLightCurve function, to which these were intended to passed
# have these values harcoded inside that function. So passing
# different values will not have any difference. Future versions
# would try solving for this caveat.
# ***************************************************************************
library(forecast)
library(reticulate)
library('microbenchmark')
library('kernlab')
library(moments)
library('caTools')
source_python("python_utils.py")
# This function simulates a light curve given some parameters of the light curve.
getLightCurve <- function(
period, # What period (in days) do you want to have in your light curve, will be a single value. eg: 1/3/5/7/9.
depth, # What depth (in % of the star's presumed constant level which is 1) do you want to have in your light curve, will be a single value. eg: 0.01/0.05/0.1/0.15/0.2.
duration, # What transit duration (as a fraction of the period) do you want to have in your light curve. eg: 1/24.
res=2, # Resolution to create the light curve. If res = 1, then the light curve cannot handle things that happen on scales less than an hour. Hence, a 1hr transit duration and 0.75hr transit duration would be treated same. If you want to avoid this, set a higher resolution using this parameter.
# Note: The definition of transit duration used in the code is how many points there are at the in-transit level whereas in astronomy it is, how many points are there before you reach the constant value again taking into account points in going from 1 --> inTransitValue.
noiseType=0, # 1 --> Gaussian noise, 2 --> Autoregressive noise. If autoregressive noise, (1, 0, 1) model is used. To change it, need to change the source code.
ntransits=10, # No. of transits in the whole time series. Note: It must be >=3, otherwise BLS/TCF matching filter periodograms might not work.
### VIMP note: While giving inputs, never give a period like 1 since duration would be a fraction of 1 which is a float number and since the code rounds the result, results might not be correct.
### To prevent such issues, if your `duration` is, say, 1/24 times the period (eg: 1 hr duration for a 1 day period), then pass period = 24 and duration = 1/24 instead of period = 1 and duration = 1/24.
gaussStd=1e-4,
# ar=0.2,
# ma=0.2,
# order=c(1, 0, 1), # ARMA, no differencing.
checkConditions=TRUE, # Whether to perform small unit tests within the code to ensure the output is as expected. Recommended: Set to TRUE for almost all cases, but added this option so that `uniroot` does not throw error when finding the limiting depth - see below functions.
seedValue=1
) {
# Ensure the user specifices legitimate values.
if (checkConditions) {
stopifnot(exprs = {
period > 0
depth >= 0
depth <= 100
duration >= 0
ntransits >= 0
})
}
# Create a simulated planet transit time series based on period, depth, and transit duration.
inTransitValue = 1 - (depth / 100) * 1
inTransitTime = duration # inTransitTime is the actual absolute in-transit time (in hours).
constTime = (period * 24 - 2 / res - inTransitTime)
if (checkConditions) {
stopifnot(exprs = {
inTransitValue < 1
inTransitValue >= 0
inTransitTime > 0 # NOTE: 0 transit duration is not allowed by the code as of now.
constTime > 0
})
}
# The deemed constant value will be 1 and the transits would be scaled with respect to 1. For eg: 1 --> 0.998 --> 1 --> 0.998 ...
# Note one caveat here is that if constTime * res is not an integer, this light curve simulation might fail.
y <- rep(1.0, each = as.integer(constTime*res)) # Start with some constant level.
for (n in 1:ntransits) {
y <- append(y, seq(from = 1, to = inTransitValue, length.out = res)) # Decrement from 1 to depth.
endt <- as.integer(inTransitTime*res) - 1
if (endt != 0) { # Special case when 0 < duration < 1 (assuming res=2).
for (j in 1:endt) {
y <- append(y, inTransitValue)
}
}
y <- append(y, seq(from = inTransitValue, to = 1, length.out = res)) # Increment from depth to 1.
constt <- as.integer(constTime*res) + 1
for (j in 1:constt) {
y <- append(y, 1)
}
}
y <- append(y, 1)
# Generate the time epochs.
tIncrement <- 1 / res
t <- seq(from = 0, by = tIncrement, length.out = length(y))
# Add noise to the simulated transits.
if (noiseType == 1) {
set.seed(seedValue)
noise <- rnorm(length(y), mean = 0, sd = gaussStd)
y <- y + noise # 0.01% Gaussian noise.
noiseStd <- sd(noise)
noiseIQR <- IQR(noise)
}
else if (noiseType == 2) {
set.seed(seedValue)
# Note that autoregresive noise has been scaled by `gaussStd` so that the range of values in both Gaussian and autoregressive case look similar.
# We can as well remove that scaling, but using it allows us to compare Gaussian and autoregressive cases much easier since then the only difference is that Gaussian is uncorrelated and autoregressive is correlated noise. And we don't need to worry about autoregressive and Gaussian noises being on different scales.
autoRegNoise <- arima.sim(list(order = c(3,0,3), ar = c(0.2,0.3,0.2), ma=c(0.2,0.2,0.3)), n = length(y)) * gaussStd # It has only AR and MA components, no differencing component. So it is ARMA and not ARIMA. Note: Keep ar and ma < 0.5.
# Note: Use the above example (immediate above) or modifications to it to use more stronger autoregressive noises.
# autoRegNoise <- arima.sim(list(order=c(1, 0, 1), ar=0.2, ma=0.2), n = length(y)) * gaussStd
y <- y + autoRegNoise
noiseStd <- sd(autoRegNoise)
noiseIQR <- IQR(autoRegNoise)
}
else if (noiseType == 0) {
noiseStd <- 0
noiseIQR <- 0
}
if (checkConditions) {
stopifnot(exprs={
length(y) == length(t)
})
}
# Print information about the light curve created.
print(sprintf('Length of time series = %d', length(y)))
# Print some things
print("==== Simulated light curve parameters ====")
print(noquote(paste("Period (hours) = ", sprintf("%.3f", period * 24))))
print(noquote(paste("Depth = ", sprintf("%.6f", depth))))
print(noquote(paste("Transit duration (hours) = ", sprintf("%.3f", inTransitTime))))
print("==========================================")
return (list(y, t, noiseStd, noiseIQR))
}
# Function to get ARIMA residuals ready for use with TCF.
getResidForTCF <- function(
y # Time series (must not be differenced because it is done internally).
) {
max.p = 10
max.q = 10
max.d = 0
ARIMA.fit = auto.arima(diff(y), stepwise=FALSE, approximation=FALSE, seasonal=FALSE, max.p=max.p, max.q=max.q, max.d=max.d) # leave d as 0.
ARIMA.resid = residuals(ARIMA.fit)
return (ARIMA.resid)
}
# Function to get ARMA residuals.
getARMAresid <- function(
y
) {
max.p = 10
max.q = 10
max.d = 0
ARMA.fit = auto.arima(y, stepwise=FALSE, approximation=FALSE, seasonal=FALSE, max.p=max.p, max.q=max.q, max.d=max.d, d=0) #leave d as 0.
ARMA.resid = residuals(ARMA.fit)
return (ARMA.resid)
}
# Function to get Gaussian Processes Regression residuals.
getGPRResid <- function(
t, y
) {
gp <- gausspr(t, y)
predicted_y <- predict(gp, t)
y <- y - predicted_y
return (y)
}
# THIS IS A TRAIL FUNCTION, NOT USED ANYWHERE : It can be used to undifference a light curve.
# **Warning**: It is not guaranteed to work as expected.
undifferenceATimeSeries <- function(
differenced, # The differenced time series.
orig_first_observation # The first observation value in the original time series.
# This original time series is the one that we want to predict. But this function assumes the user
# themselves differenced the original light curve, in which case the user would have access to the first value.
) {
undifferenced <- c()
undifferenced <- c(undifferenced, orig_first_observation)
undiff_value <- undifferenced[1]
for (i in 1:length(differenced)) {
undiff_value <- undiff_value + differenced[i]
undifferenced <- c(undifferenced, undiff_value)
}
return (undifferenced)
}
# Function for folding a light curve at a given period.
plot_folded_lc <- function(lc, bestper) # bestper is the period at which you want to fold the light curve.
{
Phase = (lc[,1] - min(lc[,1], na.rm=TRUE)) %% bestper / bestper
Range_lc = 1.3*diff(range(lc[,2], na.rm=TRUE)) + min(lc[,2], na.rm=TRUE)
Text0.pos = 1.35*diff(range(lc[,2], na.rm=TRUE)) + min(lc[,2], na.rm=TRUE)
plot(
Phase, lc[,2], xlab = "Phase", ylab ="Normalized Flux", cex.axis=1.0, cex.lab=1.2, cex = 0.5,
pch = 20, col="#00000080", ylim=c(min(lc[,2], na.rm=TRUE), Range_lc), xaxs='i'
)
axis(1,at=seq(from=0, to=1, by=.1), labels = F, tcl=0.25)
axis(1,at=seq(from=0, to=1, by=.2), labels = F, tcl=0.5)
text(0.5, Text0.pos, "Phase-folded light curve", pos=1, cex=1.2)
}
# Function to create the frequency grid used for computing the BLS and TCF periodograms.
getFreqGridToTest <- function(
t, # Observation epochs (or the time values).
period=NULL, duration=NULL,
res=2, # This the resolution in the time series that controls the candence. res=2 means cadence of 30 min and res=1 means 1hr, for example.
ofac=1, # Oversampling factor for frequency selection.
useOptimalFreqSampling=FALSE,
algo="BLS", # Either BLS or TCF.
lctype="sim" # Either real or sim.
) {
### Create a frequency grid.
## Ofir, 2014 - optimal frequency sampling - notes ##
# (1) "It is now easy to see that the frequency resolution ∆f is no longer constant - it depends on f itself due to the physics of the problem."
# (2) Section 3.2 also says that by using a very fine frequency grid (suitable for long-period signals), we (a) increase computation time a lot, and (b) it will be too sensitive to noise and less to actual real signals.
# In this code, there is also an option to use Ofir, 2014's suggestion to use the optimal frequency sampling rather than the default uniform frequency sampling.
# Note that the fact that we uniformly sample in "frequency" rather than "period" is itself a good choice: see last para in sec 7.1 in https://iopscience.iop.org/article/10.3847/1538-4365/aab766/pdf
# Note that while using min frequency as zero is often not a problem (does not add suprious peaks - as described in 7.1 in https://iopscience.iop.org/article/10.3847/1538-4365/aab766/pdf), here we start with min_freq = 1 / (duration of time series).
# One motivation for oversampling (from https://iopscience.iop.org/article/10.3847/1538-4365/aab766/pdf): "...it is important to choose grid spacings smaller than the expected widths of the periodogram peaks...To ensure that our grid sufficiently samples each peak, it is prudent to oversample by some factor—say, n0 samples per peak--and use a grid of size 1 / (n0 * T)"
# The above paper also says that n0 = 5 to 10 is common.
# Observation: TCF peak changes as res changes. In fact, the estimated period scales by `res`. This is expected as per our conversation. So we need to scale the estimated period with res.
# I think the reason for that could be that bls takes `t` (apart from `y`, the observation values), so it knows the time-spacing internally whereas TCF does not take t.
perMin <- t[2*res+1] - t[1] # `res` is needed because t[3] - t[1] = 2 time units for res=1 but 1 for res=2. So t[3] - t[1] would become smaller and smaller for larger res, which should not happen since it will give errors at some point.
perMax <- t[length(t)] - t[1]
freqMin <- 1 / perMax
freqMax <- 1 / perMin
nfreq <- length(t) * 10 # This particular value is taken from BLS - see bls.R. Here, it is used for both BLS and TCF.
if (useOptimalFreqSampling & (lctype == "sim")) {
stop("Optimal frequency sampling does not work! Please pass useOptimalFreqSampling=FALSE")
if (algo == "BLS") {
q = duration / (period * 24) # single transit duration / period, both in same units.
}
else if (algo == "TCF") { # TODO: Confirm duty cycle scaling by res.
# Duty cycle for TCF taken from Caceres, 2019 methodology paper: https://iopscience.iop.org/article/10.3847/1538-3881/ab26b8
# Again, scaling by `res` is performed because TCF calculates periodogram in terms of cadence rather than absolute times.
q = res * 1 / (period * 24)
}
s = length(t)
freqStep = q / (s * ofac)
}
else {
# Note: When we oversample, we are essentially trying to impose no constraints on the frequencies to be tested - that is helpful in general: see https://arxiv.org/pdf/1712.00734.pdf
# Note that too much oversampling can lead to artifacts. These artifacts can be wrongly interpreted as a true periodic component in the periodogram.
freqStep <- (freqMax - freqMin) / (nfreq * ofac) # Oversampled by a factor, `ofac`.
}
freqGrid <- seq(from = freqMin, to = freqMax, by = freqStep) # Goes from ~0.001 to max freq set by the time spacing (NOTE: fmax must be <= Nyquist frequency = 1/(2*delta_t) -- from Suveges, 2014), where delta_t here is res.
freqGrid <- freqGrid[-length(freqGrid)] # Remove the last frequency. This is done to prevent periodogram returning nan power at frequency = Nyquist frequency.
print(sprintf("No. of frequencies in grid [algo = %s]: %f", algo, length(freqGrid)))
return (freqGrid);
}
# Function to calculate the SNR of the periodogram peak. This automatically selects the highest peak and calculates SNR
# in a region around the peak. That region is defined by the argument oneSideWindowLength, which is set to 1500 by default.
# This means a window of 2*1500 = 3000 periods will be used as the region around the peak.
calculateSNR <- function( # TODO: For making this more efficient, compute trend fit only for region around the periodogram peak - will save some time.
periods, # The periods at which the periodogram is computed.
periodogramPower, # This power must not be detrended since it is done internally.
lambdaTrend=1, # Parameter passed to cobs.
oneSideWindowLength=1500 # NOTE: This is one side, the actual window length will be 2*oneSideWindowLength.
) {
cobsTrend <- cobs(log10(periods), periodogramPower, ic='BIC', tau=0.5, lambda=lambdaTrend)
detrended <- cobsTrend$resid
# Uncomment the below five lines to use standardization while SNR calculation.
# scatterWindowLength <- 100
# Scatter <- computeScatter(detrended, windowLength=scatterWindowLength)
# lambdaScatter <- 1
# cobsScatter <- cobs(periods, Scatter, ic='BIC', tau=0.5, lambda=lambdaScatter)
# detrended <- detrended / cobsScatter$fitted
# return (list(periods, detrended, cobsTrend$fitted, periodogramPower))
lowerInd <- which.max(detrended) - oneSideWindowLength
upperInd <- which.max(detrended) + oneSideWindowLength
if (which.max(detrended) - oneSideWindowLength < 1) {
lowerInd <- 1
}
if (which.max(detrended) + oneSideWindowLength > length(detrended)) {
upperInd <- length(detrended)
}
consider <- detrended[lowerInd:upperInd]
snr <- max(consider) / mad(consider)
return (snr)
}
# Using the microbenchmark CRAN package, this allows timing the periodogram run - BLS or TCF.
timeAnalysis <- function(
period, depth=0.01, duration=2, noiseType=1, ntransits=10,
gaussStd=1e-4, res=2, algo="BLS",
ofac=2, useOptimalFreqSampling=TRUE, times=1, lctype="sim", applyGPRforBLS=FALSE, applyARMAforBLS=FALSE
) {
yt <- getLightCurve(period, depth, duration, noiseType=noiseType, ntransits=ntransits, gaussStd=gaussStd, res=res, checkConditions=TRUE, seedValue=42)
y <- unlist(yt[1])
t <- unlist(yt[2])
freqGrid <- getFreqGridToTest(t, period, duration, res=res, ofac=ofac, useOptimalFreqSampling=useOptimalFreqSampling, algo=algo, lctype=lctype)
stopifnot(exprs={
all(freqGrid <= res / 2) # No frequency must be greater than the Nyquist frequency.
})
print(sprintf("Max frequency: %f, Min frequency: %f", max(freqGrid), min(freqGrid)))
if (algo == "BLS") {
print("BLS periodogram time benchmark...")
if (isTRUE(noiseType == 2) | applyGPRforBLS) {
y <- getGPRResid(t, y) # Run Gaussian Processes Regression on light curve if autoregressive noise is present.
}
if (applyARMAforBLS) {
y <- getARMAresid(y)
}
microbenchmark(
bls(y, t, bls.plot = FALSE, per.min=min(1/freqGrid), per.max=max(1/freqGrid), nper=length(freqGrid)),
times=times
)
}
else if (algo == "TCF") {
fstep <- (max(freqGrid) - min(freqGrid)) / length(freqGrid)
freqs <- seq(from = min(freqGrid), by = fstep, length.out = length(freqGrid))
periodsToTry <- 1 / freqs
residTCF <- getResidForTCF(y)
print("TCF periodogram time benchmark...")
microbenchmark(
tcf(residTCF, p.try = periodsToTry * res, print.output = FALSE),
times=times
)
}
}
# Function to divide the frequency grid (freqGrid) into K*L frequencies used in the extreme value application.
freqdivideFreqGrid <- function(freqGrid, L, K, seedValue=1) {
set.seed(seedValue) # Set seed for reproducibility.
if ((K %% 2) == 0) {
safeDist <- 1 + K/2 # 1 is added just to be more safe at the edges of the frequency grid. This is just a hackery.
}
else {
safeDist <- 1 + (K-1)/2 # 1 is added just to be more safe at the edges of the frequency grid. This is just a hackery.
}
endIndex <- length(freqGrid) - safeDist
freqConsider <- freqGrid[safeDist:endIndex]
KLfreqs <- rand_parts(freqConsider, n=L, l=K)
return (unlist(KLfreqs))
}
computeScatter <- function(
data, # The data for which the scatter needs to be computed. This is generally the detrended periodogram.
windowLength=1000, # Window length on one side of the focal point. The actual window length used is 2*windowLength.
algo="BLS" # Periodogram algorithm to use. BLS or TCF.
){
runmad(data, k=windowLength, endrule='mad')
# ** LIMITATIONS of this function **
# 1. windowLength must be <= length(data) / 2
# scatterVals = c()
# for (i in 1:length(data)) {
# if (i <= windowLength) {
# u = 2 * windowLength
# l = i + 1
# scatterVal = mad(c(data[1:i], data[l:u]))
# stopifnot(exprs={
# length(c(data[1:i], data[l:u])) == 2 * windowLength
# })
# # print(length(c(data[1:i], data[l:u])))
# scatterVals <- append(scatterVals, scatterVal)
# }
# else if (i + windowLength >= length(data)) {
# ll = i - 2 * windowLength - i + length(data)
# ul = i - 1
# ls = i
# us = length(data)-1
# # stopifnot(exprs={
# # length(c(data[ll:ul], data[ls:us])) == 2 * windowLength
# # })
# # print(length(c(data[ll:ul], data[ls:us])))
# scatterVal = mad(c(data[ll:ul], data[ls:us]))
# scatterVals <- append(scatterVals, scatterVal)
# }
# else {
# l_ = i-windowLength
# u_ = i+windowLength-1
# stopifnot(exprs={
# length(data[l_:u_]) == 2 * windowLength
# })
# scatterVal <- mad(data[l_:u_])
# scatterVals <- append(scatterVals, scatterVal)
# }
# }
# stopifnot(exprs={
# length(scatterVals) == length(data)
# })
# return (scatterVals)
}
standardizeAPeriodogram <- function(
output,
periodsToTry=NULL, # This argument is only needed when algo="TCF" and not needed for algo="BLS".
algo="BLS",
mode='detrend', # Other option is 'detrend' in which case only detrending is performed, no normalization using scatter is performed.
scatterWindowLength=1001
) {
lambdaTrend <- 1
lambdaScatter <- 1
# (1) Remove trend.
if (algo == "BLS") {
cobsxy50 <- cobs(log10(output$periodsTested), output$spec, ic='BIC', tau=0.5, lambda=lambdaTrend) # If tau = 0.5 and lambda = 0 => Median regression fit.
}
else if (algo == "TCF") {
cobsxy50 <- cobs(log10(periodsToTry), output$outpow, ic='BIC', tau=0.5, lambda=lambdaTrend)
}
periodogramTrendRemoved <- cobsxy50$resid
# TLS removes the mean from the detrended periodogram so that mean is zero.
# See https://github.com/hippke/tls/blob/71da590d3e199264822db425ab9f9f633253986e/transitleastsquares/stats.py
# But we don't know why it would be needed. We don't remove the mean.
# periodogramTrendRemoved <- periodogramTrendRemoved - mean(periodogramTrendRemoved)
if (mode == 'detrend_normalize') {
# (2) Remove local scatter in periodogram.
Scatter <- computeScatter(periodogramTrendRemoved, windowLength=scatterWindowLength)
if (algo == "BLS") {
cobsScatter <- cobs(log10(output$periodsTested), Scatter, ic='BIC', tau=0.5, lambda=lambdaScatter)
}
else if (algo == "TCF") {
cobsScatter <- cobs(log10(periodsToTry), Scatter, ic='BIC', tau=0.5, lambda=lambdaScatter)
}
# We do not simply divide by the "global" standard deviation.
# cobsScatter <- sd(periodogramTrendRemoved)
normalizedPeriodogram <- periodogramTrendRemoved / cobsScatter$fitted
return (list(normalizedPeriodogram, cobsxy50))
}
else { # mode == 'detrend'
return (list(periodogramTrendRemoved, cobsxy50))
}
}