-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathFitFun.R
707 lines (677 loc) · 53.5 KB
/
FitFun.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
# FitFun: Fitting Empirical Fundamental Diagrams Of Road Traffic
#
# Description: This script is an implementation of the FitFun modelling framework for empirical fundamental diagrams (EFDs; see Bramich, Menendez
# & Ambuhl, 2023, Transportation Research Part C, 151, 104068). It can be used to fit EFD data in either flow-density or speed-density
# form with any model from an extensive list of fundamental diagram models. The input EFD data, which are typically generated by a single
# suitable detector unit at a specific location on a road, should consist of measurements pairs of traffic flow (veh/h) or traffic speed
# (km/h) with traffic density (veh/km), and a time stamp associated with each measurement pair (see below for the input data file format).
# If the detector measures the traffic flow q(x,t) as the average number of vehicles passing location x per unit time, the traffic speed
# v(x,t) as the arithmetic mean of the speeds of the vehicles over some distance at time t, and the traffic density k(x,t) as the average
# number of vehicles per unit length at time t, then, by definition, these three quantities are related to each other by the fundamental
# equation of traffic flow:
#
# q(x,t) = k(x,t)*v(x,t) (1)
#
# Loop detectors (LDs) are commonly used to generate EFD data by aggregation of their raw signals over short time intervals (typically
# ~1-10 mins). While LDs can perform direct measurements of traffic flow, they are unable to perform direct measurements of traffic
# density. LDs instead provide measurements of occupancy. The occupancy is computed as the fraction of time in a given time interval
# that a vehicle was within the detection zone of the LD (i.e. it is a number in the range 0 to 1). The occupancy measured by an LD is
# approximately proportional to the traffic density, and therefore it is standard practice to model traffic flow, or speed, as a function
# of occupancy as a proxy for modelling the calibrated flow-density, or speed-density, fundamental diagram, respectively. This script can
# also be used to fit flow-occupancy and speed-occupancy EFD data.
# The modelling of EFD data within the FitFun framework is based on the assumption that there is an underlying speed-density
# relationship such that speed is a function of density only, in which case equation (1) becomes:
#
# q(k) = k*v(k) (2)
#
# Models for EFD data specified within the FitFun framework are Generalised Additive Models for Location, Scale, and Shape (GAMLSS;
# Rigby & Stasinopoulos, 2005, Applied Statistics Series C, 54, 507; Stasinopoulos et al., 2017, CRC Press LLC, Boca Raton; Rigby et al.,
# 2019, CRC Press LLC, New York), and they consist of two components; namely, a functional form model component, and a noise model
# component, where the former is assumed to be the same as the underlying functional form q(k) or v(k) of the fundamental diagram
# relationship.
# This script implements a wide range of fundamental diagram models specified within the FitFun framework. The available models employ
# all of the 50 previously proposed model components for the functional form of the fundamental diagram relationship that have been
# comprehensively collated and reviewed by Bramich, Menendez & Ambuhl (2022, IEEE Transactions On Intelligent Transportation Systems, 23,
# 14104). They also employ the four noise model components described in Bramich, Menendez & Ambuhl (2023, Transportation Research Part C,
# 151, 104068). The models are fit to the EFD data using maximum likelihood (ML) estimation, or maximum penalised likelihood (MPL)
# estimation, as appropriate (for more details see Bramich, Menendez & Ambuhl, 2023, Transportation Research Part C, 151, 104068). The
# script also computes the values of two different information criteria for any model fit, which enables model comparisons based on the
# Principle of Parsimony to be performed (see below).
# While the values of the information criteria provided by this script can be used to select the most parsimonious model for an EFD
# from a set of candidate models, there is no guarantee that the selected model actually provides an acceptable fit to the data (e.g.
# this situation typically occurs when none of the candidate models considered are capable of achieving acceptable fit quality for the
# data in question). Hence it is essential to assess the quality of any model fit, and this can be done through the analysis of the fit
# residuals. For this purpose, this script computes the fit residuals as normalised quantile residuals (NQR). For more information on
# what NQR are, please see Chapter 12 of Stasinopoulos et al. (2017, CRC Press LLC, Boca Raton) and Bramich, Menendez & Ambuhl (2023,
# Transportation Research Part C, 151, 104068). However, this script does not provide any analysis of the NQR with regards to fit quality,
# and this important task is deferred to the user. Since the definition of the NQR ensures that they follow a standard Normal distribution
# whenever the fitted model is the true model, it is standard procedure to check the distribution of the NQR against the standard Normal
# distribution. For example, one way that this can be done is by testing whether or not the null hypothesis that the NQR follow the
# standard Normal distribution can be rejected at a specific significance level such as 1% or 5% (e.g. by employing the Anderson-Darling
# test). We stress that, wherever possible, decisions on which fundamental diagram model to use should be informed through a combination
# of appropriate model comparisons (e.g. via information criteria) and analysis of fit residuals (e.g. via hypothesis testing on the NQR).
# The specific procedures for fitting each individual model that is available in this script are separated out into a single R module
# per model. These modules have names of the form "fit_flow_density_with_<functional_form_model>_<noise_model>.R" or
# "fit_speed_density_with_<functional_form_model>_<noise_model>.R", and they each contain a single function of the same name (see below
# under "Command-Line Arguments" for the definitions of "functional_form_model" and "noise_model"). Any parameters that are specific to
# a procedure for fitting a particular model are defined within the relevant module itself, and their default values can be modified if
# necessary. For example, in the modules that fit models with a fixed jam density, the jam density is set by default to "1.0", which may
# be appropriate for fits to flow-occupancy or speed-occupancy data, but will surely not be appropriate for fits to flow-density or
# speed-density data.
#
# Important Notes On Model Selection Criteria:
#
# Very often in scientific modelling, one faces the task of selecting an optimal (or best) model for a data sample from a set of candidate models
# (multiple working hypotheses). In this context, "optimality" refers both to the Principle of Parsimony, in that the best model should constitute
# the simplest model that provides a good fit to the data without under- or over-fitting, and to appropriate/relevant model performance measure(s).
# Model estimation via ML assumes a uniform prior probability density function on the model parameters. Consequently, as parameters are added to a
# model, the ML always increases, rendering it useless for the purpose of model selection between models with different dimensionalities.
# Information criteria are used as an alternative for evaluating models with different numbers of parameters. Various information criteria have
# been developed from distinct statistical view-points as implementations of the Principle of Parsimony and each one may be used to automatically
# select a parsimonious model from a set of candidate models. They may be applied regardless of whether the models under consideration are nested
# or non-nested.
# The "gamlss" software, on which this script is based, computes both the Akaike information criterion (AIC; Akaike, 1974, IEEE Transactions on
# Automatic Control, 19, 716) and the Bayesian information criterion (BIC; Schwarz, 1978, The Annals of Statistics, 6, 461) for a GAMLSS model fit.
# Both of these information criteria apply to linear and non-linear models estimated via ML. The formulae for these information criteria are given
# by:
#
# AIC = -2*ln(Lmax) + 2*Npar (3)
#
# BIC = -2*ln(Lmax) + Npar*ln(Ndat) (4)
#
# where Lmax is the value of the likelihood function L at the vector of ML estimators for the model parameters, Npar is the total number of free
# parameters in the model, and Ndat is the number of data values. Model selection with the AIC or BIC is performed by minimising -2*ln(L) for each
# model, and then minimising the AIC or BIC, respectively, over the full set of models under consideration.
# The AIC is derived as an asymptotic approximation to the Kullback-Leibler divergence (Kullback & Leibler, 1951, The Annals of Mathematical
# Statistics, 22, 79) which measures the distance of a candidate model from the true underlying model under the (reasonable) assumption that the
# true model is of infinite dimension. In this case the AIC provides an asymptotically efficient selection of a finite dimensional approximating
# model. However, if the true model is finite dimensional, then the AIC does not provide a consistent model selection. A model selection criterion
# is said to be consistent if it selects with high probability the true model from the set of candidate models whenever the true model is
# represented in the set of candidate models. The aim of the AIC is to evaluate models based on their prediction accuracy.
# It is important to be aware that the AIC suffers from a large negative bias when the number of model parameters is a non-negligible fraction
# of the number of data values (e.g. for Ndat/Npar < 40; Hurvich & Tsai, 1989, Biometrika, 76, 297). This bias can be corrected for, but at the
# cost of general applicability, since the formula for the bias correction depends on the statistical model that is being fitted. For example,
# Sugiura (1978, Communications in Statistics - Theory and Methods, 7, 13) derived a bias-corrected version of the AIC for Gaussian linear
# regression problems that is asymptotically the same as the AIC for Ndat >> Npar. Hence, the AIC in equation (3) should only be used for model
# selection for sufficiently large data samples.
# Takeuchi (1976, Mathematical Sciences, 153, 12) generalised the AIC with a more complicated formula to create the Takeuchi information
# criterion (TIC). Subsequently, Konishi & Kitagawa (1996, Biometrika, 83, 875) derived a further generalisation of the AIC and TIC, called the
# generalised information criterion (GIC), that can also be applied to model selection for models with parameters estimated by MPL. The "gamlss"
# software only implements the AIC from the available AIC-like information criteria. Consequently, for GAMLSS models that employ non-parametric
# smoothing terms estimated by MPL, the AIC is computed by using the vector of MPL estimators for the model parameters in place of the vector of
# ML estimators, and the trace of the smoother matrix as an estimate of the (effective) number of free parameters used in the fit.
# An alternative approach to model selection is a Bayesian approach where the model with the largest Bayesian posterior probability is chosen.
# The BIC is derived by approximating the posterior probability of each model. The BIC generally includes a heavier penalty than the AIC (and the
# bias-corrected AIC) for more complicated models (e.g. in the regime Npar < 20 for Ndat > 50), therefore favouring models with fewer parameters
# than those favoured by the AIC. Konishi, Ando & Imoto (2004, Biometrika, 91, 27) performed a deeper Bayesian analysis to derive an improved BIC,
# along with a version that applies to model selection for models with parameters estimated by MPL. The BIC and the improved BIC are consistent
# model selection criteria. Again, the "gamlss" software only implements the BIC from the available BIC-like information criteria. Consequently,
# for GAMLSS models that employ non-parametric smoothing terms estimated by MPL, the BIC is computed by using the vector of MPL estimators for the
# model parameters in place of the vector of ML estimators, and the trace of the smoother matrix as an estimate of the (effective) number of free
# parameters used in the fit.
#
# Usage:
#
# To run this script, issue the following command:
#
# <path_to_binary>/Rscript --vanilla <path_to_script>/FitFun.R <path_to_modules> <data_file> <output_dir> <overwrite> <fd_type>
# <functional_form_model> <noise_model> <ngrid> <upper_density> <plot_format>
#
# where "path_to_binary" is the full directory path to the "Rscript" binary (usually "/usr/bin") and "path_to_script" is the full directory path
# to where the script "FitFun.R" is stored. The definition of each of the command-line arguments "path_to_modules", "data_file", "output_dir",
# "overwrite", "fd_type", "functional_form_model", "noise_model", "ngrid", "upper_density", and "plot_format" can be found below. If
# "path_to_binary" is in the user's path, then it can be dropped. Also, if the script is being run from the directory where it resides, then
# "path_to_script" can also be dropped. In this case, the command reduces to:
#
# Rscript --vanilla FitFun.R <path_to_modules> <data_file> <output_dir> <overwrite> <fd_type> <functional_form_model> <noise_model> <ngrid>
# <upper_density> <plot_format>
#
# Command-Line Arguments:
#
# path_to_modules - STRING - The full directory path to where the R modules required by this script are stored.
# data_file - STRING - File name of the input data file. This argument can be supplied with or without a full directory path. If this argument
# is supplied without a full directory path, then the script will look for the input data file in the directory from where
# the script is run. See below for the required format of the data file.
# output_dir - STRING - The output directory where the output files are to be written. This argument can be supplied with or without a full
# directory path. If this argument is supplied without a full directory path, then the script will assume that the output
# directory is inside the directory from where the script is run. The following output files will be written to the output
# directory:
#
# Fit.Summary.<fd_type>.<functional_form_model>.<noise_model>.txt
# Fit.Curves.<fd_type>.<functional_form_model>.<noise_model>.txt
# Fit.Predictions.<fd_type>.<functional_form_model>.<noise_model>.txt
# Plot.Of.Fitted.Mu.For.Data.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# Plot.Of.Residuals.From.Mu.For.Data.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# Plot.Of.Percentiles.And.Mu.For.Data.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# Plot.Of.Normalised.Quantile.Residuals.For.Data.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# Plot.Of.Fitted.Mu.For.Full.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# Plot.Of.Residuals.From.Mu.For.Full.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# Plot.Of.Percentiles.And.Mu.For.Full.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# Plot.Of.Normalised.Quantile.Residuals.For.Full.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# Plot.Of.Normalised.Quantile.Residuals.Versus.Mu.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# Plot.Of.Normalised.Quantile.Residuals.Versus.Time.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# Plot.Of.Detrended.Normal.QQ.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# Plot.Of.Slotted.ACF.For.Normalised.Quantile.Residuals.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
#
# See below for a description of each of the output files.
# overwrite - STRING - If this argument is set to 'yes', then the script will overwrite any of the output files that already exist in the output
# directory "output_dir". If this argument is set to 'no', then the script will stop without doing anything if it finds that
# any of the output files already exist in the output directory "output_dir". If this argument is set to any other string,
# then the script will fail.
# fd_type - STRING - The form of the EFD data as a string in the format "<dependent_variable>.<independent_variable>". The acceptable values for
# this argument are 'Flow.Density' and 'Speed.Density'. If this argument is set to any other string, then the script will fail.
# functional_form_model - STRING - The abbreviated name of the functional form component of the model that is to be fitted to the EFD data. The
# acceptable values for this argument are (Bramich, Menendez & Ambuhl, 2022, IEEE Transactions On Intelligent
# Transportation Systems, 23, 14104):
#
# 'FF' - Free-flow model
# 'GS1935' - Greenshields model
# 'GS1935kjf' - Greenshields model (fixed jam density)
# 'GB1959' - Greenberg model
# 'GB1959kjf' - Greenberg model (fixed jam density)
# 'ED1961' - Edie multi-regime model
# 'ED1961kjf' - Edie multi-regime model (fixed jam density)
# 'UW1961A' - Underwood model A
# 'UW1961B' - Underwood model B
# 'UW1961Bkjf' - Underwood model B (fixed jam density)
# 'FN1961' - Franklin-Newell model
# 'FN1961kjf' - Franklin-Newell model (fixed jam density)
# 'GZ1961A' - Gazis model A
# 'GZ1961Akjf' - Gazis model A (fixed jam density)
# 'GZ1961B' - Gazis model B
# 'GZ1961Bkjf' - Gazis model B (fixed jam density)
# 'GZ1961C' - Gazis model C
# 'GZ1961Ckjf' - Gazis model C (fixed jam density)
# 'GZ1961D' - Gazis model D
# 'GZ1961Dkjf' - Gazis model D (fixed jam density)
# 'GZ1961E' - Gazis model E
# 'GZ1961Ekjf' - Gazis model E (fixed jam density)
# 'GZ1961F' - Gazis model F
# 'GZ1961G' - Gazis model G
# 'GZ1961Gkjf' - Gazis model G (fixed jam density)
# 'GZ1961H' - Gazis model H
# 'GZ1961Hkjf' - Gazis model H (fixed jam density)
# 'DK1966A' - Drake multi-regime model A
# 'DK1966Akjf' - Drake multi-regime model A (fixed jam density)
# 'DK1966B' - Drake multi-regime model B
# 'DK1966Bkjf' - Drake multi-regime model B (fixed jam density)
# 'MJ1971' - Munjal multi-regime model
# 'MJ1971kjf' - Munjal multi-regime model (fixed jam density)
# 'BM1977' - Boardman model
# 'VA1995' - van Aerde model
# 'VA1995kjf' - van Aerde model (fixed jam density)
# 'BD1995' - Bando model
# 'DC1995A' - Del Castillo model A
# 'DC1995Akjf' - Del Castillo model A (fixed jam density)
# 'DC2012B' - Del Castillo model B
# 'DC2012Bkjf' - Del Castillo model B (fixed jam density)
# 'GD2008' - Ghandehari model
# 'GD2008kjf' - Ghandehari model (fixed jam density)
# 'MN2008' - MacNicholas model
# 'MN2008kjf' - MacNicholas model (fixed jam density)
# 'WG2011A' - Wang model A
# 'WG2011B' - Wang model B
# 'WG2011C' - Wang model C
# 'SN2014' - Sun model
# 'SN2014kjf' - Sun model (fixed jam density)
#
# If this argument is set to any other string, then the script will fail.
# noise_model - STRING - The abbreviated name of the noise component of the model that is to be fitted to the EFD data. The acceptable values for
# this argument are (Bramich, Menendez & Ambuhl, 2023, Transportation Research Part C, 151, 104068):
#
# 'GaussSigCon' - Gaussian noise model with constant variance.
# 'GaussSigNS5p' - Gaussian noise model with density-dependent variance. Specifically, the density dependence of the log of
# the standard deviation is modelled using natural cubic splines with five effective free parameters.
# 'SN2SigNS5pNuNS3p' - Skew Normal Type II noise model with density-dependent scale and skewness parameters. Specifically,
# the density dependence of the log of the scale parameter and the log of the skewness parameter is
# modelled using natural cubic splines with five and three effective free parameters, respectively.
# 'SEP3SigNS5pNuNS3pTauL2p' - Skew Exponential Power Type III noise model with density-dependent scale, skewness, and
# kurtosis parameters. Specifically, the density dependence of the log of the scale parameter
# and the log of the skewness parameter is modelled using natural cubic splines with five and
# three effective free parameters, respectively. The density dependence of the log of the
# kurtosis parameter is modelled using a straight line function (i.e. an intercept and a
# gradient as the two free parameters).
#
# If this argument is set to any other string, then the script will fail.
# ngrid - INTEGER - For the purpose of reconstructing the fitted model, the script will define an equally spaced grid of "ngrid" density values.
# This argument must be a positive number greater than or equal to "11". It is recommended to set this argument to at least
# "10001".
# upper_density - FLOAT - For the purpose of reconstructing the fitted model, the script will define a grid of equally spaced density values
# covering the range from zero to "upper_density". This argument must be a positive number, and it must also be greater
# than or equal to the maximum value of the independent variable (i.e. density or occupancy) in the data.
# plot_format - STRING - The file format for the plot files that the script produces. The acceptable values for this argument are 'ps', 'pdf',
# 'png', and 'none'. If this argument is set to any other string, then the script will fail. In the case that this argument
# is set to 'none', then the script will not produce any plot files.
#
# Input Data File:
#
# The input data file "data_file" should be a non-empty ASCII text file with exactly three columns separated by white space of any length. The
# data file should not contain any header lines, or any lines that are not data lines, and it should have at least 5 data lines. The required
# columns are:
#
# Column 1 - INTEGER/FLOAT - The time stamp corresponding to each traffic measurement. For traffic measurements made over time intervals, this
# could be the start/midpoint/end of the time interval. The units of time, and, if relevant, the point within a time
# interval to which the time stamp corresponds, are not important, so long as they are consistent over all of the time
# stamps since this allows reliable time differences to be computed. Currently, the data in this column are only used
# for the computation of the slotted auto-correlation function for the NQR.
# Column 2 - INTEGER/FLOAT - The measured value of the independent variable (i.e. density or occupancy). All values in this column must be
# positive.
# Column 3 - INTEGER/FLOAT - The measured value of the dependent variable (i.e. flow or speed). All values in this column must be non-negative,
# and there must be at least one value that is non-zero.
#
# Output Files:
#
# Fit.Summary.<fd_type>.<functional_form_model>.<noise_model>.txt - This output text file provides a set of summary information for the fitted
# model. The contents of the file are fully documented within the file itself.
# Fit.Curves.<fd_type>.<functional_form_model>.<noise_model>.txt - This output text file provides the reconstructed fitted model, percentile
# curves, and curves of distributional measures (e.g. mean, mode, standard
# deviation, moment skewness, etc.), for the equally spaced grid of "ngrid"
# density values covering the range from zero to "upper_density". A header line
# provides the column descriptions.
# Fit.Predictions.<fd_type>.<functional_form_model>.<noise_model>.txt - This output text file provides the predicted values for the fitted model
# at the density values in the data and the NQR. It also provides the values
# of the percentiles and a set of distributional measures (e.g. mean, mode,
# standard deviation, moment skewness, etc.) for the fitted model at the
# density values in the data. A header line provides the column descriptions.
# Plot.Of.Fitted.Mu.For.Data.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# - Plot of the flow or speed data versus density (red points). The fitted model
# component for "mu" over the density range from zero to the maximum observed
# density is plotted as a black curve.
# Plot.Of.Residuals.From.Mu.For.Data.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# - Plot of the residuals of the flow or speed data from the fitted model component
# for "mu" versus density (red points). The plot density range is from zero to
# the maximum observed density.
# Plot.Of.Percentiles.And.Mu.For.Data.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# - Plot of the flow or speed data versus density (red points). The fitted model
# component for "mu" over the density range from zero to the maximum observed
# density is plotted as a black curve. The percentile ranges corresponding to
# 0.135 to 2.28, 2.28 to 15.87, and 15.87 to 50.0 are plotted as lighter to
# darker grey regions, respectively, below the median curve (dashed blue curve).
# The median curve may not be visible if it is coincident with the "mu" curve.
# The percentile ranges corresponding to 50.0 to 84.13, 84.13 to 97.72, and 97.72
# to 99.865 are plotted as darker to lighter grey regions, respectively, above
# the median curve. The percentile boundaries correspond to -3*sigma, -2*sigma,
# -1*sigma, the median, 1*sigma, 2*sigma, and 3*sigma in a Normal distribution.
# Plot.Of.Normalised.Quantile.Residuals.For.Data.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# - Plot of the NQR for the flow or speed data versus density (red points). The plot
# density range is from zero to the maximum observed density. The grey regions are
# percentile ranges as described for previous plots while the median line is
# coincident with the horizontal dotted line.
# Plot.Of.Fitted.Mu.For.Full.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# - Plot of the flow or speed data versus density (red points). The fitted model
# component for "mu" over the density range from zero to "upper_density" is
# plotted as a black curve.
# Plot.Of.Residuals.From.Mu.For.Full.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# - Plot of the residuals of the flow or speed data from the fitted model component
# for "mu" versus density (red points). The plot density range is from zero to
# "upper_density".
# Plot.Of.Percentiles.And.Mu.For.Full.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# - Plot of the flow or speed data versus density (red points). The fitted model
# component for "mu" over the density range from zero to "upper_density" is
# plotted as a black curve. The percentile ranges corresponding to 0.135 to 2.28,
# 2.28 to 15.87, and 15.87 to 50.0 are plotted as lighter to darker grey regions,
# respectively, below the median curve (dashed blue curve). The median curve may
# not be visible if it is coincident with the "mu" curve. The percentile ranges
# corresponding to 50.0 to 84.13, 84.13 to 97.72, and 97.72 to 99.865 are plotted
# as darker to lighter grey regions, respectively, above the median curve. The
# percentile boundaries correspond to -3*sigma, -2*sigma, -1*sigma, the median,
# 1*sigma, 2*sigma, and 3*sigma in a Normal distribution.
# Plot.Of.Normalised.Quantile.Residuals.For.Full.Density.Range.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# - Plot of the NQR for the flow or speed data versus density (red points). The plot
# density range is from zero to "upper_density". The grey regions are percentile
# ranges as described for previous plots while the median line is coincident with
# the horizontal dotted line.
# Plot.Of.Normalised.Quantile.Residuals.Versus.Mu.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# - Plot of the NQR for the flow or speed data versus the fitted values for "mu"
# (red points). The grey regions are percentile ranges as described for previous
# plots while the median line is coincident with the horizontal dotted line.
# Plot.Of.Normalised.Quantile.Residuals.Versus.Time.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# - Plot of the NQR for the flow or speed data versus time (red points). The grey
# regions are percentile ranges as described for previous plots while the median
# line is coincident with the horizontal dotted line.
# Plot.Of.Detrended.Normal.QQ.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# - Given a relevant set of theoretical quantiles from a standard Normal
# distribution, this output file displays a plot of the difference between the
# empirical NQR quantiles and the theoretical quantiles (units of sigma) versus
# the theoretical quantiles (units of sigma; red points). The approximate
# point-wise 95% confidence interval is plotted as a pair of dashed curves. This
# sort of plot is referred to as a detrended Normal quantile-quantile plot (or
# worm plot). For more details, see van Buuren & Fredriks (2001, Statistics In
# Medicine, 20, 1259).
# Plot.Of.Slotted.ACF.For.Normalised.Quantile.Residuals.<fd_type>.<functional_form_model>.<noise_model>.<plot_format>
# - Plot of the slotted auto-correlation function (slotted ACF) versus time lag
# (light grey bars) for the NQR. Uncertainties on the slotted ACF values are
# plotted as error bars. For more details, see Edelson & Krolik (1988,
# Astrophysical Journal, 333, 646). In the implementation in this script, the
# time lag bin size dTbinsize is set to the median time difference between
# consecutive traffic measurements (see the parameter "time_lag_bin_size" in
# the function "plotH" in "fitfun_functions.R"), and the number of time lag
# bins Nbin is hard-coded (see the parameter "time_lag_nbins" in the function
# "plotH" in "fitfun_functions.R"). Taken together, these two parameters define
# the maximum time difference that is considered in the computation of the
# slotted ACF as dTbinsize*(Nbin - 0.5). Since these parameters are not
# available as command-line arguments in this script, they must be changed
# directly in the code in "fitfun_functions.R" if so required.
#
# Requirements:
#
# R (Version >= 3.6.1)
# R Package: data.table (Version >= 1.12.8)
# R Package: gamlss (Version >= 5.1-6)
# R Package: ggplot2 (Version >= 3.2.1) [Not required if "plot_format" is set to 'none']
# R Package: ggpubr (Version >= 0.2.5) [Not required if "plot_format" is set to 'none']
#
# Authors:
#
# Dan Bramich (dan.bramich@hotmail.co.uk)
# Lukas Ambuhl (lukas.ambuehl@ivt.baug.ethz.ch)
# Ingest the command-line arguments
cat('\n')
cat('>-----------------------------------------------------------------------------<\n')
cat('\n')
cat('FitFun: Fitting Empirical Fundamental Diagrams Of Road Traffic\n')
cat('\n')
cat('Dan Bramich & Lukas Ambuhl\n')
cat('\n')
cat('>-----------------------------------------------------------------------------<\n')
cat('\n')
cat('Ingesting the command-line arguments...\n')
args = commandArgs(trailingOnly = TRUE)
nargs = length(args)
if (nargs < 10) {
cat('ERROR - Too few command-line arguments specified...\n')
q(save = 'no', status = 1)
} else if (nargs > 10) {
cat('ERROR - Too many command-line arguments specified...\n')
q(save = 'no', status = 1)
} else {
path_to_modules = args[1]
data_file = args[2]
output_dir = args[3]
overwrite = args[4]
fd_type = args[5]
functional_form_model = args[6]
noise_model = args[7]
ngrid = as.integer(args[8])
upper_density = as.double(args[9])
plot_format = args[10]
}
# Perform checks on the command-line arguments and report their values
cat('\n')
cat('Path to R modules: ', path_to_modules, '\n')
if (!dir.exists(path_to_modules)) {
cat('ERROR - The directory path to the required R modules does not exist...\n')
q(save = 'no', status = 1)
}
cat('Input data file: ', data_file, '\n')
if (!file.exists(data_file)) {
cat('ERROR - The input data file does not exist...\n')
q(save = 'no', status = 1)
}
cat('Output directory: ', output_dir, '\n')
if ((overwrite != 'yes') && (overwrite != 'no')) {
cat('ERROR - The command-line argument "overwrite" does not have an acceptable value...\n')
q(save = 'no', status = 1)
}
cat('Overwrite previous files: ', overwrite, '\n')
acceptable_values = c('Flow.Density', 'Speed.Density')
if (!is.element(fd_type, acceptable_values)) {
cat('ERROR - The command-line argument "fd_type" does not have an acceptable value...\n')
q(save = 'no', status = 1)
}
cat('Fundamental diagram type: ', fd_type, '\n')
acceptable_values = c('FF', 'GS1935', 'GS1935kjf', 'GB1959', 'GB1959kjf', 'ED1961', 'ED1961kjf', 'UW1961A', 'UW1961B',
'UW1961Bkjf', 'FN1961', 'FN1961kjf', 'GZ1961A', 'GZ1961Akjf', 'GZ1961B', 'GZ1961Bkjf', 'GZ1961C', 'GZ1961Ckjf',
'GZ1961D', 'GZ1961Dkjf', 'GZ1961E', 'GZ1961Ekjf', 'GZ1961F', 'GZ1961G', 'GZ1961Gkjf', 'GZ1961H', 'GZ1961Hkjf',
'DK1966A', 'DK1966Akjf', 'DK1966B', 'DK1966Bkjf', 'MJ1971', 'MJ1971kjf', 'BM1977', 'VA1995', 'VA1995kjf',
'BD1995', 'DC1995A', 'DC1995Akjf', 'DC2012B', 'DC2012Bkjf', 'GD2008', 'GD2008kjf', 'MN2008', 'MN2008kjf',
'WG2011A', 'WG2011B', 'WG2011C', 'SN2014', 'SN2014kjf')
if (!is.element(functional_form_model, acceptable_values)) {
cat('ERROR - The command-line argument "functional_form_model" does not have an acceptable value...\n')
q(save = 'no', status = 1)
}
cat('Functional form model: ', functional_form_model, '\n')
acceptable_values = c('GaussSigCon', 'GaussSigNS5p', 'SN2SigNS5pNuNS3p', 'SEP3SigNS5pNuNS3pTauL2p')
if (!is.element(noise_model, acceptable_values)) {
cat('ERROR - The command-line argument "noise_model" does not have an acceptable value...\n')
q(save = 'no', status = 1)
}
cat('Noise model: ', noise_model, '\n')
if (is.na(ngrid)) {
cat('ERROR - The command-line argument "ngrid" is not an integer...\n')
q(save = 'no', status = 1)
}
if (ngrid < 11) {
cat('ERROR - The command-line argument "ngrid" is an integer with a value that is less than "11"...\n')
q(save = 'no', status = 1)
}
cat('No. of density grid points for model reconstruction:', ngrid, '\n')
if (is.na(upper_density)) {
cat('ERROR - The command-line argument "upper_density" is not a number...\n')
q(save = 'no', status = 1)
}
if (upper_density <= 0.0) {
cat('ERROR - The command-line argument "upper_density" is a number that is zero or negative...\n')
q(save = 'no', status = 1)
}
cat('Upper density for model reconstruction: ', sprintf('%.8g', upper_density), '\n')
acceptable_values = c('ps', 'pdf', 'png', 'none')
if (!is.element(plot_format, acceptable_values)) {
cat('ERROR - The command-line argument "plot_format" does not have an acceptable value...\n')
q(save = 'no', status = 1)
}
cat('Plot file format: ', plot_format, '\n')
# Load the required R libraries and functions
cat('\n')
cat('Loading the "data.table" R library...\n')
tryCatch(
{ library(data.table) },
error = function(cond) { cat('ERROR - Failed to load the "data.table" R library...\n')
q(save = 'no', status = 1) }
)
cat('Loading the "gamlss" R library...\n')
cat('----------------------------------------------------------------\n')
tryCatch(
{ library(gamlss) },
error = function(cond) { cat('ERROR - Failed to load the "gamlss" R library...\n')
cat('----------------------------------------------------------------\n')
q(save = 'no', status = 1) }
)
cat('----------------------------------------------------------------\n')
if (plot_format != 'none') {
cat('Loading the "ggplot2" R library...\n')
tryCatch(
{ library(ggplot2) },
error = function(cond) { cat('ERROR - Failed to load the "ggplot2" R library...\n')
q(save = 'no', status = 1) }
)
cat('Loading the "ggpubr" R library...\n')
cat('----------------------------------------------------------------\n')
tryCatch(
{ library(ggpubr) },
error = function(cond) { cat('ERROR - Failed to load the "ggpubr" R library...\n')
cat('----------------------------------------------------------------\n')
q(save = 'no', status = 1) }
)
cat('----------------------------------------------------------------\n')
}
cat('Loading some R functions specific to this script...\n')
tryCatch(
{ source(file.path(path_to_modules, 'fitfun_functions.R')) },
error = function(cond) { cat('ERROR - Failed to load the R functions...\n')
q(save = 'no', status = 1) }
)
# Define the names of the output files
tmpstr1 = paste0(fd_type, '.', functional_form_model, '.', noise_model)
output_files = character(length = 15)
output_files[1] = file.path(output_dir, paste0('Fit.Summary.', tmpstr1, '.txt'))
output_files[2] = file.path(output_dir, paste0('Fit.Curves.', tmpstr1, '.txt'))
output_files[3] = file.path(output_dir, paste0('Fit.Predictions.', tmpstr1, '.txt'))
if (plot_format == 'none') {
output_files = output_files[1:3]
} else {
tmpstr2 = paste0(tmpstr1, '.', plot_format)
output_files[4] = file.path(output_dir, paste0('Plot.Of.Fitted.Mu.For.Data.Density.Range.', tmpstr2))
output_files[5] = file.path(output_dir, paste0('Plot.Of.Residuals.From.Mu.For.Data.Density.Range.', tmpstr2))
output_files[6] = file.path(output_dir, paste0('Plot.Of.Percentiles.And.Mu.For.Data.Density.Range.', tmpstr2))
output_files[7] = file.path(output_dir, paste0('Plot.Of.Normalised.Quantile.Residuals.For.Data.Density.Range.', tmpstr2))
output_files[8] = file.path(output_dir, paste0('Plot.Of.Fitted.Mu.For.Full.Density.Range.', tmpstr2))
output_files[9] = file.path(output_dir, paste0('Plot.Of.Residuals.From.Mu.For.Full.Density.Range.', tmpstr2))
output_files[10] = file.path(output_dir, paste0('Plot.Of.Percentiles.And.Mu.For.Full.Density.Range.', tmpstr2))
output_files[11] = file.path(output_dir, paste0('Plot.Of.Normalised.Quantile.Residuals.For.Full.Density.Range.', tmpstr2))
output_files[12] = file.path(output_dir, paste0('Plot.Of.Normalised.Quantile.Residuals.Versus.Mu.', tmpstr2))
output_files[13] = file.path(output_dir, paste0('Plot.Of.Normalised.Quantile.Residuals.Versus.Time.', tmpstr2))
output_files[14] = file.path(output_dir, paste0('Plot.Of.Detrended.Normal.QQ.', tmpstr2))
output_files[15] = file.path(output_dir, paste0('Plot.Of.Slotted.ACF.For.Normalised.Quantile.Residuals.', tmpstr2))
}
# If the output directory "output_dir" already exists
if (dir.exists(output_dir)) {
# Report
cat('\n')
cat('The output directory already exists:', output_dir, '\n')
cat('Checking if any of the output files already exist...\n')
# For each output file
for (outfile in output_files) {
# If the current output file already exists
if (file.exists(outfile)) {
# If the command-line argument "overwrite" is set to 'yes'
if (overwrite == 'yes') {
# Remove the current output file
cat('Removing the output file:', outfile, '\n')
tryCatch(
{ file.remove(outfile) },
error = function(cond) { cat('ERROR - Failed to remove the output file...\n')
q(save = 'no', status = 1) }
)
# If the command-line argument "overwrite" is set to 'no'
} else {
# Stop the script without doing anything
cat('ERROR - The following output file already exists:', outfile, '\n')
q(save = 'no', status = 1)
}
}
}
# If the output directory "output_dir" does not already exist
} else {
# Create the output directory "output_dir"
cat('\n')
cat('Creating the output directory:', output_dir, '\n')
tryCatch(
{ dir.create(output_dir) },
error = function(cond) { cat('ERROR - Failed to create the output directory...\n')
q(save = 'no', status = 1) }
)
}
# Read in the data file "data_file"
cat('\n')
cat('Reading in the data file:', data_file, '\n')
tryCatch(
{ traffic_data = fread(data_file, header = FALSE) },
error = function(cond) { cat('ERROR - Failed to read in the data file...\n')
q(save = 'no', status = 1) }
)
# Perform basic checks on the data that have been read in
ntraffic_data = nrow(traffic_data)
if (ntraffic_data == 0) {
cat('ERROR - The data file is empty...\n')
q(save = 'no', status = 1)
}
if (ncol(traffic_data) != 3) {
cat('ERROR - The data file does not have exactly three columns...\n')
q(save = 'no', status = 1)
}
if (!is.numeric(traffic_data$V1)) {
cat('ERROR - The data in the first column of the data file are not numeric...\n')
q(save = 'no', status = 1)
}
if (!is.numeric(traffic_data$V2)) {
cat('ERROR - The data in the second column of the data file are not numeric...\n')
q(save = 'no', status = 1)
}
if (!is.numeric(traffic_data$V3)) {
cat('ERROR - The data in the third column of the data file are not numeric...\n')
q(save = 'no', status = 1)
}
if (ntraffic_data < 5) {
cat('ERROR - The data file has less than 5 data lines...\n')
q(save = 'no', status = 1)
}
if (!all(is.finite(traffic_data$V1))) {
cat('ERROR - At least one data value in the first column is infinite...\n')
q(save = 'no', status = 1)
}
if (!all(is.finite(traffic_data$V2))) {
cat('ERROR - At least one data value in the second column is infinite...\n')
q(save = 'no', status = 1)
}
if (!all(is.finite(traffic_data$V3))) {
cat('ERROR - At least one data value in the third column is infinite...\n')
q(save = 'no', status = 1)
}
if (any(traffic_data$V2 <= 0.0)) {
cat('ERROR - At least one data value in the second column is zero or negative...\n')
q(save = 'no', status = 1)
}
if (any(traffic_data$V3 < 0.0)) {
cat('ERROR - At least one data value in the third column is negative...\n')
q(save = 'no', status = 1)
}
if (all(traffic_data$V3 == 0.0)) {
cat('ERROR - All of the data values in the third column are zero...\n')
q(save = 'no', status = 1)
}
cat('No. of lines read in: ', ntraffic_data, '\n')
# Sort the data into increasing time order
cat('Sorting the data by time...\n')
tryCatch(
{ setorder(traffic_data, V1) },
error = function(cond) { cat('ERROR - Failed to sort the data...\n')
q(save = 'no', status = 1) }
)
# Make a further check on "upper_density"
if (upper_density < max(traffic_data$V2)) {
cat('ERROR - The command-line argument "upper_density" is less than the maximum value of the density data...\n')
q(save = 'no', status = 1)
}
################################################################################################################################################
# Flow-density empirical fundamental diagrams
# If the EFD data are flow-density
cat('\n')
cat('Calling the required R module for performing the fit...\n')
if (fd_type == 'Flow.Density') {
# Load the required R module for performing the fit
tryCatch(
{ source(file.path(path_to_modules, paste0('fit_flow_density_with_', functional_form_model, '_', noise_model, '.R'))) },
error = function(cond) { cat('ERROR - Failed to load the required R module...\n')
q(save = 'no', status = 1) }
)
# Fit the chosen GAMLSS model to the flow-density EFD data
tryCatch(
{ fit_function = get(paste0('fit_flow_density_with_', functional_form_model, '_', noise_model))
model_obj = fit_function(traffic_data, ngrid, upper_density, output_files) },
error = function(cond) { cat('ERROR - Failed to fit the GAMLSS model for unknown reasons...\n')
remove_file_list(output_files)
q(save = 'no', status = 1) }
)
################################################################################################################################################
# Speed-density empirical fundamental diagrams
# If the EFD data are speed-density
} else if (fd_type == 'Speed.Density') {
# Load the required R module for performing the fit
tryCatch(
{ source(file.path(path_to_modules, paste0('fit_speed_density_with_', functional_form_model, '_', noise_model, '.R'))) },
error = function(cond) { cat('ERROR - Failed to load the required R module...\n')
q(save = 'no', status = 1) }
)
# Fit the chosen GAMLSS model to the speed-density EFD data
tryCatch(
{ fit_function = get(paste0('fit_speed_density_with_', functional_form_model, '_', noise_model))
model_obj = fit_function(traffic_data, ngrid, upper_density, output_files) },
error = function(cond) { cat('ERROR - Failed to fit the GAMLSS model for unknown reasons...\n')
remove_file_list(output_files)
q(save = 'no', status = 1) }
)
}
# Finish
cat('\n')
cat('Finished!\n')