forked from TikhonJelvis/RL-book
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbibliography.bib
1037 lines (1037 loc) · 49.6 KB
/
bibliography.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@book{Brooks1975,
added-at = {2011-02-22T17:47:19.000+0100},
author = {Brooks, Frederick P.},
biburl = {https://www.bibsonomy.org/bibtex/2368f8c335ac4bc492572b7d29efe7753/fsteeg},
interhash = {7f0faa2359c30fc99c4bfc2c203d3ecd},
intrahash = {368f8c335ac4bc492572b7d29efe7753},
keywords = {diss programming},
publisher = {Addison-Wesley},
timestamp = {2011-02-22T17:47:19.000+0100},
title = {The mythical man-month -- Essays on Software-Engineering},
topics = {Programming/Software Engineering},
year = 1975
}
@book{Sutton1998,
added-at = {2019-07-13T10:11:53.000+0200},
author = {Sutton, Richard S. and Barto, Andrew G.},
biburl = {https://www.bibsonomy.org/bibtex/2f46601cf8b13d39d1378af0d79438b12/lanteunis},
edition = {Second},
interhash = {ac6b144aaec1819919a2fba9f705c852},
intrahash = {f46601cf8b13d39d1378af0d79438b12},
keywords = {},
publisher = {The MIT Press},
timestamp = {2019-07-13T10:11:53.000+0200},
title = {Reinforcement Learning: An Introduction},
url = {http://incompleteideas.net/book/the-book-2nd.html},
year = {2018 }
}
@book{puterman2014markov,
added-at = {2017-04-07T12:13:11.000+0200},
author = {Puterman, Martin L.},
biburl = {https://www.bibsonomy.org/bibtex/22e7ac99cd30c4892171e5a7cef1bc7a7/becker},
interhash = {6cec8f775a265d8741171d17e4a4e7d0},
intrahash = {2e7ac99cd30c4892171e5a7cef1bc7a7},
keywords = {inthesis diss markov chain decision process citedby:scholar:count:9594 citedby:scholar:timestamp:2017-4-7},
publisher = {John Wiley \& Sons},
timestamp = {2017-04-07T12:13:11.000+0200},
title = {Markov decision processes: discrete stochastic dynamic programming},
year = 2014
}
@article{journals/jmlr/LagoudakisP03,
added-at = {2019-07-10T00:00:00.000+0200},
author = {Lagoudakis, Michail G. and Parr, Ronald},
biburl = {https://www.bibsonomy.org/bibtex/279090819f413e277dafd7e99c36dc22f/dblp},
ee = {http://jmlr.org/papers/v4/lagoudakis03a.html},
interhash = {80ac6d85410840025987dfdccc3511c9},
intrahash = {79090819f413e277dafd7e99c36dc22f},
journal = {J. Mach. Learn. Res.},
keywords = {dblp},
pages = {1107-1149},
timestamp = {2019-07-11T11:42:01.000+0200},
title = {Least-Squares Policy Iteration.},
url = {http://dblp.uni-trier.de/db/journals/jmlr/jmlr4.html#LagoudakisP03},
volume = 4,
year = 2003
}
@article{BlackScholes1973,
abstract = {If options are correctly priced in the market, it
should not be possible to make sure profits by creating
portfolios of long and short positions in options and
their underlying stocks. Using this principle, a
theoretical valuation formula for options is derived.
Since almost all corporate liabilities can be viewed as
combinations of options, the formula and the analysis
that led to it are also applicable to corporate
liabilities such as common stock, corporate bonds, and
warrants. In particular, the formula can be used to
derive the discount that should be applied to a
corporate bond because of the possibility of default.},
added-at = {2007-06-26T15:08:05.000+0200},
author = {Black, Fisher and Scholes, Myron S.},
biburl = {https://www.bibsonomy.org/bibtex/21b438aef4ace91b31c5b3864af3925ac/gilles.daniel},
comment = {First paper on Black-Scholes options pricing formula
Robert C. Merton and Myron S. Scholes, Nobel Prize
1997, for a new method to determine the value of
derivatives.},
interhash = {29c4e539e1156910620d127dac78c286},
intrahash = {1b438aef4ace91b31c5b3864af3925ac},
journal = {Journal of Political Economy},
keywords = {imported},
number = 3,
pages = {637--654},
timestamp = {2007-06-26T15:08:07.000+0200},
title = {The Pricing of Options and Corporate Liabilities},
volume = 81,
year = 1973
}
@article{Merton1969Portfolio,
author = {Merton, Robert C.},
comment = {Breakthrough paper on Dynamic Control of Portfolio
Allocation anc Consumption in continuous-time},
journal = {The Review of Economics and Statistics},
keywords = {imported},
number = 3,
pages = {247-257},
title = {Lifetime Portfolio Selection under Uncertainty: The Continuous-Time Case},
publisher = {JSTOR},
url = {https://doi.org/10.2307/1926560},
volume = 51,
year = 1969
}
@book{GVK505893878,
added-at = {2009-08-21T12:21:08.000+0200},
address = {Oxford [u.a.]},
author = {Björk, {Tomas}},
biburl = {https://www.bibsonomy.org/bibtex/2d3c057251ab3451711800984033d975a/fbw_hannover},
edition = {2. ed., reprint.},
interhash = {d18519febe7a31feb7f81d176833d523},
intrahash = {d3c057251ab3451711800984033d975a},
isbn = {0199271267},
keywords = {Arbitrage Arbitrage-Pricing-Theorie Derivat_<Wertpapier> Derivative_securities Finanzierung Investition Mathematical_models Wahrscheinlichkeitsrechnung},
pagetotal = {XVIII, 466},
ppn_gvk = {505893878},
publisher = {Oxford Univ. Press},
timestamp = {2009-08-21T12:21:19.000+0200},
title = {Arbitrage theory in continuous time},
url = {http://gso.gbv.de/DB=2.1/CMD?ACT=SRCHA&SRT=YOP&IKT=1016&TRM=ppn+505893878&sourceid=fbw_bibsonomy},
year = 2005
}
@book{Hull_10,
added-at = {2020-11-16T00:29:59.000+0100},
author = {Hull, John C.},
biburl = {https://www.bibsonomy.org/bibtex/273a9017b321b013d960b8796363168c2/derek-jones},
data = {NA},
edition = {Seventh},
interhash = {e3d5119b2a59ad21c9412869e07dba3e},
intrahash = {73a9017b321b013d960b8796363168c2},
isbn = {978-0-13-260460-4},
keywords = {imported},
month = oct,
online = {book},
publisher = {Pearson},
timestamp = {2020-11-16T00:29:59.000+0100},
title = {Options, Futures, and other Derivatives},
year = 2010
}
@article{bertsimas1998optimal,
title = {Optimal control of execution costs},
author = {Bertsimas, Dimitris and Lo, Andrew W.},
journal = {Journal of Financial Markets},
volume = {1},
number = {1},
pages = {1--50},
year = {1998},
publisher = {Elsevier}
}
@article{almgren2000optimal,
added-at = {2015-07-21T13:20:40.000+0200},
author = {Almgren, Robert and Chriss, Neil},
biburl = {https://www.bibsonomy.org/bibtex/2d58e08e2359cb1473f9103da68ac5a10/krassi},
interhash = {06ceb6c211f0f1a241e3867d6abce744},
intrahash = {d58e08e2359cb1473f9103da68ac5a10},
journal = {Journal of Risk},
keywords = {impact market},
pages = {5-39},
timestamp = {2015-07-21T13:20:40.000+0200},
title = {Optimal execution of portfolio transactions},
year = 2000
}
@inproceedings{conf/icml/NevmyvakaFK06,
added-at = {2018-11-06T00:00:00.000+0100},
author = {Nevmyvaka, Yuriy and Feng, Yi and Kearns, Michael J.},
biburl = {https://www.bibsonomy.org/bibtex/26a1cd69167152c4ef00ab33ccd5eab6a/dblp},
booktitle = {ICML},
crossref = {conf/icml/2006},
editor = {Cohen, William W. and Moore, Andrew W.},
ee = {https://doi.org/10.1145/1143844.1143929},
interhash = {1959e6c9644f29e09724aff0b667be6c},
intrahash = {6a1cd69167152c4ef00ab33ccd5eab6a},
isbn = {1-59593-383-2},
keywords = {dblp},
pages = {673-680},
publisher = {ACM},
series = {ACM International Conference Proceeding Series},
timestamp = {2019-11-20T11:43:16.000+0100},
title = {Reinforcement learning for optimized trade execution.},
url = {http://dblp.uni-trier.de/db/conf/icml/icml2006.html#NevmyvakaFK06},
volume = 148,
year = 2006
}
@article{journals/corr/abs-1906-02312,
added-at = {2019-06-14T00:00:00.000+0200},
author = {Vyetrenko, Svitlana and Xu, Shaojie},
biburl = {https://www.bibsonomy.org/bibtex/2523e1e8959fabad6139b868682bbf352/dblp},
ee = {http://arxiv.org/abs/1906.02312},
interhash = {a97e6b4031a5a79cfee71e4b670877a7},
intrahash = {523e1e8959fabad6139b868682bbf352},
journal = {CoRR},
keywords = {dblp},
timestamp = {2019-06-15T11:38:52.000+0200},
title = {Risk-Sensitive Compact Decision Trees for Autonomous Execution in Presence of Simulated Market Response.},
url = {http://dblp.uni-trier.de/db/journals/corr/corr1906.html#abs-1906-02312},
volume = {abs/1906.02312},
year = 2019
}
@article{Avellaneda2008,
added-at = {2008-04-23T19:09:19.000+0200},
author = {Avellaneda, Marco and Stoikov, Sasha},
biburl = {https://www.bibsonomy.org/bibtex/23a2d90e453d27af63c6544d3bc721664/smicha},
interhash = {c64f1b610a80b7d4042eadb05ee5c5cd},
intrahash = {3a2d90e453d27af63c6544d3bc721664},
issn = {1469-7688},
journal = {Quantitative Finance},
keywords = {imported},
number = 3,
pages = {217--224},
publisher = {Routledge},
timestamp = {2008-04-23T19:09:20.000+0200},
title = {High-frequency trading in a limit order book},
url = {http://www.informaworld.com/10.1080/14697680701381228},
volume = 8,
year = 2008
}
@article{journals/corr/abs-1804-04216,
added-at = {2018-08-13T00:00:00.000+0200},
author = {Spooner, Thomas and Fearnley, John and Savani, Rahul and Koukorinis, Andreas},
biburl = {https://www.bibsonomy.org/bibtex/23e8b6878e6a2246539d5a5ac61fb3ea0/dblp},
ee = {http://arxiv.org/abs/1804.04216},
interhash = {e6bdd859aad86a9441a2c486e472d8c1},
intrahash = {3e8b6878e6a2246539d5a5ac61fb3ea0},
journal = {CoRR},
keywords = {dblp},
timestamp = {2018-08-14T13:54:22.000+0200},
title = {Market Making via Reinforcement Learning.},
url = {http://dblp.uni-trier.de/db/journals/corr/corr1804.html#abs-1804-04216},
volume = {abs/1804.04216},
year = 2018
}
@article{journals/corr/abs-1911-05892,
added-at = {2019-12-04T00:00:00.000+0100},
author = {Ganesh, Sumitra and Vadori, Nelson and Xu, Mengda and Zheng, Hua and Reddy, Prashant P. and Veloso, Manuela},
biburl = {https://www.bibsonomy.org/bibtex/236bc4e65153fef4b2ec91572bfc6761c/dblp},
ee = {http://arxiv.org/abs/1911.05892},
interhash = {ec668c190bf7e2e060b7e937112d67e2},
intrahash = {36bc4e65153fef4b2ec91572bfc6761c},
journal = {CoRR},
keywords = {dblp},
timestamp = {2019-12-05T11:38:37.000+0100},
title = {Reinforcement Learning for Market Making in a Multi-agent Dealer Market.},
url = {http://dblp.uni-trier.de/db/journals/corr/corr1911.html#abs-1911-05892},
volume = {abs/1911.05892},
year = 2019
}
@article{LongstaffSchwartz2001,
author = {Longstaff, Francis A. and Schwartz, Eduardo S.},
title = {Valuing American Options by Simulation: a Simple Least-Squares Approach},
journal = {Review of Financial Studies},
year= {2001},
volume = {14},
number = {1},
pages = {113--147},
doi = {10.1093/rfs/14.1.113}
}
@inproceedings{li2009,
abstract = {Options are important instruments in modern finance. In this paper, we investigate reinforcement learning (RL) methods---in particular, least-squares policy iteration (LSPI)---for the problem of learning exercise policies for American options. We develop finite-time bounds on the performance of the policy obtained with LSPI and compare LSPI and the fitted Q-iteration algorithm (FQI) with the Longstaff-Schwartz method (LSM), the standard least-squares Monte Carlo algorithm from the finance community. Our empirical results show that the exercise policies discovered by LSPI and FQI gain larger payoffs than those discovered by LSM, on both real and synthetic data. Furthermore, we find that for all methods the policies learned from real data generally gain similar payoffs to the policies learned from simulated data. Our work shows that solution methods developed in machine learning can advance the state-of-the-art in an important and challenging application area, while demonstrating that computational finance remains a promising area for future applications of machine learning methods.},
added-at = {2020-03-17T03:03:01.000+0100},
author = {Li, Y. and Szepesv{\'a}ri, {Cs}. and Schuurmans, D.},
bdsk-url-1 = {http://www.ics.uci.edu/~aistats/},
biburl = {https://www.bibsonomy.org/bibtex/274d9725222761918a15eba8ae22a5a7d/csaba},
booktitle = {AISTATS},
date-added = {2010-08-28 17:38:14 -0600},
date-modified = {2015-08-02 01:02:54 +0000},
interhash = {a1207f07eec4066764db498ab11ce1e8},
intrahash = {74d9725222761918a15eba8ae22a5a7d},
keywords = {application finance, learning, reinforcement theory,},
pages = {352--359},
pdf = {http://jmlr.csail.mit.edu/proceedings/papers/v5/li09d/li09d.pdf},
timestamp = {2020-03-17T03:03:01.000+0100},
title = {Learning Exercise Policies for {A}merican Options},
url = {http://www.ics.uci.edu/~aistats/},
volume = 5,
year = 2009
}
@misc{sutton2001policy,
added-at = {2008-02-26T11:58:58.000+0100},
author = {Sutton, R. and Mcallester, D. and Singh, S. and Mansour, Y.},
biburl = {https://www.bibsonomy.org/bibtex/20b06d9bf0e170dd47a2d380ee8563426/schaul},
booktitle = {Advances in Neural Information Processing Systems 12 (Proceedings of the 1999 conference)},
citeulike-article-id = {2374752},
description = {idsia},
interhash = {879988d59ea02a1f3c5ec1ba5f545ba8},
intrahash = {0b06d9bf0e170dd47a2d380ee8563426},
keywords = {daanbib},
pages = {1057--1063},
priority = {2},
publisher = {MIT Press},
timestamp = {2008-02-26T12:07:03.000+0100},
title = {Policy Gradient Methods for Reinforcement Learning with Function Approximation},
year = 2001
}
@inproceedings{conf/nips/Kakade01,
added-at = {2020-03-12T00:00:00.000+0100},
author = {Kakade, Sham M.},
biburl = {https://www.bibsonomy.org/bibtex/29afe616246c2e9648d0367bfbfd507d5/dblp},
booktitle = {NIPS},
crossref = {conf/nips/2001},
editor = {Dietterich, Thomas G. and Becker, Suzanna and Ghahramani, Zoubin},
ee = {http://www-2.cs.cmu.edu/Groups/NIPS/NIPS2001/papers/psgz/CN11.ps.gz},
interhash = {10e8de9f8d2c747e392750b8164c7489},
intrahash = {9afe616246c2e9648d0367bfbfd507d5},
keywords = {dblp},
pages = {1531-1538},
publisher = {MIT Press},
timestamp = {2020-03-13T12:44:48.000+0100},
title = {A Natural Policy Gradient.},
url = {http://dblp.uni-trier.de/db/conf/nips/nips2001.html#Kakade01},
year = 2001
}
@article{amari_natural_1998,
added-at = {2014-04-15T13:06:00.000+0200},
author = {Amari, S.},
biburl = {https://www.bibsonomy.org/bibtex/204cab5fc779db34f662a2ae0f25e96ad/wittawatj},
interhash = {a5cad2a0bad7028a732ae79e9fa6a4b2},
intrahash = {04cab5fc779db34f662a2ae0f25e96ad},
journal = {Neural Computation},
keywords = {daanbib},
number = 2,
pages = {251–276},
timestamp = {2014-04-15T13:06:00.000+0200},
title = {Natural Gradient Works Efficiently in Learning},
volume = 10,
year = 1998
}
@inproceedings{conf/icml/SilverLHDWR14,
added-at = {2019-05-29T00:00:00.000+0200},
author = {Silver, David and Lever, Guy and Heess, Nicolas and Degris, Thomas and Wierstra, Daan and Riedmiller, Martin A.},
biburl = {https://www.bibsonomy.org/bibtex/2e2fb52847293919f2e6c88fc8c9eee9b/dblp},
booktitle = {ICML},
crossref = {conf/icml/2014},
ee = {http://proceedings.mlr.press/v32/silver14.html},
interhash = {938059a9d0f391a7c3763bcf9788afb1},
intrahash = {e2fb52847293919f2e6c88fc8c9eee9b},
keywords = {dblp},
pages = {387-395},
publisher = {JMLR.org},
series = {JMLR Workshop and Conference Proceedings},
timestamp = {2019-05-30T11:54:07.000+0200},
title = {Deterministic Policy Gradient Algorithms.},
url = {http://dblp.uni-trier.de/db/conf/icml/icml2014.html#SilverLHDWR14},
volume = 32,
year = 2014
}
@article{salimans2017evolution,
title={Evolution strategies as a scalable alternative to reinforcement learning},
author={Salimans, Tim and Ho, Jonathan and Chen, Xi and Sidor, Szymon and Sutskever, Ilya},
journal={arXiv preprint arXiv:1703.03864},
year={2017}
}
@article{lai-allocation,
added-at = {2007-07-05T16:17:35.000+0200},
author = {Lai, T.L. and Robbins, H.},
biburl = {https://www.bibsonomy.org/bibtex/243d5e28aa6ae3446e548319c7f964b7f/jleny},
description = {bandit problems},
interhash = {c33edf59c35ee99dbaa6f1ce8835b782},
intrahash = {43d5e28aa6ae3446e548319c7f964b7f},
journal = {Advances in Applied Mathematics},
keywords = {imported},
pages = {4--22},
timestamp = {2007-07-05T16:17:37.000+0200},
title = {Asymptotically Efficient Adaptive Allocation Rules},
volume = 6,
year = 1985
}
@article{Russo_2018,
added-at = {2018-07-27T09:01:12.000+0200},
author = {Russo, Daniel J. and Roy, Benjamin Van and Kazerouni, Abbas and Osband, Ian and Wen, Zheng},
biburl = {https://www.bibsonomy.org/bibtex/299abe90b21844b84f5ad6c6b7eb08f72/analyst},
description = {now publishers - A Tutorial on Thompson Sampling},
doi = {10.1561/2200000070},
interhash = {8e531ba267f78a2053e3adea43f938f6},
intrahash = {99abe90b21844b84f5ad6c6b7eb08f72},
journal = {Foundations and Trends{\textregistered} in Machine Learning},
keywords = {2018 book probability reinforcement-learning tutorial},
number = 1,
pages = {1--96},
publisher = {Now Publishers},
timestamp = {2018-07-27T09:01:12.000+0200},
title = {A Tutorial on Thompson Sampling},
url = {https://doi.org/10.1561%2F2200000070},
volume = 11,
year = 2018
}
@inproceedings{conf/nips/GuezHSD14,
added-at = {2020-03-06T00:00:00.000+0100},
author = {Guez, Arthur and Heess, Nicolas and Silver, David and Dayan, Peter},
biburl = {https://www.bibsonomy.org/bibtex/25ad8c1dda953f92e25cebd2be90910e9/dblp},
booktitle = {NIPS},
crossref = {conf/nips/2014},
editor = {Ghahramani, Zoubin and Welling, Max and Cortes, Corinna and Lawrence, Neil D. and Weinberger, Kilian Q.},
ee = {http://papers.nips.cc/paper/5501-bayes-adaptive-simulation-based-search-with-value-function-approximation},
interhash = {50f820f1e229e0219ac03fd626067f6a},
intrahash = {5ad8c1dda953f92e25cebd2be90910e9},
keywords = {dblp},
pages = {451-459},
timestamp = {2020-03-07T11:48:07.000+0100},
title = {Bayes-Adaptive Simulation-based Search with Value Function Approximation.},
url = {http://dblp.uni-trier.de/db/conf/nips/nips2014.html#GuezHSD14},
year = 2014
}
@inproceedings{conf/ijcai/BrafmanT01,
added-at = {2003-05-23T00:00:00.000+0200},
author = {Brafman, Ronen I. and Tennenholtz, Moshe},
biburl = {https://www.bibsonomy.org/bibtex/25aa5f438073e0f0dbb85194f5d714f7d/dblp},
booktitle = {IJCAI},
crossref = {conf/ijcai/2001},
editor = {Nebel, Bernhard},
ee = {http://ijcai.org/proceedings/2001-2},
interhash = {5997357756eb4e1585e12d737a39852d},
intrahash = {5aa5f438073e0f0dbb85194f5d714f7d},
isbn = {1-55860-777-3},
keywords = {dblp},
pages = {953-958},
publisher = {Morgan Kaufmann},
timestamp = {2019-08-21T11:49:29.000+0200},
title = {R-MAX - A General Polynomial Time Algorithm for Near-Optimal Reinforcement Learning.},
url = {http://dblp.uni-trier.de/db/conf/ijcai/ijcai2001.html#BrafmanT01},
year = 2001
}
@article{journals/sigart/Sutton91,
added-at = {2020-05-19T00:00:00.000+0200},
author = {Sutton, Richard S.},
biburl = {https://www.bibsonomy.org/bibtex/2281eb5d631b9b8c16d66e976055f89a6/dblp},
ee = {https://doi.org/10.1145/122344.122377},
interhash = {51bf73485ee40e297f5c9d5bc5dad04f},
intrahash = {281eb5d631b9b8c16d66e976055f89a6},
journal = {SIGART Bull.},
keywords = {dblp},
number = 4,
pages = {160-163},
timestamp = {2020-05-20T11:40:24.000+0200},
title = {Dyna, an Integrated Architecture for Learning, Planning, and Reacting.},
url = {http://dblp.uni-trier.de/db/journals/sigart/sigart2.html#Sutton91},
volume = 2,
year = 1991
}
@article{mnih2013atari,
abstract = {We present the first deep learning model to successfully learn control
policies directly from high-dimensional sensory input using reinforcement
learning. The model is a convolutional neural network, trained with a variant
of Q-learning, whose input is raw pixels and whose output is a value function
estimating future rewards. We apply our method to seven Atari 2600 games from
the Arcade Learning Environment, with no adjustment of the architecture or
learning algorithm. We find that it outperforms all previous approaches on six
of the games and surpasses a human expert on three of them.},
added-at = {2019-07-11T17:41:01.000+0200},
author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Graves, Alex and Antonoglou, Ioannis and Wierstra, Daan and Riedmiller, Martin},
biburl = {https://www.bibsonomy.org/bibtex/2a00ec4c09f5dc9b3f8a1836f4e02bb5d/lanteunis},
description = {[1312.5602] Playing Atari with Deep Reinforcement Learning},
interhash = {78966703f649bae69a08a6a23a4e8879},
intrahash = {a00ec4c09f5dc9b3f8a1836f4e02bb5d},
keywords = {DRLAlgoComparison dqn final reinforcement thema:reinforcement_learning_recommender},
note = {cite arxiv:1312.5602Comment: NIPS Deep Learning Workshop 2013},
timestamp = {2019-12-16T21:10:09.000+0100},
title = {Playing Atari with Deep Reinforcement Learning},
url = {http://arxiv.org/abs/1312.5602},
year = 2013
}
@article{mnih2015humanlevel,
abstract = {An artificial agent is developed that learns to play a diverse range of classic Atari 2600 computer games directly from sensory experience, achieving a performance comparable to that of an expert human player; this work paves the way to building general-purpose learning algorithms that bridge the divide between perception and action.},
added-at = {2020-03-25T21:22:39.000+0100},
author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A. and Veness, Joel and Bellemare, Marc G. and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K. and Ostrovski, Georg and Petersen, Stig and Beattie, Charles and Sadik, Amir and Antonoglou, Ioannis and King, Helen and Kumaran, Dharshan and Wierstra, Daan and Legg, Shane and Hassabis, Demis},
biburl = {https://www.bibsonomy.org/bibtex/2fb15f4471c81dc2b9edf2304cb2f7083/cpankow},
description = {Human-level control through deep reinforcement learning | Nature},
doi = {10.1038/nature14236},
interhash = {eac59980357d99db87b341b61ef6645f},
intrahash = {fb15f4471c81dc2b9edf2304cb2f7083},
issn = {14764687},
journal = {Nature},
keywords = {machinelearning neuralnetwork reinforcementlearning},
number = 7540,
pages = {529--533},
refid = {Mnih2015},
timestamp = {2020-03-25T21:22:39.000+0100},
title = {Human-level control through deep reinforcement learning},
url = {https://doi.org/10.1038/nature14236},
volume = 518,
year = 2015
}
@article{silver2016mastering,
abstract = {The game of Go has long been viewed as the most challenging of classic games for artificial intelligence owing to its enormous search space and the difficulty of evaluating board positions and moves. Here we introduce a new approach to computer Go that uses ‘value networks’ to evaluate board positions and ‘policy networks’ to select moves. These deep neural networks are trained by a novel combination of supervised learning from human expert games, and reinforcement learning from games of self-play. Without any lookahead search, the neural networks play Go at the level of state-of-the-art Monte Carlo tree search programs that simulate thousands of random games of self-play. We also introduce a new search algorithm that combines Monte Carlo simulation with value and policy networks. Using this search algorithm, our program AlphaGo achieved a 99.8% winning rate against other Go programs, and defeated the human European Go champion by 5 games to 0. This is the first time that a computer program has defeated a human professional player in the full-sized game of Go, a feat previously thought to be at least a decade away.},
added-at = {2021-03-08T10:50:11.000+0100},
author = {Silver, David and Huang, Aja and Maddison, Chris J. and Guez, Arthur and Sifre, Laurent and van den Driessche, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc and Dieleman, Sander and Grewe, Dominik and Nham, John and Kalchbrenner, Nal and Sutskever, Ilya and Lillicrap, Timothy and Leach, Madeleine and Kavukcuoglu, Koray and Graepel, Thore and Hassabis, Demis},
biburl = {https://www.bibsonomy.org/bibtex/2e3bd772f62209ea8283e242f993d3edf/analyst},
description = {Mastering the game of Go with deep neural networks and tree search | Nature},
doi = {10.1038/nature16961},
interhash = {48430c7891aaf9fe2582faa8f5d076c1},
intrahash = {e3bd772f62209ea8283e242f993d3edf},
issn = {14764687},
journal = {Nature},
keywords = {2016 deep-learning go nature paper reinforcement-learning},
number = 7587,
pages = {484--489},
refid = {Silver2016},
timestamp = {2021-03-08T10:50:11.000+0100},
title = {Mastering the game of Go with deep neural networks and tree search},
url = {https://doi.org/10.1038/nature16961},
volume = 529,
year = 2016
}
@inproceedings{conf/cg/Coulom06,
added-at = {2017-05-17T00:00:00.000+0200},
author = {Coulom, Rémi},
biburl = {https://www.bibsonomy.org/bibtex/27b82a435e6fb9cd3dc12367366ef0641/dblp},
booktitle = {Computers and Games},
crossref = {conf/cg/2006},
editor = {van den Herik, H. Jaap and Ciancarini, Paolo and Donkers, H. H. L. M.},
ee = {https://doi.org/10.1007/978-3-540-75538-8_7},
interhash = {33f87d0cbbc96923fb95cee26daaee4a},
intrahash = {7b82a435e6fb9cd3dc12367366ef0641},
isbn = {978-3-540-75537-1},
keywords = {dblp},
pages = {72-83},
publisher = {Springer},
series = {Lecture Notes in Computer Science},
timestamp = {2019-05-15T13:26:49.000+0200},
title = {Efficient Selectivity and Backup Operators in Monte-Carlo Tree Search.},
url = {http://dblp.uni-trier.de/db/conf/cg/cg2006.html#Coulom06},
volume = 4630,
year = 2006
}
@inproceedings{kocsis2006a,
abstract = {We consider batch reinforcement learning problems in continuous space, expected total discounted-reward Markovian Decision Problems. As opposed to previous theoretical work, we consider the case when the training data consists of a single sample path (trajectory) of some behaviour policy.In particular, we do not assume access to a generative model of the environment.The algorithm studied is fitted Q-iteration where in successive iterations the $Q$-functions of the intermediate policies are obtained by means of minimizing a novel Bellman-residual type error.PAC-style polynomial bounds are derived on the number of samples needed to guarantee near-optimal performance where the bound depends on the mixing rate of the trajectory, the smoothness properties of the underlying Markovian Decision Problem, the approximation power and capacity of the function set used.},
added-at = {2020-03-17T03:03:01.000+0100},
author = {Kocsis, L. and Szepesv{\'a}ri, {Cs}.},
biburl = {https://www.bibsonomy.org/bibtex/286adc7a187cd2dbefe68d09d23a948e5/csaba},
booktitle = {ECML},
crossref = {ECML06},
date-added = {2010-08-28 17:38:14 -0600},
date-modified = {2010-11-25 00:57:10 -0700},
interhash = {e9621505f16a5c5b13a8b23955c39fd5},
intrahash = {86adc7a187cd2dbefe68d09d23a948e5},
keywords = {Monte-Carlo UCT, bandits games, in learning learning, methods, reinforcement search, tree},
pages = {282--293},
pdf = {papers/ecml06.pdf},
timestamp = {2020-03-17T03:03:01.000+0100},
title = {Bandit based {M}onte-{C}arlo Planning},
year = 2006
}
@article{Auer2002,
abstract = {Reinforcement learning policies face the exploration versus exploitation dilemma, i.e. the search for a balance between exploring the environment to find profitable actions while taking the empirically best action as often as possible. A popular measure of a policy's success in addressing this dilemma is the regret, that is the loss due to the fact that the globally optimal policy is not followed all the times. One of the simplest examples of the exploration/exploitation dilemma is the multi-armed bandit problem. Lai and Robbins were the first ones to show that the regret for this problem has to grow at least logarithmically in the number of plays. Since then, policies which asymptotically achieve this regret have been devised by Lai and Robbins and many others. In this work we show that the optimal logarithmic regret is also achievable uniformly over time, with simple and efficient policies, and for all reward distributions with bounded support.},
added-at = {2021-03-13T09:12:07.000+0100},
author = {Auer, Peter and Cesa-Bianchi, Nicol{\`o} and Fischer, Paul},
biburl = {https://www.bibsonomy.org/bibtex/2c1b9ac34e95a12dcaca2791593d442f7/analyst},
day = 01,
description = {Finite-time Analysis of the Multiarmed Bandit Problem | SpringerLink},
doi = {10.1023/A:1013689704352},
interhash = {d9ed352509aceb102cbb43c4127a7b30},
intrahash = {c1b9ac34e95a12dcaca2791593d442f7},
issn = {1573-0565},
journal = {Machine Learning},
keywords = {2002 reinforcement-learning},
month = may,
number = 2,
pages = {235--256},
timestamp = {2021-03-13T09:12:07.000+0100},
title = {Finite-time Analysis of the Multiarmed Bandit Problem},
url = {https://doi.org/10.1023/A:1013689704352},
volume = 47,
year = 2002
}
@article{journals/ior/ChangFHM05,
added-at = {2018-11-14T00:00:00.000+0100},
author = {Chang, Hyeong Soo and Fu, Michael C. and Hu, Jiaqiao and Marcus, Steven I.},
biburl = {https://www.bibsonomy.org/bibtex/2644483341bcebfc39ea4aa952a4660ea/dblp},
ee = {https://www.wikidata.org/entity/Q57382677},
interhash = {164ee491ebc54ca22e40986ab8bdf89d},
intrahash = {644483341bcebfc39ea4aa952a4660ea},
journal = {Operations Research},
keywords = {dblp},
number = 1,
pages = {126-139},
timestamp = {2018-11-15T14:33:31.000+0100},
title = {An Adaptive Sampling Algorithm for Solving Markov Decision Processes.},
url = {http://dblp.uni-trier.de/db/journals/ior/ior53.html#ChangFHM05},
volume = 53,
year = 2005
}
@article{journals/corr/abs-1812-02648,
added-at = {2019-01-01T00:00:00.000+0100},
author = {van Hasselt, Hado and Doron, Yotam and Strub, Florian and Hessel, Matteo and Sonnerat, Nicolas and Modayil, Joseph},
biburl = {https://www.bibsonomy.org/bibtex/23cbd476a62eb476f3aa231be93236ce3/dblp},
ee = {http://arxiv.org/abs/1812.02648},
interhash = {14287660ba0291fb10822fd2fc8b7dcd},
intrahash = {3cbd476a62eb476f3aa231be93236ce3},
journal = {CoRR},
keywords = {dblp},
timestamp = {2019-01-02T11:37:27.000+0100},
title = {Deep Reinforcement Learning and the Deadly Triad.},
url = {http://dblp.uni-trier.de/db/journals/corr/corr1812.html#abs-1812-02648},
volume = {abs/1812.02648},
year = 2018
}
@book{Bellman1957,
added-at = {2021-02-01T10:51:23.000+0100},
address = {Princeton, NJ, USA},
author = {Bellman, Richard},
bib2html_rescat = {General RL},
biburl = {https://www.bibsonomy.org/bibtex/29cdd821222218ded252c8ba5cd712666/m-toman},
edition = 1,
interhash = {acf948462171ca060064a7ded257a792},
intrahash = {9cdd821222218ded252c8ba5cd712666},
keywords = {imported},
publisher = {Princeton University Press},
timestamp = {2021-02-01T10:51:23.000+0100},
title = {Dynamic Programming},
year = 1957
}
@article{bellman1957markovian,
added-at = {2017-04-07T12:00:35.000+0200},
author = {Bellman, Richard},
biburl = {https://www.bibsonomy.org/bibtex/2c04c5f89b4e8445651eded5b56c67342/becker},
interhash = {d7aa065c075b248c9980b4c45d635b66},
intrahash = {c04c5f89b4e8445651eded5b56c67342},
journal = {Journal of Mathematics and Mechanics},
keywords = {chain citedby:scholar:count:987 citedby:scholar:timestamp:2017-4-7 decision diss inthesis markov process},
number = 5,
pages = {679--684},
timestamp = {2017-12-20T14:47:54.000+0100},
title = {A Markovian decision process},
url = {http://www.jstor.org/stable/24900506},
volume = 6,
year = 1957
}
@book{howard:dp,
added-at = {2008-02-26T11:58:58.000+0100},
address = {Cambridge, MA},
author = {Howard, R. A.},
biburl = {https://www.bibsonomy.org/bibtex/28b55f737ee6dd7800ffc7952a33bb6bd/schaul},
citeulike-article-id = {2380352},
description = {idsia},
interhash = {7eed9f4f6bd1f9ee063d80d0f732e48f},
intrahash = {8b55f737ee6dd7800ffc7952a33bb6bd},
keywords = {inaki},
priority = {2},
publisher = {MIT Press},
timestamp = {2008-02-26T12:01:06.000+0100},
title = {Dynamic Programming and Markov Processes},
year = 1960
}
@book{Gagniuc2017MarkovCF,
title={Markov Chains: From Theory to Implementation and Experimentation},
author={Gagniuc, Paul A.},
year={2017},
publisher = {John Wiley \& Sons}
}
@article{ASTROM1965174,
title = {Optimal control of Markov processes with incomplete state information},
journal = {Journal of Mathematical Analysis and Applications},
volume = {10},
number = {1},
pages = {174-205},
year = {1965},
issn = {0022-247X},
doi = {https://doi.org/10.1016/0022-247X(65)90154-X},
url = {https://www.sciencedirect.com/science/article/pii/0022247X6590154X},
author = {K.J Åström}
}
@book{krishnamurthy_2016,
place={Cambridge},
title={Partially Observed Markov Decision Processes: From Filtering to Controlled Sensing},
DOI={10.1017/CBO9781316471104},
publisher={Cambridge University Press},
author={Krishnamurthy, Vikram},
year={2016}
}
@INPROCEEDINGS{4047044,
author={Bertsekas, Dimitri P.},
booktitle={1981 20th IEEE Conference on Decision and Control including the Symposium on Adaptive Processes},
title={Distributed dynamic programming},
year={1981},
volume={},
number={},
pages={774-779},
doi={10.1109/CDC.1981.269319}
}
@article{Bertsekas1983DistributedAC,
title={Distributed asynchronous computation of fixed points},
author={Dimitri P. Bertsekas},
journal={Mathematical Programming},
year={1983},
volume={27},
pages={107-120}
}
@book{books/lib/Bertsekas05,
added-at = {2020-07-17T00:00:00.000+0200},
author = {Bertsekas, Dimitri P.},
biburl = {https://www.bibsonomy.org/bibtex/287e129b4ba44f19590da9ec79f2c46c3/dblp},
ee = {https://www.worldcat.org/oclc/314894080},
interhash = {95914d535bf600768af9ce7c4dd8ab76},
intrahash = {87e129b4ba44f19590da9ec79f2c46c3},
isbn = {1886529264},
keywords = {dblp},
pages = {I-XV, 1-543},
publisher = {Athena Scientific},
timestamp = {2020-07-24T00:45:12.000+0200},
title = {Dynamic Programming and Optimal Control, Volume 1, 3rd Edition.},
year = 2005
}
@book{books/lib/Bertsekas12,
author = {Bertsekas, Dimitri P.},
keywords = {dblp},
publisher = {Athena Scientific},
title = {Dynamic Programming and Optimal Control, Volume 2: Approximate Dynamic Programming},
year = 2012
}
@book{BertsekasTsitsiklis96,
added-at = {2008-09-16T23:39:07.000+0200},
address = {Belmont, MA},
author = {Bertsekas, D. P. and Tsitsiklis, J. N.},
biburl = {https://www.bibsonomy.org/bibtex/219e84aaacaf689cde6190fe14ba5a337/brian.mingus},
booktitle = {Neuro-dynamic programming.},
description = {CCNLab BibTeX},
interhash = {2fbe138b7b864bc58d95999e69b5d45b},
intrahash = {19e84aaacaf689cde6190fe14ba5a337},
keywords = {nnets},
publisher = {Athena Scientific},
timestamp = {2008-09-16T23:39:21.000+0200},
title = {Neuro-dynamic programming.},
year = 1996
}
@article{KinBa17,
added-at = {2018-02-28T16:10:01.000+0100},
author = {Kingma, Diederik P. and Ba, Jimmy},
biburl = {https://www.bibsonomy.org/bibtex/23b0328784dbfce338ba0dd2618a7a059/loroch},
ee = {http://arxiv.org/abs/1412.6980},
interhash = {57d2ac873f398f21bb94790081e80394},
intrahash = {3b0328784dbfce338ba0dd2618a7a059},
journal = {CoRR},
keywords = {deep_learning gradient_descend methods momentum optimization training},
timestamp = {2018-02-28T16:10:01.000+0100},
title = {Adam: A Method for Stochastic Optimization.},
url = {http://dblp.uni-trier.de/db/journals/corr/corr1412.html#KingmaB14},
volume = {abs/1412.6980},
year = 2014
}
@book{Goodfellow-et-al-2016,
title={Deep Learning},
author={Ian Goodfellow and Yoshua Bengio and Aaron Courville},
publisher={MIT Press},
note={\url{http://www.deeplearningbook.org}},
year={2016}
}
@book{GVK266386229,
added-at = {2009-08-21T15:23:22.000+0200},
address = {Dordrecht [u.a.]},
biburl = {https://www.bibsonomy.org/bibtex/297c6ee9c85f27ecf0a8f296f488da9f9/fbw_hannover},
editor = {Barberà, {Salvador} and Seidl, {Christian} and Hammond, {Peter J.}},
interhash = {a9beb3cbe91a091036d161d99081ef75},
intrahash = {97c6ee9c85f27ecf0a8f296f488da9f9},
isbn = {0792381742},
keywords = {Mikroökonomie Nutzentheorie Theorie Theorie_der_Wirtschaftspolitik Utility-Theorie Wohlfahrtstheorie},
pagetotal = {VII, 684},
ppn_gvk = {266386229},
publisher = {Kluwer},
subtitle = {Handbook of utility theory / Barberà, Salvador. - Boston, Mass. [u.a.] : Kluwer, 1998- Vol. 1},
timestamp = {2009-08-21T15:23:23.000+0200},
title = {Handbook of utility theory},
url = {http://gso.gbv.de/DB=2.1/CMD?ACT=SRCHA&SRT=YOP&IKT=1016&TRM=ppn+266386229&sourceid=fbw_bibsonomy},
year = 1998
}
@preprint{buhler2018hedging,
abstract = {We present a framework for hedging a portfolio of derivatives in the presence
of market frictions such as transaction costs, market impact, liquidity
constraints or risk limits using modern deep reinforcement machine learning
methods.
We discuss how standard reinforcement learning methods can be applied to
non-linear reward structures, i.e. in our case convex risk measures. As a
general contribution to the use of deep learning for stochastic processes, we
also show that the set of constrained trading strategies used by our algorithm
is large enough to $\epsilon$-approximate any optimal solution.
Our algorithm can be implemented efficiently even in high-dimensional
situations using modern machine learning tools. Its structure does not depend
on specific market dynamics, and generalizes across hedging instruments
including the use of liquid derivatives. Its computational performance is
largely invariant in the size of the portfolio as it depends mainly on the
number of hedging instruments available.
We illustrate our approach by showing the effect on hedging under transaction
costs in a synthetic market driven by the Heston model, where we outperform the
standard "complete market" solution.},
added-at = {2019-05-30T18:29:52.000+0200},
author = {Bühler, Hans and Gonon, Lukas and Teichmann, Josef and Wood, Ben},
biburl = {https://www.bibsonomy.org/bibtex/2849f2e8b1df10751e8304143ea692fdb/nonancourt},
description = {Deep Hedging},
interhash = {fb505f924756bb0ca0b033b313848147},
intrahash = {849f2e8b1df10751e8304143ea692fdb},
keywords = {deep-learning finance},
note = {cite arxiv:1802.03042},
timestamp = {2019-05-30T18:29:52.000+0200},
title = {Deep Hedging},
url = {http://arxiv.org/abs/1802.03042},
year = 2018
}
@book{Gueant2016,
title={The Financial Mathematics of Market Liquidity: From Optimal Execution to Market Making},
author={Gueant, Olivier},
year={2016},
publisher = {Chapman and Hall/CRC Financial Mathematics Series}
}
@phdthesis{Watkins:89,
added-at = {2008-02-26T11:58:58.000+0100},
author = {Watkins, C. J. C. H.},
biburl = {https://www.bibsonomy.org/bibtex/21ffd549077ea1da7675431a17fa2af03/schaul},
citeulike-article-id = {2381652},
description = {idsia},
interhash = {ca824d64b71939208358edb4a26f8351},
intrahash = {1ffd549077ea1da7675431a17fa2af03},
keywords = {juergen},
priority = {2},
school = {King's College, Oxford},
timestamp = {2008-02-26T11:59:46.000+0100},
title = {Learning from Delayed Rewards},
year = 1989
}
@book{klopf1972brain,
title={Brain Function and Adaptive Systems--a Heterostatic Theory},
author={Klopf, A.H. and Air Force Cambridge Research Laboratories (U.S.). Data Sciences Laboratory},
series={Special reports},
url={https://books.google.com/books?id=C2hztwEACAAJ},
year={1972},
publisher={Data Sciences Laboratory, Air Force Cambridge Research Laboratories, Air Force Systems Command, United States Air Force}
}
@techreport{rummery:tech94,
added-at = {2008-03-11T14:52:34.000+0100},
author = {Rummery, G. A. and Niranjan, M.},
biburl = {https://www.bibsonomy.org/bibtex/2dbfba0b20bace9085789f4f479f6111f/idsia},
citeulike-article-id = {2380290},
institution = {Engineering Department, Cambridge University},
interhash = {0c7cd3821ad0fe1b39a6ce1b35ec4bc0},
intrahash = {dbfba0b20bace9085789f4f479f6111f},
keywords = {inaki},
number = {CUED/F-INFENG/TR-166},
priority = {2},
timestamp = {2008-03-11T14:56:18.000+0100},
title = {On-line {Q}-learning using connectionist systems},
year = 1994
}
@article{10.1214/aoms/1177729893,
author = {Jack Sherman and Winifred J. Morrison},
title = {{Adjustment of an Inverse Matrix Corresponding to a Change in One Element of a Given Matrix}},
volume = {21},
journal = {The Annals of Mathematical Statistics},
number = {1},
publisher = {Institute of Mathematical Statistics},
pages = {124 -- 127},
year = {1950},
doi = {10.1214/aoms/1177729893},
URL = {https://doi.org/10.1214/aoms/1177729893}
}
@article{journals/ml/BradtkeB96,
added-at = {2020-03-02T00:00:00.000+0100},
author = {Bradtke, Steven J. and Barto, Andrew G.},
biburl = {https://www.bibsonomy.org/bibtex/2d6c05c943a95b78845e2765a69cb4cc9/dblp},
ee = {https://www.wikidata.org/entity/Q56095426},
interhash = {d49c55128e85ec3a2882ba148c8db33f},
intrahash = {d6c05c943a95b78845e2765a69cb4cc9},
journal = {Mach. Learn.},
keywords = {dblp},
number = {1-3},
pages = {33-57},
timestamp = {2020-03-03T11:49:59.000+0100},
title = {Linear Least-Squares Algorithms for Temporal Difference Learning.},
url = {http://dblp.uni-trier.de/db/journals/ml/ml22.html#BradtkeB96},
volume = 22,
year = 1996
}
@phdthesis{lin:phd,
added-at = {2008-03-11T14:52:34.000+0100},
address = {Pittsburg},
author = {Lin, Long J.},
biburl = {https://www.bibsonomy.org/bibtex/215f83604aa0fe71e484b319a4bf434a4/idsia},
citeulike-article-id = {2380251},
interhash = {b312cef919452127612baf1fe7ac3382},
intrahash = {15f83604aa0fe71e484b319a4bf434a4},
keywords = {inaki},
priority = {2},
school = {CMU},
timestamp = {2008-03-11T14:56:22.000+0100},
title = {Reinforcement Learning for Robots Using Neural Networks},
year = 1993
}
@inproceedings{Baird:95,
added-at = {2008-03-11T14:52:34.000+0100},
author = {Leemon},
biburl = {https://www.bibsonomy.org/bibtex/2f421da3046f64fb46524e23fa82bc9e6/idsia},
booktitle = {International Conference on Machine Learning},
citeulike-article-id = {2374989},
interhash = {cab67e1db86772844efd1e5d94731806},
intrahash = {f421da3046f64fb46524e23fa82bc9e6},
keywords = {imported},
pages = {30--37},
priority = {2},
timestamp = {2008-03-11T15:05:44.000+0100},
title = {Residual Algorithms: Reinforcement Learning with Function Approximation},
url = {citeseer.ist.psu.edu/baird95residual.html},
year = 1995
}
@inproceedings{sutton2008,
abstract = {We introduce the first temporal-difference learning algorithm that is stable with linear function approximation and off-policy training, for any finite Markov decision process, behavior policy, and target policy, and whose complexity scales linearly in the number of parameters. We consider an i.i.d. policy-evaluation setting in which the data need not come from on-policy experience. The gradient temporal-difference (GTD) algorithm estimates the expected update vector of the TD(0) algorithm and performs stochastic gradient descent on its L2 norm. We prove that this algorithm is stable and convergent under the usual stochastic approximation conditions to the same least-squares solution as found by the LSTD, but without LSTD's quadratic computational complexity. GTD is online and incremental, and does not involve multiplying by products of likelihood ratios as in importance-sampling methods.},
added-at = {2020-03-17T03:03:01.000+0100},
author = {Sutton, R.S. and Szepesv{\'a}ri, {Cs}. and Maei, H.R.},
bibsource = {DBLP, http://dblp.uni-trier.de},
biburl = {https://www.bibsonomy.org/bibtex/2ae2eb05437d68f9027da6658faaba91e/csaba},
booktitle = {NIPS},
crossref = {NIPS21},
date-added = {2010-08-28 17:38:14 -0600},
date-modified = {2010-11-25 00:50:58 -0700},
ee = {http://books.nips.cc/papers/files/nips21/NIPS2008_0421.pdf},
interhash = {2cd17553b0213961edd86f4ea585fb67},
intrahash = {ae2eb05437d68f9027da6658faaba91e},
keywords = {GTD algorithm, approximation, function gradient learning, online prediction, reinforcement stochastic theory,},
pages = {1609--1616},
pdf = {papers/gtdnips08.pdf},
timestamp = {2020-03-17T03:03:01.000+0100},
title = {A Convergent {O}(n) Algorithm for Off-policy Temporal-difference Learning with Linear Function Approximation},
year = 2008
}
@inproceedings{sutton2009,
abstract = {Sutton, Szepesvari and Maei (2009) recently introduced the first temporal-difference learning algorithm compatible with both linear function approximation and off-policy training, and whose complexity scales only linearly in the size of the function approximator. Although their gradient temporal difference (GTD) algorithm converges reliably, it can be very slow compared to conventional linear TD (on on-policy problems where TD is convergent), calling into question its practical utility. In this paper we introduce two new related algorithms with better convergence rates. The first algorithm, GTD2, is derived and proved convergent just as GTD was, but uses a different objective function and converges significantly faster (but still not as fast as conventional TD). The second new algorithm, linear TD with gradient correction, or TDC, uses the same update rule as conventional TD except for an additional term which is initially zero. In our experiments on small test problems and in a Computer Go application with a million features, the learning rate of this algorithm was comparable to that of conventional TD. This algorithm appears to extend linear TD to off-policy learning with no penalty in performance while only doubling computational requirements.},
added-at = {2020-03-17T03:03:01.000+0100},
author = {Sutton, R.S. and Maei, H.R. and Precup, D. and Bhatnagar, S. and Silver, D. and Szepesv{\'a}ri, {Cs}. and Wiewiora, E.},
biburl = {https://www.bibsonomy.org/bibtex/22160b897b778769d8e85de83c78cdf82/csaba},
booktitle = {ICML},
date-added = {2010-08-28 17:38:14 -0600},
date-modified = {2010-11-25 00:50:04 -0700},
interhash = {8b02feb5f5e92775fec53a849ed924ac},
intrahash = {2160b897b778769d8e85de83c78cdf82},
keywords = {GTD2, TDC algorithm, approximation, function gradient learning, online prediction, reinforcement stochastic theory,},
pages = {993--1000},
pdf = {papers/GTD-ICML09.pdf},
timestamp = {2020-03-17T03:03:01.000+0100},
title = {Fast Gradient-Descent Methods for Temporal-Difference Learning with Linear Function Approximation},
year = 2009
}
@article{Williams:92,
added-at = {2008-02-26T11:58:58.000+0100},
author = {Williams, R. J.},
biburl = {https://www.bibsonomy.org/bibtex/294224c3e53bfe80ade7218b3a0283465/schaul},
citeulike-article-id = {2374762},
description = {idsia},
interhash = {b90d65a735ae02a940f5075b0fd7ebe7},
intrahash = {94224c3e53bfe80ade7218b3a0283465},
journal = {Machine Learning},
keywords = {daanbib},
pages = {229--256},
priority = {2},
timestamp = {2008-02-26T12:07:02.000+0100},
title = {Simple statistical gradient-following algorithms for connectionist reinforcement learning},
volume = 8,
year = 1992
}
@article{journals/corr/abs-1205-4839,
added-at = {2018-08-13T00:00:00.000+0200},
author = {Degris, Thomas and White, Martha and Sutton, Richard S.},
biburl = {https://www.bibsonomy.org/bibtex/2df898713eb27437f8cff1b3f1a617b0b/dblp},
ee = {http://arxiv.org/abs/1205.4839},
interhash = {69c9fdc4b9ee04a525f86765ecc7e6c9},
intrahash = {df898713eb27437f8cff1b3f1a617b0b},
journal = {CoRR},
keywords = {dblp},
timestamp = {2018-08-14T13:25:20.000+0200},
title = {Off-Policy Actor-Critic},
url = {http://dblp.uni-trier.de/db/journals/corr/corr1205.html#abs-1205-4839},
volume = {abs/1205.4839},
year = 2012
}
@article{gittins1979bandit,
title = {Bandit processes and dynamic allocation indices},
author = {Gittins, John C},
journal = {Journal of the Royal Statistical Society. Series B (Methodological)},
pages = {148--177},
year = {1979},
publisher = {JSTOR}
}
@book{shreve03,
added-at = {2016-10-24T06:15:22.000+0200},
address = {New York, NY},
author = {Shreve, Steven E.},
biburl = {https://www.bibsonomy.org/bibtex/2944e8f60998f1040c77a69b14859f82c/ytyoun},
interhash = {319dab566840b3c2ecb14597455de385},
intrahash = {944e8f60998f1040c77a69b14859f82c},
isbn = {0387401008 9780387401003},
keywords = {finance textbook},
publisher = {Springer-Verlag},
refid = {874753793},
timestamp = {2016-10-24T06:15:22.000+0200},
title = {Stochastic Calculus for Finance {I}: The Binomial Asset Pricing Model: Binomial Asset Pricing Model},
year = 2003
}
@book{shreve04,
added-at = {2016-10-24T06:18:32.000+0200},
address = {New York},
author = {Shreve, Steven E.},
biburl = {https://www.bibsonomy.org/bibtex/2d3151ac3eeae9b3c69ab17da9857c044/ytyoun},
interhash = {51bca5ed5a433fbb7b106adc500b0699},
intrahash = {d3151ac3eeae9b3c69ab17da9857c044},
isbn = {9780387401003 0387401008 9780387401010 0387401016 9780387249681 0387249680},
keywords = {finance textbook},
publisher = {Springer},
refid = {884516378},
timestamp = {2016-10-24T06:18:32.000+0200},
title = {Stochastic Calculus for Finance {II}: Continuous-Time Models},