book/bibliography.bib

@book{Brooks1975,
  added-at = {2011-02-22T17:47:19.000+0100},
  author = {Brooks, Frederick P.},
  biburl = {https://www.bibsonomy.org/bibtex/2368f8c335ac4bc492572b7d29efe7753/fsteeg},
  interhash = {7f0faa2359c30fc99c4bfc2c203d3ecd},
  intrahash = {368f8c335ac4bc492572b7d29efe7753},
  keywords = {diss programming},
  publisher = {Addison-Wesley},
  timestamp = {2011-02-22T17:47:19.000+0100},
  title = {The mythical man-month -- Essays on Software-Engineering},
  topics = {Programming/Software Engineering},
  year = 1975
}
@book{Sutton1998,
  added-at = {2019-07-13T10:11:53.000+0200},
  author = {Sutton, Richard S. and Barto, Andrew G.},
  biburl = {https://www.bibsonomy.org/bibtex/2f46601cf8b13d39d1378af0d79438b12/lanteunis},
  edition = {Second},
  interhash = {ac6b144aaec1819919a2fba9f705c852},
  intrahash = {f46601cf8b13d39d1378af0d79438b12},
  keywords = {},
  publisher = {The MIT Press},
  timestamp = {2019-07-13T10:11:53.000+0200},
  title = {Reinforcement Learning: An Introduction},
  url = {http://incompleteideas.net/book/the-book-2nd.html},
  year = {2018 }
}
@book{puterman2014markov,
  added-at = {2017-04-07T12:13:11.000+0200},
  author = {Puterman, Martin L.},
  biburl = {https://www.bibsonomy.org/bibtex/22e7ac99cd30c4892171e5a7cef1bc7a7/becker},
  interhash = {6cec8f775a265d8741171d17e4a4e7d0},
  intrahash = {2e7ac99cd30c4892171e5a7cef1bc7a7},
  keywords = {inthesis diss markov chain decision process citedby:scholar:count:9594 citedby:scholar:timestamp:2017-4-7},
  publisher = {John Wiley \& Sons},
  timestamp = {2017-04-07T12:13:11.000+0200},
  title = {Markov decision processes: discrete stochastic dynamic programming},
  year = 2014
}
@article{journals/jmlr/LagoudakisP03,
  added-at = {2019-07-10T00:00:00.000+0200},
  author = {Lagoudakis, Michail G. and Parr, Ronald},
  biburl = {https://www.bibsonomy.org/bibtex/279090819f413e277dafd7e99c36dc22f/dblp},
  ee = {http://jmlr.org/papers/v4/lagoudakis03a.html},
  interhash = {80ac6d85410840025987dfdccc3511c9},
  intrahash = {79090819f413e277dafd7e99c36dc22f},
  journal = {J. Mach. Learn. Res.},
  keywords = {dblp},
  pages = {1107-1149},
  timestamp = {2019-07-11T11:42:01.000+0200},
  title = {Least-Squares Policy Iteration.},
  url = {http://dblp.uni-trier.de/db/journals/jmlr/jmlr4.html#LagoudakisP03},
  volume = 4,
  year = 2003
}
@article{BlackScholes1973,
  abstract = {If options are correctly priced in the market, it
                 should not be possible to make sure profits by creating
                 portfolios of long and short positions in options and
                 their underlying stocks. Using this principle, a
                 theoretical valuation formula for options is derived.
                 Since almost all corporate liabilities can be viewed as
                 combinations of options, the formula and the analysis
                 that led to it are also applicable to corporate
                 liabilities such as common stock, corporate bonds, and
                 warrants. In particular, the formula can be used to
                 derive the discount that should be applied to a
                 corporate bond because of the possibility of default.},
  added-at = {2007-06-26T15:08:05.000+0200},
  author = {Black, Fisher and Scholes, Myron S.},
  biburl = {https://www.bibsonomy.org/bibtex/21b438aef4ace91b31c5b3864af3925ac/gilles.daniel},
  comment = {First paper on Black-Scholes options pricing formula
                 Robert C. Merton and Myron S. Scholes, Nobel Prize
                 1997, for a new method to determine the value of
                 derivatives.},
  interhash = {29c4e539e1156910620d127dac78c286},
  intrahash = {1b438aef4ace91b31c5b3864af3925ac},
  journal = {Journal of Political Economy},
  keywords = {imported},
  number = 3,
  pages = {637--654},
  timestamp = {2007-06-26T15:08:07.000+0200},
  title = {The Pricing of Options and Corporate Liabilities},
  volume = 81,
  year = 1973
}
@article{Merton1969Portfolio,
  author = {Merton, Robert C.},
  comment = {Breakthrough paper on Dynamic Control of Portfolio
       	Allocation anc Consumption in continuous-time},
  journal = {The Review of Economics and Statistics},
  keywords = {imported},
  number = 3,
  pages = {247-257},
  title = {Lifetime Portfolio Selection under Uncertainty: The Continuous-Time Case},
  publisher = {JSTOR},
  url = {https://doi.org/10.2307/1926560},
  volume = 51,
  year = 1969
}
@book{GVK505893878,
  added-at = {2009-08-21T12:21:08.000+0200},
  address = {Oxford [u.a.]},
  author = {Björk, {Tomas}},
  biburl = {https://www.bibsonomy.org/bibtex/2d3c057251ab3451711800984033d975a/fbw_hannover},
  edition = {2. ed., reprint.},
  interhash = {d18519febe7a31feb7f81d176833d523},
  intrahash = {d3c057251ab3451711800984033d975a},
  isbn = {0199271267},
  keywords = {Arbitrage Arbitrage-Pricing-Theorie Derivat_<Wertpapier> Derivative_securities Finanzierung Investition Mathematical_models Wahrscheinlichkeitsrechnung},
  pagetotal = {XVIII, 466},
  ppn_gvk = {505893878},
  publisher = {Oxford Univ. Press},
  timestamp = {2009-08-21T12:21:19.000+0200},
  title = {Arbitrage theory in continuous time},
  url = {http://gso.gbv.de/DB=2.1/CMD?ACT=SRCHA&SRT=YOP&IKT=1016&TRM=ppn+505893878&sourceid=fbw_bibsonomy},
  year = 2005
}
@book{Hull_10,
  added-at = {2020-11-16T00:29:59.000+0100},
  author = {Hull, John C.},
  biburl = {https://www.bibsonomy.org/bibtex/273a9017b321b013d960b8796363168c2/derek-jones},
  data = {NA},
  edition = {Seventh},
  interhash = {e3d5119b2a59ad21c9412869e07dba3e},
  intrahash = {73a9017b321b013d960b8796363168c2},
  isbn = {978-0-13-260460-4},
  keywords = {imported},
  month = oct,
  online = {book},
  publisher = {Pearson},
  timestamp = {2020-11-16T00:29:59.000+0100},
  title = {Options, Futures, and other Derivatives},
  year = 2010
}
@article{bertsimas1998optimal,
  title	= {Optimal control of execution costs},
  author	= {Bertsimas, Dimitris and Lo, Andrew W.},
  journal	= {Journal of Financial Markets},
  volume	= {1},
  number	= {1},
  pages	= {1--50},
  year	= {1998},
  publisher = {Elsevier}
}
@article{almgren2000optimal,
  added-at = {2015-07-21T13:20:40.000+0200},
  author = {Almgren, Robert and Chriss, Neil},
  biburl = {https://www.bibsonomy.org/bibtex/2d58e08e2359cb1473f9103da68ac5a10/krassi},
  interhash = {06ceb6c211f0f1a241e3867d6abce744},
  intrahash = {d58e08e2359cb1473f9103da68ac5a10},
  journal = {Journal of Risk},
  keywords = {impact market},
  pages = {5-39},
  timestamp = {2015-07-21T13:20:40.000+0200},
  title = {Optimal execution of portfolio transactions},
  year = 2000
}
@inproceedings{conf/icml/NevmyvakaFK06,
  added-at = {2018-11-06T00:00:00.000+0100},
  author = {Nevmyvaka, Yuriy and Feng, Yi and Kearns, Michael J.},
  biburl = {https://www.bibsonomy.org/bibtex/26a1cd69167152c4ef00ab33ccd5eab6a/dblp},
  booktitle = {ICML},
  crossref = {conf/icml/2006},
  editor = {Cohen, William W. and Moore, Andrew W.},
  ee = {https://doi.org/10.1145/1143844.1143929},
  interhash = {1959e6c9644f29e09724aff0b667be6c},
  intrahash = {6a1cd69167152c4ef00ab33ccd5eab6a},
  isbn = {1-59593-383-2},
  keywords = {dblp},
  pages = {673-680},
  publisher = {ACM},
  series = {ACM International Conference Proceeding Series},
  timestamp = {2019-11-20T11:43:16.000+0100},
  title = {Reinforcement learning for optimized trade execution.},
  url = {http://dblp.uni-trier.de/db/conf/icml/icml2006.html#NevmyvakaFK06},
  volume = 148,
  year = 2006
}
@article{journals/corr/abs-1906-02312,
  added-at = {2019-06-14T00:00:00.000+0200},
  author = {Vyetrenko, Svitlana and Xu, Shaojie},
  biburl = {https://www.bibsonomy.org/bibtex/2523e1e8959fabad6139b868682bbf352/dblp},
  ee = {http://arxiv.org/abs/1906.02312},
  interhash = {a97e6b4031a5a79cfee71e4b670877a7},
  intrahash = {523e1e8959fabad6139b868682bbf352},
  journal = {CoRR},
  keywords = {dblp},
  timestamp = {2019-06-15T11:38:52.000+0200},
  title = {Risk-Sensitive Compact Decision Trees for Autonomous Execution in Presence of Simulated Market Response.},
  url = {http://dblp.uni-trier.de/db/journals/corr/corr1906.html#abs-1906-02312},
  volume = {abs/1906.02312},
  year = 2019
}
@article{Avellaneda2008,
  added-at = {2008-04-23T19:09:19.000+0200},
  author = {Avellaneda, Marco and Stoikov, Sasha},
  biburl = {https://www.bibsonomy.org/bibtex/23a2d90e453d27af63c6544d3bc721664/smicha},
  interhash = {c64f1b610a80b7d4042eadb05ee5c5cd},
  intrahash = {3a2d90e453d27af63c6544d3bc721664},
  issn = {1469-7688},
  journal = {Quantitative Finance},
  keywords = {imported},
  number = 3,
  pages = {217--224},
  publisher = {Routledge},
  timestamp = {2008-04-23T19:09:20.000+0200},
  title = {High-frequency trading in a limit order book},
  url = {http://www.informaworld.com/10.1080/14697680701381228},
  volume = 8,
  year = 2008
}
@article{journals/corr/abs-1804-04216,
  added-at = {2018-08-13T00:00:00.000+0200},
  author = {Spooner, Thomas and Fearnley, John and Savani, Rahul and Koukorinis, Andreas},
  biburl = {https://www.bibsonomy.org/bibtex/23e8b6878e6a2246539d5a5ac61fb3ea0/dblp},
  ee = {http://arxiv.org/abs/1804.04216},
  interhash = {e6bdd859aad86a9441a2c486e472d8c1},
  intrahash = {3e8b6878e6a2246539d5a5ac61fb3ea0},
  journal = {CoRR},
  keywords = {dblp},
  timestamp = {2018-08-14T13:54:22.000+0200},
  title = {Market Making via Reinforcement Learning.},
  url = {http://dblp.uni-trier.de/db/journals/corr/corr1804.html#abs-1804-04216},
  volume = {abs/1804.04216},
  year = 2018
}
@article{journals/corr/abs-1911-05892,
  added-at = {2019-12-04T00:00:00.000+0100},
  author = {Ganesh, Sumitra and Vadori, Nelson and Xu, Mengda and Zheng, Hua and Reddy, Prashant P. and Veloso, Manuela},
  biburl = {https://www.bibsonomy.org/bibtex/236bc4e65153fef4b2ec91572bfc6761c/dblp},
  ee = {http://arxiv.org/abs/1911.05892},
  interhash = {ec668c190bf7e2e060b7e937112d67e2},
  intrahash = {36bc4e65153fef4b2ec91572bfc6761c},
  journal = {CoRR},
  keywords = {dblp},
  timestamp = {2019-12-05T11:38:37.000+0100},
  title = {Reinforcement Learning for Market Making in a Multi-agent Dealer Market.},
  url = {http://dblp.uni-trier.de/db/journals/corr/corr1911.html#abs-1911-05892},
  volume = {abs/1911.05892},
  year = 2019
}
@article{LongstaffSchwartz2001,
  author = {Longstaff, Francis A. and Schwartz, Eduardo S.},
  title	= {Valuing American Options by Simulation: a Simple Least-Squares Approach},
  journal = {Review of Financial Studies},
  year= {2001},
  volume = {14},
  number = {1},
  pages	= {113--147},
  doi = {10.1093/rfs/14.1.113}
}
@inproceedings{li2009,
  abstract = {Options are important instruments in modern finance. In this paper, we investigate reinforcement learning (RL) methods---in particular, least-squares policy iteration (LSPI)---for the problem of learning exercise policies for American options. We develop finite-time bounds on the performance of the policy obtained with LSPI and compare LSPI and the fitted Q-iteration algorithm (FQI) with the Longstaff-Schwartz method (LSM), the standard least-squares Monte Carlo algorithm from the finance community. Our empirical results show that the exercise policies discovered by LSPI and FQI gain larger payoffs than those discovered by LSM, on both real and synthetic data. Furthermore, we find that for all methods the policies learned from real data generally gain similar payoffs to the policies learned from simulated data. Our work shows that solution methods developed in machine learning can advance the state-of-the-art in an important and challenging application area, while demonstrating that computational finance remains a promising area for future applications of machine learning methods.},
  added-at = {2020-03-17T03:03:01.000+0100},
  author = {Li, Y. and Szepesv{\'a}ri, {Cs}. and Schuurmans, D.},
  bdsk-url-1 = {http://www.ics.uci.edu/~aistats/},
  biburl = {https://www.bibsonomy.org/bibtex/274d9725222761918a15eba8ae22a5a7d/csaba},
  booktitle = {AISTATS},
  date-added = {2010-08-28 17:38:14 -0600},
  date-modified = {2015-08-02 01:02:54 +0000},
  interhash = {a1207f07eec4066764db498ab11ce1e8},
  intrahash = {74d9725222761918a15eba8ae22a5a7d},
  keywords = {application finance, learning, reinforcement theory,},
  pages = {352--359},
  pdf = {http://jmlr.csail.mit.edu/proceedings/papers/v5/li09d/li09d.pdf},
  timestamp = {2020-03-17T03:03:01.000+0100},
  title = {Learning Exercise Policies for {A}merican Options},
  url = {http://www.ics.uci.edu/~aistats/},
  volume = 5,
  year = 2009
}
@misc{sutton2001policy,
  added-at = {2008-02-26T11:58:58.000+0100},
  author = {Sutton, R. and Mcallester, D. and Singh, S. and Mansour, Y.},
  biburl = {https://www.bibsonomy.org/bibtex/20b06d9bf0e170dd47a2d380ee8563426/schaul},
  booktitle = {Advances in Neural Information Processing Systems 12 (Proceedings of the 1999 conference)},
  citeulike-article-id = {2374752},
  description = {idsia},
  interhash = {879988d59ea02a1f3c5ec1ba5f545ba8},
  intrahash = {0b06d9bf0e170dd47a2d380ee8563426},
  keywords = {daanbib},
  pages = {1057--1063},
  priority = {2},
  publisher = {MIT Press},
  timestamp = {2008-02-26T12:07:03.000+0100},
  title = {Policy Gradient Methods for Reinforcement Learning with Function Approximation},
  year = 2001
}
@inproceedings{conf/nips/Kakade01,
  added-at = {2020-03-12T00:00:00.000+0100},
  author = {Kakade, Sham M.},
  biburl = {https://www.bibsonomy.org/bibtex/29afe616246c2e9648d0367bfbfd507d5/dblp},
  booktitle = {NIPS},
  crossref = {conf/nips/2001},
  editor = {Dietterich, Thomas G. and Becker, Suzanna and Ghahramani, Zoubin},
  ee = {http://www-2.cs.cmu.edu/Groups/NIPS/NIPS2001/papers/psgz/CN11.ps.gz},
  interhash = {10e8de9f8d2c747e392750b8164c7489},
  intrahash = {9afe616246c2e9648d0367bfbfd507d5},
  keywords = {dblp},
  pages = {1531-1538},
  publisher = {MIT Press},
  timestamp = {2020-03-13T12:44:48.000+0100},
  title = {A Natural Policy Gradient.},
  url = {http://dblp.uni-trier.de/db/conf/nips/nips2001.html#Kakade01},
  year = 2001
}
@article{amari_natural_1998,
  added-at = {2014-04-15T13:06:00.000+0200},
  author = {Amari, S.},
  biburl = {https://www.bibsonomy.org/bibtex/204cab5fc779db34f662a2ae0f25e96ad/wittawatj},
  interhash = {a5cad2a0bad7028a732ae79e9fa6a4b2},
  intrahash = {04cab5fc779db34f662a2ae0f25e96ad},
  journal = {Neural Computation},
  keywords = {daanbib},
  number = 2,
  pages = {251–276},
  timestamp = {2014-04-15T13:06:00.000+0200},
  title = {Natural Gradient Works Efficiently in Learning},
  volume = 10,
  year = 1998
}
@inproceedings{conf/icml/SilverLHDWR14,
  added-at = {2019-05-29T00:00:00.000+0200},
  author = {Silver, David and Lever, Guy and Heess, Nicolas and Degris, Thomas and Wierstra, Daan and Riedmiller, Martin A.},
  biburl = {https://www.bibsonomy.org/bibtex/2e2fb52847293919f2e6c88fc8c9eee9b/dblp},
  booktitle = {ICML},
  crossref = {conf/icml/2014},
  ee = {http://proceedings.mlr.press/v32/silver14.html},
  interhash = {938059a9d0f391a7c3763bcf9788afb1},
  intrahash = {e2fb52847293919f2e6c88fc8c9eee9b},
  keywords = {dblp},
  pages = {387-395},
  publisher = {JMLR.org},
  series = {JMLR Workshop and Conference Proceedings},
  timestamp = {2019-05-30T11:54:07.000+0200},
  title = {Deterministic Policy Gradient Algorithms.},
  url = {http://dblp.uni-trier.de/db/conf/icml/icml2014.html#SilverLHDWR14},
  volume = 32,
  year = 2014
}
@article{salimans2017evolution,
  title={Evolution strategies as a scalable alternative to reinforcement learning},
  author={Salimans, Tim and Ho, Jonathan and Chen, Xi and Sidor, Szymon and Sutskever, Ilya},
  journal={arXiv preprint arXiv:1703.03864},
  year={2017}
}
@article{lai-allocation,
  added-at = {2007-07-05T16:17:35.000+0200},
  author = {Lai, T.L. and Robbins, H.},
  biburl = {https://www.bibsonomy.org/bibtex/243d5e28aa6ae3446e548319c7f964b7f/jleny},
  description = {bandit problems},
  interhash = {c33edf59c35ee99dbaa6f1ce8835b782},
  intrahash = {43d5e28aa6ae3446e548319c7f964b7f},
  journal = {Advances in Applied Mathematics},
  keywords = {imported},
  pages = {4--22},
  timestamp = {2007-07-05T16:17:37.000+0200},
  title = {Asymptotically Efficient Adaptive Allocation Rules},
  volume = 6,
  year = 1985
}
@article{Russo_2018,
  added-at = {2018-07-27T09:01:12.000+0200},
  author = {Russo, Daniel J. and Roy, Benjamin Van and Kazerouni, Abbas and Osband, Ian and Wen, Zheng},
  biburl = {https://www.bibsonomy.org/bibtex/299abe90b21844b84f5ad6c6b7eb08f72/analyst},
  description = {now publishers - A Tutorial on Thompson Sampling},
  doi = {10.1561/2200000070},
  interhash = {8e531ba267f78a2053e3adea43f938f6},
  intrahash = {99abe90b21844b84f5ad6c6b7eb08f72},
  journal = {Foundations and Trends{\textregistered} in Machine Learning},
  keywords = {2018 book probability reinforcement-learning tutorial},
  number = 1,
  pages = {1--96},
  publisher = {Now Publishers},
  timestamp = {2018-07-27T09:01:12.000+0200},
  title = {A Tutorial on Thompson Sampling},
  url = {https://doi.org/10.1561%2F2200000070},
  volume = 11,
  year = 2018
}
@inproceedings{conf/nips/GuezHSD14,
  added-at = {2020-03-06T00:00:00.000+0100},
  author = {Guez, Arthur and Heess, Nicolas and Silver, David and Dayan, Peter},
  biburl = {https://www.bibsonomy.org/bibtex/25ad8c1dda953f92e25cebd2be90910e9/dblp},
  booktitle = {NIPS},
  crossref = {conf/nips/2014},
  editor = {Ghahramani, Zoubin and Welling, Max and Cortes, Corinna and Lawrence, Neil D. and Weinberger, Kilian Q.},
  ee = {http://papers.nips.cc/paper/5501-bayes-adaptive-simulation-based-search-with-value-function-approximation},
  interhash = {50f820f1e229e0219ac03fd626067f6a},
  intrahash = {5ad8c1dda953f92e25cebd2be90910e9},
  keywords = {dblp},
  pages = {451-459},
  timestamp = {2020-03-07T11:48:07.000+0100},
  title = {Bayes-Adaptive Simulation-based Search with Value Function Approximation.},
  url = {http://dblp.uni-trier.de/db/conf/nips/nips2014.html#GuezHSD14},
  year = 2014
}
@inproceedings{conf/ijcai/BrafmanT01,
  added-at = {2003-05-23T00:00:00.000+0200},
  author = {Brafman, Ronen I. and Tennenholtz, Moshe},
  biburl = {https://www.bibsonomy.org/bibtex/25aa5f438073e0f0dbb85194f5d714f7d/dblp},
  booktitle = {IJCAI},
  crossref = {conf/ijcai/2001},
  editor = {Nebel, Bernhard},
  ee = {http://ijcai.org/proceedings/2001-2},
  interhash = {5997357756eb4e1585e12d737a39852d},
  intrahash = {5aa5f438073e0f0dbb85194f5d714f7d},
  isbn = {1-55860-777-3},
  keywords = {dblp},
  pages = {953-958},
  publisher = {Morgan Kaufmann},
  timestamp = {2019-08-21T11:49:29.000+0200},
  title = {R-MAX - A General Polynomial Time Algorithm for Near-Optimal Reinforcement Learning.},
  url = {http://dblp.uni-trier.de/db/conf/ijcai/ijcai2001.html#BrafmanT01},
  year = 2001
}
@article{journals/sigart/Sutton91,
  added-at = {2020-05-19T00:00:00.000+0200},
  author = {Sutton, Richard S.},
  biburl = {https://www.bibsonomy.org/bibtex/2281eb5d631b9b8c16d66e976055f89a6/dblp},
  ee = {https://doi.org/10.1145/122344.122377},
  interhash = {51bf73485ee40e297f5c9d5bc5dad04f},
  intrahash = {281eb5d631b9b8c16d66e976055f89a6},
  journal = {SIGART Bull.},
  keywords = {dblp},
  number = 4,
  pages = {160-163},
  timestamp = {2020-05-20T11:40:24.000+0200},
  title = {Dyna, an Integrated Architecture for Learning, Planning, and Reacting.},
  url = {http://dblp.uni-trier.de/db/journals/sigart/sigart2.html#Sutton91},
  volume = 2,
  year = 1991
}
@article{mnih2013atari,
  abstract = {We present the first deep learning model to successfully learn control
policies directly from high-dimensional sensory input using reinforcement
learning. The model is a convolutional neural network, trained with a variant
of Q-learning, whose input is raw pixels and whose output is a value function
estimating future rewards. We apply our method to seven Atari 2600 games from
the Arcade Learning Environment, with no adjustment of the architecture or
learning algorithm. We find that it outperforms all previous approaches on six
of the games and surpasses a human expert on three of them.},
  added-at = {2019-07-11T17:41:01.000+0200},
  author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Graves, Alex and Antonoglou, Ioannis and Wierstra, Daan and Riedmiller, Martin},
  biburl = {https://www.bibsonomy.org/bibtex/2a00ec4c09f5dc9b3f8a1836f4e02bb5d/lanteunis},
  description = {[1312.5602] Playing Atari with Deep Reinforcement Learning},
  interhash = {78966703f649bae69a08a6a23a4e8879},
  intrahash = {a00ec4c09f5dc9b3f8a1836f4e02bb5d},
  keywords = {DRLAlgoComparison dqn final reinforcement thema:reinforcement_learning_recommender},
  note = {cite arxiv:1312.5602Comment: NIPS Deep Learning Workshop 2013},
  timestamp = {2019-12-16T21:10:09.000+0100},
  title = {Playing Atari with Deep Reinforcement Learning},
  url = {http://arxiv.org/abs/1312.5602},
  year = 2013
}
@article{mnih2015humanlevel,
  abstract = {An artificial agent is developed that learns to play a diverse range of classic Atari 2600 computer games directly from sensory experience, achieving a performance comparable to that of an expert human player; this work paves the way to building general-purpose learning algorithms that bridge the divide between perception and action.},
  added-at = {2020-03-25T21:22:39.000+0100},
  author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A. and Veness, Joel and Bellemare, Marc G. and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K. and Ostrovski, Georg and Petersen, Stig and Beattie, Charles and Sadik, Amir and Antonoglou, Ioannis and King, Helen and Kumaran, Dharshan and Wierstra, Daan and Legg, Shane and Hassabis, Demis},
  biburl = {https://www.bibsonomy.org/bibtex/2fb15f4471c81dc2b9edf2304cb2f7083/cpankow},
  description = {Human-level control through deep reinforcement learning | Nature},
  doi = {10.1038/nature14236},
  interhash = {eac59980357d99db87b341b61ef6645f},
  intrahash = {fb15f4471c81dc2b9edf2304cb2f7083},
  issn = {14764687},
  journal = {Nature},
  keywords = {machinelearning neuralnetwork reinforcementlearning},
  number = 7540,
  pages = {529--533},
  refid = {Mnih2015},
  timestamp = {2020-03-25T21:22:39.000+0100},
  title = {Human-level control through deep reinforcement learning},
  url = {https://doi.org/10.1038/nature14236},
  volume = 518,
  year = 2015
}
@article{silver2016mastering,
  abstract = {The game of Go has long been viewed as the most challenging of classic games for artificial intelligence owing to its enormous search space and the difficulty of evaluating board positions and moves. Here we introduce a new approach to computer Go that uses ‘value networks’ to evaluate board positions and ‘policy networks’ to select moves. These deep neural networks are trained by a novel combination of supervised learning from human expert games, and reinforcement learning from games of self-play. Without any lookahead search, the neural networks play Go at the level of state-of-the-art Monte Carlo tree search programs that simulate thousands of random games of self-play. We also introduce a new search algorithm that combines Monte Carlo simulation with value and policy networks. Using this search algorithm, our program AlphaGo achieved a 99.8% winning rate against other Go programs, and defeated the human European Go champion by 5 games to 0. This is the first time that a computer program has defeated a human professional player in the full-sized game of Go, a feat previously thought to be at least a decade away.},
  added-at = {2021-03-08T10:50:11.000+0100},
  author = {Silver, David and Huang, Aja and Maddison, Chris J. and Guez, Arthur and Sifre, Laurent and van den Driessche, George and Schrittwieser, Julian and Antonoglou, Ioannis and Panneershelvam, Veda and Lanctot, Marc and Dieleman, Sander and Grewe, Dominik and Nham, John and Kalchbrenner, Nal and Sutskever, Ilya and Lillicrap, Timothy and Leach, Madeleine and Kavukcuoglu, Koray and Graepel, Thore and Hassabis, Demis},
  biburl = {https://www.bibsonomy.org/bibtex/2e3bd772f62209ea8283e242f993d3edf/analyst},
  description = {Mastering the game of Go with deep neural networks and tree search | Nature},
  doi = {10.1038/nature16961},
  interhash = {48430c7891aaf9fe2582faa8f5d076c1},
  intrahash = {e3bd772f62209ea8283e242f993d3edf},
  issn = {14764687},
  journal = {Nature},
  keywords = {2016 deep-learning go nature paper reinforcement-learning},
  number = 7587,
  pages = {484--489},
  refid = {Silver2016},
  timestamp = {2021-03-08T10:50:11.000+0100},
  title = {Mastering the game of Go with deep neural networks and tree search},
  url = {https://doi.org/10.1038/nature16961},
  volume = 529,
  year = 2016
}
@inproceedings{conf/cg/Coulom06,
  added-at = {2017-05-17T00:00:00.000+0200},
  author = {Coulom, Rémi},
  biburl = {https://www.bibsonomy.org/bibtex/27b82a435e6fb9cd3dc12367366ef0641/dblp},
  booktitle = {Computers and Games},
  crossref = {conf/cg/2006},
  editor = {van den Herik, H. Jaap and Ciancarini, Paolo and Donkers, H. H. L. M.},
  ee = {https://doi.org/10.1007/978-3-540-75538-8_7},
  interhash = {33f87d0cbbc96923fb95cee26daaee4a},
  intrahash = {7b82a435e6fb9cd3dc12367366ef0641},
  isbn = {978-3-540-75537-1},
  keywords = {dblp},
  pages = {72-83},
  publisher = {Springer},
  series = {Lecture Notes in Computer Science},
  timestamp = {2019-05-15T13:26:49.000+0200},
  title = {Efficient Selectivity and Backup Operators in Monte-Carlo Tree Search.},
  url = {http://dblp.uni-trier.de/db/conf/cg/cg2006.html#Coulom06},
  volume = 4630,
  year = 2006
}
@inproceedings{kocsis2006a,
  abstract = {We consider batch reinforcement learning problems in continuous space, expected total discounted-reward Markovian Decision Problems. As opposed to previous theoretical work, we consider the case when the training data consists of a single sample path (trajectory) of some behaviour policy.In particular, we do not assume access to a generative model of the environment.The algorithm studied is fitted Q-iteration where in successive iterations the $Q$-functions of the intermediate policies are obtained by means of minimizing a novel Bellman-residual type error.PAC-style polynomial bounds are derived on the number of samples needed to guarantee near-optimal performance where the bound depends on the mixing rate of the trajectory, the smoothness properties of the underlying Markovian Decision Problem, the approximation power and capacity of the function set used.},
  added-at = {2020-03-17T03:03:01.000+0100},
  author = {Kocsis, L. and Szepesv{\'a}ri, {Cs}.},
  biburl = {https://www.bibsonomy.org/bibtex/286adc7a187cd2dbefe68d09d23a948e5/csaba},
  booktitle = {ECML},
  crossref = {ECML06},
  date-added = {2010-08-28 17:38:14 -0600},
  date-modified = {2010-11-25 00:57:10 -0700},
  interhash = {e9621505f16a5c5b13a8b23955c39fd5},
  intrahash = {86adc7a187cd2dbefe68d09d23a948e5},
  keywords = {Monte-Carlo UCT, bandits games, in learning learning, methods, reinforcement search, tree},
  pages = {282--293},
  pdf = {papers/ecml06.pdf},
  timestamp = {2020-03-17T03:03:01.000+0100},
  title = {Bandit based {M}onte-{C}arlo Planning},
  year = 2006
}
@article{Auer2002,
  abstract = {Reinforcement learning policies face the exploration versus exploitation dilemma, i.e. the search for a balance between exploring the environment to find profitable actions while taking the empirically best action as often as possible. A popular measure of a policy's success in addressing this dilemma is the regret, that is the loss due to the fact that the globally optimal policy is not followed all the times. One of the simplest examples of the exploration/exploitation dilemma is the multi-armed bandit problem. Lai and Robbins were the first ones to show that the regret for this problem has to grow at least logarithmically in the number of plays. Since then, policies which asymptotically achieve this regret have been devised by Lai and Robbins and many others. In this work we show that the optimal logarithmic regret is also achievable uniformly over time, with simple and efficient policies, and for all reward distributions with bounded support.},
  added-at = {2021-03-13T09:12:07.000+0100},
  author = {Auer, Peter and Cesa-Bianchi, Nicol{\`o} and Fischer, Paul},
  biburl = {https://www.bibsonomy.org/bibtex/2c1b9ac34e95a12dcaca2791593d442f7/analyst},
  day = 01,
  description = {Finite-time Analysis of the Multiarmed Bandit Problem | SpringerLink},
  doi = {10.1023/A:1013689704352},
  interhash = {d9ed352509aceb102cbb43c4127a7b30},
  intrahash = {c1b9ac34e95a12dcaca2791593d442f7},
  issn = {1573-0565},
  journal = {Machine Learning},
  keywords = {2002 reinforcement-learning},
  month = may,
  number = 2,
  pages = {235--256},
  timestamp = {2021-03-13T09:12:07.000+0100},
  title = {Finite-time Analysis of the Multiarmed Bandit Problem},
  url = {https://doi.org/10.1023/A:1013689704352},
  volume = 47,
  year = 2002
}
@article{journals/ior/ChangFHM05,
  added-at = {2018-11-14T00:00:00.000+0100},
  author = {Chang, Hyeong Soo and Fu, Michael C. and Hu, Jiaqiao and Marcus, Steven I.},
  biburl = {https://www.bibsonomy.org/bibtex/2644483341bcebfc39ea4aa952a4660ea/dblp},
  ee = {https://www.wikidata.org/entity/Q57382677},
  interhash = {164ee491ebc54ca22e40986ab8bdf89d},
  intrahash = {644483341bcebfc39ea4aa952a4660ea},
  journal = {Operations Research},
  keywords = {dblp},
  number = 1,
  pages = {126-139},
  timestamp = {2018-11-15T14:33:31.000+0100},
  title = {An Adaptive Sampling Algorithm for Solving Markov Decision Processes.},
  url = {http://dblp.uni-trier.de/db/journals/ior/ior53.html#ChangFHM05},
  volume = 53,
  year = 2005
}
@article{journals/corr/abs-1812-02648,
  added-at = {2019-01-01T00:00:00.000+0100},
  author = {van Hasselt, Hado and Doron, Yotam and Strub, Florian and Hessel, Matteo and Sonnerat, Nicolas and Modayil, Joseph},
  biburl = {https://www.bibsonomy.org/bibtex/23cbd476a62eb476f3aa231be93236ce3/dblp},
  ee = {http://arxiv.org/abs/1812.02648},
  interhash = {14287660ba0291fb10822fd2fc8b7dcd},
  intrahash = {3cbd476a62eb476f3aa231be93236ce3},
  journal = {CoRR},
  keywords = {dblp},
  timestamp = {2019-01-02T11:37:27.000+0100},
  title = {Deep Reinforcement Learning and the Deadly Triad.},
  url = {http://dblp.uni-trier.de/db/journals/corr/corr1812.html#abs-1812-02648},
  volume = {abs/1812.02648},
  year = 2018
}
@book{Bellman1957,
  added-at = {2021-02-01T10:51:23.000+0100},
  address = {Princeton, NJ, USA},
  author = {Bellman, Richard},
  bib2html_rescat = {General RL},
  biburl = {https://www.bibsonomy.org/bibtex/29cdd821222218ded252c8ba5cd712666/m-toman},
  edition = 1,
  interhash = {acf948462171ca060064a7ded257a792},
  intrahash = {9cdd821222218ded252c8ba5cd712666},
  keywords = {imported},
  publisher = {Princeton University Press},
  timestamp = {2021-02-01T10:51:23.000+0100},
  title = {Dynamic Programming},
  year = 1957
}
@article{bellman1957markovian,
  added-at = {2017-04-07T12:00:35.000+0200},
  author = {Bellman, Richard},
  biburl = {https://www.bibsonomy.org/bibtex/2c04c5f89b4e8445651eded5b56c67342/becker},
  interhash = {d7aa065c075b248c9980b4c45d635b66},
  intrahash = {c04c5f89b4e8445651eded5b56c67342},
  journal = {Journal of Mathematics and Mechanics},
  keywords = {chain citedby:scholar:count:987 citedby:scholar:timestamp:2017-4-7 decision diss inthesis markov process},
  number = 5,
  pages = {679--684},
  timestamp = {2017-12-20T14:47:54.000+0100},
  title = {A Markovian decision process},
  url = {http://www.jstor.org/stable/24900506},
  volume = 6,
  year = 1957
}
@book{howard:dp,
  added-at = {2008-02-26T11:58:58.000+0100},
  address = {Cambridge, MA},
  author = {Howard, R. A.},
  biburl = {https://www.bibsonomy.org/bibtex/28b55f737ee6dd7800ffc7952a33bb6bd/schaul},
  citeulike-article-id = {2380352},
  description = {idsia},
  interhash = {7eed9f4f6bd1f9ee063d80d0f732e48f},
  intrahash = {8b55f737ee6dd7800ffc7952a33bb6bd},
  keywords = {inaki},
  priority = {2},
  publisher = {MIT Press},
  timestamp = {2008-02-26T12:01:06.000+0100},
  title = {Dynamic Programming and Markov Processes},
  year = 1960
}
@book{Gagniuc2017MarkovCF,
  title={Markov Chains: From Theory to Implementation and Experimentation},
  author={Gagniuc, Paul A.},
  year={2017},
  publisher = {John Wiley \& Sons}
}
@article{ASTROM1965174,
title = {Optimal control of Markov processes with incomplete state information},
journal = {Journal of Mathematical Analysis and Applications},
volume = {10},
number = {1},
pages = {174-205},
year = {1965},
issn = {0022-247X},
doi = {https://doi.org/10.1016/0022-247X(65)90154-X},
url = {https://www.sciencedirect.com/science/article/pii/0022247X6590154X},
author = {K.J Åström}
}
@book{krishnamurthy_2016,
place={Cambridge},
title={Partially Observed Markov Decision Processes: From Filtering to Controlled Sensing},
DOI={10.1017/CBO9781316471104},
publisher={Cambridge University Press},
author={Krishnamurthy, Vikram},
year={2016}
}
@INPROCEEDINGS{4047044,
  author={Bertsekas, Dimitri P.},
  booktitle={1981 20th IEEE Conference on Decision and Control including the Symposium on Adaptive Processes}, 
  title={Distributed dynamic programming}, 
  year={1981},
  volume={},
  number={},
  pages={774-779},
  doi={10.1109/CDC.1981.269319}
}
@article{Bertsekas1983DistributedAC,
  title={Distributed asynchronous computation of fixed points},
  author={Dimitri P. Bertsekas},
  journal={Mathematical Programming},
  year={1983},
  volume={27},
  pages={107-120}
}
@book{books/lib/Bertsekas05,
  added-at = {2020-07-17T00:00:00.000+0200},
  author = {Bertsekas, Dimitri P.},
  biburl = {https://www.bibsonomy.org/bibtex/287e129b4ba44f19590da9ec79f2c46c3/dblp},
  ee = {https://www.worldcat.org/oclc/314894080},
  interhash = {95914d535bf600768af9ce7c4dd8ab76},
  intrahash = {87e129b4ba44f19590da9ec79f2c46c3},
  isbn = {1886529264},
  keywords = {dblp},
  pages = {I-XV, 1-543},
  publisher = {Athena Scientific},
  timestamp = {2020-07-24T00:45:12.000+0200},
  title = {Dynamic Programming and Optimal Control, Volume 1, 3rd Edition.},
  year = 2005
}
@book{books/lib/Bertsekas12,
  author = {Bertsekas, Dimitri P.},
  keywords = {dblp},
  publisher = {Athena Scientific},
  title = {Dynamic Programming and Optimal Control, Volume 2: Approximate Dynamic Programming},
  year = 2012
}
@book{BertsekasTsitsiklis96,
  added-at = {2008-09-16T23:39:07.000+0200},
  address = {Belmont, MA},
  author = {Bertsekas, D. P. and Tsitsiklis, J. N.},
  biburl = {https://www.bibsonomy.org/bibtex/219e84aaacaf689cde6190fe14ba5a337/brian.mingus},
  booktitle = {Neuro-dynamic programming.},
  description = {CCNLab BibTeX},
  interhash = {2fbe138b7b864bc58d95999e69b5d45b},
  intrahash = {19e84aaacaf689cde6190fe14ba5a337},
  keywords = {nnets},
  publisher = {Athena Scientific},
  timestamp = {2008-09-16T23:39:21.000+0200},
  title = {Neuro-dynamic programming.},
  year = 1996
}
@article{KinBa17,
  added-at = {2018-02-28T16:10:01.000+0100},
  author = {Kingma, Diederik P. and Ba, Jimmy},
  biburl = {https://www.bibsonomy.org/bibtex/23b0328784dbfce338ba0dd2618a7a059/loroch},
  ee = {http://arxiv.org/abs/1412.6980},
  interhash = {57d2ac873f398f21bb94790081e80394},
  intrahash = {3b0328784dbfce338ba0dd2618a7a059},
  journal = {CoRR},
  keywords = {deep_learning gradient_descend methods momentum optimization training},
  timestamp = {2018-02-28T16:10:01.000+0100},
  title = {Adam: A Method for Stochastic Optimization.},
  url = {http://dblp.uni-trier.de/db/journals/corr/corr1412.html#KingmaB14},
  volume = {abs/1412.6980},
  year = 2014
}
@book{Goodfellow-et-al-2016,
    title={Deep Learning},
    author={Ian Goodfellow and Yoshua Bengio and Aaron Courville},
    publisher={MIT Press},
    note={\url{http://www.deeplearningbook.org}},
    year={2016}
}
@book{GVK266386229,
  added-at = {2009-08-21T15:23:22.000+0200},
  address = {Dordrecht [u.a.]},
  biburl = {https://www.bibsonomy.org/bibtex/297c6ee9c85f27ecf0a8f296f488da9f9/fbw_hannover},
  editor = {Barberà, {Salvador} and Seidl, {Christian} and Hammond, {Peter J.}},
  interhash = {a9beb3cbe91a091036d161d99081ef75},
  intrahash = {97c6ee9c85f27ecf0a8f296f488da9f9},
  isbn = {0792381742},
  keywords = {Mikroökonomie Nutzentheorie Theorie Theorie_der_Wirtschaftspolitik Utility-Theorie Wohlfahrtstheorie},
  pagetotal = {VII, 684},
  ppn_gvk = {266386229},
  publisher = {Kluwer},
  subtitle = {Handbook of utility theory / Barberà, Salvador. - Boston, Mass. [u.a.] : Kluwer, 1998- Vol. 1},
  timestamp = {2009-08-21T15:23:23.000+0200},
  title = {Handbook of utility theory},
  url = {http://gso.gbv.de/DB=2.1/CMD?ACT=SRCHA&SRT=YOP&IKT=1016&TRM=ppn+266386229&sourceid=fbw_bibsonomy},
  year = 1998
}
@preprint{buhler2018hedging,
  abstract = {We present a framework for hedging a portfolio of derivatives in the presence
of market frictions such as transaction costs, market impact, liquidity
constraints or risk limits using modern deep reinforcement machine learning
methods.
  We discuss how standard reinforcement learning methods can be applied to
non-linear reward structures, i.e. in our case convex risk measures. As a
general contribution to the use of deep learning for stochastic processes, we
also show that the set of constrained trading strategies used by our algorithm
is large enough to $\epsilon$-approximate any optimal solution.
  Our algorithm can be implemented efficiently even in high-dimensional
situations using modern machine learning tools. Its structure does not depend
on specific market dynamics, and generalizes across hedging instruments
including the use of liquid derivatives. Its computational performance is
largely invariant in the size of the portfolio as it depends mainly on the
number of hedging instruments available.
  We illustrate our approach by showing the effect on hedging under transaction
costs in a synthetic market driven by the Heston model, where we outperform the
standard "complete market" solution.},
  added-at = {2019-05-30T18:29:52.000+0200},
  author = {Bühler, Hans and Gonon, Lukas and Teichmann, Josef and Wood, Ben},
  biburl = {https://www.bibsonomy.org/bibtex/2849f2e8b1df10751e8304143ea692fdb/nonancourt},
  description = {Deep Hedging},
  interhash = {fb505f924756bb0ca0b033b313848147},
  intrahash = {849f2e8b1df10751e8304143ea692fdb},
  keywords = {deep-learning finance},
  note = {cite arxiv:1802.03042},
  timestamp = {2019-05-30T18:29:52.000+0200},
  title = {Deep Hedging},
  url = {http://arxiv.org/abs/1802.03042},
  year = 2018
}
@book{Gueant2016,
  title={The Financial Mathematics of Market Liquidity: From Optimal Execution to Market Making},
  author={Gueant, Olivier},
  year={2016},
  publisher = {Chapman and Hall/CRC Financial Mathematics Series}
}
@phdthesis{Watkins:89,
  added-at = {2008-02-26T11:58:58.000+0100},
  author = {Watkins, C. J. C. H.},
  biburl = {https://www.bibsonomy.org/bibtex/21ffd549077ea1da7675431a17fa2af03/schaul},
  citeulike-article-id = {2381652},
  description = {idsia},
  interhash = {ca824d64b71939208358edb4a26f8351},
  intrahash = {1ffd549077ea1da7675431a17fa2af03},
  keywords = {juergen},
  priority = {2},
  school = {King's College, Oxford},
  timestamp = {2008-02-26T11:59:46.000+0100},
  title = {Learning from Delayed Rewards},
  year = 1989
}
@book{klopf1972brain,
  title={Brain Function and Adaptive Systems--a Heterostatic Theory},
  author={Klopf, A.H. and Air Force Cambridge Research Laboratories (U.S.). Data Sciences Laboratory},
  series={Special reports},
  url={https://books.google.com/books?id=C2hztwEACAAJ},
  year={1972},
  publisher={Data Sciences Laboratory, Air Force Cambridge Research Laboratories, Air Force Systems Command, United States Air Force}
}
@techreport{rummery:tech94,
  added-at = {2008-03-11T14:52:34.000+0100},
  author = {Rummery, G. A. and Niranjan, M.},
  biburl = {https://www.bibsonomy.org/bibtex/2dbfba0b20bace9085789f4f479f6111f/idsia},
  citeulike-article-id = {2380290},
  institution = {Engineering Department, Cambridge University},
  interhash = {0c7cd3821ad0fe1b39a6ce1b35ec4bc0},
  intrahash = {dbfba0b20bace9085789f4f479f6111f},
  keywords = {inaki},
  number = {CUED/F-INFENG/TR-166},
  priority = {2},
  timestamp = {2008-03-11T14:56:18.000+0100},
  title = {On-line {Q}-learning using connectionist systems},
  year = 1994
}
@article{10.1214/aoms/1177729893,
author = {Jack Sherman and Winifred J. Morrison},
title = {{Adjustment of an Inverse Matrix Corresponding to a Change in One Element of a Given Matrix}},
volume = {21},
journal = {The Annals of Mathematical Statistics},
number = {1},
publisher = {Institute of Mathematical Statistics},
pages = {124 -- 127},
year = {1950},
doi = {10.1214/aoms/1177729893},
URL = {https://doi.org/10.1214/aoms/1177729893}
}
@article{journals/ml/BradtkeB96,
  added-at = {2020-03-02T00:00:00.000+0100},
  author = {Bradtke, Steven J. and Barto, Andrew G.},
  biburl = {https://www.bibsonomy.org/bibtex/2d6c05c943a95b78845e2765a69cb4cc9/dblp},
  ee = {https://www.wikidata.org/entity/Q56095426},
  interhash = {d49c55128e85ec3a2882ba148c8db33f},
  intrahash = {d6c05c943a95b78845e2765a69cb4cc9},
  journal = {Mach. Learn.},
  keywords = {dblp},
  number = {1-3},
  pages = {33-57},
  timestamp = {2020-03-03T11:49:59.000+0100},
  title = {Linear Least-Squares Algorithms for Temporal Difference Learning.},
  url = {http://dblp.uni-trier.de/db/journals/ml/ml22.html#BradtkeB96},
  volume = 22,
  year = 1996
}
@phdthesis{lin:phd,
  added-at = {2008-03-11T14:52:34.000+0100},
  address = {Pittsburg},
  author = {Lin, Long J.},
  biburl = {https://www.bibsonomy.org/bibtex/215f83604aa0fe71e484b319a4bf434a4/idsia},
  citeulike-article-id = {2380251},
  interhash = {b312cef919452127612baf1fe7ac3382},
  intrahash = {15f83604aa0fe71e484b319a4bf434a4},
  keywords = {inaki},
  priority = {2},
  school = {CMU},
  timestamp = {2008-03-11T14:56:22.000+0100},
  title = {Reinforcement Learning for Robots Using Neural Networks},
  year = 1993
}
@inproceedings{Baird:95,
  added-at = {2008-03-11T14:52:34.000+0100},
  author = {Leemon},
  biburl = {https://www.bibsonomy.org/bibtex/2f421da3046f64fb46524e23fa82bc9e6/idsia},
  booktitle = {International Conference on Machine Learning},
  citeulike-article-id = {2374989},
  interhash = {cab67e1db86772844efd1e5d94731806},
  intrahash = {f421da3046f64fb46524e23fa82bc9e6},
  keywords = {imported},
  pages = {30--37},
  priority = {2},
  timestamp = {2008-03-11T15:05:44.000+0100},
  title = {Residual Algorithms: Reinforcement Learning with Function Approximation},
  url = {citeseer.ist.psu.edu/baird95residual.html},
  year = 1995
}
@inproceedings{sutton2008,
  abstract = {We introduce the first temporal-difference learning algorithm that is stable with linear function approximation and off-policy training, for any finite Markov decision process, behavior policy, and target policy, and whose complexity scales linearly in the number of parameters. We consider an i.i.d. policy-evaluation setting in which the data need not come from on-policy experience. The gradient temporal-difference (GTD) algorithm estimates the expected update vector of the TD(0) algorithm and performs stochastic gradient descent on its L2 norm. We prove that this algorithm is stable and convergent under the usual stochastic approximation conditions to the same least-squares solution as found by the LSTD, but without LSTD's quadratic computational complexity. GTD is online and incremental, and does not involve multiplying by products of likelihood ratios as in importance-sampling methods.},
  added-at = {2020-03-17T03:03:01.000+0100},
  author = {Sutton, R.S. and Szepesv{\'a}ri, {Cs}. and Maei, H.R.},
  bibsource = {DBLP, http://dblp.uni-trier.de},
  biburl = {https://www.bibsonomy.org/bibtex/2ae2eb05437d68f9027da6658faaba91e/csaba},
  booktitle = {NIPS},
  crossref = {NIPS21},
  date-added = {2010-08-28 17:38:14 -0600},
  date-modified = {2010-11-25 00:50:58 -0700},
  ee = {http://books.nips.cc/papers/files/nips21/NIPS2008_0421.pdf},
  interhash = {2cd17553b0213961edd86f4ea585fb67},
  intrahash = {ae2eb05437d68f9027da6658faaba91e},
  keywords = {GTD algorithm, approximation, function gradient learning, online prediction, reinforcement stochastic theory,},
  pages = {1609--1616},
  pdf = {papers/gtdnips08.pdf},
  timestamp = {2020-03-17T03:03:01.000+0100},
  title = {A Convergent {O}(n) Algorithm for Off-policy Temporal-difference Learning with Linear Function Approximation},
  year = 2008
}
@inproceedings{sutton2009,
  abstract = {Sutton, Szepesvari and Maei (2009) recently introduced the first temporal-difference learning algorithm compatible with both linear function approximation and off-policy training, and whose complexity scales only linearly in the size of the function approximator. Although their gradient temporal difference (GTD) algorithm converges reliably, it can be very slow compared to conventional linear TD (on on-policy problems where TD is convergent), calling into question its practical utility. In this paper we introduce two new related algorithms with better convergence rates. The first algorithm, GTD2, is derived and proved convergent just as GTD was, but uses a different objective function and converges significantly faster (but still not as fast as conventional TD). The second new algorithm, linear TD with gradient correction, or TDC, uses the same update rule as conventional TD except for an additional term which is initially zero. In our experiments on small test problems and in a Computer Go application with a million features, the learning rate of this algorithm was comparable to that of conventional TD. This algorithm appears to extend linear TD to off-policy learning with no penalty in performance while only doubling computational requirements.},
  added-at = {2020-03-17T03:03:01.000+0100},
  author = {Sutton, R.S. and Maei, H.R. and Precup, D. and Bhatnagar, S. and Silver, D. and Szepesv{\'a}ri, {Cs}. and Wiewiora, E.},
  biburl = {https://www.bibsonomy.org/bibtex/22160b897b778769d8e85de83c78cdf82/csaba},
  booktitle = {ICML},
  date-added = {2010-08-28 17:38:14 -0600},
  date-modified = {2010-11-25 00:50:04 -0700},
  interhash = {8b02feb5f5e92775fec53a849ed924ac},
  intrahash = {2160b897b778769d8e85de83c78cdf82},
  keywords = {GTD2, TDC algorithm, approximation, function gradient learning, online prediction, reinforcement stochastic theory,},
  pages = {993--1000},
  pdf = {papers/GTD-ICML09.pdf},
  timestamp = {2020-03-17T03:03:01.000+0100},
  title = {Fast Gradient-Descent Methods for Temporal-Difference Learning with Linear Function Approximation},
  year = 2009
}
@article{Williams:92,
  added-at = {2008-02-26T11:58:58.000+0100},
  author = {Williams, R. J.},
  biburl = {https://www.bibsonomy.org/bibtex/294224c3e53bfe80ade7218b3a0283465/schaul},
  citeulike-article-id = {2374762},
  description = {idsia},
  interhash = {b90d65a735ae02a940f5075b0fd7ebe7},
  intrahash = {94224c3e53bfe80ade7218b3a0283465},
  journal = {Machine Learning},
  keywords = {daanbib},
  pages = {229--256},
  priority = {2},
  timestamp = {2008-02-26T12:07:02.000+0100},
  title = {Simple statistical gradient-following algorithms for connectionist reinforcement learning},
  volume = 8,
  year = 1992
}
@article{journals/corr/abs-1205-4839,
  added-at = {2018-08-13T00:00:00.000+0200},
  author = {Degris, Thomas and White, Martha and Sutton, Richard S.},
  biburl = {https://www.bibsonomy.org/bibtex/2df898713eb27437f8cff1b3f1a617b0b/dblp},
  ee = {http://arxiv.org/abs/1205.4839},
  interhash = {69c9fdc4b9ee04a525f86765ecc7e6c9},
  intrahash = {df898713eb27437f8cff1b3f1a617b0b},
  journal = {CoRR},
  keywords = {dblp},
  timestamp = {2018-08-14T13:25:20.000+0200},
  title = {Off-Policy Actor-Critic},
  url = {http://dblp.uni-trier.de/db/journals/corr/corr1205.html#abs-1205-4839},
  volume = {abs/1205.4839},
  year = 2012
}
@article{gittins1979bandit,
  title	= {Bandit processes and dynamic allocation indices},
  author	= {Gittins, John C},
  journal	= {Journal of the Royal Statistical Society. Series B (Methodological)},
  pages	= {148--177},
  year	= {1979},
  publisher	= {JSTOR}
}
@book{shreve03,
  added-at = {2016-10-24T06:15:22.000+0200},
  address = {New York, NY},
  author = {Shreve, Steven E.},
  biburl = {https://www.bibsonomy.org/bibtex/2944e8f60998f1040c77a69b14859f82c/ytyoun},
  interhash = {319dab566840b3c2ecb14597455de385},
  intrahash = {944e8f60998f1040c77a69b14859f82c},
  isbn = {0387401008 9780387401003},
  keywords = {finance textbook},
  publisher = {Springer-Verlag},
  refid = {874753793},
  timestamp = {2016-10-24T06:15:22.000+0200},
  title = {Stochastic Calculus for Finance {I}: The Binomial Asset Pricing Model: Binomial Asset Pricing Model},
  year = 2003
}
@book{shreve04,
  added-at = {2016-10-24T06:18:32.000+0200},
  address = {New York},
  author = {Shreve, Steven E.},
  biburl = {https://www.bibsonomy.org/bibtex/2d3151ac3eeae9b3c69ab17da9857c044/ytyoun},
  interhash = {51bca5ed5a433fbb7b106adc500b0699},
  intrahash = {d3151ac3eeae9b3c69ab17da9857c044},
  isbn = {9780387401003 0387401008 9780387401010 0387401016 9780387249681 0387249680},
  keywords = {finance textbook},
  publisher = {Springer},
  refid = {884516378},
  timestamp = {2016-10-24T06:18:32.000+0200},
  title = {Stochastic Calculus for Finance {II}: Continuous-Time Models},
  year = 2004
}
@book{UBMA_106106503,
  added-at = {2009-05-09T22:39:40.000+0200},
  address = {Berlin ; Heidelberg [u.a.]},
  author = {Øksendal, Bernt},
  biburl = {https://www.bibsonomy.org/bibtex/2d609cfd702d5c4e01c80ea8837a43083/joberger},
  description = {an introduction with applications},
  edition = {6. ed.},
  interhash = {d155639e1f6d7d9cea9fdf51e92694b1},
  intrahash = {d609cfd702d5c4e01c80ea8837a43083},
  isbn = {3-540-04758-1},
  keywords = {UBMA},
  pages = {XXIII, 360 S.},
  publisher = {Springer},
  series = {Universitext},
  timestamp = {2009-05-09T22:39:40.000+0200},
  title = {Stochastic differential equations},
  url = {http://aleph.bib.uni-mannheim.de/F/?func=find-b&request=106106503&find_code=020&adjacent=N&local_base=MAN01PUBLIC&x=0&y=0},
  year = 2003
}
@article{CoxRossRubinstein1979,
  added-at = {2007-06-26T15:08:05.000+0200},
  author = {Cox, J. and Ross, S. and Rubinstein, M.},
  biburl = {https://www.bibsonomy.org/bibtex/25495ae962af4ab709b9b8cac55452216/gilles.daniel},
  comment = {Binomial model of option pricing based on risk neutral
                 valuation},
  interhash = {4739db960e582f9c21cd0ecddaa7b756},
  intrahash = {5495ae962af4ab709b9b8cac55452216},
  journal = {Journal of Financial Economics},
  keywords = {imported},
  pages = {229--263},
  timestamp = {2007-06-26T15:08:07.000+0200},
  title = {Option Pricing: {A} Simplified Approach},
  volume = 7,
  year = 1979
}