Deploying to gh-pages from @ 90bed58 🚀

jekyllstein · Oct 17, 2024 · 76afbd8 · 76afbd8
commit 76afbd8
Show file tree

Hide file tree

Showing 78 changed files with 60,917 additions and 0 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+settings.json
+*Manifest.toml
+.DS_Store
+*nvlib_julia.conf
+*.ptx
+*backup [0-9].jl
+*.bin
diff --git a/ApproximationUtils.jl/Project.toml b/ApproximationUtils.jl/Project.toml
@@ -0,0 +1,40 @@
+name = "ApproximationUtils"
+uuid = "d30c7613-7be3-47bd-9ebc-e60dbcd590b6"
+authors = ["Jason Eckstein <jekyllstein@gmail.com>"]
+version = "0.1.0"
+
+[deps]
+FCANN = "8c01af00-eda8-11e9-3d27-59b8c7aa8fdd"
+InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
+NVIDIALibraries = "455bb3cc-869f-42f8-aad9-fcdd684af481"
+PlutoDevMacros = "a0499f29-c39b-4c5c-807c-88074221b949"
+PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
+Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+TabularRL = "70984187-0053-49a1-8db7-ddab96a995ff"
+TailRec = "f6209947-f0a9-4a6b-8802-d6ceac74b6f9"
+Transducers = "28d57a85-8fef-5791-bfe6-a80928e7c999"
+
+[sources]
+TabularRL = {path = "../TabularRL.jl"}
+TailRec = {url = "https://github.com/TakekazuKATO/TailRec.jl"}
+NVIDIALibraries = {url = "https://github.com/Blackbody-Research/NVIDIALibraries.jl"}
+FCANN = {url = "https://github.com/Blackbody-Research/FCANN.jl"}
+
+[compat]
+FCANN = "1.11.0"
+InteractiveUtils = "1.11.0"
+LinearAlgebra = "1.11.0"
+Markdown = "1.11.0"
+NVIDIALibraries = "1.0.0"
+PlutoDevMacros = "0.9.0"
+PrecompileTools = "1.2.1"
+Reexport = "1.2.2"
+SparseArrays = "1.11.0"
+Statistics = "1.11.1"
+TailRec = "0.2.0"
+Transducers = "0.4.84"
+julia = "1.11"
diff --git a/ApproximationUtils.jl/src/ApproximationUtils.jl b/ApproximationUtils.jl/src/ApproximationUtils.jl
@@ -0,0 +1,7 @@
+module ApproximationUtils
+
+using Reexport, PrecompileTools
+
+@reexport using TabularRL, FCANN, NVIDIALibraries, SparseArrays, LinearAlgebra, TailRec, Transducers, Statistics
+
+end # module ApproximationUtils
diff --git a/Chapter-1/Chapter_1_Introduction_notebook.html b/Chapter-1/Chapter_1_Introduction_notebook.html
@@ -0,0 +1,20 @@
+<!DOCTYPE html><html lang="en"><head><meta name="viewport" content="width=device-width"><meta charset="utf-8"><meta property='og:type' content='article'>
+
+<meta name="pluto-insertion-spot-meta">
+<meta name="theme-color" media="(prefers-color-scheme: light)" content="white"><meta name="theme-color" media="(prefers-color-scheme: dark)" content="#2a2928"><meta name="color-scheme" content="light dark"><link rel="icon" type="image/png" sizes="16x16" href="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/favicon-16x16.347d2855.png" integrity="sha384-3qsGeVLdddzV9oIkj3PhXXQX2CZCjOD/CiyrPQOX6InOWw3HAHClrsQhPfX9uRAj" crossorigin="anonymous"><link rel="icon" type="image/png" sizes="32x32" href="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/favicon-32x32.8789add4.png" integrity="sha384-cOe5vSoBIgKNgkUL27p9RpsGVY0uBg9PejLccDy+fR8ZD1Iv5dF1MGHjIZAIZwm6" crossorigin="anonymous"><link rel="icon" type="image/png" sizes="96x96" href="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/favicon-96x96.48689391.png" integrity="sha384-TN49cYb8GyNmrZT14bsYXXo4l1x1NJeJ/EHuVAauAKsNPopPHLojijs9jFT4Vs8c" crossorigin="anonymous"><link rel="pluto-logo-big" href="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/logo.004c1d7c.svg" integrity="sha384-GkQkODcGxsrSRJCkeakBXihum0GUM44cwBgKyutDimectXCbCgj6Vu3jlrueqEcN" crossorigin="anonymous"><link rel="pluto-logo-small" href="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/favicon_unsaturated.d1387b25.svg" integrity="sha384-omwjH+Qy3hpAVf5FYd/pkaDBuVAfsEDRN7eBxEA8Ek00OAWP+aiV+GpEYk3I7lyo" crossorigin="anonymous"><script type="module" src="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/editor.7330d793.js" integrity="sha384-+mLMSKQxWEYKJeUt5VTdKTDfzHvui0mdMSd+iIQKYybm+6crs+6FeCr73c8yxir6" crossorigin="anonymous"></script><link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/editor.c9b6b472.css" integrity="sha384-/r++eFqY+MX24zOPLVQ1SEXsNKaMgaiC42LUbooLnc1+zar5i0Ih+sKH5dM93WL4" crossorigin="anonymous"><script defer="">console.log("Pluto.jl, by Fons van der Plas (https://github.com/fonsp), Mikołaj Bochenski (https://github.com/malyvsen), Michiel Dral (https://github.com/dralletje) and friends 🌈");</script><script src="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/editor.b8733d72.js" defer="" integrity="sha384-84yPd6AGZ/1IUiaBlssipmMKMFz9WGFQ+u8vYZ9cWicH6bZm7ZOej+kLDXnIIAQJ" crossorigin="anonymous"></script><script src="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/editor.9f9dc874.js" defer="" integrity="sha384-tkFo1EK72I9JvoTmHFa199dfRzW8mkXPUkHb/N7UhYI+bxKzX3Kh8LNCZz1ltsFF" crossorigin="anonymous"></script><script src="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/editor.90ede145.js" defer="" integrity="sha384-CuNU9gQg6fa/yynNqNWjHWzPm4nj+d7O6+HXsNGSqClhs/bYQIbBC3Lw/kh8Ukui" crossorigin="anonymous"></script><script src="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/editor.dbeed08a.js" defer="" integrity="sha384-1BEdQwXfZi4ZpsNV8w1X8pQcVK1/DS/+/M8OTo3gol7mdEspSN7nT6llX57NQCSt" crossorigin="anonymous"></script><script id="iframe-resizer-content-window-script" src="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/editor.6386bd9d.js" crossorigin="anonymous" defer="" integrity="sha384-tgN2a0VDi/lCYwZuDqT7L+A/Y/9kpxf3HV7zv2BJ5Fu7zW0EClq0nM4crfK3TRPs"></script><link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/editor.1a986d5f.css" type="text/css" integrity="sha384-biEV7R+dtBt8r/kVXCVPv0QFmmMMFBF9n6MxBxScN5PULdIdz+5W/YaKFE7GFyJn" crossorigin="anonymous"><link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/editor.317d32de.css" type="text/css" media="all" data-pluto-file="hide-ui" integrity="sha384-rM7rRGvRYP65Tiqkdta+WSApQBfZCqeSEF7JwMX/lSAQUubDKjBejLjGlQBVyphe" crossorigin="anonymous"><link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/editor.d0a5b1f0.css" type="text/css" integrity="sha384-oUdA9RJhs9IlGgJOs6m3tNmyOqOLTPOfpCXeXLUex2W5KOLfSAdyT5HoVuwUEFDQ" crossorigin="anonymous"><link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/editor.e2e3dd3d.css" type="text/css" integrity="sha384-rFNNfBgG448S4mC8A/rtDd6eRIjB04OhJ640kkIF/t55EWPrv2ZT42x9lamXEFpR" crossorigin="anonymous"><link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/editor.09b09a3f.css" type="text/css" integrity="sha384-dHB2VzrvTc7+CLgp62sndIQSbzeitJhO8vZnxV2zNlO4GHz83BZPqsY+0nTAF7WO" crossorigin="anonymous"><script data-pluto-file="launch-parameters">
+window.pluto_notebook_id = undefined;
+window.pluto_isolated_cell_ids = undefined;
+window.pluto_notebookfile = "Chapter_1_Introduction_notebook.jl";
+window.pluto_disable_ui = true;
+window.pluto_slider_server_url = undefined;
+window.pluto_binder_url = "https://mybinder.org/v2/gh/fonsp/pluto-on-binder/v0.19.47";
+window.pluto_statefile = "Chapter_1_Introduction_notebook.plutostate";
+window.pluto_preamble_html = undefined;
+</script>
+
+<meta name="pluto-insertion-spot-parameters">
+<script src="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/editor.e5e13b39.js" type="module" defer="" integrity="sha384-aSQciUMYA0alIWQ4WkNgRf/hEKn/leIeBB/mSeGjvKDSc2DFz+jKgaIDKLhAMPtc" crossorigin="anonymous"></script><script src="https://cdn.jsdelivr.net/gh/fonsp/Pluto.jl@0.19.47/frontend-dist/editor.8a3292da.js" integrity="sha384-itp4oE2PRbSrrTHVpWh8sqAuVUsz7ja6L2Dgp/JRfMCD2AwVdTk56K96POF3oLmu" crossorigin="anonymous"></script><script type="text/javascript" id="MathJax-script" integrity="sha384-4kE/rQ11E8xT9QgrCBTyvenkuPfQo8rXYQvJZuMgxyPOoUfpatjQPlgdv6V5yhUK" crossorigin="" not-the-src-yet="https://cdn.jsdelivr.net/npm/mathjax@3.2.2/es5/tex-svg-full.js" async=""></script>
+<link rel="preload" as="fetch" href="Chapter_1_Introduction_notebook.plutostate" crossorigin>
+
+<meta name="pluto-insertion-spot-preload">
+</head><body class="loading no-MαθJax"> <div style="display:flex;min-height:100vh;"> <pluto-editor class="fullscreen"> <progress style="filter:grayscale(1)" class="delete-me-when-live statefile-fetch-progress" max="100"></progress> </pluto-editor> </div> </body></html>
diff --git a/Chapter-1/Chapter_1_Introduction_notebook.jl b/Chapter-1/Chapter_1_Introduction_notebook.jl
@@ -0,0 +1,47 @@
+### A Pluto.jl notebook ###
+# v0.19.36
+
+using Markdown
+using InteractiveUtils
+
+# ╔═╡ f1fb61f6-4066-11ee-3c81-790e7afab21b
+md"""
+> *Exercise 1.1: Self-Play* Suppose, instead of playing against a random opponent, the reinforcement learning algorithm described above played against itself, with both sides learning. What do you think would happen in this case? Would it learn a different policy for selecting moves?
+
+With self play learning, the value function would include a value for every game state rather than just X moves.  Let us assume that the value represents the value for the X player.  In that case the policy that the O player should follow would be to select the move that leads to the lowest value state transition rather than the highest for the X player.  If we use temporal difference learning, sampling moves from this policy, then eventually it will converge to the value function for the minimax solution in which both sides exhibit optimal play.  The exploration parameter in the training process must be large enough that the entire state space is explored, otherwise there will not be accurate value estimates for certain states.  This policy would never lose to a suboptimal opponent, but there might be cases where it considers a number of candidate moves identical because they all should lead to a draw.  The policy trained against an imperfect opponent might favor a particular move that should not be different under perfect play but has a chance of leading to a win only against that opponent.  For example, in perfect play no starting move is preferable over another because any starting move results in a draw.  But against an opponent that plays randomly, certain moves like in the corners lead to more game states with winning paths if the opponent makes mistakes.
+"""
+
+# ╔═╡ dcbd0ceb-4d6c-4045-bbd1-ac51ffe232f9
+md"""
+> *Exercise 1.2: Symmetries* Many tic-tac-toe positions appear different but are really the same because of symmetries. How might we amend the learning process described above to take advantage of this? In what ways would this change improve the learning process? Now think again. Suppose the opponent did not take advantage of symmetries.  In that case, should we? Is it true, then, that symmetrically equivalent positions should necessarily have the same value?
+
+The value function table can be modified so that symmetrically equivalent positions are mapped to the same value rather than storing a separate value for each position.  For evaluating candidate moves that result in symmetrically equivalent positions we might pick randomly if they all contain the highest value function estimate.  If we play an opponent that does not treat equivalent positions the same, then we should not consider the symmetries for our value function because the behavior of the opponent will cause them to not be equivalent regarding our ability to win the game.
+"""
+
+# ╔═╡ 937eaf38-b2fa-459f-a8ee-8857c294a823
+md"""
+> *Exercise 1.3: Greedy Play* Suppose the reinforcement learning player was greedy, that is, it always played the move that brought it to the position that it rated the best. Might it learn to play better, or worse, than a nongreedy player? What problems might occur?
+
+When we begin the learning process, the value function is an inaccurate estimation that is continually updated.  The first move or set of moves that lead to a winning outcome will be followed by the greedy policy unless in future sampling they in fact lead to losing outcomes.  In either case the greedy policy will converge to playing the first set of moves that on average are marginally better than the average outcome.  This policy is not necessarily the optimal one and would not have accurate value function estimates for states that are never explored.  A non-greedy player will explore some additional moves that may not have a higher value estimate at the moment.  However after the value function is updated there is a greater chance of finding a move that has a higher value that what was previously considered optimal.  The greedy player could also get stuck in a bad policy because it has inaccurate value estimates for the other states.  Unless a player is allowed to explore the entire state space, it is not guaranteed to find the optimal policy.
+"""
+
+# ╔═╡ 29760a2e-d509-4d0e-8331-cd5de8723321
+md"""
+> *Exercise 1.4: Learning from Exploration* Suppose learning updates occurred after all moves, including exploratory moves. If the step-size parameter is appropriately reduced over time (but not the tendency to explore), then the state values would converge to a different set of probabilities. What (conceptually) are the two sets of probabilities computed when we do, and when we do not, learn from exploratory moves? Assuming that we do continue to make exploratory moves, which set of probabilities might be better to learn? Which would result in more wins?
+
+The two sets of probabilities represent two different value functions.  The original method that does not update after exploritory moves represents the value estimates for the greedy optimal policy; that is the policy that only considers moves that maximize the value estimate of the state transition.  If we also update from exploritory moves, then we are calculating the value estimate for policy that takes exploratory moves with some probability.  Then the policy optimization is occuring under the constraint that sometimes random moves are made, so the algorithm will find the best policy under that restriction.  If we call the probability of making random exploratory moves $\epsilon$, then this type of optimal policy can be called the $\epsilon$-greedy policy since it makes greedy moves $1 - \epsilon$ percent of the time.  The formal definition of this type of policy appears in subsequent chapters.  Since this policy is restricted, it will result in a worse final policy with fewer wins than the policy that ignores exploratory moves for learning updates..  Also, if we plan to eventually follow the greedy policy with respect to the value function when we deploy the agent, the value function will no longer accurately represent the policy being followed as it will have failed to converge to the optimal greedy policy (converging instead to the optimal $\epsilon$-greedy policy.  If, however, we consider an agent that continues to make exploratory moves after being deployed, then updating after exploratory moves will lead to a more accurate value function for that agent's policy.  It should lead to more wins than the original method because it accounts for a certain degree of random moves in the future.
+"""
+
+# ╔═╡ 20d261de-4121-4b81-8ead-f3022f8efedc
+md"""
+> *Exercise 1.5: Other Improvements* Can you think of other ways to improve the reinforcement learning player? Can you think of any better way to solve the tic-tac-toe problem as posed?
+
+Since the game is very simple, rather than estimating the value function and updating it with one forward step perhaps we could see what happens to the value function multiple steps into the future considering every possible response from our opponent.  That way we could explore more of the action space even if the opponent does not actually play those moves.  Even if we ignore all the value function estimation techniques, we can do this type of exhaustive search to just evaluate every starting position and subsequent positions to see which states lead to wins, losses, and draws.  This approach would simply simulate all the game states and have a policy based on every possible outcome from each state.  For a game this simple that approach is possible, but for games with many more states such an exhaustive search is intractable.  Also having a different reward assigned to draws and losses would help an agent distinguish between the two outcomes.  It may not increase the win rate, but it could avoid losses in cases where a draw is possible. 
+"""
+
+# ╔═╡ Cell order:
+# ╟─f1fb61f6-4066-11ee-3c81-790e7afab21b
+# ╟─dcbd0ceb-4d6c-4045-bbd1-ac51ffe232f9
+# ╟─937eaf38-b2fa-459f-a8ee-8857c294a823
+# ╟─29760a2e-d509-4d0e-8331-cd5de8723321
+# ╟─20d261de-4121-4b81-8ead-f3022f8efedc
diff --git a/Chapter-1/Chapter_1_Introduction_notebook.pdf b/Chapter-1/Chapter_1_Introduction_notebook.pdf
diff --git a/Chapter-1/Chapter_1_Introduction_notebook.plutostate b/Chapter-1/Chapter_1_Introduction_notebook.plutostate
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Auto detect text files and perform LF normalization
		* text=auto