Fix typo in policy improvement section

jekyllstein · Jan 23, 2024 · a73f093 · a73f093
1 parent d020f1b
commit a73f093
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 21 deletions.
diff --git a/Chapter-4/Chapter_4_Dynamic_Programming_notebook.jl b/Chapter-4/Chapter_4_Dynamic_Programming_notebook.jl
@@ -1,5 +1,5 @@
 ### A Pluto.jl notebook ###
-# v0.19.29
+# v0.19.36
 
 using Markdown
 using InteractiveUtils
@@ -832,9 +832,9 @@ $$q_\pi(s, \pi ^\prime(s)) \geq v_\pi(s) \tag{4.7}$$
 
 If this is true for all $s \in \mathcal{S}$ then the policy $\pi^\prime$ must be as good or better than $\pi$ meaning it has a greater or equal expected return at every state:
 
-$v_{\pi^\prime} \geq v_\pi(s) \tag{4.8}$
+$v_{\pi^\prime}(s) \geq v_\pi(s) \tag{4.8}$
 
-Starting wiht $\pi$ consider a new policy that chooses action $a$ at state $s$ instead of the usual action:  $\pi^\prime(s) = a \neq \pi(s)$.  If $q_\pi(s, a) > v_\pi(s)$, then this new policy is better than $\pi$ since $v_{\pi^\prime}(s) \geq q_\pi(s, a) > v_\pi(s)$.  This relationship is shown in the proof of the policy improvement theorem which relies upon expanding out the expression for $q_\pi$ and repeatedly applying the inequality (4.7).  
+Starting with $\pi$ consider a new policy that chooses action $a$ at state $s$ instead of the usual action:  $\pi^\prime(s) = a \neq \pi(s)$.  If $q_\pi(s, a) > v_\pi(s)$, then this new policy is better than $\pi$ since $v_{\pi^\prime}(s) \geq q_\pi(s, a) > v_\pi(s)$.  This relationship is shown in the proof of the policy improvement theorem which relies upon expanding out the expression for $q_\pi$ and repeatedly applying the inequality (4.7).  
 
 One way of creating such a policy is using action-value function of the original policy:
 
@@ -1863,7 +1863,7 @@ PlutoUI = "~0.7.52"
 PLUTO_MANIFEST_TOML_CONTENTS = """
 # This file is machine-generated - editing it directly is not advised
 
-julia_version = "1.10.0-rc1"
+julia_version = "1.10.0"
 manifest_format = "2.0"
 project_hash = "f98b95d413b2ec16c9a6d0f700b23b16b273630e"
 
@@ -2190,7 +2190,7 @@ uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 version = "1.10.0"
 
 [[deps.SuiteSparse_jll]]
-deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
+deps = ["Artifacts", "Libdl", "libblastrampoline_jll"]
 uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
 version = "7.2.1+1"
 

diff --git a/Chapter-5/Chapter_5_Monte_Carlo_Methods.jl b/Chapter-5/Chapter_5_Monte_Carlo_Methods.jl
@@ -910,18 +910,6 @@ estimate_blackjack_state(blackjack_state1, π_rand_blackjack, 5_000_000) |> v ->
 # ╔═╡ 43f9b824-fd56-4d56-84c3-23e2a3a37178
 monte_carlo_pred_V(π_blackjack1, π_rand_blackjack, blackjack_mdp, 1.0f0; num_episodes =  100000, historystateindex = blackjack_stateindex1, vinit = 0.0f0, override_state_init = true, samplemethod=Ordinary())[3] |> mean #mean weight is 2 and it stays this way as the number of episodes increases so the returns that are seen by the behavior are roughtly doubled and counted with the 0 returns
 
-# ╔═╡ 496d4ae7-10b2-466d-a391-7cd56635691b
-0.277^2
-
-# ╔═╡ a091e411-64b4-4899-9ff5-fba56228d6ec
-(1+.277)^2
-
-# ╔═╡ 5a89cbff-07d2-4fe6-91f9-352b817130b5
-0.077*.72 + .28*.52
-
-# ╔═╡ 46952a9f-60ba-4c7c-be2c-8d1e27d96439
-(-2+.277)^2
-
 # ╔═╡ 00cd2194-af13-415a-b725-bb34832e5d9a
 function figure5_3(;n = 100, vinit = 0.0f0, targetstart = true, episodes = 10_000, title = "", caption = "Figure 5.3")
 	#note that with targetstart = true, the episode will always begin with the action selected by the target policy.  Since we only care about the value estimate for this state, we only need the q value for actions taken by the target policy.  This way, every episode will produce relevant sample updates
@@ -1230,6 +1218,7 @@ monte_carlo_pred_Q(π_blackjack1, π_rand_blackjack, blackjack_mdp, 1.0f0; num_e
 [monte_carlo_pred_Q(π_blackjack1, π_rand_blackjack, blackjack_mdp, 1.0f0; num_episodes =  1, historystateindex = blackjack_stateindex1, qinit = 0.0f0, override_state_init = true, samplemethod=Ordinary(), use_target_initial_action = true)[3][1:1] |> sum for i in 1:1000] |> v -> mean(filter(x -> x != 0, v))
 
 # ╔═╡ b6eac49e-6742-4594-87a5-821437846b0d
+#using ordinary importance sampling calculates statistics on weights observed after n episodes of training including what percentage of weights are 0, 1, 2 etc...
 function test(n)
 	[monte_carlo_pred_Q(π_blackjack1, π_rand_blackjack, blackjack_mdp, 1.0f0; num_episodes =  n, historystateindex = blackjack_stateindex1, qinit = 0.0f0, override_state_init = true, samplemethod=Ordinary(), use_target_initial_action = true)[3][1:n] |> v -> n - sum(v .== 0) for _ in 1:10000] |> v -> (mean(v), median(v), mode(v), mean(v .== 0), mean(v .== 1), mean(v .== 2))
 end
@@ -1878,6 +1867,9 @@ end
 # ╔═╡ aa647f3e-a4b2-4825-bb2a-1c2469d2c0eb
 sampleracepolicy(create_policy_function(πstar2_racetrack2, track2_mdp); track = track2, trackname = "Track 2", policyname = "Monte Carlo Exploring Starts")
 
+# ╔═╡ 780f2fd9-49c3-48bc-b790-dde5be1dc81b
+track1_mdp.states
+
 # ╔═╡ ffd1e39d-e66b-40a3-8ad1-c9480d371fe8
 md"""
 ##### Results Summary
@@ -2562,10 +2554,6 @@ version = "17.4.0+2"
 # ╠═5b32bbc7-dc34-4fc3-bacc-92e55f26a98c
 # ╠═b6eac49e-6742-4594-87a5-821437846b0d
 # ╠═847a074c-1d23-4de3-a039-322aeb9f7613
-# ╠═496d4ae7-10b2-466d-a391-7cd56635691b
-# ╠═a091e411-64b4-4899-9ff5-fba56228d6ec
-# ╠═5a89cbff-07d2-4fe6-91f9-352b817130b5
-# ╠═46952a9f-60ba-4c7c-be2c-8d1e27d96439
 # ╠═00cd2194-af13-415a-b725-bb34832e5d9a
 # ╟─9ca72278-fff6-4b0f-b72c-e0d3768aff73
 # ╟─e10378eb-12b3-4468-9c22-1838107da450
@@ -2661,6 +2649,7 @@ version = "17.4.0+2"
 # ╟─edb4ad06-5e6e-48c4-9ee4-ae0b92927a91
 # ╟─4481fa33-1ff3-4778-8486-d4d8a15775cd
 # ╟─aa647f3e-a4b2-4825-bb2a-1c2469d2c0eb
+# ╠═780f2fd9-49c3-48bc-b790-dde5be1dc81b
 # ╟─ffd1e39d-e66b-40a3-8ad1-c9480d371fe8
 # ╟─f79d97bb-341a-46ad-bdfc-d080af13e2df
 # ╟─6d6e8916-9b36-4af1-b77d-86cb6a416f88