diff --git a/examples/xrl/README.md b/examples/xrl/README.md deleted file mode 100644 index 987a90e8..00000000 --- a/examples/xrl/README.md +++ /dev/null @@ -1,10 +0,0 @@ -XRL(Working in progress) - -XRL contains a set of algorithms aiming to solve extremely large reinforcement learning -problems. The problems addressed include Markov decision problems both in the model-based and -model-free settings. The algorithms are compactly grouped in a python class which has various -functions including training functions, plotting functions and some auxiliary functions. The -implementation includes policy gradient methods as well as gradient-based methods for the -primal-dual formulation. Some functionalities are still under testing. - -If you meet any problems, please contact lihaoya@stanford.edu \ No newline at end of file diff --git a/examples/xrl/src/check.py b/examples/xrl/src/check.py deleted file mode 100644 index aa57f408..00000000 --- a/examples/xrl/src/check.py +++ /dev/null @@ -1,466 +0,0 @@ -import numpy as np -import random -import scipy.sparse.linalg as sla -import scipy.optimize -import matplotlib.pyplot as plt -from scipy.special import softmax -from scipy.special import xlogy -from scipy.special import logsumexp - -# import pickle - -plt.rcParams.update({"font.size": 16}) - - -class PolicyGradientCheck: - def __init__( - self, tp=0, his=False, k=None, kernel_type="toy1", Na=50, Ns=200, gam=0.99, eps=1e-12 - ): - if kernel_type == "toy3": - Na = 2 - if kernel_type == "toy4": - Na = 2 - if kernel_type == "toy5": - Na = 2 - etp = ("KL", "rKL", "alpha") - self.kernel_type = kernel_type - self.tau = 1 - self.Na = Na - self.Ns = Ns - self.gam = gam - self.eps = eps - self.k = k if k else Ns // 10 + 1 - self.P = np.zeros([self.Na, self.Ns, self.Ns]) - self.alpha = None - self.result = None - self.ref_result = None - self.resultZ = np.zeros([self.Ns, self.Na]) - self.train_from_history = his - self.err = [[], [], {}] - self.err_exact = [] - self.V_exact = None - self.pi_exact = None - - if self.kernel_type == "toy1": - idr = list(range(self.Ns)) - for i in range(self.Na): - for j in range(self.Ns): - idx = random.sample(idr, self.k) - self.P[i, j, idx] = 1 / self.k - Us = np.random.rand(Ns) - Usa = np.random.rand(Ns, Na) - self.r = np.zeros([Ns, Na]) - for i in range(Ns): - for j in range(Na): - self.r[i, j] = Us[i] * Usa[i, j] - - if self.kernel_type == "toy2": - P = np.random.uniform(0, 1, (self.Na, self.Ns, self.Ns)) - sparse = min(self.Ns - 1, int(self.Ns * 0.9)) - for i in range(self.Na): - for j in range(self.Ns): - indices = np.random.choice(np.arange(self.Ns), replace=False, size=sparse) - P[i, j, indices] = 0 - P = P / P.sum(axis=2, keepdims=True) - self.P[:] = P[:] - Us = np.random.rand(Ns) - Usa = np.random.rand(Ns, Na) - self.r = np.zeros([Ns, Na]) - for i in range(Ns): - for j in range(Na): - self.r[i, j] = Us[i] * Usa[i, j] - - if self.kernel_type == "toy3": - self.P = np.zeros([self.Na, self.Ns, self.Ns]) - self.r = np.zeros([Ns, Na]) - for s in range(self.Ns): - self.P[0, s, s] = 1 / 2 - self.P[0, s, (s + 1) % self.Ns] = 1 / 2 - self.P[1, s, s] = 1 - self.r[s, 0] = np.sin(4 * np.pi * s / self.Ns) + 1 - self.r[s, 1] = np.cos(4 * np.pi * s / self.Ns) + 1 - - if self.kernel_type == "toy4": - self.P = np.zeros([self.Na, self.Ns, self.Ns]) - self.r = np.zeros([Ns, Na]) - self.P[0, 0, 0] = 1 - self.P[1, 0, 1] = 1 - for s in range(1, self.Ns - 1): - self.P[0, s, s - 1] = 1 # go left - self.P[1, s, s + 1] = 1 # go right - self.P[0, self.Ns - 1, self.Ns - 1] = 1 - self.P[1, self.Ns - 1, self.Ns - 1] = 1 - self.r[self.Ns - 2, 1] = 1 - - if self.kernel_type == "toy5": - self.P = np.zeros([self.Na, self.Ns, self.Ns]) - self.r = np.zeros([Ns, Na]) - self.P[0, 0, 0] = 1 - self.P[1, 0, 1] = 1 - for s in range(1, self.Ns - 1): - self.P[0, s, s - 1] = 1 # go left - self.P[1, s, s + 1] = 1 # go right - self.P[0, self.Ns - 1, self.Ns - 2] = 1 - self.P[1, self.Ns - 1, self.Ns - 1] = 0 - self.r[self.Ns - 1, 1] = 1 - - self.etp = etp[tp] - if self.etp == "alpha": - self.alpha = 0 - - def train(self, eta=1, T=20): - if self.etp == "KL": - self.err[0] = [] - if self.etp == "rKL": - self.err[1] = [] - if self.etp == "alpha": - self.err[2][self.alpha] = [] - pi = ( - np.ones([self.Ns, self.Na]) / self.Na - if not self.train_from_history - else np.copy(self.result) - ) - pi_new = np.zeros([self.Ns, self.Na]) - Idt = np.identity(self.Ns) - if self.ref_result is not None: - self.err_exact = [] - for itr in range(T): - if self.ref_result is not None: - self.err_exact.append(np.linalg.norm(self.ref_result - pi, "fro")) - self.err_exact[-1] /= np.linalg.norm(self.ref_result, "fro") - - P_pi = np.zeros([self.Ns, self.Ns]) - for s in range(self.Ns): - for t in range(self.Ns): - P_pi[s, t] = np.dot(pi[s, :], self.P[:, s, t]) - r_pi = np.zeros(self.Ns) - for s in range(self.Ns): - r_pi[s] = np.dot(pi[s, :], self.r[s, :]) - H_pi = 0 - if self.etp == "KL": - H_pi = np.sum(xlogy(pi, pi), axis=1) - if self.etp == "rKL": - H_pi = np.sum(-np.log(pi), axis=1) / self.Na - if self.etp == "alpha": - tmp_h1 = 4 / (1 - self.alpha ** 2) - tmp_h2 = 4 / (1 - self.alpha ** 2) * self.Na ** ((self.alpha - 1) / 2) - H_pi = tmp_h1 - tmp_h2 * np.sum(pi ** ((self.alpha + 1) / 2), axis=1) - v_pi = np.linalg.solve(Idt - self.gam * P_pi, r_pi - self.tau * H_pi) - # print(np.mean(v_pi)) - # print(np.linalg.norm((Idt-gam * P_pi) @ v_pi - r_pi)) - if self.etp == "KL": - pi_new = pi ** (1 - eta) - for a in range(self.Na): - pi_new[:, a] = pi_new[:, a] * np.exp( - eta * (self.r[:, a] - (Idt - self.gam * self.P[a]) @ v_pi) / self.tau - ) - - if self.etp == "rKL": - for a in range(self.Na): - tmp = self.P[a] @ v_pi - for s in range(self.Ns): - pi_new[s, a] = (1 - eta) / pi[s, a] - pi_new[s, a] -= ( - eta * self.Na * (self.r[s, a] - v_pi[s] + self.gam * tmp[s]) / self.tau - ) - - for s in range(self.Ns): - - def equation(x): - return np.sum(1 / (x + pi_new[s, :] + 1e-300)) - 1 - - low = max(self.Na - np.max(pi_new[s, :]), -np.min(pi_new[s, :])) - high = self.Na - np.min(pi_new[s, :]) - cs = low - if np.abs(equation(high)) < 1e-15: - cs = high - if np.abs(equation(low)) >= 1e-15 and np.abs(equation(high)) >= 1e-15: - cs = scipy.optimize.bisect(equation, low, high) - # cs = scipy.optimize.newton_krylov(equation, np.array([1]), f_tol=1e-12) - # print(np.abs(equation(cs)) < 1*10**-10) - for a in range(self.Na): - pi_new[s, a] = 1 / (pi_new[s, a] + cs) - # print(s) - # print(np.abs(np.sum(pi_new[s, :])-1) < 1*10**-10) - # print(np.min(pi_new[s, :])) - - if self.etp == "alpha": - for a in range(self.Na): - tmp = self.P[a] @ v_pi - for s in range(self.Ns): - pi_new[s, a] = (1 - eta) * (pi[s, a] ** ((self.alpha - 1) / 2)) - tmp_pi1 = (1 - self.alpha) / 2 * eta * (self.Na ** ((1 - self.alpha) / 2)) - tmp_pi2 = (self.r[s, a] - v_pi[s] + self.gam * tmp[s]) / self.tau - pi_new[s, a] -= tmp_pi1 * tmp_pi2 - - for s in range(self.Ns): - alpha = self.alpha - - def equation(x): - return np.sum((x + pi_new[s, :] + 1e-300) ** (2 / (alpha - 1))) - 1 - - low = max( - self.Na ** ((1 - self.alpha) / 2) - np.max(pi_new[s, :]), - -np.min(pi_new[s, :]), - ) - high = self.Na ** ((1 - self.alpha) / 2) - np.min(pi_new[s, :]) - cs = low - if np.abs(equation(high)) < 1e-15: - cs = high - if np.abs(equation(low)) >= 1e-15 and np.abs(equation(high)) >= 1e-15: - cs = scipy.optimize.bisect(equation, low, high) - for a in range(self.Na): - pi_new[s, a] = (pi_new[s, a] + cs) ** (2 / (self.alpha - 1)) - - for s in range(self.Ns): - pi_new[s, :] /= sum(pi_new[s, :]) - err = np.linalg.norm(pi_new - pi, "fro") / np.linalg.norm(pi, "fro") - print(err) - if self.etp == "KL": - self.err[0].append(err) - if self.etp == "rKL": - self.err[1].append(err) - if self.etp == "alpha": - self.err[2][self.alpha].append(err) - if err < self.eps: - print(itr) - break - pi[:] = pi_new[:] - self.result = np.copy(pi) - self.ref_result = np.copy(pi) - - def set_tau(self, tau): - self.tau = tau - self.ref_result = None - - def set_alpha(self, alpha): - self.alpha = alpha - - def set_history(self, his): - self.train_from_history = his - if his and self.result is None: - self.result = np.ones([self.Ns, self.Na]) / self.Na - - def clear_history(self): - self.result = self.result = np.ones([self.Ns, self.Na]) / self.Na - - def set_etp(self, tp): - etp = ("KL", "rKL", "alpha") - self.etp = etp[tp] - if self.etp == "alpha": - self.alpha = 0 - self.ref_result = None - - def get_value(self, pi=None, reg=False): - if self.result is None: - return - if pi is None: - pi = self.result - Idt = np.identity(self.Ns) - P_pi = np.zeros([self.Ns, self.Ns]) - for s in range(self.Ns): - for t in range(self.Ns): - P_pi[s, t] = np.dot(pi[s, :], self.P[:, s, t]) - r_pi = np.zeros(self.Ns) - for s in range(self.Ns): - r_pi[s] = np.dot(pi[s, :], self.r[s, :]) - H_pi = 0 - if reg: - if self.etp == "KL": - H_pi = np.sum(xlogy(pi, pi), axis=1) - if self.etp == "rKL": - H_pi = np.sum(-np.log(pi), axis=1) / self.Na - if self.etp == "alpha": - tmp_h1 = 4 / (1 - self.alpha ** 2) - tmp_h2 = 4 / (1 - self.alpha ** 2) * self.Na ** ((self.alpha - 1) / 2) - H_pi = tmp_h1 - tmp_h2 * np.sum(pi ** ((self.alpha + 1) / 2), axis=1) - v_pi = np.linalg.solve(Idt - self.gam * P_pi, r_pi - self.tau * H_pi) - return v_pi - - def get_expect_value(self, pi=None): - if self.result is None: - return - if pi is None: - pi = self.result - P_pi = np.zeros([self.Ns, self.Ns]) - for s in range(self.Ns): - for t in range(self.Ns): - P_pi[s, t] = np.dot(pi[s, :], self.P[:, s, t]) - r_pi = np.zeros(self.Ns) - for s in range(self.Ns): - r_pi[s] = np.dot(pi[s, :], self.r[s, :]) - _, evec = sla.eigs(P_pi.T, k=1, which="LM") - d_pi = (evec / evec.sum()).real - d_pi = d_pi.reshape(len(d_pi)) - return np.dot(d_pi, r_pi) - - def make_fig(self, alpha1, alpha2, linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - # err0 = self.err[0] - err1 = self.err[1] - err2 = self.err[2][alpha1] - err3 = self.err[2][alpha2] - ax.tick_params(axis="both", labelsize=14) - # ax.plot(err0, label="KL", linewidth=linewidth) - ax.plot(err1, label="reversed KL", linewidth=linewidth) - ax.plot(err2, label="Hellinger", linewidth=linewidth) - ax.plot(err3, label="alpha divergence (alpha=" + str(alpha2) + ")", linewidth=linewidth) - ax.set_yscale("log") - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("relative change of policy", fontsize=22) - ax.legend(loc="upper right") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def make_fig_exact(self, label="KL", linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - ax.tick_params(axis="both", labelsize=14) - err0 = ( - np.log(np.abs(np.log(self.err_exact[:-1]))) - if self.err_exact[-1] == 0 - else np.log(np.abs(np.log(self.err_exact))) - ) - ax.plot(err0, label=label, linewidth=linewidth, marker="+") - ax.plot( - [0, len(err0) - 1], - [0, np.log(2) * (len(err0) - 1)], - label="slope=log2", - color="g", - linestyle="dashed", - linewidth=0.5, - ) - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("loglog error", fontsize=22) - ax.legend(loc="upper left") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def train_all(self, alpha1=0, alpha2=-3): - self.set_etp(0) - self.train() - self.set_etp(1) - self.train() - self.set_etp(2) - self.set_alpha(alpha1) - self.train() - self.set_alpha(alpha2) - self.train() - - def value_iteration(self): - V_pre = np.ones(self.Ns) - V = np.zeros(self.Ns) - Q = np.zeros([self.Ns, self.Na]) - cnt = 0 - while np.linalg.norm(V - V_pre) / np.linalg.norm(V_pre) > 10 ** -15: - cnt += 1 - V_pre = V - for a in range(self.Na): - Q[:, a] = self.r[:, a] + self.gam * (self.P[a, :, :] @ V) - - if self.etp == "KL": - V = logsumexp(Q / self.tau, axis=1) * self.tau - if self.etp == "rKL": - return - if self.etp == "alpha": - return - if cnt % 100 == 0: - print(np.linalg.norm(V - V_pre) / np.linalg.norm(V_pre)) - - for a in range(self.Na): - Q[:, a] = self.r[:, a] + self.gam * (self.P[a, :, :] @ V) - V - self.V_exact = V - self.pi_exact = softmax(Q / self.tau, axis=1) - print(cnt) - - def train_primal_dual_old(self, CV=0.1, CW=0.1, tol=1e-6, step=0.01): - V = np.zeros(self.Ns) - W = np.ones(self.Ns) - Z0 = np.zeros([self.Ns, self.Na]) - Z = np.ones([self.Ns, self.Na]) / self.Na - dt = step - Idt = np.identity(self.Ns) - cnt = 0 - - while np.max(np.abs(Z - Z0)) > tol: - cnt += 1 - PZ = np.zeros([self.Ns, self.Ns]) - for s in range(self.Ns): - for t in range(self.Ns): - PZ[s, t] = np.dot(Z[s, :], self.P[:, s, t]) - RZ = np.zeros(self.Ns) - for s in range(self.Ns): - RZ[s] = np.dot(Z[s, :], self.r[s, :]) - HZ = np.sum(xlogy(Z, Z), axis=1) - - EV = V - (Idt - self.gam * PZ.T) @ W / CV - EW = (RZ - (Idt - self.gam * PZ) @ V - self.tau * HZ) / CW - W - EZ = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EZ[:, a] = ( - self.r[:, a] - (Idt - self.gam * self.P[a, :, :]) @ V - ) / self.tau - np.log(Z[:, a]) - Z0 = Z - V = V - dt * EV - W = W + dt * EW - Z = Z * np.exp(dt * EZ) - # print(np.min(W)) - for s in range(self.Ns): - Z[s, :] /= sum(Z[s, :]) - if cnt % 10 == 0: - print(cnt) - print(np.max(np.abs(Z - Z0))) - self.resultZ[:] = Z[:] - print(cnt) - print(np.max(np.abs(Z - Z0))) - - def train_primal_dual(self, CV=0.01, CM=0.01, tol=1e-8, step=0.01): - V = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) / self.Na - M0 = np.zeros([self.Ns, self.Na]) - dt = step - Idt = np.identity(self.Ns) - cnt = 0 - Z = np.zeros([self.Ns, self.Na]) - - while np.linalg.norm(M - M0) / np.linalg.norm(M) > tol: - M0[:] = M[:] - cnt += 1 - - EV = np.zeros(self.Ns) - for a in range(self.Na): - EV += (Idt - self.gam * self.P[a].T) @ M[:, a] - EV /= CV - V = (V + dt * EV) / (1 + dt) - - EM = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EM[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a]) @ V) / CM - - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - H[s, a] = M[s, a] if M[s, a] >= 1 else np.log(M[s, a]) + 1 - H = (H + dt * EM) / (1 + dt) - - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = H[s, a] if H[s, a] >= 1 else np.exp(H[s, a] - 1) - - if cnt % 50 == 0: - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) - for s in range(self.Ns): - Z[s, :] = M[s, :] / sum(M[s, :]) - self.resultZ[:] = Z[:] - print(M) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) - - -p = PolicyGradientCheck(Ns=1000, kernel_type="toy5") -p.set_tau(0.001) - -# with open("model.pkl", "wb") as outp: -# pickle.dump(p, outp, pickle.HIGHEST_PROTOCOL) -# -# p = pickle.load(open("model1.pkl", "rb")) diff --git a/examples/xrl/src/concentration_plot.txt b/examples/xrl/src/concentration_plot.txt deleted file mode 100644 index a4512b7e..00000000 --- a/examples/xrl/src/concentration_plot.txt +++ /dev/null @@ -1,18 +0,0 @@ -fig, (ax1, ax2, ax3) = plt.subplots(3) -im = ax1.imshow(Z_hyb.T) -fig.colorbar(im, ax=ax1) - -im = ax2.imshow(p.resultZ.T) -fig.colorbar(im, ax=ax2) - -im = ax3.imshow(p.resultZ.T-Z_hyb.T) -fig.colorbar(im, ax=ax3) - -ax1.set_title("hybrid regularization") - -ax2.set_title("mixed preconditioner") - -ax3.set_title("difference") - - -fig.tight_layout() \ No newline at end of file diff --git a/examples/xrl/src/main.py b/examples/xrl/src/main.py deleted file mode 100644 index 707f9dd1..00000000 --- a/examples/xrl/src/main.py +++ /dev/null @@ -1,1775 +0,0 @@ -import numpy as np -import random -import scipy.sparse.linalg as sla -import scipy.optimize -import matplotlib.pyplot as plt -from scipy.special import softmax -from scipy.special import xlogy -from scipy.special import logsumexp -from collections import defaultdict - -plt.rcParams.update({"font.size": 16}) - - -class PolicyGradientNew: - def __init__( - self, tp=0, his=False, k=None, kernel_type="toy1", Na=50, Ns=200, gam=0.99, eps=1e-12 - ): - if kernel_type == "toy3": - Na = 2 - etp = ("KL", "rKL", "alpha") - self.kernel_type = kernel_type - self.tau = 1 - self.Na = Na - self.Ns = Ns - self.gam = gam - self.eps = eps - self.k = k if k else Ns // 10 + 1 - self.P = np.zeros([self.Na, self.Ns, self.Ns]) - self.alpha = None - self.result = None - self.ref_result = None - self.resultZ = None - self.resultM = None - self.resultV = None - self.train_from_history = his - self.err = [[], [], {}] - self.err_exact = [] - self.err_pd = [] - self.err_pd_exact = [] - self.err_pd_val_pi = [] - self.err_pd_val_V = [] - self.lyap = [] - self.V_exact = None - self.pi_exact = None - if self.kernel_type == "toy1": - idr = list(range(self.Ns)) - for i in range(self.Na): - for j in range(self.Ns): - idx = random.sample(idr, self.k) - self.P[i, j, idx] = 1 / self.k - Us = np.random.rand(Ns) - Usa = np.random.rand(Ns, Na) - self.r = np.zeros([Ns, Na]) - for i in range(Ns): - for j in range(Na): - self.r[i, j] = Us[i] * Usa[i, j] - - if self.kernel_type == "toy2": - P = np.random.uniform(0, 1, (self.Na, self.Ns, self.Ns)) - sparse = min(self.Ns - 1, int(self.Ns * 0.9)) - for i in range(self.Na): - for j in range(self.Ns): - indices = np.random.choice(np.arange(self.Ns), replace=False, size=sparse) - P[i, j, indices] = 0 - P = P / P.sum(axis=2, keepdims=True) - self.P[:] = P[:] - Us = np.random.rand(Ns) - Usa = np.random.rand(Ns, Na) - self.r = np.zeros([Ns, Na]) - for i in range(Ns): - for j in range(Na): - self.r[i, j] = Us[i] * Usa[i, j] - - if self.kernel_type == "toy3": - self.P = np.zeros([self.Na, self.Ns, self.Ns]) - self.r = np.zeros([Ns, Na]) - for s in range(self.Ns): - self.P[0, s, s] = 1 / 2 - self.P[0, s, (s + 1) % self.Ns] = 1 / 2 - self.P[1, s, s] = 1 - self.r[s, 0] = np.sin(4 * np.pi * s / self.Ns) + 1 - self.r[s, 1] = np.cos(4 * np.pi * s / self.Ns) + 1 - - self.etp = etp[tp] - if self.etp == "alpha": - self.alpha = 0 - - def train(self, eta=1, T=20): - if self.etp == "KL": - self.err[0] = [] - if self.etp == "rKL": - self.err[1] = [] - if self.etp == "alpha": - self.err[2][self.alpha] = [] - pi = ( - np.ones([self.Ns, self.Na]) / self.Na - if not self.train_from_history - else np.copy(self.result) - ) - pi_new = np.zeros([self.Ns, self.Na]) - Idt = np.identity(self.Ns) - if self.ref_result is not None: - self.err_exact = [] - for itr in range(T): - if self.ref_result is not None: - self.err_exact.append(np.linalg.norm(self.ref_result - pi, "fro")) - self.err_exact[-1] /= np.linalg.norm(self.ref_result, "fro") - P_pi = np.zeros([self.Ns, self.Ns]) - for s in range(self.Ns): - for t in range(self.Ns): - P_pi[s, t] = np.dot(pi[s, :], self.P[:, s, t]) - r_pi = np.zeros(self.Ns) - for s in range(self.Ns): - r_pi[s] = np.dot(pi[s, :], self.r[s, :]) - H_pi = 0 - if self.etp == "KL": - H_pi = np.sum(xlogy(pi, pi), axis=1) - if self.etp == "rKL": - H_pi = np.sum(-np.log(pi), axis=1) / self.Na - if self.etp == "alpha": - H_pi = 4 / (1 - self.alpha ** 2) - 4 / (1 - self.alpha ** 2) * self.Na ** ( - (self.alpha - 1) / 2 - ) * np.sum(pi ** ((self.alpha + 1) / 2), axis=1) - v_pi = np.linalg.solve(Idt - self.gam * P_pi, r_pi - self.tau * H_pi) - if self.etp == "KL": - pi_new = pi ** (1 - eta) - for a in range(self.Na): - pi_new[:, a] = pi_new[:, a] * np.exp( - eta * (self.r[:, a] - (Idt - self.gam * self.P[a]) @ v_pi) / self.tau - ) - - if self.etp == "rKL": - for a in range(self.Na): - tmp = self.P[a] @ v_pi - for s in range(self.Ns): - pi_new[s, a] = (1 - eta) / pi[s, a] - pi_new[s, a] -= ( - eta * self.Na * (self.r[s, a] - v_pi[s] + self.gam * tmp[s]) / self.tau - ) - - for s in range(self.Ns): - - def equation(x): - return np.sum(1 / (x + pi_new[s, :] + 1e-300)) - 1 - - low = max(self.Na - np.max(pi_new[s, :]), -np.min(pi_new[s, :])) - high = self.Na - np.min(pi_new[s, :]) - cs = scipy.optimize.bisect(equation, low, high) - # cs = scipy.optimize.newton_krylov(equation, np.array([1]), f_tol=1e-12) - # print(np.abs(equation(cs)) < 1*10**-10) - for a in range(self.Na): - pi_new[s, a] = 1 / (pi_new[s, a] + cs) - # print(s) - # print(np.abs(np.sum(pi_new[s, :])-1) < 1*10**-10) - # print(np.min(pi_new[s, :])) - - if self.etp == "alpha": - for a in range(self.Na): - tmp = self.P[a] @ v_pi - for s in range(self.Ns): - pi_new[s, a] = (1 - eta) * (pi[s, a] ** ((self.alpha - 1) / 2)) - tmp_sa = (1 - self.alpha) / 2 * eta * (self.Na ** ((1 - self.alpha) / 2)) - pi_new[s, a] -= ( - tmp_sa * (self.r[s, a] - v_pi[s] + self.gam * tmp[s]) / self.tau - ) - for s in range(self.Ns): - alpha = self.alpha - - def equation(x): - return np.sum((x + pi_new[s, :] + 1e-300) ** (2 / (alpha - 1))) - 1 - - low = max( - self.Na ** ((1 - self.alpha) / 2) - np.max(pi_new[s, :]), - -np.min(pi_new[s, :]), - ) - high = self.Na ** ((1 - self.alpha) / 2) - np.min(pi_new[s, :]) - cs = scipy.optimize.bisect(equation, low, high) - for a in range(self.Na): - pi_new[s, a] = (pi_new[s, a] + cs) ** (2 / (self.alpha - 1)) - - for s in range(self.Ns): - pi_new[s, :] /= sum(pi_new[s, :]) - err = np.linalg.norm(pi_new - pi, "fro") / np.linalg.norm(pi, "fro") - print(err) - if self.etp == "KL": - self.err[0].append(err) - if self.etp == "rKL": - self.err[1].append(err) - if self.etp == "alpha": - self.err[2][self.alpha].append(err) - if err < self.eps: - print(itr) - break - pi[:] = pi_new[:] - self.result = np.copy(pi) - self.ref_result = np.copy(pi) - - def set_tau(self, tau): - self.tau = tau - self.ref_result = None - - def set_alpha(self, alpha): - self.alpha = alpha - - def set_history(self, his): - self.train_from_history = his - if his and self.result is None: - self.result = np.ones([self.Ns, self.Na]) / self.Na - - def clear_history(self): - self.result = self.result = np.ones([self.Ns, self.Na]) / self.Na - - def set_etp(self, tp): - etp = ("KL", "rKL", "alpha") - self.etp = etp[tp] - if self.etp == "alpha": - self.alpha = 0 - self.ref_result = None - - def get_value(self, pi=None, reg=False): - # if self.result is None: - # return - if pi is None: - pi = self.result - Idt = np.identity(self.Ns) - P_pi = np.zeros([self.Ns, self.Ns]) - for s in range(self.Ns): - for t in range(self.Ns): - P_pi[s, t] = np.dot(pi[s, :], self.P[:, s, t]) - r_pi = np.zeros(self.Ns) - for s in range(self.Ns): - r_pi[s] = np.dot(pi[s, :], self.r[s, :]) - H_pi = 0 - if reg: - if self.etp == "KL": - H_pi = np.sum(xlogy(pi, pi), axis=1) - if self.etp == "rKL": - H_pi = np.sum(-np.log(pi), axis=1) / self.Na - if self.etp == "alpha": - H_pi = 4 / (1 - self.alpha ** 2) - 4 / (1 - self.alpha ** 2) * self.Na ** ( - (self.alpha - 1) / 2 - ) * np.sum(pi ** ((self.alpha + 1) / 2), axis=1) - v_pi = np.linalg.solve(Idt - self.gam * P_pi, r_pi - self.tau * H_pi) - return v_pi - - def get_expect_value(self, pi=None): - # if self.result is None: - # return - if pi is None: - pi = self.result - P_pi = np.zeros([self.Ns, self.Ns]) - for s in range(self.Ns): - for t in range(self.Ns): - P_pi[s, t] = np.dot(pi[s, :], self.P[:, s, t]) - r_pi = np.zeros(self.Ns) - for s in range(self.Ns): - r_pi[s] = np.dot(pi[s, :], self.r[s, :]) - _, evec = sla.eigs(P_pi.T, k=1, which="LM") - d_pi = (evec / evec.sum()).real - d_pi = d_pi.reshape(len(d_pi)) - return np.dot(d_pi, r_pi) - - def make_fig(self, alpha1, alpha2, linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - err0 = self.err[0] - err1 = self.err[1] - err2 = self.err[2][alpha1] - err3 = self.err[2][alpha2] - ax.tick_params(axis="both", labelsize=14) - ax.plot(err0, label="KL", linewidth=linewidth) - ax.plot(err1, label="reversed KL", linewidth=linewidth) - ax.plot(err2, label="Hellinger", linewidth=linewidth) - ax.plot(err3, label="alpha divergence (alpha=" + str(alpha2) + ")", linewidth=linewidth) - ax.set_yscale("log") - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("relative change of policy", fontsize=22) - ax.legend(loc="upper right") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def make_fig_exact(self, label="KL", linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - ax.tick_params(axis="both", labelsize=14) - err0 = ( - np.log(np.abs(np.log(self.err_exact[:-1]))) - if self.err_exact[-1] == 0 - else np.log(np.abs(np.log(self.err_exact))) - ) - ax.plot(err0, label=label, linewidth=linewidth, marker="+") - ax.plot( - [0, len(err0) - 1], - [0, np.log(2) * (len(err0) - 1)], - label="slope=log2", - color="g", - linestyle="dashed", - linewidth=0.5, - ) - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("loglog error", fontsize=22) - ax.legend(loc="upper left") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def train_all(self, alpha1=0, alpha2=-3): - self.set_etp(0) - self.train() - self.set_etp(1) - self.train() - self.set_etp(2) - self.set_alpha(alpha1) - self.train() - self.set_alpha(alpha2) - self.train() - - def value_iteration(self): - V_pre = np.ones(self.Ns) - V = np.zeros(self.Ns) - Q = np.zeros([self.Ns, self.Na]) - cnt = 0 - while np.linalg.norm(V - V_pre) / np.linalg.norm(V_pre) > 10 ** -15: - cnt += 1 - V_pre = V - for a in range(self.Na): - Q[:, a] = self.r[:, a] + self.gam * (self.P[a, :, :] @ V) - - if self.etp == "KL": - V = logsumexp(Q / self.tau, axis=1) * self.tau - if self.etp == "rKL": - return - if self.etp == "alpha": - return - if cnt % 100 == 0: - print(np.linalg.norm(V - V_pre) / np.linalg.norm(V_pre)) - - for a in range(self.Na): - Q[:, a] = self.r[:, a] + self.gam * (self.P[a, :, :] @ V) - V - self.V_exact = V - self.pi_exact = softmax(Q / self.tau, axis=1) - print(cnt) - - def train_primal_dual_old(self, CV=0.1, CW=0.1, tol=1e-6, step=0.01): - V = np.zeros(self.Ns) - W = np.ones(self.Ns) - Z0 = np.zeros([self.Ns, self.Na]) - Z = np.ones([self.Ns, self.Na]) / self.Na - dt = step - Idt = np.identity(self.Ns) - cnt = 0 - - while np.max(np.abs(Z - Z0)) > tol: - cnt += 1 - PZ = np.zeros([self.Ns, self.Ns]) - for s in range(self.Ns): - for t in range(self.Ns): - PZ[s, t] = np.dot(Z[s, :], self.P[:, s, t]) - RZ = np.zeros(self.Ns) - for s in range(self.Ns): - RZ[s] = np.dot(Z[s, :], self.r[s, :]) - HZ = np.sum(xlogy(Z, Z), axis=1) - - EV = V - (Idt - self.gam * PZ.T) @ W / CV - EW = (RZ - (Idt - self.gam * PZ) @ V - self.tau * HZ) / CW - W - EZ = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EZ[:, a] = ( - self.r[:, a] - (Idt - self.gam * self.P[a, :, :]) @ V - ) / self.tau - np.log(Z[:, a]) - Z0 = Z - V = V - dt * EV - W = W + dt * EW - Z = Z * np.exp(dt * EZ) - # print(np.min(W)) - for s in range(self.Ns): - Z[s, :] /= sum(Z[s, :]) - if cnt % 10 == 0: - print(cnt) - print(np.max(np.abs(Z - Z0))) - self.resultZ = np.copy(Z) - print(cnt) - print(np.max(np.abs(Z - Z0))) - - def train_primal_dual(self, CV=0.01, tol=1e-6, step=0.01): - CM = self.tau - V = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) / self.Na - M0 = np.zeros([self.Ns, self.Na]) - dt = step - Idt = np.identity(self.Ns) - cnt = 0 - Z = np.zeros([self.Ns, self.Na]) - self.err_pd = [] - if self.resultZ is not None: - self.err_pd_exact = [] - - while np.linalg.norm(M - M0) / np.linalg.norm(M) > tol: - M0[:] = M[:] - cnt += 1 - - EV = np.zeros(self.Ns) - for a in range(self.Na): - EV += (Idt - self.gam * self.P[a].T) @ M[:, a] - EV /= CV - V = (V + dt * EV) / (1 + dt) - - EM = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EM[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a]) @ V) / CM - - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - H[s, a] = M[s, a] if M[s, a] >= 1 else np.log(M[s, a] + 1e-20) + 1 - H = (H + dt * EM) / (1 + dt) - - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = H[s, a] if H[s, a] >= 1 else np.exp(H[s, a] - 1) - if np.linalg.norm(M, "fro") == 0 or np.linalg.norm(M, "fro") == np.inf: - # self.M = M - print(M) - return - - err = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - self.err_pd.append(err) - if self.resultZ is not None: - for s in range(self.Ns): - # if np.sum(M[s, :]) == 0: - # self.M = M - # print(M) - # return - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.err_pd_exact.append( - np.linalg.norm(self.resultZ - Z, "fro") / np.linalg.norm(self.resultZ, "fro") - ) - - if cnt % 50 == 0: - print(cnt) - print(err) - for s in range(self.Ns): - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.resultZ = np.copy(Z) - self.resultV = np.copy(V) - # print("M") - # print(M) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) # step needs to be tuned - - def sample_gen(self, s, a, nb): - samples = np.random.choice(self.Ns, size=nb, replace=True, p=self.P[a, s]) - return samples - - def train_primal_dual_batch( - self, CV=0.01, tol=1e-5, step=1e-4, nb1=1, nb2=1 - ): # primal dual method from samples - CM = self.tau - V = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) / self.Na - M0 = np.zeros([self.Ns, self.Na]) - dt = step - # Idt = np.identity(self.Ns) - cnt = 0 - Z = np.zeros([self.Ns, self.Na]) - self.err_pd = [] - if self.resultZ is not None: - self.err_pd_exact = [] - - while np.linalg.norm(M - M0) / np.linalg.norm(M) > tol: - M0[:] = M[:] - cnt += 1 - - s_idx = np.random.choice(self.Ns, replace=False, size=nb1) - EV = np.sum(M, axis=1) - for s in s_idx: - for a in range(self.Na): - samples = self.sample_gen(s, a, nb2) - for sample in samples: - EV[sample] -= self.gam * M[s, a] / nb2 - EV /= CV - V = (V + dt * EV) / (1 + dt) - - s_idx = np.random.choice(self.Ns, replace=False, size=nb1) - # EM = np.zeros([self.Ns, self.Na]) - # for a in range(self.Na): - # EM[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a]) @ V) / CM - - H = np.zeros([self.Ns, self.Na]) - for s in s_idx: - for a in range(self.Na): - H[s, a] = M[s, a] if M[s, a] >= 1 else np.log(M[s, a] + 1e-300) + 1 - samples = self.sample_gen(s, a, nb2) - tmp_v = 0 - for sample in samples: - tmp_v += V[sample] - tmp_v /= nb2 - H[s, a] = (H[s, a] + dt * ((self.r[s, a] - V[s] + self.gam * tmp_v) / CM)) / ( - 1 + dt - ) - # H = (H + dt * EM) / (1 + dt) - - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = H[s, a] if H[s, a] >= 1 else np.exp(H[s, a] - 1) - if np.linalg.norm(M, "fro") == 0 or np.linalg.norm(M, "fro") == np.inf: - # self.M = M - print(M) - return - - err = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - self.err_pd.append(err) - if self.resultZ is not None: - for s in range(self.Ns): - # if np.sum(M[s, :]) == 0: - # self.M = M - # print(M) - # return - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.err_pd_exact.append( - np.linalg.norm(self.resultZ - Z, "fro") / np.linalg.norm(self.resultZ, "fro") - ) - - if cnt % 50 == 0: - print(cnt) - print(err) - for s in range(self.Ns): - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.resultZ = np.copy(Z) - # print("M") - # print(M) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) # step needs to be tuned - - def train_primal_dual_batch_acc( - self, CV=0.01, tol=1e-5, step=1e-4, nb1=1, nb2=1, lam=0.01, m=5 - ): # "acc" = anderson acceleration - CM = self.tau - V = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) / self.Na - M0 = np.zeros([self.Ns, self.Na]) - dt = step - cnt = 0 - Z = np.zeros([self.Ns, self.Na]) - self.err_pd = [] - if self.resultZ is not None: - self.err_pd_exact = [] - - V_h = [] - GV_h = [] - GH_h = [] - H_h = [] - EV = 1 - - while np.linalg.norm(M - M0) / np.linalg.norm(M) > tol or np.linalg.norm(EV) > tol: - M0[:] = M[:] - cnt += 1 - s_idx = None - if cnt == 1: - s_idx = np.random.choice(self.Ns, replace=False, size=nb1) - EV = np.sum(M, axis=1) - for s in s_idx: - for a in range(self.Na): - samples = self.sample_gen(s, a, nb2) - for sample in samples: - EV[sample] -= self.gam * M[s, a] / nb2 - EV /= CV - # print(np.linalg.norm(EV)) - GV_h.append((V + dt * EV) / (1 + dt) - V) - V = (V + dt * EV) / (1 + dt) - V_h.append(V) - - if 1 < cnt: - s_idx = np.random.choice(self.Ns, replace=False, size=nb1) - EV = np.sum(M, axis=1) - for s in s_idx: - for a in range(self.Na): - samples = self.sample_gen(s, a, nb2) - for sample in samples: - EV[sample] -= self.gam * M[s, a] / nb2 - EV /= CV - # print(np.linalg.norm(EV)) - GV_h.append((V + dt * EV) / (1 + dt) - V) - V = (V + dt * EV) / (1 + dt) - V_h.append(V) - U = np.array(GV_h).T - c = np.linalg.solve( - U.T @ U + lam * np.identity(np.size(U, 1)), np.ones(np.size(U, 1)) - ) - c = c / np.sum(c) - V = c @ V_h - if cnt > m: - GV_h = GV_h[1:] - V_h = V_h[1:] - - # s_idx = np.random.choice(self.Ns, replace=False, size=nb1) - H = None - gH = None - if cnt == 1: - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - H[s, a] = M[s, a] if M[s, a] >= 1 else np.log(M[s, a] + 1e-300) + 1 - - gH = np.zeros([self.Ns, self.Na]) - for s in s_idx: - for a in range(self.Na): - samples = self.sample_gen(s, a, nb2) - tmp_v = 0 - for sample in samples: - tmp_v += V[sample] - tmp_v /= nb2 - gH[s, a] = ( - H[s, a] + dt * ((self.r[s, a] - V[s] + self.gam * tmp_v) / CM) - ) / (1 + dt) - H[s, a] - GH_h.append(gH.reshape(-1)) - H = H + gH - H_h.append(H.reshape(-1)) - - # H = (H + dt * EM) / (1 + dt) - - if 1 < cnt: - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - H[s, a] = M[s, a] if M[s, a] >= 1 else np.log(M[s, a] + 1e-300) + 1 - - gH = np.zeros([self.Ns, self.Na]) - for s in s_idx: - for a in range(self.Na): - samples = self.sample_gen(s, a, nb2) - tmp_v = 0 - for sample in samples: - tmp_v += V[sample] - tmp_v /= nb2 - gH[s, a] = ( - H[s, a] + dt * ((self.r[s, a] - V[s] + self.gam * tmp_v) / CM) - ) / (1 + dt) - H[s, a] - GH_h.append(gH.reshape(-1)) - H = H + gH - H_h.append(H.reshape(-1)) - - U = np.array(GH_h).T - c = np.linalg.solve( - U.T @ U + lam * np.identity(np.size(U, 1)), np.ones(np.size(U, 1)) - ) - c = c / np.sum(c) - H = c @ H_h - if cnt > m: - GH_h = GH_h[1:] - H_h = H_h[1:] - - H = H.reshape([self.Ns, -1]) - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = H[s, a] if H[s, a] >= 1 else np.exp(H[s, a] - 1) - if np.linalg.norm(M, "fro") == 0 or np.linalg.norm(M, "fro") == np.inf: - # self.M = M - print(M) - return - - err = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - self.err_pd.append(err) - if self.resultZ is not None: - for s in range(self.Ns): - # if np.sum(M[s, :]) == 0: - # self.M = M - # print(M) - # return - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.err_pd_exact.append( - np.linalg.norm(self.resultZ - Z, "fro") / np.linalg.norm(self.resultZ, "fro") - ) - - if cnt % 50 == 0: - print(cnt) - print(err) - print(np.linalg.norm(EV)) - print(np.linalg.norm(gH)) - for s in range(self.Ns): - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.resultZ = np.copy(Z) - # print("M") - # print(M) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) # step needs to be tuned - - def train_primal_dual_new( - self, CV=0.01, tol=1e-6, step=0.01 - ): # with \mu\log\hat{\mu} regularization and a different approximate Hessian - CM = self.tau - V = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) / self.Na - M0 = np.zeros([self.Ns, self.Na]) - dt = step - Idt = np.identity(self.Ns) - cnt = 0 - Z = np.zeros([self.Ns, self.Na]) - self.err_pd = [] - if self.resultZ is not None: - self.err_pd_exact = [] - - while np.linalg.norm(M - M0) / np.linalg.norm(M) > tol: - M0[:] = M[:] - cnt += 1 - - EV = np.zeros(self.Ns) - for a in range(self.Na): - EV += (Idt - self.gam * self.P[a].T) @ M[:, a] - EV /= CV - V = (1 - dt) * V + dt * EV - - EM = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EM[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a]) @ V) / CM - - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - H[s, a] = np.log(M[s, a] + 1e-300) - - for s in range(self.Ns): - tmp_w = np.log(np.sum(M[s, :]) + 1e-300) - for a in range(self.Na): - # H[s, a] = (H[s, a] + dt * EM[s, a] + dt * tmp_w) / (1 + dt) - H[s, a] = (1 - dt) * H[s, a] + dt * (EM[s, a] + tmp_w) - - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = np.exp(H[s, a]) - - if np.linalg.norm(M, "fro") == 0 or np.linalg.norm(M, "fro") == np.inf: - # self.M = M - print(M) - return - - err = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - self.err_pd.append(err) - if self.resultZ is not None: - for s in range(self.Ns): - # if np.sum(M[s, :]) == 0: - # self.M = M - # print(M) - # return - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.err_pd_exact.append( - np.linalg.norm(self.resultZ - Z, "fro") / np.linalg.norm(self.resultZ, "fro") - ) - - if cnt % 50 == 0: - print(cnt) - print(err) - print(np.sum(M > 0.05)) - for s in range(self.Ns): - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.resultZ = np.copy(Z) - self.resultV = np.copy(V) - # self.M = M - # print("M") - # print(M) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) # step needs to be tuned - - def train_primal_dual_new_continue(self, CV=0.01, tol=1e-6, step=3e-4): - CM = self.tau if self.tau >= 0.01 else 2 ** (int(np.log2(0.01 / self.tau)) + 1) * self.tau - V = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) / self.Na - M0 = np.zeros([self.Ns, self.Na]) - dt = step - Idt = np.identity(self.Ns) - cnt = 0 - Z = np.zeros([self.Ns, self.Na]) - self.err_pd = [] - if self.resultZ is not None: - self.err_pd_exact = [] - - # err_tmp = np.linalg.norm(M-M0) / np.linalg.norm(M) - - while CM > self.tau or np.linalg.norm(M - M0) / np.linalg.norm(M) > tol: - M0[:] = M[:] - cnt += 1 - - EV = np.zeros(self.Ns) - for a in range(self.Na): - EV += (Idt - self.gam * self.P[a].T) @ M[:, a] - EV /= CV - V = (1 - dt) * V + dt * EV - - EM = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EM[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a]) @ V) / CM - - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - H[s, a] = np.log(M[s, a] + 1e-300) - - for s in range(self.Ns): - tmp_w = np.log(np.sum(M[s, :]) + 1e-300) - for a in range(self.Na): - # H[s, a] = (H[s, a] + dt * EM[s, a] + dt * tmp_w) / (1 + dt) - H[s, a] = (1 - dt) * H[s, a] + dt * (EM[s, a] + tmp_w) - - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = np.exp(H[s, a]) - - if np.linalg.norm(M, "fro") == 0 or np.linalg.norm(M, "fro") == np.inf: - # self.M = M - print(M) - return - - err = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - self.err_pd.append(err) - if self.resultZ is not None: - for s in range(self.Ns): - # if np.sum(M[s, :]) == 0: - # self.M = M - # print(M) - # return - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.err_pd_exact.append( - np.linalg.norm(self.resultZ - Z, "fro") / np.linalg.norm(self.resultZ, "fro") - ) - - if cnt % 50 == 0: - print(cnt) - print(err) - print(np.sum(M > 0.05)) - - if np.linalg.norm(M - M0) / np.linalg.norm(M) <= tol: - CM /= 2 - dt /= 2 - - for s in range(self.Ns): - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.resultZ = np.copy(Z) - self.resultV = np.copy(V) - # self.M = M - # print("M") - # print(M) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) # step needs to be tuned - - def train_primal_dual_new_continue_acc(self, CV=0.01, tol=1e-6, step=3e-4, lam=0.001, m=5): - CM = self.tau if self.tau >= 0.01 else 2 ** (int(np.log2(0.01 / self.tau)) + 1) * self.tau - V = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) / self.Na - M0 = np.zeros([self.Ns, self.Na]) - dt = step - Idt = np.identity(self.Ns) - cnt = 0 - Z = np.zeros([self.Ns, self.Na]) - self.err_pd = [] - if self.resultZ is not None: - self.err_pd_exact = [] - - V_h = [] - GV_h = [] - H_h = [] - GH_h = [] - - # err_tmp = np.linalg.norm(M-M0) / np.linalg.norm(M) - cc = 100 - - while CM > self.tau or np.linalg.norm(M - M0) / np.linalg.norm(M) > tol * cc: - if CM == self.tau: - cc = 1 - M0[:] = M[:] - cnt += 1 - - if cnt == 1: - EV = np.zeros(self.Ns) - for a in range(self.Na): - EV += (Idt - self.gam * self.P[a].T) @ M[:, a] - EV /= CV - GV_h.append((1 - dt) * V + dt * EV - V) - V = (1 - dt) * V + dt * EV - V_h.append(V) - - if cnt > 1: - EV = np.zeros(self.Ns) - for a in range(self.Na): - EV += (Idt - self.gam * self.P[a].T) @ M[:, a] - EV /= CV - GV_h.append((1 - dt) * V + dt * EV - V) - V = (1 - dt) * V + dt * EV - V_h.append(V) - U = np.array(GV_h).T - c = np.linalg.solve( - U.T @ U + lam * np.identity(np.size(U, 1)), np.ones(np.size(U, 1)) - ) - c = c / np.sum(c) - V = c @ V_h - # if np.min(c) < 0: - # self.U = U - # print(cnt) - # print("!") - # print(c) - # return - if cnt > m: - GV_h = GV_h[1:] - V_h = V_h[1:] - - H = None - if cnt == 1: - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - H[s, a] = np.log(M[s, a] + 1e-300) - - EM = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EM[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a]) @ V) / CM - - gH = np.zeros([self.Ns, self.Na]) - - for s in range(self.Ns): - tmp_w = np.log(np.sum(M[s, :]) + 1e-300) - for a in range(self.Na): - # H[s, a] = (H[s, a] + dt * EM[s, a] + dt * tmp_w) / (1 + dt) - gH[s, a] = (1 - dt) * H[s, a] + dt * (EM[s, a] + tmp_w) - H[s, a] - H[s, a] = (1 - dt) * H[s, a] + dt * (EM[s, a] + tmp_w) - - GH_h.append(gH.reshape(-1)) - H_h.append(H.reshape(-1)) - - if cnt > 1: - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - H[s, a] = np.log(M[s, a] + 1e-300) - - EM = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EM[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a]) @ V) / CM - - gH = np.zeros([self.Ns, self.Na]) - - for s in range(self.Ns): - tmp_w = np.log(np.sum(M[s, :]) + 1e-300) - for a in range(self.Na): - # H[s, a] = (H[s, a] + dt * EM[s, a] + dt * tmp_w) / (1 + dt) - gH[s, a] = (1 - dt) * H[s, a] + dt * (EM[s, a] + tmp_w) - H[s, a] - H[s, a] = (1 - dt) * H[s, a] + dt * (EM[s, a] + tmp_w) - - GH_h.append(gH.reshape(-1)) - H_h.append(H.reshape(-1)) - U = np.array(GH_h).T - c = np.linalg.solve( - U.T @ U + lam * np.identity(np.size(U, 1)), np.ones(np.size(U, 1)) - ) - c = c / np.sum(c) - - H = c @ H_h - if cnt > m: - GH_h = GH_h[1:] - H_h = H_h[1:] - - H = H.reshape([self.Ns, -1]) - - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = np.exp(H[s, a]) - - if np.linalg.norm(M, "fro") == 0 or np.linalg.norm(M, "fro") == np.inf: - # self.M = M - print(cnt) - print(M) - return - - err = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - self.err_pd.append(err) - if self.resultZ is not None: - for s in range(self.Ns): - # if np.sum(M[s, :]) == 0: - # self.M = M - # print(M) - # return - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.err_pd_exact.append( - np.linalg.norm(self.resultZ - Z, "fro") / np.linalg.norm(self.resultZ, "fro") - ) - - if cnt % 50 == 0: - print(cnt) - print(err) - print(np.sum(M > 0.05)) - - if np.linalg.norm(M - M0) / np.linalg.norm(M) <= tol: - CM /= 2 - dt /= 2 - - for s in range(self.Ns): - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.resultZ = np.copy(Z) - self.resultV = np.copy(V) - # self.M = M - # print("M") - # print(M) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) # step needs to be tuned - - def train_primal_dual_epart( - self, tol=1e-6, step=0.01 - ): # gradient descent for v, natural gradient descent for mu - CM = self.tau - V = np.ones(self.Ns) - vec_e = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) / self.Na - M0 = np.zeros([self.Ns, self.Na]) - dt = step - Idt = np.identity(self.Ns) - cnt = 0 - Z = np.zeros([self.Ns, self.Na]) - self.err_pd = [] - if self.resultZ is not None: - self.err_pd_exact = [] - - while np.linalg.norm(M - M0) / np.linalg.norm(M) > tol: - M0[:] = M[:] - cnt += 1 - - EV = np.zeros(self.Ns) - for a in range(self.Na): - EV += (Idt - self.gam * self.P[a].T) @ M[:, a] - V = V - dt * (vec_e - EV) - - EM = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EM[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a]) @ V) / CM - - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - H[s, a] = np.log(M[s, a] + 1e-300) - - for s in range(self.Ns): - tmp_w = np.log(np.sum(M[s, :]) + 1e-300) - for a in range(self.Na): - # H[s, a] = (H[s, a] + dt * EM[s, a] + dt * tmp_w) / (1 + dt) - H[s, a] = (1 - dt) * H[s, a] + dt * (EM[s, a] + tmp_w) - - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = np.exp(H[s, a]) - - if np.linalg.norm(M, "fro") == 0 or np.linalg.norm(M, "fro") == np.inf: - # self.M = M - print(M) - return - - err = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - self.err_pd.append(err) - if self.resultZ is not None: - for s in range(self.Ns): - # if np.sum(M[s, :]) == 0: - # self.M = M - # print(M) - # return - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.err_pd_exact.append( - np.linalg.norm(self.resultZ - Z, "fro") / np.linalg.norm(self.resultZ, "fro") - ) - - if cnt % 50 == 0: - print(cnt) - print(err) - print(np.sum(M > 0.05)) - for s in range(self.Ns): - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.resultZ = np.copy(Z) - self.resultV = np.copy(V) - # self.M = M - # print("M") - # print(M) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) # step needs to be tuned - - def train_primal_dual_e(self, tol=1e-6, step=0.01): # vanilla gradient descent - CM = self.tau - V = np.ones(self.Ns) - vec_e = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) / self.Na - M0 = np.zeros([self.Ns, self.Na]) - dt = step - Idt = np.identity(self.Ns) - cnt = 0 - Z = np.zeros([self.Ns, self.Na]) - self.err_pd = [] - if self.resultZ is not None: - self.err_pd_exact = [] - - while np.linalg.norm(M - M0) / np.linalg.norm(M) > tol: - M0[:] = M[:] - cnt += 1 - - EV = np.zeros(self.Ns) - for a in range(self.Na): - EV += (Idt - self.gam * self.P[a].T) @ M[:, a] - V = V - dt * (vec_e - EV) - - EM = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EM[:, a] = self.r[:, a] - (Idt - self.gam * self.P[a]) @ V - - for s in range(self.Ns): - tmp_w = np.log(np.sum(M[s, :]) + 1e-300) - for a in range(self.Na): - # H[s, a] = (H[s, a] + dt * EM[s, a] + dt * tmp_w) / (1 + dt) - M[s, a] = M[s, a] + dt * (EM[s, a] + CM * tmp_w - CM * np.log(M[s, a])) - - if np.linalg.norm(M, "fro") == 0 or np.linalg.norm(M, "fro") == np.inf or np.min(M) < 0: - # self.M = M - print(M) - print(np.min(M)) - return - - err = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - self.err_pd.append(err) - if self.resultZ is not None: - for s in range(self.Ns): - # if np.sum(M[s, :]) == 0: - # self.M = M - # print(M) - # return - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.err_pd_exact.append( - np.linalg.norm(self.resultZ - Z, "fro") / np.linalg.norm(self.resultZ, "fro") - ) - - if cnt % 50 == 0: - print(cnt) - print(err) - print(np.sum(M > 0.05)) - for s in range(self.Ns): - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.resultZ = np.copy(Z) - self.resultV = np.copy(V) - # self.M = M - # print("M") - # print(M) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) # step needs to be tuned - - def _lyap(self, CV, V, M): - L1 = CV / 2 * (np.linalg.norm(V - self.resultV)) ** 2 - L21 = self.tau * np.sum(xlogy(self.resultM, self.resultM)) - L22 = self.tau * np.sum(xlogy(self.resultM, M)) + self.tau * np.sum(M - self.resultM) - return L1 + L21 - L22 - - def _lyap_mixed(self, CV, V, M, c): - tmp_b = np.sum(self.resultM, axis=1) - tmp_M = np.sum(M, axis=1) - L1 = CV / 2 * (np.linalg.norm(V - self.resultV)) ** 2 - L21 = self.tau * np.sum(xlogy(self.resultM, self.resultM)) - L22 = self.tau * np.sum(xlogy(self.resultM, M)) - L22 -= self.tau * c / (1 - c) * np.sum(xlogy(tmp_b, tmp_b)) - L23 = self.tau * c / (1 - c) * np.sum(xlogy(tmp_b, tmp_M)) - L23 -= self.tau / (1 - c) * np.sum(M - self.resultM) - return L1 + L21 - L22 - L23 - - def train_primal_dual_pseudo_mix( - self, CV=0.01, tol=1e-6, step=0.01, c=0.99 - ): # with \mu\log\hat{\mu} regularization and a different approximate Hessian - CM = self.tau - V = np.ones(self.Ns) - V0 = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) / self.Na - M0 = np.zeros([self.Ns, self.Na]) - Idt = np.identity(self.Ns) - cnt = 0 - dt = step - Z = np.zeros([self.Ns, self.Na]) - self.err_pd = [] - self.err_pd_val_V = [] - self.err_pd_val_pi = [] - self.lyap = [] - if self.resultZ is not None: - self.err_pd_exact = [] - - tmp_err = 1 - while tmp_err > tol: - M0[:] = M[:] - V0[:] = V[:] - cnt += 1 - # if tmp_err < 2 * tol: - # c = 0.98 - - EV = np.zeros(self.Ns) - for a in range(self.Na): - EV += (Idt - self.gam * self.P[a].T) @ M[:, a] - EV /= CV - V = (1 - dt) * V + dt * EV - - EM = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EM[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a]) @ V) / CM - - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - H[s, a] = np.log(M[s, a] + 1e-300) - - for s in range(self.Ns): - tmp_w = np.log(np.sum(M[s, :]) + 1e-300) - tmp_pi = softmax(H[s, :]) - # for a in range(self.Na): - # # H[s, a] = (H[s, a] + dt * EM[s, a] + dt * tmp_w) / (1 + dt) - # H[s, a] = (1-dt) * H[s, a] + dt * (EM[s, a] + tmp_w) - tmp_h1 = (1 - dt) * H[s, :] + dt * (EM[s, :] + tmp_w) - tmp_h2 = c * dt * np.ones(self.Na) * np.dot(tmp_pi, EM[s, :] + tmp_w - H[s, :]) - - H[s, :] = tmp_h1 - tmp_h2 - - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = np.exp(H[s, a]) - - if np.linalg.norm(M, "fro") == 0 or np.linalg.norm(M, "fro") == np.inf: - # self.M = M - print(M) - return - - # err = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - # tmp_err = err - err1 = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - err2 = np.linalg.norm(V - V0) / np.linalg.norm(V) - tmp_err = max(err1, err2) - self.err_pd.append(err1) - for s in range(self.Ns): - # if np.sum(M[s, :]) == 0: - # self.M = M - # print(M) - # return - Z[s, :] = M[s, :] / np.sum(M[s, :]) - if self.resultZ is not None: - self.err_pd_exact.append( - np.linalg.norm(self.resultZ - Z, "fro") / np.linalg.norm(self.resultZ, "fro") - ) - - if cnt % 50 == 0: - print(cnt) - # print(CM) - print(np.sum(M)) - print(err1) - print(err2) - print(np.sum(Z > 0.05)) - print(np.linalg.norm(V - self.V_exact) / np.linalg.norm(self.V_exact)) - print(np.max(np.abs(Z - self.pi_exact))) - print(np.mean(np.abs(Z - self.pi_exact))) - - if self.resultM is not None and self.resultV is not None: - self.lyap.append(self._lyap_mixed(CV, V, M, c)) - if self.V_exact is not None: - self.err_pd_val_V.append( - np.linalg.norm(V - self.V_exact) / np.linalg.norm(self.V_exact) - ) - if self.pi_exact is not None: - self.err_pd_val_pi.append( - np.linalg.norm(self.pi_exact - Z, "fro") / np.linalg.norm(self.pi_exact, "fro") - ) - - for s in range(self.Ns): - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.resultM = np.copy(M) - self.resultZ = np.copy(Z) - self.resultV = np.copy(V) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) - - def train_primal_dual_pseudo_mix_random_grad( - self, CV=0.01, tol=1e-4, step=0.01, c=0.99, sigma=0.01 - ): # with \mu\log\hat{\mu} regularization and a different approximate Hessian - CM = self.tau - V = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) / self.Na - M0 = np.zeros([self.Ns, self.Na]) - V0 = np.ones(self.Ns) - Idt = np.identity(self.Ns) - cnt = 0 - dt = step - Z = np.zeros([self.Ns, self.Na]) - self.err_pd = [] - self.lyap = [] - self.err_pd_val_V = [] - self.err_pd_val_pi = [] - if self.resultZ is not None: - self.err_pd_exact = [] - - tmp_err = 1 - while tmp_err > tol: - M0[:] = M[:] - V0[:] = V[:] - cnt += 1 - # if tmp_err < 2 * tol: - # c = 0.98 - - EV = np.zeros(self.Ns) - for a in range(self.Na): - EV += (Idt - self.gam * self.P[a].T) @ M[:, a] + np.random.normal( - loc=0.0, scale=sigma, size=self.Ns - ) - EV /= CV - V = (1 - dt) * V + dt * EV - - EM = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EM[:, a] = ( - self.r[:, a] - (Idt - self.gam * self.P[a]) @ V - ) / CM + np.random.normal(loc=0.0, scale=sigma, size=self.Ns) - - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - H[s, a] = np.log(M[s, a] + 1e-300) - - for s in range(self.Ns): - tmp_w = np.log(np.sum(M[s, :]) + 1e-300) - tmp_pi = softmax(H[s, :]) - # for a in range(self.Na): - # # H[s, a] = (H[s, a] + dt * EM[s, a] + dt * tmp_w) / (1 + dt) - # H[s, a] = (1-dt) * H[s, a] + dt * (EM[s, a] + tmp_w) - - tmp_h1 = (1 - dt) * H[s, :] + dt * (EM[s, :] + tmp_w) - tmp_h2 = c * dt * np.ones(self.Na) * np.dot(tmp_pi, EM[s, :] + tmp_w - H[s, :]) - H[s, :] = tmp_h1 - tmp_h2 - - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = np.exp(H[s, a]) - - if np.linalg.norm(M, "fro") == 0 or np.linalg.norm(M, "fro") == np.inf: - # self.M = M - print(M) - return - - err1 = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - err2 = np.linalg.norm(V - V0) / np.linalg.norm(V) - tmp_err = max(err1, err2) - self.err_pd.append(err1) - for s in range(self.Ns): - # if np.sum(M[s, :]) == 0: - # self.M = M - # print(M) - # return - Z[s, :] = M[s, :] / np.sum(M[s, :]) - if self.resultZ is not None: - self.err_pd_exact.append( - np.linalg.norm(self.resultZ - Z, "fro") / np.linalg.norm(self.resultZ, "fro") - ) - - if self.V_exact is not None: - self.err_pd_val_V.append( - np.linalg.norm(V - self.V_exact) / np.linalg.norm(self.V_exact) - ) - if self.pi_exact is not None: - self.err_pd_val_pi.append( - np.linalg.norm(self.pi_exact - Z, "fro") / np.linalg.norm(self.pi_exact, "fro") - ) - - if cnt % 50 == 0: - print(cnt) - # print(CM) - print(np.sum(M)) - print(err1) - print(err2) - print(np.sum(Z > 0.05)) - print(np.linalg.norm(V - self.V_exact) / np.linalg.norm(self.V_exact)) - print(np.max(np.abs(Z - self.pi_exact))) - print(np.mean(np.abs(Z - self.pi_exact))) - - if self.resultM is not None and self.resultV is not None: - self.lyap.append(self._lyap_mixed(CV, V, M, c)) - - for s in range(self.Ns): - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.resultM = np.copy(M) - self.resultZ = np.copy(Z) - self.resultV = np.copy(V) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) - - def train_primal_dual_pseudo_mix_sample( - self, CV=0.01, tol=1e-4, step=0.01, c=0.99, n_sam=100, n_batch=1000 - ): # with \mu\log\hat{\mu} regularization and a different approximate Hessian - CM = self.tau - V = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) / self.Na - M0 = np.zeros([self.Ns, self.Na]) - V0 = np.ones(self.Ns) - # Idt = np.identity(self.Ns) - cnt = 0 - dt = step - Z = np.zeros([self.Ns, self.Na]) - self.err_pd = [] - self.lyap = [] - if self.resultZ is not None: - self.err_pd_exact = [] - if self.V_exact is not None: - self.err_pd_val_V = [] - if self.pi_exact is not None: - self.err_pd_val_pi = [] - - samples_buffer = [] - for s in range(self.Ns): - for a in range(self.Na): - # samples_buffer[(s, a)] = [] - samples = self.sample_gen(s, a, n_sam) - for sample in samples: - samples_buffer.append((s, a, sample)) - print("sampling finished") - print("buffer size: " + str(len(samples_buffer))) - - tmp_err = 1 - while tmp_err > tol: - M0[:] = M[:] - V0[:] = V[:] - cnt += 1 - # if tmp_err < 2 * tol: - # c = 0.98 - samples = random.sample(samples_buffer, n_batch) - - ctr = defaultdict(lambda: 0) - for (s, a, _) in samples: - ctr[(s, a)] += 1 - - # s_idx = np.random.choice(len(samples_buffer), replace=False, size=n_batch) - EV = np.sum(M, axis=1) - for s, a, sample in samples: - EV[sample] -= self.gam * M[s, a] / ctr[(s, a)] - EV /= CV - V = (1 - dt) * V + dt * EV - - EM = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - # EM[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a]) @ V) / CM - EM[:, a] = (self.r[:, a] - V) / CM - - for s, a, sample in samples: - EM[s, a] += self.gam / CM * V[sample] / ctr[(s, a)] - - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - H[s, a] = np.log(M[s, a] + 1e-300) - - for s in range(self.Ns): - tmp_w = np.log(np.sum(M[s, :]) + 1e-300) - tmp_pi = softmax(H[s, :]) - # for a in range(self.Na): - # # H[s, a] = (H[s, a] + dt * EM[s, a] + dt * tmp_w) / (1 + dt) - # H[s, a] = (1-dt) * H[s, a] + dt * (EM[s, a] + tmp_w) - - tmp_h1 = (1 - dt) * H[s, :] + dt * (EM[s, :] + tmp_w) - tmp_h2 = c * dt * np.ones(self.Na) * np.dot(tmp_pi, EM[s, :] + tmp_w - H[s, :]) - - H[s, :] = tmp_h1 - tmp_h2 - - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = np.exp(H[s, a]) - - if np.linalg.norm(M, "fro") == 0 or np.linalg.norm(M, "fro") == np.inf: - # self.M = M - print(M) - return - - # err = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - err1 = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - err2 = np.linalg.norm(V - V0) / np.linalg.norm(V) - tmp_err = max(err1, err2) - # tmp_err = err - self.err_pd.append(err1) - for s in range(self.Ns): - # if np.sum(M[s, :]) == 0: - # self.M = M - # print(M) - # return - Z[s, :] = M[s, :] / np.sum(M[s, :]) - if self.resultZ is not None: - self.err_pd_exact.append( - np.linalg.norm(self.resultZ - Z, "fro") / np.linalg.norm(self.resultZ, "fro") - ) - if self.V_exact is not None: - self.err_pd_val_V.append( - np.linalg.norm(V - self.V_exact) / np.linalg.norm(self.V_exact) - ) - if self.pi_exact is not None: - self.err_pd_val_pi.append( - np.linalg.norm(self.pi_exact - Z, "fro") / np.linalg.norm(self.pi_exact, "fro") - ) - - if cnt % 50 == 0: - print(cnt) - # print(CM) - print(np.sum(M)) - print(err1) - print(err2) - print(np.sum(Z > 0.05)) - print(np.linalg.norm(V - self.V_exact) / np.linalg.norm(self.V_exact)) - print(np.max(np.abs(Z - self.pi_exact))) - print(np.mean(np.abs(Z - self.pi_exact))) - - if self.resultM is not None and self.resultV is not None: - self.lyap.append(self._lyap_mixed(CV, V, M, c)) - - for s in range(self.Ns): - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.resultM = np.copy(M) - self.resultZ = np.copy(Z) - self.resultV = np.copy(V) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) - - def train_primal_dual_pseudo_mix_continue( - self, CV=0.1, tol=1e-6, step=0.01, c=0.99 - ): # with \mu\log\hat{\mu} regularization and a different approximate Hessian - # CM = self.tau - CM_start = 0.01 - CM = ( - self.tau - if self.tau >= CM_start - else 2 ** (int(np.log2(CM_start / self.tau)) + 1) * self.tau - ) - V = np.ones(self.Ns) - V0 = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) / self.Na - M0 = np.zeros([self.Ns, self.Na]) - Idt = np.identity(self.Ns) - cnt = 0 - dt = step - Z = np.zeros([self.Ns, self.Na]) - self.err_pd = [] - self.lyap = [] - if self.resultZ is not None: - self.err_pd_exact = [] - - tmp_err = 1 - # mode_cnt = 0 - while tmp_err > tol or CM > self.tau: - M0[:] = M[:] - V0[:] = V[:] - cnt += 1 - # if tmp_err < 2 * tol: - # c = 0.98 - - EV = np.zeros(self.Ns) - for a in range(self.Na): - EV += (Idt - self.gam * self.P[a].T) @ M[:, a] - EV /= CV - V = (1 - dt) * V + dt * EV - - EM = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EM[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a]) @ V) / CM - - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - H[s, a] = np.log(M[s, a] + 1e-300) - - for s in range(self.Ns): - tmp_w = np.log(np.sum(M[s, :]) + 1e-300) - tmp_pi = softmax(H[s, :]) - # for a in range(self.Na): - # # H[s, a] = (H[s, a] + dt * EM[s, a] + dt * tmp_w) / (1 + dt) - # H[s, a] = (1-dt) * H[s, a] + dt * (EM[s, a] + tmp_w) - - tmp_h1 = (1 - dt) * H[s, :] + dt * (EM[s, :] + tmp_w) - tmp_h2 = c * dt * np.ones(self.Na) * np.dot(tmp_pi, EM[s, :] + tmp_w - H[s, :]) - - H[s, :] = tmp_h1 - tmp_h2 - - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = np.exp(H[s, a]) - - if np.linalg.norm(M, "fro") == 0 or np.linalg.norm(M, "fro") == np.inf: - # self.M = M - print(M) - return - - err1 = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - err2 = np.linalg.norm(V - V0) / np.linalg.norm(V) - tmp_err = max(err1, err2) - self.err_pd.append(err1) - for s in range(self.Ns): - # if np.sum(M[s, :]) == 0: - # self.M = M - # print(M) - # return - Z[s, :] = M[s, :] / np.sum(M[s, :]) - if self.resultZ is not None: - self.err_pd_exact.append( - np.linalg.norm(self.resultZ - Z, "fro") / np.linalg.norm(self.resultZ, "fro") - ) - - if cnt % 50 == 0: - print(cnt) - print(CM) - print(np.sum(M)) - print(err1) - print(err2) - print(np.sum(Z > 0.05)) - - if self.resultM is not None and self.resultV is not None: - self.lyap.append(self._lyap_mixed(CV, V, M, c)) - - if CM > self.tau and tmp_err <= 400 * tol: - CM /= 2 - dt /= 2 - - # mode_cnt = np.sum(Z > 0.05) - - for s in range(self.Ns): - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.resultM = np.copy(M) - self.resultZ = np.copy(Z) - self.resultV = np.copy(V) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) - - def _f(self, CV, CM, V, H): - Idt = np.identity(self.Ns) - M = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = H[s, a] if H[s, a] >= 1 else np.exp(H[s, a] - 1) - - V_r = np.zeros(self.Ns) - for a in range(self.Na): - V_r += (Idt - self.gam * self.P[a].T) @ M[:, a] - V_r /= CV - - H_r = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - H_r[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a]) @ V) / CM - - return V_r, H_r, V_r - V, H_r - H - - def _f_step(self, CV, CM, V, H, step): - Idt = np.identity(self.Ns) - M = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = H[s, a] if H[s, a] >= 1 else np.exp(H[s, a] - 1) - - dt = step - self.err_pd = [] - if self.resultZ is not None: - self.err_pd_exact = [] - - EV = np.zeros(self.Ns) - for a in range(self.Na): - EV += (Idt - self.gam * self.P[a].T) @ M[:, a] - EV /= CV - V_r = (V + dt * EV) / (1 + dt) - - EM = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EM[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a]) @ V) / CM - - H_r = (H + dt * EM) / (1 + dt) - - return V_r, H_r, V_r - V, H_r - H - - def make_fig_pd(self, linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - err0 = self.err_pd - ax.tick_params(axis="both", labelsize=14) - ax.plot(err0, label="primal dual", linewidth=linewidth) - ax.set_yscale("log") - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("relative change of mu", fontsize=22) - ax.legend(loc="upper right") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def make_fig_pd_ref(self, linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - err0 = self.err_pd_val_V - err1 = self.err_pd_val_pi - ax.tick_params(axis="both", labelsize=14) - ax.plot(err0, label="error of value", linewidth=linewidth) - ax.plot(err1, label="error of policy", linewidth=linewidth) - ax.set_yscale("log") - ax.set_xlabel("iterations", fontsize=22) - # ax.set_ylabel("relative change of mu", fontsize=22) - ax.legend(loc="upper right") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def make_fig_pd_lyap(self, linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - err0 = self.lyap - ax.tick_params(axis="both", labelsize=14) - ax.plot(err0, label="primal dual", linewidth=linewidth) - ax.set_yscale("log") - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("Lyapunov", fontsize=22) - ax.legend(loc="upper right") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def make_fig_exact_pd(self, linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - ax.tick_params(axis="both", labelsize=14) - err0 = ( - np.abs(np.log(self.err_pd_exact[:-1])) - if self.err_pd_exact[-1] == 0 - else np.abs(np.log(self.err_pd_exact)) - ) - ax.plot(err0, label="primal dual", linewidth=linewidth, marker="+") - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("log error", fontsize=22) - ax.legend(loc="upper left") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - -p = PolicyGradientNew(Ns=200, Na=50, k=20, gam=0.9) -p.set_tau(0.01) -p.train_primal_dual(CV=0.01, step=2e-3) -# p.train_primal_dual_pseudo(CV=0.01, step=2e-2) -p.train_primal_dual_pseudo_mix(CV=0.1, step=1e-2, c=0.9) -p.train_primal_dual_pseudo_mix_random_grad(CV=0.1, step=1e-2, c=0.9, sigma=0.01) -p.train_primal_dual_new(CV=0.01, step=4e-4) - -p = PolicyGradientNew(Ns=20, Na=5, k=3, gam=0.9) -p.set_tau(0.01) -# p.train_primal_dual_pseudo_mix_sample(CV=0.1, step=1e-3, tol=1e-3, c=0.9, n_sam=10000, n_batch=2000) -p.train_primal_dual_pseudo_mix_sample(CV=0.1, step=1e-3, tol=1e-3, c=0.9, n_sam=4000, n_batch=4000) - -# p = PolicyGradientNew(Ns=50, Na=5, k=3, gam=0.9) -# p.set_tau(0.01) -# p.train_primal_dual_pseudo_mix_sample(CV=0.1, step=1e-3, tol=1e-3, c=0.9, n_sam=10000, n_batch=2000) - -p = PolicyGradientNew(Ns=100, Na=20, k=8, gam=0.9) -p.set_tau(0.1) -# p.train_primal_dual_pseudo_mix_sample(CV=0.1, step=1e-3, tol=1e-3, c=0.9, n_sam=10000, n_batch=20000) -p.train_primal_dual_pseudo_mix_sample(CV=0.1, step=1e-3, tol=1e-3, c=0.9, n_sam=10000, n_batch=8000) - -p = PolicyGradientNew(Ns=200, Na=50, k=20, gam=0.9) -p.set_tau(0.1) -p.value_iteration() -p.train_primal_dual_pseudo_mix(CV=0.1, step=8e-3, tol=1e-5, c=0.9) -p.train_primal_dual_pseudo_mix_sample( - CV=0.1, step=1e-3, tol=1e-3, c=0.9, n_sam=10000, n_batch=100000 -) - -p = PolicyGradientNew(Ns=200, Na=50, k=20, gam=0.9) -p.set_tau(0.1) -p.value_iteration() -p.train_primal_dual_pseudo_mix(CV=0.1, step=8e-3, tol=1e-5, c=0.9) -p.train_primal_dual_pseudo_mix_random_grad(CV=0.1, tol=1e-3, step=0.01, c=0.9, sigma=0.05) - -p.set_tau(0.001) -# p.train_primal_dual_pseudo(CV=0.01, step=2e-3) -p.train_primal_dual_pseudo_mix(CV=0.1, step=2e-3, c=0.993) -p.train_primal_dual_new_continue() -p.train_primal_dual_new_continue_acc(lam=0) -V_old = np.copy(p.resultV) -# Pi_old = np.copy(p.resultZ) -p.train_primal_dual_new(CV=0.01, step=3e-4) -p.train_primal_dual_e(step=3e-4) -# V = np.copy(p.resultV) -# Pi = np.copy(p.resultZ) -p.make_fig_pd() -# np.savetxt("Pi_old.txt", Pi_old, fmt="%s") -# np.savetxt("V_old.txt", V_old, fmt="%s") -plt.savefig("conv.png") -Pi = np.loadtxt("Pi_new_compare.txt") -Pi_old = np.loadtxt("Pi_old_compare.txt") -# with open("model.pkl", "wb") as outp: -# pickle.dump(p, outp, pickle.HIGHEST_PROTOCOL) -# -# p = pickle.load(open("model1.pkl", "rb")) diff --git a/examples/xrl/src/realdata.py b/examples/xrl/src/realdata.py deleted file mode 100644 index 128750fd..00000000 --- a/examples/xrl/src/realdata.py +++ /dev/null @@ -1,1361 +0,0 @@ -import numpy as np -import sparse -import pickle -import scipy.sparse.linalg as sla -import scipy.optimize -import matplotlib.pyplot as plt -from scipy.special import softmax -from scipy.special import xlogy -from scipy.special import logsumexp -import sys -import time - -# import pylab as p -# from scikits.umfpack import spsolve -# from scipy.sparse import csr_matrix -# import os -# import numba as nb -# from numba.experimental import jitclass -# from numba import int32, float32 # import the types -# sys.path.append("../") - -plt.rcParams.update({"font.size": 16}) - - -# def get_reward(nmP0, nmP1, nmR0, nmR1): -# P0 = sparse.COO(pickle.load(open(nmP0, "rb"))) -# P0 = P0.tocsr() -# cP0 = P0.tocoo() -# P1 = sparse.COO(pickle.load(open(nmP1, "rb"))) -# P1 = P1.tocsr() -# cP1 = P1.tocoo() -# cr0 = sparse.COO(pickle.load(open(nmR0, "rb"))) -# cr1 = sparse.COO(pickle.load(open(nmR1, "rb"))) -# val = [] -# for s, t, v in zip(cP0.row, cP0.col, cP0.data): -# val.append(v * cr0[s, t]) -# r_tmp0 = scipy.sparse.coo_matrix((val, (cP0.row, cP0.col)), shape=cP0.shape) -# r0 = np.sum(r_tmp0, axis=1) -# del cr0, r_tmp0 -# val = [] -# for s, t, v in zip(cP1.row, cP1.col, cP1.data): -# val.append(v * cr1[s, t]) -# r_tmp1 = scipy.sparse.coo_matrix((val, (cP1.row, cP1.col)), shape=cP1.shape) -# r1 = np.sum(r_tmp1, axis=1) -# del cr1, r_tmp1 -# r = np.asarray(np.concatenate([r0, r1], axis=1)) -# np.savetxt("r.txt", r) - - -# get_reward() - - -def solve_sparse(A, b, tol=1e-15): - num_iters = 0 - - def callback(xk): - nonlocal num_iters - num_iters += 1 - - # call the solver on your data - x, status = scipy.sparse.linalg.bicgstab(A, b, tol=tol, callback=callback) - return x, status, num_iters - - -class PolicyGradient: - def __init__(self, nmP0, nmP1, tp=0, his=False, gam=0.99, eps=1e-12): - self.P0 = sparse.COO(pickle.load(open(nmP0, "rb"))) - self.P1 = sparse.COO(pickle.load(open(nmP1, "rb"))) - self.P = sparse.stack([self.P0, self.P1], axis=0) - self.P0 = self.P0.tocsr() - self.P1 = self.P1.tocsr() - self.Na = 2 - self.Ns = self.P0.shape[0] - self.r = np.loadtxt("r.txt") - - etp = ("KL", "rKL", "alpha") - self.tau = 0.001 - self.gam = gam - self.eps = eps - self.alpha = None - self.result = None - self.ref_result = None - - self.resultZ = np.zeros([self.Ns, self.Na]) - self.train_from_history = his - self.err = [[], [], {}] - self.err_exact = [] - self.V_exact = None - self.pi_exact = None - - self.etp = etp[tp] - if self.etp == "alpha": - self.alpha = 0 - - def train(self, eta=1, T=20, count_spsolver=True): - if self.etp == "KL": - self.err[0] = [] - if self.etp == "rKL": - self.err[1] = [] - if self.etp == "alpha": - self.err[2][self.alpha] = [] - - pi = ( - np.ones([self.Ns, self.Na]) / self.Na - if not self.train_from_history - else np.copy(self.result) - ) - pi_new = np.zeros([self.Ns, self.Na]) - - # Idt = np.identity(self.Ns) - crds = [list(range(self.Ns)), list(range(self.Ns))] - Idt = sparse.COO(crds, [1] * self.Ns, shape=(self.Ns, self.Ns)) - Idt = Idt.tocsr() - - if self.ref_result is not None: - self.err_exact = [] - - for itr in range(T): - if self.ref_result is not None: - self.err_exact.append(np.linalg.norm(self.ref_result - pi, "fro")) - self.err_exact[-1] /= np.linalg.norm(self.ref_result, "fro") - - # Construct P_pi - cP0 = self.P0.tocoo() - val = [] - for s, v in zip(cP0.row, cP0.data): - val.append(v * pi[s, 0]) - P_pi_tmp0 = scipy.sparse.coo_matrix((val, (cP0.row, cP0.col)), shape=cP0.shape) - - cP1 = self.P1.tocoo() - val = [] - for s, v in zip(cP1.row, cP1.data): - val.append(v * pi[s, 1]) - P_pi_tmp1 = scipy.sparse.coo_matrix((val, (cP1.row, cP1.col)), shape=cP1.shape) - - P_pi = P_pi_tmp0 + P_pi_tmp1 - - # Construct r_pi - r_pi = np.zeros(self.Ns) - for s in range(self.Ns): - r_pi[s] = pi[s, 0] * self.r[s, 0] + pi[s, 1] * self.r[s, 1] - H_pi = 0 - if self.etp == "KL": - H_pi = np.sum(xlogy(pi, pi), axis=1) - if self.etp == "rKL": - H_pi = np.sum(-np.log(pi), axis=1) / self.Na - if self.etp == "alpha": - tmp_h1 = 4 / (1 - self.alpha ** 2) - tmp_h2 = 4 / (1 - self.alpha ** 2) * self.Na ** ((self.alpha - 1) / 2) - H_pi = tmp_h1 - tmp_h2 * np.sum(pi ** ((self.alpha + 1) / 2), axis=1) - if count_spsolver: - v_pi, _, num_iters = solve_sparse( - Idt - self.gam * P_pi, r_pi - self.tau * H_pi, tol=1e-12 - ) - print(num_iters) - else: - v_pi, _ = sla.bicgstab(Idt - self.gam * P_pi, r_pi - self.tau * H_pi, tol=1e-12) - # print(np.mean(v_pi)) - # print(np.linalg.norm((Idt-gam * P_pi) @ v_pi - r_pi)) - if self.etp == "KL": - pi_new = pi ** (1 - eta) - for a in range(self.Na): - tmp_pi = self.r[:, a] - (Idt - self.gam * self.P[a].tocsr()) @ v_pi - tmp_pi *= eta / self.tau - pi_new[:, a] = pi_new[:, a] * np.exp(tmp_pi) - - if self.etp == "rKL": - for a in range(self.Na): - tmp = self.P[a].tocsr() @ v_pi - for s in range(self.Ns): - pi_new[s, a] = (1 - eta) / pi[s, a] - pi_new[s, a] -= ( - eta * self.Na * (self.r[s, a] - v_pi[s] + self.gam * tmp[s]) / self.tau - ) - - for s in range(self.Ns): - - def equation(x): - return 1 / (x + pi_new[s, 0] + 1e-300) + 1 / (x + pi_new[s, 1] + 1e-300) - 1 - - low = max(self.Na - np.max(pi_new[s, :]), -np.min(pi_new[s, :])) - high = self.Na - np.min(pi_new[s, :]) - cs = low - # if equation(low) * equation(high) >= 0: - # print(pi_new[s, :]) - # print(equation(low)) - # print(equation(high)) - # if np.abs(equation(low)) < 1e-15: - # cs = low - if np.abs(equation(high)) < 1e-15: - cs = high - if np.abs(equation(low)) >= 1e-15 and np.abs(equation(high)) >= 1e-15: - cs = scipy.optimize.bisect(equation, low, high) - # cs = scipy.optimize.newton_krylov(equation, np.array([1]), f_tol=1e-12) - # print(np.abs(equation(cs)) < 1*10**-10) - for a in range(self.Na): - pi_new[s, a] = 1 / (pi_new[s, a] + cs) - # print(s) - # print(np.abs(np.sum(pi_new[s, :])-1) < 1*10**-10) - # print(np.min(pi_new[s, :])) - - if self.etp == "alpha": - for a in range(self.Na): - tmp = self.P[a].tocsr() @ v_pi - for s in range(self.Ns): - pi_new[s, a] = (1 - eta) * (pi[s, a] ** ((self.alpha - 1) / 2)) - tmp_pi1 = (1 - self.alpha) / 2 * eta * (self.Na ** ((1 - self.alpha) / 2)) - tmp_pi2 = (self.r[s, a] - v_pi[s] + self.gam * tmp[s]) / self.tau - pi_new[s, a] -= tmp_pi1 * tmp_pi2 - - for s in range(self.Ns): - alpha = self.alpha - - def equation(x): - tmp1 = (x + pi_new[s, 0] + 1e-300) ** (2 / (alpha - 1)) - tmp2 = (x + pi_new[s, 1] + 1e-300) ** (2 / (alpha - 1)) - return tmp1 + tmp2 - 1 - - low = max( - self.Na ** ((1 - self.alpha) / 2) - np.max(pi_new[s, :]), - -np.min(pi_new[s, :]), - ) - high = self.Na ** ((1 - self.alpha) / 2) - np.min(pi_new[s, :]) - cs = low - # if np.abs(equation(low)) < 1e-15: - # cs = low - if np.abs(equation(high)) < 1e-15: - cs = high - if np.abs(equation(low)) >= 1e-15 and np.abs(equation(high)) >= 1e-15: - cs = scipy.optimize.bisect(equation, low, high) - for a in range(self.Na): - pi_new[s, a] = (pi_new[s, a] + cs) ** (2 / (self.alpha - 1)) - - for s in range(self.Ns): - pi_new[s, :] /= sum(pi_new[s, :]) - err = np.linalg.norm(pi_new - pi, "fro") / np.linalg.norm(pi, "fro") - print(err) - if self.etp == "KL": - self.err[0].append(err) - if self.etp == "rKL": - self.err[1].append(err) - if self.etp == "alpha": - self.err[2][self.alpha].append(err) - if err < self.eps: - print(itr) - break - pi[:] = pi_new[:] - self.result = np.copy(pi) - self.ref_result = np.copy(pi) - - def set_tau(self, tau): - self.tau = tau - self.ref_result = None - - def set_alpha(self, alpha): - self.alpha = alpha - - def set_history(self, his): - self.train_from_history = his - if his and self.result is None: - self.result = np.ones([self.Ns, self.Na]) / self.Na - - def clear_history(self): - self.result = self.result = np.ones([self.Ns, self.Na]) / self.Na - - def set_etp(self, tp): - etp = ("KL", "rKL", "alpha") - self.etp = etp[tp] - if self.etp == "alpha": - self.alpha = 0 - self.ref_result = None - - def get_value(self, pi=None, reg=False): - if self.result is None: - return - if pi is None: - pi = self.result - Idt = np.identity(self.Ns) - P_pi = np.zeros([self.Ns, self.Ns]) - for s in range(self.Ns): - for t in range(self.Ns): - P_pi[s, t] = np.dot(pi[s, :], self.P[:, s, t]) - r_pi = np.zeros(self.Ns) - for s in range(self.Ns): - r_pi[s] = np.dot(pi[s, :], self.r[s, :]) - H_pi = 0 - if reg: - if self.etp == "KL": - H_pi = np.sum(xlogy(pi, pi), axis=1) - if self.etp == "rKL": - H_pi = np.sum(-np.log(pi), axis=1) / self.Na - if self.etp == "alpha": - tmp_h1 = 4 / (1 - self.alpha ** 2) - tmp_h2 = 4 / (1 - self.alpha ** 2) * self.Na ** ((self.alpha - 1) / 2) - H_pi = tmp_h1 - tmp_h2 * np.sum(pi ** ((self.alpha + 1) / 2), axis=1) - v_pi = np.linalg.solve(Idt - self.gam * P_pi, r_pi - self.tau * H_pi) - return v_pi - - def get_expect_value(self, pi=None): - if self.result is None: - return - if pi is None: - pi = self.result - P_pi = np.zeros([self.Ns, self.Ns]) - for s in range(self.Ns): - for t in range(self.Ns): - P_pi[s, t] = np.dot(pi[s, :], self.P[:, s, t]) - r_pi = np.zeros(self.Ns) - for s in range(self.Ns): - r_pi[s] = np.dot(pi[s, :], self.r[s, :]) - _, evec = sla.eigs(P_pi.T, k=1, which="LM") - d_pi = (evec / evec.sum()).real - d_pi = d_pi.reshape(len(d_pi)) - return np.dot(d_pi, r_pi) - - def make_fig(self, alpha1, alpha2, linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - ax.tick_params(axis="both", labelsize=14) - err0 = self.err[0] - err1 = self.err[1] - err2 = self.err[2][alpha1] - err3 = self.err[2][alpha2] - ax.plot(err0, label="KL", linewidth=linewidth) - ax.plot(err1, label="reversed KL", linewidth=linewidth) - ax.plot(err2, label="Hellinger", linewidth=linewidth) - ax.plot(err3, label="alpha divergence (alpha=" + str(alpha2) + "-3", linewidth=linewidth) - ax.set_yscale("log") - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("relative change of policy", fontsize=22) - ax.legend(loc="upper right") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def make_fig2(self, alpha1, alpha2, linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - ax.tick_params(axis="both", labelsize=14) - err0 = np.log(np.abs(np.log(self.err[0]))) - err1 = np.log(np.abs(np.log(self.err[1]))) - err2 = np.log(np.abs(np.log(self.err[2][alpha1]))) - err3 = np.log(np.abs(np.log(self.err[2][alpha2]))) - ax.plot(err0, label="KL", linewidth=linewidth, marker="+") - ax.plot(err1, label="reversed KL", linewidth=linewidth, marker="+") - ax.plot( - err2, - label="alpha divergence with alpha = " + str(alpha1), - linewidth=linewidth, - marker="+", - ) - ax.plot( - err3, - label="alpha divergence with alpha = " + str(alpha2), - linewidth=linewidth, - marker="+", - ) - # ax.set_yscale("log") - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("relative change of policy", fontsize=22) - ax.legend(loc="lower right") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def make_fig_exact(self, label="KL", linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - ax.tick_params(axis="both", labelsize=14) - err0 = ( - np.log(np.abs(np.log(self.err_exact[:-1]))) - if self.err_exact[-1] == 0 - else np.log(np.abs(np.log(self.err_exact))) - ) - ax.plot(err0, label=label, linewidth=linewidth, marker="+") - ax.plot( - [0, len(err0) - 1], - [0, np.log(2) * (len(err0) - 1)], - label="slope=log2", - color="g", - linestyle="dashed", - linewidth=0.5, - ) - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("loglog error", fontsize=22) - ax.legend(loc="upper left") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def train_all(self, alpha1=0, alpha2=-3): - self.set_etp(0) - self.train() - self.set_etp(1) - self.train() - self.set_etp(2) - self.set_alpha(alpha1) - self.train() - self.set_alpha(alpha2) - self.train() - - def value_iteration(self): - V_pre = np.ones(self.Ns) - V = np.zeros(self.Ns) - Q = np.zeros([self.Ns, self.Na]) - cnt = 0 - while np.linalg.norm(V - V_pre) / np.linalg.norm(V_pre) > 10 ** -15: - cnt += 1 - V_pre = V - for a in range(self.Na): - Q[:, a] = self.r[:, a] + self.gam * (self.P[a, :, :] @ V) - - if self.etp == "KL": - V = logsumexp(Q / self.tau, axis=1) * self.tau - if self.etp == "rKL": - return - if self.etp == "alpha": - return - if cnt % 100 == 0: - print(np.linalg.norm(V - V_pre) / np.linalg.norm(V_pre)) - - for a in range(self.Na): - Q[:, a] = self.r[:, a] + self.gam * (self.P[a, :, :] @ V) - V - self.V_exact = V - self.pi_exact = softmax(Q / self.tau, axis=1) - print(cnt) - - def train_primal_dual_old(self, CV=0.1, CW=0.1, tol=1e-6, step=0.01): - V = np.zeros(self.Ns) - W = np.ones(self.Ns) - Z0 = np.zeros([self.Ns, self.Na]) - Z = np.ones([self.Ns, self.Na]) / self.Na - dt = step - Idt = np.identity(self.Ns) - cnt = 0 - - while np.max(np.abs(Z - Z0)) > tol: - cnt += 1 - PZ = np.zeros([self.Ns, self.Ns]) - for s in range(self.Ns): - for t in range(self.Ns): - PZ[s, t] = np.dot(Z[s, :], self.P[:, s, t]) - RZ = np.zeros(self.Ns) - for s in range(self.Ns): - RZ[s] = np.dot(Z[s, :], self.r[s, :]) - HZ = np.sum(xlogy(Z, Z), axis=1) - - EV = V - (Idt - self.gam * PZ.T) @ W / CV - EW = (RZ - (Idt - self.gam * PZ) @ V - self.tau * HZ) / CW - W - EZ = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EZ[:, a] = ( - self.r[:, a] - (Idt - self.gam * self.P[a, :, :]) @ V - ) / self.tau - np.log(Z[:, a]) - Z0 = Z - V = V - dt * EV - W = W + dt * EW - Z = Z * np.exp(dt * EZ) - # print(np.min(W)) - for s in range(self.Ns): - Z[s, :] /= sum(Z[s, :]) - if cnt % 10 == 0: - print(cnt) - print(np.max(np.abs(Z - Z0))) - self.resultZ[:] = Z[:] - print(cnt) - print(np.max(np.abs(Z - Z0))) - - def train_primal_dual(self, CV=0.01, CM=0.01, tol=1e-8, step=0.01): - V = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) / self.Na - M0 = np.zeros([self.Ns, self.Na]) - dt = step - # Idt = np.identity(self.Ns) - crds = [list(range(self.Ns)), list(range(self.Ns))] - Idt = sparse.COO(crds, [1] * self.Ns, shape=(self.Ns, self.Ns)) - Idt = Idt.tocsr() - cnt = 0 - Z = np.zeros([self.Ns, self.Na]) - - while np.linalg.norm(M - M0) / np.linalg.norm(M) > tol: - M0[:] = M[:] - cnt += 1 - - EV = np.zeros(self.Ns) - for a in range(self.Na): - EV += (Idt - self.gam * self.P[a].T) @ M[:, a] - EV /= CV - V = (V + dt * EV) / (1 + dt) - - EM = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EM[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a]) @ V) / CM - - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - H[s, a] = M[s, a] if M[s, a] >= 1 else np.log(M[s, a]) + 1 - H = (H + dt * EM) / (1 + dt) - - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = H[s, a] if H[s, a] >= 1 else np.exp(H[s, a] - 1) - - if cnt % 50 == 0: - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) - for s in range(self.Ns): - Z[s, :] = M[s, :] / sum(M[s, :]) - self.resultZ[:] = Z[:] - print(M) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) - - -# spec = [ -# ("value", int32), # a simple scalar field -# ("array", float32[:]), # an array field -# ] - - -# @jitclass(spec) -class PolicyGradientNew: - def __init__(self, nmP0, nmP1, tp=0, his=False, gam=0.99, eps=1e-12): - self.P0 = sparse.COO(pickle.load(open(nmP0, "rb"))) - self.P1 = sparse.COO(pickle.load(open(nmP1, "rb"))) - self.Na = 2 - self.Ns = self.P0.shape[0] - self.P0 = self.P0.tocsr() - self.P1 = self.P1.tocsr() - self.P0 = self.P0.tolil() - self.P1 = self.P1.tolil() - for s in range(self.Ns): - if self.P0[s].nnz == 0: - self.P0[s, -1] = 1 - if self.P1[s].nnz == 0: - self.P1[s, -1] = 1 - self.P0 = self.P0.tocsr() - self.P1 = self.P1.tocsr() - self.P = sparse.stack([sparse.COO(self.P0), sparse.COO(self.P1.tocoo())], axis=0) - - self.r = np.loadtxt("r.txt") - self.mu = np.ones(self.Na) / self.Na - self.init_type = "prior" - self.init_dis = None - - etp = ("KL", "rKL", "alpha") - self.tau = 0.001 - self.gam = gam - self.eps = eps - self.alpha = None - self.result = None - self.ref_result = None - self.resultZ = None - self.resultM = None - self.resultV = None - - self.resultZ = np.zeros([self.Ns, self.Na]) - self.train_from_history = his - self.err = [[], [], {}] - self.err_exact = [] - self.err_pd = [] - self.err_pd_exact = [] - self.err_pd_val_pi = [] - self.err_pd_val_V = [] - self.lyap = [] - self.V_exact = None - self.pi_exact = None - self.mu_exact = None - - self.etp = etp[tp] - if self.etp == "alpha": - self.alpha = 0 - - def set_prior(self, mu): - for a in range(self.Na): - self.mu[a] = mu[a] - - def set_init_dis(self, tmp): - self.init_dis = np.copy(tmp) - - def set_init_type(self, tp): - self.init_type = tp - - def set_tau(self, tau): - self.tau = tau - self.ref_result = None - - def set_alpha(self, alpha): - self.alpha = alpha - - def set_history(self, his): - self.train_from_history = his - if his and self.result is None: - self.result = np.ones([self.Ns, self.Na]) / self.Na - - def clear_history(self): - self.result = self.result = np.ones([self.Ns, self.Na]) / self.Na - - def set_etp(self, tp): - etp = ("KL", "rKL", "alpha") - self.etp = etp[tp] - if self.etp == "alpha": - self.alpha = 0 - self.ref_result = None - - def train(self, eta=1, T=20, count_spsolver=True): - if self.etp == "KL": - self.err[0] = [] - if self.etp == "rKL": - self.err[1] = [] - if self.etp == "alpha": - self.err[2][self.alpha] = [] - - pi = None - if self.init_type == "prior": - pi = ( - np.ones([self.Ns, 1]) * self.mu - if not self.train_from_history - else np.copy(self.result) - ) - if self.init_type == "uniform": - pi = ( - np.ones([self.Ns, self.Na]) / self.Na - if not self.train_from_history - else np.copy(self.result) - ) - if self.init_type == "random": - pi = np.random.exponential(size=[self.Ns, self.Na]) - pi = pi / np.sum(pi, axis=1, keepdims=True) - if self.init_type == "assigned" and self.init_dis is not None: - pi = np.copy(self.init_dis) - - pi_new = np.zeros([self.Ns, self.Na]) - - # Idt = np.identity(self.Ns) - crds = [list(range(self.Ns)), list(range(self.Ns))] - Idt = sparse.COO(crds, [1] * self.Ns, shape=(self.Ns, self.Ns)) - Idt = Idt.tocsr() - - if self.ref_result is not None: - self.err_exact = [] - - for itr in range(T): - if self.ref_result is not None: - self.err_exact.append(np.linalg.norm(self.ref_result - pi, "fro")) - self.err_exact[-1] /= np.linalg.norm(self.ref_result, "fro") - - # Construct P_pi - cP0 = self.P0.tocoo() - val = [] - for s, v in zip(cP0.row, cP0.data): - val.append(v * pi[s, 0]) - P_pi_tmp0 = scipy.sparse.coo_matrix((val, (cP0.row, cP0.col)), shape=cP0.shape) - - cP1 = self.P1.tocoo() - val = [] - for s, v in zip(cP1.row, cP1.data): - val.append(v * pi[s, 1]) - P_pi_tmp1 = scipy.sparse.coo_matrix((val, (cP1.row, cP1.col)), shape=cP1.shape) - - P_pi = P_pi_tmp0 + P_pi_tmp1 - - # Construct r_pi - r_pi = np.zeros(self.Ns) - for s in range(self.Ns): - r_pi[s] = pi[s, 0] * self.r[s, 0] + pi[s, 1] * self.r[s, 1] - H_pi = 0 - if self.etp == "KL": - H_pi = np.sum(xlogy(pi, pi / self.mu), axis=1) - if self.etp == "rKL": - H_pi = -np.sum(xlogy(self.mu, pi / self.mu), axis=1) - if self.etp == "alpha": - tmp_h1 = 4 / (1 - self.alpha ** 2) - tmp_h2 = np.sum(self.mu * (pi / self.mu) ** ((self.alpha + 1) / 2), axis=1) - H_pi = tmp_h1 - tmp_h1 * tmp_h2 - - if count_spsolver: - v_pi, _, num_iters = solve_sparse( - Idt - self.gam * P_pi, r_pi - self.tau * H_pi, tol=1e-12 - ) - print(num_iters) - else: - v_pi, _ = sla.bicgstab(Idt - self.gam * P_pi, r_pi - self.tau * H_pi, tol=1e-12) - # print((Idt - self.gam * P_pi).shape) - # print((r_pi - self.tau * H_pi).shape) - # print(sla.bicgstab(Idt - self.gam * P_pi, r_pi - self.tau * H_pi, tol=1e-12)) - # print(np.mean(v_pi)) - # print(np.linalg.norm((Idt-gam * P_pi) @ v_pi - r_pi)) - if self.etp == "KL": - pi_new = pi ** (1 - eta) - for a in range(self.Na): - pi_new[:, a] *= self.mu[a] ** eta - tmp_pi = self.r[:, a] - (Idt - self.gam * self.P[a].tocsr()) @ v_pi - pi_new[:, a] = pi_new[:, a] * np.exp(eta * tmp_pi / self.tau) - - if self.etp == "rKL": - for a in range(self.Na): - tmp = self.P[a].tocsr() @ v_pi - for s in range(self.Ns): - pi_new[s, a] = (1 - eta) / pi[s, a] - tmp_pi = self.r[s, a] - v_pi[s] + self.gam * tmp[s] - pi_new[s, a] -= eta / self.mu[a] * tmp_pi / self.tau - - for s in range(self.Ns): - - def equation(x): - if x == -np.min(pi_new[s, :] * self.mu): - return 1e300 - tmp_x0 = x / self.mu[0] + pi_new[s, 0] + 1e-300 - tmp_x1 = x / self.mu[1] + pi_new[s, 1] + 1e-300 - return 1 / tmp_x0 + 1 / tmp_x1 - 1 - - low = max( - np.min((self.Na - pi_new[s, :]) * self.mu), -np.min(pi_new[s, :] * self.mu) - ) - high = np.max((self.Na - pi_new[s, :]) * self.mu) - cs = low - # if equation(low) * equation(high) >= 0: - # print(pi_new[s, :]) - # print(low) - # print(high) - # print(equation(low)) - # print(equation(high)) - # if np.abs(equation(low)) < 1e-15: - # cs = low - if np.abs(equation(high)) < 1e-15: - cs = high - if np.abs(equation(low)) >= 1e-15 and np.abs(equation(high)) >= 1e-15: - cs = scipy.optimize.bisect(equation, low, high) - # cs = scipy.optimize.newton_krylov(equation, np.array([1]), f_tol=1e-12) - # print(np.abs(equation(cs)) < 1*10**-10) - for a in range(self.Na): - pi_new[s, a] = 1 / (pi_new[s, a] + cs / self.mu[a]) - # print(s) - # print(np.abs(np.sum(pi_new[s, :])-1) < 1*10**-10) - # print(np.min(pi_new[s, :])) - - if self.etp == "alpha": - for a in range(self.Na): - tmp = self.P[a].tocsr() @ v_pi - for s in range(self.Ns): - pi_new[s, a] = (1 - eta) * (pi[s, a] ** ((self.alpha - 1) / 2)) - tmp_pi1 = ( - (1 - self.alpha) / 2 * eta * (self.mu[a] ** ((self.alpha - 1) / 2)) - ) - tmp_pi2 = (self.r[s, a] - v_pi[s] + self.gam * tmp[s]) / self.tau - pi_new[s, a] -= tmp_pi1 * tmp_pi2 - - for s in range(self.Ns): - alpha = self.alpha - - def equation(x): - if x == -np.min(pi_new[s, :] * self.mu): - return 1e30 - tmp_x1 = x * self.mu[0] ** ((self.alpha - 1) / 2) + pi_new[s, 0] + 1e-30 - tmp_x2 = x * self.mu[1] ** ((self.alpha - 1) / 2) + pi_new[s, 1] + 1e-30 - return tmp_x1 ** (2 / (alpha - 1)) + tmp_x2 ** (2 / (alpha - 1)) - 1 - - tmp_low1 = self.Na ** ((1 - self.alpha) / 2) - pi_new[s, :] - tmp_low2 = self.mu ** ((1 - self.alpha) / 2) - tmp_low3 = -np.min(pi_new[s, :] * self.mu ** ((1 - self.alpha) / 2)) - low = max(np.min(tmp_low1 * tmp_low2), tmp_low3) - tmp_high1 = self.Na ** ((1 - self.alpha) / 2) - pi_new[s, :] - tmp_high2 = self.mu ** ((1 - self.alpha) / 2) - high = np.max(tmp_high1 * tmp_high2) - # low = max(self.Na ** ((1 - self.alpha) / 2) - np.max(pi_new[s, :]), -np.min(pi_new[s, :])) - # high = self.Na ** ((1 - self.alpha) / 2) - np.min(pi_new[s, :]) - cs = low - if np.abs(equation(high)) < 1e-15: - cs = high - if np.abs(equation(low)) >= 1e-15 and np.abs(equation(high)) >= 1e-15: - cs = scipy.optimize.bisect(equation, low, high) - for a in range(self.Na): - pi_new[s, a] = ( - pi_new[s, a] + cs * self.mu[a] ** ((self.alpha - 1) / 2) - ) ** (2 / (self.alpha - 1)) - - for s in range(self.Ns): - pi_new[s, :] /= sum(pi_new[s, :]) - err = np.linalg.norm(pi_new - pi, "fro") / np.linalg.norm(pi, "fro") - print(err) - if self.etp == "KL": - self.err[0].append(err) - if self.etp == "rKL": - self.err[1].append(err) - if self.etp == "alpha": - self.err[2][self.alpha].append(err) - if err < self.eps: - print(itr) - break - pi[:] = pi_new[:] - self.result = np.copy(pi) - self.ref_result = np.copy(pi) - - def get_value(self, pi=None, reg=False): - if self.result is None: - return - if pi is None: - pi = self.result - Idt = np.identity(self.Ns) - P_pi = np.zeros([self.Ns, self.Ns]) - for s in range(self.Ns): - for t in range(self.Ns): - P_pi[s, t] = np.dot(pi[s, :], self.P[:, s, t]) - r_pi = np.zeros(self.Ns) - for s in range(self.Ns): - r_pi[s] = np.dot(pi[s, :], self.r[s, :]) - H_pi = 0 - if reg: - if self.etp == "KL": - H_pi = np.sum(xlogy(pi, pi), axis=1) - if self.etp == "rKL": - H_pi = np.sum(-np.log(pi), axis=1) / self.Na - if self.etp == "alpha": - tmp_h1 = 4 / (1 - self.alpha ** 2) - tmp_h2 = self.Na ** ((self.alpha - 1) / 2) - tmp_h3 = np.sum(pi ** ((self.alpha + 1) / 2), axis=1) - H_pi = tmp_h1 - tmp_h1 * tmp_h2 * tmp_h3 - v_pi = np.linalg.solve(Idt - self.gam * P_pi, r_pi - self.tau * H_pi) - return v_pi - - def get_expect_value(self, pi=None): - if self.result is None: - return - if pi is None: - pi = self.result - P_pi = np.zeros([self.Ns, self.Ns]) - for s in range(self.Ns): - for t in range(self.Ns): - P_pi[s, t] = np.dot(pi[s, :], self.P[:, s, t]) - r_pi = np.zeros(self.Ns) - for s in range(self.Ns): - r_pi[s] = np.dot(pi[s, :], self.r[s, :]) - _, evec = sla.eigs(P_pi.T, k=1, which="LM") - d_pi = (evec / evec.sum()).real - d_pi = d_pi.reshape(len(d_pi)) - return np.dot(d_pi, r_pi) - - def make_fig(self, alpha1, alpha2, linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - ax.tick_params(axis="both", labelsize=14) - err0 = self.err[0] - err1 = self.err[1] - err2 = self.err[2][alpha1] - err3 = self.err[2][alpha2] - ax.plot(err0, label="KL", linewidth=linewidth) - ax.plot(err1, label="reversed KL", linewidth=linewidth) - ax.plot(err2, label="Hellinger", linewidth=linewidth) - ax.plot(err3, label="alpha divergence (alpha=" + str(alpha2) + "-3", linewidth=linewidth) - ax.set_yscale("log") - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("relative change of policy", fontsize=22) - ax.legend(loc="upper right") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def make_fig_init(self, linewidth=0.5): - err0, err1, err2, err3 = None, None, None, None - if self.etp == "KL": - self.set_init_type("uniform") - self.train() - err0 = np.copy(self.err[0]) - self.set_init_type("prior") - self.train() - err1 = np.copy(self.err[0]) - self.set_init_type("random") - self.train() - err2 = np.copy(self.err[0]) - self.set_init_type("random") - self.train() - err3 = np.copy(self.err[0]) - if self.etp == "rKL": - self.set_init_type("uniform") - self.train() - err0 = np.copy(self.err[1]) - self.set_init_type("prior") - self.train() - err1 = np.copy(self.err[1]) - self.set_init_type("random") - self.train() - err2 = np.copy(self.err[1]) - self.set_init_type("random") - self.train() - err3 = np.copy(self.err[1]) - if self.etp == "alpha": - self.set_init_type("uniform") - self.train() - err0 = np.copy(self.err[2][self.alpha]) - self.set_init_type("prior") - self.train() - err1 = np.copy(self.err[2][self.alpha]) - self.set_init_type("random") - self.train() - err2 = np.copy(self.err[2][self.alpha]) - self.set_init_type("random") - self.train() - err3 = np.copy(self.err[2][self.alpha]) - fig, ax = plt.subplots(figsize=(12, 8)) - ax.tick_params(axis="both", labelsize=14) - ax.plot(err0, label="uniform", linewidth=linewidth) - ax.plot(err1, label="prior", linewidth=linewidth) - ax.plot(err2, label="random 1", linewidth=linewidth) - ax.plot(err3, label="random 2", linewidth=linewidth) - ax.set_yscale("log") - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("relative change of policy", fontsize=22) - ax.legend(loc="upper right") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def make_fig2(self, alpha1, alpha2, linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - ax.tick_params(axis="both", labelsize=14) - err0 = np.log(np.abs(np.log(self.err[0]))) - err1 = np.log(np.abs(np.log(self.err[1]))) - err2 = np.log(np.abs(np.log(self.err[2][alpha1]))) - err3 = np.log(np.abs(np.log(self.err[2][alpha2]))) - ax.plot(err0, label="KL", linewidth=linewidth, marker="+") - ax.plot(err1, label="reversed KL", linewidth=linewidth, marker="+") - ax.plot( - err2, - label="alpha divergence with alpha = " + str(alpha1), - linewidth=linewidth, - marker="+", - ) - ax.plot( - err3, - label="alpha divergence with alpha = " + str(alpha2), - linewidth=linewidth, - marker="+", - ) - # ax.set_yscale("log") - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("relative change of policy", fontsize=22) - ax.legend(loc="lower right") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def make_fig_exact(self, label="KL", linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - ax.tick_params(axis="both", labelsize=14) - err0 = ( - np.log(np.abs(np.log(self.err_exact[:-1]))) - if self.err_exact[-1] == 0 - else np.log(np.abs(np.log(self.err_exact))) - ) - ax.plot(err0, label=label, linewidth=linewidth, marker="+") - ax.plot( - [0, len(err0) - 1], - [0, np.log(2) * (len(err0) - 1)], - label="slope=log2", - color="g", - linestyle="dashed", - linewidth=0.5, - ) - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("loglog error", fontsize=22) - ax.legend(loc="upper left") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def train_all(self, alpha1=0, alpha2=-3): - self.set_etp(0) - self.train() - self.set_etp(1) - self.train() - self.set_etp(2) - self.set_alpha(alpha1) - self.train() - self.set_alpha(alpha2) - self.train() - - def value_iteration(self): - V_pre = np.ones(self.Ns) - V = np.zeros(self.Ns) - Q = np.zeros([self.Ns, self.Na]) - cnt = 0 - while np.linalg.norm(V - V_pre) / np.linalg.norm(V_pre) > 10 ** -15: - cnt += 1 - V_pre = V - for a in range(self.Na): - Q[:, a] = self.r[:, a] + self.gam * (self.P[a, :, :] @ V) - - if self.etp == "KL": - V = logsumexp(Q / self.tau, axis=1) * self.tau - if self.etp == "rKL": - return - if self.etp == "alpha": - return - if cnt % 100 == 0: - print(np.linalg.norm(V - V_pre) / np.linalg.norm(V_pre)) - - for a in range(self.Na): - Q[:, a] = self.r[:, a] + self.gam * (self.P[a, :, :] @ V) - V - self.V_exact = V - self.pi_exact = softmax(Q / self.tau, axis=1) - print(cnt) - - def train_primal_dual_old(self, CV=0.1, CW=0.1, tol=1e-6, step=0.01): - V = np.zeros(self.Ns) - W = np.ones(self.Ns) - Z0 = np.zeros([self.Ns, self.Na]) - Z = np.ones([self.Ns, self.Na]) / self.Na - dt = step - Idt = np.identity(self.Ns) - cnt = 0 - - while np.max(np.abs(Z - Z0)) > tol: - cnt += 1 - PZ = np.zeros([self.Ns, self.Ns]) - for s in range(self.Ns): - for t in range(self.Ns): - PZ[s, t] = np.dot(Z[s, :], self.P[:, s, t]) - RZ = np.zeros(self.Ns) - for s in range(self.Ns): - RZ[s] = np.dot(Z[s, :], self.r[s, :]) - HZ = np.sum(xlogy(Z, Z), axis=1) - - EV = V - (Idt - self.gam * PZ.T) @ W / CV - EW = (RZ - (Idt - self.gam * PZ) @ V - self.tau * HZ) / CW - W - EZ = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EZ[:, a] = ( - self.r[:, a] - (Idt - self.gam * self.P[a, :, :]) @ V - ) / self.tau - np.log(Z[:, a]) - Z0 = Z - V = V - dt * EV - W = W + dt * EW - Z = Z * np.exp(dt * EZ) - for s in range(self.Ns): - Z[s, :] /= sum(Z[s, :]) - if cnt % 10 == 0: - print(cnt) - print(np.max(np.abs(Z - Z0))) - self.resultZ[:] = Z[:] - print(cnt) - print(np.max(np.abs(Z - Z0))) - - def train_primal_dual(self, CV=0.01, CM=0.01, tol=1e-8, step=0.01): - V = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) / self.Na - M0 = np.zeros([self.Ns, self.Na]) - dt = step - # Idt = np.identity(self.Ns) - crds = [list(range(self.Ns)), list(range(self.Ns))] - Idt = sparse.COO(crds, [1] * self.Ns, shape=(self.Ns, self.Ns)) - Idt = Idt.tocsr() - cnt = 0 - Z = np.zeros([self.Ns, self.Na]) - - while np.linalg.norm(M - M0) / np.linalg.norm(M) > tol: - M0[:] = M[:] - cnt += 1 - - EV = np.zeros(self.Ns) - for a in range(self.Na): - EV += (Idt - self.gam * self.P[a].tocsr().T) @ M[:, a] - EV /= CV - V = (V + dt * EV) / (1 + dt) - - EM = np.zeros([self.Ns, self.Na]) - for a in range(self.Na): - EM[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a].tocsr()) @ V) / CM - - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - for a in range(self.Na): - H[s, a] = M[s, a] if M[s, a] >= 1 else np.log(M[s, a]) + 1 - H = (H + dt * EM) / (1 + dt) - - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = H[s, a] if H[s, a] >= 1 else np.exp(H[s, a] - 1) - - if cnt % 50 == 0: - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) - for s in range(self.Ns): - Z[s, :] = M[s, :] / sum(M[s, :]) - self.resultZ[:] = Z[:] - print(M) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) - - def _lyap(self, CV, V, M): - L1 = CV / 2 * (np.linalg.norm(V - self.resultV)) ** 2 - L21 = self.tau * np.sum(xlogy(self.resultM, self.resultM)) - L22 = self.tau * np.sum(xlogy(self.resultM, M)) - L23 = self.tau * np.sum(M - self.resultM) - return L1 + L21 - L22 + L23 - - def _lyap_mixed(self, CV, V, M, c): - tmp_b = np.sum(self.resultM, axis=1) - tmp_M = np.sum(M, axis=1) - L1 = CV / 2 * (np.linalg.norm(V - self.resultV)) ** 2 - L21 = self.tau * np.sum(xlogy(self.resultM, self.resultM)) - L22 = self.tau * np.sum(xlogy(self.resultM, M)) - L22 -= self.tau * c / (1 - c) * np.sum(xlogy(tmp_b, tmp_b)) - L23 = self.tau * c / (1 - c) * np.sum(xlogy(tmp_b, tmp_M)) - L23 -= self.tau / (1 - c) * np.sum(M - self.resultM) - return L1 + L21 - L22 - L23 - - # @nb.jit(nopython=True) - def train_primal_dual_pseudo_mix( - self, CV=0.01, tol=1e-5, step=0.01, c=0.99 - ): # with \mu\log\hat{\mu} regularization and a different approximate Hessian - CM = self.tau - V = np.ones(self.Ns) - M = np.ones([self.Ns, self.Na]) * CV - # M = np.ones([self.Ns, self.Na]) * 199629 - M0 = np.zeros([self.Ns, self.Na]) - V0 = np.ones(self.Ns) - # Idt = np.identity(self.Ns) - crds = [list(range(self.Ns)), list(range(self.Ns))] - Idt = sparse.COO(crds, [1] * self.Ns, shape=(self.Ns, self.Ns)) - Idt = Idt.tocsr() - cnt = 0 - dt = step - Z = np.zeros([self.Ns, self.Na]) - self.err_pd = [] - self.err_pd_val_V = [] - self.err_pd_val_pi = [] - self.lyap = [] - if self.resultZ is not None: - self.err_pd_exact = [] - - if self.resultM is None or self.resultV is None: - self.get_mu(CV) - self.resultM = np.copy(self.mu_exact) - self.resultV = np.copy(self.V_exact) - print("Exact mu obtained") - - tmp_err = 1 - while tmp_err > tol: - M0[:] = M[:] - V0[:] = V[:] - cnt += 1 - - EV = np.zeros(self.Ns) - # for a in range(self.Na): - # EV += (Idt - self.gam * self.P[a].tocsr().T) @ M[:, a] - EV += (Idt - self.gam * self.P[0].tocsr().T) @ M[:, 0] - EV += (Idt - self.gam * self.P[1].tocsr().T) @ M[:, 1] - EV /= CV - V = (1 - dt) * V + dt * EV - - EM = np.zeros([self.Ns, self.Na]) - # for a in range(self.Na): - # EM[:, a] = (self.r[:, a] - (Idt - self.gam * self.P[a].tocsr()) @ V) / CM - EM[:, 0] = (self.r[:, 0] - (Idt - self.gam * self.P[0].tocsr()) @ V) / CM - EM[:, 1] = (self.r[:, 1] - (Idt - self.gam * self.P[1].tocsr()) @ V) / CM - - H = np.zeros([self.Ns, self.Na]) - for s in range(self.Ns): - H[s, 0] = np.log(M[s, 0] + 1e-300) - H[s, 1] = np.log(M[s, 1] + 1e-300) - # for a in range(self.Na): - # H[s, a] = np.log(M[s, a] + 1e-300) - - # print(1) - - for s in range(self.Ns): - # tmp_w = np.log(np.sum(M[s, :]) + 1e-300) - tmp_w = np.log(M[s, 0] + M[s, 1] + 1e-300) - # tmp_pi = softmax(H[s, :]) - tmp_pi0 = np.exp(H[s, 0]) - tmp_pi1 = np.exp(H[s, 1]) - tmp_s = tmp_pi0 + tmp_pi1 - tmp_pi0 /= tmp_s - tmp_pi1 /= tmp_s - # for a in range(self.Na): - # # H[s, a] = (H[s, a] + dt * EM[s, a] + dt * tmp_w) / (1 + dt) - # H[s, a] = (1-dt) * H[s, a] + dt * (EM[s, a] + tmp_w) - # tmp_add = np.dot(tmp_pi, EM[s, :] + tmp_w - H[s, :]) - tmp_add = tmp_pi0 * (EM[s, 0] + tmp_w - H[s, 0]) + tmp_pi1 * ( - EM[s, 1] + tmp_w - H[s, 1] - ) - tmp_add *= c * dt - H[s, 0] = (1 - dt) * H[s, 0] + dt * (EM[s, 0] + tmp_w) - tmp_add - H[s, 1] = (1 - dt) * H[s, 1] + dt * (EM[s, 1] + tmp_w) - tmp_add - - # H[s, :] = (1 - dt) * H[s, :] + dt * (EM[s, :] + tmp_w) \ - # - c * dt * np.ones(self.Na) * np.dot(tmp_pi, EM[s, :] + tmp_w - H[s, :]) - - # print(2) - - for s in range(self.Ns): - for a in range(self.Na): - M[s, a] = np.exp(H[s, a]) - - if np.linalg.norm(M, "fro") == 0 or np.linalg.norm(M, "fro") == np.inf: - # self.M = M - print(M) - return - - # err = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - # tmp_err = err - # self.err_pd.append(err) - err1 = np.linalg.norm(M - M0, "fro") / np.linalg.norm(M, "fro") - err2 = np.linalg.norm(V - V0) / np.linalg.norm(V) - tmp_err = max(err1, err2) - self.err_pd.append(err1) - for s in range(self.Ns): - # if np.sum(M[s, :]) == 0: - # self.M = M - # print(M) - # return - Z[s, :] = M[s, :] / np.sum(M[s, :]) - if self.resultZ is not None: - self.err_pd_exact.append(np.linalg.norm(self.resultZ - Z, "fro")) - self.err_pd_exact[-1] /= np.linalg.norm(self.resultZ, "fro") - if self.V_exact is not None: - self.err_pd_val_V.append(np.linalg.norm(V - self.V_exact)) - self.err_pd_val_V[-1] /= np.linalg.norm(self.V_exact) - if self.pi_exact is not None: - self.err_pd_val_pi.append(np.linalg.norm(self.pi_exact - Z, "fro")) - self.err_pd_val_pi[-1] /= np.linalg.norm(self.pi_exact, "fro") - - if cnt % 5 == 0: - print(cnt) - # print(CM) - print(np.sum(M)) - print(err1) - print(err2) - # print(np.sum(Z > 0.05)) - print(np.linalg.norm(V - self.V_exact) / np.linalg.norm(self.V_exact)) - print(np.max(np.abs(Z - self.pi_exact))) - print(np.mean(np.abs(Z - self.pi_exact))) - print(np.sum(np.abs(Z - self.pi_exact) > 0.1)) - - if self.resultM is not None and self.resultV is not None: - self.lyap.append(self._lyap_mixed(CV, V, M, c)) - - for s in range(self.Ns): - Z[s, :] = M[s, :] / np.sum(M[s, :]) - self.resultM = np.copy(M) - self.resultZ = np.copy(Z) - self.resultV = np.copy(V) - print(cnt) - print(np.linalg.norm(M - M0) / np.linalg.norm(M)) - - def get_mu(self, CV): - if self.V_exact is None: - self.value_iteration() - pi = np.copy(self.pi_exact) - # Construct P_pi - cP0 = self.P0.tocoo() - val = [] - for s, v in zip(cP0.row, cP0.data): - val.append(v * pi[s, 0]) - P_pi_tmp0 = scipy.sparse.coo_matrix((val, (cP0.row, cP0.col)), shape=cP0.shape) - - cP1 = self.P1.tocoo() - val = [] - for s, v in zip(cP1.row, cP1.data): - val.append(v * pi[s, 1]) - P_pi_tmp1 = scipy.sparse.coo_matrix((val, (cP1.row, cP1.col)), shape=cP1.shape) - - P_pi = P_pi_tmp0 + P_pi_tmp1 - - crds = [list(range(self.Ns)), list(range(self.Ns))] - Idt = sparse.COO(crds, [1] * self.Ns, shape=(self.Ns, self.Ns)) - Idt = Idt.tocsr() - - w, _ = sla.bicgstab(Idt - self.gam * P_pi.T, self.V_exact, tol=1e-12) - w *= CV - # print((Idt - self.gam * P_pi.T).shape) - # print(self.V_exact.shape) - # print(w) - - mu = np.copy(self.pi_exact) - for s in range(self.Ns): - mu[s, :] *= w[s] - self.mu_exact = np.copy(mu) - - def make_fig_pd(self, linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - err0 = self.err_pd - ax.tick_params(axis="both", labelsize=14) - ax.plot(err0, label="primal dual", linewidth=linewidth) - ax.set_yscale("log") - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("relative change of mu", fontsize=22) - ax.legend(loc="upper right") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def make_fig_pd_ref(self, linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - err0 = self.err_pd_val_V - err1 = self.err_pd_val_pi - ax.tick_params(axis="both", labelsize=14) - ax.plot(err0, label="error of value", linewidth=linewidth) - ax.plot(err1, label="error of policy", linewidth=linewidth) - ax.set_yscale("log") - ax.set_xlabel("iterations", fontsize=22) - # ax.set_ylabel("relative change of mu", fontsize=22) - ax.legend(loc="upper right") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def make_fig_pd_lyap(self, linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - err0 = self.lyap - ax.tick_params(axis="both", labelsize=14) - ax.plot(err0, label="primal dual", linewidth=linewidth) - ax.set_yscale("log") - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("Lyapunov", fontsize=22) - ax.legend(loc="upper right") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - def make_fig_exact_pd(self, linewidth=0.5): - fig, ax = plt.subplots(figsize=(12, 8)) - ax.tick_params(axis="both", labelsize=14) - err0 = ( - np.abs(np.log(self.err_pd_exact[:-1])) - if self.err_pd_exact[-1] == 0 - else np.abs(np.log(self.err_pd_exact)) - ) - ax.plot(err0, label="primal dual", linewidth=linewidth, marker="+") - ax.set_xlabel("iterations", fontsize=22) - ax.set_ylabel("log error", fontsize=22) - ax.legend(loc="upper left") - ax.set_title("Tau = " + str(self.tau)) - fig.tight_layout() - - -# def load_var(var_name): -# fid = open(os.path.join(var_name, ".pkl")) -# data = pickle.load(fid) -# fid.close() -# return data - - -def get_size(obj, seen=None): - """Recursively finds size of objects""" - size = sys.getsizeof(obj) - if seen is None: - seen = set() - obj_id = id(obj) - if obj_id in seen: - return 0 - seen.add(obj_id) - if isinstance(obj, dict): - size += sum([get_size(v, seen) for v in obj.values()]) - size += sum([get_size(k, seen) for k in obj.keys()]) - elif hasattr(obj, "__dict__"): - size += get_size(obj.__dict__, seen) - elif hasattr(obj, "__iter__") and not isinstance(obj, (str, bytes, bytearray)): - size += sum([get_size(i, seen) for i in obj]) - return size - - -p = PolicyGradientNew(nmP0="nmP0", nmP1="nmP1") -# p.set_prior([1e-3, 1-1e-3]) -p.set_tau(0.1) -p.gam = 0.9 -p.value_iteration() -p.get_mu(CV=50000) -p.train_primal_dual_pseudo_mix(CV=50000, step=1e-3, c=0.9) -start_time = time.time() -p.train() -elapsed_time = time.time() - start_time -print(elapsed_time)