From c8eda449b2c6b39e9d57d1b5b2c39e43f2925892 Mon Sep 17 00:00:00 2001
From: Kilian Fatras <kilianfatras@dhcp-206-12-53-99.eduroam.wireless.ubc.ca>
Date: Fri, 15 Jun 2018 18:53:54 -0700
Subject: [PATCH 01/15] add problems solved in doc

---
 examples/plot_stochastic.py | 135 ++++++++++++
 ot/__init__.py              |   2 +
 ot/stochastic.py            | 415 ++++++++++++++++++++++++++++++++++++
 test/test_stochastic.py     | 115 ++++++++++
 4 files changed, 667 insertions(+)
 create mode 100644 examples/plot_stochastic.py
 create mode 100644 ot/stochastic.py
 create mode 100644 test/test_stochastic.py

diff --git a/examples/plot_stochastic.py b/examples/plot_stochastic.py
new file mode 100644
index 000000000..9071ddb4d
--- /dev/null
+++ b/examples/plot_stochastic.py
@@ -0,0 +1,135 @@
+"""
+==========================
+Stochastic examples
+==========================
+
+This example is designed to show how to use the stochatic optimization
+algorithms for descrete and semicontinous measures from the POT library.
+
+"""
+
+# Author: Kilian Fatras <kilian.fatras@gmail.com>
+#
+# License: MIT License
+
+import matplotlib.pylab as pl
+import numpy as np
+import ot
+
+
+#############################################################################
+# COMPUTE TRANSPORTATION MATRIX
+#############################################################################
+
+#############################################################################
+# DISCRETE CASE
+# Sample two discrete measures for the discrete case
+# ---------------------------------------------
+#
+# Define 2 discrete measures a and b, the points where are defined the source
+# and the target measures and finally the cost matrix c.
+
+n_source = 7
+n_target = 4
+reg = 1
+numItermax = 10000
+lr = 0.1
+
+a = ot.utils.unif(n_source)
+b = ot.utils.unif(n_target)
+
+rng = np.random.RandomState(0)
+X_source = rng.randn(n_source, 2)
+Y_target = rng.randn(n_target, 2)
+M = ot.dist(X_source, Y_target)
+
+#############################################################################
+#
+# Call the "SAG" method to find the transportation matrix in the discrete case
+# ---------------------------------------------
+#
+# Define the method "SAG", call ot.transportation_matrix_entropic and plot the
+# results.
+
+method = "SAG"
+sag_pi = ot.stochastic.transportation_matrix_entropic(a, b, M, reg, method,
+                                                      numItermax, lr)
+print(sag_pi)
+
+#############################################################################
+# SEMICONTINOUS CASE
+# Sample one general measure a, one discrete measures b for the semicontinous
+# case
+# ---------------------------------------------
+#
+# Define one general measure a, one discrete measures b, the points where
+# are defined the source and the target measures and finally the cost matrix c.
+
+n_source = 7
+n_target = 4
+reg = 1
+numItermax = 500000
+lr = 1
+
+a = ot.utils.unif(n_source)
+b = ot.utils.unif(n_target)
+
+rng = np.random.RandomState(0)
+X_source = rng.randn(n_source, 2)
+Y_target = rng.randn(n_target, 2)
+M = ot.dist(X_source, Y_target)
+
+#############################################################################
+#
+# Call the "ASGD" method to find the transportation matrix in the semicontinous
+# case
+# ---------------------------------------------
+#
+# Define the method "ASGD", call ot.transportation_matrix_entropic and plot the
+# results.
+
+method = "ASGD"
+asgd_pi = ot.stochastic.transportation_matrix_entropic(a, b, M, reg, method,
+                                                       numItermax, lr)
+print(asgd_pi)
+
+#############################################################################
+#
+# Compare the results with the Sinkhorn algorithm
+# ---------------------------------------------
+#
+# Call the Sinkhorn algorithm from POT
+
+sinkhorn_pi = ot.sinkhorn(a, b, M, 1)
+print(sinkhorn_pi)
+
+
+##############################################################################
+# PLOT TRANSPORTATION MATRIX
+##############################################################################
+
+##############################################################################
+# Plot SAG results
+# ----------------
+
+pl.figure(4, figsize=(5, 5))
+ot.plot.plot1D_mat(a, b, sag_pi, 'OT matrix SAG')
+pl.show()
+
+
+##############################################################################
+# Plot ASGD results
+# -----------------
+
+pl.figure(4, figsize=(5, 5))
+ot.plot.plot1D_mat(a, b, asgd_pi, 'OT matrix ASGD')
+pl.show()
+
+
+##############################################################################
+# Plot Sinkhorn results
+# ---------------------
+
+pl.figure(4, figsize=(5, 5))
+ot.plot.plot1D_mat(a, b, sinkhorn_pi, 'OT matrix Sinkhorn')
+pl.show()
diff --git a/ot/__init__.py b/ot/__init__.py
index 1500e594f..1dde39017 100644
--- a/ot/__init__.py
+++ b/ot/__init__.py
@@ -18,6 +18,8 @@
 from . import datasets
 from . import da
 from . import gromov
+from . import smooth
+from . import stochastic
 
 # OT functions
 from .lp import emd, emd2
diff --git a/ot/stochastic.py b/ot/stochastic.py
new file mode 100644
index 000000000..541f4568f
--- /dev/null
+++ b/ot/stochastic.py
@@ -0,0 +1,415 @@
+# Author: Kilian Fatras <kilian.fatras@gmail.com>
+#
+# License: MIT License
+
+import numpy as np
+
+
+def coordinate_gradient(b, M, reg, v, i):
+    '''
+    Compute the coordinate gradient update for regularized discrete
+        distributions for (i, :)
+
+    The function computes the gradient of the semi dual problem:
+    .. math::
+        \W_varepsilon(a, b) = \max_\v \sum_i (\sum_j v_j * b_j
+            - \reg log(\sum_j exp((v_j - M_{i,j})/reg) * b_j)) * a_i
+
+    where :
+    - M is the (ns,nt) metric cost matrix
+    - v is a dual variable in R^J
+    - reg is the regularization term
+    - a and b are source and target weights (sum to 1)
+
+    The algorithm used for solving the problem is the ASGD & SAG algorithms
+    as proposed in [18]_ [alg.1 & alg.2]
+
+
+    Parameters
+    ----------
+
+    b : np.ndarray(nt,),
+        target measure
+    M : np.ndarray(ns, nt),
+        cost matrix
+    reg : float nu,
+        Regularization term > 0
+    v : np.ndarray(nt,),
+        optimization vector
+    i : number int,
+        picked number i
+
+    Returns
+    -------
+
+    coordinate gradient : np.ndarray(nt,)
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 300000
+    >>> lr = 1
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> method = "ASGD"
+    >>> asgd_pi = stochastic.transportation_matrix_entropic(a, b, M, reg,
+                                                            method, numItermax,
+                                                            lr)
+    >>> print(asgd_pi)
+
+    References
+    ----------
+
+    [Genevay et al., 2016] :
+                    Stochastic Optimization for Large-scale Optimal Transport,
+                     Advances in Neural Information Processing Systems (2016),
+                      arXiv preprint arxiv:1605.08527.
+
+    '''
+
+    r = M[i, :] - v
+    exp_v = np.exp(-r / reg) * b
+    khi = exp_v / (np.sum(exp_v))
+    return b - khi
+
+
+def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=0.1):
+    '''
+    Compute the SAG algorithm to solve the regularized discrete measures
+        optimal transport max problem
+
+    The function solves the following optimization problem:
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        s.t. \gamma 1 = a
+             \gamma^T 1= b
+             \gamma \geq 0
+    where :
+    - M is the (ns,nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term
+        :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target weights (sum to 1)
+    The algorithm used for solving the problem is the SAG algorithm
+    as proposed in [18]_ [alg.1]
+
+
+    Parameters
+    ----------
+
+    a : np.ndarray(ns,),
+        source measure
+    b : np.ndarray(nt,),
+        target measure
+    M : np.ndarray(ns, nt),
+        cost matrix
+    reg : float number,
+        Regularization term > 0
+    numItermax : int number
+        number of iteration
+    lr : float number
+        learning rate
+
+    Returns
+    -------
+
+    v : np.ndarray(nt,)
+        dual variable
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 300000
+    >>> lr = 1
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> method = "SAG"
+    >>> sag_pi = stochastic.transportation_matrix_entropic(a, b, M, reg,
+                                                            method, numItermax,
+                                                            lr)
+    >>> print(asgd_pi)
+
+    References
+    ----------
+
+    [Genevay et al., 2016] :
+                    Stochastic Optimization for Large-scale Optimal Transport,
+                     Advances in Neural Information Processing Systems (2016),
+                      arXiv preprint arxiv:1605.08527.
+    '''
+
+    n_source = np.shape(M)[0]
+    n_target = np.shape(M)[1]
+    v = np.zeros(n_target)
+    stored_gradient = np.zeros((n_source, n_target))
+    sum_stored_gradient = np.zeros(n_target)
+    for _ in range(numItermax):
+        i = np.random.randint(n_source)
+        cur_coord_grad = a[i] * coordinate_gradient(b, M, reg, v, i)
+        sum_stored_gradient += (cur_coord_grad - stored_gradient[i])
+        stored_gradient[i] = cur_coord_grad
+        v += lr * (1. / n_source) * sum_stored_gradient
+    return v
+
+
+def averaged_sgd_entropic_transport(b, M, reg, numItermax=300000, lr=1):
+    '''
+    Compute the ASGD algorithm to solve the regularized semi contibous measures
+        optimal transport max problem
+
+        The function solves the following optimization problem:
+        .. math::
+            \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+            s.t. \gamma 1 = a
+                 \gamma^T 1= b
+                 \gamma \geq 0
+        where :
+        - M is the (ns,nt) metric cost matrix
+        - :math:`\Omega` is the entropic regularization term
+            :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+        - a and b are source and target weights (sum to 1)
+        The algorithm used for solving the problem is the ASGD algorithm
+        as proposed in [18]_ [alg.2]
+
+
+    Parameters
+    ----------
+
+    b : np.ndarray(nt,),
+        target measure
+    M : np.ndarray(ns, nt),
+        cost matrix
+    reg : float number,
+        Regularization term > 0
+    numItermax : int number
+        number of iteration
+    lr : float number
+        learning rate
+
+
+    Returns
+    -------
+
+    ave_v : np.ndarray(nt,)
+        optimization vector
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 300000
+    >>> lr = 1
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> method = "ASGD"
+    >>> asgd_pi = stochastic.transportation_matrix_entropic(a, b, M, reg,
+                                                            method, numItermax,
+                                                            lr)
+    >>> print(asgd_pi)
+
+    References
+    ----------
+
+    [Genevay et al., 2016] :
+                    Stochastic Optimization for Large-scale Optimal Transport,
+                     Advances in Neural Information Processing Systems (2016),
+                      arXiv preprint arxiv:1605.08527.
+    '''
+
+    n_source = np.shape(M)[0]
+    n_target = np.shape(M)[1]
+    cur_v = np.zeros(n_target)
+    ave_v = np.zeros(n_target)
+    for cur_iter in range(numItermax):
+        k = cur_iter + 1
+        i = np.random.randint(n_source)
+        cur_coord_grad = coordinate_gradient(b, M, reg, cur_v, i)
+        cur_v += (lr / np.sqrt(k)) * cur_coord_grad
+        ave_v = (1. / k) * cur_v + (1 - 1. / k) * ave_v
+    return ave_v
+
+
+def c_transform_entropic(b, M, reg, v):
+    '''
+    The goal is to recover u from the c-transform.
+
+    The function computes the c_transform of a dual variable from the other
+    dual variable:
+    .. math::
+        u = v^{c,reg} = -reg \sum_j exp((v - M)/reg) b_j
+
+    where :
+    - M is the (ns,nt) metric cost matrix
+    - u, v are dual variables in R^IxR^J
+    - reg is the regularization term
+
+    It is used to recover an optimal u from optimal v solving the semi dual
+    problem, see Proposition 2.1 of [18]_
+
+
+    Parameters
+    ----------
+
+    b : np.ndarray(nt,)
+        target measure
+    M : np.ndarray(ns, nt)
+        cost matrix
+    reg : float
+        regularization term > 0
+    v : np.ndarray(nt,)
+        dual variable
+
+    Returns
+    -------
+
+    u : np.ndarray(ns,)
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 300000
+    >>> lr = 1
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> method = "ASGD"
+    >>> asgd_pi = stochastic.transportation_matrix_entropic(a, b, M, reg,
+                                                            method, numItermax,
+                                                            lr)
+    >>> print(asgd_pi)
+
+    References
+    ----------
+
+    [Genevay et al., 2016] :
+                    Stochastic Optimization for Large-scale Optimal Transport,
+                     Advances in Neural Information Processing Systems (2016),
+                      arXiv preprint arxiv:1605.08527.
+    '''
+
+    n_source = np.shape(M)[0]
+    n_target = np.shape(M)[1]
+    u = np.zeros(n_source)
+    for i in range(n_source):
+        r = M[i, :] - v
+        exp_v = np.exp(-r / reg) * b
+        u[i] = - reg * np.log(np.sum(exp_v))
+    return u
+
+
+def transportation_matrix_entropic(a, b, M, reg, method, numItermax=10000,
+                                   lr=0.1):
+    '''
+    Compute the transportation matrix to solve the regularized discrete
+        measures optimal transport max problem
+
+    The function solves the following optimization problem:
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        s.t. \gamma 1 = a
+             \gamma^T 1= b
+             \gamma \geq 0
+    where :
+    - M is the (ns,nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term
+        :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target weights (sum to 1)
+    The algorithm used for solving the problem is the SAG or ASGD algorithms
+    as proposed in [18]_
+
+
+    Parameters
+    ----------
+
+    a : np.ndarray(ns,),
+        source measure
+    b : np.ndarray(nt,),
+        target measure
+    M : np.ndarray(ns, nt),
+        cost matrix
+    reg : float number,
+        Regularization term > 0
+    methode : str,
+        used method (SAG or ASGD)
+    numItermax : int number
+        number of iteration
+    lr : float number
+        learning rate
+    n_source : int number
+        size of the source measure
+    n_target : int number
+        size of the target measure
+
+    Returns
+    -------
+
+    pi : np.ndarray(ns, nt)
+        transportation matrix
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 300000
+    >>> lr = 1
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> method = "ASGD"
+    >>> asgd_pi = stochastic.transportation_matrix_entropic(a, b, M, reg,
+                                                            method, numItermax,
+                                                            lr)
+    >>> print(asgd_pi)
+
+    References
+    ----------
+
+    [Genevay et al., 2016] :
+                    Stochastic Optimization for Large-scale Optimal Transport,
+                     Advances in Neural Information Processing Systems (2016),
+                      arXiv preprint arxiv:1605.08527.
+    '''
+    n_source = 7
+    n_target = 4
+    if method.lower() == "sag":
+        opt_v = sag_entropic_transport(a, b, M, reg, numItermax, lr)
+    elif method.lower() == "asgd":
+        opt_v = averaged_sgd_entropic_transport(b, M, reg, numItermax, lr)
+    else:
+        print("Please, select your method between SAG and ASGD")
+        return None
+
+    opt_u = c_transform_entropic(b, M, reg, opt_v)
+    pi = (np.exp((opt_u[:, None] + opt_v[None, :] - M[:, :]) / reg)
+          * a[:, None] * b[None, :])
+    return pi
diff --git a/test/test_stochastic.py b/test/test_stochastic.py
new file mode 100644
index 000000000..829f781ae
--- /dev/null
+++ b/test/test_stochastic.py
@@ -0,0 +1,115 @@
+"""
+==========================
+Stochastic test
+==========================
+
+This example is designed to test the stochatic optimization algorithms module
+for descrete and semicontinous measures from the POT library.
+
+"""
+
+# Author: Kilian Fatras <kilian.fatras@gmail.com>
+#
+# License: MIT License
+
+import numpy as np
+import ot
+
+#############################################################################
+#
+# TEST SAG algorithm
+# ---------------------------------------------
+# 2 identical discrete measures u defined on the same space with a
+# regularization term, a learning rate and a number of iteration
+
+
+def test_stochastic_sag():
+    # test sag
+    n = 15
+    reg = 1
+    numItermax = 300000
+    rng = np.random.RandomState(0)
+
+    x = rng.randn(n, 2)
+    u = ot.utils.unif(n)
+
+    M = ot.dist(x, x)
+
+    G = ot.stochastic.transportation_matrix_entropic(u, u, M, reg, "sag",
+                                                     numItermax=numItermax)
+
+    # check constratints
+    np.testing.assert_allclose(
+        u, G.sum(1), atol=1e-04)  # cf convergence sag
+    np.testing.assert_allclose(
+        u, G.sum(0), atol=1e-04)  # cf convergence sag
+
+
+#############################################################################
+#
+# TEST ASGD algorithm
+# ---------------------------------------------
+# 2 identical discrete measures u defined on the same space with a
+# regularization term, a learning rate and a number of iteration
+
+def test_stochastic_asgd():
+    # test asgd
+    n = 15
+    reg = 1
+    numItermax = 300000
+    lr = 1
+    rng = np.random.RandomState(0)
+
+    x = rng.randn(n, 2)
+    u = ot.utils.unif(n)
+
+    M = ot.dist(x, x)
+
+    G = ot.stochastic.transportation_matrix_entropic(u, u, M, reg, "asgd",
+                                                     numItermax=numItermax,
+                                                     lr=lr)
+
+    # check constratints
+    np.testing.assert_allclose(
+        u, G.sum(1), atol=1e-03)  # cf convergence asgd
+    np.testing.assert_allclose(
+        u, G.sum(0), atol=1e-03)  # cf convergence asgd
+
+
+#############################################################################
+#
+# TEST Convergence SAG and ASGD toward Sinkhorn's solution
+# --------------------------------------------------------
+# 2 identical discrete measures u defined on the same space with a
+# regularization term, a learning rate and a number of iteration
+
+
+def test_sag_asgd_sinkhorn():
+    # test all algorithms
+    n = 15
+    reg = 1
+    nb_iter = 300000
+    lr = 1
+    rng = np.random.RandomState(0)
+
+    x = rng.randn(n, 2)
+    u = ot.utils.unif(n)
+    zero = np.zeros(n)
+    M = ot.dist(x, x)
+
+    G_asgd = ot.stochastic.transportation_matrix_entropic(u, u, M, reg, "asgd",
+                                                          numItermax=nb_iter,
+                                                          lr=1)
+    G_sag = ot.stochastic.transportation_matrix_entropic(u, u, M, reg, "sag",
+                                                         numItermax=nb_iter)
+    G_sinkhorn = ot.sinkhorn(u, u, M, reg)
+
+    # check constratints
+    np.testing.assert_allclose(
+        zero, (G_sag - G_sinkhorn).sum(1), atol=1e-03)  # cf convergence sag
+    np.testing.assert_allclose(
+        zero, (G_sag - G_sinkhorn).sum(0), atol=1e-03)  # cf convergence sag
+    np.testing.assert_allclose(
+        zero, (G_asgd - G_sinkhorn).sum(1), atol=1e-03)  # cf convergence asgd
+    np.testing.assert_allclose(
+        zero, (G_asgd - G_sinkhorn).sum(0), atol=1e-03)  # cf convergence asgd

From e2d06ef4e901e0c1e9119cdbd79b9c453c1af452 Mon Sep 17 00:00:00 2001
From: Kilian Fatras <kilianfatras@dhcp-206-12-53-99.eduroam.wireless.ubc.ca>
Date: Fri, 15 Jun 2018 19:04:38 -0700
Subject: [PATCH 02/15] add problems solved in doc

---
 examples/plot_stochastic.py |  4 ++--
 ot/stochastic.py            | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/plot_stochastic.py b/examples/plot_stochastic.py
index 9071ddb4d..0afceeb14 100644
--- a/examples/plot_stochastic.py
+++ b/examples/plot_stochastic.py
@@ -53,7 +53,7 @@
 
 method = "SAG"
 sag_pi = ot.stochastic.transportation_matrix_entropic(a, b, M, reg, method,
-                                                      numItermax, lr)
+                                                   numItermax, lr)
 print(sag_pi)
 
 #############################################################################
@@ -90,7 +90,7 @@
 
 method = "ASGD"
 asgd_pi = ot.stochastic.transportation_matrix_entropic(a, b, M, reg, method,
-                                                       numItermax, lr)
+                                                    numItermax, lr)
 print(asgd_pi)
 
 #############################################################################
diff --git a/ot/stochastic.py b/ot/stochastic.py
index 541f4568f..91231507e 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -75,8 +75,8 @@ def coordinate_gradient(b, M, reg, v, i):
     '''
 
     r = M[i, :] - v
-    exp_v = np.exp(-r / reg) * b
-    khi = exp_v / (np.sum(exp_v))
+    exp_v = np.exp(-r/reg) * b
+    khi = exp_v/(np.sum(exp_v))
     return b - khi
 
 
@@ -161,7 +161,7 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=0.1):
         cur_coord_grad = a[i] * coordinate_gradient(b, M, reg, v, i)
         sum_stored_gradient += (cur_coord_grad - stored_gradient[i])
         stored_gradient[i] = cur_coord_grad
-        v += lr * (1. / n_source) * sum_stored_gradient
+        v += lr * (1./n_source) * sum_stored_gradient
     return v
 
 
@@ -243,8 +243,8 @@ def averaged_sgd_entropic_transport(b, M, reg, numItermax=300000, lr=1):
         k = cur_iter + 1
         i = np.random.randint(n_source)
         cur_coord_grad = coordinate_gradient(b, M, reg, cur_v, i)
-        cur_v += (lr / np.sqrt(k)) * cur_coord_grad
-        ave_v = (1. / k) * cur_v + (1 - 1. / k) * ave_v
+        cur_v += (lr/np.sqrt(k)) * cur_coord_grad
+        ave_v = (1./k) * cur_v + (1 - 1./k) * ave_v
     return ave_v
 
 
@@ -317,7 +317,7 @@ def c_transform_entropic(b, M, reg, v):
     u = np.zeros(n_source)
     for i in range(n_source):
         r = M[i, :] - v
-        exp_v = np.exp(-r / reg) * b
+        exp_v = np.exp(-r/reg) * b
         u[i] = - reg * np.log(np.sum(exp_v))
     return u
 
@@ -410,6 +410,6 @@ def transportation_matrix_entropic(a, b, M, reg, method, numItermax=10000,
         return None
 
     opt_u = c_transform_entropic(b, M, reg, opt_v)
-    pi = (np.exp((opt_u[:, None] + opt_v[None, :] - M[:, :]) / reg)
+    pi = (np.exp((opt_u[:, None] + opt_v[None, :] - M[:, :])/reg)
           * a[:, None] * b[None, :])
     return pi

From 18f9242b9575e7551d5e30b66cfbaaac85e0022a Mon Sep 17 00:00:00 2001
From: Kilian Fatras <kilianfatras@dhcp-206-12-53-99.eduroam.wireless.ubc.ca>
Date: Fri, 15 Jun 2018 19:15:48 -0700
Subject: [PATCH 03/15] PEP8

---
 examples/plot_stochastic.py |  4 ++--
 ot/stochastic.py            | 14 +++++++-------
 test/test_stochastic.py     |  1 -
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/examples/plot_stochastic.py b/examples/plot_stochastic.py
index 0afceeb14..9071ddb4d 100644
--- a/examples/plot_stochastic.py
+++ b/examples/plot_stochastic.py
@@ -53,7 +53,7 @@
 
 method = "SAG"
 sag_pi = ot.stochastic.transportation_matrix_entropic(a, b, M, reg, method,
-                                                   numItermax, lr)
+                                                      numItermax, lr)
 print(sag_pi)
 
 #############################################################################
@@ -90,7 +90,7 @@
 
 method = "ASGD"
 asgd_pi = ot.stochastic.transportation_matrix_entropic(a, b, M, reg, method,
-                                                    numItermax, lr)
+                                                       numItermax, lr)
 print(asgd_pi)
 
 #############################################################################
diff --git a/ot/stochastic.py b/ot/stochastic.py
index 91231507e..541f4568f 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -75,8 +75,8 @@ def coordinate_gradient(b, M, reg, v, i):
     '''
 
     r = M[i, :] - v
-    exp_v = np.exp(-r/reg) * b
-    khi = exp_v/(np.sum(exp_v))
+    exp_v = np.exp(-r / reg) * b
+    khi = exp_v / (np.sum(exp_v))
     return b - khi
 
 
@@ -161,7 +161,7 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=0.1):
         cur_coord_grad = a[i] * coordinate_gradient(b, M, reg, v, i)
         sum_stored_gradient += (cur_coord_grad - stored_gradient[i])
         stored_gradient[i] = cur_coord_grad
-        v += lr * (1./n_source) * sum_stored_gradient
+        v += lr * (1. / n_source) * sum_stored_gradient
     return v
 
 
@@ -243,8 +243,8 @@ def averaged_sgd_entropic_transport(b, M, reg, numItermax=300000, lr=1):
         k = cur_iter + 1
         i = np.random.randint(n_source)
         cur_coord_grad = coordinate_gradient(b, M, reg, cur_v, i)
-        cur_v += (lr/np.sqrt(k)) * cur_coord_grad
-        ave_v = (1./k) * cur_v + (1 - 1./k) * ave_v
+        cur_v += (lr / np.sqrt(k)) * cur_coord_grad
+        ave_v = (1. / k) * cur_v + (1 - 1. / k) * ave_v
     return ave_v
 
 
@@ -317,7 +317,7 @@ def c_transform_entropic(b, M, reg, v):
     u = np.zeros(n_source)
     for i in range(n_source):
         r = M[i, :] - v
-        exp_v = np.exp(-r/reg) * b
+        exp_v = np.exp(-r / reg) * b
         u[i] = - reg * np.log(np.sum(exp_v))
     return u
 
@@ -410,6 +410,6 @@ def transportation_matrix_entropic(a, b, M, reg, method, numItermax=10000,
         return None
 
     opt_u = c_transform_entropic(b, M, reg, opt_v)
-    pi = (np.exp((opt_u[:, None] + opt_v[None, :] - M[:, :])/reg)
+    pi = (np.exp((opt_u[:, None] + opt_v[None, :] - M[:, :]) / reg)
           * a[:, None] * b[None, :])
     return pi
diff --git a/test/test_stochastic.py b/test/test_stochastic.py
index 829f781ae..5fe5ea966 100644
--- a/test/test_stochastic.py
+++ b/test/test_stochastic.py
@@ -89,7 +89,6 @@ def test_sag_asgd_sinkhorn():
     n = 15
     reg = 1
     nb_iter = 300000
-    lr = 1
     rng = np.random.RandomState(0)
 
     x = rng.randn(n, 2)

From 3617b42ec05fe88eb616b32b2c3827d63022e287 Mon Sep 17 00:00:00 2001
From: Kilian Fatras <kilianfatras@dhcp-206-12-53-99.eduroam.wireless.ubc.ca>
Date: Fri, 15 Jun 2018 19:18:20 -0700
Subject: [PATCH 04/15] PEP8

---
 ot/stochastic.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/ot/stochastic.py b/ot/stochastic.py
index 541f4568f..f34154bdb 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -313,7 +313,6 @@ def c_transform_entropic(b, M, reg, v):
     '''
 
     n_source = np.shape(M)[0]
-    n_target = np.shape(M)[1]
     u = np.zeros(n_source)
     for i in range(n_source):
         r = M[i, :] - v
@@ -399,8 +398,6 @@ def transportation_matrix_entropic(a, b, M, reg, method, numItermax=10000,
                      Advances in Neural Information Processing Systems (2016),
                       arXiv preprint arxiv:1605.08527.
     '''
-    n_source = 7
-    n_target = 4
     if method.lower() == "sag":
         opt_v = sag_entropic_transport(a, b, M, reg, numItermax, lr)
     elif method.lower() == "asgd":

From 055417ee06917ff8bac5d07b2d2a17d80e5da4b6 Mon Sep 17 00:00:00 2001
From: Kilian Fatras <kilianfatras@dhcp-206-12-53-99.eduroam.wireless.ubc.ca>
Date: Fri, 15 Jun 2018 19:25:29 -0700
Subject: [PATCH 05/15] pep8

---
 ot/stochastic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ot/stochastic.py b/ot/stochastic.py
index f34154bdb..9912223b6 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -407,6 +407,6 @@ def transportation_matrix_entropic(a, b, M, reg, method, numItermax=10000,
         return None
 
     opt_u = c_transform_entropic(b, M, reg, opt_v)
-    pi = (np.exp((opt_u[:, None] + opt_v[None, :] - M[:, :]) / reg)
-          * a[:, None] * b[None, :])
+    pi = (np.exp((opt_u[:, None] + opt_v[None, :] - M[:, :]) / reg) *
+          a[:, None] * b[None, :])
     return pi

From 74cfe5ac77c3e964a85ef90c11d8ebffa16ddcfe Mon Sep 17 00:00:00 2001
From: Kilian Fatras <kilianfatras@dhcp-206-12-53-210.eduroam.wireless.ubc.ca>
Date: Mon, 18 Jun 2018 17:56:28 -0700
Subject: [PATCH 06/15] add sgd

---
 README.md                   |   8 +
 examples/plot_stochastic.py |  95 +++++++-
 ot/stochastic.py            | 468 ++++++++++++++++++++++++++++++++----
 test/test_stochastic.py     | 104 +++++++-
 4 files changed, 608 insertions(+), 67 deletions(-)

diff --git a/README.md b/README.md
index 466c09ca8..8e8dcd4aa 100644
--- a/README.md
+++ b/README.md
@@ -22,6 +22,7 @@ It provides the following solvers:
 * Linear OT [14] and Joint OT matrix and mapping estimation [8].
 * Wasserstein Discriminant Analysis [11] (requires autograd + pymanopt).
 * Gromov-Wasserstein distances and barycenters ([13] and regularized [12])
+* Stochastic Optimization for Large-scale Optimal Transport (semi-dual problem [18] and dual problem [19])
 
 Some demonstrations (both in Python and Jupyter Notebook format) are available in the examples folder.
 
@@ -149,6 +150,7 @@ The contributors to this library are:
 * [Stanislas Chambon](https://slasnista.github.io/)
 * [Antoine Rolet](https://arolet.github.io/)
 * Erwan Vautier (Gromov-Wasserstein)
+* [Kilian Fatras](https://kilianfatras.github.io/) (Stochastic optimization)
 
 This toolbox benefit a lot from open source research and we would like to thank the following persons for providing some code (in various languages):
 
@@ -213,3 +215,9 @@ You can also post bug reports and feature requests in Github issues. Make sure t
 [15] Peyré, G., & Cuturi, M. (2018). [Computational Optimal Transport](https://arxiv.org/pdf/1803.00567.pdf) .
 
 [16] Agueh, M., & Carlier, G. (2011). [Barycenters in the Wasserstein space](https://hal.archives-ouvertes.fr/hal-00637399/document). SIAM Journal on Mathematical Analysis, 43(2), 904-924.
+
+[17] Blondel, M., Seguy, V., & Rolet, A. (2018). [Smooth and Sparse Optimal Transport](https://arxiv.org/pdf/1710.06276.pdf). Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics (AISTATS).
+
+[18] Genevay, A., Cuturi, M., Peyré, G. & Bach, F. (2016) [Stochastic Optimization for Large-scale Optimal Transport](arXiv preprint arxiv:1605.08527). Advances in Neural Information Processing Systems (2016).
+
+[19] Seguy, V., Bhushan Damodaran, B., Flamary, R., Courty, N., Rolet, A.& Blondel, M. [Large-scale Optimal Transport and Mapping Estimation](https://arxiv.org/pdf/1711.02283.pdf). International Conference on Learning Representation (2018)
diff --git a/examples/plot_stochastic.py b/examples/plot_stochastic.py
index 9071ddb4d..3fc1955c7 100644
--- a/examples/plot_stochastic.py
+++ b/examples/plot_stochastic.py
@@ -18,9 +18,9 @@
 
 
 #############################################################################
-# COMPUTE TRANSPORTATION MATRIX
+# COMPUTE TRANSPORTATION MATRIX FOR SEMI-DUAL PROBLEM
 #############################################################################
-
+print("------------SEMI-DUAL PROBLEM------------")
 #############################################################################
 # DISCRETE CASE
 # Sample two discrete measures for the discrete case
@@ -48,12 +48,12 @@
 # Call the "SAG" method to find the transportation matrix in the discrete case
 # ---------------------------------------------
 #
-# Define the method "SAG", call ot.transportation_matrix_entropic and plot the
+# Define the method "SAG", call ot.solve_semi_dual_entropic and plot the
 # results.
 
 method = "SAG"
-sag_pi = ot.stochastic.transportation_matrix_entropic(a, b, M, reg, method,
-                                                      numItermax, lr)
+sag_pi = ot.stochastic.solve_semi_dual_entropic(a, b, M, reg, method,
+                                             numItermax, lr)
 print(sag_pi)
 
 #############################################################################
@@ -68,8 +68,9 @@
 n_source = 7
 n_target = 4
 reg = 1
-numItermax = 500000
+numItermax = 100000
 lr = 1
+log = True
 
 a = ot.utils.unif(n_source)
 b = ot.utils.unif(n_target)
@@ -85,12 +86,13 @@
 # case
 # ---------------------------------------------
 #
-# Define the method "ASGD", call ot.transportation_matrix_entropic and plot the
+# Define the method "ASGD", call ot.solve_semi_dual_entropic and plot the
 # results.
 
 method = "ASGD"
-asgd_pi = ot.stochastic.transportation_matrix_entropic(a, b, M, reg, method,
-                                                       numItermax, lr)
+asgd_pi, log = ot.stochastic.solve_semi_dual_entropic(a, b, M, reg, method,
+                                                   numItermax, lr, log)
+print(log['alpha'], log['beta'])
 print(asgd_pi)
 
 #############################################################################
@@ -100,7 +102,7 @@
 #
 # Call the Sinkhorn algorithm from POT
 
-sinkhorn_pi = ot.sinkhorn(a, b, M, 1)
+sinkhorn_pi = ot.sinkhorn(a, b, M, reg)
 print(sinkhorn_pi)
 
 
@@ -113,7 +115,7 @@
 # ----------------
 
 pl.figure(4, figsize=(5, 5))
-ot.plot.plot1D_mat(a, b, sag_pi, 'OT matrix SAG')
+ot.plot.plot1D_mat(a, b, sag_pi, 'semi-dual : OT matrix SAG')
 pl.show()
 
 
@@ -122,7 +124,76 @@
 # -----------------
 
 pl.figure(4, figsize=(5, 5))
-ot.plot.plot1D_mat(a, b, asgd_pi, 'OT matrix ASGD')
+ot.plot.plot1D_mat(a, b, asgd_pi, 'semi-dual : OT matrix ASGD')
+pl.show()
+
+
+##############################################################################
+# Plot Sinkhorn results
+# ---------------------
+
+pl.figure(4, figsize=(5, 5))
+ot.plot.plot1D_mat(a, b, sinkhorn_pi, 'OT matrix Sinkhorn')
+pl.show()
+
+#############################################################################
+# COMPUTE TRANSPORTATION MATRIX FOR DUAL PROBLEM
+#############################################################################
+print("------------DUAL PROBLEM------------")
+#############################################################################
+# SEMICONTINOUS CASE
+# Sample one general measure a, one discrete measures b for the semicontinous
+# case
+# ---------------------------------------------
+#
+# Define one general measure a, one discrete measures b, the points where
+# are defined the source and the target measures and finally the cost matrix c.
+
+n_source = 7
+n_target = 4
+reg = 1
+numItermax = 100000
+lr = 0.1
+batch_size = 3
+log = True
+
+a = ot.utils.unif(n_source)
+b = ot.utils.unif(n_target)
+
+rng = np.random.RandomState(0)
+X_source = rng.randn(n_source, 2)
+Y_target = rng.randn(n_target, 2)
+M = ot.dist(X_source, Y_target)
+
+#############################################################################
+#
+# Call the "SGD" dual method to find the transportation matrix in the semicontinous
+# case
+# ---------------------------------------------
+#
+# Call ot.solve_dual_entropic and plot the results.
+
+sgd_dual_pi, log = ot.stochastic.solve_dual_entropic(a, b, M, reg, batch_size,
+                                                  numItermax, lr, log)
+print(log['alpha'], log['beta'])
+print(sgd_dual_pi)
+
+#############################################################################
+#
+# Compare the results with the Sinkhorn algorithm
+# ---------------------------------------------
+#
+# Call the Sinkhorn algorithm from POT
+
+sinkhorn_pi = ot.sinkhorn(a, b, M, reg)
+print(sinkhorn_pi)
+
+##############################################################################
+# Plot  SGD results
+# -----------------
+
+pl.figure(4, figsize=(5, 5))
+ot.plot.plot1D_mat(a, b, sgd_dual_pi, 'dual : OT matrix SGD')
 pl.show()
 
 
diff --git a/ot/stochastic.py b/ot/stochastic.py
index 9912223b6..31c99bebf 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -5,7 +5,10 @@
 import numpy as np
 
 
-def coordinate_gradient(b, M, reg, v, i):
+##############################################################################
+# Optimization toolbox for SEMI - DUAL problem
+##############################################################################
+def coordinate_gradient(b, M, reg, beta, i):
     '''
     Compute the coordinate gradient update for regularized discrete
         distributions for (i, :)
@@ -59,7 +62,7 @@ def coordinate_gradient(b, M, reg, v, i):
     >>> Y_target = rng.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
     >>> method = "ASGD"
-    >>> asgd_pi = stochastic.transportation_matrix_entropic(a, b, M, reg,
+    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
                                                             method, numItermax,
                                                             lr)
     >>> print(asgd_pi)
@@ -74,9 +77,9 @@ def coordinate_gradient(b, M, reg, v, i):
 
     '''
 
-    r = M[i, :] - v
-    exp_v = np.exp(-r / reg) * b
-    khi = exp_v / (np.sum(exp_v))
+    r = M[i, :] - beta
+    exp_beta = np.exp(-r/reg) * b
+    khi = exp_beta/(np.sum(exp_beta))
     return b - khi
 
 
@@ -137,7 +140,7 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=0.1):
     >>> Y_target = rng.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
     >>> method = "SAG"
-    >>> sag_pi = stochastic.transportation_matrix_entropic(a, b, M, reg,
+    >>> sag_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
                                                             method, numItermax,
                                                             lr)
     >>> print(asgd_pi)
@@ -153,16 +156,16 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=0.1):
 
     n_source = np.shape(M)[0]
     n_target = np.shape(M)[1]
-    v = np.zeros(n_target)
+    cur_beta = np.zeros(n_target)
     stored_gradient = np.zeros((n_source, n_target))
     sum_stored_gradient = np.zeros(n_target)
     for _ in range(numItermax):
         i = np.random.randint(n_source)
-        cur_coord_grad = a[i] * coordinate_gradient(b, M, reg, v, i)
+        cur_coord_grad = a[i] * coordinate_gradient(b, M, reg, cur_beta, i)
         sum_stored_gradient += (cur_coord_grad - stored_gradient[i])
         stored_gradient[i] = cur_coord_grad
-        v += lr * (1. / n_source) * sum_stored_gradient
-    return v
+        cur_beta += lr * (1./n_source) * sum_stored_gradient
+    return cur_beta
 
 
 def averaged_sgd_entropic_transport(b, M, reg, numItermax=300000, lr=1):
@@ -170,19 +173,19 @@ def averaged_sgd_entropic_transport(b, M, reg, numItermax=300000, lr=1):
     Compute the ASGD algorithm to solve the regularized semi contibous measures
         optimal transport max problem
 
-        The function solves the following optimization problem:
-        .. math::
-            \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
-            s.t. \gamma 1 = a
-                 \gamma^T 1= b
-                 \gamma \geq 0
-        where :
-        - M is the (ns,nt) metric cost matrix
-        - :math:`\Omega` is the entropic regularization term
-            :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
-        - a and b are source and target weights (sum to 1)
-        The algorithm used for solving the problem is the ASGD algorithm
-        as proposed in [18]_ [alg.2]
+    The function solves the following optimization problem:
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        s.t. \gamma 1 = a
+             \gamma^T 1= b
+             \gamma \geq 0
+    where :
+    - M is the (ns,nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term
+        :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target weights (sum to 1)
+    The algorithm used for solving the problem is the ASGD algorithm
+    as proposed in [18]_ [alg.2]
 
 
     Parameters
@@ -221,7 +224,7 @@ def averaged_sgd_entropic_transport(b, M, reg, numItermax=300000, lr=1):
     >>> Y_target = rng.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
     >>> method = "ASGD"
-    >>> asgd_pi = stochastic.transportation_matrix_entropic(a, b, M, reg,
+    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
                                                             method, numItermax,
                                                             lr)
     >>> print(asgd_pi)
@@ -237,18 +240,18 @@ def averaged_sgd_entropic_transport(b, M, reg, numItermax=300000, lr=1):
 
     n_source = np.shape(M)[0]
     n_target = np.shape(M)[1]
-    cur_v = np.zeros(n_target)
-    ave_v = np.zeros(n_target)
+    cur_beta = np.zeros(n_target)
+    ave_beta = np.zeros(n_target)
     for cur_iter in range(numItermax):
         k = cur_iter + 1
         i = np.random.randint(n_source)
-        cur_coord_grad = coordinate_gradient(b, M, reg, cur_v, i)
-        cur_v += (lr / np.sqrt(k)) * cur_coord_grad
-        ave_v = (1. / k) * cur_v + (1 - 1. / k) * ave_v
-    return ave_v
+        cur_coord_grad = coordinate_gradient(b, M, reg, cur_beta, i)
+        cur_beta += (lr/np.sqrt(k)) * cur_coord_grad
+        ave_beta = (1./k) * cur_beta + (1 - 1./k) * ave_beta
+    return ave_beta
 
 
-def c_transform_entropic(b, M, reg, v):
+def c_transform_entropic(b, M, reg, beta):
     '''
     The goal is to recover u from the c-transform.
 
@@ -298,7 +301,7 @@ def c_transform_entropic(b, M, reg, v):
     >>> Y_target = rng.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
     >>> method = "ASGD"
-    >>> asgd_pi = stochastic.transportation_matrix_entropic(a, b, M, reg,
+    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
                                                             method, numItermax,
                                                             lr)
     >>> print(asgd_pi)
@@ -313,16 +316,18 @@ def c_transform_entropic(b, M, reg, v):
     '''
 
     n_source = np.shape(M)[0]
-    u = np.zeros(n_source)
+    n_target = np.shape(M)[1]
+    alpha = np.zeros(n_source)
     for i in range(n_source):
-        r = M[i, :] - v
-        exp_v = np.exp(-r / reg) * b
-        u[i] = - reg * np.log(np.sum(exp_v))
-    return u
+        r = M[i, :] - beta
+        min_r = np.min(r)
+        exp_beta = np.exp(-(r - min_r)/reg) * b
+        alpha[i] = min_r - reg * np.log(np.sum(exp_beta))
+    return alpha
 
 
-def transportation_matrix_entropic(a, b, M, reg, method, numItermax=10000,
-                                   lr=0.1):
+def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=0.1,
+                                log=False):
     '''
     Compute the transportation matrix to solve the regularized discrete
         measures optimal transport max problem
@@ -363,12 +368,16 @@ def transportation_matrix_entropic(a, b, M, reg, method, numItermax=10000,
         size of the source measure
     n_target : int number
         size of the target measure
+    log : bool, optional
+        record log if True
 
     Returns
     -------
 
     pi : np.ndarray(ns, nt)
         transportation matrix
+    log : dict
+        log dictionary return only if log==True in parameters
 
     Examples
     --------
@@ -385,7 +394,7 @@ def transportation_matrix_entropic(a, b, M, reg, method, numItermax=10000,
     >>> Y_target = rng.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
     >>> method = "ASGD"
-    >>> asgd_pi = stochastic.transportation_matrix_entropic(a, b, M, reg,
+    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
                                                             method, numItermax,
                                                             lr)
     >>> print(asgd_pi)
@@ -398,15 +407,384 @@ def transportation_matrix_entropic(a, b, M, reg, method, numItermax=10000,
                      Advances in Neural Information Processing Systems (2016),
                       arXiv preprint arxiv:1605.08527.
     '''
+    n_source = 7
+    n_target = 4
     if method.lower() == "sag":
-        opt_v = sag_entropic_transport(a, b, M, reg, numItermax, lr)
+        opt_beta = sag_entropic_transport(a, b, M, reg, numItermax, lr)
     elif method.lower() == "asgd":
-        opt_v = averaged_sgd_entropic_transport(b, M, reg, numItermax, lr)
+        opt_beta = averaged_sgd_entropic_transport(b, M, reg, numItermax, lr)
     else:
         print("Please, select your method between SAG and ASGD")
         return None
 
-    opt_u = c_transform_entropic(b, M, reg, opt_v)
-    pi = (np.exp((opt_u[:, None] + opt_v[None, :] - M[:, :]) / reg) *
+    opt_alpha = c_transform_entropic(b, M, reg, opt_beta)
+    pi = (np.exp((opt_alpha[:, None] + opt_beta[None, :] - M[:, :])/reg) *
           a[:, None] * b[None, :])
-    return pi
+
+    if log:
+        log = {}
+        log['alpha'] = opt_alpha
+        log['beta'] = opt_beta
+        return pi, log
+    else:
+        return pi
+
+
+##############################################################################
+# Optimization toolbox for DUAL problem
+##############################################################################
+
+
+def grad_dF_dalpha(M, reg, alpha, beta, batch_size, batch_alpha, batch_beta):
+    '''
+    Computes the partial gradient of F_\W_varepsilon
+
+    Compute the partial gradient of the dual problem:
+    ..Math:
+        \forall i in batch_alpha,
+            grad_alpha_i = 1 * batch_size -
+                    sum_{j in batch_beta} exp((alpha_i + beta_j - M_{i,j})/reg)
+
+    where :
+    - M is the (ns,nt) metric cost matrix
+    - alpha, beta are dual variables in R^ixR^J
+    - reg is the regularization term
+    - batch_alpha and batch_beta are list of index
+
+    The algorithm used for solving the dual problem is the SGD algorithm
+    as proposed in [19]_ [alg.1]
+
+    Parameters
+    ----------
+
+    reg : float number,
+        Regularization term > 0
+    M : np.ndarray(ns, nt),
+        cost matrix
+    alpha : np.ndarray(ns,)
+        dual variable
+    beta : np.ndarray(nt,)
+        dual variable
+    batch_size : int number
+        size of the batch
+    batch_alpha : np.ndarray(bs,)
+        batch of index of alpha
+    batch_beta : np.ndarray(bs,)
+        batch of index of beta
+
+    Returns
+    -------
+
+    grad : np.ndarray(ns,)
+        partial grad F in alpha
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 20000
+    >>> lr = 0.1
+    >>> batch_size = 3
+    >>> log = True
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> sgd_dual_pi, log = stochastic.solve_dual_entropic(a, b, M, reg,
+                                                            batch_size,
+                                                            numItermax, lr, log)
+    >>> print(log['alpha'], log['beta'])
+    >>> print(sgd_dual_pi)
+
+    References
+    ----------
+
+    [Seguy et al., 2018] :
+                    International Conference on Learning Representation (2018),
+                      arXiv preprint arxiv:1711.02283.
+    '''
+
+    grad_alpha = np.zeros(batch_size)
+    grad_alpha[:] = batch_size
+    for j in batch_beta:
+        grad_alpha -= np.exp((alpha[batch_alpha] + beta[j]
+                              - M[batch_alpha, j])/reg)
+    return grad_alpha
+
+
+def grad_dF_dbeta(M, reg, alpha, beta, batch_size, batch_alpha, batch_beta):
+    '''
+    Computes the partial gradient of F_\W_varepsilon
+
+    Compute the partial gradient of the dual problem:
+    ..Math:
+        \forall j in batch_beta,
+            grad_beta_j = 1 * batch_size -
+                sum_{i in batch_alpha} exp((alpha_i + beta_j - M_{i,j})/reg)
+
+    where :
+    - M is the (ns,nt) metric cost matrix
+    - alpha, beta are dual variables in R^ixR^J
+    - reg is the regularization term
+    - batch_alpha and batch_beta are list of index
+
+    The algorithm used for solving the dual problem is the SGD algorithm
+    as proposed in [19]_ [alg.1]
+
+    Parameters
+    ----------
+
+    M : np.ndarray(ns, nt),
+        cost matrix
+    reg : float number,
+        Regularization term > 0
+    alpha : np.ndarray(ns,)
+        dual variable
+    beta : np.ndarray(nt,)
+        dual variable
+    batch_size : int number
+        size of the batch
+    batch_alpha : np.ndarray(bs,)
+        batch of index of alpha
+    batch_beta : np.ndarray(bs,)
+        batch of index of beta
+
+    Returns
+    -------
+
+    grad : np.ndarray(ns,)
+        partial grad F in beta
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 20000
+    >>> lr = 0.1
+    >>> batch_size = 3
+    >>> log = True
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> sgd_dual_pi, log = stochastic.solve_dual_entropic(a, b, M, reg,
+                                                            batch_size,
+                                                            numItermax, lr, log)
+    >>> print(log['alpha'], log['beta'])
+    >>> print(sgd_dual_pi)
+
+    References
+    ----------
+
+    [Seguy et al., 2018] :
+                    International Conference on Learning Representation (2018),
+                      arXiv preprint arxiv:1711.02283.
+
+    '''
+
+    grad_beta = np.zeros(batch_size)
+    grad_beta[:] = batch_size
+    for i in batch_alpha:
+        grad_beta -= np.exp((alpha[i] +
+                             beta[batch_beta] - M[i, batch_beta])/reg)
+    return grad_beta
+
+
+def sgd_entropic_regularization(M, reg, batch_size, numItermax, lr,
+                                alternate=True):
+    '''
+    Compute the sgd algorithm to solve the regularized discrete measures
+        optimal transport dual problem
+
+    The function solves the following optimization problem:
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        s.t. \gamma 1 = a
+             \gamma^T 1= b
+             \gamma \geq 0
+    where :
+    - M is the (ns,nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term
+        :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target weights (sum to 1)
+
+    Parameters
+    ----------
+
+    M : np.ndarray(ns, nt),
+        cost matrix
+    reg : float number,
+        Regularization term > 0
+    batch_size : int number
+        size of the batch
+    numItermax : int number
+        number of iteration
+    lr : float number
+        learning rate
+    alternate : bool, optional
+        alternating algorithm
+
+    Returns
+    -------
+
+    alpha : np.ndarray(ns,)
+        dual variable
+    beta : np.ndarray(nt,)
+        dual variable
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 20000
+    >>> lr = 0.1
+    >>> batch_size = 3
+    >>> log = True
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> sgd_dual_pi, log = stochastic.solve_dual_entropic(a, b, M, reg,
+                                                            batch_size,
+                                                            numItermax, lr, log)
+    >>> print(log['alpha'], log['beta'])
+    >>> print(sgd_dual_pi)
+
+    References
+    ----------
+
+    [Seguy et al., 2018] :
+                    International Conference on Learning Representation (2018),
+                      arXiv preprint arxiv:1711.02283.
+    '''
+
+    n_source = np.shape(M)[0]
+    n_target = np.shape(M)[1]
+    cur_alpha = np.random.randn(n_source)
+    cur_beta = np.random.randn(n_target)
+    if alternate:
+        for cur_iter in range(numItermax):
+            k = np.sqrt(cur_iter + 1)
+            batch_alpha = np.random.choice(n_source, batch_size, replace=False)
+            batch_beta = np.random.choice(n_target, batch_size, replace=False)
+            grad_F_alpha = grad_dF_dalpha(M, reg, cur_alpha, cur_beta,
+                                          batch_size, batch_alpha, batch_beta)
+            cur_alpha[batch_alpha] += (lr/k) * grad_F_alpha
+            grad_F_beta = grad_dF_dbeta(M, reg, cur_alpha, cur_beta,
+                                        batch_size, batch_alpha, batch_beta)
+            cur_beta[batch_beta] += (lr/k) * grad_F_beta
+
+    else:
+        for cur_iter in range(numItermax):
+            k = np.sqrt(cur_iter + 1)
+            batch_alpha = np.random.choice(n_source, batch_size, replace=False)
+            batch_beta = np.random.choice(n_target, batch_size, replace=False)
+            grad_F_alpha = grad_dF_dalpha(M, reg, cur_alpha, cur_beta,
+                                          batch_size, batch_alpha, batch_beta)
+            grad_F_beta = grad_dF_dbeta(M, reg, cur_alpha, cur_beta,
+                                        batch_size, batch_alpha, batch_beta)
+            cur_alpha[batch_alpha] += (lr/k) * grad_F_alpha
+            cur_beta[batch_beta] += (lr/k) * grad_F_beta
+
+    return cur_alpha, cur_beta
+
+
+def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
+                        log=False):
+    '''
+    Compute the transportation matrix to solve the regularized discrete measures
+        optimal transport dual problem
+
+    The function solves the following optimization problem:
+    .. math::
+        \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
+        s.t. \gamma 1 = a
+             \gamma^T 1= b
+             \gamma \geq 0
+    where :
+    - M is the (ns,nt) metric cost matrix
+    - :math:`\Omega` is the entropic regularization term
+        :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})`
+    - a and b are source and target weights (sum to 1)
+
+    Parameters
+    ----------
+
+    a : np.ndarray(ns,),
+        source measure
+    b : np.ndarray(nt,),
+        target measure
+    M : np.ndarray(ns, nt),
+        cost matrix
+    reg : float number,
+        Regularization term > 0
+    batch_size : int number
+        size of the batch
+    numItermax : int number
+        number of iteration
+    lr : float number
+        learning rate
+    log : bool, optional
+        record log if True
+
+    Returns
+    -------
+
+    pi : np.ndarray(ns, nt)
+        transportation matrix
+    log : dict
+        log dictionary return only if log==True in parameters
+
+    Examples
+    --------
+
+    >>> n_source = 7
+    >>> n_target = 4
+    >>> reg = 1
+    >>> numItermax = 20000
+    >>> lr = 0.1
+    >>> batch_size = 3
+    >>> log = True
+    >>> a = ot.utils.unif(n_source)
+    >>> b = ot.utils.unif(n_target)
+    >>> rng = np.random.RandomState(0)
+    >>> X_source = rng.randn(n_source, 2)
+    >>> Y_target = rng.randn(n_target, 2)
+    >>> M = ot.dist(X_source, Y_target)
+    >>> sgd_dual_pi, log = stochastic.solve_dual_entropic(a, b, M, reg,
+                                                            batch_size,
+                                                            numItermax, lr, log)
+    >>> print(log['alpha'], log['beta'])
+    >>> print(sgd_dual_pi)
+
+    References
+    ----------
+
+    [Seguy et al., 2018] :
+                    International Conference on Learning Representation (2018),
+                      arXiv preprint arxiv:1711.02283.
+    '''
+
+    opt_alpha, opt_beta = sgd_entropic_regularization(M, reg, batch_size,
+                                                      numItermax, lr)
+    pi = (np.exp((opt_alpha[:, None] + opt_beta[None, :] - M[:, :])/reg) *
+          a[:, None] * b[None, :])
+    if log:
+        log = {}
+        log['alpha'] = opt_alpha
+        log['beta'] = opt_beta
+        return pi, log
+    else:
+        return pi
diff --git a/test/test_stochastic.py b/test/test_stochastic.py
index 5fe5ea966..bc0cebbc4 100644
--- a/test/test_stochastic.py
+++ b/test/test_stochastic.py
@@ -15,6 +15,11 @@
 import numpy as np
 import ot
 
+
+#############################################################################
+# COMPUTE TEST FOR SEMI-DUAL PROBLEM
+#############################################################################
+
 #############################################################################
 #
 # TEST SAG algorithm
@@ -35,8 +40,8 @@ def test_stochastic_sag():
 
     M = ot.dist(x, x)
 
-    G = ot.stochastic.transportation_matrix_entropic(u, u, M, reg, "sag",
-                                                     numItermax=numItermax)
+    G = ot.stochastic.solve_semi_dual_entropic(u, u, M, reg, "sag",
+                                               numItermax=numItermax)
 
     # check constratints
     np.testing.assert_allclose(
@@ -65,9 +70,9 @@ def test_stochastic_asgd():
 
     M = ot.dist(x, x)
 
-    G = ot.stochastic.transportation_matrix_entropic(u, u, M, reg, "asgd",
-                                                     numItermax=numItermax,
-                                                     lr=lr)
+    G = ot.stochastic.solve_semi_dual_entropic(u, u, M, reg, "asgd",
+                                               numItermax=numItermax,
+                                               lr=lr)
 
     # check constratints
     np.testing.assert_allclose(
@@ -89,6 +94,7 @@ def test_sag_asgd_sinkhorn():
     n = 15
     reg = 1
     nb_iter = 300000
+    lr = 1
     rng = np.random.RandomState(0)
 
     x = rng.randn(n, 2)
@@ -96,11 +102,10 @@ def test_sag_asgd_sinkhorn():
     zero = np.zeros(n)
     M = ot.dist(x, x)
 
-    G_asgd = ot.stochastic.transportation_matrix_entropic(u, u, M, reg, "asgd",
-                                                          numItermax=nb_iter,
-                                                          lr=1)
-    G_sag = ot.stochastic.transportation_matrix_entropic(u, u, M, reg, "sag",
-                                                         numItermax=nb_iter)
+    G_asgd = ot.stochastic.solve_semi_dual_entropic(u, u, M, reg, "asgd",
+                                                    numItermax=nb_iter, lr=lr)
+    G_sag = ot.stochastic.solve_semi_dual_entropic(u, u, M, reg, "sag",
+                                                   numItermax=nb_iter)
     G_sinkhorn = ot.sinkhorn(u, u, M, reg)
 
     # check constratints
@@ -112,3 +117,82 @@ def test_sag_asgd_sinkhorn():
         zero, (G_asgd - G_sinkhorn).sum(1), atol=1e-03)  # cf convergence asgd
     np.testing.assert_allclose(
         zero, (G_asgd - G_sinkhorn).sum(0), atol=1e-03)  # cf convergence asgd
+    np.testing.assert_allclose(
+        G_sag, G_sinkhorn, atol=1e-03)  # cf convergence sag
+    np.testing.assert_allclose(
+        G_asgd, G_sinkhorn, atol=1e-03)  # cf convergence asgd
+
+
+#############################################################################
+# COMPUTE TEST FOR DUAL PROBLEM
+#############################################################################
+
+#############################################################################
+#
+# TEST SGD algorithm
+# ---------------------------------------------
+# 2 identical discrete measures u defined on the same space with a
+# regularization term, a batch_size and a number of iteration
+
+def test_stochastic_dual_sgd():
+    # test sgd
+    print("SGD")
+    n = 10
+    reg = 1
+    numItermax = 300000
+    batch_size = 8
+    rng = np.random.RandomState(0)
+
+    x = rng.randn(n, 2)
+    u = ot.utils.unif(n)
+
+    M = ot.dist(x, x)
+
+    G = ot.stochastic.solve_dual_entropic(u, u, M, reg, batch_size,
+                                       numItermax=numItermax)
+
+    # check constratints
+    np.testing.assert_allclose(
+        u, G.sum(1), atol=1e-02)  # cf convergence sgd
+    np.testing.assert_allclose(
+        u, G.sum(0), atol=1e-02)  # cf convergence sgd
+
+#############################################################################
+#
+# TEST Convergence SGD toward Sinkhorn's solution
+# --------------------------------------------------------
+# 2 identical discrete measures u defined on the same space with a
+# regularization term, a batch_size and a number of iteration
+
+
+def test_dual_sgd_sinkhorn():
+    # test all dual algorithms
+    print("SGD vs Sinkhorn")
+    n = 10
+    reg = 1
+    nb_iter = 300000
+    batch_size = 8
+    rng = np.random.RandomState(0)
+
+    x = rng.randn(n, 2)
+    u = ot.utils.unif(n)
+    zero = np.zeros(n)
+    M = ot.dist(x, x)
+
+    G_sgd = ot.stochastic.solve_dual_entropic(u, u, M, reg, batch_size,
+                                           numItermax=nb_iter)
+
+    G_sinkhorn = ot.sinkhorn(u, u, M, reg)
+
+    # check constratints
+    np.testing.assert_allclose(
+        zero, (G_sgd - G_sinkhorn).sum(1), atol=1e-02)  # cf convergence sgd
+    np.testing.assert_allclose(
+        zero, (G_sgd - G_sinkhorn).sum(0), atol=1e-02)  # cf convergence sgd
+    np.testing.assert_allclose(
+        G_sgd, G_sinkhorn, atol=1e-02)  # cf convergence sgd
+
+
+if __name__ == '__main__':
+    test_stochastic_dual_sgd()
+    test_dual_sgd_sinkhorn()

From e068b58ba4234792d96287afd34c3cddef544dd4 Mon Sep 17 00:00:00 2001
From: Kilian Fatras <kilianfatras@dhcp-206-12-53-210.eduroam.wireless.ubc.ca>
Date: Mon, 18 Jun 2018 18:05:48 -0700
Subject: [PATCH 07/15] pep8

---
 examples/plot_stochastic.py |  6 +++---
 ot/stochastic.py            | 33 +++++++++++++++------------------
 test/test_stochastic.py     |  4 ++--
 3 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/examples/plot_stochastic.py b/examples/plot_stochastic.py
index 3fc1955c7..e1394737e 100644
--- a/examples/plot_stochastic.py
+++ b/examples/plot_stochastic.py
@@ -53,7 +53,7 @@
 
 method = "SAG"
 sag_pi = ot.stochastic.solve_semi_dual_entropic(a, b, M, reg, method,
-                                             numItermax, lr)
+                                                numItermax, lr)
 print(sag_pi)
 
 #############################################################################
@@ -91,7 +91,7 @@
 
 method = "ASGD"
 asgd_pi, log = ot.stochastic.solve_semi_dual_entropic(a, b, M, reg, method,
-                                                   numItermax, lr, log)
+                                                      numItermax, lr, log)
 print(log['alpha'], log['beta'])
 print(asgd_pi)
 
@@ -174,7 +174,7 @@
 # Call ot.solve_dual_entropic and plot the results.
 
 sgd_dual_pi, log = ot.stochastic.solve_dual_entropic(a, b, M, reg, batch_size,
-                                                  numItermax, lr, log)
+                                                     numItermax, lr, log)
 print(log['alpha'], log['beta'])
 print(sgd_dual_pi)
 
diff --git a/ot/stochastic.py b/ot/stochastic.py
index 31c99bebf..98537d9d2 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -78,8 +78,8 @@ def coordinate_gradient(b, M, reg, beta, i):
     '''
 
     r = M[i, :] - beta
-    exp_beta = np.exp(-r/reg) * b
-    khi = exp_beta/(np.sum(exp_beta))
+    exp_beta = np.exp(-r / reg) * b
+    khi = exp_beta / (np.sum(exp_beta))
     return b - khi
 
 
@@ -164,7 +164,7 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=0.1):
         cur_coord_grad = a[i] * coordinate_gradient(b, M, reg, cur_beta, i)
         sum_stored_gradient += (cur_coord_grad - stored_gradient[i])
         stored_gradient[i] = cur_coord_grad
-        cur_beta += lr * (1./n_source) * sum_stored_gradient
+        cur_beta += lr * (1. / n_source) * sum_stored_gradient
     return cur_beta
 
 
@@ -246,8 +246,8 @@ def averaged_sgd_entropic_transport(b, M, reg, numItermax=300000, lr=1):
         k = cur_iter + 1
         i = np.random.randint(n_source)
         cur_coord_grad = coordinate_gradient(b, M, reg, cur_beta, i)
-        cur_beta += (lr/np.sqrt(k)) * cur_coord_grad
-        ave_beta = (1./k) * cur_beta + (1 - 1./k) * ave_beta
+        cur_beta += (lr / np.sqrt(k)) * cur_coord_grad
+        ave_beta = (1. / k) * cur_beta + (1 - 1. / k) * ave_beta
     return ave_beta
 
 
@@ -316,12 +316,11 @@ def c_transform_entropic(b, M, reg, beta):
     '''
 
     n_source = np.shape(M)[0]
-    n_target = np.shape(M)[1]
     alpha = np.zeros(n_source)
     for i in range(n_source):
         r = M[i, :] - beta
         min_r = np.min(r)
-        exp_beta = np.exp(-(r - min_r)/reg) * b
+        exp_beta = np.exp(-(r - min_r) / reg) * b
         alpha[i] = min_r - reg * np.log(np.sum(exp_beta))
     return alpha
 
@@ -407,8 +406,6 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=0.1,
                      Advances in Neural Information Processing Systems (2016),
                       arXiv preprint arxiv:1605.08527.
     '''
-    n_source = 7
-    n_target = 4
     if method.lower() == "sag":
         opt_beta = sag_entropic_transport(a, b, M, reg, numItermax, lr)
     elif method.lower() == "asgd":
@@ -418,7 +415,7 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=0.1,
         return None
 
     opt_alpha = c_transform_entropic(b, M, reg, opt_beta)
-    pi = (np.exp((opt_alpha[:, None] + opt_beta[None, :] - M[:, :])/reg) *
+    pi = (np.exp((opt_alpha[:, None] + opt_beta[None, :] - M[:, :]) / reg) *
           a[:, None] * b[None, :])
 
     if log:
@@ -511,8 +508,8 @@ def grad_dF_dalpha(M, reg, alpha, beta, batch_size, batch_alpha, batch_beta):
     grad_alpha = np.zeros(batch_size)
     grad_alpha[:] = batch_size
     for j in batch_beta:
-        grad_alpha -= np.exp((alpha[batch_alpha] + beta[j]
-                              - M[batch_alpha, j])/reg)
+        grad_alpha -= np.exp((alpha[batch_alpha] + beta[j] -
+                              M[batch_alpha, j]) / reg)
     return grad_alpha
 
 
@@ -594,7 +591,7 @@ def grad_dF_dbeta(M, reg, alpha, beta, batch_size, batch_alpha, batch_beta):
     grad_beta[:] = batch_size
     for i in batch_alpha:
         grad_beta -= np.exp((alpha[i] +
-                             beta[batch_beta] - M[i, batch_beta])/reg)
+                             beta[batch_beta] - M[i, batch_beta]) / reg)
     return grad_beta
 
 
@@ -681,10 +678,10 @@ def sgd_entropic_regularization(M, reg, batch_size, numItermax, lr,
             batch_beta = np.random.choice(n_target, batch_size, replace=False)
             grad_F_alpha = grad_dF_dalpha(M, reg, cur_alpha, cur_beta,
                                           batch_size, batch_alpha, batch_beta)
-            cur_alpha[batch_alpha] += (lr/k) * grad_F_alpha
+            cur_alpha[batch_alpha] += (lr / k) * grad_F_alpha
             grad_F_beta = grad_dF_dbeta(M, reg, cur_alpha, cur_beta,
                                         batch_size, batch_alpha, batch_beta)
-            cur_beta[batch_beta] += (lr/k) * grad_F_beta
+            cur_beta[batch_beta] += (lr / k) * grad_F_beta
 
     else:
         for cur_iter in range(numItermax):
@@ -695,8 +692,8 @@ def sgd_entropic_regularization(M, reg, batch_size, numItermax, lr,
                                           batch_size, batch_alpha, batch_beta)
             grad_F_beta = grad_dF_dbeta(M, reg, cur_alpha, cur_beta,
                                         batch_size, batch_alpha, batch_beta)
-            cur_alpha[batch_alpha] += (lr/k) * grad_F_alpha
-            cur_beta[batch_beta] += (lr/k) * grad_F_beta
+            cur_alpha[batch_alpha] += (lr / k) * grad_F_alpha
+            cur_beta[batch_beta] += (lr / k) * grad_F_beta
 
     return cur_alpha, cur_beta
 
@@ -779,7 +776,7 @@ def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
 
     opt_alpha, opt_beta = sgd_entropic_regularization(M, reg, batch_size,
                                                       numItermax, lr)
-    pi = (np.exp((opt_alpha[:, None] + opt_beta[None, :] - M[:, :])/reg) *
+    pi = (np.exp((opt_alpha[:, None] + opt_beta[None, :] - M[:, :]) / reg) *
           a[:, None] * b[None, :])
     if log:
         log = {}
diff --git a/test/test_stochastic.py b/test/test_stochastic.py
index bc0cebbc4..5824df124 100644
--- a/test/test_stochastic.py
+++ b/test/test_stochastic.py
@@ -149,7 +149,7 @@ def test_stochastic_dual_sgd():
     M = ot.dist(x, x)
 
     G = ot.stochastic.solve_dual_entropic(u, u, M, reg, batch_size,
-                                       numItermax=numItermax)
+                                          numItermax=numItermax)
 
     # check constratints
     np.testing.assert_allclose(
@@ -180,7 +180,7 @@ def test_dual_sgd_sinkhorn():
     M = ot.dist(x, x)
 
     G_sgd = ot.stochastic.solve_dual_entropic(u, u, M, reg, batch_size,
-                                           numItermax=nb_iter)
+                                              numItermax=nb_iter)
 
     G_sinkhorn = ot.sinkhorn(u, u, M, reg)
 

From 52134e92e427c825b4aa0a24ac5e000232ebd707 Mon Sep 17 00:00:00 2001
From: Kilian Fatras <kilianfatras@dhcp-206-12-52-205.eduroam.wireless.ubc.ca>
Date: Tue, 19 Jun 2018 11:31:22 -0700
Subject: [PATCH 08/15] change grad function names

---
 docs/source/all.rst |  8 ++++++++
 ot/stochastic.py    | 37 ++++++++++++++++++++++---------------
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/docs/source/all.rst b/docs/source/all.rst
index c84d968ed..29d47c24b 100644
--- a/docs/source/all.rst
+++ b/docs/source/all.rst
@@ -33,6 +33,14 @@ ot.optim
 .. automodule:: ot.optim
    :members:
 
+
+ ot.stochastic
+ --------
+
+ .. automodule:: ot.stochastic
+    :members:
+
+
 ot.da
 --------
 
diff --git a/ot/stochastic.py b/ot/stochastic.py
index 98537d9d2..d3830d6bf 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -6,9 +6,9 @@
 
 
 ##############################################################################
-# Optimization toolbox for SEMI - DUAL problem
+# Optimization toolbox for SEMI - DUAL problems
 ##############################################################################
-def coordinate_gradient(b, M, reg, beta, i):
+def coordinate_grad_semi_dual(b, M, reg, beta, i):
     '''
     Compute the coordinate gradient update for regularized discrete
         distributions for (i, :)
@@ -161,7 +161,8 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=0.1):
     sum_stored_gradient = np.zeros(n_target)
     for _ in range(numItermax):
         i = np.random.randint(n_source)
-        cur_coord_grad = a[i] * coordinate_gradient(b, M, reg, cur_beta, i)
+        cur_coord_grad = a[i] * coordinate_grad_semi_dual(b, M, reg,
+                                                          cur_beta, i)
         sum_stored_gradient += (cur_coord_grad - stored_gradient[i])
         stored_gradient[i] = cur_coord_grad
         cur_beta += lr * (1. / n_source) * sum_stored_gradient
@@ -245,7 +246,7 @@ def averaged_sgd_entropic_transport(b, M, reg, numItermax=300000, lr=1):
     for cur_iter in range(numItermax):
         k = cur_iter + 1
         i = np.random.randint(n_source)
-        cur_coord_grad = coordinate_gradient(b, M, reg, cur_beta, i)
+        cur_coord_grad = coordinate_grad_semi_dual(b, M, reg, cur_beta, i)
         cur_beta += (lr / np.sqrt(k)) * cur_coord_grad
         ave_beta = (1. / k) * cur_beta + (1 - 1. / k) * ave_beta
     return ave_beta
@@ -428,11 +429,12 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=0.1,
 
 
 ##############################################################################
-# Optimization toolbox for DUAL problem
+# Optimization toolbox for DUAL problems
 ##############################################################################
 
 
-def grad_dF_dalpha(M, reg, alpha, beta, batch_size, batch_alpha, batch_beta):
+def batch_grad_dual_alpha(M, reg, alpha, beta, batch_size, batch_alpha,
+                          batch_beta):
     '''
     Computes the partial gradient of F_\W_varepsilon
 
@@ -513,7 +515,8 @@ def grad_dF_dalpha(M, reg, alpha, beta, batch_size, batch_alpha, batch_beta):
     return grad_alpha
 
 
-def grad_dF_dbeta(M, reg, alpha, beta, batch_size, batch_alpha, batch_beta):
+def batch_grad_dual_beta(M, reg, alpha, beta, batch_size, batch_alpha,
+                         batch_beta):
     '''
     Computes the partial gradient of F_\W_varepsilon
 
@@ -676,11 +679,13 @@ def sgd_entropic_regularization(M, reg, batch_size, numItermax, lr,
             k = np.sqrt(cur_iter + 1)
             batch_alpha = np.random.choice(n_source, batch_size, replace=False)
             batch_beta = np.random.choice(n_target, batch_size, replace=False)
-            grad_F_alpha = grad_dF_dalpha(M, reg, cur_alpha, cur_beta,
-                                          batch_size, batch_alpha, batch_beta)
+            grad_F_alpha = batch_grad_dual_alpha(M, reg, cur_alpha, cur_beta,
+                                                 batch_size, batch_alpha,
+                                                 batch_beta)
             cur_alpha[batch_alpha] += (lr / k) * grad_F_alpha
-            grad_F_beta = grad_dF_dbeta(M, reg, cur_alpha, cur_beta,
-                                        batch_size, batch_alpha, batch_beta)
+            grad_F_beta = batch_grad_dual_beta(M, reg, cur_alpha, cur_beta,
+                                               batch_size, batch_alpha,
+                                               batch_beta)
             cur_beta[batch_beta] += (lr / k) * grad_F_beta
 
     else:
@@ -688,10 +693,12 @@ def sgd_entropic_regularization(M, reg, batch_size, numItermax, lr,
             k = np.sqrt(cur_iter + 1)
             batch_alpha = np.random.choice(n_source, batch_size, replace=False)
             batch_beta = np.random.choice(n_target, batch_size, replace=False)
-            grad_F_alpha = grad_dF_dalpha(M, reg, cur_alpha, cur_beta,
-                                          batch_size, batch_alpha, batch_beta)
-            grad_F_beta = grad_dF_dbeta(M, reg, cur_alpha, cur_beta,
-                                        batch_size, batch_alpha, batch_beta)
+            grad_F_alpha = batch_grad_dual_alpha(M, reg, cur_alpha, cur_beta,
+                                                 batch_size, batch_alpha,
+                                                 batch_beta)
+            grad_F_beta = batch_grad_dual_beta(M, reg, cur_alpha, cur_beta,
+                                               batch_size, batch_alpha,
+                                               batch_beta)
             cur_alpha[batch_alpha] += (lr / k) * grad_F_alpha
             cur_beta[batch_beta] += (lr / k) * grad_F_beta
 

From 7073e417bed151976c62fe20d1ba69abd30e7758 Mon Sep 17 00:00:00 2001
From: Kilian Fatras <kilianfatras@dhcp-206-12-52-205.eduroam.wireless.ubc.ca>
Date: Tue, 19 Jun 2018 11:52:10 -0700
Subject: [PATCH 09/15] remove if in test and cleaned code

---
 examples/plot_stochastic.py |  1 +
 ot/stochastic.py            |  2 ++
 test/test_stochastic.py     | 10 +++-------
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/examples/plot_stochastic.py b/examples/plot_stochastic.py
index e1394737e..09b95d041 100644
--- a/examples/plot_stochastic.py
+++ b/examples/plot_stochastic.py
@@ -136,6 +136,7 @@
 ot.plot.plot1D_mat(a, b, sinkhorn_pi, 'OT matrix Sinkhorn')
 pl.show()
 
+
 #############################################################################
 # COMPUTE TRANSPORTATION MATRIX FOR DUAL PROBLEM
 #############################################################################
diff --git a/ot/stochastic.py b/ot/stochastic.py
index d3830d6bf..57b96b7a1 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -8,6 +8,8 @@
 ##############################################################################
 # Optimization toolbox for SEMI - DUAL problems
 ##############################################################################
+
+
 def coordinate_grad_semi_dual(b, M, reg, beta, i):
     '''
     Compute the coordinate gradient update for regularized discrete
diff --git a/test/test_stochastic.py b/test/test_stochastic.py
index 5824df124..3cb51ff14 100644
--- a/test/test_stochastic.py
+++ b/test/test_stochastic.py
@@ -57,6 +57,7 @@ def test_stochastic_sag():
 # 2 identical discrete measures u defined on the same space with a
 # regularization term, a learning rate and a number of iteration
 
+
 def test_stochastic_asgd():
     # test asgd
     n = 15
@@ -134,9 +135,9 @@ def test_sag_asgd_sinkhorn():
 # 2 identical discrete measures u defined on the same space with a
 # regularization term, a batch_size and a number of iteration
 
+
 def test_stochastic_dual_sgd():
     # test sgd
-    print("SGD")
     n = 10
     reg = 1
     numItermax = 300000
@@ -157,6 +158,7 @@ def test_stochastic_dual_sgd():
     np.testing.assert_allclose(
         u, G.sum(0), atol=1e-02)  # cf convergence sgd
 
+
 #############################################################################
 #
 # TEST Convergence SGD toward Sinkhorn's solution
@@ -167,7 +169,6 @@ def test_stochastic_dual_sgd():
 
 def test_dual_sgd_sinkhorn():
     # test all dual algorithms
-    print("SGD vs Sinkhorn")
     n = 10
     reg = 1
     nb_iter = 300000
@@ -191,8 +192,3 @@ def test_dual_sgd_sinkhorn():
         zero, (G_sgd - G_sinkhorn).sum(0), atol=1e-02)  # cf convergence sgd
     np.testing.assert_allclose(
         G_sgd, G_sinkhorn, atol=1e-02)  # cf convergence sgd
-
-
-if __name__ == '__main__':
-    test_stochastic_dual_sgd()
-    test_dual_sgd_sinkhorn()

From 6777ffd5c8457faac4467e58ba9edbcf2f86961b Mon Sep 17 00:00:00 2001
From: Kilian Fatras <kilianfatras@dhcp-206-12-53-92.eduroam.wireless.ubc.ca>
Date: Thu, 21 Jun 2018 17:18:26 -0700
Subject: [PATCH 10/15] gave better step size ASGD & SAG

---
 examples/plot_stochastic.py | 10 ++++------
 ot/stochastic.py            | 37 ++++++++++++++++---------------------
 test/test_stochastic.py     |  7 ++-----
 3 files changed, 22 insertions(+), 32 deletions(-)

diff --git a/examples/plot_stochastic.py b/examples/plot_stochastic.py
index 09b95d041..6274b4c73 100644
--- a/examples/plot_stochastic.py
+++ b/examples/plot_stochastic.py
@@ -32,8 +32,7 @@
 n_source = 7
 n_target = 4
 reg = 1
-numItermax = 10000
-lr = 0.1
+numItermax = 1000
 
 a = ot.utils.unif(n_source)
 b = ot.utils.unif(n_target)
@@ -53,7 +52,7 @@
 
 method = "SAG"
 sag_pi = ot.stochastic.solve_semi_dual_entropic(a, b, M, reg, method,
-                                                numItermax, lr)
+                                                numItermax)
 print(sag_pi)
 
 #############################################################################
@@ -68,8 +67,7 @@
 n_source = 7
 n_target = 4
 reg = 1
-numItermax = 100000
-lr = 1
+numItermax = 1000
 log = True
 
 a = ot.utils.unif(n_source)
@@ -91,7 +89,7 @@
 
 method = "ASGD"
 asgd_pi, log = ot.stochastic.solve_semi_dual_entropic(a, b, M, reg, method,
-                                                      numItermax, lr, log)
+                                                      numItermax, log)
 print(log['alpha'], log['beta'])
 print(asgd_pi)
 
diff --git a/ot/stochastic.py b/ot/stochastic.py
index 57b96b7a1..ab88cd054 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -56,7 +56,6 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
     >>> n_target = 4
     >>> reg = 1
     >>> numItermax = 300000
-    >>> lr = 1
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
     >>> rng = np.random.RandomState(0)
@@ -65,8 +64,7 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
     >>> M = ot.dist(X_source, Y_target)
     >>> method = "ASGD"
     >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
-                                                            method, numItermax,
-                                                            lr)
+                                                      method, numItermax)
     >>> print(asgd_pi)
 
     References
@@ -85,7 +83,7 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
     return b - khi
 
 
-def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=0.1):
+def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
     '''
     Compute the SAG algorithm to solve the regularized discrete measures
         optimal transport max problem
@@ -134,17 +132,15 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=0.1):
     >>> n_target = 4
     >>> reg = 1
     >>> numItermax = 300000
-    >>> lr = 1
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
     >>> rng = np.random.RandomState(0)
     >>> X_source = rng.randn(n_source, 2)
     >>> Y_target = rng.randn(n_target, 2)
     >>> M = ot.dist(X_source, Y_target)
-    >>> method = "SAG"
-    >>> sag_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
-                                                            method, numItermax,
-                                                            lr)
+    >>> method = "ASGD"
+    >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
+                                                      method, numItermax)
     >>> print(asgd_pi)
 
     References
@@ -156,6 +152,8 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=0.1):
                       arXiv preprint arxiv:1605.08527.
     '''
 
+    if lr is None:
+        lr = 1. / max(a)
     n_source = np.shape(M)[0]
     n_target = np.shape(M)[1]
     cur_beta = np.zeros(n_target)
@@ -171,7 +169,7 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=0.1):
     return cur_beta
 
 
-def averaged_sgd_entropic_transport(b, M, reg, numItermax=300000, lr=1):
+def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
     '''
     Compute the ASGD algorithm to solve the regularized semi contibous measures
         optimal transport max problem
@@ -219,7 +217,6 @@ def averaged_sgd_entropic_transport(b, M, reg, numItermax=300000, lr=1):
     >>> n_target = 4
     >>> reg = 1
     >>> numItermax = 300000
-    >>> lr = 1
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
     >>> rng = np.random.RandomState(0)
@@ -228,8 +225,7 @@ def averaged_sgd_entropic_transport(b, M, reg, numItermax=300000, lr=1):
     >>> M = ot.dist(X_source, Y_target)
     >>> method = "ASGD"
     >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
-                                                            method, numItermax,
-                                                            lr)
+                                                      method, numItermax)
     >>> print(asgd_pi)
 
     References
@@ -241,6 +237,8 @@ def averaged_sgd_entropic_transport(b, M, reg, numItermax=300000, lr=1):
                       arXiv preprint arxiv:1605.08527.
     '''
 
+    if lr is None:
+        lr = 1. / max(a)
     n_source = np.shape(M)[0]
     n_target = np.shape(M)[1]
     cur_beta = np.zeros(n_target)
@@ -296,7 +294,6 @@ def c_transform_entropic(b, M, reg, beta):
     >>> n_target = 4
     >>> reg = 1
     >>> numItermax = 300000
-    >>> lr = 1
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
     >>> rng = np.random.RandomState(0)
@@ -305,8 +302,7 @@ def c_transform_entropic(b, M, reg, beta):
     >>> M = ot.dist(X_source, Y_target)
     >>> method = "ASGD"
     >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
-                                                            method, numItermax,
-                                                            lr)
+                                                      method, numItermax)
     >>> print(asgd_pi)
 
     References
@@ -328,7 +324,7 @@ def c_transform_entropic(b, M, reg, beta):
     return alpha
 
 
-def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=0.1,
+def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
                                 log=False):
     '''
     Compute the transportation matrix to solve the regularized discrete
@@ -388,7 +384,6 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=0.1,
     >>> n_target = 4
     >>> reg = 1
     >>> numItermax = 300000
-    >>> lr = 1
     >>> a = ot.utils.unif(n_source)
     >>> b = ot.utils.unif(n_target)
     >>> rng = np.random.RandomState(0)
@@ -397,8 +392,7 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=0.1,
     >>> M = ot.dist(X_source, Y_target)
     >>> method = "ASGD"
     >>> asgd_pi = stochastic.solve_semi_dual_entropic(a, b, M, reg,
-                                                            method, numItermax,
-                                                            lr)
+                                                      method, numItermax)
     >>> print(asgd_pi)
 
     References
@@ -409,10 +403,11 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=0.1,
                      Advances in Neural Information Processing Systems (2016),
                       arXiv preprint arxiv:1605.08527.
     '''
+
     if method.lower() == "sag":
         opt_beta = sag_entropic_transport(a, b, M, reg, numItermax, lr)
     elif method.lower() == "asgd":
-        opt_beta = averaged_sgd_entropic_transport(b, M, reg, numItermax, lr)
+        opt_beta = averaged_sgd_entropic_transport(a, b, M, reg, numItermax, lr)
     else:
         print("Please, select your method between SAG and ASGD")
         return None
diff --git a/test/test_stochastic.py b/test/test_stochastic.py
index 3cb51ff14..f315c8810 100644
--- a/test/test_stochastic.py
+++ b/test/test_stochastic.py
@@ -63,7 +63,6 @@ def test_stochastic_asgd():
     n = 15
     reg = 1
     numItermax = 300000
-    lr = 1
     rng = np.random.RandomState(0)
 
     x = rng.randn(n, 2)
@@ -72,8 +71,7 @@ def test_stochastic_asgd():
     M = ot.dist(x, x)
 
     G = ot.stochastic.solve_semi_dual_entropic(u, u, M, reg, "asgd",
-                                               numItermax=numItermax,
-                                               lr=lr)
+                                               numItermax=numItermax)
 
     # check constratints
     np.testing.assert_allclose(
@@ -95,7 +93,6 @@ def test_sag_asgd_sinkhorn():
     n = 15
     reg = 1
     nb_iter = 300000
-    lr = 1
     rng = np.random.RandomState(0)
 
     x = rng.randn(n, 2)
@@ -104,7 +101,7 @@ def test_sag_asgd_sinkhorn():
     M = ot.dist(x, x)
 
     G_asgd = ot.stochastic.solve_semi_dual_entropic(u, u, M, reg, "asgd",
-                                                    numItermax=nb_iter, lr=lr)
+                                                    numItermax=nb_iter)
     G_sag = ot.stochastic.solve_semi_dual_entropic(u, u, M, reg, "sag",
                                                    numItermax=nb_iter)
     G_sinkhorn = ot.sinkhorn(u, u, M, reg)

From af5f726f6adb52457ca6730ffb85f2ab486b2ada Mon Sep 17 00:00:00 2001
From: Kilian Fatras <kilianfatras@dhcp-206-12-53-92.eduroam.wireless.ubc.ca>
Date: Thu, 21 Jun 2018 17:41:25 -0700
Subject: [PATCH 11/15] fixed bug

---
 ot/stochastic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ot/stochastic.py b/ot/stochastic.py
index ab88cd054..374d1a5b9 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -153,7 +153,7 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
     '''
 
     if lr is None:
-        lr = 1. / max(a)
+        lr = 1. / max(a/reg)
     n_source = np.shape(M)[0]
     n_target = np.shape(M)[1]
     cur_beta = np.zeros(n_target)
@@ -238,7 +238,7 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
     '''
 
     if lr is None:
-        lr = 1. / max(a)
+        lr = 1. / max(a/reg)
     n_source = np.shape(M)[0]
     n_target = np.shape(M)[1]
     cur_beta = np.zeros(n_target)

From e8cf3cc343c934b7c49d303186cdf226204813b3 Mon Sep 17 00:00:00 2001
From: Kilian Fatras <kilianfatras@dhcp-206-12-53-92.eduroam.wireless.ubc.ca>
Date: Thu, 21 Jun 2018 17:52:05 -0700
Subject: [PATCH 12/15] pep8

---
 ot/stochastic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ot/stochastic.py b/ot/stochastic.py
index 374d1a5b9..f4d4c7d07 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -153,7 +153,7 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
     '''
 
     if lr is None:
-        lr = 1. / max(a/reg)
+        lr = 1. / max(a / reg)
     n_source = np.shape(M)[0]
     n_target = np.shape(M)[1]
     cur_beta = np.zeros(n_target)
@@ -238,7 +238,7 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
     '''
 
     if lr is None:
-        lr = 1. / max(a/reg)
+        lr = 1. / max(a / reg)
     n_source = np.shape(M)[0]
     n_target = np.shape(M)[1]
     cur_beta = np.zeros(n_target)

From 9fecd51c583704e02d3faaef722f0dee52f42359 Mon Sep 17 00:00:00 2001
From: Kilian Fatras <kilianfatras@dhcp-206-12-52-229.eduroam.wireless.ubc.ca>
Date: Mon, 25 Jun 2018 11:03:44 -0700
Subject: [PATCH 13/15] fix math operator and log bugs

---
 examples/plot_stochastic.py | 18 ++++++++++--------
 ot/stochastic.py            | 15 ++++++++++++---
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/examples/plot_stochastic.py b/examples/plot_stochastic.py
index 6274b4c73..b9375d423 100644
--- a/examples/plot_stochastic.py
+++ b/examples/plot_stochastic.py
@@ -15,6 +15,7 @@
 import matplotlib.pylab as pl
 import numpy as np
 import ot
+import ot.plot
 
 
 #############################################################################
@@ -88,9 +89,9 @@
 # results.
 
 method = "ASGD"
-asgd_pi, log = ot.stochastic.solve_semi_dual_entropic(a, b, M, reg, method,
-                                                      numItermax, log)
-print(log['alpha'], log['beta'])
+asgd_pi, log_asgd = ot.stochastic.solve_semi_dual_entropic(a, b, M, reg, method,
+                                                           numItermax, log=log)
+print(log_asgd['alpha'], log_asgd['beta'])
 print(asgd_pi)
 
 #############################################################################
@@ -166,15 +167,16 @@
 
 #############################################################################
 #
-# Call the "SGD" dual method to find the transportation matrix in the semicontinous
-# case
+# Call the "SGD" dual method to find the transportation matrix in the
+# semicontinous case
 # ---------------------------------------------
 #
 # Call ot.solve_dual_entropic and plot the results.
 
-sgd_dual_pi, log = ot.stochastic.solve_dual_entropic(a, b, M, reg, batch_size,
-                                                     numItermax, lr, log)
-print(log['alpha'], log['beta'])
+sgd_dual_pi, log_sgd = ot.stochastic.solve_dual_entropic(a, b, M, reg,
+                                                         batch_size, numItermax,
+                                                         lr, log=log)
+print(log_sgd['alpha'], log_sgd['beta'])
 print(sgd_dual_pi)
 
 #############################################################################
diff --git a/ot/stochastic.py b/ot/stochastic.py
index f4d4c7d07..5e8206ee6 100644
--- a/ot/stochastic.py
+++ b/ot/stochastic.py
@@ -16,8 +16,9 @@ def coordinate_grad_semi_dual(b, M, reg, beta, i):
         distributions for (i, :)
 
     The function computes the gradient of the semi dual problem:
+
     .. math::
-        \W_varepsilon(a, b) = \max_\v \sum_i (\sum_j v_j * b_j
+        \W_\varepsilon(a, b) = \max_\v \sum_i (\sum_j v_j * b_j
             - \reg log(\sum_j exp((v_j - M_{i,j})/reg) * b_j)) * a_i
 
     where :
@@ -89,6 +90,7 @@ def sag_entropic_transport(a, b, M, reg, numItermax=10000, lr=None):
         optimal transport max problem
 
     The function solves the following optimization problem:
+
     .. math::
         \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
         s.t. \gamma 1 = a
@@ -175,6 +177,7 @@ def averaged_sgd_entropic_transport(a, b, M, reg, numItermax=300000, lr=None):
         optimal transport max problem
 
     The function solves the following optimization problem:
+
     .. math::
         \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
         s.t. \gamma 1 = a
@@ -258,6 +261,7 @@ def c_transform_entropic(b, M, reg, beta):
 
     The function computes the c_transform of a dual variable from the other
     dual variable:
+
     .. math::
         u = v^{c,reg} = -reg \sum_j exp((v - M)/reg) b_j
 
@@ -331,6 +335,7 @@ def solve_semi_dual_entropic(a, b, M, reg, method, numItermax=10000, lr=None,
         measures optimal transport max problem
 
     The function solves the following optimization problem:
+
     .. math::
         \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
         s.t. \gamma 1 = a
@@ -436,7 +441,8 @@ def batch_grad_dual_alpha(M, reg, alpha, beta, batch_size, batch_alpha,
     Computes the partial gradient of F_\W_varepsilon
 
     Compute the partial gradient of the dual problem:
-    ..Math:
+
+    ..math:
         \forall i in batch_alpha,
             grad_alpha_i = 1 * batch_size -
                     sum_{j in batch_beta} exp((alpha_i + beta_j - M_{i,j})/reg)
@@ -518,7 +524,8 @@ def batch_grad_dual_beta(M, reg, alpha, beta, batch_size, batch_alpha,
     Computes the partial gradient of F_\W_varepsilon
 
     Compute the partial gradient of the dual problem:
-    ..Math:
+
+    ..math:
         \forall j in batch_beta,
             grad_beta_j = 1 * batch_size -
                 sum_{i in batch_alpha} exp((alpha_i + beta_j - M_{i,j})/reg)
@@ -602,6 +609,7 @@ def sgd_entropic_regularization(M, reg, batch_size, numItermax, lr,
         optimal transport dual problem
 
     The function solves the following optimization problem:
+
     .. math::
         \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
         s.t. \gamma 1 = a
@@ -709,6 +717,7 @@ def solve_dual_entropic(a, b, M, reg, batch_size, numItermax=10000, lr=1,
         optimal transport dual problem
 
     The function solves the following optimization problem:
+
     .. math::
         \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma)
         s.t. \gamma 1 = a

From 968ad58ecae5f26d5f3d6cfc410ccc93de96f2c0 Mon Sep 17 00:00:00 2001
From: Kilian Fatras <kilianfatras@dhcp-206-12-53-98.eduroam.wireless.ubc.ca>
Date: Tue, 26 Jun 2018 11:00:51 -0700
Subject: [PATCH 14/15] add stochastic to docs

---
 docs/source/all.rst | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/docs/source/all.rst b/docs/source/all.rst
index 29d47c24b..019409891 100644
--- a/docs/source/all.rst
+++ b/docs/source/all.rst
@@ -20,6 +20,11 @@ ot.bregman
 .. automodule:: ot.bregman
    :members:
 
+ot.smooth
+-----
+.. automodule:: ot.smooth
+   :members:
+
 ot.gromov
 ----------
 
@@ -33,14 +38,6 @@ ot.optim
 .. automodule:: ot.optim
    :members:
 
-
- ot.stochastic
- --------
-
- .. automodule:: ot.stochastic
-    :members:
-
-
 ot.da
 --------
 
@@ -71,3 +68,9 @@ ot.plot
 
 .. automodule:: ot.plot
    :members:
+
+ ot.stochastic
+ -------
+
+ .. automodule:: ot.stochastic
+    :members:

From 208ff4627ba28aa3f35328c5928324019c23ddac Mon Sep 17 00:00:00 2001
From: Kilian Fatras <kilianfatras@dhcp-206-12-53-98.eduroam.wireless.ubc.ca>
Date: Tue, 26 Jun 2018 11:09:41 -0700
Subject: [PATCH 15/15] fix stochastic to doc

---
 docs/source/all.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/all.rst b/docs/source/all.rst
index 019409891..94da2ed21 100644
--- a/docs/source/all.rst
+++ b/docs/source/all.rst
@@ -69,8 +69,8 @@ ot.plot
 .. automodule:: ot.plot
    :members:
 
- ot.stochastic
- -------
+ot.stochastic
+-------------
 
- .. automodule:: ot.stochastic
-    :members:
+.. automodule:: ot.stochastic
+   :members: