Merge branch 'master' of https://github.com/LabForComputationalVision…

…/plenoptic
plenoptic-org · Apr 23, 2021 · 5bd79c6 · 5bd79c6
2 parents 9273d0f + 65d7967
commit 5bd79c6
Show file tree

Hide file tree

Showing 5 changed files with 290 additions and 177 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,7 @@ examples/data/cifar*
 data/plenoptic-test-files
 data/ssim_images
 data/ssim_analysis.mat
+data/msssim_images
 data/cat7*
 data/elep*
 docs/_build

diff --git a/examples/09_Original_MAD.ipynb b/examples/09_Original_MAD.ipynb
diff --git a/plenoptic/metric/__init__.py b/plenoptic/metric/__init__.py
@@ -1,4 +1,4 @@
-from .perceptual_distance import ssim, nlpd, nspd, ssim_map
+from .perceptual_distance import ssim, ms_ssim, nlpd, nspd, ssim_map
 from .model_metric import model_metric
 from .naive import mse
 from .classes import NLP
diff --git a/plenoptic/metric/perceptual_distance.py b/plenoptic/metric/perceptual_distance.py
@@ -90,21 +90,16 @@ def _ssim_parts(img1, img2, dynamic_range):
     C1 = (0.01 * dynamic_range) ** 2
     C2 = (0.03 * dynamic_range) ** 2
 
-    v1 = 2.0 * sigma12 + C2
-    v2 = sigma1_sq + sigma2_sq + C2
-
-    # SSIM consists of a luminance component, a contrast component, and a
-    # structure component. This is the contrast component, which is used to
-    # compute MS-SSIM This is the contrast component, which is used to compute
-    # MS-SSIM.
-    contrast_map = v1 / v2
+    # SSIM is the product of a luminance component, a contrast component, and a
+    # structure component. The contrast-structure component has to be separated
+    # when computing MS-SSIM.
+    luminance_map = (2 * mu1_mu2 + C1) / (mu1_sq + mu2_sq + C1)
+    contrast_structure_map = (2.0 * sigma12 + C2) / (sigma1_sq + sigma2_sq + C2)
+    map_ssim = luminance_map * contrast_structure_map
 
     # the weight used for stability
-    weight = torch.log(torch.matmul((1+(sigma1_sq/C2)), (1+(sigma2_sq/C2))))
-
-    ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)
-    ssim_map = ((2 * mu1_mu2 + C1) / (mu1_sq + mu2_sq + C1)) * contrast_map
-    return ssim_map, contrast_map, weight
+    weight = torch.log((1 + sigma1_sq/C2) * (1 + sigma2_sq/C2))
+    return map_ssim, contrast_structure_map, weight
 
 
 def ssim(img1, img2, weighted=False, dynamic_range=1):
@@ -113,7 +108,7 @@ def ssim(img1, img2, weighted=False, dynamic_range=1):
     As described in [1]_, the structural similarity index (SSIM) is a
     perceptual distance metric, giving the distance between two images. SSIM is
     based on three comparison measurements between the two images: luminance,
-    contrast, and structure. All of these are computed in windows across the
+    contrast, and structure. All of these are computed convolutionally across the
     images. See the references for more information.
 
     This implementation follows the original implementation, as found at [2]_,
@@ -151,8 +146,8 @@ def ssim(img1, img2, weighted=False, dynamic_range=1):
     Returns
     ------
     mssim : torch.Tensor
-        2d tensor containing the mean SSIM for each image, averaged over the
-        whole image
+        2d tensor of shape (batch, channel) containing the mean SSIM for each
+        image, averaged over the whole image
 
     Notes
     -----
@@ -169,7 +164,7 @@ def ssim(img1, img2, weighted=False, dynamic_range=1):
     ----------
     .. [1] Z. Wang, A. C. Bovik, H. R. Sheikh, and E. P. Simoncelli, "Image
        quality assessment: From error measurement to structural similarity"
-       IEEE Transactios on Image Processing, vol. 13, no. 1, Jan. 2004.
+       IEEE Transactions on Image Processing, vol. 13, no. 1, Jan. 2004.
     .. [2] [matlab code](https://www.cns.nyu.edu/~lcv/ssim/ssim_index.m)
     .. [3] [project page](https://www.cns.nyu.edu/~lcv/ssim/)
     .. [4] Wang, Z., & Simoncelli, E. P. (2008). Maximum differentiation (MAD)
@@ -186,6 +181,10 @@ def ssim(img1, img2, weighted=False, dynamic_range=1):
     else:
         mssim = (map_ssim*weight).sum((-1, -2)) / weight.sum((-1, -2))
 
+    if min(img1.shape[2], img1.shape[3]) < 11:
+        warnings.warn("SSIM uses 11x11 convolutional kernel, but the height and/or "
+                      "the width of the input image is smaller than 11, so the "
+                      "kernel size is set to be the minimum of these two numbers.")
     return mssim
 
 
@@ -195,7 +194,7 @@ def ssim_map(img1, img2, dynamic_range=1):
     As described in [1]_, the structural similarity index (SSIM) is a
     perceptual distance metric, giving the distance between two images. SSIM is
     based on three comparison measurements between the two images: luminance,
-    contrast, and structure. All of these are computed in windows across the
+    contrast, and structure. All of these are computed convolutionally across the
     images. See the references for more information.
 
     This implementation follows the original implementation, as found at [2]_,
@@ -238,7 +237,7 @@ def ssim_map(img1, img2, dynamic_range=1):
     ----------
     .. [1] Z. Wang, A. C. Bovik, H. R. Sheikh, and E. P. Simoncelli, "Image
        quality assessment: From error measurement to structural similarity"
-       IEEE Transactios on Image Processing, vol. 13, no. 1, Jan. 2004.
+       IEEE Transactions on Image Processing, vol. 13, no. 1, Jan. 2004.
     .. [2] [matlab code](https://www.cns.nyu.edu/~lcv/ssim/ssim_index.m)
     .. [3] [project page](https://www.cns.nyu.edu/~lcv/ssim/)
     .. [4] Wang, Z., & Simoncelli, E. P. (2008). Maximum differentiation (MAD)
@@ -247,9 +246,97 @@ def ssim_map(img1, img2, dynamic_range=1):
        http://dx.doi.org/10.1167/8.12.8
 
     """
+    if min(img1.shape[2], img1.shape[3]) < 11:
+        warnings.warn("SSIM uses 11x11 convolutional kernel, but the height and/or "
+                      "the width of the input image is smaller than 11, so the "
+                      "kernel size is set to be the minimum of these two numbers.")
     return _ssim_parts(img1, img2, dynamic_range)[0]
 
 
+def ms_ssim(img1, img2, dynamic_range=1, power_factors=None):
+    r"""Multiscale structural similarity index (MS-SSIM)
+
+    As described in [1]_, multiscale structural similarity index (MS-SSIM) is
+    an improvement upon structural similarity index (SSIM) that takes into
+    account the perceptual distance between two images on different scales.
+
+    SSIM is based on three comparison measurements between the two images:
+    luminance, contrast, and structure. All of these are computed convolutionally
+    across the images, producing three maps instead of scalars. The SSIM map is
+    the elementwise product of these three maps. See `metric.ssim` and
+    `metric.ssim_map` for a full description of SSIM.
+
+    To get images of different scales, average pooling operations with kernel
+    size 2 are performed recursively on the input images. The product of
+    contrast map and structure map (the "contrast-structure map") is computed
+    for all but the coarsest scales, and the overall SSIM map is only computed
+    for the coarsest scale. Their mean values are raised to exponents and
+    multiplied to produce MS-SSIM:
+    .. math::
+        MSSSIM = {SSIM}_M^{a_M} \prod_{i=1}^{M-1} ({CS}_i)^{a_i}
+    Here :math: `M` is the number of scales, :math: `{CS}_i` is the mean value
+    of the contrast-structure map for the i'th finest scale, and :math: `{SSIM}_M`
+    is the mean value of the SSIM map for the coarsest scale. If at least one
+    of these terms are negative, the value of MS-SSIM is zero. The values of
+    :math: `a_i, i=1,...,M` are taken from the argument `power_factors`.
+
+    Parameters
+    ----------
+    img1 : torch.Tensor
+        4d tensor with first image to compare
+    img2 : torch.Tensor
+        4d tensor with second image to compare. Must have the same height and
+        width (last two dimensions) as `img1`
+    dynamic_range : int, optional.
+        dynamic range of the images. Note we assume that both images have the
+        same dynamic range. 1, the default, is appropriate for float images
+        between 0 and 1, as is common in synthesis. 2 is appropriate for float
+        images between -1 and 1, and 255 is appropriate for standard 8-bit
+        integer images. We'll raise a warning if it looks like your value is
+        not appropriate for `img1` or `img2`, but will calculate it anyway.
+    power_factors : 1D array, optional.
+        power exponents for the mean values of maps, for different scales (from
+        fine to coarse). The length of this array determines the number of scales.
+        By default, this is set to [0.0448, 0.2856, 0.3001, 0.2363, 0.1333],
+        which is what psychophysical experiments in [1]_ found.
+
+    Returns
+    ------
+    msssim : torch.Tensor
+        2d tensor of shape (batch, channel) containing the MS-SSIM for each image
+
+    References
+    ----------
+    .. [1] Wang, Zhou, Eero P. Simoncelli, and Alan C. Bovik. "Multiscale
+       structural similarity for image quality assessment." The Thrity-Seventh
+       Asilomar Conference on Signals, Systems & Computers, 2003. Vol. 2. IEEE, 2003.
+
+    """
+    if power_factors is None:
+        power_factors = [0.0448, 0.2856, 0.3001, 0.2363, 0.1333]
+
+    def downsample(img):
+        img = F.pad(img, (0, img.shape[3] % 2, 0, img.shape[2] % 2), mode="replicate")
+        img = F.avg_pool2d(img, kernel_size=2)
+        return img
+
+    msssim = 1
+    for i in range(len(power_factors) - 1):
+        _, contrast_structure_map, _ = _ssim_parts(img1, img2, dynamic_range)
+        msssim *= F.relu(contrast_structure_map.mean((-1, -2))).pow(power_factors[i])
+        img1 = downsample(img1)
+        img2 = downsample(img2)
+    map_ssim, _, _ = _ssim_parts(img1, img2, dynamic_range)
+    msssim *= F.relu(map_ssim.mean((-1, -2))).pow(power_factors[-1])
+
+    if min(img1.shape[2], img1.shape[3]) < 11:
+        warnings.warn("SSIM uses 11x11 convolutional kernel, but for some scales "
+                      "of the input image, the height and/or the width is smaller "
+                      "than 11, so the kernel size in SSIM is set to be the "
+                      "minimum of these two numbers for these scales.")
+    return msssim
+
+
 def normalized_laplacian_pyramid(im):
     """computes the normalized Laplacian Pyramid using pre-optimized parameters
 
@@ -273,7 +360,8 @@ def normalized_laplacian_pyramid(im):
     padd = 2
     normalized_laplacian_activations = []
     for N_b in range(0, N_scales):
-        filt = torch.tensor(spatialpooling_filters[N_b], dtype=torch.float32, device=im.device).unsqueeze(0).unsqueeze(0)
+        filt = torch.tensor(spatialpooling_filters[N_b], dtype=torch.float32,
+                            device=im.device).unsqueeze(0).unsqueeze(0)
         filtered_activations = F.conv2d(torch.abs(laplacian_activations[N_b]), filt, padding=padd, groups=channel)
         normalized_laplacian_activations.append(laplacian_activations[N_b] / (sigmas[N_b] + filtered_activations))
 
@@ -314,7 +402,8 @@ def nlpd(IM_1, IM_2):
 
     References
     ----------
-    .. [1] Laparra, V., Ballé, J., Berardino, A. and Simoncelli, E.P., 2016. Perceptual image quality assessment using a normalized Laplacian pyramid. Electronic Imaging, 2016(16), pp.1-6.
+    .. [1] Laparra, V., Ballé, J., Berardino, A. and Simoncelli, E.P., 2016. Perceptual image quality
+       assessment using a normalized Laplacian pyramid. Electronic Imaging, 2016(16), pp.1-6.
     """
 
     y = normalized_laplacian_pyramid(torch.cat((IM_1, IM_2), 0))

diff --git a/tests/test_plenoptic.py b/tests/test_plenoptic.py
@@ -14,7 +14,7 @@
 
 # If you add anything here, remember to update the docstring in osf_download!
 OSF_URL = {'plenoptic-test-files.tar.gz': 'q9kn8', 'ssim_images.tar.gz': 'j65tw',
-           'ssim_analysis.mat': 'ndtc7', 'MAD_results.tar.gz': 'jwcsr'}
+           'ssim_analysis.mat': 'ndtc7', 'msssim_images.tar.gz': '5fuba', 'MAD_results.tar.gz': 'jwcsr'}
 
 
 def osf_download(filename):
@@ -28,7 +28,7 @@ def osf_download(filename):
     Parameters
     ----------
     filename : {'plenoptic-test-files.tar.gz', 'ssim_images.tar.gz',
-                'ssim_analysis.mat', 'MAD_results.tar.gz'}
+                'ssim_analysis.mat', 'msssim_images.tar.gz', 'MAD_results.tar.gz'}
         Which file to download.
 
     Returns
@@ -74,6 +74,11 @@ def ssim_images():
     return osf_download('ssim_images.tar.gz')
 
 
+@pytest.fixture()
+def msssim_images():
+    return osf_download('msssim_images.tar.gz')
+
+
 @pytest.fixture()
 def ssim_analysis():
     ssim_analysis = osf_download('ssim_analysis.mat')
@@ -168,20 +173,25 @@ def test_ssim(self, einstein_img, curie_img, weighted):
         curie_img.requires_grad_()
         assert po.metric.ssim(einstein_img, curie_img, weighted=weighted).requires_grad
 
-    @pytest.mark.parametrize('func_name', ['noise', 'mse', 'ssim'])
+    def test_msssim(self, einstein_img, curie_img):
+        curie_img.requires_grad_()
+        assert po.metric.ms_ssim(einstein_img, curie_img).requires_grad
+
+    @pytest.mark.parametrize('func_name', ['noise', 'mse', 'ssim', 'ms-ssim'])
     @pytest.mark.parametrize('size_A', [1, 3])
     @pytest.mark.parametrize('size_B', [1, 2, 3])
     def test_batch_handling(self, einstein_img, curie_img, func_name, size_A, size_B):
         if func_name == 'noise':
             func = po.add_noise
             A = einstein_img.repeat(size_A, 1, 1, 1)
             B = size_B * [4]
-        elif func_name == 'mse':
-            func = po.metric.mse
-            A = einstein_img.repeat(size_A, 1, 1, 1)
-            B = curie_img.repeat(size_B, 1, 1, 1)
-        elif func_name == 'ssim':
-            func = po.metric.ssim
+        else:
+            if func_name == 'mse':
+                func = po.metric.mse
+            elif func_name == 'ssim':
+                func = po.metric.ssim
+            elif func_name == 'ms-ssim':
+                func = po.metric.ms_ssim
             A = einstein_img.repeat(size_A, 1, 1, 1)
             B = curie_img.repeat(size_B, 1, 1, 1)
         if size_A != size_B and size_A != 1 and size_B != 1:
@@ -238,6 +248,16 @@ def test_ssim_analysis(self, weighted, other_img, ssim_images,
         print(plen_val-mat_val, plen_val, mat_val)
         assert torch.allclose(plen_val, mat_val.view_as(plen_val), atol=1e-5)
 
+    def test_msssim_analysis(self, msssim_images):
+        # True values are defined by https://ece.uwaterloo.ca/~z70wang/research/iwssim/msssim.zip
+        true_values = torch.tensor([1.0000000, 0.9112161, 0.7699084, 0.8785111, 0.9488805], device=DEVICE)
+        computed_values = torch.zeros_like(true_values)
+        base_img = po.load_images(op.join(msssim_images, "samp0.tiff")).to(DEVICE)
+        for i in range(len(true_values)):
+            other_img = po.load_images(op.join(msssim_images, f"samp{i}.tiff")).to(DEVICE)
+            computed_values[i] = po.metric.ms_ssim(base_img, other_img)
+        assert torch.allclose(true_values, computed_values)
+
     def test_nlpd(self, einstein_img, curie_img):
         curie_img.requires_grad_()
         assert po.metric.nlpd(einstein_img, curie_img).requires_grad