arviz-devs · ahartikainen · Aug 16, 2020 · Aug 16, 2020 · Aug 16, 2020 · Aug 16, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,7 +14,7 @@
 * improve handling of circular variables in `az.summary` (#1313)
 * Removed change of default warning in `ELPDData` string representation (#1321)
 * update `radon` example dataset to current InferenceData schema specification (#1320)
-* update `from_cmdstan` functionality and add warmup groups (#1330)
+* update `from_cmdstan` functionality and add warmup groups (#1330 and #1351)
 * restructure plotting code to be compatible with mpl>=3.3 (#1312)
 * Replaced `_fast_kde()` with `kde()` which now also supports circular variables
 via the argument `circular` (#1284).

diff --git a/arviz/data/io_cmdstan.py b/arviz/data/io_cmdstan.py
@@ -222,8 +222,8 @@ def sample_stats_to_xarray(self):
         data = _unpack_dataframes(sampler_params)
         data_warmup = _unpack_dataframes(sampler_params_warmup)
         return (
-            dict_to_dataset(data, coords=self.coords, dims=self.dims),
-            dict_to_dataset(data_warmup, coords=self.coords, dims=self.dims),
+            dict_to_dataset(data, coords=self.coords, dims=self.dims, attrs=self.attrs),
+            dict_to_dataset(data_warmup, coords=self.coords, dims=self.dims, attrs=self.attrs),
         )
 
     @requires("posterior")
@@ -375,8 +375,10 @@ def sample_stats_prior_to_xarray(self):
         data = _unpack_dataframes(sampler_params)
         data_warmup = _unpack_dataframes(sampler_params_warmup)
         return (
-            dict_to_dataset(data, coords=self.coords, dims=self.dims),
-            dict_to_dataset(data_warmup, coords=self.coords, dims=self.dims),
+            dict_to_dataset(data, coords=self.coords, dims=self.dims, attrs=self.attrs_prior),
+            dict_to_dataset(
+                data_warmup, coords=self.coords, dims=self.dims, attrs=self.attrs_prior
+            ),
         )
 
     @requires("prior")
@@ -554,39 +556,35 @@ def to_inference_data(self):
 def _process_configuration(comments):
     """Extract sampling information."""
     results = {
-        "extra": [],
+        "comments": "\n".join(comments),
         "stan_version": {},
     }
 
     comments_gen = iter(comments)
 
     for comment in comments_gen:
-        comment = comment.strip("#").strip()
-        if comment.startswith("num_samples"):
-            results["num_samples"] = int(comment.strip("num_samples = ").strip("(Default)"))
-        elif comment.startswith("num_warmup"):
-            results["num_warmup"] = int(comment.strip("num_warmup = ").strip("(Default)"))
-        elif comment.startswith("save_warmup"):
-            results["save_warmup"] = bool(int(comment.strip("save_warmup = ").strip("(Default)")))
-        elif comment.startswith("thin"):
-            results["thin"] = int(comment.strip("thin = ").strip("(Default)"))
-        elif comment.startswith("stan_version_"):
-            key, val = comment.strip("stan_version_").split("=")
+        comment = re.sub(r"^\s*#\s*|\s*\(Default\)\s*$", "", comment).strip()
+        if comment.startswith("stan_version_"):
+            key, val = re.sub(r"^\s*stan_version_", "", comment).split("=")
             results["stan_version"][key.strip()] = val.strip()
         elif comment.startswith("Step size"):
             _, val = comment.split("=")
             results["step_size"] = float(val.strip())
         elif "inverse mass matrix" in comment:
-            comment = next(comments_gen).strip("#").strip()
+            comment = re.sub(r"^\s*#\s*", "", next(comments_gen)).strip()
             results["inverse_mass_matrix"] = np.array(comment.split(","), dtype=float)
         elif ("seconds" in comment) and any(
             item in comment for item in ("(Warm-up)", "(Sampling)", "(Total)")
         ):
-            value = (
-                comment.strip("Elapsed Time:")
-                .strip("seconds (Warm-up)")
-                .strip("seconds (Sampling)")
-                .strip("seconds (Total)")
+            value = re.sub(
+                (
+                    r"^Elapsed\s*Time:\s*|"
+                    r"\s*seconds\s*\(Warm-up\)\s*|"
+                    r"\s*seconds\s*\(Sampling\)\s*|"
+                    r"\s*seconds\s*\(Total\)\s*"
+                ),
+                "",
+                comment,
             )
             key = (
                 "warmup_time_seconds"
@@ -596,9 +594,25 @@ def _process_configuration(comments):
                 else "total_time_seconds"
             )
             results[key] = float(value)
-        else:
-            results["extra"].append(comment)
-
+        elif "=" in comment:
+            match_int = re.search(r"^(\S+)\s*=\s*([-+]?[0-9]+)$", comment)
+            match_float = re.search(r"^(\S+)\s*=\s*([-+]?[0-9]+\.[0-9]+)$", comment)
+            match_str = re.search(r"^(\S+)\s*=\s*(\S+)$", comment)
+            match_empty = re.search(r"^(\S+)\s*=\s*$", comment)
+            if match_int:
+                key, value = match_int.group(1), match_int.group(2)
+                results[key] = int(value)
+            elif match_float:
+                key, value = match_float.group(1), match_float.group(2)
+                results[key] = float(value)
+            elif match_str:
+                key, value = match_str.group(1), match_str.group(2)
+                results[key] = value
+            elif match_empty:
+                key = match_empty.group(1)
+                results[key] = None
+
+    results = {key: results[key] for key in sorted(results)}
     return results
 
 

diff --git a/arviz/tests/external_tests/test_data_cmdstan.py b/arviz/tests/external_tests/test_data_cmdstan.py
@@ -64,6 +64,7 @@ def test_sample_stats(self, paths):
                 continue
             inference_data = self.get_inference_data(path)
             assert hasattr(inference_data, "sample_stats")
+            assert "comments" in inference_data.sample_stats.attrs
 
     def test_inference_data_shapes(self, paths):
         """Assert that shapes are transformed correctly"""
@@ -87,6 +88,7 @@ def test_inference_data_shapes(self, paths):
             Z_mean_true = np.array([1, 2, 3, 4])
             Z_mean = inference_data.posterior["Z"].mean(dim=dims).mean(axis=1)
             assert np.isclose(Z_mean, Z_mean_true, atol=7e-1).all()
+            assert "comments" in inference_data.posterior.attrs
 
     def test_inference_data_input_types1(self, paths, observed_data_paths):
         """Check input types