From f47870fdf88ac4aeb11c89ce9dd41d2086690558 Mon Sep 17 00:00:00 2001
From: Philipp Schmelter <philipp@stressnet.de>
Date: Sat, 28 Jun 2025 02:07:29 +0200
Subject: [PATCH 1/5] current gmm state

---
 CHANGELOG.md      |   1 +
 src/main.py       | 134 ++++++++++++++++++++++++++++++++++++++++------
 src/markov/gmm.py | 129 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 248 insertions(+), 16 deletions(-)
 create mode 100644 src/markov/gmm.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 04964c4..5eca4ea 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - `CODEOWNERS` file and dependabot automation [#7](https://github.com/ie3-institute/simonaMarkovLoad/issues/7)
 - Added full Markov-model pipeline [#10](https://github.com/ie3-institute/simonaMarkovLoad/issues/10)
+- Added GMM feature to project [#12](https://github.com/ie3-institute/simonaMarkovLoad/issues/12)
 
 ### Changed
 - Compute instantaneous kW from cumulative kWh via 15-minute differencing [#1](https://github.com/ie3-institute/simonaMarkovLoad/issues/1)
diff --git a/src/main.py b/src/main.py
index 099795a..7173343 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,31 +1,133 @@
 import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
+from matplotlib.colors import Normalize
 
+from src.markov.buckets import assign_buckets, bucket_id
+from src.markov.gmm import fit_gmms, sample_value
 from src.markov.transition_counts import build_transition_counts
 from src.markov.transitions import build_transition_matrices
 from src.preprocessing.loader import load_timeseries
 
-df = load_timeseries(normalize=True, discretize=True)
+SIM_DAYS = 3
 
 
-counts = build_transition_counts(df)
-probs = build_transition_matrices(df, alpha=1.0)
+def _detect_value_col(df: pd.DataFrame) -> str:
+    candidate_cols = ["x", "value", "load", "power", "p_norm", "load_norm"]
+    for c in candidate_cols:
+        if c in df.columns and np.issubdtype(df[c].dtype, np.number):
+            return c
+    raise KeyError("No numeric load column found – please inspect the dataframe.")
 
-print("counts shape :", counts.shape)
-print("probs  shape :", probs.shape)
 
+def _simulate_series(
+    probs: np.ndarray,
+    gmms,
+    start_ts: pd.Timestamp,
+    start_state: int,
+    periods: int,
+    rng: np.random.Generator | None = None,
+) -> pd.DataFrame:
+    """Generate synthetic 15‑min series (timestamp, state, x)."""
+    rng = np.random.default_rng() if rng is None else rng
+    timestamps = pd.date_range(start_ts, periods=periods, freq="15min")
+    states = np.empty(periods, dtype=int)
+    xs = np.empty(periods, dtype=float)
 
-active_buckets = np.where(counts.sum(axis=(1, 2)) > 0)[0]
-bucket = int(active_buckets[0]) if active_buckets.size else 0
-print(f"\nUsing bucket {bucket}")
+    s = start_state
+    for i, ts in enumerate(timestamps):
+        b = bucket_id(ts)
+        s = rng.choice(probs.shape[1], p=probs[b, s])
+        states[i] = s
+        xs[i] = sample_value(gmms, b, s, rng=rng)
+    return pd.DataFrame({"timestamp": timestamps, "state": states, "x_sim": xs})
 
 
-print("row sums :", probs[bucket].sum(axis=1))
+def main() -> None:
+    df = load_timeseries(normalize=True, discretize=True)
+    if "bucket" not in df.columns:
+        df = assign_buckets(df)
 
-plt.imshow(probs[bucket], aspect="auto")
-plt.title(f"Bucket {bucket} – transition probabilities")
-plt.xlabel("state t+1")
-plt.ylabel("state t")
-plt.colorbar()
-plt.tight_layout()
-plt.show()
+    value_col = _detect_value_col(df)
+    print("Using load column:", value_col)
+
+    counts = build_transition_counts(df)
+    probs = build_transition_matrices(df)
+
+    _plot_first_25_buckets(counts, probs)
+
+    print("Fitting GMMs … (this may take a moment)")
+    gmms = fit_gmms(df, value_col=value_col)
+
+    periods = SIM_DAYS * 96
+    sim_df = _simulate_series(
+        probs,
+        gmms,
+        start_ts=df["timestamp"].min().normalize(),
+        start_state=int(df["state"].iloc[0]),
+        periods=periods,
+    )
+
+    _plot_simulation_diagnostics(df, sim_df, value_col)
+
+
+def _plot_first_25_buckets(counts: np.ndarray, probs: np.ndarray) -> None:
+    """Heat‑map grid for buckets 0‑24."""
+    buckets = list(range(25))
+    fig, axes = plt.subplots(5, 5, figsize=(15, 15), sharex=True, sharey=True)
+    vmax = probs[buckets].max()
+    norm = Normalize(vmin=0, vmax=vmax)
+
+    for idx, b in enumerate(buckets):
+        ax = axes.flat[idx]
+        if counts[b].sum() == 0:
+            ax.axis("off")
+            continue
+        im = ax.imshow(probs[b], aspect="auto", origin="lower", norm=norm)
+        ax.set_title(f"Bucket {b}", fontsize=8)
+        ax.set_xticks([])
+        ax.set_yticks([])
+
+    for ax in axes.flat[len(buckets) :]:
+        ax.axis("off")
+
+    fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.6, label="p")
+    fig.suptitle("Transition probabilities – buckets 0‑24", fontsize=14)
+    fig.tight_layout(rect=[0, 0, 0.97, 0.96])
+    plt.show()
+
+
+def _plot_simulation_diagnostics(
+    df: pd.DataFrame, sim: pd.DataFrame, value_col: str
+) -> None:
+    first_day = sim.iloc[:96]
+    plt.figure(figsize=(10, 3))
+    plt.plot(first_day["timestamp"], first_day["x_sim"], marker=".")
+    plt.title("Simulated power – first day")
+    plt.ylabel("normalised load x")
+    plt.tight_layout()
+    plt.show()
+
+    plt.figure(figsize=(6, 4))
+    plt.hist(df[value_col], bins=50, alpha=0.6, density=True, label="original")
+    plt.hist(sim["x_sim"], bins=50, alpha=0.6, density=True, label="simulated")
+    plt.title("Original vs simulated load distribution")
+    plt.xlabel("normalised load x")
+    plt.ylabel("density")
+    plt.legend()
+    plt.tight_layout()
+    plt.show()
+
+    sim["hour"] = sim["timestamp"].dt.hour
+    plt.figure(figsize=(10, 4))
+    sim.boxplot(column="x_sim", by="hour", grid=False)
+    plt.suptitle("")
+    plt.title("Simulated power by hour of day")
+    plt.xlabel("hour of day")
+    plt.ylabel("normalised load x")
+    plt.tight_layout()
+    plt.show()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/markov/gmm.py b/src/markov/gmm.py
new file mode 100644
index 0000000..daf8663
--- /dev/null
+++ b/src/markov/gmm.py
@@ -0,0 +1,129 @@
+import math
+from typing import Dict, List, Tuple
+
+import joblib
+import numpy as np
+import pandas as pd
+from sklearn.mixture import GaussianMixture
+
+try:
+    # rich progress bar for terminals / notebooks; optional dependency
+    from tqdm.auto import tqdm  # type: ignore
+except ImportError:
+    tqdm = None  # type: ignore
+
+from .buckets import NUM_BUCKETS
+from .transition_counts import N_STATES
+
+__all__ = [
+    "GaussianBucketModels",
+    "fit_gmms",
+    "sample_value",
+]
+
+
+GmmTuple = Tuple[np.ndarray, np.ndarray, np.ndarray]
+GaussianBucketModels = List[List[GmmTuple]]
+
+
+def _fit_single(
+    x: np.ndarray,
+    *,
+    min_samples: int = 30,
+    k_candidates: Tuple[int, ...] = (1, 2, 3),
+    random_state: int | None = None,
+) -> GmmTuple:
+    """Fit 1‑3 component spherical GMM; fallback to Normal if too few samples."""
+    if len(x) < min_samples:
+        mean = float(np.mean(x))
+        var = float(np.var(x) + 1e-6)
+        return (np.array([1.0]), np.array([mean]), np.array([var]))
+
+    best_bic = math.inf
+    best_gmm: GaussianMixture | None = None
+    for k in k_candidates:
+        gmm = GaussianMixture(
+            n_components=k,
+            covariance_type="spherical",
+            n_init="auto",
+            random_state=random_state,
+        ).fit(x.reshape(-1, 1))
+        bic = gmm.bic(x.reshape(-1, 1))
+        if bic < best_bic:
+            best_bic = bic
+            best_gmm = gmm
+
+    weights = best_gmm.weights_
+    means = best_gmm.means_.ravel()
+    variances = best_gmm.covariances_.ravel()
+    return (weights, means, variances)
+
+
+def fit_gmms(
+    df: pd.DataFrame,
+    *,
+    value_col: str = "x",
+    bucket_col: str = "bucket",
+    state_col: str = "state",
+    min_samples: int = 30,
+    k_candidates: Tuple[int, ...] = (1, 2, 3),
+    n_jobs: int = -1,
+    random_state: int | None = None,
+    verbose: int = 0,
+) -> GaussianBucketModels:
+    """Return list [bucket][state] -> (weights, means, variances)."""
+
+    samples: Dict[Tuple[int, int], List[float]] = {}
+    for _, row in df[[bucket_col, state_col, value_col]].iterrows():
+        samples.setdefault((row[bucket_col], row[state_col]), []).append(row[value_col])
+
+    tasks = [
+        ((b, s), samples.get((b, s), []))
+        for b in range(NUM_BUCKETS)
+        for s in range(N_STATES)
+    ]
+
+    iterable = tasks
+    if verbose > 0 and tqdm is not None:
+        iterable = tqdm(tasks, desc="Fitting GMMs", unit="model")
+
+    def _worker(item: Tuple[Tuple[int, int], List[float]]):
+        (b, s), x_list = item
+        x = np.asarray(x_list, dtype=float)
+        return (
+            b,
+            s,
+            _fit_single(
+                x,
+                min_samples=min_samples,
+                k_candidates=k_candidates,
+                random_state=random_state,
+            ),
+        )
+
+    results = joblib.Parallel(n_jobs=n_jobs, verbose=verbose)(
+        joblib.delayed(_worker)(task) for task in iterable
+    )
+
+    gmms: GaussianBucketModels = [
+        [None for _ in range(N_STATES)] for _ in range(NUM_BUCKETS)
+    ]
+    for b, s, gmm_tuple in results:
+        gmms[b][s] = gmm_tuple
+    return gmms
+
+
+_rng = np.random.default_rng()
+
+
+def sample_value(
+    gmms: GaussianBucketModels,
+    bucket: int,
+    state: int,
+    rng: np.random.Generator | None = None,
+) -> float:
+    """Draw a normalised load value from the GMM for (bucket, state)."""
+    weights, means, vars_ = gmms[bucket][state]
+    rng = _rng if rng is None else rng
+    comp = rng.choice(len(weights), p=weights)
+    return float(rng.normal(means[comp], math.sqrt(vars_[comp])))

From 444899f4e4d7fdc305581d4a49cd7bf3bbb25ddd Mon Sep 17 00:00:00 2001
From: Philipp Schmelter <philipp@stressnet.de>
Date: Tue, 8 Jul 2025 18:40:15 +0200
Subject: [PATCH 2/5] update

---
 src/main.py                     | 87 +++++++++++++++++----------------
 src/markov/gmm.py               | 81 ++++++++++++++++--------------
 src/markov/transition_counts.py |  5 --
 3 files changed, 88 insertions(+), 85 deletions(-)

diff --git a/src/main.py b/src/main.py
index 7173343..3598572 100644
--- a/src/main.py
+++ b/src/main.py
@@ -9,15 +9,15 @@
 from src.markov.transitions import build_transition_matrices
 from src.preprocessing.loader import load_timeseries
 
-SIM_DAYS = 3
+SIM_DAYS = 10
+PER_DAY = 96
 
 
 def _detect_value_col(df: pd.DataFrame) -> str:
-    candidate_cols = ["x", "value", "load", "power", "p_norm", "load_norm"]
-    for c in candidate_cols:
+    for c in ["x", "value", "load", "power", "p_norm", "load_norm"]:
         if c in df.columns and np.issubdtype(df[c].dtype, np.number):
             return c
-    raise KeyError("No numeric load column found – please inspect the dataframe.")
+    raise KeyError("numeric load column missing")
 
 
 def _simulate_series(
@@ -28,54 +28,25 @@ def _simulate_series(
     periods: int,
     rng: np.random.Generator | None = None,
 ) -> pd.DataFrame:
-    """Generate synthetic 15‑min series (timestamp, state, x)."""
     rng = np.random.default_rng() if rng is None else rng
-    timestamps = pd.date_range(start_ts, periods=periods, freq="15min")
+    ts = pd.date_range(start_ts, periods=periods, freq="15min")
     states = np.empty(periods, dtype=int)
     xs = np.empty(periods, dtype=float)
 
     s = start_state
-    for i, ts in enumerate(timestamps):
-        b = bucket_id(ts)
+    for i, t in enumerate(ts):
+        b = bucket_id(t)
         s = rng.choice(probs.shape[1], p=probs[b, s])
         states[i] = s
         xs[i] = sample_value(gmms, b, s, rng=rng)
-    return pd.DataFrame({"timestamp": timestamps, "state": states, "x_sim": xs})
 
-
-def main() -> None:
-    df = load_timeseries(normalize=True, discretize=True)
-    if "bucket" not in df.columns:
-        df = assign_buckets(df)
-
-    value_col = _detect_value_col(df)
-    print("Using load column:", value_col)
-
-    counts = build_transition_counts(df)
-    probs = build_transition_matrices(df)
-
-    _plot_first_25_buckets(counts, probs)
-
-    print("Fitting GMMs … (this may take a moment)")
-    gmms = fit_gmms(df, value_col=value_col)
-
-    periods = SIM_DAYS * 96
-    sim_df = _simulate_series(
-        probs,
-        gmms,
-        start_ts=df["timestamp"].min().normalize(),
-        start_state=int(df["state"].iloc[0]),
-        periods=periods,
-    )
-
-    _plot_simulation_diagnostics(df, sim_df, value_col)
+    return pd.DataFrame({"timestamp": ts, "state": states, "x_sim": xs})
 
 
 def _plot_first_25_buckets(counts: np.ndarray, probs: np.ndarray) -> None:
-    """Heat‑map grid for buckets 0‑24."""
-    buckets = list(range(25))
+    buckets = range(25)
     fig, axes = plt.subplots(5, 5, figsize=(15, 15), sharex=True, sharey=True)
-    vmax = probs[buckets].max()
+    vmax = probs[list(buckets)].max()
     norm = Normalize(vmin=0, vmax=vmax)
 
     for idx, b in enumerate(buckets):
@@ -92,15 +63,15 @@ def _plot_first_25_buckets(counts: np.ndarray, probs: np.ndarray) -> None:
         ax.axis("off")
 
     fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.6, label="p")
-    fig.suptitle("Transition probabilities – buckets 0‑24", fontsize=14)
-    fig.tight_layout(rect=[0, 0, 0.97, 0.96])
+    fig.suptitle("Transition probabilities – buckets 0–24", fontsize=14)
+    plt.tight_layout(rect=[0, 0, 0.97, 0.96])
     plt.show()
 
 
 def _plot_simulation_diagnostics(
     df: pd.DataFrame, sim: pd.DataFrame, value_col: str
 ) -> None:
-    first_day = sim.iloc[:96]
+    first_day = sim.iloc[:PER_DAY]
     plt.figure(figsize=(10, 3))
     plt.plot(first_day["timestamp"], first_day["x_sim"], marker=".")
     plt.title("Simulated power – first day")
@@ -121,7 +92,6 @@ def _plot_simulation_diagnostics(
     sim["hour"] = sim["timestamp"].dt.hour
     plt.figure(figsize=(10, 4))
     sim.boxplot(column="x_sim", by="hour", grid=False)
-    plt.suptitle("")
     plt.title("Simulated power by hour of day")
     plt.xlabel("hour of day")
     plt.ylabel("normalised load x")
@@ -129,5 +99,36 @@ def _plot_simulation_diagnostics(
     plt.show()
 
 
+def main() -> None:
+    df = load_timeseries(normalize=True, discretize=True)
+    if "bucket" not in df.columns:
+        df = assign_buckets(df)
+
+    val_col = _detect_value_col(df)
+
+    counts = build_transition_counts(df)
+    probs = build_transition_matrices(df)
+
+    _plot_first_25_buckets(counts, probs)
+
+    gmms = fit_gmms(
+        df,
+        value_col=val_col,
+        verbose=1,
+        heartbeat_seconds=60,
+    )
+
+    periods = SIM_DAYS * PER_DAY
+    sim = _simulate_series(
+        probs,
+        gmms,
+        start_ts=df["timestamp"].min().normalize(),
+        start_state=int(df["state"].iloc[0]),
+        periods=periods,
+    )
+
+    _plot_simulation_diagnostics(df, sim, val_col)
+
+
 if __name__ == "__main__":
     main()
diff --git a/src/markov/gmm.py b/src/markov/gmm.py
index daf8663..061197d 100644
--- a/src/markov/gmm.py
+++ b/src/markov/gmm.py
@@ -1,5 +1,5 @@
 import math
-from typing import Dict, List, Tuple
+from typing import List, Optional, Tuple
 
 import joblib
 import numpy as np
@@ -7,23 +7,19 @@
 from sklearn.mixture import GaussianMixture
 
 try:
-    # rich progress bar for terminals / notebooks; optional dependency
-    from tqdm.auto import tqdm  # type: ignore
+    from tqdm.auto import tqdm
 except ImportError:
-    tqdm = None  # type: ignore
+    tqdm = None
 
 from .buckets import NUM_BUCKETS
 from .transition_counts import N_STATES
 
-__all__ = [
-    "GaussianBucketModels",
-    "fit_gmms",
-    "sample_value",
-]
-
+__all__ = ["GaussianBucketModels", "fit_gmms", "sample_value"]
 
 GmmTuple = Tuple[np.ndarray, np.ndarray, np.ndarray]
-GaussianBucketModels = List[List[GmmTuple]]
+GaussianBucketModels = List[List[Optional[GmmTuple]]]
+
+_rng = np.random.default_rng()
 
 
 def _fit_single(
@@ -32,31 +28,35 @@ def _fit_single(
     min_samples: int = 30,
     k_candidates: Tuple[int, ...] = (1, 2, 3),
     random_state: int | None = None,
-) -> GmmTuple:
-    """Fit 1‑3 component spherical GMM; fallback to Normal if too few samples."""
+) -> Optional[GmmTuple]:
+    if x.size == 0:
+        return None
     if len(x) < min_samples:
         mean = float(np.mean(x))
-        var = float(np.var(x) + 1e-6)
-        return (np.array([1.0]), np.array([mean]), np.array([var]))
-
+        var = float(max(np.var(x), 1e-5))
+        return (
+            np.array([1.0], dtype=float),
+            np.array([mean], dtype=float),
+            np.array([var], dtype=float),
+        )
     best_bic = math.inf
     best_gmm: GaussianMixture | None = None
     for k in k_candidates:
         gmm = GaussianMixture(
             n_components=k,
             covariance_type="spherical",
-            n_init="auto",
+            n_init=1,
             random_state=random_state,
         ).fit(x.reshape(-1, 1))
         bic = gmm.bic(x.reshape(-1, 1))
         if bic < best_bic:
             best_bic = bic
             best_gmm = gmm
-
-    weights = best_gmm.weights_
-    means = best_gmm.means_.ravel()
-    variances = best_gmm.covariances_.ravel()
-    return (weights, means, variances)
+    return (
+        best_gmm.weights_,
+        best_gmm.means_.ravel(),
+        best_gmm.covariances_.ravel(),
+    )
 
 
 def fit_gmms(
@@ -65,29 +65,37 @@ def fit_gmms(
     value_col: str = "x",
     bucket_col: str = "bucket",
     state_col: str = "state",
-    min_samples: int = 30,
+    min_samples: int = 5,
     k_candidates: Tuple[int, ...] = (1, 2, 3),
     n_jobs: int = -1,
     random_state: int | None = None,
     verbose: int = 0,
+    heartbeat_seconds: int | None = None,
 ) -> GaussianBucketModels:
-    """Return list [bucket][state] -> (weights, means, variances)."""
+    if heartbeat_seconds:
+        import faulthandler
+
+        faulthandler.enable()
+        faulthandler.dump_traceback_later(heartbeat_seconds, repeat=True)
 
-    samples: Dict[Tuple[int, int], List[float]] = {}
-    for _, row in df[[bucket_col, state_col, value_col]].iterrows():
-        samples.setdefault((row[bucket_col], row[state_col]), []).append(row[value_col])
+    grouped = (
+        df[[bucket_col, state_col, value_col]]
+        .groupby([bucket_col, state_col])[value_col]
+        .apply(list)
+        .to_dict()
+    )
 
     tasks = [
-        ((b, s), samples.get((b, s), []))
+        ((b, s), grouped.get((b, s), []))
         for b in range(NUM_BUCKETS)
         for s in range(N_STATES)
     ]
 
-    iterable = tasks
-    if verbose > 0 and tqdm is not None:
-        iterable = tqdm(tasks, desc="Fitting GMMs", unit="model")
+    iterable = (
+        tqdm(tasks, desc="Fitting GMMs", unit="model") if verbose and tqdm else tasks
+    )
 
-    def _worker(item: Tuple[Tuple[int, int], List[float]]):
+    def _worker(item):
         (b, s), x_list = item
         x = np.asarray(x_list, dtype=float)
         return (
@@ -113,17 +121,16 @@ def _worker(item: Tuple[Tuple[int, int], List[float]]):
     return gmms
 
 
-_rng = np.random.default_rng()
-
-
 def sample_value(
     gmms: GaussianBucketModels,
     bucket: int,
     state: int,
     rng: np.random.Generator | None = None,
 ) -> float:
-    """Draw a normalised load value from the GMM for (bucket, state)."""
-    weights, means, vars_ = gmms[bucket][state]
+    gmm = gmms[bucket][state]
+    if gmm is None:
+        raise ValueError(f"No GMM trained for bucket {bucket}, state {state}")
+    weights, means, vars_ = gmm
     rng = _rng if rng is None else rng
     comp = rng.choice(len(weights), p=weights)
     return float(rng.normal(means[comp], math.sqrt(vars_[comp])))
diff --git a/src/markov/transition_counts.py b/src/markov/transition_counts.py
index 5e9d756..59a12b1 100644
--- a/src/markov/transition_counts.py
+++ b/src/markov/transition_counts.py
@@ -15,11 +15,6 @@ def build_transition_counts(
     bucket_col: str = "bucket",
     dtype=np.uint32,
 ) -> np.ndarray:
-    """
-    Absolute transition counts:
-        C[b, i, j] = # of times state_t=i  → state_{t+1}=j in bucket b
-    Shape = (2 304, 10, 10).
-    """
     df = df.sort_values("timestamp")
 
     s_t = df[state_col].to_numpy(dtype=int)[:-1]

From d8be38ec630b62cfde03b229d964468d3cea3617 Mon Sep 17 00:00:00 2001
From: Philipp Schmelter <philipp@stressnet.de>
Date: Tue, 5 Aug 2025 00:20:11 +0200
Subject: [PATCH 3/5] removed lp

---
 src/markov/transitions.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/markov/transitions.py b/src/markov/transitions.py
index 08095c0..37e1aae 100644
--- a/src/markov/transitions.py
+++ b/src/markov/transitions.py
@@ -1,15 +1,19 @@
 import numpy as np
 import pandas as pd
 
-from src.config import CONFIG
-
 from ._core import _transition_counts
 
-alpha = float(CONFIG["model"]["laplace_alpha"])
-
 
 def build_transition_matrices(df: pd.DataFrame, *, dtype=np.float32) -> np.ndarray:
     counts = _transition_counts(df, dtype=dtype)
-    counts += alpha
-    counts /= counts.sum(axis=2, keepdims=True)
-    return counts.astype(dtype)
+    row_sum = counts.sum(axis=2, keepdims=True)
+    empty = row_sum == 0
+    if np.any(empty):
+        n_states = counts.shape[2]
+        idx_b, idx_i, _ = np.where(empty)
+        counts[idx_b, idx_i, :] = 0
+        counts[idx_b, idx_i, idx_i] = 1
+    row_sum[empty] = 1
+    probs = counts / row_sum
+
+    return probs.astype(dtype)

From 6f4ce3dc8b1617a32dd4c7b8ad6243d04f2645fc Mon Sep 17 00:00:00 2001
From: Philipp Schmelter <philipp@stressnet.de>
Date: Tue, 5 Aug 2025 00:28:00 +0200
Subject: [PATCH 4/5] removed lp

---
 src/markov/transitions.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/markov/transitions.py b/src/markov/transitions.py
index 37e1aae..a8726d4 100644
--- a/src/markov/transitions.py
+++ b/src/markov/transitions.py
@@ -9,7 +9,6 @@ def build_transition_matrices(df: pd.DataFrame, *, dtype=np.float32) -> np.ndarr
     row_sum = counts.sum(axis=2, keepdims=True)
     empty = row_sum == 0
     if np.any(empty):
-        n_states = counts.shape[2]
         idx_b, idx_i, _ = np.where(empty)
         counts[idx_b, idx_i, :] = 0
         counts[idx_b, idx_i, idx_i] = 1

From 3f99611ab9ae123a0142e31830a4856338223db1 Mon Sep 17 00:00:00 2001
From: Philipp Schmelter <philipp@stressnet.de>
Date: Thu, 7 Aug 2025 10:50:53 +0200
Subject: [PATCH 5/5] new version

---
 src/main.py               | 112 +++++++++++++++++---------------------
 src/markov/postprocess.py |  21 +++++++
 src/markov/transitions.py |  38 +++++++++----
 3 files changed, 100 insertions(+), 71 deletions(-)
 create mode 100644 src/markov/postprocess.py

diff --git a/src/main.py b/src/main.py
index 3598572..cb09bee 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,17 +1,17 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from matplotlib.colors import Normalize
 
-from src.markov.buckets import assign_buckets, bucket_id
-from src.markov.gmm import fit_gmms, sample_value
+
+from src.markov.buckets       import assign_buckets, bucket_id
+from src.markov.gmm           import fit_gmms, sample_value
 from src.markov.transition_counts import build_transition_counts
-from src.markov.transitions import build_transition_matrices
+from src.markov.transitions   import build_transition_matrices
+from src.markov.postprocess   import apply_dwell_time, two_point_smooth
 from src.preprocessing.loader import load_timeseries
 
 SIM_DAYS = 10
-PER_DAY = 96
-
+PER_DAY  = 96
 
 def _detect_value_col(df: pd.DataFrame) -> str:
     for c in ["x", "value", "load", "power", "p_norm", "load_norm"]:
@@ -20,60 +20,46 @@ def _detect_value_col(df: pd.DataFrame) -> str:
     raise KeyError("numeric load column missing")
 
 
-def _simulate_series(
+def _simulate_states(
     probs: np.ndarray,
-    gmms,
     start_ts: pd.Timestamp,
     start_state: int,
     periods: int,
     rng: np.random.Generator | None = None,
-) -> pd.DataFrame:
+) -> np.ndarray:
     rng = np.random.default_rng() if rng is None else rng
-    ts = pd.date_range(start_ts, periods=periods, freq="15min")
-    states = np.empty(periods, dtype=int)
-    xs = np.empty(periods, dtype=float)
 
+    states = np.empty(periods, dtype=int)
     s = start_state
-    for i, t in enumerate(ts):
-        b = bucket_id(t)
-        s = rng.choice(probs.shape[1], p=probs[b, s])
+    for i, t in enumerate(pd.date_range(start_ts, periods=periods, freq="15min")):
+        b  = bucket_id(t)
+        s  = rng.choice(probs.shape[1], p=probs[b, s])
         states[i] = s
-        xs[i] = sample_value(gmms, b, s, rng=rng)
-
-    return pd.DataFrame({"timestamp": ts, "state": states, "x_sim": xs})
-
+    return states
 
-def _plot_first_25_buckets(counts: np.ndarray, probs: np.ndarray) -> None:
-    buckets = range(25)
-    fig, axes = plt.subplots(5, 5, figsize=(15, 15), sharex=True, sharey=True)
-    vmax = probs[list(buckets)].max()
-    norm = Normalize(vmin=0, vmax=vmax)
 
-    for idx, b in enumerate(buckets):
-        ax = axes.flat[idx]
-        if counts[b].sum() == 0:
-            ax.axis("off")
-            continue
-        im = ax.imshow(probs[b], aspect="auto", origin="lower", norm=norm)
-        ax.set_title(f"Bucket {b}", fontsize=8)
-        ax.set_xticks([])
-        ax.set_yticks([])
-
-    for ax in axes.flat[len(buckets) :]:
-        ax.axis("off")
+def _states_to_series(
+    states: np.ndarray,
+    gmms,
+    start_ts: pd.Timestamp,
+    rng: np.random.Generator | None = None,
+) -> pd.Series:
+    rng = np.random.default_rng() if rng is None else rng
+    ts  = pd.date_range(start_ts, periods=len(states), freq="15min")
+    xs  = np.empty_like(states, dtype=float)
+    for i, (t, s) in enumerate(zip(ts, states)):
+        b   = bucket_id(t)
+        xs[i] = sample_value(gmms, b, s, rng=rng)
+    return pd.Series(xs, index=ts, name="x_sim")
 
-    fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.6, label="p")
-    fig.suptitle("Transition probabilities – buckets 0–24", fontsize=14)
-    plt.tight_layout(rect=[0, 0, 0.97, 0.96])
-    plt.show()
 
 
-def _plot_simulation_diagnostics(
-    df: pd.DataFrame, sim: pd.DataFrame, value_col: str
-) -> None:
+def _plot_simulation_diagnostics(df: pd.DataFrame,
+                                 sim: pd.Series,
+                                 value_col: str) -> None:
     first_day = sim.iloc[:PER_DAY]
     plt.figure(figsize=(10, 3))
-    plt.plot(first_day["timestamp"], first_day["x_sim"], marker=".")
+    plt.plot(first_day.index, first_day, marker=".")
     plt.title("Simulated power – first day")
     plt.ylabel("normalised load x")
     plt.tight_layout()
@@ -81,7 +67,7 @@ def _plot_simulation_diagnostics(
 
     plt.figure(figsize=(6, 4))
     plt.hist(df[value_col], bins=50, alpha=0.6, density=True, label="original")
-    plt.hist(sim["x_sim"], bins=50, alpha=0.6, density=True, label="simulated")
+    plt.hist(sim,           bins=50, alpha=0.6, density=True, label="simulated")
     plt.title("Original vs simulated load distribution")
     plt.xlabel("normalised load x")
     plt.ylabel("density")
@@ -89,9 +75,9 @@ def _plot_simulation_diagnostics(
     plt.tight_layout()
     plt.show()
 
-    sim["hour"] = sim["timestamp"].dt.hour
+    sim_hr = pd.DataFrame({"x_sim": sim, "hour": sim.index.hour})
     plt.figure(figsize=(10, 4))
-    sim.boxplot(column="x_sim", by="hour", grid=False)
+    sim_hr.boxplot(column="x_sim", by="hour", grid=False)
     plt.title("Simulated power by hour of day")
     plt.xlabel("hour of day")
     plt.ylabel("normalised load x")
@@ -107,27 +93,31 @@ def main() -> None:
     val_col = _detect_value_col(df)
 
     counts = build_transition_counts(df)
-    probs = build_transition_matrices(df)
+    probs  = build_transition_matrices(df)
 
-    _plot_first_25_buckets(counts, probs)
+    gmms   = fit_gmms(df, value_col=val_col, verbose=1,
+                      heartbeat_seconds=60)
 
-    gmms = fit_gmms(
-        df,
-        value_col=val_col,
-        verbose=1,
-        heartbeat_seconds=60,
-    )
-
-    periods = SIM_DAYS * PER_DAY
-    sim = _simulate_series(
+    periods  = SIM_DAYS * PER_DAY
+    start_ts = df["timestamp"].min().normalize()
+    raw_states = _simulate_states(
         probs,
-        gmms,
-        start_ts=df["timestamp"].min().normalize(),
+        start_ts=start_ts,
         start_state=int(df["state"].iloc[0]),
         periods=periods,
     )
 
-    _plot_simulation_diagnostics(df, sim, val_col)
+    states = apply_dwell_time(raw_states, p_extend=0.7)
+
+    sim_series = _states_to_series(states, gmms, start_ts)
+    sim_series = two_point_smooth(sim_series)
+
+    real_kwh = df.set_index("timestamp")[val_col].resample("D").sum() / 4
+    sim_kwh  = sim_series.resample("D").sum() / 4
+    scale = df[val_col].mean() / sim_series.mean()
+    sim_series *= scale
+
+    _plot_simulation_diagnostics(df, sim_series, val_col)
 
 
 if __name__ == "__main__":
diff --git a/src/markov/postprocess.py b/src/markov/postprocess.py
new file mode 100644
index 0000000..81d296e
--- /dev/null
+++ b/src/markov/postprocess.py
@@ -0,0 +1,21 @@
+import numpy as np
+import pandas as pd
+
+def apply_dwell_time(states: np.ndarray, p_extend: float = 0.7) -> np.ndarray:
+    if states.size < 2:
+        return states
+
+    out = states.copy()
+    hold_next = False
+    for t in range(1, len(states)):
+        if hold_next:
+            out[t] = out[t-1]
+            hold_next = False
+        elif states[t] != states[t-1] and np.random.rand() < p_extend:
+            out[t] = out[t-1]
+            hold_next = True
+    return out
+
+
+def two_point_smooth(series: pd.Series) -> pd.Series:
+    return 0.5 * (series.shift(1, fill_value=series.iloc[0]) + series)
diff --git a/src/markov/transitions.py b/src/markov/transitions.py
index a8726d4..86cc775 100644
--- a/src/markov/transitions.py
+++ b/src/markov/transitions.py
@@ -1,18 +1,36 @@
 import numpy as np
 import pandas as pd
-
 from ._core import _transition_counts
 
 
-def build_transition_matrices(df: pd.DataFrame, *, dtype=np.float32) -> np.ndarray:
-    counts = _transition_counts(df, dtype=dtype)
+
+DEFAULT_MIN_COUNT   = 25
+DEFAULT_UNIFORM_EPS = 0.25
+
+
+def build_transition_matrices(
+    df: pd.DataFrame,
+    *,
+    dtype: np.dtype = np.float32,
+    min_count: int = DEFAULT_MIN_COUNT,
+    uniform_eps: float = DEFAULT_UNIFORM_EPS,
+) -> np.ndarray:
+
+
+    counts: np.ndarray = _transition_counts(df, dtype=np.float64)
+    _, n_states, _ = counts.shape
+
+    row_sum = counts.sum(axis=2, keepdims=True)
+    mask_empty  = (row_sum == 0)
+    mask_sparse = (row_sum < min_count) & ~mask_empty
+
+    if mask_empty.any():
+        counts[mask_empty.repeat(n_states, axis=2)] = 1.0
+
+    if mask_sparse.any():
+        counts[mask_sparse.repeat(n_states, axis=2)] += uniform_eps / n_states
+
     row_sum = counts.sum(axis=2, keepdims=True)
-    empty = row_sum == 0
-    if np.any(empty):
-        idx_b, idx_i, _ = np.where(empty)
-        counts[idx_b, idx_i, :] = 0
-        counts[idx_b, idx_i, idx_i] = 1
-    row_sum[empty] = 1
-    probs = counts / row_sum
+    probs   = counts / row_sum
 
     return probs.astype(dtype)