From f47870fdf88ac4aeb11c89ce9dd41d2086690558 Mon Sep 17 00:00:00 2001 From: Philipp Schmelter Date: Sat, 28 Jun 2025 02:07:29 +0200 Subject: [PATCH 1/5] current gmm state --- CHANGELOG.md | 1 + src/main.py | 134 ++++++++++++++++++++++++++++++++++++++++------ src/markov/gmm.py | 129 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 248 insertions(+), 16 deletions(-) create mode 100644 src/markov/gmm.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 04964c4..5eca4ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - `CODEOWNERS` file and dependabot automation [#7](https://github.com/ie3-institute/simonaMarkovLoad/issues/7) - Added full Markov-model pipeline [#10](https://github.com/ie3-institute/simonaMarkovLoad/issues/10) +- Added GMM feature to project [#12](https://github.com/ie3-institute/simonaMarkovLoad/issues/12) ### Changed - Compute instantaneous kW from cumulative kWh via 15-minute differencing [#1](https://github.com/ie3-institute/simonaMarkovLoad/issues/1) diff --git a/src/main.py b/src/main.py index 099795a..7173343 100644 --- a/src/main.py +++ b/src/main.py @@ -1,31 +1,133 @@ import matplotlib.pyplot as plt import numpy as np +import pandas as pd +from matplotlib.colors import Normalize +from src.markov.buckets import assign_buckets, bucket_id +from src.markov.gmm import fit_gmms, sample_value from src.markov.transition_counts import build_transition_counts from src.markov.transitions import build_transition_matrices from src.preprocessing.loader import load_timeseries -df = load_timeseries(normalize=True, discretize=True) +SIM_DAYS = 3 -counts = build_transition_counts(df) -probs = build_transition_matrices(df, alpha=1.0) +def _detect_value_col(df: pd.DataFrame) -> str: + candidate_cols = ["x", "value", "load", "power", "p_norm", "load_norm"] + for c in candidate_cols: + if c in df.columns and np.issubdtype(df[c].dtype, np.number): + return c + raise KeyError("No numeric load column found – please inspect the dataframe.") -print("counts shape :", counts.shape) -print("probs shape :", probs.shape) +def _simulate_series( + probs: np.ndarray, + gmms, + start_ts: pd.Timestamp, + start_state: int, + periods: int, + rng: np.random.Generator | None = None, +) -> pd.DataFrame: + """Generate synthetic 15‑min series (timestamp, state, x).""" + rng = np.random.default_rng() if rng is None else rng + timestamps = pd.date_range(start_ts, periods=periods, freq="15min") + states = np.empty(periods, dtype=int) + xs = np.empty(periods, dtype=float) -active_buckets = np.where(counts.sum(axis=(1, 2)) > 0)[0] -bucket = int(active_buckets[0]) if active_buckets.size else 0 -print(f"\nUsing bucket {bucket}") + s = start_state + for i, ts in enumerate(timestamps): + b = bucket_id(ts) + s = rng.choice(probs.shape[1], p=probs[b, s]) + states[i] = s + xs[i] = sample_value(gmms, b, s, rng=rng) + return pd.DataFrame({"timestamp": timestamps, "state": states, "x_sim": xs}) -print("row sums :", probs[bucket].sum(axis=1)) +def main() -> None: + df = load_timeseries(normalize=True, discretize=True) + if "bucket" not in df.columns: + df = assign_buckets(df) -plt.imshow(probs[bucket], aspect="auto") -plt.title(f"Bucket {bucket} – transition probabilities") -plt.xlabel("state t+1") -plt.ylabel("state t") -plt.colorbar() -plt.tight_layout() -plt.show() + value_col = _detect_value_col(df) + print("Using load column:", value_col) + + counts = build_transition_counts(df) + probs = build_transition_matrices(df) + + _plot_first_25_buckets(counts, probs) + + print("Fitting GMMs … (this may take a moment)") + gmms = fit_gmms(df, value_col=value_col) + + periods = SIM_DAYS * 96 + sim_df = _simulate_series( + probs, + gmms, + start_ts=df["timestamp"].min().normalize(), + start_state=int(df["state"].iloc[0]), + periods=periods, + ) + + _plot_simulation_diagnostics(df, sim_df, value_col) + + +def _plot_first_25_buckets(counts: np.ndarray, probs: np.ndarray) -> None: + """Heat‑map grid for buckets 0‑24.""" + buckets = list(range(25)) + fig, axes = plt.subplots(5, 5, figsize=(15, 15), sharex=True, sharey=True) + vmax = probs[buckets].max() + norm = Normalize(vmin=0, vmax=vmax) + + for idx, b in enumerate(buckets): + ax = axes.flat[idx] + if counts[b].sum() == 0: + ax.axis("off") + continue + im = ax.imshow(probs[b], aspect="auto", origin="lower", norm=norm) + ax.set_title(f"Bucket {b}", fontsize=8) + ax.set_xticks([]) + ax.set_yticks([]) + + for ax in axes.flat[len(buckets) :]: + ax.axis("off") + + fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.6, label="p") + fig.suptitle("Transition probabilities – buckets 0‑24", fontsize=14) + fig.tight_layout(rect=[0, 0, 0.97, 0.96]) + plt.show() + + +def _plot_simulation_diagnostics( + df: pd.DataFrame, sim: pd.DataFrame, value_col: str +) -> None: + first_day = sim.iloc[:96] + plt.figure(figsize=(10, 3)) + plt.plot(first_day["timestamp"], first_day["x_sim"], marker=".") + plt.title("Simulated power – first day") + plt.ylabel("normalised load x") + plt.tight_layout() + plt.show() + + plt.figure(figsize=(6, 4)) + plt.hist(df[value_col], bins=50, alpha=0.6, density=True, label="original") + plt.hist(sim["x_sim"], bins=50, alpha=0.6, density=True, label="simulated") + plt.title("Original vs simulated load distribution") + plt.xlabel("normalised load x") + plt.ylabel("density") + plt.legend() + plt.tight_layout() + plt.show() + + sim["hour"] = sim["timestamp"].dt.hour + plt.figure(figsize=(10, 4)) + sim.boxplot(column="x_sim", by="hour", grid=False) + plt.suptitle("") + plt.title("Simulated power by hour of day") + plt.xlabel("hour of day") + plt.ylabel("normalised load x") + plt.tight_layout() + plt.show() + + +if __name__ == "__main__": + main() diff --git a/src/markov/gmm.py b/src/markov/gmm.py new file mode 100644 index 0000000..daf8663 --- /dev/null +++ b/src/markov/gmm.py @@ -0,0 +1,129 @@ +import math +from typing import Dict, List, Tuple + +import joblib +import numpy as np +import pandas as pd +from sklearn.mixture import GaussianMixture + +try: + # rich progress bar for terminals / notebooks; optional dependency + from tqdm.auto import tqdm # type: ignore +except ImportError: + tqdm = None # type: ignore + +from .buckets import NUM_BUCKETS +from .transition_counts import N_STATES + +__all__ = [ + "GaussianBucketModels", + "fit_gmms", + "sample_value", +] + + +GmmTuple = Tuple[np.ndarray, np.ndarray, np.ndarray] +GaussianBucketModels = List[List[GmmTuple]] + + +def _fit_single( + x: np.ndarray, + *, + min_samples: int = 30, + k_candidates: Tuple[int, ...] = (1, 2, 3), + random_state: int | None = None, +) -> GmmTuple: + """Fit 1‑3 component spherical GMM; fallback to Normal if too few samples.""" + if len(x) < min_samples: + mean = float(np.mean(x)) + var = float(np.var(x) + 1e-6) + return (np.array([1.0]), np.array([mean]), np.array([var])) + + best_bic = math.inf + best_gmm: GaussianMixture | None = None + for k in k_candidates: + gmm = GaussianMixture( + n_components=k, + covariance_type="spherical", + n_init="auto", + random_state=random_state, + ).fit(x.reshape(-1, 1)) + bic = gmm.bic(x.reshape(-1, 1)) + if bic < best_bic: + best_bic = bic + best_gmm = gmm + + weights = best_gmm.weights_ + means = best_gmm.means_.ravel() + variances = best_gmm.covariances_.ravel() + return (weights, means, variances) + + +def fit_gmms( + df: pd.DataFrame, + *, + value_col: str = "x", + bucket_col: str = "bucket", + state_col: str = "state", + min_samples: int = 30, + k_candidates: Tuple[int, ...] = (1, 2, 3), + n_jobs: int = -1, + random_state: int | None = None, + verbose: int = 0, +) -> GaussianBucketModels: + """Return list [bucket][state] -> (weights, means, variances).""" + + samples: Dict[Tuple[int, int], List[float]] = {} + for _, row in df[[bucket_col, state_col, value_col]].iterrows(): + samples.setdefault((row[bucket_col], row[state_col]), []).append(row[value_col]) + + tasks = [ + ((b, s), samples.get((b, s), [])) + for b in range(NUM_BUCKETS) + for s in range(N_STATES) + ] + + iterable = tasks + if verbose > 0 and tqdm is not None: + iterable = tqdm(tasks, desc="Fitting GMMs", unit="model") + + def _worker(item: Tuple[Tuple[int, int], List[float]]): + (b, s), x_list = item + x = np.asarray(x_list, dtype=float) + return ( + b, + s, + _fit_single( + x, + min_samples=min_samples, + k_candidates=k_candidates, + random_state=random_state, + ), + ) + + results = joblib.Parallel(n_jobs=n_jobs, verbose=verbose)( + joblib.delayed(_worker)(task) for task in iterable + ) + + gmms: GaussianBucketModels = [ + [None for _ in range(N_STATES)] for _ in range(NUM_BUCKETS) + ] + for b, s, gmm_tuple in results: + gmms[b][s] = gmm_tuple + return gmms + + +_rng = np.random.default_rng() + + +def sample_value( + gmms: GaussianBucketModels, + bucket: int, + state: int, + rng: np.random.Generator | None = None, +) -> float: + """Draw a normalised load value from the GMM for (bucket, state).""" + weights, means, vars_ = gmms[bucket][state] + rng = _rng if rng is None else rng + comp = rng.choice(len(weights), p=weights) + return float(rng.normal(means[comp], math.sqrt(vars_[comp]))) From 444899f4e4d7fdc305581d4a49cd7bf3bbb25ddd Mon Sep 17 00:00:00 2001 From: Philipp Schmelter Date: Tue, 8 Jul 2025 18:40:15 +0200 Subject: [PATCH 2/5] update --- src/main.py | 87 +++++++++++++++++---------------- src/markov/gmm.py | 81 ++++++++++++++++-------------- src/markov/transition_counts.py | 5 -- 3 files changed, 88 insertions(+), 85 deletions(-) diff --git a/src/main.py b/src/main.py index 7173343..3598572 100644 --- a/src/main.py +++ b/src/main.py @@ -9,15 +9,15 @@ from src.markov.transitions import build_transition_matrices from src.preprocessing.loader import load_timeseries -SIM_DAYS = 3 +SIM_DAYS = 10 +PER_DAY = 96 def _detect_value_col(df: pd.DataFrame) -> str: - candidate_cols = ["x", "value", "load", "power", "p_norm", "load_norm"] - for c in candidate_cols: + for c in ["x", "value", "load", "power", "p_norm", "load_norm"]: if c in df.columns and np.issubdtype(df[c].dtype, np.number): return c - raise KeyError("No numeric load column found – please inspect the dataframe.") + raise KeyError("numeric load column missing") def _simulate_series( @@ -28,54 +28,25 @@ def _simulate_series( periods: int, rng: np.random.Generator | None = None, ) -> pd.DataFrame: - """Generate synthetic 15‑min series (timestamp, state, x).""" rng = np.random.default_rng() if rng is None else rng - timestamps = pd.date_range(start_ts, periods=periods, freq="15min") + ts = pd.date_range(start_ts, periods=periods, freq="15min") states = np.empty(periods, dtype=int) xs = np.empty(periods, dtype=float) s = start_state - for i, ts in enumerate(timestamps): - b = bucket_id(ts) + for i, t in enumerate(ts): + b = bucket_id(t) s = rng.choice(probs.shape[1], p=probs[b, s]) states[i] = s xs[i] = sample_value(gmms, b, s, rng=rng) - return pd.DataFrame({"timestamp": timestamps, "state": states, "x_sim": xs}) - -def main() -> None: - df = load_timeseries(normalize=True, discretize=True) - if "bucket" not in df.columns: - df = assign_buckets(df) - - value_col = _detect_value_col(df) - print("Using load column:", value_col) - - counts = build_transition_counts(df) - probs = build_transition_matrices(df) - - _plot_first_25_buckets(counts, probs) - - print("Fitting GMMs … (this may take a moment)") - gmms = fit_gmms(df, value_col=value_col) - - periods = SIM_DAYS * 96 - sim_df = _simulate_series( - probs, - gmms, - start_ts=df["timestamp"].min().normalize(), - start_state=int(df["state"].iloc[0]), - periods=periods, - ) - - _plot_simulation_diagnostics(df, sim_df, value_col) + return pd.DataFrame({"timestamp": ts, "state": states, "x_sim": xs}) def _plot_first_25_buckets(counts: np.ndarray, probs: np.ndarray) -> None: - """Heat‑map grid for buckets 0‑24.""" - buckets = list(range(25)) + buckets = range(25) fig, axes = plt.subplots(5, 5, figsize=(15, 15), sharex=True, sharey=True) - vmax = probs[buckets].max() + vmax = probs[list(buckets)].max() norm = Normalize(vmin=0, vmax=vmax) for idx, b in enumerate(buckets): @@ -92,15 +63,15 @@ def _plot_first_25_buckets(counts: np.ndarray, probs: np.ndarray) -> None: ax.axis("off") fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.6, label="p") - fig.suptitle("Transition probabilities – buckets 0‑24", fontsize=14) - fig.tight_layout(rect=[0, 0, 0.97, 0.96]) + fig.suptitle("Transition probabilities – buckets 0–24", fontsize=14) + plt.tight_layout(rect=[0, 0, 0.97, 0.96]) plt.show() def _plot_simulation_diagnostics( df: pd.DataFrame, sim: pd.DataFrame, value_col: str ) -> None: - first_day = sim.iloc[:96] + first_day = sim.iloc[:PER_DAY] plt.figure(figsize=(10, 3)) plt.plot(first_day["timestamp"], first_day["x_sim"], marker=".") plt.title("Simulated power – first day") @@ -121,7 +92,6 @@ def _plot_simulation_diagnostics( sim["hour"] = sim["timestamp"].dt.hour plt.figure(figsize=(10, 4)) sim.boxplot(column="x_sim", by="hour", grid=False) - plt.suptitle("") plt.title("Simulated power by hour of day") plt.xlabel("hour of day") plt.ylabel("normalised load x") @@ -129,5 +99,36 @@ def _plot_simulation_diagnostics( plt.show() +def main() -> None: + df = load_timeseries(normalize=True, discretize=True) + if "bucket" not in df.columns: + df = assign_buckets(df) + + val_col = _detect_value_col(df) + + counts = build_transition_counts(df) + probs = build_transition_matrices(df) + + _plot_first_25_buckets(counts, probs) + + gmms = fit_gmms( + df, + value_col=val_col, + verbose=1, + heartbeat_seconds=60, + ) + + periods = SIM_DAYS * PER_DAY + sim = _simulate_series( + probs, + gmms, + start_ts=df["timestamp"].min().normalize(), + start_state=int(df["state"].iloc[0]), + periods=periods, + ) + + _plot_simulation_diagnostics(df, sim, val_col) + + if __name__ == "__main__": main() diff --git a/src/markov/gmm.py b/src/markov/gmm.py index daf8663..061197d 100644 --- a/src/markov/gmm.py +++ b/src/markov/gmm.py @@ -1,5 +1,5 @@ import math -from typing import Dict, List, Tuple +from typing import List, Optional, Tuple import joblib import numpy as np @@ -7,23 +7,19 @@ from sklearn.mixture import GaussianMixture try: - # rich progress bar for terminals / notebooks; optional dependency - from tqdm.auto import tqdm # type: ignore + from tqdm.auto import tqdm except ImportError: - tqdm = None # type: ignore + tqdm = None from .buckets import NUM_BUCKETS from .transition_counts import N_STATES -__all__ = [ - "GaussianBucketModels", - "fit_gmms", - "sample_value", -] - +__all__ = ["GaussianBucketModels", "fit_gmms", "sample_value"] GmmTuple = Tuple[np.ndarray, np.ndarray, np.ndarray] -GaussianBucketModels = List[List[GmmTuple]] +GaussianBucketModels = List[List[Optional[GmmTuple]]] + +_rng = np.random.default_rng() def _fit_single( @@ -32,31 +28,35 @@ def _fit_single( min_samples: int = 30, k_candidates: Tuple[int, ...] = (1, 2, 3), random_state: int | None = None, -) -> GmmTuple: - """Fit 1‑3 component spherical GMM; fallback to Normal if too few samples.""" +) -> Optional[GmmTuple]: + if x.size == 0: + return None if len(x) < min_samples: mean = float(np.mean(x)) - var = float(np.var(x) + 1e-6) - return (np.array([1.0]), np.array([mean]), np.array([var])) - + var = float(max(np.var(x), 1e-5)) + return ( + np.array([1.0], dtype=float), + np.array([mean], dtype=float), + np.array([var], dtype=float), + ) best_bic = math.inf best_gmm: GaussianMixture | None = None for k in k_candidates: gmm = GaussianMixture( n_components=k, covariance_type="spherical", - n_init="auto", + n_init=1, random_state=random_state, ).fit(x.reshape(-1, 1)) bic = gmm.bic(x.reshape(-1, 1)) if bic < best_bic: best_bic = bic best_gmm = gmm - - weights = best_gmm.weights_ - means = best_gmm.means_.ravel() - variances = best_gmm.covariances_.ravel() - return (weights, means, variances) + return ( + best_gmm.weights_, + best_gmm.means_.ravel(), + best_gmm.covariances_.ravel(), + ) def fit_gmms( @@ -65,29 +65,37 @@ def fit_gmms( value_col: str = "x", bucket_col: str = "bucket", state_col: str = "state", - min_samples: int = 30, + min_samples: int = 5, k_candidates: Tuple[int, ...] = (1, 2, 3), n_jobs: int = -1, random_state: int | None = None, verbose: int = 0, + heartbeat_seconds: int | None = None, ) -> GaussianBucketModels: - """Return list [bucket][state] -> (weights, means, variances).""" + if heartbeat_seconds: + import faulthandler + + faulthandler.enable() + faulthandler.dump_traceback_later(heartbeat_seconds, repeat=True) - samples: Dict[Tuple[int, int], List[float]] = {} - for _, row in df[[bucket_col, state_col, value_col]].iterrows(): - samples.setdefault((row[bucket_col], row[state_col]), []).append(row[value_col]) + grouped = ( + df[[bucket_col, state_col, value_col]] + .groupby([bucket_col, state_col])[value_col] + .apply(list) + .to_dict() + ) tasks = [ - ((b, s), samples.get((b, s), [])) + ((b, s), grouped.get((b, s), [])) for b in range(NUM_BUCKETS) for s in range(N_STATES) ] - iterable = tasks - if verbose > 0 and tqdm is not None: - iterable = tqdm(tasks, desc="Fitting GMMs", unit="model") + iterable = ( + tqdm(tasks, desc="Fitting GMMs", unit="model") if verbose and tqdm else tasks + ) - def _worker(item: Tuple[Tuple[int, int], List[float]]): + def _worker(item): (b, s), x_list = item x = np.asarray(x_list, dtype=float) return ( @@ -113,17 +121,16 @@ def _worker(item: Tuple[Tuple[int, int], List[float]]): return gmms -_rng = np.random.default_rng() - - def sample_value( gmms: GaussianBucketModels, bucket: int, state: int, rng: np.random.Generator | None = None, ) -> float: - """Draw a normalised load value from the GMM for (bucket, state).""" - weights, means, vars_ = gmms[bucket][state] + gmm = gmms[bucket][state] + if gmm is None: + raise ValueError(f"No GMM trained for bucket {bucket}, state {state}") + weights, means, vars_ = gmm rng = _rng if rng is None else rng comp = rng.choice(len(weights), p=weights) return float(rng.normal(means[comp], math.sqrt(vars_[comp]))) diff --git a/src/markov/transition_counts.py b/src/markov/transition_counts.py index 5e9d756..59a12b1 100644 --- a/src/markov/transition_counts.py +++ b/src/markov/transition_counts.py @@ -15,11 +15,6 @@ def build_transition_counts( bucket_col: str = "bucket", dtype=np.uint32, ) -> np.ndarray: - """ - Absolute transition counts: - C[b, i, j] = # of times state_t=i → state_{t+1}=j in bucket b - Shape = (2 304, 10, 10). - """ df = df.sort_values("timestamp") s_t = df[state_col].to_numpy(dtype=int)[:-1] From d8be38ec630b62cfde03b229d964468d3cea3617 Mon Sep 17 00:00:00 2001 From: Philipp Schmelter Date: Tue, 5 Aug 2025 00:20:11 +0200 Subject: [PATCH 3/5] removed lp --- src/markov/transitions.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/markov/transitions.py b/src/markov/transitions.py index 08095c0..37e1aae 100644 --- a/src/markov/transitions.py +++ b/src/markov/transitions.py @@ -1,15 +1,19 @@ import numpy as np import pandas as pd -from src.config import CONFIG - from ._core import _transition_counts -alpha = float(CONFIG["model"]["laplace_alpha"]) - def build_transition_matrices(df: pd.DataFrame, *, dtype=np.float32) -> np.ndarray: counts = _transition_counts(df, dtype=dtype) - counts += alpha - counts /= counts.sum(axis=2, keepdims=True) - return counts.astype(dtype) + row_sum = counts.sum(axis=2, keepdims=True) + empty = row_sum == 0 + if np.any(empty): + n_states = counts.shape[2] + idx_b, idx_i, _ = np.where(empty) + counts[idx_b, idx_i, :] = 0 + counts[idx_b, idx_i, idx_i] = 1 + row_sum[empty] = 1 + probs = counts / row_sum + + return probs.astype(dtype) From 6f4ce3dc8b1617a32dd4c7b8ad6243d04f2645fc Mon Sep 17 00:00:00 2001 From: Philipp Schmelter Date: Tue, 5 Aug 2025 00:28:00 +0200 Subject: [PATCH 4/5] removed lp --- src/markov/transitions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/markov/transitions.py b/src/markov/transitions.py index 37e1aae..a8726d4 100644 --- a/src/markov/transitions.py +++ b/src/markov/transitions.py @@ -9,7 +9,6 @@ def build_transition_matrices(df: pd.DataFrame, *, dtype=np.float32) -> np.ndarr row_sum = counts.sum(axis=2, keepdims=True) empty = row_sum == 0 if np.any(empty): - n_states = counts.shape[2] idx_b, idx_i, _ = np.where(empty) counts[idx_b, idx_i, :] = 0 counts[idx_b, idx_i, idx_i] = 1 From 3f99611ab9ae123a0142e31830a4856338223db1 Mon Sep 17 00:00:00 2001 From: Philipp Schmelter Date: Thu, 7 Aug 2025 10:50:53 +0200 Subject: [PATCH 5/5] new version --- src/main.py | 112 +++++++++++++++++--------------------- src/markov/postprocess.py | 21 +++++++ src/markov/transitions.py | 38 +++++++++---- 3 files changed, 100 insertions(+), 71 deletions(-) create mode 100644 src/markov/postprocess.py diff --git a/src/main.py b/src/main.py index 3598572..cb09bee 100644 --- a/src/main.py +++ b/src/main.py @@ -1,17 +1,17 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from matplotlib.colors import Normalize -from src.markov.buckets import assign_buckets, bucket_id -from src.markov.gmm import fit_gmms, sample_value + +from src.markov.buckets import assign_buckets, bucket_id +from src.markov.gmm import fit_gmms, sample_value from src.markov.transition_counts import build_transition_counts -from src.markov.transitions import build_transition_matrices +from src.markov.transitions import build_transition_matrices +from src.markov.postprocess import apply_dwell_time, two_point_smooth from src.preprocessing.loader import load_timeseries SIM_DAYS = 10 -PER_DAY = 96 - +PER_DAY = 96 def _detect_value_col(df: pd.DataFrame) -> str: for c in ["x", "value", "load", "power", "p_norm", "load_norm"]: @@ -20,60 +20,46 @@ def _detect_value_col(df: pd.DataFrame) -> str: raise KeyError("numeric load column missing") -def _simulate_series( +def _simulate_states( probs: np.ndarray, - gmms, start_ts: pd.Timestamp, start_state: int, periods: int, rng: np.random.Generator | None = None, -) -> pd.DataFrame: +) -> np.ndarray: rng = np.random.default_rng() if rng is None else rng - ts = pd.date_range(start_ts, periods=periods, freq="15min") - states = np.empty(periods, dtype=int) - xs = np.empty(periods, dtype=float) + states = np.empty(periods, dtype=int) s = start_state - for i, t in enumerate(ts): - b = bucket_id(t) - s = rng.choice(probs.shape[1], p=probs[b, s]) + for i, t in enumerate(pd.date_range(start_ts, periods=periods, freq="15min")): + b = bucket_id(t) + s = rng.choice(probs.shape[1], p=probs[b, s]) states[i] = s - xs[i] = sample_value(gmms, b, s, rng=rng) - - return pd.DataFrame({"timestamp": ts, "state": states, "x_sim": xs}) - + return states -def _plot_first_25_buckets(counts: np.ndarray, probs: np.ndarray) -> None: - buckets = range(25) - fig, axes = plt.subplots(5, 5, figsize=(15, 15), sharex=True, sharey=True) - vmax = probs[list(buckets)].max() - norm = Normalize(vmin=0, vmax=vmax) - for idx, b in enumerate(buckets): - ax = axes.flat[idx] - if counts[b].sum() == 0: - ax.axis("off") - continue - im = ax.imshow(probs[b], aspect="auto", origin="lower", norm=norm) - ax.set_title(f"Bucket {b}", fontsize=8) - ax.set_xticks([]) - ax.set_yticks([]) - - for ax in axes.flat[len(buckets) :]: - ax.axis("off") +def _states_to_series( + states: np.ndarray, + gmms, + start_ts: pd.Timestamp, + rng: np.random.Generator | None = None, +) -> pd.Series: + rng = np.random.default_rng() if rng is None else rng + ts = pd.date_range(start_ts, periods=len(states), freq="15min") + xs = np.empty_like(states, dtype=float) + for i, (t, s) in enumerate(zip(ts, states)): + b = bucket_id(t) + xs[i] = sample_value(gmms, b, s, rng=rng) + return pd.Series(xs, index=ts, name="x_sim") - fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.6, label="p") - fig.suptitle("Transition probabilities – buckets 0–24", fontsize=14) - plt.tight_layout(rect=[0, 0, 0.97, 0.96]) - plt.show() -def _plot_simulation_diagnostics( - df: pd.DataFrame, sim: pd.DataFrame, value_col: str -) -> None: +def _plot_simulation_diagnostics(df: pd.DataFrame, + sim: pd.Series, + value_col: str) -> None: first_day = sim.iloc[:PER_DAY] plt.figure(figsize=(10, 3)) - plt.plot(first_day["timestamp"], first_day["x_sim"], marker=".") + plt.plot(first_day.index, first_day, marker=".") plt.title("Simulated power – first day") plt.ylabel("normalised load x") plt.tight_layout() @@ -81,7 +67,7 @@ def _plot_simulation_diagnostics( plt.figure(figsize=(6, 4)) plt.hist(df[value_col], bins=50, alpha=0.6, density=True, label="original") - plt.hist(sim["x_sim"], bins=50, alpha=0.6, density=True, label="simulated") + plt.hist(sim, bins=50, alpha=0.6, density=True, label="simulated") plt.title("Original vs simulated load distribution") plt.xlabel("normalised load x") plt.ylabel("density") @@ -89,9 +75,9 @@ def _plot_simulation_diagnostics( plt.tight_layout() plt.show() - sim["hour"] = sim["timestamp"].dt.hour + sim_hr = pd.DataFrame({"x_sim": sim, "hour": sim.index.hour}) plt.figure(figsize=(10, 4)) - sim.boxplot(column="x_sim", by="hour", grid=False) + sim_hr.boxplot(column="x_sim", by="hour", grid=False) plt.title("Simulated power by hour of day") plt.xlabel("hour of day") plt.ylabel("normalised load x") @@ -107,27 +93,31 @@ def main() -> None: val_col = _detect_value_col(df) counts = build_transition_counts(df) - probs = build_transition_matrices(df) + probs = build_transition_matrices(df) - _plot_first_25_buckets(counts, probs) + gmms = fit_gmms(df, value_col=val_col, verbose=1, + heartbeat_seconds=60) - gmms = fit_gmms( - df, - value_col=val_col, - verbose=1, - heartbeat_seconds=60, - ) - - periods = SIM_DAYS * PER_DAY - sim = _simulate_series( + periods = SIM_DAYS * PER_DAY + start_ts = df["timestamp"].min().normalize() + raw_states = _simulate_states( probs, - gmms, - start_ts=df["timestamp"].min().normalize(), + start_ts=start_ts, start_state=int(df["state"].iloc[0]), periods=periods, ) - _plot_simulation_diagnostics(df, sim, val_col) + states = apply_dwell_time(raw_states, p_extend=0.7) + + sim_series = _states_to_series(states, gmms, start_ts) + sim_series = two_point_smooth(sim_series) + + real_kwh = df.set_index("timestamp")[val_col].resample("D").sum() / 4 + sim_kwh = sim_series.resample("D").sum() / 4 + scale = df[val_col].mean() / sim_series.mean() + sim_series *= scale + + _plot_simulation_diagnostics(df, sim_series, val_col) if __name__ == "__main__": diff --git a/src/markov/postprocess.py b/src/markov/postprocess.py new file mode 100644 index 0000000..81d296e --- /dev/null +++ b/src/markov/postprocess.py @@ -0,0 +1,21 @@ +import numpy as np +import pandas as pd + +def apply_dwell_time(states: np.ndarray, p_extend: float = 0.7) -> np.ndarray: + if states.size < 2: + return states + + out = states.copy() + hold_next = False + for t in range(1, len(states)): + if hold_next: + out[t] = out[t-1] + hold_next = False + elif states[t] != states[t-1] and np.random.rand() < p_extend: + out[t] = out[t-1] + hold_next = True + return out + + +def two_point_smooth(series: pd.Series) -> pd.Series: + return 0.5 * (series.shift(1, fill_value=series.iloc[0]) + series) diff --git a/src/markov/transitions.py b/src/markov/transitions.py index a8726d4..86cc775 100644 --- a/src/markov/transitions.py +++ b/src/markov/transitions.py @@ -1,18 +1,36 @@ import numpy as np import pandas as pd - from ._core import _transition_counts -def build_transition_matrices(df: pd.DataFrame, *, dtype=np.float32) -> np.ndarray: - counts = _transition_counts(df, dtype=dtype) + +DEFAULT_MIN_COUNT = 25 +DEFAULT_UNIFORM_EPS = 0.25 + + +def build_transition_matrices( + df: pd.DataFrame, + *, + dtype: np.dtype = np.float32, + min_count: int = DEFAULT_MIN_COUNT, + uniform_eps: float = DEFAULT_UNIFORM_EPS, +) -> np.ndarray: + + + counts: np.ndarray = _transition_counts(df, dtype=np.float64) + _, n_states, _ = counts.shape + + row_sum = counts.sum(axis=2, keepdims=True) + mask_empty = (row_sum == 0) + mask_sparse = (row_sum < min_count) & ~mask_empty + + if mask_empty.any(): + counts[mask_empty.repeat(n_states, axis=2)] = 1.0 + + if mask_sparse.any(): + counts[mask_sparse.repeat(n_states, axis=2)] += uniform_eps / n_states + row_sum = counts.sum(axis=2, keepdims=True) - empty = row_sum == 0 - if np.any(empty): - idx_b, idx_i, _ = np.where(empty) - counts[idx_b, idx_i, :] = 0 - counts[idx_b, idx_i, idx_i] = 1 - row_sum[empty] = 1 - probs = counts / row_sum + probs = counts / row_sum return probs.astype(dtype)