diff --git a/README.md b/README.md index 74741b2..d7fd36a 100644 --- a/README.md +++ b/README.md @@ -119,6 +119,33 @@ Submit your results to the leaderboard by creating a pull request that adds your The final `all_results.csv` file should contain `98` lines (one for each dataset configuration) and `15` columns: `4` for dataset, model, domain and num_variates and `11` for the evaluation metrics. +## Time Series Features Analysis + +Add NUM_CPUS to your .env file to run the analysis in parallel. + +``` +echo "NUM_CPUS={N}" >> .env +``` + +To replicate the time series feature analysis in the paper, run the following command: + +``` +python -m cli.analysis datasets=all_datasets +``` +This will run the analysis for all the datasets in the benchmark and generate two folders under `outputs/analysis/test`: +1. `datasets`: This folder contains the individual features for each dataset along with some some plots visualizing those features. +2. `all_datasets`: This folder contains the aggregated features for all the datasets along with some some plots visualizing those features. + +Note: Expect the analysis to take long, we recommend running it on a large cpu cluster and setting the `NUM_CPUS` environment variable to the number of cores you have access to. + +If you just want to try the analysis out you can run it with a few datasets by creating a new config file in the `cli/conf/analysis/datasets` folder. Follow the [`sample`](cli/conf/analysis/datasets/sample.yaml) file shared. + +``` +python -m cli.analysis datasets=sample +``` + + + ## Citation If you find this benchmark useful, please consider citing: diff --git a/cli/__init__.py b/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cli/analysis.py b/cli/analysis.py new file mode 100644 index 0000000..e4472a2 --- /dev/null +++ b/cli/analysis.py @@ -0,0 +1,32 @@ + +import hydra +from hydra.utils import instantiate +from omegaconf import DictConfig +from hydra.core.hydra_config import HydraConfig +from gift_eval.analysis.utils import plot_histogram + + +@hydra.main(version_base="1.3", config_path="conf/analysis", config_name="default") +def main(cfg: DictConfig): + output_dir = HydraConfig.get().runtime.output_dir + analyzer = instantiate(cfg.analyzer, _convert_="all") + analyzer.print_datasets() + + print(analyzer.freq_distribution_by_dataset) + print(analyzer.freq_distribution_by_ts) + print(analyzer.freq_distribution_by_ts_length) + print(analyzer.freq_distribution_by_window) + + # plot a histogram of all three frequncy distributions and save it to output_dir + plot_histogram(analyzer.freq_distribution_by_dataset, + "dataset", output_dir) + plot_histogram(analyzer.freq_distribution_by_ts, "time series", output_dir) + plot_histogram(analyzer.freq_distribution_by_ts_length, + "ts length", output_dir) + plot_histogram(analyzer.freq_distribution_by_window, "window", output_dir) + + analyzer.features_by_window(output_dir) + + +if __name__ == "__main__": + main() diff --git a/cli/conf/analysis/datasets/all_datasets.yaml b/cli/conf/analysis/datasets/all_datasets.yaml new file mode 100644 index 0000000..0ee008c --- /dev/null +++ b/cli/conf/analysis/datasets/all_datasets.yaml @@ -0,0 +1,222 @@ +name : all_datasets +datasets: + - _target_: gift_eval.data.Dataset + name: m4_yearly + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: m4_quarterly + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: m4_monthly + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: m4_weekly + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: m4_daily + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: m4_hourly + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: electricity/15T + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: electricity/H + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: electricity/D + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: electricity/W + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: solar/10T + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: solar/H + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: solar/D + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: solar/W + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: hospital + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: covid_deaths + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: us_births/D + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: us_births/M + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: us_births/W + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: saugeenday/D + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: saugeenday/M + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: saugeenday/W + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: temperature_rain_with_missing + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: kdd_cup_2018_with_missing/H + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: kdd_cup_2018_with_missing/D + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: car_parts_with_missing + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: restaurant + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: hierarchical_sales/D + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: hierarchical_sales/W + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: LOOP_SEATTLE/5T + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: LOOP_SEATTLE/H + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: LOOP_SEATTLE/D + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: SZ_TAXI/15T + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: SZ_TAXI/H + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: M_DENSE/H + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: M_DENSE/D + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: ett1/15T + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: ett1/H + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: ett1/D + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: ett1/W + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: ett2/15T + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: ett2/H + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: ett2/D + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: ett2/W + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: jena_weather/10T + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: jena_weather/H + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: jena_weather/D + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: bitbrains_fast_storage/5T + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: bitbrains_fast_storage/H + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: bitbrains_rnd/5T + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: bitbrains_rnd/H + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: bizitobs_application + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: bizitobs_service + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: bizitobs_l2c/5T + term: short + to_univariate: true + - _target_: gift_eval.data.Dataset + name: bizitobs_l2c/H + term: short + to_univariate: true diff --git a/cli/conf/analysis/datasets/default.yaml b/cli/conf/analysis/datasets/default.yaml new file mode 100644 index 0000000..aa1be84 --- /dev/null +++ b/cli/conf/analysis/datasets/default.yaml @@ -0,0 +1,3 @@ +name : default +datasets: + \ No newline at end of file diff --git a/cli/conf/analysis/datasets/sample.yaml b/cli/conf/analysis/datasets/sample.yaml new file mode 100644 index 0000000..252debf --- /dev/null +++ b/cli/conf/analysis/datasets/sample.yaml @@ -0,0 +1,10 @@ +name : sample +datasets: + - _target_: gift_eval.data.Dataset + name: m4_weekly + term: short + to_univariate: false + - _target_: gift_eval.data.Dataset + name: m4_hourly + term: short + to_univariate: false \ No newline at end of file diff --git a/cli/conf/analysis/default.yaml b/cli/conf/analysis/default.yaml new file mode 100644 index 0000000..1b9ae18 --- /dev/null +++ b/cli/conf/analysis/default.yaml @@ -0,0 +1,11 @@ +defaults: + - datasets: all_datasets + - _self_ + +hydra: + run: + dir: outputs/${hydra:job.name}/${name}/${datasets.name} +analyzer: + _target_: gift_eval.analysis.Analyzer + datasets: ${datasets.datasets} +name: "test" diff --git a/pyproject.toml b/pyproject.toml index f36c408..9ad56a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,10 @@ dependencies = [ "hydra-core==1.3", "datasets~=2.17.1", "orjson", - "matplotlib~=3.9.2" + "matplotlib~=3.9.2", + "tsfeatures", + "ray", + "scipy~=1.11.3", ] requires-python = ">=3.10" authors = [ diff --git a/src/gift_eval/analysis/__init__.py b/src/gift_eval/analysis/__init__.py new file mode 100644 index 0000000..5048d4e --- /dev/null +++ b/src/gift_eval/analysis/__init__.py @@ -0,0 +1,3 @@ +from .analyzer import Analyzer + +__all__ = ["Analyzer"] \ No newline at end of file diff --git a/src/gift_eval/analysis/analyzer.py b/src/gift_eval/analysis/analyzer.py new file mode 100644 index 0000000..b3eae5b --- /dev/null +++ b/src/gift_eval/analysis/analyzer.py @@ -0,0 +1,242 @@ +import os +import gc +import numpy as np + +import ray +from ray.experimental import tqdm_ray + +import pandas as pd +from pandas.tseries.frequencies import to_offset + +from pathlib import Path +from tqdm import tqdm +from collections import defaultdict +from functools import cached_property +from gluonts.time_feature import norm_freq_str + +from gift_eval.data import Dataset +from .features import get_ts_features +from .utils import persist_analysis + +from dotenv import load_dotenv +load_dotenv() + +MAX_CONTEXT_LEN = 500 +runtime_env = { + 'env_vars': { + "RAY_memory_usage_threshold": "0.85" + } +} + +if not os.getenv("NUM_CPUS"): + print("NUM_CPUS environment variable not found. Setting to 1. Set NUM_CPUS to speed up processing.") +NUM_CPUS = int(os.getenv("NUM_CPUS", "1")) + + +@ray.remote(num_cpus=NUM_CPUS) +def process_instance(self, test_input, test_label, dataset_freq): + """ + Process a single time series instance to compute features. + + Parameters: + - self: Reference to the calling object. + - test_input: Dictionary containing the input time series data. + - test_label: Dictionary containing the label time series data. + - dataset_freq: Frequency of the dataset. + + Returns: + - DataFrame containing the computed features for the time series instance. + """ + np_inp = np.array(test_input["target"]) + np_label = np.array(test_label["target"]) + + # Check if the input is 2D and trim to MAX_CONTEXT_LEN if necessary + if len(np_inp.shape) == 2: + if np_inp.shape[1] > MAX_CONTEXT_LEN: + np_inp = np_inp[:, -MAX_CONTEXT_LEN:] + np_instance = np.concatenate((np_inp, np_label), axis=1) + else: + if len(np_inp) > MAX_CONTEXT_LEN: + np_inp = np_inp[-MAX_CONTEXT_LEN:] + np_instance = np.concatenate((np_inp, np_label)) + + # Compute time series features + window_features_df = get_ts_features( + np_instance, norm_freq_str(to_offset(dataset_freq).name)) + self.pbar.update.remote(1) + return window_features_df + + +@ray.remote +def process_dataset(self, dataset, output_dir): + """ + Process an entire dataset to compute features for each time series instance. + + Parameters: + - self: Reference to the calling object. + - dataset: The dataset to be processed. + - output_dir: Directory where the processed data will be saved. + + Returns: + - None, but updates the progress bar and persists the analysis results. + """ + # Determine the directory for the dataset based on its term and name + if str(dataset.term) == "Term.SHORT": + dataset_dir = Path(os.path.join(os.path.dirname( + output_dir), f"datasets/{dataset.name}")) + else: + if "/" in dataset.name: + dataset_name, dataset_freq = dataset.name.split("/") + dataset_name = f"{dataset_name}:{dataset.term}/{dataset_freq}" + dataset_dir = Path(os.path.join(os.path.dirname( + output_dir), f"datasets/{dataset_name}")) + else: + dataset_dir = Path(os.path.join(os.path.dirname( + output_dir), f"datasets/{dataset.name}:{dataset.term}")) + + # Create the directory if it doesn't exist + if not dataset_dir.exists(): + dataset_dir.mkdir(parents=True, exist_ok=True) + print("Directory created:", dataset_dir) + else: + print("Directory already exists:", dataset_dir) + # Assume dataset has already been processed + self.pbar.update.remote(dataset.hf_dataset.num_rows * dataset.windows) + return None + + all_features_list = [] + test_data = dataset.test_data + + # Process each instance in the dataset + features = [process_instance.remote( + self, test_input, test_label, dataset.freq) for test_input, test_label in test_data] + + for feature in features: + try: + # Retrieve the result with a timeout + result = ray.get(feature, timeout=300) # 300 seconds timeout + all_features_list.append(result) + except ray.exceptions.GetTimeoutError: + print("A task timed out and will be skipped.") + continue # Skip this particular instance + except Exception as e: + print(f"An error occurred while processing: {e}") + continue + + gc.collect() + + # Concatenate all features and persist the analysis + all_features_df = pd.concat(all_features_list) + persist_analysis(all_features_df, dataset_dir) + + +class Analyzer(): + """ + Analyzer class to manage the analysis of multiple datasets, including feature computation and frequency distribution analysis. + """ + + def __init__(self, datasets: list[Dataset]): + """ + Initialize the Analyzer with a list of datasets. + + Parameters: + - datasets: List of Dataset objects to be analyzed. + """ + self.datasets = datasets + ray.init(runtime_env=runtime_env) + remote_tqdm = ray.remote(tqdm_ray.tqdm) + self.pbar = remote_tqdm.remote(total=self._sum_windows_count) + + def print_datasets(self): + """Print the names of all datasets.""" + for dataset in self.datasets: + print(dataset.name) + + @cached_property + def _sum_series_count(self) -> int: + """Calculate the total number of series across all datasets.""" + total_count = 0 + for dataset in self.datasets: + total_count += dataset.hf_dataset.num_rows + return total_count + + @cached_property + def _sum_windows_count(self) -> int: + """Calculate the total number of windows across all datasets.""" + total_count = 0 + for dataset in self.datasets: + total_count += dataset.hf_dataset.num_rows * dataset.windows + return total_count + + @property + def freq_distribution_by_dataset(self): + """Compute the frequency distribution by dataset.""" + freqs = [norm_freq_str(to_offset(dataset.freq).name) + for dataset in self.datasets] + freq_counts = {freq: freqs.count(freq) for freq in set(freqs)} + return freq_counts + + @property + def freq_distribution_by_ts(self): + """Compute the frequency distribution by time series count.""" + freq_ts_counts = defaultdict(lambda: 0) + for dataset in self.datasets: + freq_ts_counts[norm_freq_str( + to_offset(dataset.freq).name)] += dataset.hf_dataset.num_rows + return freq_ts_counts + + @property + def freq_distribution_by_ts_length(self): + """Compute the frequency distribution by time series length.""" + freq_dp_counts = defaultdict(lambda: 0) + for dataset in self.datasets: + freq_dp_counts[norm_freq_str( + to_offset(dataset.freq).name)] += dataset.sum_series_length + return freq_dp_counts + + @property + def freq_distribution_by_window(self): + """Compute the frequency distribution by window count.""" + freq_window_counts = defaultdict(lambda: 0) + for dataset in self.datasets: + freq_window_counts[norm_freq_str( + to_offset(dataset.freq).name)] += dataset.hf_dataset.num_rows * dataset.windows + return freq_window_counts + + def features_by_window(self, output_dir): + """ + Create and persist features of each window for each dataset. + + Parameters: + - output_dir: Directory where the features will be saved. + """ + ray.get([process_dataset.remote(self, dataset, output_dir) + for dataset in self.datasets]) + self.pbar.close.remote() + + all_datasets_df = [] + # Aggregate the characteristics for each dataset + with tqdm(total=len(self.datasets), desc="Computing ts features for whole benchmark") as pbar: + for dataset in self.datasets: + pbar.set_description(f"Loading ts features | {dataset.name}") + if str(dataset.term) == "Term.SHORT": + dataset_df_path = os.path.join(os.path.dirname( + output_dir), f"datasets/{dataset.name}/features.csv") + else: + if "/" in dataset.name: + dataset_name, dataset_freq = dataset.name.split("/") + dataset_name = f"{dataset_name}:{dataset.term}/{dataset_freq}" + dataset_df_path = Path(os.path.join(os.path.dirname( + output_dir), f"datasets/{dataset_name}/features.csv")) + else: + dataset_df_path = Path(os.path.join(os.path.dirname( + output_dir), f"datasets/{dataset.name}:{dataset.term}/features.csv")) + df = pd.read_csv(dataset_df_path) + all_datasets_df.append(df) + pbar.update(1) + + # Concatenate all dataset features and persist the analysis + all_features_df = pd.concat(all_datasets_df, ignore_index=True) + persist_analysis(all_features_df, output_dir) + + return None diff --git a/src/gift_eval/analysis/features.py b/src/gift_eval/analysis/features.py new file mode 100644 index 0000000..33f995b --- /dev/null +++ b/src/gift_eval/analysis/features.py @@ -0,0 +1,99 @@ +import numpy as np +import re +from tsfeatures import tsfeatures, stl_features, entropy, hurst, lumpiness, stability + +import pandas as pd + +# Define periods for different frequency strings +PERIODS = {'H': 24, 'D': 7, # hourly, daily + 'M': 12, 'Q': 4, # monthly, quarterly + 'W': 4, 'A': 1, # weekly, annual + 'T': 60, 'S': 60, # minute, second + 'L': 1000, 'U': 1000, 'N': 1000} # millisecond, microsecond, nanosecond + + +def infer_period(freq): + """ + Infer the period of a time series based on its frequency string. + + Parameters: + - freq: Frequency string (e.g., 'H', 'D', '2A-DEC'). + + Returns: + - The period as an integer. + + Raises: + - ValueError: If the frequency is not recognized. + """ + if '-' in freq: + freq = freq.split('-')[0] + + if freq in PERIODS: + return PERIODS[freq] + elif freq.isalnum(): + pattern = r"(\d+)([a-zA-Z]+)" + match = re.match(pattern, freq) + repeat_count, freq_str = match.groups() + return max(PERIODS[freq_str]//int(repeat_count), 1) + else: + raise ValueError(f"Frequency {freq} not recognized") + + +def get_ts_features(timeseries: np.ndarray, freq) -> float: + """ + Extract time series features using the tsfeatures package. + + Parameters: + - timeseries: A numpy array representing the time series data. + - freq: Frequency string of the time series. + + Returns: + - A DataFrame containing selected features: trend, seasonal_strength, entropy, hurst, lumpiness, stability. + """ + # Create a DataFrame with a date range and the time series data + panel = pd.DataFrame({'ds': pd.date_range( + start='1900-01-01', periods=len(timeseries), freq=freq), 'y': timeseries}) + panel['unique_id'] = 1 + + # Compute features using tsfeatures + features_df = tsfeatures(panel, features=[ + stl_features, entropy, hurst, lumpiness, stability], freq=infer_period(freq)) + + # Ensure all required columns are present, filling missing ones with NaN + for column in ['trend', 'seasonal_strength', 'entropy', 'hurst', 'lumpiness', 'stability']: + if column not in features_df.columns: + features_df[column] = np.nan + return features_df[['trend', 'seasonal_strength', 'entropy', 'hurst', 'lumpiness', 'stability']] + + +if __name__ == "__main__": + # Test the infer_period function with various frequency strings + print(infer_period('30T')) + print(infer_period('2A-DEC')) + print(infer_period('H')) + print(infer_period('A')) + print(infer_period('2A')) + print(infer_period('A-DEC')) + print(infer_period('A-JAN')) + print(infer_period('5S')) + + # Generate random time series data and test feature extraction + timeseries = np.random.randn(100) + print(get_ts_features(timeseries, '30T')) + + # Generate a time series with all zeros and test feature extraction + timeseries = np.zeros(100) + print(get_ts_features(timeseries, 'D')) + + # Generate a time series with a trend and test feature extraction + timeseries = np.arange(100) + print(get_ts_features(timeseries, '30T')) + + # Generate a time series with seasonality and test feature extraction + timeseries = np.array( + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]) + print(get_ts_features(timeseries, 'M')) + + # Generate a time series with a trend in the first half and random in the second half + timeseries = np.concatenate([np.arange(50), np.random.randn(50)]) + print(get_ts_features(timeseries, 'H')) diff --git a/src/gift_eval/analysis/utils.py b/src/gift_eval/analysis/utils.py new file mode 100644 index 0000000..edf6f5e --- /dev/null +++ b/src/gift_eval/analysis/utils.py @@ -0,0 +1,139 @@ +import matplotlib.pyplot as plt +import numpy as np +import os + + +def normalize_data(mean, std): + """ + Normalize the data using Min-Max Scaling to bring the mean and standard deviation between 0 and 1. + + Parameters: + - mean: The mean values of the data. + - std: The standard deviation values of the data. + + Returns: + - Tuple of normalized mean and standard deviation. + """ + # Apply Min-Max Scaling to normalize the data between 0 and 1 + mean_normalized = (mean - mean.min()) / (mean.max() - mean.min()) + # Normalize std by the same range for consistency + std_normalized = std / (mean.max() - mean.min()) + return mean_normalized, std_normalized + + +def plot_radar_chart(df, output_dir): + """ + Plot a radar chart for the given DataFrame and save it to the specified output directory. + + Parameters: + - df: DataFrame containing the data to plot. + - output_dir: Directory where the radar chart will be saved. + """ + # Calculate mean and std across rows for each column + mean = df.mean() + std = df.std() + + # Normalize data + mean_normalized, std_normalized = normalize_data(mean, std) + + # Number of variables we're plotting. + num_vars = len(df.columns) + + # Split the circle into even parts and save angles so we know where to put each axis. + angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist() + # Complete the loop for the plot + angles += angles[:1] + mean_normalized = mean_normalized.tolist() + mean_normalized.tolist()[:1] + std_normalized = std_normalized.tolist() + std_normalized.tolist()[:1] + + # Draw the plot + fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True)) + ax.fill(angles, mean_normalized, color='red', alpha=0.25) + ax.plot(angles, mean_normalized, color='red', label='Mean') + + # Draw error bars + for angle, mean, error in zip(angles, mean_normalized, std_normalized): + ax.errorbar(angle, mean, yerr=error, color='black', capsize=3) + + ax.set_ylim(0, 1) + + # Labels for each feature + labels = df.columns.tolist() + ax.set_xticks(angles[:-1]) + ax.set_xticklabels(labels, size=12) + + # Title and legend + plt.title('Time Series Benchmark Features (Normalized)', + size=15, color='red', y=1.1) + ax.legend(loc='upper right', bbox_to_anchor=(1.1, 1.1)) + + # Save the figure + plt.savefig(os.path.join(output_dir, 'character_radar.png')) + plt.close() + + +def persist_analysis(features_df, output_dir): + """ + Persist the analysis results by saving the features DataFrame and its description to CSV files. + + Parameters: + - features_df: DataFrame containing the features to be saved. + - output_dir: Directory where the CSV files will be saved. + """ + features_df.to_csv(f"{output_dir}/features.csv") + features_df.describe().to_csv(f"{output_dir}/features_description.csv") + + # Create histograms of each characteristic + for column in features_df.columns: + plot_feature_histogram(features_df, column, output_dir) + # Create a ring plot showing the mean and std of some key features, trend and seasonality should be there for sure. + plot_radar_chart(features_df, output_dir) + + +def plot_histogram(freq_distribution_dict, name, output_dir): + """ + Plot the histogram for the given frequency distribution dictionary and save it to the output directory. + + Parameters: + - freq_distribution_dict: Dictionary with frequency strings as keys and counts as values. + - name: Name of the frequency distribution. + - output_dir: Directory where the histogram will be saved. + """ + fig, ax = plt.subplots() + ax.bar(freq_distribution_dict.keys(), freq_distribution_dict.values()) + ax.set_xlabel(f"Frequency of {name}") + ax.set_ylabel("Count") + ax.set_title(f"Distribution of {name} frequencies") + plt.savefig(f"{output_dir}/{name}_frequency_distribution.png") + plt.close() + + +def plot_feature_histogram(dataframe, column_name, output_directory): + """ + Plot a histogram for the given column in the DataFrame and save it to the specified output directory. + + Parameters: + - dataframe: The DataFrame containing the data. + - column_name: The column name for which to plot the histogram. + - output_directory: The directory where the plot will be saved. + """ + # Check if the column exists in the dataframe + if column_name not in dataframe.columns: + raise ValueError( + f"Column '{column_name}' does not exist in the dataframe.") + + # Drop rows with NA values in the specified column + clean_data = dataframe[column_name].dropna() + + # Plot the histogram + plt.figure(figsize=(10, 6)) + plt.hist(clean_data, bins=10, edgecolor='black') + plt.title(f'Histogram of {column_name}') + plt.xlabel(column_name) + plt.ylabel('Frequency') + + # Save the plot + plot_filename = os.path.join( + output_directory, f"{column_name}_histogram.png") + plt.savefig(plot_filename) + plt.close()