enhanced features like error handling and user guide

phillip1029 · Sep 16, 2024 · 6bc45a3 · 6bc45a3
1 parent c904214
commit 6bc45a3
Show file tree

Hide file tree

Showing 3 changed files with 190 additions and 4 deletions.
diff --git a/SyntheticDataAlchemy/synthetic_data.py b/SyntheticDataAlchemy/synthetic_data.py
@@ -4,8 +4,37 @@
 from sklearn.impute import SimpleImputer
 from scipy.stats import gaussian_kde
 from category_encoders import TargetEncoder
+import matplotlib.pyplot as plt
+import seaborn as sns
 
-def generate_synthetic_data(df, n_samples):
+def generate_synthetic_data(df, n_samples, random_state=None, kde_bandwidth=None, correlation_threshold=0.1):
+    """
+    Generate synthetic data based on the input DataFrame.
+
+    Args:
+        df (pd.DataFrame): Input DataFrame containing the original data.
+        n_samples (int): Number of synthetic samples to generate.
+        random_state (int, optional): Random seed for reproducibility. Defaults to None.
+        kde_bandwidth (float, optional): Bandwidth for KDE. Defaults to None (automatic selection).
+        correlation_threshold (float, optional): Threshold for preserving correlations. Defaults to 0.1.
+
+    Returns:
+        pd.DataFrame: Synthetic data DataFrame.
+
+    Raises:
+        ValueError: If input DataFrame is empty or contains unsupported data types.
+    """
+    # Input validation
+    if df.empty:
+        raise ValueError("Input DataFrame is empty.")
+
+    if not set(df.dtypes).issubset([np.int64, np.float64, object, 'category']):
+        raise ValueError("Input DataFrame contains unsupported data types.")
+
+    # Set random seed if provided
+    if random_state is not None:
+        np.random.seed(random_state)
+
     # Separate numerical and categorical columns
     num_cols = df.select_dtypes(include=['int64', 'float64']).columns
     cat_cols = df.select_dtypes(include=['object', 'category']).columns
@@ -18,7 +47,7 @@ def generate_synthetic_data(df, n_samples):
     num_data_scaled = scaler.fit_transform(num_data_imputed)
 
     # Estimate the joint distribution of numerical features
-    kde = gaussian_kde(num_data_scaled.T)
+    kde = gaussian_kde(num_data_scaled.T, bw_method=kde_bandwidth)
 
     # Generate synthetic numerical data
     synthetic_num_data = kde.resample(n_samples).T
@@ -42,7 +71,63 @@ def generate_synthetic_data(df, n_samples):
         for num_col in num_cols:
             encoded_col = encoder.fit_transform(df[cat_col], df[num_col]).squeeze()
             correlation = df[num_col].corr(encoded_col)
-            if abs(correlation) > 0.1:  # Adjust this threshold as needed
+            if abs(correlation) > correlation_threshold:
                 synthetic_df[num_col] += (encoder.transform(synthetic_df[cat_col]).squeeze() - encoded_col.mean()) * correlation
 
-    return synthetic_df
+    return synthetic_df
+
+def visualize_comparison(original_df, synthetic_df, num_cols=None, cat_cols=None):
+    """
+    Visualize the comparison between original and synthetic data.
+
+    Args:
+        original_df (pd.DataFrame): Original DataFrame.
+        synthetic_df (pd.DataFrame): Synthetic DataFrame.
+        num_cols (list, optional): List of numerical columns to visualize. Defaults to all numerical columns.
+        cat_cols (list, optional): List of categorical columns to visualize. Defaults to all categorical columns.
+    """
+    if num_cols is None:
+        num_cols = original_df.select_dtypes(include=['int64', 'float64']).columns
+
+    if cat_cols is None:
+        cat_cols = original_df.select_dtypes(include=['object', 'category']).columns
+
+    # Visualize numerical columns
+    for col in num_cols:
+        plt.figure(figsize=(10, 5))
+        sns.histplot(original_df[col], kde=True, label='Original', color='blue', alpha=0.5)
+        sns.histplot(synthetic_df[col], kde=True, label='Synthetic', color='red', alpha=0.5)
+        plt.title(f'Distribution Comparison: {col}')
+        plt.legend()
+        plt.show()
+
+    # Visualize categorical columns
+    for col in cat_cols:
+        plt.figure(figsize=(10, 5))
+        sns.countplot(x=col, data=original_df, label='Original', alpha=0.5)
+        sns.countplot(x=col, data=synthetic_df, label='Synthetic', alpha=0.5)
+        plt.title(f'Distribution Comparison: {col}')
+        plt.legend()
+        plt.xticks(rotation=45)
+        plt.show()
+
+# Example usage
+if __name__ == "__main__":
+    # Create a sample DataFrame
+    df = pd.DataFrame({
+        'age': [25, 30, 35, 40, 45],
+        'income': [50000, 60000, 70000, 80000, 90000],
+        'gender': ['M', 'F', 'M', 'F', 'M'],
+        'education': ['Bachelor', 'Master', 'PhD', 'Bachelor', 'Master']
+    })
+
+    # Generate synthetic data
+    synthetic_df = generate_synthetic_data(df, n_samples=100, random_state=42)
+
+    print("Original Data:")
+    print(df)
+    print("\nSynthetic Data (first 5 rows):")
+    print(synthetic_df.head())
+
+    # Visualize comparison
+    visualize_comparison(df, synthetic_df)
diff --git a/USER_GUIDE.md b/USER_GUIDE.md
@@ -0,0 +1,94 @@
+# DataAlchemy User Guide
+
+## Introduction
+
+DataAlchemy is a powerful tool for generating synthetic data that closely mimics your original dataset. This guide will walk you through the features and usage of DataAlchemy from a business user's perspective.
+
+## Why Use DataAlchemy?
+
+1. **Data Privacy**: Generate synthetic data that preserves the statistical properties of your original data without exposing sensitive information.
+2. **Augment Limited Datasets**: Expand small datasets for more robust analysis and machine learning model training.
+3. **Test Data Generation**: Create realistic test data for software development and quality assurance processes.
+4. **Scenario Analysis**: Generate data for various "what-if" scenarios to support decision-making.
+
+## Features
+
+### 1. Synthetic Data Generation
+
+DataAlchemy can create synthetic versions of your datasets that maintain the statistical properties and relationships of the original data.
+
+### 2. Configurability
+
+- **Sample Size Control**: Generate any number of synthetic samples.
+- **Reproducibility**: Set a random seed for consistent results across multiple runs.
+- **Fine-tuning**: Adjust parameters like KDE bandwidth and correlation threshold for more precise control over the synthetic data generation process.
+
+### 3. Visualization
+
+Compare your original data with the generated synthetic data using built-in visualization tools. This helps ensure that the synthetic data accurately represents the characteristics of your original dataset.
+
+## How to Use DataAlchemy
+
+### Step 1: Installation
+
+1. Ensure you have Python installed on your system.
+2. Download the DataAlchemy package.
+3. Open a terminal or command prompt in the DataAlchemy directory.
+4. Run the following command to install the required dependencies:
+
+   ```
+   pip install -r requirements.txt
+   ```
+
+### Step 2: Prepare Your Data
+
+Organize your data into a pandas DataFrame. DataAlchemy supports both numerical and categorical data types.
+
+### Step 3: Generate Synthetic Data
+
+Here's a basic example of how to use DataAlchemy to generate synthetic data:
+
+```python
+from SyntheticDataAlchemy.synthetic_data import generate_synthetic_data, visualize_comparison
+import pandas as pd
+
+# Load your original data
+original_data = pd.read_csv('your_data.csv')
+
+# Generate synthetic data
+synthetic_data = generate_synthetic_data(original_data, n_samples=1000)
+
+# Compare original and synthetic data
+visualize_comparison(original_data, synthetic_data)
+```
+
+### Step 4: Customize the Process (Optional)
+
+You can fine-tune the synthetic data generation process using additional parameters:
+
+```python
+synthetic_data = generate_synthetic_data(
+    original_data,
+    n_samples=1000,
+    random_state=42,  # Set for reproducibility
+    kde_bandwidth=0.1,  # Adjust for smoother or more detailed distributions
+    correlation_threshold=0.2  # Adjust for preserving weaker or stronger relationships
+)
+```
+
+### Step 5: Analyze and Validate
+
+Use the `visualize_comparison` function to compare the distributions of your original and synthetic data. This will help you ensure that the synthetic data accurately represents your original dataset.
+
+## Best Practices
+
+1. **Start Small**: Begin with a subset of your data to quickly test and understand the synthetic data generation process.
+2. **Iterate**: Adjust parameters and regenerate data as needed to achieve the desired balance between privacy and utility.
+3. **Validate Thoroughly**: Always compare the synthetic data with your original data to ensure it meets your requirements.
+4. **Document Your Process**: Keep track of the parameters used to generate your synthetic datasets for reproducibility and audit purposes.
+
+## Conclusion
+
+DataAlchemy provides a user-friendly way to generate high-quality synthetic data for various business needs. By following this guide, you can leverage DataAlchemy to create valuable synthetic datasets while preserving the privacy and integrity of your original data.
+
+For more detailed information on the functions and parameters, please refer to the inline documentation in the `synthetic_data.py` file.
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,7 @@
+pandas
+numpy
+scikit-learn
+scipy
+category_encoders
+matplotlib
+seaborn