Skip to content

Commit

Permalink
plugged format_master
Browse files Browse the repository at this point in the history
Plugged New Internal Tool;
cr_format_master  #[maintainence]

use : used to detect the category of dataset.
right now its commented in engine.core_rec

check it out in : engine/format_master/

maintainer :  @vishesh9131
  • Loading branch information
vishesh9131 committed Oct 26, 2024
1 parent 89db26a commit ac2170a
Show file tree
Hide file tree
Showing 11 changed files with 945 additions and 5 deletions.
6 changes: 3 additions & 3 deletions engine/Tmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,7 @@ def compute_neighborhood_similarity(self, adjacency_matrix, x):




# # Example usage
# # Example usage (Do not consider it in production -vishesh)
# input_dim = 4
# d_model = 8
# num_layers = 2
Expand All @@ -93,6 +92,7 @@ def compute_neighborhood_similarity(self, adjacency_matrix, x):

########################## TestingGraphTransformersAugAttention ########################
class TestingGraphTransformersAugAttention(Module):
# this mentioned method is defined in paper.. [vishesh will provide link to his paper here]
def __init__(self, num_layers, d_model, num_heads, d_feedforward, input_dim, num_weights=10, use_weights=True, dropout=0.1):
super(TestingGraphTransformersAugAttention, self).__init__()
self.num_weights = num_weights
Expand All @@ -104,7 +104,7 @@ def __init__(self, num_layers, d_model, num_heads, d_feedforward, input_dim, num
if self.use_weights:
self.weight_linears = ModuleList([Linear(input_dim, d_model) for _ in range(num_weights)])

# Attention components
# Attention components (Augumenting KQV(Key Value Pair) matrices)
self.query_linear = Linear(d_model, d_model)
self.key_linear = Linear(d_model, d_model)
self.value_linear = Linear(d_model, d_model)
Expand Down
Binary file modified engine/__pycache__/Tmodel.cpython-311.pyc
Binary file not shown.
Binary file modified engine/__pycache__/core_rec.cpython-311.pyc
Binary file not shown.
8 changes: 7 additions & 1 deletion engine/core_rec.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,10 @@


#Promoted this script to engine.cr_utility.dataset
from engine.datasets import GraphDataset
from engine.datasets import GraphDataset


#FormatMaster is the plug for corerec preprocessing to detect dataset format and category
# from engine.format_master.ds_format_loader import *
# from engine.format_master.cr_formatMaster import *
# from engine.format_master.format_library import *
2 changes: 1 addition & 1 deletion engine/cr_boosters/test_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def test_adam_function(self):
exp_avg_sqs = [torch.zeros_like(params[0])]
max_exp_avg_sqs = [torch.zeros_like(params[0])]
state_steps = [torch.tensor(0)]

adam(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, amsgrad=False, beta1=0.9, beta2=0.999, lr=0.001, weight_decay=0, eps=1e-8, maximize=False)

# Calculate expected values based on the Adam update rule
Expand Down
197 changes: 197 additions & 0 deletions engine/format_master/cr_formatMaster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
"""
This format detection pipeline will be later plugged into corerec's utilities.
This script provides a complete pipeline for data processing, including loading,
format detection, preprocessing, and validation of data.
Functions:
- load_data(file_path): Loads data from the specified file path into a DataFrame.
- detect_format(df): Detects the format of the given DataFrame.
- preprocess_data(df, data_format): Preprocesses the DataFrame based on the detected format.
- validate_data(preprocessed_df): Validates the preprocessed DataFrame to ensure it meets required standards.
- format_detection_pipeline(file_path): Orchestrates the entire process of loading, detecting,
preprocessing, and validating data from a given file path.
config dictionary:
- parallel_processing: bool, whether to use parallel processing.
- log_level: str, the level of logging to use.
- chunk_size: int, the size of chunks to use for parallel processing.
- missing_value_strategy: str, the strategy to use for handling missing values.
- scaling_method: str, the method to use for scaling the data.
- validation_rules: dict, the rules to use for validating the data.
- report_format: str, the format to use for the report.
- log_file: str, the file to use for logging.
- monitoring: bool, whether to use monitoring.
- num_workers: int, the number of workers to use for parallel processing.
- distributed_backend: str, the distributed backend to use.
- custom_steps: list, the custom steps to use.
Example config:
config = {
'parallel_processing': True,
'log_level': 'INFO',
'chunk_size': 10000,
'missing_value_strategy': 'fill_mean',
'scaling_method': 'standard',
'validation_rules': {'max_null_percentage': 0.1},
'report_format': 'json',
'log_file': 'pipeline.log',
'monitoring': True,
'num_workers': 4,
'distributed_backend': 'dask',
'custom_steps': ['step1', 'step2']
}
Author: Vishesh Yadav
"""

import pandas as pd
import numpy as np
import json
import os
import warnings
import logging
import dask.dataframe as dd # Dask for handling large datasets
from dask.distributed import Client
import yaml
import glob
from fuzzywuzzy import fuzz, process

from engine.format_master.format_library import *
from ds_format_loader import load_data, detect_format, preprocess_data, validate_data

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def load_config(config_path):
"""Load configuration from a JSON or YAML file."""
with open(config_path, 'r') as file:
if config_path.endswith('.json'):
return json.load(file)
elif config_path.endswith('.yaml') or config_path.endswith('.yml'):
return yaml.safe_load(file)
else:
raise ValueError("Unsupported configuration file format.")

def setup_dask_client(config):
"""Setup Dask client for distributed processing."""
if config.get('distributed_backend') == 'dask':
client = Client(n_workers=config.get('num_workers', 4))
logging.info("Dask client set up with {} workers.".format(config.get('num_workers', 4)))
return client
return None

def find_file(file_name, search_paths, extensions=None, threshold=80):
"""Search for a file in multiple directories with different possible names and extensions."""
if extensions is None:
extensions = ['.csv', '.txt', '.json'] # Add more extensions as needed

possible_names = [file_name, file_name.lower(), file_name.upper(), file_name.replace(' ', '_')]
all_files = []

# Collect all files in the search paths
for path in search_paths:
for root, dirs, files in os.walk(path):
for file in files:
all_files.append(os.path.join(root, file))

# Try to find the file using exact and fuzzy matching
for name in possible_names:
for ext in extensions:
target_name = f"{name}{ext}"
# Exact match
for file_path in all_files:
if os.path.basename(file_path) == target_name:
logging.info(f"Exact match found: {file_path}")
return file_path

# Fuzzy match
matches = process.extractBests(target_name, all_files, scorer=fuzz.ratio, score_cutoff=threshold)
if matches:
best_match = matches[0][0]
logging.info(f"Fuzzy match found: {best_match}")
return best_match

logging.warning(f"File {file_name} not found in specified directories.")
return None

def detect(data, config=None, custom_preprocess=None):
"""
Advanced pipeline to load, detect, preprocess, and validate data.
Parameters:
- data: DataFrame or str, the data or path to the data file.
- config: dict, optional configuration for the pipeline.
- custom_preprocess: function, optional custom preprocessing function.
Returns:
- preprocessed_df: DataFrame, the preprocessed data.
"""
try:
if isinstance(data, str):
logging.info(f"Searching for data file: {data}")
search_paths = ['.', './data', './datasets'] # Add more directories as needed
data_path = find_file(data, search_paths)

if data_path:
logging.info(f"Loading data from {data_path}")
if data_path.endswith('.csv'):
# Specify dtypes to avoid dtype inference issues
dtype_spec = {'isbn': 'object'} # Specify other columns as needed
df = dd.read_csv(data_path, blocksize=config.get('chunk_size', '64MB'), dtype=dtype_spec)
is_dask = True
else:
df = load_data(data_path)
is_dask = False

if is_dask:
df = df.compute()
else:
logging.error("Data file not found. Exiting the pipeline.")
return pd.DataFrame()
else:
df = data
logging.info("Data provided directly as DataFrame")

if df.empty:
warnings.warn("The loaded DataFrame is empty.")
return df

logging.info(f"Data loaded with {df.shape[0]} rows and {df.shape[1]} columns")

data_format = detect_format(df)
if data_format is None:
raise ValueError("Data format detection failed")
logging.info(f"Detected data format: {data_format}")

if custom_preprocess:
logging.info("Applying custom preprocessing function")
preprocessed_df = custom_preprocess(df, data_format)
else:
logging.info("Applying standard preprocessing")
preprocessed_df = preprocess_data(df, data_format)

if preprocessed_df is None:
raise ValueError("Preprocessing failed")

logging.info("Validating preprocessed data")
validate_data(preprocessed_df)

logging.info(f"Preprocessed data has {preprocessed_df.shape[0]} rows and {preprocessed_df.shape[1]} columns")

return preprocessed_df

except Exception as e:
logging.error(f"An error occurred during the pipeline execution: {e}")
return pd.DataFrame()

# if __name__ == "__main__":
# config_path = 'config.yaml' # Path to your configuration file
# config = load_config(config_path)

# dask_client = setup_dask_client(config)

# preprocessed_data = detect('data/books.csv', config=config)

# if dask_client:
# dask_client.close()
Loading

0 comments on commit ac2170a

Please sign in to comment.