feat: piloting of the framework (#227)

* Initial template * Updated a bit on the framework * Revised to enable feat training * Fixed Accuracy * fixed mcc * change some files for feature * update some logics in feat eng * update some codes * update some codes * Revised further to successfully completed a cycle * Revised further * Delete two CSVs * fix some ci errors --------- Co-authored-by: Young <afe.young@gmail.com> Co-authored-by: Xisen-Wang <xisen_application@163.com>
microsoft · Aug 27, 2024 · e9b103e · e9b103e
1 parent bf2684c
commit e9b103e
Show file tree

Hide file tree

Showing 6 changed files with 228 additions and 1 deletion.
diff --git a/rdagent/scenarios/kaggle/experiment/README.md b/rdagent/scenarios/kaggle/experiment/README.md
@@ -0,0 +1,6 @@
+
+# Meta template
+It is an example of how we organize the workspace of a competition.
+We expect all the competitions to align with it so the knowledge in modules (model, feature) can transfer.
+
+The generation process of the initial template is hoped to be conducted by LLM (however, it is based on human efforts currently).
diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py
@@ -0,0 +1,47 @@
+import pandas as pd
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+
+def preprocess(X: pd.DataFrame):
+ """
+ We want the X_train & X_test & X_valid to contain the same number of columns & maintain feature consistency.
+ """
+ # Identify numerical and categorical features
+ numerical_cols = [cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]]
+ categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
+
+ # Define preprocessors for numerical and categorical features
+ categorical_transformer = Pipeline(
+ steps=[
+ ("imputer", SimpleImputer(strategy="most_frequent")),
+ ("onehot", OneHotEncoder(handle_unknown="ignore")),
+ ]
+ )
+
+ numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])
+
+ # Combine preprocessing steps
+ preprocessor = ColumnTransformer(
+ transformers=[
+ ("cat", categorical_transformer, categorical_cols),
+ ("num", numerical_transformer, numerical_cols),
+ ]
+ )
+
+ # Fit the preprocessor on the data and transform it
+ preprocessor.fit(X) # TODO depend on its input shape
+ X_array = preprocessor.transform(X).toarray()
+
+ # Get feature names for the columns in the transformed data
+ feature_names = (
+ preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out(categorical_cols).tolist()
+ + numerical_cols
+ )
+
+ # Convert arrays back to DataFrames
+ X_transformed = pd.DataFrame(X_array, columns=feature_names, index=X.index)
+
+ return X_transformed
diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/feat.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/feat.py
@@ -0,0 +1,13 @@
+import pandas as pd
+
+"""
+Here is the feature engineering code for each task, with the function name specified as feat_eng. 
+The file name should start with feat_, followed by the specific task name.
+"""
+
+
+def feat_eng(X: pd.DataFrame):
+ """
+ return the selected features
+ """
+ return X
diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/model.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/model.py
@@ -0,0 +1,36 @@
+"""
+motivation of the model
+"""
+import pandas as pd
+import xgboost as xgb
+
+
+def select(X):
+ """
+ Select relevant features. To be used in fit & predict function
+ """
+ return X
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
+ """Define and train the model. Merge feature_select"""
+ dtrain = xgb.DMatrix(X_train, label=y_train)
+ dvalid = xgb.DMatrix(X_valid, label=y_valid)
+
+ # TODO: for quick running....
+ params = {}
+ num_round = 2
+
+ evallist = [(dtrain, "train"), (dvalid, "eval")]
+ bst = xgb.train(params, dtrain, num_round, evallist)
+
+ return bst
+
+
+def predict(model, X):
+ """
+ Keep feature select's consistency.
+ """
+ dtest = xgb.DMatrix(X)
+ y_pred_prob = model.predict(dtest)
+ return y_pred_prob > 0.5 # Apply threshold to get boolean predictions
diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/train.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/train.py
@@ -0,0 +1,125 @@
+import os
+import random
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+from fea_share_preprocess import preprocess
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.metrics import accuracy_score, matthews_corrcoef
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import LabelEncoder, OneHotEncoder
+
+from rdagent.scenarios.kaggle.experiment.meta_tpl.fea_share_preprocess import preprocess
+
+# Set random seed for reproducibility
+SEED = 42
+random.seed(SEED)
+np.random.seed(SEED)
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+
+# support various method for metrics calculation
+def compute_metrics_for_classification(y_true, y_pred):
+ """Compute accuracy metric for classification."""
+ accuracy = accuracy_score(y_true, y_pred)
+ return accuracy
+
+
+def compute_metrics_for_classification(y_true, y_pred):
+ """Compute MCC for classification."""
+ mcc = matthews_corrcoef(y_true, y_pred)
+ return mcc
+
+
+# Load and preprocess the data
+data_df = pd.read_csv("/home/v-xisenwang/git_ignore_folder/data/playground-series-s4e8/train.csv")
+data_df = data_df.drop(["id"], axis=1)
+
+X = data_df.drop(["class"], axis=1)
+y = data_df[["class"]]
+
+label_encoder = LabelEncoder()
+y = label_encoder.fit_transform(y) # 将类别标签转换为数值
+X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=SEED)
+
+# 1) Preprocess the data
+X_train = preprocess(X_train)
+X_valid = preprocess(X_valid)
+
+submission_df = pd.read_csv("/home/v-xisenwang/git_ignore_folder/data/playground-series-s4e8/test.csv")
+passenger_ids = submission_df["id"]
+submission_df = submission_df.drop(["id"], axis=1)
+X_test = preprocess(submission_df)
+
+# 2) Auto feature engineering
+X_train_l, X_valid_l = [], []
+X_test_l = []
+for f in DIRNAME.glob("feat*.py"):
+ m = __import__(f.name.strip(".py"))
+ X_train = m.feat_eng(X_train)
+ X_valid = m.feat_eng(X_valid)
+ X_test = m.feat_eng(X_test)
+
+ X_train_l.append(X_train)
+ X_valid_l.append(X_valid)
+ X_test_l.append(X_test)
+
+X_train = pd.concat(X_train_l, axis=1)
+X_valid = pd.concat(X_valid_l, axis=1)
+X_test = pd.concat(X_test_l, axis=1)
+
+
+def align_features(train_df, valid_df):
+ # Align the features of validation data to the training data
+ valid_df = valid_df.reindex(columns=train_df.columns, fill_value=0)
+ return valid_df
+
+
+X_valid = align_features(X_train, X_valid)
+X_test = align_features(X_train, X_test)
+
+# 3) Train the model
+model_l = [] # list[tuple[model, predict_func,]]
+for f in DIRNAME.glob("model*.py"):
+ # TODO put select() in model.py: fit(X_train, y_train, X_valid, y_valid)
+ m = __import__(f.name.strip(".py"))
+ model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict))
+
+# Evaluate the model on the validation set
+y_valid_pred_l = []
+for model, predict_func in model_l:
+ y_valid_pred_l.append(predict_func(model, X_valid))
+
+# Ensemble
+# TODO: ensemble method in a script
+# Average the predictions and apply a threshold to determine class labels
+y_valid_pred = np.mean(y_valid_pred_l, axis=0)
+y_valid_pred = (y_valid_pred > 0.5).astype(int)
+
+mcc = compute_metrics_for_classification(y_valid, y_valid_pred)
+print("Final on validation set: ", mcc)
+
+# Save the validation accuracy
+pd.Series(data=[mcc], index=["MCC"]).to_csv(
+ "/home/v-xisenwang/RD-Agent/rdagent/scenarios/kaggle/experiment/meta_tpl/submission_score.csv"
+)
+
+# Make predictions on the test set and save them
+y_test_pred_bool_l = []
+for m, m_pred in model_l:
+ y_test_pred_bool_l.append(
+ m_pred(m, X_test).astype(int)
+ ) # TODO Make this an ensemble. Currently it uses the last prediction
+
+y_test_pred = np.mean(y_test_pred_bool_l, axis=0)
+y_test_pred = (y_test_pred > 0.5).astype(int) # TODO Make it a module. Ensemble prediction
+
+y_test_pred_labels = label_encoder.inverse_transform(y_test_pred) # 将整数转换回 'e' 或 'p'
+submission_result = pd.DataFrame({"id": passenger_ids, "class": y_test_pred_labels})
+
+# submit predictions for the test set
+submission_result.to_csv("./submission.csv", index=False)
diff --git a/rdagent/scenarios/kaggle/experiment/prompts.yaml b/rdagent/scenarios/kaggle/experiment/prompts.yaml
@@ -4,7 +4,7 @@ kg_description_template:
 
  user: |-
  Based on the following competition description, please extract the following details:
- 1. Competition Type
+ 1. Competition Type 
  2. Competition Description
  3. Target Description
  4. Competition Features