-
-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: piloting of the framework (#227)
* Initial template * Updated a bit on the framework * Revised to enable feat training * Fixed Accuracy * fixed mcc * change some files for feature * update some logics in feat eng * update some codes * update some codes * Revised further to successfully completed a cycle * Revised further * Delete two CSVs * fix some ci errors --------- Co-authored-by: Young <afe.young@gmail.com> Co-authored-by: Xisen-Wang <xisen_application@163.com>
- Loading branch information
1 parent
bf2684c
commit e9b103e
Showing
6 changed files
with
228 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
|
||
# Meta template | ||
It is an example of how we organize the workspace of a competition. | ||
We expect all the competitions to align with it so the knowledge in modules (model, feature) can transfer. | ||
|
||
The generation process of the initial template is hoped to be conducted by LLM (however, it is based on human efforts currently). |
47 changes: 47 additions & 0 deletions
47
rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import pandas as pd | ||
from sklearn.compose import ColumnTransformer | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.preprocessing import OneHotEncoder | ||
|
||
|
||
def preprocess(X: pd.DataFrame): | ||
""" | ||
We want the X_train & X_test & X_valid to contain the same number of columns & maintain feature consistency. | ||
""" | ||
# Identify numerical and categorical features | ||
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]] | ||
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"] | ||
|
||
# Define preprocessors for numerical and categorical features | ||
categorical_transformer = Pipeline( | ||
steps=[ | ||
("imputer", SimpleImputer(strategy="most_frequent")), | ||
("onehot", OneHotEncoder(handle_unknown="ignore")), | ||
] | ||
) | ||
|
||
numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))]) | ||
|
||
# Combine preprocessing steps | ||
preprocessor = ColumnTransformer( | ||
transformers=[ | ||
("cat", categorical_transformer, categorical_cols), | ||
("num", numerical_transformer, numerical_cols), | ||
] | ||
) | ||
|
||
# Fit the preprocessor on the data and transform it | ||
preprocessor.fit(X) # TODO depend on its input shape | ||
X_array = preprocessor.transform(X).toarray() | ||
|
||
# Get feature names for the columns in the transformed data | ||
feature_names = ( | ||
preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out(categorical_cols).tolist() | ||
+ numerical_cols | ||
) | ||
|
||
# Convert arrays back to DataFrames | ||
X_transformed = pd.DataFrame(X_array, columns=feature_names, index=X.index) | ||
|
||
return X_transformed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import pandas as pd | ||
|
||
""" | ||
Here is the feature engineering code for each task, with the function name specified as feat_eng. | ||
The file name should start with feat_, followed by the specific task name. | ||
""" | ||
|
||
|
||
def feat_eng(X: pd.DataFrame): | ||
""" | ||
return the selected features | ||
""" | ||
return X |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
""" | ||
motivation of the model | ||
""" | ||
import pandas as pd | ||
import xgboost as xgb | ||
|
||
|
||
def select(X): | ||
""" | ||
Select relevant features. To be used in fit & predict function | ||
""" | ||
return X | ||
|
||
|
||
def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): | ||
"""Define and train the model. Merge feature_select""" | ||
dtrain = xgb.DMatrix(X_train, label=y_train) | ||
dvalid = xgb.DMatrix(X_valid, label=y_valid) | ||
|
||
# TODO: for quick running.... | ||
params = {} | ||
num_round = 2 | ||
|
||
evallist = [(dtrain, "train"), (dvalid, "eval")] | ||
bst = xgb.train(params, dtrain, num_round, evallist) | ||
|
||
return bst | ||
|
||
|
||
def predict(model, X): | ||
""" | ||
Keep feature select's consistency. | ||
""" | ||
dtest = xgb.DMatrix(X) | ||
y_pred_prob = model.predict(dtest) | ||
return y_pred_prob > 0.5 # Apply threshold to get boolean predictions |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
import os | ||
import random | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import xgboost as xgb | ||
from fea_share_preprocess import preprocess | ||
from sklearn.compose import ColumnTransformer | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.metrics import accuracy_score, matthews_corrcoef | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.preprocessing import LabelEncoder, OneHotEncoder | ||
|
||
from rdagent.scenarios.kaggle.experiment.meta_tpl.fea_share_preprocess import preprocess | ||
|
||
# Set random seed for reproducibility | ||
SEED = 42 | ||
random.seed(SEED) | ||
np.random.seed(SEED) | ||
DIRNAME = Path(__file__).absolute().resolve().parent | ||
|
||
|
||
# support various method for metrics calculation | ||
def compute_metrics_for_classification(y_true, y_pred): | ||
"""Compute accuracy metric for classification.""" | ||
accuracy = accuracy_score(y_true, y_pred) | ||
return accuracy | ||
|
||
|
||
def compute_metrics_for_classification(y_true, y_pred): | ||
"""Compute MCC for classification.""" | ||
mcc = matthews_corrcoef(y_true, y_pred) | ||
return mcc | ||
|
||
|
||
# Load and preprocess the data | ||
data_df = pd.read_csv("/home/v-xisenwang/git_ignore_folder/data/playground-series-s4e8/train.csv") | ||
data_df = data_df.drop(["id"], axis=1) | ||
|
||
X = data_df.drop(["class"], axis=1) | ||
y = data_df[["class"]] | ||
|
||
label_encoder = LabelEncoder() | ||
y = label_encoder.fit_transform(y) # 将类别标签转换为数值 | ||
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=SEED) | ||
|
||
# 1) Preprocess the data | ||
X_train = preprocess(X_train) | ||
X_valid = preprocess(X_valid) | ||
|
||
submission_df = pd.read_csv("/home/v-xisenwang/git_ignore_folder/data/playground-series-s4e8/test.csv") | ||
passenger_ids = submission_df["id"] | ||
submission_df = submission_df.drop(["id"], axis=1) | ||
X_test = preprocess(submission_df) | ||
|
||
# 2) Auto feature engineering | ||
X_train_l, X_valid_l = [], [] | ||
X_test_l = [] | ||
for f in DIRNAME.glob("feat*.py"): | ||
m = __import__(f.name.strip(".py")) | ||
X_train = m.feat_eng(X_train) | ||
X_valid = m.feat_eng(X_valid) | ||
X_test = m.feat_eng(X_test) | ||
|
||
X_train_l.append(X_train) | ||
X_valid_l.append(X_valid) | ||
X_test_l.append(X_test) | ||
|
||
X_train = pd.concat(X_train_l, axis=1) | ||
X_valid = pd.concat(X_valid_l, axis=1) | ||
X_test = pd.concat(X_test_l, axis=1) | ||
|
||
|
||
def align_features(train_df, valid_df): | ||
# Align the features of validation data to the training data | ||
valid_df = valid_df.reindex(columns=train_df.columns, fill_value=0) | ||
return valid_df | ||
|
||
|
||
X_valid = align_features(X_train, X_valid) | ||
X_test = align_features(X_train, X_test) | ||
|
||
# 3) Train the model | ||
model_l = [] # list[tuple[model, predict_func,]] | ||
for f in DIRNAME.glob("model*.py"): | ||
# TODO put select() in model.py: fit(X_train, y_train, X_valid, y_valid) | ||
m = __import__(f.name.strip(".py")) | ||
model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict)) | ||
|
||
# Evaluate the model on the validation set | ||
y_valid_pred_l = [] | ||
for model, predict_func in model_l: | ||
y_valid_pred_l.append(predict_func(model, X_valid)) | ||
|
||
# Ensemble | ||
# TODO: ensemble method in a script | ||
# Average the predictions and apply a threshold to determine class labels | ||
y_valid_pred = np.mean(y_valid_pred_l, axis=0) | ||
y_valid_pred = (y_valid_pred > 0.5).astype(int) | ||
|
||
mcc = compute_metrics_for_classification(y_valid, y_valid_pred) | ||
print("Final on validation set: ", mcc) | ||
|
||
# Save the validation accuracy | ||
pd.Series(data=[mcc], index=["MCC"]).to_csv( | ||
"/home/v-xisenwang/RD-Agent/rdagent/scenarios/kaggle/experiment/meta_tpl/submission_score.csv" | ||
) | ||
|
||
# Make predictions on the test set and save them | ||
y_test_pred_bool_l = [] | ||
for m, m_pred in model_l: | ||
y_test_pred_bool_l.append( | ||
m_pred(m, X_test).astype(int) | ||
) # TODO Make this an ensemble. Currently it uses the last prediction | ||
|
||
y_test_pred = np.mean(y_test_pred_bool_l, axis=0) | ||
y_test_pred = (y_test_pred > 0.5).astype(int) # TODO Make it a module. Ensemble prediction | ||
|
||
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred) # 将整数转换回 'e' 或 'p' | ||
submission_result = pd.DataFrame({"id": passenger_ids, "class": y_test_pred_labels}) | ||
|
||
# submit predictions for the test set | ||
submission_result.to_csv("./submission.csv", index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters