Skip to content

Commit

Permalink
feat: piloting of the framework (#227)
Browse files Browse the repository at this point in the history
* Initial template

* Updated a bit on the framework

* Revised to enable feat training

* Fixed Accuracy

* fixed mcc

* change some files for feature

* update some logics in feat eng

* update some codes

* update some codes

* Revised further to successfully completed a cycle

* Revised further

* Delete two CSVs

* fix some ci errors

---------

Co-authored-by: Young <afe.young@gmail.com>
Co-authored-by: Xisen-Wang <xisen_application@163.com>
  • Loading branch information
3 people authored Aug 27, 2024
1 parent bf2684c commit e9b103e
Show file tree
Hide file tree
Showing 6 changed files with 228 additions and 1 deletion.
6 changes: 6 additions & 0 deletions rdagent/scenarios/kaggle/experiment/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@

# Meta template
It is an example of how we organize the workspace of a competition.
We expect all the competitions to align with it so the knowledge in modules (model, feature) can transfer.

The generation process of the initial template is hoped to be conducted by LLM (however, it is based on human efforts currently).
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder


def preprocess(X: pd.DataFrame):
"""
We want the X_train & X_test & X_valid to contain the same number of columns & maintain feature consistency.
"""
# Identify numerical and categorical features
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]]
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]

# Define preprocessors for numerical and categorical features
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)

numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
transformers=[
("cat", categorical_transformer, categorical_cols),
("num", numerical_transformer, numerical_cols),
]
)

# Fit the preprocessor on the data and transform it
preprocessor.fit(X) # TODO depend on its input shape
X_array = preprocessor.transform(X).toarray()

# Get feature names for the columns in the transformed data
feature_names = (
preprocessor.named_transformers_["cat"]["onehot"].get_feature_names_out(categorical_cols).tolist()
+ numerical_cols
)

# Convert arrays back to DataFrames
X_transformed = pd.DataFrame(X_array, columns=feature_names, index=X.index)

return X_transformed
13 changes: 13 additions & 0 deletions rdagent/scenarios/kaggle/experiment/meta_tpl/feat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import pandas as pd

"""
Here is the feature engineering code for each task, with the function name specified as feat_eng.
The file name should start with feat_, followed by the specific task name.
"""


def feat_eng(X: pd.DataFrame):
"""
return the selected features
"""
return X
36 changes: 36 additions & 0 deletions rdagent/scenarios/kaggle/experiment/meta_tpl/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
motivation of the model
"""
import pandas as pd
import xgboost as xgb


def select(X):
"""
Select relevant features. To be used in fit & predict function
"""
return X


def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
"""Define and train the model. Merge feature_select"""
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

# TODO: for quick running....
params = {}
num_round = 2

evallist = [(dtrain, "train"), (dvalid, "eval")]
bst = xgb.train(params, dtrain, num_round, evallist)

return bst


def predict(model, X):
"""
Keep feature select's consistency.
"""
dtest = xgb.DMatrix(X)
y_pred_prob = model.predict(dtest)
return y_pred_prob > 0.5 # Apply threshold to get boolean predictions
125 changes: 125 additions & 0 deletions rdagent/scenarios/kaggle/experiment/meta_tpl/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import os
import random
from pathlib import Path

import numpy as np
import pandas as pd
import xgboost as xgb
from fea_share_preprocess import preprocess
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from rdagent.scenarios.kaggle.experiment.meta_tpl.fea_share_preprocess import preprocess

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
DIRNAME = Path(__file__).absolute().resolve().parent


# support various method for metrics calculation
def compute_metrics_for_classification(y_true, y_pred):
"""Compute accuracy metric for classification."""
accuracy = accuracy_score(y_true, y_pred)
return accuracy


def compute_metrics_for_classification(y_true, y_pred):
"""Compute MCC for classification."""
mcc = matthews_corrcoef(y_true, y_pred)
return mcc


# Load and preprocess the data
data_df = pd.read_csv("/home/v-xisenwang/git_ignore_folder/data/playground-series-s4e8/train.csv")
data_df = data_df.drop(["id"], axis=1)

X = data_df.drop(["class"], axis=1)
y = data_df[["class"]]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y) # 将类别标签转换为数值
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, random_state=SEED)

# 1) Preprocess the data
X_train = preprocess(X_train)
X_valid = preprocess(X_valid)

submission_df = pd.read_csv("/home/v-xisenwang/git_ignore_folder/data/playground-series-s4e8/test.csv")
passenger_ids = submission_df["id"]
submission_df = submission_df.drop(["id"], axis=1)
X_test = preprocess(submission_df)

# 2) Auto feature engineering
X_train_l, X_valid_l = [], []
X_test_l = []
for f in DIRNAME.glob("feat*.py"):
m = __import__(f.name.strip(".py"))
X_train = m.feat_eng(X_train)
X_valid = m.feat_eng(X_valid)
X_test = m.feat_eng(X_test)

X_train_l.append(X_train)
X_valid_l.append(X_valid)
X_test_l.append(X_test)

X_train = pd.concat(X_train_l, axis=1)
X_valid = pd.concat(X_valid_l, axis=1)
X_test = pd.concat(X_test_l, axis=1)


def align_features(train_df, valid_df):
# Align the features of validation data to the training data
valid_df = valid_df.reindex(columns=train_df.columns, fill_value=0)
return valid_df


X_valid = align_features(X_train, X_valid)
X_test = align_features(X_train, X_test)

# 3) Train the model
model_l = [] # list[tuple[model, predict_func,]]
for f in DIRNAME.glob("model*.py"):
# TODO put select() in model.py: fit(X_train, y_train, X_valid, y_valid)
m = __import__(f.name.strip(".py"))
model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict))

# Evaluate the model on the validation set
y_valid_pred_l = []
for model, predict_func in model_l:
y_valid_pred_l.append(predict_func(model, X_valid))

# Ensemble
# TODO: ensemble method in a script
# Average the predictions and apply a threshold to determine class labels
y_valid_pred = np.mean(y_valid_pred_l, axis=0)
y_valid_pred = (y_valid_pred > 0.5).astype(int)

mcc = compute_metrics_for_classification(y_valid, y_valid_pred)
print("Final on validation set: ", mcc)

# Save the validation accuracy
pd.Series(data=[mcc], index=["MCC"]).to_csv(
"/home/v-xisenwang/RD-Agent/rdagent/scenarios/kaggle/experiment/meta_tpl/submission_score.csv"
)

# Make predictions on the test set and save them
y_test_pred_bool_l = []
for m, m_pred in model_l:
y_test_pred_bool_l.append(
m_pred(m, X_test).astype(int)
) # TODO Make this an ensemble. Currently it uses the last prediction

y_test_pred = np.mean(y_test_pred_bool_l, axis=0)
y_test_pred = (y_test_pred > 0.5).astype(int) # TODO Make it a module. Ensemble prediction

y_test_pred_labels = label_encoder.inverse_transform(y_test_pred) # 将整数转换回 'e' 或 'p'
submission_result = pd.DataFrame({"id": passenger_ids, "class": y_test_pred_labels})

# submit predictions for the test set
submission_result.to_csv("./submission.csv", index=False)
2 changes: 1 addition & 1 deletion rdagent/scenarios/kaggle/experiment/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ kg_description_template:
user: |-
Based on the following competition description, please extract the following details:
1. Competition Type
1. Competition Type
2. Competition Description
3. Target Description
4. Competition Features
Expand Down

0 comments on commit e9b103e

Please sign in to comment.