From ce540c7d20e58d42d51019d516fb40771ece498f Mon Sep 17 00:00:00 2001 From: Sanket Nikam Date: Wed, 25 Oct 2023 18:06:58 +0530 Subject: [PATCH 1/7] Added Gradient Boosting Classifier --- .../gradient_boosting_classifier.py | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 machine_learning/gradient_boosting_classifier.py diff --git a/machine_learning/gradient_boosting_classifier.py b/machine_learning/gradient_boosting_classifier.py new file mode 100644 index 000000000000..2776f717d0a4 --- /dev/null +++ b/machine_learning/gradient_boosting_classifier.py @@ -0,0 +1,95 @@ + +import numpy as np +from sklearn.datasets import load_iris +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split +from sklearn.tree import DecisionTreeRegressor + + +class GradientBoostingClassifier: + def __init__(self, n_estimators: int = 100, learning_rate: float = 0.1) -> None: + """ + Initialize a GradientBoostingClassifier. + + Parameters: + - n_estimators (int): The number of weak learners to train. + - learning_rate (float): The learning rate for updating the model. + + Attributes: + - n_estimators (int): The number of weak learners. + - learning_rate (float): The learning rate. + - models (list): A list to store the trained weak learners. + """ + self.n_estimators = n_estimators + self.learning_rate = learning_rate + self.models: list[tuple[DecisionTreeRegressor, float]] = [] + + def fit(self, x: np.ndarray, y: np.ndarray) -> None: + """ + Fit the GradientBoostingClassifier to the training data. + + Parameters: + - x (np.ndarray): The training features. + - y (np.ndarray): The target values. + + Returns: + None + """ + for _ in range(self.n_estimators): + # Calculate the pseudo-residuals + residuals = -self.gradient(y, self.predict(x)) + # Fit a weak learner (e.g., decision tree) to the residuals + model = DecisionTreeRegressor(max_depth=1) + model.fit(x, residuals) + # Update the model by adding the weak learner with a learning rate + self.models.append((model, self.learning_rate)) + + def predict(self, x: np.ndarray) -> np.ndarray: + """ + Make predictions on input data. + + Parameters: + - x (np.ndarray): The input data for making predictions. + + Returns: + - np.ndarray: An array of binary predictions (-1 or 1). + """ + # Initialize predictions with zeros + predictions = np.zeros(x.shape[0]) + for model, learning_rate in self.models: + predictions += learning_rate * model.predict(x) + return np.sign(predictions) # Convert to binary predictions (-1 or 1) + + def gradient(self, y: np.ndarray, y_pred: np.ndarray) -> np.ndarray: + """ + Calculate the negative gradient (pseudo-residuals) for logistic loss. + + Parameters: + - y (np.ndarray): The target values. + - y_pred (np.ndarray): The predicted values. + + Returns: + - np.ndarray: An array of pseudo-residuals. + """ + return -y / (1 + np.exp(y * y_pred)) + + +if __name__ == "__main__": + iris = load_iris() + X, y = iris.data, iris.target + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 + ) + + clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1) + clf.fit(X_train, y_train) + + y_pred = clf.predict(X_test) + accuracy = accuracy_score(y_test, y_pred) + print(f"Accuracy: {accuracy:.2f}") + +# Perform some calculations in doctests +if __name__ == "__main__": + import doctest + + doctest.testmod() From 171a42c780a766ec8d14c30fa18021fad7e31d5d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 25 Oct 2023 12:41:16 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/gradient_boosting_classifier.py | 1 - 1 file changed, 1 deletion(-) diff --git a/machine_learning/gradient_boosting_classifier.py b/machine_learning/gradient_boosting_classifier.py index 2776f717d0a4..92da6b159b30 100644 --- a/machine_learning/gradient_boosting_classifier.py +++ b/machine_learning/gradient_boosting_classifier.py @@ -1,4 +1,3 @@ - import numpy as np from sklearn.datasets import load_iris from sklearn.metrics import accuracy_score From ec6ed8cb4c097f423fdad8e0e78473536de85341 Mon Sep 17 00:00:00 2001 From: Sanket Nikam <77570082+SannketNikam@users.noreply.github.com> Date: Wed, 25 Oct 2023 18:31:27 +0530 Subject: [PATCH 3/7] Update gradient_boosting_classifier.py --- machine_learning/gradient_boosting_classifier.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/machine_learning/gradient_boosting_classifier.py b/machine_learning/gradient_boosting_classifier.py index 92da6b159b30..92652d5fdc27 100644 --- a/machine_learning/gradient_boosting_classifier.py +++ b/machine_learning/gradient_boosting_classifier.py @@ -75,15 +75,15 @@ def gradient(self, y: np.ndarray, y_pred: np.ndarray) -> np.ndarray: if __name__ == "__main__": iris = load_iris() - X, y = iris.data, iris.target - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, random_state=42 + x, y = iris.data, iris.target + x_train, x_test, y_train, y_test = train_test_split( + x, y, test_size=0.2, random_state=42 ) clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1) - clf.fit(X_train, y_train) + clf.fit(x_train, y_train) - y_pred = clf.predict(X_test) + y_pred = clf.predict(x_test) accuracy = accuracy_score(y_test, y_pred) print(f"Accuracy: {accuracy:.2f}") From 0cbc5a51d968888e1f025673f35530b93dcd2e1c Mon Sep 17 00:00:00 2001 From: Sanket Nikam <77570082+SannketNikam@users.noreply.github.com> Date: Wed, 25 Oct 2023 18:40:33 +0530 Subject: [PATCH 4/7] Update gradient_boosting_classifier.py --- .../gradient_boosting_classifier.py | 72 ++++++++++++------- 1 file changed, 48 insertions(+), 24 deletions(-) diff --git a/machine_learning/gradient_boosting_classifier.py b/machine_learning/gradient_boosting_classifier.py index 92652d5fdc27..55e3943be7d4 100644 --- a/machine_learning/gradient_boosting_classifier.py +++ b/machine_learning/gradient_boosting_classifier.py @@ -21,74 +21,98 @@ def __init__(self, n_estimators: int = 100, learning_rate: float = 0.1) -> None: """ self.n_estimators = n_estimators self.learning_rate = learning_rate - self.models: list[tuple[DecisionTreeRegressor, float]] = [] + self.models = [] - def fit(self, x: np.ndarray, y: np.ndarray) -> None: + def fit(self, features: np.ndarray, target: np.ndarray) -> None: """ Fit the GradientBoostingClassifier to the training data. Parameters: - - x (np.ndarray): The training features. - - y (np.ndarray): The target values. + - features (np.ndarray): The training features. + - target (np.ndarray): The target values. Returns: None + + >>> import numpy as np + >>> from sklearn.datasets import load_iris + >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1) + >>> iris = load_iris() + >>> X, y = iris.data, iris.target + >>> clf.fit(X, y) + >>> # Check if the model is trained + >>> len(clf.models) == 100 + True """ for _ in range(self.n_estimators): # Calculate the pseudo-residuals - residuals = -self.gradient(y, self.predict(x)) + residuals = -self.gradient(target, self.predict(features)) # Fit a weak learner (e.g., decision tree) to the residuals model = DecisionTreeRegressor(max_depth=1) - model.fit(x, residuals) + model.fit(features, residuals) # Update the model by adding the weak learner with a learning rate self.models.append((model, self.learning_rate)) - def predict(self, x: np.ndarray) -> np.ndarray: + def predict(self, features: np.ndarray) -> np.ndarray: """ Make predictions on input data. Parameters: - - x (np.ndarray): The input data for making predictions. + - features (np.ndarray): The input data for making predictions. Returns: - np.ndarray: An array of binary predictions (-1 or 1). + + >>> import numpy as np + >>> from sklearn.datasets import load_iris + >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1) + >>> iris = load_iris() + >>> X, y = iris.data, iris.target + >>> clf.fit(X, y) + >>> y_pred = clf.predict(X) + >>> # Check if the predictions have the correct shape + >>> y_pred.shape == y.shape + True """ # Initialize predictions with zeros - predictions = np.zeros(x.shape[0]) + predictions = np.zeros(features.shape[0]) for model, learning_rate in self.models: - predictions += learning_rate * model.predict(x) + predictions += learning_rate * model.predict(features) return np.sign(predictions) # Convert to binary predictions (-1 or 1) - def gradient(self, y: np.ndarray, y_pred: np.ndarray) -> np.ndarray: + def gradient(self, target: np.ndarray, y_pred: np.ndarray) -> np.ndarray: """ Calculate the negative gradient (pseudo-residuals) for logistic loss. Parameters: - - y (np.ndarray): The target values. + - target (np.ndarray): The target values. - y_pred (np.ndarray): The predicted values. Returns: - np.ndarray: An array of pseudo-residuals. + + >>> import numpy as np + >>> clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1) + >>> target = np.array([0, 1, 0, 1]) + >>> y_pred = np.array([0.2, 0.8, 0.3, 0.7]) + >>> residuals = clf.gradient(target, y_pred) + >>> # Check if residuals have the correct shape + >>> residuals.shape == target.shape + True """ - return -y / (1 + np.exp(y * y_pred)) + return -target / (1 + np.exp(target * y_pred)) if __name__ == "__main__": iris = load_iris() - x, y = iris.data, iris.target - x_train, x_test, y_train, y_test = train_test_split( - x, y, test_size=0.2, random_state=42 + X, y = iris.data, iris.target + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 ) clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1) - clf.fit(x_train, y_train) + clf.fit(X_train, y_train) - y_pred = clf.predict(x_test) + y_pred = clf.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print(f"Accuracy: {accuracy:.2f}") - -# Perform some calculations in doctests -if __name__ == "__main__": - import doctest - - doctest.testmod() From 6661e12931335508402bc762ae3814adf5a117fd Mon Sep 17 00:00:00 2001 From: Sanket Nikam <77570082+SannketNikam@users.noreply.github.com> Date: Wed, 25 Oct 2023 18:55:27 +0530 Subject: [PATCH 5/7] Update gradient_boosting_classifier.py --- machine_learning/gradient_boosting_classifier.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/machine_learning/gradient_boosting_classifier.py b/machine_learning/gradient_boosting_classifier.py index 55e3943be7d4..c54ceb3eef0a 100644 --- a/machine_learning/gradient_boosting_classifier.py +++ b/machine_learning/gradient_boosting_classifier.py @@ -3,6 +3,7 @@ from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeRegressor +from typing import List, Tuple class GradientBoostingClassifier: @@ -21,7 +22,7 @@ def __init__(self, n_estimators: int = 100, learning_rate: float = 0.1) -> None: """ self.n_estimators = n_estimators self.learning_rate = learning_rate - self.models = [] + self.models: List[Tuple[DecisionTreeRegressor, float]] = [] def fit(self, features: np.ndarray, target: np.ndarray) -> None: """ From 63bcd54f4d4d5dfb3a382abae478cf39c73d5094 Mon Sep 17 00:00:00 2001 From: Sanket Nikam <77570082+SannketNikam@users.noreply.github.com> Date: Wed, 25 Oct 2023 19:01:59 +0530 Subject: [PATCH 6/7] Update gradient_boosting_classifier.py --- machine_learning/gradient_boosting_classifier.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/machine_learning/gradient_boosting_classifier.py b/machine_learning/gradient_boosting_classifier.py index c54ceb3eef0a..70967b541002 100644 --- a/machine_learning/gradient_boosting_classifier.py +++ b/machine_learning/gradient_boosting_classifier.py @@ -3,8 +3,6 @@ from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeRegressor -from typing import List, Tuple - class GradientBoostingClassifier: def __init__(self, n_estimators: int = 100, learning_rate: float = 0.1) -> None: @@ -22,7 +20,7 @@ def __init__(self, n_estimators: int = 100, learning_rate: float = 0.1) -> None: """ self.n_estimators = n_estimators self.learning_rate = learning_rate - self.models: List[Tuple[DecisionTreeRegressor, float]] = [] + self.models: list[tuple[DecisionTreeRegressor, float]] = [] def fit(self, features: np.ndarray, target: np.ndarray) -> None: """ From 3af39b5ee75df7439dd936212d9077bde9d9cf96 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 25 Oct 2023 13:32:43 +0000 Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/gradient_boosting_classifier.py | 1 + 1 file changed, 1 insertion(+) diff --git a/machine_learning/gradient_boosting_classifier.py b/machine_learning/gradient_boosting_classifier.py index 70967b541002..2902394d8226 100644 --- a/machine_learning/gradient_boosting_classifier.py +++ b/machine_learning/gradient_boosting_classifier.py @@ -4,6 +4,7 @@ from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeRegressor + class GradientBoostingClassifier: def __init__(self, n_estimators: int = 100, learning_rate: float = 0.1) -> None: """