Merge branch 'main' into dependabot/pip/pygments-2.15.0

koaning · Sep 19, 2023 · 23cf390 · 23cf390
2 parents 39fd742 + 3ed073d
commit 23cf390
Show file tree

Hide file tree

Showing 10 changed files with 30 additions and 31 deletions.
diff --git a/doc/crossvalidation.ipynb b/doc/crossvalidation.ipynb
@@ -593,7 +593,7 @@
    "source": [
     "## GroupTimeSeriesSplit\n",
     "\n",
-    "In a time series problem it is possible that not every time unit (e.g. years) has the same amount of rows/observations. This makes a normal kfold split inpractical as you cannot specify a certain timeframe per fold (e.g. 5 years), because this can cause the folds' sizes to be very different. With `GroupTimeSeriesSplit` you can specify the amount of folds you want (e.g. `n_splits=3`) and `GroupTimeSeriesSplit` will calculate itself folds in such a way that the amount of observations per fold are as similar as possible. <br>\n",
+    "In a time series problem it is possible that not every time unit (e.g. years) has the same amount of rows/observations. This makes a normal kfold split impractical as you cannot specify a certain timeframe per fold (e.g. 5 years), because this can cause the folds' sizes to be very different. With `GroupTimeSeriesSplit` you can specify the amount of folds you want (e.g. `n_splits=3`) and `GroupTimeSeriesSplit` will calculate itself folds in such a way that the amount of observations per fold are as similar as possible. <br>\n",
     "\n",
     "The folds are created with a smartly modified brute forced method. This still means that for higher `n_splits` values in combination with many different unique time periods (e.g. 100 different years, thus 100 groups) the generation of the optimal split points can take minutes to hours. `UserWarnings` are raised when `GroupTimeSeriesSplit` expects to be running over a minute. Of course, this actual runtime depends on your machine's specifications.\n",
     "\n",

diff --git a/doc/linear-models.ipynb b/doc/linear-models.ipynb
diff --git a/doc/meta.ipynb b/doc/meta.ipynb
diff --git a/doc/outliers.ipynb b/doc/outliers.ipynb
diff --git a/doc/rstudio.md b/doc/rstudio.md
@@ -6,10 +6,10 @@ on how to build a proper scikit-learn gridsearch using reticulate so
 we figured we might add a resource to our documentation here.
 
 It should be said that we feel that the best developer experience
-is definately going to be in python but we figured it be helpful
+is definitely going to be in python but we figured it be helpful
 to put a small example in our documentation. 
 
-## Demo 
+## Demo
 
 You'll first need to install a dependency and set up a link to a 
 python virtualenv that has scikit-lego already installed.
@@ -127,7 +127,7 @@ ggplot(data=cv_df) +
 
 ![](_static/Rplot2.png)
 
-## Important 
+## Important
 
 Note that we're mainly trying to demonstrate the R api here. In terms of fairness you 
 would want to explore the dataset further before you say anything conclusive.

diff --git a/readme.md b/readme.md
@@ -142,7 +142,7 @@ Here's a list of features that this library currently offers:
 We want to be rather open here in what we accept but we do demand three
 things before they become added to the project:
 
-1. any new feature contributes towards a demonstratable real-world usecase
+1. any new feature contributes towards a demonstrable real-world usecase
 2. any new feature passes standard unit tests (we use the ones from scikit-learn)
 3. the feature has been discussed in the issue list beforehand
 

diff --git a/sklego/linear_model.py b/sklego/linear_model.py
@@ -491,7 +491,7 @@ def constraints(self, y_hat, y_true, sensitive, n_obs):
 
 class BaseScipyMinimizeRegressor(BaseEstimator, RegressorMixin, ABC):
     """
-    Base class for regressors relying on scipy's minimze method. Derive a class from this one and give it the function to be minimized.
+    Base class for regressors relying on scipy's minimize method. Derive a class from this one and give it the function to be minimized.
 
     Parameters
     ----------

diff --git a/sklego/meta/confusion_balancer.py b/sklego/meta/confusion_balancer.py
@@ -48,7 +48,7 @@ def fit(self, X, y):
         X, y = check_X_y(X, y, estimator=self.estimator, dtype=FLOAT_DTYPES)
         if not isinstance(self.estimator, ProbabilisticClassifier):
             raise ValueError(
-                "The ConfusionBalancer meta model only works on classifcation models with .predict_proba."
+                "The ConfusionBalancer meta model only works on classification models with .predict_proba."
             )
         self.estimator.fit(X, y)
         self.classes_ = unique_labels(y)

diff --git a/sklego/meta/thresholder.py b/sklego/meta/thresholder.py
@@ -18,7 +18,7 @@ class Thresholder(BaseEstimator, ClassifierMixin):
     design the algorithm to only accept a certain class if the probability
     for it is larger than, say, 90% instead of 50%.
 
-    :param model: the moddel to threshold
+    :param model: the model to threshold
     :param threshold: the actual threshold to use
     :param refit: if True, we will always retrain the model even if it is already fitted.
     If False we only refit if the original model isn't fitted.

diff --git a/sklego/model_selection.py b/sklego/model_selection.py
@@ -482,7 +482,7 @@ def _calc_first_and_last_split_index(self, X=None, y=None, groups=None):
         )
         init_ideal_group_size = self._ideal_group_size * 0.9
 
-        # initalize the index of the first split, to reduce the amount of possible index split options
+        # initialize the index of the first split, to reduce the amount of possible index split options
         first_split_index = (
             self._grouped_df.assign(
                 cumsum_obs=lambda df: df["observations"].cumsum()
@@ -496,7 +496,7 @@ def _calc_first_and_last_split_index(self, X=None, y=None, groups=None):
             .iloc[0]
             .name
         )
-        # initalize the index of the last split point, to reduce the amount of possible index split options
+        # initialize the index of the last split point, to reduce the amount of possible index split options
         last_split_index = len(self._grouped_df) - (
             self._grouped_df.assign(
                 observations=lambda df: df["observations"].values[::-1],
@@ -634,7 +634,7 @@ def _regroup(self, groups):
         """
         Specifies in which group every observation belongs
 
-        :param groups: orginal groups in array
+        :param groups: original groups in array
         :type: groups: np.array
 
         :return: indices for the train and test splits of each fold