Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add random_seed parameter #28

Merged
merged 3 commits into from
May 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 50 additions & 7 deletions smogn/over_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
def over_sampling(

## arguments / inputs
data, ## training set
index, ## index of input data
perc, ## over / under sampling
pert, ## perturbation / noise percentage
k ## num of neighs for over-sampling

data, ## training set
index, ## index of input data
perc, ## over / under sampling
pert, ## perturbation / noise percentage
k, ## num of neighs for over-sampling
seed = None ## random seed for sampling (pos int or None)
):

"""
Expand Down Expand Up @@ -221,6 +221,10 @@ def over_sampling(
## total number of new synthetic observations to generate
n_synth = int(n * (perc - 1 - x_synth))

## set random seed
if seed:
np.random.seed(seed = seed)

## randomly index data by the number of new synthetic observations
r_index = np.random.choice(
a = tuple(range(0, n)),
Expand All @@ -241,6 +245,10 @@ def over_sampling(

for j in range(x_synth):

## set random seed
if seed:
np.random.seed(seed = seed)

## randomly select a k nearest neighbor
neigh = int(np.random.choice(
a = tuple(range(k)),
Expand All @@ -249,6 +257,10 @@ def over_sampling(
## conduct synthetic minority over-sampling
## technique for regression (smoter)
if neigh in safe_list:
## set random seed
if seed:
rd.seed(a = seed)

diffs = data.iloc[
knn_matrix[i, neigh], 0:(d - 1)] - data.iloc[
i, 0:(d - 1)]
Expand All @@ -258,6 +270,10 @@ def over_sampling(
## randomly assign nominal / categorical features from
## observed cases and selected neighbors
for x in feat_list_nom:
## set random seed
if seed:
rd.seed(a = seed)

synth_matrix[i * x_synth + j, x] = [data.iloc[
knn_matrix[i, neigh], x], data.iloc[
i, x]][round(rd.random())]
Expand Down Expand Up @@ -304,6 +320,10 @@ def over_sampling(
if pd.isna(data.iloc[i, x]):
synth_matrix[index_gaus, x] = None
else:
## set random seed
if seed:
np.random.seed(seed = seed)

synth_matrix[index_gaus, x] = data.iloc[
i, x] + float(np.random.normal(
loc = 0,
Expand All @@ -323,7 +343,10 @@ def over_sampling(
probs[z] = len(
np.where(data.iloc[
:, x] == data.iloc[:, x][z]))

## set random seed
if seed:
rd.seed(a = seed)

synth_matrix[index_gaus, x] = rd.choices(
population = data.iloc[:, x].unique(),
weights = probs,
Expand All @@ -338,6 +361,10 @@ def over_sampling(
safe_list = np.where(
dist_matrix[i, knn_matrix[i]] < max_dist[i])[0]

## set random seed
if seed:
np.random.seed(seed = seed)

## randomly select a k nearest neighbor
neigh = int(np.random.choice(
a = tuple(range(0, k)),
Expand All @@ -346,6 +373,10 @@ def over_sampling(
## conduct synthetic minority over-sampling
## technique for regression (smoter)
if neigh in safe_list:
## set random seed
if seed:
rd.seed(a = seed)

diffs = data.iloc[
knn_matrix[i, neigh], 0:(d - 1)] - data.iloc[i, 0:(d - 1)]
synth_matrix[x_synth * n + count, 0:(d - 1)] = data.iloc[
Expand All @@ -354,6 +385,10 @@ def over_sampling(
## randomly assign nominal / categorical features from
## observed cases and selected neighbors
for x in feat_list_nom:
## set random seed
if seed:
rd.seed(a = seed)

synth_matrix[x_synth * n + count, x] = [data.iloc[
knn_matrix[i, neigh], x], data.iloc[
i, x]][round(rd.random())]
Expand Down Expand Up @@ -395,6 +430,10 @@ def over_sampling(
if pd.isna(data.iloc[i, x]):
synth_matrix[x_synth * n + count, x] = None
else:
## set random seed
if seed:
np.random.seed(seed = seed)

synth_matrix[x_synth * n + count, x] = data.iloc[
i, x] + float(np.random.normal(
loc = 0,
Expand All @@ -413,6 +452,10 @@ def over_sampling(
data.iloc[:, x] == data.iloc[:, x][z])
)

## set random seed
if seed:
rd.seed(a = seed)

synth_matrix[
x_synth * n + count, x] = rd.choices(
population = data.iloc[:, x].unique(),
Expand Down
5 changes: 5 additions & 0 deletions smogn/smoter.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def smoter(
drop_na_col = True, ## auto drop columns with nan's (bool)
drop_na_row = True, ## auto drop rows with nan's (bool)
replace = False, ## sampling replacement (bool)
seed = None, ## seed for random sampling (pos int or None)

## phi relevance function arguments / inputs
rel_thres = 0.5, ## relevance threshold considered rare (pos real)
Expand Down Expand Up @@ -253,6 +254,10 @@ def smoter(
if under_samp is True:
if s_perc[i] < 1:

## set random seed
if seed:
np.random.seed(seed = seed)

## drop observations in training set
## considered 'normal' (not 'rare')
omit_index = np.random.choice(
Expand Down