Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Normalizing columns which have higher range values compared to other … #3

Merged
merged 1 commit into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions scripts/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
import sys

from scripts.regression import sum_squared_error, y_estimate
from scripts.local_ancillary import gather_local_stats, add_site_covariates, get_cost
from scripts.utils import list_recursive, log
import scripts.local_ancillary as lc

def local_0(args):
input_list = args["input"]
Expand All @@ -34,6 +34,8 @@ def local_0(args):
X = cf[x_headers]
y = df[y_headers]

columns_to_normalize = lc.check_cols_to_normalize(X)

tol = input_list["tol"]
eta = input_list["eta"]

Expand All @@ -49,7 +51,8 @@ def local_0(args):
"x_headers": x_headers,
"y_headers": y_headers,
"tol": tol,
"eta": eta
"eta": eta,
"columns_to_normalize": columns_to_normalize,
},
"cache": cache_dict
}
Expand All @@ -66,14 +69,18 @@ def local_1(args):
local statistics to the remote site"""

X = pd.read_json(args["cache"]["covariates"], orient='records')
X = lc.normalize_columns(X, input_list["columns_to_normalize"])

log(f'\n\nNormalizing the following column values to their z-scores: {input_list["columns_to_normalize"]} \n ', args['state'])

y = pd.read_json(args["cache"]["dependents"], orient='records')
y_labels = list(y.columns)

site = args['state']['clientId']

beta_vector, local_stats_list, meanY_vector, lenY_vector, site = gather_local_stats(X, y, site)
beta_vector, local_stats_list, meanY_vector, lenY_vector, site = lc.gather_local_stats(X, y, site)

augmented_X = add_site_covariates(args, X)
augmented_X = lc.add_site_covariates(args, X)

beta_vec_size = augmented_X.shape[1]

Expand Down Expand Up @@ -136,7 +143,7 @@ def local_2(args):
if not mask_flag[i]:
gradient[i, :] = (
1 / len(X)) * np.dot(biased_X.T, np.dot(biased_X, w_) - y_)
cost[i] = get_cost(y_actual=y[i], y_predicted=np.dot(biased_X, w_))
cost[i] = lc.get_cost(y_actual=y[i], y_predicted=np.dot(biased_X, w_))

output_dict = {
"local_grad": gradient.tolist(),
Expand Down
18 changes: 18 additions & 0 deletions scripts/local_ancillary.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,21 @@ def add_site_covariates(args, X):

def get_cost(y_actual, y_predicted):
return np.average((y_actual-y_predicted)**2)


def check_cols_to_normalize(X):
columns_to_normalize=[]
max_vals = X.max(axis=0).to_numpy()
minval = np.min(max_vals[np.nonzero(max_vals)])
X_headers = list(X.columns)
ranges = max_vals / minval
temp_cols_indxs = np.where(ranges > 10000)[0]
for col_indx in temp_cols_indxs:
columns_to_normalize.append(X_headers[col_indx])

return columns_to_normalize

def normalize_columns(data_df, cols):
for col in cols:
data_df[col] = (data_df[col] - data_df[col].mean())/(data_df[col].std())
return data_df
16 changes: 10 additions & 6 deletions scripts/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,22 @@


def remote_0(args):
#log(args, args['state'])
# log(args, args['state'])

input_list = args["input"]

site_ids = list(input_list.keys())
userID = list(input_list)[0]
site_ids = sorted(list(input_list.keys()))
userID = list(site_ids)[0]

site_covar_list = [
'{}_{}'.format('site', label) for index, label in enumerate(site_ids)
if index
]

columns_to_normalize = set()
for userID in site_ids:
columns_to_normalize.update(columns_to_normalize.union(input_list[userID]["columns_to_normalize"]))

X_labels = input_list[userID]["x_headers"]
y_labels = input_list[userID]["y_headers"]

Expand All @@ -37,6 +41,7 @@ def remote_0(args):

output_dict = {
"site_covar_list": site_covar_list,
"columns_to_normalize": list(columns_to_normalize),
"computation_phase": "remote_0"
}

Expand All @@ -52,11 +57,10 @@ def remote_0(args):
"cache": cache_dict,
}

#log(args, args['state'])
# log(args, args['state'])

return computation_output_dict


def remote_1(args):
#log(args, args['state'])
"""Need this function for performing multi-shot regression"""
Expand Down