From bf2c7ee2022a575a90f3a90cf7e63500ca77e670 Mon Sep 17 00:00:00 2001 From: Sunitha Basodi Date: Wed, 4 Sep 2024 12:04:58 +0000 Subject: [PATCH] Normalizing columns which have higher range values compared to other columns --- scripts/local.py | 17 ++++++++++++----- scripts/local_ancillary.py | 18 ++++++++++++++++++ scripts/remote.py | 16 ++++++++++------ 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/scripts/local.py b/scripts/local.py index 17a8023..9feb96f 100644 --- a/scripts/local.py +++ b/scripts/local.py @@ -12,8 +12,8 @@ import sys from scripts.regression import sum_squared_error, y_estimate -from scripts.local_ancillary import gather_local_stats, add_site_covariates, get_cost from scripts.utils import list_recursive, log +import scripts.local_ancillary as lc def local_0(args): input_list = args["input"] @@ -34,6 +34,8 @@ def local_0(args): X = cf[x_headers] y = df[y_headers] + columns_to_normalize = lc.check_cols_to_normalize(X) + tol = input_list["tol"] eta = input_list["eta"] @@ -49,7 +51,8 @@ def local_0(args): "x_headers": x_headers, "y_headers": y_headers, "tol": tol, - "eta": eta + "eta": eta, + "columns_to_normalize": columns_to_normalize, }, "cache": cache_dict } @@ -66,14 +69,18 @@ def local_1(args): local statistics to the remote site""" X = pd.read_json(args["cache"]["covariates"], orient='records') + X = lc.normalize_columns(X, input_list["columns_to_normalize"]) + + log(f'\n\nNormalizing the following column values to their z-scores: {input_list["columns_to_normalize"]} \n ', args['state']) + y = pd.read_json(args["cache"]["dependents"], orient='records') y_labels = list(y.columns) site = args['state']['clientId'] - beta_vector, local_stats_list, meanY_vector, lenY_vector, site = gather_local_stats(X, y, site) + beta_vector, local_stats_list, meanY_vector, lenY_vector, site = lc.gather_local_stats(X, y, site) - augmented_X = add_site_covariates(args, X) + augmented_X = lc.add_site_covariates(args, X) beta_vec_size = augmented_X.shape[1] @@ -136,7 +143,7 @@ def local_2(args): if not mask_flag[i]: gradient[i, :] = ( 1 / len(X)) * np.dot(biased_X.T, np.dot(biased_X, w_) - y_) - cost[i] = get_cost(y_actual=y[i], y_predicted=np.dot(biased_X, w_)) + cost[i] = lc.get_cost(y_actual=y[i], y_predicted=np.dot(biased_X, w_)) output_dict = { "local_grad": gradient.tolist(), diff --git a/scripts/local_ancillary.py b/scripts/local_ancillary.py index e0e4849..38223f5 100644 --- a/scripts/local_ancillary.py +++ b/scripts/local_ancillary.py @@ -110,3 +110,21 @@ def add_site_covariates(args, X): def get_cost(y_actual, y_predicted): return np.average((y_actual-y_predicted)**2) + + +def check_cols_to_normalize(X): + columns_to_normalize=[] + max_vals = X.max(axis=0).to_numpy() + minval = np.min(max_vals[np.nonzero(max_vals)]) + X_headers = list(X.columns) + ranges = max_vals / minval + temp_cols_indxs = np.where(ranges > 10000)[0] + for col_indx in temp_cols_indxs: + columns_to_normalize.append(X_headers[col_indx]) + + return columns_to_normalize + +def normalize_columns(data_df, cols): + for col in cols: + data_df[col] = (data_df[col] - data_df[col].mean())/(data_df[col].std()) + return data_df \ No newline at end of file diff --git a/scripts/remote.py b/scripts/remote.py index b40a8c4..c9c3005 100644 --- a/scripts/remote.py +++ b/scripts/remote.py @@ -17,18 +17,22 @@ def remote_0(args): - #log(args, args['state']) + # log(args, args['state']) input_list = args["input"] - site_ids = list(input_list.keys()) - userID = list(input_list)[0] - + site_ids = sorted(list(input_list.keys())) + userID = list(site_ids)[0] + site_covar_list = [ '{}_{}'.format('site', label) for index, label in enumerate(site_ids) if index ] + columns_to_normalize = set() + for userID in site_ids: + columns_to_normalize.update(columns_to_normalize.union(input_list[userID]["columns_to_normalize"])) + X_labels = input_list[userID]["x_headers"] y_labels = input_list[userID]["y_headers"] @@ -37,6 +41,7 @@ def remote_0(args): output_dict = { "site_covar_list": site_covar_list, + "columns_to_normalize": list(columns_to_normalize), "computation_phase": "remote_0" } @@ -52,11 +57,10 @@ def remote_0(args): "cache": cache_dict, } - #log(args, args['state']) + # log(args, args['state']) return computation_output_dict - def remote_1(args): #log(args, args['state']) """Need this function for performing multi-shot regression"""