From fc68c6cea692f3cb71c4d77615b52c77c26c5113 Mon Sep 17 00:00:00 2001 From: hyunjuna Date: Wed, 1 Feb 2023 10:30:00 -0800 Subject: [PATCH 1/2] Update validateDataset.py to make errors on uploads more user-friendly --- lab/pyutils/validateDataset copy.py | 286 ++++++++++++++++++ lab/pyutils/validateDataset.py | 132 +++++++- lab/webapp/src/components/FileUpload/index.js | 1 + 3 files changed, 416 insertions(+), 3 deletions(-) create mode 100644 lab/pyutils/validateDataset copy.py diff --git a/lab/pyutils/validateDataset copy.py b/lab/pyutils/validateDataset copy.py new file mode 100644 index 000000000..af089624b --- /dev/null +++ b/lab/pyutils/validateDataset copy.py @@ -0,0 +1,286 @@ +"""~This file is part of the Aliro library~ + +Copyright (C) 2023 Epistasis Lab, +Center for Artificial Intelligence Research and Education (CAIRE), +Department of Computational Biomedicine (CBM), +Cedars-Sinai Medical Center. + +Aliro is maintained by: + - Hyunjun Choi (hyunjun.choi@cshs.org) + - Miguel Hernandez (miguel.e.hernandez@cshs.org) + - Nick Matsumoto (nicholas.matsumoto@cshs.org) + - Jay Moran (jay.moran@cshs.org) + - and many other generous open source contributors + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +(Autogenerated header, do not modify) + +""" +import argparse +import sys +import simplejson +from sklearn.utils import check_X_y, check_array +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder +from sklearn.compose import ColumnTransformer +import os +import os.path +import pandas as pd +import numpy as np +import logging +import requests +import time +import traceback +from io import StringIO + + +logger = logging.getLogger(__name__) +logger.addHandler(logging.StreamHandler()) +logger.setLevel(logging.INFO) + +MIN_ROWS = 10 +MIN_COLS = 2 +MIN_ROW_PER_CLASS = 2 + + +def validate_data_from_server(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs): + # Read the data set into memory + raw_data = get_file_from_server(file_id) + df = pd.read_csv(StringIO(raw_data), sep=None, engine='python', **kwargs) + return validate_data(df, prediction_type, target_field, categories, ordinals) + + +def validate_data_from_filepath(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs): + # Read the data set into memory + df = pd.read_csv(file_id, sep=None, engine='python', **kwargs) + return validate_data(df, prediction_type, target_field, categories, ordinals) + + +def encode_data(df, target_column, categories, ordinals, encoding_strategy="OneHotEncoder"): + ''' + use OneHotEncoder or OrdinalEncoder to convert categorical features + See skl_utils + ''' + + # check that categorical and ordinal columns can be encoded + if categories or ordinals: + transformers = [] + if categories: + if encoding_strategy == "OneHotEncoder": + transformers.append( + ("categorical_encoder", OneHotEncoder(), categories)) + elif encoding_strategy == "OrdinalEncoder": + transformers.append( + ("categorical_encoder", OrdinalEncoder(), categories)) + if ordinals: + ordinal_features = sorted(list(ordinals.keys())) + ordinal_map = [ordinals[k] for k in ordinal_features] + transformers.append(("ordinalencoder", + OrdinalEncoder(categories=ordinal_map), + ordinal_features)) + + ct = ColumnTransformer( + transformers=transformers, + remainder='passthrough', + sparse_threshold=0 + ) + return ct.fit_transform(df) + else: + return df + + +def validate_data(df, prediction_type="classification", target_column=None, categories=None, ordinals=None): + ''' + Check that a datafile is valid + + + @return tuple + boolean - validation result + string - message + ''' + + if prediction_type not in ["classification", "regression"]: + logger.warn(f"Invalid prediction type: '{prediction_type}'") + return False, f"Invalid prediction type: '{prediction_type}'" + + num_df = df + + # dimension validation + if df.shape[0] < MIN_ROWS: + logger.warn("Dataset has dimensions {}, classification datasets must have at least {} rows.".format( + df.shape, MIN_ROWS)) + return False, "Dataset has dimensions {}, classification datasets must have at least {} rows.".format(df.shape, MIN_ROWS) + + if df.shape[1] < MIN_COLS: + logger.warn("Dataset has dimensions {}, classification datasets must have at least {} columns.".format( + df.shape, MIN_COLS)) + return False, "Dataset has dimensions {}, classification datasets must have at least {} columns.".format(df.shape, MIN_COLS) + + # target column validation + if (target_column != None): + if not (target_column in df.columns): + logger.warn("Target column '" + target_column + "' not in data") + return False, "Target column '" + target_column + "' not in data" + if categories and target_column in categories: + logger.warn("Target column '" + target_column + + "' cannot be a categorical feature") + return False, "Target column '" + target_column + "' cannot be a categorical feature" + if ordinals and target_column in ordinals: + logger.warn("Target column '" + target_column + + "' cannot be an ordinal feature") + return False, "Target column '" + target_column + "' cannot be an ordinal feature" + + # check that cat columns can be encoded + if categories or ordinals: + try: + encode_data(df, target_column, categories, + ordinals, "OneHotEncoder") + encode_data(df, target_column, categories, + ordinals, "OrdinalEncoder") + except Exception as e: + logger.warn("encode_data() failed, " + str(e)) + return False, "encode_data() failed, " + str(e) + + if categories: + num_df = num_df.drop(columns=categories) + if ordinals: + num_df = num_df.drop(columns=list(ordinals.keys())) + + # check only check if target is specified + if target_column: + + # classification + if (prediction_type == "classification"): + # target column of classification problem does not need to be numeric + num_df = num_df.drop(columns=target_column, axis=1) + + # Check rows per class + counts = df.groupby(target_column).count() + fails_validation = counts[counts[counts.columns[1]] + < MIN_ROW_PER_CLASS] + if (not fails_validation.empty): + msg = "Classification datasets must have at least 2 rows per class, class(es) '{}' have only 1 row.".format( + list(fails_validation.index.values)) + logger.warn(msg) + return False, msg + + # check that non-cat feature columns contain only numeric data + if (len(num_df.columns)) > 0: + try: + check_array(num_df, dtype=np.float64, + order="C", force_all_finite=True) + + except Exception as e: + logger.warn("sklearn.check_array() validation " + str(e)) + return False, "sklearn.check_array() validation " + str(e) + + # check t + + return True, None + + +def get_file_from_server(file_id): + ''' + Retrieve a file from the main Aliro server + ''' + apiPath = 'http://' + os.environ['LAB_HOST'] + ':' + os.environ['LAB_PORT'] + path = apiPath + "/api/v1/files/" + file_id + + logger.debug("retrieving file:" + file_id) + logger.debug("api path: " + path) + + res = None + try: + res = requests.request('GET', path, timeout=15) + except: + logger.error("Unexpected error in get_file_from_server for path 'GET: " + + str(path) + "': " + str(sys.exc_info()[0])) + raise + + if res.status_code != requests.codes.ok: + msg = "Request GET status_code not ok, path: '" + \ + str(path) + "'' status code: '" + str(res.status_code) + \ + "'' response text: " + str(res.text) + logger.error(msg) + raise RuntimeError(msg) + + logger.info("File retrieved, file_id: '" + file_id + + "', path: '" + path + "', status_code: " + str(res.status_code)) + return res.text + + +def main(): + meta_features_all = [] + parser = argparse.ArgumentParser( + description="Validate a dataset", add_help=False) + parser.add_argument('INPUT_FILE', type=str, help='Filepath or fileId.') + parser.add_argument('-target', action='store', dest='TARGET', type=str, default='class', + help='Name of target column', required=False) + parser.add_argument('-identifier_type', action='store', dest='IDENTIFIER_TYPE', type=str, choices=['filepath', 'fileid'], default='filepath', + help='Name of target column') + parser.add_argument('-categorical_features', action='store', dest='JSON_CATEGORIES', type=str, required=False, default=None, + help='JSON list of categorical features') + parser.add_argument('-ordinal_features', action='store', dest='JSON_ORDINALS', type=str, required=False, default=None, + help='JSON dict of ordianl features and possible values') + parser.add_argument('-prediction_type', action='store', dest='PREDICTION_TYPE', type=str, choices=['classification', 'regression'], default="classification", + help="Classification or regression problem") + + args = parser.parse_args() + + # set up the file logger + logpath = os.path.join(os.environ['PROJECT_ROOT'], "target/logs") + if not os.path.exists(logpath): + os.makedirs(logpath) + + formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') + fhandler = logging.FileHandler( + os.path.join(logpath, 'validateDataset.log')) + fhandler.setFormatter(formatter) + logger.addHandler(fhandler) + + success = None + errorMessage = None + meta_json = None + + categories = None + ordinals = None + + try: + if args.JSON_CATEGORIES: + categories = simplejson.loads(args.JSON_CATEGORIES) + if args.JSON_ORDINALS: + ordinals = simplejson.loads(args.JSON_ORDINALS) + prediction_type = args.PREDICTION_TYPE + # print("categories: ") + # print(categories) + + if (args.IDENTIFIER_TYPE == 'filepath'): + success, errorMessage = validate_data_from_filepath( + args.INPUT_FILE, prediction_type, args.TARGET, categories, ordinals) + else: + success, errorMessage = validate_data_from_server( + args.INPUT_FILE, prediction_type, args.TARGET, categories, ordinals) + meta_json = simplejson.dumps( + {"success": success, "errorMessage": errorMessage}, ignore_nan=True) # , ensure_ascii=False) + except Exception as e: + logger.error(traceback.format_exc()) + meta_json = simplejson.dumps( + {"success": False, "errorMessage": "Exception: " + repr(e)}, ignore_nan=True) # , ensure_ascii=False) + + print(meta_json) + sys.stdout.flush() + + +if __name__ == '__main__': + main() diff --git a/lab/pyutils/validateDataset.py b/lab/pyutils/validateDataset.py index af089624b..bb523ae43 100644 --- a/lab/pyutils/validateDataset.py +++ b/lab/pyutils/validateDataset.py @@ -54,16 +54,63 @@ MIN_ROW_PER_CLASS = 2 +def check_dataframe(df, target_column): + ''' + check_dataframe function checks whether each column contains only numeric data or not. + missing values are not allowed in df. + strings are not allowed in df. + inf or -inf are not allowed in df. + ''' + + error_message = "Found error in data:" + + # find columns contain missing value(NaN) in df + nan_cols = df.columns[df.isnull().any()].tolist() + if len(nan_cols) > 0: + error_message += "* 'MISSING VALUE' in " + \ + str(nan_cols) + "" + + df_non_target = df.drop(columns=target_column, axis=1) + inf_cols_list = [] + + # find which features contain infinity or -infinity in df_non_target + + # find columns whose data type is object + # object dtype for storing strings in pandas + # if a column contains both string and numeric data, its dtype is object. + str_cols = df.columns[df.dtypes == object].tolist() + + non_str_cols = df_non_target.columns.difference(str_cols) + + for col in non_str_cols: + + if np.isinf(df[col]).any(): + inf_cols_list.append(col) + + if len(inf_cols_list) > 0: + error_message += "* '+INFINITY or -INFINITY' in " + \ + str(inf_cols_list) + " " + + # str_trigger = False + if len(str_cols) > 0: + error_message += "* 'STRING' in " + \ + str(str_cols)+" " + + return error_message + + def validate_data_from_server(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs): # Read the data set into memory raw_data = get_file_from_server(file_id) df = pd.read_csv(StringIO(raw_data), sep=None, engine='python', **kwargs) - return validate_data(df, prediction_type, target_field, categories, ordinals) + # return validate_data(df, prediction_type, target_field, categories, ordinals) + return validate_data_updated(df, prediction_type, target_field, categories, ordinals) def validate_data_from_filepath(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs): # Read the data set into memory df = pd.read_csv(file_id, sep=None, engine='python', **kwargs) + # print("dfprint", df) return validate_data(df, prediction_type, target_field, categories, ordinals) @@ -190,6 +237,87 @@ def validate_data(df, prediction_type="classification", target_column=None, cate return True, None +def validate_data_updated(df, prediction_type="classification", target_column=None, categories=None, ordinals=None): + ''' + Check that a df is valid + This function checks for the following: + - prediction_type is valid + - number of rows and columns is valid + - target column is valid + - missing values in df. + - strings in df. + - inf or -inf in df. + + + + @return tuple + boolean - validation result + string - message + ''' + + # check prediction type is valid + if prediction_type not in ["classification", "regression"]: + logger.warn(f"Invalid prediction type: '{prediction_type}'") + return False, f"Invalid prediction type: '{prediction_type}'" + + # check the number of rows and columns is valid + if df.shape[0] < MIN_ROWS: + logger.warn("Dataset has dimensions {}, classification datasets must have at least {} rows.".format( + df.shape, MIN_ROWS)) + return False, "Dataset has dimensions {}, classification datasets must have at least {} rows.".format(df.shape, MIN_ROWS) + + # check the number of columns is valid + if df.shape[1] < MIN_COLS: + logger.warn("Dataset has dimensions {}, classification datasets must have at least {} columns.".format( + df.shape, MIN_COLS)) + return False, "Dataset has dimensions {}, classification datasets must have at least {} columns.".format(df.shape, MIN_COLS) + + # target column validation + if (target_column != None): + if not (target_column in df.columns): + logger.warn("Target column '" + target_column + "' not in data") + return False, "Target column '" + target_column + "' not in data" + if categories and target_column in categories: + logger.warn("Target column '" + target_column + + "' cannot be a categorical feature") + return False, "Target column '" + target_column + "' cannot be a categorical feature" + if ordinals and target_column in ordinals: + logger.warn("Target column '" + target_column + + "' cannot be an ordinal feature") + return False, "Target column '" + target_column + "' cannot be an ordinal feature" + + # check only check if target is specified + if target_column: + + # classification + if (prediction_type == "classification"): + # target column of classification problem does not need to be numeric + df_non_target = df.drop(columns=target_column, axis=1) + + # Check rows per class + counts = df.groupby(target_column).count() + fails_validation = counts[counts[counts.columns[1]] + < MIN_ROW_PER_CLASS] + if (not fails_validation.empty): + msg = "Classification datasets must have at least 2 rows per class, class(es) '{}' have only 1 row.".format( + list(fails_validation.index.values)) + logger.warn(msg) + return False, msg + + # In the below code,the check_dataframe() checks whether features and target column contain only processed data. + # check whether each column contains only processed data or not + # missing values are not allowed in df + # strings are not allowed in df + # inf or -inf are not allowed in df + if (len(df.columns)) > 0: + error_message = check_dataframe(df, target_column) + if error_message != "Found error in data:": + logger.warn(str(error_message)) + return False, str(error_message) + + return True, None + + def get_file_from_server(file_id): ''' Retrieve a file from the main Aliro server @@ -262,8 +390,6 @@ def main(): if args.JSON_ORDINALS: ordinals = simplejson.loads(args.JSON_ORDINALS) prediction_type = args.PREDICTION_TYPE - # print("categories: ") - # print(categories) if (args.IDENTIFIER_TYPE == 'filepath'): success, errorMessage = validate_data_from_filepath( diff --git a/lab/webapp/src/components/FileUpload/index.js b/lab/webapp/src/components/FileUpload/index.js index e07d895e8..650179d5e 100644 --- a/lab/webapp/src/components/FileUpload/index.js +++ b/lab/webapp/src/components/FileUpload/index.js @@ -2448,6 +2448,7 @@ handleCatFeaturesUserTextCancel() { {this.state.errorModalContent} + ) } From b7aa1ddb190ba00f5080a878ea145f6b1cd1fac9 Mon Sep 17 00:00:00 2001 From: hyunjuna Date: Wed, 1 Feb 2023 10:34:30 -0800 Subject: [PATCH 2/2] remove copy --- lab/pyutils/validateDataset copy.py | 286 ---------------------------- 1 file changed, 286 deletions(-) delete mode 100644 lab/pyutils/validateDataset copy.py diff --git a/lab/pyutils/validateDataset copy.py b/lab/pyutils/validateDataset copy.py deleted file mode 100644 index af089624b..000000000 --- a/lab/pyutils/validateDataset copy.py +++ /dev/null @@ -1,286 +0,0 @@ -"""~This file is part of the Aliro library~ - -Copyright (C) 2023 Epistasis Lab, -Center for Artificial Intelligence Research and Education (CAIRE), -Department of Computational Biomedicine (CBM), -Cedars-Sinai Medical Center. - -Aliro is maintained by: - - Hyunjun Choi (hyunjun.choi@cshs.org) - - Miguel Hernandez (miguel.e.hernandez@cshs.org) - - Nick Matsumoto (nicholas.matsumoto@cshs.org) - - Jay Moran (jay.moran@cshs.org) - - and many other generous open source contributors - -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program. If not, see . - -(Autogenerated header, do not modify) - -""" -import argparse -import sys -import simplejson -from sklearn.utils import check_X_y, check_array -from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder -from sklearn.compose import ColumnTransformer -import os -import os.path -import pandas as pd -import numpy as np -import logging -import requests -import time -import traceback -from io import StringIO - - -logger = logging.getLogger(__name__) -logger.addHandler(logging.StreamHandler()) -logger.setLevel(logging.INFO) - -MIN_ROWS = 10 -MIN_COLS = 2 -MIN_ROW_PER_CLASS = 2 - - -def validate_data_from_server(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs): - # Read the data set into memory - raw_data = get_file_from_server(file_id) - df = pd.read_csv(StringIO(raw_data), sep=None, engine='python', **kwargs) - return validate_data(df, prediction_type, target_field, categories, ordinals) - - -def validate_data_from_filepath(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs): - # Read the data set into memory - df = pd.read_csv(file_id, sep=None, engine='python', **kwargs) - return validate_data(df, prediction_type, target_field, categories, ordinals) - - -def encode_data(df, target_column, categories, ordinals, encoding_strategy="OneHotEncoder"): - ''' - use OneHotEncoder or OrdinalEncoder to convert categorical features - See skl_utils - ''' - - # check that categorical and ordinal columns can be encoded - if categories or ordinals: - transformers = [] - if categories: - if encoding_strategy == "OneHotEncoder": - transformers.append( - ("categorical_encoder", OneHotEncoder(), categories)) - elif encoding_strategy == "OrdinalEncoder": - transformers.append( - ("categorical_encoder", OrdinalEncoder(), categories)) - if ordinals: - ordinal_features = sorted(list(ordinals.keys())) - ordinal_map = [ordinals[k] for k in ordinal_features] - transformers.append(("ordinalencoder", - OrdinalEncoder(categories=ordinal_map), - ordinal_features)) - - ct = ColumnTransformer( - transformers=transformers, - remainder='passthrough', - sparse_threshold=0 - ) - return ct.fit_transform(df) - else: - return df - - -def validate_data(df, prediction_type="classification", target_column=None, categories=None, ordinals=None): - ''' - Check that a datafile is valid - - - @return tuple - boolean - validation result - string - message - ''' - - if prediction_type not in ["classification", "regression"]: - logger.warn(f"Invalid prediction type: '{prediction_type}'") - return False, f"Invalid prediction type: '{prediction_type}'" - - num_df = df - - # dimension validation - if df.shape[0] < MIN_ROWS: - logger.warn("Dataset has dimensions {}, classification datasets must have at least {} rows.".format( - df.shape, MIN_ROWS)) - return False, "Dataset has dimensions {}, classification datasets must have at least {} rows.".format(df.shape, MIN_ROWS) - - if df.shape[1] < MIN_COLS: - logger.warn("Dataset has dimensions {}, classification datasets must have at least {} columns.".format( - df.shape, MIN_COLS)) - return False, "Dataset has dimensions {}, classification datasets must have at least {} columns.".format(df.shape, MIN_COLS) - - # target column validation - if (target_column != None): - if not (target_column in df.columns): - logger.warn("Target column '" + target_column + "' not in data") - return False, "Target column '" + target_column + "' not in data" - if categories and target_column in categories: - logger.warn("Target column '" + target_column + - "' cannot be a categorical feature") - return False, "Target column '" + target_column + "' cannot be a categorical feature" - if ordinals and target_column in ordinals: - logger.warn("Target column '" + target_column + - "' cannot be an ordinal feature") - return False, "Target column '" + target_column + "' cannot be an ordinal feature" - - # check that cat columns can be encoded - if categories or ordinals: - try: - encode_data(df, target_column, categories, - ordinals, "OneHotEncoder") - encode_data(df, target_column, categories, - ordinals, "OrdinalEncoder") - except Exception as e: - logger.warn("encode_data() failed, " + str(e)) - return False, "encode_data() failed, " + str(e) - - if categories: - num_df = num_df.drop(columns=categories) - if ordinals: - num_df = num_df.drop(columns=list(ordinals.keys())) - - # check only check if target is specified - if target_column: - - # classification - if (prediction_type == "classification"): - # target column of classification problem does not need to be numeric - num_df = num_df.drop(columns=target_column, axis=1) - - # Check rows per class - counts = df.groupby(target_column).count() - fails_validation = counts[counts[counts.columns[1]] - < MIN_ROW_PER_CLASS] - if (not fails_validation.empty): - msg = "Classification datasets must have at least 2 rows per class, class(es) '{}' have only 1 row.".format( - list(fails_validation.index.values)) - logger.warn(msg) - return False, msg - - # check that non-cat feature columns contain only numeric data - if (len(num_df.columns)) > 0: - try: - check_array(num_df, dtype=np.float64, - order="C", force_all_finite=True) - - except Exception as e: - logger.warn("sklearn.check_array() validation " + str(e)) - return False, "sklearn.check_array() validation " + str(e) - - # check t - - return True, None - - -def get_file_from_server(file_id): - ''' - Retrieve a file from the main Aliro server - ''' - apiPath = 'http://' + os.environ['LAB_HOST'] + ':' + os.environ['LAB_PORT'] - path = apiPath + "/api/v1/files/" + file_id - - logger.debug("retrieving file:" + file_id) - logger.debug("api path: " + path) - - res = None - try: - res = requests.request('GET', path, timeout=15) - except: - logger.error("Unexpected error in get_file_from_server for path 'GET: " + - str(path) + "': " + str(sys.exc_info()[0])) - raise - - if res.status_code != requests.codes.ok: - msg = "Request GET status_code not ok, path: '" + \ - str(path) + "'' status code: '" + str(res.status_code) + \ - "'' response text: " + str(res.text) - logger.error(msg) - raise RuntimeError(msg) - - logger.info("File retrieved, file_id: '" + file_id + - "', path: '" + path + "', status_code: " + str(res.status_code)) - return res.text - - -def main(): - meta_features_all = [] - parser = argparse.ArgumentParser( - description="Validate a dataset", add_help=False) - parser.add_argument('INPUT_FILE', type=str, help='Filepath or fileId.') - parser.add_argument('-target', action='store', dest='TARGET', type=str, default='class', - help='Name of target column', required=False) - parser.add_argument('-identifier_type', action='store', dest='IDENTIFIER_TYPE', type=str, choices=['filepath', 'fileid'], default='filepath', - help='Name of target column') - parser.add_argument('-categorical_features', action='store', dest='JSON_CATEGORIES', type=str, required=False, default=None, - help='JSON list of categorical features') - parser.add_argument('-ordinal_features', action='store', dest='JSON_ORDINALS', type=str, required=False, default=None, - help='JSON dict of ordianl features and possible values') - parser.add_argument('-prediction_type', action='store', dest='PREDICTION_TYPE', type=str, choices=['classification', 'regression'], default="classification", - help="Classification or regression problem") - - args = parser.parse_args() - - # set up the file logger - logpath = os.path.join(os.environ['PROJECT_ROOT'], "target/logs") - if not os.path.exists(logpath): - os.makedirs(logpath) - - formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - fhandler = logging.FileHandler( - os.path.join(logpath, 'validateDataset.log')) - fhandler.setFormatter(formatter) - logger.addHandler(fhandler) - - success = None - errorMessage = None - meta_json = None - - categories = None - ordinals = None - - try: - if args.JSON_CATEGORIES: - categories = simplejson.loads(args.JSON_CATEGORIES) - if args.JSON_ORDINALS: - ordinals = simplejson.loads(args.JSON_ORDINALS) - prediction_type = args.PREDICTION_TYPE - # print("categories: ") - # print(categories) - - if (args.IDENTIFIER_TYPE == 'filepath'): - success, errorMessage = validate_data_from_filepath( - args.INPUT_FILE, prediction_type, args.TARGET, categories, ordinals) - else: - success, errorMessage = validate_data_from_server( - args.INPUT_FILE, prediction_type, args.TARGET, categories, ordinals) - meta_json = simplejson.dumps( - {"success": success, "errorMessage": errorMessage}, ignore_nan=True) # , ensure_ascii=False) - except Exception as e: - logger.error(traceback.format_exc()) - meta_json = simplejson.dumps( - {"success": False, "errorMessage": "Exception: " + repr(e)}, ignore_nan=True) # , ensure_ascii=False) - - print(meta_json) - sys.stdout.flush() - - -if __name__ == '__main__': - main()