diff --git a/lab/pyutils/validateDataset.py b/lab/pyutils/validateDataset.py index af089624b..bb523ae43 100644 --- a/lab/pyutils/validateDataset.py +++ b/lab/pyutils/validateDataset.py @@ -54,16 +54,63 @@ MIN_ROW_PER_CLASS = 2 +def check_dataframe(df, target_column): + ''' + check_dataframe function checks whether each column contains only numeric data or not. + missing values are not allowed in df. + strings are not allowed in df. + inf or -inf are not allowed in df. + ''' + + error_message = "Found error in data:" + + # find columns contain missing value(NaN) in df + nan_cols = df.columns[df.isnull().any()].tolist() + if len(nan_cols) > 0: + error_message += "* 'MISSING VALUE' in " + \ + str(nan_cols) + "" + + df_non_target = df.drop(columns=target_column, axis=1) + inf_cols_list = [] + + # find which features contain infinity or -infinity in df_non_target + + # find columns whose data type is object + # object dtype for storing strings in pandas + # if a column contains both string and numeric data, its dtype is object. + str_cols = df.columns[df.dtypes == object].tolist() + + non_str_cols = df_non_target.columns.difference(str_cols) + + for col in non_str_cols: + + if np.isinf(df[col]).any(): + inf_cols_list.append(col) + + if len(inf_cols_list) > 0: + error_message += "* '+INFINITY or -INFINITY' in " + \ + str(inf_cols_list) + " " + + # str_trigger = False + if len(str_cols) > 0: + error_message += "* 'STRING' in " + \ + str(str_cols)+" " + + return error_message + + def validate_data_from_server(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs): # Read the data set into memory raw_data = get_file_from_server(file_id) df = pd.read_csv(StringIO(raw_data), sep=None, engine='python', **kwargs) - return validate_data(df, prediction_type, target_field, categories, ordinals) + # return validate_data(df, prediction_type, target_field, categories, ordinals) + return validate_data_updated(df, prediction_type, target_field, categories, ordinals) def validate_data_from_filepath(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs): # Read the data set into memory df = pd.read_csv(file_id, sep=None, engine='python', **kwargs) + # print("dfprint", df) return validate_data(df, prediction_type, target_field, categories, ordinals) @@ -190,6 +237,87 @@ def validate_data(df, prediction_type="classification", target_column=None, cate return True, None +def validate_data_updated(df, prediction_type="classification", target_column=None, categories=None, ordinals=None): + ''' + Check that a df is valid + This function checks for the following: + - prediction_type is valid + - number of rows and columns is valid + - target column is valid + - missing values in df. + - strings in df. + - inf or -inf in df. + + + + @return tuple + boolean - validation result + string - message + ''' + + # check prediction type is valid + if prediction_type not in ["classification", "regression"]: + logger.warn(f"Invalid prediction type: '{prediction_type}'") + return False, f"Invalid prediction type: '{prediction_type}'" + + # check the number of rows and columns is valid + if df.shape[0] < MIN_ROWS: + logger.warn("Dataset has dimensions {}, classification datasets must have at least {} rows.".format( + df.shape, MIN_ROWS)) + return False, "Dataset has dimensions {}, classification datasets must have at least {} rows.".format(df.shape, MIN_ROWS) + + # check the number of columns is valid + if df.shape[1] < MIN_COLS: + logger.warn("Dataset has dimensions {}, classification datasets must have at least {} columns.".format( + df.shape, MIN_COLS)) + return False, "Dataset has dimensions {}, classification datasets must have at least {} columns.".format(df.shape, MIN_COLS) + + # target column validation + if (target_column != None): + if not (target_column in df.columns): + logger.warn("Target column '" + target_column + "' not in data") + return False, "Target column '" + target_column + "' not in data" + if categories and target_column in categories: + logger.warn("Target column '" + target_column + + "' cannot be a categorical feature") + return False, "Target column '" + target_column + "' cannot be a categorical feature" + if ordinals and target_column in ordinals: + logger.warn("Target column '" + target_column + + "' cannot be an ordinal feature") + return False, "Target column '" + target_column + "' cannot be an ordinal feature" + + # check only check if target is specified + if target_column: + + # classification + if (prediction_type == "classification"): + # target column of classification problem does not need to be numeric + df_non_target = df.drop(columns=target_column, axis=1) + + # Check rows per class + counts = df.groupby(target_column).count() + fails_validation = counts[counts[counts.columns[1]] + < MIN_ROW_PER_CLASS] + if (not fails_validation.empty): + msg = "Classification datasets must have at least 2 rows per class, class(es) '{}' have only 1 row.".format( + list(fails_validation.index.values)) + logger.warn(msg) + return False, msg + + # In the below code,the check_dataframe() checks whether features and target column contain only processed data. + # check whether each column contains only processed data or not + # missing values are not allowed in df + # strings are not allowed in df + # inf or -inf are not allowed in df + if (len(df.columns)) > 0: + error_message = check_dataframe(df, target_column) + if error_message != "Found error in data:": + logger.warn(str(error_message)) + return False, str(error_message) + + return True, None + + def get_file_from_server(file_id): ''' Retrieve a file from the main Aliro server @@ -262,8 +390,6 @@ def main(): if args.JSON_ORDINALS: ordinals = simplejson.loads(args.JSON_ORDINALS) prediction_type = args.PREDICTION_TYPE - # print("categories: ") - # print(categories) if (args.IDENTIFIER_TYPE == 'filepath'): success, errorMessage = validate_data_from_filepath( diff --git a/lab/webapp/src/components/FileUpload/index.js b/lab/webapp/src/components/FileUpload/index.js index e07d895e8..650179d5e 100644 --- a/lab/webapp/src/components/FileUpload/index.js +++ b/lab/webapp/src/components/FileUpload/index.js @@ -2448,6 +2448,7 @@ handleCatFeaturesUserTextCancel() { {this.state.errorModalContent} + ) }