From 9edbbce239943d0829c72b3888490386f94a7e1c Mon Sep 17 00:00:00 2001 From: hyunjuna Date: Wed, 1 Feb 2023 17:59:43 -0800 Subject: [PATCH] Make errors on uploads more user-friendly #570 Update error message for each case in validateDataset.py Update labApi.test.ts Update react to show error message for each case --- lab/pyutils/tests/test_validateDataset.py | 205 ++++++++++-------- lab/pyutils/validateDataset.py | 43 ++-- lab/webapp/src/components/FileUpload/index.js | 15 +- tests/integration/jest/labApi.test.ts | 76 ++++++- 4 files changed, 228 insertions(+), 111 deletions(-) diff --git a/lab/pyutils/tests/test_validateDataset.py b/lab/pyutils/tests/test_validateDataset.py index e93037d65..4c672693c 100644 --- a/lab/pyutils/tests/test_validateDataset.py +++ b/lab/pyutils/tests/test_validateDataset.py @@ -48,10 +48,11 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) + @nottest def load_bad_test_data(): return [ - ("appendicitis_bad_rows", + ("appendicitis_bad_rows", "data/datasets/test/test_bad/appendicitis_bad_rows.csv", "classification", "target_class", @@ -59,99 +60,99 @@ def load_bad_test_data(): None, "Dataset has dimensions (5, 8), classification datasets must have at least 10 rows."), # because the data has only one column, the file is not autoparsed properly - ("appendicitis_bad_cols", + ("appendicitis_bad_cols", "data/datasets/test/test_bad/appendicitis_bad_cols.csv", "classification", "target_class", None, None, "Target column 'target_class' not in data"), - #"Dataset has dimensions (5, 2), classification datasets must have at least 2 columns."), - ("appendicitis_bad_cols_per_class", + # "Dataset has dimensions (5, 2), classification datasets must have at least 2 columns."), + ("appendicitis_bad_cols_per_class", "data/datasets/test/test_bad/appendicitis_bad_cols_per_class.csv", "classification", "target_class", None, None, "Classification datasets must have at least 2 rows per class, class(es) '[3]' have only 1 row."), - ("appendicitis_bad_dim", + ("appendicitis_bad_dim", "data/datasets/test/test_bad/appendicitis_bad_dim.csv", "classification", "class", None, None, "Classification datasets must have at least 2 rows per class, class(es) '[9999999.0]' have only 1 row."), - ("appendicitis_bad_target_col", + ("appendicitis_bad_target_col", "data/datasets/test/test_bad/appendicitis_bad_target_col.csv", "classification", "class", None, None, "Target column 'class' not in data"), - ("appendicitis_null", + ("appendicitis_null", "data/datasets/test/test_bad/appendicitis_null.csv", "classification", "class", None, None, - "sklearn.check_array() validation Input contains NaN, infinity or a value too large for dtype('float64')."), - ("appendicitis_cat", + "sklearn.check_array() validation Input contains NaN, infinity or a value too large for dtype('float64')."), + ("appendicitis_cat", "data/datasets/test/test_bad/appendicitis_cat.csv", "classification", "target_class", None, None, "sklearn.check_array() validation could not convert string to float: 'b'"), - ("appendicitis_cat_2", + ("appendicitis_cat_2", "data/datasets/test/test_bad/appendicitis_cat_2.csv", "classification", "target_class", None, None, "sklearn.check_array() validation could not convert string to float: 'a'"), - ("appendicitis_cat_missing_col", + ("appendicitis_cat_missing_col", "data/datasets/test/integration/appendicitis_cat_ord.csv", "classification", "target_class", ["cat"], None, "sklearn.check_array() validation could not convert string to float: 'first'"), - ("appendicitis_ord_missing_col", + ("appendicitis_ord_missing_col", "data/datasets/test/integration/appendicitis_cat_ord.csv", "classification", "target_class", None, - {"ord" : ["first", "second", "third"]}, + {"ord": ["first", "second", "third"]}, "sklearn.check_array() validation could not convert string to float: 'b'"), - ("appendicitis_cat_ord_missing_val", + ("appendicitis_cat_ord_missing_val", "data/datasets/test/integration/appendicitis_cat_ord.csv", "classification", "target_class", ["cat"], - {"ord" : ["first", "second"]}, + {"ord": ["first", "second"]}, "encode_data() failed, Found unknown categories ['third'] in column 0 during fit"), - ("appendicitis_ord_target", + ("appendicitis_ord_target", "data/datasets/test/test_bad/appendicitis_ord_target.csv", "classification", "target_class", None, - {"target_class" : ["true", "false"]}, + {"target_class": ["true", "false"]}, "Target column 'target_class' cannot be an ordinal feature"), - ("appendicitis_cat_target", + ("appendicitis_cat_target", "data/datasets/test/test_bad/appendicitis_ord_target.csv", "classification", "target_class", ["target_class"], None, "Target column 'target_class' cannot be a categorical feature"), - ("reg_vineyard_null_target", + ("reg_vineyard_null_target", "data/datasets/test/test_bad/regression/vineyard_null_target.csv", "regression", "target", None, None, "sklearn.check_array() validation Input contains NaN, infinity or a value too large for dtype('float64')."), - ("reg_vineyard_str_target", + ("reg_vineyard_str_target", "data/datasets/test/test_bad/regression/vineyard_str_target.csv", "regression", "target", @@ -160,194 +161,222 @@ def load_bad_test_data(): "sklearn.check_array() validation could not convert string to float: 'bar'"), ] + @nottest def load_bad_test_data_parameter_exception(): return [ - ("appendicitis_bad_prediction_type", - "data/datasets/test/test_bad/appendicitis_ord_target.csv", - "badPredictionType", - "target_class", - ["target_class"], - None, - "Invalid prediction type: 'badPredictionType'") + ("appendicitis_bad_prediction_type", + "data/datasets/test/test_bad/appendicitis_ord_target.csv", + "badPredictionType", + "target_class", + ["target_class"], + None, + "Invalid prediction type: 'badPredictionType'") ] + @nottest def load_bad_test_data_no_target(): return [ - ("appendicitis_bad_dim", + ("appendicitis_bad_dim", "data/datasets/test/test_bad/appendicitis_bad_dim.csv", "classification", "class", "sklearn.check_array() validation Input contains NaN, infinity or a value too large for dtype('float64')."), - ("appendicitis_null", + ("appendicitis_null", "data/datasets/test/test_bad/appendicitis_null.csv", "classification", "class", - "sklearn.check_array() validation Input contains NaN, infinity or a value too large for dtype('float64')."), - ("appendicitis_cat", + "sklearn.check_array() validation Input contains NaN, infinity or a value too large for dtype('float64')."), + ("appendicitis_cat", "data/datasets/test/test_bad/appendicitis_cat.csv", "target_class", "sklearn.check_array() validation could not convert string to float: 'b'"), - ("appendicitis_cat_2", + ("appendicitis_cat_2", "data/datasets/test/test_bad/appendicitis_cat_2.csv", "target_class", "sklearn.check_array() validation could not convert string to float: 'a'"), ] + @nottest def load_good_test_data(): return [ - ("allbp", + ("allbp", "data/datasets/test/test_flat/allbp.csv", "classification", "class", None, None), - ("appendicitis_alt_target_col", + ("appendicitis_alt_target_col", "data/datasets/test/test_flat/appendicitis.csv", "classification", "target_class", [], {}), - ("appendicitis_cat", + ("appendicitis_cat", "data/datasets/test/integration/appendicitis_cat.csv", "classification", "target_class", ["cat"], None), - ("appendicitis_ord", + ("appendicitis_ord", "data/datasets/test/integration/appendicitis_ord.csv", "classification", "target_class", None, - {"ord" : ["first", "second", "third"]} - ), - ("appendicitis_cat_ord", + {"ord": ["first", "second", "third"]} + ), + ("appendicitis_cat_ord", "data/datasets/test/integration/appendicitis_cat_ord.csv", "classification", "target_class", ["cat"], - {"ord" : ["first", "second", "third"]} - ), - ("appendicitis_str_target", + {"ord": ["first", "second", "third"]} + ), + ("appendicitis_str_target", "data/datasets/test/integration/appendicitis_string_target.csv", "classification", "target_class", None, None - ), - ("reg_vineyard", + ), + ("reg_vineyard", "data/datasets/test/test_regression/192_vineyard.csv", "regression", "target", None, None - ), - ("reg_auto_price", + ), + ("reg_auto_price", "data/datasets/test/test_regression/195_auto_price.tsv", "regression", "target", None, None - ), - ] + ), + ] class TestResultUtils(unittest.TestCase): @parameterized.expand(load_bad_test_data) def test_validate_data_file_bad(self, name, file_path, prediction_type, target_column, categories, ordinals, expectedMessage): - result, message = validateDataset.validate_data_from_filepath(file_path, prediction_type, target_column, categories, ordinals) - assert(message) + result, message = validateDataset.validate_data_from_filepath( + file_path, prediction_type, target_column, categories, ordinals) + assert (message) self.assertEqual(message, expectedMessage) - assert not(result) + assert not (result) + class TestResultUtils(unittest.TestCase): @parameterized.expand(load_bad_test_data_parameter_exception) def test_validate_data_file_bad_parameters_bad(self, name, file_path, prediction_type, target_column, categories, ordinals, expectedMessage): - result, message = validateDataset.validate_data_from_filepath(file_path, prediction_type, target_column, categories, ordinals) - assert(message) + result, message = validateDataset.validate_data_from_filepath( + file_path, prediction_type, target_column, categories, ordinals) + assert (message) self.assertEqual(message, expectedMessage) - assert not(result) + assert not (result) - # NaN or string errors are only checked if the target column is specified. -## @parameterized.expand(load_bad_test_data_no_target) -## def test_validate_data_file_bad_no_target(self, name, file_path, prediction_type, target_column, expectedMessage): -## result, message = validateDataset.validate_data_from_filepath(file_path, prediction_type, None) -## assert(message) -## self.assertEqual(message, expectedMessage) -## assert not(result) + # NaN or string errors are only checked if the target column is specified. +# @parameterized.expand(load_bad_test_data_no_target) +# def test_validate_data_file_bad_no_target(self, name, file_path, prediction_type, target_column, expectedMessage): +# result, message = validateDataset.validate_data_from_filepath(file_path, prediction_type, None) +# assert(message) +# self.assertEqual(message, expectedMessage) +# assert not(result) @parameterized.expand(load_good_test_data) def test_validate_data_file_good(self, name, file_path, prediction_type, target_column, categories, ordinals): - result, message = validateDataset.validate_data_from_filepath(file_path, prediction_type, target_column, categories, ordinals) - logger.debug("name: " + name + " file_path: " + file_path + " target:" + target_column + " res: " + str(result) + " msg: " + str(message)) + result, message = validateDataset.validate_data_from_filepath( + file_path, prediction_type, target_column, categories, ordinals) + logger.debug("name: " + name + " file_path: " + file_path + " target:" + + target_column + " res: " + str(result) + " msg: " + str(message)) self.assertTrue(result) - + @parameterized.expand(load_good_test_data) def test_validate_data_file_good_no_target(self, name, file_path, prediction_type, target_column, categories, ordinals): - result, message = validateDataset.validate_data_from_filepath(file_path, prediction_type, None, categories, ordinals) - logger.debug("name: " + name + " file_path: " + file_path + " target:" + target_column + " res: " + str(result) + " msg: " + str(message)) + result, message = validateDataset.validate_data_from_filepath( + file_path, prediction_type, None, categories, ordinals) + logger.debug("name: " + name + " file_path: " + file_path + " target:" + + target_column + " res: " + str(result) + " msg: " + str(message)) self.assertTrue(result) @parameterized.expand(load_good_test_data) def test_validate_data_main_file_good(self, name, file_path, prediction_type, target_column, categories, ordinals): result = io.StringIO() - testargs = ["program.py", file_path, '-target', target_column, '-identifier_type', 'filepath'] - - if (categories) : testargs.extend(['-categorical_features', simplejson.dumps(categories)]) - if (ordinals) : testargs.extend(['-ordinal_features', simplejson.dumps(ordinals)]) - if (prediction_type) : testargs.extend(['-prediction_type', prediction_type]) + testargs = ["program.py", file_path, '-target', + target_column, '-identifier_type', 'filepath'] + + if (categories): + testargs.extend( + ['-categorical_features', simplejson.dumps(categories)]) + if (ordinals): + testargs.extend(['-ordinal_features', simplejson.dumps(ordinals)]) + if (prediction_type): + testargs.extend(['-prediction_type', prediction_type]) with patch.object(sys, 'argv', testargs): sys.stdout = result validateDataset.main() sys.stdout = sys.__stdout__ - logger.debug("testargs: " + str(testargs) + " res: " + result.getvalue()) + logger.debug("testargs: " + str(testargs) + + " res: " + result.getvalue()) self.assertTrue(result.getvalue()) objResult = simplejson.loads(result.getvalue()) self.assertEqual(objResult, {"success": True, "errorMessage": None}) - @parameterized.expand(load_bad_test_data) def test_validate_data_main_file_bad(self, name, file_path, prediction_type, target_column, categories, ordinals, expectedMessage): result = io.StringIO() - testargs = ["program.py", file_path, '-target', target_column, '-identifier_type', 'filepath'] + testargs = ["program.py", file_path, '-target', + target_column, '-identifier_type', 'filepath'] - if (categories) : testargs.extend(['-categorical_features', simplejson.dumps(categories)]) - if (ordinals) : testargs.extend(['-ordinal_features', simplejson.dumps(ordinals)]) - if (prediction_type) : testargs.extend(['-prediction_type', prediction_type]) + if (categories): + testargs.extend( + ['-categorical_features', simplejson.dumps(categories)]) + if (ordinals): + testargs.extend(['-ordinal_features', simplejson.dumps(ordinals)]) + if (prediction_type): + testargs.extend(['-prediction_type', prediction_type]) with patch.object(sys, 'argv', testargs): sys.stdout = result validateDataset.main() sys.stdout = sys.__stdout__ - logger.debug("testargs: " + str(testargs) + " res: " + result.getvalue()) + logger.debug("testargs: " + str(testargs) + + " res: " + result.getvalue()) self.assertTrue(result.getvalue()) objResult = simplejson.loads(result.getvalue()) - self.assertEqual(objResult, {"success": False, "errorMessage": expectedMessage}) - + self.assertEqual( + objResult, {"success": False, "errorMessage": expectedMessage}) @parameterized.expand(load_good_test_data) def test_validate_data_main_api_connect_error(self, name, file_path, prediction_type, target_column, categories, ordinals): result = io.StringIO() - testargs = ["program.py", file_path, '-target', target_column, '-identifier_type', 'fileid'] + testargs = ["program.py", file_path, '-target', + target_column, '-identifier_type', 'fileid'] - if (categories) : testargs.extend(['-categorical_features', simplejson.dumps(categories)]) - if (ordinals) : testargs.extend(['-ordinal_features', simplejson.dumps(ordinals)]) - if (prediction_type) : testargs.extend(['-prediction_type', prediction_type]) + if (categories): + testargs.extend( + ['-categorical_features', simplejson.dumps(categories)]) + if (ordinals): + testargs.extend(['-ordinal_features', simplejson.dumps(ordinals)]) + if (prediction_type): + testargs.extend(['-prediction_type', prediction_type]) with patch.object(sys, 'argv', testargs): sys.stdout = result validateDataset.main() sys.stdout = sys.__stdout__ - logger.debug("testargs: " + str(testargs) + " res: " + result.getvalue()) + logger.debug("testargs: " + str(testargs) + + " res: " + result.getvalue()) self.assertTrue(result.getvalue()) objResult = simplejson.loads(result.getvalue()) self.assertIsInstance(objResult, dict) self.assertEqual(list(objResult), ["success", "errorMessage"]) self.assertEqual(objResult['success'], False) - #self.assertRegex(objResult['errorMessage'], "^Exception: ConnectionError\(MaxRetryError") - #self.assertRegex(objResult['errorMessage'], "^Exception: ConnectTimeout\(MaxRetryError") - self.assertRegex(objResult['errorMessage'], "^Exception: Connect.*\(MaxRetryError") + # self.assertRegex(objResult['errorMessage'], "^Exception: ConnectionError\(MaxRetryError") + # self.assertRegex(objResult['errorMessage'], "^Exception: ConnectTimeout\(MaxRetryError") + self.assertRegex(objResult['errorMessage'], + "^Exception: Connect.*\(MaxRetryError") diff --git a/lab/pyutils/validateDataset.py b/lab/pyutils/validateDataset.py index bb523ae43..12ef19a66 100644 --- a/lab/pyutils/validateDataset.py +++ b/lab/pyutils/validateDataset.py @@ -62,7 +62,8 @@ def check_dataframe(df, target_column): inf or -inf are not allowed in df. ''' - error_message = "Found error in data:" + # error_message = "Found error in data:" + error_message = "" # find columns contain missing value(NaN) in df nan_cols = df.columns[df.isnull().any()].tolist() @@ -96,15 +97,18 @@ def check_dataframe(df, target_column): error_message += "* 'STRING' in " + \ str(str_cols)+" " - return error_message + if error_message != "": + raise ValueError(error_message) + + return True def validate_data_from_server(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs): # Read the data set into memory raw_data = get_file_from_server(file_id) df = pd.read_csv(StringIO(raw_data), sep=None, engine='python', **kwargs) - # return validate_data(df, prediction_type, target_field, categories, ordinals) - return validate_data_updated(df, prediction_type, target_field, categories, ordinals) + return validate_data(df, prediction_type, target_field, categories, ordinals) + # return validate_data_updated(df, prediction_type, target_field, categories, ordinals) def validate_data_from_filepath(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs): @@ -146,8 +150,10 @@ def encode_data(df, target_column, categories, ordinals, encoding_strategy="OneH else: return df +# original validate_data function -def validate_data(df, prediction_type="classification", target_column=None, categories=None, ordinals=None): + +def validate_data_origin(df, prediction_type="classification", target_column=None, categories=None, ordinals=None): ''' Check that a datafile is valid @@ -232,12 +238,12 @@ def validate_data(df, prediction_type="classification", target_column=None, cate logger.warn("sklearn.check_array() validation " + str(e)) return False, "sklearn.check_array() validation " + str(e) - # check t - return True, None +# updated validate_data function + -def validate_data_updated(df, prediction_type="classification", target_column=None, categories=None, ordinals=None): +def validate_data(df, prediction_type="classification", target_column=None, categories=None, ordinals=None): ''' Check that a df is valid This function checks for the following: @@ -304,16 +310,17 @@ def validate_data_updated(df, prediction_type="classification", target_column=No logger.warn(msg) return False, msg - # In the below code,the check_dataframe() checks whether features and target column contain only processed data. - # check whether each column contains only processed data or not - # missing values are not allowed in df - # strings are not allowed in df - # inf or -inf are not allowed in df - if (len(df.columns)) > 0: - error_message = check_dataframe(df, target_column) - if error_message != "Found error in data:": - logger.warn(str(error_message)) - return False, str(error_message) + # In the below code,the check_dataframe() checks whether features and target column contain only processed data. + # check whether each column contains only processed data or not + # missing values are not allowed in df + # strings are not allowed in df + # inf or -inf are not allowed in df + if (len(df.columns)) > 0: + try: + check_dataframe(df, target_column) + except ValueError as e: + logger.warn("check_dataframe() validation " + str(e)) + return False, "check_dataframe() validation " + str(e) return True, None diff --git a/lab/webapp/src/components/FileUpload/index.js b/lab/webapp/src/components/FileUpload/index.js index 650179d5e..b4eeb8e48 100644 --- a/lab/webapp/src/components/FileUpload/index.js +++ b/lab/webapp/src/components/FileUpload/index.js @@ -2445,7 +2445,20 @@ handleCatFeaturesUserTextCancel() { {this.state.errorModalHeader} - {this.state.errorModalContent} + {/* {this.state.errorModalContent} */} + + {errorModalContent.map((item, index) => { + + if (index==0) + { + return

{item}

; + } + else{ + return

{index}: {item}

; + } + + })}
+ diff --git a/tests/integration/jest/labApi.test.ts b/tests/integration/jest/labApi.test.ts index 121f68a9d..7253db131 100644 --- a/tests/integration/jest/labApi.test.ts +++ b/tests/integration/jest/labApi.test.ts @@ -15,14 +15,14 @@ describe('lab', () => { describe.each` testname | filename | prediction_type | target | categorical | ordinal ${'numeric'} | ${'appendicitis_2.csv'} | ${'classification'} | ${'target_class'} | ${[]} | ${{}} - ${'categorical'} | ${'appendicitis_cat.csv'} | ${'classification'} | ${'target_class'} |${["cat"]} | ${{}} - ${'categorical_ordinal'} | ${'appendicitis_cat_ord.csv'} | ${'classification'} | ${'target_class'} |${["cat"]} | ${ {"ord" : ["first", "second", "third"]}} ${'regression'} | ${'192_vineyard.csv'} | ${'regression'} | ${'target'} |${[]} | ${{}} `("putDatasetGood", ({testname, filename, prediction_type, target, categorical, ordinal}) => { it(`${testname}`, async () => { jest.setTimeout(15000) let filepath = `${util.DATASET_PATH}/${filename}` + console.log('test!!!') + console.log(`${testname} ${filename} ${prediction_type}, ${target}, ${categorical} ${ordinal}`) let form = new FormData(); @@ -94,6 +94,7 @@ describe('lab', () => { } }); + // appendicitis_cat.csv it('string data in file', async () => { expect.assertions(3); @@ -121,10 +122,77 @@ describe('lab', () => { var json = await e.response.json() expect(json.error).toBeTruthy() expect(e.response.status).toEqual(400) - expect(json.error).toEqual("Unable to upload file. Error: Datafile validation failed, sklearn.check_array() validation could not convert string to float: 'b'") + expect(json.error).toEqual("Unable to upload file. Error: Datafile validation failed, check_dataframe() validation * 'STRING' in ['cat'] ") + } + }); + + + // aappendicitis_cat_ord.csv + it('string data in file', async () => { + expect.assertions(3); + + let filepath = `${util.DATASET_PATH}/appendicitis_cat_ord.csv` + + let form = new FormData(); + + let metadata = JSON.stringify({ + 'name': 'appendicitis_cat_ord_datasetbad.csv', + 'username': 'testuser', + 'timestamp': Date.now(), + 'dependent_col' : 'target_class', + 'categorical_features' : [] + }) + + form.append('_metadata', metadata) + form.append('_files', fs.createReadStream(filepath)); + + let result + + try { + result = await labApi.putDataset(form); + } + catch (e) { + var json = await e.response.json() + expect(json.error).toBeTruthy() + expect(e.response.status).toEqual(400) + expect(json.error).toEqual("Unable to upload file. Error: Datafile validation failed, check_dataframe() validation * 'STRING' in ['cat', 'ord'] ") } }); + + it('string data in file', async () => { + expect.assertions(3); + + let filepath = `${util.DATASET_PATH}/appendicitis_cat_ord.csv` + + let form = new FormData(); + + let metadata = JSON.stringify({ + 'name': 'appendicitis_cat_ord_datasetbad.csv', + 'username': 'testuser', + 'timestamp': Date.now(), + 'dependent_col' : 'target_class', + 'categorical_features' : [] + }) + + form.append('_metadata', metadata) + form.append('_files', fs.createReadStream(filepath)); + + let result + + try { + result = await labApi.putDataset(form); + } + catch (e) { + var json = await e.response.json() + expect(json.error).toBeTruthy() + expect(e.response.status).toEqual(400) + expect(json.error).toEqual("Unable to upload file. Error: Datafile validation failed, check_dataframe() validation * 'STRING' in ['cat', 'ord'] ") + } + }); + + + it('invalid metadata key', async () => { expect.assertions(3); @@ -242,7 +310,7 @@ describe('lab', () => { var json = await e.response.json() expect(json.error).toBeTruthy() expect(e.response.status).toEqual(400) - expect(json.error).toEqual("Unable to upload file. Error: metafeatures validation failed, dataset with metafeature signature '6526f294170ebc6066c53ad1f2b01b5b4c61708bd455fa08b1e60895a74a4a83' has already been registered, count: 1.") + expect(json.error).toEqual("Unable to upload file. Error: metafeatures validation failed, dataset with metafeature signature '13f1e79965d62e80bf70f4ea5b19c137be49df149ad223add6f9853b9d50e088' has already been registered, count: 1.") } });