From fc68c6cea692f3cb71c4d77615b52c77c26c5113 Mon Sep 17 00:00:00 2001
From: hyunjuna <hyunjun.choi@cshs.org>
Date: Wed, 1 Feb 2023 10:30:00 -0800
Subject: [PATCH 1/2] Update validateDataset.py to make errors on uploads more
 user-friendly

---
 lab/pyutils/validateDataset copy.py           | 286 ++++++++++++++++++
 lab/pyutils/validateDataset.py                | 132 +++++++-
 lab/webapp/src/components/FileUpload/index.js |   1 +
 3 files changed, 416 insertions(+), 3 deletions(-)
 create mode 100644 lab/pyutils/validateDataset copy.py

diff --git a/lab/pyutils/validateDataset copy.py b/lab/pyutils/validateDataset copy.py
new file mode 100644
index 000000000..af089624b
--- /dev/null
+++ b/lab/pyutils/validateDataset copy.py	
@@ -0,0 +1,286 @@
+"""~This file is part of the Aliro library~
+
+Copyright (C) 2023 Epistasis Lab, 
+Center for Artificial Intelligence Research and Education (CAIRE),
+Department of Computational Biomedicine (CBM),
+Cedars-Sinai Medical Center.
+
+Aliro is maintained by:
+    - Hyunjun Choi (hyunjun.choi@cshs.org)
+    - Miguel Hernandez (miguel.e.hernandez@cshs.org)
+    - Nick Matsumoto (nicholas.matsumoto@cshs.org)
+    - Jay Moran (jay.moran@cshs.org)
+    - and many other generous open source contributors
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+(Autogenerated header, do not modify)
+
+"""
+import argparse
+import sys
+import simplejson
+from sklearn.utils import check_X_y, check_array
+from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
+from sklearn.compose import ColumnTransformer
+import os
+import os.path
+import pandas as pd
+import numpy as np
+import logging
+import requests
+import time
+import traceback
+from io import StringIO
+
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.StreamHandler())
+logger.setLevel(logging.INFO)
+
+MIN_ROWS = 10
+MIN_COLS = 2
+MIN_ROW_PER_CLASS = 2
+
+
+def validate_data_from_server(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs):
+    # Read the data set into memory
+    raw_data = get_file_from_server(file_id)
+    df = pd.read_csv(StringIO(raw_data), sep=None, engine='python', **kwargs)
+    return validate_data(df, prediction_type, target_field, categories, ordinals)
+
+
+def validate_data_from_filepath(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs):
+    # Read the data set into memory
+    df = pd.read_csv(file_id, sep=None, engine='python', **kwargs)
+    return validate_data(df, prediction_type, target_field, categories, ordinals)
+
+
+def encode_data(df, target_column, categories, ordinals, encoding_strategy="OneHotEncoder"):
+    '''
+    use OneHotEncoder or OrdinalEncoder to convert categorical features
+    See skl_utils
+    '''
+
+    # check that categorical and ordinal columns can be encoded
+    if categories or ordinals:
+        transformers = []
+        if categories:
+            if encoding_strategy == "OneHotEncoder":
+                transformers.append(
+                    ("categorical_encoder", OneHotEncoder(), categories))
+            elif encoding_strategy == "OrdinalEncoder":
+                transformers.append(
+                    ("categorical_encoder", OrdinalEncoder(), categories))
+        if ordinals:
+            ordinal_features = sorted(list(ordinals.keys()))
+            ordinal_map = [ordinals[k] for k in ordinal_features]
+            transformers.append(("ordinalencoder",
+                                 OrdinalEncoder(categories=ordinal_map),
+                                 ordinal_features))
+
+        ct = ColumnTransformer(
+            transformers=transformers,
+            remainder='passthrough',
+            sparse_threshold=0
+        )
+        return ct.fit_transform(df)
+    else:
+        return df
+
+
+def validate_data(df, prediction_type="classification", target_column=None, categories=None, ordinals=None):
+    '''
+    Check that a datafile is valid
+
+
+    @return tuple
+        boolean - validation result
+        string  - message
+    '''
+
+    if prediction_type not in ["classification", "regression"]:
+        logger.warn(f"Invalid prediction type: '{prediction_type}'")
+        return False, f"Invalid prediction type: '{prediction_type}'"
+
+    num_df = df
+
+    # dimension validation
+    if df.shape[0] < MIN_ROWS:
+        logger.warn("Dataset has dimensions {}, classification datasets must have at least {} rows.".format(
+            df.shape, MIN_ROWS))
+        return False, "Dataset has dimensions {}, classification datasets must have at least {} rows.".format(df.shape, MIN_ROWS)
+
+    if df.shape[1] < MIN_COLS:
+        logger.warn("Dataset has dimensions {}, classification datasets must have at least {} columns.".format(
+            df.shape, MIN_COLS))
+        return False, "Dataset has dimensions {}, classification datasets must have at least {} columns.".format(df.shape, MIN_COLS)
+
+    # target column validation
+    if (target_column != None):
+        if not (target_column in df.columns):
+            logger.warn("Target column '" + target_column + "' not in data")
+            return False, "Target column '" + target_column + "' not in data"
+        if categories and target_column in categories:
+            logger.warn("Target column '" + target_column +
+                        "' cannot be a categorical feature")
+            return False, "Target column '" + target_column + "' cannot be a categorical feature"
+        if ordinals and target_column in ordinals:
+            logger.warn("Target column '" + target_column +
+                        "' cannot be an ordinal feature")
+            return False, "Target column '" + target_column + "' cannot be an ordinal feature"
+
+    # check that cat columns can be encoded
+    if categories or ordinals:
+        try:
+            encode_data(df, target_column, categories,
+                        ordinals, "OneHotEncoder")
+            encode_data(df, target_column, categories,
+                        ordinals, "OrdinalEncoder")
+        except Exception as e:
+            logger.warn("encode_data() failed, " + str(e))
+            return False, "encode_data() failed, " + str(e)
+
+        if categories:
+            num_df = num_df.drop(columns=categories)
+        if ordinals:
+            num_df = num_df.drop(columns=list(ordinals.keys()))
+
+    # check only check if target is specified
+    if target_column:
+
+        # classification
+        if (prediction_type == "classification"):
+            # target column of classification problem does not need to be numeric
+            num_df = num_df.drop(columns=target_column, axis=1)
+
+            # Check rows per class
+            counts = df.groupby(target_column).count()
+            fails_validation = counts[counts[counts.columns[1]]
+                                      < MIN_ROW_PER_CLASS]
+            if (not fails_validation.empty):
+                msg = "Classification datasets must have at least 2 rows per class, class(es) '{}' have only 1 row.".format(
+                    list(fails_validation.index.values))
+                logger.warn(msg)
+                return False, msg
+
+        # check that non-cat feature columns contain only numeric data
+        if (len(num_df.columns)) > 0:
+            try:
+                check_array(num_df, dtype=np.float64,
+                            order="C", force_all_finite=True)
+
+            except Exception as e:
+                logger.warn("sklearn.check_array() validation " + str(e))
+                return False, "sklearn.check_array() validation " + str(e)
+
+        # check t
+
+    return True, None
+
+
+def get_file_from_server(file_id):
+    '''
+    Retrieve a file from the main Aliro server
+    '''
+    apiPath = 'http://' + os.environ['LAB_HOST'] + ':' + os.environ['LAB_PORT']
+    path = apiPath + "/api/v1/files/" + file_id
+
+    logger.debug("retrieving file:" + file_id)
+    logger.debug("api path: " + path)
+
+    res = None
+    try:
+        res = requests.request('GET', path, timeout=15)
+    except:
+        logger.error("Unexpected error in get_file_from_server for path 'GET: " +
+                     str(path) + "': " + str(sys.exc_info()[0]))
+        raise
+
+    if res.status_code != requests.codes.ok:
+        msg = "Request GET status_code not ok, path: '" + \
+            str(path) + "'' status code: '" + str(res.status_code) + \
+            "'' response text: " + str(res.text)
+        logger.error(msg)
+        raise RuntimeError(msg)
+
+    logger.info("File retrieved, file_id: '" + file_id +
+                "', path: '" + path + "', status_code: " + str(res.status_code))
+    return res.text
+
+
+def main():
+    meta_features_all = []
+    parser = argparse.ArgumentParser(
+        description="Validate a dataset", add_help=False)
+    parser.add_argument('INPUT_FILE', type=str, help='Filepath or fileId.')
+    parser.add_argument('-target', action='store', dest='TARGET', type=str, default='class',
+                        help='Name of target column', required=False)
+    parser.add_argument('-identifier_type', action='store', dest='IDENTIFIER_TYPE', type=str, choices=['filepath', 'fileid'], default='filepath',
+                        help='Name of target column')
+    parser.add_argument('-categorical_features', action='store', dest='JSON_CATEGORIES', type=str, required=False, default=None,
+                        help='JSON list of categorical features')
+    parser.add_argument('-ordinal_features', action='store', dest='JSON_ORDINALS', type=str, required=False, default=None,
+                        help='JSON dict of ordianl features and possible values')
+    parser.add_argument('-prediction_type', action='store', dest='PREDICTION_TYPE', type=str, choices=['classification', 'regression'], default="classification",
+                        help="Classification or regression problem")
+
+    args = parser.parse_args()
+
+    # set up the file logger
+    logpath = os.path.join(os.environ['PROJECT_ROOT'], "target/logs")
+    if not os.path.exists(logpath):
+        os.makedirs(logpath)
+
+    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
+    fhandler = logging.FileHandler(
+        os.path.join(logpath, 'validateDataset.log'))
+    fhandler.setFormatter(formatter)
+    logger.addHandler(fhandler)
+
+    success = None
+    errorMessage = None
+    meta_json = None
+
+    categories = None
+    ordinals = None
+
+    try:
+        if args.JSON_CATEGORIES:
+            categories = simplejson.loads(args.JSON_CATEGORIES)
+        if args.JSON_ORDINALS:
+            ordinals = simplejson.loads(args.JSON_ORDINALS)
+        prediction_type = args.PREDICTION_TYPE
+        # print("categories: ")
+        # print(categories)
+
+        if (args.IDENTIFIER_TYPE == 'filepath'):
+            success, errorMessage = validate_data_from_filepath(
+                args.INPUT_FILE, prediction_type, args.TARGET, categories, ordinals)
+        else:
+            success, errorMessage = validate_data_from_server(
+                args.INPUT_FILE, prediction_type, args.TARGET, categories, ordinals)
+        meta_json = simplejson.dumps(
+            {"success": success, "errorMessage": errorMessage}, ignore_nan=True)  # , ensure_ascii=False)
+    except Exception as e:
+        logger.error(traceback.format_exc())
+        meta_json = simplejson.dumps(
+            {"success": False, "errorMessage": "Exception: " + repr(e)}, ignore_nan=True)  # , ensure_ascii=False)
+
+    print(meta_json)
+    sys.stdout.flush()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/lab/pyutils/validateDataset.py b/lab/pyutils/validateDataset.py
index af089624b..bb523ae43 100644
--- a/lab/pyutils/validateDataset.py
+++ b/lab/pyutils/validateDataset.py
@@ -54,16 +54,63 @@
 MIN_ROW_PER_CLASS = 2
 
 
+def check_dataframe(df, target_column):
+    '''
+    check_dataframe function checks whether each column contains only numeric data or not.
+    missing values are not allowed in df.
+    strings are not allowed in df.
+    inf or -inf are not allowed in df.
+    '''
+
+    error_message = "Found error in data:"
+
+    # find columns contain missing value(NaN) in df
+    nan_cols = df.columns[df.isnull().any()].tolist()
+    if len(nan_cols) > 0:
+        error_message += "* 'MISSING VALUE' in " + \
+            str(nan_cols) + ""
+
+    df_non_target = df.drop(columns=target_column, axis=1)
+    inf_cols_list = []
+
+    # find which features contain infinity or -infinity in df_non_target
+
+    # find columns whose data type is object
+    # object dtype for storing strings in pandas
+    # if a column contains both string and numeric data, its dtype is object.
+    str_cols = df.columns[df.dtypes == object].tolist()
+
+    non_str_cols = df_non_target.columns.difference(str_cols)
+
+    for col in non_str_cols:
+
+        if np.isinf(df[col]).any():
+            inf_cols_list.append(col)
+
+    if len(inf_cols_list) > 0:
+        error_message += "* '+INFINITY or -INFINITY' in " + \
+            str(inf_cols_list) + "  "
+
+    # str_trigger = False
+    if len(str_cols) > 0:
+        error_message += "* 'STRING' in " + \
+            str(str_cols)+" "
+
+    return error_message
+
+
 def validate_data_from_server(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs):
     # Read the data set into memory
     raw_data = get_file_from_server(file_id)
     df = pd.read_csv(StringIO(raw_data), sep=None, engine='python', **kwargs)
-    return validate_data(df, prediction_type, target_field, categories, ordinals)
+    # return validate_data(df, prediction_type, target_field, categories, ordinals)
+    return validate_data_updated(df, prediction_type, target_field, categories, ordinals)
 
 
 def validate_data_from_filepath(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs):
     # Read the data set into memory
     df = pd.read_csv(file_id, sep=None, engine='python', **kwargs)
+    # print("dfprint", df)
     return validate_data(df, prediction_type, target_field, categories, ordinals)
 
 
@@ -190,6 +237,87 @@ def validate_data(df, prediction_type="classification", target_column=None, cate
     return True, None
 
 
+def validate_data_updated(df, prediction_type="classification", target_column=None, categories=None, ordinals=None):
+    '''
+    Check that a df is valid
+    This function checks for the following:
+        - prediction_type is valid
+        - number of rows and columns is valid
+        - target column is valid
+        - missing values in df.
+        - strings in df.
+        - inf or -inf in df.
+
+
+
+    @return tuple
+        boolean - validation result
+        string  - message
+    '''
+
+    # check prediction type is valid
+    if prediction_type not in ["classification", "regression"]:
+        logger.warn(f"Invalid prediction type: '{prediction_type}'")
+        return False, f"Invalid prediction type: '{prediction_type}'"
+
+    # check the number of rows and columns is valid
+    if df.shape[0] < MIN_ROWS:
+        logger.warn("Dataset has dimensions {}, classification datasets must have at least {} rows.".format(
+            df.shape, MIN_ROWS))
+        return False, "Dataset has dimensions {}, classification datasets must have at least {} rows.".format(df.shape, MIN_ROWS)
+
+    # check the number of columns is valid
+    if df.shape[1] < MIN_COLS:
+        logger.warn("Dataset has dimensions {}, classification datasets must have at least {} columns.".format(
+            df.shape, MIN_COLS))
+        return False, "Dataset has dimensions {}, classification datasets must have at least {} columns.".format(df.shape, MIN_COLS)
+
+    # target column validation
+    if (target_column != None):
+        if not (target_column in df.columns):
+            logger.warn("Target column '" + target_column + "' not in data")
+            return False, "Target column '" + target_column + "' not in data"
+        if categories and target_column in categories:
+            logger.warn("Target column '" + target_column +
+                        "' cannot be a categorical feature")
+            return False, "Target column '" + target_column + "' cannot be a categorical feature"
+        if ordinals and target_column in ordinals:
+            logger.warn("Target column '" + target_column +
+                        "' cannot be an ordinal feature")
+            return False, "Target column '" + target_column + "' cannot be an ordinal feature"
+
+    # check only check if target is specified
+    if target_column:
+
+        # classification
+        if (prediction_type == "classification"):
+            # target column of classification problem does not need to be numeric
+            df_non_target = df.drop(columns=target_column, axis=1)
+
+            # Check rows per class
+            counts = df.groupby(target_column).count()
+            fails_validation = counts[counts[counts.columns[1]]
+                                      < MIN_ROW_PER_CLASS]
+            if (not fails_validation.empty):
+                msg = "Classification datasets must have at least 2 rows per class, class(es) '{}' have only 1 row.".format(
+                    list(fails_validation.index.values))
+                logger.warn(msg)
+                return False, msg
+
+    # In the below code,the check_dataframe() checks whether features and target column contain only processed data.
+    # check whether each column contains only processed data or not
+    # missing values are not allowed in df
+    # strings are not allowed in df
+    # inf or -inf are not allowed in df
+    if (len(df.columns)) > 0:
+        error_message = check_dataframe(df, target_column)
+        if error_message != "Found error in data:":
+            logger.warn(str(error_message))
+            return False, str(error_message)
+
+    return True, None
+
+
 def get_file_from_server(file_id):
     '''
     Retrieve a file from the main Aliro server
@@ -262,8 +390,6 @@ def main():
         if args.JSON_ORDINALS:
             ordinals = simplejson.loads(args.JSON_ORDINALS)
         prediction_type = args.PREDICTION_TYPE
-        # print("categories: ")
-        # print(categories)
 
         if (args.IDENTIFIER_TYPE == 'filepath'):
             success, errorMessage = validate_data_from_filepath(
diff --git a/lab/webapp/src/components/FileUpload/index.js b/lab/webapp/src/components/FileUpload/index.js
index e07d895e8..650179d5e 100644
--- a/lab/webapp/src/components/FileUpload/index.js
+++ b/lab/webapp/src/components/FileUpload/index.js
@@ -2448,6 +2448,7 @@ handleCatFeaturesUserTextCancel() {
         <Modal.Content>{this.state.errorModalContent}</Modal.Content>
 
 
+
       </Modal>
       )
     }

From b7aa1ddb190ba00f5080a878ea145f6b1cd1fac9 Mon Sep 17 00:00:00 2001
From: hyunjuna <hyunjun.choi@cshs.org>
Date: Wed, 1 Feb 2023 10:34:30 -0800
Subject: [PATCH 2/2] remove copy

---
 lab/pyutils/validateDataset copy.py | 286 ----------------------------
 1 file changed, 286 deletions(-)
 delete mode 100644 lab/pyutils/validateDataset copy.py

diff --git a/lab/pyutils/validateDataset copy.py b/lab/pyutils/validateDataset copy.py
deleted file mode 100644
index af089624b..000000000
--- a/lab/pyutils/validateDataset copy.py	
+++ /dev/null
@@ -1,286 +0,0 @@
-"""~This file is part of the Aliro library~
-
-Copyright (C) 2023 Epistasis Lab, 
-Center for Artificial Intelligence Research and Education (CAIRE),
-Department of Computational Biomedicine (CBM),
-Cedars-Sinai Medical Center.
-
-Aliro is maintained by:
-    - Hyunjun Choi (hyunjun.choi@cshs.org)
-    - Miguel Hernandez (miguel.e.hernandez@cshs.org)
-    - Nick Matsumoto (nicholas.matsumoto@cshs.org)
-    - Jay Moran (jay.moran@cshs.org)
-    - and many other generous open source contributors
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-(Autogenerated header, do not modify)
-
-"""
-import argparse
-import sys
-import simplejson
-from sklearn.utils import check_X_y, check_array
-from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
-from sklearn.compose import ColumnTransformer
-import os
-import os.path
-import pandas as pd
-import numpy as np
-import logging
-import requests
-import time
-import traceback
-from io import StringIO
-
-
-logger = logging.getLogger(__name__)
-logger.addHandler(logging.StreamHandler())
-logger.setLevel(logging.INFO)
-
-MIN_ROWS = 10
-MIN_COLS = 2
-MIN_ROW_PER_CLASS = 2
-
-
-def validate_data_from_server(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs):
-    # Read the data set into memory
-    raw_data = get_file_from_server(file_id)
-    df = pd.read_csv(StringIO(raw_data), sep=None, engine='python', **kwargs)
-    return validate_data(df, prediction_type, target_field, categories, ordinals)
-
-
-def validate_data_from_filepath(file_id, prediction_type, target_field, categories=None, ordinals=None, **kwargs):
-    # Read the data set into memory
-    df = pd.read_csv(file_id, sep=None, engine='python', **kwargs)
-    return validate_data(df, prediction_type, target_field, categories, ordinals)
-
-
-def encode_data(df, target_column, categories, ordinals, encoding_strategy="OneHotEncoder"):
-    '''
-    use OneHotEncoder or OrdinalEncoder to convert categorical features
-    See skl_utils
-    '''
-
-    # check that categorical and ordinal columns can be encoded
-    if categories or ordinals:
-        transformers = []
-        if categories:
-            if encoding_strategy == "OneHotEncoder":
-                transformers.append(
-                    ("categorical_encoder", OneHotEncoder(), categories))
-            elif encoding_strategy == "OrdinalEncoder":
-                transformers.append(
-                    ("categorical_encoder", OrdinalEncoder(), categories))
-        if ordinals:
-            ordinal_features = sorted(list(ordinals.keys()))
-            ordinal_map = [ordinals[k] for k in ordinal_features]
-            transformers.append(("ordinalencoder",
-                                 OrdinalEncoder(categories=ordinal_map),
-                                 ordinal_features))
-
-        ct = ColumnTransformer(
-            transformers=transformers,
-            remainder='passthrough',
-            sparse_threshold=0
-        )
-        return ct.fit_transform(df)
-    else:
-        return df
-
-
-def validate_data(df, prediction_type="classification", target_column=None, categories=None, ordinals=None):
-    '''
-    Check that a datafile is valid
-
-
-    @return tuple
-        boolean - validation result
-        string  - message
-    '''
-
-    if prediction_type not in ["classification", "regression"]:
-        logger.warn(f"Invalid prediction type: '{prediction_type}'")
-        return False, f"Invalid prediction type: '{prediction_type}'"
-
-    num_df = df
-
-    # dimension validation
-    if df.shape[0] < MIN_ROWS:
-        logger.warn("Dataset has dimensions {}, classification datasets must have at least {} rows.".format(
-            df.shape, MIN_ROWS))
-        return False, "Dataset has dimensions {}, classification datasets must have at least {} rows.".format(df.shape, MIN_ROWS)
-
-    if df.shape[1] < MIN_COLS:
-        logger.warn("Dataset has dimensions {}, classification datasets must have at least {} columns.".format(
-            df.shape, MIN_COLS))
-        return False, "Dataset has dimensions {}, classification datasets must have at least {} columns.".format(df.shape, MIN_COLS)
-
-    # target column validation
-    if (target_column != None):
-        if not (target_column in df.columns):
-            logger.warn("Target column '" + target_column + "' not in data")
-            return False, "Target column '" + target_column + "' not in data"
-        if categories and target_column in categories:
-            logger.warn("Target column '" + target_column +
-                        "' cannot be a categorical feature")
-            return False, "Target column '" + target_column + "' cannot be a categorical feature"
-        if ordinals and target_column in ordinals:
-            logger.warn("Target column '" + target_column +
-                        "' cannot be an ordinal feature")
-            return False, "Target column '" + target_column + "' cannot be an ordinal feature"
-
-    # check that cat columns can be encoded
-    if categories or ordinals:
-        try:
-            encode_data(df, target_column, categories,
-                        ordinals, "OneHotEncoder")
-            encode_data(df, target_column, categories,
-                        ordinals, "OrdinalEncoder")
-        except Exception as e:
-            logger.warn("encode_data() failed, " + str(e))
-            return False, "encode_data() failed, " + str(e)
-
-        if categories:
-            num_df = num_df.drop(columns=categories)
-        if ordinals:
-            num_df = num_df.drop(columns=list(ordinals.keys()))
-
-    # check only check if target is specified
-    if target_column:
-
-        # classification
-        if (prediction_type == "classification"):
-            # target column of classification problem does not need to be numeric
-            num_df = num_df.drop(columns=target_column, axis=1)
-
-            # Check rows per class
-            counts = df.groupby(target_column).count()
-            fails_validation = counts[counts[counts.columns[1]]
-                                      < MIN_ROW_PER_CLASS]
-            if (not fails_validation.empty):
-                msg = "Classification datasets must have at least 2 rows per class, class(es) '{}' have only 1 row.".format(
-                    list(fails_validation.index.values))
-                logger.warn(msg)
-                return False, msg
-
-        # check that non-cat feature columns contain only numeric data
-        if (len(num_df.columns)) > 0:
-            try:
-                check_array(num_df, dtype=np.float64,
-                            order="C", force_all_finite=True)
-
-            except Exception as e:
-                logger.warn("sklearn.check_array() validation " + str(e))
-                return False, "sklearn.check_array() validation " + str(e)
-
-        # check t
-
-    return True, None
-
-
-def get_file_from_server(file_id):
-    '''
-    Retrieve a file from the main Aliro server
-    '''
-    apiPath = 'http://' + os.environ['LAB_HOST'] + ':' + os.environ['LAB_PORT']
-    path = apiPath + "/api/v1/files/" + file_id
-
-    logger.debug("retrieving file:" + file_id)
-    logger.debug("api path: " + path)
-
-    res = None
-    try:
-        res = requests.request('GET', path, timeout=15)
-    except:
-        logger.error("Unexpected error in get_file_from_server for path 'GET: " +
-                     str(path) + "': " + str(sys.exc_info()[0]))
-        raise
-
-    if res.status_code != requests.codes.ok:
-        msg = "Request GET status_code not ok, path: '" + \
-            str(path) + "'' status code: '" + str(res.status_code) + \
-            "'' response text: " + str(res.text)
-        logger.error(msg)
-        raise RuntimeError(msg)
-
-    logger.info("File retrieved, file_id: '" + file_id +
-                "', path: '" + path + "', status_code: " + str(res.status_code))
-    return res.text
-
-
-def main():
-    meta_features_all = []
-    parser = argparse.ArgumentParser(
-        description="Validate a dataset", add_help=False)
-    parser.add_argument('INPUT_FILE', type=str, help='Filepath or fileId.')
-    parser.add_argument('-target', action='store', dest='TARGET', type=str, default='class',
-                        help='Name of target column', required=False)
-    parser.add_argument('-identifier_type', action='store', dest='IDENTIFIER_TYPE', type=str, choices=['filepath', 'fileid'], default='filepath',
-                        help='Name of target column')
-    parser.add_argument('-categorical_features', action='store', dest='JSON_CATEGORIES', type=str, required=False, default=None,
-                        help='JSON list of categorical features')
-    parser.add_argument('-ordinal_features', action='store', dest='JSON_ORDINALS', type=str, required=False, default=None,
-                        help='JSON dict of ordianl features and possible values')
-    parser.add_argument('-prediction_type', action='store', dest='PREDICTION_TYPE', type=str, choices=['classification', 'regression'], default="classification",
-                        help="Classification or regression problem")
-
-    args = parser.parse_args()
-
-    # set up the file logger
-    logpath = os.path.join(os.environ['PROJECT_ROOT'], "target/logs")
-    if not os.path.exists(logpath):
-        os.makedirs(logpath)
-
-    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
-    fhandler = logging.FileHandler(
-        os.path.join(logpath, 'validateDataset.log'))
-    fhandler.setFormatter(formatter)
-    logger.addHandler(fhandler)
-
-    success = None
-    errorMessage = None
-    meta_json = None
-
-    categories = None
-    ordinals = None
-
-    try:
-        if args.JSON_CATEGORIES:
-            categories = simplejson.loads(args.JSON_CATEGORIES)
-        if args.JSON_ORDINALS:
-            ordinals = simplejson.loads(args.JSON_ORDINALS)
-        prediction_type = args.PREDICTION_TYPE
-        # print("categories: ")
-        # print(categories)
-
-        if (args.IDENTIFIER_TYPE == 'filepath'):
-            success, errorMessage = validate_data_from_filepath(
-                args.INPUT_FILE, prediction_type, args.TARGET, categories, ordinals)
-        else:
-            success, errorMessage = validate_data_from_server(
-                args.INPUT_FILE, prediction_type, args.TARGET, categories, ordinals)
-        meta_json = simplejson.dumps(
-            {"success": success, "errorMessage": errorMessage}, ignore_nan=True)  # , ensure_ascii=False)
-    except Exception as e:
-        logger.error(traceback.format_exc())
-        meta_json = simplejson.dumps(
-            {"success": False, "errorMessage": "Exception: " + repr(e)}, ignore_nan=True)  # , ensure_ascii=False)
-
-    print(meta_json)
-    sys.stdout.flush()
-
-
-if __name__ == '__main__':
-    main()