diff --git a/.travis.yml b/.travis.yml
index 675186469..beaa3b53e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,10 +15,11 @@ env:
   - TEST_DIR=/tmp/test_dir/
   - MODULE=openml
   matrix:
-  - DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.20.0"
-  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.20.0"
-  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.0" RUN_FLAKE8="true" SKIP_TESTS="true"
-  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.0" COVERAGE="true" DOCPUSH="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.5" SKLEARN_VERSION="0.21.2"
+  - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.21.2"
+  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" RUN_FLAKE8="true" SKIP_TESTS="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.21.2" COVERAGE="true" DOCPUSH="true"
+  - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.20.2"
   # Checks for older scikit-learn versions (which also don't nicely work with
   # Python3.7)
   - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.19.2"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b13051d67..5a77dfd58 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -81,6 +81,10 @@ following rules before you submit a pull request:
    Drafts often benefit from the inclusion of a
    [task list](https://github.com/blog/1375-task-lists-in-gfm-issues-pulls-comments)
    in the PR description.
+   
+- Add [unit tests](https://github.com/openml/openml-python/tree/develop/tests) and [examples](https://github.com/openml/openml-python/tree/develop/examples) for any new functionality being introduced. 
+    - If an unit test contains an upload to the test server, please ensure that it is followed by a file collection for deletion, to prevent the test server from bulking up. For example, `TestBase._mark_entity_for_removal('data', dataset.dataset_id)`, `TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))`.
+    - Please ensure that the example is run on the test server by beginning with the call to `openml.config.start_using_configuration_for_example()`.      
 
 -  All tests pass when running `pytest`. On
    Unix-like systems, check with (from the toplevel source folder):
diff --git a/LICENSE b/LICENSE
index 146b8cc36..e08aa862b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2014-2018, Matthias Feurer, Jan van Rijn, Andreas Müller, 
+Copyright (c) 2014-2019, Matthias Feurer, Jan van Rijn, Andreas Müller, 
 Joaquin Vanschoren and others.
 All rights reserved.
 
diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md
index 4cedd1478..571ae0d1c 100644
--- a/PULL_REQUEST_TEMPLATE.md
+++ b/PULL_REQUEST_TEMPLATE.md
@@ -9,6 +9,8 @@ Please make sure that:
 * for any new function or class added, please add it to doc/api.rst
     * the list of classes and functions should be alphabetical 
 * for any new functionality, consider adding a relevant example
+* add unit tests for new functionalities
+    * collect files uploaded to test server using _mark_entity_for_removal()
 -->
 
 #### Reference Issue
diff --git a/ci_scripts/test.sh b/ci_scripts/test.sh
index 80b35f04f..1c82591e0 100644
--- a/ci_scripts/test.sh
+++ b/ci_scripts/test.sh
@@ -1,5 +1,11 @@
 set -e
 
+# check status and branch before running the unit tests
+before="`git status --porcelain -b`"
+before="$before"
+# storing current working directory
+curr_dir=`pwd`
+
 run_tests() {
     # Get into a temp directory to run test from the installed scikit learn and
     # check if we  do not leave artifacts
@@ -22,7 +28,7 @@ run_tests() {
         PYTEST_ARGS=''
     fi
 
-    pytest -n 4 --duration=20 --timeout=600 --timeout-method=thread -sv --ignore='test_OpenMLDemo.py' $PYTEST_ARGS $test_dir
+    pytest -n 4 --durations=20 --timeout=600 --timeout-method=thread -sv --ignore='test_OpenMLDemo.py' $PYTEST_ARGS $test_dir
 }
 
 if [[ "$RUN_FLAKE8" == "true" ]]; then
@@ -32,3 +38,15 @@ fi
 if [[ "$SKIP_TESTS" != "true" ]]; then
     run_tests
 fi
+
+# changing directory to stored working directory
+cd $curr_dir
+# check status and branch after running the unit tests
+# compares with $before to check for remaining files
+after="`git status --porcelain -b`"
+if [[ "$before" != "$after" ]]; then
+    echo 'git status from before: '$before
+    echo 'git status from after: '$after
+    echo "All generated files have not been deleted!"
+    exit 1
+fi
diff --git a/doc/api.rst b/doc/api.rst
index 93a6d18b6..7979c7bfc 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -72,6 +72,7 @@ Modules
     get_dataset
     get_datasets
     list_datasets
+    list_qualities
     status_update
 
 :mod:`openml.evaluations`: Evaluation Functions
@@ -83,6 +84,7 @@ Modules
    :template: function.rst
 
     list_evaluations
+    list_evaluation_measures
 
 :mod:`openml.flows`: Flow Functions
 -----------------------------------
diff --git a/doc/conf.py b/doc/conf.py
index 9b49078fb..03a2ec0db 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -15,6 +15,7 @@
 import os
 import sys
 import sphinx_bootstrap_theme
+import time
 import openml
 
 # If extensions (or modules to document with autodoc) are in another directory,
@@ -65,7 +66,7 @@
 # General information about the project.
 project = u'OpenML'
 copyright = (
-    u'2014-2019, the OpenML-Python team.'
+    u'2014-{}, the OpenML-Python team.'.format(time.strftime("%Y,%m,%d,%H,%M,%S").split(',')[0])
 )
 
 # The version info for the project you're documenting, acts as replacement for
diff --git a/doc/index.rst b/doc/index.rst
index 8752dbe9b..96e534705 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -21,16 +21,12 @@ Example
 .. code:: python
 
     import openml
-    from sklearn import preprocessing, tree, pipeline
-
-    # Set the OpenML API Key which is required to upload your runs.
-    # You can get your own API by signing up to OpenML.org.
-    openml.config.apikey = 'ABC'
+    from sklearn import impute, tree, pipeline
 
     # Define a scikit-learn classifier or pipeline
     clf = pipeline.Pipeline(
         steps=[
-            ('imputer', preprocessing.Imputer()),
+            ('imputer', impute.SimpleImputer()),
             ('estimator', tree.DecisionTreeClassifier())
         ]
     )
@@ -39,10 +35,13 @@ Example
     task = openml.tasks.get_task(31)
     # Run the scikit-learn model on the task.
     run = openml.runs.run_model_on_task(clf, task)
-    # Publish the experiment on OpenML (optional, requires an API key).
+    # Publish the experiment on OpenML (optional, requires an API key.
+    # You can get your own API key by signing up to OpenML.org)
     run.publish()
     print('View the run online: %s/run/%d' % (openml.config.server, run.run_id))
 
+You can find more examples in our `examples gallery <examples/index.html>`_.
+
 ----------------------------
 How to get OpenML for python
 ----------------------------
diff --git a/doc/progress.rst b/doc/progress.rst
index 5629eb0cb..33db154ef 100644
--- a/doc/progress.rst
+++ b/doc/progress.rst
@@ -6,6 +6,27 @@
 Changelog
 =========
 
+0.10.0
+~~~~~~
+* ADD #737: Add list_evaluations_setups to return hyperparameters along with list of evaluations.
+* FIX #261: Test server is cleared of all files uploaded during unit testing.
+* FIX #447: All files created by unit tests no longer persist in local.
+* FIX #608: Fixing dataset_id referenced before assignment error in get_run function.
+* FIX #447: All files created by unit tests are deleted after the completion of all unit tests.
+* FIX #589: Fixing a bug that did not successfully upload the columns to ignore when creating and publishing a dataset.
+* FIX #608: Fixing dataset_id referenced before assignment error in get_run function.
+* DOC #639: More descriptive documention for function to convert array format.
+* DOC #719: Add documentation on uploading tasks.
+* ADD #687: Adds a function to retrieve the list of evaluation measures available.
+* ADD #695: A function to retrieve all the data quality measures available.
+* ADD #412: Add a function to trim flow names for scikit-learn flows.
+* ADD #715: `list_evaluations` now has an option to sort evaluations by score (value).
+* ADD #722: Automatic reinstantiation of flow in `run_model_on_task`. Clearer errors if that's not possible.
+* ADD #412: The scikit-learn extension populates the short name field for flows.
+* MAINT #726: Update examples to remove deprecation warnings from scikit-learn
+* MAINT #752: Update OpenML-Python to be compatible with sklearn 0.21
+
+
 0.9.0
 ~~~~~
 * ADD #560: OpenML-Python can now handle regression tasks as well.
@@ -21,6 +42,7 @@ Changelog
 * ADD #659: Lazy loading of task splits.
 * ADD #516: `run_flow_on_task` flow uploading is now optional.
 * ADD #680: Adds `openml.config.start_using_configuration_for_example` (and resp. stop) to easily connect to the test server.
+* ADD #75, #653: Adds a pretty print for objects of the top-level classes.
 * FIX #642: `check_datasets_active` now correctly also returns active status of deactivated datasets.
 * FIX #304, #636: Allow serialization of numpy datatypes and list of lists of more types (e.g. bools, ints) for flows.
 * FIX #651: Fixed a bug that would prevent openml-python from finding the user's config file.
diff --git a/examples/fetch_evaluations_tutorial.py b/examples/fetch_evaluations_tutorial.py
index 97872e9f7..10511c540 100644
--- a/examples/fetch_evaluations_tutorial.py
+++ b/examples/fetch_evaluations_tutorial.py
@@ -20,7 +20,6 @@
 
 ############################################################################
 import openml
-from pprint import pprint
 
 ############################################################################
 # Listing evaluations
@@ -37,7 +36,7 @@
                                             output_format='dataframe')
 
 # Querying the returned results for precision above 0.98
-pprint(evals[evals.value > 0.98])
+print(evals[evals.value > 0.98])
 
 #############################################################################
 # Viewing a sample task
@@ -47,7 +46,7 @@
 # We will start by displaying a simple *supervised classification* task:
 task_id = 167140        # https://www.openml.org/t/167140
 task = openml.tasks.get_task(task_id)
-pprint(vars(task))
+print(task)
 
 #############################################################################
 # Obtaining all the evaluations for the task
@@ -60,11 +59,11 @@
 evals = openml.evaluations.list_evaluations(function=metric, task=[task_id],
                                             output_format='dataframe')
 # Displaying the first 10 rows
-pprint(evals.head(n=10))
+print(evals.head(n=10))
 # Sorting the evaluations in decreasing order of the metric chosen
 evals = evals.sort_values(by='value', ascending=False)
 print("\nDisplaying head of sorted dataframe: ")
-pprint(evals.head())
+print(evals.head())
 
 #############################################################################
 # Obtaining CDF of metric for chosen task
@@ -147,4 +146,4 @@ def plot_flow_compare(evaluations, top_n=10, metric='predictive_accuracy'):
 flow_ids = evals.flow_id.unique()[:top_n]
 flow_names = evals.flow_name.unique()[:top_n]
 for i in range(top_n):
-    pprint((flow_ids[i], flow_names[i]))
+    print((flow_ids[i], flow_names[i]))
diff --git a/examples/flows_and_runs_tutorial.py b/examples/flows_and_runs_tutorial.py
index d196c30ee..d65abdf28 100644
--- a/examples/flows_and_runs_tutorial.py
+++ b/examples/flows_and_runs_tutorial.py
@@ -6,8 +6,7 @@
 """
 
 import openml
-from pprint import pprint
-from sklearn import ensemble, neighbors, preprocessing, pipeline, tree
+from sklearn import compose, ensemble, impute, neighbors, preprocessing, pipeline, tree
 
 ############################################################################
 # Train machine learning models
@@ -39,8 +38,9 @@
     target=dataset.default_target_attribute
 )
 print("Categorical features: {}".format(categorical_indicator))
-enc = preprocessing.OneHotEncoder(categorical_features=categorical_indicator)
-X = enc.fit_transform(X)
+transformer = compose.ColumnTransformer(
+    [('one_hot_encoder', preprocessing.OneHotEncoder(categories='auto'), categorical_indicator)])
+X = transformer.fit_transform(X)
 clf.fit(X, y)
 
 ############################################################################
@@ -57,7 +57,7 @@
 # Run the flow
 run = openml.runs.run_model_on_task(clf, task)
 
-# pprint(vars(run), depth=2)
+print(run)
 
 ############################################################################
 # Share the run on the OpenML server
@@ -74,18 +74,38 @@
 # We can now also inspect the flow object which was automatically created:
 
 flow = openml.flows.get_flow(run.flow_id)
-pprint(vars(flow), depth=1)
+print(flow)
 
 ############################################################################
 # It also works with pipelines
 # ############################
 #
 # When you need to handle 'dirty' data, build pipelines to model then automatically.
-task = openml.tasks.get_task(115)
+task = openml.tasks.get_task(1)
+features = task.get_dataset().features
+nominal_feature_indices = [
+    i for i in range(len(features))
+    if features[i].name != task.target_name and features[i].data_type == 'nominal'
+]
 pipe = pipeline.Pipeline(steps=[
-    ('Imputer', preprocessing.Imputer(strategy='median')),
-    ('OneHotEncoder', preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')),
-    ('Classifier', ensemble.RandomForestClassifier())
+    (
+        'Preprocessing',
+        compose.ColumnTransformer([
+            ('Nominal', pipeline.Pipeline(
+                [
+                    ('Imputer', impute.SimpleImputer(strategy='most_frequent')),
+                    (
+                        'Encoder',
+                        preprocessing.OneHotEncoder(
+                            sparse=False, handle_unknown='ignore',
+                        )
+                    ),
+                ]),
+                nominal_feature_indices,
+             ),
+        ]),
+    ),
+    ('Classifier', ensemble.RandomForestClassifier(n_estimators=10))
 ])
 
 run = openml.runs.run_model_on_task(pipe, task, avoid_duplicate_runs=False)
diff --git a/examples/introduction_tutorial.py b/examples/introduction_tutorial.py
index 7dc3a8324..9cd88ceba 100644
--- a/examples/introduction_tutorial.py
+++ b/examples/introduction_tutorial.py
@@ -1,6 +1,6 @@
 """
 Introduction
-===================
+============
 
 An introduction to OpenML, followed up by a simple example.
 """
@@ -15,6 +15,8 @@
 # * Works seamlessly with scikit-learn and other libraries
 # * Large scale benchmarking, compare to state of the art
 #
+
+############################################################################
 # Installation
 # ^^^^^^^^^^^^
 # Installation is done via ``pip``:
@@ -26,6 +28,8 @@
 # For further information, please check out the installation guide at
 # https://openml.github.io/openml-python/master/contributing.html#installation
 #
+
+############################################################################
 # Authentication
 # ^^^^^^^^^^^^^^
 #
@@ -49,6 +53,7 @@
 # .. warning:: This example uploads data. For that reason, this example
 #   connects to the test server instead. This prevents the live server from
 #   crowding with example datasets, tasks, studies, and so on.
+
 ############################################################################
 import openml
 from sklearn import neighbors
diff --git a/examples/sklearn/openml_run_example.py b/examples/sklearn/openml_run_example.py
index 84e11bd54..195a0aa77 100644
--- a/examples/sklearn/openml_run_example.py
+++ b/examples/sklearn/openml_run_example.py
@@ -5,7 +5,7 @@
 An example of an automated machine learning experiment.
 """
 import openml
-from sklearn import tree, preprocessing, pipeline
+from sklearn import impute, tree, pipeline
 
 ############################################################################
 # .. warning:: This example uploads data. For that reason, this example
@@ -21,7 +21,7 @@
 # Define a scikit-learn pipeline
 clf = pipeline.Pipeline(
     steps=[
-        ('imputer', preprocessing.Imputer()),
+        ('imputer', impute.SimpleImputer()),
         ('estimator', tree.DecisionTreeClassifier())
     ]
 )
diff --git a/examples/tasks_tutorial.py b/examples/tasks_tutorial.py
index f1f07d027..c54ecdbd9 100644
--- a/examples/tasks_tutorial.py
+++ b/examples/tasks_tutorial.py
@@ -7,7 +7,6 @@
 
 import openml
 import pandas as pd
-from pprint import pprint
 
 ############################################################################
 #
@@ -40,11 +39,11 @@
 tasks = pd.DataFrame.from_dict(tasks, orient='index')
 print(tasks.columns)
 print("First 5 of %s tasks:" % len(tasks))
-pprint(tasks.head())
+print(tasks.head())
 
 # The same can be obtained through lesser lines of code
 tasks_df = openml.tasks.list_tasks(task_type_id=1, output_format='dataframe')
-pprint(tasks_df.head())
+print(tasks_df.head())
 
 ############################################################################
 # We can filter the list of tasks to only contain datasets with more than
@@ -78,7 +77,7 @@
 tasks = openml.tasks.list_tasks(tag='OpenML100')
 tasks = pd.DataFrame.from_dict(tasks, orient='index')
 print("First 5 of %s tasks:" % len(tasks))
-pprint(tasks.head())
+print(tasks.head())
 
 ############################################################################
 # Furthermore, we can list tasks based on the dataset id:
@@ -86,14 +85,14 @@
 tasks = openml.tasks.list_tasks(data_id=1471)
 tasks = pd.DataFrame.from_dict(tasks, orient='index')
 print("First 5 of %s tasks:" % len(tasks))
-pprint(tasks.head())
+print(tasks.head())
 
 ############################################################################
 # In addition, a size limit and an offset can be applied both separately and simultaneously:
 
 tasks = openml.tasks.list_tasks(size=10, offset=50)
 tasks = pd.DataFrame.from_dict(tasks, orient='index')
-pprint(tasks)
+print(tasks)
 
 ############################################################################
 #
@@ -134,11 +133,87 @@
 ############################################################################
 # Properties of the task are stored as member variables:
 
-pprint(vars(task))
+print(task)
 
 ############################################################################
 # And:
 
 ids = [2, 1891, 31, 9983]
 tasks = openml.tasks.get_tasks(ids)
-pprint(tasks[0])
+print(tasks[0])
+
+############################################################################
+# Creating tasks
+# ^^^^^^^^^^^^^^
+#
+# You can also create new tasks. Take the following into account:
+#
+# * You can only create tasks on _active_ datasets
+# * For now, only the following tasks are supported: classification, regression,
+# clustering, and learning curve analysis.
+# * For now, tasks can only be created on a single dataset.
+# * The exact same task must not already exist.
+#
+# Creating a task requires the following input:
+#
+# * task_type_id: The task type ID, required (see below). Required.
+# * dataset_id: The dataset ID. Required.
+# * target_name: The name of the attribute you aim to predict.
+# Optional.
+# * estimation_procedure_id : The ID of the estimation procedure used to create train-test
+# splits. Optional.
+# * evaluation_measure: The name of the evaluation measure. Optional.
+# * Any additional inputs for specific tasks
+#
+# It is best to leave the evaluation measure open if there is no strong prerequisite for a
+# specific measure. OpenML will always compute all appropriate measures and you can filter
+# or sort results on your favourite measure afterwards. Only add an evaluation measure if
+# necessary (e.g. when other measure make no sense), since it will create a new task, which
+# scatters results across tasks.
+
+
+############################################################################
+# Example
+# #######
+#
+# Let's create a classification task on a dataset. In this example we will do this on the
+# Iris dataset (ID=128 (on test server)). We'll use 10-fold cross-validation (ID=1),
+# and _predictive accuracy_ as the predefined measure (this can also be left open).
+# If a task with these parameters exist, we will get an appropriate exception.
+# If such a task doesn't exist, a task will be created and the corresponding task_id
+# will be returned.
+
+
+# using test server for example uploads
+openml.config.start_using_configuration_for_example()
+
+try:
+    tasktypes = openml.tasks.TaskTypeEnum
+    my_task = openml.tasks.create_task(
+        task_type_id=tasktypes.SUPERVISED_CLASSIFICATION,
+        dataset_id=128,
+        target_name="class",
+        evaluation_measure="predictive_accuracy",
+        estimation_procedure_id=1)
+    my_task.publish()
+except openml.exceptions.OpenMLServerException as e:
+    # Error code for 'task already exists'
+    if e.code == 614:
+        # Lookup task
+        tasks = openml.tasks.list_tasks(data_id=128, output_format='dataframe').to_numpy()
+        tasks = tasks[tasks[:, 4] == "Supervised Classification"]
+        tasks = tasks[tasks[:, 6] == "10-fold Crossvalidation"]
+        tasks = tasks[tasks[:, 19] == "predictive_accuracy"]
+        task_id = tasks[0][0]
+        print("Task already exists. Task ID is", task_id)
+
+# reverting to prod server
+openml.config.stop_using_configuration_for_example()
+
+
+############################################################################
+# [Complete list of task types](https://www.openml.org/search?type=task_type)
+# [Complete list of model estimation procedures](
+# https://www.openml.org/search?q=%2520measure_type%3Aestimation_procedure&type=measure)
+# [Complete list of evaluation measures](
+# https://www.openml.org/search?q=measure_type%3Aevaluation_measure&type=measure)
diff --git a/openml/__version__.py b/openml/__version__.py
index bfb63854a..fd6968a5d 100644
--- a/openml/__version__.py
+++ b/openml/__version__.py
@@ -1,4 +1,4 @@
 """Version information."""
 
 # The following line *must* be the last in the module, exactly as formatted:
-__version__ = "0.9.0"
+__version__ = "0.10.0"
diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py
index 78bc41237..8f52e16fc 100644
--- a/openml/datasets/__init__.py
+++ b/openml/datasets/__init__.py
@@ -6,6 +6,7 @@
     get_datasets,
     list_datasets,
     status_update,
+    list_qualities
 )
 from .dataset import OpenMLDataset
 from .data_feature import OpenMLDataFeature
@@ -20,4 +21,5 @@
     'OpenMLDataset',
     'OpenMLDataFeature',
     'status_update',
+    'list_qualities'
 ]
diff --git a/openml/datasets/data_feature.py b/openml/datasets/data_feature.py
index b271e63dc..077be639e 100644
--- a/openml/datasets/data_feature.py
+++ b/openml/datasets/data_feature.py
@@ -1,18 +1,19 @@
 class OpenMLDataFeature(object):
-    """Data Feature (a.k.a. Attribute) object.
+    """
+    Data Feature (a.k.a. Attribute) object.
 
-       Parameters
-       ----------
-       index : int
-            The index of this feature
-        name : str
-            Name of the feature
-        data_type : str
-            can be nominal, numeric, string, date (corresponds to arff)
-        nominal_values : list(str)
-            list of the possible values, in case of nominal attribute
-        number_missing_values : int
-       """
+    Parameters
+    ----------
+    index : int
+        The index of this feature
+    name : str
+        Name of the feature
+    data_type : str
+        can be nominal, numeric, string, date (corresponds to arff)
+    nominal_values : list(str)
+        list of the possible values, in case of nominal attribute
+    number_missing_values : int
+    """
     LEGAL_DATA_TYPES = ['nominal', 'numeric', 'string', 'date']
 
     def __init__(self, index, name, data_type, nominal_values,
@@ -22,8 +23,16 @@ def __init__(self, index, name, data_type, nominal_values,
         if data_type not in self.LEGAL_DATA_TYPES:
             raise ValueError('data type should be in %s, found: %s' %
                              (str(self.LEGAL_DATA_TYPES), data_type))
-        if nominal_values is not None and type(nominal_values) != list:
-            raise ValueError('Nominal_values is of wrong datatype')
+        if data_type == 'nominal':
+            if nominal_values is None:
+                raise TypeError('Dataset features require attribute `nominal_values` for nominal '
+                                'feature type.')
+            elif not isinstance(nominal_values, list):
+                raise TypeError('Argument `nominal_values` is of wrong datatype, should be list, '
+                                'but is {}'.format(type(nominal_values)))
+        else:
+            if nominal_values is not None:
+                raise TypeError('Argument `nominal_values` must be None for non-nominal feature.')
         if type(number_missing_values) != int:
             raise ValueError('number_missing_values is of wrong datatype')
 
@@ -33,7 +42,7 @@ def __init__(self, index, name, data_type, nominal_values,
         self.nominal_values = nominal_values
         self.number_missing_values = number_missing_values
 
-    def __str__(self):
+    def __repr__(self):
         return "[%d - %s (%s)]" % (self.index, self.name, self.data_type)
 
     def _repr_pretty_(self, pp, cycle):
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
index b6833a513..630fac35e 100644
--- a/openml/datasets/dataset.py
+++ b/openml/datasets/dataset.py
@@ -132,9 +132,9 @@ def __init__(self, name, description, format=None,
         self.default_target_attribute = default_target_attribute
         self.row_id_attribute = row_id_attribute
         if isinstance(ignore_attribute, str):
-            self.ignore_attributes = [ignore_attribute]
+            self.ignore_attribute = [ignore_attribute]
         elif isinstance(ignore_attribute, list) or ignore_attribute is None:
-            self.ignore_attributes = ignore_attribute
+            self.ignore_attribute = ignore_attribute
         else:
             raise ValueError('Wrong data type for ignore_attribute. '
                              'Should be list.')
@@ -153,7 +153,6 @@ def __init__(self, name, description, format=None,
 
         if features is not None:
             self.features = {}
-            # todo add nominal values (currently not in database)
             for idx, xmlfeature in enumerate(features['oml:feature']):
                 nr_missing = xmlfeature.get('oml:number_of_missing_values', 0)
                 feature = OpenMLDataFeature(int(xmlfeature['oml:index']),
@@ -173,6 +172,36 @@ def __init__(self, name, description, format=None,
         else:
             self.data_pickle_file = None
 
+    def __repr__(self):
+        header = "OpenML Dataset"
+        header = '{}\n{}\n'.format(header, '=' * len(header))
+
+        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
+        fields = {"Name": self.name,
+                  "Version": self.version,
+                  "Format": self.format,
+                  "Licence": self.licence,
+                  "Download URL": self.url,
+                  "Data file": self.data_file,
+                  "Pickle file": self.data_pickle_file,
+                  "# of features": len(self.features)}
+        if self.upload_date is not None:
+            fields["Upload Date"] = self.upload_date.replace('T', ' ')
+        if self.dataset_id is not None:
+            fields["OpenML URL"] = "{}d/{}".format(base_url, self.dataset_id)
+        if self.qualities['NumberOfInstances'] is not None:
+            fields["# of instances"] = int(self.qualities['NumberOfInstances'])
+
+        # determines the order in which the information will be printed
+        order = ["Name", "Version", "Format", "Upload Date", "Licence", "Download URL",
+                 "OpenML URL", "Data File", "Pickle File", "# of features", "# of instances"]
+        fields = [(key, fields[key]) for key in order if key in fields]
+
+        longest_field_name_length = max(len(name) for name, value in fields)
+        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
+        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
+        return header + body
+
     def _data_arff_to_pickle(self, data_file):
         data_pickle_file = data_file.replace('.arff', '.pkl.py3')
         if os.path.exists(data_pickle_file):
@@ -368,9 +397,25 @@ def decode_arff(fh):
     def _convert_array_format(data, array_format, attribute_names):
         """Convert a dataset to a given array format.
 
-        By default, the data are stored as a sparse matrix or a pandas
-        dataframe. One might be interested to get a pandas SparseDataFrame or a
-        NumPy array instead, respectively.
+        Converts to numpy array if data is non-sparse.
+        Converts to a sparse dataframe if data is sparse.
+
+        Parameters
+        ----------
+        array_format : str {'array', 'dataframe'}
+            Desired data type of the output
+            - If array_format='array'
+                If data is non-sparse
+                    Converts to numpy-array
+                    Enforces numeric encoding of categorical columns
+                    Missing values are represented as NaN in the numpy-array
+                else returns data as is
+            - If array_format='dataframe'
+                If data is sparse
+                    Works only on sparse data
+                    Converts sparse data to sparse dataframe
+                else returns data as is
+
         """
         if array_format == "array" and not scipy.sparse.issparse(data):
             # We encode the categories such that they are integer to be able
@@ -396,8 +441,11 @@ def _encode_if_category(column):
                     'PyOpenML cannot handle string when returning numpy'
                     ' arrays. Use dataset_format="dataframe".'
                 )
-        if array_format == "dataframe" and scipy.sparse.issparse(data):
+        elif array_format == "dataframe" and scipy.sparse.issparse(data):
             return pd.SparseDataFrame(data, columns=attribute_names)
+        else:
+            data_type = "sparse-data" if scipy.sparse.issparse(data) else "non-sparse data"
+            warn("Cannot convert {} to '{}'. Returning input data.".format(data_type, array_format))
         return data
 
     @staticmethod
@@ -423,7 +471,7 @@ def get_data(
             self,
             target: Optional[Union[List[str], str]] = None,
             include_row_id: bool = False,
-            include_ignore_attributes: bool = False,
+            include_ignore_attribute: bool = False,
             dataset_format: str = "dataframe",
     ) -> Tuple[
             Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix],
@@ -440,7 +488,7 @@ def get_data(
             Splitting multiple columns is currently not supported.
         include_row_id : boolean (default=False)
             Whether to include row ids in the returned dataset.
-        include_ignore_attributes : boolean (default=False)
+        include_ignore_attribute : boolean (default=False)
             Whether to include columns that are marked as "ignore"
             on the server in the dataset.
         dataset_format : string (default='dataframe')
@@ -479,11 +527,11 @@ def get_data(
             elif isinstance(self.row_id_attribute, Iterable):
                 to_exclude.extend(self.row_id_attribute)
 
-        if not include_ignore_attributes and self.ignore_attributes is not None:
-            if isinstance(self.ignore_attributes, str):
-                to_exclude.append(self.ignore_attributes)
-            elif isinstance(self.ignore_attributes, Iterable):
-                to_exclude.extend(self.ignore_attributes)
+        if not include_ignore_attribute and self.ignore_attribute is not None:
+            if isinstance(self.ignore_attribute, str):
+                to_exclude.append(self.ignore_attribute)
+            elif isinstance(self.ignore_attribute, Iterable):
+                to_exclude.extend(self.ignore_attribute)
 
         if len(to_exclude) > 0:
             logger.info("Going to remove the following attributes:"
@@ -566,7 +614,7 @@ def retrieve_class_labels(self, target_name: str = 'class') -> Union[None, List[
         return None
 
     def get_features_by_type(self, data_type, exclude=None,
-                             exclude_ignore_attributes=True,
+                             exclude_ignore_attribute=True,
                              exclude_row_id_attribute=True):
         """
         Return indices of features of a given type, e.g. all nominal features.
@@ -579,7 +627,7 @@ def get_features_by_type(self, data_type, exclude=None,
         exclude : list(int)
             Indices to exclude (and adapt the return values as if these indices
                         are not present)
-        exclude_ignore_attributes : bool
+        exclude_ignore_attribute : bool
             Whether to exclude the defined ignore attributes (and adapt the
             return values as if these indices are not present)
         exclude_row_id_attribute : bool
@@ -593,9 +641,9 @@ def get_features_by_type(self, data_type, exclude=None,
         """
         if data_type not in OpenMLDataFeature.LEGAL_DATA_TYPES:
             raise TypeError("Illegal feature type requested")
-        if self.ignore_attributes is not None:
-            if not isinstance(self.ignore_attributes, list):
-                raise TypeError("ignore_attributes should be a list")
+        if self.ignore_attribute is not None:
+            if not isinstance(self.ignore_attribute, list):
+                raise TypeError("ignore_attribute should be a list")
         if self.row_id_attribute is not None:
             if not isinstance(self.row_id_attribute, str):
                 raise TypeError("row id attribute should be a str")
@@ -607,8 +655,8 @@ def get_features_by_type(self, data_type, exclude=None,
         to_exclude = []
         if exclude is not None:
             to_exclude.extend(exclude)
-        if exclude_ignore_attributes and self.ignore_attributes is not None:
-            to_exclude.extend(self.ignore_attributes)
+        if exclude_ignore_attribute and self.ignore_attribute is not None:
+            to_exclude.extend(self.ignore_attribute)
         if exclude_row_id_attribute and self.row_id_attribute is not None:
             to_exclude.append(self.row_id_attribute)
 
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
index 30f58757c..1ed888ec1 100644
--- a/openml/datasets/functions.py
+++ b/openml/datasets/functions.py
@@ -165,6 +165,30 @@ def _get_cache_directory(dataset: OpenMLDataset) -> str:
     return _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset.dataset_id)
 
 
+def list_qualities() -> List[str]:
+    """ Return list of data qualities available.
+
+    The function performs an API call to retrieve the entire list of
+    data qualities that are computed on the datasets uploaded.
+
+    Returns
+    -------
+    list
+    """
+    api_call = "data/qualities/list"
+    xml_string = openml._api_calls._perform_api_call(api_call, 'get')
+    qualities = xmltodict.parse(xml_string, force_list=('oml:quality'))
+    # Minimalistic check if the XML is useful
+    if 'oml:data_qualities_list' not in qualities:
+        raise ValueError('Error in return XML, does not contain '
+                         '"oml:data_qualities_list"')
+    if not isinstance(qualities['oml:data_qualities_list']['oml:quality'], list):
+        raise TypeError('Error in return XML, does not contain '
+                        '"oml:quality" as a list')
+    qualities = qualities['oml:data_qualities_list']['oml:quality']
+    return qualities
+
+
 def list_datasets(
     offset: Optional[int] = None,
     size: Optional[int] = None,
@@ -277,10 +301,10 @@ def __list_datasets(api_call, output_format='dict'):
 
     datasets = dict()
     for dataset_ in datasets_dict['oml:data']['oml:dataset']:
-        ignore_attributes = ['oml:file_id', 'oml:quality']
+        ignore_attribute = ['oml:file_id', 'oml:quality']
         dataset = {k.replace('oml:', ''): v
                    for (k, v) in dataset_.items()
-                   if k not in ignore_attributes}
+                   if k not in ignore_attribute}
         dataset['did'] = int(dataset['did'])
         dataset['version'] = int(dataset['version'])
 
diff --git a/openml/evaluations/__init__.py b/openml/evaluations/__init__.py
index 650ba3502..43cec8738 100644
--- a/openml/evaluations/__init__.py
+++ b/openml/evaluations/__init__.py
@@ -1,4 +1,5 @@
 from .evaluation import OpenMLEvaluation
-from .functions import list_evaluations
+from .functions import list_evaluations, list_evaluation_measures, list_evaluations_setups
 
-__all__ = ['OpenMLEvaluation', 'list_evaluations']
+__all__ = ['OpenMLEvaluation', 'list_evaluations', 'list_evaluation_measures',
+           'list_evaluations_setups']
diff --git a/openml/evaluations/evaluation.py b/openml/evaluations/evaluation.py
index a22b6598f..48b407575 100644
--- a/openml/evaluations/evaluation.py
+++ b/openml/evaluations/evaluation.py
@@ -1,3 +1,5 @@
+import openml.config
+
 
 class OpenMLEvaluation(object):
     """
@@ -47,3 +49,32 @@ def __init__(self, run_id, task_id, setup_id, flow_id, flow_name,
         self.value = value
         self.values = values
         self.array_data = array_data
+
+    def __repr__(self):
+        header = "OpenML Evaluation"
+        header = '{}\n{}\n'.format(header, '=' * len(header))
+
+        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
+        fields = {"Upload Date": self.upload_time,
+                  "Run ID": self.run_id,
+                  "OpenML Run URL": "{}r/{}".format(base_url, self.run_id),
+                  "Task ID": self.task_id,
+                  "OpenML Task URL": "{}t/{}".format(base_url, self.task_id),
+                  "Flow ID": self.flow_id,
+                  "OpenML Flow URL": "{}f/{}".format(base_url, self.flow_id),
+                  "Setup ID": self.setup_id,
+                  "Data ID": self.data_id,
+                  "Data Name": self.data_name,
+                  "OpenML Data URL": "{}d/{}".format(base_url, self.data_id),
+                  "Metric Used": self.function,
+                  "Result": self.value}
+
+        order = ["Uploader Date", "Run ID", "OpenML Run URL", "Task ID", "OpenML Task URL"
+                 "Flow ID", "OpenML Flow URL", "Setup ID", "Data ID", "Data Name",
+                 "OpenML Data URL", "Metric Used", "Result"]
+        fields = [(key, fields[key]) for key in order if key in fields]
+
+        longest_field_name_length = max(len(name) for name, value in fields)
+        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
+        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
+        return header + body
diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
index 322168aa4..55517f3d6 100644
--- a/openml/evaluations/functions.py
+++ b/openml/evaluations/functions.py
@@ -1,11 +1,14 @@
 import json
 import xmltodict
 import pandas as pd
+import numpy as np
 from typing import Union, List, Optional, Dict
+import collections
 
 import openml.utils
 import openml._api_calls
 from ..evaluations import OpenMLEvaluation
+import openml
 
 
 def list_evaluations(
@@ -19,6 +22,7 @@ def list_evaluations(
     uploader: Optional[List] = None,
     tag: Optional[str] = None,
     per_fold: Optional[bool] = None,
+    sort_order: Optional[str] = None,
     output_format: str = 'object'
 ) -> Union[Dict, pd.DataFrame]:
     """
@@ -48,6 +52,9 @@ def list_evaluations(
 
     per_fold : bool, optional
 
+    sort_order : str, optional
+       order of sorting evaluations, ascending ("asc") or descending ("desc")
+
     output_format: str, optional (default='object')
         The parameter decides the format of the output.
         - If 'object' the output is a dict of OpenMLEvaluation objects
@@ -77,6 +84,7 @@ def list_evaluations(
                                   flow=flow,
                                   uploader=uploader,
                                   tag=tag,
+                                  sort_order=sort_order,
                                   per_fold=per_fold_str)
 
 
@@ -87,6 +95,7 @@ def _list_evaluations(
     setup: Optional[List] = None,
     flow: Optional[List] = None,
     uploader: Optional[List] = None,
+    sort_order: Optional[str] = None,
     output_format: str = 'object',
     **kwargs
 ) -> Union[Dict, pd.DataFrame]:
@@ -114,6 +123,9 @@ def _list_evaluations(
     kwargs: dict, optional
         Legal filter operators: tag, limit, offset.
 
+    sort_order : str, optional
+        order of sorting evaluations, ascending ("asc") or descending ("desc")
+
     output_format: str, optional (default='dict')
         The parameter decides the format of the output.
         - If 'dict' the output is a dict of dict
@@ -141,6 +153,8 @@ def _list_evaluations(
         api_call += "/flow/%s" % ','.join([str(int(i)) for i in flow])
     if uploader is not None:
         api_call += "/uploader/%s" % ','.join([str(int(i)) for i in uploader])
+    if sort_order is not None:
+        api_call += "/sort_order/%s" % sort_order
 
     return __list_evaluations(api_call, output_format=output_format)
 
@@ -157,7 +171,7 @@ def __list_evaluations(api_call, output_format='object'):
     assert type(evals_dict['oml:evaluations']['oml:evaluation']) == list, \
         type(evals_dict['oml:evaluations'])
 
-    evals = dict()
+    evals = collections.OrderedDict()
     for eval_ in evals_dict['oml:evaluations']['oml:evaluation']:
         run_id = int(eval_['oml:run_id'])
         value = None
@@ -197,6 +211,119 @@ def __list_evaluations(api_call, output_format='object'):
                              'array_data': array_data}
 
     if output_format == 'dataframe':
-        evals = pd.DataFrame.from_dict(evals, orient='index')
-
+        rows = [value for key, value in evals.items()]
+        evals = pd.DataFrame.from_records(rows, columns=rows[0].keys())
     return evals
+
+
+def list_evaluation_measures() -> List[str]:
+    """ Return list of evaluation measures available.
+
+    The function performs an API call to retrieve the entire list of
+    evaluation measures that are available.
+
+    Returns
+    -------
+    list
+
+    """
+    api_call = "evaluationmeasure/list"
+    xml_string = openml._api_calls._perform_api_call(api_call, 'get')
+    qualities = xmltodict.parse(xml_string, force_list=('oml:measures'))
+    # Minimalistic check if the XML is useful
+    if 'oml:evaluation_measures' not in qualities:
+        raise ValueError('Error in return XML, does not contain '
+                         '"oml:evaluation_measures"')
+    if not isinstance(qualities['oml:evaluation_measures']['oml:measures'][0]['oml:measure'],
+                      list):
+        raise TypeError('Error in return XML, does not contain '
+                        '"oml:measure" as a list')
+    qualities = qualities['oml:evaluation_measures']['oml:measures'][0]['oml:measure']
+    return qualities
+
+
+def list_evaluations_setups(
+        function: str,
+        offset: Optional[int] = None,
+        size: Optional[int] = None,
+        id: Optional[List] = None,
+        task: Optional[List] = None,
+        setup: Optional[List] = None,
+        flow: Optional[List] = None,
+        uploader: Optional[List] = None,
+        tag: Optional[str] = None,
+        per_fold: Optional[bool] = None,
+        sort_order: Optional[str] = None,
+        output_format: str = 'dataframe'
+) -> Union[Dict, pd.DataFrame]:
+    """
+    List all run-evaluation pairs matching all of the given filters
+    and their hyperparameter settings.
+
+    Parameters
+    ----------
+    function : str
+        the evaluation function. e.g., predictive_accuracy
+    offset : int, optional
+        the number of runs to skip, starting from the first
+    size : int, optional
+        the maximum number of runs to show
+    id : list[int], optional
+        the list of evaluation ID's
+    task : list[int], optional
+        the list of task ID's
+    setup: list[int], optional
+        the list of setup ID's
+    flow : list[int], optional
+        the list of flow ID's
+    uploader : list[int], optional
+        the list of uploader ID's
+    tag : str, optional
+        filter evaluation based on given tag
+    per_fold : bool, optional
+    sort_order : str, optional
+       order of sorting evaluations, ascending ("asc") or descending ("desc")
+    output_format: str, optional (default='dataframe')
+        The parameter decides the format of the output.
+        - If 'dict' the output is a dict of dict
+        - If 'dataframe' the output is a pandas DataFrame
+
+
+    Returns
+    -------
+    dict or dataframe with hyperparameter settings as a list of tuples.
+    """
+    # List evaluations
+    evals = list_evaluations(function=function, offset=offset, size=size, id=id, task=task,
+                             setup=setup, flow=flow, uploader=uploader, tag=tag,
+                             per_fold=per_fold, sort_order=sort_order, output_format='dataframe')
+
+    # List setups
+    # Split setups in evals into chunks of N setups as list_setups does not support large size
+    df = pd.DataFrame()
+    if len(evals) != 0:
+        N = 100
+        setup_chunks = np.split(evals['setup_id'].unique(),
+                                ((len(evals['setup_id'].unique()) - 1) // N) + 1)
+        setups = pd.DataFrame()
+        for setup in setup_chunks:
+            result = pd.DataFrame(openml.setups.list_setups(setup=setup, output_format='dataframe'))
+            result.drop('flow_id', axis=1, inplace=True)
+            # concat resulting setup chunks into single datframe
+            setups = pd.concat([setups, result], ignore_index=True)
+        parameters = []
+        # Convert parameters of setup into list of tuples of (hyperparameter, value)
+        for parameter_dict in setups['parameters']:
+            if parameter_dict is not None:
+                parameters.append([tuple([param['parameter_name'], param['value']])
+                                   for param in parameter_dict.values()])
+            else:
+                parameters.append([])
+        setups['parameters'] = parameters
+        # Merge setups with evaluations
+        df = pd.merge(evals, setups, on='setup_id', how='left')
+
+    if output_format == 'dataframe':
+        return df
+    else:
+        return df.to_dict(orient='index')
diff --git a/openml/exceptions.py b/openml/exceptions.py
index 2bd52ca49..492587adc 100644
--- a/openml/exceptions.py
+++ b/openml/exceptions.py
@@ -25,7 +25,7 @@ def __init__(self, message: str, code: str = None, additional: str = None, url:
         self.url = url
         super().__init__(message)
 
-    def __str__(self):
+    def __repr__(self):
         return '%s returned code %s: %s' % (
             self.url, self.code, self.message,
         )
diff --git a/openml/extensions/sklearn/extension.py b/openml/extensions/sklearn/extension.py
index ce8e4ebf9..d44b61ae7 100644
--- a/openml/extensions/sklearn/extension.py
+++ b/openml/extensions/sklearn/extension.py
@@ -87,6 +87,122 @@ def can_handle_model(cls, model: Any) -> bool:
         """
         return isinstance(model, sklearn.base.BaseEstimator)
 
+    @classmethod
+    def trim_flow_name(
+            cls,
+            long_name: str,
+            extra_trim_length: int = 100,
+            _outer: bool = True
+    ) -> str:
+        """ Shorten generated sklearn flow name to at most `max_length` characters.
+
+        Flows are assumed to have the following naming structure:
+        (model_selection)? (pipeline)? (steps)+
+        and will be shortened to:
+        sklearn.(selection.)?(pipeline.)?(steps)+
+        e.g. (white spaces and newlines added for readability)
+        sklearn.pipeline.Pipeline(
+            columntransformer=sklearn.compose._column_transformer.ColumnTransformer(
+                numeric=sklearn.pipeline.Pipeline(
+                    imputer=sklearn.preprocessing.imputation.Imputer,
+                    standardscaler=sklearn.preprocessing.data.StandardScaler),
+                nominal=sklearn.pipeline.Pipeline(
+                    simpleimputer=sklearn.impute.SimpleImputer,
+                    onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)),
+            variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
+            svc=sklearn.svm.classes.SVC)
+        ->
+        sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)
+
+        Parameters
+        ----------
+        long_name : str
+            The full flow name generated by the scikit-learn extension.
+        extra_trim_length: int (default=100)
+            If the trimmed name would exceed `extra_trim_length` characters, additional trimming
+            of the short name is performed. This reduces the produced short name length.
+            There is no guarantee the end result will not exceed `extra_trim_length`.
+        _outer : bool (default=True)
+            For internal use only. Specifies if the function is called recursively.
+
+        Returns
+        -------
+        str
+
+        """
+        def remove_all_in_parentheses(string: str) -> str:
+            string, removals = re.subn(r"\([^()]*\)", "", string)
+            while removals > 0:
+                string, removals = re.subn(r"\([^()]*\)", "", string)
+            return string
+
+        # Generally, we want to trim all hyperparameters, the exception to that is for model
+        # selection, as the `estimator` hyperparameter is very indicative of what is in the flow.
+        # So we first trim name of the `estimator` specified in mode selection. For reference, in
+        # the example below, we want to trim `sklearn.tree.tree.DecisionTreeClassifier`, and
+        # keep it in the final trimmed flow name:
+        # sklearn.pipeline.Pipeline(Imputer=sklearn.preprocessing.imputation.Imputer,
+        # VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
+        # Estimator=sklearn.model_selection._search.RandomizedSearchCV(estimator=
+        # sklearn.tree.tree.DecisionTreeClassifier))
+        if 'sklearn.model_selection' in long_name:
+            start_index = long_name.index('sklearn.model_selection')
+            estimator_start = (start_index
+                               + long_name[start_index:].index('estimator=')
+                               + len('estimator='))
+
+            model_select_boilerplate = long_name[start_index:estimator_start]
+            # above is .g. "sklearn.model_selection._search.RandomizedSearchCV(estimator="
+            model_selection_class = model_select_boilerplate.split('(')[0].split('.')[-1]
+
+            # Now we want to also find and parse the `estimator`, for this we find the closing
+            # parenthesis to the model selection technique:
+            closing_parenthesis_expected = 1
+            for i, char in enumerate(long_name[estimator_start:], start=estimator_start):
+                if char == '(':
+                    closing_parenthesis_expected += 1
+                if char == ')':
+                    closing_parenthesis_expected -= 1
+                if closing_parenthesis_expected == 0:
+                    break
+
+            model_select_pipeline = long_name[estimator_start:i]
+            trimmed_pipeline = cls.trim_flow_name(model_select_pipeline, _outer=False)
+            _, trimmed_pipeline = trimmed_pipeline.split('.', maxsplit=1)  # trim module prefix
+            model_select_short = "sklearn.{}[{}]".format(model_selection_class, trimmed_pipeline)
+            name = long_name[:start_index] + model_select_short + long_name[i + 1:]
+        else:
+            name = long_name
+
+        module_name = long_name.split('.')[0]
+        short_name = module_name + '.{}'
+
+        if name.startswith('sklearn.pipeline'):
+            full_pipeline_class, pipeline = name[:-1].split('(', maxsplit=1)
+            pipeline_class = full_pipeline_class.split('.')[-1]
+            # We don't want nested pipelines in the short name, so we trim all complicated
+            # subcomponents, i.e. those with parentheses:
+            pipeline = remove_all_in_parentheses(pipeline)
+
+            # then the pipeline steps are formatted e.g.:
+            # step1name=sklearn.submodule.ClassName,step2name...
+            components = [component.split('.')[-1] for component in pipeline.split(',')]
+            pipeline = "{}({})".format(pipeline_class, ','.join(components))
+            if len(short_name.format(pipeline)) > extra_trim_length:
+                pipeline = "{}(...,{})".format(pipeline_class, components[-1])
+        else:
+            # Just a simple component: e.g. sklearn.tree.DecisionTreeClassifier
+            pipeline = remove_all_in_parentheses(name).split('.')[-1]
+
+        if not _outer:
+            # Anything from parenthesis in inner calls should not be culled, so we use brackets
+            pipeline = pipeline.replace('(', '[').replace(')', ']')
+        else:
+            # Square brackets may be introduced with nested model_selection
+            pipeline = pipeline.replace('[', '(').replace(']', ')')
+
+        return short_name.format(pipeline)
+
     ################################################################################################
     # Methods for flow serialization and de-serialization
 
@@ -402,6 +518,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
             name = '%s(%s)' % (class_name, sub_components_names[1:])
         else:
             name = class_name
+        short_name = SklearnExtension.trim_flow_name(name)
 
         # Get the external versions of all sub-components
         external_version = self._get_external_version_string(model, subcomponents)
@@ -419,6 +536,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
         sklearn_version_formatted = sklearn_version.replace('==', '_')
         flow = OpenMLFlow(name=name,
                           class_name=class_name,
+                          custom_name=short_name,
                           description='Automatically created scikit-learn flow.',
                           model=model,
                           components=subcomponents,
@@ -432,6 +550,7 @@ def _serialize_model(self, model: Any) -> OpenMLFlow:
                                 # annotate a class of sklearn.svm.SVC() with the
                                 # tag svm?
                                 ],
+                          extension=self,
                           language='English',
                           # TODO fill in dependencies!
                           dependencies=dependencies)
@@ -455,9 +574,12 @@ def _get_external_version_string(
             model_package_name, model_package_version_number,
         )
         openml_version = self._format_external_version('openml', openml.__version__)
+        sklearn_version = self._format_external_version('sklearn', sklearn.__version__)
+
         external_versions = set()
         external_versions.add(external_version)
         external_versions.add(openml_version)
+        external_versions.add(sklearn_version)
         for visitee in sub_components.values():
             for external_version in visitee.external_version.split(','):
                 external_versions.add(external_version)
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
index 829bc0745..0db69d16f 100644
--- a/openml/flows/flow.py
+++ b/openml/flows/flow.py
@@ -7,6 +7,8 @@
 from ..extensions import get_extension_by_flow
 from ..utils import extract_xml_tags, _tag_entity
 
+import openml.config
+
 
 class OpenMLFlow(object):
     """OpenML Flow. Stores machine learning models.
@@ -85,7 +87,7 @@ def __init__(self, name, description, model, components, parameters,
                  dependencies, class_name=None, custom_name=None,
                  binary_url=None, binary_format=None,
                  binary_md5=None, uploader=None, upload_date=None,
-                 flow_id=None, version=None):
+                 flow_id=None, extension=None, version=None):
         self.name = name
         self.description = description
         self.model = model
@@ -129,8 +131,47 @@ def __init__(self, name, description, model, components, parameters,
         self.language = language
         self.dependencies = dependencies
         self.flow_id = flow_id
+        if extension is None:
+            self._extension = get_extension_by_flow(self)
+        else:
+            self._extension = extension
 
-        self.extension = get_extension_by_flow(self)
+    @property
+    def extension(self):
+        if self._extension is not None:
+            return self._extension
+        else:
+            raise RuntimeError("No extension could be found for flow {}: {}"
+                               .format(self.flow_id, self.name))
+
+    def __repr__(self):
+        header = "OpenML Flow"
+        header = '{}\n{}\n'.format(header, '=' * len(header))
+
+        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
+        fields = {"Flow Name": self.name,
+                  "Flow Description": self.description,
+                  "Dependencies": self.dependencies}
+        if self.flow_id is not None:
+            if self.version is not None:
+                fields["Flow ID"] = "{} (version {})".format(self.flow_id, self.version)
+            else:
+                fields["Flow ID"] = self.flow_id
+            fields["Flow URL"] = "{}f/{}".format(base_url, self.flow_id)
+        if self.upload_date is not None:
+            fields["Upload Date"] = self.upload_date.replace('T', ' ')
+        if self.binary_url is not None:
+            fields["Binary URL"] = self.binary_url
+
+        # determines the order in which the information will be printed
+        order = ["Flow ID", "Flow URL", "Flow Name", "Flow Description", "Binary URL",
+                 "Upload Date", "Dependencies"]
+        fields = [(key, fields[key]) for key in order if key in fields]
+
+        longest_field_name_length = max(len(name) for name, value in fields)
+        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
+        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
+        return header + body
 
     def _to_xml(self) -> str:
         """Generate xml representation of self for upload to server.
@@ -378,14 +419,15 @@ def publish(self, raise_error_if_exists: bool = False) -> 'OpenMLFlow':
         _copy_server_fields(flow, self)
         try:
             openml.flows.functions.assert_flows_equal(
-                self, flow, flow.upload_date, ignore_parameter_values=True
+                self, flow, flow.upload_date,
+                ignore_parameter_values=True,
+                ignore_custom_name_if_none=True
             )
         except ValueError as e:
             message = e.args[0]
-            raise ValueError("Flow was not stored correctly on the server. "
-                             "New flow ID is %d. Please check manually and "
-                             "remove the flow if necessary! Error is:\n'%s'" %
-                             (flow_id, message))
+            raise ValueError("The flow on the server is inconsistent with the local flow. "
+                             "The server flow ID is {}. Please check manually and remove "
+                             "the flow if necessary! Error is:\n'{}'".format(flow_id, message))
         return self
 
     def get_structure(self, key_item: str) -> Dict[str, List[str]]:
diff --git a/openml/flows/functions.py b/openml/flows/functions.py
index 5841dc699..d12bcfe91 100644
--- a/openml/flows/functions.py
+++ b/openml/flows/functions.py
@@ -92,7 +92,6 @@ def get_flow(flow_id: int, reinstantiate: bool = False) -> OpenMLFlow:
 
     if reinstantiate:
         flow.model = flow.extension.flow_to_model(flow)
-
     return flow
 
 
@@ -308,7 +307,8 @@ def _check_flow_for_server_id(flow: OpenMLFlow) -> None:
 
 def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
                        ignore_parameter_values_on_older_children: str = None,
-                       ignore_parameter_values: bool = False) -> None:
+                       ignore_parameter_values: bool = False,
+                       ignore_custom_name_if_none: bool = False) -> None:
     """Check equality of two flows.
 
     Two flows are equal if their all keys which are not set by the server
@@ -326,6 +326,9 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
 
     ignore_parameter_values : bool
         Whether to ignore parameter values when comparing flows.
+
+   ignore_custom_name_if_none : bool
+        Whether to ignore the custom name field if either flow has `custom_name` equal to `None`.
     """
     if not isinstance(flow1, OpenMLFlow):
         raise TypeError('Argument 1 must be of type OpenMLFlow, but is %s' %
@@ -359,8 +362,9 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
                                      'argument2, but not in argument1.' % name)
                 assert_flows_equal(attr1[name], attr2[name],
                                    ignore_parameter_values_on_older_children,
-                                   ignore_parameter_values)
-        elif key == 'extension':
+                                   ignore_parameter_values,
+                                   ignore_custom_name_if_none)
+        elif key == '_extension':
             continue
         else:
             if key == 'parameters':
@@ -386,6 +390,13 @@ def assert_flows_equal(flow1: OpenMLFlow, flow2: OpenMLFlow,
                     # Continue needs to be done here as the first if
                     # statement triggers in both special cases
                     continue
+            elif (key == 'custom_name'
+                  and ignore_custom_name_if_none
+                  and (attr1 is None or attr2 is None)):
+                # If specified, we allow `custom_name` inequality if one flow's name is None.
+                # Helps with backwards compatibility as `custom_name` is now auto-generated, but
+                # before it used to be `None`.
+                continue
 
             if attr1 != attr2:
                 raise ValueError("Flow %s: values for attribute '%s' differ: "
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
index 87596deca..767a4a48a 100644
--- a/openml/runs/functions.py
+++ b/openml/runs/functions.py
@@ -78,22 +78,22 @@ def run_model_on_task(
         Flow generated from the model.
     """
 
-    extension = get_extension_by_model(model, raise_if_no_extension=True)
-    if extension is None:
-        # This should never happen and is only here to please mypy will be gone soon once the
-        # whole function is removed
-        raise TypeError(extension)
-
     # TODO: At some point in the future do not allow for arguments in old order (6-2018).
     # Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019).
     # When removing this please also remove the method `is_estimator` from the extension
     # interface as it is only used here (MF, 3-2019)
-    if isinstance(model, OpenMLTask) and extension.is_estimator(model):
+    if isinstance(model, OpenMLTask):
         warnings.warn("The old argument order (task, model) is deprecated and "
                       "will not be supported in the future. Please use the "
                       "order (model, task).", DeprecationWarning)
         task, model = model, task
 
+    extension = get_extension_by_model(model, raise_if_no_extension=True)
+    if extension is None:
+        # This should never happen and is only here to please mypy will be gone soon once the
+        # whole function is removed
+        raise TypeError(extension)
+
     flow = extension.model_to_flow(model)
 
     run = run_flow_on_task(
@@ -159,9 +159,6 @@ def run_flow_on_task(
     if flow_tags is not None and not isinstance(flow_tags, list):
         raise ValueError("flow_tags should be a list")
 
-    if task.task_id is None:
-        raise ValueError("The task should be published at OpenML")
-
     # TODO: At some point in the future do not allow for arguments in old order (changed 6-2018).
     # Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019).
     if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow):
@@ -171,6 +168,11 @@ def run_flow_on_task(
                       "order (model, Flow).", DeprecationWarning)
         task, flow = flow, task
 
+    if task.task_id is None:
+        raise ValueError("The task should be published at OpenML")
+
+    if flow.model is None:
+        flow.model = flow.extension.flow_to_model(flow)
     flow.model = flow.extension.seed_model(flow.model, seed=seed)
 
     # We only need to sync with the server right now if we want to upload the flow,
@@ -667,6 +669,13 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
         dataset_id = int(run['oml:input_data']['oml:dataset']['oml:did'])
     elif not from_server:
         dataset_id = None
+    else:
+        # fetching the task to obtain dataset_id
+        t = openml.tasks.get_task(task_id, download_data=False)
+        if not hasattr(t, 'dataset_id'):
+            raise ValueError("Unable to fetch dataset_id from the task({}) "
+                             "linked to run({})".format(task_id, run_id))
+        dataset_id = t.dataset_id
 
     files = OrderedDict()
     evaluations = OrderedDict()
diff --git a/openml/runs/run.py b/openml/runs/run.py
index 0e5e12b9b..6a4818f30 100644
--- a/openml/runs/run.py
+++ b/openml/runs/run.py
@@ -67,13 +67,41 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
         self.tags = tags
         self.predictions_url = predictions_url
 
-    def __str__(self):
-        flow_name = self.flow_name
-        if flow_name is not None and len(flow_name) > 26:
-            # long enough to show sklearn.pipeline.Pipeline
-            flow_name = flow_name[:26] + "..."
-        return "[run id: {}, task id: {}, flow id: {}, flow name: {}]".format(
-            self.run_id, self.task_id, self.flow_id, flow_name)
+    def __repr__(self):
+        header = "OpenML Run"
+        header = '{}\n{}\n'.format(header, '=' * len(header))
+
+        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
+        fields = {"Uploader Name": self.uploader_name,
+                  "Metric": self.task_evaluation_measure,
+                  "Run ID": self.run_id,
+                  "Task ID": self.task_id,
+                  "Task Type": self.task_type,
+                  "Task URL": "{}t/{}".format(base_url, self.task_id),
+                  "Flow ID": self.flow_id,
+                  "Flow Name": self.flow_name,
+                  "Flow URL": "{}f/{}".format(base_url, self.flow_id),
+                  "Setup ID": self.setup_id,
+                  "Setup String": self.setup_string,
+                  "Dataset ID": self.dataset_id,
+                  "Dataset URL": "{}d/{}".format(base_url, self.dataset_id)}
+        if self.uploader is not None:
+            fields["Uploader Profile"] = "{}u/{}".format(base_url, self.uploader)
+        if self.run_id is not None:
+            fields["Run URL"] = "{}r/{}".format(base_url, self.run_id)
+        if self.evaluations is not None and self.task_evaluation_measure in self.evaluations:
+            fields["Result"] = self.evaluations[self.task_evaluation_measure]
+
+        # determines the order in which the information will be printed
+        order = ["Uploader Name", "Uploader Profile", "Metric", "Result", "Run ID", "Run URL",
+                 "Task ID", "Task Type", "Task URL", "Flow ID", "Flow Name", "Flow URL",
+                 "Setup ID", "Setup String", "Dataset ID", "Dataset URL"]
+        fields = [(key, fields[key]) for key in order if key in fields]
+
+        longest_field_name_length = max(len(name) for name, value in fields)
+        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
+        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
+        return header + body
 
     def _repr_pretty_(self, pp, cycle):
         pp.text(str(self))
diff --git a/openml/runs/trace.py b/openml/runs/trace.py
index 42e89c50b..1786120e8 100644
--- a/openml/runs/trace.py
+++ b/openml/runs/trace.py
@@ -380,7 +380,7 @@ def merge_traces(cls, traces: List['OpenMLRunTrace']) -> 'OpenMLRunTrace':
 
         return cls(None, merged_trace)
 
-    def __str__(self):
+    def __repr__(self):
         return '[Run id: %d, %d trace iterations]'.format(
             -1 if self.run_id is None else self.run_id,
             len(self.trace_iterations),
@@ -471,7 +471,7 @@ def get_parameters(self):
                 result[param[len(PREFIX):]] = value
         return result
 
-    def __str__(self):
+    def __repr__(self):
         """
         tmp string representation, will be changed in the near future
         """
diff --git a/openml/setups/setup.py b/openml/setups/setup.py
index 91e921b55..aee1aa0bf 100644
--- a/openml/setups/setup.py
+++ b/openml/setups/setup.py
@@ -1,15 +1,17 @@
+import openml.config
+
 
 class OpenMLSetup(object):
     """Setup object (a.k.a. Configuration).
 
-       Parameters
-       ----------
-       setup_id : int
-            The OpenML setup id
-       flow_id : int
-            The flow that it is build upon
-        parameters : dict
-            The setting of the parameters
+    Parameters
+    ----------
+    setup_id : int
+        The OpenML setup id
+    flow_id : int
+        The flow that it is build upon
+    parameters : dict
+        The setting of the parameters
     """
 
     def __init__(self, setup_id, flow_id, parameters):
@@ -25,6 +27,25 @@ def __init__(self, setup_id, flow_id, parameters):
         self.flow_id = flow_id
         self.parameters = parameters
 
+    def __repr__(self):
+        header = "OpenML Setup"
+        header = '{}\n{}\n'.format(header, '=' * len(header))
+
+        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
+        fields = {"Setup ID": self.setup_id,
+                  "Flow ID": self.flow_id,
+                  "Flow URL": "{}f/{}".format(base_url, self.flow_id),
+                  "# of Parameters": len(self.parameters)}
+
+        # determines the order in which the information will be printed
+        order = ["Setup ID", "Flow ID", "Flow URL", "# of Parameters"]
+        fields = [(key, fields[key]) for key in order if key in fields]
+
+        longest_field_name_length = max(len(name) for name, value in fields)
+        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
+        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
+        return header + body
+
 
 class OpenMLParameter(object):
     """Parameter object (used in setup).
@@ -60,3 +81,34 @@ def __init__(self, input_id, flow_id, flow_name, full_name, parameter_name,
         self.data_type = data_type
         self.default_value = default_value
         self.value = value
+
+    def __repr__(self):
+        header = "OpenML Parameter"
+        header = '{}\n{}\n'.format(header, '=' * len(header))
+
+        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
+        fields = {"ID": self.id,
+                  "Flow ID": self.flow_id,
+                  # "Flow Name": self.flow_name,
+                  "Flow Name": self.full_name,
+                  "Flow URL": "{}f/{}".format(base_url, self.flow_id),
+                  "Parameter Name": self.parameter_name}
+        # indented prints for parameter attributes
+        # indention = 2 spaces + 1 | + 2 underscores
+        indent = "{}|{}".format(" " * 2, "_" * 2)
+        parameter_data_type = "{}Data Type".format(indent)
+        fields[parameter_data_type] = self.data_type
+        parameter_default = "{}Default".format(indent)
+        fields[parameter_default] = self.default_value
+        parameter_value = "{}Value".format(indent)
+        fields[parameter_value] = self.value
+
+        # determines the order in which the information will be printed
+        order = ["ID", "Flow ID", "Flow Name", "Flow URL", "Parameter Name",
+                 parameter_data_type, parameter_default, parameter_value]
+        fields = [(key, fields[key]) for key in order if key in fields]
+
+        longest_field_name_length = max(len(name) for name, value in fields)
+        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
+        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
+        return header + body
diff --git a/openml/study/functions.py b/openml/study/functions.py
index 0e2f9eb3f..ccd523016 100644
--- a/openml/study/functions.py
+++ b/openml/study/functions.py
@@ -182,8 +182,8 @@ def create_study(
     where the runs are the main entity (collection consists of runs and all
     entities (flows, tasks, etc) that are related to these runs)
 
-    Parameters:
-    -----------
+    Parameters
+    ----------
     alias : str (optional)
         a string ID, unique on server (url-friendly)
     benchmark_suite : int (optional)
@@ -195,8 +195,8 @@ def create_study(
     run_ids : list
         a list of run ids associated with this study
 
-    Returns:
-    --------
+    Returns
+    -------
     OpenMLStudy
         A local OpenML study object (call publish method to upload to server)
     """
@@ -228,8 +228,8 @@ def create_benchmark_suite(
     Creates an OpenML benchmark suite (collection of entity types, where
     the tasks are the linked entity)
 
-    Parameters:
-    -----------
+    Parameters
+    ----------
     alias : str (optional)
         a string ID, unique on server (url-friendly)
     name : str
@@ -239,8 +239,8 @@ def create_benchmark_suite(
     task_ids : list
         a list of task ids associated with this study
 
-    Returns:
-    --------
+    Returns
+    -------
     OpenMLStudy
         A local OpenML study object (call publish method to upload to server)
     """
diff --git a/openml/study/study.py b/openml/study/study.py
index 46f1339eb..8657749da 100644
--- a/openml/study/study.py
+++ b/openml/study/study.py
@@ -89,6 +89,39 @@ def __init__(
         self.runs = runs
         pass
 
+    def __repr__(self):
+        # header is provided by the sub classes
+        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
+        fields = {"Name": self.name,
+                  "Status": self.status,
+                  "Main Entity Type": self.main_entity_type}
+        if self.id is not None:
+            fields["ID"] = self.id
+            fields["Study URL"] = "{}s/{}".format(base_url, self.id)
+        if self.creator is not None:
+            fields["Creator"] = "{}u/{}".format(base_url, self.creator)
+        if self.creation_date is not None:
+            fields["Upload Time"] = self.creation_date.replace('T', ' ')
+        if self.data is not None:
+            fields["# of Data"] = len(self.data)
+        if self.tasks is not None:
+            fields["# of Tasks"] = len(self.tasks)
+        if self.flows is not None:
+            fields["# of Flows"] = len(self.flows)
+        if self.runs is not None:
+            fields["# of Runs"] = len(self.runs)
+
+        # determines the order in which the information will be printed
+        order = ["ID", "Name", "Status", "Main Entity Type", "Study URL",
+                 "# of Data", "# of Tasks", "# of Flows", "# of Runs",
+                 "Creator", "Upload Time"]
+        fields = [(key, fields[key]) for key in order if key in fields]
+
+        longest_field_name_length = max(len(name) for name, value in fields)
+        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
+        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
+        return body
+
     def publish(self) -> int:
         """
         Publish the study on the OpenML server.
@@ -235,6 +268,12 @@ def __init__(
             setups=setups,
         )
 
+    def __repr__(self):
+        header = "OpenML Study"
+        header = '{}\n{}\n'.format(header, '=' * len(header))
+        body = super(OpenMLStudy, self).__repr__()
+        return header + body
+
 
 class OpenMLBenchmarkSuite(BaseStudy):
     """
@@ -306,3 +345,9 @@ def __init__(
             runs=None,
             setups=None,
         )
+
+    def __repr__(self):
+        header = "OpenML Benchmark Suite"
+        header = '{}\n{}\n'.format(header, '=' * len(header))
+        body = super(OpenMLBenchmarkSuite, self).__repr__()
+        return header + body
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index 69850a096..4bb93b007 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -133,14 +133,14 @@ def list_tasks(
 ) -> Union[Dict, pd.DataFrame]:
     """
     Return a number of tasks having the given tag and task_type_id
+
     Parameters
     ----------
     Filter task_type_id is separated from the other filters because
     it is used as task_type_id in the task description, but it is named
     type when used as a filter in list tasks call.
     task_type_id : int, optional
-        ID of the task type as detailed
-        `here <https://www.openml.org/search?type=task_type>`_.
+        ID of the task type as detailed `here <https://www.openml.org/search?type=task_type>`_.
         - Supervised classification: 1
         - Supervised regression: 2
         - Learning curve: 3
@@ -362,7 +362,7 @@ def get_task(task_id: int, download_data: bool = True) -> OpenMLTask:
         # List of class labels availaible in dataset description
         # Including class labels as part of task meta data handles
         #   the case where data download was initially disabled
-        if isinstance(task, OpenMLClassificationTask):
+        if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
             task.class_labels = \
                 dataset.retrieve_class_labels(task.target_name)
         # Clustering tasks do not have class labels
diff --git a/openml/tasks/task.py b/openml/tasks/task.py
index 6e0154726..83af79373 100644
--- a/openml/tasks/task.py
+++ b/openml/tasks/task.py
@@ -55,6 +55,36 @@ def __init__(
         self.estimation_procedure_id = estimation_procedure_id
         self.split = None  # type: Optional[OpenMLSplit]
 
+    def __repr__(self):
+        header = "OpenML Task"
+        header = '{}\n{}\n'.format(header, '=' * len(header))
+
+        base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
+        fields = {"Task Type": self.task_type}
+        if self.task_id is not None:
+            fields["Task ID"] = self.task_id
+            fields["Task URL"] = "{}t/{}".format(base_url, self.task_id)
+        if self.evaluation_measure is not None:
+            fields["Evaluation Measure"] = self.evaluation_measure
+        if self.estimation_procedure is not None:
+            fields["Estimation Procedure"] = self.estimation_procedure['type']
+        if self.target_name is not None:
+            fields["Target Feature"] = self.target_name
+            if hasattr(self, 'class_labels'):
+                fields["# of Classes"] = len(self.class_labels)
+            if hasattr(self, 'cost_matrix'):
+                fields["Cost Matrix"] = "Available"
+
+        # determines the order in which the information will be printed
+        order = ["Task Type", "Task ID", "Task URL", "Estimation Procedure", "Evaluation Measure",
+                 "Target Feature", "# of Classes", "Cost Matrix"]
+        fields = [(key, fields[key]) for key in order if key in fields]
+
+        longest_field_name_length = max(len(name) for name, value in fields)
+        field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
+        body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
+        return header + body
+
     def get_dataset(self) -> datasets.OpenMLDataset:
         """Download dataset associated with task"""
         return datasets.get_dataset(self.dataset_id)
diff --git a/openml/testing.py b/openml/testing.py
index 1ce0862d0..370fb9102 100644
--- a/openml/testing.py
+++ b/openml/testing.py
@@ -17,6 +17,8 @@
 import openml
 from openml.tasks import TaskTypeEnum
 
+import logging
+
 
 class TestBase(unittest.TestCase):
     """Base class for tests
@@ -26,6 +28,15 @@ class TestBase(unittest.TestCase):
     Currently hard-codes a read-write key.
     Hopefully soon allows using a test server, not the production server.
     """
+    publish_tracker = {'run': [], 'data': [], 'flow': [], 'task': [],
+                       'study': [], 'user': []}  # type: dict
+    test_server = "https://test.openml.org/api/v1/xml"
+    # amueller's read/write key that he will throw away later
+    apikey = "610344db6388d9ba34f6db45a3cf71de"
+
+    # creating logger for tracking files uploaded to test server
+    logger = logging.getLogger("unit_tests_published_entities")
+    logger.setLevel(logging.DEBUG)
 
     def setUp(self, n_levels: int = 1):
         """Setup variables and temporary directories.
@@ -58,7 +69,9 @@ def setUp(self, n_levels: int = 1):
             self.static_cache_dir = os.path.join(static_cache_dir, 'files')
 
         if self.static_cache_dir is None:
-            raise ValueError('Cannot find test cache dir!')
+            raise ValueError(
+                'Cannot find test cache dir, expected it to be {}!'.format(static_cache_dir)
+            )
 
         self.cwd = os.getcwd()
         workdir = os.path.dirname(os.path.abspath(__file__))
@@ -70,12 +83,9 @@ def setUp(self, n_levels: int = 1):
         os.chdir(self.workdir)
 
         self.cached = True
-        # amueller's read/write key that he will throw away later
-        openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de"
+        openml.config.apikey = TestBase.apikey
         self.production_server = "https://openml.org/api/v1/xml"
-        self.test_server = "https://test.openml.org/api/v1/xml"
-
-        openml.config.server = self.test_server
+        openml.config.server = TestBase.test_server
         openml.config.avoid_duplicate_runs = False
         openml.config.cache_directory = self.workdir
 
@@ -86,7 +96,7 @@ def setUp(self, n_levels: int = 1):
                 with open(openml.config.config_file, 'w') as fh:
                     fh.write('apikey = %s' % openml.config.apikey)
 
-        # Increase the number of retries to avoid spurios server failures
+        # Increase the number of retries to avoid spurious server failures
         self.connection_n_retries = openml.config.connection_n_retries
         openml.config.connection_n_retries = 10
 
@@ -103,6 +113,40 @@ def tearDown(self):
         openml.config.server = self.production_server
         openml.config.connection_n_retries = self.connection_n_retries
 
+    @classmethod
+    def _mark_entity_for_removal(self, entity_type, entity_id):
+        """ Static record of entities uploaded to test server
+
+        Dictionary of lists where the keys are 'entity_type'.
+        Each such dictionary is a list of integer IDs.
+        For entity_type='flow', each list element is a tuple
+        of the form (Flow ID, Flow Name).
+        """
+        if entity_type not in TestBase.publish_tracker:
+            TestBase.publish_tracker[entity_type] = [entity_id]
+        else:
+            TestBase.publish_tracker[entity_type].append(entity_id)
+
+    @classmethod
+    def _delete_entity_from_tracker(self, entity_type, entity):
+        """ Deletes entity records from the static file_tracker
+
+        Given an entity type and corresponding ID, deletes all entries, including
+        duplicate entries of the ID for the entity type.
+        """
+        if entity_type in TestBase.publish_tracker:
+            # removes duplicate entries
+            TestBase.publish_tracker[entity_type] = list(set(TestBase.publish_tracker[entity_type]))
+            if entity_type == 'flow':
+                delete_index = [i for i, (id_, _) in
+                                enumerate(TestBase.publish_tracker[entity_type])
+                                if id_ == entity][0]
+            else:
+                delete_index = [i for i, id_ in
+                                enumerate(TestBase.publish_tracker[entity_type])
+                                if id_ == entity][0]
+            TestBase.publish_tracker[entity_type].pop(delete_index)
+
     def _get_sentinel(self, sentinel=None):
         if sentinel is None:
             # Create a unique prefix for the flow. Necessary because the flow
@@ -197,4 +241,10 @@ def _check_fold_timing_evaluations(
                         self.assertLessEqual(evaluation, max_val)
 
 
-__all__ = ['TestBase']
+try:
+    from sklearn.impute import SimpleImputer
+except ImportError:
+    from sklearn.preprocessing import Imputer as SimpleImputer
+
+
+__all__ = ['TestBase', 'SimpleImputer']
diff --git a/openml/utils.py b/openml/utils.py
index 54064aca5..f6cc81ff7 100644
--- a/openml/utils.py
+++ b/openml/utils.py
@@ -5,6 +5,7 @@
 import warnings
 import pandas as pd
 from functools import wraps
+import collections
 
 import openml._api_calls
 import openml.exceptions
@@ -182,7 +183,7 @@ def _list_all(listing_call, output_format='dict', *args, **filters):
     active_filters = {key: value for key, value in filters.items()
                       if value is not None}
     page = 0
-    result = {}
+    result = collections.OrderedDict()
     if output_format == 'dataframe':
         result = pd.DataFrame()
 
diff --git a/setup.cfg b/setup.cfg
index fac02f0f9..726c8fa73 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,17 +1,6 @@
 [metadata]
 description-file = README.md
 
-[nosetests]
-# nosetests skips test files with the executable bit by default
-# which can silently hide failing tests.
-exe = 1
-cover-html = 1
-cover-html-dir = coverage
-cover-package = openml
-
-detailed-errors = 1
-with-doctest = 1
-doctest-tests = 1
-doctest-extension = rst
-doctest-fixtures = _fixture
-#doctest-options = +ELLIPSIS,+NORMALIZE_WHITESPACE
+[tool:pytest]
+filterwarnings =
+    ignore:the matrix subclass:PendingDeprecationWarning
diff --git a/setup.py b/setup.py
index ae676eaf8..3b271badd 100644
--- a/setup.py
+++ b/setup.py
@@ -6,13 +6,6 @@
 with open("openml/__version__.py") as fh:
     version = fh.readlines()[-1].split()[-1].strip("\"'")
 
-# Using Python setup.py install will try to build numpy which is prone to failure and
-# very time consuming anyway.
-if len(sys.argv) > 1 and sys.argv[1] == 'install':
-    print('Please install this package with pip: `pip install -e .` '
-          'Installation requires pip>=10.0.')
-    sys.exit(1)
-
 if sys.version_info < (3, 5):
     raise ValueError(
         'Unsupported Python version {}.{}.{} found. OpenML requires Python 3.5 or higher.'
@@ -35,6 +28,7 @@
                  version=version,
                  packages=setuptools.find_packages(),
                  package_data={'': ['*.txt', '*.md']},
+                 python_requires=">=3.5",
                  install_requires=[
                      'liac-arff>=2.4.0',
                      'xmltodict',
@@ -79,7 +73,6 @@
                               'Operating System :: Unix',
                               'Operating System :: MacOS',
                               'Programming Language :: Python :: 3',
-                              'Programming Language :: Python :: 3.4',
                               'Programming Language :: Python :: 3.5',
                               'Programming Language :: Python :: 3.6',
                               'Programming Language :: Python :: 3.7'])
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 000000000..9e08d09a8
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,181 @@
+'''This file is recognized by pytest for defining specified behaviour
+
+'conftest.py' files are directory-scope files that are shared by all
+sub-directories from where this file is placed. pytest recognises
+'conftest.py' for any unit test executed from within this directory
+tree. This file is used to define fixtures, hooks, plugins, and other
+functionality that can be shared by the unit tests.
+
+This file has been created for the OpenML testing to primarily make use
+of the pytest hooks 'pytest_sessionstart' and 'pytest_sessionfinish',
+which are being used for managing the deletion of local and remote files
+created by the unit tests, run across more than one process.
+
+This design allows one to comment or remove the conftest.py file to
+disable file deletions, without editing any of the test case files.
+
+
+Possible Future: class TestBase from openml/testing.py can be included
+    under this file and there would not be any requirements to import
+    testing.py in each of the unit test modules.
+'''
+
+import os
+import logging
+from typing import List
+
+import openml
+from openml.testing import TestBase
+
+# creating logger for unit test file deletion status
+logger = logging.getLogger("unit_tests")
+logger.setLevel(logging.DEBUG)
+
+file_list = []
+directory = None
+
+# finding the root directory of conftest.py and going up to OpenML main directory
+# exploiting the fact that conftest.py always resides in the root directory for tests
+static_dir = os.path.dirname(os.path.abspath(__file__))
+logging.info("static directory: {}".format(static_dir))
+print("static directory: {}".format(static_dir))
+while True:
+    if 'openml' in os.listdir(static_dir):
+        break
+    static_dir = os.path.join(static_dir, '..')
+
+
+def worker_id() -> str:
+    ''' Returns the name of the worker process owning this function call.
+
+    :return: str
+        Possible outputs from the set of {'master', 'gw0', 'gw1', ..., 'gw(n-1)'}
+        where n is the number of workers being used by pytest-xdist
+    '''
+    vars_ = list(os.environ.keys())
+    if 'PYTEST_XDIST_WORKER' in vars_ or 'PYTEST_XDIST_WORKER_COUNT' in vars_:
+        return os.environ['PYTEST_XDIST_WORKER']
+    else:
+        return 'master'
+
+
+def read_file_list() -> List[str]:
+    '''Returns a list of paths to all files that currently exist in 'openml/tests/files/'
+
+    :return: List[str]
+    '''
+    directory = os.path.join(static_dir, 'tests/files/')
+    if worker_id() == 'master':
+        logger.info("Collecting file lists from: {}".format(directory))
+    files = os.walk(directory)
+    file_list = []
+    for root, _, filenames in files:
+        for filename in filenames:
+            file_list.append(os.path.join(root, filename))
+    return file_list
+
+
+def compare_delete_files(old_list, new_list) -> None:
+    '''Deletes files that are there in the new_list but not in the old_list
+
+    :param old_list: List[str]
+    :param new_list: List[str]
+    :return: None
+    '''
+    file_list = list(set(new_list) - set(old_list))
+    for file in file_list:
+        os.remove(file)
+        logger.info("Deleted from local: {}".format(file))
+
+
+def delete_remote_files(tracker) -> None:
+    '''Function that deletes the entities passed as input, from the OpenML test server
+
+    The TestBase class in openml/testing.py has an attribute called publish_tracker.
+    This function expects the dictionary of the same structure.
+    It is a dictionary of lists, where the keys are entity types, while the values are
+    lists of integer IDs, except for key 'flow' where the value is a tuple (ID, flow name).
+
+    Iteratively, multiple POST requests are made to the OpenML test server using
+    openml.utils._delete_entity() to remove the entities uploaded by all the unit tests.
+
+    :param tracker: Dict
+    :return: None
+    '''
+    openml.config.server = TestBase.test_server
+    openml.config.apikey = TestBase.apikey
+
+    # reordering to delete sub flows at the end of flows
+    # sub-flows have shorter names, hence, sorting by descending order of flow name length
+    if 'flow' in tracker:
+        flow_deletion_order = [entity_id for entity_id, _ in
+                               sorted(tracker['flow'], key=lambda x: len(x[1]), reverse=True)]
+        tracker['flow'] = flow_deletion_order
+
+    # deleting all collected entities published to test server
+    # 'run's are deleted first to prevent dependency issue of entities on deletion
+    logger.info("Entity Types: {}".format(['run', 'data', 'flow', 'task', 'study']))
+    for entity_type in ['run', 'data', 'flow', 'task', 'study']:
+        logger.info("Deleting {}s...".format(entity_type))
+        for i, entity in enumerate(tracker[entity_type]):
+            try:
+                openml.utils._delete_entity(entity_type, entity)
+                logger.info("Deleted ({}, {})".format(entity_type, entity))
+            except Exception as e:
+                logger.warn("Cannot delete ({},{}): {}".format(entity_type, entity, e))
+
+
+def pytest_sessionstart() -> None:
+    '''pytest hook that is executed before any unit test starts
+
+    This function will be called by each of the worker processes, along with the master process
+    when they are spawned. This happens even before the collection of unit tests.
+    If number of workers, n=4, there will be a total of 5 (1 master + 4 workers) calls of this
+    function, before execution of any unit test begins. The master pytest process has the name
+    'master' while the worker processes are named as 'gw{i}' where i = 0, 1, ..., n-1.
+    The order of process spawning is: 'master' -> random ordering of the 'gw{i}' workers.
+
+    Since, master is always executed first, it is checked if the current process is 'master' and
+    store a list of strings of paths of all files in the directory (pre-unit test snapshot).
+
+    :return: None
+    '''
+    # file_list is global to maintain the directory snapshot during tear down
+    global file_list
+    worker = worker_id()
+    if worker == 'master':
+        file_list = read_file_list()
+
+
+def pytest_sessionfinish() -> None:
+    '''pytest hook that is executed after all unit tests of a worker ends
+
+    This function will be called by each of the worker processes, along with the master process
+    when they are done with the unit tests allocated to them.
+    If number of workers, n=4, there will be a total of 5 (1 master + 4 workers) calls of this
+    function, before execution of any unit test begins. The master pytest process has the name
+    'master' while the worker processes are named as 'gw{i}' where i = 0, 1, ..., n-1.
+    The order of invocation is: random ordering of the 'gw{i}' workers -> 'master'.
+
+    Since, master is always executed last, it is checked if the current process is 'master' and,
+    * Compares file list with pre-unit test snapshot and deletes all local files generated
+    * Iterates over the list of entities uploaded to test server and deletes them remotely
+
+    :return: None
+    '''
+    # allows access to the file_list read in the set up phase
+    global file_list
+    worker = worker_id()
+    logger.info("Finishing worker {}".format(worker))
+
+    # Test file deletion
+    logger.info("Deleting files uploaded to test server for worker {}".format(worker))
+    delete_remote_files(TestBase.publish_tracker)
+
+    if worker == 'master':
+        # Local file deletion
+        new_file_list = read_file_list()
+        compare_delete_files(file_list, new_file_list)
+        logger.info("Local files deleted")
+
+    logging.info("{} is killed".format(worker))
diff --git a/tests/files/org/openml/test/datasets/-1/features.xml b/tests/files/org/openml/test/datasets/-1/features.xml
index d46f635c1..01adbf5a8 100644
--- a/tests/files/org/openml/test/datasets/-1/features.xml
+++ b/tests/files/org/openml/test/datasets/-1/features.xml
@@ -180003,6 +180003,8 @@
     <oml:index>20000</oml:index>
     <oml:name>class</oml:name>
     <oml:data_type>nominal</oml:data_type>
+    <oml:nominal_value>-1</oml:nominal_value>
+    <oml:nominal_value>1</oml:nominal_value>
     <oml:is_target>false</oml:is_target>
     <oml:is_ignore>false</oml:is_ignore>
     <oml:is_row_identifier>false</oml:is_row_identifier>
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 5f4f9806d..cabad9565 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -141,7 +141,7 @@ def test_get_data_with_target_pandas(self):
         self.assertNotIn("class", attribute_names)
 
     def test_get_data_rowid_and_ignore_and_target(self):
-        self.dataset.ignore_attributes = ["condition"]
+        self.dataset.ignore_attribute = ["condition"]
         self.dataset.row_id_attribute = ["hardness"]
         X, y, categorical, names = self.dataset.get_data(target="class")
         self.assertEqual(X.shape, (898, 36))
@@ -151,15 +151,15 @@ def test_get_data_rowid_and_ignore_and_target(self):
         self.assertEqual(y.shape, (898, ))
 
     def test_get_data_with_ignore_attributes(self):
-        self.dataset.ignore_attributes = ["condition"]
-        rval, _, categorical, _ = self.dataset.get_data(include_ignore_attributes=True)
+        self.dataset.ignore_attribute = ["condition"]
+        rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=True)
         for (dtype, is_cat) in zip(rval.dtypes, categorical):
             expected_type = 'category' if is_cat else 'float64'
             self.assertEqual(dtype.name, expected_type)
         self.assertEqual(rval.shape, (898, 39))
         self.assertEqual(len(categorical), 39)
 
-        rval, _, categorical, _ = self.dataset.get_data(include_ignore_attributes=False)
+        rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=False)
         for (dtype, is_cat) in zip(rval.dtypes, categorical):
             expected_type = 'category' if is_cat else 'float64'
             self.assertEqual(dtype.name, expected_type)
@@ -271,9 +271,9 @@ def test_get_sparse_dataset_with_rowid(self):
         self.assertEqual(len(categorical), 20000)
 
     def test_get_sparse_dataset_with_ignore_attributes(self):
-        self.sparse_dataset.ignore_attributes = ["V256"]
+        self.sparse_dataset.ignore_attribute = ["V256"]
         rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format='array', include_ignore_attributes=True
+            dataset_format='array', include_ignore_attribute=True
         )
         self.assertTrue(sparse.issparse(rval))
         self.assertEqual(rval.dtype, np.float32)
@@ -281,7 +281,7 @@ def test_get_sparse_dataset_with_ignore_attributes(self):
 
         self.assertEqual(len(categorical), 20001)
         rval, _, categorical, _ = self.sparse_dataset.get_data(
-            dataset_format='array', include_ignore_attributes=False
+            dataset_format='array', include_ignore_attribute=False
         )
         self.assertTrue(sparse.issparse(rval))
         self.assertEqual(rval.dtype, np.float32)
@@ -290,13 +290,13 @@ def test_get_sparse_dataset_with_ignore_attributes(self):
 
     def test_get_sparse_dataset_rowid_and_ignore_and_target(self):
         # TODO: re-add row_id and ignore attributes
-        self.sparse_dataset.ignore_attributes = ["V256"]
+        self.sparse_dataset.ignore_attribute = ["V256"]
         self.sparse_dataset.row_id_attribute = ["V512"]
         X, y, categorical, _ = self.sparse_dataset.get_data(
             dataset_format='array',
             target="class",
             include_row_id=False,
-            include_ignore_attributes=False,
+            include_ignore_attribute=False,
         )
         self.assertTrue(sparse.issparse(X))
         self.assertEqual(X.dtype, np.float32)
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 0b2620485..5726d2442 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -4,6 +4,7 @@
 from unittest import mock
 
 import arff
+import time
 
 import pytest
 import numpy as np
@@ -43,14 +44,14 @@ def tearDown(self):
         super(TestOpenMLDataset, self).tearDown()
 
     def _remove_pickle_files(self):
-        cache_dir = self.static_cache_dir
+        self.lock_path = os.path.join(openml.config.get_cache_directory(), 'locks')
         for did in ['-1', '2']:
             with lockutils.external_lock(
                     name='datasets.functions.get_dataset:%s' % did,
-                    lock_path=os.path.join(openml.config.get_cache_directory(), 'locks'),
+                    lock_path=self.lock_path,
             ):
-                pickle_path = os.path.join(cache_dir, 'datasets', did,
-                                           'dataset.pkl')
+                pickle_path = os.path.join(openml.config.get_cache_directory(), 'datasets',
+                                           did, 'dataset.pkl.py3')
                 try:
                     os.remove(pickle_path)
                 except (OSError, FileNotFoundError):
@@ -478,6 +479,9 @@ def test_publish_dataset(self):
             data_file=file_path,
         )
         dataset.publish()
+        TestBase._mark_entity_for_removal('data', dataset.dataset_id)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            dataset.dataset_id))
         self.assertIsInstance(dataset.dataset_id, int)
 
     def test__retrieve_class_labels(self):
@@ -498,6 +502,9 @@ def test_upload_dataset_with_url(self):
             url="https://www.openml.org/data/download/61/dataset_61_iris.arff",
         )
         dataset.publish()
+        TestBase._mark_entity_for_removal('data', dataset.dataset_id)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            dataset.dataset_id))
         self.assertIsInstance(dataset.dataset_id, int)
 
     def test_data_status(self):
@@ -507,6 +514,9 @@ def test_data_status(self):
             version=1,
             url="https://www.openml.org/data/download/61/dataset_61_iris.arff")
         dataset.publish()
+        TestBase._mark_entity_for_removal('data', dataset.dataset_id)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            dataset.dataset_id))
         did = dataset.dataset_id
 
         # admin key for test server (only adminds can activate datasets.
@@ -620,6 +630,9 @@ def test_create_dataset_numpy(self):
         )
 
         upload_did = dataset.publish()
+        TestBase._mark_entity_for_removal('data', upload_did)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            upload_did))
 
         self.assertEqual(
             _get_online_dataset_arff(upload_did),
@@ -682,6 +695,9 @@ def test_create_dataset_list(self):
         )
 
         upload_did = dataset.publish()
+        TestBase._mark_entity_for_removal('data', upload_did)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            upload_did))
         self.assertEqual(
             _get_online_dataset_arff(upload_did),
             dataset._dataset,
@@ -725,6 +741,9 @@ def test_create_dataset_sparse(self):
         )
 
         upload_did = xor_dataset.publish()
+        TestBase._mark_entity_for_removal('data', upload_did)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            upload_did))
         self.assertEqual(
             _get_online_dataset_arff(upload_did),
             xor_dataset._dataset,
@@ -762,6 +781,9 @@ def test_create_dataset_sparse(self):
         )
 
         upload_did = xor_dataset.publish()
+        TestBase._mark_entity_for_removal('data', upload_did)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            upload_did))
         self.assertEqual(
             _get_online_dataset_arff(upload_did),
             xor_dataset._dataset,
@@ -885,6 +907,9 @@ def test_create_dataset_pandas(self):
             paper_url=paper_url
         )
         upload_did = dataset.publish()
+        TestBase._mark_entity_for_removal('data', upload_did)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            upload_did))
         self.assertEqual(
             _get_online_dataset_arff(upload_did),
             dataset._dataset,
@@ -919,6 +944,9 @@ def test_create_dataset_pandas(self):
             paper_url=paper_url
         )
         upload_did = dataset.publish()
+        TestBase._mark_entity_for_removal('data', upload_did)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            upload_did))
         self.assertEqual(
             _get_online_dataset_arff(upload_did),
             dataset._dataset,
@@ -955,6 +983,9 @@ def test_create_dataset_pandas(self):
             paper_url=paper_url
         )
         upload_did = dataset.publish()
+        TestBase._mark_entity_for_removal('data', upload_did)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            upload_did))
         downloaded_data = _get_online_dataset_arff(upload_did)
         self.assertEqual(
             downloaded_data,
@@ -1012,9 +1043,10 @@ def test_ignore_attributes_dataset(self):
             original_data_url=original_data_url,
             paper_url=paper_url
         )
-        self.assertEqual(dataset.ignore_attributes, ['outlook'])
+        self.assertEqual(dataset.ignore_attribute, ['outlook'])
 
         # pass a list to ignore_attribute
+        ignore_attribute = ['outlook', 'windy']
         dataset = openml.datasets.functions.create_dataset(
             name=name,
             description=description,
@@ -1025,7 +1057,7 @@ def test_ignore_attributes_dataset(self):
             licence=licence,
             default_target_attribute=default_target_attribute,
             row_id_attribute=None,
-            ignore_attribute=['outlook', 'windy'],
+            ignore_attribute=ignore_attribute,
             citation=citation,
             attributes='auto',
             data=df,
@@ -1033,7 +1065,7 @@ def test_ignore_attributes_dataset(self):
             original_data_url=original_data_url,
             paper_url=paper_url
         )
-        self.assertEqual(dataset.ignore_attributes, ['outlook', 'windy'])
+        self.assertEqual(dataset.ignore_attribute, ignore_attribute)
 
         # raise an error if unknown type
         err_msg = 'Wrong data type for ignore_attribute. Should be list.'
@@ -1057,6 +1089,83 @@ def test_ignore_attributes_dataset(self):
                 paper_url=paper_url
             )
 
+    def test_publish_fetch_ignore_attribute(self):
+        """Test to upload and retrieve dataset and check ignore_attributes"""
+        data = [
+            ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'],
+            ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'],
+            ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'],
+            ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'],
+            ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes']
+        ]
+        column_names = ['rnd_str', 'outlook', 'temperature', 'humidity',
+                        'windy', 'play']
+        df = pd.DataFrame(data, columns=column_names)
+        # enforce the type of each column
+        df['outlook'] = df['outlook'].astype('category')
+        df['windy'] = df['windy'].astype('bool')
+        df['play'] = df['play'].astype('category')
+        # meta-information
+        name = '%s-pandas_testing_dataset' % self._get_sentinel()
+        description = 'Synthetic dataset created from a Pandas DataFrame'
+        creator = 'OpenML tester'
+        collection_date = '01-01-2018'
+        language = 'English'
+        licence = 'MIT'
+        default_target_attribute = 'play'
+        citation = 'None'
+        original_data_url = 'http://openml.github.io/openml-python'
+        paper_url = 'http://openml.github.io/openml-python'
+
+        # pass a list to ignore_attribute
+        ignore_attribute = ['outlook', 'windy']
+        dataset = openml.datasets.functions.create_dataset(
+            name=name,
+            description=description,
+            creator=creator,
+            contributor=None,
+            collection_date=collection_date,
+            language=language,
+            licence=licence,
+            default_target_attribute=default_target_attribute,
+            row_id_attribute=None,
+            ignore_attribute=ignore_attribute,
+            citation=citation,
+            attributes='auto',
+            data=df,
+            version_label='test',
+            original_data_url=original_data_url,
+            paper_url=paper_url
+        )
+
+        # publish dataset
+        upload_did = dataset.publish()
+        TestBase._mark_entity_for_removal('data', upload_did)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            upload_did))
+        # test if publish was successful
+        self.assertIsInstance(upload_did, int)
+
+        dataset = None
+        # fetching from server
+        # loop till timeout or fetch not successful
+        max_waiting_time_seconds = 400
+        # time.time() works in seconds
+        start_time = time.time()
+        while time.time() - start_time < max_waiting_time_seconds:
+            try:
+                dataset = openml.datasets.get_dataset(upload_did)
+                break
+            except Exception as e:
+                # returned code 273: Dataset not processed yet
+                # returned code 362: No qualities found
+                print("Failed to fetch dataset:{} with '{}'.".format(upload_did, str(e)))
+                time.sleep(10)
+                continue
+        if dataset is None:
+            raise ValueError("TIMEOUT: Failed to fetch uploaded dataset - {}".format(upload_did))
+        self.assertEqual(dataset.ignore_attribute, ignore_attribute)
+
     def test_create_dataset_row_id_attribute_error(self):
         # meta-information
         name = '%s-pandas_testing_dataset' % self._get_sentinel()
@@ -1146,6 +1255,9 @@ def test_create_dataset_row_id_attribute_inference(self):
             )
             self.assertEqual(dataset.row_id_attribute, output_row_id)
             upload_did = dataset.publish()
+            TestBase._mark_entity_for_removal('data', upload_did)
+            TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                                upload_did))
             arff_dataset = arff.loads(_get_online_dataset_arff(upload_did))
             arff_data = np.array(arff_dataset['data'], dtype=object)
             # if we set the name of the index then the index will be added to
@@ -1190,3 +1302,8 @@ def test_create_dataset_attributes_auto_without_df(self):
                 original_data_url=original_data_url,
                 paper_url=paper_url
             )
+
+    def test_list_qualities(self):
+        qualities = openml.datasets.list_qualities()
+        self.assertEqual(isinstance(qualities, list), True)
+        self.assertEqual(all([isinstance(q, str) for q in qualities]), True)
diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py
index 37e8f710d..b25b35391 100644
--- a/tests/test_evaluations/test_evaluation_functions.py
+++ b/tests/test_evaluations/test_evaluation_functions.py
@@ -6,6 +6,30 @@
 class TestEvaluationFunctions(TestBase):
     _multiprocess_can_split_ = True
 
+    def _check_list_evaluation_setups(self, size, **kwargs):
+        evals_setups = openml.evaluations.list_evaluations_setups("predictive_accuracy",
+                                                                  **kwargs, size=size,
+                                                                  sort_order='desc',
+                                                                  output_format='dataframe')
+        evals = openml.evaluations.list_evaluations("predictive_accuracy",
+                                                    **kwargs, size=size,
+                                                    sort_order='desc',
+                                                    output_format='dataframe')
+
+        # Check if list is non-empty
+        self.assertGreater(len(evals_setups), 0)
+        # Check if output from sort is sorted in the right order
+        self.assertSequenceEqual(sorted(evals_setups['value'].tolist(), reverse=True),
+                                 evals_setups['value'].tolist())
+
+        # Check if output and order of list_evaluations is preserved
+        self.assertSequenceEqual(evals_setups['run_id'].tolist(), evals['run_id'].tolist())
+        # Check if the hyper-parameter column is as accurate and flow_id
+        for index, row in evals_setups.iterrows():
+            params = openml.runs.get_run(row['run_id']).parameter_settings
+            hyper_params = [tuple([param['oml:name'], param['oml:value']]) for param in params]
+            self.assertTrue(sorted(row['parameters']) == sorted(hyper_params))
+
     def test_evaluation_list_filter_task(self):
         openml.config.server = self.production_server
 
@@ -116,3 +140,41 @@ def test_evaluation_list_per_fold(self):
         for run_id in evaluations.keys():
             self.assertIsNotNone(evaluations[run_id].value)
             self.assertIsNone(evaluations[run_id].values)
+
+    def test_evaluation_list_sort(self):
+        size = 10
+        task_id = 115
+        # Get all evaluations of the task
+        unsorted_eval = openml.evaluations.list_evaluations(
+            "predictive_accuracy", offset=0, task=[task_id])
+        # Get top 10 evaluations of the same task
+        sorted_eval = openml.evaluations.list_evaluations(
+            "predictive_accuracy", size=size, offset=0, task=[task_id], sort_order="desc")
+        self.assertEqual(len(sorted_eval), size)
+        self.assertGreater(len(unsorted_eval), 0)
+        sorted_output = [evaluation.value for evaluation in sorted_eval.values()]
+        unsorted_output = [evaluation.value for evaluation in unsorted_eval.values()]
+
+        # Check if output from sort is sorted in the right order
+        self.assertTrue(sorted(sorted_output, reverse=True) == sorted_output)
+
+        # Compare manual sorting against sorted output
+        test_output = sorted(unsorted_output, reverse=True)
+        self.assertTrue(test_output[:size] == sorted_output)
+
+    def test_list_evaluation_measures(self):
+        measures = openml.evaluations.list_evaluation_measures()
+        self.assertEqual(isinstance(measures, list), True)
+        self.assertEqual(all([isinstance(s, str) for s in measures]), True)
+
+    def test_list_evaluations_setups_filter_flow(self):
+        openml.config.server = self.production_server
+        flow_id = [405]
+        size = 100
+        self._check_list_evaluation_setups(size, flow=flow_id)
+
+    def test_list_evaluations_setups_filter_task(self):
+        openml.config.server = self.production_server
+        task_id = [6]
+        size = 100
+        self._check_list_evaluation_setups(size, task=task_id)
diff --git a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
index aef064ad5..8bc615516 100644
--- a/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
+++ b/tests/test_extensions/test_sklearn_extension/test_sklearn_extension.py
@@ -1,5 +1,6 @@
 import collections
 import json
+import re
 import os
 import sys
 import unittest
@@ -27,10 +28,6 @@
 import sklearn.tree
 import sklearn.cluster
 
-if LooseVersion(sklearn.__version__) < "0.20":
-    from sklearn.preprocessing import Imputer
-else:
-    from sklearn.impute import SimpleImputer as Imputer
 
 import openml
 from openml.extensions.sklearn import SklearnExtension
@@ -38,7 +35,8 @@
 from openml.flows import OpenMLFlow
 from openml.flows.functions import assert_flows_equal
 from openml.runs.trace import OpenMLRunTrace
-from openml.testing import TestBase
+from openml.testing import TestBase, SimpleImputer
+
 
 this_directory = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(this_directory)
@@ -76,6 +74,7 @@ def test_serialize_model(self):
                                                         max_leaf_nodes=2000)
 
             fixture_name = 'sklearn.tree.tree.DecisionTreeClassifier'
+            fixture_short_name = 'sklearn.DecisionTreeClassifier'
             fixture_description = 'Automatically created scikit-learn flow.'
             version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
                               % sklearn.__version__
@@ -117,6 +116,7 @@ def test_serialize_model(self):
 
             self.assertEqual(serialization.name, fixture_name)
             self.assertEqual(serialization.class_name, fixture_name)
+            self.assertEqual(serialization.custom_name, fixture_short_name)
             self.assertEqual(serialization.description, fixture_description)
             self.assertEqual(serialization.parameters, fixture_parameters)
             self.assertEqual(serialization.dependencies, version_fixture)
@@ -142,6 +142,7 @@ def test_serialize_model_clustering(self):
             model = sklearn.cluster.KMeans()
 
             fixture_name = 'sklearn.cluster.k_means_.KMeans'
+            fixture_short_name = 'sklearn.KMeans'
             fixture_description = 'Automatically created scikit-learn flow.'
             version_fixture = 'sklearn==%s\nnumpy>=1.6.1\nscipy>=0.9' \
                               % sklearn.__version__
@@ -179,6 +180,7 @@ def test_serialize_model_clustering(self):
 
             self.assertEqual(serialization.name, fixture_name)
             self.assertEqual(serialization.class_name, fixture_name)
+            self.assertEqual(serialization.custom_name, fixture_short_name)
             self.assertEqual(serialization.description, fixture_description)
             self.assertEqual(serialization.parameters, fixture_parameters)
             self.assertEqual(serialization.dependencies, version_fixture)
@@ -204,6 +206,7 @@ def test_serialize_model_with_subcomponent(self):
         fixture_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' \
                        '(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)'
         fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'
+        fixture_short_name = 'sklearn.AdaBoostClassifier'
         fixture_description = 'Automatically created scikit-learn flow.'
         fixture_subcomponent_name = 'sklearn.tree.tree.DecisionTreeClassifier'
         fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier'
@@ -218,6 +221,7 @@ def test_serialize_model_with_subcomponent(self):
 
         self.assertEqual(serialization.name, fixture_name)
         self.assertEqual(serialization.class_name, fixture_class_name)
+        self.assertEqual(serialization.custom_name, fixture_short_name)
         self.assertEqual(serialization.description, fixture_description)
         self.assertEqual(serialization.parameters['algorithm'], '"SAMME.R"')
         self.assertIsInstance(serialization.parameters['base_estimator'], str)
@@ -259,6 +263,7 @@ def test_serialize_pipeline(self):
         fixture_name = 'sklearn.pipeline.Pipeline(' \
                        'scaler=sklearn.preprocessing.data.StandardScaler,' \
                        'dummy=sklearn.dummy.DummyClassifier)'
+        fixture_short_name = 'sklearn.Pipeline(StandardScaler,DummyClassifier)'
         fixture_description = 'Automatically created scikit-learn flow.'
         fixture_structure = {
             fixture_name: [],
@@ -270,17 +275,21 @@ def test_serialize_pipeline(self):
         structure = serialization.get_structure('name')
 
         self.assertEqual(serialization.name, fixture_name)
+        self.assertEqual(serialization.custom_name, fixture_short_name)
         self.assertEqual(serialization.description, fixture_description)
         self.assertDictEqual(structure, fixture_structure)
 
         # Comparing the pipeline
         # The parameters only have the name of base objects(not the whole flow)
         # as value
-        # memory parameter has been added in 0.19
+        # memory parameter has been added in 0.19, verbose in 0.21
         if LooseVersion(sklearn.__version__) < "0.19":
             self.assertEqual(len(serialization.parameters), 1)
-        else:
+        elif LooseVersion(sklearn.__version__) < "0.21":
             self.assertEqual(len(serialization.parameters), 2)
+        else:
+            self.assertEqual(len(serialization.parameters), 3)
+
         # Hard to compare two representations of a dict due to possibly
         # different sorting. Making a json makes it easier
         self.assertEqual(
@@ -343,6 +352,7 @@ def test_serialize_pipeline_clustering(self):
         fixture_name = 'sklearn.pipeline.Pipeline(' \
                        'scaler=sklearn.preprocessing.data.StandardScaler,' \
                        'clusterer=sklearn.cluster.k_means_.KMeans)'
+        fixture_short_name = 'sklearn.Pipeline(StandardScaler,KMeans)'
         fixture_description = 'Automatically created scikit-learn flow.'
         fixture_structure = {
             fixture_name: [],
@@ -354,6 +364,7 @@ def test_serialize_pipeline_clustering(self):
         structure = serialization.get_structure('name')
 
         self.assertEqual(serialization.name, fixture_name)
+        self.assertEqual(serialization.custom_name, fixture_short_name)
         self.assertEqual(serialization.description, fixture_description)
         self.assertDictEqual(structure, fixture_structure)
 
@@ -363,8 +374,10 @@ def test_serialize_pipeline_clustering(self):
         # memory parameter has been added in 0.19
         if LooseVersion(sklearn.__version__) < "0.19":
             self.assertEqual(len(serialization.parameters), 1)
-        else:
+        elif LooseVersion(sklearn.__version__) < "0.21":
             self.assertEqual(len(serialization.parameters), 2)
+        else:
+            self.assertEqual(len(serialization.parameters), 3)
         # Hard to compare two representations of a dict due to possibly
         # different sorting. Making a json makes it easier
         self.assertEqual(
@@ -431,6 +444,7 @@ def test_serialize_column_transformer(self):
         fixture = 'sklearn.compose._column_transformer.ColumnTransformer(' \
                   'numeric=sklearn.preprocessing.data.StandardScaler,' \
                   'nominal=sklearn.preprocessing._encoders.OneHotEncoder)'
+        fixture_short_name = 'sklearn.ColumnTransformer'
         fixture_description = 'Automatically created scikit-learn flow.'
         fixture_structure = {
             fixture: [],
@@ -441,6 +455,7 @@ def test_serialize_column_transformer(self):
         serialization = self.extension.model_to_flow(model)
         structure = serialization.get_structure('name')
         self.assertEqual(serialization.name, fixture)
+        self.assertEqual(serialization.custom_name, fixture_short_name)
         self.assertEqual(serialization.description, fixture_description)
         self.assertDictEqual(structure, fixture_structure)
         # del serialization.model
@@ -611,7 +626,7 @@ def test_serialize_feature_union_switched_names(self):
             .format(module_name_encoder))
 
     def test_serialize_complex_flow(self):
-        ohe = sklearn.preprocessing.OneHotEncoder(categorical_features=[0])
+        ohe = sklearn.preprocessing.OneHotEncoder()
         scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
         boosting = sklearn.ensemble.AdaBoostClassifier(
             base_estimator=sklearn.tree.DecisionTreeClassifier())
@@ -734,15 +749,16 @@ def test_serialize_simple_parameter_grid(self):
         # Examples from the scikit-learn documentation
         models = [sklearn.svm.SVC(), sklearn.ensemble.RandomForestClassifier()]
         grids = \
-            [[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
-              {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
-               'kernel': ['rbf']}],
-             {"max_depth": [3, None],
-              "max_features": [1, 3, 10],
-              "min_samples_split": [1, 3, 10],
-              "min_samples_leaf": [1, 3, 10],
-              "bootstrap": [True, False],
-              "criterion": ["gini", "entropy"]}]
+            [[OrderedDict([('C', [1, 10, 100, 1000]), ('kernel', ['linear'])]),
+              OrderedDict([('C', [1, 10, 100, 1000]), ('gamma', [0.001, 0.0001]),
+                           ('kernel', ['rbf'])])],
+             OrderedDict([("bootstrap", [True, False]),
+                          ("criterion", ["gini", "entropy"]),
+                          ("max_depth", [3, None]),
+                          ("max_features", [1, 3, 10]),
+                          ("min_samples_leaf", [1, 3, 10]),
+                          ("min_samples_split", [1, 3, 10])
+                          ])]
 
         for grid, model in zip(grids, models):
             serialized = self.extension.model_to_flow(grid)
@@ -750,9 +766,9 @@ def test_serialize_simple_parameter_grid(self):
 
             self.assertEqual(deserialized, grid)
             self.assertIsNot(deserialized, grid)
-
+            # providing error_score because nan != nan
             hpo = sklearn.model_selection.GridSearchCV(
-                param_grid=grid, estimator=model)
+                param_grid=grid, estimator=model, error_score=-1000)
 
             serialized = self.extension.model_to_flow(hpo)
             deserialized = self.extension.flow_to_model(serialized)
@@ -825,7 +841,8 @@ def test_serialize_advanced_grid_fails(self):
         )
         with self.assertRaisesRegex(
             TypeError,
-            ".*OpenMLFlow.*is not JSON serializable",
+                re.compile(r".*OpenML.*Flow.*is not JSON serializable",
+                           flags=re.DOTALL)
         ):
             self.extension.model_to_flow(clf)
 
@@ -929,7 +946,7 @@ def test_illegal_parameter_names(self):
     def test_illegal_parameter_names_pipeline(self):
         # illegal name: steps
         steps = [
-            ('Imputer', Imputer(strategy='median')),
+            ('Imputer', SimpleImputer(strategy='median')),
             ('OneHotEncoder',
              sklearn.preprocessing.OneHotEncoder(sparse=False,
                                                  handle_unknown='ignore')),
@@ -942,7 +959,7 @@ def test_illegal_parameter_names_featureunion(self):
         # illegal name: transformer_list
         transformer_list = [
             ('transformer_list',
-             Imputer(strategy='median')),
+             SimpleImputer(strategy='median')),
             ('OneHotEncoder',
              sklearn.preprocessing.OneHotEncoder(sparse=False,
                                                  handle_unknown='ignore'))
@@ -1001,18 +1018,25 @@ def test_paralizable_check(self):
                 self.extension._prevent_optimize_n_jobs(model)
 
     def test__get_fn_arguments_with_defaults(self):
-        if LooseVersion(sklearn.__version__) < "0.19":
+        sklearn_version = LooseVersion(sklearn.__version__)
+        if sklearn_version < "0.19":
             fns = [
                 (sklearn.ensemble.RandomForestRegressor.__init__, 15),
                 (sklearn.tree.DecisionTreeClassifier.__init__, 12),
                 (sklearn.pipeline.Pipeline.__init__, 0)
             ]
-        else:
+        elif sklearn_version < "0.21":
             fns = [
                 (sklearn.ensemble.RandomForestRegressor.__init__, 16),
                 (sklearn.tree.DecisionTreeClassifier.__init__, 13),
                 (sklearn.pipeline.Pipeline.__init__, 1)
             ]
+        else:
+            fns = [
+                (sklearn.ensemble.RandomForestRegressor.__init__, 16),
+                (sklearn.tree.DecisionTreeClassifier.__init__, 13),
+                (sklearn.pipeline.Pipeline.__init__, 2)
+            ]
 
         for fn, num_params_with_defaults in fns:
             defaults, defaultless = (
@@ -1033,7 +1057,7 @@ def test_deserialize_with_defaults(self):
         # used the 'initialize_with_defaults' flag of the deserialization
         # method to return a flow that contains default hyperparameter
         # settings.
-        steps = [('Imputer', Imputer()),
+        steps = [('Imputer', SimpleImputer()),
                  ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()),
                  ('Estimator', sklearn.tree.DecisionTreeClassifier())]
         pipe_orig = sklearn.pipeline.Pipeline(steps=steps)
@@ -1057,7 +1081,7 @@ def test_deserialize_adaboost_with_defaults(self):
         # used the 'initialize_with_defaults' flag of the deserialization
         # method to return a flow that contains default hyperparameter
         # settings.
-        steps = [('Imputer', Imputer()),
+        steps = [('Imputer', SimpleImputer()),
                  ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()),
                  ('Estimator', sklearn.ensemble.AdaBoostClassifier(
                      sklearn.tree.DecisionTreeClassifier()))]
@@ -1083,7 +1107,7 @@ def test_deserialize_complex_with_defaults(self):
         # method to return a flow that contains default hyperparameter
         # settings.
         steps = [
-            ('Imputer', Imputer()),
+            ('Imputer', SimpleImputer()),
             ('OneHotEncoder', sklearn.preprocessing.OneHotEncoder()),
             (
                 'Estimator',
@@ -1126,6 +1150,8 @@ def test_openml_param_name_to_sklearn(self):
         task = openml.tasks.get_task(115)
         run = openml.runs.run_flow_on_task(flow, task)
         run = run.publish()
+        TestBase._mark_entity_for_removal('run', run.run_id)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run.run_id))
         run = openml.runs.get_run(run.run_id)
         setup = openml.setups.get_setup(run.setup_id)
 
@@ -1217,6 +1243,14 @@ def setUp(self):
     ################################################################################################
     # Test methods for performing runs with this extension module
 
+    def test_run_model_on_task(self):
+        class MyPipe(sklearn.pipeline.Pipeline):
+            pass
+        task = openml.tasks.get_task(1)
+        pipe = MyPipe([('imp', SimpleImputer()),
+                       ('dummy', sklearn.dummy.DummyClassifier())])
+        openml.runs.run_model_on_task(pipe, task)
+
     def test_seed_model(self):
         # randomized models that are initialized without seeds, can be seeded
         randomized_clfs = [
@@ -1285,7 +1319,7 @@ def test_run_model_on_fold_classification_1(self):
         y_test = y[test_indices]
 
         pipeline = sklearn.pipeline.Pipeline(steps=[
-            ('imp', sklearn.preprocessing.Imputer()),
+            ('imp', SimpleImputer()),
             ('clf', sklearn.tree.DecisionTreeClassifier()),
         ])
         # TODO add some mocking here to actually test the innards of this function, too!
@@ -1411,11 +1445,11 @@ def predict_proba(*args, **kwargs):
             y_train = y[train_indices]
             X_test = X[test_indices]
             clf1 = sklearn.pipeline.Pipeline(steps=[
-                ('imputer', sklearn.preprocessing.Imputer()),
+                ('imputer', SimpleImputer()),
                 ('estimator', sklearn.naive_bayes.GaussianNB())
             ])
             clf2 = sklearn.pipeline.Pipeline(steps=[
-                ('imputer', sklearn.preprocessing.Imputer()),
+                ('imputer', SimpleImputer()),
                 ('estimator', HardNaiveBayes())
             ])
 
@@ -1468,7 +1502,7 @@ def test_run_model_on_fold_regression(self):
         y_test = y[test_indices]
 
         pipeline = sklearn.pipeline.Pipeline(steps=[
-            ('imp', sklearn.preprocessing.Imputer()),
+            ('imp', SimpleImputer()),
             ('clf', sklearn.tree.DecisionTreeRegressor()),
         ])
         # TODO add some mocking here to actually test the innards of this function, too!
@@ -1513,7 +1547,7 @@ def test_run_model_on_fold_clustering(self):
         X = task.get_X(dataset_format='array')
 
         pipeline = sklearn.pipeline.Pipeline(steps=[
-            ('imp', sklearn.preprocessing.Imputer()),
+            ('imp', SimpleImputer()),
             ('clf', sklearn.cluster.KMeans()),
         ])
         # TODO add some mocking here to actually test the innards of this function, too!
@@ -1596,3 +1630,62 @@ def test__extract_trace_data(self):
                 self.assertIn(param_in_trace, trace_iteration.parameters)
                 param_value = json.loads(trace_iteration.parameters[param_in_trace])
                 self.assertTrue(param_value in param_grid[param])
+
+    def test_trim_flow_name(self):
+        import re
+        long = """sklearn.pipeline.Pipeline(
+                    columntransformer=sklearn.compose._column_transformer.ColumnTransformer(
+                        numeric=sklearn.pipeline.Pipeline(
+                            SimpleImputer=sklearn.preprocessing.imputation.Imputer,
+                            standardscaler=sklearn.preprocessing.data.StandardScaler),
+                        nominal=sklearn.pipeline.Pipeline(
+                            simpleimputer=sklearn.impute.SimpleImputer,
+                            onehotencoder=sklearn.preprocessing._encoders.OneHotEncoder)),
+                    variancethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
+                    svc=sklearn.svm.classes.SVC)"""
+        short = "sklearn.Pipeline(ColumnTransformer,VarianceThreshold,SVC)"
+        shorter = "sklearn.Pipeline(...,SVC)"
+        long_stripped, _ = re.subn(r'\s', '', long)
+        self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
+        self.assertEqual(shorter,
+                         SklearnExtension.trim_flow_name(long_stripped, extra_trim_length=50))
+
+        long = """sklearn.pipeline.Pipeline(
+                    imputation=openmlstudy14.preprocessing.ConditionalImputer,
+                    hotencoding=sklearn.preprocessing.data.OneHotEncoder,
+                    variencethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,
+                    classifier=sklearn.ensemble.forest.RandomForestClassifier)"""
+        short = "sklearn.Pipeline(ConditionalImputer,OneHotEncoder,VarianceThreshold,RandomForestClassifier)"  # noqa: E501
+        long_stripped, _ = re.subn(r'\s', '', long)
+        self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
+
+        long = """sklearn.pipeline.Pipeline(
+                    SimpleImputer=sklearn.preprocessing.imputation.Imputer,
+                    VarianceThreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold, # noqa: E501
+                    Estimator=sklearn.model_selection._search.RandomizedSearchCV(
+                        estimator=sklearn.tree.tree.DecisionTreeClassifier))"""
+        short = "sklearn.Pipeline(Imputer,VarianceThreshold,RandomizedSearchCV(DecisionTreeClassifier))"  # noqa: E501
+        long_stripped, _ = re.subn(r'\s', '', long)
+        self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
+
+        long = """sklearn.model_selection._search.RandomizedSearchCV(
+                    estimator=sklearn.pipeline.Pipeline(
+                        SimpleImputer=sklearn.preprocessing.imputation.Imputer,
+                        classifier=sklearn.ensemble.forest.RandomForestClassifier))"""
+        short = "sklearn.RandomizedSearchCV(Pipeline(Imputer,RandomForestClassifier))"
+        long_stripped, _ = re.subn(r'\s', '', long)
+        self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
+
+        long = """sklearn.pipeline.FeatureUnion(
+                    pca=sklearn.decomposition.pca.PCA,
+                    svd=sklearn.decomposition.truncated_svd.TruncatedSVD)"""
+        short = "sklearn.FeatureUnion(PCA,TruncatedSVD)"
+        long_stripped, _ = re.subn(r'\s', '', long)
+        self.assertEqual(short, SklearnExtension.trim_flow_name(long_stripped))
+
+        long = "sklearn.ensemble.forest.RandomForestClassifier"
+        short = "sklearn.RandomForestClassifier"
+        self.assertEqual(short, SklearnExtension.trim_flow_name(long))
+
+        self.assertEqual("weka.IsolationForest",
+                         SklearnExtension.trim_flow_name("weka.IsolationForest"))
diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py
index 7b8c66cab..25e2dacfb 100644
--- a/tests/test_flows/test_flow.py
+++ b/tests/test_flows/test_flow.py
@@ -19,18 +19,13 @@
 import sklearn.naive_bayes
 import sklearn.tree
 
-if LooseVersion(sklearn.__version__) < "0.20":
-    from sklearn.preprocessing import Imputer
-else:
-    from sklearn.impute import SimpleImputer as Imputer
-
 import xmltodict
 
 import openml
 from openml._api_calls import _perform_api_call
 import openml.exceptions
 import openml.extensions.sklearn
-from openml.testing import TestBase
+from openml.testing import TestBase, SimpleImputer
 import openml.utils
 
 
@@ -41,6 +36,9 @@ def setUp(self):
         super().setUp()
         self.extension = openml.extensions.sklearn.SklearnExtension()
 
+    def tearDown(self):
+        super().tearDown()
+
     def test_get_flow(self):
         # We need to use the production server here because 4024 is not the
         # test server
@@ -177,6 +175,9 @@ def test_publish_flow(self):
         flow, _ = self._add_sentinel_to_flow_name(flow, None)
 
         flow.publish()
+        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            flow.flow_id))
         self.assertIsInstance(flow.flow_id, int)
 
     @mock.patch('openml.flows.functions.flow_exists')
@@ -187,6 +188,9 @@ def test_publish_existing_flow(self, flow_exists_mock):
 
         with self.assertRaises(openml.exceptions.PyOpenMLError) as context_manager:
             flow.publish(raise_error_if_exists=True)
+            TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
+            TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                                flow.flow_id))
 
         self.assertTrue('OpenMLFlow already exists' in context_manager.exception.message)
 
@@ -197,6 +201,9 @@ def test_publish_flow_with_similar_components(self):
         flow = self.extension.model_to_flow(clf)
         flow, _ = self._add_sentinel_to_flow_name(flow, None)
         flow.publish()
+        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            flow.flow_id))
         # For a flow where both components are published together, the upload
         # date should be equal
         self.assertEqual(
@@ -213,6 +220,9 @@ def test_publish_flow_with_similar_components(self):
         flow1 = self.extension.model_to_flow(clf1)
         flow1, sentinel = self._add_sentinel_to_flow_name(flow1, None)
         flow1.publish()
+        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            flow1.flow_id))
 
         # In order to assign different upload times to the flows!
         time.sleep(1)
@@ -222,6 +232,9 @@ def test_publish_flow_with_similar_components(self):
         flow2 = self.extension.model_to_flow(clf2)
         flow2, _ = self._add_sentinel_to_flow_name(flow2, sentinel)
         flow2.publish()
+        TestBase._mark_entity_for_removal('flow', (flow2.flow_id, flow2.name))
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            flow2.flow_id))
         # If one component was published before the other, the components in
         # the flow should have different upload dates
         self.assertNotEqual(flow2.upload_date,
@@ -234,6 +247,9 @@ def test_publish_flow_with_similar_components(self):
         # Child flow has different parameter. Check for storing the flow
         # correctly on the server should thus not check the child's parameters!
         flow3.publish()
+        TestBase._mark_entity_for_removal('flow', (flow3.flow_id, flow3.name))
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            flow3.flow_id))
 
     def test_semi_legal_flow(self):
         # TODO: Test if parameters are set correctly!
@@ -246,6 +262,9 @@ def test_semi_legal_flow(self):
         flow, _ = self._add_sentinel_to_flow_name(flow, None)
 
         flow.publish()
+        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            flow.flow_id))
 
     @mock.patch('openml.flows.functions.get_flow')
     @mock.patch('openml.flows.functions.flow_exists')
@@ -260,6 +279,8 @@ def test_publish_error(self, api_call_mock, flow_exists_mock, get_flow_mock):
         get_flow_mock.return_value = flow
 
         flow.publish()
+        # Not collecting flow_id for deletion since this is a test for failed upload
+
         self.assertEqual(api_call_mock.call_count, 1)
         self.assertEqual(get_flow_mock.call_count, 1)
         self.assertEqual(flow_exists_mock.call_count, 1)
@@ -271,10 +292,13 @@ def test_publish_error(self, api_call_mock, flow_exists_mock, get_flow_mock):
 
         with self.assertRaises(ValueError) as context_manager:
             flow.publish()
+            TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
+            TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                                flow.flow_id))
 
         fixture = (
-            "Flow was not stored correctly on the server. "
-            "New flow ID is 1. Please check manually and remove "
+            "The flow on the server is inconsistent with the local flow. "
+            "The server flow ID is 1. Please check manually and remove "
             "the flow if necessary! Error is:\n"
             "'Flow sklearn.ensemble.forest.RandomForestClassifier: "
             "values for attribute 'name' differ: "
@@ -289,8 +313,8 @@ def test_illegal_flow(self):
         # should throw error as it contains two imputers
         illegal = sklearn.pipeline.Pipeline(
             steps=[
-                ('imputer1', Imputer()),
-                ('imputer2', Imputer()),
+                ('imputer1', SimpleImputer()),
+                ('imputer2', SimpleImputer()),
                 ('classif', sklearn.tree.DecisionTreeClassifier())
             ]
         )
@@ -321,7 +345,7 @@ def test_existing_flow_exists(self):
         if LooseVersion(sklearn.__version__) >= '0.20':
             ohe_params['categories'] = 'auto'
         steps = [
-            ('imputation', Imputer(strategy='median')),
+            ('imputation', SimpleImputer(strategy='median')),
             ('hotencoding', sklearn.preprocessing.OneHotEncoder(**ohe_params)),
             (
                 'variencethreshold',
@@ -336,6 +360,9 @@ def test_existing_flow_exists(self):
             flow, _ = self._add_sentinel_to_flow_name(flow, None)
             # publish the flow
             flow = flow.publish()
+            TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
+            TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                                flow.flow_id))
             # redownload the flow
             flow = openml.flows.get_flow(flow.flow_id)
 
@@ -394,6 +421,9 @@ def test_sklearn_to_upload_to_flow(self):
         flow, sentinel = self._add_sentinel_to_flow_name(flow, None)
 
         flow.publish()
+        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            flow.flow_id))
         self.assertIsInstance(flow.flow_id, int)
 
         # Check whether we can load the flow again
diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
index 087623d3d..95b4fa3f0 100644
--- a/tests/test_flows/test_flow_functions.py
+++ b/tests/test_flows/test_flow_functions.py
@@ -4,6 +4,7 @@
 
 from distutils.version import LooseVersion
 import sklearn
+from sklearn import ensemble
 import pandas as pd
 
 import openml
@@ -14,6 +15,12 @@
 class TestFlowFunctions(TestBase):
     _multiprocess_can_split_ = True
 
+    def setUp(self):
+        super(TestFlowFunctions, self).setUp()
+
+    def tearDown(self):
+        super(TestFlowFunctions, self).tearDown()
+
     def _check_flow(self, flow):
         self.assertEqual(type(flow), dict)
         self.assertEqual(len(flow), 6)
@@ -242,7 +249,6 @@ def test_are_flows_equal_ignore_if_older(self):
     def test_sklearn_to_flow_list_of_lists(self):
         from sklearn.preprocessing import OrdinalEncoder
         ordinal_encoder = OrdinalEncoder(categories=[[0, 1], [0, 1]])
-
         extension = openml.extensions.sklearn.SklearnExtension()
 
         # Test serialization works
@@ -251,8 +257,42 @@ def test_sklearn_to_flow_list_of_lists(self):
         # Test flow is accepted by server
         self._add_sentinel_to_flow_name(flow)
         flow.publish()
-
+        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id))
         # Test deserialization works
         server_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
         self.assertEqual(server_flow.parameters['categories'], '[[0, 1], [0, 1]]')
         self.assertEqual(server_flow.model.categories, flow.model.categories)
+
+    def test_get_flow_reinstantiate_model(self):
+        model = ensemble.RandomForestClassifier(n_estimators=33)
+        extension = openml.extensions.get_extension_by_model(model)
+        flow = extension.model_to_flow(model)
+        flow.publish(raise_error_if_exists=False)
+        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id))
+
+        downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
+        self.assertIsInstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier)
+
+    def test_get_flow_reinstantiate_model_no_extension(self):
+        # Flow 10 is a WEKA flow
+        self.assertRaisesRegex(RuntimeError,
+                               "No extension could be found for flow 10: weka.SMO",
+                               openml.flows.get_flow,
+                               flow_id=10,
+                               reinstantiate=True)
+
+    @unittest.skipIf(LooseVersion(sklearn.__version__) == "0.19.1",
+                     reason="Target flow is from sklearn 0.19.1")
+    def test_get_flow_reinstantiate_model_wrong_version(self):
+        # Note that CI does not test against 0.19.1.
+        openml.config.server = self.production_server
+        _, sklearn_major, _ = LooseVersion(sklearn.__version__).version[:3]
+        flow = 8175
+        expected = 'Trying to deserialize a model with dependency sklearn==0.19.1 not satisfied.'
+        self.assertRaisesRegex(ValueError,
+                               expected,
+                               openml.flows.get_flow,
+                               flow_id=flow,
+                               reinstantiate=True)
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index bba14b324..88fe8d6ef 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -7,12 +7,13 @@
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import Imputer
 
-from openml.testing import TestBase
+from openml.testing import TestBase, SimpleImputer
 import openml
 import openml.extensions.sklearn
 
+import pytest
+
 
 class TestRun(TestBase):
     # Splitting not helpful, these test's don't rely on the server and take
@@ -104,7 +105,7 @@ def _check_array(array, type_):
     def test_to_from_filesystem_vanilla(self):
 
         model = Pipeline([
-            ('imputer', Imputer(strategy='mean')),
+            ('imputer', SimpleImputer(strategy='mean')),
             ('classifier', DecisionTreeClassifier(max_depth=1)),
         ])
         task = openml.tasks.get_task(119)
@@ -129,11 +130,15 @@ def test_to_from_filesystem_vanilla(self):
         self.assertTrue(run_prime.flow is None)
         self._test_run_obj_equals(run, run_prime)
         run_prime.publish()
+        TestBase._mark_entity_for_removal('run', run_prime.run_id)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            run_prime.run_id))
 
+    @pytest.mark.flaky(reruns=3)
     def test_to_from_filesystem_search(self):
 
         model = Pipeline([
-            ('imputer', Imputer(strategy='mean')),
+            ('imputer', SimpleImputer(strategy='mean')),
             ('classifier', DecisionTreeClassifier(max_depth=1)),
         ])
         model = GridSearchCV(
@@ -162,11 +167,14 @@ def test_to_from_filesystem_search(self):
         run_prime = openml.runs.OpenMLRun.from_filesystem(cache_path)
         self._test_run_obj_equals(run, run_prime)
         run_prime.publish()
+        TestBase._mark_entity_for_removal('run', run_prime.run_id)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            run_prime.run_id))
 
     def test_to_from_filesystem_no_model(self):
 
         model = Pipeline([
-            ('imputer', Imputer(strategy='mean')),
+            ('imputer', SimpleImputer(strategy='mean')),
             ('classifier', DummyClassifier()),
         ])
         task = openml.tasks.get_task(119)
@@ -196,7 +204,7 @@ def test_publish_with_local_loaded_flow(self):
         extension = openml.extensions.sklearn.SklearnExtension()
 
         model = Pipeline([
-            ('imputer', Imputer(strategy='mean')),
+            ('imputer', SimpleImputer(strategy='mean')),
             ('classifier', DummyClassifier()),
         ])
         task = openml.tasks.get_task(119)
@@ -226,6 +234,9 @@ def test_publish_with_local_loaded_flow(self):
         # obtain run from filesystem
         loaded_run = openml.runs.OpenMLRun.from_filesystem(cache_path)
         loaded_run.publish()
+        TestBase._mark_entity_for_removal('run', loaded_run.run_id)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                            loaded_run.run_id))
 
         # make sure the flow is published as part of publishing the run.
         self.assertTrue(openml.flows.flow_exists(flow.name, flow.external_version))
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 0c8b861c4..dc35d1f01 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -17,7 +17,7 @@
 import pandas as pd
 
 import openml.extensions.sklearn
-from openml.testing import TestBase
+from openml.testing import TestBase, SimpleImputer
 from openml.runs.functions import (
     _run_task_get_arffcontent,
     run_exists,
@@ -28,7 +28,7 @@
 from sklearn.naive_bayes import GaussianNB
 from sklearn.model_selection._search import BaseSearchCV
 from sklearn.tree import DecisionTreeClassifier
-from sklearn.preprocessing.imputation import Imputer
+
 from sklearn.dummy import DummyClassifier
 from sklearn.preprocessing import StandardScaler
 from sklearn.feature_selection import VarianceThreshold
@@ -184,6 +184,8 @@ def _remove_random_state(flow):
         flow, _ = self._add_sentinel_to_flow_name(flow, sentinel)
         if not openml.flows.flow_exists(flow.name, flow.external_version):
             flow.publish()
+            TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
+            TestBase.logger.info("collected from test_run_functions: {}".format(flow.flow_id))
 
         task = openml.tasks.get_task(task_id)
 
@@ -196,6 +198,8 @@ def _remove_random_state(flow):
             avoid_duplicate_runs=openml.config.avoid_duplicate_runs,
         )
         run_ = run.publish()
+        TestBase._mark_entity_for_removal('run', run.run_id)
+        TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id))
         self.assertEqual(run_, run)
         self.assertIsInstance(run.dataset_id, int)
 
@@ -407,7 +411,7 @@ def determine_grid_size(param_grid):
             # suboptimal (slow), and not guaranteed to work if evaluation
             # engine is behind.
             # TODO: mock this? We have the arff already on the server
-            self._wait_for_processed_run(run.run_id, 200)
+            self._wait_for_processed_run(run.run_id, 400)
             try:
                 model_prime = openml.runs.initialize_model_from_trace(
                     run_id=run.run_id,
@@ -546,7 +550,7 @@ def get_ct_cf(nominal_indices, numeric_indices):
             '62501', sentinel=sentinel)
 
     def test_run_and_upload_decision_tree_pipeline(self):
-        pipeline2 = Pipeline(steps=[('Imputer', Imputer(strategy='median')),
+        pipeline2 = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')),
                                     ('VarianceThreshold', VarianceThreshold()),
                                     ('Estimator', RandomizedSearchCV(
                                         DecisionTreeClassifier(),
@@ -653,7 +657,7 @@ def test_learning_curve_task_2(self):
         num_folds = 10
         num_samples = 8
 
-        pipeline2 = Pipeline(steps=[('Imputer', Imputer(strategy='median')),
+        pipeline2 = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')),
                                     ('VarianceThreshold', VarianceThreshold()),
                                     ('Estimator', RandomizedSearchCV(
                                         DecisionTreeClassifier(),
@@ -687,6 +691,8 @@ def test_initialize_cv_from_run(self):
             seed=1,
         )
         run_ = run.publish()
+        TestBase._mark_entity_for_removal('run', run.run_id)
+        TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id))
         run = openml.runs.get_run(run_.run_id)
 
         modelR = openml.runs.initialize_model_from_run(run_id=run.run_id)
@@ -708,9 +714,9 @@ def _test_local_evaluations(self, run):
         np.testing.assert_array_almost_equal(accuracy_scores_provided,
                                              accuracy_scores)
 
-        # also check if we can obtain some other scores: # TODO: how to do AUC?
+        # also check if we can obtain some other scores:
         tests = [(sklearn.metrics.cohen_kappa_score, {'weights': None}),
-                 (sklearn.metrics.auc, {'reorder': True}),
+                 (sklearn.metrics.roc_auc_score, {}),
                  (sklearn.metrics.average_precision_score, {}),
                  (sklearn.metrics.jaccard_similarity_score, {}),
                  (sklearn.metrics.precision_score, {'average': 'macro'}),
@@ -725,10 +731,10 @@ def _test_local_evaluations(self, run):
                 self.assertGreaterEqual(alt_scores[idx], 0)
                 self.assertLessEqual(alt_scores[idx], 1)
 
-    def test_local_run_metric_score_swapped_parameter_order_model(self):
+    def test_local_run_swapped_parameter_order_model(self):
 
         # construct sci-kit learn classifier
-        clf = Pipeline(steps=[('imputer', Imputer(strategy='median')),
+        clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                               ('estimator', RandomForestClassifier())])
 
         # download task
@@ -736,18 +742,17 @@ def test_local_run_metric_score_swapped_parameter_order_model(self):
 
         # invoke OpenML run
         run = openml.runs.run_model_on_task(
-            model=clf,
-            task=task,
+            task, clf,
             avoid_duplicate_runs=False,
             upload_flow=False,
         )
 
         self._test_local_evaluations(run)
 
-    def test_local_run_metric_score_swapped_parameter_order_flow(self):
+    def test_local_run_swapped_parameter_order_flow(self):
 
         # construct sci-kit learn classifier
-        clf = Pipeline(steps=[('imputer', Imputer(strategy='median')),
+        clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                               ('estimator', RandomForestClassifier())])
 
         flow = self.extension.model_to_flow(clf)
@@ -756,8 +761,7 @@ def test_local_run_metric_score_swapped_parameter_order_flow(self):
 
         # invoke OpenML run
         run = openml.runs.run_flow_on_task(
-            flow=flow,
-            task=task,
+            task, flow,
             avoid_duplicate_runs=False,
             upload_flow=False,
         )
@@ -767,7 +771,7 @@ def test_local_run_metric_score_swapped_parameter_order_flow(self):
     def test_local_run_metric_score(self):
 
         # construct sci-kit learn classifier
-        clf = Pipeline(steps=[('imputer', Imputer(strategy='median')),
+        clf = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                               ('estimator', RandomForestClassifier())])
 
         # download task
@@ -794,7 +798,7 @@ def test_online_run_metric_score(self):
 
     def test_initialize_model_from_run(self):
         clf = sklearn.pipeline.Pipeline(steps=[
-            ('Imputer', Imputer(strategy='median')),
+            ('Imputer', SimpleImputer(strategy='median')),
             ('VarianceThreshold', VarianceThreshold(threshold=0.05)),
             ('Estimator', GaussianNB())])
         task = openml.tasks.get_task(11)
@@ -804,6 +808,8 @@ def test_initialize_model_from_run(self):
             avoid_duplicate_runs=False,
         )
         run_ = run.publish()
+        TestBase._mark_entity_for_removal('run', run_.run_id)
+        TestBase.logger.info("collected from test_run_functions: {}".format(run_.run_id))
         run = openml.runs.get_run(run_.run_id)
 
         modelR = openml.runs.initialize_model_from_run(run_id=run.run_id)
@@ -855,6 +861,8 @@ def test_get_run_trace(self):
                 num_iterations * num_folds,
             )
             run = run.publish()
+            TestBase._mark_entity_for_removal('run', run.run_id)
+            TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id))
             self._wait_for_processed_run(run.run_id, 200)
             run_id = run.run_id
         except openml.exceptions.OpenMLRunsExistError as e:
@@ -874,12 +882,12 @@ def test__run_exists(self):
         rs = 1
         clfs = [
             sklearn.pipeline.Pipeline(steps=[
-                ('Imputer', Imputer(strategy='mean')),
+                ('Imputer', SimpleImputer(strategy='mean')),
                 ('VarianceThreshold', VarianceThreshold(threshold=0.05)),
                 ('Estimator', DecisionTreeClassifier(max_depth=4))
             ]),
             sklearn.pipeline.Pipeline(steps=[
-                ('Imputer', Imputer(strategy='most_frequent')),
+                ('Imputer', SimpleImputer(strategy='most_frequent')),
                 ('VarianceThreshold', VarianceThreshold(threshold=0.1)),
                 ('Estimator', DecisionTreeClassifier(max_depth=4))]
             )
@@ -899,6 +907,8 @@ def test__run_exists(self):
                     upload_flow=True
                 )
                 run.publish()
+                TestBase._mark_entity_for_removal('run', run.run_id)
+                TestBase.logger.info("collected from test_run_functions: {}".format(run.run_id))
             except openml.exceptions.PyOpenMLError:
                 # run already existed. Great.
                 pass
@@ -959,6 +969,8 @@ def test_run_with_illegal_flow_id_after_load(self):
                                   "but 'flow.flow_id' is not None.")
         with self.assertRaisesRegex(openml.exceptions.PyOpenMLError, expected_message_regex):
             loaded_run.publish()
+            TestBase._mark_entity_for_removal('run', loaded_run.run_id)
+            TestBase.logger.info("collected from test_run_functions: {}".format(loaded_run.run_id))
 
     def test_run_with_illegal_flow_id_1(self):
         # Check the case where the user adds an illegal flow id to an existing
@@ -968,6 +980,8 @@ def test_run_with_illegal_flow_id_1(self):
         flow_orig = self.extension.model_to_flow(clf)
         try:
             flow_orig.publish()  # ensures flow exist on server
+            TestBase._mark_entity_for_removal('flow', (flow_orig.flow_id, flow_orig.name))
+            TestBase.logger.info("collected from test_run_functions: {}".format(flow_orig.flow_id))
         except openml.exceptions.OpenMLServerException:
             # flow already exists
             pass
@@ -993,6 +1007,8 @@ def test_run_with_illegal_flow_id_1_after_load(self):
         flow_orig = self.extension.model_to_flow(clf)
         try:
             flow_orig.publish()  # ensures flow exist on server
+            TestBase._mark_entity_for_removal('flow', (flow_orig.flow_id, flow_orig.name))
+            TestBase.logger.info("collected from test_run_functions: {}".format(flow_orig.flow_id))
         except openml.exceptions.OpenMLServerException:
             # flow already exists
             pass
@@ -1235,7 +1251,7 @@ def test_run_on_dataset_with_missing_labels(self):
         flow.name = 'dummy'
         task = openml.tasks.get_task(2)
 
-        model = Pipeline(steps=[('Imputer', Imputer(strategy='median')),
+        model = Pipeline(steps=[('Imputer', SimpleImputer(strategy='median')),
                                 ('Estimator', DecisionTreeClassifier())])
 
         data_content, _, _, _ = _run_task_get_arffcontent(
@@ -1261,12 +1277,14 @@ def test_get_uncached_run(self):
         with self.assertRaises(openml.exceptions.OpenMLCacheException):
             openml.runs.functions._get_cached_run(10)
 
-    def test_run_model_on_task_downloaded_flow(self):
+    def test_run_flow_on_task_downloaded_flow(self):
         model = sklearn.ensemble.RandomForestClassifier(n_estimators=33)
         flow = self.extension.model_to_flow(model)
         flow.publish(raise_error_if_exists=False)
+        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
+        TestBase.logger.info("collected from test_run_functions: {}".format(flow.flow_id))
 
-        downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True)
+        downloaded_flow = openml.flows.get_flow(flow.flow_id)
         task = openml.tasks.get_task(119)  # diabetes
         run = openml.runs.run_flow_on_task(
             flow=downloaded_flow,
@@ -1276,3 +1294,5 @@ def test_run_model_on_task_downloaded_flow(self):
         )
 
         run.publish()
+        TestBase._mark_entity_for_removal('run', run.run_id)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run.run_id))
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index a8f7de4d4..16e149544 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -40,6 +40,8 @@ def test_nonexisting_setup_exists(self):
         flow = self.extension.model_to_flow(dectree)
         flow.name = 'TEST%s%s' % (sentinel, flow.name)
         flow.publish()
+        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id))
 
         # although the flow exists (created as of previous statement),
         # we can be sure there are no setups (yet) as it was just created
@@ -52,6 +54,8 @@ def _existing_setup_exists(self, classif):
         flow = self.extension.model_to_flow(classif)
         flow.name = 'TEST%s%s' % (get_sentinel(), flow.name)
         flow.publish()
+        TestBase._mark_entity_for_removal('flow', (flow.flow_id, flow.name))
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], flow.flow_id))
 
         # although the flow exists, we can be sure there are no
         # setups (yet) as it hasn't been ran
@@ -66,6 +70,8 @@ def _existing_setup_exists(self, classif):
         # spoof flow id, otherwise the sentinel is ignored
         run.flow_id = flow.flow_id
         run.publish()
+        TestBase._mark_entity_for_removal('run', run.run_id)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], run.run_id))
         # download the run, as it contains the right setup id
         run = openml.runs.get_run(run.run_id)
 
diff --git a/tests/test_study/test_study_examples.py b/tests/test_study/test_study_examples.py
index abee2d72a..1d9c56d54 100644
--- a/tests/test_study/test_study_examples.py
+++ b/tests/test_study/test_study_examples.py
@@ -1,4 +1,4 @@
-from openml.testing import TestBase
+from openml.testing import TestBase, SimpleImputer
 
 
 class TestStudyFunctions(TestBase):
@@ -30,12 +30,13 @@ def test_Figure1a(self):
         import sklearn.pipeline
         import sklearn.preprocessing
         import sklearn.tree
+
         benchmark_suite = openml.study.get_study(
             'OpenML100', 'tasks'
         )  # obtain the benchmark suite
         clf = sklearn.pipeline.Pipeline(
             steps=[
-                ('imputer', sklearn.preprocessing.Imputer()),
+                ('imputer', SimpleImputer()),
                 ('estimator', sklearn.tree.DecisionTreeClassifier())
             ]
         )  # build a sklearn classifier
@@ -51,4 +52,7 @@ def test_Figure1a(self):
             )  # print accuracy score
             print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name, score.mean()))
             run.publish()  # publish the experiment on OpenML (optional)
+            TestBase._mark_entity_for_removal('run', run.run_id)
+            TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                                run.run_id))
             print('URL for run: %s/run/%d' % (openml.config.server, run.run_id))
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index c87dd8e15..33ba0c452 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -77,6 +77,9 @@ def test_publish_benchmark_suite(self):
             task_ids=fixture_task_ids
         )
         study_id = study.publish()
+        TestBase._mark_entity_for_removal('study', study_id)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], study_id))
+
         self.assertGreater(study_id, 0)
 
         # verify main meta data
@@ -132,6 +135,8 @@ def test_publish_study(self):
             run_ids=list(run_list.keys())
         )
         study_id = study.publish()
+        # not tracking upload for delete since _delete_entity called end of function
+        # asserting return status from openml.study.delete_study()
         self.assertGreater(study_id, 0)
         study_downloaded = openml.study.get_study(study_id)
         self.assertEqual(study_downloaded.alias, fixt_alias)
@@ -181,6 +186,8 @@ def test_study_attach_illegal(self):
             run_ids=list(run_list.keys())
         )
         study_id = study.publish()
+        TestBase._mark_entity_for_removal('study', study_id)
+        TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1], study_id))
         study_original = openml.study.get_study(study_id)
 
         with self.assertRaisesRegex(openml.exceptions.OpenMLServerException,
diff --git a/tests/test_tasks/test_clustering_task.py b/tests/test_tasks/test_clustering_task.py
index 21e03052f..168b798d1 100644
--- a/tests/test_tasks/test_clustering_task.py
+++ b/tests/test_tasks/test_clustering_task.py
@@ -1,5 +1,7 @@
 import openml
+from openml.testing import TestBase
 from .test_task import OpenMLTaskTest
+from openml.exceptions import OpenMLServerException
 
 
 class OpenMLClusteringTaskTest(OpenMLTaskTest):
@@ -28,19 +30,31 @@ def test_download_task(self):
         self.assertEqual(task.dataset_id, 36)
 
     def test_upload_task(self):
-
-        # The base class uploads a clustering task with a target
-        # feature. A situation where a ground truth is available
-        # to benchmark the clustering algorithm.
-        super(OpenMLClusteringTaskTest, self).test_upload_task()
-
-        dataset_id = self._get_compatible_rand_dataset()
-        # Upload a clustering task without a ground truth.
-        task = openml.tasks.create_task(
-            task_type_id=self.task_type_id,
-            dataset_id=dataset_id,
-            estimation_procedure_id=self.estimation_procedure
-        )
-
-        task_id = task.publish()
-        openml.utils._delete_entity('task', task_id)
+        compatible_datasets = self._get_compatible_rand_dataset()
+        for i in range(100):
+            try:
+                dataset_id = compatible_datasets[i % len(compatible_datasets)]
+                # Upload a clustering task without a ground truth.
+                task = openml.tasks.create_task(
+                    task_type_id=self.task_type_id,
+                    dataset_id=dataset_id,
+                    estimation_procedure_id=self.estimation_procedure
+                )
+                task_id = task.publish()
+                TestBase._mark_entity_for_removal('task', task_id)
+                TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                                    task_id))
+                # success
+                break
+            except OpenMLServerException as e:
+                # Error code for 'task already exists'
+                # Should be 533 according to the docs
+                # (# https://www.openml.org/api_docs#!/task/post_task)
+                if e.code == 614:
+                    continue
+                else:
+                    raise e
+        else:
+            raise ValueError(
+                'Could not create a valid task for task type ID {}'.format(self.task_type_id)
+            )
diff --git a/tests/test_tasks/test_split.py b/tests/test_tasks/test_split.py
index 46c6564a1..763bb15f7 100644
--- a/tests/test_tasks/test_split.py
+++ b/tests/test_tasks/test_split.py
@@ -19,8 +19,7 @@ def setUp(self):
             self.directory, "..", "files", "org", "openml", "test",
             "tasks", "1882", "datasplits.arff"
         )
-        # TODO Needs to be adapted regarding the python version
-        self.pd_filename = self.arff_filename.replace(".arff", ".pkl")
+        self.pd_filename = self.arff_filename.replace(".arff", ".pkl.py3")
 
     def tearDown(self):
         try:
diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py
index fe7fa5f0e..3066d9ce9 100644
--- a/tests/test_tasks/test_task.py
+++ b/tests/test_tasks/test_task.py
@@ -1,5 +1,6 @@
 import unittest
-from random import randint
+from typing import List
+from random import randint, shuffle
 
 from openml.exceptions import OpenMLServerException
 from openml.testing import TestBase
@@ -11,9 +12,6 @@
     create_task,
     get_task
 )
-from openml.utils import (
-    _delete_entity,
-)
 
 
 class OpenMLTaskTest(TestBase):
@@ -47,9 +45,10 @@ def test_upload_task(self):
         # beforehand would not be an option because a concurrent unit test could potentially
         # create the same task and make this unit test fail (i.e. getting a dataset and creating
         # a task for it is not atomic).
+        compatible_datasets = self._get_compatible_rand_dataset()
         for i in range(100):
             try:
-                dataset_id = self._get_compatible_rand_dataset()
+                dataset_id = compatible_datasets[i % len(compatible_datasets)]
                 # TODO consider implementing on the diff task types.
                 task = create_task(
                     task_type_id=self.task_type_id,
@@ -59,6 +58,9 @@ def test_upload_task(self):
                 )
 
                 task_id = task.publish()
+                TestBase._mark_entity_for_removal('task', task_id)
+                TestBase.logger.info("collected from {}: {}".format(__file__.split('/')[-1],
+                                                                    task_id))
                 # success
                 break
             except OpenMLServerException as e:
@@ -74,9 +76,7 @@ def test_upload_task(self):
                 'Could not create a valid task for task type ID {}'.format(self.task_type_id)
             )
 
-        _delete_entity('task', task_id)
-
-    def _get_compatible_rand_dataset(self) -> int:
+    def _get_compatible_rand_dataset(self) -> List:
 
         compatible_datasets = []
         active_datasets = list_datasets(status='active')
@@ -84,22 +84,30 @@ def _get_compatible_rand_dataset(self) -> int:
         # depending on the task type, find either datasets
         # with only symbolic features or datasets with only
         # numerical features.
-        if self.task_type_id != 2:
+        if self.task_type_id == 2:
+            # regression task
+            for dataset_id, dataset_info in active_datasets.items():
+                if 'NumberOfSymbolicFeatures' in dataset_info:
+                    if dataset_info['NumberOfSymbolicFeatures'] == 0:
+                        compatible_datasets.append(dataset_id)
+        elif self.task_type_id == 5:
+            # clustering task
+            compatible_datasets = list(active_datasets.keys())
+        else:
             for dataset_id, dataset_info in active_datasets.items():
                 # extra checks because of:
                 # https://github.com/openml/OpenML/issues/959
                 if 'NumberOfNumericFeatures' in dataset_info:
                     if dataset_info['NumberOfNumericFeatures'] == 0:
                         compatible_datasets.append(dataset_id)
-        else:
-            for dataset_id, dataset_info in active_datasets.items():
-                if 'NumberOfSymbolicFeatures' in dataset_info:
-                    if dataset_info['NumberOfSymbolicFeatures'] == 0:
-                        compatible_datasets.append(dataset_id)
 
-        random_dataset_pos = randint(0, len(compatible_datasets) - 1)
+        # in-place shuffling
+        shuffle(compatible_datasets)
+        return compatible_datasets
 
-        return compatible_datasets[random_dataset_pos]
+        # random_dataset_pos = randint(0, len(compatible_datasets) - 1)
+        #
+        # return compatible_datasets[random_dataset_pos]
 
     def _get_random_feature(self, dataset_id: int) -> str:
 
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index ef3a454d8..f773752d5 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -12,6 +12,12 @@
 class TestTask(TestBase):
     _multiprocess_can_split_ = True
 
+    def setUp(self):
+        super(TestTask, self).setUp()
+
+    def tearDown(self):
+        super(TestTask, self).tearDown()
+
     def test__get_cached_tasks(self):
         openml.config.cache_directory = self.static_cache_dir
         tasks = openml.tasks.functions._get_cached_tasks()
@@ -78,6 +84,8 @@ def test_list_tasks_empty(self):
 
         self.assertIsInstance(tasks, dict)
 
+    @unittest.skip("Server will currently incorrectly return only 99 tasks."
+                   "See https://github.com/openml/OpenML/issues/980")
     def test_list_tasks_by_tag(self):
         num_basic_tasks = 100  # number is flexible, check server if fails
         tasks = openml.tasks.list_tasks(tag='OpenML100')
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
index 55cbba64b..4a0789414 100644
--- a/tests/test_tasks/test_task_methods.py
+++ b/tests/test_tasks/test_task_methods.py
@@ -7,6 +7,12 @@
 # Common methods between tasks
 class OpenMLTaskMethodsTest(TestBase):
 
+    def setUp(self):
+        super(OpenMLTaskMethodsTest, self).setUp()
+
+    def tearDown(self):
+        super(OpenMLTaskMethodsTest, self).tearDown()
+
     def test_tagging(self):
         task = openml.tasks.get_task(1)
         tag = "testing_tag_{}_{}".format(self.id(), time())
diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py
index 04f803f86..1f754c23a 100644
--- a/tests/test_utils/test_utils.py
+++ b/tests/test_utils/test_utils.py
@@ -43,12 +43,16 @@ def test_list_all_for_datasets(self):
             self._check_dataset(datasets[did])
 
     def test_list_datasets_with_high_size_parameter(self):
+        # Testing on prod since concurrent deletion of uploded datasets make the test fail
+        openml.config.server = self.production_server
+
         datasets_a = openml.datasets.list_datasets()
         datasets_b = openml.datasets.list_datasets(size=np.inf)
 
-        # note that in the meantime the number of datasets could have increased
-        # due to tests that run in parallel.
-        self.assertGreaterEqual(len(datasets_b), len(datasets_a))
+        # Reverting to test server
+        openml.config.server = self.test_server
+
+        self.assertEqual(len(datasets_a), len(datasets_b))
 
     def test_list_all_for_tasks(self):
         required_size = 1068  # default test server reset value