From b6a7d2ebddd69f0952adbf93161923952a1a608a Mon Sep 17 00:00:00 2001 From: Felix Biessmann Date: Thu, 6 Feb 2020 21:15:48 +0100 Subject: [PATCH] Fix broken build, see [issue 115](https://github.com/awslabs/datawig/issues/115) - fixed dependencies in requirements.txt - changed pandas usage to latest API version --- README.md | 8 ++++++-- datawig/imputer.py | 4 ++-- requirements/requirements.txt | 10 +++++----- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 1112ec9..7dc86de 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,8 @@ The DataWig API expects your data as a [pandas DataFrame](https://pandas.pydata. | SDCards | Best SDCard ever ... | 8GB | Blue | | Dress | This **yellow** dress | M | **?** | +### Quickstart Example + For most use cases, the `SimpleImputer` class is the best starting point. For convenience there is the function [SimpleImputer.complete](https://datawig.readthedocs.io/en/latest/source/API.html#datawig.simple_imputer.SimpleImputer.complete) that takes a DataFrame and fits an imputation model for each column with missing values, with all other columns as inputs: ```python @@ -60,8 +62,10 @@ You can also impute values in specific columns only (called `output_column` belo import datawig df = datawig.utils.generate_df_string( num_samples=200, - data_column_name='sentences', label_column_name='label') + data_column_name='sentences', + label_column_name='label') +df_train, df_test = datawig.utils.random_split(df) #Initialize a SimpleImputer model imputer = datawig.SimpleImputer( @@ -138,4 +142,4 @@ Run tests: ``` ./venv/bin/pip install -r requirements/requirements.dev.txt ./venv/bin/python -m pytest -``` \ No newline at end of file +``` diff --git a/datawig/imputer.py b/datawig/imputer.py index bef01fe..d843ab3 100644 --- a/datawig/imputer.py +++ b/datawig/imputer.py @@ -819,14 +819,14 @@ def predict(self, predictions = self.predict_above_precision(data_frame, precision_threshold).items() for label, imputations in predictions: imputation_col = label + imputation_suffix - if data_frame.columns.contains(imputation_col): + if imputation_col in data_frame.columns: raise ColumnOverwriteException( "DataFrame contains column {}; remove column and try again".format( imputation_col)) if label not in numerical_outputs: imputation_proba_col = label + score_suffix - if data_frame.columns.contains(imputation_proba_col): + if imputation_proba_col in data_frame.columns: raise ColumnOverwriteException( "DataFrame contains column {}; remove column and try again".format( imputation_proba_col)) diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 27e6352..52115a7 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,5 +1,5 @@ -numpy>=1.15.0 -scikit-learn[alldeps]>=0.20.0 -typing>=3.6.6 -pandas>=0.22.0 -mxnet>=1.3.0 \ No newline at end of file +numpy==1.18.0 +scikit-learn[alldeps]==0.22.1 +typing==3.6.6 +pandas==0.25.0 +mxnet==1.4.0