Skip to content

Commit

Permalink
* #136 & #86: column replacements for values
Browse files Browse the repository at this point in the history
  • Loading branch information
aschonfeld committed May 30, 2020
1 parent 76bfa6c commit 7ee3f77
Show file tree
Hide file tree
Showing 22 changed files with 2,068 additions and 6 deletions.
3 changes: 2 additions & 1 deletion docs/source/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ markdown
pandas
future
Flask
scipy
scipy
scikit-learn
3 changes: 2 additions & 1 deletion dtale/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,14 @@ def __init__(self, *args, **kwargs):
self.host = kwargs.pop('hostname', 'localhost')
self.port = kwargs.pop('port', str(random.randint(0, 65535))) or str(random.randint(0, 65535))
super(DtaleFlaskTesting, self).__init__(*args, **kwargs)
self.application.config['SERVER_NAME'] = '{host}:{port}'.format(host=self.host, port=self.port)
self.application.config['SESSION_COOKIE_DOMAIN'] = 'localhost.localdomain'

def get(self, *args, **kwargs):
"""
:param args: Optional arguments to be passed to :meth:`flask:flask.FlaskClient.get`
:param kwargs: Optional keyword arguments to be passed to :meth:`flask:flask.FlaskClient.get`
"""
self.application.config['SERVER_NAME'] = '{host}:{port}'.format(host=self.host, port=self.port)
return super(DtaleFlaskTesting, self).get(url_scheme='http', *args, **kwargs)


Expand Down
239 changes: 239 additions & 0 deletions dtale/column_replacements.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
import re

import numpy as np
import pandas as pd
from six import string_types

import dtale.global_state as global_state
from dtale.utils import classify_type, find_dtype


class ColumnReplacement(object):

def __init__(self, data_id, col, replacement_type, cfg, name=None):
self.data_id = data_id
if replacement_type == 'spaces':
self.builder = SpaceReplacement(col, cfg, name)
elif replacement_type == 'strings':
self.builder = StringReplacement(col, cfg, name)
elif replacement_type == 'value':
self.builder = ValueReplacement(col, cfg, name)
elif replacement_type == 'imputer': # iterative, knn, simple
self.builder = ImputerReplacement(col, cfg, name)
else:
raise NotImplementedError("'{}' replacement not implemented yet!".format(replacement_type))

def build_replacements(self):
return self.builder.build_column(global_state.get_data(self.data_id))

def build_code(self):
return self.builder.build_code(global_state.get_data(self.data_id))


def get_inner_replacement_value(val):
return np.nan if isinstance(val, string_types) and val.lower() == 'nan' else val


def get_replacement_value(cfg, prop):
value = (cfg or {}).get(prop) or 'nan'
return get_inner_replacement_value(value)


def get_inner_replacement_value_as_str(val, series):
if isinstance(val, string_types) and val.lower() == 'nan':
return 'np.nan'
if classify_type(find_dtype(series)) == 'S':
return "'{value}'".format(value=val)
return val


def get_replacement_value_as_str(cfg, prop, series):
value = (cfg or {}).get(prop) or 'nan'
return get_inner_replacement_value_as_str(value, series)


class SpaceReplacement(object):

def __init__(self, col, cfg, name):
self.col = col
self.cfg = cfg
self.name = name

def build_column(self, data):
value = get_replacement_value(self.cfg, 'value')
return data[self.col].replace(r'^\s+$', value, regex=True)

def build_code(self, data):
value = get_replacement_value_as_str(self.cfg, 'value', data[self.col])
return "df.loc[:, '{name}'] = df['{col}'].replace(r'^\\s+$', {value}, regex=True)".format(
name=self.name or self.col, col=self.col, value=value
)


class StringReplacement(object):

def __init__(self, col, cfg, name):
self.col = col
self.cfg = cfg
self.name = name

def parse_cfg(self):
return (self.cfg[p] for p in ['value', 'ignoreCase', 'isChar'])

def build_column(self, data):
value, ignore_case, is_char = self.parse_cfg()
flags = re.UNICODE
if ignore_case:
flags |= re.IGNORECASE
value = re.escape(value)
if is_char:
value = '[{value}]+'.format(value=value)
regex_pat = re.compile(r'^ *{value} *$'.format(value=value), flags=flags)
replace_with = get_replacement_value(self.cfg, 'replace')
return data[self.col].replace(regex_pat, replace_with, regex=True)

def build_code(self, data):
value, ignore_case, is_char = self.parse_cfg()
flags = re.UNICODE
if ignore_case:
flags |= re.IGNORECASE

regex_exp = "r'^ *{value} *$'.format(value=re.escape({value}))"
if is_char:
regex_exp = "r'^ *[{value}]+ *$'.format(value=re.escape({value}))"
regex_exp = regex_exp.format(value=value)

replace_with = get_replacement_value_as_str(self.cfg, 'replace', data[self.col])

return (
"import re\n\n"
"regex_pat = re.compile({regex_exp}, flags={flags})\n"
"df.loc[:, '{name}'] = df['{col}'].replace(regex_pat, {replace}, regex=True)"
).format(name=self.name or self.col, col=self.col, regex_exp=regex_exp, flags=flags, replace=replace_with)


class ValueReplacement(object):

def __init__(self, col, cfg, name):
self.col = col
self.cfg = cfg
self.name = name

def build_column(self, data):
s = data[self.col]
replacements = {}
col_replacements = []
for replacement in self.cfg.get('value', []):
value = get_replacement_value(replacement, 'value')
replacement_type = replacement.get('type')
if replacement_type == 'agg':
replace = getattr(s, replacement['replace'])() # min, max, mean, median
if pd.isnull(replace):
raise Exception(
'Running the aggregation, {agg}, on {col} resulted in nan, this would result in a no-op.'
)
elif replacement_type == 'col':
col_replacements.append(lambda s2: np.where(s2 == value, data[replacement['replace']], s2))
else:
replace = get_replacement_value(replacement, 'replace')
replacements[value] = replace
final_s = s
if len(replacements):
final_s = final_s.replace(replacements)
for col_r in col_replacements:
final_s = col_r(final_s)
return final_s

def build_code(self, data):
replacements = []
series = data[self.col]
col_replacements = []
for replacement in self.cfg.get('value', []):
value = get_replacement_value_as_str(replacement, 'value', series)
replacement_type = self.cfg.get('type')
if replacement_type == 'agg':
replace = "getattr(df['{col}'], '{agg}')()".format(agg=replacement['value'], col=self.col)
elif replacement_type == 'col':
col_replacements.append("s = np.where(s == {value}, data['{col2}'], s)".format(
col2=replacement['replace'], value=value
))
else:
replace = get_replacement_value_as_str(replacement, 'replace', series)
replacements.append('\t{value}: {replace}'.format(value=value, replace=replace))

code = ["s = df['{col}']".format(col=self.col)]
if len(replacements):
replacements = ',\n'.join(replacements)
replacements = '{\n' + replacements + '}'
code.append("s = s.replace({replacements})".format(replacements=replacements))
code += col_replacements
code.append("df.loc[:, '{name}'] = s".format(name=self.name or self.col))
return '\n'.join(code)


class ImputerReplacement(object):

def __init__(self, col, cfg, name):
self.col = col
self.cfg = cfg
self.name = name

def build_column(self, data):
imputer_type = self.cfg['type']
if imputer_type == 'iterative':
try:
from sklearn.experimental import enable_iterative_imputer # noqa
from sklearn.impute import IterativeImputer
except ImportError:
raise Exception(
'You must have at least scikit-learn 0.21.0 installed in order to use the Iterative Imputer!'
)
imputer = IterativeImputer()
elif imputer_type == 'knn':
try:
from sklearn.impute import KNNImputer
except ImportError:
raise Exception(
'You must have at least scikit-learn 0.22.0 installed in order to use the Iterative Imputer!'
)
n_neighbors = self.cfg.get('n_neighbors') or 2
imputer = KNNImputer(n_neighbors=n_neighbors)
elif imputer_type == 'simple':
try:
from sklearn.impute import SimpleImputer
except ImportError:
raise Exception(
'You must have at least scikit-learn 0.20.0 installed in order to use the Iterative Imputer!'
)
imputer = SimpleImputer()
else:
raise NotImplementedError("'{}' sklearn imputer not implemented yet!".format(imputer_type))
output = imputer.fit_transform(data[[self.col]])
return pd.DataFrame(output, columns=[self.col], index=data.index)[self.col]

def build_code(self, _data):
imputer_type = self.cfg['type']
code = []
if imputer_type == 'iterative':
code.append((
"from sklearn.experimental import enable_iterative_imputer\n"
"from sklearn.impute import IterativeImputer\n\n"
"output = IterativeImputer().fit_transform(df[['{col}']])"
).format(col=self.col))
elif imputer_type == 'knn':
n_neighbors = self.cfg.get('n_neighbors') or 2
code.append((
"from sklearn.impute import KNNImputer\n\n"
"output = KNNImputer(n_neighbors={n_neighbors}).fit_transform(df[['{col}']])"
).format(col=self.col, n_neighbors=n_neighbors))
elif imputer_type == 'simple':
code.append((
"from sklearn.impute import SimpleImputer\n\n"
"output = SimpleImputer().fit_transform(df[['{col}']])"
).format(col=self.col))
code.append(
"df.loc[:, '{name}'] = pd.DataFrame(output, columns=['{col}'], index=df.index)['{col}']".format(
name=self.name or self.col, col=self.col
)
)
return '\n'.join(code)
7 changes: 5 additions & 2 deletions dtale/static/css/main.css
Original file line number Diff line number Diff line change
Expand Up @@ -4551,7 +4551,8 @@ button.close {
}
}

div.build-modal > div.modal-lg {
div.build-modal > div.modal-lg,
div.replacement-modal > div.modal-lg {
min-width: 720px;
}

Expand All @@ -4567,7 +4568,8 @@ div.filter-modal > div.modal-lg {
.modal-lg {
max-width: 800px;
}
div.build-modal > div.modal-lg {
div.build-modal > div.modal-lg,
div.replacement-modal > div.modal-lg {
max-width: 720px;
}
div.reshape-modal > div.modal-lg {
Expand Down Expand Up @@ -10603,6 +10605,7 @@ div.container-fluid.code-export > div#popup-content > div.modal-footer {

@media (min-height: 330px) {
div.container-fluid.build > div#popup-content > div.modal-footer,
div.container-fluid.replacement > div#popup-content > div.modal-footer,
div.container-fluid.reshape > div#popup-content > div.modal-footer {
position: absolute;
bottom: 0;
Expand Down
67 changes: 67 additions & 0 deletions dtale/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from dtale.cli.clickutils import retrieve_meta_info_and_version
from dtale.column_builders import ColumnBuilder
from dtale.column_filters import ColumnFilter
from dtale.column_replacements import ColumnReplacement
from dtale.dash_application.charts import (build_raw_chart, chart_url_params,
chart_url_querystring, export_chart,
export_chart_data, url_encode_func)
Expand Down Expand Up @@ -951,6 +952,72 @@ def reshape_data(data_id):
return jsonify_error(e)


@dtale.route('/build-replacement/<data_id>')
def build_replacement(data_id):
"""
:class:`flask:flask.Flask` route to handle the replacement of specific values within a column in a dataframe. Some
of the operations the are available are:
- spaces: replace values consisting of only spaces with a specific value
- value: replace specific values with a specific value or aggregation
- strings: replace values which contain a specific character or string (case-insensitive or not) with a
specific value
- imputer: replace nan values using sklearn imputers iterative, knn or simple
:param data_id: integer string identifier for a D-Tale process's data
:type data_id: str
:param col: string from flask.request.args['col'] of the column to perform replacements upon
:param type: string from flask.request.args['type'] of the type of replacement to perform
(spaces/fillna/strings/imputer)
:param cfg: dict from flask.request.args['cfg'] of how to calculate the replacements
:return: JSON {success: True/False}
"""

def build_data_ranges(data, col, dtype):
data_ranges = {}
if classify_type(dtype) == 'F' and not data[col].isnull().all():
try:
data_ranges[col] = data[[col]].agg(['min', 'max']).to_dict()[col]
except ValueError:
pass
return data_ranges

try:
data = global_state.get_data(data_id)
name = get_str_arg(request, 'name')
if name is not None:
name = str(name)
if name in data.columns:
raise Exception("A column named '{}' already exists!".format(name))
col = get_str_arg(request, 'col')
replacement_type = get_str_arg(request, 'type')
cfg = json.loads(get_str_arg(request, 'cfg'))

builder = ColumnReplacement(data_id, col, replacement_type, cfg)
output = builder.build_replacements()
dtype = find_dtype(output)
curr_dtypes = global_state.get_dtypes(data_id)

if name is not None:
data.loc[:, name] = output
dtype_f = dtype_formatter(data, {name: dtype}, build_data_ranges(data, name, dtype))
curr_dtypes.append(dtype_f(len(curr_dtypes), name))
else:
data.loc[:, col] = output
dtype_f = dtype_formatter(data, {name: dtype}, build_data_ranges(data, name, dtype))
col_index = next((i for i, d in enumerate(curr_dtypes) if d['name'] == col), None)
curr_col_dtype = dtype_f(col_index, col)
curr_dtypes = [curr_col_dtype if d['name'] == col else d for d in curr_dtypes]

global_state.set_data(data_id, data)
global_state.set_dtypes(data_id, curr_dtypes)
curr_history = global_state.get_history(data_id) or []
curr_history += [builder.build_code()]
global_state.set_history(data_id, curr_history)
return jsonify(success=True)
except BaseException as e:
return jsonify_error(e)


@dtale.route('/test-filter/<data_id>')
def test_filter(data_id):
"""
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def run_tests(self):
"itsdangerous",
"pandas",
"requests",
"scikit-learn >= '0.21.0",
"scipy",
"six"
],
Expand Down
Loading

0 comments on commit 7ee3f77

Please sign in to comment.