Skip to content

Commit

Permalink
Merge pull request #54 from xmnlab/master
Browse files Browse the repository at this point in the history
Merged from master
  • Loading branch information
xmnlab authored Jun 4, 2018
2 parents 7644a3d + 8de95f8 commit 55a46fe
Show file tree
Hide file tree
Showing 11 changed files with 42 additions and 26 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ version: 2
base: &base
machine:
image: circleci/classic:latest
docker_layer_caching: true
docker_layer_caching: false
working_directory: ~/ibis/ci


Expand Down
37 changes: 27 additions & 10 deletions ci/datamgr.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
#!/usr/bin/env python

import os
import six
import sys
import click
import tarfile

import click
import six

import pandas as pd
import sqlalchemy as sa

Expand Down Expand Up @@ -69,16 +70,32 @@ def read_tables(names, data_directory):
yield (name, df)


def convert_to_database_compatible_value(value):
"""Pandas 0.23 broke DataFrame.to_sql, so we workaround it by rolling our
own extremely low-tech conversion routine
"""
if pd.isnull(value):
return None
elif isinstance(value, pd.Timestamp):
return value.to_pydatetime()
else:
return value


def insert(engine, tablename, df):
keys = df.columns
rows = [
dict(zip(keys, tuple(map(convert_to_database_compatible_value, row))))
for row in df.itertuples(index=False, name=None)
]
t = sa.Table(tablename, sa.MetaData(bind=engine), autoload=True)
engine.execute(t.insert(), rows)


def insert_tables(engine, names, data_directory):
for table, df in read_tables(names, data_directory):
with engine.begin() as connection:
df.to_sql(
table, connection, index=False, if_exists='append',
chunksize=1 if os.name == 'nt' else None
# Pandas 0.23 uses multi value inserts which is very slow for a
# chunksize of 1. For some reason this only shows up on
# Appveyor Windows CI
)
insert(connection, table, df)


@click.group()
Expand Down Expand Up @@ -326,7 +343,7 @@ def clickhouse(schema, tables, data_directory, **params):
cols = df.select_dtypes([object]).columns
df[cols] = df[cols].fillna('')

df.to_sql(table, engine, index=False, if_exists='append')
insert(engine, table, df)


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion ci/requirements-dev-2.7.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies:
- mock
- multipledispatch
- numpy=1.11.*
- pandas<0.23
- pandas
- pathlib2
- plumbum
- psycopg2
Expand Down
2 changes: 1 addition & 1 deletion ci/requirements-dev-3.5.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dependencies:
- lz4
- multipledispatch
- numpy=1.12.0
- pandas<0.23
- pandas
- plumbum
- psycopg2
- pyarrow>=0.6.0
Expand Down
2 changes: 1 addition & 1 deletion ci/requirements-dev-3.6.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dependencies:
- lz4
- multipledispatch
- numpy
- pandas<0.23
- pandas
- plumbum
- psycopg2
- pyarrow>=0.6.0
Expand Down
2 changes: 1 addition & 1 deletion ci/requirements-docs-3.6.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies:
- nbsphinx
- numpy
- numpydoc
- pandas<0.23
- pandas
- plumbum
- psycopg2
- pyarrow>=0.6.0
Expand Down
2 changes: 1 addition & 1 deletion ibis/bigquery/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def test_scalar_param_timestamp(alltypes, df, timestamp_value):
result = expr.execute(
params={param: timestamp_value}
).sort_values('timestamp_col').reset_index(drop=True)
value = pd.Timestamp(timestamp_value, tz='UTC')
value = pd.Timestamp(timestamp_value)
expected = df.loc[
df.timestamp_col <= value, ['timestamp_col']
].sort_values('timestamp_col').reset_index(drop=True)
Expand Down
3 changes: 1 addition & 2 deletions ibis/clickhouse/tests/test_types.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import pytest
import pandas as pd


pytest.importorskip('clickhouse_driver')
Expand All @@ -14,4 +13,4 @@ def test_column_types(alltypes):
assert df.bigint_col.dtype.name == 'int64'
assert df.float_col.dtype.name == 'float32'
assert df.double_col.dtype.name == 'float64'
assert pd.core.common.is_datetime64_dtype(df.timestamp_col.dtype)
assert df.timestamp_col.dtype.name == 'datetime64[ns]'
4 changes: 2 additions & 2 deletions ibis/impala/tests/test_connection_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_connection_pool_size(env, hdfs_client):
host=env.impala_host,
database=env.test_data_db,
)
assert client.con.connection_pool_size == 1
assert len(client.con.connection_pool) == 1


@pytest.mark.impala
Expand All @@ -41,4 +41,4 @@ def test_connection_pool_size_after_close(env, hdfs_client):
database=env.test_data_db,
)
client.close()
assert client.con.connection_pool_size == 0
assert len(client.con.connection_pool) == 0
2 changes: 1 addition & 1 deletion ibis/impala/tests/test_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1214,7 +1214,7 @@ def test_column_types(self):
assert df.bigint_col.dtype.name == 'int64'
assert df.float_col.dtype.name == 'float32'
assert df.double_col.dtype.name == 'float64'
assert pd.core.common.is_datetime64_dtype(df.timestamp_col.dtype)
assert df.timestamp_col.dtype.name == 'datetime64[ns]'

def test_timestamp_builtins(self):
i32 = L(50000)
Expand Down
10 changes: 5 additions & 5 deletions ibis/pandas/udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,21 +168,21 @@ def udf_signature(input_type, klass):
>>> input_type = [dt.int64, dt.double]
>>> sig = udf_signature(input_type, pd.Series)
>>> pprint(sig) # doctest: +ELLIPSIS
((<class 'pandas.core.series.Series'>,
((<class '...Series'>,
<... 'int'>,
<... 'numpy.integer'>,
<... 'NoneType'>),
(<class 'pandas.core.series.Series'>,
(<class '...Series'>,
<... 'float'>,
<... 'numpy.floating'>,
<... 'NoneType'>))
>>> input_type = [dt.Int64(nullable=False), dt.Double(nullable=False)]
>>> sig = udf_signature(input_type, SeriesGroupBy)
>>> pprint(sig)
((<class 'pandas.core.groupby.SeriesGroupBy'>,
>>> pprint(sig) # doctest: +ELLIPSIS
((<class '...SeriesGroupBy'>,
<... 'int'>,
<... 'numpy.integer'>),
(<class 'pandas.core.groupby.SeriesGroupBy'>,
(<class '...SeriesGroupBy'>,
<... 'float'>,
<... 'numpy.floating'>))
"""
Expand Down

0 comments on commit 55a46fe

Please sign in to comment.