Merge pull request #54 from xmnlab/master

Merged from master
ibis-project · Jun 4, 2018 · 55a46fe · 55a46fe
2 parents 7644a3d + 8de95f8
commit 55a46fe
Show file tree

Hide file tree

Showing 11 changed files with 42 additions and 26 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -4,7 +4,7 @@ version: 2
 base: &base
   machine:
     image: circleci/classic:latest
-    docker_layer_caching: true
+    docker_layer_caching: false
   working_directory: ~/ibis/ci
 
 

diff --git a/ci/datamgr.py b/ci/datamgr.py
@@ -1,11 +1,12 @@
 #!/usr/bin/env python
 
 import os
-import six
 import sys
-import click
 import tarfile
 
+import click
+import six
+
 import pandas as pd
 import sqlalchemy as sa
 
@@ -69,16 +70,32 @@ def read_tables(names, data_directory):
         yield (name, df)
 
 
+def convert_to_database_compatible_value(value):
+    """Pandas 0.23 broke DataFrame.to_sql, so we workaround it by rolling our
+    own extremely low-tech conversion routine
+    """
+    if pd.isnull(value):
+        return None
+    elif isinstance(value, pd.Timestamp):
+        return value.to_pydatetime()
+    else:
+        return value
+
+
+def insert(engine, tablename, df):
+    keys = df.columns
+    rows = [
+        dict(zip(keys, tuple(map(convert_to_database_compatible_value, row))))
+        for row in df.itertuples(index=False, name=None)
+    ]
+    t = sa.Table(tablename, sa.MetaData(bind=engine), autoload=True)
+    engine.execute(t.insert(), rows)
+
+
 def insert_tables(engine, names, data_directory):
     for table, df in read_tables(names, data_directory):
         with engine.begin() as connection:
-            df.to_sql(
-                table, connection, index=False, if_exists='append',
-                chunksize=1 if os.name == 'nt' else None
-                # Pandas 0.23 uses multi value inserts which is very slow for a
-                # chunksize of 1. For some reason this only shows up on
-                # Appveyor Windows CI
-            )
+            insert(connection, table, df)
 
 
 @click.group()
@@ -326,7 +343,7 @@ def clickhouse(schema, tables, data_directory, **params):
             cols = df.select_dtypes([object]).columns
             df[cols] = df[cols].fillna('')
 
-        df.to_sql(table, engine, index=False, if_exists='append')
+        insert(engine, table, df)
 
 
 if __name__ == '__main__':

diff --git a/ci/requirements-dev-2.7.yml b/ci/requirements-dev-2.7.yml
@@ -18,7 +18,7 @@ dependencies:
   - mock
   - multipledispatch
   - numpy=1.11.*
-  - pandas<0.23
+  - pandas
   - pathlib2
   - plumbum
   - psycopg2

diff --git a/ci/requirements-dev-3.5.yml b/ci/requirements-dev-3.5.yml
@@ -14,7 +14,7 @@ dependencies:
   - lz4
   - multipledispatch
   - numpy=1.12.0
-  - pandas<0.23
+  - pandas
   - plumbum
   - psycopg2
   - pyarrow>=0.6.0

diff --git a/ci/requirements-dev-3.6.yml b/ci/requirements-dev-3.6.yml
@@ -14,7 +14,7 @@ dependencies:
   - lz4
   - multipledispatch
   - numpy
-  - pandas<0.23
+  - pandas
   - plumbum
   - psycopg2
   - pyarrow>=0.6.0

diff --git a/ci/requirements-docs-3.6.yml b/ci/requirements-docs-3.6.yml
@@ -18,7 +18,7 @@ dependencies:
   - nbsphinx
   - numpy
   - numpydoc
-  - pandas<0.23
+  - pandas
   - plumbum
   - psycopg2
   - pyarrow>=0.6.0

diff --git a/ibis/bigquery/tests/test_client.py b/ibis/bigquery/tests/test_client.py
@@ -290,7 +290,7 @@ def test_scalar_param_timestamp(alltypes, df, timestamp_value):
     result = expr.execute(
         params={param: timestamp_value}
     ).sort_values('timestamp_col').reset_index(drop=True)
-    value = pd.Timestamp(timestamp_value, tz='UTC')
+    value = pd.Timestamp(timestamp_value)
     expected = df.loc[
         df.timestamp_col <= value, ['timestamp_col']
     ].sort_values('timestamp_col').reset_index(drop=True)

diff --git a/ibis/clickhouse/tests/test_types.py b/ibis/clickhouse/tests/test_types.py
@@ -1,5 +1,4 @@
 import pytest
-import pandas as pd
 
 
 pytest.importorskip('clickhouse_driver')
@@ -14,4 +13,4 @@ def test_column_types(alltypes):
     assert df.bigint_col.dtype.name == 'int64'
     assert df.float_col.dtype.name == 'float32'
     assert df.double_col.dtype.name == 'float64'
-    assert pd.core.common.is_datetime64_dtype(df.timestamp_col.dtype)
+    assert df.timestamp_col.dtype.name == 'datetime64[ns]'
diff --git a/ibis/impala/tests/test_connection_pool.py b/ibis/impala/tests/test_connection_pool.py
@@ -29,7 +29,7 @@ def test_connection_pool_size(env, hdfs_client):
         host=env.impala_host,
         database=env.test_data_db,
     )
-    assert client.con.connection_pool_size == 1
+    assert len(client.con.connection_pool) == 1
 
 
 @pytest.mark.impala
@@ -41,4 +41,4 @@ def test_connection_pool_size_after_close(env, hdfs_client):
         database=env.test_data_db,
     )
     client.close()
-    assert client.con.connection_pool_size == 0
+    assert len(client.con.connection_pool) == 0
diff --git a/ibis/impala/tests/test_exprs.py b/ibis/impala/tests/test_exprs.py
@@ -1214,7 +1214,7 @@ def test_column_types(self):
         assert df.bigint_col.dtype.name == 'int64'
         assert df.float_col.dtype.name == 'float32'
         assert df.double_col.dtype.name == 'float64'
-        assert pd.core.common.is_datetime64_dtype(df.timestamp_col.dtype)
+        assert df.timestamp_col.dtype.name == 'datetime64[ns]'
 
     def test_timestamp_builtins(self):
         i32 = L(50000)

diff --git a/ibis/pandas/udf.py b/ibis/pandas/udf.py
@@ -168,21 +168,21 @@ def udf_signature(input_type, klass):
     >>> input_type = [dt.int64, dt.double]
     >>> sig = udf_signature(input_type, pd.Series)
     >>> pprint(sig)  # doctest: +ELLIPSIS
-    ((<class 'pandas.core.series.Series'>,
+    ((<class '...Series'>,
       <... 'int'>,
       <... 'numpy.integer'>,
       <... 'NoneType'>),
-     (<class 'pandas.core.series.Series'>,
+     (<class '...Series'>,
       <... 'float'>,
       <... 'numpy.floating'>,
       <... 'NoneType'>))
     >>> input_type = [dt.Int64(nullable=False), dt.Double(nullable=False)]
     >>> sig = udf_signature(input_type, SeriesGroupBy)
-    >>> pprint(sig)
-    ((<class 'pandas.core.groupby.SeriesGroupBy'>,
+    >>> pprint(sig)  # doctest: +ELLIPSIS
+    ((<class '...SeriesGroupBy'>,
       <... 'int'>,
       <... 'numpy.integer'>),
-     (<class 'pandas.core.groupby.SeriesGroupBy'>,
+     (<class '...SeriesGroupBy'>,
       <... 'float'>,
       <... 'numpy.floating'>))
     """