diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 4ce2ce5b69cb4..081e84c57c0ac 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -118,6 +118,7 @@ Other enhancements - ``Series`` gained an ``is_unique`` attribute (:issue:`11946`) - ``DataFrame.quantile`` and ``Series.quantile`` now accept ``interpolation`` keyword (:issue:`10174`). - ``DataFrame.select_dtypes`` now allows the ``np.float16`` typecode (:issue:`11990`) +- ``DataFrame.to_sql `` now allows a single value as the SQL type for all columns (:issue:`11886`). .. _whatsnew_0180.enhancements.rounding: @@ -303,6 +304,9 @@ Other API Changes - ``.memory_usage`` now includes values in the index, as does memory_usage in ``.info`` (:issue:`11597`) +- ``DataFrame.to_latex()`` now supports non-ascii encodings (eg utf-8) in Python 2 with the parameter ``encoding`` (:issue:`7061`) + + Changes to eval ^^^^^^^^^^^^^^^ @@ -463,6 +467,7 @@ Bug Fixes - Bug in ``pd.read_clipboard`` and ``pd.to_clipboard`` functions not supporting Unicode; upgrade included ``pyperclip`` to v1.5.15 (:issue:`9263`) - Bug in ``DataFrame.query`` containing an assignment (:issue:`8664`) +- Bug in ``from_msgpack`` where ``__contains__()`` fails for columns of the unpacked ``DataFrame``, if the ``DataFrame`` has object columns. (:issue: `11880`) - Bug in timezone info lost when broadcasting scalar datetime to ``DataFrame`` (:issue:`11682`) diff --git a/pandas/core/format.py b/pandas/core/format.py index 86d39c139fb51..a50edd9462431 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -619,105 +619,20 @@ def _join_multiline(self, *strcols): st = ed return '\n\n'.join(str_lst) - def to_latex(self, column_format=None, longtable=False): + def to_latex(self, column_format=None, longtable=False, encoding=None): """ Render a DataFrame to a LaTeX tabular/longtable environment output. """ - self.escape = self.kwds.get('escape', True) - def get_col_type(dtype): - if issubclass(dtype.type, np.number): - return 'r' - else: - return 'l' - - frame = self.frame - - if len(frame.columns) == 0 or len(frame.index) == 0: - info_line = (u('Empty %s\nColumns: %s\nIndex: %s') - % (type(self.frame).__name__, - frame.columns, frame.index)) - strcols = [[info_line]] - else: - strcols = self._to_str_columns() - - if self.index and isinstance(self.frame.index, MultiIndex): - clevels = self.frame.columns.nlevels - strcols.pop(0) - name = any(self.frame.index.names) - for i, lev in enumerate(self.frame.index.levels): - lev2 = lev.format() - blank = ' ' * len(lev2[0]) - lev3 = [blank] * clevels - if name: - lev3.append(lev.name) - for level_idx, group in itertools.groupby( - self.frame.index.labels[i]): - count = len(list(group)) - lev3.extend([lev2[level_idx]] + [blank] * (count - 1)) - strcols.insert(i, lev3) - - if column_format is None: - dtypes = self.frame.dtypes._values - column_format = ''.join(map(get_col_type, dtypes)) - if self.index: - index_format = 'l' * self.frame.index.nlevels - column_format = index_format + column_format - elif not isinstance(column_format, - compat.string_types): # pragma: no cover - raise AssertionError('column_format must be str or unicode, not %s' - % type(column_format)) - - def write(buf, frame, column_format, strcols, longtable=False): - if not longtable: - buf.write('\\begin{tabular}{%s}\n' % column_format) - buf.write('\\toprule\n') - else: - buf.write('\\begin{longtable}{%s}\n' % column_format) - buf.write('\\toprule\n') - - nlevels = frame.columns.nlevels - if any(frame.index.names): - nlevels += 1 - for i, row in enumerate(zip(*strcols)): - if i == nlevels and self.header: - buf.write('\\midrule\n') # End of header - if longtable: - buf.write('\\endhead\n') - buf.write('\\midrule\n') - buf.write('\\multicolumn{3}{r}{{Continued on next ' - 'page}} \\\\\n') - buf.write('\midrule\n') - buf.write('\endfoot\n\n') - buf.write('\\bottomrule\n') - buf.write('\\endlastfoot\n') - if self.escape: - crow = [(x.replace('\\', '\\textbackslash') # escape backslashes first - .replace('_', '\\_') - .replace('%', '\\%') - .replace('$', '\\$') - .replace('#', '\\#') - .replace('{', '\\{') - .replace('}', '\\}') - .replace('~', '\\textasciitilde') - .replace('^', '\\textasciicircum') - .replace('&', '\\&') if x else '{}') for x in row] - else: - crow = [x if x else '{}' for x in row] - buf.write(' & '.join(crow)) - buf.write(' \\\\\n') - - if not longtable: - buf.write('\\bottomrule\n') - buf.write('\\end{tabular}\n') - else: - buf.write('\\end{longtable}\n') + latex_renderer = LatexFormatter(self, column_format=column_format, + longtable=longtable) if hasattr(self.buf, 'write'): - write(self.buf, frame, column_format, strcols, longtable) + latex_renderer.write_result(self.buf) elif isinstance(self.buf, compat.string_types): - with open(self.buf, 'w') as f: - write(f, frame, column_format, strcols, longtable) + import codecs + with codecs.open(self.buf, 'w', encoding=encoding) as f: + latex_renderer.write_result(f) else: raise TypeError('buf is not a file name and it has no write ' 'method') @@ -851,6 +766,124 @@ def _get_column_name_list(self): return names +class LatexFormatter(TableFormatter): + """ Used to render a DataFrame to a LaTeX tabular/longtable environment + output. + + Parameters + ---------- + formatter : `DataFrameFormatter` + column_format : str, default None + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' for 3 columns + longtable : boolean, default False + Use a longtable environment instead of tabular. + + See also + -------- + HTMLFormatter + """ + + def __init__(self, formatter, column_format=None, longtable=False): + self.fmt = formatter + self.frame = self.fmt.frame + self.column_format = column_format + self.longtable = longtable + + def write_result(self, buf): + """ + Render a DataFrame to a LaTeX tabular/longtable environment output. + """ + + # string representation of the columns + if len(self.frame.columns) == 0 or len(self.frame.index) == 0: + info_line = (u('Empty %s\nColumns: %s\nIndex: %s') + % (type(self.frame).__name__, + self.frame.columns, self.frame.index)) + strcols = [[info_line]] + else: + strcols = self.fmt._to_str_columns() + + def get_col_type(dtype): + if issubclass(dtype.type, np.number): + return 'r' + else: + return 'l' + + if self.fmt.index and isinstance(self.frame.index, MultiIndex): + clevels = self.frame.columns.nlevels + strcols.pop(0) + name = any(self.frame.index.names) + for i, lev in enumerate(self.frame.index.levels): + lev2 = lev.format() + blank = ' ' * len(lev2[0]) + lev3 = [blank] * clevels + if name: + lev3.append(lev.name) + for level_idx, group in itertools.groupby( + self.frame.index.labels[i]): + count = len(list(group)) + lev3.extend([lev2[level_idx]] + [blank] * (count - 1)) + strcols.insert(i, lev3) + + column_format = self.column_format + if column_format is None: + dtypes = self.frame.dtypes._values + column_format = ''.join(map(get_col_type, dtypes)) + if self.fmt.index: + index_format = 'l' * self.frame.index.nlevels + column_format = index_format + column_format + elif not isinstance(column_format, + compat.string_types): # pragma: no cover + raise AssertionError('column_format must be str or unicode, not %s' + % type(column_format)) + + if not self.longtable: + buf.write('\\begin{tabular}{%s}\n' % column_format) + buf.write('\\toprule\n') + else: + buf.write('\\begin{longtable}{%s}\n' % column_format) + buf.write('\\toprule\n') + + nlevels = self.frame.columns.nlevels + if any(self.frame.index.names): + nlevels += 1 + for i, row in enumerate(zip(*strcols)): + if i == nlevels and self.fmt.header: + buf.write('\\midrule\n') # End of header + if self.longtable: + buf.write('\\endhead\n') + buf.write('\\midrule\n') + buf.write('\\multicolumn{3}{r}{{Continued on next ' + 'page}} \\\\\n') + buf.write('\\midrule\n') + buf.write('\\endfoot\n\n') + buf.write('\\bottomrule\n') + buf.write('\\endlastfoot\n') + if self.fmt.kwds.get('escape', True): + # escape backslashes first + crow = [(x.replace('\\', '\\textbackslash') + .replace('_', '\\_') + .replace('%', '\\%') + .replace('$', '\\$') + .replace('#', '\\#') + .replace('{', '\\{') + .replace('}', '\\}') + .replace('~', '\\textasciitilde') + .replace('^', '\\textasciicircum') + .replace('&', '\\&') if x else '{}') for x in row] + else: + crow = [x if x else '{}' for x in row] + buf.write(' & '.join(crow)) + buf.write(' \\\\\n') + + if not self.longtable: + buf.write('\\bottomrule\n') + buf.write('\\end{tabular}\n') + else: + buf.write('\\end{longtable}\n') + + class HTMLFormatter(TableFormatter): indent_delta = 2 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7220b25daf318..b27c4268796dd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1547,7 +1547,7 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, bold_rows=True, column_format=None, - longtable=None, escape=None): + longtable=None, escape=None, encoding=None): """ Render a DataFrame to a tabular environment table. You can splice this into a LaTeX document. Requires \\usepackage{booktabs}. @@ -1567,7 +1567,8 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, default: True When set to False prevents from escaping latex special characters in column names. - + encoding : str, default None + Default encoding is ascii in Python 2 and utf-8 in Python 3 """ if colSpace is not None: # pragma: no cover @@ -1589,7 +1590,8 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, sparsify=sparsify, index_names=index_names, escape=escape) - formatter.to_latex(column_format=column_format, longtable=longtable) + formatter.to_latex(column_format=column_format, longtable=longtable, + encoding=encoding) if buf is None: return formatter.buf.getvalue() diff --git a/pandas/core/window.py b/pandas/core/window.py index 1e5816e898baa..ce8fda9e932bc 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -965,6 +965,7 @@ def corr(self, other=None, pairwise=None, **kwargs): Use a standard estimation bias correction """ + class EWM(_Rolling): r""" Provides exponential weighted functions diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index 58e9d64921e0d..a5fcbd3f2d0f1 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -342,7 +342,7 @@ cdef class Int64HashTable(HashTable): self.table.vals[k] = values[i] @cython.boundscheck(False) - def map_locations(self, int64_t[:] values): + def map_locations(self, ndarray[int64_t, ndim=1] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -570,7 +570,7 @@ cdef class Float64HashTable(HashTable): return np.asarray(labels) @cython.boundscheck(False) - def map_locations(self, float64_t[:] values): + def map_locations(self, ndarray[float64_t, ndim=1] values): cdef: Py_ssize_t i, n = len(values) int ret = 0 diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 95a6d02b1ccb6..8cf7e0eb15b48 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -19,6 +19,7 @@ from pandas.core.common import isnull from pandas.core.base import PandasObject from pandas.core.dtypes import DatetimeTZDtype +from pandas.core.generic import is_dictlike from pandas.tseries.tools import to_datetime from pandas.util.decorators import Appender @@ -548,9 +549,11 @@ def to_sql(frame, name, con, flavor='sqlite', schema=None, if_exists='fail', chunksize : int, default None If not None, then rows will be written in batches of this size at a time. If None, all rows will be written at once. - dtype : dict of column name to SQL type, default None + dtype : single SQL type or dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should - be a SQLAlchemy type, or a string for sqlite3 fallback connection. + be a SQLAlchemy type, or a string for sqlite3 fallback connection. + If all columns are of the same type, one single value can be + used. """ if if_exists not in ('fail', 'replace', 'append'): @@ -563,7 +566,7 @@ def to_sql(frame, name, con, flavor='sqlite', schema=None, if_exists='fail', elif not isinstance(frame, DataFrame): raise NotImplementedError("'frame' argument should be either a " "Series or a DataFrame") - + pandas_sql.to_sql(frame, name, if_exists=if_exists, index=index, index_label=index_label, schema=schema, chunksize=chunksize, dtype=dtype) @@ -1222,11 +1225,15 @@ def to_sql(self, frame, name, if_exists='fail', index=True, chunksize : int, default None If not None, then rows will be written in batches of this size at a time. If None, all rows will be written at once. - dtype : dict of column name to SQL type, default None + dtype : single SQL type or dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should - be a SQLAlchemy type. + be a SQLAlchemy type. If all columns are of the same type, one + single value can be used. + """ + if dtype and not is_dictlike(dtype): + dtype = { col_name : dtype for col_name in frame } if dtype is not None: from sqlalchemy.types import to_instance, TypeEngine for col, my_type in dtype.items(): @@ -1618,11 +1625,15 @@ def to_sql(self, frame, name, if_exists='fail', index=True, chunksize : int, default None If not None, then rows will be written in batches of this size at a time. If None, all rows will be written at once. - dtype : dict of column name to SQL type, default None + dtype : single SQL type or dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should - be a string. + be a string. If all columns are of the same type, one single + value can be used. """ + if dtype and not is_dictlike(dtype): + dtype = { col_name : dtype for col_name in frame } + if dtype is not None: for col, my_type in dtype.items(): if not isinstance(my_type, str): diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index d6a9feb1bd8f4..61b24c858b60d 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -9,8 +9,8 @@ from pandas import compat from pandas.compat import u from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, - date_range, period_range, Index, SparseSeries, SparseDataFrame, - SparsePanel) + date_range, period_range, Index) +from pandas.io.packers import to_msgpack, read_msgpack import pandas.util.testing as tm from pandas.util.testing import (ensure_clean, assert_index_equal, assert_series_equal, @@ -23,7 +23,19 @@ nan = np.nan -from pandas.io.packers import to_msgpack, read_msgpack +try: + import blosc # NOQA +except ImportError: + _BLOSC_INSTALLED = False +else: + _BLOSC_INSTALLED = True + +try: + import zlib # NOQA +except ImportError: + _ZLIB_INSTALLED = False +else: + _ZLIB_INSTALLED = True _multiprocess_can_split_ = False @@ -483,6 +495,14 @@ class TestCompression(TestPackers): """ def setUp(self): + try: + from sqlalchemy import create_engine + self._create_sql_engine = create_engine + except ImportError: + self._SQLALCHEMY_INSTALLED = False + else: + self._SQLALCHEMY_INSTALLED = True + super(TestCompression, self).setUp() data = { 'A': np.arange(1000, dtype=np.float64), @@ -508,14 +528,56 @@ def test_compression_zlib(self): assert_frame_equal(self.frame[k], i_rec[k]) def test_compression_blosc(self): - try: - import blosc - except ImportError: + if not _BLOSC_INSTALLED: raise nose.SkipTest('no blosc') i_rec = self.encode_decode(self.frame, compress='blosc') for k in self.frame.keys(): assert_frame_equal(self.frame[k], i_rec[k]) + def test_readonly_axis_blosc(self): + # GH11880 + if not _BLOSC_INSTALLED: + raise nose.SkipTest('no blosc') + df1 = DataFrame({'A': list('abcd')}) + df2 = DataFrame(df1, index=[1., 2., 3., 4.]) + self.assertTrue(1 in self.encode_decode(df1['A'], compress='blosc')) + self.assertTrue(1. in self.encode_decode(df2['A'], compress='blosc')) + + def test_readonly_axis_zlib(self): + # GH11880 + df1 = DataFrame({'A': list('abcd')}) + df2 = DataFrame(df1, index=[1., 2., 3., 4.]) + self.assertTrue(1 in self.encode_decode(df1['A'], compress='zlib')) + self.assertTrue(1. in self.encode_decode(df2['A'], compress='zlib')) + + def test_readonly_axis_blosc_to_sql(self): + # GH11880 + if not _BLOSC_INSTALLED: + raise nose.SkipTest('no blosc') + if not self._SQLALCHEMY_INSTALLED: + raise nose.SkipTest('no sqlalchemy') + expected = DataFrame({'A': list('abcd')}) + df = self.encode_decode(expected, compress='blosc') + eng = self._create_sql_engine("sqlite:///:memory:") + df.to_sql('test', eng, if_exists='append') + result = pandas.read_sql_table('test', eng, index_col='index') + result.index.names = [None] + assert_frame_equal(expected, result) + + def test_readonly_axis_zlib_to_sql(self): + # GH11880 + if not _ZLIB_INSTALLED: + raise nose.SkipTest('no zlib') + if not self._SQLALCHEMY_INSTALLED: + raise nose.SkipTest('no sqlalchemy') + expected = DataFrame({'A': list('abcd')}) + df = self.encode_decode(expected, compress='zlib') + eng = self._create_sql_engine("sqlite:///:memory:") + df.to_sql('test', eng, if_exists='append') + result = pandas.read_sql_table('test', eng, index_col='index') + result.index.names = [None] + assert_frame_equal(expected, result) + class TestEncoding(TestPackers): def setUp(self): diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index bfd1ac3f08ee8..909713d50a1ab 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -1509,6 +1509,21 @@ def test_dtype(self): self.assertTrue(isinstance(sqltype, sqlalchemy.String)) self.assertEqual(sqltype.length, 10) + def test_to_sql_single_dtype(self): + self.drop_table('single_dtype_test') + cols = ['A','B'] + data = [('a','b'), + ('c','d')] + df = DataFrame(data,columns=cols) + df.to_sql('single_dtype_test',self.conn,dtype=sqlalchemy.TEXT) + meta = sqlalchemy.schema.MetaData(bind=self.conn) + meta.reflect() + sqltypea = meta.tables['single_dtype_test'].columns['A'].type + sqltypeb = meta.tables['single_dtype_test'].columns['B'].type + self.assertTrue(isinstance(sqltypea, sqlalchemy.TEXT)) + self.assertTrue(isinstance(sqltypeb, sqlalchemy.TEXT)) + self.drop_table('single_dtype_test') + def test_notnull_dtype(self): cols = {'Bool': Series([True,None]), 'Date': Series([datetime(2012, 5, 1), None]), @@ -1967,6 +1982,19 @@ def test_dtype(self): self.assertRaises(ValueError, df.to_sql, 'error', self.conn, dtype={'B': bool}) + def test_to_sql_single_dtype(self): + if self.flavor == 'mysql': + raise nose.SkipTest('Not applicable to MySQL legacy') + self.drop_table('single_dtype_test') + cols = ['A','B'] + data = [('a','b'), + ('c','d')] + df = DataFrame(data,columns=cols) + df.to_sql('single_dtype_test',self.conn,dtype='STRING') + self.assertEqual(self._get_sqlite_column_type('single_dtype_test','A'),'STRING') + self.assertEqual(self._get_sqlite_column_type('single_dtype_test','B'),'STRING') + self.drop_table('single_dtype_test') + def test_notnull_dtype(self): if self.flavor == 'mysql': raise nose.SkipTest('Not applicable to MySQL legacy') diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index 4d17610d87bea..a73b459459321 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -15,6 +15,8 @@ from numpy.random import randn import numpy as np +import codecs + div_style = '' try: import IPython @@ -2554,6 +2556,24 @@ def test_to_latex_filename(self): with open(path, 'r') as f: self.assertEqual(self.frame.to_latex(), f.read()) + # test with utf-8 and encoding option (GH 7061) + df = DataFrame([[u'au\xdfgangen']]) + with tm.ensure_clean('test.tex') as path: + df.to_latex(path, encoding='utf-8') + with codecs.open(path, 'r', encoding='utf-8') as f: + self.assertEqual(df.to_latex(), f.read()) + + # test with utf-8 without encoding option + if compat.PY3: # python3 default encoding is utf-8 + with tm.ensure_clean('test.tex') as path: + df.to_latex(path) + with codecs.open(path, 'r') as f: + self.assertEqual(df.to_latex(), f.read()) + else: + # python2 default encoding is ascii, so an error should be raised + with tm.ensure_clean('test.tex') as path: + self.assertRaises(UnicodeEncodeError, df.to_latex, path) + def test_to_latex(self): # it works! self.frame.to_latex()