From 8ece8cfb1dfb1870ef9a36f1a9ed8675a907b3b4 Mon Sep 17 00:00:00 2001 From: Sam Ruth Date: Sat, 24 Jan 2015 16:09:30 -0500 Subject: [PATCH 001/239] Changed uin8 to uint8 in response to issue #9266 --- pandas/io/parsers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 637612d5fb09d..99fb24ebf91dd 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -991,7 +991,7 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, try: values = lib.map_infer(values, conv_f) except ValueError: - mask = lib.ismember(values, na_values).view(np.uin8) + mask = lib.ismember(values, na_values).view(np.uint8) values = lib.map_infer_mask(values, conv_f, mask) coerce_type = False From d550e64a2ebe54d2f74a625997a3c1b6543968fa Mon Sep 17 00:00:00 2001 From: Sam Ruth Date: Sat, 24 Jan 2015 18:33:44 -0500 Subject: [PATCH 002/239] Added test for GH 9266 --- pandas/io/tests/test_parsers.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 35530a7f5e07f..014ebf299f07d 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2984,6 +2984,28 @@ def test_variable_width_unicode(self): tm.assert_frame_equal(expected, read_fwf(BytesIO(test.encode('utf8')), header=None, encoding='utf8')) + def test_convert_to_nd_arrays(self): + #GH 9266 + with open('test.txt','w') as f: + f.write( + """1421302964.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127 + 1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" + ) + try: + pd.read_fwf('test.txt', colspecs=[(0,17),(25,26),(33,37),(49,51),(58,62),(63,1000)], + names=['time','pri','pgn','dst','src','data'], + converters={'pgn':lambda x: int(x,16), + 'src':lambda x: int(x,16), + 'dst':lambda x: int(x,16), + 'data':lambda x: len(x.split(' '))}, + index_col='time') + except AttributeError: + self.assertIn('Error with read_fwf function.') + + + + + class TestCParserHighMemory(ParserTests, tm.TestCase): From e7d90fe7b8cab73c09e451593e9da672761cfd77 Mon Sep 17 00:00:00 2001 From: Sam Ruth Date: Mon, 26 Jan 2015 22:02:24 -0500 Subject: [PATCH 003/239] Trying again with new testing function --- pandas/io/tests/test_parsers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 014ebf299f07d..b52b9a472e8e4 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2988,8 +2988,8 @@ def test_convert_to_nd_arrays(self): #GH 9266 with open('test.txt','w') as f: f.write( - """1421302964.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127 - 1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" + """1421302964.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127 \n""" + + """1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" ) try: pd.read_fwf('test.txt', colspecs=[(0,17),(25,26),(33,37),(49,51),(58,62),(63,1000)], From 5ea14a510d3cd1dd636ad186ba2b42eb9ae52b55 Mon Sep 17 00:00:00 2001 From: Sam Ruth Date: Sun, 5 Apr 2015 20:23:04 -0400 Subject: [PATCH 004/239] Modified the test_to_nd_array test to make it clearer what was being tested. --- pandas/io/tests/test_parsers.py | 35 +++++++++++++++------------------ 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index b52b9a472e8e4..c2a7f814e17c3 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2986,25 +2986,22 @@ def test_variable_width_unicode(self): def test_convert_to_nd_arrays(self): #GH 9266 - with open('test.txt','w') as f: - f.write( - """1421302964.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127 \n""" - + """1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" - ) - try: - pd.read_fwf('test.txt', colspecs=[(0,17),(25,26),(33,37),(49,51),(58,62),(63,1000)], - names=['time','pri','pgn','dst','src','data'], - converters={'pgn':lambda x: int(x,16), - 'src':lambda x: int(x,16), - 'dst':lambda x: int(x,16), - 'data':lambda x: len(x.split(' '))}, - index_col='time') - except AttributeError: - self.assertIn('Error with read_fwf function.') - - - - + with tm.ensure_clean('test.txt') as path: + with open(path,'w') as f: + f.write( + """1421302964.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127 \n""" + + """1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" + ) + + result = pd.read_fwf('test.txt', colspecs=[(0,17),(25,26),(33,37),(49,51),(58,62),(63,1000)], + names=['time','pri','pgn','dst','src','data'], + converters={'pgn':lambda x: int(x,16), + 'src':lambda x: int(x,16), + 'dst':lambda x: int(x,16), + 'data':lambda x: len(x.split(' '))}, + index_col='time') + self.assertEqual(result['dst'].dtype,np.uint8) + class TestCParserHighMemory(ParserTests, tm.TestCase): From 650e39813a6fda7818f9d8e488b562de4bf6ba00 Mon Sep 17 00:00:00 2001 From: Sam Ruth Date: Sun, 5 Apr 2015 20:45:03 -0400 Subject: [PATCH 005/239] Added documentation for bug fix on issue #9266 --- doc/source/whatsnew/v0.16.1.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 05c762b91b925..160778ed9f8ca 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -80,3 +80,5 @@ Bug Fixes - Bug in ``Series.quantile`` on empty Series of type ``Datetime`` or ``Timedelta`` (:issue:`9675`) - Bug in ``where`` causing incorrect results when upcasting was required (:issue:`9731`) + +- Bug in ``ParserBase.convert_to_nd_arrays`` when called by ``DataFrame.read_fwf`` (:issue:`9266``) From 5779cbdfc19355249149d9460ab7820333bfb4d6 Mon Sep 17 00:00:00 2001 From: Sam Ruth Date: Sun, 24 May 2015 11:06:48 -0400 Subject: [PATCH 006/239] Rebased --- pandas/io/tests/test_parsers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index c2a7f814e17c3..1bf8f7fef7b6b 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -17,6 +17,7 @@ from pandas.compat import( StringIO, BytesIO, PY3, range, long, lrange, lmap, u ) + from pandas.io.common import URLError import pandas.io.parsers as parsers from pandas.io.parsers import (read_csv, read_table, read_fwf, From d31428afba1a86c0a2f808dc3076ffc59ca02a1b Mon Sep 17 00:00:00 2001 From: Roy Hyunjin Han Date: Wed, 1 Apr 2015 20:44:20 -0400 Subject: [PATCH 007/239] Fix zlib and blosc imports --- pandas/io/packers.py | 34 +++++----------------------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index b3e2e16af54c2..04321296d4646 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -65,26 +65,7 @@ # until we can pass this into our conversion functions, # this is pretty hacky compressor = None -_IMPORTS = False -_BLOSC = False -def _importers(): - # import things we need - # but make this done on a first use basis - - global _IMPORTS - if _IMPORTS: - return - - _IMPORTS = True - - global _BLOSC - import zlib - try: - import blosc - _BLOSC = True - except: - pass def to_msgpack(path_or_buf, *args, **kwargs): """ @@ -103,7 +84,6 @@ def to_msgpack(path_or_buf, *args, **kwargs): compress : type of compressor (zlib or blosc), default to None (no compression) """ - _importers() global compressor compressor = kwargs.pop('compress', None) append = kwargs.pop('append', None) @@ -146,7 +126,6 @@ def read_msgpack(path_or_buf, iterator=False, **kwargs): obj : type of object stored in file """ - _importers() path_or_buf, _ = get_filepath_or_buffer(path_or_buf) if iterator: return Iterator(path_or_buf) @@ -232,9 +211,10 @@ def convert(values): # convert to a bytes array v = v.tostring() + import zlib return zlib.compress(v) - elif compressor == 'blosc' and _BLOSC: + elif compressor == 'blosc': # return string arrays like they are if dtype == np.object_: @@ -242,6 +222,7 @@ def convert(values): # convert to a bytes array v = v.tostring() + import blosc return blosc.compress(v, typesize=dtype.itemsize) # ndarray (on original dtype) @@ -254,18 +235,13 @@ def unconvert(values, dtype, compress=None): return np.array(values, dtype=object) if compress == 'zlib': - + import zlib values = zlib.decompress(values) return np.frombuffer(values, dtype=dtype) elif compress == 'blosc': - - if not _BLOSC: - raise Exception("cannot uncompress w/o blosc") - - # decompress + import blosc values = blosc.decompress(values) - return np.frombuffer(values, dtype=dtype) # from a string From b49d6fd60d16f7c049c78871a48a1c579320a7de Mon Sep 17 00:00:00 2001 From: Roy Hyunjin Han Date: Wed, 1 Apr 2015 22:57:25 -0400 Subject: [PATCH 008/239] Add missing keys and tests --- pandas/io/packers.py | 16 +++++++++++----- pandas/io/tests/test_packers.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 04321296d4646..75ca44fd1ef3e 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -234,6 +234,8 @@ def unconvert(values, dtype, compress=None): if dtype == np.object_: return np.array(values, dtype=object) + values = values.encode('latin1') + if compress == 'zlib': import zlib values = zlib.decompress(values) @@ -245,7 +247,7 @@ def unconvert(values, dtype, compress=None): return np.frombuffer(values, dtype=dtype) # from a string - return np.fromstring(values.encode('latin1'), dtype=dtype) + return np.fromstring(values, dtype=dtype) def encode(obj): @@ -261,7 +263,8 @@ def encode(obj): 'name': getattr(obj, 'name', None), 'freq': getattr(obj, 'freqstr', None), 'dtype': obj.dtype.num, - 'data': convert(obj.asi8)} + 'data': convert(obj.asi8), + 'compress': compressor} elif isinstance(obj, DatetimeIndex): tz = getattr(obj, 'tz', None) @@ -275,19 +278,22 @@ def encode(obj): 'dtype': obj.dtype.num, 'data': convert(obj.asi8), 'freq': getattr(obj, 'freqstr', None), - 'tz': tz} + 'tz': tz, + 'compress': compressor} elif isinstance(obj, MultiIndex): return {'typ': 'multi_index', 'klass': obj.__class__.__name__, 'names': getattr(obj, 'names', None), 'dtype': obj.dtype.num, - 'data': convert(obj.values)} + 'data': convert(obj.values), + 'compress': compressor} else: return {'typ': 'index', 'klass': obj.__class__.__name__, 'name': getattr(obj, 'name', None), 'dtype': obj.dtype.num, - 'data': convert(obj.values)} + 'data': convert(obj.values), + 'compress': compressor} elif isinstance(obj, Series): if isinstance(obj, SparseSeries): raise NotImplementedError( diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 9633f567ab098..992de2b67bf49 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -446,6 +446,37 @@ def test_sparse_panel(self): check_panel_type=True) +class TestCompression(TestPackers): + + def setUp(self): + super(TestCompression, self).setUp() + data = { + 'A': np.arange(1000, dtype=float), + 'B': range(1000), + 'C': list(100 * 'abcdefghij'), + } + self.frame = { + 'float': DataFrame(dict([(k, data[k]) for k in ['A', 'A']])), + 'int': DataFrame(dict([(k, data[k]) for k in ['B', 'B']])), + 'mixed': DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C']])), + } + + def test_plain(self): + i_rec = self.encode_decode(self.frame, compress='zlib') + for k in self.frame.keys(): + assert_frame_equal(self.frame[k], i_rec[k]) + + def test_compression_zlib(self): + i_rec = self.encode_decode(self.frame, compress='zlib') + for k in self.frame.keys(): + assert_frame_equal(self.frame[k], i_rec[k]) + + def test_compression_blosc(self): + i_rec = self.encode_decode(self.frame, compress='blosc') + for k in self.frame.keys(): + assert_frame_equal(self.frame[k], i_rec[k]) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 7e8d6dd05ef508cc5a089a36b50871ac1524492e Mon Sep 17 00:00:00 2001 From: Roy Hyunjin Han Date: Wed, 1 Apr 2015 23:03:54 -0400 Subject: [PATCH 009/239] Make test_plain actually plain --- pandas/io/tests/test_packers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 992de2b67bf49..74ef8bf20f70c 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -462,7 +462,7 @@ def setUp(self): } def test_plain(self): - i_rec = self.encode_decode(self.frame, compress='zlib') + i_rec = self.encode_decode(self.frame) for k in self.frame.keys(): assert_frame_equal(self.frame[k], i_rec[k]) From 6ce2447e765d87eb340fbc83d98af05f06d3be0a Mon Sep 17 00:00:00 2001 From: Roy Hyunjin Han Date: Wed, 1 Apr 2015 23:49:26 -0400 Subject: [PATCH 010/239] Add blosc to .travis.yml --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 6c4d6897a69de..1b1c797fa7e08 100644 --- a/.travis.yml +++ b/.travis.yml @@ -121,6 +121,7 @@ before_install: - echo $VIRTUAL_ENV - export PATH="$HOME/miniconda/bin:$PATH" - sudo apt-get install ccache + - pip install -U blosc - df -h - date - pwd From 6aab111269e71fc28a3bd08fd5a42df058786dfd Mon Sep 17 00:00:00 2001 From: Roy Hyunjin Han Date: Thu, 2 Apr 2015 00:28:55 -0400 Subject: [PATCH 011/239] Add blosc to requirements --- .travis.yml | 1 - ci/requirements-2.6.txt | 1 + ci/requirements-2.7.txt | 1 + ci/requirements-2.7_32.txt | 1 + ci/requirements-2.7_64.txt | 1 + ci/requirements-2.7_LOCALE.txt | 1 + ci/requirements-2.7_SLOW.txt | 1 + ci/requirements-3.2.txt | 1 + ci/requirements-3.3.txt | 1 + ci/requirements-3.4.txt | 1 + ci/requirements-3.4_32.txt | 1 + ci/requirements-3.4_64.txt | 1 + ci/requirements-3.4_SLOW.txt | 1 + 13 files changed, 12 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 1b1c797fa7e08..6c4d6897a69de 100644 --- a/.travis.yml +++ b/.travis.yml @@ -121,7 +121,6 @@ before_install: - echo $VIRTUAL_ENV - export PATH="$HOME/miniconda/bin:$PATH" - sudo apt-get install ccache - - pip install -U blosc - df -h - date - pwd diff --git a/ci/requirements-2.6.txt b/ci/requirements-2.6.txt index 9b338cee26801..a26f260ab5841 100644 --- a/ci/requirements-2.6.txt +++ b/ci/requirements-2.6.txt @@ -14,3 +14,4 @@ numexpr=1.4.2 pymysql=0.6.0 sqlalchemy=0.7.8 xlsxwriter=0.4.6 +blosc diff --git a/ci/requirements-2.7.txt b/ci/requirements-2.7.txt index 0d515f300f5a7..5b1991c8810ce 100644 --- a/ci/requirements-2.7.txt +++ b/ci/requirements-2.7.txt @@ -23,3 +23,4 @@ beautiful-soup=4.2.1 httplib2=0.8 python-gflags=2.0 google-api-python-client=1.2 +blosc diff --git a/ci/requirements-2.7_32.txt b/ci/requirements-2.7_32.txt index 01b305bb6f21a..9e6dec6048d35 100644 --- a/ci/requirements-2.7_32.txt +++ b/ci/requirements-2.7_32.txt @@ -9,3 +9,4 @@ matplotlib openpyxl xlrd scipy +blosc diff --git a/ci/requirements-2.7_64.txt b/ci/requirements-2.7_64.txt index 01b305bb6f21a..9e6dec6048d35 100644 --- a/ci/requirements-2.7_64.txt +++ b/ci/requirements-2.7_64.txt @@ -9,3 +9,4 @@ matplotlib openpyxl xlrd scipy +blosc diff --git a/ci/requirements-2.7_LOCALE.txt b/ci/requirements-2.7_LOCALE.txt index 6c70bfd77ff3f..f5e46f3ddade8 100644 --- a/ci/requirements-2.7_LOCALE.txt +++ b/ci/requirements-2.7_LOCALE.txt @@ -16,3 +16,4 @@ scipy=0.11.0 beautiful-soup=4.2.1 statsmodels=0.4.3 bigquery=2.0.17 +blosc diff --git a/ci/requirements-2.7_SLOW.txt b/ci/requirements-2.7_SLOW.txt index a1ecbceda40dd..2b84fd7b095b2 100644 --- a/ci/requirements-2.7_SLOW.txt +++ b/ci/requirements-2.7_SLOW.txt @@ -23,3 +23,4 @@ beautiful-soup httplib2 python-gflags google-api-python-client +blosc diff --git a/ci/requirements-3.2.txt b/ci/requirements-3.2.txt index 9ba8fd7ca9393..bd7bd3c7e7219 100644 --- a/ci/requirements-3.2.txt +++ b/ci/requirements-3.2.txt @@ -13,3 +13,4 @@ html5lib scipy==0.12.0 beautifulsoup4==4.2.1 statsmodels==0.5.0 +blosc diff --git a/ci/requirements-3.3.txt b/ci/requirements-3.3.txt index c9beec81236fb..0a5cba36b79d3 100644 --- a/ci/requirements-3.3.txt +++ b/ci/requirements-3.3.txt @@ -15,3 +15,4 @@ lxml=3.2.1 scipy beautiful-soup=4.2.1 statsmodels +blosc diff --git a/ci/requirements-3.4.txt b/ci/requirements-3.4.txt index 8a55c0458688e..6ba917154e31d 100644 --- a/ci/requirements-3.4.txt +++ b/ci/requirements-3.4.txt @@ -17,3 +17,4 @@ sqlalchemy bottleneck pymysql==0.6.3 psycopg2 +blosc diff --git a/ci/requirements-3.4_32.txt b/ci/requirements-3.4_32.txt index e9dfe9f0ee19e..bac033f7275c8 100644 --- a/ci/requirements-3.4_32.txt +++ b/ci/requirements-3.4_32.txt @@ -8,3 +8,4 @@ scipy numexpr pytables matplotlib +blosc diff --git a/ci/requirements-3.4_64.txt b/ci/requirements-3.4_64.txt index e9dfe9f0ee19e..bac033f7275c8 100644 --- a/ci/requirements-3.4_64.txt +++ b/ci/requirements-3.4_64.txt @@ -8,3 +8,4 @@ scipy numexpr pytables matplotlib +blosc diff --git a/ci/requirements-3.4_SLOW.txt b/ci/requirements-3.4_SLOW.txt index 930cf126b7da0..c6d904cb4c314 100644 --- a/ci/requirements-3.4_SLOW.txt +++ b/ci/requirements-3.4_SLOW.txt @@ -17,3 +17,4 @@ sqlalchemy bottleneck pymysql psycopg2 +blosc From 9214a00d4ccc7e05dd06250b36b04697b7201cfd Mon Sep 17 00:00:00 2001 From: Roy Hyunjin Han Date: Thu, 2 Apr 2015 08:25:35 -0400 Subject: [PATCH 012/239] Diagnose why blosc is not importing --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 6c4d6897a69de..ca225b2513ab7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -126,6 +126,7 @@ before_install: - pwd - uname -a - python -V + - python -c "import blosc; blosc.print_versions()" - ci/before_install.sh # Xvfb stuff for clipboard functionality; see the travis-ci documentation - export DISPLAY=:99.0 From 660a675160d8e6743505c713d9d81b67e4fbcf4d Mon Sep 17 00:00:00 2001 From: Roy Hyunjin Han Date: Thu, 2 Apr 2015 08:40:25 -0400 Subject: [PATCH 013/239] Get travis to cooperate --- .travis.yml | 1 - ci/install_pydata.sh | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index ca225b2513ab7..6c4d6897a69de 100644 --- a/.travis.yml +++ b/.travis.yml @@ -126,7 +126,6 @@ before_install: - pwd - uname -a - python -V - - python -c "import blosc; blosc.print_versions()" - ci/before_install.sh # Xvfb stuff for clipboard functionality; see the travis-ci documentation - export DISPLAY=:99.0 diff --git a/ci/install_pydata.sh b/ci/install_pydata.sh index 33a6d3854da22..a52446043b3b8 100755 --- a/ci/install_pydata.sh +++ b/ci/install_pydata.sh @@ -92,6 +92,8 @@ rm -f $VIRTUAL_ENV/lib/python$TRAVIS_PYTHON_VERSION/no-global-site-packages.txt time pip install $PIP_ARGS -r ci/requirements-${wheel_box}.txt +python -c 'import blosc; blosc.print_versions()' + # Need to enable for locale testing. The location of the locale file(s) is # distro specific. For example, on Arch Linux all of the locales are in a # commented file--/etc/locale.gen--that must be commented in to be used From f811a1aa7ebc74e97a4c84ffb4257d86b666e030 Mon Sep 17 00:00:00 2001 From: Roy Hyunjin Han Date: Thu, 2 Apr 2015 08:54:13 -0400 Subject: [PATCH 014/239] Make travis listen --- ci/install_pydata.sh | 2 -- ci/script.sh | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/install_pydata.sh b/ci/install_pydata.sh index a52446043b3b8..33a6d3854da22 100755 --- a/ci/install_pydata.sh +++ b/ci/install_pydata.sh @@ -92,8 +92,6 @@ rm -f $VIRTUAL_ENV/lib/python$TRAVIS_PYTHON_VERSION/no-global-site-packages.txt time pip install $PIP_ARGS -r ci/requirements-${wheel_box}.txt -python -c 'import blosc; blosc.print_versions()' - # Need to enable for locale testing. The location of the locale file(s) is # distro specific. For example, on Arch Linux all of the locales are in a # commented file--/etc/locale.gen--that must be commented in to be used diff --git a/ci/script.sh b/ci/script.sh index b1ba7ba79c816..6d747ee8a447b 100755 --- a/ci/script.sh +++ b/ci/script.sh @@ -16,6 +16,8 @@ fi "$TRAVIS_BUILD_DIR"/ci/build_docs.sh 2>&1 > /tmp/doc.log & # doc build log will be shown after tests +pip install -U blosc +python -c 'import blosc; blosc.print_versions()' echo nosetests --exe -A "$NOSE_ARGS" pandas --with-xunit --xunit-file=/tmp/nosetests.xml nosetests --exe -A "$NOSE_ARGS" pandas --with-xunit --xunit-file=/tmp/nosetests.xml From adc24445c5da1daa6a7791fd75ba9c0fc482887d Mon Sep 17 00:00:00 2001 From: Roy Hyunjin Han Date: Thu, 2 Apr 2015 10:23:17 -0400 Subject: [PATCH 015/239] Respond to comments from jreback --- ci/requirements-2.6.txt | 1 - ci/requirements-2.7.txt | 1 - ci/requirements-2.7_32.txt | 1 - ci/requirements-2.7_64.txt | 1 - ci/requirements-2.7_LOCALE.txt | 1 - ci/requirements-2.7_SLOW.txt | 1 - ci/requirements-3.2.txt | 1 - ci/requirements-3.3.txt | 1 - ci/requirements-3.4.txt | 1 - ci/requirements-3.4_32.txt | 1 - ci/requirements-3.4_64.txt | 1 - ci/requirements-3.4_SLOW.txt | 1 - ci/script.sh | 2 +- pandas/io/tests/test_packers.py | 14 +++++++++----- 14 files changed, 10 insertions(+), 18 deletions(-) diff --git a/ci/requirements-2.6.txt b/ci/requirements-2.6.txt index a26f260ab5841..9b338cee26801 100644 --- a/ci/requirements-2.6.txt +++ b/ci/requirements-2.6.txt @@ -14,4 +14,3 @@ numexpr=1.4.2 pymysql=0.6.0 sqlalchemy=0.7.8 xlsxwriter=0.4.6 -blosc diff --git a/ci/requirements-2.7.txt b/ci/requirements-2.7.txt index 5b1991c8810ce..0d515f300f5a7 100644 --- a/ci/requirements-2.7.txt +++ b/ci/requirements-2.7.txt @@ -23,4 +23,3 @@ beautiful-soup=4.2.1 httplib2=0.8 python-gflags=2.0 google-api-python-client=1.2 -blosc diff --git a/ci/requirements-2.7_32.txt b/ci/requirements-2.7_32.txt index 9e6dec6048d35..01b305bb6f21a 100644 --- a/ci/requirements-2.7_32.txt +++ b/ci/requirements-2.7_32.txt @@ -9,4 +9,3 @@ matplotlib openpyxl xlrd scipy -blosc diff --git a/ci/requirements-2.7_64.txt b/ci/requirements-2.7_64.txt index 9e6dec6048d35..01b305bb6f21a 100644 --- a/ci/requirements-2.7_64.txt +++ b/ci/requirements-2.7_64.txt @@ -9,4 +9,3 @@ matplotlib openpyxl xlrd scipy -blosc diff --git a/ci/requirements-2.7_LOCALE.txt b/ci/requirements-2.7_LOCALE.txt index f5e46f3ddade8..6c70bfd77ff3f 100644 --- a/ci/requirements-2.7_LOCALE.txt +++ b/ci/requirements-2.7_LOCALE.txt @@ -16,4 +16,3 @@ scipy=0.11.0 beautiful-soup=4.2.1 statsmodels=0.4.3 bigquery=2.0.17 -blosc diff --git a/ci/requirements-2.7_SLOW.txt b/ci/requirements-2.7_SLOW.txt index 2b84fd7b095b2..a1ecbceda40dd 100644 --- a/ci/requirements-2.7_SLOW.txt +++ b/ci/requirements-2.7_SLOW.txt @@ -23,4 +23,3 @@ beautiful-soup httplib2 python-gflags google-api-python-client -blosc diff --git a/ci/requirements-3.2.txt b/ci/requirements-3.2.txt index bd7bd3c7e7219..9ba8fd7ca9393 100644 --- a/ci/requirements-3.2.txt +++ b/ci/requirements-3.2.txt @@ -13,4 +13,3 @@ html5lib scipy==0.12.0 beautifulsoup4==4.2.1 statsmodels==0.5.0 -blosc diff --git a/ci/requirements-3.3.txt b/ci/requirements-3.3.txt index 0a5cba36b79d3..c9beec81236fb 100644 --- a/ci/requirements-3.3.txt +++ b/ci/requirements-3.3.txt @@ -15,4 +15,3 @@ lxml=3.2.1 scipy beautiful-soup=4.2.1 statsmodels -blosc diff --git a/ci/requirements-3.4.txt b/ci/requirements-3.4.txt index 6ba917154e31d..8a55c0458688e 100644 --- a/ci/requirements-3.4.txt +++ b/ci/requirements-3.4.txt @@ -17,4 +17,3 @@ sqlalchemy bottleneck pymysql==0.6.3 psycopg2 -blosc diff --git a/ci/requirements-3.4_32.txt b/ci/requirements-3.4_32.txt index bac033f7275c8..e9dfe9f0ee19e 100644 --- a/ci/requirements-3.4_32.txt +++ b/ci/requirements-3.4_32.txt @@ -8,4 +8,3 @@ scipy numexpr pytables matplotlib -blosc diff --git a/ci/requirements-3.4_64.txt b/ci/requirements-3.4_64.txt index bac033f7275c8..e9dfe9f0ee19e 100644 --- a/ci/requirements-3.4_64.txt +++ b/ci/requirements-3.4_64.txt @@ -8,4 +8,3 @@ scipy numexpr pytables matplotlib -blosc diff --git a/ci/requirements-3.4_SLOW.txt b/ci/requirements-3.4_SLOW.txt index c6d904cb4c314..930cf126b7da0 100644 --- a/ci/requirements-3.4_SLOW.txt +++ b/ci/requirements-3.4_SLOW.txt @@ -17,4 +17,3 @@ sqlalchemy bottleneck pymysql psycopg2 -blosc diff --git a/ci/script.sh b/ci/script.sh index 6d747ee8a447b..e1f71e70ded69 100755 --- a/ci/script.sh +++ b/ci/script.sh @@ -16,7 +16,7 @@ fi "$TRAVIS_BUILD_DIR"/ci/build_docs.sh 2>&1 > /tmp/doc.log & # doc build log will be shown after tests -pip install -U blosc +pip install -U blosc # See https://github.com/pydata/pandas/pull/9783 python -c 'import blosc; blosc.print_versions()' echo nosetests --exe -A "$NOSE_ARGS" pandas --with-xunit --xunit-file=/tmp/nosetests.xml diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 74ef8bf20f70c..d85e75f5d2818 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -447,18 +447,22 @@ def test_sparse_panel(self): class TestCompression(TestPackers): + """See https://github.com/pydata/pandas/pull/9783 + """ def setUp(self): super(TestCompression, self).setUp() data = { - 'A': np.arange(1000, dtype=float), - 'B': range(1000), + 'A': np.arange(1000, dtype=np.float64), + 'B': np.arange(1000, dtype=np.int32), 'C': list(100 * 'abcdefghij'), + 'D': date_range(datetime.datetime(2015, 4, 1), periods=1000), + 'E': [datetime.timedelta(days=x) for x in range(1000)], } self.frame = { - 'float': DataFrame(dict([(k, data[k]) for k in ['A', 'A']])), - 'int': DataFrame(dict([(k, data[k]) for k in ['B', 'B']])), - 'mixed': DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C']])), + 'float': DataFrame(dict((k, data[k]) for k in ['A', 'A'])), + 'int': DataFrame(dict((k, data[k]) for k in ['B', 'B'])), + 'mixed': DataFrame(data), } def test_plain(self): From bf64ed9ceaa20b61cc0d741c3f106fd7858bca76 Mon Sep 17 00:00:00 2001 From: Roy Hyunjin Han Date: Thu, 2 Apr 2015 10:35:29 -0400 Subject: [PATCH 016/239] Restore spacing From b0b1c265a3729339de77b96772c07134dcdce1ef Mon Sep 17 00:00:00 2001 From: dsm054 Date: Sat, 28 Mar 2015 18:35:51 -0400 Subject: [PATCH 017/239] BUG: DataFrame.equals should not care about block order (GH #9330) --- doc/source/whatsnew/v0.16.1.txt | 1 + pandas/core/internals.py | 14 +++++++++++++- pandas/io/tests/test_pytables.py | 18 ++++++++++++++++-- pandas/tests/test_frame.py | 14 ++++++++++++++ pandas/tests/test_internals.py | 25 ++++++++++++++++++++++--- 5 files changed, 66 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 160778ed9f8ca..1f4b1284e5509 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -71,6 +71,7 @@ Bug Fixes - Bug in ``transform`` causing length mismatch when null entries were present and a fast aggregator was being used (:issue:`9697`) +- Bug in ``equals`` causing false negatives when block order differed (:issue:`9330`) - Bug in ``DataFrame`` slicing may not retain metadata (:issue:`9776`) - Bug where ``TimdeltaIndex`` were not properly serialized in fixed ``HDFStore`` (:issue:`9635`) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 7a16fb2b6b0d7..9b2d366bfb2be 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3310,8 +3310,20 @@ def equals(self, other): return False self._consolidate_inplace() other._consolidate_inplace() + if len(self.blocks) != len(other.blocks): + return False + + # canonicalize block order, using a tuple combining the type + # name and then mgr_locs because there might be unconsolidated + # blocks (say, Categorical) which can only be distinguished by + # the iteration order + def canonicalize(block): + return (block.dtype.name, block.mgr_locs.as_array.tolist()) + + self_blocks = sorted(self.blocks, key=canonicalize) + other_blocks = sorted(other.blocks, key=canonicalize) return all(block.equals(oblock) for block, oblock in - zip(self.blocks, other.blocks)) + zip(self_blocks, other_blocks)) class SingleBlockManager(BlockManager): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index acdc991c92efe..03e7a8eae549d 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4584,19 +4584,33 @@ def test_duplicate_column_name(self): with ensure_clean_path(self.path) as path: self.assertRaises(ValueError, df.to_hdf, path, 'df', format='fixed') + df.to_hdf(path, 'df', format='table') + other = read_hdf(path, 'df') + + tm.assert_frame_equal(df, other) + self.assertTrue(df.equals(other)) + self.assertTrue(other.equals(df)) + + def test_round_trip_equals(self): + # GH 9330 + df = DataFrame({"B": [1,2], "A": ["x","y"]}) + + with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', format='table') other = read_hdf(path, 'df') tm.assert_frame_equal(df, other) + self.assertTrue(df.equals(other)) + self.assertTrue(other.equals(df)) def test_preserve_timedeltaindex_type(self): - # GH9635 + # GH9635 # Storing TimedeltaIndexed DataFrames in fixed stores did not preserve # the type of the index. df = DataFrame(np.random.normal(size=(10,5))) df.index = timedelta_range(start='0s',periods=10,freq='1s',name='example') with ensure_clean_store(self.path) as store: - + store['df'] = df assert_frame_equal(store['df'], df) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index cdda087b27613..3e4c16f63035f 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -5944,6 +5944,20 @@ def test_boolean_comparison(self): self.assertRaises(ValueError, lambda : df == (2,2)) self.assertRaises(ValueError, lambda : df == [2,2]) + def test_equals_different_blocks(self): + # GH 9330 + df0 = pd.DataFrame({"A": ["x","y"], "B": [1,2], + "C": ["w","z"]}) + df1 = df0.reset_index()[["A","B","C"]] + # this assert verifies that the above operations have + # induced a block rearrangement + self.assertTrue(df0._data.blocks[0].dtype != + df1._data.blocks[0].dtype) + # do the real tests + self.assert_frame_equal(df0, df1) + self.assertTrue(df0.equals(df1)) + self.assertTrue(df1.equals(df0)) + def test_to_csv_from_csv(self): pname = '__tmp_to_csv_from_csv__' diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 45f089f5e0a53..36585abd1b98f 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -68,15 +68,15 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): elif typestr in ('object', 'string', 'O'): values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset], shape) - elif typestr in ('bool'): + elif typestr in ('b','bool',): values = np.ones(shape, dtype=np.bool_) elif typestr in ('datetime', 'dt', 'M8[ns]'): values = (mat * 1e9).astype('M8[ns]') elif typestr in ('timedelta', 'td', 'm8[ns]'): values = (mat * 1).astype('m8[ns]') - elif typestr in ('category'): + elif typestr in ('category',): values = Categorical([1,1,2,2,3,3,3,3,4,4]) - elif typestr in ('category2'): + elif typestr in ('category2',): values = Categorical(['a','a','a','a','b','b','c','c','c','d']) elif typestr in ('sparse', 'sparse_na'): # FIXME: doesn't support num_rows != 10 @@ -751,6 +751,25 @@ def test_equals(self): bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) self.assertTrue(bm1.equals(bm2)) + def test_equals_block_order_different_dtypes(self): + # GH 9330 + + mgr_strings = [ + "a:i8;b:f8", # basic case + "a:i8;b:f8;c:c8;d:b", # many types + "a:i8;e:dt;f:td;g:string", # more types + "a:i8;b:category;c:category2;d:category2", # categories + "c:sparse;d:sparse_na;b:f8", # sparse + ] + + for mgr_string in mgr_strings: + bm = create_mgr(mgr_string) + block_perms = itertools.permutations(bm.blocks) + for bm_perm in block_perms: + bm_this = BlockManager(bm_perm, bm.axes) + self.assertTrue(bm.equals(bm_this)) + self.assertTrue(bm_this.equals(bm)) + def test_single_mgr_ctor(self): mgr = create_single_mgr('f8', num_rows=5) self.assertEqual(mgr.as_matrix().tolist(), [0., 1., 2., 3., 4.]) From 7375ec430af7bc40e237e30428949babdce27690 Mon Sep 17 00:00:00 2001 From: Tomaz Berisa Date: Fri, 3 Apr 2015 16:12:44 -0400 Subject: [PATCH 018/239] BUG: Fix for #9764 Values from range [1e-7, 5e-7] (for display.precision=7) not displaying 0 anymore --- pandas/core/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/format.py b/pandas/core/format.py index b21ca9050ffd0..7b8a3161b5e05 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -1996,7 +1996,7 @@ def _format_strings(self): # this is pretty arbitrary for now has_large_values = (abs_vals > 1e8).any() - has_small_values = ((abs_vals < 10 ** (-self.digits)) & + has_small_values = ((abs_vals < 10 ** (-self.digits+1)) & (abs_vals > 0)).any() if too_long and has_large_values: From 2296520f93d84ef1eb13c29532fe331291973998 Mon Sep 17 00:00:00 2001 From: Tomaz Berisa Date: Fri, 3 Apr 2015 17:29:56 -0400 Subject: [PATCH 019/239] TST: Test for #9764 fix --- pandas/tests/test_format.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index ce32c8af99a73..145fa78b387cd 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -2986,6 +2986,20 @@ def test_format(self): self.assertEqual(result[0], " 12") self.assertEqual(result[1], " 0") + def test_output_significant_digits(self): + # relevant to issue #9764 + d=pd.DataFrame({'col1':[9.999e-8, 1e-7, 1.0001e-7, 2e-7, 4.999e-7, 5e-7, 5.0001e-7, 6e-7, 9.999e-7, 1e-6, 1.0001e-6, 2e-6, 4.999e-6, 5e-6, 5.0001e-6, 6e-6]}) + + expected_output={ + (0,6):' col1\n0 9.999000e-08\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07', + (1,6):' col1\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07', + (1,8):' col1\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07\n6 5.000100e-07\n7 6.000000e-07', + (8,16):' col1\n8 9.999000e-07\n9 1.000000e-06\n10 1.000100e-06\n11 2.000000e-06\n12 4.999000e-06\n13 5.000000e-06\n14 5.000100e-06\n15 6.000000e-06', + (9,16):' col1\n9 0.000001\n10 0.000001\n11 0.000002\n12 0.000005\n13 0.000005\n14 0.000005\n15 0.000006' + } + + for k, v in expected_output.items(): + self.assertEqual(d[k[0]:k[1]].__str__(), v) class TestRepr_timedelta64(tm.TestCase): From 00907a97b7ca03d0e16e09f40327e296de7a1f95 Mon Sep 17 00:00:00 2001 From: Tomaz Berisa Date: Fri, 3 Apr 2015 17:48:38 -0400 Subject: [PATCH 020/239] TST: Test saves and restore context (#9764) --- pandas/tests/test_format.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index 145fa78b387cd..e3f458690d5f7 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -2987,7 +2987,13 @@ def test_format(self): self.assertEqual(result[1], " 0") def test_output_significant_digits(self): - # relevant to issue #9764 + # Issue #9764 + + # In case default display precision changes: + saved_option=pd.get_option('display.precision') + pd.set_option('display.precision', 7) + + # DataFrame from issue #9764 d=pd.DataFrame({'col1':[9.999e-8, 1e-7, 1.0001e-7, 2e-7, 4.999e-7, 5e-7, 5.0001e-7, 6e-7, 9.999e-7, 1e-6, 1.0001e-6, 2e-6, 4.999e-6, 5e-6, 5.0001e-6, 6e-6]}) expected_output={ @@ -3001,6 +3007,9 @@ def test_output_significant_digits(self): for k, v in expected_output.items(): self.assertEqual(d[k[0]:k[1]].__str__(), v) + # Restore precision + pd.set_option('display.precision', saved_option) + class TestRepr_timedelta64(tm.TestCase): def test_none(self): From 59ea8cc81c663a786bc87b31a5b4c831b7b68f55 Mon Sep 17 00:00:00 2001 From: Tomaz Berisa Date: Sat, 4 Apr 2015 17:39:59 -0400 Subject: [PATCH 021/239] CLN: Test code cleanup (#9764) --- pandas/tests/test_format.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index e3f458690d5f7..1dcdbf12a6b59 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -2990,25 +2990,21 @@ def test_output_significant_digits(self): # Issue #9764 # In case default display precision changes: - saved_option=pd.get_option('display.precision') - pd.set_option('display.precision', 7) + with pd.option_context('display.precision', 7): + # DataFrame example from issue #9764 + d=pd.DataFrame({'col1':[9.999e-8, 1e-7, 1.0001e-7, 2e-7, 4.999e-7, 5e-7, 5.0001e-7, 6e-7, 9.999e-7, 1e-6, 1.0001e-6, 2e-6, 4.999e-6, 5e-6, 5.0001e-6, 6e-6]}) + + expected_output={ + (0,6):' col1\n0 9.999000e-08\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07', + (1,6):' col1\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07', + (1,8):' col1\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07\n6 5.000100e-07\n7 6.000000e-07', + (8,16):' col1\n8 9.999000e-07\n9 1.000000e-06\n10 1.000100e-06\n11 2.000000e-06\n12 4.999000e-06\n13 5.000000e-06\n14 5.000100e-06\n15 6.000000e-06', + (9,16):' col1\n9 0.000001\n10 0.000001\n11 0.000002\n12 0.000005\n13 0.000005\n14 0.000005\n15 0.000006' + } + + for (start, stop), v in expected_output.items(): + self.assertEqual(str(d[start:stop]), v) - # DataFrame from issue #9764 - d=pd.DataFrame({'col1':[9.999e-8, 1e-7, 1.0001e-7, 2e-7, 4.999e-7, 5e-7, 5.0001e-7, 6e-7, 9.999e-7, 1e-6, 1.0001e-6, 2e-6, 4.999e-6, 5e-6, 5.0001e-6, 6e-6]}) - - expected_output={ - (0,6):' col1\n0 9.999000e-08\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07', - (1,6):' col1\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07', - (1,8):' col1\n1 1.000000e-07\n2 1.000100e-07\n3 2.000000e-07\n4 4.999000e-07\n5 5.000000e-07\n6 5.000100e-07\n7 6.000000e-07', - (8,16):' col1\n8 9.999000e-07\n9 1.000000e-06\n10 1.000100e-06\n11 2.000000e-06\n12 4.999000e-06\n13 5.000000e-06\n14 5.000100e-06\n15 6.000000e-06', - (9,16):' col1\n9 0.000001\n10 0.000001\n11 0.000002\n12 0.000005\n13 0.000005\n14 0.000005\n15 0.000006' - } - - for k, v in expected_output.items(): - self.assertEqual(d[k[0]:k[1]].__str__(), v) - - # Restore precision - pd.set_option('display.precision', saved_option) class TestRepr_timedelta64(tm.TestCase): From d5be1059fdc3df9438dde35c6fd22528337ade3e Mon Sep 17 00:00:00 2001 From: Tomaz Berisa Date: Sat, 4 Apr 2015 17:54:32 -0400 Subject: [PATCH 022/239] DOC: Update whatsnew for 0.16.1 (#9764) --- doc/source/whatsnew/v0.16.1.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 1f4b1284e5509..a20e901dbddd1 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -81,5 +81,9 @@ Bug Fixes - Bug in ``Series.quantile`` on empty Series of type ``Datetime`` or ``Timedelta`` (:issue:`9675`) - Bug in ``where`` causing incorrect results when upcasting was required (:issue:`9731`) +<<<<<<< HEAD - Bug in ``ParserBase.convert_to_nd_arrays`` when called by ``DataFrame.read_fwf`` (:issue:`9266``) +======= +- Bug in ``FloatArrayFormatter`` where decision boundary for displaying "small" floats in decimal format is off by one order of magnitude for a given display.precision (:issue:`9764`) +>>>>>>> ce988b4... DOC: Update whatsnew for 0.16.1 (#9764) From 22f62c7941bad30f253628510f2ea69265a52c05 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 4 Apr 2015 13:12:11 -0400 Subject: [PATCH 023/239] DOC: add dev environment creation details to contributing.rst --- CONTRIBUTING.md | 79 ++++++++++++++++++++++++++++++-- ci/requirements_all.txt | 21 +++++++++ ci/requirements_dev.txt | 5 ++ doc/source/contributing.rst | 91 +++++++++++++++++++++++++++++++++++-- doc/source/install.rst | 46 ++----------------- 5 files changed, 192 insertions(+), 50 deletions(-) create mode 100644 ci/requirements_all.txt create mode 100644 ci/requirements_dev.txt diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f7041dbabdad5..d3eeb820a12eb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -137,6 +137,69 @@ clear what the branch brings to *pandas*. You can have many shiny-new-features and switch in between them using the git checkout command. +### Creating a Development Environment + +An easy way to create a *pandas* development environment is as follows. + +- Install either Install Anaconda \ or + Install miniconda \ +- Make sure that you have + cloned the repository \ +- `cd` to the pandas source directory + +Tell `conda` to create a new environment, named `pandas_dev`, or any +name you would like for this environment by running: + + conda create -n pandas_dev --file ci/requirements_dev.txt + +For a python 3 environment + + conda create -n pandas_dev python=3 --file ci/requirements_dev.txt + +If you are on `windows`, then you will need to install the compiler +linkages: + + conda install -n pandas_dev libpython + +This will create the new environment, and not touch any of your existing +environments, nor any existing python installation. It will install all +of the basic dependencies of *pandas*, as well as the development and +testing tools. If you would like to install other dependencies, you can +install them as follows: + + conda install -n pandas_dev -c pandas pytables scipy + +To install *all* pandas dependencies you can do the following: + + conda install -n pandas_dev -c pandas --file ci/requirements_all.txt + +To work in this environment, `activate` it as follows: + + activate pandas_dev + +At which point, the prompt will change to indicate you are in the new +development environment. + +> **note** +> +> The above syntax is for `windows` environments. To work on +> `macosx/linux`, use: +> +> source activate pandas_dev + +To view your environments: + + conda info -e + +To return to you home root environment: + + deactivate + +See the full `conda` docs [here](http://conda.pydata.org/docs). + +At this point you can easily do an *in-place* install, as detailed in +the next section. + ### Making changes Before making your code changes, it is often necessary to build the code @@ -231,13 +294,19 @@ docstrings that follow the Numpy Docstring Standard (see above), but you don't need to install this because a local copy of `numpydoc` is included in the *pandas* source code. +It is easiest to +create a development environment \, then +install: + + conda install -n pandas_dev sphinx ipython + Furthermore, it is recommended to have all [optional dependencies](http://pandas.pydata.org/pandas-docs/dev/install.html#optional-dependencies) -installed. This is not needed, but be aware that you will see some error -messages. Because all the code in the documentation is executed during -the doc build, the examples using this optional dependencies will -generate errors. Run `pd.show_versions()` to get an overview of the -installed version of all dependencies. +installed. This is not strictly necessary, but be aware that you will +see some error messages. Because all the code in the documentation is +executed during the doc build, the examples using this optional +dependencies will generate errors. Run `pd.show_versions()` to get an +overview of the installed version of all dependencies. > **warning** > diff --git a/ci/requirements_all.txt b/ci/requirements_all.txt new file mode 100644 index 0000000000000..c70efed96a8dd --- /dev/null +++ b/ci/requirements_all.txt @@ -0,0 +1,21 @@ +nose +sphinx +ipython +dateutil +pytz +openpyxl +xlsxwriter +xlrd +html5lib +patsy +beautiful-soup +numpy +cython +scipy +numexpr +pytables +matplotlib +lxml +sqlalchemy +bottleneck +pymysql diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt new file mode 100644 index 0000000000000..b273ca043c4a2 --- /dev/null +++ b/ci/requirements_dev.txt @@ -0,0 +1,5 @@ +dateutil +pytz +numpy +cython +nose diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index b3b2d272e66c6..cc4473e8d355a 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -96,6 +96,8 @@ Getting Started with Git setting up your SSH key, and configuring git. All these steps need to be completed before working seamlessly with your local repository and GitHub. +.. _contributing.forking: + Forking ------- @@ -132,6 +134,84 @@ changes in this branch specific to one bug or feature so it is clear what the branch brings to *pandas*. You can have many shiny-new-features and switch in between them using the git checkout command. +.. _contributing.dev_env: + +Creating a Development Environment +---------------------------------- + +An easy way to create a *pandas* development environment is as follows. + +- Install either :ref:`Install Anaconda ` or :ref:`Install miniconda ` +- Make sure that you have :ref:`cloned the repository ` +- ``cd`` to the pandas source directory + +Tell ``conda`` to create a new environment, named ``pandas_dev``, or any name you would like for this environment by running: + +:: + + conda create -n pandas_dev --file ci/requirements_dev.txt + + +For a python 3 environment + +:: + + conda create -n pandas_dev python=3 --file ci/requirements_dev.txt + + +If you are on ``windows``, then you will need to install the compiler linkages: + +:: + + conda install -n pandas_dev libpython + +This will create the new environment, and not touch any of your existing environments, nor any existing python installation. It will install all of the basic dependencies of *pandas*, as well as the development and testing tools. If you would like to install other dependencies, you can install them as follows: + +:: + + conda install -n pandas_dev -c pandas pytables scipy + +To install *all* pandas dependencies you can do the following: + +:: + + conda install -n pandas_dev -c pandas --file ci/requirements_all.txt + +To work in this environment, ``activate`` it as follows: + +:: + + activate pandas_dev + +At which point, the prompt will change to indicate you are in the new development environment. + +.. note:: + + The above syntax is for ``windows`` environments. To work on ``macosx/linux``, use: + + :: + + source activate pandas_dev + +To view your environments: + +:: + + conda info -e + +To return to you home root environment: + +:: + + deactivate + +See the full ``conda`` docs `here +`_. + +At this point you can easily do an *in-place* install, as detailed in the next section. + +.. _contributing.getting_source: + Making changes -------------- @@ -237,9 +317,15 @@ follow the Numpy Docstring Standard (see above), but you don't need to install this because a local copy of ``numpydoc`` is included in the *pandas* source code. +It is easiest to :ref:`create a development environment `, then install: + +:: + + conda install -n pandas_dev sphinx ipython + Furthermore, it is recommended to have all `optional dependencies `_ -installed. This is not needed, but be aware that you will see some error +installed. This is not strictly necessary, but be aware that you will see some error messages. Because all the code in the documentation is executed during the doc build, the examples using this optional dependencies will generate errors. Run ``pd.show_versions()`` to get an overview of the installed version of all @@ -572,6 +658,3 @@ branch has not actually been merged. The branch will still exist on GitHub, so to delete it there do :: git push origin --delete shiny-new-feature - - - diff --git a/doc/source/install.rst b/doc/source/install.rst index dd9021d0439dc..07c88841e5dcb 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -35,6 +35,8 @@ pandas at all. Simply create an account, and have access to pandas from within your brower via an `IPython Notebook `__ in a few minutes. +.. _install.anaconda + Installing pandas with Anaconda ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -66,6 +68,8 @@ admin rights to install it, it will install in the user's home directory, and this also makes it trivial to delete Anaconda at a later date (just delete that folder). +.. _install.miniconda + Installing pandas with Miniconda ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -173,47 +177,8 @@ Installing using your Linux distribution's package manager. Installing from source ~~~~~~~~~~~~~~~~~~~~~~ -.. note:: - - Installing from the git repository requires a recent installation of `Cython - `__ as the cythonized C sources are no longer checked - into source control. Released source distributions will contain the built C - files. I recommend installing the latest Cython via ``easy_install -U - Cython`` - -The source code is hosted at http://github.com/pydata/pandas, it can be checked -out using git and compiled / installed like so: - -:: - - git clone git://github.com/pydata/pandas.git - cd pandas - python setup.py install - -Make sure you have Cython installed when installing from the repository, -rather then a tarball or pypi. -On Windows, I suggest installing the MinGW compiler suite following the -directions linked to above. Once configured property, run the following on the -command line: - -:: - - python setup.py build --compiler=mingw32 - python setup.py install - -Note that you will not be able to import pandas if you open an interpreter in -the source directory unless you build the C extensions in place: - -:: - - python setup.py build_ext --inplace - -The most recent version of MinGW (any installer dated after 2011-08-03) -has removed the '-mno-cygwin' option but Distutils has not yet been updated to -reflect that. Thus, you may run into an error like "unrecognized command line -option '-mno-cygwin'". Until the bug is fixed in Distutils, you may need to -install a slightly older version of MinGW (2011-08-02 installer). +See the :ref:`contributing documentation ` for complete instructions on building from the git source tree. Further, see :ref:`creating a devevlopment environment ` if you wish to create a *pandas* development environment. Running the test suite ~~~~~~~~~~~~~~~~~~~~~~ @@ -354,4 +319,3 @@ Optional Dependencies work. Hence, it is highly recommended that you install these. A packaged distribution like `Enthought Canopy `__ may be worth considering. - From 96adf2c99ebeecc8de65aa5c0b3b0b88c0c8f12f Mon Sep 17 00:00:00 2001 From: David BROCHART Date: Tue, 17 Mar 2015 16:24:21 +0100 Subject: [PATCH 024/239] Fixed bug #9671 where 'DataFrame.plot()' raised an error when both 'color' and 'style' keywords were passed and there was no color symbol in the style strings (this should be allowed) --- doc/source/whatsnew/v0.16.1.txt | 5 +++++ pandas/tests/test_graphics.py | 16 ++++++++++++++++ pandas/tools/plotting.py | 15 ++++++++++----- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index a20e901dbddd1..e5d2d78ebc2ad 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -86,4 +86,9 @@ Bug Fixes - Bug in ``ParserBase.convert_to_nd_arrays`` when called by ``DataFrame.read_fwf`` (:issue:`9266``) ======= - Bug in ``FloatArrayFormatter`` where decision boundary for displaying "small" floats in decimal format is off by one order of magnitude for a given display.precision (:issue:`9764`) +<<<<<<< HEAD >>>>>>> ce988b4... DOC: Update whatsnew for 0.16.1 (#9764) +======= + +- Fixed bug where ``DataFrame.plot()`` raised an error when both ``color`` and ``style`` keywords were passed and there was no color symbol in the style strings (:issue:`9671`) +>>>>>>> f00d6bb... Fixed bug #9671 where 'DataFrame.plot()' raised an error when both 'color' and 'style' keywords were passed and there was no color symbol in the style strings (this should be allowed) diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 3ce4e150326a2..36c19cd39f76c 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -1154,6 +1154,22 @@ def test_plot(self): self.assertEqual(len(axes), 1) self.assertIs(ax.get_axes(), axes[0]) + def test_color_and_style_arguments(self): + df = DataFrame({'x': [1, 2], 'y': [3, 4]}) + # passing both 'color' and 'style' arguments should be allowed + # if there is no color symbol in the style strings: + ax = df.plot(color = ['red', 'black'], style = ['-', '--']) + # check that the linestyles are correctly set: + linestyle = [line.get_linestyle() for line in ax.lines] + self.assertEqual(linestyle, ['-', '--']) + # check that the colors are correctly set: + color = [line.get_color() for line in ax.lines] + self.assertEqual(color, ['red', 'black']) + # passing both 'color' and 'style' arguments should not be allowed + # if there is a color symbol in the style strings: + with tm.assertRaises(ValueError): + df.plot(color = ['red', 'black'], style = ['k-', 'r--']) + def test_nonnumeric_exclude(self): df = DataFrame({'A': ["x", "y", "z"], 'B': [1, 2, 3]}) ax = df.plot() diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 0be030d7c2c8e..358c5b0dd5940 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -867,12 +867,17 @@ def _validate_color_args(self): "simultaneously. Using 'color'") if 'color' in self.kwds and self.style is not None: + if com.is_list_like(self.style): + styles = self.style + else: + styles = [self.style] # need only a single match - if re.match('^[a-z]+?', self.style) is not None: - raise ValueError("Cannot pass 'style' string with a color " - "symbol and 'color' keyword argument. Please" - " use one or the other or pass 'style' " - "without a color symbol") + for s in styles: + if re.match('^[a-z]+?', s) is not None: + raise ValueError("Cannot pass 'style' string with a color " + "symbol and 'color' keyword argument. Please" + " use one or the other or pass 'style' " + "without a color symbol") def _iter_data(self, data=None, keep_index=False, fillna=None): if data is None: From 307fe736d58d5aae0e622738660868ca2df3e3da Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 6 Apr 2015 08:24:13 -0400 Subject: [PATCH 025/239] DOC: correction to contributing.rst --- CONTRIBUTING.md | 4 ++-- doc/source/contributing.rst | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d3eeb820a12eb..284ac2fc5b169 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -12,8 +12,8 @@ navigate to the [GitHub "issues" tab](https://github.com/pydata/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pydata/pandas/issues?labels=Docs&sort=updated&state=open) -and [Good as first -PR](https://github.com/pydata/pandas/issues?labels=Good+as+first+PR&sort=updated&state=open) +and [Difficulty +Novice](https://github.com/pydata/pandas/issues?q=is%3Aopen+is%3Aissue+label%3A%22Difficulty+Novice%22) where you could start out. Or maybe through using *pandas* you have an idea of you own or are diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index cc4473e8d355a..2112d5b127e64 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -17,8 +17,8 @@ If you are simply looking to start working with the *pandas* codebase, navigate `GitHub "issues" tab `_ and start looking through interesting issues. There are a number of issues listed under `Docs `_ -and `Good as first PR -`_ +and `Difficulty Novice +`_ where you could start out. Or maybe through using *pandas* you have an idea of you own or are looking for something From 34337582c7bddd05e19e980a05aab297c7c171a0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 1 Apr 2015 13:38:29 +0200 Subject: [PATCH 026/239] DOC: fix some various doc warnings --- doc/source/r_interface.rst | 1 + pandas/core/strings.py | 18 ++++++++++-------- pandas/tseries/tools.py | 5 ++++- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst index 826d9e980538e..2207c823f43b1 100644 --- a/doc/source/r_interface.rst +++ b/doc/source/r_interface.rst @@ -56,6 +56,7 @@ appropriate pandas object (most likely a DataFrame): .. ipython:: python + :okwarning: import pandas.rpy.common as com infert = com.load_data('infert') diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 97f6752fb5851..4ef341c481a60 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -769,12 +769,14 @@ def str_rstrip(arr, to_strip=None): def str_wrap(arr, width, **kwargs): - """ - Wrap long strings to be formatted in paragraphs + r""" + Wrap long strings to be formatted in paragraphs. + + This method has the same keyword parameters and defaults as + :class:`textwrap.TextWrapper`. Parameters ---------- - Same keyword parameters and defaults as :class:`textwrap.TextWrapper` width : int Maximum line-width expand_tabs : bool, optional @@ -806,11 +808,11 @@ def str_wrap(arr, width, **kwargs): settings. To achieve behavior matching R's stringr library str_wrap function, use the arguments: - expand_tabs = False - replace_whitespace = True - drop_whitespace = True - break_long_words = False - break_on_hyphens = False + - expand_tabs = False + - replace_whitespace = True + - drop_whitespace = True + - break_long_words = False + - break_on_hyphens = False Examples -------- diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index 8430e0209fd78..ef37e003ab67f 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -210,10 +210,13 @@ def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, Returns ------- - ret : datetime if parsing succeeded. Return type depends on input: + ret : datetime if parsing succeeded. + Return type depends on input: + - list-like: DatetimeIndex - Series: Series of datetime64 dtype - scalar: Timestamp + In case when it is not possible to return designated types (e.g. when any element of input is before Timestamp.min or after Timestamp.max) return will have datetime.datetime type (or correspoding array/Series). From 1678e64f73b4abdfb71601eca0ad30676fcc8680 Mon Sep 17 00:00:00 2001 From: Kerby Shedden Date: Fri, 3 Apr 2015 08:43:03 -0400 Subject: [PATCH 027/239] Closes #9795 (Stata writer changes input frame) Add note to release notes --- doc/source/whatsnew/v0.16.1.txt | 2 +- pandas/io/stata.py | 4 +++- pandas/io/tests/test_stata.py | 9 +++++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index e5d2d78ebc2ad..de87b42dc5441 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -66,7 +66,7 @@ Bug Fixes - Bug in ``scatter_matrix`` draws unexpected axis ticklabels (:issue:`5662`) - +- Fixed bug in ``StataWriter`` resulting in changes to input ``DataFrame`` upon save (:issue:`9795`). - Bug in ``transform`` causing length mismatch when null entries were present and a fast aggregator was being used (:issue:`9697`) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7dd32fd00a4d2..3972bad7b2d83 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1885,6 +1885,8 @@ def _prepare_pandas(self, data): #NOTE: we might need a different API / class for pandas objects so # we can set different semantics - handle this with a PR to pandas.io + data = data.copy() + if self._write_index: data = data.reset_index() @@ -2013,7 +2015,7 @@ def _write_variable_labels(self, labels=None): self._write(_pad_bytes("", 81)) def _prepare_data(self): - data = self.data.copy() + data = self.data typlist = self.typlist convert_dates = self._convert_dates # 1. Convert dates diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 8b44be61d5f66..0aaf018b21584 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -290,6 +290,15 @@ def test_stata_doc_examples(self): df = DataFrame(np.random.randn(10, 2), columns=list('AB')) df.to_stata(path) + def test_write_preserves_original(self): + # 9795 + np.random.seed(423) + df = pd.DataFrame(np.random.randn(5,4), columns=list('abcd')) + df.ix[2, 'a':'c'] = np.nan + df_copy = df.copy() + df.to_stata('test.dta', write_index=False) + tm.assert_frame_equal(df, df_copy) + def test_encoding(self): # GH 4626, proper encoding handling From 954a68ef3e3f474718d5494f4bc0c0cf94a5dd85 Mon Sep 17 00:00:00 2001 From: Evan Wright Date: Tue, 7 Apr 2015 08:33:23 -0400 Subject: [PATCH 028/239] DOC: Clean up documentation for convert_objects --- pandas/core/generic.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 012a73fac1ef4..b695ef6550f42 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2261,19 +2261,23 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, Parameters ---------- - convert_dates : if True, attempt to soft convert dates, if 'coerce', - force conversion (and non-convertibles get NaT) - convert_numeric : if True attempt to coerce to numbers (including - strings), non-convertibles get NaN - convert_timedeltas : if True, attempt to soft convert timedeltas, if 'coerce', - force conversion (and non-convertibles get NaT) - copy : Boolean, if True, return copy even if no copy is necessary - (e.g. no conversion was done), default is True. - It is meant for internal use, not to be confused with `inplace` kw. + convert_dates : boolean, default True + If True, convert to date where possible. If 'coerce', force + conversion, with unconvertible values becoming NaT. + convert_numeric : boolean, default False + If True, attempt to coerce to numbers (including strings), with + unconvertible values becoming NaN. + convert_timedeltas : boolean, default True + If True, convert to timedelta where possible. If 'coerce', force + conversion, with unconvertible values becoming NaT. + copy : boolean, default True + If True, return a copy even if no copy is necessary (e.g. no + conversion was done). Note: This is meant for internal use, and + should not be confused with inplace. Returns ------- - converted : asm as input object + converted : same as input object """ return self._constructor( self._data.convert(convert_dates=convert_dates, From d4880936e9bc6f4d075d0a72d4489fc400d3a2d8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 5 Apr 2015 19:17:04 -0500 Subject: [PATCH 029/239] API: Sort keys for DataFrame.assign Previously the order was arbitrary. For predicitability, we'll sort before inserting. --- doc/source/dsintro.rst | 6 ++++-- doc/source/whatsnew/v0.16.1.txt | 4 ++++ pandas/core/frame.py | 11 ++++++----- pandas/tests/test_frame.py | 19 ++++++++++++++----- 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index e1c14029f1cf9..adcf2fca9b4c5 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -461,7 +461,7 @@ Inspired by `dplyr's `__ ``mutate`` verb, DataFrame has an :meth:`~pandas.DataFrame.assign` method that allows you to easily create new columns that are potentially -derived from existing columns. +derived from existing columns. .. ipython:: python @@ -511,7 +511,9 @@ DataFrame is returned, with the new values inserted. .. warning:: Since the function signature of ``assign`` is ``**kwargs``, a dictionary, - the order of the new columns in the resulting DataFrame cannot be guaranteed. + the order of the new columns in the resulting DataFrame cannot be guaranteed + to match the order you pass in. To make things predictable, items are inserted + alphabetically (by key) at the end of the DataFrame. All expressions are computed first, and then assigned. So you can't refer to another column being assigned in the same call to ``assign``. For example: diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index de87b42dc5441..38f7cea01066d 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -45,6 +45,10 @@ API changes - Add support for separating years and quarters using dashes, for example 2014-Q1. (:issue:`9688`) +- :meth:`~pandas.DataFrame.assign` now inserts new columns in alphabetical order. Previously + the order was arbitrary. (:issue:`9777`) + + .. _whatsnew_0161.performance: Performance Improvements diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f700d4316842c..8b683ad89558a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2244,10 +2244,11 @@ def assign(self, **kwargs): Notes ----- Since ``kwargs`` is a dictionary, the order of your - arguments may not be preserved, and so the order of the - new columns is not well defined. Assigning multiple - columns within the same ``assign`` is possible, but you cannot - reference other columns created within the same ``assign`` call. + arguments may not be preserved. The make things predicatable, + the columns are inserted in alphabetical order, at the end of + your DataFrame. Assigning multiple columns within the same + ``assign`` is possible, but you cannot reference other columns + created within the same ``assign`` call. Examples -------- @@ -2296,7 +2297,7 @@ def assign(self, **kwargs): results[k] = v # ... and then assign - for k, v in results.items(): + for k, v in sorted(results.items()): data[k] = v return data diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 3e4c16f63035f..e4abe15dee493 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -14073,12 +14073,21 @@ def test_assign(self): assert_frame_equal(result, expected) def test_assign_multiple(self): - df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) + df = DataFrame([[1, 4], [2, 5], [3, 6]], columns=['A', 'B']) result = df.assign(C=[7, 8, 9], D=df.A, E=lambda x: x.B) - expected = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9], - 'D': [1, 2, 3], 'E': [4, 5, 6]}) - # column order isn't preserved - assert_frame_equal(result.reindex_like(expected), expected) + expected = DataFrame([[1, 4, 7, 1, 4], [2, 5, 8, 2, 5], + [3, 6, 9, 3, 6]], columns=list('ABCDE')) + assert_frame_equal(result, expected) + + def test_assign_alphabetical(self): + # GH 9818 + df = DataFrame([[1, 2], [3, 4]], columns=['A', 'B']) + result = df.assign(D=df.A + df.B, C=df.A - df.B) + expected = DataFrame([[1, 2, -1, 3], [3, 4, -1, 7]], + columns=list('ABCD')) + assert_frame_equal(result, expected) + result = df.assign(C=df.A - df.B, D=df.A + df.B) + assert_frame_equal(result, expected) def test_assign_bad(self): df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) From 604410b1f60c4c53b7d73a93d89c33b84c5f582c Mon Sep 17 00:00:00 2001 From: Scott Sanderson Date: Mon, 6 Apr 2015 20:32:41 -0400 Subject: [PATCH 030/239] BUG: Preserve tz of start_date in Holiday.dates Previously, the timezone of only the start_date was lost, causing the subsequent call to `DatetimeIndex` to fail if a both start_date and end_date were tz-aware. --- pandas/tseries/holiday.py | 5 +- pandas/tseries/tests/test_holiday.py | 181 ++++++++++++++++----------- 2 files changed, 111 insertions(+), 75 deletions(-) diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 3b3542b760d6f..c31e25115c6a4 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -203,7 +203,10 @@ def dates(self, start_date, end_date, return_name=False): end_date = Timestamp(end_date) year_offset = DateOffset(years=1) - base_date = Timestamp(datetime(start_date.year, self.month, self.day)) + base_date = Timestamp( + datetime(start_date.year, self.month, self.day), + tz=start_date.tz, + ) dates = DatetimeIndex(start=base_date, end=end_date, freq=year_offset) holiday_dates = self._apply_rule(dates) if self.days_of_week is not None: diff --git a/pandas/tseries/tests/test_holiday.py b/pandas/tseries/tests/test_holiday.py index c2300481eca43..0880e84f1fcde 100644 --- a/pandas/tseries/tests/test_holiday.py +++ b/pandas/tseries/tests/test_holiday.py @@ -9,6 +9,7 @@ HolidayCalendarFactory, next_workday, previous_workday, before_nearest_workday, EasterMonday, GoodFriday, after_nearest_workday, weekend_to_monday) +from pytz import utc import nose class TestCalendar(tm.TestCase): @@ -55,87 +56,119 @@ def setUp(self): self.start_date = datetime(2011, 1, 1) self.end_date = datetime(2020, 12, 31) + def check_results(self, holiday, start, end, expected): + self.assertEqual(list(holiday.dates(start, end)), expected) + # Verify that timezone info is preserved. + self.assertEqual( + list( + holiday.dates( + utc.localize(Timestamp(start)), + utc.localize(Timestamp(end)), + ) + ), + [utc.localize(dt) for dt in expected], + ) + def test_usmemorialday(self): - holidays = USMemorialDay.dates(self.start_date, - self.end_date) - holidayList = [ - datetime(2011, 5, 30), - datetime(2012, 5, 28), - datetime(2013, 5, 27), - datetime(2014, 5, 26), - datetime(2015, 5, 25), - datetime(2016, 5, 30), - datetime(2017, 5, 29), - datetime(2018, 5, 28), - datetime(2019, 5, 27), - datetime(2020, 5, 25), - ] - self.assertEqual(list(holidays), holidayList) + self.check_results( + holiday=USMemorialDay, + start=self.start_date, + end=self.end_date, + expected=[ + datetime(2011, 5, 30), + datetime(2012, 5, 28), + datetime(2013, 5, 27), + datetime(2014, 5, 26), + datetime(2015, 5, 25), + datetime(2016, 5, 30), + datetime(2017, 5, 29), + datetime(2018, 5, 28), + datetime(2019, 5, 27), + datetime(2020, 5, 25), + ], + ) def test_non_observed_holiday(self): - july_3rd = Holiday('July 4th Eve', month=7, day=3) - result = july_3rd.dates("2001-01-01", "2003-03-03") - expected = [Timestamp('2001-07-03 00:00:00'), - Timestamp('2002-07-03 00:00:00')] - self.assertEqual(list(result), expected) - july_3rd = Holiday('July 4th Eve', month=7, day=3, - days_of_week=(0, 1, 2, 3)) - result = july_3rd.dates("2001-01-01", "2008-03-03") - expected = [Timestamp('2001-07-03 00:00:00'), - Timestamp('2002-07-03 00:00:00'), - Timestamp('2003-07-03 00:00:00'), - Timestamp('2006-07-03 00:00:00'), - Timestamp('2007-07-03 00:00:00')] - self.assertEqual(list(result), expected) + + self.check_results( + Holiday('July 4th Eve', month=7, day=3), + start="2001-01-01", + end="2003-03-03", + expected=[ + Timestamp('2001-07-03 00:00:00'), + Timestamp('2002-07-03 00:00:00') + ] + ) + + self.check_results( + Holiday('July 4th Eve', month=7, day=3, days_of_week=(0, 1, 2, 3)), + start="2001-01-01", + end="2008-03-03", + expected=[ + Timestamp('2001-07-03 00:00:00'), + Timestamp('2002-07-03 00:00:00'), + Timestamp('2003-07-03 00:00:00'), + Timestamp('2006-07-03 00:00:00'), + Timestamp('2007-07-03 00:00:00'), + ] + ) def test_easter(self): - holidays = EasterMonday.dates(self.start_date, - self.end_date) - holidayList = [Timestamp('2011-04-25 00:00:00'), - Timestamp('2012-04-09 00:00:00'), - Timestamp('2013-04-01 00:00:00'), - Timestamp('2014-04-21 00:00:00'), - Timestamp('2015-04-06 00:00:00'), - Timestamp('2016-03-28 00:00:00'), - Timestamp('2017-04-17 00:00:00'), - Timestamp('2018-04-02 00:00:00'), - Timestamp('2019-04-22 00:00:00'), - Timestamp('2020-04-13 00:00:00')] - - - self.assertEqual(list(holidays), holidayList) - holidays = GoodFriday.dates(self.start_date, - self.end_date) - holidayList = [Timestamp('2011-04-22 00:00:00'), - Timestamp('2012-04-06 00:00:00'), - Timestamp('2013-03-29 00:00:00'), - Timestamp('2014-04-18 00:00:00'), - Timestamp('2015-04-03 00:00:00'), - Timestamp('2016-03-25 00:00:00'), - Timestamp('2017-04-14 00:00:00'), - Timestamp('2018-03-30 00:00:00'), - Timestamp('2019-04-19 00:00:00'), - Timestamp('2020-04-10 00:00:00')] - self.assertEqual(list(holidays), holidayList) - + + self.check_results( + EasterMonday, + start=self.start_date, + end=self.end_date, + expected=[ + Timestamp('2011-04-25 00:00:00'), + Timestamp('2012-04-09 00:00:00'), + Timestamp('2013-04-01 00:00:00'), + Timestamp('2014-04-21 00:00:00'), + Timestamp('2015-04-06 00:00:00'), + Timestamp('2016-03-28 00:00:00'), + Timestamp('2017-04-17 00:00:00'), + Timestamp('2018-04-02 00:00:00'), + Timestamp('2019-04-22 00:00:00'), + Timestamp('2020-04-13 00:00:00'), + ], + ) + self.check_results( + GoodFriday, + start=self.start_date, + end=self.end_date, + expected=[ + Timestamp('2011-04-22 00:00:00'), + Timestamp('2012-04-06 00:00:00'), + Timestamp('2013-03-29 00:00:00'), + Timestamp('2014-04-18 00:00:00'), + Timestamp('2015-04-03 00:00:00'), + Timestamp('2016-03-25 00:00:00'), + Timestamp('2017-04-14 00:00:00'), + Timestamp('2018-03-30 00:00:00'), + Timestamp('2019-04-19 00:00:00'), + Timestamp('2020-04-10 00:00:00'), + ], + ) def test_usthanksgivingday(self): - holidays = USThanksgivingDay.dates(self.start_date, - self.end_date) - holidayList = [ - datetime(2011, 11, 24), - datetime(2012, 11, 22), - datetime(2013, 11, 28), - datetime(2014, 11, 27), - datetime(2015, 11, 26), - datetime(2016, 11, 24), - datetime(2017, 11, 23), - datetime(2018, 11, 22), - datetime(2019, 11, 28), - datetime(2020, 11, 26), - ] - - self.assertEqual(list(holidays), holidayList) + + self.check_results( + USThanksgivingDay, + start=self.start_date, + end=self.end_date, + expected=[ + datetime(2011, 11, 24), + datetime(2012, 11, 22), + datetime(2013, 11, 28), + datetime(2014, 11, 27), + datetime(2015, 11, 26), + datetime(2016, 11, 24), + datetime(2017, 11, 23), + datetime(2018, 11, 22), + datetime(2019, 11, 28), + datetime(2020, 11, 26), + ], + ) def test_argument_types(self): holidays = USThanksgivingDay.dates(self.start_date, From 8014c246ecfe8dae0592e9c295e4c735fceb5372 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 16 Mar 2014 10:29:39 +0900 Subject: [PATCH 031/239] ENH: drop function now has errors keyword for non-existing column handling --- doc/source/whatsnew/v0.15.2.txt | 2 +- doc/source/whatsnew/v0.16.1.txt | 7 ++++ pandas/core/generic.py | 8 ++-- pandas/core/index.py | 31 ++++++++++------ pandas/tests/test_frame.py | 33 +++++++++++++++++ pandas/tests/test_index.py | 66 +++++++++++++++++++++++++++++---- pandas/tests/test_panel.py | 9 +++++ pandas/tests/test_series.py | 8 ++++ 8 files changed, 142 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt index 02de919e3f83e..6a14a4024ba5a 100644 --- a/doc/source/whatsnew/v0.15.2.txt +++ b/doc/source/whatsnew/v0.15.2.txt @@ -49,7 +49,7 @@ API changes In [3]: cat = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']) In [4]: cat - Out[4]: + Out[4]: [a, b, a] Categories (3, object): [a < b < c] diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 38f7cea01066d..53ca4c0c306e1 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -23,6 +23,13 @@ Enhancements +- ``drop`` function can now accept ``errors`` keyword to suppress ValueError raised when any of label does not exist in the target data. (:issue:`6736`) + + .. ipython:: python + + df = DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C']) + df.drop(['A', 'X'], axis=1, errors='ignore') + .. _whatsnew_0161.api: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b695ef6550f42..bc65f1f62fa1a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1557,7 +1557,7 @@ def reindex_like(self, other, method=None, copy=True, limit=None): return self.reindex(**d) - def drop(self, labels, axis=0, level=None, inplace=False): + def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'): """ Return new object with labels in requested axis removed @@ -1569,6 +1569,8 @@ def drop(self, labels, axis=0, level=None, inplace=False): For MultiIndex inplace : bool, default False If True, do operation inplace and return None. + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and existing labels are dropped. Returns ------- @@ -1582,9 +1584,9 @@ def drop(self, labels, axis=0, level=None, inplace=False): if level is not None: if not isinstance(axis, MultiIndex): raise AssertionError('axis must be a MultiIndex') - new_axis = axis.drop(labels, level=level) + new_axis = axis.drop(labels, level=level, errors=errors) else: - new_axis = axis.drop(labels) + new_axis = axis.drop(labels, errors=errors) dropped = self.reindex(**{axis_name: new_axis}) try: dropped.axes[axis_].set_names(axis.names, inplace=True) diff --git a/pandas/core/index.py b/pandas/core/index.py index e335d00551bab..fd11cd7f598c3 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -2325,13 +2325,15 @@ def insert(self, loc, item): (_self[:loc], item_idx, _self[loc:])) return Index(idx, name=self.name) - def drop(self, labels): + def drop(self, labels, errors='raise'): """ Make new Index with passed list of labels deleted Parameters ---------- labels : array-like + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and existing labels are dropped. Returns ------- @@ -2341,7 +2343,9 @@ def drop(self, labels): indexer = self.get_indexer(labels) mask = indexer == -1 if mask.any(): - raise ValueError('labels %s not contained in axis' % labels[mask]) + if errors != 'ignore': + raise ValueError('labels %s not contained in axis' % labels[mask]) + indexer = indexer[~mask] return self.delete(indexer) @Appender(_shared_docs['drop_duplicates'] % _index_doc_kwargs) @@ -3847,7 +3851,7 @@ def repeat(self, n): sortorder=self.sortorder, verify_integrity=False) - def drop(self, labels, level=None): + def drop(self, labels, level=None, errors='raise'): """ Make new MultiIndex with passed list of labels deleted @@ -3870,19 +3874,24 @@ def drop(self, labels, level=None): indexer = self.get_indexer(labels) mask = indexer == -1 if mask.any(): - raise ValueError('labels %s not contained in axis' - % labels[mask]) - return self.delete(indexer) + if errors != 'ignore': + raise ValueError('labels %s not contained in axis' + % labels[mask]) + indexer = indexer[~mask] except Exception: pass inds = [] for label in labels: - loc = self.get_loc(label) - if isinstance(loc, int): - inds.append(loc) - else: - inds.extend(lrange(loc.start, loc.stop)) + try: + loc = self.get_loc(label) + if isinstance(loc, int): + inds.append(loc) + else: + inds.extend(lrange(loc.start, loc.stop)) + except KeyError: + if errors != 'ignore': + raise return self.delete(inds) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index e4abe15dee493..b8bdd2d4e3b40 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -7423,6 +7423,26 @@ def test_drop_names(self): self.assertEqual(obj.columns.name, 'second') self.assertEqual(list(df.columns), ['d', 'e', 'f']) + self.assertRaises(ValueError, df.drop, ['g']) + self.assertRaises(ValueError, df.drop, ['g'], 1) + + # errors = 'ignore' + dropped = df.drop(['g'], errors='ignore') + expected = Index(['a', 'b', 'c']) + self.assert_index_equal(dropped.index, expected) + + dropped = df.drop(['b', 'g'], errors='ignore') + expected = Index(['a', 'c']) + self.assert_index_equal(dropped.index, expected) + + dropped = df.drop(['g'], axis=1, errors='ignore') + expected = Index(['d', 'e', 'f']) + self.assert_index_equal(dropped.columns, expected) + + dropped = df.drop(['d', 'g'], axis=1, errors='ignore') + expected = Index(['e', 'f']) + self.assert_index_equal(dropped.columns, expected) + def test_dropEmptyRows(self): N = len(self.frame.index) mat = randn(N) @@ -7801,6 +7821,19 @@ def test_drop(self): assert_frame_equal(simple.drop([0, 1, 3], axis=0), simple.ix[[2], :]) assert_frame_equal(simple.drop([0, 3], axis='index'), simple.ix[[1, 2], :]) + self.assertRaises(ValueError, simple.drop, 5) + self.assertRaises(ValueError, simple.drop, 'C', 1) + self.assertRaises(ValueError, simple.drop, [1, 5]) + self.assertRaises(ValueError, simple.drop, ['A', 'C'], 1) + + # errors = 'ignore' + assert_frame_equal(simple.drop(5, errors='ignore'), simple) + assert_frame_equal(simple.drop([0, 5], errors='ignore'), + simple.ix[[1, 2, 3], :]) + assert_frame_equal(simple.drop('C', axis=1, errors='ignore'), simple) + assert_frame_equal(simple.drop(['A', 'C'], axis=1, errors='ignore'), + simple[['B']]) + #non-unique - wheee! nu_df = DataFrame(lzip(range(3), range(-3, 1), list('abc')), columns=['a', 'a', 'b']) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 39db387045f12..61cb337880c00 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1036,20 +1036,43 @@ def check_slice(in_slice, expected): def test_drop(self): n = len(self.strIndex) - dropped = self.strIndex.drop(self.strIndex[lrange(5, 10)]) + drop = self.strIndex[lrange(5, 10)] + dropped = self.strIndex.drop(drop) expected = self.strIndex[lrange(5) + lrange(10, n)] self.assertTrue(dropped.equals(expected)) self.assertRaises(ValueError, self.strIndex.drop, ['foo', 'bar']) + self.assertRaises(ValueError, self.strIndex.drop, ['1', 'bar']) + + # errors='ignore' + mixed = drop.tolist() + ['foo'] + dropped = self.strIndex.drop(mixed, errors='ignore') + expected = self.strIndex[lrange(5) + lrange(10, n)] + self.assert_index_equal(dropped, expected) + + dropped = self.strIndex.drop(['foo', 'bar'], errors='ignore') + expected = self.strIndex[lrange(n)] + self.assert_index_equal(dropped, expected) dropped = self.strIndex.drop(self.strIndex[0]) expected = self.strIndex[1:] - self.assertTrue(dropped.equals(expected)) + self.assert_index_equal(dropped, expected) ser = Index([1, 2, 3]) dropped = ser.drop(1) expected = Index([2, 3]) - self.assertTrue(dropped.equals(expected)) + self.assert_index_equal(dropped, expected) + + # errors='ignore' + self.assertRaises(ValueError, ser.drop, [3, 4]) + + dropped = ser.drop(4, errors='ignore') + expected = Index([1, 2, 3]) + self.assert_index_equal(dropped, expected) + + dropped = ser.drop([3, 4, 5], errors='ignore') + expected = Index([1, 2]) + self.assert_index_equal(dropped, expected) def test_tuple_union_bug(self): import pandas @@ -3529,21 +3552,50 @@ def test_drop(self): dropped2 = self.index.drop(index) expected = self.index[[0, 2, 3, 5]] - self.assertTrue(dropped.equals(expected)) - self.assertTrue(dropped2.equals(expected)) + self.assert_index_equal(dropped, expected) + self.assert_index_equal(dropped2, expected) dropped = self.index.drop(['bar']) expected = self.index[[0, 1, 3, 4, 5]] - self.assertTrue(dropped.equals(expected)) + self.assert_index_equal(dropped, expected) + + dropped = self.index.drop('foo') + expected = self.index[[2, 3, 4, 5]] + self.assert_index_equal(dropped, expected) index = MultiIndex.from_tuples([('bar', 'two')]) self.assertRaises(KeyError, self.index.drop, [('bar', 'two')]) self.assertRaises(KeyError, self.index.drop, index) + self.assertRaises(KeyError, self.index.drop, ['foo', 'two']) + + # partially correct argument + mixed_index = MultiIndex.from_tuples([('qux', 'one'), ('bar', 'two')]) + self.assertRaises(KeyError, self.index.drop, mixed_index) + + # error='ignore' + dropped = self.index.drop(index, errors='ignore') + expected = self.index[[0, 1, 2, 3, 4, 5]] + self.assert_index_equal(dropped, expected) + + dropped = self.index.drop(mixed_index, errors='ignore') + expected = self.index[[0, 1, 2, 3, 5]] + self.assert_index_equal(dropped, expected) + + dropped = self.index.drop(['foo', 'two'], errors='ignore') + expected = self.index[[2, 3, 4, 5]] + self.assert_index_equal(dropped, expected) # mixed partial / full drop dropped = self.index.drop(['foo', ('qux', 'one')]) expected = self.index[[2, 3, 5]] - self.assertTrue(dropped.equals(expected)) + self.assert_index_equal(dropped, expected) + + # mixed partial / full drop / error='ignore' + mixed_index = ['foo', ('qux', 'one'), 'two'] + self.assertRaises(KeyError, self.index.drop, mixed_index) + dropped = self.index.drop(mixed_index, errors='ignore') + expected = self.index[[2, 3, 5]] + self.assert_index_equal(dropped, expected) def test_droplevel_with_names(self): index = self.index[self.index.get_loc('foo')] diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index cab668b3118fd..0fd03cb5804a8 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1984,6 +1984,15 @@ def check_drop(drop_val, axis_number, aliases, expected): expected = Panel({"One": df}) check_drop('Two', 0, ['items'], expected) + self.assertRaises(ValueError, panel.drop, 'Three') + + # errors = 'ignore' + dropped = panel.drop('Three', errors='ignore') + assert_panel_equal(dropped, panel) + dropped = panel.drop(['Two', 'Three'], errors='ignore') + expected = Panel({"One": df}) + assert_panel_equal(dropped, expected) + # Major exp_df = DataFrame({"A": [2], "B": [4]}, index=[1]) expected = Panel({"One": exp_df, "Two": exp_df}) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index c021bb1bf2fd6..f044fe540ea24 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -1954,6 +1954,14 @@ def test_drop(self): self.assertRaises(ValueError, s.drop, 'bc') self.assertRaises(ValueError, s.drop, ('a',)) + # errors='ignore' + s = Series(range(3),index=list('abc')) + result = s.drop('bc', errors='ignore') + assert_series_equal(result, s) + result = s.drop(['a', 'd'], errors='ignore') + expected = s.ix[1:] + assert_series_equal(result, expected) + # bad axis self.assertRaises(ValueError, s.drop, 'one', axis='columns') From 4ef0e34351bbe987f2797c982cd5867981b07dd6 Mon Sep 17 00:00:00 2001 From: Evan Wright Date: Wed, 8 Apr 2015 08:15:25 -0400 Subject: [PATCH 032/239] BUG: skiprows doesn't handle blank lines properly when engine='c' (GH #9832) --- doc/source/whatsnew/v0.16.1.txt | 4 ++++ pandas/io/tests/test_parsers.py | 22 ++++++++++++++++++++++ pandas/src/parser/tokenizer.c | 18 ++++++------------ 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 53ca4c0c306e1..ec8a43bc1b0a2 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -102,4 +102,8 @@ Bug Fixes ======= - Fixed bug where ``DataFrame.plot()`` raised an error when both ``color`` and ``style`` keywords were passed and there was no color symbol in the style strings (:issue:`9671`) +<<<<<<< HEAD >>>>>>> f00d6bb... Fixed bug #9671 where 'DataFrame.plot()' raised an error when both 'color' and 'style' keywords were passed and there was no color symbol in the style strings (this should be allowed) +======= +- Bug in ``read_csv`` and ``read_table`` when using ``skip_rows`` parameter if blank lines are present. (:issue:`9832`) +>>>>>>> e67893f... BUG: skiprows doesn't handle blank lines properly when engine='c' (GH #9832) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 1bf8f7fef7b6b..e65f3ffd7c5a5 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -840,6 +840,28 @@ def test_deep_skiprows(self): condensed_data = self.read_csv(StringIO(condensed_text)) tm.assert_frame_equal(data, condensed_data) + def test_skiprows_blank(self): + # GH 9832 + text = """#foo,a,b,c +#foo,a,b,c + +#foo,a,b,c +#foo,a,b,c + +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + data = self.read_csv(StringIO(text), skiprows=6, header=None, + index_col=0, parse_dates=True) + + expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), + columns=[1, 2, 3], + index=[datetime(2000, 1, 1), datetime(2000, 1, 2), + datetime(2000, 1, 3)]) + expected.index.name = 0 + tm.assert_frame_equal(data, expected) + def test_detect_string_na(self): data = """A,B foo,bar diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 975142ebacc2a..1bc4096658b29 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -757,11 +757,9 @@ int tokenize_delimited(parser_t *self, size_t line_limit) case START_RECORD: // start of record if (skip_this_line(self, self->file_lines)) { + self->state = SKIP_LINE; if (c == '\n') { - END_LINE() - } - else { - self->state = SKIP_LINE; + END_LINE(); } break; } @@ -1093,11 +1091,9 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) case START_RECORD: // start of record if (skip_this_line(self, self->file_lines)) { + self->state = SKIP_LINE; if (c == self->lineterminator) { - END_LINE() - } - else { - self->state = SKIP_LINE; + END_LINE(); } break; } @@ -1391,11 +1387,9 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) case START_RECORD: // start of record if (skip_this_line(self, self->file_lines)) { + self->state = SKIP_LINE; if (c == '\n') { - END_LINE() - } - else { - self->state = SKIP_LINE; + END_LINE(); } break; } else if (c == '\n') { From 94fec1ec213a8e88401766022ffefb0ba60d0bb6 Mon Sep 17 00:00:00 2001 From: lucas Date: Wed, 8 Apr 2015 21:47:13 +0800 Subject: [PATCH 033/239] BUG: Issue 9798 fixed BUG: #9798 `index_col` shouldn't accept the value `True` move the check to ``TextFileReader`` ``clean_options`` --- doc/source/whatsnew/v0.16.1.txt | 5 +++++ pandas/io/parsers.py | 2 ++ pandas/io/tests/test_parsers.py | 5 +++++ 3 files changed, 12 insertions(+) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index ec8a43bc1b0a2..c802e63e962f1 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -106,4 +106,9 @@ Bug Fixes >>>>>>> f00d6bb... Fixed bug #9671 where 'DataFrame.plot()' raised an error when both 'color' and 'style' keywords were passed and there was no color symbol in the style strings (this should be allowed) ======= - Bug in ``read_csv`` and ``read_table`` when using ``skip_rows`` parameter if blank lines are present. (:issue:`9832`) +<<<<<<< HEAD >>>>>>> e67893f... BUG: skiprows doesn't handle blank lines properly when engine='c' (GH #9832) +======= + +- Bug in ``read_csv()`` interprets ``index_col=True`` as ``1`` (:issue:`9798`) +>>>>>>> 53f2ea4... BUG: Issue 9798 fixed diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 99fb24ebf91dd..45a85bb63f12c 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -652,6 +652,8 @@ def _clean_options(self, options, engine): # really delete this one keep_default_na = result.pop('keep_default_na') + if index_col is True: + raise ValueError("The value of index_col couldn't be 'True'") if _is_index_col(index_col): if not isinstance(index_col, (list, tuple, np.ndarray)): index_col = [index_col] diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index e65f3ffd7c5a5..33579d2d64b29 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -521,6 +521,11 @@ def test_usecols_index_col_False(self): df = self.read_csv(StringIO(s_malformed), usecols=cols, index_col=False) tm.assert_frame_equal(expected, df) + def test_index_col_is_True(self): + # Issue 9798 + self.assertRaises(ValueError, self.read_csv, StringIO(self.ts_data), + index_col=True) + def test_converter_index_col_bug(self): # 1835 data = "A;B\n1;2\n3;4" From b92c341d877ae4a10d43b4ff108d83396a7ed741 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 9 Apr 2015 22:39:39 +0900 Subject: [PATCH 034/239] DOC: Fix release note for v0.16 --- doc/source/whatsnew/v0.16.0.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index aa35434802799..f9bef3d9c7f4a 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -474,10 +474,11 @@ Other API Changes - ``Series.values_counts`` and ``Series.describe`` for categorical data will now put ``NaN`` entries at the end. (:issue:`9443`) - ``Series.describe`` for categorical data will now give counts and frequencies of 0, not ``NaN``, for unused categories (:issue:`9443`) -- Due to a bug fix, looking up a partial string label with ``DatetimeIndex.asof`` now includes values that match the string, even if they are after the start of the partial string label (:issue:`9258`). Old behavior: +- Due to a bug fix, looking up a partial string label with ``DatetimeIndex.asof`` now includes values that match the string, even if they are after the start of the partial string label (:issue:`9258`). - .. ipython:: python - :verbatim: + Old behavior: + + .. code-block:: python In [4]: pd.to_datetime(['2000-01-31', '2000-02-28']).asof('2000-02') Out[4]: Timestamp('2000-01-31 00:00:00') From 669cb13766dab250deed2ba4260f755bc90070d0 Mon Sep 17 00:00:00 2001 From: David Stephens Date: Tue, 7 Apr 2015 21:43:29 -0700 Subject: [PATCH 035/239] TST: Fix tests in TestGoogle --- pandas/io/tests/test_data.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index 70a25a45c0ad4..9b27d612cdeee 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -33,7 +33,7 @@ def assert_n_failed_equals_n_null_columns(wngs, obj, cls=SymbolWarning): all_nan_cols = pd.Series(dict((k, pd.isnull(v).all()) for k, v in compat.iteritems(obj))) n_all_nan_cols = all_nan_cols.sum() - valid_warnings = pd.Series([wng for wng in wngs if isinstance(wng, cls)]) + valid_warnings = pd.Series([wng for wng in wngs if wng.category == cls]) assert_equal(len(valid_warnings), n_all_nan_cols) failed_symbols = all_nan_cols[all_nan_cols].index msgs = valid_warnings.map(lambda x: x.message) @@ -79,7 +79,7 @@ def test_get_goog_volume(self): for locale in self.locales: with tm.set_locale(locale): df = web.get_data_google('GOOG').sort_index() - self.assertEqual(df.Volume.ix['OCT-08-2010'], 2863473) + self.assertEqual(df.Volume.ix['JAN-02-2015'], 1446662) @network def test_get_multi1(self): @@ -87,10 +87,10 @@ def test_get_multi1(self): sl = ['AAPL', 'AMZN', 'GOOG'] with tm.set_locale(locale): pan = web.get_data_google(sl, '2012') - ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG] + ts = pan.Close.GOOG.index[pan.Close.AAPL < pan.Close.GOOG] if (hasattr(pan, 'Close') and hasattr(pan.Close, 'GOOG') and hasattr(pan.Close, 'AAPL')): - self.assertEqual(ts[0].dayofyear, 96) + self.assertEqual(ts[0].dayofyear, 3) else: self.assertRaises(AttributeError, lambda: pan.Close) @@ -135,7 +135,7 @@ def test_dtypes(self): def test_unicode_date(self): #GH8967 data = web.get_data_google('F', start='JAN-01-10', end='JAN-27-13') - self.assertEquals(data.index.name, 'Date') + self.assertEqual(data.index.name, 'Date') class TestYahoo(tm.TestCase): From d6a4ee60553fe7f9652ffe8be6f08c701cb81dfa Mon Sep 17 00:00:00 2001 From: Evan Wright Date: Tue, 31 Mar 2015 07:09:43 -0400 Subject: [PATCH 036/239] ENH: Allow conversion of datetime64 and timedelta64 to string in astype (GH 9757) --- doc/source/whatsnew/v0.16.1.txt | 1 + pandas/core/common.py | 12 ++++++------ pandas/tests/test_frame.py | 27 +++++++++++++++++++++++++++ pandas/tests/test_series.py | 13 +++++++++++++ 4 files changed, 47 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index c802e63e962f1..5fb6eece53b51 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -30,6 +30,7 @@ Enhancements df = DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C']) df.drop(['A', 'X'], axis=1, errors='ignore') +- Allow conversion of values with dtype ``datetime64`` or ``timedelta64`` to strings using ``astype(str)`` (:issue:`9757`) .. _whatsnew_0161.api: diff --git a/pandas/core/common.py b/pandas/core/common.py index ec805aba34d48..0fb35c2fb02fc 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2637,7 +2637,12 @@ def _astype_nansafe(arr, dtype, copy=True): if not isinstance(dtype, np.dtype): dtype = _coerce_to_dtype(dtype) - if is_datetime64_dtype(arr): + if issubclass(dtype.type, compat.text_type): + # in Py3 that's str, in Py2 that's unicode + return lib.astype_unicode(arr.ravel()).reshape(arr.shape) + elif issubclass(dtype.type, compat.string_types): + return lib.astype_str(arr.ravel()).reshape(arr.shape) + elif is_datetime64_dtype(arr): if dtype == object: return tslib.ints_to_pydatetime(arr.view(np.int64)) elif dtype == np.int64: @@ -2675,11 +2680,6 @@ def _astype_nansafe(arr, dtype, copy=True): elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer): # work around NumPy brokenness, #1987 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape) - elif issubclass(dtype.type, compat.text_type): - # in Py3 that's str, in Py2 that's unicode - return lib.astype_unicode(arr.ravel()).reshape(arr.shape) - elif issubclass(dtype.type, compat.string_types): - return lib.astype_str(arr.ravel()).reshape(arr.shape) if copy: return arr.astype(dtype) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index b8bdd2d4e3b40..94d3ed72ee427 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4192,6 +4192,33 @@ def test_astype_cast_nan_int(self): df = DataFrame(data={"Values": [1.0, 2.0, 3.0, np.nan]}) self.assertRaises(ValueError, df.astype, np.int64) + def test_astype_str(self): + # GH9757 + dts = Series(date_range('2010-01-04', periods=5)) + tds = Series([Timedelta(x, unit='d') for x in range(5)]) + ns = Series(range(5)) + fs = Series([0.0, 0.2, 0.4, 0.6, 0.8]) + + df = DataFrame({ + 'dts' : dts.values, + 'tds' : tds.values, + 'ns' : ns.values, + 'fs' : fs.values, + }) + + # Test str and unicode on python 2.x and just str on python 3.x + for tt in set([str, compat.text_type]): + result = df.astype(tt) + + expected = DataFrame({ + 'dts' : list(map(tt, dts.values)), + 'tds' : list(map(tt, tds.values)), + 'ns' : list(map(tt, ns.values)), + 'fs' : list(map(tt, fs.values)), + }) + + assert_frame_equal(result, expected) + def test_array_interface(self): result = np.sqrt(self.frame) tm.assert_isinstance(result, type(self.frame)) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index f044fe540ea24..1c64b0d60d23f 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -5511,6 +5511,19 @@ def test_astype_str(self): expec = s.map(compat.text_type) assert_series_equal(res, expec) + # GH9757 + # Test str and unicode on python 2.x and just str on python 3.x + for tt in set([str, compat.text_type]): + ts = Series([Timestamp('2010-01-04 00:00:00')]) + s = ts.astype(tt) + expected = Series([tt(ts.values[0])]) + assert_series_equal(s, expected) + + td = Series([Timedelta(1, unit='d')]) + s = td.astype(tt) + expected = Series([tt(td.values[0])]) + assert_series_equal(s, expected) + def test_astype_unicode(self): # GH7758 From 64a9ef357e28c68407c3e3fc06ff1d06b932d3c9 Mon Sep 17 00:00:00 2001 From: Evan Wright Date: Thu, 9 Apr 2015 23:53:39 -0400 Subject: [PATCH 037/239] Add tests with time zones, and clean up DataFrame test --- pandas/tests/test_frame.py | 25 +++++++++++-------------- pandas/tests/test_series.py | 5 +++++ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 94d3ed72ee427..6ea76710b4de7 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -4194,27 +4194,24 @@ def test_astype_cast_nan_int(self): def test_astype_str(self): # GH9757 - dts = Series(date_range('2010-01-04', periods=5)) - tds = Series([Timedelta(x, unit='d') for x in range(5)]) - ns = Series(range(5)) - fs = Series([0.0, 0.2, 0.4, 0.6, 0.8]) + a = Series(date_range('2010-01-04', periods=5)) + b = Series(date_range('3/6/2012 00:00', periods=5, tz='US/Eastern')) + c = Series([Timedelta(x, unit='d') for x in range(5)]) + d = Series(range(5)) + e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) - df = DataFrame({ - 'dts' : dts.values, - 'tds' : tds.values, - 'ns' : ns.values, - 'fs' : fs.values, - }) + df = DataFrame({'a' : a, 'b' : b, 'c' : c, 'd' : d, 'e' : e}) # Test str and unicode on python 2.x and just str on python 3.x for tt in set([str, compat.text_type]): result = df.astype(tt) expected = DataFrame({ - 'dts' : list(map(tt, dts.values)), - 'tds' : list(map(tt, tds.values)), - 'ns' : list(map(tt, ns.values)), - 'fs' : list(map(tt, fs.values)), + 'a' : list(map(tt, a.values)), + 'b' : list(map(tt, b.values)), + 'c' : list(map(tt, c.values)), + 'd' : list(map(tt, d.values)), + 'e' : list(map(tt, e.values)), }) assert_frame_equal(result, expected) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 1c64b0d60d23f..fec98a37b5017 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -5519,6 +5519,11 @@ def test_astype_str(self): expected = Series([tt(ts.values[0])]) assert_series_equal(s, expected) + ts = Series([Timestamp('2010-01-04 00:00:00', tz='US/Eastern')]) + s = ts.astype(tt) + expected = Series([tt(ts.values[0])]) + assert_series_equal(s, expected) + td = Series([Timedelta(1, unit='d')]) s = td.astype(tt) expected = Series([tt(td.values[0])]) From b8061901faf49d400145817e744bfc7b391a5163 Mon Sep 17 00:00:00 2001 From: Mortada Mehyar Date: Mon, 16 Mar 2015 20:03:45 -0700 Subject: [PATCH 038/239] ENH: add StringMethods (.str accessor) to Index, fixes #9068 --- doc/source/text.rst | 11 +++++++++-- doc/source/whatsnew/v0.16.1.txt | 19 +++++++++++++++++- pandas/core/base.py | 19 ++++++++++++++++++ pandas/core/series.py | 16 ---------------- pandas/core/strings.py | 28 +++++++++++++++++++-------- pandas/tests/test_index.py | 34 +++++++++++++++++++++++++++++++++ pandas/tests/test_series.py | 13 +++++++++++++ 7 files changed, 113 insertions(+), 27 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index a98153e277fae..ee91ea3c166b6 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -17,10 +17,10 @@ Working with Text Data .. _text.string_methods: -Series is equipped with a set of string processing methods +Series and Index are equipped with a set of string processing methods that make it easy to operate on each element of the array. Perhaps most importantly, these methods exclude missing/NA values automatically. These are -accessed via the Series's ``str`` attribute and generally have names matching +accessed via the ``str`` attribute and generally have names matching the equivalent (scalar) built-in string methods: .. ipython:: python @@ -30,6 +30,13 @@ the equivalent (scalar) built-in string methods: s.str.upper() s.str.len() +.. ipython:: python + + idx = Index([' jack', 'jill ', ' jesse ', 'frank']) + idx.str.strip() + idx.str.lstrip() + idx.str.rstrip() + Splitting and Replacing Strings ------------------------------- diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 5fb6eece53b51..382c78c496009 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -18,11 +18,28 @@ Enhancements ~~~~~~~~~~~~ - Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`) +- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`) -- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`) + The `.str` accessor is now available for both `Series` and `Index`. + + .. ipython:: python + idx = Index([' jack', 'jill ', ' jesse ', 'frank']) + idx.str.strip() + One special case for the `.str` accessor on `Index` is that if a string method returns `bool`, the `.str` accessor + will return a `np.array` instead of a boolean `Index` (:issue:`8875`). This enables the following expression + to work naturally: + .. ipython:: python + + idx = Index(['a1', 'a2', 'b1', 'b2']) + s = Series(range(4), index=idx) + s + idx.str.startswith('a') + s[s.index.str.startswith('a')] + +- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`) - ``drop`` function can now accept ``errors`` keyword to suppress ValueError raised when any of label does not exist in the target data. (:issue:`6736`) .. ipython:: python diff --git a/pandas/core/base.py b/pandas/core/base.py index dde2e74132c4b..a3d3c3791e20c 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -10,6 +10,7 @@ import pandas.tslib as tslib import pandas.lib as lib from pandas.util.decorators import Appender, cache_readonly +from pandas.core.strings import StringMethods _shared_docs = dict() @@ -497,6 +498,24 @@ def searchsorted(self, key, side='left'): #### needs tests/doc-string return self.values.searchsorted(key, side=side) + # string methods + def _make_str_accessor(self): + from pandas.core.series import Series + from pandas.core.index import Index + if isinstance(self, Series) and not com.is_object_dtype(self.dtype): + # this really should exclude all series with any non-string values, + # but that isn't practical for performance reasons until we have a + # str dtype (GH 9343) + raise AttributeError("Can only use .str accessor with string " + "values, which use np.object_ dtype in " + "pandas") + elif isinstance(self, Index) and self.inferred_type != 'string': + raise AttributeError("Can only use .str accessor with string " + "values (i.e. inferred_type is 'string')") + return StringMethods(self) + + str = AccessorProperty(StringMethods, _make_str_accessor) + _shared_docs['drop_duplicates'] = ( """Return %(klass)s with duplicate values removed diff --git a/pandas/core/series.py b/pandas/core/series.py index 68f3a6032402f..b71c269468d62 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -28,7 +28,6 @@ from pandas.core import generic, base from pandas.core.internals import SingleBlockManager from pandas.core.categorical import Categorical, CategoricalAccessor -from pandas.core.strings import StringMethods from pandas.tseries.common import (maybe_to_datetimelike, CombinedDatetimelikeProperties) from pandas.tseries.index import DatetimeIndex @@ -2494,21 +2493,6 @@ def to_period(self, freq=None, copy=True): return self._constructor(new_values, index=new_index).__finalize__(self) - #------------------------------------------------------------------------------ - # string methods - - def _make_str_accessor(self): - if not com.is_object_dtype(self.dtype): - # this really should exclude all series with any non-string values, - # but that isn't practical for performance reasons until we have a - # str dtype (GH 9343) - raise AttributeError("Can only use .str accessor with string " - "values, which use np.object_ dtype in " - "pandas") - return StringMethods(self) - - str = base.AccessorProperty(StringMethods, _make_str_accessor) - #------------------------------------------------------------------------------ # Datetimelike delegation methods diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 4ef341c481a60..6d20907373014 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1,7 +1,7 @@ import numpy as np from pandas.compat import zip -from pandas.core.common import isnull, _values_from_object +from pandas.core.common import isnull, _values_from_object, is_bool_dtype import pandas.compat as compat from pandas.util.decorators import Appender import re @@ -632,9 +632,10 @@ def str_split(arr, pat=None, n=None, return_type='series'): pat : string, default None String or regular expression to split on. If None, splits on whitespace n : int, default None (all) - return_type : {'series', 'frame'}, default 'series + return_type : {'series', 'index', 'frame'}, default 'series' If frame, returns a DataFrame (elements are strings) - If series, returns an Series (elements are lists of strings). + If series or index, returns the same type as the original object + (elements are lists of strings). Notes ----- @@ -646,9 +647,13 @@ def str_split(arr, pat=None, n=None, return_type='series'): """ from pandas.core.series import Series from pandas.core.frame import DataFrame + from pandas.core.index import Index - if return_type not in ('series', 'frame'): - raise ValueError("return_type must be {'series', 'frame'}") + if return_type not in ('series', 'index', 'frame'): + raise ValueError("return_type must be {'series', 'index', 'frame'}") + if return_type == 'frame' and isinstance(arr, Index): + raise ValueError("return_type='frame' is not supported for string " + "methods on Index") if pat is None: if n is None or n == 0: n = -1 @@ -928,9 +933,9 @@ def do_copy(target): class StringMethods(object): """ - Vectorized string functions for Series. NAs stay NA unless handled - otherwise by a particular method. Patterned after Python's string methods, - with some inspiration from R's stringr package. + Vectorized string functions for Series and Index. NAs stay NA unless + handled otherwise by a particular method. Patterned after Python's string + methods, with some inspiration from R's stringr package. Examples -------- @@ -959,11 +964,18 @@ def __iter__(self): def _wrap_result(self, result): from pandas.core.series import Series from pandas.core.frame import DataFrame + from pandas.core.index import Index if not hasattr(result, 'ndim'): return result elif result.ndim == 1: name = getattr(result, 'name', None) + if isinstance(self.series, Index): + # if result is a boolean np.array, return the np.array + # instead of wrapping it into a boolean Index (GH 8875) + if is_bool_dtype(result): + return result + return Index(result, name=name or self.series.name) return Series(result, index=self.series.index, name=name or self.series.name) else: diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 61cb337880c00..bb75b12754dca 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1197,6 +1197,40 @@ def test_join_self(self): for kind in kinds: joined = res.join(res, how=kind) self.assertIs(res, joined) + def test_str_attribute(self): + # GH9068 + methods = ['strip', 'rstrip', 'lstrip'] + idx = Index([' jack', 'jill ', ' jesse ', 'frank']) + for method in methods: + expected = Index([getattr(str, method)(x) for x in idx.values]) + tm.assert_index_equal(getattr(Index.str, method)(idx.str), expected) + + # create a few instances that are not able to use .str accessor + indices = [Index(range(5)), + tm.makeDateIndex(10), + MultiIndex.from_tuples([('foo', '1'), ('bar', '3')]), + PeriodIndex(start='2000', end='2010', freq='A')] + for idx in indices: + with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'): + idx.str.repeat(2) + + idx = Index(['a b c', 'd e', 'f']) + expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']]) + tm.assert_index_equal(idx.str.split(), expected) + tm.assert_index_equal(idx.str.split(return_type='series'), expected) + # return_type 'index' is an alias for 'series' + tm.assert_index_equal(idx.str.split(return_type='index'), expected) + with self.assertRaisesRegexp(ValueError, 'not supported'): + idx.str.split(return_type='frame') + + # test boolean case, should return np.array instead of boolean Index + idx = Index(['a1', 'a2', 'b1', 'b2']) + expected = np.array([True, True, False, False]) + self.assert_array_equal(idx.str.startswith('a'), expected) + self.assertIsInstance(idx.str.startswith('a'), np.ndarray) + s = Series(range(4), index=idx) + expected = Series(range(2), index=['a1', 'a2']) + tm.assert_series_equal(s[s.index.str.startswith('a')], expected) def test_indexing_doesnt_change_class(self): idx = Index([1, 2, 3, 'a', 'b', 'c']) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index fec98a37b5017..70a6e2541692a 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -4933,6 +4933,19 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) self.assertIsInstance(csv_str, str) + def test_str_attribute(self): + # GH9068 + methods = ['strip', 'rstrip', 'lstrip'] + s = Series([' jack', 'jill ', ' jesse ', 'frank']) + for method in methods: + expected = Series([getattr(str, method)(x) for x in s.values]) + assert_series_equal(getattr(Series.str, method)(s.str), expected) + + # str accessor only valid with string values + s = Series(range(5)) + with self.assertRaisesRegexp(AttributeError, 'only use .str accessor'): + s.str.repeat(2) + def test_clip(self): val = self.ts.median() From 44861eb4367ad3a24d61a5628b0879b7c5d55e72 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 11 Apr 2015 17:39:46 +0900 Subject: [PATCH 039/239] BUG: plot(kind=hist) results in TypeError for non-numeric data --- doc/source/whatsnew/v0.16.1.txt | 1 + pandas/tests/test_graphics.py | 12 ++++++++++++ pandas/tools/plotting.py | 3 ++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 382c78c496009..2fa02d70b6d4b 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -107,6 +107,7 @@ Bug Fixes - Bug in plotting continuously using ``secondary_y`` may not show legend properly. (:issue:`9610`, :issue:`9779`) +- Bug in ``DataFrame.plot(kind="hist")`` results in ``TypeError`` when ``DataFrame`` contains non-numeric columns (:issue:`9853`) - Bug in ``Series.quantile`` on empty Series of type ``Datetime`` or ``Timedelta`` (:issue:`9675`) - Bug in ``where`` causing incorrect results when upcasting was required (:issue:`9731`) diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 36c19cd39f76c..7d489ce66c288 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -678,6 +678,18 @@ def test_hist_df_kwargs(self): ax = df.plot(kind='hist', bins=5) self.assertEqual(len(ax.patches), 10) + @slow + def test_hist_df_with_nonnumerics(self): + # GH 9853 + with tm.RNGContext(1): + df = DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D']) + df['E'] = ['x', 'y'] * 5 + ax = df.plot(kind='hist', bins=5) + self.assertEqual(len(ax.patches), 20) + + ax = df.plot(kind='hist') # bins=10 + self.assertEqual(len(ax.patches), 40) + @slow def test_hist_legacy(self): _check_plot_works(self.ts.hist) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 358c5b0dd5940..1accc48b0d3c4 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1948,7 +1948,8 @@ def __init__(self, data, bins=10, bottom=0, **kwargs): def _args_adjust(self): if com.is_integer(self.bins): # create common bin edge - values = np.ravel(self.data.values) + values = self.data.convert_objects()._get_numeric_data() + values = np.ravel(values) values = values[~com.isnull(values)] hist, self.bins = np.histogram(values, bins=self.bins, From 67ac08a66da96ab9778e505e1d546a40e56bc79f Mon Sep 17 00:00:00 2001 From: Artemy Kolchinsky Date: Wed, 15 Oct 2014 14:39:51 -0400 Subject: [PATCH 040/239] Fix to allow sparse dataframes to have nan column labels Support for nan columns Fix Trigger Travis CI jreback fixes Release note update --- doc/source/whatsnew/v0.16.1.txt | 6 ++++++ pandas/sparse/frame.py | 18 +++++++++--------- pandas/sparse/tests/test_sparse.py | 6 ++++++ 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 2fa02d70b6d4b..d22542d01fb0f 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -130,4 +130,10 @@ Bug Fixes ======= - Bug in ``read_csv()`` interprets ``index_col=True`` as ``1`` (:issue:`9798`) +<<<<<<< HEAD >>>>>>> 53f2ea4... BUG: Issue 9798 fixed +======= + +- Bug in which ``SparseDataFrame`` could not take `nan` as a column name (:issue:`8822`) + +>>>>>>> 7879205... Fix to allow sparse dataframes to have nan column labels diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 30b06c8a93142..bc022fcb6542b 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -100,7 +100,7 @@ def __init__(self, data=None, index=None, columns=None, mgr = self._init_mgr( data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) elif data is None: - data = {} + data = DataFrame() if index is None: index = Index([]) @@ -115,7 +115,7 @@ def __init__(self, data=None, index=None, columns=None, index=index, kind=self._default_kind, fill_value=self._default_fill_value) - mgr = dict_to_manager(data, columns, index) + mgr = df_to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) @@ -155,7 +155,7 @@ def _init_dict(self, data, index, columns, dtype=None): kind=self._default_kind, fill_value=self._default_fill_value, copy=True) - sdict = {} + sdict = DataFrame() for k, v in compat.iteritems(data): if isinstance(v, Series): # Force alignment, no copy necessary @@ -181,7 +181,7 @@ def _init_dict(self, data, index, columns, dtype=None): if c not in sdict: sdict[c] = sp_maker(nan_vec) - return dict_to_manager(sdict, columns, index) + return df_to_manager(sdict, columns, index) def _init_matrix(self, data, index, columns, dtype=None): data = _prep_ndarray(data, copy=False) @@ -228,12 +228,12 @@ def _unpickle_sparse_frame_compat(self, state): else: index = idx - series_dict = {} + series_dict = DataFrame() for col, (sp_index, sp_values) in compat.iteritems(series): series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index, fill_value=fv) - self._data = dict_to_manager(series_dict, columns, index) + self._data = df_to_manager(series_dict, columns, index) self._default_fill_value = fv self._default_kind = kind @@ -737,13 +737,13 @@ def applymap(self, func): """ return self.apply(lambda x: lmap(func, x)) -def dict_to_manager(sdict, columns, index): - """ create and return the block manager from a dict of series, columns, index """ +def df_to_manager(sdf, columns, index): + """ create and return the block manager from a dataframe of series, columns, index """ # from BlockManager perspective axes = [_ensure_index(columns), _ensure_index(index)] - return create_block_manager_from_arrays([sdict[c] for c in columns], columns, axes) + return create_block_manager_from_arrays([sdf[c] for c in columns], columns, axes) def stack_sparse_frame(frame): diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index f187e7f883e11..454cbcd5320e9 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -1663,6 +1663,12 @@ def test_as_blocks(self): self.assertEqual(list(df_blocks.keys()), ['float64']) assert_frame_equal(df_blocks['float64'], df) + def test_nan_columnname(self): + # GH 8822 + nan_colname = DataFrame(Series(1.0,index=[0]),columns=[nan]) + nan_colname_sparse = nan_colname.to_sparse() + self.assertTrue(np.isnan(nan_colname_sparse.columns[0])) + def _dense_series_compare(s, f): result = f(s) From 8d495e597492e9f00c09d169596e7c6e0b9384b6 Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Fri, 10 Apr 2015 16:28:50 +0200 Subject: [PATCH 041/239] Fix: unequal comparisons of categorical and scalar Before, unequal comparisons were not checking the order of the categories. This was due to a conversion to an ndarray, which turned the comparison to one between ndarray and scalar, which of course has no categories to take into account. Also add test cases and remove the one which actually tested the wrong behaviour. --- doc/source/whatsnew/v0.16.1.txt | 4 ++++ pandas/core/ops.py | 28 +++++++++++++++---------- pandas/tests/test_categorical.py | 35 +++++++++++++++++++++++++++++--- 3 files changed, 53 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index d22542d01fb0f..f67d4a88881c9 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -136,4 +136,8 @@ Bug Fixes - Bug in which ``SparseDataFrame`` could not take `nan` as a column name (:issue:`8822`) +<<<<<<< HEAD >>>>>>> 7879205... Fix to allow sparse dataframes to have nan column labels +======= +- Bug in unequal comparisons between a ``Series`` of dtype `"category"` and a scalar (e.g. ``Series(Categorical(list("abc"), categories=list("cba"), ordered=True)) > "b"``, which wouldn't use the order of the categories but use the lexicographical order. (:issue:`9848`) +>>>>>>> f0ac930... Fix: unequal comparisons of categorical and scalar diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 954d2c8a77326..2af9cd43faaef 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -594,20 +594,26 @@ def wrapper(self, other): mask = isnull(self) - values = self.get_values() - other = _index.convert_scalar(values,_values_from_object(other)) + if com.is_categorical_dtype(self): + # cats are a special case as get_values() would return an ndarray, which would then + # not take categories ordering into account + # we can go directly to op, as the na_op would just test again and dispatch to it. + res = op(self.values, other) + else: + values = self.get_values() + other = _index.convert_scalar(values,_values_from_object(other)) - if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): - values = values.view('i8') + if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): + values = values.view('i8') - # scalars - res = na_op(values, other) - if np.isscalar(res): - raise TypeError('Could not compare %s type with Series' - % type(other)) + # scalars + res = na_op(values, other) + if np.isscalar(res): + raise TypeError('Could not compare %s type with Series' + % type(other)) - # always return a full value series here - res = _values_from_object(res) + # always return a full value series here + res = _values_from_object(res) res = pd.Series(res, index=self.index, name=self.name, dtype='bool') diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 7f4b3fcb94dfa..4c5678bf6633f 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -114,6 +114,9 @@ def f(): Categorical([1,2], [1,2,np.nan, np.nan]) self.assertRaises(ValueError, f) + # The default should be unordered + c1 = Categorical(["a", "b", "c", "a"]) + self.assertFalse(c1.ordered) # Categorical as input c1 = Categorical(["a", "b", "c", "a"]) @@ -367,6 +370,13 @@ def f(): self.assertRaises(TypeError, lambda: a < cat) self.assertRaises(TypeError, lambda: a < cat_rev) + # Make sure that unequal comparison take the categories order in account + cat_rev = pd.Categorical(list("abc"), categories=list("cba"), ordered=True) + exp = np.array([True, False, False]) + res = cat_rev > "b" + self.assert_numpy_array_equal(res, exp) + + def test_na_flags_int_categories(self): # #1457 @@ -2390,6 +2400,18 @@ def test_comparisons(self): exp = Series([False, False, True]) tm.assert_series_equal(res, exp) + scalar = base[1] + res = cat > scalar + exp = Series([False, False, True]) + exp2 = cat.values > scalar + tm.assert_series_equal(res, exp) + tm.assert_numpy_array_equal(res.values, exp2) + res_rev = cat_rev > scalar + exp_rev = Series([True, False, False]) + exp_rev2 = cat_rev.values > scalar + tm.assert_series_equal(res_rev, exp_rev) + tm.assert_numpy_array_equal(res_rev.values, exp_rev2) + # Only categories with same categories can be compared def f(): cat > cat_rev @@ -2408,9 +2430,16 @@ def f(): self.assertRaises(TypeError, lambda: a < cat) self.assertRaises(TypeError, lambda: a < cat_rev) - # Categoricals can be compared to scalar values - res = cat_rev > base[0] - tm.assert_series_equal(res, exp) + # unequal comparison should raise for unordered cats + cat = Series(Categorical(list("abc"))) + def f(): + cat > "b" + self.assertRaises(TypeError, f) + cat = Series(Categorical(list("abc"), ordered=False)) + def f(): + cat > "b" + self.assertRaises(TypeError, f) + # And test NaN handling... cat = Series(Categorical(["a","b","c", np.nan])) From abfdeda67b09c41abdc01dd957669bd7973d9fa0 Mon Sep 17 00:00:00 2001 From: Mortada Mehyar Date: Sat, 11 Apr 2015 11:47:02 -0700 Subject: [PATCH 042/239] DOC: add more examples to StringMethods on Index --- doc/source/text.rst | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/doc/source/text.rst b/doc/source/text.rst index ee91ea3c166b6..f417f56f51fbc 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -37,6 +37,32 @@ the equivalent (scalar) built-in string methods: idx.str.lstrip() idx.str.rstrip() +The string methods on Index are especially useful for cleaning up or +transforming DataFrame columns. For instance, you may have columns with +leading or trailing whitespace: + +.. ipython:: python + + df = DataFrame(randn(3, 2), columns=[' Column A ', ' Column B '], + index=range(3)) + df + +Since ``df.columns`` is an Index object, we can use the ``.str`` accessor + +.. ipython:: python + + df.columns.str.strip() + df.columns.str.lower() + +These string methods can then be used to clean up the columns as needed. +Here we are removing leading and trailing whitespaces, lowercasing all names, +and replacing any remaining whitespaces with underscores: + +.. ipython:: python + + df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_') + df + Splitting and Replacing Strings ------------------------------- From a835b5ad77bea0d2839296f13b986a12346e8acb Mon Sep 17 00:00:00 2001 From: behzad nouri Date: Thu, 9 Apr 2015 18:59:02 -0400 Subject: [PATCH 043/239] memory access bug in read_csv causing segfault --- doc/source/whatsnew/v0.16.1.txt | 1 + pandas/io/tests/test_cparser.py | 22 +++++++++ pandas/parser.pyx | 79 +++++++++++---------------------- pandas/src/parser/tokenizer.c | 30 ++++++------- pandas/src/parser/tokenizer.h | 9 ++-- 5 files changed, 69 insertions(+), 72 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index f67d4a88881c9..0062eb421890f 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -91,6 +91,7 @@ Bug Fixes - Fixed bug (:issue:`9542`) where labels did not appear properly in legend of ``DataFrame.plot()``. Passing ``label=`` args also now works, and series indices are no longer mutated. - Bug in json serialization when frame has length zero.(:issue:`9805`) +- Bug in `read_csv` where missing trailing delimiters would cause segfault. (:issue:`5664`) - Bug in ``scatter_matrix`` draws unexpected axis ticklabels (:issue:`5662`) diff --git a/pandas/io/tests/test_cparser.py b/pandas/io/tests/test_cparser.py index ad6f071d738ff..93d55c654de90 100644 --- a/pandas/io/tests/test_cparser.py +++ b/pandas/io/tests/test_cparser.py @@ -336,6 +336,28 @@ def test_empty_field_eof(self): 2: np.array(['3', ''], dtype=object)} assert_array_dicts_equal(result, expected) + # GH5664 + a = DataFrame([['b'], [nan]], columns=['a'], index=['a', 'c']) + b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], + columns=list('abcd'), + index=[1, 1]) + c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan], + [8, 9, 10, 11], [13, 14, nan, nan]], + columns=list('abcd'), + index=[0, 5, 7, 12]) + + for _ in range(100): + df = read_csv(StringIO('a,b\nc\n'), skiprows=0, + names=['a'], engine='c') + assert_frame_equal(df, a) + + df = read_csv(StringIO('1,1,1,1,0\n'*2 + '\n'*2), + names=list("abcd"), engine='c') + assert_frame_equal(df, b) + + df = read_csv(StringIO('0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14'), + names=list('abcd'), engine='c') + assert_frame_equal(df, c) def assert_array_dicts_equal(left, right): for k, v in compat.iteritems(left): diff --git a/pandas/parser.pyx b/pandas/parser.pyx index d13781d6fa132..73a03fc5cef7c 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -175,7 +175,7 @@ cdef extern from "parser/tokenizer.h": int col void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) - char* COLITER_NEXT(coliter_t it) + void COLITER_NEXT(coliter_t, const char *) parser_t* parser_new() @@ -212,7 +212,7 @@ cdef extern from "parser/tokenizer.h": inline int to_longlong(char *item, long long *p_value) # inline int to_longlong_thousands(char *item, long long *p_value, # char tsep) - int to_boolean(char *item, uint8_t *val) + int to_boolean(const char *item, uint8_t *val) cdef extern from "parser/io.h": @@ -1279,7 +1279,7 @@ cdef _string_box_factorize(parser_t *parser, int col, Py_ssize_t i size_t lines coliter_t it - char *word + const char *word = NULL ndarray[object] result int ret = 0 @@ -1296,7 +1296,7 @@ cdef _string_box_factorize(parser_t *parser, int col, coliter_setup(&it, parser, col, line_start) for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) if na_filter: k = kh_get_str(na_hashset, word) @@ -1333,7 +1333,7 @@ cdef _string_box_utf8(parser_t *parser, int col, Py_ssize_t i size_t lines coliter_t it - char *word + const char *word = NULL ndarray[object] result int ret = 0 @@ -1350,7 +1350,7 @@ cdef _string_box_utf8(parser_t *parser, int col, coliter_setup(&it, parser, col, line_start) for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) if na_filter: k = kh_get_str(na_hashset, word) @@ -1388,7 +1388,7 @@ cdef _string_box_decode(parser_t *parser, int col, Py_ssize_t i, size size_t lines coliter_t it - char *word + const char *word = NULL ndarray[object] result int ret = 0 @@ -1407,7 +1407,7 @@ cdef _string_box_decode(parser_t *parser, int col, coliter_setup(&it, parser, col, line_start) for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) if na_filter: k = kh_get_str(na_hashset, word) @@ -1444,7 +1444,7 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start, int error Py_ssize_t i, j coliter_t it - char *word + const char *word = NULL char *data ndarray result @@ -1454,7 +1454,7 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start, coliter_setup(&it, parser, col, line_start) for i in range(line_end - line_start): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) strncpy(data, word, width) data += width @@ -1469,7 +1469,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, int error, na_count = 0 size_t i, lines coliter_t it - char *word + const char *word = NULL char *p_end double *data double NA = na_values[np.float64] @@ -1485,7 +1485,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, if na_filter: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) k = kh_get_str(na_hashset, word) # in the hash table @@ -1509,7 +1509,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, data += 1 else: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: @@ -1530,7 +1530,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, int error, na_count = 0 size_t i, lines coliter_t it - char *word + const char *word = NULL int64_t *data ndarray result @@ -1544,7 +1544,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, if na_filter: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) k = kh_get_str(na_hashset, word) # in the hash table if k != na_hashset.n_buckets: @@ -1561,7 +1561,7 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, return None, None else: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, &error, parser.thousands) if error != 0: @@ -1578,7 +1578,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, int error, na_count = 0 size_t i, lines coliter_t it - char *word + const char *word = NULL uint8_t *data ndarray result @@ -1592,7 +1592,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, if na_filter: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) k = kh_get_str(na_hashset, word) # in the hash table @@ -1608,7 +1608,7 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, data += 1 else: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) error = to_boolean(word, data) if error != 0: @@ -1625,7 +1625,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, int error, na_count = 0 size_t i, lines coliter_t it - char *word + const char *word = NULL uint8_t *data ndarray result @@ -1639,7 +1639,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, if na_filter: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) k = kh_get_str(na_hashset, word) # in the hash table @@ -1667,7 +1667,7 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, data += 1 else: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) k = kh_get_str(true_hashset, word) if k != true_hashset.n_buckets: @@ -1688,33 +1688,6 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, return result.view(np.bool_), na_count -cdef _get_na_mask(parser_t *parser, int col, int line_start, int line_end, - kh_str_t *na_hashset): - cdef: - int error - Py_ssize_t i - size_t lines - coliter_t it - char *word - ndarray[uint8_t, cast=True] result - khiter_t k - - lines = line_end - line_start - result = np.empty(lines, dtype=np.bool_) - - coliter_setup(&it, parser, col, line_start) - for i in range(lines): - word = COLITER_NEXT(it) - - k = kh_get_str(na_hashset, word) - # in the hash table - if k != na_hashset.n_buckets: - result[i] = 1 - else: - result[i] = 0 - - return result - cdef kh_str_t* kset_from_list(list values) except NULL: # caller takes responsibility for freeing the hash table cdef: @@ -1897,7 +1870,7 @@ cdef _apply_converter(object f, parser_t *parser, int col, Py_ssize_t i size_t lines coliter_t it - char *word + const char *word = NULL char *errors = "strict" ndarray[object] result object val @@ -1909,17 +1882,17 @@ cdef _apply_converter(object f, parser_t *parser, int col, if not PY3 and c_encoding == NULL: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) val = PyBytes_FromString(word) result[i] = f(val) elif ((PY3 and c_encoding == NULL) or c_encoding == b'utf-8'): for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) val = PyUnicode_FromString(word) result[i] = f(val) else: for i in range(lines): - word = COLITER_NEXT(it) + COLITER_NEXT(it, word) val = PyUnicode_Decode(word, strlen(word), c_encoding, errors) result[i] = f(val) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 1bc4096658b29..1850aab50b55a 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -38,7 +38,7 @@ See LICENSE for the license * RESTORE_FINAL (2): * Put the file position at the next byte after the * data read from the file_buffer. -* +* #define RESTORE_NOT 0 #define RESTORE_INITIAL 1 #define RESTORE_FINAL 2 @@ -304,7 +304,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { self->stream_len, &self->stream_cap, nbytes * 2, sizeof(char), &status); - TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, self->stream_cap=%zu, status=%zu\n", + TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, self->stream_cap=%zu, status=%zu\n", self->stream, self->stream_len, self->stream_cap, status)) if (status != 0) { @@ -334,7 +334,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { self->words_len, &self->words_cap, nbytes, sizeof(char*), &status); - TRACE(("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, %d)\n", + TRACE(("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, %d)\n", self->words_len, self->words_cap, nbytes, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; @@ -371,7 +371,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { self->lines + 1, &self->lines_cap, nbytes, sizeof(int), &status); - TRACE(("make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", + TRACE(("make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", self->lines + 1, self->lines_cap, nbytes, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; @@ -398,7 +398,7 @@ static int push_char(parser_t *self, char c) { /* TRACE(("pushing %c \n", c)) */ TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", self->stream_len+1, c, self->stream_cap)) if (self->stream_len >= self->stream_cap) { - TRACE(("push_char: ERROR!!! self->stream_len(%d) >= self->stream_cap(%d)\n", + TRACE(("push_char: ERROR!!! self->stream_len(%d) >= self->stream_cap(%d)\n", self->stream_len, self->stream_cap)) self->error_msg = (char*) malloc(64); sprintf(self->error_msg, "Buffer overflow caught - possible malformed input file.\n"); @@ -463,7 +463,6 @@ static void append_warning(parser_t *self, const char *msg) { static int end_line(parser_t *self) { int fields; - khiter_t k; /* for hash set detection */ int ex_fields = self->expected_fields; char *msg; @@ -483,7 +482,7 @@ static int end_line(parser_t *self) { TRACE(("end_line: Skipping row %d\n", self->file_lines)); // increment file line count self->file_lines++; - + // skip the tokens from this bad line self->line_start[self->lines] += fields; @@ -605,12 +604,11 @@ int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) { static int parser_buffer_bytes(parser_t *self, size_t nbytes) { int status; size_t bytes_read; - void *src = self->source; status = 0; self->datapos = 0; self->data = self->cb_io(self->source, nbytes, &bytes_read, &status); - TRACE(("parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", + TRACE(("parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", nbytes, bytes_read, status)); self->datalen = bytes_read; @@ -704,7 +702,7 @@ typedef int (*parser_op)(parser_t *self, size_t line_limit); int skip_this_line(parser_t *self, int64_t rownum) { if (self->skipset != NULL) { - return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) != + return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) != ((kh_int64_t*)self->skipset)->n_buckets ); } else { @@ -784,7 +782,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit) else self->state = EAT_CRNL; break; - } + } else if (c == self->commentchar) { self->state = EAT_LINE_COMMENT; break; @@ -1750,7 +1748,7 @@ int parser_trim_buffers(parser_t *self) { /* trim stream */ new_cap = _next_pow2(self->stream_len) + 1; - TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n", + TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n", new_cap, self->stream_cap, self->lines_cap)); if (new_cap < self->stream_cap) { TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling safe_realloc\n")); @@ -1871,7 +1869,7 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { } } - TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n", + TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n", self->datalen - self->datapos, self->datalen, self->datapos)); /* TRACE(("sourcetype: %c, status: %d\n", self->sourcetype, status)); */ @@ -2033,7 +2031,7 @@ int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep) return status; }*/ -int to_boolean(char *item, uint8_t *val) { +int to_boolean(const char *item, uint8_t *val) { char *tmp; int i, status = 0; @@ -2357,7 +2355,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, num_digits++; num_decimals++; } - + if (num_digits >= max_digits) // consume extra decimal digits while (isdigit(*p)) ++p; @@ -2653,4 +2651,4 @@ uint64_t str_to_uint64(const char *p_item, uint64_t uint_max, int *error) *error = 0; return number; } -*/ \ No newline at end of file +*/ diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 694a73ec78153..d3777e858b6ca 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -228,9 +228,12 @@ coliter_t *coliter_new(parser_t *self, int i); /* #define COLITER_NEXT(iter) iter->words[iter->line_start[iter->line++] + iter->col] */ // #define COLITER_NEXT(iter) iter.words[iter.line_start[iter.line++] + iter.col] -#define COLITER_NEXT(iter) iter.words[*iter.line_start++ + iter.col] +#define COLITER_NEXT(iter, word) do { \ + const int i = *iter.line_start++ + iter.col; \ + word = i < *iter.line_start ? iter.words[i]: ""; \ + } while(0) -parser_t* parser_new(); +parser_t* parser_new(void); int parser_init(parser_t *self); @@ -270,6 +273,6 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep, in //int P_INLINE to_complex(char *item, double *p_real, double *p_imag, char sci, char decimal); int P_INLINE to_longlong(char *item, long long *p_value); //int P_INLINE to_longlong_thousands(char *item, long long *p_value, char tsep); -int to_boolean(char *item, uint8_t *val); +int to_boolean(const char *item, uint8_t *val); #endif // _PARSER_COMMON_H_ From c2f61e0bb7100656559cd660d732d139e340ce66 Mon Sep 17 00:00:00 2001 From: David Stephens Date: Fri, 27 Mar 2015 19:18:10 -0700 Subject: [PATCH 044/239] BUG: raw_locales unreachable in util.testing.get_locales --- pandas/tests/test_util.py | 4 ++++ pandas/util/testing.py | 16 +++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pandas/tests/test_util.py b/pandas/tests/test_util.py index 2e22b33dc769a..bb8bd3df96b71 100644 --- a/pandas/tests/test_util.py +++ b/pandas/tests/test_util.py @@ -79,6 +79,10 @@ def test_warning(self): with tm.assert_produces_warning(FutureWarning): self.assertNotAlmostEquals(1, 2) + def test_locale(self): + #GH9744 + locales = pandas.util.testing.get_locales() + self.assertTrue(len(locales) >= 1) def test_rands(): r = tm.rands(10) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 3d9a0e7b43634..b4baedada46e1 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -331,19 +331,21 @@ def get_locales(prefix=None, normalize=True, # raw_locales is "\n" seperated list of locales # it may contain non-decodable parts, so split # extract what we can and then rejoin. - raw_locales = [] + raw_locales = raw_locales.split(b'\n') + out_locales = [] for x in raw_locales: - try: - raw_locales.append(str(x, encoding=pd.options.display.encoding)) - except: - pass + if compat.PY3: + out_locales.append(str(x, encoding=pd.options.display.encoding)) + else: + out_locales.append(str(x)) + except TypeError: pass if prefix is None: - return _valid_locales(raw_locales, normalize) + return _valid_locales(out_locales, normalize) - found = re.compile('%s.*' % prefix).findall('\n'.join(raw_locales)) + found = re.compile('%s.*' % prefix).findall('\n'.join(out_locales)) return _valid_locales(found, normalize) From ad3549d7fdb9cc2d5c39e258b5a0f89db277d2fe Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 11 Apr 2015 18:23:10 +0900 Subject: [PATCH 045/239] BUG/CLN: Repeated time-series plot may raise TypeError --- doc/source/whatsnew/v0.16.1.txt | 1 + pandas/tools/plotting.py | 107 ++++++++++---------------- pandas/tseries/tests/test_plotting.py | 32 ++++++++ 3 files changed, 75 insertions(+), 65 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 0062eb421890f..0374ade03290a 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -109,6 +109,7 @@ Bug Fixes - Bug in plotting continuously using ``secondary_y`` may not show legend properly. (:issue:`9610`, :issue:`9779`) - Bug in ``DataFrame.plot(kind="hist")`` results in ``TypeError`` when ``DataFrame`` contains non-numeric columns (:issue:`9853`) +- Bug where repeated plotting of ``DataFrame`` with a ``DatetimeIndex`` may raise ``TypeError`` (:issue:`9852`) - Bug in ``Series.quantile`` on empty Series of type ``Datetime`` or ``Timedelta`` (:issue:`9675`) - Bug in ``where`` causing incorrect results when upcasting was required (:issue:`9731`) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 1accc48b0d3c4..6a284e547433a 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -885,28 +885,16 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): if fillna is not None: data = data.fillna(fillna) - from pandas.core.frame import DataFrame - if isinstance(data, (Series, np.ndarray, Index)): - label = self.label if self.label is not None else data.name + if self.sort_columns: + columns = com._try_sort(data.columns) + else: + columns = data.columns + + for col in columns: if keep_index is True: - yield label, data + yield col, data[col] else: - yield label, np.asarray(data) - elif isinstance(data, DataFrame): - if self.sort_columns: - columns = com._try_sort(data.columns) - else: - columns = data.columns - - for col in columns: - # # is this right? - # empty = df[col].count() == 0 - # values = df[col].values if not empty else np.zeros(len(df)) - - if keep_index is True: - yield col, data[col] - else: - yield col, data[col].values + yield col, data[col].values @property def nseries(self): @@ -1006,7 +994,15 @@ def result(self): return self.axes[0] def _compute_plot_data(self): - numeric_data = self.data.convert_objects()._get_numeric_data() + data = self.data + + if isinstance(data, Series): + label = self.kwds.pop('label', None) + if label is None and data.name is None: + label = 'None' + data = data.to_frame(name=label) + + numeric_data = data.convert_objects()._get_numeric_data() try: is_empty = numeric_data.empty @@ -1027,12 +1023,7 @@ def _add_table(self): if self.table is False: return elif self.table is True: - from pandas.core.frame import DataFrame - if isinstance(self.data, Series): - data = DataFrame(self.data, columns=[self.data.name]) - elif isinstance(self.data, DataFrame): - data = self.data - data = data.transpose() + data = self.data.transpose() else: data = self.table ax = self._get_ax(0) @@ -1099,18 +1090,15 @@ def _apply_axis_properties(self, axis, rot=None, fontsize=None): @property def legend_title(self): - if hasattr(self.data, 'columns'): - if not isinstance(self.data.columns, MultiIndex): - name = self.data.columns.name - if name is not None: - name = com.pprint_thing(name) - return name - else: - stringified = map(com.pprint_thing, - self.data.columns.names) - return ','.join(stringified) + if not isinstance(self.data.columns, MultiIndex): + name = self.data.columns.name + if name is not None: + name = com.pprint_thing(name) + return name else: - return None + stringified = map(com.pprint_thing, + self.data.columns.names) + return ','.join(stringified) def _add_legend_handle(self, handle, label, index=None): if not label is None: @@ -1256,12 +1244,10 @@ def _get_ax(self, i): return ax def on_right(self, i): - from pandas.core.frame import DataFrame if isinstance(self.secondary_y, bool): return self.secondary_y - if (isinstance(self.data, DataFrame) and - isinstance(self.secondary_y, (tuple, list, np.ndarray, Index))): + if isinstance(self.secondary_y, (tuple, list, np.ndarray, Index)): return self.data.columns[i] in self.secondary_y def _get_style(self, i, col_name): @@ -1553,16 +1539,14 @@ def __init__(self, data, **kwargs): self.x_compat = bool(self.kwds.pop('x_compat')) def _index_freq(self): - from pandas.core.frame import DataFrame - if isinstance(self.data, (Series, DataFrame)): - freq = getattr(self.data.index, 'freq', None) - if freq is None: - freq = getattr(self.data.index, 'inferred_freq', None) - if freq == 'B': - weekdays = np.unique(self.data.index.dayofweek) - if (5 in weekdays) or (6 in weekdays): - freq = None - return freq + freq = getattr(self.data.index, 'freq', None) + if freq is None: + freq = getattr(self.data.index, 'inferred_freq', None) + if freq == 'B': + weekdays = np.unique(self.data.index.dayofweek) + if (5 in weekdays) or (6 in weekdays): + freq = None + return freq def _is_dynamic_freq(self, freq): if isinstance(freq, DateOffset): @@ -1574,9 +1558,7 @@ def _is_dynamic_freq(self, freq): def _no_base(self, freq): # hack this for 0.10.1, creating more technical debt...sigh - from pandas.core.frame import DataFrame - if (isinstance(self.data, (Series, DataFrame)) - and isinstance(self.data.index, DatetimeIndex)): + if isinstance(self.data.index, DatetimeIndex): base = frequencies.get_freq(freq) x = self.data.index if (base <= frequencies.FreqGroup.FR_DAY): @@ -1686,17 +1668,13 @@ def _update_prior(self, y): def _maybe_convert_index(self, data): # tsplot converts automatically, but don't want to convert index # over and over for DataFrames - from pandas.core.frame import DataFrame - if (isinstance(data.index, DatetimeIndex) and - isinstance(data, DataFrame)): + if isinstance(data.index, DatetimeIndex): freq = getattr(data.index, 'freq', None) if freq is None: freq = getattr(data.index, 'inferred_freq', None) if isinstance(freq, DateOffset): freq = freq.rule_code - freq = frequencies.get_base_alias(freq) - freq = frequencies.get_period_alias(freq) if freq is None: ax = self._get_ax(0) @@ -1705,9 +1683,10 @@ def _maybe_convert_index(self, data): if freq is None: raise ValueError('Could not get frequency alias for plotting') - data = DataFrame(data.values, - index=data.index.to_period(freq=freq), - columns=data.columns) + freq = frequencies.get_base_alias(freq) + freq = frequencies.get_period_alias(freq) + + data.index = data.index.to_period(freq=freq) return data def _post_plot_logic(self): @@ -2522,9 +2501,7 @@ def plot_series(data, kind='line', ax=None, # Series unique if ax is None and len(plt.get_fignums()) > 0: ax = _gca() ax = getattr(ax, 'left_ax', ax) - # is there harm in this? - if label is None: - label = data.name + return _plot(data, kind=kind, ax=ax, figsize=figsize, use_index=use_index, title=title, grid=grid, legend=legend, diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index c4e642ffe43b0..bdc0aa02f2715 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -636,6 +636,38 @@ def test_mixed_freq_irregular_first(self): x2 = lines[1].get_xdata() assert_array_equal(x2, s1.index.asobject.values) + def test_mixed_freq_regular_first_df(self): + # GH 9852 + import matplotlib.pyplot as plt + s1 = tm.makeTimeSeries().to_frame() + s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] + ax = s1.plot() + ax2 = s2.plot(style='g', ax=ax) + lines = ax2.get_lines() + idx1 = PeriodIndex(lines[0].get_xdata()) + idx2 = PeriodIndex(lines[1].get_xdata()) + self.assertTrue(idx1.equals(s1.index.to_period('B'))) + self.assertTrue(idx2.equals(s2.index.to_period('B'))) + left, right = ax2.get_xlim() + pidx = s1.index.to_period() + self.assertEqual(left, pidx[0].ordinal) + self.assertEqual(right, pidx[-1].ordinal) + + @slow + def test_mixed_freq_irregular_first_df(self): + # GH 9852 + import matplotlib.pyplot as plt + s1 = tm.makeTimeSeries().to_frame() + s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] + ax = s2.plot(style='g') + ax = s1.plot(ax=ax) + self.assertFalse(hasattr(ax, 'freq')) + lines = ax.get_lines() + x1 = lines[0].get_xdata() + assert_array_equal(x1, s2.index.asobject.values) + x2 = lines[1].get_xdata() + assert_array_equal(x2, s1.index.asobject.values) + def test_mixed_freq_hf_first(self): idxh = date_range('1/1/1999', periods=365, freq='D') idxl = date_range('1/1/1999', periods=12, freq='M') From 52875a1019ab3c668970dbc5142f507fbeb4c44b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 12 Apr 2015 10:01:31 -0400 Subject: [PATCH 046/239] DOC: add note about hosted dev docs --- doc/source/contributing.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 2112d5b127e64..7785f5fe3283d 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -381,6 +381,13 @@ browser to see the full documentation you just built:: And you'll have the satisfaction of seeing your new and improved documentation! +.. _contributing.dev_docs: + +Built Master Branch Documentation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When pull-requests are merged into the pandas *master* branch, the main parts of the documentation are +also built by Travis-CI. These docs are then hosted `here `_. Contributing to the code base ============================= From 9351ee6df9f14aeae802625c6d7e864649e26896 Mon Sep 17 00:00:00 2001 From: Mortada Mehyar Date: Sun, 12 Apr 2015 11:53:38 -0700 Subject: [PATCH 047/239] DOC/CLN: fixed several typos in categorical.rst --- doc/source/categorical.rst | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index d03e0fb117c5c..11e7fb0fd4117 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -23,11 +23,11 @@ Categorical Data .. versionadded:: 0.15 .. note:: - While there was in `pandas.Categorical` in earlier versions, the ability to use + While there was `pandas.Categorical` in earlier versions, the ability to use categorical data in `Series` and `DataFrame` is new. -This is a introduction to pandas categorical data type, including a short comparison +This is an introduction to pandas categorical data type, including a short comparison with R's ``factor``. `Categoricals` are a pandas data type, which correspond to categorical variables in @@ -276,7 +276,7 @@ Sorting and Order .. warning:: - The default for construction has change in v0.16.0 to ``ordered=False``, from the prior implicit ``ordered=True`` + The default for construction has changed in v0.16.0 to ``ordered=False``, from the prior implicit ``ordered=True`` If categorical data is ordered (``s.cat.ordered == True``), then the order of the categories has a meaning and certain operations are possible. If the categorical is unordered, ``.min()/.max()`` will raise a `TypeError`. @@ -347,15 +347,15 @@ Multi Column Sorting ~~~~~~~~~~~~~~~~~~~~ A categorical dtyped column will partcipate in a multi-column sort in a similar manner to other columns. -The ordering of the categorical is determined by the ``categories`` of that columns. +The ordering of the categorical is determined by the ``categories`` of that column. .. ipython:: python - dfs = DataFrame({'A' : Categorical(list('bbeebbaa'),categories=['e','a','b'],ordered=True), + dfs = DataFrame({'A' : Categorical(list('bbeebbaa'), categories=['e','a','b'], ordered=True), 'B' : [1,2,1,2,2,1,2,1] }) - dfs.sort(['A','B']) + dfs.sort(['A', 'B']) -Reordering the ``categories``, changes a future sort. +Reordering the ``categories`` changes a future sort. .. ipython:: python @@ -380,7 +380,7 @@ categories or a categorical with any list-like object, will raise a TypeError. Any "non-equality" comparisons of categorical data with a `Series`, `np.array`, `list` or categorical data with different categories or ordering will raise an `TypeError` because custom - categories ordering could be interpreted in two ways: one with taking in account the + categories ordering could be interpreted in two ways: one with taking into account the ordering and one without. .. ipython:: python @@ -471,7 +471,7 @@ Data munging ------------ The optimized pandas data access methods ``.loc``, ``.iloc``, ``.ix`` ``.at``, and ``.iat``, -work as normal, the only difference is the return type (for getting) and +work as normal. The only difference is the return type (for getting) and that only values already in `categories` can be assigned. Getting @@ -707,8 +707,8 @@ an ``object`` dtype is a constant times the length of the data. .. note:: - If the number of categories approaches the length of the data, the ``Categorical`` will use nearly (or more) memory than an - equivalent ``object`` dtype representation. + If the number of categories approaches the length of the data, the ``Categorical`` will use nearly the same or + more memory than an equivalent ``object`` dtype representation. .. ipython:: python From 3582b1c3765c7eb931a74df51e34dbbf8a60b06e Mon Sep 17 00:00:00 2001 From: Mortada Mehyar Date: Sun, 12 Apr 2015 19:41:10 -0700 Subject: [PATCH 048/239] DOC/CLN: fixed bloolean indexing example, cleaned up typos --- doc/source/indexing.rst | 19 ++++++++++--------- doc/source/options.rst | 6 +++--- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index fc074802353ee..2eabc35fd831d 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -30,9 +30,9 @@ The axis labeling information in pandas objects serves many purposes: In this section, we will focus on the final point: namely, how to slice, dice, and generally get and set subsets of pandas objects. The primary focus will be on Series and DataFrame as they have received more development attention in -this area. Expect more work to be invested higher-dimensional data structures -(including ``Panel``) in the future, especially in label-based advanced -indexing. +this area. Expect more work to be invested in higher-dimensional data +structures (including ``Panel``) in the future, especially in label-based +advanced indexing. .. note:: @@ -54,7 +54,7 @@ indexing. .. warning:: - In 0.15.0 ``Index`` has internally been refactored to no longer sub-class ``ndarray`` + In 0.15.0 ``Index`` has internally been refactored to no longer subclass ``ndarray`` but instead subclass ``PandasObject``, similarly to the rest of the pandas objects. This should be a transparent change with only very limited API implications (See the :ref:`Internal Refactoring `) @@ -225,9 +225,9 @@ new column. sa.a = 5 sa - dfa.A = list(range(len(dfa.index))) # ok if A already exists + dfa.A = list(range(len(dfa.index))) # ok if A already exists dfa - dfa['A'] = list(range(len(dfa.index))) # use this form to create a new column + dfa['A'] = list(range(len(dfa.index))) # use this form to create a new column dfa .. warning:: @@ -314,7 +314,7 @@ Selection By Label dfl.loc['20130102':'20130104'] pandas provides a suite of methods in order to have **purely label based indexing**. This is a strict inclusion based protocol. -**at least 1** of the labels for which you ask, must be in the index or a ``KeyError`` will be raised! When slicing, the start bound is *included*, **AND** the stop bound is *included*. Integers are valid labels, but they refer to the label **and not the position**. +**At least 1** of the labels for which you ask, must be in the index or a ``KeyError`` will be raised! When slicing, the start bound is *included*, **AND** the stop bound is *included*. Integers are valid labels, but they refer to the label **and not the position**. The ``.loc`` attribute is the primary access method. The following are valid inputs: @@ -578,9 +578,10 @@ Using a boolean vector to index a Series works exactly as in a numpy ndarray: .. ipython:: python + s = Series(range(-3, 4)) + s s[s > 0] - s[(s < 0) & (s > -0.5)] - s[(s < -1) | (s > 1 )] + s[(s < -1) | (s > 0.5)] s[~(s < 0)] You may select rows from a DataFrame using a boolean vector the same length as diff --git a/doc/source/options.rst b/doc/source/options.rst index 7e36f369bc7e7..4b69015353612 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -18,7 +18,7 @@ Overview pandas has an options system that lets you customize some aspects of its behaviour, display-related options being those the user is most likely to adjust. -Options have a full "dotted-style", case-insensitive name (e.g. ``display.max_rows``), +Options have a full "dotted-style", case-insensitive name (e.g. ``display.max_rows``). You can get/set options directly as attributes of the top-level ``options`` attribute: .. ipython:: python @@ -29,7 +29,7 @@ You can get/set options directly as attributes of the top-level ``options`` attr pd.options.display.max_rows There is also an API composed of 5 relevant functions, available directly from the ``pandas`` -namespace, and they are: +namespace: - :func:`~pandas.get_option` / :func:`~pandas.set_option` - get/set the value of a single option. - :func:`~pandas.reset_option` - reset one or more options to their default value. @@ -412,7 +412,7 @@ mode.use_inf_as_null False True means treat None, NaN, -INF, Number Formatting ------------------ -pandas also allow you to set how numbers are displayed in the console. +pandas also allows you to set how numbers are displayed in the console. This option is not set through the ``set_options`` API. Use the ``set_eng_float_format`` function From db8b1a0d38601c4c55a93433af1caa1aad8b1294 Mon Sep 17 00:00:00 2001 From: Artemy Kolchinsky Date: Wed, 15 Oct 2014 14:39:51 -0400 Subject: [PATCH 049/239] ENH: Allow get_dummies to return sparse dataframe ENH: Allow get_dummies to return sparse dataframe ENH: Allow get_dummies to return sparse dataframe Fix Fix Fixes Bug in order of columns Slight speed improvement get_dummies update Release notes update Remove convert dummies test --- doc/source/whatsnew/v0.16.1.txt | 1 + pandas/core/reshape.py | 61 ++++++++++++++++++++++++--------- pandas/tests/test_reshape.py | 58 +++++++++++++++++-------------- 3 files changed, 79 insertions(+), 41 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 0374ade03290a..ceea29c92c9c0 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -48,6 +48,7 @@ Enhancements df.drop(['A', 'X'], axis=1, errors='ignore') - Allow conversion of values with dtype ``datetime64`` or ``timedelta64`` to strings using ``astype(str)`` (:issue:`9757`) +- ``get_dummies`` function now accepts ``sparse`` keyword. If set to ``True``, the return DataFrame is sparse. (:issue:`8823`) .. _whatsnew_0161.api: diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 291a73778197a..af98e533cb5b7 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -9,6 +9,10 @@ from pandas.core.series import Series from pandas.core.frame import DataFrame +from pandas.core.sparse import SparseDataFrame, SparseSeries +from pandas.sparse.array import SparseArray +from pandas._sparse import IntIndex + from pandas.core.categorical import Categorical from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote, isnull) @@ -932,7 +936,7 @@ def melt_stub(df, stub, i, j): return newdf.set_index([i, j]) def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, - columns=None): + columns=None, sparse=False): """ Convert categorical variable into dummy/indicator variables @@ -953,6 +957,8 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, Column names in the DataFrame to be encoded. If `columns` is None then all the columns with `object` or `category` dtype will be converted. + sparse : bool, default False + Whether the returned DataFrame should be sparse or not. Returns ------- @@ -1039,16 +1045,17 @@ def check_len(item, name): with_dummies = [result] for (col, pre, sep) in zip(columns_to_encode, prefix, prefix_sep): - dummy = _get_dummies_1d(data[col], prefix=pre, - prefix_sep=sep, dummy_na=dummy_na) + dummy = _get_dummies_1d(data[col], prefix=pre, prefix_sep=sep, + dummy_na=dummy_na, sparse=sparse) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: - result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na) + result = _get_dummies_1d(data, prefix, prefix_sep, dummy_na, + sparse=sparse) return result -def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): +def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories @@ -1059,19 +1066,17 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): index = data.index else: index = np.arange(len(data)) - return DataFrame(index=index) - - number_of_cols = len(levels) - if dummy_na: - number_of_cols += 1 - - dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0) + if not sparse: + return DataFrame(index=index) + else: + return SparseDataFrame(index=index) + codes = cat.codes.copy() if dummy_na: + codes[codes == -1] = len(cat.categories) levels = np.append(cat.categories, np.nan) - else: - # reset NaN GH4446 - dummy_mat[cat.codes == -1] = 0 + + number_of_cols = len(levels) if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) @@ -1084,7 +1089,31 @@ def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): else: index = None - return DataFrame(dummy_mat, index=index, columns=dummy_cols) + if sparse: + sparse_series = {} + N = len(data) + sp_indices = [ [] for _ in range(len(dummy_cols)) ] + for ndx, code in enumerate(codes): + if code == -1: + # Blank entries if not dummy_na and code == -1, #GH4446 + continue + sp_indices[code].append(ndx) + + for col, ixs in zip(dummy_cols, sp_indices): + sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs), + fill_value=0) + sparse_series[col] = SparseSeries(data=sarr, index=index) + + return SparseDataFrame(sparse_series, index=index, columns=dummy_cols) + + else: + dummy_mat = np.eye(number_of_cols).take(codes, axis=0) + + if not dummy_na: + # reset NaN GH4446 + dummy_mat[codes == -1] = 0 + + return DataFrame(dummy_mat, index=index, columns=dummy_cols) def make_axis_dummies(frame, axis='minor', transform=None): diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 66f5110830c72..346c9e2598985 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -151,6 +151,8 @@ def test_multiindex(self): class TestGetDummies(tm.TestCase): + sparse = False + def setUp(self): self.df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) @@ -163,20 +165,20 @@ def test_basic(self): expected = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}, 'c': {0: 0.0, 1: 0.0, 2: 1.0}}) - assert_frame_equal(get_dummies(s_list), expected) - assert_frame_equal(get_dummies(s_series), expected) + assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected) + assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected) expected.index = list('ABC') - assert_frame_equal(get_dummies(s_series_index), expected) + assert_frame_equal(get_dummies(s_series_index, sparse=self.sparse), expected) def test_just_na(self): just_na_list = [np.nan] just_na_series = Series(just_na_list) just_na_series_index = Series(just_na_list, index = ['A']) - res_list = get_dummies(just_na_list) - res_series = get_dummies(just_na_series) - res_series_index = get_dummies(just_na_series_index) + res_list = get_dummies(just_na_list, sparse=self.sparse) + res_series = get_dummies(just_na_series, sparse=self.sparse) + res_series_index = get_dummies(just_na_series_index, sparse=self.sparse) self.assertEqual(res_list.empty, True) self.assertEqual(res_series.empty, True) @@ -188,12 +190,13 @@ def test_just_na(self): def test_include_na(self): s = ['a', 'b', np.nan] - res = get_dummies(s) + res = get_dummies(s, sparse=self.sparse) exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) assert_frame_equal(res, exp) - res_na = get_dummies(s, dummy_na=True) + # Sparse dataframes do not allow nan labelled columns, see #GH8822 + res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0}, 'a': {0: 1.0, 1: 0.0, 2: 0.0}, 'b': {0: 0.0, 1: 1.0, 2: 0.0}}).reindex_axis(['a', 'b', nan], 1) @@ -201,7 +204,7 @@ def test_include_na(self): exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) - res_just_na = get_dummies([nan], dummy_na=True) + res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse) exp_just_na = DataFrame(Series(1.0,index=[0]),columns=[nan]) assert_array_equal(res_just_na.values, exp_just_na.values) @@ -210,21 +213,21 @@ def test_unicode(self): # See GH 6885 - get_dummies chokes on unicode values e = 'e' eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] - res = get_dummies(s, prefix='letter') + res = get_dummies(s, prefix='letter', sparse=self.sparse) exp = DataFrame({'letter_e': {0: 1.0, 1: 0.0, 2: 0.0}, u('letter_%s') % eacute: {0: 0.0, 1: 1.0, 2: 1.0}}) assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self): df = self.df[['A', 'B']] - result = get_dummies(df) + result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'A_a': [1., 0, 1], 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], 'B_c': [0., 0, 1]}) assert_frame_equal(result, expected) def test_dataframe_dummies_mix_default(self): df = self.df - result = get_dummies(df) + result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1], 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], 'B_c': [0., 0, 1]}) @@ -235,7 +238,7 @@ def test_dataframe_dummies_prefix_list(self): prefixes = ['from_A', 'from_B'] df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) - result = get_dummies(df, prefix=prefixes) + result = get_dummies(df, prefix=prefixes, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1]}) @@ -243,10 +246,10 @@ def test_dataframe_dummies_prefix_list(self): 'from_B_c']] assert_frame_equal(result, expected) - def test_datafrmae_dummies_prefix_str(self): + def test_dataframe_dummies_prefix_str(self): # not that you should do this... df = self.df - result = get_dummies(df, prefix='bad') + result = get_dummies(df, prefix='bad', sparse=self.sparse) expected = DataFrame([[1, 1., 0., 1., 0.], [2, 0., 1., 1., 0.], [3, 1., 0., 0., 1.]], @@ -256,40 +259,40 @@ def test_datafrmae_dummies_prefix_str(self): def test_dataframe_dummies_subset(self): df = self.df result = get_dummies(df, prefix=['from_A'], - columns=['A']) + columns=['A'], sparse=self.sparse) expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self): df = self.df - result = get_dummies(df, prefix_sep='..') + result = get_dummies(df, prefix_sep='..', sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'A..a': [1., 0, 1], 'A..b': [0., 1, 0], 'B..b': [1., 1, 0], 'B..c': [0., 0, 1]}) expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep=['..', '__']) + result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse) expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'}) assert_frame_equal(result, expected) - result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}) + result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'}, sparse=self.sparse) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_bad_length(self): with tm.assertRaises(ValueError): - get_dummies(self.df, prefix=['too few']) + get_dummies(self.df, prefix=['too few'], sparse=self.sparse) def test_dataframe_dummies_prefix_sep_bad_length(self): with tm.assertRaises(ValueError): - get_dummies(self.df, prefix_sep=['bad']) + get_dummies(self.df, prefix_sep=['bad'], sparse=self.sparse) def test_dataframe_dummies_prefix_dict(self): prefixes = {'A': 'from_A', 'B': 'from_B'} df = DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) - result = get_dummies(df, prefix=prefixes) + result = get_dummies(df, prefix=prefixes, sparse=self.sparse) expected = DataFrame({'from_A_a': [1., 0, 1], 'from_A_b': [0., 1, 0], 'from_B_b': [1., 1, 0], 'from_B_c': [0., 0, 1], 'C': [1, 2, 3]}) @@ -298,7 +301,7 @@ def test_dataframe_dummies_prefix_dict(self): def test_dataframe_dummies_with_na(self): df = self.df df.loc[3, :] = [np.nan, np.nan, np.nan] - result = get_dummies(df, dummy_na=True) + result = get_dummies(df, dummy_na=True, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3, np.nan], 'A_a': [1., 0, 1, 0], 'A_b': [0., 1, 0, 0], 'A_nan': [0., 0, 0, 1], 'B_b': [1., 1, 0, 0], 'B_c': [0., 0, 1, 0], 'B_nan': [0., 0, 0, 1]}) @@ -306,14 +309,14 @@ def test_dataframe_dummies_with_na(self): 'B_nan']] assert_frame_equal(result, expected) - result = get_dummies(df, dummy_na=False) + result = get_dummies(df, dummy_na=False, sparse=self.sparse) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) def test_dataframe_dummies_with_categorical(self): df = self.df df['cat'] = pd.Categorical(['x', 'y', 'y']) - result = get_dummies(df) + result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], 'A_a': [1., 0, 1], 'A_b': [0., 1, 0], 'B_b': [1., 1, 0], 'B_c': [0., 0, 1], 'cat_x': [1., 0, 0], @@ -322,6 +325,11 @@ def test_dataframe_dummies_with_categorical(self): 'cat_x', 'cat_y']] assert_frame_equal(result, expected) + +class TestGetDummiesSparse(TestGetDummies): + sparse = True + + class TestLreshape(tm.TestCase): def test_pairs(self): From 5ed4801b02887bfd66adb28634b34af250133966 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 13 Apr 2015 09:05:36 -0400 Subject: [PATCH 050/239] wip --- doc/source/whatsnew/v0.16.1.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index ceea29c92c9c0..3a23fb5cef729 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -48,7 +48,7 @@ Enhancements df.drop(['A', 'X'], axis=1, errors='ignore') - Allow conversion of values with dtype ``datetime64`` or ``timedelta64`` to strings using ``astype(str)`` (:issue:`9757`) -- ``get_dummies`` function now accepts ``sparse`` keyword. If set to ``True``, the return DataFrame is sparse. (:issue:`8823`) +- ``get_dummies`` function now accepts ``sparse`` keyword. If set to ``True``, the return ``DataFrame`` is sparse, e.g. ``SparseDataFrame``. (:issue:`8823`) .. _whatsnew_0161.api: From 7ecc49a6c6942f5e8fa1f7a39ae6a2fa0c8c3f41 Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Sun, 12 Apr 2015 23:37:47 +0200 Subject: [PATCH 051/239] BUG: Fix for comparisons of categorical and an scalar not in categories, xref GH9836 Up to now, a comparison of categorical data and a scalar, which is not in the categories would return `False` for all elements when it should raise a `TypeError`, which it now does. Also fix that `!=` comparisons would return `False` for all elements when the more logical choice would be `True`. --- doc/source/whatsnew/v0.16.1.txt | 5 +++++ pandas/core/categorical.py | 9 ++++++++- pandas/tests/test_categorical.py | 27 +++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 3a23fb5cef729..bd1aef2dea04e 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -143,4 +143,9 @@ Bug Fixes >>>>>>> 7879205... Fix to allow sparse dataframes to have nan column labels ======= - Bug in unequal comparisons between a ``Series`` of dtype `"category"` and a scalar (e.g. ``Series(Categorical(list("abc"), categories=list("cba"), ordered=True)) > "b"``, which wouldn't use the order of the categories but use the lexicographical order. (:issue:`9848`) +<<<<<<< HEAD >>>>>>> f0ac930... Fix: unequal comparisons of categorical and scalar +======= + +- Bug in unequal comparisons between categorical data and a scalar, which was not in the categories (e.g. ``Series(Categorical(list("abc"), ordered=True)) > "d"``. This returned ``False`` for all elements, but now raises a ``TypeError``. Equality comparisons also now return ``False`` for ``==`` and ``True`` for ``!=``. (:issue:`9848`) +>>>>>>> 35b20d8... BUG: Fix for comparisons of categorical and an scalar not in categories, xref GH9836 diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 991678a8e7d79..b79f2c9b4f6df 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -61,7 +61,14 @@ def f(self, other): i = self.categories.get_loc(other) return getattr(self._codes, op)(i) else: - return np.repeat(False, len(self)) + if op == '__eq__': + return np.repeat(False, len(self)) + elif op == '__ne__': + return np.repeat(True, len(self)) + else: + msg = "Cannot compare a Categorical for op {op} with a scalar, " \ + "which is not a category." + raise TypeError(msg.format(op=op)) else: # allow categorical vs object dtype array comparisons for equality diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 4c5678bf6633f..af48774492b11 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1087,6 +1087,20 @@ def test_reflected_comparison_with_scalars(self): self.assert_numpy_array_equal(cat > cat[0], [False, True, True]) self.assert_numpy_array_equal(cat[0] < cat, [False, True, True]) + def test_comparison_with_unknown_scalars(self): + # https://github.com/pydata/pandas/issues/9836#issuecomment-92123057 and following + # comparisons with scalars not in categories should raise for unequal comps, but not for + # equal/not equal + cat = pd.Categorical([1, 2, 3], ordered=True) + + self.assertRaises(TypeError, lambda: cat < 4) + self.assertRaises(TypeError, lambda: cat > 4) + self.assertRaises(TypeError, lambda: 4 < cat) + self.assertRaises(TypeError, lambda: 4 > cat) + + self.assert_numpy_array_equal(cat == 4 , [False, False, False]) + self.assert_numpy_array_equal(cat != 4 , [True, True, True]) + class TestCategoricalAsBlock(tm.TestCase): _multiprocess_can_split_ = True @@ -2440,6 +2454,19 @@ def f(): cat > "b" self.assertRaises(TypeError, f) + # https://github.com/pydata/pandas/issues/9836#issuecomment-92123057 and following + # comparisons with scalars not in categories should raise for unequal comps, but not for + # equal/not equal + cat = Series(Categorical(list("abc"), ordered=True)) + + self.assertRaises(TypeError, lambda: cat < "d") + self.assertRaises(TypeError, lambda: cat > "d") + self.assertRaises(TypeError, lambda: "d" < cat) + self.assertRaises(TypeError, lambda: "d" > cat) + + self.assert_series_equal(cat == "d" , Series([False, False, False])) + self.assert_series_equal(cat != "d" , Series([True, True, True])) + # And test NaN handling... cat = Series(Categorical(["a","b","c", np.nan])) From f893ac5e7668db59e8c19cc69c523da4daf3e7fb Mon Sep 17 00:00:00 2001 From: Chris Gilmer Date: Mon, 13 Apr 2015 10:24:55 -0400 Subject: [PATCH 052/239] GH9570 allow timedelta string conversion without leading zero --- doc/source/whatsnew/v0.16.1.txt | 2 ++ pandas/tseries/tests/test_timedeltas.py | 7 +++++++ pandas/tseries/timedeltas.py | 7 +++---- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index bd1aef2dea04e..3643e8d939694 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -50,6 +50,8 @@ Enhancements - Allow conversion of values with dtype ``datetime64`` or ``timedelta64`` to strings using ``astype(str)`` (:issue:`9757`) - ``get_dummies`` function now accepts ``sparse`` keyword. If set to ``True``, the return ``DataFrame`` is sparse, e.g. ``SparseDataFrame``. (:issue:`8823`) +- Allow timedelta string conversion when leading zero is missing from time definition, ie `0:00:00` vs `00:00:00`. (:issue:`9570`) + .. _whatsnew_0161.api: API changes diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index b74a3a59d3bca..bc51e01ca9bdf 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -64,6 +64,13 @@ def test_construction(self): self.assertEqual(Timedelta(123072001000000).value, 123072001000000) self.assertTrue('1 days 10:11:12.001' in str(Timedelta(123072001000000))) + # string conversion with/without leading zero + # GH 9570 + self.assertEqual(Timedelta('0:00:00'), timedelta(hours=0)) + self.assertEqual(Timedelta('00:00:00'), timedelta(hours=0)) + self.assertEqual(Timedelta('-1:00:00'), -timedelta(hours=1)) + self.assertEqual(Timedelta('-01:00:00'), -timedelta(hours=1)) + # more strings # GH 8190 self.assertEqual(Timedelta('1 h'), timedelta(hours=1)) diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index 91e75da1b551c..5b353058f0093 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -119,7 +119,7 @@ def _validate_timedelta_unit(arg): _short_search = re.compile( "^\s*(?P-?)\s*(?P\d*\.?\d*)\s*(?Pd|s|ms|us|ns)?\s*$",re.IGNORECASE) _full_search = re.compile( - "^\s*(?P-?)\s*(?P\d*\.?\d*)?\s*(days|d|day)?,?\s*\+?(?P