From 3a9c09357464e4aa1519db956b23de7e8896d59f Mon Sep 17 00:00:00 2001 From: Sangwoong Yoon Date: Sat, 18 Aug 2018 15:14:48 +0900 Subject: [PATCH 1/2] BUG: fix read_csv to parse timezone correctly - use box=True for to_datetime(), and adjust downstream processing to the change. - resolve #22256 --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/io/parsers.py | 15 ++++++++------- pandas/tests/io/parser/parse_dates.py | 16 ++++++++++++++++ 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index d7feb6e547b22..d09c9a4cd6a3d 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -676,6 +676,7 @@ I/O - :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) +- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) - Plotting diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4b3fa08e5e4af..08fb0172adcff 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1620,7 +1620,6 @@ def _infer_types(self, values, na_values, try_num_bool=True): converted : ndarray na_count : int """ - na_count = 0 if issubclass(values.dtype.type, (np.number, np.bool_)): mask = algorithms.isin(values, list(na_values)) @@ -1633,20 +1632,22 @@ def _infer_types(self, values, na_values, try_num_bool=True): if try_num_bool: try: - result = lib.maybe_convert_numeric(values, na_values, False) + result = lib.maybe_convert_numeric(np.asarray(values), + na_values, False) na_count = isna(result).sum() except Exception: result = values if values.dtype == np.object_: - na_count = parsers.sanitize_objects(result, na_values, - False) + na_count = parsers.sanitize_objects(np.asarray(result), + na_values, False) else: result = values if values.dtype == np.object_: - na_count = parsers.sanitize_objects(values, na_values, False) + na_count = parsers.sanitize_objects(np.asarray(values), + na_values, False) if result.dtype == np.object_ and try_num_bool: - result = libops.maybe_convert_bool(values, + result = libops.maybe_convert_bool(np.asarray(values), true_values=self.true_values, false_values=self.false_values) @@ -3033,7 +3034,7 @@ def converter(*date_cols): return tools.to_datetime( ensure_object(strs), utc=None, - box=False, + box=True, dayfirst=dayfirst, errors='ignore', infer_datetime_format=infer_datetime_format diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py index 903439d2d2292..7bfccde975bf0 100644 --- a/pandas/tests/io/parser/parse_dates.py +++ b/pandas/tests/io/parser/parse_dates.py @@ -13,6 +13,7 @@ from pandas._libs.tslibs import parsing from pandas._libs.tslib import Timestamp +import pytz import pandas as pd import pandas.io.parsers as parsers import pandas.core.tools.datetimes as tools @@ -674,3 +675,18 @@ def test_parse_date_float(self, data, expected, parse_dates): # (i.e. float precision should remain unchanged). result = self.read_csv(StringIO(data), parse_dates=parse_dates) tm.assert_frame_equal(result, expected) + + def test_parse_timezone(self): + data = """dt,val + 2018-01-04 09:01:00+09:00,23350 + 2018-01-04 09:02:00+09:00,23400 + 2018-01-04 09:03:00+09:00,23400 + 2018-01-04 09:04:00+09:00,23400 + 2018-01-04 09:05:00+09:00,23400""" + parsed = self.read_csv(StringIO(data), parse_dates=['dt']) + dti = pd.DatetimeIndex(start='2018-01-04 09:01:00', + end='2018-01-04 09:05:00', freq='1min', + tz=pytz.FixedOffset(540)) + expected_data = {'dt': dti, 'val': [23350, 23400, 23400, 23400, 23400]} + expected = DataFrame(expected_data) + tm.assert_frame_equal(parsed, expected) From 339f8365409e66f87b3482b313840ab0aa58cb4b Mon Sep 17 00:00:00 2001 From: Sangwoong Yoon Date: Mon, 20 Aug 2018 12:58:40 +0900 Subject: [PATCH 2/2] CLN: Remove try-except in parse_dates test --- pandas/tests/io/parser/parse_dates.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py index 7bfccde975bf0..ae3c806ac1c8e 100644 --- a/pandas/tests/io/parser/parse_dates.py +++ b/pandas/tests/io/parser/parse_dates.py @@ -16,7 +16,6 @@ import pytz import pandas as pd import pandas.io.parsers as parsers -import pandas.core.tools.datetimes as tools import pandas.util.testing as tm import pandas.io.date_converters as conv @@ -357,21 +356,13 @@ def test_parse_dates_custom_euroformat(self): def test_parse_tz_aware(self): # See gh-1693 - import pytz data = StringIO("Date,x\n2012-06-13T01:39:00Z,0.5") # it works result = self.read_csv(data, index_col=0, parse_dates=True) stamp = result.index[0] assert stamp.minute == 39 - try: - assert result.index.tz is pytz.utc - except AssertionError: - arr = result.index.to_pydatetime() - result = tools.to_datetime(arr, utc=True)[0] - assert stamp.minute == result.minute - assert stamp.hour == result.hour - assert stamp.day == result.day + assert result.index.tz is pytz.utc def test_multiple_date_cols_index(self): data = """ @@ -677,6 +668,7 @@ def test_parse_date_float(self, data, expected, parse_dates): tm.assert_frame_equal(result, expected) def test_parse_timezone(self): + # gh-22256 data = """dt,val 2018-01-04 09:01:00+09:00,23350 2018-01-04 09:02:00+09:00,23400