Skip to content

ENH: convert multiple text file columns to a single date column #1227

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 104 additions & 19 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ def next(x):

from pandas.util.decorators import Appender

class DateConversionError(Exception):
pass

_parser_params = """Also supports optionally iterating or breaking of the file
into chunks.

Expand Down Expand Up @@ -155,7 +158,8 @@ def _read(cls, filepath_or_buffer, kwds):
f = com._get_handle(filepath_or_buffer, 'r', encoding=encoding)

if kwds.get('date_parser', None) is not None:
kwds['parse_dates'] = True
if isinstance(kwds['parse_dates'], bool):
kwds['parse_dates'] = True

# Extract some of the arguments (pass chunksize on).
kwds.pop('filepath_or_buffer')
Expand Down Expand Up @@ -362,8 +366,8 @@ class TextParser(object):
def __init__(self, f, delimiter=None, names=None, header=0,
index_col=None, na_values=None, thousands=None,
comment=None, parse_dates=False,
date_parser=None, dayfirst=False, chunksize=None,
skiprows=None, skip_footer=0, converters=None,
date_parser=None, dayfirst=False,
chunksize=None, skiprows=None, skip_footer=0, converters=None,
verbose=False, encoding=None):
"""
Workhorse function for processing nested list into DataFrame
Expand Down Expand Up @@ -672,7 +676,6 @@ def get_chunk(self, rows=None):

zipped_content = list(lib.to_object_array(content).T)

# no index column specified, so infer that's what is wanted
if self.index_col is not None:
if np.isscalar(self.index_col):
index = zipped_content.pop(self.index_col)
Expand All @@ -686,19 +689,17 @@ def get_chunk(self, rows=None):
zipped_content.pop(i)

if np.isscalar(self.index_col):
if self._should_parse_dates(0):
index = lib.try_parse_dates(index, parser=self.date_parser,
dayfirst=self.dayfirst)
if self._should_parse_dates(self.index_col):
index = self._conv_date(index)
index, na_count = _convert_types(index, self.na_values)
index = Index(index, name=self.index_name)
if self.verbose and na_count:
print 'Found %d NA values in the index' % na_count
else:
arrays = []
for i, arr in enumerate(index):
if self._should_parse_dates(i):
arr = lib.try_parse_dates(arr, parser=self.date_parser,
dayfirst=self.dayfirst)
if self._should_parse_dates(self.index_col[i]):
arr = self._conv_date(arr)
arr, _ = _convert_types(arr, self.na_values)
arrays.append(arr)
index = MultiIndex.from_arrays(arrays, names=self.index_name)
Expand Down Expand Up @@ -736,18 +737,13 @@ def get_chunk(self, rows=None):
col = self.columns[col]
data[col] = lib.map_infer(data[col], f)

if not isinstance(self.parse_dates, bool):
for x in self.parse_dates:
if isinstance(x, int) and x not in data:
x = self.orig_columns[x]
if x in self.index_col or x in self.index_name:
continue
data[x] = lib.try_parse_dates(data[x], parser=self.date_parser,
dayfirst=self.dayfirst)
columns = self.columns
if self.parse_dates is not None:
data, columns = self._process_date_conversion(data)

data = _convert_to_ndarrays(data, self.na_values, self.verbose)

return DataFrame(data=data, columns=self.columns, index=index)
return DataFrame(data=data, columns=columns, index=index)

def _find_line_number(self, exp_len, chunk_len, chunk_i):
if exp_len is None:
Expand Down Expand Up @@ -778,6 +774,68 @@ def _should_parse_dates(self, i):
name = self.index_name[i]
return i in to_parse or name in to_parse

def _conv_date(self, *date_cols):
if self.date_parser is None:
return lib.try_parse_dates(_concat_date_cols(date_cols),
dayfirst=self.dayfirst)
else:
try:
return self.date_parser(*date_cols)
except:
return lib.try_parse_dates(_concat_date_cols(date_cols),
parser=self.date_parser,
dayfirst=self.dayfirst)

def _process_date_conversion(self, data_dict):
new_cols = []
new_data = {}
columns = self.columns

if self.parse_dates is None or isinstance(self.parse_dates, bool):
return data_dict, columns

if isinstance(self.parse_dates, list):
# list of column lists
for colspec in self.parse_dates:
if np.isscalar(colspec):
if isinstance(colspec, int) and colspec not in data_dict:
colspec = self.orig_columns[colspec]
if self._isindex(colspec):
continue
data_dict[colspec] = self._conv_date(data_dict[colspec])
else:
new_name, col = _try_convert_dates(self._conv_date, colspec,
data_dict, self.orig_columns)
if new_name in data_dict:
raise ValueError('New date column already in dict %s' %
new_name)
new_data[new_name] = col
new_cols.append(new_name)

elif isinstance(self.parse_dates, dict):
# dict of new name to column list
for new_name, colspec in self.parse_dates.iteritems():
if new_name in data_dict:
raise ValueError('Date column %s already in dict' %
new_name)

_, col = _try_convert_dates(self._conv_date, colspec, data_dict,
self.orig_columns)
new_data[new_name] = col
new_cols.append(new_name)

data_dict.update(new_data)
new_cols.extend(columns)
return data_dict, new_cols

def _isindex(self, colspec):
return (colspec == self.index_col or
(isinstance(self.index_col, list) and
colspec in self.index_col) or
(colspec == self.index_name or
(isinstance(self.index_name, list) and
colspec in self.index_name)))

def _get_lines(self, rows=None):
source = self.data
lines = self.buf
Expand Down Expand Up @@ -860,6 +918,33 @@ def _convert_types(values, na_values):

return result, na_count

def _get_col_names(colspec, columns):
colset = set(columns)
colnames = []
for c in colspec:
if c in colset:
colnames.append(str(c))
elif isinstance(c, int):
colnames.append(str(columns[c]))
return colnames

def _try_convert_dates(parser, colspec, data_dict, columns):
colspec = _get_col_names(colspec, columns)
new_name = '_'.join(colspec)

to_parse = [data_dict[c] for c in colspec if c in data_dict]
try:
new_col = parser(*to_parse)
except DateConversionError:
new_col = _concat_date_cols(to_parse)
return new_name, new_col

def _concat_date_cols(date_cols):
if len(date_cols) == 1:
return date_cols[0]
concat = lambda x: ' '.join(x)
return np.array(np.apply_along_axis(concat, 0, np.vstack(date_cols)),
dtype=object)

class FixedWidthReader(object):
"""
Expand Down
53 changes: 53 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import numpy as np

from pandas import DataFrame, Index, isnull
import pandas.io.parsers as parsers
from pandas.io.parsers import (read_csv, read_table, read_fwf,
ExcelFile, TextParser)
from pandas.util.testing import assert_almost_equal, assert_frame_equal, network
Expand Down Expand Up @@ -90,6 +91,58 @@ def test_comment_fwf(self):
comment='#')
assert_almost_equal(df.values, expected)

def test_multiple_date_col(self):
# Can use multiple date parsers
data = """\
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""
def func(*date_cols):
return lib.try_parse_dates(parsers._concat_date_cols(date_cols))

df = read_table(StringIO(data), sep=',', header=None,
date_parser=func,
parse_dates={'nominal' : [1, 2],
'actual' : [1,3]})
self.assert_('nominal' in df)
self.assert_('actual' in df)
from datetime import datetime
d = datetime(1999, 1, 27, 19, 0)
self.assert_(df.ix[0, 'nominal'] == d)

data = """\
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""
df = read_table(StringIO(data), sep=',', header=None,
parse_dates=[[1, 2], [1,3]])
self.assert_('X.2_X.3' in df)
self.assert_('X.2_X.4' in df)
from datetime import datetime
d = datetime(1999, 1, 27, 19, 0)
self.assert_(df.ix[0, 'X.2_X.3'] == d)

data = '''\
KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
'''
df = read_table(StringIO(data), sep=',', header=None,
parse_dates=[1], index_col=1)
from datetime import datetime
d = datetime(1999, 1, 27, 19, 0)
self.assert_(df.index[0] == d)

def test_malformed(self):
# all
data = """ignore
Expand Down
39 changes: 39 additions & 0 deletions vb_suite/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,42 @@
setup,
cleanup="os.remove('test.csv')",
start_date=datetime(2012, 5, 7))

setup = common_setup + """
from pandas import read_table
from cStringIO import StringIO
import os
N = 10000
K = 8
data = '''\
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
'''
data = data * 2000
"""
cmd = ("read_table(StringIO(data), sep=',', header=None, "
"parse_dates=[[1,2], [1,3]])")
sdate = datetime(2012, 5, 7)
read_table_multiple_date = Benchmark(cmd, setup, start_date=sdate)

setup = common_setup + """
from pandas import read_table
from cStringIO import StringIO
import os
N = 10000
K = 8
data = '''\
KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
'''
data = data * 2000
"""
cmd = "read_table(StringIO(data), sep=',', header=None, parse_dates=[1])"
sdate = datetime(2012, 5, 7)
read_table_multiple_date_baseline = Benchmark(cmd, setup, start_date=sdate)