pandas-dev · changhiskhan · May 11, 2012 · May 11, 2012 · May 11, 2012 · May 11, 2012
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -24,6 +24,9 @@ def next(x):
 
 from pandas.util.decorators import Appender
 
+class DateConversionError(Exception):
+    pass
+
 _parser_params = """Also supports optionally iterating or breaking of the file
 into chunks.
 
@@ -155,7 +158,8 @@ def _read(cls, filepath_or_buffer, kwds):
             f = com._get_handle(filepath_or_buffer, 'r', encoding=encoding)
 
     if kwds.get('date_parser', None) is not None:
-        kwds['parse_dates'] = True
+        if isinstance(kwds['parse_dates'], bool):
+            kwds['parse_dates'] = True
 
     # Extract some of the arguments (pass chunksize on).
     kwds.pop('filepath_or_buffer')
@@ -362,8 +366,8 @@ class TextParser(object):
     def __init__(self, f, delimiter=None, names=None, header=0,
                  index_col=None, na_values=None, thousands=None,
                  comment=None, parse_dates=False,
-                 date_parser=None, dayfirst=False, chunksize=None,
-                 skiprows=None, skip_footer=0, converters=None,
+                 date_parser=None, dayfirst=False,
+                 chunksize=None, skiprows=None, skip_footer=0, converters=None,
                  verbose=False, encoding=None):
         """
         Workhorse function for processing nested list into DataFrame
@@ -672,7 +676,6 @@ def get_chunk(self, rows=None):
 
         zipped_content = list(lib.to_object_array(content).T)
 
-        # no index column specified, so infer that's what is wanted
         if self.index_col is not None:
             if np.isscalar(self.index_col):
                 index = zipped_content.pop(self.index_col)
@@ -686,19 +689,17 @@ def get_chunk(self, rows=None):
                     zipped_content.pop(i)
 
             if np.isscalar(self.index_col):
-                if self._should_parse_dates(0):
-                    index = lib.try_parse_dates(index, parser=self.date_parser,
-                                                dayfirst=self.dayfirst)
+                if self._should_parse_dates(self.index_col):
+                    index = self._conv_date(index)
                 index, na_count = _convert_types(index, self.na_values)
                 index = Index(index, name=self.index_name)
                 if self.verbose and na_count:
                     print 'Found %d NA values in the index' % na_count
             else:
                 arrays = []
                 for i, arr in enumerate(index):
-                    if self._should_parse_dates(i):
-                        arr = lib.try_parse_dates(arr, parser=self.date_parser,
-                                                  dayfirst=self.dayfirst)
+                    if self._should_parse_dates(self.index_col[i]):
+                        arr = self._conv_date(arr)
                     arr, _ = _convert_types(arr, self.na_values)
                     arrays.append(arr)
                 index = MultiIndex.from_arrays(arrays, names=self.index_name)
@@ -736,18 +737,13 @@ def get_chunk(self, rows=None):
                 col = self.columns[col]
             data[col] = lib.map_infer(data[col], f)
 
-        if not isinstance(self.parse_dates, bool):
-            for x in self.parse_dates:
-                if isinstance(x, int) and x not in data:
-                    x = self.orig_columns[x]
-                if x in self.index_col or x in self.index_name:
-                    continue
-                data[x] = lib.try_parse_dates(data[x], parser=self.date_parser,
-                                              dayfirst=self.dayfirst)
+        columns = self.columns
+        if self.parse_dates is not None:
+            data, columns = self._process_date_conversion(data)
 
         data = _convert_to_ndarrays(data, self.na_values, self.verbose)
 
-        return DataFrame(data=data, columns=self.columns, index=index)
+        return DataFrame(data=data, columns=columns, index=index)
 
     def _find_line_number(self, exp_len, chunk_len, chunk_i):
         if exp_len is None:
@@ -778,6 +774,68 @@ def _should_parse_dates(self, i):
                 name = self.index_name[i]
             return i in to_parse or name in to_parse
 
+    def _conv_date(self, *date_cols):
+        if self.date_parser is None:
+            return lib.try_parse_dates(_concat_date_cols(date_cols),
+                                       dayfirst=self.dayfirst)
+        else:
+            try:
+                return self.date_parser(*date_cols)
+            except:
+                return lib.try_parse_dates(_concat_date_cols(date_cols),
+                                           parser=self.date_parser,
+                                           dayfirst=self.dayfirst)
+
+    def _process_date_conversion(self, data_dict):
+        new_cols = []
+        new_data = {}
+        columns = self.columns
+
+        if self.parse_dates is None or isinstance(self.parse_dates, bool):
+            return data_dict, columns
+
+        if isinstance(self.parse_dates, list):
+            # list of column lists
+            for colspec in self.parse_dates:
+                if np.isscalar(colspec):
+                    if isinstance(colspec, int) and colspec not in data_dict:
+                        colspec = self.orig_columns[colspec]
+                    if self._isindex(colspec):
+                        continue
+                    data_dict[colspec] = self._conv_date(data_dict[colspec])
+                else:
+                    new_name, col = _try_convert_dates(self._conv_date, colspec,
+                                        data_dict, self.orig_columns)
+                    if new_name in data_dict:
+                        raise ValueError('New date column already in dict %s' %
+                                         new_name)
+                    new_data[new_name] = col
+                    new_cols.append(new_name)
+
+        elif isinstance(self.parse_dates, dict):
+            # dict of new name to column list
+            for new_name, colspec in self.parse_dates.iteritems():
+                if new_name in data_dict:
+                    raise ValueError('Date column %s already in dict' %
+                                     new_name)
+
+                _, col = _try_convert_dates(self._conv_date, colspec, data_dict,
+                                            self.orig_columns)
+                new_data[new_name] = col
+                new_cols.append(new_name)
+
+        data_dict.update(new_data)
+        new_cols.extend(columns)
+        return data_dict, new_cols
+
+    def _isindex(self, colspec):
+        return (colspec == self.index_col or
+                (isinstance(self.index_col, list) and
+                 colspec in self.index_col) or
+                (colspec == self.index_name or
+                 (isinstance(self.index_name, list) and
+                  colspec in self.index_name)))
+
     def _get_lines(self, rows=None):
         source = self.data
         lines = self.buf
@@ -860,6 +918,33 @@ def _convert_types(values, na_values):
 
     return result, na_count
 
+def _get_col_names(colspec, columns):
+    colset = set(columns)
+    colnames = []
+    for c in colspec:
+        if c in colset:
+            colnames.append(str(c))
+        elif isinstance(c, int):
+            colnames.append(str(columns[c]))
+    return colnames
+
+def _try_convert_dates(parser, colspec, data_dict, columns):
+    colspec = _get_col_names(colspec, columns)
+    new_name = '_'.join(colspec)
+
+    to_parse = [data_dict[c] for c in colspec if c in data_dict]
+    try:
+        new_col = parser(*to_parse)
+    except DateConversionError:
+        new_col = _concat_date_cols(to_parse)
+    return new_name, new_col
+
+def _concat_date_cols(date_cols):
+    if len(date_cols) == 1:
+        return date_cols[0]
+    concat = lambda x: ' '.join(x)
+    return np.array(np.apply_along_axis(concat, 0, np.vstack(date_cols)),
+                    dtype=object)
 
 class FixedWidthReader(object):
     """

diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -12,6 +12,7 @@
 import numpy as np
 
 from pandas import DataFrame, Index, isnull
+import pandas.io.parsers as parsers
 from pandas.io.parsers import (read_csv, read_table, read_fwf,
                                ExcelFile, TextParser)
 from pandas.util.testing import assert_almost_equal, assert_frame_equal, network
@@ -90,6 +91,58 @@ def test_comment_fwf(self):
                       comment='#')
         assert_almost_equal(df.values, expected)
 
+    def test_multiple_date_col(self):
+        # Can use multiple date parsers
+        data = """\
+KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
+"""
+        def func(*date_cols):
+            return lib.try_parse_dates(parsers._concat_date_cols(date_cols))
+
+        df = read_table(StringIO(data), sep=',', header=None,
+                        date_parser=func,
+                        parse_dates={'nominal' : [1, 2],
+                                     'actual' : [1,3]})
+        self.assert_('nominal' in df)
+        self.assert_('actual' in df)
+        from datetime import datetime
+        d = datetime(1999, 1, 27, 19, 0)
+        self.assert_(df.ix[0, 'nominal'] == d)
+
+        data = """\
+KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
+"""
+        df = read_table(StringIO(data), sep=',', header=None,
+                        parse_dates=[[1, 2], [1,3]])
+        self.assert_('X.2_X.3' in df)
+        self.assert_('X.2_X.4' in df)
+        from datetime import datetime
+        d = datetime(1999, 1, 27, 19, 0)
+        self.assert_(df.ix[0, 'X.2_X.3'] == d)
+
+        data = '''\
+KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+'''
+        df = read_table(StringIO(data), sep=',', header=None,
+                        parse_dates=[1], index_col=1)
+        from datetime import datetime
+        d = datetime(1999, 1, 27, 19, 0)
+        self.assert_(df.index[0] == d)
+
     def test_malformed(self):
         # all
         data = """ignore

diff --git a/vb_suite/parser.py b/vb_suite/parser.py
@@ -50,3 +50,42 @@
                                 setup,
                                 cleanup="os.remove('test.csv')",
                                 start_date=datetime(2012, 5, 7))
+
+setup = common_setup + """
+from pandas import read_table
+from cStringIO import StringIO
+import os
+N = 10000
+K = 8
+data = '''\
+KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+'''
+data = data * 2000
+"""
+cmd = ("read_table(StringIO(data), sep=',', header=None, "
+       "parse_dates=[[1,2], [1,3]])")
+sdate = datetime(2012, 5, 7)
+read_table_multiple_date = Benchmark(cmd, setup, start_date=sdate)
+
+setup = common_setup + """
+from pandas import read_table
+from cStringIO import StringIO
+import os
+N = 10000
+K = 8
+data = '''\
+KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
+KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
+KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000
+KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000
+KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000
+'''
+data = data * 2000
+"""
+cmd = "read_table(StringIO(data), sep=',', header=None, parse_dates=[1])"
+sdate = datetime(2012, 5, 7)
+read_table_multiple_date_baseline = Benchmark(cmd, setup, start_date=sdate)