diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0f49c976d00a3..74b64818c1ae7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -16,6 +16,7 @@ import collections import warnings import types +from itertools import islice, chain from numpy import nan as NA import numpy as np @@ -159,6 +160,8 @@ class DataFrame(NDFrame): Data type to force, otherwise infer copy : boolean, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input + count : int or None, when data it's a generator, number of values to + read. If None reads the whole generator Examples -------- @@ -185,7 +188,7 @@ def _constructor(self): _constructor_sliced = Series def __init__(self, data=None, index=None, columns=None, dtype=None, - copy=False): + copy=False, count=None): if data is None: data = {} if dtype is not None: @@ -232,7 +235,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, copy=copy) elif isinstance(data, (list, types.GeneratorType)): if isinstance(data, types.GeneratorType): - data = list(data) + data = list(islice(data, count)) if len(data) > 0: if index is None and isinstance(data[0], Series): index = _get_names_from_index(data) @@ -705,7 +708,7 @@ def to_gbq(self, destination_table, schema=None, col_order=None, @classmethod def from_records(cls, data, index=None, exclude=None, columns=None, - coerce_float=False, nrows=None): + coerce_float=False, count=None, nrows=None): """ Convert structured or record ndarray to DataFrame @@ -726,24 +729,29 @@ def from_records(cls, data, index=None, exclude=None, columns=None, coerce_float : boolean, default False Attempt to convert values to non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets + count : int or None, number of records to read from a generator. + If None reads the whole generator Returns ------- df : DataFrame """ + #Deprecate undocumented nrows + if nrows is not None: + warnings.warn("nrows is deprecated, use count", + FutureWarning) + count = nrows + # Make a copy of the input columns so we can modify it if columns is not None: columns = _ensure_index(columns) if com.is_iterator(data): - if nrows == 0: + if count == 0: return cls() try: - if compat.PY3: - first_row = next(data) - else: - first_row = next(data) + first_row = next(data) except StopIteration: return cls(index=index, columns=columns) @@ -751,19 +759,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None, if hasattr(first_row, 'dtype') and first_row.dtype.names: dtype = first_row.dtype - values = [first_row] - - # if unknown length iterable (generator) - if nrows is None: - # consume whole generator - values += list(data) - else: - i = 1 - for row in data: - values.append(row) - i += 1 - if i >= nrows: - break + # put the generator in a list + values = list(islice(chain([first_row], data), count)) if dtype is not None: data = np.array(values, dtype=dtype) diff --git a/pandas/core/series.py b/pandas/core/series.py index c310358ab58f9..87c8183643795 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2,6 +2,7 @@ Data structure for 1-dimensional cross-sectional and time series data """ from __future__ import division +from itertools import islice # pylint: disable=E1101,E1103 # pylint: disable=W0703,W0622,W0613,W0201 @@ -118,11 +119,13 @@ class Series(generic.NDFrame): dtype : numpy.dtype or None If None, dtype will be inferred copy : boolean, default False, copy input data + count : int or None, number of values to read from a generator. + If None reads the whole generator """ _metadata = ['name'] def __init__(self, data=None, index=None, dtype=None, name=None, - copy=False, fastpath=False): + copy=False, count=None, fastpath=False): # we are called internally, so short-circuit if fastpath: @@ -192,7 +195,7 @@ def __init__(self, data=None, index=None, dtype=None, name=None, name = data.name data = np.asarray(data) elif isinstance(data, types.GeneratorType): - data = list(data) + data = list(islice(data, count)) elif isinstance(data, (set, frozenset)): raise TypeError("{0!r} type is unordered" "".format(data.__class__.__name__)) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index bded2fad36763..b6f18664d3e5b 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2791,6 +2791,15 @@ def test_constructor_generator(self): expected = DataFrame({ 0 : range(10), 1 : 'a' }) assert_frame_equal(result, expected, check_dtype=False) + def test_constructor_generator_count_limit(self): + generator_length = 10 + expected_length = 5 + + #only works when data it'a a generator, not a collection + gen = ([ i, 'a'] for i in range(generator_length)) + result = DataFrame(gen, count=expected_length) + self.assertEqual(len(result), expected_length) + def test_constructor_list_of_dicts(self): data = [OrderedDict([['a', 1.5], ['b', 3], ['c', 4], ['d', 6]]), OrderedDict([['a', 1.5], ['b', 3], ['d', 6]]), @@ -3820,6 +3829,14 @@ def list_generator(length): result = DataFrame.from_records(generator, columns=columns_names) assert_frame_equal(result, expected) + def test_from_records_generator_count_limit(self): + def generator(length): + for i in range(length): + yield (i, i/2) + expected_length = 5 + df = DataFrame.from_records(generator(10), count=expected_length) + self.assertEqual(len(df), expected_length) + def test_from_records_columns_not_modified(self): tuples = [(1, 2, 3), (1, 2, 3), diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 5a4cbf1a6e16e..fbc06f96b4249 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -403,6 +403,14 @@ def test_constructor_generator(self): exp.index = lrange(10, 20) assert_series_equal(result, exp) + def test_constructor_generator_count_limit(self): + generator_length = 10 + expected_length = 5 + gen = (i for i in range(generator_length)) + + result = Series(gen, count=expected_length) + self.assertEqual(len(result), expected_length) + def test_constructor_categorical(self): cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c']) res = Series(cat)