chrisdev · wkschwartz · Mar 27, 2018 · Mar 27, 2018 · Mar 27, 2018 · Mar 28, 2018
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -23,5 +23,6 @@ Contributions
 - `Yousuf Jawwad <https://github.com/ysfjwd>`_
 - `@henhuy <https://github.com/henhuy>`_
 - `Hélio Meira Lins <https://github.com/meiralins>`_
+- `William Schwartz <https://github.com/wkschwartz>`_
 - `@utpyngo <https://github.com/utpyngo>`_
 
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -3,6 +3,10 @@ CHANGES
 0.5.1 (2018-01-)
 -----------------
 - Address Unicode decode error when installing with pip3 on docker (Thanks @utapyngo)
+- Fix `#63 <https://github.com/chrisdev/django-pandas/issues/63>`_: Use memory
+  efficient iteration in ``read_frame`` (by @wkschwartz)
+- Add ``compress`` argument to ``read_frame`` to infer NumPy data types for the
+  returned data frame's columns from the Django field types (by @wkschwartz)
 
 0.5.0 (2018-01-20)
 ------------------

diff --git a/README.rst b/README.rst
@@ -25,6 +25,7 @@ Contributors
 * `@henhuy <https://github.com/henhuy>`_
 * `Hélio Meira Lins <https://github.com/meiralins>`_
 * `@utpyngo <https://github.com/utpyngo>`_
+* `William Schwartz <https://github.com/wkschwartz>`_
 
 What's New
 ===========
@@ -119,6 +120,23 @@ read_frame
                 human readable versions of any foreign key or choice fields 
                 else use the actual values set in the model.
 
+    - compress: a false value, ``True``, or a mapping, default False
+                If a true value, infer `NumPy data types
+                <https://docs.scipy.org/doc/numpy/user/basics.types.html>`_ for
+                Pandas dataframe columns from the corresponding Django field
+                types. For example, Django's built in ``SmallIntgerField`` is
+                cast to NumPy's ``int16``. If ``compress`` is a mapping (e.g., a
+                ``dict``), it should be a mapping with Django field subclasses
+                as keys and `NumPy dtypes
+                <https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html>`_
+                as values. This mapping overrides the defualts for the field
+                classes appearing in the mapping. However, the inference is
+                based on the field subclass lowest on a chain of subclasses,
+                that is, in order of inheritence. To override
+                ``SmallIntegerField`` it is therefore not sufficient to override
+                ``IntegerField``. Careful of setting ``compress={}`` because
+                ``{}`` is a false value in Python, which would cause
+                ``read_frame`` not to compress columns.
 
 Examples
 ^^^^^^^^^

diff --git a/django_pandas/io.py b/django_pandas/io.py
@@ -1,6 +1,17 @@
+try:
+    from collections.abc import Mapping
+except ImportError: # pragma: no cover
+    Mapping = dict
+
 import pandas as pd
 from .utils import update_with_verbose, get_related_model
 import django
+from django.db.models import fields, ForeignKey
+import numpy as np
+try:
+    from django.contrib.gis.db.models import fields as geo_fields
+except (ImportError, django.core.exceptions.ImproperlyConfigured): # pragma: no cover
+    geo_fields = None
 
 
 def to_fields(qs, fieldnames):
@@ -32,14 +43,140 @@ def is_values_queryset(qs):
         return qs._iterable_class == django.db.models.query.ValuesIterable
 
 
+_FIELDS_TO_DTYPES = {
+    fields.AutoField:                  np.dtype(np.int32),
+    fields.BigAutoField:               np.dtype(np.int64),
+    fields.BigIntegerField:            np.dtype(np.int64),
+    fields.BinaryField:                object, # Pandas has no bytes type
+    fields.BooleanField:               np.dtype(np.bool_),
+    fields.CharField:                  object, # Pandas has no str type
+    fields.DateField:                  np.dtype('datetime64[D]'),
+    fields.DateTimeField:              np.dtype('datetime64[us]'),
+    fields.DecimalField:               object,
+    fields.DurationField:              np.dtype('timedelta64[us]'),
+    fields.EmailField:                 object,
+    fields.FilePathField:              object,
+    fields.FloatField:                 np.dtype(np.float64),
+    fields.GenericIPAddressField:      object,
+    fields.IntegerField:               np.dtype(np.int32),
+    fields.PositiveIntegerField:       np.dtype(np.uint32),
+    fields.PositiveSmallIntegerField:  np.dtype(np.uint16),
+    fields.SlugField:                  object,
+    fields.SmallIntegerField:          np.dtype(np.int16),
+    fields.TextField:                  object,
+    fields.TimeField:                  object,
+    fields.URLField:                   object,
+    fields.UUIDField:                  object,
+
+    # https://pandas.pydata.org/pandas-docs/stable/missing_data.html#missing-data-casting-rules-and-indexing
+    # Explicitly setting NullBooleanField here can be removed when support for
+    # Django versions <= 2.0 are dropped. See
+    # https://github.com/django/django/pull/8467
+    fields.NullBooleanField:           object,
+}
+
+if geo_fields is not None:
+    _FIELDS_TO_DTYPES.update({
+        # Geometry fields
+        geo_fields.GeometryField:          object,
+        geo_fields.RasterField:            object,
+    })
+
+def _get_dtypes(fields_to_dtypes, fields, fieldnames):
+    """Infer NumPy dtypes from field types among those named in fieldnames.
+
+    Returns a list of (fieldname, NumPy dtype) pairs. Read about NumPy dtypes
+    here [#]_ and here [#]_. The returned list can be passed to ``numpy.array``
+    in ``read_frame``.
+
+    Parameters
+    ----------
+
+    field_to_dtypes : mapping
+        A (potentially empty) mapping of Django field classes to NumPy dtypes.
+        This mapping overrides the defualts from ``_FIELDS_TO_DTYPES``. The
+        back-up default dtype is ``object`` for unfamiliar field classes.
+
+    fields : list of Django field class instances
+        They must correspond in order to the columns of the dataframe that
+        ``read_frame`` is building.
+
+    fieldnames : iterable of names of the fields as they will appear in the data
+        frame
+
+    .. [#] https://docs.scipy.org/doc/numpy/user/basics.types.html
+    .. [#] https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html
+    """
+    dtypes = []
+    f2d = _FIELDS_TO_DTYPES.copy()
+    f2d.update(fields_to_dtypes)
+    for k, v in f2d.items():
+        if not issubclass(k, django.db.models.fields.Field):
+            raise TypeError('Expected a type of field, not {!r}'.format(k))
+        if not isinstance(v, np.dtype):
+            f2d[k] = np.dtype(v)
+    for field, name in zip(fields, fieldnames):
+        # Get field.null before switching to target field since foreign key can
+        # be nullable even while the target isn't, and vice versa.
+        nullable = field.null
+        if isinstance(field, ForeignKey):
+            field = field.target_field
+        nullable = nullable or field.null
+
+        # Find the lowest subclass among the keys of f2d
+        t, dtype = object, np.generic
+        for k, v in f2d.items():
+            if isinstance(field, k) and issubclass(k, t):
+                t, dtype = k, v
+
+        # Handle nulls for integer and boolean types
+        if nullable and issubclass(dtype.type, (np.bool_, bool)):
+            # Pandas handles nullable booleans as objects. See
+            # https://pandas.pydata.org/pandas-docs/stable/missing_data.html#missing-data-casting-rules-and-indexing
+            # Not needed until Django 2.1. See
+            # https://github.com/django/django/pull/8467
+            dtype = np.object_
+        elif nullable and issubclass(dtype.type, (np.integer, int)):
+            # dtype.itemsize is denominated in bytes. Check it against the
+            # number of mantissa bits since the max exact integer is
+            # 2**(mantissa bits):
+            #   >>> 2**sys.float_info.mant_dig - 1 == int(float(2**sys.float_info.mant_dig - 1))
+            #   True
+            #   >>> 2**sys.float_info.mant_dig     == int(float(2**sys.float_info.mant_dig))
+            #   True
+            #   >>> 2**sys.float_info.mant_dig + 1 == int(float(2**sys.float_info.mant_dig + 1))
+            #   False
+            # Thus the integer needs to fit into ((mantissa bits) - 1) bits
+            # https://docs.scipy.org/doc/numpy-dev/user/basics.types.html
+            def fits(itype, ftype):
+                return np.iinfo(itype).bits <= (np.finfo(ftype).nmant - 1)
+            if fits(dtype, np.float16):
+                dtype = np.float16
+            elif fits(dtype, np.float32):
+                dtype = np.float32
+            elif fits(dtype, np.float64):
+                dtype = np.float64
+            elif fits(dtype, np.longdouble):
+                dtype = np.longdouble
+            else:
+                dtype = np.object_
+
+        dtypes.append((name, dtype))
+    return dtypes
+
+
 def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False,
-               verbose=True):
+               verbose=True, compress=False):
     """
     Returns a dataframe from a QuerySet
 
     Optionally specify the field names/columns to utilize and
     a field as the index
 
+    This function uses the QuerySet's ``iterator`` method, so it does not
+    populate the QuerySet's cache. This is more memory efficient in the typical
+    case where you do not use the QuerySet after ``read_frame``.
+
     Parameters
     ----------
 
@@ -58,15 +195,49 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False,
     coerce_float : boolean, default False
         Attempt to convert values to non-string, non-numeric data (like
         decimal.Decimal) to floating point, useful for SQL result sets
+        Does not work with ``compress``.
 
     verbose:  boolean If  this is ``True`` then populate the DataFrame with the
                 human readable versions of any foreign key fields else use
                 the primary keys values.
                 The human readable version of the foreign key field is
                 defined in the ``__unicode__`` or ``__str__``
                 methods of the related class definition
-   """
 
+    compress: a false value, ``True``, or a mapping, default False
+        If a true value, infer NumPy data types [#]_ for Pandas dataframe
+        columns from the corresponding Django field types. For example, Django's
+        built in ``SmallIntgerField`` is cast to NumPy's ``int16``. If
+        ``compress`` is a mapping (e.g., a ``dict``), it should be a mapping
+        with Django field subclasses as keys and  NumPy dtypes [#]_ as values.
+        This mapping overrides the defaults for the field classes appearing in
+        the mapping. However, the inference is based on the field subclass
+        lowest on a chain of subclasses, that is, in order of inheritance.
+        To override ``SmallIntegerField`` it is therefore not sufficient to
+        override ``IntegerField``. Careful of setting ``compress={}`` because
+        ``{}`` is a false value in Python, which would cause ``read_frame``
+        not to compress columns.
+
+        Does not work with ``coerce_float``.
+
+    Known Issues
+    ------------
+
+    When using ``compress=True`` with a nullable foreign key field the double-
+    underscore import name may not work but the single-underscore import name
+    should. For example, suppose model ``A`` has a nullable foreign key field
+    ``b`` pointing at model ``B``, both of which models' primary key fields are
+    called ``id``. Suppose further that ``A``'s table has some entries with
+    null values of ``b`` and some with non-null values.
+    ``read_frame(A.objects.all(), ['b', 'b_id'])`` and
+    ``read_frame(A.objects.filter(b__isnull=False), ['b__id'])`` will work as
+    expected, but ``read_frame(A.objects.all(), ['b__id'])`` will not.
+
+    .. [#] https://docs.scipy.org/doc/numpy/user/basics.types.html
+    .. [#] https://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html
+   """
+    if coerce_float and compress:
+        raise ValueError('Cannot use coerce_float and compress at the same time')
     if fieldnames:
         fieldnames = pd.unique(fieldnames)
         if index_col is not None and index_col not in fieldnames:
@@ -108,11 +279,24 @@ def read_frame(qs, fieldnames=(), index_col=None, coerce_float=False,
         fields = qs.model._meta.fields
         fieldnames = [f.name for f in fields]
 
-    if is_values_queryset(qs):
-        recs = list(qs)
-    else:
-        recs = list(qs.values_list(*fieldnames))
+    if not issubclass(qs._iterable_class, django.db.models.query.ValuesListIterable):
+        qs = qs.values_list(*fieldnames)
+    recs = qs.iterator()
 
+    if compress:
+        if not isinstance(compress, (bool, Mapping)):
+            raise TypeError('Ambiguous compress argument: {!r}'.format(compress))
+        if not isinstance(compress, Mapping):
+            compress = {}
+        dtype = _get_dtypes(compress, fields, fieldnames)
+        # As long as there are no object dtypes, we can avoid the intermediate
+        # list, but np.fromiter chokes on dtype('O').
+        if np.dtype('O') in [dt[1] for dt in dtype]: # small list, set not needed
+            recs = np.array(list(recs), dtype=dtype)
+        else:
+            # Skip the count argument because qs.count() may take more time than
+            # just reallocating memory as NumPy consumes the iterator.
+            recs = np.fromiter(recs, dtype=dtype)
     df = pd.DataFrame.from_records(recs, columns=fieldnames,
                                    coerce_float=coerce_float)
 

diff --git a/django_pandas/tests/models.py b/django_pandas/tests/models.py
@@ -1,3 +1,7 @@
+import datetime as dt
+from decimal import Decimal
+from uuid import UUID
+
 from django.db import models
 from django.utils.encoding import python_2_unicode_compatible
 from django_pandas.managers import DataFrameManager, PassThroughManager
@@ -9,7 +13,7 @@ class MyModel(models.Model):
     col1 = models.IntegerField()
     col2 = models.FloatField(null=True)
     col3 = models.FloatField(null=True)
-    col4 = models.IntegerField()
+    col4 = models.SmallIntegerField()
 
     def __str__(self):
         return "{} {} {} {}".format(
@@ -32,6 +36,53 @@ class MyModelChoice(models.Model):
     objects = DataFrameManager()
 
 
+class ByteField(models.SmallIntegerField):
+    pass
+
+class CompressableModel(models.Model):
+    # Can only have one auto field per model and id is added automatically
+    # id        = models.AutoField(primary_key=True)
+    # bigauto   = models.BigAutoField()
+
+    bigint      = models.BigIntegerField(default=2**63 - 1)
+    binary      = models.BinaryField(default=b'test bytes')
+    boolean     = models.BooleanField(default=True)
+    char        = models.CharField(max_length=10, default='test chars')
+    date        = models.DateField(default=dt.date(2018, 3, 27))
+    datetime    = models.DateTimeField(default=dt.datetime(2018, 3, 27, 13, 55, 56))
+    decimal     = models.DecimalField(decimal_places=1, max_digits=3, default=Decimal(1.5))
+    duration    = models.DurationField(default=dt.timedelta(minutes=1, seconds=1))
+    email       = models.EmailField(default="an+email@address.com")
+    filepath    = models.FilePathField(default="/usr/local/bin/python")
+    floating    = models.FloatField(default=1.2)
+    ip          = models.GenericIPAddressField(default="::ffff:192.0.2.1")
+    integer     = models.IntegerField(default=2**31 - 1)
+    nullboolean = models.NullBooleanField(default=None)
+    uint        = models.PositiveIntegerField(default=2**31 - 1)
+    ushort      = models.PositiveSmallIntegerField(default=2**15 - 1)
+    slug        = models.SlugField(default="test_slug")
+    short       = models.SmallIntegerField(default=-(2**15 - 1))
+    text        = models.TextField(default="test text")
+    time        = models.TimeField(default=dt.time(13, 55, 56))
+    url         = models.URLField(default="https://github.com/chrisdev/django-pandas")
+    uuid        = models.UUIDField(default=UUID(int=1234556789))
+
+    # Custom field
+    byte        = ByteField(default=127)
+
+
+class CompressableModelWithNulls(models.Model):
+    bigint      = models.BigIntegerField(null=True, default=None)
+    floating    = models.FloatField(null=True, default=None)
+    integer     = models.IntegerField(null=True, default=None)
+    nullboolean = models.NullBooleanField(null=True, default=None)
+    uint        = models.PositiveIntegerField(null=True, default=None)
+    ushort      = models.PositiveSmallIntegerField(null=True, default=None)
+    short       = models.SmallIntegerField(null=True, default=None)
+    # Custom field
+    byte        = ByteField(null=True, default=None)
+
+
 @python_2_unicode_compatible
 class DataFrame(models.Model):