Skip to content

Commit 6858d0f

Browse files
bashtagegfyoung
authored andcommitted
BUG: Allow value labels to be read with iterator (#16926)
All value labels to be read before the iterator has been used Fix issue where categorical data was incorrectly reformatted when write_index was False closes #16923
1 parent a587d56 commit 6858d0f

File tree

3 files changed

+36
-19
lines changed

3 files changed

+36
-19
lines changed

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ I/O
162162

163163
- Bug in :func:`read_csv` in which non integer values for the header argument generated an unhelpful / unrelated error message (:issue:`16338`)
164164

165+
- Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`)
165166

166167
Plotting
167168
^^^^^^^^

pandas/io/stata.py

+20-16
Original file line numberDiff line numberDiff line change
@@ -997,6 +997,7 @@ def __init__(self, path_or_buf, convert_dates=True,
997997
self.path_or_buf = BytesIO(contents)
998998

999999
self._read_header()
1000+
self._setup_dtype()
10001001

10011002
def __enter__(self):
10021003
""" enter context manager """
@@ -1299,6 +1300,23 @@ def _read_old_header(self, first_char):
12991300
# necessary data to continue parsing
13001301
self.data_location = self.path_or_buf.tell()
13011302

1303+
def _setup_dtype(self):
1304+
"""Map between numpy and state dtypes"""
1305+
if self._dtype is not None:
1306+
return self._dtype
1307+
1308+
dtype = [] # Convert struct data types to numpy data type
1309+
for i, typ in enumerate(self.typlist):
1310+
if typ in self.NUMPY_TYPE_MAP:
1311+
dtype.append(('s' + str(i), self.byteorder +
1312+
self.NUMPY_TYPE_MAP[typ]))
1313+
else:
1314+
dtype.append(('s' + str(i), 'S' + str(typ)))
1315+
dtype = np.dtype(dtype)
1316+
self._dtype = dtype
1317+
1318+
return self._dtype
1319+
13021320
def _calcsize(self, fmt):
13031321
return (type(fmt) is int and fmt or
13041322
struct.calcsize(self.byteorder + fmt))
@@ -1472,22 +1490,10 @@ def read(self, nrows=None, convert_dates=None,
14721490
if nrows is None:
14731491
nrows = self.nobs
14741492

1475-
if (self.format_version >= 117) and (self._dtype is None):
1493+
if (self.format_version >= 117) and (not self._value_labels_read):
14761494
self._can_read_value_labels = True
14771495
self._read_strls()
14781496

1479-
# Setup the dtype.
1480-
if self._dtype is None:
1481-
dtype = [] # Convert struct data types to numpy data type
1482-
for i, typ in enumerate(self.typlist):
1483-
if typ in self.NUMPY_TYPE_MAP:
1484-
dtype.append(('s' + str(i), self.byteorder +
1485-
self.NUMPY_TYPE_MAP[typ]))
1486-
else:
1487-
dtype.append(('s' + str(i), 'S' + str(typ)))
1488-
dtype = np.dtype(dtype)
1489-
self._dtype = dtype
1490-
14911497
# Read data
14921498
dtype = self._dtype
14931499
max_read_len = (self.nobs - self._lines_read) * dtype.itemsize
@@ -1958,7 +1964,6 @@ def _prepare_categoricals(self, data):
19581964
return data
19591965

19601966
get_base_missing_value = StataMissingValue.get_base_missing_value
1961-
index = data.index
19621967
data_formatted = []
19631968
for col, col_is_cat in zip(data, is_cat):
19641969
if col_is_cat:
@@ -1981,8 +1986,7 @@ def _prepare_categoricals(self, data):
19811986

19821987
# Replace missing values with Stata missing value for type
19831988
values[values == -1] = get_base_missing_value(dtype)
1984-
data_formatted.append((col, values, index))
1985-
1989+
data_formatted.append((col, values))
19861990
else:
19871991
data_formatted.append((col, data[col]))
19881992
return DataFrame.from_items(data_formatted)

pandas/tests/io/test_stata.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,18 @@
99
from datetime import datetime
1010
from distutils.version import LooseVersion
1111

12-
import pytest
1312
import numpy as np
1413
import pandas as pd
1514
import pandas.util.testing as tm
15+
import pytest
1616
from pandas import compat
17+
from pandas._libs.tslib import NaT
1718
from pandas.compat import iterkeys
19+
from pandas.core.dtypes.common import is_categorical_dtype
1820
from pandas.core.frame import DataFrame, Series
1921
from pandas.io.parsers import read_csv
2022
from pandas.io.stata import (read_stata, StataReader, InvalidColumnName,
2123
PossiblePrecisionLoss, StataMissingValue)
22-
from pandas._libs.tslib import NaT
23-
from pandas.core.dtypes.common import is_categorical_dtype
2424

2525

2626
class TestStata(object):
@@ -1297,3 +1297,15 @@ def test_pickle_path_localpath(self):
12971297
reader = lambda x: read_stata(x).set_index('index')
12981298
result = tm.round_trip_localpath(df.to_stata, reader)
12991299
tm.assert_frame_equal(df, result)
1300+
1301+
@pytest.mark.parametrize('write_index', [True, False])
1302+
def test_value_labels_iterator(self, write_index):
1303+
# GH 16923
1304+
d = {'A': ['B', 'E', 'C', 'A', 'E']}
1305+
df = pd.DataFrame(data=d)
1306+
df['A'] = df['A'].astype('category')
1307+
with tm.ensure_clean() as path:
1308+
df.to_stata(path, write_index=write_index)
1309+
dta_iter = pd.read_stata(path, iterator=True)
1310+
value_labels = dta_iter.value_labels()
1311+
assert value_labels == {'A': {0: 'A', 1: 'B', 2: 'C', 3: 'E'}}

0 commit comments

Comments
 (0)