@@ -997,6 +997,7 @@ def __init__(self, path_or_buf, convert_dates=True,
997
997
self .path_or_buf = BytesIO (contents )
998
998
999
999
self ._read_header ()
1000
+ self ._setup_dtype ()
1000
1001
1001
1002
def __enter__ (self ):
1002
1003
""" enter context manager """
@@ -1299,6 +1300,23 @@ def _read_old_header(self, first_char):
1299
1300
# necessary data to continue parsing
1300
1301
self .data_location = self .path_or_buf .tell ()
1301
1302
1303
+ def _setup_dtype (self ):
1304
+ """Map between numpy and state dtypes"""
1305
+ if self ._dtype is not None :
1306
+ return self ._dtype
1307
+
1308
+ dtype = [] # Convert struct data types to numpy data type
1309
+ for i , typ in enumerate (self .typlist ):
1310
+ if typ in self .NUMPY_TYPE_MAP :
1311
+ dtype .append (('s' + str (i ), self .byteorder +
1312
+ self .NUMPY_TYPE_MAP [typ ]))
1313
+ else :
1314
+ dtype .append (('s' + str (i ), 'S' + str (typ )))
1315
+ dtype = np .dtype (dtype )
1316
+ self ._dtype = dtype
1317
+
1318
+ return self ._dtype
1319
+
1302
1320
def _calcsize (self , fmt ):
1303
1321
return (type (fmt ) is int and fmt or
1304
1322
struct .calcsize (self .byteorder + fmt ))
@@ -1472,22 +1490,10 @@ def read(self, nrows=None, convert_dates=None,
1472
1490
if nrows is None :
1473
1491
nrows = self .nobs
1474
1492
1475
- if (self .format_version >= 117 ) and (self ._dtype is None ):
1493
+ if (self .format_version >= 117 ) and (not self ._value_labels_read ):
1476
1494
self ._can_read_value_labels = True
1477
1495
self ._read_strls ()
1478
1496
1479
- # Setup the dtype.
1480
- if self ._dtype is None :
1481
- dtype = [] # Convert struct data types to numpy data type
1482
- for i , typ in enumerate (self .typlist ):
1483
- if typ in self .NUMPY_TYPE_MAP :
1484
- dtype .append (('s' + str (i ), self .byteorder +
1485
- self .NUMPY_TYPE_MAP [typ ]))
1486
- else :
1487
- dtype .append (('s' + str (i ), 'S' + str (typ )))
1488
- dtype = np .dtype (dtype )
1489
- self ._dtype = dtype
1490
-
1491
1497
# Read data
1492
1498
dtype = self ._dtype
1493
1499
max_read_len = (self .nobs - self ._lines_read ) * dtype .itemsize
@@ -1958,7 +1964,6 @@ def _prepare_categoricals(self, data):
1958
1964
return data
1959
1965
1960
1966
get_base_missing_value = StataMissingValue .get_base_missing_value
1961
- index = data .index
1962
1967
data_formatted = []
1963
1968
for col , col_is_cat in zip (data , is_cat ):
1964
1969
if col_is_cat :
@@ -1981,8 +1986,7 @@ def _prepare_categoricals(self, data):
1981
1986
1982
1987
# Replace missing values with Stata missing value for type
1983
1988
values [values == - 1 ] = get_base_missing_value (dtype )
1984
- data_formatted .append ((col , values , index ))
1985
-
1989
+ data_formatted .append ((col , values ))
1986
1990
else :
1987
1991
data_formatted .append ((col , data [col ]))
1988
1992
return DataFrame .from_items (data_formatted )
0 commit comments