PERF: Avoid fragmentation of DataFrame in read_sas (pandas-dev#48603)

* PERF: Avoid fragmentation of DataFrame in read_sas * Add whatsnew * Add warning
noatamir · Nov 9, 2022 · bb48d11 · bb48d11
1 parent 7278f77
commit bb48d11
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 3 deletions.
diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst
@@ -219,7 +219,7 @@ MultiIndex
 
 I/O
 ^^^
--
+- Bug in :func:`read_sas` caused fragmentation of :class:`DataFrame` and raised :class:`.errors.PerformanceWarning` (:issue:`48595`)
 -
 
 Period

diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py
@@ -481,7 +481,7 @@ def read(self, nrows: int | None = None) -> pd.DataFrame:
         raw = self.filepath_or_buffer.read(read_len)
         data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
 
-        df = pd.DataFrame(index=range(read_lines))
+        df_data = {}
         for j, x in enumerate(self.columns):
             vec = data["s" + str(j)]
             ntype = self.fields[j]["ntype"]
@@ -496,7 +496,8 @@ def read(self, nrows: int | None = None) -> pd.DataFrame:
                 if self._encoding is not None:
                     v = [y.decode(self._encoding) for y in v]
 
-            df[x] = v
+            df_data.update({x: v})
+        df = pd.DataFrame(df_data)
 
         if self._index is None:
             df.index = pd.Index(range(self._lines_read, self._lines_read + read_lines))
-Original file line number
+Diff line change
@@ Expand Up / @@ -219,7 +219,7 @@ MultiIndex @@
     I/O
     ^^^
-    -
+    - Bug in :func:`read_sas` caused fragmentation of :class:`DataFrame` and raised :class:`.errors.PerformanceWarning` (:issue:`48595`)
     -
     Period
@@ Expand Down @@