pandas-dev · jorisvandenbossche · Feb 8, 2021 · Jan 30, 2021 · Jan 30, 2021 · Jan 31, 2021
diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst
@@ -36,6 +36,7 @@ Bug fixes
 
 - :func:`pandas.read_excel` error message when a specified ``sheetname`` does not exist is now uniform across engines (:issue:`39250`)
 - Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information; the fix requires ``openpyxl`` >= 3.0.0, prior versions may still fail (:issue:`38956`, :issue:`39001`)
+- Fixed bug in :func:`pandas.read_excel` sometimes producing a ``DataFrame`` with trailing rows of ``np.nan`` when the engine ``openpyxl`` is used (:issue:`39181`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py
@@ -546,10 +546,16 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
             sheet.reset_dimensions()
 
         data: List[List[Scalar]] = []
+        last_row_with_data = -1
         for row_number, row in enumerate(sheet.rows):
             converted_row = [self._convert_cell(cell, convert_float) for cell in row]
+            if not all(cell == "" for cell in converted_row):
+                last_row_with_data = row_number
             data.append(converted_row)
 
+        # Trim trailing empty rows
+        data = data[: last_row_with_data + 1]
+
         if version >= "3.0.0" and is_readonly and len(data) > 0:
             # With dimension reset, openpyxl no longer pads rows
             max_width = max(len(data_row) for data_row in data)

diff --git a/pandas/tests/io/data/excel/empty_trailing_rows.xlsx b/pandas/tests/io/data/excel/empty_trailing_rows.xlsx
diff --git a/pandas/tests/io/data/excel/empty_with_blank_row.xlsx b/pandas/tests/io/data/excel/empty_with_blank_row.xlsx
diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py
@@ -189,3 +189,43 @@ def test_append_mode_file(ext):
         second = data.find(b"docProps/app.xml", first + 1)
         third = data.find(b"docProps/app.xml", second + 1)
         assert second != -1 and third == -1
+
+
+# When read_only is None, use read_excel instead of a workbook
+@pytest.mark.parametrize("read_only", [True, False, None])
+def test_read_with_empty_trailing_rows(datapath, ext, read_only, request):
+    # GH 39181
+    version = LooseVersion(get_version(openpyxl))
+    if (read_only or read_only is None) and version < "3.0.0":
+        msg = "openpyxl read-only sheet is incorrect when dimension data is wrong"
+        request.node.add_marker(pytest.mark.xfail(reason=msg))
+    path = datapath("io", "data", "excel", f"empty_trailing_rows{ext}")
+    if read_only is None:
+        result = pd.read_excel(path)
+    else:
+        wb = openpyxl.load_workbook(path, read_only=read_only)
+        result = pd.read_excel(wb, engine="openpyxl")
+        wb.close()
+    expected = DataFrame(
+        {
+            "Title": [np.nan, "A", 1, 2, 3],
+            "Unnamed: 1": [np.nan, "B", 4, 5, 6],
+            "Unnamed: 2": [np.nan, "C", 7, 8, 9],
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+# When read_only is None, use read_excel instead of a workbook
+@pytest.mark.parametrize("read_only", [True, False, None])
+def test_read_empty_with_blank_row(datapath, ext, read_only):
+    # GH 39547 - empty excel file with a row that has no data
+    path = datapath("io", "data", "excel", f"empty_with_blank_row{ext}")
+    if read_only is None:
+        result = pd.read_excel(path)
+    else:
+        wb = openpyxl.load_workbook(path, read_only=read_only)
+        result = pd.read_excel(wb, engine="openpyxl")
+        wb.close()
+    expected = DataFrame()
+    tm.assert_frame_equal(result, expected)