Skip to content

fix for 61123 read_excel-nrows-param-reads-extra-rows #61127

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
@@ -696,6 +696,7 @@ def f(skiprows: Sequence, x: int) -> bool:
# the number of rows read from file
return None

# This method calculates how many rows to read from the file
def parse(
self,
sheet_name: str | int | list[int] | list[str] | None = 0,
@@ -748,13 +749,15 @@ def parse(
if verbose:
print(f"Reading sheet {asheetname}")

# Get the sheet object based on name or index
if isinstance(asheetname, str):
sheet = self.get_sheet_by_name(asheetname)
else: # assume an integer if not a string
sheet = self.get_sheet_by_index(asheetname)

file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
data = self.get_sheet_data(sheet, file_rows_needed)

if hasattr(sheet, "close"):
# pyxlsb opens two TemporaryFiles
sheet.close()
@@ -764,6 +767,11 @@ def parse(
output[asheetname] = DataFrame()
continue

# Ensure we don't process more rows than requested with nrows
# This is a safeguard in case get_sheet_data returns more rows than requested
if nrows is not None and len(data) > nrows:
data = data[:nrows + (0 if header is None else header + 1)]

output = self._parse_sheet(
data=data,
output=output,
5 changes: 4 additions & 1 deletion pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
@@ -625,7 +625,10 @@ def get_sheet_data(
break

# Trim trailing empty rows
data = data[: last_row_with_data + 1]
if file_rows_needed is None:
# Only trim trailing empty rows when file_rows_needed is None
# to ensure we return exactly file_rows_needed rows when specified
data = data[: last_row_with_data + 1]

if len(data) > 0:
# extend rows to max width
5 changes: 5 additions & 0 deletions pandas/io/excel/_pyxlsb.py
Original file line number Diff line number Diff line change
@@ -124,4 +124,9 @@ def get_sheet_data(
data_row + (max_width - len(data_row)) * empty_cell
for data_row in data
]

# Ensure we return exactly file_rows_needed rows if specified
if file_rows_needed is not None and len(data) > file_rows_needed:
data = data[:file_rows_needed]

return data
1 change: 1 addition & 0 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
@@ -110,6 +110,7 @@ def _parse_cell(cell_contents, cell_typ):
cell_contents = time(
cell_contents.hour,
cell_contents.minute,
# xlrd implementation already correctly limits rows to file_rows_needed
cell_contents.second,
cell_contents.microsecond,
)
74 changes: 74 additions & 0 deletions pandas/tests/io/excel/run_nrows_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""
Standalone script to test nrows parameter with adjacent tables in Excel files.
This script can be run directly with Python without using pytest.
Usage:
python pandas/tests/io/excel/run_nrows_test.py
"""
import os
import tempfile
import pandas as pd


def run_test():
"""
Test that nrows parameter correctly handles adjacent tables.
This test creates two Excel files:
1. One with a blank row between two tables
2. One with no blank row between two tables
Then it verifies that reading with nrows=3 returns only the first table
in both cases.
"""
# Create temporary directory
with tempfile.TemporaryDirectory() as tmp_dir:
# Create test files
file1 = os.path.join(tmp_dir, "with_blank.xlsx")
file2 = os.path.join(tmp_dir, "no_blank.xlsx")

# Create test data
df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})

print("Creating Excel files...")

# Create file with blank row between tables
with pd.ExcelWriter(file1) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# Add blank row by starting lower table at row 5 (0-based index + header)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)

# Create file with no blank row between tables
with pd.ExcelWriter(file2) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# No blank row, lower table starts right after (row 4 = header of second table)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)

print("Reading Excel files with nrows=3...")

# Read with nrows=3 (should only get the first table)
df1 = pd.read_excel(file1, nrows=3)
df2 = pd.read_excel(file2, nrows=3)

# Expected result - just the first table
expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

# Verify results
print("Verifying results...")
pd.testing.assert_frame_equal(df1, expected)
pd.testing.assert_frame_equal(df2, expected)

# Verify shapes
assert df1.shape == (3, 2), f"Expected (3, 2) but got {df1.shape}"
assert df2.shape == (3, 2), f"Expected (3, 2) but got {df2.shape}"

# Verify last row doesn't contain headers from second table
assert df2.iloc[-1, 0] == 3, f"Expected 3 but got {df2.iloc[-1, 0]}"
assert df2.iloc[-1, 1] == 6, f"Expected 6 but got {df2.iloc[-1, 1]}"

print("All tests passed!")


if __name__ == "__main__":
run_test()
64 changes: 64 additions & 0 deletions pandas/tests/io/excel/test_adjacent_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from __future__ import annotations

import pytest
import pandas as pd
import pandas._testing as tm

from pandas.io.excel import ExcelWriter


class TestAdjacentTables:
"""Tests for reading Excel files with adjacent tables."""

@pytest.mark.parametrize(
"engine,read_ext",
[
pytest.param("openpyxl", ".xlsx", marks=[pytest.mark.skip_if_no("openpyxl")]),
pytest.param("xlsxwriter", ".xlsx", marks=[pytest.mark.skip_if_no("xlsxwriter")]),
],
)
def test_excel_read_adjacent_tables_nrows(self, engine, read_ext, tmp_path):
"""
Test that nrows parameter correctly handles adjacent tables with and without blank rows.
GH-61123
"""
# Create test files with tables with and without blank rows between them
# File 1: Two tables with a blank row between
file1 = tmp_path / f"test1{read_ext}"
df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})

with ExcelWriter(file1, engine=engine) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# Add blank row by starting lower table at row 5 (0-based index + header)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)

# File 2: Two tables with no blank row
file2 = tmp_path / f"test2{read_ext}"
with ExcelWriter(file2, engine=engine) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# No blank row, lower table starts right after (row 4 = header of second table)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)

# Read first 3 rows (header + 3 data rows)
# Using nrows=3 to get exactly the upper table without blank rows
df1 = pd.read_excel(file1, header=0, nrows=3, engine=engine)
df2 = pd.read_excel(file2, header=0, nrows=3, engine=engine)

# Expected data - just the upper table
expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

# Check content
tm.assert_frame_equal(df1, expected)
tm.assert_frame_equal(df2, expected)

# Verify we didn't read the header of the next table in df2
# If we did, the last row would contain column headers from the second table
assert df1.shape == (3, 2), f"Expected (3, 2) but got {df1.shape}"
assert df2.shape == (3, 2), f"Expected (3, 2) but got {df2.shape}"

# Fix the comparison warning by checking string values properly
last_row_values = [str(x) for x in df2.iloc[-1].values]
assert "A" not in last_row_values, "Second table header was incorrectly included"
assert "B" not in last_row_values, "Second table header was incorrectly included"
58 changes: 58 additions & 0 deletions pandas/tests/io/excel/test_excel_adjacent_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
Tests for reading Excel files with adjacent tables.
"""
import pytest
import pandas as pd
import pandas._testing as tm


class TestExcelAdjacentTables:
"""Tests for reading Excel files with adjacent tables."""

@pytest.mark.parametrize("engine", ["openpyxl"])
def test_nrows_with_adjacent_tables(self, engine, tmp_path):
"""
Test that nrows parameter correctly handles adjacent tables.
GH-61123: When using nrows to limit the number of rows read from an Excel file,
the function should correctly handle cases where tables are adjacent (no blank
row between them).
"""
# Create test files with tables with and without blank rows between them
# File 1: Two tables with a blank row between
file1 = tmp_path / "test1.xlsx"
df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})

with pd.ExcelWriter(file1, engine=engine) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# Add blank row by starting lower table at row 5 (0-based index + header)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)

# File 2: Two tables with no blank row
file2 = tmp_path / "test2.xlsx"
with pd.ExcelWriter(file2, engine=engine) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# No blank row, lower table starts right after (row 4 = header of second table)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)

# Read first 3 rows (header + 3 data rows)
# Using nrows=3 to get exactly the upper table without blank rows
df1 = pd.read_excel(file1, header=0, nrows=3, engine=engine)
df2 = pd.read_excel(file2, header=0, nrows=3, engine=engine)

# Expected data - just the upper table
expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

# Check content
tm.assert_frame_equal(df1, expected)
tm.assert_frame_equal(df2, expected)

# Verify we didn't read the header of the next table in df2
# If we did, the last row would contain column headers from the second table
assert df1.shape == (3, 2), f"Expected (3, 2) but got {df1.shape}"
assert df2.shape == (3, 2), f"Expected (3, 2) but got {df2.shape}"

# Check specific values in the last row to ensure we didn't read the header
assert df2.iloc[-1, 0] == 3, f"Expected 3 but got {df2.iloc[-1, 0]}"
assert df2.iloc[-1, 1] == 6, f"Expected 6 but got {df2.iloc[-1, 1]}"
54 changes: 54 additions & 0 deletions pandas/tests/io/excel/test_minimal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""
Minimal test for reading Excel files with adjacent tables.
"""
import pytest
import pandas as pd
import pandas._testing as tm


def test_nrows_with_adjacent_tables(tmp_path):
"""
Test that nrows parameter correctly handles adjacent tables.
GH-61123: When using nrows to limit the number of rows read from an Excel file,
the function should correctly handle cases where tables are adjacent (no blank
row between them).
"""
# Create test files with tables with and without blank rows between them
# File 1: Two tables with a blank row between
file1 = tmp_path / "test1.xlsx"
df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})

with pd.ExcelWriter(file1) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# Add blank row by starting lower table at row 5 (0-based index + header)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)

# File 2: Two tables with no blank row
file2 = tmp_path / "test2.xlsx"
with pd.ExcelWriter(file2) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# No blank row, lower table starts right after (row 4 = header of second table)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)

# Read first 3 rows (header + 3 data rows)
# Using nrows=3 to get exactly the upper table without blank rows
df1 = pd.read_excel(file1, header=0, nrows=3)
df2 = pd.read_excel(file2, header=0, nrows=3)

# Expected data - just the upper table
expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

# Check content
tm.assert_frame_equal(df1, expected)
tm.assert_frame_equal(df2, expected)

# Verify we didn't read the header of the next table in df2
# If we did, the last row would contain column headers from the second table
assert df1.shape == (3, 2)
assert df2.shape == (3, 2)

# Check specific values in the last row to ensure we didn't read the header
assert df2.iloc[-1, 0] == 3
assert df2.iloc[-1, 1] == 6
59 changes: 59 additions & 0 deletions pandas/tests/io/excel/test_nrows_adjacent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
Test for GH-61123: nrows parameter with adjacent tables in Excel files.
"""
import os
import pytest
import pandas as pd
import pandas._testing as tm


@pytest.mark.skipif(not os.path.exists("pandas/io/excel/_openpyxl.py"), reason="openpyxl not installed")
def test_nrows_with_adjacent_tables(tmp_path):
"""
Test that nrows parameter correctly handles adjacent tables.
This test creates two Excel files:
1. One with a blank row between two tables
2. One with no blank row between two tables
Then it verifies that reading with nrows=3 returns only the first table
in both cases.
"""
# Create test files
file1 = tmp_path / "with_blank.xlsx"
file2 = tmp_path / "no_blank.xlsx"

# Create test data
df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})

# Create file with blank row between tables
with pd.ExcelWriter(file1) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# Add blank row by starting lower table at row 5 (0-based index + header)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)

# Create file with no blank row between tables
with pd.ExcelWriter(file2) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# No blank row, lower table starts right after (row 4 = header of second table)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)

# Read with nrows=3 (should only get the first table)
df1 = pd.read_excel(file1, nrows=3)
df2 = pd.read_excel(file2, nrows=3)

# Expected result - just the first table
expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

# Verify results
tm.assert_frame_equal(df1, expected)
tm.assert_frame_equal(df2, expected)

# Verify shapes
assert df1.shape == (3, 2)
assert df2.shape == (3, 2)

# Verify last row doesn't contain headers from second table
assert df2.iloc[-1, 0] == 3
assert df2.iloc[-1, 1] == 6
106 changes: 106 additions & 0 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
@@ -1167,6 +1167,10 @@ def test_read_excel_multiindex_header_only(self, read_ext):
tm.assert_frame_equal(result, expected)

def test_excel_old_index_format(self, read_ext):
"""
Test reading Excel files with old index format (pre-1.7).
See gh-4679.
"""
# see gh-4679
filename = "test_index_name_pre17" + read_ext

@@ -1239,6 +1243,108 @@ def test_excel_old_index_format(self, read_ext):
actual = pd.read_excel(filename, sheet_name="multi_no_names", index_col=[0, 1])
tm.assert_frame_equal(actual, expected)

# GH-issue: read_excel nrows parameter reads extra rows when tables are adjacent
# Test that nrows is respected even when tables are adjacent (no blank row between them)

# First table has header + 1 data row (2 rows total)
# We want to read only these 2 rows, not the header of the next table
num_rows_to_pull = 2

# Create test files with tables with and without blank rows between them
# File 1: Two tables with a blank row between
file1 = tmp_path / "test1.xlsx"
df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})
with pd.ExcelWriter(file1) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# Add blank row by starting lower table at row 5 (0-based index + header)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)

# File 2: Two tables with no blank row
file2 = tmp_path / "test2.xlsx"
with pd.ExcelWriter(file2) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
def test_excel_read_tables_with_and_without_blank_row(self, tmp_path):
"""
GH-61123
Test that nrows parameter correctly handles adjacent tables with and without blank rows.
"""
def test_excel_read_tables_with_and_without_blank_row(self, engine_and_read_ext, tmp_path):
"""
GH-61123
Test that nrows parameter correctly handles adjacent tables with and without blank rows.
"""
engine, read_ext = engine_and_read_ext

# Skip incompatible engine/extension combinations
if engine == 'xlrd' and read_ext != '.xls':
pytest.skip(f"Engine {engine} not compatible with {read_ext}")
if engine == 'odf' and read_ext != '.ods':
pytest.skip(f"Engine {engine} not compatible with {read_ext}")
if engine == 'pyxlsb' and read_ext != '.xlsb':
pytest.skip(f"Engine {engine} not compatible with {read_ext}")

# Map reader engines to appropriate writer engines
writer_engine = None
if read_ext == '.xlsx' or read_ext == '.xlsm':
writer_engine = 'openpyxl'
elif read_ext == '.xls':
writer_engine = 'xlwt'
elif read_ext == '.xlsb':
writer_engine = 'xlsxwriter' # Use xlsxwriter for xlsb files
elif read_ext == '.ods':
writer_engine = 'odf'

if writer_engine is None:
pytest.skip(f"No writer engine available for {read_ext}")

try:
# Create test files with tables with and without blank rows between them
# File 1: Two tables with a blank row between
file1 = tmp_path / f"test1{read_ext}"
df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})

with pd.ExcelWriter(file1, engine=writer_engine) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# Add blank row by starting lower table at row 5 (0-based index + header)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)

# File 2: Two tables with no blank row
file2 = tmp_path / f"test2{read_ext}"
with pd.ExcelWriter(file2, engine=writer_engine) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# No blank row, lower table starts right after (row 4 = header of second table)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)

# Read first 3 rows (header + 3 data rows)
# Using nrows=3 to get exactly the upper table without blank rows
df1 = pd.read_excel(file1, header=0, nrows=3, engine=engine)
df2 = pd.read_excel(file2, header=0, nrows=3, engine=engine)

# Expected data - just the upper table
expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

# Check content
tm.assert_frame_equal(df1, expected)
tm.assert_frame_equal(df2, expected)

# Verify we didn't read the header of the next table in df2
# If we did, the last row would contain column headers from the second table
assert df1.shape == (3, 2), f"Expected (3, 2) but got {df1.shape}"
assert df2.shape == (3, 2), f"Expected (3, 2) but got {df2.shape}"

# Fix the comparison warning by checking specific values instead
assert df2.iloc[-1, 0] == 3, f"Expected 3 but got {df2.iloc[-1, 0]}"
assert df2.iloc[-1, 1] == 6, f"Expected 6 but got {df2.iloc[-1, 1]}"
except ImportError:
pytest.skip(f"Required writer engine {writer_engine} not available")
except ValueError as e:
if "No Excel writer" in str(e):
pytest.skip(f"Excel writer {writer_engine} not available")
else:
raise

def test_read_excel_bool_header_arg(self, read_ext):
# GH 6114
msg = "Passing a bool to header is invalid"
59 changes: 59 additions & 0 deletions test_adjacent_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
Simple script to test nrows parameter with adjacent tables in Excel files.
Run this directly with: python test_adjacent_tables.py
"""
import os
import tempfile
import pandas as pd

def main():
# Create temporary directory
with tempfile.TemporaryDirectory() as tmp_dir:
# Create test files
file1 = os.path.join(tmp_dir, "with_blank.xlsx")
file2 = os.path.join(tmp_dir, "no_blank.xlsx")

# Create test data
df_upper = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df_lower = pd.DataFrame({"A": [7, 8, 9], "B": [10, 11, 12]})

print("Creating Excel files...")

# Create file with blank row between tables
with pd.ExcelWriter(file1) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# Add blank row by starting lower table at row 5 (0-based index + header)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=5, index=False)

# Create file with no blank row between tables
with pd.ExcelWriter(file2) as writer:
df_upper.to_excel(writer, sheet_name="Sheet1", index=False)
# No blank row, lower table starts right after (row 4 = header of second table)
df_lower.to_excel(writer, sheet_name="Sheet1", startrow=4, index=False)

print("Reading Excel files with nrows=3...")

# Read with nrows=3 (should only get the first table)
df1 = pd.read_excel(file1, nrows=3)
df2 = pd.read_excel(file2, nrows=3)

# Expected result - just the first table
expected = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

# Verify results
print("Verifying results...")
pd.testing.assert_frame_equal(df1, expected)
pd.testing.assert_frame_equal(df2, expected)

# Verify shapes
assert df1.shape == (3, 2), f"Expected (3, 2) but got {df1.shape}"
assert df2.shape == (3, 2), f"Expected (3, 2) but got {df2.shape}"

# Verify last row doesn't contain headers from second table
assert df2.iloc[-1, 0] == 3, f"Expected 3 but got {df2.iloc[-1, 0]}"
assert df2.iloc[-1, 1] == 6, f"Expected 6 but got {df2.iloc[-1, 1]}"

print("All tests passed!")

if __name__ == "__main__":
main()