-
-
Notifications
You must be signed in to change notification settings - Fork 18.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added 'displayed_only' option to 'read_html' #20047
Changes from all commits
234fe8b
b2f24bb
509c9e2
00f1b5f
61bac89
0c7b137
b1d0f91
2960025
2cc98d9
3093879
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -160,6 +160,14 @@ class _HtmlFrameParser(object): | |
attrs : dict | ||
List of HTML <table> element attributes to match. | ||
|
||
encoding : str | ||
Encoding to be used by parser | ||
|
||
displayed_only : bool | ||
Whether or not items with "display:none" should be ignored | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a versionadded tag |
||
|
||
.. versionadded:: 0.23.0 | ||
|
||
Attributes | ||
---------- | ||
io : str or file-like | ||
|
@@ -172,6 +180,14 @@ class _HtmlFrameParser(object): | |
A dictionary of valid table attributes to use to search for table | ||
elements. | ||
|
||
encoding : str | ||
Encoding to be used by parser | ||
|
||
displayed_only : bool | ||
Whether or not items with "display:none" should be ignored | ||
|
||
.. versionadded:: 0.23.0 | ||
|
||
Notes | ||
----- | ||
To subclass this class effectively you must override the following methods: | ||
|
@@ -187,11 +203,12 @@ class _HtmlFrameParser(object): | |
functionality. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add this to the attributes list |
||
""" | ||
|
||
def __init__(self, io, match, attrs, encoding): | ||
def __init__(self, io, match, attrs, encoding, displayed_only): | ||
self.io = io | ||
self.match = match | ||
self.attrs = attrs | ||
self.encoding = encoding | ||
self.displayed_only = displayed_only | ||
|
||
def parse_tables(self): | ||
tables = self._parse_tables(self._build_doc(), self.match, self.attrs) | ||
|
@@ -380,6 +397,27 @@ def _parse_raw_tbody(self, table): | |
res = self._parse_tr(table) | ||
return self._parse_raw_data(res) | ||
|
||
def _handle_hidden_tables(self, tbl_list, attr_name): | ||
"""Returns list of tables, potentially removing hidden elements | ||
|
||
Parameters | ||
---------- | ||
tbl_list : list of Tag or list of Element | ||
Type of list elements will vary depending upon parser used | ||
attr_name : str | ||
Name of the accessor for retrieving HTML attributes | ||
|
||
Returns | ||
------- | ||
list of Tag or list of Element | ||
Return type matches `tbl_list` | ||
""" | ||
if not self.displayed_only: | ||
return tbl_list | ||
|
||
return [x for x in tbl_list if "display:none" not in | ||
getattr(x, attr_name).get('style', '').replace(" ", "")] | ||
|
||
|
||
class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): | ||
"""HTML to DataFrame parser that uses BeautifulSoup under the hood. | ||
|
@@ -431,8 +469,14 @@ def _parse_tables(self, doc, match, attrs): | |
|
||
result = [] | ||
unique_tables = set() | ||
tables = self._handle_hidden_tables(tables, "attrs") | ||
|
||
for table in tables: | ||
if self.displayed_only: | ||
for elem in table.find_all( | ||
style=re.compile(r"display:\s*none")): | ||
elem.decompose() | ||
|
||
if (table not in unique_tables and | ||
table.find(text=match) is not None): | ||
result.append(table) | ||
|
@@ -528,6 +572,17 @@ def _parse_tables(self, doc, match, kwargs): | |
|
||
tables = doc.xpath(xpath_expr, namespaces=_re_namespace) | ||
|
||
tables = self._handle_hidden_tables(tables, "attrib") | ||
if self.displayed_only: | ||
for table in tables: | ||
# lxml utilizes XPATH 1.0 which does not have regex | ||
# support. As a result, we find all elements with a style | ||
# attribute and iterate them to check for display:none | ||
for elem in table.xpath('.//*[@style]'): | ||
if "display:none" in elem.attrib.get( | ||
"style", "").replace(" ", ""): | ||
elem.getparent().remove(elem) | ||
|
||
if not tables: | ||
raise ValueError("No tables found matching regex {patt!r}" | ||
.format(patt=pattern)) | ||
|
@@ -729,15 +784,15 @@ def _validate_flavor(flavor): | |
return flavor | ||
|
||
|
||
def _parse(flavor, io, match, attrs, encoding, **kwargs): | ||
def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): | ||
flavor = _validate_flavor(flavor) | ||
compiled_match = re.compile(match) # you can pass a compiled regex here | ||
|
||
# hack around python 3 deleting the exception variable | ||
retained = None | ||
for flav in flavor: | ||
parser = _parser_dispatch(flav) | ||
p = parser(io, compiled_match, attrs, encoding) | ||
p = parser(io, compiled_match, attrs, encoding, displayed_only) | ||
|
||
try: | ||
tables = p.parse_tables() | ||
|
@@ -773,7 +828,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, | |
skiprows=None, attrs=None, parse_dates=False, | ||
tupleize_cols=None, thousands=',', encoding=None, | ||
decimal='.', converters=None, na_values=None, | ||
keep_default_na=True): | ||
keep_default_na=True, displayed_only=True): | ||
r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. | ||
|
||
Parameters | ||
|
@@ -877,6 +932,11 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, | |
|
||
.. versionadded:: 0.19.0 | ||
|
||
display_only : bool, default True | ||
Whether elements with "display: none" should be parsed | ||
|
||
.. versionadded:: 0.23.0 | ||
|
||
Returns | ||
------- | ||
dfs : list of DataFrames | ||
|
@@ -924,4 +984,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, | |
parse_dates=parse_dates, tupleize_cols=tupleize_cols, | ||
thousands=thousands, attrs=attrs, encoding=encoding, | ||
decimal=decimal, converters=converters, na_values=na_values, | ||
keep_default_na=keep_default_na) | ||
keep_default_na=keep_default_na, | ||
displayed_only=displayed_only) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -674,6 +674,39 @@ def test_wikipedia_states_table(self): | |
result = self.read_html(data, 'Arizona', header=1)[0] | ||
assert result['sq mi'].dtype == np.dtype('float64') | ||
|
||
@pytest.mark.parametrize("displayed_only,exp0,exp1", [ | ||
(True, DataFrame(["foo"]), None), | ||
(False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))]) | ||
def test_displayed_only(self, displayed_only, exp0, exp1): | ||
# GH 20027 | ||
data = StringIO("""<html> | ||
<body> | ||
<table> | ||
<tr> | ||
<td> | ||
foo | ||
<span style="display:none;text-align:center">bar</span> | ||
<span style="display:none">baz</span> | ||
<span style="display: none">qux</span> | ||
</td> | ||
</tr> | ||
</table> | ||
<table style="display: none"> | ||
<tr> | ||
<td>foo</td> | ||
</tr> | ||
</table> | ||
</body> | ||
</html>""") | ||
|
||
dfs = self.read_html(data, displayed_only=displayed_only) | ||
tm.assert_frame_equal(dfs[0], exp0) | ||
|
||
if exp1 is not None: | ||
tm.assert_frame_equal(dfs[1], exp1) | ||
else: | ||
assert len(dfs) == 1 # Should not parse hidden table | ||
|
||
def test_decimal_rows(self): | ||
|
||
# GH 12907 | ||
|
@@ -896,6 +929,39 @@ def test_computer_sales_page(self): | |
data = os.path.join(DATA_PATH, 'computer_sales_page.html') | ||
self.read_html(data, header=[0, 1]) | ||
|
||
@pytest.mark.parametrize("displayed_only,exp0,exp1", [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Blatant copy/paste of the method above. I was kind of surprised how few tests were shared between the parsers, so there's opportunity here to consolidate a lot of these tests into a base class but I figured that was better done comprehensively than trying to shimmy into this change |
||
(True, DataFrame(["foo"]), None), | ||
(False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))]) | ||
def test_displayed_only(self, displayed_only, exp0, exp1): | ||
# GH 20027 | ||
data = StringIO("""<html> | ||
<body> | ||
<table> | ||
<tr> | ||
<td> | ||
foo | ||
<span style="display:none;text-align:center">bar</span> | ||
<span style="display:none">baz</span> | ||
<span style="display: none">qux</span> | ||
</td> | ||
</tr> | ||
</table> | ||
<table style="display: none"> | ||
<tr> | ||
<td>foo</td> | ||
</tr> | ||
</table> | ||
</body> | ||
</html>""") | ||
|
||
dfs = self.read_html(data, displayed_only=displayed_only) | ||
tm.assert_frame_equal(dfs[0], exp0) | ||
|
||
if exp1 is not None: | ||
tm.assert_frame_equal(dfs[1], exp1) | ||
else: | ||
assert len(dfs) == 1 # Should not parse hidden table | ||
|
||
|
||
def test_invalid_flavor(): | ||
url = 'google.com' | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't add encoding as part of this change but it looks to have been undocumented from whenever it was added. Tossed something in there for now, assuming the sprint this weekend may address in more detail.
The docstrings throughout this module I think technically violate the standard by introducing blank space in between each parameter, but figured better left to the sprint than tossing in this change