From 234fe8b7484905bce3238803a32be4db2b68cd5c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Mar 2018 12:32:01 -0800 Subject: [PATCH 1/7] Added failing test --- pandas/tests/io/test_html.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 151a0750b7f6e..442d43198cfb1 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -674,6 +674,22 @@ def test_wikipedia_states_table(self): result = self.read_html(data, 'Arizona', header=1)[0] assert result['sq mi'].dtype == np.dtype('float64') + @pytest.mark.parametrize("displayed_only,exp", [ + (True, DataFrame(["foo"])), (False, DataFrame(["foofoo"]))]) + def test_displayed_only(self, displayed_only, exp): + data = StringIO(""" + + + + + +
foofoo
+ + """) + result = self.read_html(data, displayed_only=displayed_only) + + tm.assert_frame_equal(result, exp) + def test_decimal_rows(self): # GH 12907 From b2f24bb95f154e698a2c61c65a39898adb8b4bf5 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Mar 2018 15:29:04 -0800 Subject: [PATCH 2/7] Added working keyword arg for displayed elems --- pandas/io/html.py | 43 +++++++++++++++++++++++++++---- pandas/tests/io/test_html.py | 50 ++++++++++++++++++++++++++++++++---- 2 files changed, 83 insertions(+), 10 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index be4854bc19cc6..fba76a1effd2a 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -160,6 +160,12 @@ class _HtmlFrameParser(object): attrs : dict List of HTML element attributes to match. + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + Attributes ---------- io : str or file-like @@ -187,11 +193,12 @@ class _HtmlFrameParser(object): functionality. """ - def __init__(self, io, match, attrs, encoding): + def __init__(self, io, match, attrs, encoding, displayed_only): self.io = io self.match = match self.attrs = attrs self.encoding = encoding + self.displayed_only = displayed_only def parse_tables(self): tables = self._parse_tables(self._build_doc(), self.match, self.attrs) @@ -432,7 +439,18 @@ def _parse_tables(self, doc, match, attrs): result = [] unique_tables = set() + if self.displayed_only: + # Remove any hidden tables + tables = [x for x in tables if not ( + "display:none" in x.attrs.get('style', '').replace(' ', ''))] + for table in tables: + if self.displayed_only: + import re + for elem in table.find_all( + style=re.compile(r"display:\s*none")): + elem.decompose() + if (table not in unique_tables and table.find(text=match) is not None): result.append(table) @@ -528,6 +546,15 @@ def _parse_tables(self, doc, match, kwargs): tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + if self.displayed_only: + # Remove any hidden tables + tables = [x for x in tables if not ( + "display:none" in x.attrib.get('style', '').replace(' ', ''))] + for table in tables: + for elem in table.xpath( + './/*[contains(@style, "display:none")]'): + elem.getparent().remove(elem) + if not tables: raise ValueError("No tables found matching regex {patt!r}" .format(patt=pattern)) @@ -729,7 +756,7 @@ def _validate_flavor(flavor): return flavor -def _parse(flavor, io, match, attrs, encoding, **kwargs): +def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here @@ -737,7 +764,7 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs): retained = None for flav in flavor: parser = _parser_dispatch(flav) - p = parser(io, compiled_match, attrs, encoding) + p = parser(io, compiled_match, attrs, encoding, displayed_only) try: tables = p.parse_tables() @@ -773,7 +800,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=None, thousands=',', encoding=None, decimal='.', converters=None, na_values=None, - keep_default_na=True): + keep_default_na=True, displayed_only=True): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -877,6 +904,11 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, .. versionadded:: 0.19.0 + display_only : bool, default True + Whether elements with "display: none" should be parsed + + .. versionadded:: 0.23.0 + Returns ------- dfs : list of DataFrames @@ -924,4 +956,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands, attrs=attrs, encoding=encoding, decimal=decimal, converters=converters, na_values=na_values, - keep_default_na=keep_default_na) + keep_default_na=keep_default_na, + displayed_only=displayed_only) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 442d43198cfb1..2c97361bf238c 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -674,9 +674,11 @@ def test_wikipedia_states_table(self): result = self.read_html(data, 'Arizona', header=1)[0] assert result['sq mi'].dtype == np.dtype('float64') - @pytest.mark.parametrize("displayed_only,exp", [ - (True, DataFrame(["foo"])), (False, DataFrame(["foofoo"]))]) - def test_displayed_only(self, displayed_only, exp): + @pytest.mark.parametrize("displayed_only,exp0,exp1", [ + (True, DataFrame(["foo"]), None), + (False, DataFrame(["foofoo"]), DataFrame(["foofoo"]))]) + def test_displayed_only(self, displayed_only, exp0, exp1): + # GH 20027 data = StringIO("""
@@ -684,11 +686,21 @@ def test_displayed_only(self, displayed_only, exp):
foofoo
+ + + + +
foofoo
""") - result = self.read_html(data, displayed_only=displayed_only) - tm.assert_frame_equal(result, exp) + dfs = self.read_html(data, displayed_only=displayed_only) + tm.assert_frame_equal(dfs[0], exp0) + + if exp1 is not None: + tm.assert_frame_equal(dfs[1], exp1) + else: + assert len(dfs) == 1 # Should not parse hidden table def test_decimal_rows(self): @@ -912,6 +924,34 @@ def test_computer_sales_page(self): data = os.path.join(DATA_PATH, 'computer_sales_page.html') self.read_html(data, header=[0, 1]) + @pytest.mark.parametrize("displayed_only,exp0,exp1", [ + (True, DataFrame(["foo"]), None), + (False, DataFrame(["foofoo"]), DataFrame(["foofoo"]))]) + def test_displayed_only(self, displayed_only, exp0, exp1): + # GH 20027 + data = StringIO(""" + + + + + +
foofoo
+ + + + +
foofoo
+ + """) + + dfs = self.read_html(data, displayed_only=displayed_only) + tm.assert_frame_equal(dfs[0], exp0) + + if exp1 is not None: + tm.assert_frame_equal(dfs[1], exp1) + else: + assert len(dfs) == 1 # Should not parse hidden table + def test_invalid_flavor(): url = 'google.com' From 509c9e22a33a255b5e1f309a8a272ceabf6394fb Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 7 Mar 2018 15:33:39 -0800 Subject: [PATCH 3/7] Updated whatsnew --- doc/source/whatsnew/v0.23.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index feca90aae6237..51ed68b398a6c 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -341,6 +341,7 @@ Other Enhancements - :func:`DataFrame.replace` now supports the ``method`` parameter, which can be used to specify the replacement method when ``to_replace`` is a scalar, list or tuple and ``value`` is ``None`` (:issue:`19632`) - :meth:`Timestamp.month_name`, :meth:`DatetimeIndex.month_name`, and :meth:`Series.dt.month_name` are now available (:issue:`12805`) - :meth:`Timestamp.day_name` and :meth:`DatetimeIndex.day_name` are now available to return day names with a specified locale (:issue:`12806`) +- :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`) .. _whatsnew_0230.api_breaking: From 0c7b137c8cecb053d5387f7200ebc56ece91d46a Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Mar 2018 08:50:39 -0800 Subject: [PATCH 4/7] Refactored to share hidden table removal method --- pandas/io/html.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index fba76a1effd2a..acffa5e35a0da 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -387,6 +387,27 @@ def _parse_raw_tbody(self, table): res = self._parse_tr(table) return self._parse_raw_data(res) + def _handle_hidden_tables(self, tbl_list, attr_name): + """Returns list of tables, potentially removing hidden elements + + Parameters + ---------- + tbl_list : list of Tag or list of Element + Type of list elements will vary depending upon parser used + attr_name : str + Name of the accessor for retrieving HTML attributes + + Returns + ------- + list of Tag or list of Element + Return type matches `tbl_list` + """ + if not self.displayed_only: + return tbl_list + + return [x for x in tbl_list if not ("display:none" in getattr( + x, attr_name).get('style', '').replace(' ', ''))] + class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): """HTML to DataFrame parser that uses BeautifulSoup under the hood. @@ -438,15 +459,10 @@ def _parse_tables(self, doc, match, attrs): result = [] unique_tables = set() - - if self.displayed_only: - # Remove any hidden tables - tables = [x for x in tables if not ( - "display:none" in x.attrs.get('style', '').replace(' ', ''))] + tables = self._handle_hidden_tables(tables, "attrs") for table in tables: if self.displayed_only: - import re for elem in table.find_all( style=re.compile(r"display:\s*none")): elem.decompose() @@ -546,10 +562,8 @@ def _parse_tables(self, doc, match, kwargs): tables = doc.xpath(xpath_expr, namespaces=_re_namespace) + tables = self._handle_hidden_tables(tables, "attrib") if self.displayed_only: - # Remove any hidden tables - tables = [x for x in tables if not ( - "display:none" in x.attrib.get('style', '').replace(' ', ''))] for table in tables: for elem in table.xpath( './/*[contains(@style, "display:none")]'): From b1d0f91e766faa5bd916579a7a8fa081b0c60bab Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Mar 2018 09:13:06 -0800 Subject: [PATCH 5/7] Added more complicated scenarios; squashed bugs --- pandas/io/html.py | 14 +++++++++----- pandas/tests/io/test_html.py | 22 ++++++++++++++++------ 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index acffa5e35a0da..2099c7cf4226a 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -405,8 +405,8 @@ def _handle_hidden_tables(self, tbl_list, attr_name): if not self.displayed_only: return tbl_list - return [x for x in tbl_list if not ("display:none" in getattr( - x, attr_name).get('style', '').replace(' ', ''))] + return [x for x in tbl_list if not "display:none" in + getattr(x, attr_name).get('style', '').replace(" ", "")] class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): @@ -565,9 +565,13 @@ def _parse_tables(self, doc, match, kwargs): tables = self._handle_hidden_tables(tables, "attrib") if self.displayed_only: for table in tables: - for elem in table.xpath( - './/*[contains(@style, "display:none")]'): - elem.getparent().remove(elem) + # lxml utilizes XPATH 1.0 which does not have regex + # support. As a result, we find all elements with a style + # attribute and iterate them to check for display:none + for elem in table.xpath('.//*[@style]'): + if "display:none" in elem.attrib.get( + "style", "").replace(" ", ""): + elem.getparent().remove(elem) if not tables: raise ValueError("No tables found matching regex {patt!r}" diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 2c97361bf238c..b18104e951504 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -676,19 +676,24 @@ def test_wikipedia_states_table(self): @pytest.mark.parametrize("displayed_only,exp0,exp1", [ (True, DataFrame(["foo"]), None), - (False, DataFrame(["foofoo"]), DataFrame(["foofoo"]))]) + (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))]) def test_displayed_only(self, displayed_only, exp0, exp1): # GH 20027 data = StringIO(""" - +
foofoo + foo + bar + baz + qux +
- +
foofoofoo
@@ -926,19 +931,24 @@ def test_computer_sales_page(self): @pytest.mark.parametrize("displayed_only,exp0,exp1", [ (True, DataFrame(["foo"]), None), - (False, DataFrame(["foofoo"]), DataFrame(["foofoo"]))]) + (False, DataFrame(["foo bar baz qux"]), DataFrame(["foo"]))]) def test_displayed_only(self, displayed_only, exp0, exp1): # GH 20027 data = StringIO(""" - +
foofoo + foo + bar + baz + qux +
- +
foofoofoo
From 2960025fd2acea86518dbe0aa604d49d5e78ea74 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 8 Mar 2018 09:14:23 -0800 Subject: [PATCH 6/7] PEP8 fix --- pandas/io/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index 2099c7cf4226a..bf3cab05f89bd 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -405,7 +405,7 @@ def _handle_hidden_tables(self, tbl_list, attr_name): if not self.displayed_only: return tbl_list - return [x for x in tbl_list if not "display:none" in + return [x for x in tbl_list if "display:none" not in getattr(x, attr_name).get('style', '').replace(" ", "")] From 3093879826e9970eb87581889333b5e639637bc3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 9 Mar 2018 08:30:32 -0800 Subject: [PATCH 7/7] Docstring updates --- pandas/io/html.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/io/html.py b/pandas/io/html.py index bf3cab05f89bd..300a5a151f5d2 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -166,6 +166,8 @@ class _HtmlFrameParser(object): displayed_only : bool Whether or not items with "display:none" should be ignored + .. versionadded:: 0.23.0 + Attributes ---------- io : str or file-like @@ -178,6 +180,14 @@ class _HtmlFrameParser(object): A dictionary of valid table attributes to use to search for table elements. + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + + .. versionadded:: 0.23.0 + Notes ----- To subclass this class effectively you must override the following methods: