From 234fe8b7484905bce3238803a32be4db2b68cd5c Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Wed, 7 Mar 2018 12:32:01 -0800
Subject: [PATCH 1/7] Added failing test

---
 pandas/tests/io/test_html.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 151a0750b7f6e..442d43198cfb1 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -674,6 +674,22 @@ def test_wikipedia_states_table(self):
         result = self.read_html(data, 'Arizona', header=1)[0]
         assert result['sq mi'].dtype == np.dtype('float64')
 
+    @pytest.mark.parametrize("displayed_only,exp", [
+        (True, DataFrame(["foo"])), (False, DataFrame(["foofoo"]))])
+    def test_displayed_only(self, displayed_only, exp):
+        data = StringIO("""<html>
+          <body>
+            <table>
+              <tr>
+                <td>foo<span style="display:none">foo</span></td>
+              </tr>
+            </table>
+          </body>
+        </html>""")
+        result = self.read_html(data, displayed_only=displayed_only)
+
+        tm.assert_frame_equal(result, exp)
+
     def test_decimal_rows(self):
 
         # GH 12907

From b2f24bb95f154e698a2c61c65a39898adb8b4bf5 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Wed, 7 Mar 2018 15:29:04 -0800
Subject: [PATCH 2/7] Added working keyword arg for displayed elems

---
 pandas/io/html.py            | 43 +++++++++++++++++++++++++++----
 pandas/tests/io/test_html.py | 50 ++++++++++++++++++++++++++++++++----
 2 files changed, 83 insertions(+), 10 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index be4854bc19cc6..fba76a1effd2a 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -160,6 +160,12 @@ class _HtmlFrameParser(object):
     attrs : dict
         List of HTML <table> element attributes to match.
 
+    encoding : str
+        Encoding to be used by parser
+
+    displayed_only : bool
+        Whether or not items with "display:none" should be ignored
+
     Attributes
     ----------
     io : str or file-like
@@ -187,11 +193,12 @@ class _HtmlFrameParser(object):
     functionality.
     """
 
-    def __init__(self, io, match, attrs, encoding):
+    def __init__(self, io, match, attrs, encoding, displayed_only):
         self.io = io
         self.match = match
         self.attrs = attrs
         self.encoding = encoding
+        self.displayed_only = displayed_only
 
     def parse_tables(self):
         tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
@@ -432,7 +439,18 @@ def _parse_tables(self, doc, match, attrs):
         result = []
         unique_tables = set()
 
+        if self.displayed_only:
+            # Remove any hidden tables
+            tables = [x for x in tables if not (
+                "display:none" in x.attrs.get('style', '').replace(' ', ''))]
+
         for table in tables:
+            if self.displayed_only:
+                import re
+                for elem in table.find_all(
+                        style=re.compile(r"display:\s*none")):
+                    elem.decompose()
+
             if (table not in unique_tables and
                     table.find(text=match) is not None):
                 result.append(table)
@@ -528,6 +546,15 @@ def _parse_tables(self, doc, match, kwargs):
 
         tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
 
+        if self.displayed_only:
+            # Remove any hidden tables
+            tables = [x for x in tables if not (
+                "display:none" in x.attrib.get('style', '').replace(' ', ''))]
+            for table in tables:
+                for elem in table.xpath(
+                        './/*[contains(@style, "display:none")]'):
+                    elem.getparent().remove(elem)
+
         if not tables:
             raise ValueError("No tables found matching regex {patt!r}"
                              .format(patt=pattern))
@@ -729,7 +756,7 @@ def _validate_flavor(flavor):
     return flavor
 
 
-def _parse(flavor, io, match, attrs, encoding, **kwargs):
+def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
     flavor = _validate_flavor(flavor)
     compiled_match = re.compile(match)  # you can pass a compiled regex here
 
@@ -737,7 +764,7 @@ def _parse(flavor, io, match, attrs, encoding, **kwargs):
     retained = None
     for flav in flavor:
         parser = _parser_dispatch(flav)
-        p = parser(io, compiled_match, attrs, encoding)
+        p = parser(io, compiled_match, attrs, encoding, displayed_only)
 
         try:
             tables = p.parse_tables()
@@ -773,7 +800,7 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
               skiprows=None, attrs=None, parse_dates=False,
               tupleize_cols=None, thousands=',', encoding=None,
               decimal='.', converters=None, na_values=None,
-              keep_default_na=True):
+              keep_default_na=True, displayed_only=True):
     r"""Read HTML tables into a ``list`` of ``DataFrame`` objects.
 
     Parameters
@@ -877,6 +904,11 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
 
         .. versionadded:: 0.19.0
 
+    display_only : bool, default True
+        Whether elements with "display: none" should be parsed
+
+        .. versionadded:: 0.23.0
+
     Returns
     -------
     dfs : list of DataFrames
@@ -924,4 +956,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None,
                   parse_dates=parse_dates, tupleize_cols=tupleize_cols,
                   thousands=thousands, attrs=attrs, encoding=encoding,
                   decimal=decimal, converters=converters, na_values=na_values,
-                  keep_default_na=keep_default_na)
+                  keep_default_na=keep_default_na,
+                  displayed_only=displayed_only)
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 442d43198cfb1..2c97361bf238c 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -674,9 +674,11 @@ def test_wikipedia_states_table(self):
         result = self.read_html(data, 'Arizona', header=1)[0]
         assert result['sq mi'].dtype == np.dtype('float64')
 
-    @pytest.mark.parametrize("displayed_only,exp", [
-        (True, DataFrame(["foo"])), (False, DataFrame(["foofoo"]))])
-    def test_displayed_only(self, displayed_only, exp):
+    @pytest.mark.parametrize("displayed_only,exp0,exp1", [
+        (True, DataFrame(["foo"]), None),
+        (False, DataFrame(["foofoo"]), DataFrame(["foofoo"]))])
+    def test_displayed_only(self, displayed_only, exp0, exp1):
+        # GH 20027
         data = StringIO("""<html>
           <body>
             <table>
@@ -684,11 +686,21 @@ def test_displayed_only(self, displayed_only, exp):
                 <td>foo<span style="display:none">foo</span></td>
               </tr>
             </table>
+            <table style="display: none">
+              <tr>
+                <td>foo<span>foo</span></td>
+              </tr>
+            </table>
           </body>
         </html>""")
-        result = self.read_html(data, displayed_only=displayed_only)
 
-        tm.assert_frame_equal(result, exp)
+        dfs = self.read_html(data, displayed_only=displayed_only)
+        tm.assert_frame_equal(dfs[0], exp0)
+
+        if exp1 is not None:
+            tm.assert_frame_equal(dfs[1], exp1)
+        else:
+            assert len(dfs) == 1  # Should not parse hidden table
 
     def test_decimal_rows(self):
 
@@ -912,6 +924,34 @@ def test_computer_sales_page(self):
         data = os.path.join(DATA_PATH, 'computer_sales_page.html')
         self.read_html(data, header=[0, 1])
 
+    @pytest.mark.parametrize("displayed_only,exp0,exp1", [
+        (True, DataFrame(["foo"]), None),
+        (False, DataFrame(["foofoo"]), DataFrame(["foofoo"]))])
+    def test_displayed_only(self, displayed_only, exp0, exp1):
+        # GH 20027
+        data = StringIO("""<html>
+          <body>
+            <table>
+              <tr>
+                <td>foo<span style="display:none">foo</span></td>
+              </tr>
+            </table>
+            <table style="display: none">
+              <tr>
+                <td>foo<span>foo</span></td>
+              </tr>
+            </table>
+          </body>
+        </html>""")
+
+        dfs = self.read_html(data, displayed_only=displayed_only)
+        tm.assert_frame_equal(dfs[0], exp0)
+
+        if exp1 is not None:
+            tm.assert_frame_equal(dfs[1], exp1)
+        else:
+            assert len(dfs) == 1  # Should not parse hidden table
+
 
 def test_invalid_flavor():
     url = 'google.com'

From 509c9e22a33a255b5e1f309a8a272ceabf6394fb Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Wed, 7 Mar 2018 15:33:39 -0800
Subject: [PATCH 3/7] Updated whatsnew

---
 doc/source/whatsnew/v0.23.0.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
index feca90aae6237..51ed68b398a6c 100644
--- a/doc/source/whatsnew/v0.23.0.txt
+++ b/doc/source/whatsnew/v0.23.0.txt
@@ -341,6 +341,7 @@ Other Enhancements
 - :func:`DataFrame.replace` now supports the ``method`` parameter, which can be used to specify the replacement method when ``to_replace`` is a scalar, list or tuple and ``value`` is ``None`` (:issue:`19632`)
 - :meth:`Timestamp.month_name`, :meth:`DatetimeIndex.month_name`, and :meth:`Series.dt.month_name` are now available (:issue:`12805`)
 - :meth:`Timestamp.day_name` and :meth:`DatetimeIndex.day_name` are now available to return day names with a specified locale (:issue:`12806`)
+- :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`)
 
 .. _whatsnew_0230.api_breaking:
 

From 0c7b137c8cecb053d5387f7200ebc56ece91d46a Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Thu, 8 Mar 2018 08:50:39 -0800
Subject: [PATCH 4/7] Refactored to share hidden table removal method

---
 pandas/io/html.py | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index fba76a1effd2a..acffa5e35a0da 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -387,6 +387,27 @@ def _parse_raw_tbody(self, table):
             res = self._parse_tr(table)
         return self._parse_raw_data(res)
 
+    def _handle_hidden_tables(self, tbl_list, attr_name):
+        """Returns list of tables, potentially removing hidden elements
+
+        Parameters
+        ----------
+        tbl_list : list of Tag or list of Element
+            Type of list elements will vary depending upon parser used
+        attr_name : str
+            Name of the accessor for retrieving HTML attributes
+
+        Returns
+        -------
+        list of Tag or list of Element
+            Return type matches `tbl_list`
+        """
+        if not self.displayed_only:
+            return tbl_list
+
+        return [x for x in tbl_list if not ("display:none" in getattr(
+            x, attr_name).get('style', '').replace(' ', ''))]
+
 
 class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
     """HTML to DataFrame parser that uses BeautifulSoup under the hood.
@@ -438,15 +459,10 @@ def _parse_tables(self, doc, match, attrs):
 
         result = []
         unique_tables = set()
-
-        if self.displayed_only:
-            # Remove any hidden tables
-            tables = [x for x in tables if not (
-                "display:none" in x.attrs.get('style', '').replace(' ', ''))]
+        tables = self._handle_hidden_tables(tables, "attrs")
 
         for table in tables:
             if self.displayed_only:
-                import re
                 for elem in table.find_all(
                         style=re.compile(r"display:\s*none")):
                     elem.decompose()
@@ -546,10 +562,8 @@ def _parse_tables(self, doc, match, kwargs):
 
         tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
 
+        tables = self._handle_hidden_tables(tables, "attrib")
         if self.displayed_only:
-            # Remove any hidden tables
-            tables = [x for x in tables if not (
-                "display:none" in x.attrib.get('style', '').replace(' ', ''))]
             for table in tables:
                 for elem in table.xpath(
                         './/*[contains(@style, "display:none")]'):

From b1d0f91e766faa5bd916579a7a8fa081b0c60bab Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Thu, 8 Mar 2018 09:13:06 -0800
Subject: [PATCH 5/7] Added more complicated scenarios; squashed bugs

---
 pandas/io/html.py            | 14 +++++++++-----
 pandas/tests/io/test_html.py | 22 ++++++++++++++++------
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index acffa5e35a0da..2099c7cf4226a 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -405,8 +405,8 @@ def _handle_hidden_tables(self, tbl_list, attr_name):
         if not self.displayed_only:
             return tbl_list
 
-        return [x for x in tbl_list if not ("display:none" in getattr(
-            x, attr_name).get('style', '').replace(' ', ''))]
+        return [x for x in tbl_list if not "display:none" in
+                getattr(x, attr_name).get('style', '').replace(" ", "")]
 
 
 class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
@@ -565,9 +565,13 @@ def _parse_tables(self, doc, match, kwargs):
         tables = self._handle_hidden_tables(tables, "attrib")
         if self.displayed_only:
             for table in tables:
-                for elem in table.xpath(
-                        './/*[contains(@style, "display:none")]'):
-                    elem.getparent().remove(elem)
+                # lxml utilizes XPATH 1.0 which does not have regex
+                # support. As a result, we find all elements with a style
+                # attribute and iterate them to check for display:none
+                for elem in table.xpath('.//*[@style]'):
+                    if "display:none" in elem.attrib.get(
+                            "style", "").replace(" ", ""):
+                        elem.getparent().remove(elem)
 
         if not tables:
             raise ValueError("No tables found matching regex {patt!r}"
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 2c97361bf238c..b18104e951504 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -676,19 +676,24 @@ def test_wikipedia_states_table(self):
 
     @pytest.mark.parametrize("displayed_only,exp0,exp1", [
         (True, DataFrame(["foo"]), None),
-        (False, DataFrame(["foofoo"]), DataFrame(["foofoo"]))])
+        (False, DataFrame(["foo  bar  baz  qux"]), DataFrame(["foo"]))])
     def test_displayed_only(self, displayed_only, exp0, exp1):
         # GH 20027
         data = StringIO("""<html>
           <body>
             <table>
               <tr>
-                <td>foo<span style="display:none">foo</span></td>
+                <td>
+                  foo
+                  <span style="display:none;text-align:center">bar</span>
+                  <span style="display:none">baz</span>
+                  <span style="display: none">qux</span>
+                </td>
               </tr>
             </table>
             <table style="display: none">
               <tr>
-                <td>foo<span>foo</span></td>
+                <td>foo</td>
               </tr>
             </table>
           </body>
@@ -926,19 +931,24 @@ def test_computer_sales_page(self):
 
     @pytest.mark.parametrize("displayed_only,exp0,exp1", [
         (True, DataFrame(["foo"]), None),
-        (False, DataFrame(["foofoo"]), DataFrame(["foofoo"]))])
+        (False, DataFrame(["foo  bar  baz  qux"]), DataFrame(["foo"]))])
     def test_displayed_only(self, displayed_only, exp0, exp1):
         # GH 20027
         data = StringIO("""<html>
           <body>
             <table>
               <tr>
-                <td>foo<span style="display:none">foo</span></td>
+                <td>
+                  foo
+                  <span style="display:none;text-align:center">bar</span>
+                  <span style="display:none">baz</span>
+                  <span style="display: none">qux</span>
+                </td>
               </tr>
             </table>
             <table style="display: none">
               <tr>
-                <td>foo<span>foo</span></td>
+                <td>foo</td>
               </tr>
             </table>
           </body>

From 2960025fd2acea86518dbe0aa604d49d5e78ea74 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Thu, 8 Mar 2018 09:14:23 -0800
Subject: [PATCH 6/7] PEP8 fix

---
 pandas/io/html.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index 2099c7cf4226a..bf3cab05f89bd 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -405,7 +405,7 @@ def _handle_hidden_tables(self, tbl_list, attr_name):
         if not self.displayed_only:
             return tbl_list
 
-        return [x for x in tbl_list if not "display:none" in
+        return [x for x in tbl_list if "display:none" not in
                 getattr(x, attr_name).get('style', '').replace(" ", "")]
 
 

From 3093879826e9970eb87581889333b5e639637bc3 Mon Sep 17 00:00:00 2001
From: Will Ayd <william.ayd@icloud.com>
Date: Fri, 9 Mar 2018 08:30:32 -0800
Subject: [PATCH 7/7] Docstring updates

---
 pandas/io/html.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pandas/io/html.py b/pandas/io/html.py
index bf3cab05f89bd..300a5a151f5d2 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -166,6 +166,8 @@ class _HtmlFrameParser(object):
     displayed_only : bool
         Whether or not items with "display:none" should be ignored
 
+        .. versionadded:: 0.23.0
+
     Attributes
     ----------
     io : str or file-like
@@ -178,6 +180,14 @@ class _HtmlFrameParser(object):
         A dictionary of valid table attributes to use to search for table
         elements.
 
+    encoding : str
+        Encoding to be used by parser
+
+    displayed_only : bool
+        Whether or not items with "display:none" should be ignored
+
+        .. versionadded:: 0.23.0
+
     Notes
     -----
     To subclass this class effectively you must override the following methods: