From d6faeb4063e1edcff3372a76298c06237fa6b864 Mon Sep 17 00:00:00 2001 From: Ryan Kienstra Date: Wed, 29 Jan 2020 16:21:51 -0600 Subject: [PATCH 1/6] Conditionally remove html[xmlns], and convert html[xml:lang] to html[lang] As Weston mentioned, neither of these is valid HTML5 --- src/Dom/Document.php | 29 +++++++++++++++++++++++ tests/php/test-class-amp-dom-document.php | 20 ++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/src/Dom/Document.php b/src/Dom/Document.php index 5bad55dddb0..2e4c23d4d5a 100644 --- a/src/Dom/Document.php +++ b/src/Dom/Document.php @@ -340,6 +340,8 @@ public function loadHTML( $source, $options = 0 ) { libxml_use_internal_errors( $libxml_previous_state ); if ( $success ) { + $this->normalize_html_attributes(); + // Remove http-equiv charset again. $meta = $this->head->firstChild; if ( @@ -1021,6 +1023,33 @@ private function restore_doctype_node( $html ) { return preg_replace( self::HTML_RESTORE_DOCTYPE_PATTERN, '\1!\3\4>', $html, 1 ); } + /** + * Normalizes HTML attributes to be HTML5 compatible. + * + * Conditionally removes html[xmlns], and converts html[xml:lang] to html[lang]. + */ + private function normalize_html_attributes() { + $html = $this->getElementsByTagName( 'html' )->item( 0 ); + if ( ! $html->attributes ) { + return; + } + + $xmlns = $html->attributes->getNamedItem( 'xmlns' ); + $xmlns_value_to_strip = 'http://www.w3.org/1999/xhtml'; + if ( $xmlns && $xmlns_value_to_strip === $xmlns->nodeValue ) { + $html->removeAttributeNode( $xmlns ); + } + + $xml_lang = $html->attributes->getNamedItem( 'xml:lang' ); + if ( $xml_lang ) { + if ( ! $html->attributes->getNamedItem( 'lang' ) && $xml_lang->nodeValue ) { + // Move the html[xml:lang] value to html[lang]. + $html->setAttribute( 'lang', $xml_lang->nodeValue ); + } + $html->removeAttributeNode( $xml_lang ); + } + } + /** * Deduplicate a given tag. * diff --git a/tests/php/test-class-amp-dom-document.php b/tests/php/test-class-amp-dom-document.php index 199dce78f84..8356b9d6901 100644 --- a/tests/php/test-class-amp-dom-document.php +++ b/tests/php/test-class-amp-dom-document.php @@ -88,6 +88,26 @@ public function data_dom_document() { '' . $head . '

Text

', '' . $head . '

Text

', ], + 'html_with_xmlns_and_xml_lang' => [ + 'utf-8', + '' . $head . '', + '' . $head . '', + ], + 'html_with_xmlns_value_that_should_remain' => [ + 'utf-8', + '' . $head . '', + '' . $head . '', + ], + 'html_with_lang_and_xml_lang' => [ + 'utf-8', + '' . $head . '', + '' . $head . '', + ], + 'html_with_empty_xml_lang' => [ + 'utf-8', + '' . $head . '', + '' . $head . '', + ], 'slashes_on_closing_tags' => [ 'utf-8', '

Text

', From 93d96f4a9e92e021597ecae524bf9d476f84f552 Mon Sep 17 00:00:00 2001 From: Ryan Kienstra Date: Wed, 29 Jan 2020 16:40:00 -0600 Subject: [PATCH 2/6] If the html[lang] is "", overwrite it with the xml:lang Before, if the xml:lang was non-empty and the lang was "", this would not overwrite it. --- src/Dom/Document.php | 3 ++- tests/php/test-class-amp-dom-document.php | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Dom/Document.php b/src/Dom/Document.php index 2e4c23d4d5a..f6c4f0f6665 100644 --- a/src/Dom/Document.php +++ b/src/Dom/Document.php @@ -1042,7 +1042,8 @@ private function normalize_html_attributes() { $xml_lang = $html->attributes->getNamedItem( 'xml:lang' ); if ( $xml_lang ) { - if ( ! $html->attributes->getNamedItem( 'lang' ) && $xml_lang->nodeValue ) { + $lang_node = $html->attributes->getNamedItem( 'lang' ); + if ( ( ! $lang_node || ! $lang_node->nodeValue ) && $xml_lang->nodeValue ) { // Move the html[xml:lang] value to html[lang]. $html->setAttribute( 'lang', $xml_lang->nodeValue ); } diff --git a/tests/php/test-class-amp-dom-document.php b/tests/php/test-class-amp-dom-document.php index 8356b9d6901..a067df34f31 100644 --- a/tests/php/test-class-amp-dom-document.php +++ b/tests/php/test-class-amp-dom-document.php @@ -108,6 +108,11 @@ public function data_dom_document() { '' . $head . '', '' . $head . '', ], + 'html_with_empty_lang' => [ + 'utf-8', + '' . $head . '', + '' . $head . '', + ], 'slashes_on_closing_tags' => [ 'utf-8', '

Text

', From ff6bdbaab4feebfbe709ef1ea1dec8e963e3986a Mon Sep 17 00:00:00 2001 From: Ryan Kienstra Date: Wed, 29 Jan 2020 16:54:30 -0600 Subject: [PATCH 3/6] Use $html->hasAttributes to possibly exit early. --- src/Dom/Document.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Dom/Document.php b/src/Dom/Document.php index f6c4f0f6665..58c41d0c9f9 100644 --- a/src/Dom/Document.php +++ b/src/Dom/Document.php @@ -1030,7 +1030,7 @@ private function restore_doctype_node( $html ) { */ private function normalize_html_attributes() { $html = $this->getElementsByTagName( 'html' )->item( 0 ); - if ( ! $html->attributes ) { + if ( ! $html->hasAttributes() ) { return; } From 7aaf8fd016967e130c9564bdeed9f0c9aee944f7 Mon Sep 17 00:00:00 2001 From: Ryan Kienstra Date: Wed, 29 Jan 2020 17:31:07 -0600 Subject: [PATCH 4/6] Commit Weston's suggestion to use documentElement Co-Authored-By: Weston Ruter --- src/Dom/Document.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Dom/Document.php b/src/Dom/Document.php index 58c41d0c9f9..49e5e0b1f92 100644 --- a/src/Dom/Document.php +++ b/src/Dom/Document.php @@ -1029,7 +1029,7 @@ private function restore_doctype_node( $html ) { * Conditionally removes html[xmlns], and converts html[xml:lang] to html[lang]. */ private function normalize_html_attributes() { - $html = $this->getElementsByTagName( 'html' )->item( 0 ); + $html = $this->documentElement; if ( ! $html->hasAttributes() ) { return; } From aa495049a7be497dbc17c22823e0fbdea9c4f37b Mon Sep 17 00:00:00 2001 From: Ryan Kienstra Date: Wed, 29 Jan 2020 17:40:06 -0600 Subject: [PATCH 5/6] Fix PHPCS error --- src/Dom/Document.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Dom/Document.php b/src/Dom/Document.php index 49e5e0b1f92..93daed43c69 100644 --- a/src/Dom/Document.php +++ b/src/Dom/Document.php @@ -1030,7 +1030,7 @@ private function restore_doctype_node( $html ) { */ private function normalize_html_attributes() { $html = $this->documentElement; - if ( ! $html->hasAttributes() ) { + if ( ! $html->hasAttributes() ) { return; } From 096cf7b880794ac352e9b0235a1eeb46a802f144 Mon Sep 17 00:00:00 2001 From: Ryan Kienstra Date: Wed, 29 Jan 2020 17:52:58 -0600 Subject: [PATCH 6/6] Remove needless $xmlns_value_to_strip variable As Weston mentioned, this is only used in one place an shouldn't be needed. --- src/Dom/Document.php | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/Dom/Document.php b/src/Dom/Document.php index 93daed43c69..f798bf5cb2d 100644 --- a/src/Dom/Document.php +++ b/src/Dom/Document.php @@ -1034,9 +1034,8 @@ private function normalize_html_attributes() { return; } - $xmlns = $html->attributes->getNamedItem( 'xmlns' ); - $xmlns_value_to_strip = 'http://www.w3.org/1999/xhtml'; - if ( $xmlns && $xmlns_value_to_strip === $xmlns->nodeValue ) { + $xmlns = $html->attributes->getNamedItem( 'xmlns' ); + if ( $xmlns && 'http://www.w3.org/1999/xhtml' === $xmlns->nodeValue ) { $html->removeAttributeNode( $xmlns ); }