diff --git a/src/Dom/Document.php b/src/Dom/Document.php index 5bad55dddb0..f798bf5cb2d 100644 --- a/src/Dom/Document.php +++ b/src/Dom/Document.php @@ -340,6 +340,8 @@ public function loadHTML( $source, $options = 0 ) { libxml_use_internal_errors( $libxml_previous_state ); if ( $success ) { + $this->normalize_html_attributes(); + // Remove http-equiv charset again. $meta = $this->head->firstChild; if ( @@ -1021,6 +1023,33 @@ private function restore_doctype_node( $html ) { return preg_replace( self::HTML_RESTORE_DOCTYPE_PATTERN, '\1!\3\4>', $html, 1 ); } + /** + * Normalizes HTML attributes to be HTML5 compatible. + * + * Conditionally removes html[xmlns], and converts html[xml:lang] to html[lang]. + */ + private function normalize_html_attributes() { + $html = $this->documentElement; + if ( ! $html->hasAttributes() ) { + return; + } + + $xmlns = $html->attributes->getNamedItem( 'xmlns' ); + if ( $xmlns && 'http://www.w3.org/1999/xhtml' === $xmlns->nodeValue ) { + $html->removeAttributeNode( $xmlns ); + } + + $xml_lang = $html->attributes->getNamedItem( 'xml:lang' ); + if ( $xml_lang ) { + $lang_node = $html->attributes->getNamedItem( 'lang' ); + if ( ( ! $lang_node || ! $lang_node->nodeValue ) && $xml_lang->nodeValue ) { + // Move the html[xml:lang] value to html[lang]. + $html->setAttribute( 'lang', $xml_lang->nodeValue ); + } + $html->removeAttributeNode( $xml_lang ); + } + } + /** * Deduplicate a given tag. * diff --git a/tests/php/test-class-amp-dom-document.php b/tests/php/test-class-amp-dom-document.php index 199dce78f84..a067df34f31 100644 --- a/tests/php/test-class-amp-dom-document.php +++ b/tests/php/test-class-amp-dom-document.php @@ -88,6 +88,31 @@ public function data_dom_document() { '' . $head . '

Text

', '' . $head . '

Text

', ], + 'html_with_xmlns_and_xml_lang' => [ + 'utf-8', + '' . $head . '', + '' . $head . '', + ], + 'html_with_xmlns_value_that_should_remain' => [ + 'utf-8', + '' . $head . '', + '' . $head . '', + ], + 'html_with_lang_and_xml_lang' => [ + 'utf-8', + '' . $head . '', + '' . $head . '', + ], + 'html_with_empty_xml_lang' => [ + 'utf-8', + '' . $head . '', + '' . $head . '', + ], + 'html_with_empty_lang' => [ + 'utf-8', + '' . $head . '', + '' . $head . '', + ], 'slashes_on_closing_tags' => [ 'utf-8', '

Text

',