diff --git a/src/Dom/Document.php b/src/Dom/Document.php index b43653d41..3ded1d7e0 100644 --- a/src/Dom/Document.php +++ b/src/Dom/Document.php @@ -163,12 +163,22 @@ final class Document extends DOMDocument */ const PROPERTY_GETTER_ERROR_MESSAGE = 'Undefined property: AmpProject\\Dom\\Document::'; + /** + * Charset compatibility tag for making DOMDocument behave. + * + * See: http://php.net/manual/en/domdocument.loadhtml.php#78243. + * + * @var string + */ + const HTTP_EQUIV_META_TAG = ''; + // Regex patterns and values used for adding and removing http-equiv charsets for compatibility. // The opening tag pattern contains a comment to make sure we don't match a tag within a comment. + const HTML_GET_HEAD_OPENING_TAG_PATTERN = '/(?>\s*)*\s+[^>]*)?>/is'; - const HTML_GET_HEAD_OPENING_TAG_REPLACEMENT = '$0'; + const HTML_GET_HEAD_OPENING_TAG_REPLACEMENT = '$0' . self::HTTP_EQUIV_META_TAG; + const HTML_GET_HTML_OPENING_TAG_PATTERN = '/(?>\s*)*\s+[^>]*)?>/is'; + const HTML_GET_HTML_OPENING_TAG_REPLACEMENT = '$0' . self::HTTP_EQUIV_META_TAG . ''; const HTML_GET_HTTP_EQUIV_TAG_PATTERN = '##i'; @@ -507,14 +517,7 @@ public function loadHTMLFragment($source, $options = []) $source = $this->adaptEncoding($source); } - // Force-add http-equiv charset to make DOMDocument behave as it should. - // See: http://php.net/manual/en/domdocument.loadhtml.php#78243. - $source = preg_replace( - self::HTML_GET_HEAD_OPENING_TAG_PATTERN, - self::HTML_GET_HEAD_OPENING_TAG_REPLACEMENT, - $source, - 1 - ); + $source = $this->addHttpEquivCharset($source); $libxml_previous_state = libxml_use_internal_errors(true); @@ -2044,4 +2047,46 @@ public function enforceCssMaxByteCount($maxByteCount = AMP::MAX_CSS_BYTE_COUNT) { $this->cssMaxByteCountEnforced = $maxByteCount; } + + /** + * Add a http-equiv charset meta tag to the document's node. + * + * This is needed to make the DOMDocument behave as it should in terms of encoding. + * See: http://php.net/manual/en/domdocument.loadhtml.php#78243. + * + * @param string $html HTML string to add the http-equiv charset to. + * @return string Adapted string of HTML. + */ + private function addHttpEquivCharset($html) + { + $count = 0; + + // We try first to detect an existing node. + $html = preg_replace( + self::HTML_GET_HEAD_OPENING_TAG_PATTERN, + self::HTML_GET_HEAD_OPENING_TAG_REPLACEMENT, + $html, + 1, + $count + ); + + + // If no was found, we look for the tag instead. + if ($count < 1) { + $html = preg_replace( + self::HTML_GET_HTML_OPENING_TAG_PATTERN, + self::HTML_GET_HTML_OPENING_TAG_REPLACEMENT, + $html, + 1, + $count + ); + } + + // Finally, we just prepend the head with the required http-equiv charset. + if ($count < 1) { + $html = '' . self::HTTP_EQUIV_META_TAG . '' . $html; + } + + return $html; + } } diff --git a/tests/Dom/DocumentTest.php b/tests/Dom/DocumentTest.php index 67c31606b..1eecc8e5c 100644 --- a/tests/Dom/DocumentTest.php +++ b/tests/Dom/DocumentTest.php @@ -1190,4 +1190,64 @@ public function testLibxmlOptionBC() $documentFragment->loadHTMLFragment('
', '524288'); $this->assertEquals($expectedOptions, $documentFragment->getOptions()); } + + /** + * Data for document fragment tests. + * + * @return array Data. + */ + public function dataDocumentFragment() + { + $target = '
'; + + foreach ([true, false] as $body) { + foreach ([true, false] as $head) { + foreach ([true, false] as $html) { + foreach ([true, false] as $doctype) { + $source = $body ? '' . $target . '' : $target; + $case = $body ? 'with_body' : 'without_body'; + + $source = $head ? '' . $source : $source; + $case = $head ? 'with_head_' . $case : 'without_head_' . $case; + + $source = $html ? '' . $source . '' : $source; + $case = $html ? 'with_html_' . $case : 'without_html_' . $case; + + $source = $doctype ? '' . $source : $source; + $case = $doctype ? 'with_doctype_' . $case : 'without_doctype_' . $case; + + $cases["fragment_encoding_{$case}"] = ['utf-8', $source, $target]; + } + } + } + } + + return $cases; + } + + /** + * Tests loading and saving a document fragment. + * + * @param string $charset Charset to use. + * @param string $source Source content. + * @param string $expected Expected target content. + * @param callable|null $fragmentCallback Optional. Callback to use for fetching the fragment node to compare. + * Defaults to retrieving the first child node of the body tag. + * + * @dataProvider dataDocumentFragment + * @covers \AmpProject\Dom\Document::loadHTML() + * @covers \AmpProject\Dom\Document::saveHTML() + */ + public function testDocumentFragment($charset, $source, $expected, $fragmentCallback = null) + { + if ($fragmentCallback === null) { + $fragmentCallback = static function (Document $document) { + return $document->body->firstChild; + }; + } + + $document = Document::fromHtmlFragment($source, $charset); + + $this->assertEqualMarkup($expected, $document->saveHTMLFragment($fragmentCallback($document))); + } } diff --git a/tests/Optimizer/Transformer/ServerSideRenderingTest.php b/tests/Optimizer/Transformer/ServerSideRenderingTest.php index 6b75b1247..cc9fd6d9c 100644 --- a/tests/Optimizer/Transformer/ServerSideRenderingTest.php +++ b/tests/Optimizer/Transformer/ServerSideRenderingTest.php @@ -6,6 +6,7 @@ use AmpProject\Optimizer\Error; use AmpProject\Optimizer\ErrorCollection; use AmpProject\Optimizer\Exception\InvalidHtmlAttribute; +use AmpProject\Tag; use AmpProject\Tests\ErrorComparison; use AmpProject\Tests\MarkupComparison; use AmpProject\Tests\TestCase; @@ -152,8 +153,8 @@ public function dataTransform() [ Error\CannotRemoveBoilerplate::fromRenderDelayingScript( Document::fromHtmlFragment( - TestMarkup::SCRIPT_AMPSTORY - )->head->firstChild + '' . TestMarkup::SCRIPT_AMPSTORY . '' + )->head->lastChild ), ], ], @@ -164,8 +165,8 @@ public function dataTransform() [ Error\CannotRemoveBoilerplate::fromRenderDelayingScript( Document::fromHtmlFragment( - TestMarkup::SCRIPT_AMPDYNAMIC_CSSCLASSES - )->head->firstChild + '' . TestMarkup::SCRIPT_AMPDYNAMIC_CSSCLASSES . '' + )->head->lastChild ), ], ],