From 364e7ddde32104d67de434df08a5268597a44b7b Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Tue, 16 Mar 2021 13:03:35 +0000 Subject: [PATCH 1/4] Add test to trigger bug --- tests/Dom/DocumentTest.php | 117 +++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/tests/Dom/DocumentTest.php b/tests/Dom/DocumentTest.php index 67c31606b..fd35e891e 100644 --- a/tests/Dom/DocumentTest.php +++ b/tests/Dom/DocumentTest.php @@ -1190,4 +1190,121 @@ public function testLibxmlOptionBC() $documentFragment->loadHTMLFragment('
', '524288'); $this->assertEquals($expectedOptions, $documentFragment->getOptions()); } + + /** + * Data for document fragment tests. + * + * @return array Data. + */ + public function dataDocumentFragment() + { + return [ + 'encoding_without_doctype_without_html_without_head_without_body' => [ + 'utf-8', + '
', + '
', + ], + 'encoding_without_doctype_without_html_without_head_with_body' => [ + 'utf-8', + '
', + '
', + ], + 'encoding_without_doctype_without_html_with_head_without_body' => [ + 'utf-8', + '
', + '
', + ], + 'encoding_without_doctype_without_html_with_head_with_body' => [ + 'utf-8', + '
', + '
', + ], + 'encoding_without_doctype_with_html_without_head_without_body' => [ + 'utf-8', + '
', + '
', + ], + 'encoding_without_doctype_with_html_without_head_with_body' => [ + 'utf-8', + '
', + '
', + ], + 'encoding_without_doctype_with_html_with_head_without_body' => [ + 'utf-8', + '
', + '
', + ], + 'encoding_without_doctype_with_html_with_head_with_body' => [ + 'utf-8', + '
', + '
', + ], + 'encoding_with_doctype_without_html_without_head_without_body' => [ + 'utf-8', + '
', + '
', + ], + 'encoding_with_doctype_without_html_without_head_with_body' => [ + 'utf-8', + '
', + '
', + ], + 'encoding_with_doctype_without_html_with_head_without_body' => [ + 'utf-8', + '
', + '
', + ], + 'encoding_with_doctype_without_html_with_head_with_body' => [ + 'utf-8', + '
', + '
', + ], + 'encoding_with_doctype_with_html_without_head_without_body' => [ + 'utf-8', + '
', + '
', + ], + 'encoding_with_doctype_with_html_without_head_with_body' => [ + 'utf-8', + '
', + '
', + ], + 'encoding_with_doctype_with_html_with_head_without_body' => [ + 'utf-8', + '
', + '
', + ], + 'encoding_with_doctype_with_html_with_head_with_body' => [ + 'utf-8', + '
', + '
', + ], + ]; + } + + /** + * Tests loading and saving a document fragment. + * + * @param string $charset Charset to use. + * @param string $source Source content. + * @param string $expected Expected target content. + * @param callable|null $fragmentCallback Optional. Callback to use for fetching the fragment node to compare. + * Defaults to retrieving the first child node of the body tag. + * + * @dataProvider dataDocumentFragment + * @covers \AmpProject\Dom\Document::loadHTML() + * @covers \AmpProject\Dom\Document::saveHTML() + */ + public function testDocumentFragment($charset, $source, $expected, $fragmentCallback = null) + { + if ($fragmentCallback === null) { + $fragmentCallback = static function (Document $document) { + return $document->body->firstChild; + }; + } + + $document = Document::fromHtmlFragment($source, $charset); + + $this->assertEqualMarkup($expected, $document->saveHTMLFragment($fragmentCallback($document))); + } } From 0f2f06d64e5b9764f71c454e2001ea69d1884580 Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Tue, 16 Mar 2021 13:45:13 +0000 Subject: [PATCH 2/4] Fix test in SSR --- src/Dom/Document.php | 90 ++++++++++++++++--- .../Transformer/ServerSideRenderingTest.php | 9 +- 2 files changed, 84 insertions(+), 15 deletions(-) diff --git a/src/Dom/Document.php b/src/Dom/Document.php index b43653d41..0aa560dfb 100644 --- a/src/Dom/Document.php +++ b/src/Dom/Document.php @@ -163,12 +163,24 @@ final class Document extends DOMDocument */ const PROPERTY_GETTER_ERROR_MESSAGE = 'Undefined property: AmpProject\\Dom\\Document::'; + /** + * Charset compatibility tag for making DOMDocument behave. + * + * See: http://php.net/manual/en/domdocument.loadhtml.php#78243. + * + * @var string + */ + const HTTP_EQUIV_META_TAG = ''; + // Regex patterns and values used for adding and removing http-equiv charsets for compatibility. // The opening tag pattern contains a comment to make sure we don't match a tag within a comment. + const HTML_GET_HEAD_OPENING_TAG_PATTERN = '/(?>\s*)*\s+[^>]*)?>/is'; - const HTML_GET_HEAD_OPENING_TAG_REPLACEMENT = '$0'; + const HTML_GET_HEAD_OPENING_TAG_REPLACEMENT = '$0' . self::HTTP_EQUIV_META_TAG; + const HTML_GET_BODY_OPENING_TAG_PATTERN = '/(?>\s*)*\s+[^>]*)?>/is'; + const HTML_GET_BODY_OPENING_TAG_REPLACEMENT = '' . self::HTTP_EQUIV_META_TAG . '$0'; + const HTML_GET_HTML_OPENING_TAG_PATTERN = '/(?>\s*)*\s+[^>]*)?>/is'; + const HTML_GET_HTML_OPENING_TAG_REPLACEMENT = '$0' . self::HTTP_EQUIV_META_TAG . ''; const HTML_GET_HTTP_EQUIV_TAG_PATTERN = '##i'; @@ -507,14 +519,7 @@ public function loadHTMLFragment($source, $options = []) $source = $this->adaptEncoding($source); } - // Force-add http-equiv charset to make DOMDocument behave as it should. - // See: http://php.net/manual/en/domdocument.loadhtml.php#78243. - $source = preg_replace( - self::HTML_GET_HEAD_OPENING_TAG_PATTERN, - self::HTML_GET_HEAD_OPENING_TAG_REPLACEMENT, - $source, - 1 - ); + $source = $this->addHttpEquivCharset($source); $libxml_previous_state = libxml_use_internal_errors(true); @@ -2044,4 +2049,67 @@ public function enforceCssMaxByteCount($maxByteCount = AMP::MAX_CSS_BYTE_COUNT) { $this->cssMaxByteCountEnforced = $maxByteCount; } + + /** + * Add a http-equiv charset meta tag to the document's node. + * + * This is needed to make the DOMDocument behave as it should in terms of encoding. + * See: http://php.net/manual/en/domdocument.loadhtml.php#78243. + * + * @param string $html HTML string to add the http-equiv charset to. + * @return string Adapted string of HTML. + */ + private function addHttpEquivCharset($html) + { + $count = 0; + + // We try first to detect an existing node. + $html = preg_replace( + self::HTML_GET_HEAD_OPENING_TAG_PATTERN, + self::HTML_GET_HEAD_OPENING_TAG_REPLACEMENT, + $html, + 1, + $count + ); + + // In case no node was found, we try to prepend it together with the http-equiv to the tag. + if ($count < 1) { + $html = preg_replace( + self::HTML_GET_BODY_OPENING_TAG_PATTERN, + self::HTML_GET_BODY_OPENING_TAG_REPLACEMENT, + $html, + 1, + $count + ); + } + + // If no was found either, we look for the tag instead. + if ($count < 1) { + $html = preg_replace( + self::HTML_GET_HTML_OPENING_TAG_PATTERN, + self::HTML_GET_HTML_OPENING_TAG_REPLACEMENT, + $html, + 1, + $count + ); + } + + // If no was found either, we look for the tag instead. + if ($count < 1) { + $html = preg_replace( + self::HTML_GET_HTML_OPENING_TAG_PATTERN, + self::HTML_GET_HTML_OPENING_TAG_REPLACEMENT, + $html, + 1, + $count + ); + } + + // Finally, we just prepend the head with the required http-equiv charset. + if ($count < 1) { + $html = '' . self::HTTP_EQUIV_META_TAG . '' . $html; + } + + return $html; + } } diff --git a/tests/Optimizer/Transformer/ServerSideRenderingTest.php b/tests/Optimizer/Transformer/ServerSideRenderingTest.php index 6b75b1247..cc9fd6d9c 100644 --- a/tests/Optimizer/Transformer/ServerSideRenderingTest.php +++ b/tests/Optimizer/Transformer/ServerSideRenderingTest.php @@ -6,6 +6,7 @@ use AmpProject\Optimizer\Error; use AmpProject\Optimizer\ErrorCollection; use AmpProject\Optimizer\Exception\InvalidHtmlAttribute; +use AmpProject\Tag; use AmpProject\Tests\ErrorComparison; use AmpProject\Tests\MarkupComparison; use AmpProject\Tests\TestCase; @@ -152,8 +153,8 @@ public function dataTransform() [ Error\CannotRemoveBoilerplate::fromRenderDelayingScript( Document::fromHtmlFragment( - TestMarkup::SCRIPT_AMPSTORY - )->head->firstChild + '' . TestMarkup::SCRIPT_AMPSTORY . '' + )->head->lastChild ), ], ], @@ -164,8 +165,8 @@ public function dataTransform() [ Error\CannotRemoveBoilerplate::fromRenderDelayingScript( Document::fromHtmlFragment( - TestMarkup::SCRIPT_AMPDYNAMIC_CSSCLASSES - )->head->firstChild + '' . TestMarkup::SCRIPT_AMPDYNAMIC_CSSCLASSES . '' + )->head->lastChild ), ], ], From 68969adb6af1d3851d05003fe9260778e25d73de Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Tue, 16 Mar 2021 14:46:24 +0000 Subject: [PATCH 3/4] Compute fragment encoding test cases --- tests/Dom/DocumentTest.php | 107 +++++++++---------------------------- 1 file changed, 25 insertions(+), 82 deletions(-) diff --git a/tests/Dom/DocumentTest.php b/tests/Dom/DocumentTest.php index fd35e891e..1eecc8e5c 100644 --- a/tests/Dom/DocumentTest.php +++ b/tests/Dom/DocumentTest.php @@ -1198,88 +1198,31 @@ public function testLibxmlOptionBC() */ public function dataDocumentFragment() { - return [ - 'encoding_without_doctype_without_html_without_head_without_body' => [ - 'utf-8', - '
', - '
', - ], - 'encoding_without_doctype_without_html_without_head_with_body' => [ - 'utf-8', - '
', - '
', - ], - 'encoding_without_doctype_without_html_with_head_without_body' => [ - 'utf-8', - '
', - '
', - ], - 'encoding_without_doctype_without_html_with_head_with_body' => [ - 'utf-8', - '
', - '
', - ], - 'encoding_without_doctype_with_html_without_head_without_body' => [ - 'utf-8', - '
', - '
', - ], - 'encoding_without_doctype_with_html_without_head_with_body' => [ - 'utf-8', - '
', - '
', - ], - 'encoding_without_doctype_with_html_with_head_without_body' => [ - 'utf-8', - '
', - '
', - ], - 'encoding_without_doctype_with_html_with_head_with_body' => [ - 'utf-8', - '
', - '
', - ], - 'encoding_with_doctype_without_html_without_head_without_body' => [ - 'utf-8', - '
', - '
', - ], - 'encoding_with_doctype_without_html_without_head_with_body' => [ - 'utf-8', - '
', - '
', - ], - 'encoding_with_doctype_without_html_with_head_without_body' => [ - 'utf-8', - '
', - '
', - ], - 'encoding_with_doctype_without_html_with_head_with_body' => [ - 'utf-8', - '
', - '
', - ], - 'encoding_with_doctype_with_html_without_head_without_body' => [ - 'utf-8', - '
', - '
', - ], - 'encoding_with_doctype_with_html_without_head_with_body' => [ - 'utf-8', - '
', - '
', - ], - 'encoding_with_doctype_with_html_with_head_without_body' => [ - 'utf-8', - '
', - '
', - ], - 'encoding_with_doctype_with_html_with_head_with_body' => [ - 'utf-8', - '
', - '
', - ], - ]; + $target = '
'; + + foreach ([true, false] as $body) { + foreach ([true, false] as $head) { + foreach ([true, false] as $html) { + foreach ([true, false] as $doctype) { + $source = $body ? '' . $target . '' : $target; + $case = $body ? 'with_body' : 'without_body'; + + $source = $head ? '' . $source : $source; + $case = $head ? 'with_head_' . $case : 'without_head_' . $case; + + $source = $html ? '' . $source . '' : $source; + $case = $html ? 'with_html_' . $case : 'without_html_' . $case; + + $source = $doctype ? '' . $source : $source; + $case = $doctype ? 'with_doctype_' . $case : 'without_doctype_' . $case; + + $cases["fragment_encoding_{$case}"] = ['utf-8', $source, $target]; + } + } + } + } + + return $cases; } /** From 4758e217229fb333a029cdc238c07cf2d459466c Mon Sep 17 00:00:00 2001 From: Alain Schlesser Date: Wed, 17 Mar 2021 11:00:19 +0000 Subject: [PATCH 4/4] Remove unneeded compat code --- src/Dom/Document.php | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/src/Dom/Document.php b/src/Dom/Document.php index 0aa560dfb..3ded1d7e0 100644 --- a/src/Dom/Document.php +++ b/src/Dom/Document.php @@ -177,8 +177,6 @@ final class Document extends DOMDocument const HTML_GET_HEAD_OPENING_TAG_PATTERN = '/(?>\s*)*\s+[^>]*)?>/is'; const HTML_GET_HEAD_OPENING_TAG_REPLACEMENT = '$0' . self::HTTP_EQUIV_META_TAG; - const HTML_GET_BODY_OPENING_TAG_PATTERN = '/(?>\s*)*\s+[^>]*)?>/is'; - const HTML_GET_BODY_OPENING_TAG_REPLACEMENT = '' . self::HTTP_EQUIV_META_TAG . '$0'; const HTML_GET_HTML_OPENING_TAG_PATTERN = '/(?>\s*)*\s+[^>]*)?>/is'; const HTML_GET_HTML_OPENING_TAG_REPLACEMENT = '$0' . self::HTTP_EQUIV_META_TAG . ''; const HTML_GET_HTTP_EQUIV_TAG_PATTERN = '# node was found, we try to prepend it together with the http-equiv to the tag. - if ($count < 1) { - $html = preg_replace( - self::HTML_GET_BODY_OPENING_TAG_PATTERN, - self::HTML_GET_BODY_OPENING_TAG_REPLACEMENT, - $html, - 1, - $count - ); - } - - // If no was found either, we look for the tag instead. - if ($count < 1) { - $html = preg_replace( - self::HTML_GET_HTML_OPENING_TAG_PATTERN, - self::HTML_GET_HTML_OPENING_TAG_REPLACEMENT, - $html, - 1, - $count - ); - } - // If no was found either, we look for the tag instead. + // If no was found, we look for the tag instead. if ($count < 1) { $html = preg_replace( self::HTML_GET_HTML_OPENING_TAG_PATTERN,