Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix broken encoding when using document fragments #99

Merged
merged 4 commits into from
Mar 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 56 additions & 11 deletions src/Dom/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,22 @@ final class Document extends DOMDocument
*/
const PROPERTY_GETTER_ERROR_MESSAGE = 'Undefined property: AmpProject\\Dom\\Document::';

/**
* Charset compatibility tag for making DOMDocument behave.
*
* See: http://php.net/manual/en/domdocument.loadhtml.php#78243.
*
* @var string
*/
const HTTP_EQUIV_META_TAG = '<meta http-equiv="content-type" content="text/html; charset=utf-8">';

// Regex patterns and values used for adding and removing http-equiv charsets for compatibility.
// The opening tag pattern contains a comment to make sure we don't match a <head> tag within a comment.

const HTML_GET_HEAD_OPENING_TAG_PATTERN = '/(?><!--.*?-->\s*)*<head(?>\s+[^>]*)?>/is';
const HTML_GET_HEAD_OPENING_TAG_REPLACEMENT = '$0<meta http-equiv="content-type" '
. 'content="text/html; '
. 'charset=utf-8">';
const HTML_GET_HEAD_OPENING_TAG_REPLACEMENT = '$0' . self::HTTP_EQUIV_META_TAG;
const HTML_GET_HTML_OPENING_TAG_PATTERN = '/(?><!--.*?-->\s*)*<html(?>\s+[^>]*)?>/is';
const HTML_GET_HTML_OPENING_TAG_REPLACEMENT = '$0<head>' . self::HTTP_EQUIV_META_TAG . '</head>';
const HTML_GET_HTTP_EQUIV_TAG_PATTERN = '#<meta http-equiv=([\'"])content-type\1 '
. 'content=([\'"])text/html; '
. 'charset=utf-8\2>#i';
Expand Down Expand Up @@ -507,14 +517,7 @@ public function loadHTMLFragment($source, $options = [])
$source = $this->adaptEncoding($source);
}

// Force-add http-equiv charset to make DOMDocument behave as it should.
// See: http://php.net/manual/en/domdocument.loadhtml.php#78243.
$source = preg_replace(
self::HTML_GET_HEAD_OPENING_TAG_PATTERN,
self::HTML_GET_HEAD_OPENING_TAG_REPLACEMENT,
$source,
1
);
$source = $this->addHttpEquivCharset($source);

$libxml_previous_state = libxml_use_internal_errors(true);

Expand Down Expand Up @@ -2044,4 +2047,46 @@ public function enforceCssMaxByteCount($maxByteCount = AMP::MAX_CSS_BYTE_COUNT)
{
$this->cssMaxByteCountEnforced = $maxByteCount;
}

/**
* Add a http-equiv charset meta tag to the document's <head> node.
*
* This is needed to make the DOMDocument behave as it should in terms of encoding.
* See: http://php.net/manual/en/domdocument.loadhtml.php#78243.
*
* @param string $html HTML string to add the http-equiv charset to.
* @return string Adapted string of HTML.
*/
private function addHttpEquivCharset($html)
{
$count = 0;

// We try first to detect an existing <head> node.
$html = preg_replace(
self::HTML_GET_HEAD_OPENING_TAG_PATTERN,
self::HTML_GET_HEAD_OPENING_TAG_REPLACEMENT,
$html,
1,
$count
);


// If no <head> was found, we look for the <html> tag instead.
if ($count < 1) {
$html = preg_replace(
self::HTML_GET_HTML_OPENING_TAG_PATTERN,
self::HTML_GET_HTML_OPENING_TAG_REPLACEMENT,
$html,
1,
$count
);
}

// Finally, we just prepend the head with the required http-equiv charset.
if ($count < 1) {
$html = '<head>' . self::HTTP_EQUIV_META_TAG . '</head>' . $html;
}

return $html;
}
}
60 changes: 60 additions & 0 deletions tests/Dom/DocumentTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -1190,4 +1190,64 @@ public function testLibxmlOptionBC()
$documentFragment->loadHTMLFragment('<div></div>', '524288');
$this->assertEquals($expectedOptions, $documentFragment->getOptions());
}

/**
* Data for document fragment tests.
*
* @return array Data.
*/
public function dataDocumentFragment()
{
$target = '<div style="Iñtërnâtiônàlizætiøn"></div>';

foreach ([true, false] as $body) {
foreach ([true, false] as $head) {
foreach ([true, false] as $html) {
foreach ([true, false] as $doctype) {
$source = $body ? '<body>' . $target . '</body>' : $target;
$case = $body ? 'with_body' : 'without_body';

$source = $head ? '<head></head>' . $source : $source;
$case = $head ? 'with_head_' . $case : 'without_head_' . $case;

$source = $html ? '<html>' . $source . '</html>' : $source;
$case = $html ? 'with_html_' . $case : 'without_html_' . $case;

$source = $doctype ? '<!DOCTYPE html>' . $source : $source;
$case = $doctype ? 'with_doctype_' . $case : 'without_doctype_' . $case;

$cases["fragment_encoding_{$case}"] = ['utf-8', $source, $target];
}
}
}
}

return $cases;
}

/**
* Tests loading and saving a document fragment.
*
* @param string $charset Charset to use.
* @param string $source Source content.
* @param string $expected Expected target content.
* @param callable|null $fragmentCallback Optional. Callback to use for fetching the fragment node to compare.
* Defaults to retrieving the first child node of the body tag.
*
* @dataProvider dataDocumentFragment
* @covers \AmpProject\Dom\Document::loadHTML()
* @covers \AmpProject\Dom\Document::saveHTML()
*/
public function testDocumentFragment($charset, $source, $expected, $fragmentCallback = null)
{
if ($fragmentCallback === null) {
$fragmentCallback = static function (Document $document) {
return $document->body->firstChild;
};
}

$document = Document::fromHtmlFragment($source, $charset);

$this->assertEqualMarkup($expected, $document->saveHTMLFragment($fragmentCallback($document)));
}
}
9 changes: 5 additions & 4 deletions tests/Optimizer/Transformer/ServerSideRenderingTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
use AmpProject\Optimizer\Error;
use AmpProject\Optimizer\ErrorCollection;
use AmpProject\Optimizer\Exception\InvalidHtmlAttribute;
use AmpProject\Tag;
use AmpProject\Tests\ErrorComparison;
use AmpProject\Tests\MarkupComparison;
use AmpProject\Tests\TestCase;
Expand Down Expand Up @@ -152,8 +153,8 @@ public function dataTransform()
[
Error\CannotRemoveBoilerplate::fromRenderDelayingScript(
Document::fromHtmlFragment(
TestMarkup::SCRIPT_AMPSTORY
)->head->firstChild
'<head>' . TestMarkup::SCRIPT_AMPSTORY . '</head>'
)->head->lastChild
),
],
],
Expand All @@ -164,8 +165,8 @@ public function dataTransform()
[
Error\CannotRemoveBoilerplate::fromRenderDelayingScript(
Document::fromHtmlFragment(
TestMarkup::SCRIPT_AMPDYNAMIC_CSSCLASSES
)->head->firstChild
'<head>' . TestMarkup::SCRIPT_AMPDYNAMIC_CSSCLASSES . '</head>'
)->head->lastChild
),
],
],
Expand Down