diff --git a/includes/class-amp-theme-support.php b/includes/class-amp-theme-support.php index 52eca9d8b74..f7401cd9700 100644 --- a/includes/class-amp-theme-support.php +++ b/includes/class-amp-theme-support.php @@ -193,8 +193,6 @@ public static function register_hooks() { */ add_action( 'template_redirect', array( __CLASS__, 'start_output_buffering' ), 0 ); - add_filter( 'the_content', array( __CLASS__, 'filter_the_content' ), PHP_INT_MAX ); - // @todo Add character conversion. } @@ -442,32 +440,12 @@ public static function get_amp_custom_styles() { return $css; } - /** - * Filter the content to be valid AMP. - * - * @param string $content Content. - * @return string Amplified content. - */ - public static function filter_the_content( $content ) { - $args = array( - 'content_max_width' => ! empty( $content_width ) ? $content_width : AMP_Post_Template::CONTENT_MAX_WIDTH, // Back-compat. - ); - - list( $sanitized_content, $scripts, $styles ) = AMP_Content_Sanitizer::sanitize( $content, self::$sanitizer_classes, $args ); - - self::$amp_scripts = array_merge( self::$amp_scripts, $scripts ); - self::$amp_styles = array_merge( self::$amp_styles, $styles ); - - return $sanitized_content; - } - /** * Determine required AMP scripts. * - * @param string $html Output HTML. * @return string Scripts to inject into the HEAD. */ - public static function get_amp_component_scripts( $html ) { + public static function get_amp_component_scripts() { $amp_scripts = self::$amp_scripts; foreach ( self::$embed_handlers as $embed_handler ) { @@ -512,15 +490,41 @@ public static function start_output_buffering() { * Finish output buffering. * * @todo Do this in shutdown instead of output buffering callback? + * @global int $content_width * @param string $output Buffered output. * @return string Finalized output. */ public static function finish_output_buffering( $output ) { + global $content_width; + + $dom = AMP_DOM_Utils::get_dom( $output ); + $args = array( + 'content_max_width' => ! empty( $content_width ) ? $content_width : AMP_Post_Template::CONTENT_MAX_WIDTH, // Back-compat. + ); + + $assets = AMP_Content_Sanitizer::sanitize_document( $dom, self::$sanitizer_classes, $args ); + + self::$amp_scripts = array_merge( self::$amp_scripts, $assets['scripts'] ); + self::$amp_styles = array_merge( self::$amp_styles, $assets['styles'] ); + + /* + * @todo The sanitize method needs to be updated to sanitize the entire HTML element and not just the BODY. + * This will require updating mandatory_parent_blacklist in amphtml-update.py to include elements that appear in the HEAD. + * This will ensure that the scripts and styles that plugins output via wp_head() will be sanitized as well. However, + * since the the old paired mode is sending content from the *body* we'll need to be able to filter out the elements + * from outside the body from being part of the whitelist sanitizer when it runs when theme support is not present, + * as otherwise elements from the HEAD could get added to the BODY. + */ + $output = preg_replace( + '#()(.+)()#si', + '$1' . AMP_DOM_Utils::get_content_from_dom( $dom ) . '$3', + $output + ); // Inject required scripts. $output = preg_replace( '#' . preg_quote( self::COMPONENT_SCRIPTS_PLACEHOLDER, '#' ) . '#', - self::get_amp_component_scripts( $output ), + self::get_amp_component_scripts(), $output, 1 ); @@ -533,7 +537,6 @@ public static function finish_output_buffering( $output ) { 1 ); - // @todo Add more validation checking and potentially the whitelist sanitizer. return $output; } } diff --git a/includes/sanitizers/class-amp-rule-spec.php b/includes/sanitizers/class-amp-rule-spec.php index 694b7dcf7df..6bec8aa167a 100644 --- a/includes/sanitizers/class-amp-rule-spec.php +++ b/includes/sanitizers/class-amp-rule-spec.php @@ -87,22 +87,18 @@ abstract class AMP_Rule_Spec { */ public static $additional_allowed_tags = array( - /** - * An experimental tag with no protoascii - */ + // An experimental tag with no protoascii. 'amp-share-tracking' => array( 'attr_spec_list' => array(), 'tag_spec' => array(), ), - /** - * Needed for some tags such as analytics - */ + // Needed for some tags such as analytics. 'script' => array( 'attr_spec_list' => array( 'type' => array( 'mandatory' => true, - 'value_casei' => 'text/javascript', + 'value_casei' => 'application/json', ), ), 'tag_spec' => array(), diff --git a/includes/templates/class-amp-content-sanitizer.php b/includes/templates/class-amp-content-sanitizer.php index dcbd62097ef..ddf533091cd 100644 --- a/includes/templates/class-amp-content-sanitizer.php +++ b/includes/templates/class-amp-content-sanitizer.php @@ -7,23 +7,50 @@ /** * Class AMP_Content_Sanitizer + * + * @since 0.4.1 */ class AMP_Content_Sanitizer { /** - * Sanitize. + * Sanitize _content_. + * + * @since 0.4.1 * - * @param string $content Content. + * @param string $content HTML content string or DOM document. * @param string[] $sanitizer_classes Sanitizer classes. * @param array $global_args Global args. - * - * @return array + * @return array Tuple containing sanitized HTML, scripts array, and styles array. */ public static function sanitize( $content, array $sanitizer_classes, $global_args = array() ) { + $dom = AMP_DOM_Utils::get_dom_from_content( $content ); + + $results = self::sanitize_document( $dom, $sanitizer_classes, $global_args ); + return array( + AMP_DOM_Utils::get_content_from_dom( $dom ), + $results['scripts'], + $results['styles'], + ); + } + + /** + * Sanitize document. + * + * @since 0.7 + * + * @param DOMDocument $dom HTML document. + * @param string[] $sanitizer_classes Sanitizer classes. + * @param array $global_args Global args passed into . + * @return array { + * Scripts and styles needed by sanitizers. + * + * @type array $scripts Scripts. + * @type array $styles Styles. + * } + */ + public static function sanitize_document( &$dom, $sanitizer_classes, $global_args ) { $scripts = array(); $styles = array(); - $dom = AMP_DOM_Utils::get_dom_from_content( $content ); - foreach ( $sanitizer_classes as $sanitizer_class => $args ) { if ( ! class_exists( $sanitizer_class ) ) { /* translators: %s is sanitizer class */ @@ -31,6 +58,11 @@ public static function sanitize( $content, array $sanitizer_classes, $global_arg continue; } + /** + * Sanitizer. + * + * @type AMP_Base_Sanitizer $sanitizer + */ $sanitizer = new $sanitizer_class( $dom, array_merge( $global_args, $args ) ); if ( ! is_subclass_of( $sanitizer, 'AMP_Base_Sanitizer' ) ) { @@ -45,9 +77,7 @@ public static function sanitize( $content, array $sanitizer_classes, $global_arg $styles = array_merge( $styles, $sanitizer->get_styles() ); } - $sanitized_content = AMP_DOM_Utils::get_content_from_dom( $dom ); - - return array( $sanitized_content, $scripts, $styles ); + return compact( 'scripts', 'styles' ); } } diff --git a/includes/utils/class-amp-dom-utils.php b/includes/utils/class-amp-dom-utils.php index d5f54bcbbf4..9492dd4cee5 100644 --- a/includes/utils/class-amp-dom-utils.php +++ b/includes/utils/class-amp-dom-utils.php @@ -13,17 +13,45 @@ class AMP_DOM_Utils { /** - * Return a valid DOMDocument representing arbitrary HTML content passed as a parameter. + * HTML elements that are self-closing. * - * @see Reciprocal function get_content_from_dom() + * Not all are valid AMP, but we include them for completeness. * - * @since 0.2 + * @since 0.7 + * @link https://www.w3.org/TR/html5/syntax.html#serializing-html-fragments + * @var array + */ + private static $self_closing_tags = array( + 'area', + 'base', + 'basefont', + 'bgsound', + 'br', + 'col', + 'embed', + 'frame', + 'hr', + 'img', + 'input', + 'keygen', + 'link', + 'meta', + 'param', + 'source', + 'track', + 'wbr', + ); + + /** + * Return a valid DOMDocument representing HTML document passed as a parameter. * - * @param string $content Valid HTML content to be represented by a DOMDocument. + * @since 0.7 + * + * @param string $document Valid HTML document to be represented by a DOMDocument. * * @return DOMDocument|false Returns DOMDocument, or false if conversion failed. */ - public static function get_dom_from_content( $content ) { + public static function get_dom( $document ) { $libxml_previous_state = libxml_use_internal_errors( true ); $dom = new DOMDocument(); @@ -34,13 +62,7 @@ public static function get_dom_from_content( $content ) { * We can later use this to extract our nodes. * Add charset so loadHTML does not have problems parsing it. */ - $result = $dom->loadHTML( - sprintf( - '%s', - get_bloginfo( 'charset' ), - $content - ) - ); + $result = $dom->loadHTML( $document ); libxml_clear_errors(); libxml_use_internal_errors( $libxml_previous_state ); @@ -52,6 +74,35 @@ public static function get_dom_from_content( $content ) { return $dom; } + /** + * Return a valid DOMDocument representing arbitrary HTML content passed as a parameter. + * + * @see Reciprocal function get_content_from_dom() + * + * @since 0.2 + * + * @param string $content Valid HTML content to be represented by a DOMDocument. + * + * @return DOMDocument|false Returns DOMDocument, or false if conversion failed. + */ + public static function get_dom_from_content( $content ) { + /* + * Wrap in dummy tags, since XML needs one parent node. + * It also makes it easier to loop through nodes. + * We can later use this to extract our nodes. + * Add utf-8 charset so loadHTML does not have problems parsing it. + * See: http://php.net/manual/en/domdocument.loadhtml.php#78243 + */ + $document = sprintf( + '%s', + get_bloginfo( 'charset' ), + $content + ); + + return self::get_dom( $document ); + + } + /** * Return valid HTML content extracted from the DOMDocument passed as a parameter. * @@ -67,6 +118,8 @@ public static function get_content_from_dom( $dom ) { /** * We only want children of the body tag, since we have a subset of HTML. + * + * @todo We will want to get the full HTML eventually. */ $body = $dom->getElementsByTagName( 'body' )->item( 0 ); @@ -119,7 +172,7 @@ public static function get_content_from_dom_node( $dom, $node ) { * Cache this regex so we don't have to recreate it every call. */ if ( ! isset( $self_closing_tags_regex ) ) { - $self_closing_tags = implode( '|', self::get_self_closing_tags() ); + $self_closing_tags = implode( '|', self::$self_closing_tags ); $self_closing_tags_regex = "#>#i"; } @@ -262,48 +315,6 @@ public static function recursive_force_closing_tags( $dom, $node = null ) { * @return bool Returns true if a valid self-closing tag, false if not. */ private static function is_self_closing_tag( $tag ) { - return in_array( $tag, self::get_self_closing_tags(), true ); - } - - /** - * Returns array of self closing tags - * - * @since 0.6 - * - * @return string[] - */ - private static function get_self_closing_tags() { - /* - * As this function is called a lot the static var - * prevents having to re-create the array every time. - */ - static $self_closing_tags; - if ( ! isset( $self_closing_tags ) ) { - /* - * https://www.w3.org/TR/html5/syntax.html#serializing-html-fragments - * Not all are valid AMP, but we include them for completeness. - */ - $self_closing_tags = array( - 'area', - 'base', - 'basefont', - 'bgsound', - 'br', - 'col', - 'embed', - 'frame', - 'hr', - 'img', - 'input', - 'keygen', - 'link', - 'meta', - 'param', - 'source', - 'track', - 'wbr', - ); - } - return $self_closing_tags; + return in_array( strtolower( $tag ), self::$self_closing_tags, true ); } } diff --git a/tests/test-class-amp-theme-support.php b/tests/test-class-amp-theme-support.php index 5a10663bfb7..0b15e7f1165 100644 --- a/tests/test-class-amp-theme-support.php +++ b/tests/test-class-amp-theme-support.php @@ -67,4 +67,53 @@ public function test_is_paired_available() { $this->assertTrue( is_search() ); $this->assertFalse( AMP_Theme_Support::is_paired_available() ); } + + /** + * Test finish_output_buffering. + * + * @covers AMP_Theme_Support::finish_output_buffering() + */ + public function test_finish_output_buffering() { + add_theme_support( 'amp' ); + AMP_Theme_Support::init(); + ob_start(); + ?> + + > + + + + + + + + + + + + assertContains( '', $sanitized_html ); + $this->assertContains( '', $sanitized_html ); + $this->assertContains( '