diff --git a/includes/class-amp-theme-support.php b/includes/class-amp-theme-support.php index 52eca9d8b74..f7401cd9700 100644 --- a/includes/class-amp-theme-support.php +++ b/includes/class-amp-theme-support.php @@ -193,8 +193,6 @@ public static function register_hooks() { */ add_action( 'template_redirect', array( __CLASS__, 'start_output_buffering' ), 0 ); - add_filter( 'the_content', array( __CLASS__, 'filter_the_content' ), PHP_INT_MAX ); - // @todo Add character conversion. } @@ -442,32 +440,12 @@ public static function get_amp_custom_styles() { return $css; } - /** - * Filter the content to be valid AMP. - * - * @param string $content Content. - * @return string Amplified content. - */ - public static function filter_the_content( $content ) { - $args = array( - 'content_max_width' => ! empty( $content_width ) ? $content_width : AMP_Post_Template::CONTENT_MAX_WIDTH, // Back-compat. - ); - - list( $sanitized_content, $scripts, $styles ) = AMP_Content_Sanitizer::sanitize( $content, self::$sanitizer_classes, $args ); - - self::$amp_scripts = array_merge( self::$amp_scripts, $scripts ); - self::$amp_styles = array_merge( self::$amp_styles, $styles ); - - return $sanitized_content; - } - /** * Determine required AMP scripts. * - * @param string $html Output HTML. * @return string Scripts to inject into the HEAD. */ - public static function get_amp_component_scripts( $html ) { + public static function get_amp_component_scripts() { $amp_scripts = self::$amp_scripts; foreach ( self::$embed_handlers as $embed_handler ) { @@ -512,15 +490,41 @@ public static function start_output_buffering() { * Finish output buffering. * * @todo Do this in shutdown instead of output buffering callback? + * @global int $content_width * @param string $output Buffered output. * @return string Finalized output. */ public static function finish_output_buffering( $output ) { + global $content_width; + + $dom = AMP_DOM_Utils::get_dom( $output ); + $args = array( + 'content_max_width' => ! empty( $content_width ) ? $content_width : AMP_Post_Template::CONTENT_MAX_WIDTH, // Back-compat. + ); + + $assets = AMP_Content_Sanitizer::sanitize_document( $dom, self::$sanitizer_classes, $args ); + + self::$amp_scripts = array_merge( self::$amp_scripts, $assets['scripts'] ); + self::$amp_styles = array_merge( self::$amp_styles, $assets['styles'] ); + + /* + * @todo The sanitize method needs to be updated to sanitize the entire HTML element and not just the BODY. + * This will require updating mandatory_parent_blacklist in amphtml-update.py to include elements that appear in the HEAD. + * This will ensure that the scripts and styles that plugins output via wp_head() will be sanitized as well. However, + * since the the old paired mode is sending content from the *body* we'll need to be able to filter out the elements + * from outside the body from being part of the whitelist sanitizer when it runs when theme support is not present, + * as otherwise elements from the HEAD could get added to the BODY. + */ + $output = preg_replace( + '#(
)#si', + '$1' . AMP_DOM_Utils::get_content_from_dom( $dom ) . '$3', + $output + ); // Inject required scripts. $output = preg_replace( '#' . preg_quote( self::COMPONENT_SCRIPTS_PLACEHOLDER, '#' ) . '#', - self::get_amp_component_scripts( $output ), + self::get_amp_component_scripts(), $output, 1 ); @@ -533,7 +537,6 @@ public static function finish_output_buffering( $output ) { 1 ); - // @todo Add more validation checking and potentially the whitelist sanitizer. return $output; } } diff --git a/includes/sanitizers/class-amp-rule-spec.php b/includes/sanitizers/class-amp-rule-spec.php index 694b7dcf7df..6bec8aa167a 100644 --- a/includes/sanitizers/class-amp-rule-spec.php +++ b/includes/sanitizers/class-amp-rule-spec.php @@ -87,22 +87,18 @@ abstract class AMP_Rule_Spec { */ public static $additional_allowed_tags = array( - /** - * An experimental tag with no protoascii - */ + // An experimental tag with no protoascii. 'amp-share-tracking' => array( 'attr_spec_list' => array(), 'tag_spec' => array(), ), - /** - * Needed for some tags such as analytics - */ + // Needed for some tags such as analytics. 'script' => array( 'attr_spec_list' => array( 'type' => array( 'mandatory' => true, - 'value_casei' => 'text/javascript', + 'value_casei' => 'application/json', ), ), 'tag_spec' => array(), diff --git a/includes/templates/class-amp-content-sanitizer.php b/includes/templates/class-amp-content-sanitizer.php index dcbd62097ef..ddf533091cd 100644 --- a/includes/templates/class-amp-content-sanitizer.php +++ b/includes/templates/class-amp-content-sanitizer.php @@ -7,23 +7,50 @@ /** * Class AMP_Content_Sanitizer + * + * @since 0.4.1 */ class AMP_Content_Sanitizer { /** - * Sanitize. + * Sanitize _content_. + * + * @since 0.4.1 * - * @param string $content Content. + * @param string $content HTML content string or DOM document. * @param string[] $sanitizer_classes Sanitizer classes. * @param array $global_args Global args. - * - * @return array + * @return array Tuple containing sanitized HTML, scripts array, and styles array. */ public static function sanitize( $content, array $sanitizer_classes, $global_args = array() ) { + $dom = AMP_DOM_Utils::get_dom_from_content( $content ); + + $results = self::sanitize_document( $dom, $sanitizer_classes, $global_args ); + return array( + AMP_DOM_Utils::get_content_from_dom( $dom ), + $results['scripts'], + $results['styles'], + ); + } + + /** + * Sanitize document. + * + * @since 0.7 + * + * @param DOMDocument $dom HTML document. + * @param string[] $sanitizer_classes Sanitizer classes. + * @param array $global_args Global args passed into . + * @return array { + * Scripts and styles needed by sanitizers. + * + * @type array $scripts Scripts. + * @type array $styles Styles. + * } + */ + public static function sanitize_document( &$dom, $sanitizer_classes, $global_args ) { $scripts = array(); $styles = array(); - $dom = AMP_DOM_Utils::get_dom_from_content( $content ); - foreach ( $sanitizer_classes as $sanitizer_class => $args ) { if ( ! class_exists( $sanitizer_class ) ) { /* translators: %s is sanitizer class */ @@ -31,6 +58,11 @@ public static function sanitize( $content, array $sanitizer_classes, $global_arg continue; } + /** + * Sanitizer. + * + * @type AMP_Base_Sanitizer $sanitizer + */ $sanitizer = new $sanitizer_class( $dom, array_merge( $global_args, $args ) ); if ( ! is_subclass_of( $sanitizer, 'AMP_Base_Sanitizer' ) ) { @@ -45,9 +77,7 @@ public static function sanitize( $content, array $sanitizer_classes, $global_arg $styles = array_merge( $styles, $sanitizer->get_styles() ); } - $sanitized_content = AMP_DOM_Utils::get_content_from_dom( $dom ); - - return array( $sanitized_content, $scripts, $styles ); + return compact( 'scripts', 'styles' ); } } diff --git a/includes/utils/class-amp-dom-utils.php b/includes/utils/class-amp-dom-utils.php index d5f54bcbbf4..9492dd4cee5 100644 --- a/includes/utils/class-amp-dom-utils.php +++ b/includes/utils/class-amp-dom-utils.php @@ -13,17 +13,45 @@ class AMP_DOM_Utils { /** - * Return a valid DOMDocument representing arbitrary HTML content passed as a parameter. + * HTML elements that are self-closing. * - * @see Reciprocal function get_content_from_dom() + * Not all are valid AMP, but we include them for completeness. * - * @since 0.2 + * @since 0.7 + * @link https://www.w3.org/TR/html5/syntax.html#serializing-html-fragments + * @var array + */ + private static $self_closing_tags = array( + 'area', + 'base', + 'basefont', + 'bgsound', + 'br', + 'col', + 'embed', + 'frame', + 'hr', + 'img', + 'input', + 'keygen', + 'link', + 'meta', + 'param', + 'source', + 'track', + 'wbr', + ); + + /** + * Return a valid DOMDocument representing HTML document passed as a parameter. * - * @param string $content Valid HTML content to be represented by a DOMDocument. + * @since 0.7 + * + * @param string $document Valid HTML document to be represented by a DOMDocument. * * @return DOMDocument|false Returns DOMDocument, or false if conversion failed. */ - public static function get_dom_from_content( $content ) { + public static function get_dom( $document ) { $libxml_previous_state = libxml_use_internal_errors( true ); $dom = new DOMDocument(); @@ -34,13 +62,7 @@ public static function get_dom_from_content( $content ) { * We can later use this to extract our nodes. * Add charset so loadHTML does not have problems parsing it. */ - $result = $dom->loadHTML( - sprintf( - '