From d5c1f80ad91c0d230032781f5c45538a31702853 Mon Sep 17 00:00:00 2001 From: Del Putnam Date: Mon, 16 Jan 2017 06:36:46 -0500 Subject: [PATCH] Changes to address issue #604. Replaced with which uses AMP_Allowed_Styles_Generated to validate CSS. Also replaced use of with a custom function which ignores delimiters in parentheses and quotation marks. --- bin/amp_wp_build_styles.py | 754 ++++++++++++++++++ .../class-amp-allowed-styles-generated.php | 234 ++++++ .../sanitizers/class-amp-style-sanitizer.php | 397 ++++++++- tests/test-amp-style-sanitizer.php | 204 +++-- 4 files changed, 1506 insertions(+), 83 deletions(-) create mode 100644 bin/amp_wp_build_styles.py create mode 100644 includes/sanitizers/class-amp-allowed-styles-generated.php diff --git a/bin/amp_wp_build_styles.py b/bin/amp_wp_build_styles.py new file mode 100644 index 00000000000..3c5e3857111 --- /dev/null +++ b/bin/amp_wp_build_styles.py @@ -0,0 +1,754 @@ +""" +This script is used to generate the 'class-amp-allowed-tags-generated.php' +file that is used by the class AMP_Tag_And_Attribute_Sanitizer. + +Follow the steps below to generate a new version of the allowed tags class: + +- Download a copy of the latet AMPHTML repository from github: + + git clone git@github.com:ampproject/amphtml.git + +- Copy this file into the repo's validator subdirectory: + + cp amp_wp_build.py amphtml/validator + +- Run the file from the validator subdirectory: + cd amphtml/validator;python amp_wp_build.py + +- The class-amp-allowed-tags-generated.php will be generated at: + amphtml/validator/amp_wp/class-amp-allowed-tags-generated.php + +- copy this file into the amp-wp plugin: + cp amp_wp/class-amp-allowed-tags-generated.php /path/to/wordpress/wp-content/plugins/amp-wp/includes/sanitizers/ + +Then have fun sanitizing your AMP posts! +""" + +import glob +import logging +import os +import platform +import re +import shutil +import subprocess +import sys +import tempfile +import collections + +def Die(msg): + print >> sys.stderr, msg + sys.exit(1) + + +def SetupOutDir(out_dir): + """Sets up a clean output directory. + + Args: + out_dir: directory name of the output directory. Must not have slashes, + dots, etc. + """ + #logging.info('entering ...') + assert re.match(r'^[a-zA-Z_\-0-9]+$', out_dir), 'bad out_dir: %s' % out_dir + + if os.path.exists(out_dir): + subprocess.check_call(['rm', '-rf', out_dir]) + os.mkdir(out_dir) + #logging.info('... done') + + +def GenValidatorPb2Py(out_dir): + """Calls the proto compiler to generate validator_pb2.py. + + Args: + out_dir: directory name of the output directory. Must not have slashes, + dots, etc. + """ + #logging.info('entering ...') + assert re.match(r'^[a-zA-Z_\-0-9]+$', out_dir), 'bad out_dir: %s' % out_dir + + subprocess.check_call(['protoc', 'validator.proto', + '--python_out=%s' % out_dir]) + open('%s/__init__.py' % out_dir, 'w').close() + #logging.info('... done') + + +def GenValidatorProtoascii(out_dir): + """Assembles the validator protoascii file from the main and extensions. + + Args: + out_dir: directory name of the output directory. Must not have slashes, + dots, etc. + """ + #logging.info('entering ...') + assert re.match(r'^[a-zA-Z_\-0-9]+$', out_dir), 'bad out_dir: %s' % out_dir + + protoascii_segments = [open('validator-main.protoascii').read()] + extensions = glob.glob('extensions/*/0.1/validator-*.protoascii') + # In the Github project, the extensions are located in a sibling directory + # to the validator rather than a child directory. + if not extensions: + extensions = glob.glob('../extensions/*/0.1/validator-*.protoascii') + extensions.sort() + for extension in extensions: + protoascii_segments.append(open(extension).read()) + f = open('%s/validator.protoascii' % out_dir, 'w') + f.write(''.join(protoascii_segments)) + f.close() + + #logging.info('... done') + + +def GeneratePHP(out_dir): + """Calls validator_gen_md to generate validator-generated.md. + + Args: + out_dir: directory name of the output directory. Must not have slashes, + dots, etc. + """ + #logging.info('entering ...') + assert re.match(r'^[a-zA-Z_\-0-9]+$', out_dir), 'bad out_dir: %s' % out_dir + + allowed_tags, attr_lists, versions = ParseRules(out_dir) + + #Generate the output + out = [] + GenerateHeaderPHP(out) + GenerateSpecVersionPHP(out, versions) + GenerateAllowedStylesPHP(out, allowed_tags) + # GenerateLayoutAttributesPHP(out, attr_lists) + # GenerateGlobalAttributesPHP(out, attr_lists) + GenerateFooterPHP(out) + + # join out array into a single string and remove unneeded whitespace + output = re.sub("\\(\\s*\\)", "()", '\n'.join(out)) + + # replace 'True' with true and 'False' with false + output = re.sub("'True'", "true", output) + output = re.sub("'False'", "false", output) + + # Write the php file to disk. + f = open('%s/class-amp-allowed-styles-generated.php' % out_dir, 'w') + # f.write('\n'.join(out)) + f.write(output) + f.close() + #logging.info('... done') + + +def GenerateHeaderPHP(out): + #logging.info('entering ...') + + # Output the file's header + out.append(' array(' % tag.lower()) + for attributes in attributes_list: + out.append('\t\t\tarray(') + GenerateAttributesPHP(out, attributes) + out.append('\t\t\t),') + out.append('\t\t),') + #logging.info('... done with: %s' % tag.lower()) + + +def GenerateAttributesPHP(out, attributes, indent_level = 4): + #logging.info('entering ...') + + indent = '' + for i in range(0,indent_level): + indent += '\t' + + sorted_attributes = sorted(attributes.items()) + for (attribute, values) in collections.OrderedDict(sorted_attributes).iteritems(): + #logging.info('generating php for attribute: %s...' % attribute.lower()) + out.append('%s\'%s\' => array(' % (indent, attribute.lower())) + GeneratePropertiesPHP(out, values) + out.append('%s),' % indent) + #logging.info('...done with: %s' % attribute.lower()) + + out.append('') + #logging.info('... done') + + +def GeneratePropertiesPHP(out, properties, indent_level = 5): + #logging.info('entering ...') + indent = '' + for i in range(0,indent_level): + indent += '\t' + + sorted_properties = sorted(properties.items()) + for (prop, values) in collections.OrderedDict(sorted_properties).iteritems(): + if 'html_format' == prop: + sorted_values = sorted(values.items()) + for(value_type, value) in collections.OrderedDict(sorted_values).iteritems(): + out.append('%s\'%s\' => array(' % (indent, value_type.lower())) + GenerateValuesPHP(out, value, 5) + out.append('%s),' % indent) + #logging.info('generating php for property: %s...' % prop.lower()) + elif isinstance(values, (str, bool, int)): + if isinstance(values, str): + values = values.lower() + out.append('%s\'%s\' => \'%s\',' % (indent, prop.lower(), values)) + else: + out.append('%s\'%s\' => array(' % (indent, prop.lower())) + if isinstance(values, dict): + sorted_values = sorted(values.items()) + for(value_type, value) in collections.OrderedDict(sorted_values).iteritems(): + if isinstance(value, (str, bool, int)): + if isinstance(value, str): + value = value.lower() + out.append('%s\t\'%s\' => \'%s\',' % (indent, value_type, value)) + else: + out.append('%s\t\'%s\' => array(' % (indent, value_type.lower())) + GenerateValuesPHP(out, value, 7) + out.append('%s\t),' % indent) + if isinstance(values, list): + for value in values: + if isinstance(value, dict): + out.append('%sarray(' % indent) + sorted_items = sorted(value.items()) + for (k,v) in collections.OrderedDict(sorted_items).iteritems(): + out.append('%s\t\'%s\' => \'%s\',' % (indent, k, v)) + out.append('%s),' % indent) + + elif isinstance(value, (str,int,bool)): + out.append('%s\t\'%s\',' % (indent,value)) + + out.append('%s),' % indent) + + #logging.info('...done with: %s' % prop.lower()) + + #logging.info('...done') + + +def GenerateValuesPHP(out, values, indent_level = 6): + #logging.info('entering...') + + indent = '' + for i in range(0, indent_level): + indent += '\t' + + if isinstance(values, dict): + sorted_values = sorted(values.items()) + for (key, value) in collections.OrderedDict(sorted_values).iteritems(): + + #logging.info('generating php for value: %s...' % key.lower()) + + if isinstance(value, (str, bool)): + out.append('%s\'%s\' => \'%s\',' % (indent, key.lower(), value)) + + if isinstance(value, list): + out.append('%s\'%s\' => array(' % (indent, key.lower())) + sorted_value = sorted(value) + for v in sorted_value: + out.append('%s\t\'%s\',' % (indent, v)) + out.append('%s),' % indent) + + #logging.info('...done with: %s' % key.lower()) + + elif isinstance(values, list): + sorted_values = sorted(values) + for v in sorted_values: + #logging.info('generating php for value: %s' % v.lower()) + out.append('%s\t\'%s\',' % (indent, v.lower())) + #logging.info('...done with: %s' % v.lower()) + + #logging.info('...done') + + +def GenerateFooterPHP(out): + #logging.info('entering ...') + + # Output the footer. + out.append('\tpublic static function get_allowed_styles() {') + out.append('\t\treturn self::$allowed_styles;') + out.append('\t}') + out.append('') + + out.append('\tpublic static function get_custom_styles_for_amp_html() {') + out.append('\t\tforeach ( self::$allowed_styles[\'style\'] as $value) {') + out.append('\t\t\tif ( isset( $value[\'attr_spec_list\'][\'amp-custom\'] ) &&') + out.append('\t\t\t\tin_array( \'amp\', $value[\'tag_spec\'][\'html_format\'] ) ) {') + out.append('\t\t\t\treturn $value;') + out.append('\t\t\t}') + out.append('\t\t}') + out.append('\t}') + out.append('') + + # out.append('\tpublic static function get_allowed_attributes() {') + # out.append('\t\treturn self::$globally_allowed_attrs;') + # out.append('\t}') + # out.append('') + + # out.append('\tpublic static function get_layout_attributes() {') + # out.append('\t\treturn self::$layout_allowed_attrs;') + # out.append('\t}') + # out.append('') + + out.append('}') + out.append('') + + out.append('?>') + out.append('') + #logging.info('... done') + + +def ParseRules(out_dir): + #logging.info('entering ...') + + # These imports happen late, within this method because they don't necessarily + # exist when the module starts running, and the ones that probably do + # are checked by CheckPrereqs. + from google.protobuf import text_format + from amp_wp import validator_pb2 + import validator_gen_md + + allowed_tags = {} + attr_lists = {} + versions = {} + + specfile='%s/validator.protoascii' % out_dir + + validator_pb2=validator_pb2 + text_format=text_format + + # Merge specfile with message buffers. + rules = validator_pb2.ValidatorRules() + text_format.Merge(open(specfile).read(), rules) + + # Record the version of this specfile and the corresponding validator version. + if rules.HasField('spec_file_revision'): + versions['spec_file_revision'] = rules.spec_file_revision + + if rules.HasField('min_validator_revision_required'): + versions['min_validator_revision_required'] = rules.min_validator_revision_required + + # Build a dictionary of the named attribute lists that are used by multiple tags. + for (field_desc, field_val) in rules.ListFields(): + if 'attr_lists' == field_desc.name: + for attr_spec in field_val: + attr_lists[UnicodeEscape(attr_spec.name)] = GetAttrs(attr_spec.attrs) + + # Build a dictionary of allowed tags and an associated list of their allowed + # attributes, values and other criteria. + + # Don't include tags that have a mandatory parent with one of these tag names + # since we're only concerned with using this tag list to validate the body + # of the DOM + mandatory_parent_blacklist = [ + '$ROOT', + '!DOCTYPE', + 'HTML', + 'HEAD', + ] + + for (field_desc, field_val) in rules.ListFields(): + if 'tags' == field_desc.name: + for tag_spec in field_val: + + # Ignore tags that are outside of the body + #if tag_spec.HasField('mandatory_parent') and tag_spec.mandatory_parent in mandatory_parent_blacklist and tag_spec.tag_name != 'BODY': + # continue + + if tag_spec.tag_name != 'STYLE': + continue + + # Ignore the special $REFERENCE_POINT tag + # if '$REFERENCE_POINT' == tag_spec.tag_name: + # continue + + # Ignore deprecated tags + if tag_spec.HasField('deprecation'): + continue + + # If we made it here, then start adding the tag_spec + if tag_spec.tag_name not in allowed_tags: + tag_list = [] + else: + tag_list = allowed_tags[UnicodeEscape(tag_spec.tag_name)] + # AddTag(allowed_tags, tag_spec, attr_lists) + tag_list.append(GetTagSpec(tag_spec, attr_lists)) + allowed_tags[UnicodeEscape(tag_spec.tag_name)] = tag_list + + #logging.info('... done') + return allowed_tags, attr_lists, versions + + +def GetTagSpec(tag_spec, attr_lists): + #logging.info('entering ...') + + tag_dict = GetTagRules(tag_spec) + attr_dict = GetAttrs(tag_spec.attrs) + cdata_dict = GetCdataRules(tag_spec.cdata) + # print( cdata_dict ) + # TODO: add CDATA section if validation of non-body elements is required. + + # Now add attributes from any attribute lists to this tag. + for (tag_field_desc, tag_field_val) in tag_spec.ListFields(): + if 'attr_lists' == tag_field_desc.name: + for attr_list in tag_field_val: + attr_dict.update(attr_lists[UnicodeEscape(attr_list)]) + + #logging.info('... done') + return {'tag_spec':tag_dict, 'attr_spec_list':attr_dict, 'cdata_spec_list':cdata_dict} + + +def GetTagRules(tag_spec): + #logging.info('entering ...') + + tag_rules = {} + + if tag_spec.also_requires_tag: + also_requires_tag_list = [] + for also_requires_tag in tag_spec.also_requires_tag: + also_requires_tag_list.append(UnicodeEscape(also_requires_tag)) + tag_rules['also_requires_tag'] = {'also_requires_tag': also_requires_tag_list} + + if tag_spec.disallowed_ancestor: + disallowed_ancestor_list = [] + for disallowed_ancestor in tag_spec.disallowed_ancestor: + disallowed_ancestor_list.append(UnicodeEscape(disallowed_ancestor)) + tag_rules['disallowed_ancestor'] = {'disallowed_ancestor': disallowed_ancestor_list} + + if tag_spec.html_format: + html_format_list = [] + for html_format in tag_spec.html_format: + if 1 == html_format: + html_format_list.append('amp') + elif 2 == html_format: + html_format_list.append('amp4ads') + tag_rules['html_format'] = {'html_format': html_format_list} + + if tag_spec.HasField('mandatory'): + tag_rules['mandatory'] = tag_spec.mandatory + + if tag_spec.HasField('mandatory_alternatives'): + tag_rules['mandatory_alternatives'] = UnicodeEscape(tag_spec.mandatory_alternatives) + + if tag_spec.HasField('mandatory_ancestor'): + tag_rules['mandatory_ancestor'] = UnicodeEscape(tag_spec.mandatory_ancestor) + + if tag_spec.HasField('mandatory_ancestor_suggested_alternative'): + tag_rules['mandatory_ancestor_suggested_alternative'] = UnicodeEscape(tag_spec.mandatory_ancestor_suggested_alternative) + + if tag_spec.HasField('mandatory_parent'): + tag_rules['mandatory_parent'] = UnicodeEscape(tag_spec.mandatory_parent) + + if tag_spec.HasField('spec_name'): + tag_rules['spec_name'] = UnicodeEscape(tag_spec.spec_name) + + if tag_spec.HasField('spec_url'): + tag_rules['spec_url'] = UnicodeEscape(tag_spec.spec_url) + + if tag_spec.HasField('unique'): + tag_rules['unique'] = tag_spec.unique + + if tag_spec.HasField('unique_warning'): + tag_rules['unique_warning'] = tag_spec.unique_warning + + + + #logging.info('... done') + return tag_rules + + +def GetAttrs(attrs): + #logging.info('entering ...') + + attr_dict = {} + for attr_spec in attrs: + + value_dict = GetValues(attr_spec) + + # Add attribute name and alternative_names + attr_dict[UnicodeEscape(attr_spec.name)] = value_dict + + #logging.info('... done') + return attr_dict + + +def GetValues(attr_spec): + #logging.info('entering ...') + + value_dict = {} + + # Add alternative names + if attr_spec.alternative_names: + alt_names_list = [] + for alternative_name in attr_spec.alternative_names: + alt_names_list.append(UnicodeEscape(alternative_name)) + value_dict['alternative_names'] = {'alternative_names': alt_names_list} + + # Add blacklisted value regex + if attr_spec.HasField('blacklisted_value_regex'): + value_dict['blacklisted_value_regex'] = UnicodeEscape(attr_spec.blacklisted_value_regex) + + # dispatch_key is a boolean + if attr_spec.HasField('dispatch_key'): + value_dict['dispatch_key'] = attr_spec.dispatch_key + + # mandatory is a boolean + if attr_spec.HasField('mandatory'): + value_dict['mandatory'] = attr_spec.mandatory + + # Add allowed value + if attr_spec.HasField('value'): + value_dict['value'] = UnicodeEscape(attr_spec.value) + + # value_casei + if attr_spec.HasField('value_casei'): + value_dict['value_casei'] = UnicodeEscape(attr_spec.value_casei) + + # value_regex + if attr_spec.HasField('value_regex'): + value_dict['value_regex'] = UnicodeEscape(attr_spec.value_regex) + + # value_regex_casei + if attr_spec.HasField('value_regex_casei'): + value_dict['value_regex_casei'] = UnicodeEscape(attr_spec.value_regex_casei) + + #value_properties is a dictionary of dictionaries + if attr_spec.HasField('value_properties'): + value_properties_dict = {} + for (value_properties_key, value_properties_val) in attr_spec.value_properties.ListFields(): + for value_property in value_properties_val: + property_dict = {} + # print 'value_property.name: %s' % value_property.name + for (key,val) in value_property.ListFields(): + if val != value_property.name: + if isinstance(val, unicode): + val = UnicodeEscape(val) + property_dict[UnicodeEscape(key.name)] = val + value_properties_dict[UnicodeEscape(value_property.name)] = property_dict + value_dict['value_properties'] = value_properties_dict + + # value_url is a dictionary + if attr_spec.HasField('value_url'): + value_url_dict = {} + for (value_url_key, value_url_val) in attr_spec.value_url.ListFields(): + if isinstance(value_url_val, (list, collections.Sequence)): + value_url_val_val = [] + for val in value_url_val: + value_url_val_val.append(UnicodeEscape(val)) + else: + value_url_val_val = value_url_val + value_url_dict[value_url_key.name] = value_url_val_val + value_dict['value_url'] = value_url_dict + + #logging.info('... done') + return value_dict + +def GetCdataRules(cdata): + #logging.info('entering ...') + + cdata_rules = {} + + for (key,val) in cdata.ListFields(): + + if isinstance(val, (str,bool,int)): + cdata_rules[key.name] = val + + elif isinstance(val, unicode): + cdata_rules[key.name] = UnicodeEscape(val) + + elif 'css_spec' == key.name: + css_spec = {} + for (k,v) in val.ListFields(): + + if 'at_rule_spec' == k.name: + if k.name not in css_spec: + at_rule_spec = [] + else: + at_rule_spec = css_spec[UnicodeEscape(k.name)] + + for vv in v: + if vv.HasField('name'): + at_rule_spec.append(vv.name) + + css_spec[UnicodeEscape(k.name)] = at_rule_spec + + elif 'image_url_spec' == k.name or 'font_url_spec' == k.name: + if k.name not in css_spec: + url_spec = {} + else: + url_spec = css_spec[UnicodeEscape(k.name)] + + for (usk,usv) in v.ListFields(): + + if isinstance(usv,(str,bool,int)): + url_spec[UnicodeEscape(usk.name)] = usv + + elif isinstance(usv,(unicode)): + url_spec[UnicodeEscape(usk.name)] = UnicodeEscape(usv) + + + elif isinstance(usv, (list,collections.Sequence)): + if usk.name not in url_spec: + url_spec_list = [] + else: + url_spec_list = url_spec[UnicodeEscape(usk.name)] + + for usvv in usv: + url_spec_list.append(UnicodeEscape(usvv)) + + url_spec[UnicodeEscape(usk.name)] = url_spec_list + + css_spec[UnicodeEscape(k.name)] = url_spec + + cdata_rules[key.name] = css_spec + + elif 'blacklisted_cdata_regex' == key.name: + blacklisted_cdata_regex_list = [] + for v in val: + blacklisted_cdata_regex_list.append(UnicodeEscape(v.regex)) + + cdata_rules[key.name] = blacklisted_cdata_regex_list + + + + + + # if 'validate_amp4ads' == k.name: + # css_spec[k.name] = v + # if 'at_rule_spec' == k.name: + # at_rule_spec = [] + # for vv in v: + # at_rule_spec_fields = {} + # for (arsk,arsv) in vv.ListFields(): + # at_rule_spec_fields[arsk.name] = arsv + # css_spec[k.name] = at_rule_spec_fields + # elif ('font_url_spec' == k.name) or ('image_url_spec' == k.name): + # url_spec = {} + # for (usk,usv) in v.ListFields(): + # if isinstance(usv, collections.Sequence): + # usv_list = [] + # for usvv in usv: + # usv_list.append(UnicodeEscape(usvv)) + # url_spec[usk.name] = usv_list + # else: + # url_spec[usk.name] = usv + # css_spec[k.name] = url_spec + # cdata_rules[key.name] = css_spec + # elif 'blacklisted_cdata_regex' == key.name: + # bcr_list = [] + # for v in val: + # bcr_list_fields = {} + # for (kk,vv) in v.ListFields(): + # bcr_list_fields[kk.name] = vv + # bcr_list.append(bcr_list_fields) + # cdata_rules[key.name] = bcr_list + # elif isinstance(val, bool): + # cdata_rules[key.name] = val + # elif isinstance(val, unicode): + # cdata_rules[key.name] = UnicodeEscape(val) + # elif isinstance(val, str): + # cdata_rules[key.name] = val + # elif isinstance(val, int): + # cdata_rules[key.name] = val + + return cdata_rules + +def GetAtRuleSpec(at_rule_spec): + + for v in at_rule_spec: + + print "%s %s" % (v.name, type(v.name)) + +def GetCssSpec(css_spec): + css_spec = {} + for (k,v) in css_spec: + if isinstance(v, (str,bool,int)): + css_spec[k.name] = v + elif isinstance(v, unicode): + css_spec[k.name] = UnicodeEscape(v) + + return css_spec + +def UnicodeEscape(string): + """Helper function which escapes unicode characters. + + Args: + string: A string which may contain unicode characters. + Returns: + An escaped string. + """ + return ('' + string).encode('unicode-escape') + + +def Main(): + """The main method, which executes all build steps and runs the tests.""" + logging.basicConfig( + format='[[%(filename)s %(funcName)s]] - %(message)s', level=logging.INFO) + + out_dir = 'amp_wp' + + SetupOutDir(out_dir) + GenValidatorProtoascii(out_dir) + GenValidatorPb2Py(out_dir) + GenValidatorProtoascii(out_dir) + GeneratePHP(out_dir) + +if __name__ == '__main__': + Main() diff --git a/includes/sanitizers/class-amp-allowed-styles-generated.php b/includes/sanitizers/class-amp-allowed-styles-generated.php new file mode 100644 index 00000000000..62218f4496b --- /dev/null +++ b/includes/sanitizers/class-amp-allowed-styles-generated.php @@ -0,0 +1,234 @@ + array( + array( + 'attr_spec_list' => array( + 'amp-custom' => array( + 'mandatory' => true, + 'value' => '', + ), + 'type' => array( + 'value_casei' => 'text/css', + ), + ), + 'cdata_spec_list' => array( + 'blacklisted_cdata_regex' => array( + '