-
Notifications
You must be signed in to change notification settings - Fork 383
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
added host validation for urls #983
Changes from 15 commits
a39e073
843760a
f78ac49
4f6dcc2
b8fed2e
5a6ae74
2067992
331d152
454b285
7a3e497
ea52b1e
aca760d
3ced093
790dec8
8e6698b
a04782a
3604c6e
0d9d50a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
+2 −2 | check-diff.sh | |
+16 −7 | phpunit-plugin-bootstrap.php | |
+1 −1 | travis.install.sh |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -699,6 +699,19 @@ private function validate_attr_spec_list_for_node( $node, $attr_spec_list ) { | |
} | ||
} | ||
|
||
/* | ||
* If given attribute's value is a URL with a host, the host must | ||
* be valid | ||
*/ | ||
if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ] ) ) { | ||
$result = $this->check_attr_spec_rule_valid_url( $node, $attr_name, $attr_spec_rule ); | ||
if ( AMP_Rule_Spec::PASS === $result ) { | ||
$score++; | ||
} elseif ( AMP_Rule_Spec::FAIL === $result ) { | ||
return 0; | ||
} | ||
} | ||
|
||
/* | ||
* If the given attribute's value is *not* a relative path, and the rule's | ||
* value is `false`, then pass. | ||
|
@@ -877,6 +890,9 @@ private function delegated_sanitize_disallowed_attribute_values_in_node( $node, | |
} elseif ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ] ) && | ||
AMP_Rule_Spec::FAIL === $this->check_attr_spec_rule_allowed_protocol( $node, $attr_name, $attr_spec_rule ) ) { | ||
$should_remove_node = true; | ||
} elseif ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ] ) && | ||
AMP_Rule_Spec::FAIL === $this->check_attr_spec_rule_valid_url( $node, $attr_name, $attr_spec_rule ) ) { | ||
$should_remove_node = true; | ||
} elseif ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOW_RELATIVE ] ) && | ||
AMP_Rule_Spec::FAIL === $this->check_attr_spec_rule_disallowed_relative( $node, $attr_name, $attr_spec_rule ) ) { | ||
$should_remove_node = true; | ||
|
@@ -1122,6 +1138,47 @@ private function check_attr_spec_rule_value_regex_casei( $node, $attr_name, $att | |
return AMP_Rule_Spec::NOT_APPLICABLE; | ||
} | ||
|
||
/** | ||
* Check if attribute has a valid host value | ||
* | ||
* @since 0.7 | ||
* | ||
* @param DOMElement $node Node. | ||
* @param string $attr_name Attribute name. | ||
* @param array[]|string[] $attr_spec_rule Attribute spec rule. | ||
* | ||
* @return string: | ||
* - AMP_Rule_Spec::PASS - $attr_name has a value that matches the rule. | ||
* - AMP_Rule_Spec::FAIL - $attr_name has a value that does *not* match rule. | ||
* - AMP_Rule_Spec::NOT_APPLICABLE - $attr_name does not exist or there | ||
* is no rule for this attribute. | ||
*/ | ||
private function check_attr_spec_rule_valid_url( $node, $attr_name, $attr_spec_rule ) { | ||
if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ] ) ) { | ||
if ( $node->hasAttribute( $attr_name ) ) { | ||
$urls_to_test = preg_split( '/\s*,\s*/', $node->getAttribute( $attr_name ) ); | ||
foreach ( $urls_to_test as $url ) { | ||
$url = urldecode( $url ); | ||
// Check if the host contains invalid chars. | ||
$url_host = wp_parse_url( $url, PHP_URL_HOST ); | ||
if ( $url_host && preg_match( '/[!"#$%&\'()*+,\/:;<=>?@[\]^`{|}~\s]/i', $url_host ) ) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I should think that a whitelist should be used rather than a blacklist here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tend to think like you, but the amp validator is doing it this way, looking for blacklisted chars, so i thought it would be better to follow the same approach. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this is supposed to match everything matched by |
||
return AMP_Rule_Spec::FAIL; | ||
} | ||
|
||
// Check if the protocol contains invalid chars. | ||
$dots_pos = strpos( $url, ':' ); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this not be using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case, you could omit There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The point is that |
||
if ( false !== $dots_pos && preg_match( '/[!"#$%&\'()*+,\/:;<=>?@[\]^`{|}~\s]/i', substr( $url, 0, $dots_pos ) ) ) { | ||
return AMP_Rule_Spec::FAIL; | ||
} | ||
} | ||
|
||
return AMP_Rule_Spec::PASS; | ||
} | ||
} | ||
|
||
return AMP_Rule_Spec::NOT_APPLICABLE; | ||
} | ||
|
||
/** | ||
* Check if attribute has a protocol value rule determine if it matches. | ||
* | ||
|
@@ -1138,9 +1195,7 @@ private function check_attr_spec_rule_value_regex_casei( $node, $attr_name, $att | |
private function check_attr_spec_rule_allowed_protocol( $node, $attr_name, $attr_spec_rule ) { | ||
if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOWED_PROTOCOL ] ) ) { | ||
if ( $node->hasAttribute( $attr_name ) ) { | ||
$attr_value = $node->getAttribute( $attr_name ); | ||
$attr_value = preg_replace( '/\s*,\s*/', ',', $attr_value ); | ||
$urls_to_test = explode( ',', $attr_value ); | ||
$urls_to_test = preg_split( '/\s*,\s*/', $node->getAttribute( $attr_name ) ); | ||
foreach ( $urls_to_test as $url ) { | ||
/* | ||
* This seems to be an acceptable check since the AMP validator | ||
|
@@ -1157,9 +1212,7 @@ private function check_attr_spec_rule_allowed_protocol( $node, $attr_name, $attr | |
} elseif ( isset( $attr_spec_rule[ AMP_Rule_Spec::ALTERNATIVE_NAMES ] ) ) { | ||
foreach ( $attr_spec_rule[ AMP_Rule_Spec::ALTERNATIVE_NAMES ] as $alternative_name ) { | ||
if ( $node->hasAttribute( $alternative_name ) ) { | ||
$attr_value = $node->getAttribute( $alternative_name ); | ||
$attr_value = preg_replace( '/\s*,\s*/', ',', $attr_value ); | ||
$urls_to_test = explode( ',', $attr_value ); | ||
$urls_to_test = preg_split( '/\s*,\s*/', $node->getAttribute( $alternative_name ) ); | ||
foreach ( $urls_to_test as $url ) { | ||
/* | ||
* This seems to be an acceptable check since the AMP validator | ||
|
@@ -1196,9 +1249,7 @@ private function check_attr_spec_rule_allowed_protocol( $node, $attr_name, $attr | |
private function check_attr_spec_rule_disallowed_relative( $node, $attr_name, $attr_spec_rule ) { | ||
if ( isset( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOW_RELATIVE ] ) && ! ( $attr_spec_rule[ AMP_Rule_Spec::VALUE_URL ][ AMP_Rule_Spec::ALLOW_RELATIVE ] ) ) { | ||
if ( $node->hasAttribute( $attr_name ) ) { | ||
$attr_value = $node->getAttribute( $attr_name ); | ||
$attr_value = preg_replace( '/\s*,\s*/', ',', $attr_value ); | ||
$urls_to_test = explode( ',', $attr_value ); | ||
$urls_to_test = preg_split( '/\s*,\s*/', $node->getAttribute( $attr_name ) ); | ||
foreach ( $urls_to_test as $url ) { | ||
$parsed_url = AMP_WP_Utils::parse_url( $url ); | ||
|
||
|
@@ -1217,9 +1268,7 @@ private function check_attr_spec_rule_disallowed_relative( $node, $attr_name, $a | |
} elseif ( isset( $attr_spec_rule[ AMP_Rule_Spec::ALTERNATIVE_NAMES ] ) ) { | ||
foreach ( $attr_spec_rule[ AMP_Rule_Spec::ALTERNATIVE_NAMES ] as $alternative_name ) { | ||
if ( $node->hasAttribute( $alternative_name ) ) { | ||
$attr_value = $node->getAttribute( $alternative_name ); | ||
$attr_value = preg_replace( '/\s*,\s*/', ',', $attr_value ); | ||
$urls_to_test = explode( ',', $attr_value ); | ||
$urls_to_test = preg_split( '/\s*,\s*/', $node->getAttribute( $alternative_name ) ); | ||
foreach ( $urls_to_test as $url ) { | ||
$parsed_url = AMP_WP_Utils::parse_url( $url ); | ||
if ( empty( $parsed_url['scheme'] ) ) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -680,6 +680,19 @@ public function get_body_data() { | |
'<a class="foo" href="">value</a>', | ||
), | ||
|
||
'a_with_wrong_host' => array( | ||
'<a class="foo" href="http://foo bar">value</a>', | ||
'<a class="foo" href="">value</a>', | ||
), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably should add a test here for scheme-less URLs, like: |
||
'a_with_wrong_schemeless_host' => array( | ||
'<a class="foo" href="//bad domain with a space.com/foo">value</a>', | ||
'<a class="foo" href="">value</a>', | ||
), | ||
'a_with_mail_host' => array( | ||
'<a class="foo" href="mail to:foo@bar.com">value</a>', | ||
'<a class="foo" href="">value</a>', | ||
), | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add test cases for examples found in #977 (comment) |
||
// font is removed so we should check that other elements are checked as well. | ||
'font_with_other_bad_elements' => array( | ||
'<font size="1">Headline</font><span style="color: blue">Span</span>', | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add a
@since 0.7
here