Skip to content

Commit

Permalink
Replace attribute invalid characters with _, vs stripping
Browse files Browse the repository at this point in the history
Fixes #2143
  • Loading branch information
jhy committed Jul 10, 2024
1 parent 68f6f9c commit 6cbe7e4
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 10 deletions.
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
* Updated the `button` tag configuration to include a space between multiple button elements in the `Element.text()`
method. [2105](https://github.com/jhy/jsoup/issues/2105)
* Added support for the `ns|*` all elements in namespace Selector. [1811](https://github.com/jhy/jsoup/issues/1811)
* When normalising attribute names during serialization, invalid characters are now replaced with `_`, vs being
stripped. This should make the process clearer, and generally prevent an invalid attribute name being coerced
unexpectedly. [2143](https://github.com/jhy/jsoup/issues/2143)

### Changes

Expand Down
15 changes: 10 additions & 5 deletions src/main/java/org/jsoup/nodes/Attribute.java
Original file line number Diff line number Diff line change
Expand Up @@ -174,18 +174,23 @@ static void htmlNoValidate(String key, @Nullable String val, Appendable accum, D
}

private static final Pattern xmlKeyValid = Pattern.compile("[a-zA-Z_:][-a-zA-Z0-9_:.]*");
private static final Pattern xmlKeyReplace = Pattern.compile("[^-a-zA-Z0-9_:.]");
private static final Pattern xmlKeyReplace = Pattern.compile("[^-a-zA-Z0-9_:.]+");
private static final Pattern htmlKeyValid = Pattern.compile("[^\\x00-\\x1f\\x7f-\\x9f \"'/=]+");
private static final Pattern htmlKeyReplace = Pattern.compile("[\\x00-\\x1f\\x7f-\\x9f \"'/=]");
private static final Pattern htmlKeyReplace = Pattern.compile("[\\x00-\\x1f\\x7f-\\x9f \"'/=]+");

/**
* Get a valid attribute key for the given syntax. If the key is not valid, it will be coerced into a valid key.
* @param key the original attribute key
* @param syntax HTML or XML
* @return the original key if it's valid; a key with invalid characters replaced with "_" otherwise; or null if a valid key could not be created.
*/
@Nullable public static String getValidKey(String key, Syntax syntax) {
// we consider HTML attributes to always be valid. XML checks key validity
if (syntax == Syntax.xml && !xmlKeyValid.matcher(key).matches()) {
key = xmlKeyReplace.matcher(key).replaceAll("");
key = xmlKeyReplace.matcher(key).replaceAll("_");
return xmlKeyValid.matcher(key).matches() ? key : null; // null if could not be coerced
}
else if (syntax == Syntax.html && !htmlKeyValid.matcher(key).matches()) {
key = htmlKeyReplace.matcher(key).replaceAll("");
key = htmlKeyReplace.matcher(key).replaceAll("_");
return htmlKeyValid.matcher(key).matches() ? key : null; // null if could not be coerced
}
return key;
Expand Down
4 changes: 2 additions & 2 deletions src/test/java/org/jsoup/helper/W3CDomTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ public void handlesInvalidAttributeNames() {

Document w3Doc = W3CDom.convert(jsoupDoc);
String xml = W3CDom.asString(w3Doc, W3CDom.OutputXml());
assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body name=\"\" style=\"color: red\"/></html>", xml);
assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body _=\"\" name_=\"\" style=\"color: red\"/></html>", xml);
}

@Test
Expand All @@ -175,7 +175,7 @@ public void xmlInputDocMaintainsHtmlAttributeNames() {

Document w3Doc = W3CDom.convert(jsoupDoc);
String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
String expected = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p hnh=\"2\">unicode attr names coerced</p></body></html>";
String expected = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p h_nh=\"2\">unicode attr names coerced</p></body></html>";
assertEquals(expected, TextUtil.stripNewlines(out));
}

Expand Down
4 changes: 2 additions & 2 deletions src/test/java/org/jsoup/parser/HtmlParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ private static Stream<Arguments> dupeAttributeData() {
// NOTE: per spec this should be the test case. but impacts too many ppl
// assertEquals("<p =a>One<a <p>Something</a></p>\n<a <p>Else</a>", doc.body().html());

assertEquals("<p a>One<a></a></p><p><a>Something</a></p><a>Else</a>", TextUtil.stripNewlines(doc.body().html()));
assertEquals("<p _a>One<a></a></p><p><a>Something</a></p><a>Else</a>", TextUtil.stripNewlines(doc.body().html()));

doc = Jsoup.parse("<p .....>");
assertEquals("<p .....></p>", doc.body().html());
Expand Down Expand Up @@ -1522,7 +1522,7 @@ private boolean didAddElements(String input) {
assertEquals(Document.OutputSettings.Syntax.html, doc.outputSettings().syntax());

String out = doc.body().outerHtml();
assertEquals("<body style=\"color: red\" name>\n <div></div>\n</body>", out);
assertEquals("<body style=\"color: red\" _ name_>\n <div _></div>\n</body>", out);
}

@Test void templateInHead() {
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ public void handlesLTinScript() {
assertEquals(Syntax.xml, doc.outputSettings().syntax());

String out = doc.html();
assertEquals("<body style=\"color: red\" name=\"\"><div></div></body>", out);
assertEquals("<body style=\"color: red\" _=\"\" name_=\"\"><div _=\"\"></div></body>", out);
}

@Test void customTagsAreFlyweights() {
Expand Down

0 comments on commit 6cbe7e4

Please sign in to comment.