Skip to content

Commit

Permalink
remove DOCTYPE declarations (#1260)
Browse files Browse the repository at this point in the history
  • Loading branch information
Athou committed Jan 10, 2025
1 parent 74f7c48 commit 62d3ed1
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.commafeed.backend.feed.parser;

import java.util.Collection;
import java.util.regex.Pattern;

import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
Expand All @@ -11,6 +12,8 @@
@Singleton
class FeedCleaner {

private static final Pattern DOCTYPE_PATTERN = Pattern.compile("<!DOCTYPE[^>]*>", Pattern.CASE_INSENSITIVE);

public String trimInvalidXmlCharacters(String xml) {
if (StringUtils.isBlank(xml)) {
return null;
Expand Down Expand Up @@ -60,4 +63,8 @@ public String replaceHtmlEntitiesWithNumericEntities(String source) {
return sb.toString();
}

public String removeDoctypeDeclarations(String xml) {
return DOCTYPE_PATTERN.matcher(xml).replaceAll("");
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedException {
throw new FeedException("Input string is null for url " + feedUrl);
}
xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString);
xmlString = feedCleaner.removeDoctypeDeclarations(xmlString);

InputSource source = new InputSource(new StringReader(xmlString));
SyndFeed feed = new SyndFeedInput().build(source);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,22 @@ void testReplaceHtmlEntitiesWithNumericEntities() {
Assertions.assertEquals("<source>T&#180;l&#180;phone &#8242;</source>", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
}

@Test
void testRemoveDoctype() {
String source = "<!DOCTYPE html><html><head></head><body></body></html>";
Assertions.assertEquals("<html><head></head><body></body></html>", feedCleaner.removeDoctypeDeclarations(source));
}

@Test
void testRemoveMultilineDoctype() {
String source = """
<!DOCTYPE
html
>
<html><head></head><body></body></html>""";
Assertions.assertEquals("""
<html><head></head><body></body></html>""", feedCleaner.removeDoctypeDeclarations(source));
}

}

0 comments on commit 62d3ed1

Please sign in to comment.