From 62d3ed16e6c46eb5bbc1395549a7bd0e68fa7534 Mon Sep 17 00:00:00 2001 From: Athou Date: Fri, 10 Jan 2025 16:09:21 +0100 Subject: [PATCH] remove DOCTYPE declarations (#1260) --- .../backend/feed/parser/FeedCleaner.java | 7 +++++++ .../backend/feed/parser/FeedParser.java | 1 + .../backend/feed/parser/FeedCleanerTest.java | 18 ++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java index 5389fd616..e656121b4 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java @@ -1,6 +1,7 @@ package com.commafeed.backend.feed.parser; import java.util.Collection; +import java.util.regex.Pattern; import org.ahocorasick.trie.Emit; import org.ahocorasick.trie.Trie; @@ -11,6 +12,8 @@ @Singleton class FeedCleaner { + private static final Pattern DOCTYPE_PATTERN = Pattern.compile("]*>", Pattern.CASE_INSENSITIVE); + public String trimInvalidXmlCharacters(String xml) { if (StringUtils.isBlank(xml)) { return null; @@ -60,4 +63,8 @@ public String replaceHtmlEntitiesWithNumericEntities(String source) { return sb.toString(); } + public String removeDoctypeDeclarations(String xml) { + return DOCTYPE_PATTERN.matcher(xml).replaceAll(""); + } + } diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java index d25f7c136..1877d250e 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java @@ -64,6 +64,7 @@ public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedException { throw new FeedException("Input string is null for url " + feedUrl); } xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString); + xmlString = feedCleaner.removeDoctypeDeclarations(xmlString); InputSource source = new InputSource(new StringReader(xmlString)); SyndFeed feed = new SyndFeedInput().build(source); diff --git a/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java b/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java index 68a7584cf..c324b9584 100644 --- a/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java +++ b/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java @@ -13,4 +13,22 @@ void testReplaceHtmlEntitiesWithNumericEntities() { Assertions.assertEquals("T´l´phone ′", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source)); } + @Test + void testRemoveDoctype() { + String source = ""; + Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source)); + } + + @Test + void testRemoveMultilineDoctype() { + String source = """ + + """; + Assertions.assertEquals(""" + + """, feedCleaner.removeDoctypeDeclarations(source)); + } + } \ No newline at end of file