From 38344a3af64da5af7837b1a109f5283ed10af7ba Mon Sep 17 00:00:00 2001
From: Sveder <michael@vdoo.com>
Date: Tue, 20 Jul 2021 16:11:50 +0300
Subject: [PATCH] Add a "remove duplicate lines" filter.

---
 CHANGELOG.md            |  1 +
 docs/source/filters.rst |  1 +
 lib/urlwatch/filters.py | 25 +++++++++++++++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 87bae0fb..7c06db2d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,7 @@ The format mostly follows [Keep a Changelog](http://keepachangelog.com/en/1.0.0/
 
 - Migrated CI pipeline from Travis CI to Github Actions
 - `user_visible_url` can now be specified for all job types (#654, by kongomongo)
+- Added a `remove-duplicate-lines` filter. 
 
 ## [2.23] -- 2021-04-10
 
diff --git a/docs/source/filters.rst b/docs/source/filters.rst
index 63184d32..9aa71960 100644
--- a/docs/source/filters.rst
+++ b/docs/source/filters.rst
@@ -69,6 +69,7 @@ At the moment, the following filters are built-in:
 - **sha1sum**: Calculate the SHA-1 checksum of the content
 - **shellpipe**: Filter using a shell command
 - **sort**: Sort input items
+- **remove-duplicate-lines**: Remove duplicate lines (case sensitive)
 - **strip**: Strip leading and trailing whitespace
 - **xpath**: Filter XML/HTML using XPath expressions
 - **jq**: Filter, transform and extract values from JSON
diff --git a/lib/urlwatch/filters.py b/lib/urlwatch/filters.py
index 782b5b3b..2c16335b 100644
--- a/lib/urlwatch/filters.py
+++ b/lib/urlwatch/filters.py
@@ -809,6 +809,31 @@ def filter(self, data, subfilter):
         return separator.join(sorted(data.split(separator), key=str.casefold, reverse=reverse))
 
 
+class RemoveDuplicateLinesFilter(FilterBase):
+    """Remove duplicate lines"""
+
+    __kind__ = 'remove-duplicate-lines'
+
+    __supported_subfilters__ = {
+        'separator': 'Item separator (default: newline)',
+    }
+
+    __default_subfilter__ = 'separator'
+
+    def filter(self, data, subfilter):
+        separator = subfilter.get('separator', '\n')
+        data_lines = data.split(separator)
+
+        def get_unique_lines(lines):
+            seen = set()
+            for line in lines:
+                if line not in seen:
+                    yield line
+                    seen.add(line)
+
+        return separator.join(get_unique_lines(data_lines))
+
+
 class ReverseFilter(FilterBase):
     """Reverse input items"""