From 38344a3af64da5af7837b1a109f5283ed10af7ba Mon Sep 17 00:00:00 2001 From: Sveder Date: Tue, 20 Jul 2021 16:11:50 +0300 Subject: [PATCH] Add a "remove duplicate lines" filter. --- CHANGELOG.md | 1 + docs/source/filters.rst | 1 + lib/urlwatch/filters.py | 25 +++++++++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 87bae0fb..7c06db2d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ The format mostly follows [Keep a Changelog](http://keepachangelog.com/en/1.0.0/ - Migrated CI pipeline from Travis CI to Github Actions - `user_visible_url` can now be specified for all job types (#654, by kongomongo) +- Added a `remove-duplicate-lines` filter. ## [2.23] -- 2021-04-10 diff --git a/docs/source/filters.rst b/docs/source/filters.rst index 63184d32..9aa71960 100644 --- a/docs/source/filters.rst +++ b/docs/source/filters.rst @@ -69,6 +69,7 @@ At the moment, the following filters are built-in: - **sha1sum**: Calculate the SHA-1 checksum of the content - **shellpipe**: Filter using a shell command - **sort**: Sort input items +- **remove-duplicate-lines**: Remove duplicate lines (case sensitive) - **strip**: Strip leading and trailing whitespace - **xpath**: Filter XML/HTML using XPath expressions - **jq**: Filter, transform and extract values from JSON diff --git a/lib/urlwatch/filters.py b/lib/urlwatch/filters.py index 782b5b3b..2c16335b 100644 --- a/lib/urlwatch/filters.py +++ b/lib/urlwatch/filters.py @@ -809,6 +809,31 @@ def filter(self, data, subfilter): return separator.join(sorted(data.split(separator), key=str.casefold, reverse=reverse)) +class RemoveDuplicateLinesFilter(FilterBase): + """Remove duplicate lines""" + + __kind__ = 'remove-duplicate-lines' + + __supported_subfilters__ = { + 'separator': 'Item separator (default: newline)', + } + + __default_subfilter__ = 'separator' + + def filter(self, data, subfilter): + separator = subfilter.get('separator', '\n') + data_lines = data.split(separator) + + def get_unique_lines(lines): + seen = set() + for line in lines: + if line not in seen: + yield line + seen.add(line) + + return separator.join(get_unique_lines(data_lines)) + + class ReverseFilter(FilterBase): """Reverse input items"""