Skip to content

Commit

Permalink
feat: Add Reddit sanitizers (#195)
Browse files Browse the repository at this point in the history
  • Loading branch information
svenjacobs authored Jul 7, 2023
1 parent b34a41c commit 73f32fa
Show file tree
Hide file tree
Showing 9 changed files with 230 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ import com.svenjacobs.app.leon.core.domain.sanitizer.linksynergy.LinkSynergySani
import com.svenjacobs.app.leon.core.domain.sanitizer.netflix.NetflixSanitizer
import com.svenjacobs.app.leon.core.domain.sanitizer.newegg.NewEggSanitizer
import com.svenjacobs.app.leon.core.domain.sanitizer.pearl.PearlSanitizer
import com.svenjacobs.app.leon.core.domain.sanitizer.reddit.RedditMailSanitizer
import com.svenjacobs.app.leon.core.domain.sanitizer.reddit.RedditOutSanitizer
import com.svenjacobs.app.leon.core.domain.sanitizer.sessionids.SessionIdsSanitizer
import com.svenjacobs.app.leon.core.domain.sanitizer.shopee.ShopeeSanitizer
import com.svenjacobs.app.leon.core.domain.sanitizer.spiegel.SpiegelSanitizer
Expand Down Expand Up @@ -83,6 +85,8 @@ class ContainerInitializer : DistinctInitializer<Unit> {
NetflixSanitizer(),
NewEggSanitizer(),
PearlSanitizer(),
RedditMailSanitizer(),
RedditOutSanitizer(),
SessionIdsSanitizer(),
ShopeeSanitizer(),
SpiegelSanitizer(),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
* Léon - The URL Cleaner
* Copyright (C) 2023 Sven Jacobs
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package com.svenjacobs.app.leon.core.common.url

import java.net.URLDecoder

fun decodeUrl(encoded: String): String = URLDecoder.decode(encoded, "UTF-8")
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@

package com.svenjacobs.app.leon.core.domain

import com.svenjacobs.app.leon.core.common.url.decodeUrl
import com.svenjacobs.app.leon.core.domain.inject.DomainContainer.SanitizerRepository
import com.svenjacobs.app.leon.core.domain.inject.DomainContainer.Sanitizers
import com.svenjacobs.app.leon.core.domain.sanitizer.Sanitizer
import com.svenjacobs.app.leon.core.domain.sanitizer.SanitizerRepository
import com.svenjacobs.app.leon.core.domain.sanitizer.SanitizersCollection
import java.net.URLDecoder
import kotlinx.collections.immutable.ImmutableList
import kotlinx.collections.immutable.toImmutableList
import kotlinx.coroutines.Dispatchers
Expand All @@ -43,7 +43,6 @@ class CleanerService(
val urls: ImmutableList<String>,
)

@Suppress("BlockingMethodInNonBlockingContext")
suspend fun clean(text: String?, decodeUrl: Boolean = false): Result {
if (text.isNullOrEmpty()) throw IllegalArgumentException()

Expand All @@ -65,10 +64,7 @@ class CleanerService(
.let { (cleaned, urls) ->
val decoded = if (decodeUrl) {
withContext(Dispatchers.Default) {
URLDecoder.decode(
cleaned,
"UTF-8",
)
decodeUrl(cleaned)
}
} else {
cleaned
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
* Léon - The URL Cleaner
* Copyright (C) 2022 Sven Jacobs
* Copyright (C) 2023 Sven Jacobs
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
Expand All @@ -18,7 +18,7 @@

package com.svenjacobs.app.leon.core.domain.sanitizer

import java.net.URLDecoder
import com.svenjacobs.app.leon.core.common.url.decodeUrl

/**
* Base class for sanitizers that extract URLs from search engine result links.
Expand All @@ -32,6 +32,6 @@ abstract class SearchResultSanitizer(
override fun invoke(input: String): String {
val result = regex.find(input) ?: return input
val group = result.groups[1] ?: return input
return URLDecoder.decode(group.value, "UTF-8")
return decodeUrl(group.value)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Léon - The URL Cleaner
* Copyright (C) 2023 Sven Jacobs
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package com.svenjacobs.app.leon.core.domain.sanitizer.reddit

import android.content.Context
import com.svenjacobs.app.leon.core.common.domain.matchesDomain
import com.svenjacobs.app.leon.core.common.regex.RegexFactory
import com.svenjacobs.app.leon.core.common.url.decodeUrl
import com.svenjacobs.app.leon.core.domain.R
import com.svenjacobs.app.leon.core.domain.sanitizer.Sanitizer
import com.svenjacobs.app.leon.core.domain.sanitizer.SanitizerId

class RedditMailSanitizer : Sanitizer {

override val id = SanitizerId("reddit_mail")

override fun getMetadata(context: Context) = Sanitizer.Metadata(
name = context.getString(R.string.sanitizer_reddit_mail),
)

override fun invoke(input: String): String {
val encoded = URL_REGEX.find(input)?.groupValues?.getOrNull(1) ?: return input
val url = decodeUrl(encoded)
return RegexFactory.AllParameters.replace(url, "")
}

override fun matchesDomain(input: String) = input.matchesDomain("click.redditmail.com")

private companion object {
private val URL_REGEX = Regex("click\\.redditmail\\.com/[^/]+/(.+)")
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Léon - The URL Cleaner
* Copyright (C) 2023 Sven Jacobs
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package com.svenjacobs.app.leon.core.domain.sanitizer.reddit

import android.content.Context
import com.svenjacobs.app.leon.core.common.domain.matchesDomain
import com.svenjacobs.app.leon.core.common.regex.RegexFactory
import com.svenjacobs.app.leon.core.domain.R
import com.svenjacobs.app.leon.core.domain.sanitizer.Sanitizer
import com.svenjacobs.app.leon.core.domain.sanitizer.SanitizerId
import com.svenjacobs.app.leon.core.domain.sanitizer.SearchResultSanitizer

class RedditOutSanitizer : SearchResultSanitizer(
RegexFactory.ofParameter("url"),
) {

override val id = SanitizerId("reddit_out")

override fun getMetadata(context: Context) = Sanitizer.Metadata(
name = context.getString(R.string.sanitizer_reddit_out),
)

override fun matchesDomain(input: String) = input.matchesDomain("out.reddit.com")
}
18 changes: 10 additions & 8 deletions core-domain/src/main/res/values/strings.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,30 +23,32 @@
<string name="sanitizer_aol_search_name">AOL Search</string>
<string name="sanitizer_change_name" translatable="false">Change</string>
<string name="sanitizer_ebay_name" translatable="false">eBay</string>
<string name="sanitizer_elfinanciero_name" translatable="false">ElFinanciero</string>
<string name="sanitizer_empty_parameters_name">Empty Parameters</string>
<string name="sanitizer_facebook_name" translatable="false">Facebook</string>
<string name="sanitizer_flipkart_name" translatable="false">Flipkart</string>
<string name="sanitizer_ga_name" translatable="false">Google Analytics</string>
<string name="sanitizer_georiot_name" translatable="false">GeoRiot</string>
<string name="sanitizer_google_search_name">Google Search</string>
<string name="sanitizer_google_ads_name" translatable="false">Google Ads</string>
<string name="sanitizer_google_search_name">Google Search</string>
<string name="sanitizer_instagram_name" translatable="false">Instagram</string>
<string name="sanitizer_jdoqocy_name" translatable="false">Jdoqocy</string>
<string name="sanitizer_lazada" translatable="false">Lazada</string>
<string name="sanitizer_linksynergy_name" translatable="false">LinkSynergy</string>
<string name="sanitizer_netflix_name" translatable="false">Netflix</string>
<string name="sanitizer_newegg_name" translatable="false">NewEgg</string>
<string name="sanitizer_pearl" translatable="false">Pearl</string>
<string name="sanitizer_reddit_mail" translatable="false">Reddit (click.redditmail.com)</string>
<string name="sanitizer_reddit_out" translatable="false">Reddit (out.reddit.com)</string>
<string name="sanitizer_session_ids_name" translatable="false">Session IDs</string>
<string name="sanitizer_shopee" translatable="false">Shopee</string>
<string name="sanitizer_spiegel_name" translatable="false">Spiegel</string>
<string name="sanitizer_spotify_name" translatable="false">Spotify</string>
<string name="sanitizer_theguardian" translatable="false">The Guardian</string>
<string name="sanitizer_twitter_name" translatable="false">Twitter</string>
<string name="sanitizer_webtrekk_name" translatable="false">Webtrekk</string>
<string name="sanitizer_yahoo_search_name">Yahoo Search</string>
<string name="sanitizer_youtube_music_name" translatable="false">YouTube Music</string>
<string name="sanitizer_youtube_redirect_name">YouTube Redirect</string>
<string name="sanitizer_youtube_short_url_name" translatable="false">Youtu.be</string>
<string name="sanitizer_youtube_music_name" translatable="false">YouTube Music</string>
<string name="sanitizer_jdoqocy_name" translatable="false">Jdoqocy</string>
<string name="sanitizer_elfinanciero_name" translatable="false">ElFinanciero</string>
<string name="sanitizer_lazada" translatable="false">Lazada</string>
<string name="sanitizer_shopee" translatable="false">Shopee</string>
<string name="sanitizer_pearl" translatable="false">Pearl</string>
<string name="sanitizer_theguardian" translatable="false">The Guardian</string>
</resources>
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Léon - The URL Cleaner
* Copyright (C) 2023 Sven Jacobs
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package com.svenjacobs.app.leon.core.domain.sanitizer.reddit

import io.kotest.core.spec.style.WordSpec
import io.kotest.matchers.shouldBe

class RedditMailSanitizerTest : WordSpec(
{
val sanitizer = RedditMailSanitizer()

"invoke" should {

"extract and decode URL" {
val result = sanitizer(
"https://click.redditmail.com/CL0/https:%2F%2Fwww.reddit.com%2Fr%2FComp" +
"ressOrDie%2Fcomments%2F11u2vso%2Frcompressordie_lounge%2Fjl9fp68%2F%3F\$" +
"deep_link=true%26correlation_id=5329d6c9-34a4-4a44-9cea-76317f68123f%26r" +
"ef=email_comment_reply%26ref_campaign=email_comment_reply%26ref_source=e" +
"mail/3/010001884768a910-4b97a265-36d8-461f-9d79-fc2b535e5217-000000/sa_u" +
"FF6uMCdJu1cTLCaOI8Ng6wQBjfPtc5hMCnOrx4Q=301",
)

result shouldBe "https://www.reddit.com/r/CompressOrDie/comments/11u2vso/rcompres" +
"sordie_lounge/jl9fp68/"
}
}

"matchesDomain" should {

"match for click.redditmail.com" {
sanitizer.matchesDomain("https://click.redditmail.com") shouldBe true
}
}
},
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Léon - The URL Cleaner
* Copyright (C) 2023 Sven Jacobs
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package com.svenjacobs.app.leon.core.domain.sanitizer.reddit

import io.kotest.core.spec.style.WordSpec
import io.kotest.matchers.shouldBe

class RedditOutSanitizerTest : WordSpec(
{
val sanitizer = RedditOutSanitizer()

"invoke" should {

"extract URL" {
val result = sanitizer(
"https://out.reddit.com/t3_11zcpau?url=https%3A%2F%2Fcompress-or-die.co" +
"m%2FThe-nasty-red-JPG-compression-artifacts&token=AQAA-odsZCyQ04Ae10crjv" +
"g8DGlsTPckMpu3vvIjNwmWPgLdQMbC&app_name=web2x&web_redirect=true/",
)

result shouldBe "https://compress-or-die.com/The-nasty-red-JPG-compression-artifacts"
}
}

"matchesDomain" should {

"match for out.reddit.com" {
sanitizer.matchesDomain("https://out.reddit.com") shouldBe true
}
}
},
)

0 comments on commit 73f32fa

Please sign in to comment.