From a31da1c84ad1d941dfb2acbe90bcd0201b7120fc Mon Sep 17 00:00:00 2001 From: Mikhael Sokolov Date: Tue, 8 Jun 2021 19:07:15 +0300 Subject: [PATCH] remove base url property from `Skraper` interface (#168) * remove base url property from `Skraper` interface * provide `UnknownMedia` Co-authored-by: sokomishalov --- README.md | 3 +- .../kotlin/ru/sokomishalov/skraper/Skraper.kt | 12 ++--- .../ru/sokomishalov/skraper/Skrapers.kt | 15 +++--- .../skraper/internal/jsoup/JsoupExtensions.kt | 11 +++-- .../ru/sokomishalov/skraper/model/media.kt | 8 ++++ .../provider/facebook/FacebookSkraper.kt | 18 ++++++-- .../skraper/provider/flickr/FlickrSkraper.kt | 46 +++++++++++-------- .../skraper/provider/ifunny/IFunnySkraper.kt | 16 +++++-- .../provider/instagram/InstagramSkraper.kt | 16 +++++-- .../provider/ninegag/NinegagSkraper.kt | 14 ++++-- .../skraper/provider/pikabu/PikabuSkraper.kt | 16 +++++-- .../provider/pinterest/PinterestSkraper.kt | 10 ++-- .../skraper/provider/reddit/RedditSkraper.kt | 18 ++++++-- .../provider/telegram/TelegramSkraper.kt | 16 +++++-- .../skraper/provider/tiktok/TikTokSkraper.kt | 29 +++++++----- .../skraper/provider/tumblr/TumblrSkraper.kt | 12 ++--- .../skraper/provider/twitch/TwitchSkraper.kt | 28 +++++++---- .../provider/twitter/TwitterSkraper.kt | 19 ++++---- .../skraper/provider/vk/VkSkraper.kt | 18 +++++--- .../provider/youtube/YoutubeSkraper.kt | 20 ++++---- .../provider/youtube/YoutubeVideoResolver.kt | 2 +- .../skraper/provider/SkraperTck.kt | 2 +- .../bot/telegram/service/SkraperBot.kt | 22 +++++---- 23 files changed, 231 insertions(+), 140 deletions(-) diff --git a/README.md b/README.md index 6ba76c0a..5700c204 100644 --- a/README.md +++ b/README.md @@ -195,11 +195,10 @@ interface: ```kotlin interface Skraper { - val baseUrl: String val client: SkraperClient - fun supports(url: String): Boolean fun getPosts(path: String): Flow suspend fun getPageInfo(path: String): PageInfo? + fun supports(media: Media): Boolean suspend fun resolve(media: Media): Media } ``` diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skraper.kt index 4bb2d148..7cb5dbf6 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skraper.kt @@ -18,7 +18,6 @@ package ru.sokomishalov.skraper import kotlinx.coroutines.flow.Flow import ru.sokomishalov.skraper.client.SkraperClient import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient -import ru.sokomishalov.skraper.internal.net.host import ru.sokomishalov.skraper.model.Media import ru.sokomishalov.skraper.model.PageInfo import ru.sokomishalov.skraper.model.Post @@ -28,11 +27,6 @@ import ru.sokomishalov.skraper.model.Post */ interface Skraper { - /** - * @return provider base url - */ - val baseUrl: String - /** * @return http client */ @@ -51,10 +45,10 @@ interface Skraper { suspend fun getPageInfo(path: String): PageInfo? /** - * @param url potential provider relative url - * @return true if such skraper supports this url + * @param media media item + * @return true if such skraper supports this media and can resolve/download it */ - fun supports(url: String): Boolean = url.host.removePrefix("www.") in baseUrl.host + fun supports(media: Media): Boolean /** * @param media with provider relative url diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skrapers.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skrapers.kt index 822e6fc4..f75b51a8 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skrapers.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skrapers.kt @@ -24,10 +24,7 @@ import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.internal.ffmpeg.FfmpegCliRunner import ru.sokomishalov.skraper.internal.ffmpeg.FfmpegRunner import ru.sokomishalov.skraper.internal.net.path -import ru.sokomishalov.skraper.model.Audio -import ru.sokomishalov.skraper.model.Image -import ru.sokomishalov.skraper.model.Media -import ru.sokomishalov.skraper.model.Video +import ru.sokomishalov.skraper.model.* import ru.sokomishalov.skraper.provider.facebook.FacebookSkraper import ru.sokomishalov.skraper.provider.flickr.FlickrSkraper import ru.sokomishalov.skraper.provider.ifunny.IFunnySkraper @@ -55,11 +52,11 @@ object Skrapers { } /** - * @param url potential provider relative url + * @param media media item * @return skraper which supports this url or null if none of skrapers supports it */ - fun suitable(url: String): Skraper? { - return providers.find { it.supports(url) } + fun findSuitable(media: Media): Skraper? { + return providers.find { it.supports(media) } } /** @@ -78,7 +75,7 @@ object Skrapers { // otherwise else -> { - suitable(media.url) + findSuitable(media) ?.resolve(media) ?.run { when { @@ -87,6 +84,7 @@ object Skrapers { is Image -> media.copy(url = url) is Video -> media.copy(url = url) is Audio -> media.copy(url = url) + is UnknownMedia -> media.copy(url = url) } } } @@ -166,6 +164,7 @@ object Skrapers { is Image -> filename.substringAfterLast(".", "png") is Video -> filename.substringAfterLast(".", "mp4") is Audio -> filename.substringAfterLast(".", "mp3") + is UnknownMedia -> filename.substringAfterLast(".") } } diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/jsoup/JsoupExtensions.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/jsoup/JsoupExtensions.kt index adbe6e78..95d8a115 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/jsoup/JsoupExtensions.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/jsoup/JsoupExtensions.kt @@ -21,10 +21,7 @@ import org.jsoup.nodes.Document import org.jsoup.nodes.Element import ru.sokomishalov.skraper.internal.map.firstNotNull import ru.sokomishalov.skraper.internal.number.div -import ru.sokomishalov.skraper.model.Audio -import ru.sokomishalov.skraper.model.Image -import ru.sokomishalov.skraper.model.Media -import ru.sokomishalov.skraper.model.Video +import ru.sokomishalov.skraper.model.* internal inline fun Element.getFirstElementByClass(name: String): Element? { return getElementsByClass(name).firstOrNull() @@ -126,6 +123,12 @@ internal fun Document.extractOpenGraphMedia(media: Media): Media { url = audioUrl ?: media.url ) } + is UnknownMedia -> { + val url = firstNotNull("og:url") + media.copy( + url = url ?: media.url + ) + } } } } \ No newline at end of file diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/model/media.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/model/media.kt index c5f3f477..ecd63e88 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/model/media.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/model/media.kt @@ -58,3 +58,11 @@ data class Audio( override val url: String, val duration: Duration? = null ) : Media() + +/** + * Represents an unknown media. + * @property url some url + */ +data class UnknownMedia( + override val url: String +): Media() diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/facebook/FacebookSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/facebook/FacebookSkraper.kt index a7558753..3c1cb1f2 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/facebook/FacebookSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/facebook/FacebookSkraper.kt @@ -28,6 +28,7 @@ import ru.sokomishalov.skraper.internal.iterable.emitBatch import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByAttributeValue import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByClass import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByTag +import ru.sokomishalov.skraper.internal.net.host import ru.sokomishalov.skraper.internal.number.div import ru.sokomishalov.skraper.internal.serialization.getDouble import ru.sokomishalov.skraper.internal.serialization.getLong @@ -42,16 +43,14 @@ import java.time.Instant * @author sokomishalov */ open class FacebookSkraper @JvmOverloads constructor( - override val client: SkraperClient = DefaultBlockingSkraperClient, - override val baseUrl: String = "https://facebook.com", - private val mobileBaseUrl: String = "https://m.facebook.com" + override val client: SkraperClient = DefaultBlockingSkraperClient ) : Skraper { override fun getPosts(path: String): Flow = flow { val postsPath = path.substringBefore("/posts") + "/posts" var nextPath = postsPath while (true) { - val fetchResult = client.fetchString(HttpRequest(url = mobileBaseUrl.buildFullURL(path = nextPath))) + val fetchResult = client.fetchString(HttpRequest(url = MOBILE_BASE_URL.buildFullURL(path = nextPath))) val (document, nextPage) = fetchResult?.extractDocumentAndNextPage() ?: break nextPath = nextPage ?: break @@ -79,7 +78,7 @@ open class FacebookSkraper @JvmOverloads constructor( override suspend fun getPageInfo(path: String): PageInfo? { val aboutPath = path.substringBefore("/about") + "/about" - val page = client.fetchDocument(HttpRequest(url = baseUrl.buildFullURL(path = aboutPath))) + val page = client.fetchDocument(HttpRequest(url = BASE_URL.buildFullURL(path = aboutPath))) return page?.run { val isCommunity = getFirstElementByAttributeValue("data-key", "tab_community") != null @@ -106,6 +105,10 @@ open class FacebookSkraper @JvmOverloads constructor( } } + override fun supports(media: Media): Boolean { + return "facebook.com" in media.url.host + } + override suspend fun resolve(media: Media): Media { return client.fetchOpenGraphMedia(media) } @@ -255,4 +258,9 @@ open class FacebookSkraper @JvmOverloads constructor( return getFirstElementByClass("coverPhotoImg") ?.attr("src") } + + companion object { + const val BASE_URL: String = "https://facebook.com" + const val MOBILE_BASE_URL: String = "https://m.facebook.com" + } } \ No newline at end of file diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/flickr/FlickrSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/flickr/FlickrSkraper.kt index 759def54..fb39c85d 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/flickr/FlickrSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/flickr/FlickrSkraper.kt @@ -31,6 +31,7 @@ import ru.sokomishalov.skraper.internal.iterable.emitBatch import ru.sokomishalov.skraper.internal.jsoup.getBackgroundImageUrl import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByClass import ru.sokomishalov.skraper.internal.jsoup.getStyle +import ru.sokomishalov.skraper.internal.net.host import ru.sokomishalov.skraper.internal.number.div import ru.sokomishalov.skraper.internal.serialization.* import ru.sokomishalov.skraper.internal.string.unescapeHtml @@ -43,8 +44,7 @@ import java.time.Instant * @author sokomishalov */ open class FlickrSkraper @JvmOverloads constructor( - override val client: SkraperClient = DefaultBlockingSkraperClient, - override val baseUrl: String = "https://flickr.com" + override val client: SkraperClient = DefaultBlockingSkraperClient ) : Skraper { override fun getPosts(path: String): Flow = flow { @@ -86,22 +86,24 @@ open class FlickrSkraper @JvmOverloads constructor( } jsonPosts.isNotEmpty() -> jsonPosts.forEach { (key, value) -> - emit(Post( - id = key, - text = value?.extractPostText(), - publishedAt = value?.extractPostPublishDate(), - statistics = PostStatistics( - likes = value?.extractPostLikes(), - comments = value?.extractPostCommentsCount(), - views = value?.extractPostViewsCount(), - ), - media = listOf( - Image( - url = value.extractPostAttachmentUrl(), - aspectRatio = value.extractPostAspectRatio() + emit( + Post( + id = key, + text = value?.extractPostText(), + publishedAt = value?.extractPostPublishDate(), + statistics = PostStatistics( + likes = value?.extractPostLikes(), + comments = value?.extractPostCommentsCount(), + views = value?.extractPostViewsCount(), + ), + media = listOf( + Image( + url = value.extractPostAttachmentUrl(), + aspectRatio = value.extractPostAspectRatio() + ) ) ) - )) + ) } } } @@ -126,6 +128,10 @@ open class FlickrSkraper @JvmOverloads constructor( } } + override fun supports(media: Media): Boolean { + return "flickr.com" in media.url.host + } + override suspend fun resolve(media: Media): Media { return when (media) { is Image -> client.fetchOpenGraphMedia(media) @@ -134,7 +140,7 @@ open class FlickrSkraper @JvmOverloads constructor( } private suspend fun getPage(path: String): Document? { - return client.fetchDocument(HttpRequest(url = baseUrl.buildFullURL(path = path))) + return client.fetchDocument(HttpRequest(url = BASE_URL.buildFullURL(path = path))) } private fun Document?.parseModelJson(): JsonNode? { @@ -248,7 +254,7 @@ open class FlickrSkraper @JvmOverloads constructor( } private fun JsonNode.extractPageLogo(): Image? { - return getFirstByPath("photostream-models.0.owner.buddyicon","person-models.0.buddyicon") + return getFirstByPath("photostream-models.0.owner.buddyicon", "person-models.0.buddyicon") ?.getFirstByPath("large", "medium", "small", "default") ?.asText() ?.convertToImage() @@ -265,4 +271,8 @@ open class FlickrSkraper @JvmOverloads constructor( this?.asText().orEmpty() } } + + companion object { + const val BASE_URL: String = "https://flickr.com" + } } diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ifunny/IFunnySkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ifunny/IFunnySkraper.kt index 9e10d307..bbded046 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ifunny/IFunnySkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ifunny/IFunnySkraper.kt @@ -27,6 +27,7 @@ import ru.sokomishalov.skraper.client.fetchOpenGraphMedia import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.internal.iterable.emitBatch import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByTag +import ru.sokomishalov.skraper.internal.net.host import ru.sokomishalov.skraper.internal.serialization.* import ru.sokomishalov.skraper.model.* @@ -34,8 +35,7 @@ import ru.sokomishalov.skraper.model.* * @author sokomishalov */ open class IFunnySkraper @JvmOverloads constructor( - override val client: SkraperClient = DefaultBlockingSkraperClient, - override val baseUrl: String = "https://ifunny.co" + override val client: SkraperClient = DefaultBlockingSkraperClient ) : Skraper { override fun getPosts(path: String): Flow = flow { @@ -66,7 +66,7 @@ open class IFunnySkraper @JvmOverloads constructor( media = listOf( when { isVideo -> Video( - url = "${baseUrl}${link}", + url = "${BASE_URL}${link}", aspectRatio = aspectRatio ) else -> Image( @@ -83,6 +83,10 @@ open class IFunnySkraper @JvmOverloads constructor( } } + override fun supports(media: Media): Boolean { + return "ifunny.co" in media.url.host + } + override suspend fun resolve(media: Media): Media { return client.fetchOpenGraphMedia(media) } @@ -120,6 +124,10 @@ open class IFunnySkraper @JvmOverloads constructor( private suspend fun getPage(path: String): Document? { - return client.fetchDocument(HttpRequest(url = baseUrl.buildFullURL(path = path))) + return client.fetchDocument(HttpRequest(url = BASE_URL.buildFullURL(path = path))) + } + + companion object { + const val BASE_URL: String = "https://ifunny.co" } } \ No newline at end of file diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/instagram/InstagramSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/instagram/InstagramSkraper.kt index d2d5a879..97d390e7 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/instagram/InstagramSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/instagram/InstagramSkraper.kt @@ -25,6 +25,7 @@ import ru.sokomishalov.skraper.client.fetchDocument import ru.sokomishalov.skraper.client.fetchOpenGraphMedia import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.internal.iterable.emitBatch +import ru.sokomishalov.skraper.internal.net.host import ru.sokomishalov.skraper.internal.number.div import ru.sokomishalov.skraper.internal.serialization.* import ru.sokomishalov.skraper.model.* @@ -35,8 +36,7 @@ import java.time.Instant * @author sokomishalov */ open class InstagramSkraper @JvmOverloads constructor( - override val client: SkraperClient = DefaultBlockingSkraperClient, - override val baseUrl: String = "https://instagram.com" + override val client: SkraperClient = DefaultBlockingSkraperClient ) : Skraper { override fun getPosts(path: String): Flow = flow { @@ -87,6 +87,10 @@ open class InstagramSkraper @JvmOverloads constructor( } } + override fun supports(media: Media): Boolean { + return "instagram.com" in media.url.host + } + override suspend fun resolve(media: Media): Media { return client.fetchOpenGraphMedia(media) } @@ -96,7 +100,7 @@ open class InstagramSkraper @JvmOverloads constructor( private fun JsonNode.extractPostMediaItems(): List { val isVideo = this["is_video"].asBoolean() val aspectRatio = this["dimensions"]?.run { getDouble("width") / getDouble("height") } - val shortcodeUrl = "${baseUrl}/p/${getString("shortcode")}" + val shortcodeUrl = "${BASE_URL}/p/${getString("shortcode")}" return listOf( when { @@ -119,7 +123,7 @@ open class InstagramSkraper @JvmOverloads constructor( } private suspend fun fetchJsonNodes(path: String): JsonNode? { - val document = client.fetchDocument(HttpRequest(url = baseUrl.buildFullURL(path))) + val document = client.fetchDocument(HttpRequest(url = BASE_URL.buildFullURL(path))) return document ?.getElementsByTag("script") ?.map { it.html() } @@ -128,4 +132,8 @@ open class InstagramSkraper @JvmOverloads constructor( ?.substringBeforeLast(";") .readJsonNodes() } + + companion object { + const val BASE_URL: String = "https://instagram.com" + } } \ No newline at end of file diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ninegag/NinegagSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ninegag/NinegagSkraper.kt index 645aa56d..c7301a58 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ninegag/NinegagSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ninegag/NinegagSkraper.kt @@ -26,6 +26,7 @@ import ru.sokomishalov.skraper.client.fetchDocument import ru.sokomishalov.skraper.client.fetchOpenGraphMedia import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.internal.iterable.emitBatch +import ru.sokomishalov.skraper.internal.net.host import ru.sokomishalov.skraper.internal.number.div import ru.sokomishalov.skraper.internal.number.minus import ru.sokomishalov.skraper.internal.serialization.* @@ -39,8 +40,7 @@ import java.time.Instant * @author sokomishalov */ open class NinegagSkraper @JvmOverloads constructor( - override val client: SkraperClient = DefaultBlockingSkraperClient, - override val baseUrl: String = "https://9gag.com" + override val client: SkraperClient = DefaultBlockingSkraperClient ) : Skraper { override fun getPosts(path: String): Flow = flow { @@ -84,6 +84,10 @@ open class NinegagSkraper @JvmOverloads constructor( } } + override fun supports(media: Media): Boolean { + return "9gag.com" in media.url.host + } + override suspend fun resolve(media: Media): Media { return when (media) { is Video -> { @@ -100,7 +104,7 @@ open class NinegagSkraper @JvmOverloads constructor( } private suspend fun getUserPage(path: String): Document? { - return client.fetchDocument(HttpRequest(url = baseUrl.buildFullURL(path = path))) + return client.fetchDocument(HttpRequest(url = BASE_URL.buildFullURL(path = path))) } private fun Document?.extractJsonData(): JsonNode? { @@ -133,4 +137,8 @@ open class NinegagSkraper @JvmOverloads constructor( )) } } + + companion object { + const val BASE_URL: String = "https://9gag.com" + } } \ No newline at end of file diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pikabu/PikabuSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pikabu/PikabuSkraper.kt index 5bf1114b..2b83d48e 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pikabu/PikabuSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pikabu/PikabuSkraper.kt @@ -28,6 +28,7 @@ import ru.sokomishalov.skraper.client.fetchOpenGraphMedia import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.internal.iterable.emitBatch import ru.sokomishalov.skraper.internal.jsoup.* +import ru.sokomishalov.skraper.internal.net.host import ru.sokomishalov.skraper.internal.number.div import ru.sokomishalov.skraper.model.* import java.nio.charset.Charset @@ -37,8 +38,7 @@ import java.time.format.DateTimeFormatter.ISO_DATE_TIME import kotlin.text.Charsets.UTF_8 open class PikabuSkraper @JvmOverloads constructor( - override val client: SkraperClient = DefaultBlockingSkraperClient, - override val baseUrl: String = "https://pikabu.ru" + override val client: SkraperClient = DefaultBlockingSkraperClient ) : Skraper { override fun getPosts(path: String): Flow = flow { @@ -107,6 +107,10 @@ open class PikabuSkraper @JvmOverloads constructor( } } + override fun supports(media: Media): Boolean { + return "pikabu.ru" in media.url.host + } + override suspend fun resolve(media: Media): Media { return when (media) { is Video -> { @@ -125,7 +129,7 @@ open class PikabuSkraper @JvmOverloads constructor( private suspend fun getPage(path: String, page: Int = 1): Document? { return client.fetchDocument( - request = HttpRequest(url = baseUrl.buildFullURL(path = path, queryParams = mapOf("page" to page))), + request = HttpRequest(url = BASE_URL.buildFullURL(path = path, queryParams = mapOf("page" to page))), charset = Charset.forName("windows-1251") ) } @@ -133,7 +137,7 @@ open class PikabuSkraper @JvmOverloads constructor( private fun Element.extractPostId(): String { return getFirstElementByClass("story__title-link") ?.attr("href") - ?.substringAfter("${baseUrl}/story/") + ?.substringAfter("${BASE_URL}/story/") .orEmpty() } @@ -282,4 +286,8 @@ open class PikabuSkraper @JvmOverloads constructor( return filter { b -> "story-block_type_text" in b.classNames() } .joinToString("\n") { b -> b.wholeText() } } + + companion object { + const val BASE_URL: String = "https://pikabu.ru" + } } \ No newline at end of file diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pinterest/PinterestSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pinterest/PinterestSkraper.kt index 3a6f90d1..f9aff981 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pinterest/PinterestSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pinterest/PinterestSkraper.kt @@ -38,8 +38,7 @@ import java.util.Locale.ROOT * @author sokomishalov */ open class PinterestSkraper @JvmOverloads constructor( - override val client: SkraperClient = DefaultBlockingSkraperClient, - override val baseUrl: String = "https://pinterest.com" + override val client: SkraperClient = DefaultBlockingSkraperClient ) : Skraper { override fun getPosts(path: String): Flow = flow { @@ -85,8 +84,8 @@ open class PinterestSkraper @JvmOverloads constructor( } } - override fun supports(url: String): Boolean { - return "pinterest" in url.host + override fun supports(media: Media): Boolean { + return "pinterest" in media.url.host } override suspend fun resolve(media: Media): Media { @@ -97,7 +96,7 @@ open class PinterestSkraper @JvmOverloads constructor( } private suspend fun getUserJson(path: String): JsonNode? { - val webPage = client.fetchDocument(HttpRequest(url = baseUrl.buildFullURL(path = path))) + val webPage = client.fetchDocument(HttpRequest(url = BASE_URL.buildFullURL(path = path))) val infoJson = webPage?.getElementById("initial-state")?.html() return infoJson.readJsonNodes() } @@ -158,6 +157,7 @@ open class PinterestSkraper @JvmOverloads constructor( } companion object { + const val BASE_URL: String = "https://pinterest.com" private val DATE_FORMATTER = DateTimeFormatter.ofPattern("EEE, d MMM yyyy HH:mm:ss Z", ROOT) } } diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraper.kt index 4d30c575..8eaf2514 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraper.kt @@ -29,6 +29,7 @@ import ru.sokomishalov.skraper.client.fetchOpenGraphMedia import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.internal.consts.DEFAULT_POSTS_BATCH import ru.sokomishalov.skraper.internal.iterable.emitBatch +import ru.sokomishalov.skraper.internal.net.host import ru.sokomishalov.skraper.internal.net.path import ru.sokomishalov.skraper.internal.number.div import ru.sokomishalov.skraper.internal.serialization.* @@ -36,8 +37,7 @@ import ru.sokomishalov.skraper.model.* import java.time.Instant open class RedditSkraper @JvmOverloads constructor( - override val client: SkraperClient = DefaultBlockingSkraperClient, - override val baseUrl: String = "https://reddit.com" + override val client: SkraperClient = DefaultBlockingSkraperClient ) : Skraper { override fun getPosts(path: String): Flow = flow { @@ -46,7 +46,7 @@ open class RedditSkraper @JvmOverloads constructor( while (true) { val response = client.fetchJson( HttpRequest( - url = baseUrl.buildFullURL( + url = BASE_URL.buildFullURL( path = "${path.removeSuffix("/")}.json", queryParams = mapOf("limit" to DEFAULT_POSTS_BATCH, "after" to nextPage) ) @@ -78,7 +78,7 @@ open class RedditSkraper @JvmOverloads constructor( override suspend fun getPageInfo(path: String): PageInfo? { val response = client.fetchJson( - HttpRequest(url = baseUrl.buildFullURL(path = "${path.removeSuffix("/")}/about.json")) + HttpRequest(url = BASE_URL.buildFullURL(path = "${path.removeSuffix("/")}/about.json")) ) val isUser = path.removePrefix("/").startsWith("u") @@ -103,6 +103,10 @@ open class RedditSkraper @JvmOverloads constructor( } } + override fun supports(media: Media): Boolean { + return "reddit.com" in media.url.host + } + override suspend fun resolve(media: Media): Media { return when (media) { is Image -> client.fetchOpenGraphMedia(media) @@ -113,7 +117,7 @@ open class RedditSkraper @JvmOverloads constructor( ?.firstOrNull() ?: media } - is Audio -> media + else -> media } } @@ -148,4 +152,8 @@ open class RedditSkraper @JvmOverloads constructor( else -> previewMedia } } + + companion object { + const val BASE_URL: String = "https://reddit.com" + } } \ No newline at end of file diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/telegram/TelegramSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/telegram/TelegramSkraper.kt index 688f26bb..9d958a6d 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/telegram/TelegramSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/telegram/TelegramSkraper.kt @@ -31,6 +31,7 @@ import ru.sokomishalov.skraper.client.fetchDocument import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.internal.iterable.emitBatch import ru.sokomishalov.skraper.internal.jsoup.* +import ru.sokomishalov.skraper.internal.net.host import ru.sokomishalov.skraper.internal.net.path import ru.sokomishalov.skraper.model.* import java.time.Duration @@ -41,8 +42,7 @@ import java.time.ZonedDateTime * @author sokomishalov */ open class TelegramSkraper @JvmOverloads constructor( - override val client: SkraperClient = DefaultBlockingSkraperClient, - override val baseUrl: String = "https://t.me" + override val client: SkraperClient = DefaultBlockingSkraperClient ) : Skraper { override fun getPosts(path: String): Flow = flow { @@ -98,9 +98,13 @@ open class TelegramSkraper @JvmOverloads constructor( } } + override fun supports(media: Media): Boolean { + return "t.me" in media.url.host + } + override suspend fun resolve(media: Media): Media { return when { - supports(media.url) -> { + media.url.host.removePrefix("www.") in BASE_URL.host -> { val path = media.url.path.removePrefix("/s") val posts = getPosts(path) posts.firstOrNull { it.id == path }?.media?.firstOrNull() ?: media @@ -197,6 +201,10 @@ open class TelegramSkraper @JvmOverloads constructor( } private suspend fun fetchDocument(path: String): Document? { - return client.fetchDocument(HttpRequest(url = baseUrl.buildFullURL(path))) + return client.fetchDocument(HttpRequest(url = BASE_URL.buildFullURL(path))) + } + + companion object { + const val BASE_URL: String = "https://t.me" } } \ No newline at end of file diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/tiktok/TikTokSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/tiktok/TikTokSkraper.kt index 29b2447d..2ad59a42 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/tiktok/TikTokSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/tiktok/TikTokSkraper.kt @@ -27,6 +27,7 @@ import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.internal.consts.CRAWLER_USER_AGENTS import ru.sokomishalov.skraper.internal.consts.USER_AGENT_HEADER import ru.sokomishalov.skraper.internal.iterable.emitBatch +import ru.sokomishalov.skraper.internal.net.host import ru.sokomishalov.skraper.internal.number.div import ru.sokomishalov.skraper.internal.serialization.* import ru.sokomishalov.skraper.model.* @@ -35,8 +36,7 @@ import java.time.Instant class TikTokSkraper @JvmOverloads constructor( - override val client: SkraperClient = DefaultBlockingSkraperClient, - override val baseUrl: String = "https://tiktok.com" + override val client: SkraperClient = DefaultBlockingSkraperClient ) : Skraper { override fun getPosts(path: String): Flow = flow { @@ -61,7 +61,7 @@ class TikTokSkraper @JvmOverloads constructor( val aspectRatio = getDouble("video.width") / getDouble("video.height") listOf( Video( - url = "${baseUrl}/@${getString("author.uniqueId")}/video/${getString("id")}", + url = "${BASE_URL}/@${getString("author.uniqueId")}/video/${getString("id")}", aspectRatio = aspectRatio, duration = getLong("video.duration")?.let { Duration.ofSeconds(it) }, thumbnail = Image( @@ -75,13 +75,6 @@ class TikTokSkraper @JvmOverloads constructor( } } - override suspend fun resolve(media: Media): Media { - return when (media) { - is Video -> client.fetchOpenGraphMedia(media) - else -> media - } - } - override suspend fun getPageInfo(path: String): PageInfo? { val userJson = getPagePropsJson(path = path)?.get("userInfo") @@ -100,11 +93,21 @@ class TikTokSkraper @JvmOverloads constructor( } } + override fun supports(media: Media): Boolean { + return "tiktok.com" in media.url.host + } + + override suspend fun resolve(media: Media): Media { + return when (media) { + is Video -> client.fetchOpenGraphMedia(media) + else -> media + } + } private suspend fun getPagePropsJson(path: String): JsonNode? { val document = client.fetchDocument( HttpRequest( - url = "${baseUrl}${path}", + url = "${BASE_URL}${path}", headers = mapOf(USER_AGENT_HEADER to CRAWLER_USER_AGENTS.random()) ) ) @@ -116,4 +119,8 @@ class TikTokSkraper @JvmOverloads constructor( return json?.getByPath("props.pageProps") } + + companion object { + const val BASE_URL: String = "https://tiktok.com" + } } \ No newline at end of file diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/tumblr/TumblrSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/tumblr/TumblrSkraper.kt index 4ce452c2..80299e96 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/tumblr/TumblrSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/tumblr/TumblrSkraper.kt @@ -42,8 +42,7 @@ import java.time.format.DateTimeFormatter import java.util.Locale.ENGLISH open class TumblrSkraper @JvmOverloads constructor( - override val client: SkraperClient = DefaultBlockingSkraperClient, - override val baseUrl: String = "https://tumblr.com" + override val client: SkraperClient = DefaultBlockingSkraperClient ) : Skraper { override fun getPosts(path: String): Flow = flow { @@ -80,8 +79,8 @@ open class TumblrSkraper @JvmOverloads constructor( ) } - override fun supports(url: String): Boolean { - return "tumblr.com" in url.host + override fun supports(media: Media): Boolean { + return "tumblr.com" in media.url.host } override suspend fun resolve(media: Media): Media { @@ -89,7 +88,7 @@ open class TumblrSkraper @JvmOverloads constructor( } internal suspend fun getUserPage(username: String): Document? { - return client.fetchDocument(HttpRequest(url = baseUrl.replace("://", "://${username}."))) + return client.fetchDocument(HttpRequest(url = BASE_URL.replace("://", "://${username}."))) } private suspend fun getNonUserPage(path: String): Document? { @@ -102,7 +101,7 @@ open class TumblrSkraper @JvmOverloads constructor( val username = path.substringAfter("/blog/view/").substringBefore("/") return getUserPage(username = username) } - else -> client.fetchDocument(HttpRequest(url = baseUrl.buildFullURL(path = path))) + else -> client.fetchDocument(HttpRequest(url = BASE_URL.buildFullURL(path = path))) } } @@ -208,6 +207,7 @@ open class TumblrSkraper @JvmOverloads constructor( } companion object { + const val BASE_URL: String = "https://tumblr.com" private val DATE_FORMATTER = DateTimeFormatter.ofPattern("MMM d'th,' yyyy").withLocale(ENGLISH) private val DATE_TIME_FORMATTER = DateTimeFormatter.ofPattern("h:mm a, EEEE, MMMM d, yyyy").withLocale(ENGLISH) } diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitch/TwitchSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitch/TwitchSkraper.kt index f6c84c26..f41003df 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitch/TwitchSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitch/TwitchSkraper.kt @@ -30,6 +30,7 @@ import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.internal.consts.DEFAULT_HEADERS import ru.sokomishalov.skraper.internal.consts.DEFAULT_POSTS_BATCH import ru.sokomishalov.skraper.internal.iterable.emitBatch +import ru.sokomishalov.skraper.internal.net.host import ru.sokomishalov.skraper.internal.serialization.* import ru.sokomishalov.skraper.internal.string.unescapeUrl import ru.sokomishalov.skraper.model.* @@ -42,11 +43,7 @@ import kotlin.text.Charsets.UTF_8 * @author sokomishalov */ open class TwitchSkraper @JvmOverloads constructor( - override val client: SkraperClient = DefaultBlockingSkraperClient, - override val baseUrl: String = "https://twitch.tv", - private val graphBaseUrl: String = "https://gql.twitch.tv/gql", - private val restBaseUrl: String = "https://api.twitch.tv/api", - private val usherBaseUrl: String = "https://usher.ttvnw.net/vod" + override val client: SkraperClient = DefaultBlockingSkraperClient ) : Skraper { override fun getPosts(path: String): Flow = flow { @@ -174,6 +171,10 @@ open class TwitchSkraper @JvmOverloads constructor( } } + override fun supports(media: Media): Boolean { + return "twitch.tv" in media.url.host + } + override suspend fun resolve(media: Media): Media { return when (media) { is Video -> { @@ -204,7 +205,7 @@ open class TwitchSkraper @JvmOverloads constructor( val token = client.fetchJson( HttpRequest( - url = restBaseUrl.buildFullURL(path = "/vods/${videoId}/access_token"), + url = REST_BASE_URL.buildFullURL(path = "/vods/${videoId}/access_token"), method = GET, headers = DEFAULT_HEADERS + mapOf("Client-ID" to clientId) ) @@ -212,7 +213,7 @@ open class TwitchSkraper @JvmOverloads constructor( val videoMeta = client.fetchString( HttpRequest( - url = usherBaseUrl.buildFullURL( + url = USHER_BASE_URL.buildFullURL( path = "/${videoId}.m3u8", queryParams = mapOf( "nauth" to token?.getString("token"), @@ -240,7 +241,7 @@ open class TwitchSkraper @JvmOverloads constructor( } private suspend fun getPage(path: String): Document? { - return client.fetchDocument(HttpRequest(url = baseUrl.buildFullURL(path = path))) + return client.fetchDocument(HttpRequest(url = BASE_URL.buildFullURL(path = path))) } private fun Document?.extractClientId(): String { @@ -288,7 +289,7 @@ open class TwitchSkraper @JvmOverloads constructor( views = getInt("viewCount"), ), media = listOf(Video( - url = baseUrl.buildFullURL(path = "/videos/${getString("id")}"), + url = BASE_URL.buildFullURL(path = "/videos/${getString("id")}"), duration = getLong("lengthSeconds")?.let { Duration.ofSeconds(it) } )) ) @@ -327,7 +328,7 @@ open class TwitchSkraper @JvmOverloads constructor( private suspend fun graphRequest(clientId: String, query: String): JsonNode? { return client.fetchJson( HttpRequest( - url = graphBaseUrl, + url = GRAPH_BASE_URL, method = POST, headers = DEFAULT_HEADERS + mapOf( "Client-ID" to clientId, @@ -420,4 +421,11 @@ open class TwitchSkraper @JvmOverloads constructor( } } """ + + companion object { + const val BASE_URL: String = "https://twitch.tv" + const val GRAPH_BASE_URL: String = "https://gql.twitch.tv/gql" + const val REST_BASE_URL: String = "https://api.twitch.tv/api" + const val USHER_BASE_URL: String = "https://usher.ttvnw.net/vod" + } } \ No newline at end of file diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitter/TwitterSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitter/TwitterSkraper.kt index b52b8a13..c9b98b5a 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitter/TwitterSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitter/TwitterSkraper.kt @@ -41,9 +41,7 @@ import java.time.Instant * @author sokomishalov */ open class TwitterSkraper @JvmOverloads constructor( - override val client: SkraperClient = DefaultBlockingSkraperClient, - override val baseUrl: String = "https://twitter.com", - private val apiBaseUrl: String = "https://api.twitter.com" + override val client: SkraperClient = DefaultBlockingSkraperClient ) : Skraper { override fun getPosts(path: String): Flow = flow { @@ -94,9 +92,8 @@ open class TwitterSkraper @JvmOverloads constructor( } } - override fun supports(url: String): Boolean { - return arrayOf("twitter.com", "t.co") - .any { url.host.removePrefix("www.") in it } + override fun supports(media: Media): Boolean { + return arrayOf("twitter.com", "t.co").any { it in media.url.host } } override suspend fun resolve(media: Media): Media { @@ -130,7 +127,7 @@ open class TwitterSkraper @JvmOverloads constructor( .substringBefore("?") client.fetchJson(HttpRequest( - url = apiBaseUrl.buildFullURL(path = "/1.1/videos/tweet/config/${tweetId}.json"), + url = API_BASE_URL.buildFullURL(path = "/1.1/videos/tweet/config/${tweetId}.json"), headers = mapOf( "Authorization" to token, "x-guest-token" to guestToken @@ -158,7 +155,7 @@ open class TwitterSkraper @JvmOverloads constructor( private suspend fun getUserPage(path: String): Document? { return client.fetchDocument(HttpRequest( - url = baseUrl.buildFullURL(path = path), + url = BASE_URL.buildFullURL(path = path), headers = DEFAULT_HEADERS )) } @@ -236,7 +233,7 @@ open class TwitterSkraper @JvmOverloads constructor( } videosElement != null -> listOf( Video( - url = "${baseUrl}/i/status/${extractTweetId()}", + url = "${BASE_URL}/i/status/${extractTweetId()}", aspectRatio = videosElement .getFirstElementByClass("PlayableMedia-player") ?.getStyle("padding-bottom") @@ -272,7 +269,7 @@ open class TwitterSkraper @JvmOverloads constructor( val guestTokenNode = token?.let { client.fetchJson(HttpRequest( - url = apiBaseUrl.buildFullURL(path = "/1.1/guest/activate.json"), + url = API_BASE_URL.buildFullURL(path = "/1.1/guest/activate.json"), method = POST, headers = mapOf("Authorization" to it) )) @@ -284,6 +281,8 @@ open class TwitterSkraper @JvmOverloads constructor( } companion object { + const val BASE_URL: String = "https://twitter.com" + const val API_BASE_URL: String = "https://api.twitter.com" private val DEFAULT_HEADERS: Map by lazy { mapOf(USER_AGENT_HEADER to CRAWLER_USER_AGENTS.random()) } } } \ No newline at end of file diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/vk/VkSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/vk/VkSkraper.kt index 3c7c23b1..b385b653 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/vk/VkSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/vk/VkSkraper.kt @@ -28,6 +28,7 @@ import ru.sokomishalov.skraper.client.fetchDocument import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.internal.iterable.emitBatch import ru.sokomishalov.skraper.internal.jsoup.* +import ru.sokomishalov.skraper.internal.net.host import ru.sokomishalov.skraper.model.* import java.time.Instant import java.time.LocalDate @@ -40,8 +41,7 @@ import java.util.Locale.ENGLISH * @author sokomishalov */ open class VkSkraper @JvmOverloads constructor( - override val client: SkraperClient = DefaultBlockingSkraperClient, - override val baseUrl: String = "https://vk.com" + override val client: SkraperClient = DefaultBlockingSkraperClient ) : Skraper { override fun getPosts(path: String): Flow = flow { @@ -86,6 +86,10 @@ open class VkSkraper @JvmOverloads constructor( } } + override fun supports(media: Media): Boolean { + return "vk.com" in media.url.host + } + override suspend fun resolve(media: Media): Media { return when (media) { is Video -> { @@ -134,7 +138,7 @@ open class VkSkraper @JvmOverloads constructor( private suspend fun getUserPage(path: String): Document? { return client.fetchDocument( HttpRequest( - url = baseUrl.buildFullURL(path = path), + url = BASE_URL.buildFullURL(path = path), headers = mapOf("Accept-Language" to "en-US") ) ) @@ -160,7 +164,7 @@ open class VkSkraper @JvmOverloads constructor( startsWith("today at ") -> { removePrefix("today at ") .let { - LocalTime.parse(it.toUpperCase(), VK_SHORT_TIME_AGO_DATE_FORMATTER) + LocalTime.parse(it.uppercase(), VK_SHORT_TIME_AGO_DATE_FORMATTER) } .let { LocalDate @@ -172,7 +176,7 @@ open class VkSkraper @JvmOverloads constructor( startsWith("yesterday at ") -> { removePrefix("yesterday at ") .let { - LocalTime.parse(it.toUpperCase(), VK_SHORT_TIME_AGO_DATE_FORMATTER) + LocalTime.parse(it.uppercase(), VK_SHORT_TIME_AGO_DATE_FORMATTER) } .let { LocalDate @@ -236,7 +240,7 @@ open class VkSkraper @JvmOverloads constructor( ?.getElementsByTag("a") ?.map { val isVideo = it.attr("href").startsWith("/video") - val hrefLink = "${baseUrl}${it.attr("href")}" + val hrefLink = "${BASE_URL}${it.attr("href")}" when { isVideo -> Video( @@ -303,6 +307,8 @@ open class VkSkraper @JvmOverloads constructor( } companion object { + const val BASE_URL: String = "https://vk.com" + private val VK_SHORT_TIME_AGO_DATE_FORMATTER = DateTimeFormatterBuilder() .appendPattern("h:mm a") .parseLenient() diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/youtube/YoutubeSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/youtube/YoutubeSkraper.kt index 65039f9a..85a73e4e 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/youtube/YoutubeSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/youtube/YoutubeSkraper.kt @@ -45,8 +45,7 @@ import java.time.temporal.ChronoUnit.DAYS import java.time.temporal.TemporalAmount open class YoutubeSkraper @JvmOverloads constructor( - override val client: SkraperClient = DefaultBlockingSkraperClient, - override val baseUrl: String = "https://www.youtube.com" + override val client: SkraperClient = DefaultBlockingSkraperClient ) : Skraper { override fun getPosts(path: String): Flow = flow { @@ -90,16 +89,13 @@ open class YoutubeSkraper @JvmOverloads constructor( } } - override fun supports(url: String): Boolean { - return setOf("youtube.com", "youtu.be") - .any { url.host.removePrefix("www.") in it } + override fun supports(media: Media): Boolean { + return arrayOf("youtube.com", "youtu.be").any { it in media.url.host } } override suspend fun resolve(media: Media): Media { return when (media) { - is Video -> runCatching { - YoutubeVideoResolver(client = client, baseUrl = baseUrl).getVideo(media) - }.getOrNull() ?: media + is Video -> runCatching { YoutubeVideoResolver(client = client, baseUrl = BASE_URL).getVideo(media) }.getOrNull() ?: media else -> media } } @@ -107,7 +103,7 @@ open class YoutubeSkraper @JvmOverloads constructor( private suspend fun getUserPage(path: String): Document? { return client.fetchDocument( HttpRequest( - url = baseUrl.buildFullURL( + url = BASE_URL.buildFullURL( path = path, queryParams = mapOf("gl" to "EN", "hl" to "en") ), @@ -127,7 +123,7 @@ open class YoutubeSkraper @JvmOverloads constructor( private fun JsonNode.extractVideos(): List