Skip to content

Commit

Permalink
remove base url property from Skraper interface (#168)
Browse files Browse the repository at this point in the history
* remove base url property from `Skraper` interface

* provide `UnknownMedia`

Co-authored-by: sokomishalov <sokolovm@dev.vtb>
  • Loading branch information
sokomishalov and sokomishalov authored Jun 8, 2021
1 parent 36547ff commit a31da1c
Show file tree
Hide file tree
Showing 23 changed files with 231 additions and 140 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,11 +195,10 @@ interface:

```kotlin
interface Skraper {
val baseUrl: String
val client: SkraperClient
fun supports(url: String): Boolean
fun getPosts(path: String): Flow<Post>
suspend fun getPageInfo(path: String): PageInfo?
fun supports(media: Media): Boolean
suspend fun resolve(media: Media): Media
}
```
Expand Down
12 changes: 3 additions & 9 deletions skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skraper.kt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ package ru.sokomishalov.skraper
import kotlinx.coroutines.flow.Flow
import ru.sokomishalov.skraper.client.SkraperClient
import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient
import ru.sokomishalov.skraper.internal.net.host
import ru.sokomishalov.skraper.model.Media
import ru.sokomishalov.skraper.model.PageInfo
import ru.sokomishalov.skraper.model.Post
Expand All @@ -28,11 +27,6 @@ import ru.sokomishalov.skraper.model.Post
*/
interface Skraper {

/**
* @return provider base url
*/
val baseUrl: String

/**
* @return http client
*/
Expand All @@ -51,10 +45,10 @@ interface Skraper {
suspend fun getPageInfo(path: String): PageInfo?

/**
* @param url potential provider relative url
* @return true if such skraper supports this url
* @param media media item
* @return true if such skraper supports this media and can resolve/download it
*/
fun supports(url: String): Boolean = url.host.removePrefix("www.") in baseUrl.host
fun supports(media: Media): Boolean

/**
* @param media with provider relative url
Expand Down
15 changes: 7 additions & 8 deletions skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skrapers.kt
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@ import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient
import ru.sokomishalov.skraper.internal.ffmpeg.FfmpegCliRunner
import ru.sokomishalov.skraper.internal.ffmpeg.FfmpegRunner
import ru.sokomishalov.skraper.internal.net.path
import ru.sokomishalov.skraper.model.Audio
import ru.sokomishalov.skraper.model.Image
import ru.sokomishalov.skraper.model.Media
import ru.sokomishalov.skraper.model.Video
import ru.sokomishalov.skraper.model.*
import ru.sokomishalov.skraper.provider.facebook.FacebookSkraper
import ru.sokomishalov.skraper.provider.flickr.FlickrSkraper
import ru.sokomishalov.skraper.provider.ifunny.IFunnySkraper
Expand Down Expand Up @@ -55,11 +52,11 @@ object Skrapers {
}

/**
* @param url potential provider relative url
* @param media media item
* @return skraper which supports this url or null if none of skrapers supports it
*/
fun suitable(url: String): Skraper? {
return providers.find { it.supports(url) }
fun findSuitable(media: Media): Skraper? {
return providers.find { it.supports(media) }
}

/**
Expand All @@ -78,7 +75,7 @@ object Skrapers {

// otherwise
else -> {
suitable(media.url)
findSuitable(media)
?.resolve(media)
?.run {
when {
Expand All @@ -87,6 +84,7 @@ object Skrapers {
is Image -> media.copy(url = url)
is Video -> media.copy(url = url)
is Audio -> media.copy(url = url)
is UnknownMedia -> media.copy(url = url)
}
}
}
Expand Down Expand Up @@ -166,6 +164,7 @@ object Skrapers {
is Image -> filename.substringAfterLast(".", "png")
is Video -> filename.substringAfterLast(".", "mp4")
is Audio -> filename.substringAfterLast(".", "mp3")
is UnknownMedia -> filename.substringAfterLast(".")
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@ import org.jsoup.nodes.Document
import org.jsoup.nodes.Element
import ru.sokomishalov.skraper.internal.map.firstNotNull
import ru.sokomishalov.skraper.internal.number.div
import ru.sokomishalov.skraper.model.Audio
import ru.sokomishalov.skraper.model.Image
import ru.sokomishalov.skraper.model.Media
import ru.sokomishalov.skraper.model.Video
import ru.sokomishalov.skraper.model.*

internal inline fun Element.getFirstElementByClass(name: String): Element? {
return getElementsByClass(name).firstOrNull()
Expand Down Expand Up @@ -126,6 +123,12 @@ internal fun Document.extractOpenGraphMedia(media: Media): Media {
url = audioUrl ?: media.url
)
}
is UnknownMedia -> {
val url = firstNotNull("og:url")
media.copy(
url = url ?: media.url
)
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,11 @@ data class Audio(
override val url: String,
val duration: Duration? = null
) : Media()

/**
* Represents an unknown media.
* @property url some url
*/
data class UnknownMedia(
override val url: String
): Media()
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import ru.sokomishalov.skraper.internal.iterable.emitBatch
import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByAttributeValue
import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByClass
import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByTag
import ru.sokomishalov.skraper.internal.net.host
import ru.sokomishalov.skraper.internal.number.div
import ru.sokomishalov.skraper.internal.serialization.getDouble
import ru.sokomishalov.skraper.internal.serialization.getLong
Expand All @@ -42,16 +43,14 @@ import java.time.Instant
* @author sokomishalov
*/
open class FacebookSkraper @JvmOverloads constructor(
override val client: SkraperClient = DefaultBlockingSkraperClient,
override val baseUrl: String = "https://facebook.com",
private val mobileBaseUrl: String = "https://m.facebook.com"
override val client: SkraperClient = DefaultBlockingSkraperClient
) : Skraper {

override fun getPosts(path: String): Flow<Post> = flow {
val postsPath = path.substringBefore("/posts") + "/posts"
var nextPath = postsPath
while (true) {
val fetchResult = client.fetchString(HttpRequest(url = mobileBaseUrl.buildFullURL(path = nextPath)))
val fetchResult = client.fetchString(HttpRequest(url = MOBILE_BASE_URL.buildFullURL(path = nextPath)))

val (document, nextPage) = fetchResult?.extractDocumentAndNextPage() ?: break
nextPath = nextPage ?: break
Expand Down Expand Up @@ -79,7 +78,7 @@ open class FacebookSkraper @JvmOverloads constructor(
override suspend fun getPageInfo(path: String): PageInfo? {
val aboutPath = path.substringBefore("/about") + "/about"

val page = client.fetchDocument(HttpRequest(url = baseUrl.buildFullURL(path = aboutPath)))
val page = client.fetchDocument(HttpRequest(url = BASE_URL.buildFullURL(path = aboutPath)))

return page?.run {
val isCommunity = getFirstElementByAttributeValue("data-key", "tab_community") != null
Expand All @@ -106,6 +105,10 @@ open class FacebookSkraper @JvmOverloads constructor(
}
}

override fun supports(media: Media): Boolean {
return "facebook.com" in media.url.host
}

override suspend fun resolve(media: Media): Media {
return client.fetchOpenGraphMedia(media)
}
Expand Down Expand Up @@ -255,4 +258,9 @@ open class FacebookSkraper @JvmOverloads constructor(
return getFirstElementByClass("coverPhotoImg")
?.attr("src")
}

companion object {
const val BASE_URL: String = "https://facebook.com"
const val MOBILE_BASE_URL: String = "https://m.facebook.com"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import ru.sokomishalov.skraper.internal.iterable.emitBatch
import ru.sokomishalov.skraper.internal.jsoup.getBackgroundImageUrl
import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByClass
import ru.sokomishalov.skraper.internal.jsoup.getStyle
import ru.sokomishalov.skraper.internal.net.host
import ru.sokomishalov.skraper.internal.number.div
import ru.sokomishalov.skraper.internal.serialization.*
import ru.sokomishalov.skraper.internal.string.unescapeHtml
Expand All @@ -43,8 +44,7 @@ import java.time.Instant
* @author sokomishalov
*/
open class FlickrSkraper @JvmOverloads constructor(
override val client: SkraperClient = DefaultBlockingSkraperClient,
override val baseUrl: String = "https://flickr.com"
override val client: SkraperClient = DefaultBlockingSkraperClient
) : Skraper {

override fun getPosts(path: String): Flow<Post> = flow {
Expand Down Expand Up @@ -86,22 +86,24 @@ open class FlickrSkraper @JvmOverloads constructor(
}

jsonPosts.isNotEmpty() -> jsonPosts.forEach { (key, value) ->
emit(Post(
id = key,
text = value?.extractPostText(),
publishedAt = value?.extractPostPublishDate(),
statistics = PostStatistics(
likes = value?.extractPostLikes(),
comments = value?.extractPostCommentsCount(),
views = value?.extractPostViewsCount(),
),
media = listOf(
Image(
url = value.extractPostAttachmentUrl(),
aspectRatio = value.extractPostAspectRatio()
emit(
Post(
id = key,
text = value?.extractPostText(),
publishedAt = value?.extractPostPublishDate(),
statistics = PostStatistics(
likes = value?.extractPostLikes(),
comments = value?.extractPostCommentsCount(),
views = value?.extractPostViewsCount(),
),
media = listOf(
Image(
url = value.extractPostAttachmentUrl(),
aspectRatio = value.extractPostAspectRatio()
)
)
)
))
)
}
}
}
Expand All @@ -126,6 +128,10 @@ open class FlickrSkraper @JvmOverloads constructor(
}
}

override fun supports(media: Media): Boolean {
return "flickr.com" in media.url.host
}

override suspend fun resolve(media: Media): Media {
return when (media) {
is Image -> client.fetchOpenGraphMedia(media)
Expand All @@ -134,7 +140,7 @@ open class FlickrSkraper @JvmOverloads constructor(
}

private suspend fun getPage(path: String): Document? {
return client.fetchDocument(HttpRequest(url = baseUrl.buildFullURL(path = path)))
return client.fetchDocument(HttpRequest(url = BASE_URL.buildFullURL(path = path)))
}

private fun Document?.parseModelJson(): JsonNode? {
Expand Down Expand Up @@ -248,7 +254,7 @@ open class FlickrSkraper @JvmOverloads constructor(
}

private fun JsonNode.extractPageLogo(): Image? {
return getFirstByPath("photostream-models.0.owner.buddyicon","person-models.0.buddyicon")
return getFirstByPath("photostream-models.0.owner.buddyicon", "person-models.0.buddyicon")
?.getFirstByPath("large", "medium", "small", "default")
?.asText()
?.convertToImage()
Expand All @@ -265,4 +271,8 @@ open class FlickrSkraper @JvmOverloads constructor(
this?.asText().orEmpty()
}
}

companion object {
const val BASE_URL: String = "https://flickr.com"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@ import ru.sokomishalov.skraper.client.fetchOpenGraphMedia
import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient
import ru.sokomishalov.skraper.internal.iterable.emitBatch
import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByTag
import ru.sokomishalov.skraper.internal.net.host
import ru.sokomishalov.skraper.internal.serialization.*
import ru.sokomishalov.skraper.model.*

/**
* @author sokomishalov
*/
open class IFunnySkraper @JvmOverloads constructor(
override val client: SkraperClient = DefaultBlockingSkraperClient,
override val baseUrl: String = "https://ifunny.co"
override val client: SkraperClient = DefaultBlockingSkraperClient
) : Skraper {

override fun getPosts(path: String): Flow<Post> = flow {
Expand Down Expand Up @@ -66,7 +66,7 @@ open class IFunnySkraper @JvmOverloads constructor(
media = listOf(
when {
isVideo -> Video(
url = "${baseUrl}${link}",
url = "${BASE_URL}${link}",
aspectRatio = aspectRatio
)
else -> Image(
Expand All @@ -83,6 +83,10 @@ open class IFunnySkraper @JvmOverloads constructor(
}
}

override fun supports(media: Media): Boolean {
return "ifunny.co" in media.url.host
}

override suspend fun resolve(media: Media): Media {
return client.fetchOpenGraphMedia(media)
}
Expand Down Expand Up @@ -120,6 +124,10 @@ open class IFunnySkraper @JvmOverloads constructor(


private suspend fun getPage(path: String): Document? {
return client.fetchDocument(HttpRequest(url = baseUrl.buildFullURL(path = path)))
return client.fetchDocument(HttpRequest(url = BASE_URL.buildFullURL(path = path)))
}

companion object {
const val BASE_URL: String = "https://ifunny.co"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import ru.sokomishalov.skraper.client.fetchDocument
import ru.sokomishalov.skraper.client.fetchOpenGraphMedia
import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient
import ru.sokomishalov.skraper.internal.iterable.emitBatch
import ru.sokomishalov.skraper.internal.net.host
import ru.sokomishalov.skraper.internal.number.div
import ru.sokomishalov.skraper.internal.serialization.*
import ru.sokomishalov.skraper.model.*
Expand All @@ -35,8 +36,7 @@ import java.time.Instant
* @author sokomishalov
*/
open class InstagramSkraper @JvmOverloads constructor(
override val client: SkraperClient = DefaultBlockingSkraperClient,
override val baseUrl: String = "https://instagram.com"
override val client: SkraperClient = DefaultBlockingSkraperClient
) : Skraper {

override fun getPosts(path: String): Flow<Post> = flow {
Expand Down Expand Up @@ -87,6 +87,10 @@ open class InstagramSkraper @JvmOverloads constructor(
}
}

override fun supports(media: Media): Boolean {
return "instagram.com" in media.url.host
}

override suspend fun resolve(media: Media): Media {
return client.fetchOpenGraphMedia(media)
}
Expand All @@ -96,7 +100,7 @@ open class InstagramSkraper @JvmOverloads constructor(
private fun JsonNode.extractPostMediaItems(): List<Media> {
val isVideo = this["is_video"].asBoolean()
val aspectRatio = this["dimensions"]?.run { getDouble("width") / getDouble("height") }
val shortcodeUrl = "${baseUrl}/p/${getString("shortcode")}"
val shortcodeUrl = "${BASE_URL}/p/${getString("shortcode")}"

return listOf(
when {
Expand All @@ -119,7 +123,7 @@ open class InstagramSkraper @JvmOverloads constructor(
}

private suspend fun fetchJsonNodes(path: String): JsonNode? {
val document = client.fetchDocument(HttpRequest(url = baseUrl.buildFullURL(path)))
val document = client.fetchDocument(HttpRequest(url = BASE_URL.buildFullURL(path)))
return document
?.getElementsByTag("script")
?.map { it.html() }
Expand All @@ -128,4 +132,8 @@ open class InstagramSkraper @JvmOverloads constructor(
?.substringBeforeLast(";")
.readJsonNodes()
}

companion object {
const val BASE_URL: String = "https://instagram.com"
}
}
Loading

0 comments on commit a31da1c

Please sign in to comment.