diff --git a/README.md b/README.md index 3b5d6cb8..70d5a61b 100644 --- a/README.md +++ b/README.md @@ -48,26 +48,54 @@ Usage: ``` ```text -usage: [-h] PROVIDER PATH [-n LIMIT] [-t TYPE] [-o OUTPUT] +usage: [-h] PROVIDER PATH [-n LIMIT] [-t TYPE] [-o OUTPUT] [-m] + [--parallel-downloads PARALLEL_DOWNLOADS] optional arguments: - -h, --help show this help message and exit + -h, --help show this help message and exit - -n LIMIT, posts limit (50 by default) - --limit LIMIT + -n LIMIT, --limit LIMIT posts limit (50 by default) - -t TYPE, output type, options: [log, csv, json, xml, yaml] - --type TYPE + -t TYPE, --type TYPE output type, options: [log, csv, json, xml, yaml] - -o OUTPUT, output path - --output OUTPUT + -o OUTPUT, --output OUTPUT output path + + -m, --media-only scrape media only + + --parallel-downloads PARALLEL_DOWNLOADS amount of parallel downloads for media items if + enabled flag --media-only (4 by default) + + +positional arguments: + PROVIDER skraper provider, options: [facebook, instagram, + twitter, youtube, twitch, reddit, ninegag, pinterest, + flickr, tumblr, ifunny, vk, pikabu] + + PATH path to user/community/channel/topic/trend +usage: [-h] PROVIDER PATH [-n LIMIT] [-t TYPE] [-o OUTPUT] [-m] + [--parallel-downloads PARALLEL_DOWNLOADS] + +optional arguments: + -h, --help show this help message and exit + + -n LIMIT, --limit LIMIT posts limit (50 by default) + + -t TYPE, --type TYPE output type, options: [log, csv, json, xml, yaml] + + -o OUTPUT, --output OUTPUT output path + + -m, --media-only scrape media only + + --parallel-downloads PARALLEL_DOWNLOADS amount of parallel downloads for media items if + enabled flag --only-media (4 by default) positional arguments: - PROVIDER skraper provider, options: [facebook, instagram, twitter, youtube, twitch, reddit, - ninegag, pinterest, flickr, tumblr, ifunny, vk, pikabu] + PROVIDER skraper provider, options: [facebook, instagram, + twitter, youtube, twitch, reddit, ninegag, pinterest, + flickr, tumblr, ifunny, vk, pikabu] - PATH path to user/community/channel/topic/trend + PATH path to user/community/channel/topic/trend ``` Examples: diff --git a/TODO.md b/TODO.md index f74178da..eb330227 100644 --- a/TODO.md +++ b/TODO.md @@ -1,9 +1,9 @@ -### TODO list -- [ ] Thumbnails to the videos -- [ ] Add option to the cli tool to download media +### Roadmap +- [ ] Telegram bot - [ ] Client gui - [ ] Replace java.time.* from jdk 1.8 to lower jdk date-time api to be more android-friendly. - [ ] Implement [LinkedIn](https://linkedin.com) - branch (origin/feature/linkedin) - [ ] Implement [Snapchat stories](https://story.snapchat.com/) - branch (origin/feature/snapchat) +- [ ] Implement [Imgur](https://imgur.com/) - [ ] Implement [Tiktok](https://tiktok.com) - branch (origin/feature/tiktok) \ No newline at end of file diff --git a/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/Main.kt b/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/Main.kt index b33253d3..216f437c 100644 --- a/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/Main.kt +++ b/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/Main.kt @@ -33,69 +33,114 @@ import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule import com.fasterxml.jackson.module.kotlin.registerKotlinModule import com.xenomachina.argparser.ArgParser import com.xenomachina.argparser.mainBody +import kotlinx.coroutines.asCoroutineDispatcher +import kotlinx.coroutines.async +import kotlinx.coroutines.awaitAll import kotlinx.coroutines.runBlocking -import ru.sokomishalov.skraper.cli.OutputType.* -import ru.sokomishalov.skraper.model.Post +import ru.sokomishalov.skraper.Skraper +import ru.sokomishalov.skraper.cli.model.Args +import ru.sokomishalov.skraper.cli.model.OutputType.* +import ru.sokomishalov.skraper.cli.model.Provider +import ru.sokomishalov.skraper.model.* +import ru.sokomishalov.skraper.provider.youtube.YoutubeSkraper import java.io.File +import java.io.File.separator +import java.net.URL +import java.nio.channels.Channels import java.time.LocalDateTime.now import java.time.format.DateTimeFormatter.ofPattern +import java.util.* +import java.util.concurrent.Executors +import kotlin.system.exitProcess import kotlin.text.Charsets.UTF_8 -fun main(args: Array) = mainBody(columns = 150) { - val parsedArgs = ArgParser( - args = args.ifEmpty { arrayOf("--help") } - ).parseInto(::Args) + +fun main(args: Array) = mainBody(columns = 100) { + val parsedArgs = ArgParser(args = args.ifEmpty { arrayOf("--help") }).parseInto(::Args) println("${"Skraper".green()} ${"v.0.3.0".magenta()} started") val posts = runBlocking { - parsedArgs.provider.skraper.getPosts( - path = "/${parsedArgs.path.removeSuffix("/")}", + parsedArgs.skraper.getPosts( + path = "/${parsedArgs.path.removePrefix("/")}", limit = parsedArgs.amount ) } + when { + parsedArgs.onlyMedia -> posts.persistMedia(parsedArgs) + else -> posts.persistMeta(parsedArgs) + } +} + +private fun List.persistMedia(parsedArgs: Args) { + val provider = parsedArgs.skraper.javaClass.simpleName.toString().toLowerCase().replace("skraper", "") + val requestedPath = parsedArgs.path + val root = when { + parsedArgs.output.isFile -> parsedArgs.output.parentFile.absolutePath + else -> parsedArgs.output.absolutePath + } + val targetDir = File("${root}/${provider}/${requestedPath}").apply { mkdirs() } + + runBlocking(context = Executors.newFixedThreadPool(parsedArgs.parallelDownloads).asCoroutineDispatcher()) { + flatMap { post -> + post.media.map { media -> + async { + parsedArgs.skraper.download( + post = post, + media = media, + targetDir = targetDir + ) + } + } + }.awaitAll() + } + + exitProcess(1) +} + +private fun List.persistMeta(parsedArgs: Args) { + val provider = parsedArgs.skraper.javaClass.simpleName.toString().replace("Skraper", "").toLowerCase() + val requestedPath = parsedArgs.path + val content = when (parsedArgs.outputType) { - LOG -> posts - .joinToString("\n") { it.toString() } + LOG -> joinToString("\n") { it.toString() } .also { println(it) } JSON -> JsonMapper() .registerModule(JavaTimeModule()) .registerModule(Jdk8Module()) .writerWithDefaultPrettyPrinter() - .writeValueAsString(posts) + .writeValueAsString(this) XML -> XmlMapper() .registerModule(JavaTimeModule()) .registerModule(Jdk8Module()) .writerWithDefaultPrettyPrinter() - .writeValueAsString(posts) + .writeValueAsString(this) YAML -> YAMLMapper() .registerModule(JavaTimeModule()) .registerModule(Jdk8Module()) .writerWithDefaultPrettyPrinter() - .writeValueAsString(posts) + .writeValueAsString(this) CSV -> { CsvMapper() - .apply { - registerKotlinModule() - registerModule(JavaTimeModule()) - registerModule(Jdk8Module()) - registerModule(SimpleModule().apply { - addSerializer(Post::class.java, object : JsonSerializer() { - override fun serialize(item: Post, jgen: JsonGenerator, serializerProvider: SerializerProvider) { - jgen.writeStartObject() - jgen.writeStringField("ID", item.id) - jgen.writeStringField("Text", item.text) - jgen.writeStringField("Published at", item.publishedAt?.toString(10)) - jgen.writeStringField("Rating", item.rating?.toString(10).orEmpty()) - jgen.writeStringField("Comments count", item.commentsCount?.toString(10).orEmpty()) - jgen.writeStringField("Views count", item.viewsCount?.toString(10).orEmpty()) - jgen.writeStringField("Media", item.media.joinToString(" ") { it.url }) - jgen.writeEndObject() - } - }) + .registerKotlinModule() + .registerModule(JavaTimeModule()) + .registerModule(Jdk8Module()) + .registerModule(SimpleModule().apply { + addSerializer(Post::class.java, object : JsonSerializer() { + override fun serialize(item: Post, jgen: JsonGenerator, serializerProvider: SerializerProvider) { + jgen.writeStartObject() + jgen.writeStringField("ID", item.id) + jgen.writeStringField("Text", item.text) + jgen.writeStringField("Published at", item.publishedAt?.toString(10)) + jgen.writeStringField("Rating", item.rating?.toString(10).orEmpty()) + jgen.writeStringField("Comments count", item.commentsCount?.toString(10).orEmpty()) + jgen.writeStringField("Views count", item.viewsCount?.toString(10).orEmpty()) + jgen.writeStringField("Media", item.media.joinToString(" ") { it.url }) + jgen.writeEndObject() + } }) - } + }) .writer(CsvSchema .builder() .addColumn("ID") @@ -108,7 +153,7 @@ fun main(args: Array) = mainBody(columns = 150) { .build() .withHeader() ) - .writeValueAsString(posts) + .writeValueAsString(this) } } @@ -117,17 +162,84 @@ fun main(args: Array) = mainBody(columns = 150) { parsedArgs.output.isFile -> parsedArgs.output else -> { val root = parsedArgs.output.absolutePath - val provider = parsedArgs.provider.toString().toLowerCase() - val requestedPath = parsedArgs.path val now = now().format(ofPattern("ddMMyyyy'_'hhmmss")) val ext = parsedArgs.outputType.extension - File("${root}/${provider}/${requestedPath}_${now}${ext}") + File("${root}/${provider}/${requestedPath}_${now}.${ext}") } } fileToWrite .apply { parentFile.mkdirs() } - .also { println("Fetched ${posts.size.toString().green()} posts. Saved to: ${it.path.cyan()}") } .writeText(text = content, charset = UTF_8) + + println(fileToWrite.path.cyan()) +} + +@Suppress("BlockingMethodInNonBlockingContext") +private suspend fun Skraper.download(post: Post, media: Media, targetDir: File) { + val mediaUrl = URL(media.url) + + val (directMediaUrl, filename) = when { + // has some possible extension + mediaUrl + .path + .substringAfterLast("/") + .substringAfterLast(".", "") + .isNotEmpty() -> mediaUrl to media.extractFileName() + + // youtube video + mediaUrl.host in YoutubeSkraper.HOSTS -> { + val resolved = Provider.YOUTUBE.skraper.resolve(media) + val name = post.text ?: UUID.randomUUID().toString() + val filename = "${name.abbreviate()}.mp4" + + URL(resolved.url) to filename + } + + // otherwise + else -> { + resolve(media).run { + URL(url) to extractFileName() + } + } + } + + + val targetFile = File("${targetDir.absolutePath}$separator${filename}") + runCatching { + Channels.newChannel(directMediaUrl.openStream()).use { rbc -> + targetFile.outputStream().use { fos -> + fos.channel.transferFrom(rbc, 0, Long.MAX_VALUE) + } + } + }.onSuccess { + println(targetFile.absolutePath) + }.onFailure { + println("Cannot download $directMediaUrl") + } +} + + +private fun Media.extractFileName(): String { + val filename = URL(url).path + + val filenameWithoutExtension = filename + .substringAfterLast("/") + .substringBeforeLast(".") + + val extension = when (this) { + is Image -> filename.substringAfterLast(".", "png") + is Video -> filename.substringAfterLast(".", "mp4") + is Audio -> filename.substringAfterLast(".", "mp3") + } + + return "${filenameWithoutExtension}.${extension}" +} + +private fun String.abbreviate(maxLength: Int = 100): String { + return when { + length > 100 -> "${substring((0..maxLength - 3))}..." + else -> this + } } \ No newline at end of file diff --git a/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/Args.kt b/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/model/Args.kt similarity index 69% rename from cli/src/main/kotlin/ru/sokomishalov/skraper/cli/Args.kt rename to cli/src/main/kotlin/ru/sokomishalov/skraper/cli/model/Args.kt index 37e5ee2d..e01114c5 100644 --- a/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/Args.kt +++ b/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/model/Args.kt @@ -13,18 +13,18 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package ru.sokomishalov.skraper.cli +package ru.sokomishalov.skraper.cli.model import com.xenomachina.argparser.ArgParser import com.xenomachina.argparser.default -import ru.sokomishalov.skraper.cli.OutputType.LOG +import ru.sokomishalov.skraper.cli.model.OutputType.LOG import java.io.File class Args(parser: ArgParser) { - val provider by parser.positional( + val skraper by parser.positional( name = "PROVIDER", help = "skraper provider, options: ${Provider.values().contentToString().toLowerCase()}" - ) { Provider.valueOf(toUpperCase()) } + ) { Provider.valueOf(toUpperCase()).skraper } val path by parser.positional( name = "PATH", @@ -34,15 +34,25 @@ class Args(parser: ArgParser) { val amount by parser.storing( "-n", "--limit", help = "posts limit (50 by default)" - ) { toInt() }.default(50) + ) { toInt() }.default { 50 } val outputType by parser.storing( "-t", "--type", help = "output type, options: ${OutputType.values().contentToString().toLowerCase()}" - ) { OutputType.valueOf(toUpperCase()) }.default(LOG) + ) { OutputType.valueOf(toUpperCase()) }.default { LOG } val output by parser.storing( "-o", "--output", help = "output path" ) { File(this) }.default { File("") } + + val onlyMedia by parser.flagging( + "-m", "--media-only", + help = "scrape media only" + ) + + val parallelDownloads by parser.storing( + "--parallel-downloads", + help = "amount of parallel downloads for media items if enabled flag --media-only (4 by default)" + ) { toInt() }.default { 4 } } \ No newline at end of file diff --git a/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/OutputType.kt b/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/model/OutputType.kt similarity index 84% rename from cli/src/main/kotlin/ru/sokomishalov/skraper/cli/OutputType.kt rename to cli/src/main/kotlin/ru/sokomishalov/skraper/cli/model/OutputType.kt index 162af7c2..30230fec 100644 --- a/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/OutputType.kt +++ b/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/model/OutputType.kt @@ -13,15 +13,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package ru.sokomishalov.skraper.cli +package ru.sokomishalov.skraper.cli.model /** * @author sokomishalov */ enum class OutputType(val extension: String) { - LOG(".log"), - CSV(".csv"), - JSON(".json"), - XML(".xml"), - YAML(".yaml") + LOG("log"), + CSV("csv"), + JSON("json"), + XML("xml"), + YAML("yaml") } \ No newline at end of file diff --git a/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/Provider.kt b/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/model/Provider.kt similarity index 66% rename from cli/src/main/kotlin/ru/sokomishalov/skraper/cli/Provider.kt rename to cli/src/main/kotlin/ru/sokomishalov/skraper/cli/model/Provider.kt index 84bb175a..049c3c7f 100644 --- a/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/Provider.kt +++ b/cli/src/main/kotlin/ru/sokomishalov/skraper/cli/model/Provider.kt @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package ru.sokomishalov.skraper.cli +package ru.sokomishalov.skraper.cli.model import ru.sokomishalov.skraper.Skraper import ru.sokomishalov.skraper.client.ktor.KtorSkraperClient @@ -31,20 +31,20 @@ import ru.sokomishalov.skraper.provider.twitter.TwitterSkraper import ru.sokomishalov.skraper.provider.vk.VkSkraper import ru.sokomishalov.skraper.provider.youtube.YoutubeSkraper -private val DEFAULT_CLIENT = KtorSkraperClient() +internal val CLIENT = KtorSkraperClient() enum class Provider(val skraper: Skraper) { - FACEBOOK(FacebookSkraper(client = DEFAULT_CLIENT)), - INSTAGRAM(InstagramSkraper(client = DEFAULT_CLIENT)), - TWITTER(TwitterSkraper(client = DEFAULT_CLIENT)), - YOUTUBE(YoutubeSkraper(client = DEFAULT_CLIENT)), - TWITCH(TwitchSkraper(client = DEFAULT_CLIENT)), - REDDIT(RedditSkraper(client = DEFAULT_CLIENT)), - NINEGAG(NinegagSkraper(client = DEFAULT_CLIENT)), - PINTEREST(PinterestSkraper(client = DEFAULT_CLIENT)), - FLICKR(FlickrSkraper(client = DEFAULT_CLIENT)), - TUMBLR(TumblrSkraper(client = DEFAULT_CLIENT)), - IFUNNY(IFunnySkraper(client = DEFAULT_CLIENT)), - VK(VkSkraper(client = DEFAULT_CLIENT)), - PIKABU(PikabuSkraper(client = DEFAULT_CLIENT)) + FACEBOOK(FacebookSkraper(client = CLIENT)), + INSTAGRAM(InstagramSkraper(client = CLIENT)), + TWITTER(TwitterSkraper(client = CLIENT)), + YOUTUBE(YoutubeSkraper(client = CLIENT)), + TWITCH(TwitchSkraper(client = CLIENT)), + REDDIT(RedditSkraper(client = CLIENT)), + NINEGAG(NinegagSkraper(client = CLIENT)), + PINTEREST(PinterestSkraper(client = CLIENT)), + FLICKR(FlickrSkraper(client = CLIENT)), + TUMBLR(TumblrSkraper(client = CLIENT)), + IFUNNY(IFunnySkraper(client = CLIENT)), + VK(VkSkraper(client = CLIENT)), + PIKABU(PikabuSkraper(client = CLIENT)) } \ No newline at end of file diff --git a/skrapers/pom.xml b/skrapers/pom.xml index aa5c4e35..ee73a231 100644 --- a/skrapers/pom.xml +++ b/skrapers/pom.xml @@ -40,6 +40,13 @@ ${jackson.version} + + + com.github.sealedtx + java-youtube-downloader + 2.1.1 + + com.squareup.okhttp3 diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skraper.kt index f1cb3c6d..c8550180 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/Skraper.kt @@ -13,6 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +@file:Suppress("BlockingMethodInNonBlockingContext") + package ru.sokomishalov.skraper import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient @@ -30,7 +32,7 @@ interface Skraper { val baseUrl: URLString /** - * @return http client for fetching web pages, images and json from network + * @return http client for fetching web pages, media and json from network */ val client: SkraperClient get() = DefaultBlockingSkraperClient @@ -54,4 +56,10 @@ interface Skraper { * @return list of posts */ suspend fun getPosts(path: String, limit: Int = DEFAULT_POSTS_LIMIT): List + + /** + * @param media with provider relative url + * @return direct media url + */ + suspend fun resolve(media: Media): Media = media } diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/SkraperClient.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/SkraperClient.kt index 17012e23..590afe09 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/SkraperClient.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/SkraperClient.kt @@ -17,6 +17,7 @@ package ru.sokomishalov.skraper import ru.sokomishalov.skraper.client.HttpMethodType import ru.sokomishalov.skraper.client.HttpMethodType.GET +import ru.sokomishalov.skraper.internal.consts.DEFAULT_USER_AGENT import ru.sokomishalov.skraper.model.URLString /** @@ -27,7 +28,7 @@ interface SkraperClient { suspend fun request( url: URLString, method: HttpMethodType = GET, - headers: Map = emptyMap(), + headers: Map = mapOf("User-Agent" to DEFAULT_USER_AGENT), body: ByteArray? = null ): ByteArray? diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/SkraperClientExtensions.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/SkraperClientExtensions.kt index ff4504cb..016b3088 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/SkraperClientExtensions.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/SkraperClientExtensions.kt @@ -20,8 +20,11 @@ import org.jsoup.Jsoup import org.jsoup.nodes.Document import ru.sokomishalov.skraper.client.HttpMethodType import ru.sokomishalov.skraper.client.HttpMethodType.GET +import ru.sokomishalov.skraper.internal.jsoup.getMetaPropertyMap +import ru.sokomishalov.skraper.internal.map.firstPresent +import ru.sokomishalov.skraper.internal.number.div import ru.sokomishalov.skraper.internal.serialization.readJsonNodes -import ru.sokomishalov.skraper.model.URLString +import ru.sokomishalov.skraper.model.* import java.nio.charset.Charset import kotlin.text.Charsets.UTF_8 @@ -62,4 +65,59 @@ suspend fun SkraperClient.fetchDocument( return runCatching { request(url, method, headers, body)?.run { Jsoup.parse(toString(charset)) } }.getOrNull() +} + +/** + * @see open graph protocol + */ +suspend fun SkraperClient.fetchOpenGraphMedia(media: Media): Media { + val page = fetchDocument(url = media.url) + + return page?.run { + val metaMap = getMetaPropertyMap() + + with(metaMap) { + when (media) { + is Video -> { + val videoWidth = firstPresent("og:video:width")?.toIntOrNull() + val videoHeight = firstPresent("og:video:height")?.toIntOrNull() + val videoUrl = firstPresent("og:video", "og:video:url", "og:video:secure_url") + + val thumbWidth = firstPresent("og:image:width")?.toIntOrNull() + val thumbHeight = firstPresent("og:image:height")?.toIntOrNull() + val thumbUrl = firstPresent("og:image", "og:image:url", "og:image:secure_url") + + media.copy( + url = videoUrl ?: media.url, + aspectRatio = (videoWidth / videoHeight) ?: media.aspectRatio, + thumbnail = (thumbUrl ?: media.thumbnail?.url)?.let { url -> + Image( + url = url, + aspectRatio = (thumbWidth / thumbHeight) + ?: (videoWidth / videoHeight) + ?: media.thumbnail?.aspectRatio + ) + } + + ) + } + is Image -> { + val imageWidth = firstPresent("og:image:width")?.toIntOrNull() + val imageHeight = firstPresent("og:image:height")?.toIntOrNull() + val imageUrl = firstPresent("og:image", "og:image:url", "og:image:secure_url") + + media.copy( + url = imageUrl ?: media.url, + aspectRatio = (imageWidth / imageHeight) ?: media.aspectRatio + ) + } + is Audio -> { + val audioUrl = firstPresent("og:audio", "og:audio:url", "og:audio:secure_url") + media.copy( + url = audioUrl ?: media.url + ) + } + } + } + } ?: media } \ No newline at end of file diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/client/ktor/KtorSkraperClient.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/client/ktor/KtorSkraperClient.kt index a17a681f..a56d3b43 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/client/ktor/KtorSkraperClient.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/client/ktor/KtorSkraperClient.kt @@ -42,9 +42,7 @@ class KtorSkraperClient( this.method = HttpMethod.parse(method.name) headers .filterKeys { it !in UnsafeHeadersList } - .forEach { (k, v) -> - header(k, v) - } + .forEach { (k, v) -> header(k, v) } body?.let { this.body = ByteArrayContent( bytes = it, diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/consts/SkaperConstants.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/consts/SkaperConstants.kt index 6caa1f1c..77344c65 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/consts/SkaperConstants.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/consts/SkaperConstants.kt @@ -15,4 +15,5 @@ */ package ru.sokomishalov.skraper.internal.consts -internal const val DEFAULT_POSTS_LIMIT: Int = 50 \ No newline at end of file +const val DEFAULT_POSTS_LIMIT: Int = 50 +const val DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" \ No newline at end of file diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/jsoup/JsoupExtensions.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/jsoup/JsoupExtensions.kt index 53d95c5e..c8806e77 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/jsoup/JsoupExtensions.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/jsoup/JsoupExtensions.kt @@ -17,6 +17,7 @@ package ru.sokomishalov.skraper.internal.jsoup +import org.jsoup.nodes.Document import org.jsoup.nodes.Element @PublishedApi @@ -44,6 +45,17 @@ internal inline fun Element.getFirstElementByAttributeValueContaining(name: Stri return getElementsByAttributeValueContaining(name, valuePart).firstOrNull() } +@PublishedApi +internal fun Document?.getMetaPropertyMap(): Map { + return this + ?.head() + ?.getElementsByTag("meta") + ?.filter { it.hasAttr("property") } + ?.map { it.attr("property") to it.attr("content") } + ?.toMap() + .orEmpty() +} + @PublishedApi internal fun Element.getStyleMap(): Map { return when { diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/map/MapExtensions.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/map/MapExtensions.kt new file mode 100644 index 00000000..93149feb --- /dev/null +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/map/MapExtensions.kt @@ -0,0 +1,26 @@ +/** + * Copyright (c) 2019-present Mikhael Sokolov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +@file:Suppress("NOTHING_TO_INLINE") + +package ru.sokomishalov.skraper.internal.map + +/** + * @author sokomishalov + */ + +internal inline fun Map.firstPresent(vararg keys: K): V? { + return keys.mapNotNull { this[it] }.firstOrNull() +} \ No newline at end of file diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/net/UrlExtensions.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/net/UrlExtensions.kt index c23067bf..22965406 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/net/UrlExtensions.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/internal/net/UrlExtensions.kt @@ -23,6 +23,8 @@ import java.io.DataOutputStream import java.net.HttpURLConnection import java.net.HttpURLConnection.* import java.net.URL +import java.net.URLDecoder +import kotlin.text.Charsets.UTF_8 /** @@ -58,6 +60,20 @@ internal suspend fun URL.request( } } +val URL.queryParams: Map + get() { + return query + .split("&".toRegex()) + .map { + val idx = it.indexOf("=") + val key = URLDecoder.decode(it.substring(0, idx), UTF_8) + val value = URLDecoder.decode(it.substring(idx + 1), UTF_8) + + key to value + } + .toMap() + } + private fun HttpURLConnection.applyData( method: HttpMethodType, headers: Map, diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/model/Media.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/model/Media.kt index 93ae84e6..be650c41 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/model/Media.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/model/Media.kt @@ -39,11 +39,13 @@ data class Image( * Represents an image. * @property url video url * @property aspectRatio width to height ratio + * @property thumbnail thumb * @property duration video duration */ data class Video( override val url: URLString, val aspectRatio: Double? = null, + val thumbnail: Image? = null, val duration: Duration? = null ) : Media() diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/model/ModelExtensions.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/model/ModelExtensions.kt index 9ad82e41..c36aa88a 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/model/ModelExtensions.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/model/ModelExtensions.kt @@ -45,12 +45,7 @@ internal fun URLString.buildFullURL(path: String, queryParams: Map .filter { it.value != null } .map { "${it.key}=${it.value.toString().escapeUrl()}" } .fold(initial = "", operation = { acc, s -> "$acc&$s" }) - .let { - when { - it.isNotEmpty() -> "?${it}" - else -> it - } - } + .let { if (it.isNotEmpty()) "?${it}" else it } return baseUrlString + pathString + queryParamsString } \ No newline at end of file diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/facebook/FacebookSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/facebook/FacebookSkraper.kt index 1cc33adf..750578ed 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/facebook/FacebookSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/facebook/FacebookSkraper.kt @@ -22,13 +22,16 @@ import ru.sokomishalov.skraper.Skraper import ru.sokomishalov.skraper.SkraperClient import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.fetchDocument +import ru.sokomishalov.skraper.fetchOpenGraphMedia import ru.sokomishalov.skraper.internal.jsoup.* +import ru.sokomishalov.skraper.internal.net.queryParams import ru.sokomishalov.skraper.internal.number.div import ru.sokomishalov.skraper.internal.serialization.getByPath import ru.sokomishalov.skraper.internal.serialization.getInt import ru.sokomishalov.skraper.internal.serialization.getString import ru.sokomishalov.skraper.internal.serialization.readJsonNodes import ru.sokomishalov.skraper.model.* +import java.net.URL /** @@ -91,6 +94,10 @@ class FacebookSkraper @JvmOverloads constructor( return client.fetchDocument(url = baseUrl.buildFullURL(path = path)) } + override suspend fun resolve(media: Media): Media { + return client.fetchOpenGraphMedia(media) + } + private fun JsonNode?.prepareMetaInfoMap(): Map { return this ?.get("pre_display_requires") @@ -224,8 +231,15 @@ class FacebookSkraper @JvmOverloads constructor( else -> getFirstElementByClass("uiScaledImageContainer") ?.getFirstElementByTag("img") ?.run { + val url = attr("src")?.let { + when { + "safe_image.php" in it -> URL(it).queryParams["url"] + else -> it + } + }.orEmpty() + listOf(Image( - url = attr("src"), + url = url, aspectRatio = attr("width").toDoubleOrNull() / attr("height").toDoubleOrNull() )) } diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/flickr/FlickrSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/flickr/FlickrSkraper.kt index 8b48d248..f1344e9e 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/flickr/FlickrSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/flickr/FlickrSkraper.kt @@ -23,6 +23,7 @@ import ru.sokomishalov.skraper.Skraper import ru.sokomishalov.skraper.SkraperClient import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.fetchDocument +import ru.sokomishalov.skraper.fetchOpenGraphMedia import ru.sokomishalov.skraper.internal.jsoup.getBackgroundImageStyle import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByClass import ru.sokomishalov.skraper.internal.jsoup.getStyle @@ -110,6 +111,13 @@ class FlickrSkraper @JvmOverloads constructor( } } + override suspend fun resolve(media: Media): Media { + return when (media) { + is Image -> client.fetchOpenGraphMedia(media) + else -> media + } + } + private suspend fun getPage(path: String): Document? { return client.fetchDocument(url = baseUrl.buildFullURL(path = path)) } diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ifunny/IFunnySkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ifunny/IFunnySkraper.kt index 2df0b635..45652619 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ifunny/IFunnySkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ifunny/IFunnySkraper.kt @@ -21,6 +21,7 @@ import ru.sokomishalov.skraper.Skraper import ru.sokomishalov.skraper.SkraperClient import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.fetchDocument +import ru.sokomishalov.skraper.fetchOpenGraphMedia import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByTag import ru.sokomishalov.skraper.internal.serialization.getByPath import ru.sokomishalov.skraper.internal.serialization.getString @@ -75,6 +76,10 @@ class IFunnySkraper @JvmOverloads constructor( } } + override suspend fun resolve(media: Media): Media { + return client.fetchOpenGraphMedia(media) + } + override suspend fun getPageInfo(path: String): PageInfo? { val page = getPage(path = path) diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/instagram/InstagramSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/instagram/InstagramSkraper.kt index aab7771e..35799e10 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/instagram/InstagramSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/instagram/InstagramSkraper.kt @@ -20,6 +20,7 @@ import ru.sokomishalov.skraper.Skraper import ru.sokomishalov.skraper.SkraperClient import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.fetchJson +import ru.sokomishalov.skraper.fetchOpenGraphMedia import ru.sokomishalov.skraper.internal.number.div import ru.sokomishalov.skraper.internal.serialization.* import ru.sokomishalov.skraper.model.* @@ -31,7 +32,7 @@ import ru.sokomishalov.skraper.model.MediaSize.* */ class InstagramSkraper @JvmOverloads constructor( override val client: SkraperClient = DefaultBlockingSkraperClient, - private val apiQueryId: String = "17888483320059182", + private val gqlUserMediasQueryId: String = "17888483320059182", override val baseUrl: URLString = "https://instagram.com" ) : Skraper { @@ -58,6 +59,10 @@ class InstagramSkraper @JvmOverloads constructor( } } + override suspend fun resolve(media: Media): Media { + return client.fetchOpenGraphMedia(media) + } + private suspend fun getUserInfo(path: String): JsonNode? { val json = client.fetchJson(url = baseUrl.buildFullURL( path = path, @@ -71,7 +76,7 @@ class InstagramSkraper @JvmOverloads constructor( val data = client.fetchJson(url = baseUrl.buildFullURL( path = "/graphql/query/", queryParams = mapOf( - "query_id" to apiQueryId, + "query_id" to gqlUserMediasQueryId, "id" to userId, "first" to limit ) @@ -83,48 +88,22 @@ class InstagramSkraper @JvmOverloads constructor( .orEmpty() return postsNodes.map { - Post( - id = it.extractPostId(), - text = it.extractPostCaption(), - publishedAt = it.extractPostPublishedAt(), - rating = it.extractPostLikesCount(), - viewsCount = it.extractPostViewsCount(), - commentsCount = it.extractPostCommentsCount(), - media = it.extractPostMediaItems() - ) + with(it) { + Post( + id = getString("id").orEmpty(), + text = getString("edge_media_to_caption.edges.0.node.text").orEmpty(), + publishedAt = getLong("taken_at_timestamp"), + rating = getInt("edge_media_preview_like.count"), + viewsCount = getInt("video_view_count"), + commentsCount = getInt("edge_media_to_comment.count"), + media = extractPostMediaItems() + ) + } } } - private fun JsonNode.extractPostId(): String { - return getString("id") - .orEmpty() - } - - private fun JsonNode.extractPostCaption(): String { - return getByPath("edge_media_to_caption.edges.0.node.text") - ?.asText() - .orEmpty() - } - - private fun JsonNode.extractPostPublishedAt(): Long? { - return getLong("taken_at_timestamp") - } - - private fun JsonNode.extractPostLikesCount(): Int? { - return getInt("edge_media_preview_like.count") - } - - private fun JsonNode.extractPostCommentsCount(): Int? { - return getInt("edge_media_to_comment.count") - } - - private fun JsonNode.extractPostViewsCount(): Int? { - return getInt("video_view_count") - } - private fun JsonNode.extractPostMediaItems(): List { val isVideo = this["is_video"].asBoolean() - val aspectRatio = this["dimensions"]?.run { getDouble("width") / getDouble("height") } return listOf( diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ninegag/NinegagSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ninegag/NinegagSkraper.kt index 4a4fa7fd..ed83b8d8 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ninegag/NinegagSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/ninegag/NinegagSkraper.kt @@ -21,6 +21,7 @@ import ru.sokomishalov.skraper.Skraper import ru.sokomishalov.skraper.SkraperClient import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.fetchDocument +import ru.sokomishalov.skraper.fetchOpenGraphMedia import ru.sokomishalov.skraper.internal.number.div import ru.sokomishalov.skraper.internal.number.minus import ru.sokomishalov.skraper.internal.serialization.* @@ -44,14 +45,16 @@ class NinegagSkraper @JvmOverloads constructor( val posts = dataJson.getPosts().take(limit) return posts.map { p -> - Post( - id = p.getString("id").orEmpty(), - text = p.getString("title"), - publishedAt = p.getLong("creationTs"), - rating = p.getInt("upVoteCount") - p.getInt("downVoteCount"), - commentsCount = p.getInt("commentsCount"), - media = p.extractPostMediaItems() - ) + with(p) { + Post( + id = getString("id").orEmpty(), + text = getString("title"), + publishedAt = getLong("creationTs"), + rating = getInt("upVoteCount") - getInt("downVoteCount"), + commentsCount = getInt("commentsCount"), + media = extractPostMediaItems() + ) + } } } @@ -68,6 +71,10 @@ class NinegagSkraper @JvmOverloads constructor( } } + override suspend fun resolve(media: Media): Media { + return client.fetchOpenGraphMedia(media) + } + private suspend fun getUserPage(path: String): Document? { return client.fetchDocument(url = baseUrl.buildFullURL(path = path)) } diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pikabu/PikabuSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pikabu/PikabuSkraper.kt index 845470ce..46e436a7 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pikabu/PikabuSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pikabu/PikabuSkraper.kt @@ -45,24 +45,26 @@ class PikabuSkraper @JvmOverloads constructor( .orEmpty() return stories.map { - val storyBlocks = it.getElementsByClass("story-block") + with(it) { + val storyBlocks = getElementsByClass("story-block") - val title = it.extractPostTitle() - val text = storyBlocks.parseText() + val title = extractPostTitle() + val text = storyBlocks.parseText() - val caption = when { - text.isBlank() -> title - else -> "${title}\n\n${text}" - } + val caption = when { + text.isBlank() -> title + else -> "${title}\n\n${text}" + } - Post( - id = it.extractPostId(), - text = String(caption.toByteArray(UTF_8)), - publishedAt = it.extractPostPublishDate(), - rating = it.extractPostRating(), - commentsCount = it.extractPostCommentsCount(), - media = storyBlocks.extractPostMediaItems() - ) + Post( + id = extractPostId(), + text = String(caption.toByteArray(UTF_8)), + publishedAt = extractPostPublishDate(), + rating = extractPostRating(), + commentsCount = extractPostCommentsCount(), + media = storyBlocks.extractPostMediaItems() + ) + } } } @@ -149,8 +151,12 @@ class PikabuSkraper @JvmOverloads constructor( "story-block_type_video" in b.classNames() -> b .getFirstElementByAttributeValueContaining("data-type", "video") ?.run { + val ext = when { + attr("data-webm")?.toBoolean() ?: false -> ".webm" + else -> "" + } Video( - url = attr("data-source").orEmpty(), + url = "${attr("data-source")}$ext", aspectRatio = attr("data-ratio")?.toDoubleOrNull(), duration = attr("data-duration")?.toLongOrNull()?.let { Duration.ofSeconds(it) } ) diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pinterest/PinterestSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pinterest/PinterestSkraper.kt index ad992ab8..de87ff97 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pinterest/PinterestSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/pinterest/PinterestSkraper.kt @@ -20,6 +20,7 @@ import ru.sokomishalov.skraper.Skraper import ru.sokomishalov.skraper.SkraperClient import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.fetchDocument +import ru.sokomishalov.skraper.fetchOpenGraphMedia import ru.sokomishalov.skraper.internal.number.div import ru.sokomishalov.skraper.internal.serialization.* import ru.sokomishalov.skraper.model.* @@ -72,6 +73,13 @@ class PinterestSkraper @JvmOverloads constructor( } } + override suspend fun resolve(media: Media): Media { + return when (media) { + is Image -> client.fetchOpenGraphMedia(media) + else -> media + } + } + private suspend fun getUserJson(path: String): JsonNode? { val webPage = client.fetchDocument(baseUrl.buildFullURL(path = path)) val infoJson = webPage?.getElementById("initial-state")?.html() diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraper.kt index a1d5daaa..6a53bc34 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraper.kt @@ -22,6 +22,7 @@ import ru.sokomishalov.skraper.Skraper import ru.sokomishalov.skraper.SkraperClient import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.fetchJson +import ru.sokomishalov.skraper.fetchOpenGraphMedia import ru.sokomishalov.skraper.internal.number.div import ru.sokomishalov.skraper.internal.serialization.* import ru.sokomishalov.skraper.model.* @@ -44,14 +45,16 @@ class RedditSkraper @JvmOverloads constructor( .mapNotNull { it["data"] } return posts.map { - Post( - id = it.getString("id").orEmpty(), - text = it.getString("title"), - publishedAt = it.getLong("created_utc"), - rating = it.getInt("score"), - commentsCount = it.getInt("num_comments"), - media = it.extractPostMediaItems() - ) + with(it) { + Post( + id = getString("id").orEmpty(), + text = getString("title"), + publishedAt = getLong("created_utc"), + rating = getInt("score"), + commentsCount = getInt("num_comments"), + media = extractPostMediaItems() + ) + } } } @@ -82,6 +85,13 @@ class RedditSkraper @JvmOverloads constructor( } } + override suspend fun resolve(media: Media): Media { + return when (media) { + is Image -> client.fetchOpenGraphMedia(media) + else -> media + } + } + private fun JsonNode.extractPostMediaItems(): List { val isVideo = this["media"].isEmpty.not() val url = getString("url").orEmpty() diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraperExtensions.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraperExtensions.kt index 6f7e2633..07135ab2 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraperExtensions.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraperExtensions.kt @@ -25,7 +25,7 @@ import ru.sokomishalov.skraper.model.Post */ suspend fun RedditSkraper.getCommunityHotPosts(community: String, limit: Int = DEFAULT_POSTS_LIMIT): List { - return getPosts(path = "/r/${community.removePrefix("r/")}/${""}", limit = limit) + return getPosts(path = "/r/${community.removePrefix("r/")}/", limit = limit) } suspend fun RedditSkraper.getCommunityNewPosts(community: String, limit: Int = DEFAULT_POSTS_LIMIT): List { @@ -41,7 +41,7 @@ suspend fun RedditSkraper.getCommunityControversialPosts(community: String, limi } suspend fun RedditSkraper.getCommunityTopPosts(community: String, limit: Int = DEFAULT_POSTS_LIMIT): List { - return getPosts(path = "/r/${community.removePrefix("r/")}/${"top"}", limit = limit) + return getPosts(path = "/r/${community.removePrefix("r/")}/top", limit = limit) } suspend fun RedditSkraper.getUserInfo(username: String): PageInfo? { diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/tumblr/TumblrSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/tumblr/TumblrSkraper.kt index 34652874..ba3fc1c6 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/tumblr/TumblrSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/tumblr/TumblrSkraper.kt @@ -23,6 +23,7 @@ import ru.sokomishalov.skraper.Skraper import ru.sokomishalov.skraper.SkraperClient import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.fetchDocument +import ru.sokomishalov.skraper.fetchOpenGraphMedia import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByAttributeValue import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByClass import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByTag @@ -51,6 +52,10 @@ class TumblrSkraper @JvmOverloads constructor( return page.extractPageInfo() } + override suspend fun resolve(media: Media): Media { + return client.fetchOpenGraphMedia(media) + } + internal suspend fun getUserPage(username: String): Document? { return client.fetchDocument(url = baseUrl.replace("://", "://${username}.")) } diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitch/TwitchSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitch/TwitchSkraper.kt index e4db01c3..6d21096d 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitch/TwitchSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitch/TwitchSkraper.kt @@ -23,6 +23,7 @@ import ru.sokomishalov.skraper.client.HttpMethodType.POST import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.fetchDocument import ru.sokomishalov.skraper.fetchJson +import ru.sokomishalov.skraper.internal.consts.DEFAULT_USER_AGENT import ru.sokomishalov.skraper.internal.serialization.* import ru.sokomishalov.skraper.internal.string.unescapeUrl import ru.sokomishalov.skraper.model.* @@ -243,7 +244,8 @@ class TwitchSkraper @JvmOverloads constructor( method = POST, headers = mapOf( "Client-ID" to clientId, - "Accept-Language" to "en-US" + "Accept-Language" to "en-US", + "User-Agent" to DEFAULT_USER_AGENT ), body = "{ \"query\": \"${query.replace("\n", " ").replace("\"", "\\\"")}\" }".toByteArray(UTF_8) ) diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitter/TwitterSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitter/TwitterSkraper.kt index 16e3c9b6..147d907c 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitter/TwitterSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/twitter/TwitterSkraper.kt @@ -51,14 +51,16 @@ class TwitterSkraper @JvmOverloads constructor( .orEmpty() return posts.map { - Post( - id = it.extractTweetId(), - text = it.extractTweetText(), - rating = it.extractTweetLikes(), - commentsCount = it.extractTweetReplies(), - publishedAt = it.extractTweetPublishDate(), - media = it.extractTweetMediaItems() - ) + with(it) { + Post( + id = extractTweetId(), + text = extractTweetText(), + rating = extractTweetLikes(), + commentsCount = extractTweetReplies(), + publishedAt = extractTweetPublishDate(), + media = extractTweetMediaItems() + ) + } } } @@ -86,9 +88,8 @@ class TwitterSkraper @JvmOverloads constructor( return client.fetchDocument(url = baseUrl.buildFullURL(path = path)) } - private fun Document?.extractJsonData(): JsonNode? { - return this - ?.getElementById("init-data") + private fun Document.extractJsonData(): JsonNode? { + return getElementById("init-data") ?.attr("value") ?.readJsonNodes() } diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/vk/VkSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/vk/VkSkraper.kt index f2d49fbb..0f03797a 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/vk/VkSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/vk/VkSkraper.kt @@ -23,6 +23,8 @@ import ru.sokomishalov.skraper.Skraper import ru.sokomishalov.skraper.SkraperClient import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.fetchDocument +import ru.sokomishalov.skraper.fetchOpenGraphMedia +import ru.sokomishalov.skraper.internal.consts.DEFAULT_USER_AGENT import ru.sokomishalov.skraper.internal.jsoup.* import ru.sokomishalov.skraper.model.* import java.time.LocalDate @@ -78,10 +80,20 @@ class VkSkraper @JvmOverloads constructor( } } + override suspend fun resolve(media: Media): Media { + return when (media) { + is Video -> client.fetchOpenGraphMedia(media) + else -> media + } + } + private suspend fun getUserPage(path: String): Document? { return client.fetchDocument( url = baseUrl.buildFullURL(path = path), - headers = mapOf("Accept-Language" to "en-US") + headers = mapOf( + "Accept-Language" to "en-US", + "User-Agent" to DEFAULT_USER_AGENT + ) ) } @@ -163,25 +175,31 @@ class VkSkraper @JvmOverloads constructor( private fun Element.extractPostMediaItems(): List { val thumbElement = getFirstElementByClass("thumbs_map_helper") + val aspectRatio = thumbElement + ?.getStyle("padding-top") + ?.removeSuffix("%") + ?.toDoubleOrNull() + ?.run { 100 / this } + return thumbElement - ?.getElementsByClass("thumb_map_img") + ?.getElementsByTag("a") ?.mapNotNull { - val isVideo = it.attr("data-video").isNotBlank() - val aspectRatio = thumbElement - .getStyle("padding-top") - ?.removeSuffix("%") - ?.toDoubleOrNull() - ?.run { 100 / this } - - when { - isVideo -> Video( - url = "${baseUrl}${it.attr("href")}", - aspectRatio = aspectRatio - ) - else -> Image( - url = it.getBackgroundImageStyle(), - aspectRatio = aspectRatio - ) + with(it) { + val isVideo = attr("href").startsWith("/video") + val hrefLink = "${baseUrl}${attr("href")}" + + when { + isVideo -> Video( + url = hrefLink, + aspectRatio = aspectRatio + ) + else -> Image( + url = getFirstElementByClass("thumb_map_img") + ?.getBackgroundImageStyle() + ?: hrefLink, + aspectRatio = aspectRatio + ) + } } } .orEmpty() @@ -247,6 +265,5 @@ class VkSkraper @JvmOverloads constructor( .appendPattern("d MMM yyyy") .parseLenient() .toFormatter(ENGLISH) - } } diff --git a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/youtube/YoutubeSkraper.kt b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/youtube/YoutubeSkraper.kt index 58f63676..41f94cf4 100644 --- a/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/youtube/YoutubeSkraper.kt +++ b/skrapers/src/main/kotlin/ru/sokomishalov/skraper/provider/youtube/YoutubeSkraper.kt @@ -15,6 +15,9 @@ */ package ru.sokomishalov.skraper.provider.youtube +import com.github.kiulian.downloader.YoutubeDownloader +import kotlinx.coroutines.Dispatchers.IO +import kotlinx.coroutines.withContext import org.jsoup.nodes.Document import org.jsoup.nodes.Element import ru.sokomishalov.skraper.Skraper @@ -23,8 +26,10 @@ import ru.sokomishalov.skraper.client.jdk.DefaultBlockingSkraperClient import ru.sokomishalov.skraper.fetchDocument import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByAttributeValue import ru.sokomishalov.skraper.internal.jsoup.getFirstElementByClass +import ru.sokomishalov.skraper.internal.net.queryParams import ru.sokomishalov.skraper.model.* import ru.sokomishalov.skraper.model.MediaSize.* +import java.net.URL import java.time.Duration import java.time.Duration.ZERO import java.time.Period @@ -37,6 +42,11 @@ class YoutubeSkraper @JvmOverloads constructor( override val baseUrl: URLString = "https://www.youtube.com" ) : Skraper { + companion object { + @JvmStatic + val HOSTS = listOf("www.youtube.com", "youtube.com", "youtu.be") + } + override suspend fun getPosts(path: String, limit: Int): List { val page = getUserPage(path = path) @@ -46,15 +56,15 @@ class YoutubeSkraper @JvmOverloads constructor( .orEmpty() return videos.map { - val linkElement = it.getFirstElementByClass("yt-uix-tile-link") - - Post( - id = linkElement.extractPostId(), - text = linkElement.extractPostCaption(), - viewsCount = it.extractPostViewsCount(), - publishedAt = it.extractPostPublishDate(), - media = it.extractPostVideos() - ) + with(it) { + Post( + id = extractPostId(), + text = extractPostCaption(), + viewsCount = extractPostViewsCount(), + publishedAt = extractPostPublishDate(), + media = extractPostVideos() + ) + } } } @@ -73,6 +83,23 @@ class YoutubeSkraper @JvmOverloads constructor( } } + override suspend fun resolve(media: Media): Media { + return when (media) { + is Video -> withContext(IO) { + val mediaUrl = URL(media.url) + + val id = mediaUrl.queryParams["v"] ?: mediaUrl.path.substringAfterLast("/") + val video = YoutubeDownloader().getVideo(id) + val format = video.videoWithAudioFormats().firstOrNull() + + val url = format?.url() ?: media.url + + Video(url = url) + } + else -> media + } + } + private suspend fun getUserPage(path: String): Document? { return client.fetchDocument(url = baseUrl.buildFullURL( path = path, @@ -82,6 +109,7 @@ class YoutubeSkraper @JvmOverloads constructor( private fun Element?.extractPostId(): String { return this + ?.getFirstElementByClass("yt-uix-tile-link") ?.attr("href") ?.substringAfter("/watch?v=") .orEmpty() @@ -89,6 +117,7 @@ class YoutubeSkraper @JvmOverloads constructor( private fun Element?.extractPostCaption(): String { return this + ?.getFirstElementByClass("yt-uix-tile-link") ?.attr("title") .orEmpty() } @@ -111,23 +140,23 @@ class YoutubeSkraper @JvmOverloads constructor( ?.getElementsByTag("li") ?.getOrNull(0) ?.wholeText() - ?.let { + ?.run { val now = currentUnixTimestamp() - val amount = it.split(" ") + val amount = split(" ") .firstOrNull() ?.toIntOrNull() ?: 1 val temporalAmount: TemporalAmount = when { - it.contains("moment", ignoreCase = true) -> Duration.ofMillis(amount.toLong()) - it.contains("second", ignoreCase = true) -> Duration.ofSeconds(amount.toLong()) - it.contains("minute", ignoreCase = true) -> Duration.ofMinutes(amount.toLong()) - it.contains("hour", ignoreCase = true) -> Duration.ofHours(amount.toLong()) - it.contains("day", ignoreCase = true) -> Duration.ofDays(amount.toLong()) - it.contains("week", ignoreCase = true) -> Period.ofWeeks(amount) - it.contains("month", ignoreCase = true) -> Period.ofMonths(amount) - it.contains("year", ignoreCase = true) -> Period.ofYears(amount) + contains("moment", ignoreCase = true) -> Duration.ofMillis(amount.toLong()) + contains("second", ignoreCase = true) -> Duration.ofSeconds(amount.toLong()) + contains("minute", ignoreCase = true) -> Duration.ofMinutes(amount.toLong()) + contains("hour", ignoreCase = true) -> Duration.ofHours(amount.toLong()) + contains("day", ignoreCase = true) -> Duration.ofDays(amount.toLong()) + contains("week", ignoreCase = true) -> Period.ofWeeks(amount) + contains("month", ignoreCase = true) -> Period.ofMonths(amount) + contains("year", ignoreCase = true) -> Period.ofYears(amount) else -> ZERO } val millisAgo = when (temporalAmount) { @@ -187,12 +216,12 @@ class YoutubeSkraper @JvmOverloads constructor( return this .getFirstElementByClass("yt-subscription-button-subscriber-count-branded-horizontal") ?.wholeText() - ?.let { + ?.run { when { - it.endsWith("K") -> it.replace("K", "").replace(".", "").toIntOrNull()?.times(1_000) - it.endsWith("M", ignoreCase = true) -> it.replace("M", "").replace(".", "").toIntOrNull()?.times(1_000_000) - it.endsWith("B", ignoreCase = true) -> it.replace("B", "").replace(".", "").toIntOrNull()?.times(1_000_000_000) - else -> it.replace(".", "").toIntOrNull() + endsWith("K") -> replace("K", "").replace(".", "").toIntOrNull()?.times(1_000) + endsWith("M", ignoreCase = true) -> replace("M", "").replace(".", "").toIntOrNull()?.times(1_000_000) + endsWith("B", ignoreCase = true) -> replace("B", "").replace(".", "").toIntOrNull()?.times(1_000_000_000) + else -> replace(".", "").toIntOrNull() } } } diff --git a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/SkraperTck.kt b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/SkraperTck.kt index 7c750c6c..61d98326 100644 --- a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/SkraperTck.kt +++ b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/SkraperTck.kt @@ -28,9 +28,11 @@ import org.slf4j.LoggerFactory import ru.sokomishalov.skraper.Skraper import ru.sokomishalov.skraper.SkraperClient import ru.sokomishalov.skraper.client.ktor.KtorSkraperClient +import ru.sokomishalov.skraper.model.Media import ru.sokomishalov.skraper.model.PageInfo import ru.sokomishalov.skraper.model.Post import ru.sokomishalov.skraper.model.ProviderInfo +import kotlin.test.assertNotEquals import kotlin.test.assertNotNull import kotlin.test.assertTrue @@ -104,7 +106,14 @@ abstract class SkraperTck { } } - private suspend fun logAction(action: suspend Skraper.() -> T): T { + protected fun assertMediaResolved(media: Media) = runBlocking { + val resolved = logAction { skraper.resolve(media) } + assertNotNull(resolved) + assertNotNull(resolved.url) + assertNotEquals(media.url, resolved.url) + } + + protected suspend fun logAction(action: suspend Skraper.() -> T): T { return skraper.action().also { log.info(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(it)) } diff --git a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/facebook/FacebookSkraperTest.kt b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/facebook/FacebookSkraperTest.kt index 2baf3329..9162c33b 100644 --- a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/facebook/FacebookSkraperTest.kt +++ b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/facebook/FacebookSkraperTest.kt @@ -16,6 +16,8 @@ package ru.sokomishalov.skraper.provider.facebook import org.junit.Test +import ru.sokomishalov.skraper.model.Image +import ru.sokomishalov.skraper.model.Video import ru.sokomishalov.skraper.provider.SkraperTck /** @@ -46,4 +48,10 @@ class FacebookSkraperTest : SkraperTck() { fun `Check user page info`() { assertPageInfo { skraper.getUserInfo(username = username) } } + + @Test + fun `Check media resolving`() { + assertMediaResolved(Video("https://www.facebook.com/UKGuff/videos/216184746137024/")) + assertMediaResolved(Image("https://www.facebook.com/memes/photos/a.527860673898191/5283528444998033/")) + } } \ No newline at end of file diff --git a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/flickr/FlickrSkraperTest.kt b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/flickr/FlickrSkraperTest.kt index 5d07f64b..1b08e095 100644 --- a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/flickr/FlickrSkraperTest.kt +++ b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/flickr/FlickrSkraperTest.kt @@ -16,6 +16,7 @@ package ru.sokomishalov.skraper.provider.flickr import org.junit.Test +import ru.sokomishalov.skraper.model.Image import ru.sokomishalov.skraper.provider.SkraperTck /** @@ -41,4 +42,9 @@ class FlickrSkraperTest : SkraperTck() { fun `Check user info`() { assertPageInfo { skraper.getUserPageInfo(username = username) } } + + @Test + fun `Check media resolving`() { + assertMediaResolved(Image("https://www.flickr.com/photos/harrythehawk/49711484733/")) + } } \ No newline at end of file diff --git a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/ifunny/IFunnySkraperTest.kt b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/ifunny/IFunnySkraperTest.kt index b2431777..45851751 100644 --- a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/ifunny/IFunnySkraperTest.kt +++ b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/ifunny/IFunnySkraperTest.kt @@ -16,6 +16,8 @@ package ru.sokomishalov.skraper.provider.ifunny import org.junit.Test +import ru.sokomishalov.skraper.model.Image +import ru.sokomishalov.skraper.model.Video import ru.sokomishalov.skraper.provider.SkraperTck @@ -42,4 +44,10 @@ class IFunnySkraperTest : SkraperTck() { fun `Check user info`() { assertPageInfo { skraper.getUserInfo(username = username) } } + + @Test + fun `Check media resolving`() { + assertMediaResolved(Video("https://ifunny.co/video/ySMWOp7Y7?gallery=featured")) + assertMediaResolved(Image("https://ifunny.co/picture/VmxhClHY7?gallery=featured")) + } } \ No newline at end of file diff --git a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/instagram/InstagramSkraperTest.kt b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/instagram/InstagramSkraperTest.kt index 13808f6a..319f3f3b 100644 --- a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/instagram/InstagramSkraperTest.kt +++ b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/instagram/InstagramSkraperTest.kt @@ -16,6 +16,8 @@ package ru.sokomishalov.skraper.provider.instagram import org.junit.Test +import ru.sokomishalov.skraper.model.Image +import ru.sokomishalov.skraper.model.Video import ru.sokomishalov.skraper.provider.SkraperTck /** @@ -35,4 +37,10 @@ class InstagramSkraperTest : SkraperTck() { fun `Check user info`() { assertPageInfo { skraper.getUserInfo(username = username) } } + + @Test + fun `Check media resolving`() { + assertMediaResolved(Video("https://www.instagram.com/p/B-flad2F5o7/")) + assertMediaResolved(Image("https://www.instagram.com/p/B-gwQJelNjs/")) + } } \ No newline at end of file diff --git a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/ninegag/NinegagSkraperTest.kt b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/ninegag/NinegagSkraperTest.kt index 7926c02f..e31b1206 100644 --- a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/ninegag/NinegagSkraperTest.kt +++ b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/ninegag/NinegagSkraperTest.kt @@ -16,6 +16,7 @@ package ru.sokomishalov.skraper.provider.ninegag import org.junit.Test +import ru.sokomishalov.skraper.model.Image import ru.sokomishalov.skraper.provider.SkraperTck /** @@ -61,4 +62,11 @@ class NinegagSkraperTest : SkraperTck() { fun `Check topic info`() { assertPageInfo { skraper.getTopicInfo(topic = topic) } } + + @Test + fun `Check media resolving`() { + // FIXME + // assertMediaResolved(Video("https://9gag.com/gag/a9RxgGZ")) + assertMediaResolved(Image("https://9gag.com/gag/aQ1LGEq")) + } } \ No newline at end of file diff --git a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/pikabu/PikabuSkraperTest.kt b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/pikabu/PikabuSkraperTest.kt index 09e908a6..e1aeacfc 100644 --- a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/pikabu/PikabuSkraperTest.kt +++ b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/pikabu/PikabuSkraperTest.kt @@ -79,5 +79,4 @@ class PikabuSkraperTest : SkraperTck() { fun `Check community info`() { assertPageInfo { skraper.getCommunityInfo(community = community) } } - } \ No newline at end of file diff --git a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/pinterest/PinterestSkraperTest.kt b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/pinterest/PinterestSkraperTest.kt index c8e3fd45..d89c1102 100644 --- a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/pinterest/PinterestSkraperTest.kt +++ b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/pinterest/PinterestSkraperTest.kt @@ -16,6 +16,7 @@ package ru.sokomishalov.skraper.provider.pinterest import org.junit.Test +import ru.sokomishalov.skraper.model.Image import ru.sokomishalov.skraper.provider.SkraperTck /** @@ -36,4 +37,9 @@ class PinterestSkraperTest : SkraperTck() { fun `Check user info`() { assertPageInfo { skraper.getUserInfo(username = username) } } + + @Test + fun `Check media resolving`() { + assertMediaResolved(Image("https://www.pinterest.ru/pin/89509111320495523/")) + } } \ No newline at end of file diff --git a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraperTest.kt b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraperTest.kt index 62086998..0ddbbee0 100644 --- a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraperTest.kt +++ b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/reddit/RedditSkraperTest.kt @@ -16,6 +16,7 @@ package ru.sokomishalov.skraper.provider.reddit import org.junit.Test +import ru.sokomishalov.skraper.model.Image import ru.sokomishalov.skraper.provider.SkraperTck /** @@ -62,4 +63,9 @@ class RedditSkraperTest : SkraperTck() { fun `Check community info`() { assertPageInfo { skraper.getCommunityInfo(community = community) } } + + @Test + fun `Check media resolving`() { + assertMediaResolved(Image("https://www.reddit.com/r/memes/comments/fu78mt/assuming_birds_are_real/")) + } } \ No newline at end of file diff --git a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/tumblr/TumblrSkraperTest.kt b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/tumblr/TumblrSkraperTest.kt index a49a35b6..a3eec96c 100644 --- a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/tumblr/TumblrSkraperTest.kt +++ b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/tumblr/TumblrSkraperTest.kt @@ -16,6 +16,8 @@ package ru.sokomishalov.skraper.provider.tumblr import org.junit.Test +import ru.sokomishalov.skraper.model.Image +import ru.sokomishalov.skraper.model.Video import ru.sokomishalov.skraper.provider.SkraperTck class TumblrSkraperTest : SkraperTck() { @@ -32,4 +34,10 @@ class TumblrSkraperTest : SkraperTck() { fun `Check user info`() { assertPageInfo { skraper.getUserInfo(username = username) } } + + @Test + fun `Check media resolving`() { + assertMediaResolved(Image("https://memegeeks.tumblr.com/image/614390937539493888")) + assertMediaResolved(Video("https://sadiiomane10.tumblr.com/post/611939046726483968/james-milners-goal-line-clearance")) + } } \ No newline at end of file diff --git a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/vk/VkSkraperTest.kt b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/vk/VkSkraperTest.kt index 32287978..d0e9bad7 100644 --- a/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/vk/VkSkraperTest.kt +++ b/skrapers/src/test/kotlin/ru/sokomishalov/skraper/provider/vk/VkSkraperTest.kt @@ -26,7 +26,8 @@ class VkSkraperTest : SkraperTck() { override val skraper: VkSkraper = VkSkraper(client = client) override val path: String = "/durov" private val username: String = "durov" - private val community: String = "komment" +// private val community: String = "komment" + private val community: String = "vlentach" @Test fun `Check user posts`() {