Skip to content

Commit

Permalink
option --media-only for the cli
Browse files Browse the repository at this point in the history
  • Loading branch information
sokomishalov committed Apr 3, 2020
1 parent ce6b6b8 commit 2b7d7e6
Show file tree
Hide file tree
Showing 42 changed files with 655 additions and 223 deletions.
50 changes: 39 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,26 +48,54 @@ Usage:
```

```text
usage: [-h] PROVIDER PATH [-n LIMIT] [-t TYPE] [-o OUTPUT]
usage: [-h] PROVIDER PATH [-n LIMIT] [-t TYPE] [-o OUTPUT] [-m]
[--parallel-downloads PARALLEL_DOWNLOADS]
optional arguments:
-h, --help show this help message and exit
-h, --help show this help message and exit
-n LIMIT, posts limit (50 by default)
--limit LIMIT
-n LIMIT, --limit LIMIT posts limit (50 by default)
-t TYPE, output type, options: [log, csv, json, xml, yaml]
--type TYPE
-t TYPE, --type TYPE output type, options: [log, csv, json, xml, yaml]
-o OUTPUT, output path
--output OUTPUT
-o OUTPUT, --output OUTPUT output path
-m, --media-only scrape media only
--parallel-downloads PARALLEL_DOWNLOADS amount of parallel downloads for media items if
enabled flag --media-only (4 by default)
positional arguments:
PROVIDER skraper provider, options: [facebook, instagram,
twitter, youtube, twitch, reddit, ninegag, pinterest,
flickr, tumblr, ifunny, vk, pikabu]
PATH path to user/community/channel/topic/trend
usage: [-h] PROVIDER PATH [-n LIMIT] [-t TYPE] [-o OUTPUT] [-m]
[--parallel-downloads PARALLEL_DOWNLOADS]
optional arguments:
-h, --help show this help message and exit
-n LIMIT, --limit LIMIT posts limit (50 by default)
-t TYPE, --type TYPE output type, options: [log, csv, json, xml, yaml]
-o OUTPUT, --output OUTPUT output path
-m, --media-only scrape media only
--parallel-downloads PARALLEL_DOWNLOADS amount of parallel downloads for media items if
enabled flag --only-media (4 by default)
positional arguments:
PROVIDER skraper provider, options: [facebook, instagram, twitter, youtube, twitch, reddit,
ninegag, pinterest, flickr, tumblr, ifunny, vk, pikabu]
PROVIDER skraper provider, options: [facebook, instagram,
twitter, youtube, twitch, reddit, ninegag, pinterest,
flickr, tumblr, ifunny, vk, pikabu]
PATH path to user/community/channel/topic/trend
PATH path to user/community/channel/topic/trend
```

Examples:
Expand Down
6 changes: 3 additions & 3 deletions TODO.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
### TODO list
- [ ] Thumbnails to the videos
- [ ] Add option to the cli tool to download media
### Roadmap
- [ ] Telegram bot
- [ ] Client gui
- [ ] Replace java.time.* from jdk 1.8 to lower jdk date-time api
to be more android-friendly.
- [ ] Implement [LinkedIn](https://linkedin.com) - branch (origin/feature/linkedin)
- [ ] Implement [Snapchat stories](https://story.snapchat.com/) - branch (origin/feature/snapchat)
- [ ] Implement [Imgur](https://imgur.com/)
- [ ] Implement [Tiktok](https://tiktok.com) - branch (origin/feature/tiktok)
186 changes: 149 additions & 37 deletions cli/src/main/kotlin/ru/sokomishalov/skraper/cli/Main.kt
Original file line number Diff line number Diff line change
Expand Up @@ -33,69 +33,114 @@ import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule
import com.fasterxml.jackson.module.kotlin.registerKotlinModule
import com.xenomachina.argparser.ArgParser
import com.xenomachina.argparser.mainBody
import kotlinx.coroutines.asCoroutineDispatcher
import kotlinx.coroutines.async
import kotlinx.coroutines.awaitAll
import kotlinx.coroutines.runBlocking
import ru.sokomishalov.skraper.cli.OutputType.*
import ru.sokomishalov.skraper.model.Post
import ru.sokomishalov.skraper.Skraper
import ru.sokomishalov.skraper.cli.model.Args
import ru.sokomishalov.skraper.cli.model.OutputType.*
import ru.sokomishalov.skraper.cli.model.Provider
import ru.sokomishalov.skraper.model.*
import ru.sokomishalov.skraper.provider.youtube.YoutubeSkraper
import java.io.File
import java.io.File.separator
import java.net.URL
import java.nio.channels.Channels
import java.time.LocalDateTime.now
import java.time.format.DateTimeFormatter.ofPattern
import java.util.*
import java.util.concurrent.Executors
import kotlin.system.exitProcess
import kotlin.text.Charsets.UTF_8

fun main(args: Array<String>) = mainBody(columns = 150) {
val parsedArgs = ArgParser(
args = args.ifEmpty { arrayOf("--help") }
).parseInto(::Args)

fun main(args: Array<String>) = mainBody(columns = 100) {
val parsedArgs = ArgParser(args = args.ifEmpty { arrayOf("--help") }).parseInto(::Args)

println("${"Skraper".green()} ${"v.0.3.0".magenta()} started")

val posts = runBlocking {
parsedArgs.provider.skraper.getPosts(
path = "/${parsedArgs.path.removeSuffix("/")}",
parsedArgs.skraper.getPosts(
path = "/${parsedArgs.path.removePrefix("/")}",
limit = parsedArgs.amount
)
}

when {
parsedArgs.onlyMedia -> posts.persistMedia(parsedArgs)
else -> posts.persistMeta(parsedArgs)
}
}

private fun List<Post>.persistMedia(parsedArgs: Args) {
val provider = parsedArgs.skraper.javaClass.simpleName.toString().toLowerCase().replace("skraper", "")
val requestedPath = parsedArgs.path
val root = when {
parsedArgs.output.isFile -> parsedArgs.output.parentFile.absolutePath
else -> parsedArgs.output.absolutePath
}
val targetDir = File("${root}/${provider}/${requestedPath}").apply { mkdirs() }

runBlocking(context = Executors.newFixedThreadPool(parsedArgs.parallelDownloads).asCoroutineDispatcher()) {
flatMap { post ->
post.media.map { media ->
async {
parsedArgs.skraper.download(
post = post,
media = media,
targetDir = targetDir
)
}
}
}.awaitAll()
}

exitProcess(1)
}

private fun List<Post>.persistMeta(parsedArgs: Args) {
val provider = parsedArgs.skraper.javaClass.simpleName.toString().replace("Skraper", "").toLowerCase()
val requestedPath = parsedArgs.path

val content = when (parsedArgs.outputType) {
LOG -> posts
.joinToString("\n") { it.toString() }
LOG -> joinToString("\n") { it.toString() }
.also { println(it) }
JSON -> JsonMapper()
.registerModule(JavaTimeModule())
.registerModule(Jdk8Module())
.writerWithDefaultPrettyPrinter()
.writeValueAsString(posts)
.writeValueAsString(this)
XML -> XmlMapper()
.registerModule(JavaTimeModule())
.registerModule(Jdk8Module())
.writerWithDefaultPrettyPrinter()
.writeValueAsString(posts)
.writeValueAsString(this)
YAML -> YAMLMapper()
.registerModule(JavaTimeModule())
.registerModule(Jdk8Module())
.writerWithDefaultPrettyPrinter()
.writeValueAsString(posts)
.writeValueAsString(this)
CSV -> {
CsvMapper()
.apply {
registerKotlinModule()
registerModule(JavaTimeModule())
registerModule(Jdk8Module())
registerModule(SimpleModule().apply {
addSerializer(Post::class.java, object : JsonSerializer<Post>() {
override fun serialize(item: Post, jgen: JsonGenerator, serializerProvider: SerializerProvider) {
jgen.writeStartObject()
jgen.writeStringField("ID", item.id)
jgen.writeStringField("Text", item.text)
jgen.writeStringField("Published at", item.publishedAt?.toString(10))
jgen.writeStringField("Rating", item.rating?.toString(10).orEmpty())
jgen.writeStringField("Comments count", item.commentsCount?.toString(10).orEmpty())
jgen.writeStringField("Views count", item.viewsCount?.toString(10).orEmpty())
jgen.writeStringField("Media", item.media.joinToString(" ") { it.url })
jgen.writeEndObject()
}
})
.registerKotlinModule()
.registerModule(JavaTimeModule())
.registerModule(Jdk8Module())
.registerModule(SimpleModule().apply {
addSerializer(Post::class.java, object : JsonSerializer<Post>() {
override fun serialize(item: Post, jgen: JsonGenerator, serializerProvider: SerializerProvider) {
jgen.writeStartObject()
jgen.writeStringField("ID", item.id)
jgen.writeStringField("Text", item.text)
jgen.writeStringField("Published at", item.publishedAt?.toString(10))
jgen.writeStringField("Rating", item.rating?.toString(10).orEmpty())
jgen.writeStringField("Comments count", item.commentsCount?.toString(10).orEmpty())
jgen.writeStringField("Views count", item.viewsCount?.toString(10).orEmpty())
jgen.writeStringField("Media", item.media.joinToString(" ") { it.url })
jgen.writeEndObject()
}
})
}
})
.writer(CsvSchema
.builder()
.addColumn("ID")
Expand All @@ -108,7 +153,7 @@ fun main(args: Array<String>) = mainBody(columns = 150) {
.build()
.withHeader()
)
.writeValueAsString(posts)
.writeValueAsString(this)

}
}
Expand All @@ -117,17 +162,84 @@ fun main(args: Array<String>) = mainBody(columns = 150) {
parsedArgs.output.isFile -> parsedArgs.output
else -> {
val root = parsedArgs.output.absolutePath
val provider = parsedArgs.provider.toString().toLowerCase()
val requestedPath = parsedArgs.path
val now = now().format(ofPattern("ddMMyyyy'_'hhmmss"))
val ext = parsedArgs.outputType.extension

File("${root}/${provider}/${requestedPath}_${now}${ext}")
File("${root}/${provider}/${requestedPath}_${now}.${ext}")
}
}

fileToWrite
.apply { parentFile.mkdirs() }
.also { println("Fetched ${posts.size.toString().green()} posts. Saved to: ${it.path.cyan()}") }
.writeText(text = content, charset = UTF_8)

println(fileToWrite.path.cyan())
}

@Suppress("BlockingMethodInNonBlockingContext")
private suspend fun Skraper.download(post: Post, media: Media, targetDir: File) {
val mediaUrl = URL(media.url)

val (directMediaUrl, filename) = when {
// has some possible extension
mediaUrl
.path
.substringAfterLast("/")
.substringAfterLast(".", "")
.isNotEmpty() -> mediaUrl to media.extractFileName()

// youtube video
mediaUrl.host in YoutubeSkraper.HOSTS -> {
val resolved = Provider.YOUTUBE.skraper.resolve(media)
val name = post.text ?: UUID.randomUUID().toString()
val filename = "${name.abbreviate()}.mp4"

URL(resolved.url) to filename
}

// otherwise
else -> {
resolve(media).run {
URL(url) to extractFileName()
}
}
}


val targetFile = File("${targetDir.absolutePath}$separator${filename}")
runCatching {
Channels.newChannel(directMediaUrl.openStream()).use { rbc ->
targetFile.outputStream().use { fos ->
fos.channel.transferFrom(rbc, 0, Long.MAX_VALUE)
}
}
}.onSuccess {
println(targetFile.absolutePath)
}.onFailure {
println("Cannot download $directMediaUrl")
}
}


private fun Media.extractFileName(): String {
val filename = URL(url).path

val filenameWithoutExtension = filename
.substringAfterLast("/")
.substringBeforeLast(".")

val extension = when (this) {
is Image -> filename.substringAfterLast(".", "png")
is Video -> filename.substringAfterLast(".", "mp4")
is Audio -> filename.substringAfterLast(".", "mp3")
}

return "${filenameWithoutExtension}.${extension}"
}

private fun String.abbreviate(maxLength: Int = 100): String {
return when {
length > 100 -> "${substring((0..maxLength - 3))}..."
else -> this
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package ru.sokomishalov.skraper.cli
package ru.sokomishalov.skraper.cli.model

import com.xenomachina.argparser.ArgParser
import com.xenomachina.argparser.default
import ru.sokomishalov.skraper.cli.OutputType.LOG
import ru.sokomishalov.skraper.cli.model.OutputType.LOG
import java.io.File

class Args(parser: ArgParser) {
val provider by parser.positional(
val skraper by parser.positional(
name = "PROVIDER",
help = "skraper provider, options: ${Provider.values().contentToString().toLowerCase()}"
) { Provider.valueOf(toUpperCase()) }
) { Provider.valueOf(toUpperCase()).skraper }

val path by parser.positional(
name = "PATH",
Expand All @@ -34,15 +34,25 @@ class Args(parser: ArgParser) {
val amount by parser.storing(
"-n", "--limit",
help = "posts limit (50 by default)"
) { toInt() }.default(50)
) { toInt() }.default { 50 }

val outputType by parser.storing(
"-t", "--type",
help = "output type, options: ${OutputType.values().contentToString().toLowerCase()}"
) { OutputType.valueOf(toUpperCase()) }.default(LOG)
) { OutputType.valueOf(toUpperCase()) }.default { LOG }

val output by parser.storing(
"-o", "--output",
help = "output path"
) { File(this) }.default { File("") }

val onlyMedia by parser.flagging(
"-m", "--media-only",
help = "scrape media only"
)

val parallelDownloads by parser.storing(
"--parallel-downloads",
help = "amount of parallel downloads for media items if enabled flag --media-only (4 by default)"
) { toInt() }.default { 4 }
}
Loading

0 comments on commit 2b7d7e6

Please sign in to comment.