From 83fc3805554e12aea8665a767c5bc892e214f6e1 Mon Sep 17 00:00:00 2001 From: Johannes Meyer zum Alten Borgloh Date: Mon, 17 Sep 2018 22:54:17 +0200 Subject: [PATCH] #271 Improves tumblr inline video detection Improves the regex pattern for the detection of inlined tumblr videos within other posts content/bodys. --- .../Crawler/TumblrBlogCrawler.cs | 29 +++++++++---------- .../Crawler/TumblrHiddenCrawler.cs | 16 +++++----- .../Crawler/TumblrLikedByCrawler.cs | 8 ++--- .../Crawler/TumblrSearchCrawler.cs | 8 ++--- .../Crawler/TumblrTagSearchCrawler.cs | 8 ++--- 5 files changed, 33 insertions(+), 36 deletions(-) diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrBlogCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrBlogCrawler.cs index 2df5d50..457f921 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/TumblrBlogCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/TumblrBlogCrawler.cs @@ -769,7 +769,7 @@ private void AddInlinePhotoUrl(Post post) private void AddInlineVttTumblrVideoUrl(Post post) { - var regex = new Regex("\"(https?://vtt.tumblr.com/(tumblr_[A-Za-z0-9]*))"); + var regex = new Regex("\"(https?://vtt.tumblr.com/(tumblr_[\\w]*))"); foreach (Match match in regex.Matches(InlineSearch(post))) { string videoUrl = match.Groups[1].Value; @@ -784,29 +784,29 @@ private void AddInlineVttTumblrVideoUrl(Post post) AddToDownloadList(new VideoPost( videoUrl + "_480.mp4", post.id, post.unix_timestamp.ToString())); - //AddToJsonQueue(new TumblrCrawlerXmlData(Path.ChangeExtension(videoUrl.Split('/').Last(), ".json"), post)); + //AddToJsonQueue(new TumblrCrawlerXmlData(Path.ChangeExtension(videoUrl.Split('/').Last(), "_480.json"), post)); } } } private void AddInlineVideoUrl(Post post) { - var regex = new Regex("\"(http[A-Za-z0-9_/:.]*.com/video_file/[A-Za-z0-9_/:.]*)\""); + var regex = new Regex("src=\"(http[A-Za-z0-9_/:.]*video_file[\\S]*/(tumblr_[\\w]*))[0-9/]*\""); foreach (Match match in regex.Matches(InlineSearch(post))) { - string videoUrl = match.Groups[1].Value; + string videoUrl = match.Groups[2].Value; if (shellService.Settings.VideoSize == 1080) { - AddToDownloadList(new VideoPost(videoUrl.Replace("/480", "") + ".mp4", post.id, post.unix_timestamp.ToString())); - //AddToJsonQueue(new TumblrCrawlerXmlData(Path.ChangeExtension(videoUrl.Split('/').Last(), ".json"), post)); + AddToDownloadList(new VideoPost("https://vtt.tumblr.com/" + videoUrl + ".mp4", post.id, post.unix_timestamp.ToString())); + //AddToJsonQueue(new TumblrCrawlerXmlData(videoUrl + ".json", post)); } else if (shellService.Settings.VideoSize == 480) { AddToDownloadList(new VideoPost( - "https://vtt.tumblr.com/" + videoUrl.Replace("/480", "").Split('/').Last() + "_480.mp4", + "https://vtt.tumblr.com/" + videoUrl + "_480.mp4", post.id, post.unix_timestamp.ToString())); - //AddToJsonQueue(new TumblrCrawlerXmlData(Path.ChangeExtension(videoUrl.Split('/').Last(), ".json"), post)); + //AddToJsonQueue(new TumblrCrawlerXmlData(videoUrl + "_480.json", post)); } } } @@ -837,25 +837,22 @@ private void AddPhotoSetUrl(Post post) private void AddVideoUrl(Post post) { - string videoUrl = Regex.Match(post.video_player, "(Path.ChangeExtension(videoUrl.Split('/').Last(), ".json"), post)); + AddToJsonQueue(new TumblrCrawlerData(videoUrl + ".json", post)); } else if (shellService.Settings.VideoSize == 480) { - AddToDownloadList(new VideoPost( - "https://vt.tumblr.com/" + videoUrl.Replace("/480", "").Split('/').Last() + "_480.mp4", + "https://vtt.tumblr.com/" + videoUrl + "_480.mp4", post.id, post.unix_timestamp.ToString())); - AddToJsonQueue(new TumblrCrawlerData(Path.ChangeExtension(videoUrl.Split('/').Last(), ".json"), post)); - + AddToJsonQueue(new TumblrCrawlerData(videoUrl + "_480.json", post)); } } diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrHiddenCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrHiddenCrawler.cs index 97c000c..dbf00a5 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/TumblrHiddenCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/TumblrHiddenCrawler.cs @@ -593,7 +593,7 @@ private void AddInlineVttTumblrVideoUrl(Post post) { if (post.caption == null) return; - var regex = new Regex("\"(https?://vtt.tumblr.com/(tumblr_[A-Za-z0-9]*))"); + var regex = new Regex("\"(https?://vtt.tumblr.com/(tumblr_[\\w]*))"); foreach (Match match in regex.Matches(post.caption)) { string videoUrl = match.Groups[1].Value; @@ -605,7 +605,7 @@ private void AddInlineVttTumblrVideoUrl(Post post) else if (shellService.Settings.VideoSize == 480) { AddToDownloadList(new VideoPost(videoUrl + "_480.mp4", post.id, post.timestamp.ToString())); - //AddToJsonQueue(new TumblrCrawlerData(Path.ChangeExtension(videoUrl.Split('/').Last(), ".json"), post)); + //AddToJsonQueue(new TumblrCrawlerData(Path.ChangeExtension(videoUrl.Split('/').Last(), "_480.json"), post)); } } } @@ -614,21 +614,21 @@ private void AddInlineVideoUrl(Post post) { if (post.caption == null) return; - var regex = new Regex("\"(http[A-Za-z0-9_/:.]*.com/video_file/[A-Za-z0-9_/:.]*)\""); + var regex = new Regex("src=\"(http[A-Za-z0-9_/:.]*video_file[\\S]*/(tumblr_[\\w]*))[0-9/]*\""); foreach (Match match in regex.Matches(post.caption)) { - string videoUrl = match.Groups[1].Value; + string videoUrl = match.Groups[2].Value; if (shellService.Settings.VideoSize == 1080) { - AddToDownloadList(new VideoPost(videoUrl.Replace("/480", "") + ".mp4", post.id, post.timestamp.ToString())); - //AddToJsonQueue(new TumblrCrawlerData(Path.ChangeExtension(videoUrl.Split('/').Last(), ".json"), post)); + AddToDownloadList(new VideoPost("https://vtt.tumblr.com/" + videoUrl + ".mp4", post.id, post.timestamp.ToString())); + //AddToJsonQueue(new TumblrCrawlerData(videoUrl + ".json", post)); } else if (shellService.Settings.VideoSize == 480) { AddToDownloadList(new VideoPost( - "https://vtt.tumblr.com/" + videoUrl.Replace("/480", "").Split('/').Last() + "_480.mp4", + "https://vtt.tumblr.com/" + videoUrl + "_480.mp4", post.id, post.timestamp.ToString())); - //AddToJsonQueue(new TumblrCrawlerData(Path.ChangeExtension(videoUrl.Split('/').Last(), ".json"), post)); + //AddToJsonQueue(new TumblrCrawlerData(videoUrl + "_480.json", post)); } } } diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrLikedByCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrLikedByCrawler.cs index 429c773..9602860 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/TumblrLikedByCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/TumblrLikedByCrawler.cs @@ -257,21 +257,21 @@ private void AddVideoUrlToDownloadList(string document) { if (blog.DownloadVideo) { - var regex = new Regex("src=\"(http[A-Za-z0-9_/:.]*.com/video_file/[A-Za-z0-9_/:.]*)\""); + var regex = new Regex("src=\"(http[A-Za-z0-9_/:.]*video_file[\\S]*/(tumblr_[\\w]*))[0-9/]*\""); foreach (Match match in regex.Matches(document)) { - string videoUrl = match.Groups[1].Value; + string videoUrl = match.Groups[2].Value; // TODO: add valid postID if (shellService.Settings.VideoSize == 1080) { // TODO: add valid postID - AddToDownloadList(new VideoPost(videoUrl.Replace("/480", "") + ".mp4", Guid.NewGuid().ToString("N"))); + AddToDownloadList(new VideoPost("https://vtt.tumblr.com/" + videoUrl + ".mp4", Guid.NewGuid().ToString("N"))); } else if (shellService.Settings.VideoSize == 480) { // TODO: add valid postID AddToDownloadList(new VideoPost( - "https://vt.tumblr.com/" + videoUrl.Replace("/480", "").Split('/').Last() + "_480.mp4", + "https://vtt.tumblr.com/" + videoUrl + "_480.mp4", Guid.NewGuid().ToString("N"))); } } diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrSearchCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrSearchCrawler.cs index 635f86d..5b25b1f 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/TumblrSearchCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/TumblrSearchCrawler.cs @@ -208,21 +208,21 @@ private void AddVideoUrlToDownloadList(string document) { if (blog.DownloadVideo) { - var regex = new Regex("src=\"(http[A-Za-z0-9_/:.]*.com/video_file/[A-Za-z0-9_/:.]*)\""); + var regex = new Regex("src=\"(http[A-Za-z0-9_/:.]*video_file[\\S]*/(tumblr_[\\w]*))[0-9/]*\""); foreach (Match match in regex.Matches(document)) { - string videoUrl = match.Groups[1].Value; + string videoUrl = match.Groups[2].Value; // TODO: postId if (shellService.Settings.VideoSize == 1080) { // TODO: postID - AddToDownloadList(new VideoPost(videoUrl.Replace("/480", "") + ".mp4", Guid.NewGuid().ToString("N"))); + AddToDownloadList(new VideoPost("https://vtt.tumblr.com/" + videoUrl + ".mp4", Guid.NewGuid().ToString("N"))); } else if (shellService.Settings.VideoSize == 480) { // TODO: postID AddToDownloadList(new VideoPost( - "https://vt.tumblr.com/" + videoUrl.Replace("/480", "").Split('/').Last() + "_480.mp4", + "https://vtt.tumblr.com/" + videoUrl + "_480.mp4", Guid.NewGuid().ToString("N"))); } } diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrTagSearchCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrTagSearchCrawler.cs index 67c3388..60601df 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/TumblrTagSearchCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/TumblrTagSearchCrawler.cs @@ -251,21 +251,21 @@ private void AddVideoUrlToDownloadList(string document) { if (blog.DownloadVideo) { - var regex = new Regex("src=\"(http[A-Za-z0-9_/:.]*.com/video_file/[A-Za-z0-9_/:.]*)\""); + var regex = new Regex("src=\"(http[A-Za-z0-9_/:.]*video_file[\\S]*/(tumblr_[\\w]*))[0-9/]*\""); foreach (Match match in regex.Matches(document)) { - string videoUrl = match.Groups[1].Value; + string videoUrl = match.Groups[2].Value; // TODO: postId if (shellService.Settings.VideoSize == 1080) { // TODO: postID - AddToDownloadList(new VideoPost(videoUrl.Replace("/480", "") + ".mp4", Guid.NewGuid().ToString("N"))); + AddToDownloadList(new VideoPost("https://vtt.tumblr.com/" + videoUrl + ".mp4", Guid.NewGuid().ToString("N"))); } else if (shellService.Settings.VideoSize == 480) { // TODO: postID AddToDownloadList(new VideoPost( - "https://vt.tumblr.com/" + videoUrl.Replace("/480", "").Split('/').Last() + "_480.mp4", + "https://vtt.tumblr.com/" + videoUrl + "_480.mp4", Guid.NewGuid().ToString("N"))); } }