Skip to content
This repository has been archived by the owner on Mar 9, 2021. It is now read-only.

Commit

Permalink
Exclude 'tumblr_' urls from generic image/video matching
Browse files Browse the repository at this point in the history
- Excludes urls containing 'tumblr_' from the generic image/video pattern matching results.
  • Loading branch information
johanneszab committed Dec 7, 2018
1 parent 53f0f6b commit 47e774e
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,9 @@ protected void AddGenericPhotoUrl(string post)
{
foreach (string imageUrl in tumblrParser.SearchForGenericPhotoUrl(post))
{
if (tumblrParser.IsTumblrUrl(imageUrl))
continue;

if (CheckIfSkipGif(imageUrl))
continue;

Expand All @@ -263,15 +266,10 @@ protected void AddGenericVideoUrl(string post)
{
foreach (string videoUrl in tumblrParser.SearchForGenericVideoUrl(post))
{
string url = videoUrl;
if (url.Contains("tumblr") && shellService.Settings.VideoSize == 480)
{
int indexOfSuffix = url.LastIndexOf('.');
if (indexOfSuffix >= 0)
url = url.Insert(indexOfSuffix, "_480");
}

AddToDownloadList(new VideoPost(url, Guid.NewGuid().ToString("N")));
if (tumblrParser.IsTumblrUrl(videoUrl))
continue;

AddToDownloadList(new VideoPost(videoUrl, Guid.NewGuid().ToString("N")));
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,7 @@ public interface ITumblrParser
IEnumerable<string> SearchForGenericPhotoUrl(string searchableText);

IEnumerable<string> SearchForGenericVideoUrl(string searchableText);

bool IsTumblrUrl(string url);
}
}
6 changes: 6 additions & 0 deletions src/TumblThree/TumblThree.Applications/Parser/TumblrParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,11 @@ public IEnumerable<string> SearchForGenericVideoUrl(string searchableText)
yield return videoUrl;
}
}

public bool IsTumblrUrl(string url)
{
var regex = new Regex("tumblr_[\\w]*");
return regex.IsMatch(url);
}
}
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 6 additions & 4 deletions src/TumblThree/TumblThree.Presentation/Properties/Resources.resx
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ since the throttling algorithm is too slow to adjust and might negatively impact
<value>Number of connections used for each scan</value>
</data>
<data name="ToolTipScanDescription" xml:space="preserve">
<value>Sets the number connections used for scanning.
<value>Sets the number of connections used for scanning.
Since the data is usually small, you should leave this high.
Note: This setting has no impact if the Limit Tumblr Api Connections setting is turned on.</value>
</data>
Expand Down Expand Up @@ -700,7 +700,7 @@ E.g. great big car, bears searches for great big cars and bears.</value>
<value>Number of concurrent connections to the tumblr video host</value>
</data>
<data name="ToolTipVideoConnectionsDescription" xml:space="preserve">
<value>The vt.tumblr.com host regularly closes connections if the number is too high.</value>
<value>The v*.tumblr.com hosts regularly close connections if the number is too high.</value>
</data>
<data name="ExternalSettings" xml:space="preserve">
<value>External</value>
Expand Down Expand Up @@ -927,13 +927,15 @@ This value determines the information refresh rate for each individual queued bl
</data>
<data name="ToolTipRegExPhotosDescription" xml:space="preserve">
<value>Uses regular expressions to search for images in everything TumblThree scans.
This will add plenty of duplicate image urls to the queue, but might gather images from websites that are currently not supported by a specifically written parser.</value>
This might add plenty of duplicate or random image urls to the queue, but could potentially
gather images from websites that are currently not supported by a specifically written parser.</value>
</data>
<data name="ToolTipRegExVideos" xml:space="preserve">
<value>Search for videos in the crawl data</value>
</data>
<data name="ToolTipRegExVideosDescription" xml:space="preserve">
<value>Uses regular expressions to search for videos in everything TumblThree scans.
This will add plenty of duplicate video urls to the queue, but might gather videos from websites that are currently not supported by a specifically written parser.</value>
This might add plenty of duplicate or random video urls to the queue, but could potentially
gather videos from websites that are currently not supported by a specifically written parser.</value>
</data>
</root>

0 comments on commit 47e774e

Please sign in to comment.