diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 53c8833509..d3d2a8a32e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1478,6 +1478,64 @@ Consider all listed sites to potentially be NSFW. + + Wikimedia Instances + + + Wikipedia + https://www.wikipedia.org/ + Articles, Categories + + + + Wiktionary + https://www.wiktionary.org/ + Articles, Categories + + + + Wikiquote + https://www.wikiquote.org/ + Articles, Categories + + + + Wikibooks + https://www.wikibooks.org/ + Articles, Categories + + + + Wikisource + https://www.wikisource.org/ + Articles, Categories + + + + Wikinews + https://www.wikinews.org/ + Articles, Categories + + + + Wikiversity + https://www.wikiversity.org/ + Articles, Categories + + + + Wikispecies + https://species.wikimedia.org/ + Articles, Categories + + + + Wikimedia Commons + https://commons.wikimedia.org/ + Articles, Categories + + + Moebooru and MyImouto diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 8e7129618a..863089176a 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -178,6 +178,7 @@ "weibo", "wikiart", "wikifeet", + "wikimedia", "xhamster", "xvideos", "zerochan", diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py new file mode 100644 index 0000000000..1a8965159b --- /dev/null +++ b/gallery_dl/extractor/wikimedia.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Ailothaen +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Wikimedia and Wikipedia""" + +from .common import BaseExtractor, Message +from .. import text + + +class WikimediaExtractor(BaseExtractor): + """Base class for wikimedia extractors""" + basecategory = "wikimedia" + directory_fmt = ("{category}", "{page}") + archive_fmt = "{sha1}" + request_interval = (1.0, 2.0) + + def __init__(self, match): + BaseExtractor.__init__(self, match) + self.title = match.group(match.lastindex) + + def items(self): + for info in self._pagination(self.params): + image = info["imageinfo"][0] + + image["metadata"] = { + m["name"]: m["value"] + for m in image["metadata"]} + image["commonmetadata"] = { + m["name"]: m["value"] + for m in image["commonmetadata"]} + + filename = image["canonicaltitle"] + image["filename"], _, image["extension"] = \ + filename.partition(":")[2].rpartition(".") + image["date"] = text.parse_datetime( + image["timestamp"], "%Y-%m-%dT%H:%M:%SZ") + image["page"] = self.title + + yield Message.Directory, image + yield Message.Url, image["url"], image + + def _pagination(self, params): + """ + https://www.mediawiki.org/wiki/API:Query + https://opendata.stackexchange.com/questions/13381 + """ + + url = self.root + "/w/api.php" + params["action"] = "query" + params["format"] = "json" + + while True: + data = self.request(url, params=params).json() + + try: + pages = data["query"]["pages"] + except KeyError: + pass + else: + yield from pages.values() + + try: + continuation = data["continue"] + except KeyError: + break + params.update(continuation) + + +BASE_PATTERN = WikimediaExtractor.update({ + "wikipedia": { + "root": None, + "pattern": r"[a-z]{2,}\.wikipedia\.org", + }, + "wiktionary": { + "root": None, + "pattern": r"[a-z]{2,}\.wiktionary\.org", + }, + "wikiquote": { + "root": None, + "pattern": r"[a-z]{2,}\.wikiquote\.org", + }, + "wikibooks": { + "root": None, + "pattern": r"[a-z]{2,}\.wikibooks\.org", + }, + "wikisource": { + "root": None, + "pattern": r"[a-z]{2,}\.wikisource\.org", + }, + "wikinews": { + "root": None, + "pattern": r"[a-z]{2,}\.wikinews\.org", + }, + "wikiversity": { + "root": None, + "pattern": r"[a-z]{2,}\.wikiversity\.org", + }, + "wikispecies": { + "root": "https://species.wikimedia.org", + "pattern": r"species\.wikimedia\.org", + }, + "wikimediacommons": { + "root": "https://commons.wikimedia.org", + "pattern": r"commons\.wikimedia\.org", + }, +}) + + +class WikimediaArticleExtractor(WikimediaExtractor): + """Extractor for wikimedia articles""" + subcategory = "article" + pattern = BASE_PATTERN + r"/wiki/(?!Category:)([^/?#]+)" + example = "https://en.wikipedia.org/wiki/TITLE" + + def _init(self): + self.params = { + "generator": "images", + "titles" : self.title, + "prop" : "imageinfo", + "iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|" + "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth", + } + + +class WikimediaCategoryExtractor(WikimediaExtractor): + subcategory = "category" + pattern = BASE_PATTERN + r"/wiki/(Category:[^/?#]+)" + example = "https://commons.wikimedia.org/wiki/Category:NAME" + + def _init(self): + self.params = { + "generator": "categorymembers", + "gcmtitle" : self.title, + "gcmtype" : "file", + "prop" : "imageinfo", + "iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|" + "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth", + } diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index d3107b4757..34566465e7 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -139,6 +139,7 @@ "webmshare" : "webmshare", "webtoons" : "Webtoon", "wikiart" : "WikiArt.org", + "wikimediacommons": "Wikimedia Commons", "xbunkr" : "xBunkr", "xhamster" : "xHamster", "xvideos" : "XVideos", diff --git a/test/results/wikibooks.py b/test/results/wikibooks.py new file mode 100644 index 0000000000..882741d5ba --- /dev/null +++ b/test/results/wikibooks.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikibooks.org/wiki/Title", + "#category": ("wikimedia", "wikibooks", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikibooks.org/wiki/Category:Title", + "#category": ("wikimedia", "wikibooks", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikimediacommons.py b/test/results/wikimediacommons.py new file mode 100644 index 0000000000..6cc03e34a0 --- /dev/null +++ b/test/results/wikimediacommons.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://commons.wikimedia.org/wiki/File:Starr-050516-1367-Pimenta_dioica-flowers-Maunaloa-Molokai_(24762757525).jpg", + "#category": ("wikimedia", "wikimediacommons", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro", + "#category": ("wikimedia", "wikimediacommons", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikinews.py b/test/results/wikinews.py new file mode 100644 index 0000000000..8a2af25e23 --- /dev/null +++ b/test/results/wikinews.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikinews.org/wiki/Title", + "#category": ("wikimedia", "wikinews", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikinews.org/wiki/Category:Title", + "#category": ("wikimedia", "wikinews", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikipedia.py b/test/results/wikipedia.py new file mode 100644 index 0000000000..874998786b --- /dev/null +++ b/test/results/wikipedia.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikipedia.org/wiki/Title", + "#category": ("wikimedia", "wikipedia", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikipedia.org/wiki/Athena", + "#category": ("wikimedia", "wikipedia", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, + "#pattern" : r"https://upload.wikimedia.org/wikipedia/.+", + "#count" : range(50, 100), + + "bitdepth" : int, + "canonicaltitle": str, + "comment" : str, + "commonmetadata": dict, + "date" : "type:datetime", + "descriptionshorturl": str, + "descriptionurl": str, + "extension" : str, + "extmetadata" : dict, + "filename" : str, + "height" : int, + "metadata" : dict, + "mime" : r"re:image/\w+", + "page" : "Athena", + "sha1" : r"re:^[0-9a-f]{40}$", + "size" : int, + "timestamp" : str, + "url" : str, + "user" : str, + "userid" : int, + "width" : int, +}, + +{ + "#url" : "https://en.wikipedia.org/wiki/Category:Physics", + "#category": ("wikimedia", "wikipedia", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikiquote.py b/test/results/wikiquote.py new file mode 100644 index 0000000000..5e6fb3212a --- /dev/null +++ b/test/results/wikiquote.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikiquote.org/wiki/Title", + "#category": ("wikimedia", "wikiquote", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikiquote.org/wiki/Category:Title", + "#category": ("wikimedia", "wikiquote", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikisource.py b/test/results/wikisource.py new file mode 100644 index 0000000000..afdee23e12 --- /dev/null +++ b/test/results/wikisource.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikisource.org/wiki/Title", + "#category": ("wikimedia", "wikisource", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikisource.org/wiki/Category:Title", + "#category": ("wikimedia", "wikisource", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikispecies.py b/test/results/wikispecies.py new file mode 100644 index 0000000000..d455fbacf9 --- /dev/null +++ b/test/results/wikispecies.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://species.wikimedia.org/wiki/Geranospiza", + "#category": ("wikimedia", "wikispecies", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, + "#urls" : "https://upload.wikimedia.org/wikipedia/commons/0/01/Geranospiza_caerulescens.jpg", + "#sha1_content": "3a17c14b15489928e4154f826af1c42afb5a523e", +}, + +{ + "#url" : "https://species.wikimedia.org/wiki/Category:Names", + "#category": ("wikimedia", "wikispecies", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wikiversity.py b/test/results/wikiversity.py new file mode 100644 index 0000000000..58565f4998 --- /dev/null +++ b/test/results/wikiversity.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wikiversity.org/wiki/Title", + "#category": ("wikimedia", "wikiversity", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wikiversity.org/wiki/Category:Title", + "#category": ("wikimedia", "wikiversity", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +) diff --git a/test/results/wiktionary.py b/test/results/wiktionary.py new file mode 100644 index 0000000000..c7a016f5fb --- /dev/null +++ b/test/results/wiktionary.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.wiktionary.org/wiki/Word", + "#category": ("wikimedia", "wiktionary", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, +}, + +{ + "#url" : "https://en.wiktionary.org/wiki/Category:Words", + "#category": ("wikimedia", "wiktionary", "category"), + "#class" : wikimedia.WikimediaCategoryExtractor, +}, + +)