[nekohouse] add initial support (#5241, #6738)

mikf · Jan 20, 2025 · 05fa6dd · 05fa6dd
1 parent 6ce310d
commit 05fa6dd
Show file tree

Hide file tree

Showing 4 changed files with 199 additions and 0 deletions.
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
@@ -625,6 +625,12 @@ Consider all listed sites to potentially be NSFW.
     <td>Comics, Episodes</td>
     <td></td>
 </tr>
+<tr>
+    <td>Nekohouse</td>
+    <td>https://nekohouse.su/</td>
+    <td>Posts, User Profiles</td>
+    <td></td>
+</tr>
 <tr>
     <td>Newgrounds</td>
     <td>https://www.newgrounds.com/</td>

diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
@@ -115,6 +115,7 @@
     "myportfolio",
     "naver",
     "naverwebtoon",
+    "nekohouse",
     "newgrounds",
     "nhentai",
     "nijie",

diff --git a/gallery_dl/extractor/nekohouse.py b/gallery_dl/extractor/nekohouse.py
@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://nekohouse.su/"""
+
+from .common import Extractor, Message
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?nekohouse\.su"
+USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)"
+
+
+class NekohouseExtractor(Extractor):
+    """Base class for nekohouse extractors"""
+    category = "nekohouse"
+    root = "https://nekohouse.su"
+
+
+class NekohousePostExtractor(NekohouseExtractor):
+    subcategory = "post"
+    directory_fmt = ("{category}", "{service}", "{username} ({user_id})",
+                     "{post_id} {date} {title[b:230]}")
+    filename_fmt = "{num:>02} {id|filename}.{extension}"
+    archive_fmt = "{service}_{user_id}_{post_id}_{hash}"
+    pattern = USER_PATTERN + r"/post/([^/?#]+)"
+    example = "https://nekohouse.su/SERVICE/user/12345/post/12345"
+
+    def items(self):
+        service, user_id, post_id = self.groups
+        url = "{}/{}/user/{}/post/{}".format(
+            self.root, service, user_id, post_id)
+        html = self.request(url).text
+
+        files = self._extract_files(html)
+        post = self._extract_post(html)
+        post["service"] = service
+        post["user_id"] = user_id
+        post["post_id"] = post_id
+        post["count"] = len(files)
+
+        yield Message.Directory, post
+        for post["num"], file in enumerate(files, 1):
+            url = file["url"]
+            text.nameext_from_url(url, file)
+            file["hash"] = file["filename"]
+            file.update(post)
+            if "name" in file:
+                text.nameext_from_url(file.pop("name"), file)
+            yield Message.Url, url, file
+
+    def _extract_post(self, html):
+        extr = text.extract_from(html)
+        return {
+            "username": text.unescape(extr(
+                'class="scrape__user-name', '</').rpartition(">")[2].strip()),
+            "title"   : text.unescape(extr(
+                'class="scrape__title', '</').rpartition(">")[2]),
+            "date"   : text.parse_datetime(extr(
+                'datetime="', '"')[:19], "%Y-%m-%d %H:%M:%S"),
+            "content": text.unescape(extr(
+                'class="scrape__content">', "</div>").strip()),
+        }
+
+    def _extract_files(self, html):
+        files = []
+
+        extr = text.extract_from(text.extr(
+            html, 'class="scrape__files"', "<footer"))
+        while True:
+            file_id = extr('<a href="/post/', '"')
+            if not file_id:
+                break
+            files.append({
+                "id"  : file_id,
+                "url" : self.root + extr('href="', '"'),
+                "type": "file",
+            })
+
+        extr = text.extract_from(text.extr(
+            html, 'class="scrape__attachments"', "</ul>"))
+        while True:
+            url = extr('href="', '"')
+            if not url:
+                break
+            files.append({
+                "id"  : "",
+                "url" : self.root + url,
+                "name": text.unescape(extr('download="', '"')),
+                "type": "attachment",
+            })
+
+        return files
+
+
+class NekohouseUserExtractor(NekohouseExtractor):
+    subcategory = "user"
+    pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|\?|#)"
+    example = "https://nekohouse.su/SERVICE/user/12345"
+
+    def items(self):
+        service, user_id, _ = self.groups
+        url = "{}/{}/user/{}".format(self.root, service, user_id)
+        params = {"o": 0}
+
+        data = {"_extractor": NekohousePostExtractor}
+        while True:
+            html = self.request(url, params=params).text
+
+            cnt = 0
+            for post in text.extract_iter(html, "<article", "</article>"):
+                cnt += 1
+                url = self.root + text.extr(post, '<a href="', '"')
+                yield Message.Queue, url, data
+
+            if cnt < 50:
+                return
+            params["o"] += 50
diff --git a/test/results/nekohouse.py b/test/results/nekohouse.py
@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import nekohouse
+
+
+__tests__ = (
+{
+    "#url"     : "https://nekohouse.su/fantia/user/319092/post/3163233",
+    "#class"   : nekohouse.NekohousePostExtractor,
+    "#urls"    : (
+        "https://nekohouse.su/data/b2/ca/b2ca86189cda7408d75c36d850ca6394c089786d46c6dd0c90b4a2e17e07774f.jpg",
+        "https://nekohouse.su/data/2e/cf/2ecfd1a04affa35c147bb43d626d6149c2c3f9a9fb7df1659a40c8de1b3e09e5.jpg",
+        "https://nekohouse.su/data/9a/ed/9aed4b879023b761882c7c11ce74a3ee51a22487e2c77df0bfabed7c5a73cbe5.jpg",
+    ),
+
+    "content"  : "エリー・マナ・マリア編のもの\n\n会場行った人以外よくわからないと思うので、\nレポの体をなしてないですが…",
+    "count"    : 3,
+    "date"     : "dt:2024-12-12 09:34:36",
+    "extension": "jpg",
+    "filename" : r"re:^[0-9a-f]{64}$",
+    "hash"     : r"re:^[0-9a-f]{64}$",
+    "id"       : {"662005", "662006", "662007"},
+    "num"      : range(1, 3),
+    "post_id"  : "3163233",
+    "service"  : "fantia",
+    "title"    : "ルミナスバースデーイベ２",
+    "type"     : "file",
+    "url"      : str,
+    "user_id"  : "319092",
+    "username" : "島田フミカネ",
+},
+
+{
+    "#url"     : "https://nekohouse.su/fantia/user/19235/post/2621173",
+    "#comment" : "attachment / video",
+    "#class"   : nekohouse.NekohousePostExtractor,
+    "#range"   : "6",
+    "#urls"    : (
+        "https://nekohouse.su/data/f9/4c/f94ca55a329604bec63536828a36fd2b455aec03ffb3657e25c0b405d8484823.mp4",
+    ),
+
+    "content"  : "",
+    "count"    : 6,
+    "date"     : "dt:2024-03-15 12:09:48",
+    "extension": "mp4",
+    "filename" : "レミリアゲームver0.01",
+    "hash"     : "f94ca55a329604bec63536828a36fd2b455aec03ffb3657e25c0b405d8484823",
+    "id"       : "",
+    "num"      : 6,
+    "post_id"  : "2621173",
+    "service"  : "fantia",
+    "title"    : "ふたなりレミリア総受けエロゲーのお話",
+    "type"     : "attachment",
+    "url"      : "https://nekohouse.su/data/f9/4c/f94ca55a329604bec63536828a36fd2b455aec03ffb3657e25c0b405d8484823.mp4",
+    "user_id"  : "19235",
+    "username" : "なまこ大爆発",
+},
+
+{
+    "#url"     : "https://nekohouse.su/fantia/user/19235",
+    "#class"   : nekohouse.NekohouseUserExtractor,
+    "#pattern" : r"https://nekohouse\.su/fantia/user/19235/post/\d+",
+    "#count"   : range(50, 100),
+},
+
+)