Skip to content

Commit

Permalink
[nekohouse] add initial support (#5241, #6738)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikf committed Jan 20, 2025
1 parent 6ce310d commit 05fa6dd
Show file tree
Hide file tree
Showing 4 changed files with 199 additions and 0 deletions.
6 changes: 6 additions & 0 deletions docs/supportedsites.md
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,12 @@ Consider all listed sites to potentially be NSFW.
<td>Comics, Episodes</td>
<td></td>
</tr>
<tr>
<td>Nekohouse</td>
<td>https://nekohouse.su/</td>
<td>Posts, User Profiles</td>
<td></td>
</tr>
<tr>
<td>Newgrounds</td>
<td>https://www.newgrounds.com/</td>
Expand Down
1 change: 1 addition & 0 deletions gallery_dl/extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@
"myportfolio",
"naver",
"naverwebtoon",
"nekohouse",
"newgrounds",
"nhentai",
"nijie",
Expand Down
122 changes: 122 additions & 0 deletions gallery_dl/extractor/nekohouse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# -*- coding: utf-8 -*-

# Copyright 2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://nekohouse.su/"""

from .common import Extractor, Message
from .. import text

BASE_PATTERN = r"(?:https?://)?nekohouse\.su"
USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)"


class NekohouseExtractor(Extractor):
"""Base class for nekohouse extractors"""
category = "nekohouse"
root = "https://nekohouse.su"


class NekohousePostExtractor(NekohouseExtractor):
subcategory = "post"
directory_fmt = ("{category}", "{service}", "{username} ({user_id})",
"{post_id} {date} {title[b:230]}")
filename_fmt = "{num:>02} {id|filename}.{extension}"
archive_fmt = "{service}_{user_id}_{post_id}_{hash}"
pattern = USER_PATTERN + r"/post/([^/?#]+)"
example = "https://nekohouse.su/SERVICE/user/12345/post/12345"

def items(self):
service, user_id, post_id = self.groups
url = "{}/{}/user/{}/post/{}".format(
self.root, service, user_id, post_id)
html = self.request(url).text

files = self._extract_files(html)
post = self._extract_post(html)
post["service"] = service
post["user_id"] = user_id
post["post_id"] = post_id
post["count"] = len(files)

yield Message.Directory, post
for post["num"], file in enumerate(files, 1):
url = file["url"]
text.nameext_from_url(url, file)
file["hash"] = file["filename"]
file.update(post)
if "name" in file:
text.nameext_from_url(file.pop("name"), file)
yield Message.Url, url, file

def _extract_post(self, html):
extr = text.extract_from(html)
return {
"username": text.unescape(extr(
'class="scrape__user-name', '</').rpartition(">")[2].strip()),
"title" : text.unescape(extr(
'class="scrape__title', '</').rpartition(">")[2]),
"date" : text.parse_datetime(extr(
'datetime="', '"')[:19], "%Y-%m-%d %H:%M:%S"),
"content": text.unescape(extr(
'class="scrape__content">', "</div>").strip()),
}

def _extract_files(self, html):
files = []

extr = text.extract_from(text.extr(
html, 'class="scrape__files"', "<footer"))
while True:
file_id = extr('<a href="/post/', '"')
if not file_id:
break
files.append({
"id" : file_id,
"url" : self.root + extr('href="', '"'),
"type": "file",
})

extr = text.extract_from(text.extr(
html, 'class="scrape__attachments"', "</ul>"))
while True:
url = extr('href="', '"')
if not url:
break
files.append({
"id" : "",
"url" : self.root + url,
"name": text.unescape(extr('download="', '"')),
"type": "attachment",
})

return files


class NekohouseUserExtractor(NekohouseExtractor):
subcategory = "user"
pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|\?|#)"
example = "https://nekohouse.su/SERVICE/user/12345"

def items(self):
service, user_id, _ = self.groups
url = "{}/{}/user/{}".format(self.root, service, user_id)
params = {"o": 0}

data = {"_extractor": NekohousePostExtractor}
while True:
html = self.request(url, params=params).text

cnt = 0
for post in text.extract_iter(html, "<article", "</article>"):
cnt += 1
url = self.root + text.extr(post, '<a href="', '"')
yield Message.Queue, url, data

if cnt < 50:
return
params["o"] += 50
70 changes: 70 additions & 0 deletions test/results/nekohouse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# -*- coding: utf-8 -*-

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

from gallery_dl.extractor import nekohouse


__tests__ = (
{
"#url" : "https://nekohouse.su/fantia/user/319092/post/3163233",
"#class" : nekohouse.NekohousePostExtractor,
"#urls" : (
"https://nekohouse.su/data/b2/ca/b2ca86189cda7408d75c36d850ca6394c089786d46c6dd0c90b4a2e17e07774f.jpg",
"https://nekohouse.su/data/2e/cf/2ecfd1a04affa35c147bb43d626d6149c2c3f9a9fb7df1659a40c8de1b3e09e5.jpg",
"https://nekohouse.su/data/9a/ed/9aed4b879023b761882c7c11ce74a3ee51a22487e2c77df0bfabed7c5a73cbe5.jpg",
),

"content" : "エリー・マナ・マリア編のもの\n\n会場行った人以外よくわからないと思うので、\nレポの体をなしてないですが…",
"count" : 3,
"date" : "dt:2024-12-12 09:34:36",
"extension": "jpg",
"filename" : r"re:^[0-9a-f]{64}$",
"hash" : r"re:^[0-9a-f]{64}$",
"id" : {"662005", "662006", "662007"},
"num" : range(1, 3),
"post_id" : "3163233",
"service" : "fantia",
"title" : "ルミナスバースデーイベ2",
"type" : "file",
"url" : str,
"user_id" : "319092",
"username" : "島田フミカネ",
},

{
"#url" : "https://nekohouse.su/fantia/user/19235/post/2621173",
"#comment" : "attachment / video",
"#class" : nekohouse.NekohousePostExtractor,
"#range" : "6",
"#urls" : (
"https://nekohouse.su/data/f9/4c/f94ca55a329604bec63536828a36fd2b455aec03ffb3657e25c0b405d8484823.mp4",
),

"content" : "",
"count" : 6,
"date" : "dt:2024-03-15 12:09:48",
"extension": "mp4",
"filename" : "レミリアゲームver0.01",
"hash" : "f94ca55a329604bec63536828a36fd2b455aec03ffb3657e25c0b405d8484823",
"id" : "",
"num" : 6,
"post_id" : "2621173",
"service" : "fantia",
"title" : "ふたなりレミリア総受けエロゲーのお話",
"type" : "attachment",
"url" : "https://nekohouse.su/data/f9/4c/f94ca55a329604bec63536828a36fd2b455aec03ffb3657e25c0b405d8484823.mp4",
"user_id" : "19235",
"username" : "なまこ大爆発",
},

{
"#url" : "https://nekohouse.su/fantia/user/19235",
"#class" : nekohouse.NekohouseUserExtractor,
"#pattern" : r"https://nekohouse\.su/fantia/user/19235/post/\d+",
"#count" : range(50, 100),
},

)

0 comments on commit 05fa6dd

Please sign in to comment.