Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update NoodleDude scraper & switch to python #2085

Merged
merged 4 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 194 additions & 0 deletions scrapers/NoodleDude/NoodleDude.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
import argparse
import datetime
import json
import os
import re
import sys
from configparser import ConfigParser, NoSectionError
from urllib.parse import urlparse

import requests
from py_common.deps import ensure_requirements
from py_common import log

ensure_requirements("cloudscraper")
import cloudscraper

ensure_requirements("lxml")
from lxml import html, etree

scraper = cloudscraper.create_scraper()

def base64_image(url) -> str:
import base64
b64img_bytes = base64.b64encode(scraper.get(url).content)
return f"data:image/jpeg;base64,{b64img_bytes.decode('utf-8')}"

def xpath_string(tree, selector):
raw = tree.xpath(selector)
if not raw or len(raw) < 1:
return ""
return raw[0].strip()

def scrape(url: str, retries=0):
try:
scraped = scraper.get(url, timeout=(3, 7))
except requests.exceptions.Timeout as exc_time:
log.debug(f"Timeout: {exc_time}")
return scrape(url, retries + 1)
except Exception as e:
log.error(f"scrape error {e}")
sys.exit(1)
if scraped.status_code >= 400:
log.error(f"HTTP Error: {scraped.status_code}")
sys.exit(1)
return html.fromstring(scraped.content)


def scene_title(tree):
return xpath_string(tree, "//meta[@property='og:title']/@content").split('|')[0].strip()

def scene_date(tree):
stash_date = "%Y-%m-%d"
date_format = "%B %d, %Y"
raw = tree.xpath("//div[contains(@class, 'video_info_wrapper')]//span[@id='release_date']/@title")[0]
raw = re.sub(r'(\d)(st|nd|rd|th)', r'\1', raw)
return datetime.datetime.strptime(raw, date_format).strftime(stash_date)

def scene_details(tree):
rawDescription = tree.xpath("//*[contains(@class, 'video_description')]")
details = rawDescription[0].text_content()

songs = ""
rawSong = tree.xpath("//a[contains(@class, 'song_link')]//span/text()")
rawSongs = zip(rawSong[::2], rawSong[1::2])
for (songTitle, songAuthor) in rawSongs:
songs += "\n" + songAuthor + " - " + songTitle

if songs != "":
details = details + "\n\nSongs: " + songs
return details

def scene_tags(tree):
# Tags do not appear anymore on the site
return []

def parse_performer_card(tree):
performer = {}
# Disable image - Too slow when there are 50+ performers in the list
# imgUrl = tree.xpath("img/@src")
# if imgUrl and len(imgUrl) == 1 and imgUrl[0] != "/static/images/placeholder.svg":
# performer["images"] = [base64_image(imgUrl[0])]
performer["name"] = tree.xpath("div/span[1]/text()")[0]

performer["urls"] = ["https://www.noodledude.io" + tree.xpath("@href")[0]]
return performer

def get_performers(tree):
detailsUrl = tree.xpath("//*[@id='current-video']//button[contains(@class, 'btn-plus')]/@hx-get")
if not detailsUrl or len(detailsUrl) < 1:
return []

scrapedPerformers = scrape("https://www.noodledude.io" + detailsUrl[0])
performerNodes = scrapedPerformers.xpath("//a[contains(@class, 'performer_card')]")
performerList = map(parse_performer_card, performerNodes)
return list(performerList)

def scene_image(tree):
rawUrl = tree.xpath("//video[@id='player']/@poster")[0]
return base64_image(rawUrl)


def performer_name(tree):
return xpath_string(tree, "h1/text()")

def performer_birthdate(tree):
return xpath_string(tree, "span[not(@class)]/span[@class='fc2']/@title")

def performer_aliases(tree):
raw = tree.xpath("span[@class='fs-s']/text()")
if not raw or len(raw) == 0:
return ""
return ",".join(raw)

def performer_urls(tree):
raw = tree.xpath("div[@class='performer-links']/a/@href")
if not raw or len(raw) == 0:
return []
return raw


def scene_from_tree(tree):
return {
"title": scene_title(tree),
"date": scene_date(tree),
"details": scene_details(tree),
"tags" : scene_tags(tree),
"image" : scene_image(tree),
"studio": {
"name" : "NoodleDudePMV"
},
"performers": get_performers(tree)
}

def performer_from_tree(tree):
performerTree = tree.xpath("//div[contains(@class, 'performer-main-info')]")[0]
return {
"name": performer_name(tree),
"urls": performer_urls(performerTree),
"birthdate": performer_birthdate(performerTree),
"aliases": performer_aliases(performerTree),
"images": [
base64_image(url) for url in tree.xpath("//img[@class='performer-image']/@src")
],
}


def main():
parser = argparse.ArgumentParser("NoodleDude Scraper", argument_default="")
subparsers = parser.add_subparsers(
dest="operation", help="Operation to perform", required=True
)

subparsers.add_parser("performer", help="Scrape a performer").add_argument(
"url", nargs="?", help="Performer URL"
)
subparsers.add_parser("scene", help="Scrape a scene").add_argument(
"url", nargs="?", help="Scene URL"
)

if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)

args = parser.parse_args()

# Script is being piped into, probably by Stash
if not sys.stdin.isatty():
try:
frag = json.load(sys.stdin)
args.__dict__.update(frag)
log.debug(f"With arguments from stdin: {args}")
except json.decoder.JSONDecodeError:
log.error("Received invalid JSON from stdin")
sys.exit(1)

url = args.url
if not url:
log.error("No URL provided")
sys.exit(1)

log.debug(f"{args.operation} scraping '{url}'")
scraped = scrape(url)
result = {}
if args.operation == "performer":
result = performer_from_tree(scraped)
result["urls"].append(url)
elif args.operation == "scene":
result = scene_from_tree(scraped)
result["url"] = url

print(json.dumps(result))

if __name__ == "__main__":
main()
20 changes: 20 additions & 0 deletions scrapers/NoodleDude/NoodleDude.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: "NoodleDude"
# requires: py_common

sceneByURL:
- url:
- noodledude.io
action: script
script:
- python
- NoodleDude.py
- scene
performerByURL:
- url:
- noodledude.io
action: script
script:
- python
- NoodleDude.py
- performer
# Last Updated November 31, 2024
41 changes: 0 additions & 41 deletions scrapers/Noodledude.yml

This file was deleted.