From daeeaa60f9a89fa706b2199ef4a8933ed29f52cd Mon Sep 17 00:00:00 2001 From: WellDoneLiz <124777034+WellDoneLiz@users.noreply.github.com> Date: Thu, 21 Nov 2024 04:41:48 -0800 Subject: [PATCH] refactor sceneByURL for new site design, remove broken sceneByFragment (#2112) --- .../JacquieEtMichelTV/JacquieEtMichelTV.py | 151 ++++++++---------- .../JacquieEtMichelTV/JacquieEtMichelTV.yml | 21 +-- 2 files changed, 75 insertions(+), 97 deletions(-) diff --git a/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.py b/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.py index c1fa53eee..f2fb2ff7e 100644 --- a/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.py +++ b/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.py @@ -1,98 +1,81 @@ import base64 -import datetime +from datetime import datetime as dt import json -import string import sys -from urllib.parse import urlparse -# extra modules below need to be installed -try: - import cloudscraper -except ModuleNotFoundError: - print("You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)", file=sys.stderr) - print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper", file=sys.stderr) - sys.exit() +from py_common import log +from py_common.types import ScrapedPerformer, ScrapedScene, ScrapedStudio, ScrapedTag +from py_common.util import scraper_args +from py_common.deps import ensure_requirements -try: - from lxml import html -except ModuleNotFoundError: - print("You need to install the lxml module. (https://lxml.de/installation.html#installation)", file=sys.stderr) - print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml", file=sys.stderr) - sys.exit() +ensure_requirements("cloudscraper", "lxml") -try: - import py_common.graphql as graphql - import py_common.log as log -except ModuleNotFoundError: - print("You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)", file=sys.stderr) - sys.exit() +import cloudscraper +from lxml import html -lang = 'en' -if len(sys.argv) > 1: - if sys.argv[1] == 'fr': - lang = 'fr' +def scene_from_url(url: str) -> ScrapedScene: + scene = ScrapedScene() -frag = json.loads(sys.stdin.read()) -if not frag['url']: - log.error('No URL entered.') - sys.exit(1) + scraper = cloudscraper.create_scraper() + try: + scraped = scraper.get(url) + scraped.raise_for_status() + except Exception as ex: + log.error(f"Error getting URL: {ex}") + sys.exit(1) -url = frag["url"] -scraper = cloudscraper.create_scraper() -try: - cookies = {'lang': lang} - scraped = scraper.get(url, cookies=cookies) -except: - log.error("scrape error") - sys.exit(1) + tree = html.fromstring(scraped.text) -if scraped.status_code >= 400: - log.error(f'HTTP Error: {scraped.status_code}') - sys.exit(1) + video_data = None + video_data_elems = tree.xpath("//script[@type='application/ld+json']") + for d in video_data_elems: + if '"@type": "VideoObject"' in d.text: + video_data = json.loads(d.text)[0] + break + if not video_data: + log.error("No VideoObject data found.") + sys.exit(1) -tree = html.fromstring(scraped.text) + scene = ScrapedScene({ + "title": video_data["name"], + "details": video_data["description"], + "studio": ScrapedStudio(name=video_data["productionCompany"]), + "tags": [ScrapedTag(name=t) for t in video_data["keywords"].split(",")], + "performers": [ScrapedPerformer(name=a["name"]) for a in video_data["actor"]] + }) -title = None -title_res = tree.xpath("//h1/text()") -if title_res: - title = title_res[0] -date = None -dt = tree.xpath("//span[@class='video-detail__date']/text()") -if dt: - f, *m, l = dt[0].split() - log.debug(f"found date: {l}") - if l: - if lang == 'fr': - date = datetime.datetime.strptime(l, - "%d/%m/%Y").strftime("%Y-%m-%d") - else: - # en - date = datetime.datetime.strptime(l, - "%m/%d/%Y").strftime("%Y-%m-%d") -desc = tree.xpath("//meta[@property='og:description']/@content") -details = "" -if desc: - details = desc[0] -tags = tree.xpath("//a[@class='video-detail__tag-list__link']/text()") -imgurl_res = tree.xpath("//video[@id='video-player']/@poster") -datauri = None -if imgurl_res: - imgurl = imgurl_res[0] - img = scraper.get(imgurl).content - b64img = base64.b64encode(img) - datauri = "data:image/jpeg;base64," + # If no performers look in zeder elem + if not scene["performers"]: + try: + zeder_elem = tree.xpath("//div[contains(@class, '-zeder-detail-')]")[0] + zeder_attrs = zeder_elem.attrib + for k, v in zeder_attrs.items(): + if "data-zeder-actor-" in k: + scene["performers"].append(ScrapedPerformer(name=v.replace("-", " ").title())) + except IndexError: + pass -ret = { - 'title': title, - 'tags': [{ - 'name': x.strip() - } for x in tags], - 'date': date, - 'details': details, - 'image': datauri + b64img.decode('utf-8'), - 'studio': { - 'name': 'Jacquie Et Michel TV' - }, -} + scene["date"] = dt.fromisoformat(video_data["datePublished"]).strftime("%Y-%m-%d") -print(json.dumps(ret)) + image_url = tree.xpath("//meta[@property='og:image']/@content")[0] + try: + img = scraper.get(image_url).content + scraped.raise_for_status() + scene["image"] = "data:image/jpeg;base64," + base64.b64encode(img).decode() + except Exception as ex: + log.error(f"Failed to get image: {ex}") + + return scene + + +if __name__ == "__main__": + op, args = scraper_args() + result = None + match op, args: + case "scene-by-url", {"url": url} if url: + result = scene_from_url(url) + case _: + log.error(f"Not Implemented: Operation: {op}, arguments: {json.dumps(args)}") + sys.exit(1) + + print(json.dumps(result)) diff --git a/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.yml b/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.yml index b92cafe2b..4e0682ec0 100644 --- a/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.yml +++ b/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.yml @@ -1,18 +1,13 @@ name: JacquieEtMicaelTV -# requires: py_common -sceneByFragment: - action: script - script: - - python - - JacquieEtMichelTV.py - #- fr # uncomment if you want to use the french description sceneByURL: - - url: - - jacquieetmicheltv.net/en/videos - action: script + - action: script + url: + - jacquieetmicheltv.net/en/content + - jacquieetmicheltv.net/fr/content script: - - python + - python3 - JacquieEtMichelTV.py - #- fr # uncomment if you want to use the french description -# Last Updated January 29, 2022 + - scene-by-url + +# Last Updated November 19, 2024