refactor sceneByURL for new site design, remove broken sceneByFragment (

#2112)
stashapp · Nov 21, 2024 · daeeaa6 · daeeaa6
1 parent c8e9609
commit daeeaa6
Show file tree

Hide file tree

Showing 2 changed files with 75 additions and 97 deletions.
diff --git a/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.py b/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.py
@@ -1,98 +1,81 @@
 import base64
-import datetime
+from datetime import datetime as dt
 import json
-import string
 import sys
-from urllib.parse import urlparse
-# extra modules below need to be installed
-try:
-    import cloudscraper
-except ModuleNotFoundError:
-    print("You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)", file=sys.stderr)
-    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper", file=sys.stderr)
-    sys.exit()
+from py_common import log
+from py_common.types import ScrapedPerformer, ScrapedScene, ScrapedStudio, ScrapedTag
+from py_common.util import scraper_args
+from py_common.deps import ensure_requirements
 
-try:
-    from lxml import html
-except ModuleNotFoundError:
-    print("You need to install the lxml module. (https://lxml.de/installation.html#installation)", file=sys.stderr)
-    print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml", file=sys.stderr)
-    sys.exit()
+ensure_requirements("cloudscraper", "lxml")
 
-try:
-    import py_common.graphql as graphql
-    import py_common.log as log
-except ModuleNotFoundError:
-    print("You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)", file=sys.stderr)
-    sys.exit()
+import cloudscraper
+from lxml import html
 
-lang = 'en'
 
-if len(sys.argv) > 1:
-    if sys.argv[1] == 'fr':
-        lang = 'fr'
+def scene_from_url(url: str) -> ScrapedScene:
+    scene = ScrapedScene()
 
-frag = json.loads(sys.stdin.read())
-if not frag['url']:
-    log.error('No URL entered.')
-    sys.exit(1)
+    scraper = cloudscraper.create_scraper()
+    try:
+        scraped = scraper.get(url)
+        scraped.raise_for_status()
+    except Exception as ex:
+        log.error(f"Error getting URL: {ex}")
+        sys.exit(1)
 
-url = frag["url"]
-scraper = cloudscraper.create_scraper()
-try:
-    cookies = {'lang': lang}
-    scraped = scraper.get(url, cookies=cookies)
-except:
-    log.error("scrape error")
-    sys.exit(1)
+    tree = html.fromstring(scraped.text)
 
-if scraped.status_code >= 400:
-    log.error(f'HTTP Error: {scraped.status_code}')
-    sys.exit(1)
+    video_data = None
+    video_data_elems = tree.xpath("//script[@type='application/ld+json']")
+    for d in video_data_elems:
+        if '"@type": "VideoObject"' in d.text:
+            video_data = json.loads(d.text)[0]
+            break
+    if not video_data:
+        log.error("No VideoObject data found.")
+        sys.exit(1)
 
-tree = html.fromstring(scraped.text)
+    scene = ScrapedScene({
+        "title": video_data["name"],
+        "details": video_data["description"],
+        "studio": ScrapedStudio(name=video_data["productionCompany"]),
+        "tags": [ScrapedTag(name=t) for t in video_data["keywords"].split(",")],
+        "performers": [ScrapedPerformer(name=a["name"]) for a in video_data["actor"]]
+    })
 
-title = None
-title_res = tree.xpath("//h1/text()")
-if title_res:
-    title = title_res[0]
-date = None
-dt = tree.xpath("//span[@class='video-detail__date']/text()")
-if dt:
-    f, *m, l = dt[0].split()
-    log.debug(f"found date: {l}")
-    if l:
-        if lang == 'fr':
-            date = datetime.datetime.strptime(l,
-                                              "%d/%m/%Y").strftime("%Y-%m-%d")
-        else:
-            # en
-            date = datetime.datetime.strptime(l,
-                                              "%m/%d/%Y").strftime("%Y-%m-%d")
-desc = tree.xpath("//meta[@property='og:description']/@content")
-details = ""
-if desc:
-    details = desc[0]
-tags = tree.xpath("//a[@class='video-detail__tag-list__link']/text()")
-imgurl_res = tree.xpath("//video[@id='video-player']/@poster")
-datauri = None
-if imgurl_res:
-    imgurl = imgurl_res[0]
-    img = scraper.get(imgurl).content
-    b64img = base64.b64encode(img)
-    datauri = "data:image/jpeg;base64,"
+    # If no performers look in zeder elem
+    if not scene["performers"]:
+        try:
+            zeder_elem = tree.xpath("//div[contains(@class, '-zeder-detail-')]")[0]
+            zeder_attrs = zeder_elem.attrib
+            for k, v in zeder_attrs.items():
+                if "data-zeder-actor-" in k:
+                    scene["performers"].append(ScrapedPerformer(name=v.replace("-", " ").title()))
+        except IndexError:
+            pass
 
-ret = {
-    'title': title,
-    'tags': [{
-        'name': x.strip()
-    } for x in tags],
-    'date': date,
-    'details': details,
-    'image': datauri + b64img.decode('utf-8'),
-    'studio': {
-        'name': 'Jacquie Et Michel TV'
-    },
-}
+    scene["date"] = dt.fromisoformat(video_data["datePublished"]).strftime("%Y-%m-%d")
 
-print(json.dumps(ret))
+    image_url = tree.xpath("//meta[@property='og:image']/@content")[0]
+    try:
+        img = scraper.get(image_url).content
+        scraped.raise_for_status()
+        scene["image"] = "data:image/jpeg;base64," + base64.b64encode(img).decode()
+    except Exception as ex:
+        log.error(f"Failed to get image: {ex}")
+
+    return scene
+
+
+if __name__ == "__main__":
+    op, args = scraper_args()
+    result = None
+    match op, args:
+        case "scene-by-url", {"url": url} if url:
+            result = scene_from_url(url)
+        case _:
+            log.error(f"Not Implemented: Operation: {op}, arguments: {json.dumps(args)}")
+            sys.exit(1)
+
+    print(json.dumps(result))
diff --git a/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.yml b/scrapers/JacquieEtMichelTV/JacquieEtMichelTV.yml
@@ -1,18 +1,13 @@
 name: JacquieEtMicaelTV
-# requires: py_common
 
-sceneByFragment:
-    action: script
-    script:
-      - python
-      - JacquieEtMichelTV.py
-      #- fr # uncomment if you want to use the french description
 sceneByURL:
-  - url:
-     - jacquieetmicheltv.net/en/videos
-    action: script
+  - action: script
+    url:
+      - jacquieetmicheltv.net/en/content
+      - jacquieetmicheltv.net/fr/content
     script:
-      - python
+      - python3
       - JacquieEtMichelTV.py
-      #- fr # uncomment if you want to use the french description
-# Last Updated January 29, 2022
+      - scene-by-url
+
+# Last Updated November 19, 2024