-
-
Notifications
You must be signed in to change notification settings - Fork 427
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor sceneByURL for new site design, remove broken sceneByFragment (
- Loading branch information
1 parent
c8e9609
commit daeeaa6
Showing
2 changed files
with
75 additions
and
97 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,98 +1,81 @@ | ||
import base64 | ||
import datetime | ||
from datetime import datetime as dt | ||
import json | ||
import string | ||
import sys | ||
from urllib.parse import urlparse | ||
# extra modules below need to be installed | ||
try: | ||
import cloudscraper | ||
except ModuleNotFoundError: | ||
print("You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)", file=sys.stderr) | ||
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper", file=sys.stderr) | ||
sys.exit() | ||
from py_common import log | ||
from py_common.types import ScrapedPerformer, ScrapedScene, ScrapedStudio, ScrapedTag | ||
from py_common.util import scraper_args | ||
from py_common.deps import ensure_requirements | ||
|
||
try: | ||
from lxml import html | ||
except ModuleNotFoundError: | ||
print("You need to install the lxml module. (https://lxml.de/installation.html#installation)", file=sys.stderr) | ||
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml", file=sys.stderr) | ||
sys.exit() | ||
ensure_requirements("cloudscraper", "lxml") | ||
|
||
try: | ||
import py_common.graphql as graphql | ||
import py_common.log as log | ||
except ModuleNotFoundError: | ||
print("You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)", file=sys.stderr) | ||
sys.exit() | ||
import cloudscraper | ||
from lxml import html | ||
|
||
lang = 'en' | ||
|
||
if len(sys.argv) > 1: | ||
if sys.argv[1] == 'fr': | ||
lang = 'fr' | ||
def scene_from_url(url: str) -> ScrapedScene: | ||
scene = ScrapedScene() | ||
|
||
frag = json.loads(sys.stdin.read()) | ||
if not frag['url']: | ||
log.error('No URL entered.') | ||
sys.exit(1) | ||
scraper = cloudscraper.create_scraper() | ||
try: | ||
scraped = scraper.get(url) | ||
scraped.raise_for_status() | ||
except Exception as ex: | ||
log.error(f"Error getting URL: {ex}") | ||
sys.exit(1) | ||
|
||
url = frag["url"] | ||
scraper = cloudscraper.create_scraper() | ||
try: | ||
cookies = {'lang': lang} | ||
scraped = scraper.get(url, cookies=cookies) | ||
except: | ||
log.error("scrape error") | ||
sys.exit(1) | ||
tree = html.fromstring(scraped.text) | ||
|
||
if scraped.status_code >= 400: | ||
log.error(f'HTTP Error: {scraped.status_code}') | ||
sys.exit(1) | ||
video_data = None | ||
video_data_elems = tree.xpath("//script[@type='application/ld+json']") | ||
for d in video_data_elems: | ||
if '"@type": "VideoObject"' in d.text: | ||
video_data = json.loads(d.text)[0] | ||
break | ||
if not video_data: | ||
log.error("No VideoObject data found.") | ||
sys.exit(1) | ||
|
||
tree = html.fromstring(scraped.text) | ||
scene = ScrapedScene({ | ||
"title": video_data["name"], | ||
"details": video_data["description"], | ||
"studio": ScrapedStudio(name=video_data["productionCompany"]), | ||
"tags": [ScrapedTag(name=t) for t in video_data["keywords"].split(",")], | ||
"performers": [ScrapedPerformer(name=a["name"]) for a in video_data["actor"]] | ||
}) | ||
|
||
title = None | ||
title_res = tree.xpath("//h1/text()") | ||
if title_res: | ||
title = title_res[0] | ||
date = None | ||
dt = tree.xpath("//span[@class='video-detail__date']/text()") | ||
if dt: | ||
f, *m, l = dt[0].split() | ||
log.debug(f"found date: {l}") | ||
if l: | ||
if lang == 'fr': | ||
date = datetime.datetime.strptime(l, | ||
"%d/%m/%Y").strftime("%Y-%m-%d") | ||
else: | ||
# en | ||
date = datetime.datetime.strptime(l, | ||
"%m/%d/%Y").strftime("%Y-%m-%d") | ||
desc = tree.xpath("//meta[@property='og:description']/@content") | ||
details = "" | ||
if desc: | ||
details = desc[0] | ||
tags = tree.xpath("//a[@class='video-detail__tag-list__link']/text()") | ||
imgurl_res = tree.xpath("//video[@id='video-player']/@poster") | ||
datauri = None | ||
if imgurl_res: | ||
imgurl = imgurl_res[0] | ||
img = scraper.get(imgurl).content | ||
b64img = base64.b64encode(img) | ||
datauri = "data:image/jpeg;base64," | ||
# If no performers look in zeder elem | ||
if not scene["performers"]: | ||
try: | ||
zeder_elem = tree.xpath("//div[contains(@class, '-zeder-detail-')]")[0] | ||
zeder_attrs = zeder_elem.attrib | ||
for k, v in zeder_attrs.items(): | ||
if "data-zeder-actor-" in k: | ||
scene["performers"].append(ScrapedPerformer(name=v.replace("-", " ").title())) | ||
except IndexError: | ||
pass | ||
|
||
ret = { | ||
'title': title, | ||
'tags': [{ | ||
'name': x.strip() | ||
} for x in tags], | ||
'date': date, | ||
'details': details, | ||
'image': datauri + b64img.decode('utf-8'), | ||
'studio': { | ||
'name': 'Jacquie Et Michel TV' | ||
}, | ||
} | ||
scene["date"] = dt.fromisoformat(video_data["datePublished"]).strftime("%Y-%m-%d") | ||
|
||
print(json.dumps(ret)) | ||
image_url = tree.xpath("//meta[@property='og:image']/@content")[0] | ||
try: | ||
img = scraper.get(image_url).content | ||
scraped.raise_for_status() | ||
scene["image"] = "data:image/jpeg;base64," + base64.b64encode(img).decode() | ||
except Exception as ex: | ||
log.error(f"Failed to get image: {ex}") | ||
|
||
return scene | ||
|
||
|
||
if __name__ == "__main__": | ||
op, args = scraper_args() | ||
result = None | ||
match op, args: | ||
case "scene-by-url", {"url": url} if url: | ||
result = scene_from_url(url) | ||
case _: | ||
log.error(f"Not Implemented: Operation: {op}, arguments: {json.dumps(args)}") | ||
sys.exit(1) | ||
|
||
print(json.dumps(result)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,13 @@ | ||
name: JacquieEtMicaelTV | ||
# requires: py_common | ||
|
||
sceneByFragment: | ||
action: script | ||
script: | ||
- python | ||
- JacquieEtMichelTV.py | ||
#- fr # uncomment if you want to use the french description | ||
sceneByURL: | ||
- url: | ||
- jacquieetmicheltv.net/en/videos | ||
action: script | ||
- action: script | ||
url: | ||
- jacquieetmicheltv.net/en/content | ||
- jacquieetmicheltv.net/fr/content | ||
script: | ||
- python | ||
- python3 | ||
- JacquieEtMichelTV.py | ||
#- fr # uncomment if you want to use the french description | ||
# Last Updated January 29, 2022 | ||
- scene-by-url | ||
|
||
# Last Updated November 19, 2024 |