Skip to content

Commit

Permalink
refactor sceneByURL for new site design, remove broken sceneByFragment (
Browse files Browse the repository at this point in the history
  • Loading branch information
WellDoneLiz authored Nov 21, 2024
1 parent c8e9609 commit daeeaa6
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 97 deletions.
151 changes: 67 additions & 84 deletions scrapers/JacquieEtMichelTV/JacquieEtMichelTV.py
Original file line number Diff line number Diff line change
@@ -1,98 +1,81 @@
import base64
import datetime
from datetime import datetime as dt
import json
import string
import sys
from urllib.parse import urlparse
# extra modules below need to be installed
try:
import cloudscraper
except ModuleNotFoundError:
print("You need to install the cloudscraper module. (https://pypi.org/project/cloudscraper/)", file=sys.stderr)
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install cloudscraper", file=sys.stderr)
sys.exit()
from py_common import log
from py_common.types import ScrapedPerformer, ScrapedScene, ScrapedStudio, ScrapedTag
from py_common.util import scraper_args
from py_common.deps import ensure_requirements

try:
from lxml import html
except ModuleNotFoundError:
print("You need to install the lxml module. (https://lxml.de/installation.html#installation)", file=sys.stderr)
print("If you have pip (normally installed with python), run this command in a terminal (cmd): pip install lxml", file=sys.stderr)
sys.exit()
ensure_requirements("cloudscraper", "lxml")

try:
import py_common.graphql as graphql
import py_common.log as log
except ModuleNotFoundError:
print("You need to download the folder 'py_common' from the community repo! (CommunityScrapers/tree/master/scrapers/py_common)", file=sys.stderr)
sys.exit()
import cloudscraper
from lxml import html

lang = 'en'

if len(sys.argv) > 1:
if sys.argv[1] == 'fr':
lang = 'fr'
def scene_from_url(url: str) -> ScrapedScene:
scene = ScrapedScene()

frag = json.loads(sys.stdin.read())
if not frag['url']:
log.error('No URL entered.')
sys.exit(1)
scraper = cloudscraper.create_scraper()
try:
scraped = scraper.get(url)
scraped.raise_for_status()
except Exception as ex:
log.error(f"Error getting URL: {ex}")
sys.exit(1)

url = frag["url"]
scraper = cloudscraper.create_scraper()
try:
cookies = {'lang': lang}
scraped = scraper.get(url, cookies=cookies)
except:
log.error("scrape error")
sys.exit(1)
tree = html.fromstring(scraped.text)

if scraped.status_code >= 400:
log.error(f'HTTP Error: {scraped.status_code}')
sys.exit(1)
video_data = None
video_data_elems = tree.xpath("//script[@type='application/ld+json']")
for d in video_data_elems:
if '"@type": "VideoObject"' in d.text:
video_data = json.loads(d.text)[0]
break
if not video_data:
log.error("No VideoObject data found.")
sys.exit(1)

tree = html.fromstring(scraped.text)
scene = ScrapedScene({
"title": video_data["name"],
"details": video_data["description"],
"studio": ScrapedStudio(name=video_data["productionCompany"]),
"tags": [ScrapedTag(name=t) for t in video_data["keywords"].split(",")],
"performers": [ScrapedPerformer(name=a["name"]) for a in video_data["actor"]]
})

title = None
title_res = tree.xpath("//h1/text()")
if title_res:
title = title_res[0]
date = None
dt = tree.xpath("//span[@class='video-detail__date']/text()")
if dt:
f, *m, l = dt[0].split()
log.debug(f"found date: {l}")
if l:
if lang == 'fr':
date = datetime.datetime.strptime(l,
"%d/%m/%Y").strftime("%Y-%m-%d")
else:
# en
date = datetime.datetime.strptime(l,
"%m/%d/%Y").strftime("%Y-%m-%d")
desc = tree.xpath("//meta[@property='og:description']/@content")
details = ""
if desc:
details = desc[0]
tags = tree.xpath("//a[@class='video-detail__tag-list__link']/text()")
imgurl_res = tree.xpath("//video[@id='video-player']/@poster")
datauri = None
if imgurl_res:
imgurl = imgurl_res[0]
img = scraper.get(imgurl).content
b64img = base64.b64encode(img)
datauri = "data:image/jpeg;base64,"
# If no performers look in zeder elem
if not scene["performers"]:
try:
zeder_elem = tree.xpath("//div[contains(@class, '-zeder-detail-')]")[0]
zeder_attrs = zeder_elem.attrib
for k, v in zeder_attrs.items():
if "data-zeder-actor-" in k:
scene["performers"].append(ScrapedPerformer(name=v.replace("-", " ").title()))
except IndexError:
pass

ret = {
'title': title,
'tags': [{
'name': x.strip()
} for x in tags],
'date': date,
'details': details,
'image': datauri + b64img.decode('utf-8'),
'studio': {
'name': 'Jacquie Et Michel TV'
},
}
scene["date"] = dt.fromisoformat(video_data["datePublished"]).strftime("%Y-%m-%d")

print(json.dumps(ret))
image_url = tree.xpath("//meta[@property='og:image']/@content")[0]
try:
img = scraper.get(image_url).content
scraped.raise_for_status()
scene["image"] = "data:image/jpeg;base64," + base64.b64encode(img).decode()
except Exception as ex:
log.error(f"Failed to get image: {ex}")

return scene


if __name__ == "__main__":
op, args = scraper_args()
result = None
match op, args:
case "scene-by-url", {"url": url} if url:
result = scene_from_url(url)
case _:
log.error(f"Not Implemented: Operation: {op}, arguments: {json.dumps(args)}")
sys.exit(1)

print(json.dumps(result))
21 changes: 8 additions & 13 deletions scrapers/JacquieEtMichelTV/JacquieEtMichelTV.yml
Original file line number Diff line number Diff line change
@@ -1,18 +1,13 @@
name: JacquieEtMicaelTV
# requires: py_common

sceneByFragment:
action: script
script:
- python
- JacquieEtMichelTV.py
#- fr # uncomment if you want to use the french description
sceneByURL:
- url:
- jacquieetmicheltv.net/en/videos
action: script
- action: script
url:
- jacquieetmicheltv.net/en/content
- jacquieetmicheltv.net/fr/content
script:
- python
- python3
- JacquieEtMichelTV.py
#- fr # uncomment if you want to use the french description
# Last Updated January 29, 2022
- scene-by-url

# Last Updated November 19, 2024

0 comments on commit daeeaa6

Please sign in to comment.