-
-
Notifications
You must be signed in to change notification settings - Fork 427
/
XConfessions.yml
102 lines (100 loc) · 3.53 KB
/
XConfessions.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
name: XConfessions
sceneByURL:
- action: scrapeXPath
url:
- xconfessions.com
scraper: sceneScraper
movieByURL:
- action: scrapeXPath
url:
- xconfessions.com
scraper: movieScraper
xPathScrapers:
sceneScraper:
common: &com
$script: //script[contains(.,'window.__NUXT')]/text()
scene:
Title: &title //h1
Date: &date
selector: $script
postProcess:
- replace:
- regex: .*release_date."([^\s]+).*
with: $1
- parseDate: 2006-01-02
Details: &details
selector: $script
postProcess:
- replace:
- regex: .+synopsis_clean="(?P<full>[^"]+).+short_synopsis_clean="(?P<short>[^"]+).+
with: |
$short
$full
Performers:
Name:
selector: //p[span[contains(.,'Performers:')]]//a
concat: ' & ' # some performers are joined
split: ' & ' # with &
Director: &director //p[span[contains(.,'Director:')]]//a
Tags:
Name:
selector: //div/a[contains(@href,'/categories/')] | //button/@data-test
postProcess:
- replace:
- regex: 'profile-'
with: ''
Image: &image
selector: $script
postProcess:
- replace:
# Most scenes have a cover_title_picture value in the script
- regex: .+?cover_title_picture="([^"]+).+
with: $1
# Some without the cover_title_picture will have twi consecutive images
# with a UID. The first is the movie poster, the second is the scene cover image
- regex: .+,"(https:[^"]+?-[^"]+?)","(https:[^"]+?-[^"]+)".*
with: $2
# Some without the UID-style images instead have a sequence of images after
# a time stamp. The first is th movie poster, the second is the scene image
- regex: .+"\d\d\d\d-\d\d-\d\d\s+\d\d:\d\d:\d\d"[0-9a-zA-Z,"]*"(https:[^"]+?)","(https:[^"]+)".*
with: $2
# Some don't have any of the above, but have a confession_cover_image value
# that is pretty good, but doesn't have the scene name on it.
- regex: .+?confession_cover_image:"([^"]+).+
with: $1
# If nothing above matches, take the last image in the script. Often this
# is the scene image, but it's not 100% reliable.
- regex: .+(https:[^"]+).*$
with: $1
# Clean up UTF-8 coded slashes for real ones
- regex: '\\u002F'
with: '/'
Studio: &studio
Name:
fixed: XConfessions
Movies:
Name: *title
Director: *director
Duration: &duration
selector: $script
postProcess:
- replace:
- regex: .+?.length="([^"]+).+
with: $1
Date: *date
Synopsis: *details
Studio: *studio
URL: &url //link[@rel='canonical']/@href
FrontImage: &frontImage //article//div[contains(@class,'134px')]//img[contains(@sizes,'200px') or contains(@src,'jpeg') or contains(@src,'jpg')]/@src
movieScraper:
common: *com
movie:
Name: *title
Director: *director
Duration: *duration
Date: *date
Synopsis: *details
Studio: *studio
URL: *url
FrontImage: *frontImage
# Last Updated April 13, 2024