-
Notifications
You must be signed in to change notification settings - Fork 1
/
filmwebplus.xml
312 lines (306 loc) · 15.2 KB
/
filmwebplus.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
<?xml version="1.0" encoding="utf-8"?><scraper framework="1.1" date="2012-10-26" name="Filmweb+" content="movies" language="pl">
<!-- Match only URLs starting with https://www.filmweb.pl/ -->
<NfoUrl dest="3">
<RegExp input="$$1" output="<url>\1</url>" dest="3">
<expression noclean="1">(https://www.filmweb.pl/[^\r\n]+)</expression>
</RegExp>
</NfoUrl>
<CreateSearchUrl dest="3">
<!--Movie title is pre-urlencoded by Kodi - no need to urlencode manually-->
<RegExp input="" output="<url>https://www.filmweb.pl/search/film?q=$$1&amp;startYear=$$2&amp;endYear=$$2</url>" dest="3">
<expression />
</RegExp>
</CreateSearchUrl>
<GetSearchResults dest="3">
<!--Enclose results in <results></results>-->
<RegExp input="$$5" output="<results sorted="yes">\1</results>" dest="3">
<!--Fetch search results (title, year, URL)-->
<RegExp input="$$1" output="<entity><title>\2</title><year>\3</year><url>https://www.filmweb.pl\1</url></entity>" dest="5+">
<expression repeat="yes"><a class="filmPreview__link" href="([^"]+)"><h3 class="filmPreview__title">([^<]+)</h3></a> <span class="filmPreview__year">(\d+)</span></expression>
</RegExp>
<expression fixchars="1" noclean="1" />
</RegExp>
</GetSearchResults>
<GetDetails dest="3">
<!--Enclose fetched details in <details></details>-->
<RegExp input="$$5" output="<details>\1</details>" dest="3">
<!--Enclose title (fetched from buffer 6) in <title></title>-->
<RegExp input="$$6" output="<title>\1</title>" dest="5+">
<!--Save title in buffer 6-->
<RegExp input="$$1" output="\1" dest="6">
<expression><h1 itemprop="name" class="filmCoverSection__title [^"]+"><span[^>]+>([^<]+)</expression>
</RegExp>
<expression />
</RegExp>
<!--Enclose original title (fetched from buffer 6 - buffer 7 is urlencoded!) in <originaltitle></originaltitle>-->
<RegExp input="$$6" output="<originaltitle>\1</originaltitle>" dest="5+">
<!--Save original title in buffer 6, if it exists-->
<RegExp input="$$1" output="\1" dest="6">
<expression><h2 class="filmCoverSection__orginalTitle">([^<]+)</expression>
</RegExp>
<!--Replace "Movie, The" with "The Movie" etc.-->
<RegExp input="$$6" output="\2 \1" dest="6">
<expression>(.*), (A|An|The)$</expression>
</RegExp>
<!--urlencode buffer 6 (original title if it exists, Polish title otherwise) to buffer 7 to re-use it in later calls-->
<RegExp input="$$6" output="\1" dest="7">
<expression encode="1" fixchars="1" />
</RegExp>
<!--Replace Roman "II" at the end of the title with Arabic "2" to work around some incorrect Filmweb titles (caveat: buffer 7 is urlencoded!)-->
<RegExp input="$$7" output="\1%202" dest="7">
<expression>(.*)%20II$</expression>
</RegExp>
<!--Replace Roman "III" at the end of the title with Arabic "3" to work around some incorrect Filmweb titles (caveat: buffer 7 is urlencoded!)-->
<RegExp input="$$7" output="\1%203" dest="7">
<expression>(.*)%20III$</expression>
</RegExp>
<expression />
</RegExp>
<!--Enclose year (fetched from buffer 6 for unification with title fetching) in <year></year>-->
<RegExp input="$$6" output="<year>\1</year>" dest="5+">
<!--Save year in buffer 6-->
<RegExp input="$$1" output="\1" dest="6">
<expression><span class="filmCoverSection__year">(\d+)</expression>
</RegExp>
<!--Copy buffer 6 (year) to buffer 8 to re-use it in later calls-->
<RegExp input="$$6" output="\1" dest="8">
<expression />
</RegExp>
<expression />
</RegExp>
<!--Fetch runtime -->
<RegExp input="$$1" output="<runtime>\1</runtime>" dest="5+">
<expression>data-duration="(\d+)"</expression>
</RegExp>
<!--Fetch countries-->
<RegExp input="$$6" output="<country>\1</country>" dest="5+">
<RegExp input="$$1" output="\1" dest="6">
<expression noclean="1"><h3 class="filmInfo__header">produkcja</h3><div class="filmInfo__info">(.*?)</div></expression>
</RegExp>
<expression repeat="yes"><a href="[^"]+">([^<]+)</expression>
</RegExp>
<!--Fetch rating, replacing comma with decimal point-->
<RegExp input="$$1" output="<rating>\1.\2</rating>" dest="5+">
<expression><span itemprop="ratingValue">\s*(\d+),(\d+)</expression>
</RegExp>
<!--Fetch number of votes-->
<RegExp input="$$1" output="<votes>\1</votes>" dest="5+">
<expression><span itemprop="ratingCount">([^<]+)</expression>
</RegExp>
<!--Fetch genres-->
<RegExp input="$$6" output="<genre>\1</genre>" dest="5+">
<RegExp input="$$1" output="\1" dest="6">
<expression noclean="1"><div class="filmInfo__info" itemprop="genre">(.*?)</div></expression>
</RegExp>
<expression repeat="yes"><a href="[^"]+">([^<]+)</expression>
</RegExp>
<!--Fetch actors from cast, along with portraits and roles-->
<RegExp input="$$6" output="<actor><thumb>\1</thumb><name>\2</name><role>\4</role></actor>" dest="5+">
<!--Extract cast-->
<RegExp input="$$1" output="\1" dest="6">
<expression noclean="1"><section class="page__section filmCastSection (.*?)</section></expression>
</RegExp>
<expression repeat="yes">data-photo="([^"]*).*?span data-person-source[^>]+>([^<]+).*?<(span data-role-source|div class="personRole__role">[^<])[^>]+>\s*([^<]*)</expression>
</RegExp>
<!--Fetch directors-->
<RegExp input="$$1" output="<director>\1</director>" dest="5+">
<expression repeat="yes">title="([^"]+)" itemprop="director"</expression>
</RegExp>
<!--Fetch writers' names-->
<RegExp input="$$1" output="<credits>\1</credits>" dest="5+">
<!--Extract writers-->
<expression repeat="yes">title="([^"]+)" itemprop="creator"</expression>
</RegExp>
<!--Fetch just the first plot description-->
<RegExp input="$$1" output="<plot>\2</plot>" dest="5+">
<expression>class="(descriptionSection__text|descriptionSection__moreText hide)">([^<].*?)<</expression>
</RegExp>
<!--Fetch the main poster first-->
<RegExp input="$$1" output="<thumb preview="\1.\2.jpg">\1.$.jpg</thumb>" dest="5+">
<expression><img id="filmPoster"[^>]+src="([^"]+)\.(\d)\.jpg"</expression>
</RegExp>
<!--Other posters will be grabbed by GetPosters as another HTTP request is required-->
<RegExp input="$$1" output="<url function="GetPosters">https://www.filmweb.pl\1</url>" dest="5+">
<expression><a href="([^"]+/posters)"</expression>
</RegExp>
<!--Trailer will be grabbed by GetTrailer as another HTTP request is required-->
<RegExp input="$$1" output="<url function="GetTrailer">https://www.filmweb.pl\1</url>" dest="5+">
<expression>a href="(/video/[Zz]wiastun/[^"]+)</expression>
</RegExp>
<!--Prepare IMDB search URL using previously prepared title (buffer 7) and year (buffer 8)-->
<RegExp input="" output="<chain function="Latinize">GetIMDBSearch:https://www.imdb.com/search/title?release_date=$$8,$$8&amp;title=$$7&amp;title_type=feature,tv_movie,documentary,video</chain>" dest="5+">
<expression />
</RegExp>
<!--Prepare TMDB search URL using previously prepared title (buffer 7) and year (buffer 8)-->
<RegExp input="" output="<chain function="Latinize">GetTMDBSearch:https://www.themoviedb.org/search/movie?query=$$7+y%3A$$8</chain>" dest="5+">
<expression />
</RegExp>
<expression fixchars="1" noclean="1" />
</RegExp>
</GetDetails>
<GetPosters dest="3">
<!--Enclose posters in <details></details>-->
<RegExp input="$$5" output="<details>\1</details>" dest="3">
<!--Fetch posters-->
<RegExp input="$$1" output="<thumb preview="\1.\2.jpg">\1.$.jpg</thumb>" dest="5">
<expression repeat="yes"><img class="simplePoster__image"[^>]+data-src="([^"]+)\.(\d)\.jpg"</expression>
</RegExp>
<expression noclean="1" />
</RegExp>
</GetPosters>
<GetTrailer dest="3">
<!--Enclose trailer in <details><trailer></trailer></details>-->
<RegExp input="$$5" output="<details><trailer>\1</trailer></details>" dest="3">
<!--Fetch only the first trailer-->
<RegExp input="$$1" output="\1" dest="5">
<expression>"src":"([^"]+)</expression>
</RegExp>
<!--Invoke YouTube plugin if the trailer is sourced from YouTube-->
<RegExp input="$$5" output="plugin://plugin.video.youtube/?action=play_video&amp;videoid=\1" dest="5">
<expression>https://www.youtube.com/watch\?v=(.*)</expression>
</RegExp>
<expression noclean="1" />
</RegExp>
</GetTrailer>
<!--
This is a recurrent function which first replaces Polish
characters from its argument with their Latin counterparts
and then splits the converted argument into a function name and
a latinized string (format is "fname:lstring"). The value
returned by Latinize is a call to a URL function "fname" with
a latinized argument "lstring".
I am aware that this function's code may be confusing but this
is the price we have to pay for no built-in replacing function
being available to Kodi scrapers. Each iteration replaces
a single occurence of every Polish letter - here is how an
example string will look like after each iteration:
0. ąąąąćććęęł
1. ąąąaććcęel
2. ąąaaćcceel
3. ąaaaccceel
4. aaaaccceel
Once there are no more Polish letters present in the search
URL, the function stops calling itself and instead outputs
a call to the supplied URL function.
Note: input for Latinize is pre-urlencoded.
-->
<Latinize dest="3">
<!--Enclose result in <details></details>-->
<RegExp input="$$6" output="<details>\1</details>" dest="3">
<!--Copy input to buffer 5 for processing-->
<RegExp input="$$1" output="\1" dest="5">
<expression />
</RegExp>
<!--Replace "ą" with "a"-->
<RegExp input="$$5" output="\1a\2" dest="5">
<expression>(.*)%c4%85(.*)</expression>
</RegExp>
<!--Replace "ć" with "c"-->
<RegExp input="$$5" output="\1c\2" dest="5">
<expression>(.*)%c4%87(.*)</expression>
</RegExp>
<!--Replace "ę" with "e"-->
<RegExp input="$$5" output="\1e\2" dest="5">
<expression>(.*)%c4%99(.*)</expression>
</RegExp>
<!--Replace "ł" with "l"-->
<RegExp input="$$5" output="\1l\2" dest="5">
<expression>(.*)%c5%82(.*)</expression>
</RegExp>
<!--Replace "ń" with "n"-->
<RegExp input="$$5" output="\1n\2" dest="5">
<expression>(.*)%c5%84(.*)</expression>
</RegExp>
<!--Replace "ś" with "s"-->
<RegExp input="$$5" output="\1s\2" dest="5">
<expression>(.*)%c5%9b(.*)</expression>
</RegExp>
<!--Replace "ż" with "z"-->
<RegExp input="$$5" output="\1z\2" dest="5">
<expression>(.*)%c5%bc(.*)</expression>
</RegExp>
<!--Replace "ź" with "z"-->
<RegExp input="$$5" output="\1z\2" dest="5">
<expression>(.*)%c5%ba(.*)</expression>
</RegExp>
<!--Replace "Ą" with "A"-->
<RegExp input="$$5" output="\1A\2" dest="5">
<expression>(.*)%c4%84(.*)</expression>
</RegExp>
<!--Replace "Ć" with "C"-->
<RegExp input="$$5" output="\1C\2" dest="5">
<expression>(.*)%c4%86(.*)</expression>
</RegExp>
<!--Replace "Ę" with "E"-->
<RegExp input="$$5" output="\1E\2" dest="5">
<expression>(.*)%c4%98(.*)</expression>
</RegExp>
<!--Replace "Ł" with "L"-->
<RegExp input="$$5" output="\1L\2" dest="5">
<expression>(.*)%c5%81(.*)</expression>
</RegExp>
<!--Replace "Ń" with "N"-->
<RegExp input="$$5" output="\1N\2" dest="5">
<expression>(.*)%c5%83(.*)</expression>
</RegExp>
<!--Replace "Ś" with "S"-->
<RegExp input="$$5" output="\1S\2" dest="5">
<expression>(.*)%c5%9a(.*)</expression>
</RegExp>
<!--Replace "Ż" with "Z"-->
<RegExp input="$$5" output="\1Z\2" dest="5">
<expression>(.*)%c5%bb(.*)</expression>
</RegExp>
<!--Replace "Ź" with "Z"-->
<RegExp input="$$5" output="\1Z\2" dest="5">
<expression>(.*)%c5%b9(.*)</expression>
</RegExp>
<!--Enclose processed result in <url></url>...-->
<RegExp input="$$5" output="<url function="\1">\2</url>" dest="6">
<expression>(.*?):(.*)</expression>
</RegExp>
<!--...but if there are any unprocessed characters left in the search URL, call self again-->
<RegExp input="$$5" output="<chain function="Latinize">\1</chain>" dest="6">
<expression>(.*(%c4%85|%c4%87|%c4%99|%c5%82|%c5%84|%c5%9b|%c5%bc|%c5%ba|%c4%84|%c4%86|%c4%98|%c5%81|%c5%83|%c5%9a|%c5%bb|%c5%b9).*)</expression>
</RegExp>
<expression noclean="1" />
</RegExp>
</Latinize>
<GetIMDBSearch dest="3">
<!--Just take the first movie ID from the search results page and hope for the best, then use that ID to get IMDB details-->
<RegExp input="$$1" output="<details><url function="GetIMDBDetails">https://www.imdb.com/title/\1/</url></details>" dest="3">
<expression noclean="1"><div class="lister-list">.*?<a href="/title/([^/]+)</expression>
</RegExp>
</GetIMDBSearch>
<GetIMDBDetails dest="3">
<!--Enclose fetched details in <details></details>-->
<RegExp input="$$5" output="<details>\1</details>" dest="3">
<!--Fetch studios-->
<RegExp input="$$1" output="<studio>\1</studio>" dest="5+">
<expression repeat="yes">"/company/[^/>]+>([^<]+)</expression>
</RegExp>
<!--Fetch MPAA rating-->
<RegExp input="$$1" output="<mpaa>\1</mpaa>" dest="5+">
<expression><a href="[^"]+/parentalguide/[^>]+>([0-9A-Z-]+)<</expression>
</RegExp>
<expression noclean="1" />
</RegExp>
</GetIMDBDetails>
<GetTMDBSearch dest="3">
<!--Just use the first movie link from the search results page and hope for the best-->
<RegExp input="$$1" output="<details><url function="GetTMDBDetails">https://www.themoviedb.org\1/images/backdrops</url></details>" dest="3">
<expression noclean="1"><div class="results.*?href="([^"]+)</expression>
</RegExp>
</GetTMDBSearch>
<GetTMDBDetails dest="3">
<!--Enclose fanarts in <details><fanart></fanart></details>-->
<RegExp input="$$5" output="<details><fanart>\1</fanart></details>" dest="3">
<!--Fetch fanarts-->
<RegExp input="$$1" output="<thumb preview="https://www.themoviedb.org\2">https://www.themoviedb.org\1</thumb>" dest="5">
<!--Extract the list of fanarts (called "backdrops" in TMDB)-->
<expression repeat="yes"><a class="image" href="([^"]+).*?class="backdrop" src="([^"]+)</expression>
</RegExp>
<expression noclean="1" />
</RegExp>
</GetTMDBDetails>
</scraper>