-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser_imdb.py
294 lines (239 loc) · 10.1 KB
/
parser_imdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# -*- coding: utf-8 -*-
# iMDB 2 JSON-LD
# © 2015/07 Wladston Viana (wladston@wladston.net) under the BSD License.
#
# WHAT IS THIS SCRIPT
# This script converts a SQLite-based iMDB dataset to the JSON-LD format using
# the schema.org vocabulay. Expects a "imdb.sqlite" database file at the
# working directory. Also make sure you have all the python requirements (see
# requirements.txt).
#
# TODO:
# - Make sure all the information from Person is properly implemented.
# - Caching: never process person or company twice.
# - Create final bash script to:
# -- Download the needed files from imdb servers. (20)
# -- Generates the sql database (10)
# -- Converts the sql datbase to json-ld data. (10)
# -- Create requirements.txt
#
# - Map the keys certificates, mpaa to schema:contentRatig.
# - Map the key certificates to schema:typicalAgeRange.
# - Map the key literature to schema:referecedIn.
# - Map the key ['business']['copyright holder'] to schema:copyrightHolder.
# - Fix representation of schema:birthDate.
# - Fix representation of schema:ratingValue, use decimal instead of float.
# - Fix representation of schema:duration.
# - Review the get_movie_duration() function.
#
# NOTE: These keys, even though abailable on the iMDB dataset, were not used so
# far bcause they do not fit in the schema.org vocabulary.
# 'color info', 'crazy credits', 'complete crew', 'complete cast', 'trivia',
# 'sound mix', 'tech info', 'quotes', 'votes distribution', 'countries',
# 'goofs', 'laserdisc', 'canonical title', 'long imdb title', 'long imdb
# canonical title', 'smart canonical title', 'smart long imdb canonical title',
# Python Libs.
import os
import re
import json
import sqlite3
from datetime import datetime, timedelta
# External Libs.
from dateutil.parser import parse as duparser
from pyld import jsonld
from imdb import IMDb
def setup_session():
path = os.path.dirname(os.path.realpath(__file__)) + "/imdb.sqlite"
db = sqlite3.connect(path)
imdb = IMDb('sql', uri='sqlite://' + path)
return db.cursor(), imdb
def get_imdb_url(element, element_type=None, resolve_using_imdb=False):
if not resolve_using_imdb:
# URL Format: http://www.imdb.com/Title?Bourne+Ultimatum,+The+(2007)
# http://www.imdb.com/Name?Musgrove,+Matt
base = 'http://www.imdb.com/%s' % (element_type)
key = 'long imdb canonical %s' % element_type.lower()
url = base + '?' + element[key]
return url.replace(' ', '+')
else:
return IA.get_imdbURL(x)
def get_and_parse_companies(imdb, movie):
need_type = ['distributors']
companies = {}
for role in ['production companies', 'distributors']:
for company in movie.get(role, []):
imdb.update(company, info=['main'])
cdata = {'name': company['name']}
if company.get('country'):
c = company['country']
if c[0] == '[' and c[-1] == ']':
c = c[1:-1]
cdata['address'] = {'addressCountry': c}
if role in need_type:
cdata['@type'] = 'Organization'
companies.setdefault(role, []).append(cdata)
return companies
def get_rating(movie):
return {'ratingCount': movie['votes'],
'bestRating': 10,
'worstRating': 1,
'ratingValue': movie['rating']}
def parse_imdb_team(team, role):
people = []
need_type = ['producer', 'composer', 'writer']
for pdata in [p for p in team.values() if role in p['roles']]:
person = {'@id': get_imdb_url(pdata, 'Name')}
person['name'] = pdata['name'],
if 'height' in pdata:
person['height'] = {'@type': 'QuantitativeValue',
'unitText': pdata['height']}
if 'birth date' in pdata:
person['birthDate'] = pdata['birth date']
if 'birth notes' in pdata:
person['birthPlace'] = {'@type': 'Place',
'name': pdata['birth notes']}
if role in need_type:
person['@type'] = 'Person'
people.append(person)
return people
def get_team(imdb, movie):
'''Extracts all people participating in producing the movie.'''
special = ['producer', 'writer', 'cast', 'director', 'editor', 'composer']
not_special = ['sound crew', 'art department', 'visual effects', 'make up',
'art director', 'set decorator', 'costume designer', 'cinematographer',
'stunt performer', 'production designer', 'production manager',
'assistant director', 'crewmembers']
team = {}
for role in special + not_special:
for person in movie.get(role, []):
imdb.update(person, info=['biography', 'main'])
cname = person['long imdb canonical name']
person.data['long imdb canonical name'] = cname
team.setdefault(person.getID(), person.data)
role = 'contributor' if role in not_special else role
team[person.getID()].setdefault('roles', []).append(role)
for person in team.values():
if 'contributor' in person['roles'] and len(person['roles']) > 1:
person['roles'].remove('contributor')
return team
def get_movie_duration(m):
if 'runtimes' in m.keys():
runtime = m['runtimes'][0]
if '::' in runtime:
runtime = runtime.split('::')[0]
if ':' in runtime:
runtime = runtime.split(':')[1]
if '.' in runtime:
runtime = runtime.split('.')[0]
if '-' in runtime:
runtime = runtime.split('-')[0]
if ',' in runtime:
runtime = runtime.split(',')[0]
if ' ' in runtime:
runtime = runtime.split(' ')[0]
if '\'' in runtime:
runtime = runtime.split('\'')[0]
if '\"' in runtime:
runtime = runtime.split('\"')[0]
if 'm' in runtime:
runtime = runtime.split('m')[0]
if '\\' in runtime:
runtime = runtime.split('\\')[0]
if '/' in runtime:
runtime = runtime.split('/')[0]
if runtime == '':
return None
runtime = int(runtime)
return str(timedelta(minutes=runtime))
return None
def get_movie_release(movie):
if movie.get('release dates'):
return str(duparser(movie['release dates'][0].split(':')[1]).date())
elif movie.get('year'):
return str(datetime.strptime(str(movie['year']), '%Y').date())
return None
def get_headlines(movie):
if not movie.get('taglines'):
return None, None
if len(movie['taglines']) == 1:
return movie['taglines'][0], None
else:
return movie['taglines'][1], movie['taglines'][1:]
def get_alternate_name(movie):
alternate_name = []
for title in movie.get('akas',[]):
title = title.split('::')[0]
mth = re.match( r'^(.*)\([0-9]{4}\)$', title)
title = mth.group() if mth else title
alternate_name.append(title.strip())
return alternate_name
def fetch_imdb_object(imdb, mid):
# Unused_infosets:
# 'alternate versions', 'connections', 'crazy credits', 'episodes','goofs',
# 'technical', 'literature', 'locations', 'quotes', 'trivia',
# 'vote details'
infosets = ['business', 'main', 'plot', 'keywords','release dates',
'soundtrack', 'taglines']
return imdb.get_movie(mid, info=infosets)
def parse_imdb_movie(movie):
movie['duration'] = get_movie_duration(movie)
movie['aggregateRating'] = get_rating(movie)
movie['headline'], movie['alternativeHeadline'] = get_headlines(movie)
movie['datePublished'] = get_movie_release(movie)
movie['genre'] = ', '.join(movie.get('genres', []))
movie['inLanguage'] = ', '.join(movie.get('languages', []))
movie['keywords'] = ','.join(movie.get('keywords', []))
movie['alternateName'] = get_alternate_name(movie)
movie['description'] = movie.get('plot', [])[0]
movie['name'] = movie['title']
movie['url'] = get_imdb_url(movie, 'Title')
return movie
def get_movies_to_process(cursor):
# kind_id = 1 is for cinema entries.
# info_type_id = 100 is for number of votes.
min_votes = 128000
q = """ SELECT id FROM title WHERE kind_id = 1
AND id IN (SELECT DISTINCT movie_id FROM movie_info_idx WHERE
info_type_id = 100 AND CAST(info as int) > %d);"""
ids = cursor.execute(q % min_votes).fetchall()
print "// %d movies to process." % len(ids)
return [x[0] for x in ids]
def get_jsonld_from_imdb(imdb, mid):
imdb_movie = fetch_imdb_object(imdb, mid)
imdb_team = get_team(imdb, imdb_movie)
companies = get_and_parse_companies(imdb, imdb_movie)
movie_data = parse_imdb_movie(imdb_movie)
movie = {
'@id': movie_data['url'],
'@type': 'Movie',
'actor': parse_imdb_team(imdb_team, 'cast'),
'director': parse_imdb_team(imdb_team, 'director'),
'duration': movie_data['duration'],
'musicBy': parse_imdb_team(imdb_team, 'composer'),
'productionCompany': companies['production companies'],
'aggregateRating': movie_data['aggregateRating'],
'alternativeHeadline': movie_data['alternativeHeadline'],
'author': parse_imdb_team(imdb_team, 'writer'),
'datePublished': movie_data['datePublished'],
'editor': parse_imdb_team(imdb_team, 'editor'),
'genre': movie_data['genre'],
'headline': movie_data['headline'],
'inLanguage': movie_data['inLanguage'],
'keywords': movie_data['keywords'],
'producer': parse_imdb_team(imdb_team, 'producer'),
'provider': companies['distributors'],
'alternateName': movie_data['alternateName'],
'description': movie_data['description'],
'name': movie_data['name'],
'url': movie_data['url'],
}
return {k: v for (k, v) in movie.items() if v}
def process_all_imdb(cursor, imdb):
for i, mid in enumerate(get_movies_to_process(cursor)):
print "// %d movies processed." % i
movie = get_jsonld_from_imdb(imdb, mid)
movie = jsonld.compact(movie, 'http://schema.org/')
print json.dumps(movie, indent=4)
if __name__ == "__main__":
cursor, imdb = setup_session()
process_all_imdb(cursor, imdb)