-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscript_crawler.py
153 lines (113 loc) · 4.71 KB
/
script_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# -*- coding: utf-8 -*-
from selenium import webdriver
import requests
from bs4 import BeautifulSoup as bs
import codecs
from os import path
def get_movie_list(url):
res = requests.get(url)
soup = bs(res.text, 'html.parser')
soup_td = soup.find_all('td')
movie_tag_list = soup_td[5].find_all('p')
return movie_tag_list
def get_script_url(driver, movie_url):
driver.get('http://www.imsdb.com'+movie_url)
html = driver.page_source
soup = bs(html, 'html.parser')
a_tag = soup.find('table', 'script-details').find_all('a', href=True)[-1]
url = a_tag['href']
if not url.startswith('/scripts/'): # script url is not exist
return ''
else:
return url
#ToDO: 연도 더하기
def get_movie_score(driver, movie_title):
try:
driver.get('https://www.imdb.com/search/title?title=%s&title_type=feature' % movie_title)
html = driver.page_source
soup = bs(html, 'html.parser')
score_list = soup.find('div', 'lister-list').find_all('div', 'lister-item')
for score_soup in score_list:
try:
score = score_soup.find('div', 'ratings-imdb-rating')['data-value']
if score:
return score
except TypeError:
pass
except AttributeError:
driver.get('http://www.imdb.com/find?q='+movie_title+'&s=tt')
html = driver.page_source
soup = bs(html, 'html.parser')
soup_table = soup.find('table', 'findList')
if not soup_table:
print(movie_title, 'You have to find score for this movie by yourself (refer- : https://www.imdb.com)')
return 0
movie_url_list = soup_table.find_all('tr', 'findResult')
for movie_url in movie_url_list:
url = movie_url.find('td', 'result_text').find('a', href=True)['href']
driver.get('http://www.imdb.com'+url)
html = driver.page_source
soup = bs(html, 'html.parser')
try:
score = soup.find('div', 'ratingValue').find('span').text
if score:
return score
except:
pass
print(movie_title, 'You have to find score for this movie by yourself (refer- : https://www.imdb.com)')
return 0
def get_movie_script(driver, script_url):
if not path.basename(script_url).endswith('html'):
return
movie_title = script_url[9:-5]
score = get_movie_score(driver, movie_title)
# print(movie_title, score)
driver.get('http://www.imsdb.com' + script_url)
html = driver.page_source
script_soup = bs(html, 'html.parser', from_encoding='utf-8')
script_td = script_soup.find('td', 'scrtext')
if script_td:
for br in script_td.find_all('br'):
br.replace_with('\n')
if len(script_td.find_all('p')) > 100:
script_text = ''
for script_line in script_td.find_all('p'):
script_text += script_line.get_text()
else:
script_td.find('table').decompose()
script_td.find('div').decompose()
script_text = script_td.get_text()
if len(script_text.split('\n')) < 100:
return 0
script_text = script_text.replace('\r','\n').replace("\'", "'").replace('\xa0', ' ')
with codecs.open('./raw_script/%s_%s.txt' % (movie_title, score), 'w', 'utf-8') as f:
f.write(script_text)
return 1
else:
return 0
def work(prefix):
print(prefix+' start')
# selenium driver config
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
driver = webdriver.Chrome('chromedriver', chrome_options=options)
driver.implicitly_wait(0.1)
movie_tag_list = get_movie_list('http://www.imsdb.com/alphabetical/'+prefix)
num_success = 0
num_fail = 0
for movie_tag in movie_tag_list:
movie_url = movie_tag.find('a', href=True)['href']
movie_script_url = get_script_url(driver, movie_url)
if movie_script_url == '':
continue
result = get_movie_script(driver, movie_script_url)
if result:
num_success += 1
else:
print('fail: ', movie_script_url)
num_fail += 1
print('%s, all: %d success: %d fail: %d nonexsistent: %d'
% (prefix, len(movie_tag_list), num_success, num_fail, len(movie_tag_list)-num_success-num_fail))
driver.quit()