-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_fetcher.py
115 lines (99 loc) · 3.39 KB
/
data_fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from datetime import datetime
from bs4 import BeautifulSoup
import requests
from requests.exceptions import ConnectionError
import regex as re
import csv
from tqdm import tqdm
total_films = 0
data = []
with open('export/watched.csv') as f:
reader = csv.reader(f)
next(reader)
for row in reader:
data.append(row)
total_films += 1
def get_film_data(film):
name = film[1]
year = film[2]
uri = film[3]
try:
page = requests.get(uri).text
except ConnectionError:
print('No internet connection.')
return
soup = BeautifulSoup(page, 'lxml')
# director
try:
director = str(soup.find('meta', attrs={'name':'twitter:data1'}).attrs['content'])
except:
director = ''
# cast
cast = []
s = soup.find(class_='cast-list').find_all('a')
for m in s:
if m.string.startswith("Show All"):
break
elif m.string.startswith('Stan Lee'):
continue
else:
cast.append(m.string)
# country, langs
no_details = False
countries=[]
langs=[]
try:
span = soup.find('div', attrs={'id':'tab-details'}).select("span")
except:
no_details = True
if not no_details:
for s in span:
if s.contents[0]=="Countries" or s.contents[0]=="Country":
d1 = s.find_next('div')
countries = [str(c.contents[0]) for c in d1.find_all('a')]
if s.contents[0]=="Languages" or s.contents[0]=="Language":
d1 = s.find_next('div')
langs = [str(c.contents[0]) for c in d1.find_all('a')]
# genre
genres=[]
no_genre = False
try:
span = soup.find('div', attrs={'id':'tab-genres'}).select("span")
except:
no_genre = True
if not no_genre:
for s in span:
if s.contents[0]=="Genres" or s.contents[0]=="Genre":
d1 = s.find_next('div')
genres = [str(c.contents[0]) for c in d1.find_all('a', href=True)]
# runtime
d = soup.find('p', class_='text-footer')
time = re.findall("\d+", str(d))[0]
# times watched
rewatched = False
with open('export/diary.csv', encoding='utf-8') as f:
diary = csv.reader(f)
next(diary)
watches = []
for film in diary:
if film[1] == name and film[2] == str(year):
watches.append(datetime.strptime(film[7], '%Y-%m-%d'))
if len(watches) > 1:
rewatched = True
# poster
try:
poster = str(soup.find('div', attrs={'id':'js-poster-col'}).find('img').attrs['src'])
except:
poster = ''
details = [uri.split('/')[-1], year, name, director, time, cast, poster, countries, langs, genres, watches]
return details
with open('export/data.csv', 'a', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['uri', 'Year', 'Name', 'Director', 'Runtime', 'Cast', 'Poster', 'Countries', 'Languages', 'Genres', 'Watches'])
for i in tqdm(range(total_films), desc="Loading film data..", ascii=False, ncols=75):
try:
film_data = get_film_data(data[i])
writer.writerow(film_data)
except:
print(f'\n\nError occured at {data[i][1]} ({data[i][2]})')
break