-
Notifications
You must be signed in to change notification settings - Fork 0
/
AlbumData.py
76 lines (67 loc) · 2.72 KB
/
AlbumData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import requests
from bs4 import BeautifulSoup
# This function gets the table head and return a list of all the fields in the table head row.
def get_table_head_fields_as_list(table_obj):
result = []
table_head = table_obj.find('tr')
table_head_fields = table_head.find_all('tr')
for field_obj in table_head_fields:
result.append(field_obj.getText().strip())
return result
# This function gets data from a specified album page.
def get_album_data(name):
return -1
# This function gets the table body and return a list of rows with data fields.
def get_table_body_as_lists(table_obj):
result = []
# table_body = table_obj.find('tbody')
table_rows = table_obj.find_all('tr')
for row in table_rows:
curr_row = []
row_fields = row.find_all('td')
for field_obj in row_fields:
curr_row.append(field_obj.getText().strip())
result.append(curr_row)
return result
#def scrape_album(title):
url = 'http://tmbw.net/wiki/Discography'
data = requests.get(url) # get page data
soup = BeautifulSoup(data.text,features="html.parser") # parse page data
table = soup.find('table') # get the first (and only) table on the page
#print(table)
table_head = get_table_head_fields_as_list(table)
table_body = get_table_body_as_lists(table)
final_table_data = [table_head] + table_body # join the head data and body data
final_table_data = final_table_data[2:]
# Get all releases listed on the Discography page.
for entry in final_table_data:
name = entry[1]
#details_link = "http://tmbw.net/wiki/" + str(name)
# details_link = "http://tmbw.net" + str(table.find_all("tr").find_all("td")[1].find('a')['href'])
# d_data = requests.get(details_link) # get page data
# d_soup = BeautifulSoup(d_data.text,features="html.parser") # parse page data
# d_table = d_soup.find('table') # get the first (and only) table on the page
# d_table_head = get_table_head_fields_as_list(d_table)
# d_table_body = get_table_body_as_lists(d_table)
# d_final_table_data = [d_table_head] + d_table_body # join the head data and body data
# d_final_table_data = d_final_table_data[2:]
typ = entry[2]
#date =
year = entry[0]
#length =
#num_tracks =
#cover =
#tracklist =
print("{}, {} {}".format(name, year, typ))
def Album(title):
'''Wrapper function the endpoint will call
Input: title
Output: data in format specified by schema
'''
# get initial album details, so pass it to the scrape_album function.
# search the table for that album title, then return some info about that album
album_details = ""
for album in final_table_data:
if album[1] == title:
album_details = album
return album_details