forked from emo-eth/pylyrics3
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpylyrics3.py
259 lines (231 loc) · 9.13 KB
/
pylyrics3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
'''
Loosely based on http://github.com/tremby/py-lyrics - thanks!
'''
import urllib.parse
import requests
from collections import defaultdict
from bs4 import BeautifulSoup
class PyLyrics3(object):
'''
Lyric-scraper object.
'''
__lyric_wiki_url = 'http://lyrics.wikia.com'
def __init__(self, proxies=None):
'''
Params:
Optional:
proxies: dict - proxies for the object's _session property to use
'''
self._session = requests.Session()
self._session.headers.update({'User-Agent': 'lyrics'})
self._session.proxies = proxies
def get_artist_lyrics(self, artist, albums=False):
'''
Params:
artist: str - name of the artist whose lyrics will be scraped
artist: boolean - flag to structure result dict by album. Default is
False, setting to True will return a dict nested by album
Returns dict with structure {song: lyrics}: {str: str}
'''
artist = artist.lower()
url = self._construct_lyricwiki_url(artist)
try:
soup = self.__get_soup(url)
except ValueError:
print("Sorry, we couldn't find a Wiki for '%s' on LyricWiki."
% artist)
return
if albums:
return self.__parse_albums(self.__get_artist_album_links(soup),
artist.lower())
song_urls = self.__get_artist_song_links(soup)
title_to_lyrics = {}
for url in song_urls:
try:
song_artist, title = self.__parse_song_artist_and_title(url)
except IndexError:
# when a song is a cover, the link to the original artist gets
# picked up by the link css selector
continue
if artist not in song_artist.lower():
continue
lyrics = self.get_lyrics_from_url(url)
if lyrics:
title_to_lyrics[title] = lyrics
return title_to_lyrics
def get_song_lyrics(self, artist, title):
'''
Params:
artist: str - name of artist
title: str - name of song
Returns str of scraped lyrics
'''
return self.get_lyrics_from_url(self._construct_lyricwiki_url(artist,
title))
def get_lyrics_from_url(self, url):
'''Get and return the lyrics for the given song.
Returns False if there are no lyrics (it's instrumental).
TODO:
Raises an IOError if the lyrics couldn't be found.
Raises an IndexError if there is no lyrics tag.
'''
try:
soup = self.__get_soup(url)
except ValueError:
page_name = self._decode_lyricwiki(url.split('/')[-1])
artist = page_name.split(':')[0]
title = page_name.split(':')[1]
print("Ran into an error getting lyrics for '%s' by %s!"
% (title, artist))
return
try:
lyricbox = soup.select('.lyricbox')[0]
# remove script tags
[s.extract() for s in lyricbox.find_all('script')]
except IndexError:
return None
# look for a sign that it's instrumental
if len(soup.select('.lyricbox a[title=\'Instrumental\']')):
return False
# prepare output
lyrics = []
if lyricbox.text is not None:
for string in lyricbox.stripped_strings:
lyrics.append(string + ' \n ')
return ''.join(lyrics).strip()
def __parse_albums(self, albums, artist_name):
'''Given a collection of album <a> tags, fetch their lyrics
Params:
albums: collection - collection of <a> tags, each being a link to
an album
Returns:
Dict with structure {album: {track: lyrics}}: {str: {str: str}}
'''
artist_dict = defaultdict(dict)
for album in albums:
album_title = album.text
tracks = self.__parse_multi_disc(self.__get_parent_h2(album))
urls = [self.__lyric_wiki_url + a.get('href') for a in tracks]
for url in urls:
try:
artist, track_title = self.__parse_song_artist_and_title(url)
except IndexError:
continue
if artist_name != artist.lower():
continue
artist_dict[album_title][track_title] = self.get_lyrics_from_url(url)
return artist_dict
# Lyric Wiki helper methods
@staticmethod
def _decode_lyricwiki(str_):
'''Decode Lyric Wiki encoded str'''
str_ = str_.replace('Less_Than', '<')
str_ = str_.replace('Greater_Than', '>')
str_ = str_.replace('Number_', '#')
str_ = str_.replace('Sharp_', '#')
str_ = str_.replace('%27', "'")
str_ = str_.replace('_', ' ')
return urllib.parse.unquote(str_)
@staticmethod
def _encode_lyricwiki(str_):
'''Return a string in LyricWiki encoding.
Substitutions are performed as described at
< http: // lyrics.wikia.com / LyricWiki: Page_Names > .
'''
words = str_.split()
newwords = []
for word in words:
newwords.append(word.title())
str_ = '_'.join(newwords)
str_ = str_.replace('<', 'Less_Than')
str_ = str_.replace('>', 'Greater_Than')
# TODO: Support Sharp_ as a valid substitution for '#'.
str_ = str_.replace('#', 'Number_')
str_ = str_.replace('[', '(')
str_ = str_.replace(']', ')')
str_ = str_.replace('{', '(')
str_ = str_.replace('}', ')')
return str_
@staticmethod
def _construct_lyricwiki_url(artist, song_title=None, edit=False):
'''Constructs a LyricWiki URL for an artist or song
Params:
artist: str - artist to link to
Optional:
song_title: str - specific song to link to
edit: boolean - flag to get link to edit entry
Returns str url'''
base = PyLyrics3.__lyric_wiki_url + '/wiki/'
page_name = PyLyrics3._encode_lyricwiki(artist)
if song_title:
page_name += ':%s' % PyLyrics3._encode_lyricwiki(song_title)
if edit:
return base + 'index.php?title=%s&action=edit' % page_name
return base + page_name
# Class helper methods
@staticmethod
def __parse_multi_disc(h2_tag):
'''Given an album's h2 tag, find all <a> tags for tracks associated with
the album
Params:
h2_tag: BeautifulSoup - h2 tag of album
Returns list of <a> tags for tracks'''
tracks = []
soup = h2_tag.next_sibling
while soup and soup.name != 'h2':
if soup.name == 'ol':
tracks += soup.select('li b a')
soup = soup.next_sibling
return tracks
@staticmethod
def __get_artist_song_links(artist_soup):
'''Given the soup of an artist page, get <a> tags of all tracks'''
songs = []
for link_tag in artist_soup.select('li b a'):
if link_tag.get('href'):
link = PyLyrics3.__lyric_wiki_url + link_tag.get('href')
songs.append(link)
return songs
@staticmethod
def __get_artist_album_links(artist_soup):
'''Given the soup of an artist page, get <a> tags of all albums'''
return artist_soup.select('h2 .mw-headline a')
@staticmethod
def __get_parent_h2(album_a_tag):
'''Given the soup of an album's <a> tag, get its h2 parent'''
return album_a_tag.parent.parent
@staticmethod
def __parse_song_artist_and_title(url):
'''Given a LyricWiki encoded url, parse out the song title'''
# unpacking as 3 elements would throw a ValueError rather than an
# IndexError, which the other methods catch
splits = url.split(':')
artist = splits[1]
title = splits[2]
artist = artist.split('/')[-1]
return (PyLyrics3._decode_lyricwiki(artist),
PyLyrics3._decode_lyricwiki(title))
# IO Helper methods
def __get_soup(self, url):
'''Download and parse a URL as a BeautifulSoup object'''
req = self._session.get(url)
try:
self.__check_response(req.status_code)
# lxml is much speedier than the normal parser, but requires install
try:
return BeautifulSoup(req.text, 'lxml')
except ValueError:
return BeautifulSoup(req.text, 'html.parser')
except AssertionError:
print('Unable to download url ' + url)
raise ValueError('Status', req.status_code)
@staticmethod
def __check_response(status_code):
'''Raises an assertion error if the status code is not a success'''
first_digit = status_code // 100
assert first_digit in (2, 3)
# support for importing just the functions
__INSTANCE = PyLyrics3()
get_song_lyrics = __INSTANCE.get_song_lyrics
get_artist_lyrics = __INSTANCE.get_artist_lyrics
get_lyrics_from_url = __INSTANCE.get_lyrics_from_url