-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMelonLyricScrapper.py
90 lines (68 loc) · 2.91 KB
/
MelonLyricScrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import requests
import re
import datetime
from bs4 import BeautifulSoup
from Scrapper.Scrapper import Scrapper
class MelonLyricScrapper(Scrapper):
def __init__(self):
Scrapper.__init__(self)
self.url = 'https://www.melon.com/song/detail.htm?songid='
def strip_html(self, data):
p = re.compile(r'<.*?>')
return p.sub('', data)
def strip_19(self, data):
p = re.compile(r'(19금|\r|\n|\t)')
return p.sub('', data)
def find_hanguel(self, data):
mat = re.match(r'.*[ㄱ-ㅎㅏ-ㅣ가-힣]+.*', data)
if mat is None:
return False
else:
return True
def set_url(self, *args):
if len(args) != 1:
raise ValueError('set_url(song_id) -> args must have just one item')
self.url = 'https://www.melon.com/song/detail.htm?songid={}'.format(args[0])
def scrapping(self, *args):
if len(args) != 1:
raise ValueError('scrapping(song_id) -> args must have just one item')
song_id = args[0]
self.set_url(song_id)
result = {}
try:
req = requests.get(self.url)
html = req.text
soup = BeautifulSoup(html, 'html.parser')
lyric_data = soup.select('#d_video_summary')
lyric_data = str(lyric_data[0]).replace('<br/>', '\n')
lyric_data = self.strip_html(lyric_data)
lyric_data = lyric_data.strip()
if self.find_hanguel(lyric_data) is False:
return None
title_data = soup.select('#downloadfrm > div > div > div.entry > div.info > div.song_name')
title_data = self.strip_19(title_data[0].text[4:].strip())
artist_data = soup.select(
'#downloadfrm > div > div > div.entry > div.info > div.artist > a > span:nth-of-type(1)')
artist_data = self.strip_html(str(artist_data[0]))
meta_datas = soup.select('#downloadfrm > div > div > div.entry > div.meta > dl > dd')
result['artist'] = artist_data
result['title'] = title_data
result['album'] = meta_datas[0].text
result['release_date'] = datetime.datetime.strptime(meta_datas[1].text, '%Y.%m.%d').timestamp()
result['genre'] = meta_datas[2].text
result['lyric'] = lyric_data
return result
except requests.exceptions.Timeout as e:
# print(self.emit_error_message() % (song_id))
return None
except requests.exceptions.TooManyRedirects as e:
# print(self.emit_error_message() % (song_id))
return None
except requests.exceptions.RequestException as e:
# print(self.emit_error_message() % (song_id))
return None
except IndexError as e:
# print(self.emit_error_message() % (song_id))
return None
except ValueError as e:
return None