-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlyrics.py
122 lines (100 loc) · 3.91 KB
/
lyrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# Bare bone py
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import requests
import re
def fetch_lyrics():
tree = ET.parse('sitemap.xml')
root = tree.getroot()
tag = "{http://www.sitemaps.org/schemas/sitemap/0.9}loc"
urls = []
for d in root.iter(tag):
# print(d.text)
urls.append(d.text)
# print(urls)
print("Total urls found {}".format(len(urls)))
# Todo: variable naming
# Final Lyrics list contains dictionary of song name and lyrics
lyrics = []
# buggy: fetching only first 7 songs lyrics
for url in urls[:7]:
lyrics.append(extract_lyrics(url))
# print(extract_lyrics(url))
return lyrics
def extract_lyrics(url=''):
if not url:
return
req = requests.get(url)
parse = BeautifulSoup(req.text, 'html.parser')
body_content = parse.find("div", class_="post-body post-content")
body = body_content.find_all("span")
# just to debug
for i, b in enumerate(body):
# print(i, b.text.strip())
pass
lyrics = []
for i, t in enumerate(body):
k = remove_metadata(t.text.strip())
k = remove_music_clues(k)
k = remove_footnote(k)
if k:
lyrics.append(k)
ly = {
"song-name": "",
"lyrics": []
}
ly["song-name"] = song_name(url)
ly["lyrics"] = lyrics
return ly
def song_name(url):
s = str(url.split("/")[-1]).split("-")
n = s[0] + "-" + s[1]
return n
def remove_metadata(text):
text = re.sub("Vocal.*", "", text, flags=re.I)
text = re.sub("Master.*", "", text, flags=re.I)
text = re.sub("Direction.*", "", text, flags=re.I)
text = re.sub("Starring.*", "", text, flags=re.I)
text = re.sub("DOP.*", "", text, flags=re.I)
text = re.sub("Arial.*", "", text, flags=re.I)
text = re.sub("Singer.*", "", text, flags=re.I)
text = re.sub("Starring.*", "", text, flags=re.I)
text = re.sub("Mijing.*", "", text, flags=re.I)
text = re.sub("Concept.*", "", text, flags=re.I)
text = re.sub(".*Lyrics.*", "", text, flags=re.I)
text = re.sub("Makeup.*", "", text, flags=re.I)
text = re.sub("Make-Up.*", "", text, flags=re.I)
text = re.sub("\nCo Cast.*", "", text, flags=re.I)
text = re.sub("\n.*Music.*", "", text, flags=re.I)
text = re.sub("\n.*Guiter.*", "", text, flags=re.I)
text = re.sub("\n.*Light.*", "", text, flags=re.I)
text = re.sub("\n.*", "", text, flags=re.I)
text = re.sub("Produced.*", "", text, flags=re.I)
text = re.sub("Assist.*", "", text, flags=re.I)
text = re.sub("Aeriel.*", "", text, flags=re.I)
text = re.sub("Lyricist.*", "", text, flags=re.I)
text = re.sub("Genres.*", "", text, flags=re.I)
text = re.sub("Romantic.*", "", text, flags=re.I)
text = re.sub("Producer.*", "", text, flags=re.I)
text = re.sub("Riya Brahma.*", "", text, flags=re.I)
text = re.sub("Audio.*", "", text, flags=re.I)
text = re.sub("Recording.*", "", text, flags=re.I)
text = re.sub("Mixing.*", "", text, flags=re.I)
text = re.sub("Video.*", "", text, flags=re.I)
text = re.sub("Director.*", "", text, flags=re.I)
text = re.sub("Editor.*", "", text, flags=re.I)
text = re.sub("Production.*", "", text, flags=re.I)
text = re.sub("Phwi phwi phwi Bodo Melody song is sung by Nikita Boro and written by Ibson Lal Baruah", "", text, flags=re.I)
return text
def remove_music_clues(text):
text = re.sub("Music.*", "", text, flags=re.I)
text = re.sub("\..*", "", text, flags=re.I)
text = re.sub("times", "", text, flags=re.I)
return text
def remove_footnote(text):
text = re.sub("Thanks for visiting Bodo Song Lyrics Site.", "", text, flags=re.I)
text = re.sub("Thanks.*", "", text, flags=re.I)
text = re.sub("Related.*", "", text, flags=re.I)
text = re.sub("Visiting.*", "", text, flags=re.I)
text = re.sub("You make.*", "", text, flags=re.I)
return text