-
Notifications
You must be signed in to change notification settings - Fork 1
/
girlcode.py
70 lines (52 loc) · 2.15 KB
/
girlcode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from textblob import Word, TextBlob
from bs4 import BeautifulSoup
from collections import namedtuple
from lyrics import wikicase, getlyrics
import requests, operator
def strip(word):
result = ""
for i in word:
if i not in ".?,\"'-�@()*:;[]{}<>$%^=+/\\|\n":
result += i
return result
def stripFeaturedArtists(artist):
if "Featuring" in artist:
return artist[0:artist.find("Featuring") - 1]
else:
return artist
def getSongs(row):
title = wikicase(row.find("h2").text.strip())
artist = (row.find("h3").text.strip())
artist = wikicase(stripFeaturedArtists(artist))
return title, artist
def getRnBSongs(headerTag):
title = wikicase(headerTag.find("h1").text.strip())
artist = headerTag.select(".chart_info a")[0].text.strip()
artist = wikicase(stripFeaturedArtists(artist))
return title, artist
if __name__ == "__main__":
soups = []
soup_page0 = BeautifulSoup(requests.get("http://www.billboard.com/charts/christian-songs").text)
soup_page1 = BeautifulSoup(requests.get("http://www.billboard.com/charts/christian-songs?page=1").text)
soup_page2 = BeautifulSoup(requests.get("http://www.billboard.com/charts/christian-songs?page=2").text)
soups.extend([soup_page0, soup_page1, soup_page2])
songs = []
for soup in soups:
rawRows = soup.select(".song_review header")
for row in rawRows:
title, artist = getRnBSongs(row)
songs.append((title, artist))
vocabulary = {} #hashmap of word, number of appearances
unordered_list = file('output_chr.txt', 'w')
for title, artist in songs:
lyrics = getlyrics(artist, title)
for word in TextBlob(lyrics).words:
normalized = Word(word).lemmatize().lower()
if normalized not in vocabulary:
vocabulary[normalized] = 0
vocabulary[normalized] += 1
unordered_list.write(word.encode('utf8') + '\n')
sorted_v = sorted(vocabulary.items(), key=operator.itemgetter(1))
f = file('output_chr.csv', 'w')
for word, count in sorted_v:
f.write(word.encode('utf8') + "," + str(count) + "\n")