-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathother_analysis.py
158 lines (116 loc) · 5.44 KB
/
other_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
set(stopwords.words('english'))
import matplotlib.pyplot as plt
import re
from collections import Counter
import os
from os import path
from wordcloud import WordCloud
def other_visualizations():
#Read in vital people url file
vital_people_df = pd.read_excel('Data\\vital_people_links.xlsx')
#List to store word count of all the vital people articles
vital_people_word_counts=[]
#String to store all wikipedia text
full_vital_text=""
# Set of stop words
stop_words = set(stopwords.words('english'))
#Iterate through all vital people links
for i in vital_people_df['Links']:
#Original link to scrape
res = requests.get(i)
#Get html content from the scraped link
soup = bs(res.text, "html.parser")
#Get only article text from the url link
text=''
for paragraph in soup.find_all('p'):
text += paragraph.text
full_vital_text+=paragraph.text
vital_people_word_counts.append(str(len(text.split())))
#Replace all stop words with blank in full_vital_text
big_regex = re.compile(' '.join(map(re.escape, stop_words)))
full_vital_text = big_regex.sub(" ", full_vital_text)
#Create dictionary and count frequency of individual words
vital_text_dictionary=dict(Counter(full_vital_text.replace(',', '').replace('.', '').split()))
#Convert list to pandas dataframe
vital_people_word_counts_df=pd.DataFrame(vital_people_word_counts, columns=['Counts'])
#Store dataframe as an excel file
vital_people_word_counts_df.to_excel(r'C:\Users\Ajay\PycharmProjects\Wikipedia_NLP_Project\Data\vital_people_word_counts.xlsx',index=False, header=True)
#Read in serial killers url file
serial_people_df = pd.read_excel('Data\\serial_killer_people_links.xlsx')
#List to store word count of all the serial killer articles
serial_killer_word_counts=[]
#String to store all wikipedia text
full_serial_killer_text=""
#Iterate through all serial killer links
for i in serial_people_df['Links']:
#Original link to scrape
res = requests.get(i)
#Get html content from the scraped link
soup = bs(res.text, "html.parser")
#Get only article text from the url link
text=''
for paragraph in soup.find_all('p'):
text += paragraph.text
full_serial_killer_text += paragraph.text
serial_killer_word_counts.append(str(len(text.split())))
#Replace all stop words with blank in full_vital_text
big_regex = re.compile(' '.join(map(re.escape, stop_words)))
full_serial_killer_text = big_regex.sub(" ", full_serial_killer_text)
#Create dictionary and count frequency of individual words
serial_killer_text_dictionary=dict(Counter(full_serial_killer_text.replace(',', '').replace('.', '').split()))
#Convert list to pandas dataframe
serial_killer_word_counts_df=pd.DataFrame(serial_killer_word_counts, columns=['Counts'])
#Store dataframe as an excel file
serial_killer_word_counts_df.to_excel(r'C:\Users\Ajay\PycharmProjects\Wikipedia_NLP_Project\Data\serial_people_word_counts.xlsx',index=False, header=True)
#Creating box plot
vital_people_word_counts = pd.read_excel('Data\\vital_people_word_counts.xlsx')
serial_killer_word_counts = pd.read_excel('Data\\serial_people_word_counts.xlsx')
data=[vital_people_word_counts['Counts'],serial_killer_word_counts['Counts']]
labels = ['Vital People Word Count', 'Serial Killer Word Count',]
fig = plt.figure(figsize =(10, 7))
# Creating plot
bp = plt.boxplot(data,meanline=True, showmeans=True, showcaps=True, showbox=True, showfliers=True, vert=True, patch_artist=True, labels=labels)
colors = ['green', 'red']
for patch, color in zip(bp['boxes'], colors):
patch.set_facecolor(color)
# show plot
plt.show()
#Plotting vital people word cloud
text_word_cloud = " ".join([(k + " ") for k,v in vital_text_dictionary.items()])
# Generate a word cloud image
wordcloud = WordCloud().generate(text_word_cloud)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
print(serial_killer_text_dictionary)
# lower max_font_size
wordcloud = WordCloud(max_font_size=40).generate(text_word_cloud)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
#Store vital people word count dictionary as excel file
vital_people_frequency_df = pd.DataFrame(data=vital_text_dictionary, index=[0])
vital_people_frequency_df.to_excel('vital_people_frequency_df.xlsx')
#Store serial killers word count dictionary as excel file
serial_killer_frequency_df = pd.DataFrame(data=serial_killer_text_dictionary, index=[0])
serial_killer_frequency_df.to_excel('serial_killer_frequency_df.xlsx')
#Plotting vital people word cloud
text_word_cloud = " ".join([(k + " ") for k,v in serial_killer_text_dictionary.items()])
# Generate a word cloud image
wordcloud = WordCloud().generate(text_word_cloud)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
# lower max_font_size
wordcloud = WordCloud(max_font_size=40).generate(text_word_cloud)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()