-
Notifications
You must be signed in to change notification settings - Fork 0
/
prnewswire_article_crawler.py
109 lines (91 loc) · 3.55 KB
/
prnewswire_article_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#THINGS TO DO BEFORE WRITING CSV:
#perform sentiment analysis
#determine if bull or bear
#search for ticker
#gather stock stats
#create filtered list to output
import re
import time
from datetime import datetime
import pandas as pd
import requests
import csv
class NewsArticle:
def __init__(self, title, pubdate, link, content):
self.title = title
self.pubdate = pubdate
self.link = link
self.content = content
#self.sentiment = buy | sell
#self.ticker = ticker
#self.stock_of_interest = true | false
#def get_sentiment():
#def find_tickers():
def main():
urls = ['http://www.prnewswire.com/rss/automotive-transportation/all-automotive-transportation-news.rss',\
'http://www.prnewswire.com/rss/business-technology/all-business-technology-news.rss',\
'http://www.prnewswire.com/rss/consumer-products-retail/all-consumer-products-retail-news.rss',\
'http://www.prnewswire.com/rss/consumer-technology/all-consumer-technology-news.rss',\
'http://www.prnewswire.com/rss/energy/all-energy-news.rss',\
'http://www.prnewswire.com/rss/environment/all-environment-news.rss',\
'http://www.prnewswire.com/rss/financial-services/all-financial-services-news.rss',\
'http://www.prnewswire.com/rss/general-business/all-general-business-news.rss',\
'http://www.prnewswire.com/rss/health/all-health-news.rss',\
'http://www.prnewswire.com/rss/heavy-industry-manufacturing/all-heavy-industry-manufacturing-news.rss',\
'http://www.prnewswire.com/rss/policy-public-interest/all-policy-public-interest-news.rss',\
'http://www.prnewswire.com/rss/telecommunications/all-telecommunications-news.rss']
headers = {'user-agent': 'Chrome/53.0.2785.143'}
feed_headers = []
create_log_csv()
try:
for url in urls:
headers = get_news_headers(url)
for i in range(len(links)):
if '.rss' in links[i]:
pass
else:
feed_headers.append([titles[i].encode('utf8'),pubdates[i],links[i]])
#linksource = requests.get(links[i], headers=headers)
#linktext = linksource.text
#newscontent = re.findall('<!-- Content Body START -->((.|\n)*)<!-- Content Body END -->',linktext)
#cleancontent = re.findall('<p itemprop="articleBody">(.*?)</p>',str(newscontent))
#cleantext = clean_html(cleancontent)
#t=0
#while t<1:
# print(cleantext[5])
# t+=1
except Exception as e:
print(e)
news_list = remove_dup_articles(feed_headers)
write_log_to_csv(news_list)
def get_news_headers(url):
pr_news_response = requests.get(url, headers=headers)
pr_news_text = pr_news_response.text
titles = re.findall('<title>(.*?)</title>',pr_news_text)
links = re.findall('<link>(.*?)</link>',pr_news_text)
pubdates = re.findall('<pubDate>(.*?)</pubDate>',pr_news_text)
#descriptions = re.findall(r'<description>(.*?)</description>',prn_text)
def remove_dup_articles(headers):
df = pd.DataFrame(headers)
df = df.drop_duplicates()
new_list = df.values.tolist()
return new_list
#def get_article(link):
def clean_html(source):
cleaner = re.compile('<.*?>')
cleantext = re.sub(cleaner, '', source)
return cleantext
def create_log_csv():
filename = 'news_log_%s.csv'%datetime.now().strftime('%Y-%m-%d_%H%M')
with open(filename, 'w', newline='') as out:
csvwriter = csv.writer(out, delimiter=',')
csvwriter.writerow(['Title','Publish Date','Link'])
out.close()
def write_log_to_csv(news):
filename = 'news_log_%s.csv'%datetime.now().strftime('%Y-%m-%d_%H%M')
with open(filename, 'a', newline='') as out:
csvwriter = csv.writer(out, delimiter=',')
for item in news:
csvwriter.writerow(item)
out.close()
main()