-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
155 lines (119 loc) · 4.77 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import feedparser
import json
import hashlib
import re
import datetime
import time
from email import utils
import pprint
def main():
print("Hello {}!".format("Worldz"))
# blogs.append("https://sec6441.tumblr.com/rss")
# blogs.append("https://kxy-6441.tumblr.com/rss")
# blogs.append("https://acsm6441.tumblr.com/rss")
# blogs.append("https://blog.securityengineering.online/feed/")
blogs = []
listOfBlogs = open("blogs.txt", "r")
for blog in listOfBlogs:
blogs.append(blog)
allBlogs = {}
try:
with open("logs/allBlogs.json", 'r') as fp:
allBlogs = json.load(fp)
except:
print ("Couldn't find existing file. New file will be created.")
for blogURL in blogs:
print(" ----", blogURL, "---- ")
canonicalURL = getCanonicalURL(blogURL)
filename = "logs/" + canonicalURL + ".json"
# Extract previous blog history
blogHistory = {}
blogHistory["data"] = {}
blogHistory["logs"] = []
try:
with open(filename, 'r') as fp:
blogHistory = json.load(fp)
except:
print ("Couldn't find existing log for ", canonicalURL)
# Grab the blog's RSS feed and iterate through it.
try:
blogFeed = feedparser.parse(blogURL)
for post in blogFeed['entries']:
currentTime = getCurrentTimeString()
# Create JSON object to hold details about this blog.
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(post)
# print(post)
blogPost = {}
blogPost['GUID'] = post['id']
blogPost['title'] = post['title_detail']['value']
if 'summary' in post:
blogPost['content'] = post['summary']
else:
blogPost['content'] = ""
blogPost['published'] = post['published']
blogPost['scrapeDate'] = currentTime
try:
blogPost['previousScrapeDate'] = blogHistory["logs"][-1]
except:
print ("ERROR GETTING PREVIOUS SCRAPE DATE")
blogPost['previousScrapeDate'] = ""
# print (blogPost)
# Unique identifiers for this post
guid = post['id']
uniqueIdentifier = blogPost['title'] + blogPost['content'] + blogPost['published']
versionIDHash = hashlib.sha256(uniqueIdentifier.encode('utf-8')).hexdigest()
# HANDLING VERSIONING OF POSTS
# Grab previous versions of this post, if they exist
try:
blogPostVersions = blogHistory["data"][guid]
except:
blogPostVersions = {}
print ("Unique Identifier is ", uniqueIdentifier)
print (versionIDHash)
# Add this version of this blog post
if versionIDHash not in blogPostVersions:
blogPostVersions[versionIDHash] = blogPost #Add this blog post as a version of this post.
blogHistory["data"][guid] = blogPostVersions # Add all versions of this post to list of all posts.
except:
print ("Couldn't find feed, or an error occured. Skipping : ", canonicalURL)
blogHistory["logs"].append((getCurrentTimeString()))
# print ("Printing data ", blogHistory)
# Write all of the blog's history out to a file.
with open(filename, 'w') as fp:
json.dump(blogHistory, fp, sort_keys=False, indent=4)
# Add this blog to the list containing all blogs and their data.
allBlogs[canonicalURL] = blogHistory
#Write all blogs and their posts and history to file.
with open("logs/allBlogs.json", 'w') as fp:
json.dump(allBlogs, fp, sort_keys=True, indent=4)
def getCurrentTimeString():
nowdt = datetime.datetime.now()
nowtuple = nowdt.timetuple()
nowtimestamp = time.mktime(nowtuple)
timestamp = utils.formatdate(nowtimestamp)
return timestamp
def getCanonicalURL(URL):
result = re.search('(?<=\/\/).*?(?=/)', URL)
canonicalURL = result.group()
# print("Canonical name : ", result.group() )
return result.group()
def readRSS():
d = feedparser.parse('https://sec6441.tumblr.com/rss')
entries = d['entries']
for entry in entries:
print (entry['title'])
# print(entry)
# print (d)
try:
d = feedparser.parse('https://sec6441.tumblr.com/rss')
entries = d['entries']
for entry in entries:
print (d['title'])
print (d)
except:
print("whoops")
# print (d)
if __name__ == '__main__':
main()
# readRSS()