-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
83 lines (72 loc) · 2.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import requests
import json
import re
import feedparser
import io
from lxml import html
from urllib.parse import urljoin
def dumpJson(siteName, siteDomain, rssLink):
data = {
"university": siteName,
"faculity": "Computer Engineering",
"rss": rssLink
}
with open('sites/' + siteDomain + '.json', 'w') as f:
json.dump(data, f)
def feedfinder(url, domain):
"""https://gist.github.com/pleycpl/46953ff26e7da165c9f20dfbe1cd8256"""
print(url)
raw = False
try:
raw = requests.get(url).content
except Exception as e:
print('Website doesn\'t exists: ', e)
return 0
if not raw:
print('Lxml doesn\'t work')
return 0
result = []
possibleFeeds = []
tree = html.fromstring(raw)
feedUrls = tree.xpath("//link[@rel='alternate']")
if feedUrls:
for feed in feedUrls:
t = feed.xpath('@type')
if t:
t = t[0]
if "rss" in t or "xml" in t:
href = feed.xpath('@href')
if href:
href = href[0]
possibleFeeds.append(urljoin(url, href))
atags = tree.xpath("//a")
for a in atags:
href = a.xpath('@href')
if href:
href = href[0]
if "xml" in href or "rss" in href or "feed" in href:
possibleFeeds.append(urljoin(url, href))
for link in list(set(possibleFeeds)):
# Thanks for https://stackoverflow.com/questions/9772691/feedparser-with-timeout
try:
resp = requests.get(link, timeout=10.0)
except Exception as e:
print("Timeout when reading RSS %s", link, ' e:', e)
return
content = io.BytesIO(resp.content)
f = feedparser.parse(content)
if len(f.entries) > 0:
if url not in result:
result.append(link)
return (result)
def main():
with open("tr-sites.json") as SitesJsonData:
sites = json.load(SitesJsonData)
for site in sites[:]:
result = feedfinder(site['web_page'], site['domain'])
print(result)
if result:
print('Rss Exists: ', result[0])
dumpJson(site['name'], site['domain'], result[0])
if __name__ == '__main__':
main()