forked from apg/tech-blogs-you-should-read
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiscover.py
92 lines (76 loc) · 2.28 KB
/
discover.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python
__author__ = 'Andrew Gwozdziewycz'
__email__ = 'web@apgwoz.com'
__license__ = 'GPLv3'
import sys
import codecs
from urllib import urlopen
from urlparse import urljoin
from pyquery import PyQuery as P
import feedparser
OPMLENV = """<opml version="1.1">
<head><title>Quora Feeds</title>
<dateCreated>2013-10-10 06:22:36.131417</dateCreated>
<dateModified>2013-10-10 06:22:36.131417</dateModified>
</head>
<body>
%s
</body>
</opml>
"""
OUTLINE = """
<outline htmlUrl="%(url)s"
text="%(title)s"
title="%(title)s"
type="rss"
version="RSS"
xmlUrl="%(alt)s" />
"""
def openpage(url):
content = urlopen(url).read()
return P(content)
def findalt(d):
nodes = d.find('link[rel="alternate"]')
if nodes:
return nodes[0].attrib.get('href')
return None
def readalt(alt):
d = feedparser.parse(alt)
return d
def readall(fname):
alts = []
with open(fname) as f:
for line in f:
name, url = line.strip().split('\t')
try:
d = openpage(url)
except Exception, x:
print >>sys.stderr, "Couldn't open %s at %s -- %s" % (name, url, x)
continue
alt = findalt(d)
if alt:
if not alt.startswith('http'):
alt = urljoin(url, alt)
print >>sys.stderr, "ALT for '%s' at: %s" % (url, alt)
try:
doc = readalt(alt)
alts.append({
'alt': alt,
'url': url,
'title': doc['feed']['title']
})
except Exception, e:
print >>sys.stderr, "Couldn't read alt for '%s': %s" % (alt, e)
continue
else:
print >>sys.stderr, "Couldn't find an alt for '%s'" % url
return alts
def opml(alts):
return OPMLENV % '\n'.join(map(lambda x: OUTLINE % x, alts))
if __name__ == '__main__':
if len(sys.argv) != 3:
print >>sys.stderr, "usage: %s input out.opml" % sys.argv[0]
raise SystemExit()
alts = readall(sys.argv[1])
with codecs.open(sys.argv[2], 'w', 'utf8') as f:
f.write(opml(alts))