forked from chihacknight/civic-json-worker
-
Notifications
You must be signed in to change notification settings - Fork 53
/
feeds.py
100 lines (80 loc) · 3.09 KB
/
feeds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4
"""
Tools to extract feed links, test if they are valid and parse them
with feedparser, returning content or a proper error.
"""
import urllib2
from httplib import BadStatusLine
import feedparser
from BeautifulSoup import BeautifulSoup
from socket import error as SocketError
# list of attributes that can have a feed link in the <HEAD> section
# so we can identify at least one in a page
FEED_LINKS_ATTRIBUTES = (
(('type', 'application/rss+xml'),),
(('type', 'application/atom+xml'),),
(('type', 'application/rss'),),
(('type', 'application/atom'),),
(('type', 'application/rdf+xml'),),
(('type', 'application/rdf'),),
(('type', 'text/rss+xml'),),
(('type', 'text/atom+xml'),),
(('type', 'text/rss'),),
(('type', 'text/atom'),),
(('type', 'text/rdf+xml'),),
(('type', 'text/rdf'),),
(('rel', 'alternate'), ('type', 'text/xml')),
(('rel', 'alternate'), ('type', 'application/xml')),
)
def extract_feed_links(html, feed_links_attributes=FEED_LINKS_ATTRIBUTES):
"""
Return a generator yielding potiential feed links in a HTML page.
>>> url = urllib2.urlopen('http://www.codinghorror.com/blog/')
>>> links = extract_feed_links(url.read(1000000))
>>> tuple(links)
(u'http://feeds.feedburner.com/codinghorror/',)
"""
soup = BeautifulSoup(html)
head = soup.find('head')
for attrs in feed_links_attributes:
if head:
for link in head.findAll('link', dict(attrs)):
href = dict(link.attrs).get('href', '')
if href:
yield unicode(href)
def get_first_working_feed_link(url):
"""
Try to use the current URL as a feed. If it works, returns it.
It it doesn't, load the HTML and try to get links from it then
test them one by one and returns the first one that works.
>>> get_first_working_feed_link('http://www.codinghorror.com/blog/')
u'http://feeds.feedburner.com/codinghorror/'
>>> get_first_working_feed_link('http://feeds.feedburner.com/codinghorror/')
u'http://feeds.feedburner.com/codinghorror/'
"""
# if the url is a feed itself, returns it
html = None
try:
html = urllib2.urlopen(url).read(1000000)
except (SocketError, BadStatusLine):
return None
feed = feedparser.parse(html)
if not feed.get("bozo", 1):
return unicode(url)
# construct the site url from the domain name and the protocole name
parsed_url = urllib2.urlparse.urlparse(url)
site_url = u"%s://%s" % (parsed_url.scheme, parsed_url.netloc)
# parse the html extracted from the url, and get all the potiential
# links from it then try them one by one
for link in extract_feed_links(html):
if '://' not in link: # if we got a relative URL, make it absolute
link = site_url + link
feed = feedparser.parse(link)
if not feed.get("bozo", 1):
return link
return None
if __name__ == "__main__":
import doctest
doctest.testmod()