-
Notifications
You must be signed in to change notification settings - Fork 8
/
tldextract.py
208 lines (171 loc) · 7.39 KB
/
tldextract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# -*- coding: utf-8 -*-
"""`tldextract` accurately separates the gTLD or ccTLD (generic or country code
top-level domain) from the registered domain and subdomains of a URL.
>>> import tldextract
>>> tldextract.extract('http://forums.news.cnn.com/')
ExtractResult(subdomain='forums.news', domain='cnn', tld='com')
>>> tldextract.extract('http://forums.bbc.co.uk/') # United Kingdom
ExtractResult(subdomain='forums', domain='bbc', tld='co.uk')
>>> tldextract.extract('http://www.worldbank.org.kg/') # Kyrgyzstan
ExtractResult(subdomain='www', domain='worldbank', tld='org.kg')
`ExtractResult` is a namedtuple, so it's simple to access the parts you want.
>>> ext = tldextract.extract('http://forums.bbc.co.uk')
>>> ext.domain
'bbc'
>>> '.'.join(ext[:2]) # rejoin subdomain and domain
'forums.bbc'
"""
from __future__ import with_statement
try:
import cPickle as pickle
except ImportError:
import pickle
import logging
from operator import itemgetter
import os
import pkg_resources
import re
import socket
import urllib2
import urlparse
LOG = logging.getLogger(__file__)
SCHEME_RE = re.compile(r'^([' + urlparse.scheme_chars + ']+:)?//')
IP_RE = re.compile(r'^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$')
class ExtractResult(tuple):
'ExtractResult(subdomain, domain, tld)'
__slots__ = ()
_fields = ('subdomain', 'domain', 'tld')
def __new__(_cls, subdomain, domain, tld):
'Create new instance of ExtractResult(subdomain, domain, tld)'
return tuple.__new__(_cls, (subdomain, domain, tld))
@classmethod
def _make(cls, iterable, new=tuple.__new__, len=len):
'Make a new ExtractResult object from a sequence or iterable'
result = new(cls, iterable)
if len(result) != 3:
raise TypeError('Expected 3 arguments, got %d' % len(result))
return result
def __repr__(self):
'Return a nicely formatted representation string'
return 'ExtractResult(subdomain=%r, domain=%r, tld=%r)' % self
def _asdict(self):
'Return a new dict which maps field names to their values'
return dict(zip(self._fields, self))
def _replace(_self, **kwds):
'Return a new ExtractResult object replacing specified fields with new values'
result = _self._make(map(kwds.pop, ('subdomain', 'domain', 'tld'), _self))
if kwds:
raise ValueError('Got unexpected field names: %r' % kwds.keys())
return result
def __getnewargs__(self):
'Return self as a plain tuple. Used by copy and pickle.'
return tuple(self)
subdomain = property(itemgetter(0), doc='Alias for field number 0')
domain = property(itemgetter(1), doc='Alias for field number 1')
tld = property(itemgetter(2), doc='Alias for field number 2')
def extract(url, fetch=True):
"""
Takes a string URL and splits it into its subdomain, domain, and
gTLD/ccTLD component. Ignores scheme, username, and path components.
If fetch is True (the default) and no cached TLD set is found, this module
will fetch TLD sources live over HTTP on first use. Set to False to
not make HTTP requests. Either way, if the TLD set can't be read, the
module will fall back to the included TLD set snapshot.
>>> extract('http://forums.news.cnn.com/')
ExtractResult(subdomain='forums.news', domain='cnn', tld='com')
>>> extract('http://forums.bbc.co.uk/')
ExtractResult(subdomain='forums', domain='bbc', tld='co.uk')
"""
netloc = SCHEME_RE.sub("", url).partition("/")[0]
return _extract(netloc, fetch)
def urlsplit(url, fetch=True):
"""Same as `extract` but calls urlparse.urlsplit to further 'validate' the
input URL. This function will therefore raise the same errors as
urlparse.urlsplit and handle some inputs differently than extract, such as
URLs missing a scheme.
>>> urlsplit('http://forums.news.cnn.com/')
ExtractResult(subdomain='forums.news', domain='cnn', tld='com')
>>> urlsplit('forums.bbc.co.uk/') # urlsplit won't see a netloc
ExtractResult(subdomain='', domain='', tld='')
"""
netloc = urlparse.urlsplit(url).netloc
return _extract(netloc, fetch)
def _extract(netloc, fetch=True):
netloc = netloc.split("@")[-1].partition(':')[0]
registered_domain, tld = _get_tld_extractor(fetch).extract(netloc)
if not tld and netloc and netloc[0].isdigit():
try:
is_ip = socket.inet_aton(netloc)
return ExtractResult('', netloc, '')
except AttributeError:
if IP_RE.match(netloc):
return ExtractResult('', netloc, '')
except socket.error:
pass
subdomain, _, domain = registered_domain.rpartition('.')
return ExtractResult(subdomain, domain, tld)
TLD_EXTRACTOR = None
def _get_tld_extractor(fetch=True):
global TLD_EXTRACTOR
if TLD_EXTRACTOR:
return TLD_EXTRACTOR
moddir = os.path.dirname(__file__)
cached_file = os.path.join(moddir, '.tld_set')
try:
with open(cached_file) as f:
TLD_EXTRACTOR = _PublicSuffixListTLDExtractor(pickle.load(f))
return TLD_EXTRACTOR
except IOError, file_not_found:
pass
tlds = frozenset()
if fetch:
tld_sources = (_PublicSuffixListSource,)
tlds = frozenset(tld for tld_source in tld_sources for tld in tld_source())
if not tlds:
with pkg_resources.resource_stream(__name__, '.tld_set_snapshot') as snapshot_file:
TLD_EXTRACTOR = _PublicSuffixListTLDExtractor(pickle.load(snapshot_file))
return TLD_EXTRACTOR
LOG.info("computed TLDs: %s", tlds)
if LOG.isEnabledFor(logging.DEBUG):
import difflib
with pkg_resources.resource_stream(__name__, '.tld_set_snapshot') as snapshot_file:
snapshot = sorted(pickle.load(snapshot_file))
new = sorted(tlds)
for line in difflib.unified_diff(snapshot, new, fromfile=".tld_set_snapshot", tofile=".tld_set"):
print >> sys.stderr, line
try:
with open(cached_file, 'w') as f:
pickle.dump(tlds, f)
except IOError, e:
LOG.warn("unable to cache TLDs in file %s: %s", cached_file, e)
TLD_EXTRACTOR = _PublicSuffixListTLDExtractor(tlds)
return TLD_EXTRACTOR
def _fetch_page(url):
try:
return unicode(urllib2.urlopen(url).read(), 'utf-8')
except urllib2.URLError, e:
LOG.error(e)
return u''
def _PublicSuffixListSource():
page = _fetch_page('http://mxr.mozilla.org/mozilla/source/netwerk/dns/src/effective_tld_names.dat?raw=1')
tld_finder = re.compile(r'^(?P<tld>[.*!]*\w[\S]*)', re.UNICODE | re.MULTILINE)
tlds = [m.group('tld') for m in tld_finder.finditer(page)]
return tlds
class _PublicSuffixListTLDExtractor(object):
def __init__(self, tlds):
self.tlds = tlds
def extract(self, netloc):
spl = netloc.split('.')
for i in range(len(spl)):
maybe_tld = '.'.join(spl[i:])
exception_tld = '!' + maybe_tld
if exception_tld in self.tlds:
return '.'.join(spl[:i+1]), '.'.join(spl[i+1:])
wildcard_tld = '*.' + '.'.join(spl[i+1:])
if wildcard_tld in self.tlds or maybe_tld in self.tlds:
return '.'.join(spl[:i]), maybe_tld
return netloc, ''
if __name__ == "__main__":
import sys
url = sys.argv[1]
print ' '.join(extract(url))