diff --git a/google/modules/standard_search.py b/google/modules/standard_search.py index be8d63e..4cf739c 100644 --- a/google/modules/standard_search.py +++ b/google/modules/standard_search.py @@ -8,7 +8,7 @@ from .utils import _get_search_url, get_html from bs4 import BeautifulSoup import urllib.parse -from urllib.parse import unquote +from urllib.parse import unquote, parse_qs, urlparse from unidecode import unidecode from re import match @@ -51,12 +51,13 @@ def _limit_str_size(self, str_element, size_limit): # PUBLIC -def search(query, pages=1, lang='en', ncr=False, void=True): +def search(query, pages=1, lang='en', area='com', ncr=False, void=True): """Returns a list of GoogleResult. Args: query: String to search in google. pages: Number of pages where results must be taken. + area : Area of google homepages. TODO: add support to get the google results. Returns: @@ -64,7 +65,7 @@ def search(query, pages=1, lang='en', ncr=False, void=True): results = [] for i in range(pages): - url = _get_search_url(query, i, lang=lang, ncr=ncr) + url = _get_search_url(query, i, lang=lang, area=area, ncr=ncr) html = get_html(url) if html: @@ -107,20 +108,61 @@ def _get_name(li): return None +def _filter_link(link): + '''Filter links found in the Google result pages HTML code. + Returns None if the link doesn't yield a valid result. + ''' + try: + # Valid results are absolute URLs not pointing to a Google domain + # like images.google.com or googleusercontent.com + o = urlparse(link, 'http') + # link type-1 + # >>> "https://www.gitbook.com/book/ljalphabeta/python-" + if o.netloc and 'google' not in o.netloc: + return link + # link type-2 + # >>> "http://www.google.com/url?url=http://python.jobbole.com/84108/&rct=j&frm=1&q=&esrc=s&sa=U&ved=0ahUKEwj3quDH-Y7UAhWG6oMKHdQ-BQMQFggUMAA&usg=AFQjCNHPws5Buru5Z71wooRLHT6mpvnZlA" + if o.netloc and o.path.startswith('/url'): + try: + link = parse_qs(o.query)['url'][0] + o = urlparse(link, 'http') + if o.netloc and 'google' not in o.netloc: + return link + except KeyError: + pass + # Decode hidden URLs. + if link.startswith('/url?'): + try: + # link type-3 + # >>> "/url?q=http://python.jobbole.com/84108/&sa=U&ved=0ahUKEwjFw6Txg4_UAhVI5IMKHfqVAykQFggUMAA&usg=AFQjCNFOTLpmpfqctpIn0sAfaj5U5gAU9A" + link = parse_qs(o.query)['q'][0] + # Valid results are absolute URLs not pointing to a Google domain + # like images.google.com or googleusercontent.com + o = urlparse(link, 'http') + if o.netloc and 'google' not in o.netloc: + return link + except KeyError: + # link type-4 + # >>> "/url?url=https://machine-learning-python.kspax.io/&rct=j&frm=1&q=&esrc=s&sa=U&ved=0ahUKEwj3quDH-Y7UAhWG6oMKHdQ-BQMQFggfMAI&usg=AFQjCNEfkUI0RP_RlwD3eI22rSfqbYM_nA" + link = parse_qs(o.query)['url'][0] + o = urlparse(link, 'http') + if o.netloc and 'google' not in o.netloc: + return link + + # Otherwise, or on error, return None. + except Exception: + pass + return None + + def _get_link(li): """Return external link from a search.""" try: a = li.find("a") link = a["href"] - except: + except Exception: return None - - if link.startswith("/url?"): - m = match('/url\?(url|q)=(.+?)&', link) - if m and len(m.groups()) == 2: - return unquote(m.group(2)) - - return None + return _filter_link(link) def _get_google_link(li): @@ -128,7 +170,7 @@ def _get_google_link(li): try: a = li.find("a") link = a["href"] - except: + except Exception: return None if link.startswith("/url?") or link.startswith("/search?"): diff --git a/google/modules/utils.py b/google/modules/utils.py index 1e523fa..03dbd0a 100644 --- a/google/modules/utils.py +++ b/google/modules/utils.py @@ -8,13 +8,19 @@ from past.utils import old_div import time from selenium import webdriver -import urllib.request, urllib.error, urllib.parse +import urllib.request +import urllib.error +import urllib.parse from functools import wraps # import requests from urllib.parse import urlencode from fake_useragent import UserAgent import sys +class AreaError(KeyError): + pass + + def measure_time(fn): def decorator(*args, **kwargs): @@ -34,7 +40,7 @@ def normalize_query(query): return query.strip().replace(":", "%3A").replace("+", "%2B").replace("&", "%26").replace(" ", "+") -def _get_search_url(query, page=0, per_page=10, lang='en', ncr=False): +def _get_search_url(query, page=0, per_page=10, lang='en', area='com', ncr=False): # note: num per page might not be supported by google anymore (because of # google instant) @@ -48,6 +54,7 @@ def _get_search_url(query, page=0, per_page=10, lang='en', ncr=False): params['gws_rd'] = 'cr' # Google Web Server ReDirect: CountRy. params = urlencode(params) + url = u"https://www.google.com/search?" + params # Workaround to switch between http and https, since this way @@ -61,6 +68,339 @@ def _get_search_url(query, page=0, per_page=10, lang='en', ncr=False): # return u"http://www.google.com/search?hl=%s&q=%s&start=%i&num=%i" % # (lang, normalize_query(query), page * per_page, per_page) + if area == 'com': + url = u"http://www.google.com/search?" + elif area == 'is': + url = 'http://www.google.is/search?' + elif area == 'dk': + url = 'http://www.google.dk/search?' + elif area == 'no': + url = 'http://www.google.no/search?' + elif area == 'se': + url = 'http://www.google.se/search?' + elif area == 'fi': + url = 'http://www.google.fi/search?' + elif area == 'ee': + url = 'http://www.google.ee/search?' + elif area == 'lv': + url = 'http://www.google.lv/search?' + elif area == 'lt': + url = 'http://www.google.lt/search?' + elif area == 'ie': + url = 'http://www.google.ie/search?' + elif area == 'uk': + url = 'http://www.google.co.uk/search?' + elif area == 'gg': + url = 'http://www.google.gg/search?' + elif area == 'je': + url = 'http://www.google.je/search?' + elif area == 'im': + url = 'http://www.google.im/search?' + elif area == 'fr': + url = 'http://www.google.fr/search?' + elif area == 'nl': + url = 'http://www.google.nl/search?' + elif area == 'be': + url = 'http://www.google.be/search?' + elif area == 'lu': + url = 'http://www.google.lu/search?' + elif area == 'de': + url = 'http://www.google.de/search?' + elif area == 'at': + url = 'http://www.google.at/search?' + elif area == 'ch': + url = 'http://www.google.ch/search?' + elif area == 'li': + url = 'http://www.google.li/search?' + elif area == 'pt': + url = 'http://www.google.pt/search?' + elif area == 'es': + url = 'http://www.google.es/search?' + elif area == 'gi': + url = 'http://www.google.com.gi/search?' + elif area == 'ad': + url = 'http://www.google.ad/search?' + elif area == 'it': + url = 'http://www.google.it/search?' + elif area == 'mt': + url = 'http://www.google.com.mt/search?' + elif area == 'sm': + url = 'http://www.google.sm/search?' + elif area == 'gr': + url = 'http://www.google.gr/search?' + elif area == 'ru': + url = 'http://www.google.ru/search?' + elif area == 'by': + url = 'http://www.google.com.by/search?' + elif area == 'ua': + url = 'http://www.google.com.ua/search?' + elif area == 'pl': + url = 'http://www.google.pl/search?' + elif area == 'cz': + url = 'http://www.google.cz/search?' + elif area == 'sk': + url = 'http://www.google.sk/search?' + elif area == 'hu': + url = 'http://www.google.hu/search?' + elif area == 'si': + url = 'http://www.google.si/search?' + elif area == 'hr': + url = 'http://www.google.hr/search?' + elif area == 'ba': + url = 'http://www.google.ba/search?' + elif area == 'me': + url = 'http://www.google.me/search?' + elif area == 'rs': + url = 'http://www.google.rs/search?' + elif area == 'mk': + url = 'http://www.google.mk/search?' + elif area == 'bg': + url = 'http://www.google.bg/search?' + elif area == 'ro': + url = 'http://www.google.ro/search?' + elif area == 'md': + url = 'http://www.google.md/search?' + elif area == 'hk': + url = 'http://www.google.com.hk/search?' + elif area == 'mn': + url = 'http://www.google.mn/search?' + elif area == 'kr': + url = 'http://www.google.co.kr/search?' + elif area == 'jp': + url = 'http://www.google.co.jp/search?' + elif area == 'vn': + url = 'http://www.google.com.vn/search?' + elif area == 'la': + url = 'http://www.google.la/search?' + elif area == 'kh': + url = 'http://www.google.com.kh/search?' + elif area == 'th': + url = 'http://www.google.co.th/search?' + elif area == 'my': + url = 'http://www.google.com.my/search?' + elif area == 'sg': + url = 'http://www.google.com.sg/search?' + elif area == 'bn': + url = 'http://www.google.com.bn/search?' + elif area == 'ph': + url = 'http://www.google.com.ph/search?' + elif area == 'id': + url = 'http://www.google.co.id/search?' + elif area == 'tp': + url = 'http://www.google.tp/search?' + elif area == 'kz': + url = 'http://www.google.kz/search?' + elif area == 'kg': + url = 'http://www.google.kg/search?' + elif area == 'tj': + url = 'http://www.google.com.tj/search?' + elif area == 'uz': + url = 'http://www.google.co.uz/search?' + elif area == 'tm': + url = 'http://www.google.tm/search?' + elif area == 'af': + url = 'http://www.google.com.af/search?' + elif area == 'pk': + url = 'http://www.google.com.pk/search?' + elif area == 'np': + url = 'http://www.google.com.np/search?' + elif area == 'in': + url = 'http://www.google.co.in/search?' + elif area == 'bd': + url = 'http://www.google.com.bd/search?' + elif area == 'lk': + url = 'http://www.google.lk/search?' + elif area == 'mv': + url = 'http://www.google.mv/search?' + elif area == 'kw': + url = 'http://www.google.com.kw/search?' + elif area == 'sa': + url = 'http://www.google.com.sa/search?' + elif area == 'bh': + url = 'http://www.google.com.bh/search?' + elif area == 'ae': + url = 'http://www.google.ae/search?' + elif area == 'om': + url = 'http://www.google.com.om/search?' + elif area == 'jo': + url = 'http://www.google.jo/search?' + elif area == 'il': + url = 'http://www.google.co.il/search?' + elif area == 'lb': + url = 'http://www.google.com.lb/search?' + elif area == 'tr': + url = 'http://www.google.com.tr/search?' + elif area == 'az': + url = 'http://www.google.az/search?' + elif area == 'am': + url = 'http://www.google.am/search?' + elif area == 'ls': + url = 'http://www.google.co.ls/search?' + elif area == 'eg': + url = 'http://www.google.com.eg/search?' + elif area == 'ly': + url = 'http://www.google.com.ly/search?' + elif area == 'dz': + url = 'http://www.google.dz/search?' + elif area == 'ma': + url = 'http://www.google.co.ma/search?' + elif area == 'sn': + url = 'http://www.google.sn/search?' + elif area == 'gm': + url = 'http://www.google.gm/search?' + elif area == 'ml': + url = 'http://www.google.ml/search?' + elif area == 'bf': + url = 'http://www.google.bf/search?' + elif area == 'sl': + url = 'http://www.google.com.sl/search?' + elif area == 'ci': + url = 'http://www.google.ci/search?' + elif area == 'gh': + url = 'http://www.google.com.gh/search?' + elif area == 'tg': + url = 'http://www.google.tg/search?' + elif area == 'bj': + url = 'http://www.google.bj/search?' + elif area == 'ne': + url = 'http://www.google.ne/search?' + elif area == 'ng': + url = 'http://www.google.com.ng/search?' + elif area == 'sh': + url = 'http://www.google.sh/search?' + elif area == 'cm': + url = 'http://www.google.cm/search?' + elif area == 'td': + url = 'http://www.google.td/search?' + elif area == 'cf': + url = 'http://www.google.cf/search?' + elif area == 'ga': + url = 'http://www.google.ga/search?' + elif area == 'cg': + url = 'http://www.google.cg/search?' + elif area == 'cd': + url = 'http://www.google.cd/search?' + elif area == 'ao': + url = 'http://www.google.it.ao/search?' + elif area == 'et': + url = 'http://www.google.com.et/search?' + elif area == 'dj': + url = 'http://www.google.dj/search?' + elif area == 'ke': + url = 'http://www.google.co.ke/search?' + elif area == 'ug': + url = 'http://www.google.co.ug/search?' + elif area == 'tz': + url = 'http://www.google.co.tz/search?' + elif area == 'rw': + url = 'http://www.google.rw/search?' + elif area == 'bi': + url = 'http://www.google.bi/search?' + elif area == 'mw': + url = 'http://www.google.mw/search?' + elif area == 'mz': + url = 'http://www.google.co.mz/search?' + elif area == 'mg': + url = 'http://www.google.mg/search?' + elif area == 'sc': + url = 'http://www.google.sc/search?' + elif area == 'mu': + url = 'http://www.google.mu/search?' + elif area == 'zm': + url = 'http://www.google.co.zm/search?' + elif area == 'zw': + url = 'http://www.google.co.zw/search?' + elif area == 'bw': + url = 'http://www.google.co.bw/search?' + elif area == 'na': + url = 'http://www.google.com.na/search?' + elif area == 'za': + url = 'http://www.google.co.za/search?' + elif area == 'au': + url = 'http://www.google.com.au/search?' + elif area == 'nf': + url = 'http://www.google.com.nf/search?' + elif area == 'nz': + url = 'http://www.google.co.nz/search?' + elif area == 'sb': + url = 'http://www.google.com.sb/search?' + elif area == 'fj': + url = 'http://www.google.com.fj/search?' + elif area == 'fm': + url = 'http://www.google.fm/search?' + elif area == 'ki': + url = 'http://www.google.ki/search?' + elif area == 'nr': + url = 'http://www.google.nr/search?' + elif area == 'tk': + url = 'http://www.google.tk/search?' + elif area == 'ws': + url = 'http://www.google.ws/search?' + elif area == 'as': + url = 'http://www.google.as/search?' + elif area == 'to': + url = 'http://www.google.to/search?' + elif area == 'nu': + url = 'http://www.google.nu/search?' + elif area == 'ck': + url = 'http://www.google.co.ck/search?' + elif area == 'do': + url = 'http://www.google.com.do/search?' + elif area == 'tt': + url = 'http://www.google.tt/search?' + elif area == 'co': + url = 'http://www.google.com.co/search?' + elif area == 'ec': + url = 'http://www.google.com.ec/search?' + elif area == 've': + url = 'http://www.google.co.ve/search?' + elif area == 'gy': + url = 'http://www.google.gy/search?' + elif area == 'pe': + url = 'http://www.google.com.pe/search?' + elif area == 'bo': + url = 'http://www.google.com.bo/search?' + elif area == 'py': + url = 'http://www.google.com.py/search?' + elif area == 'br': + url = 'http://www.google.com.br/search?' + elif area == 'uy': + url = 'http://www.google.com.uy/search?' + elif area == 'ar': + url = 'http://www.google.com.ar/search?' + elif area == 'cl': + url = 'http://www.google.cl/search?' + elif area == 'gl': + url = 'http://www.google.gl/search?' + elif area == 'ca': + url = 'http://www.google.ca/search?' + elif area == 'mx': + url = 'http://www.google.com.mx/search?' + elif area == 'gt': + url = 'http://www.google.com.gt/search?' + elif area == 'bz': + url = 'http://www.google.com.bz/search?' + elif area == 'sv': + url = 'http://www.google.com.sv/search?' + elif area == 'hn': + url = 'http://www.google.hn/search?' + elif area == 'ni': + url = 'http://www.google.com.ni/search?' + elif area == 'cr': + url = 'http://www.google.co.cr/search?' + elif area == 'pa': + url = 'http://www.google.com.pa/search?' + elif area == 'bs': + url = 'http://www.google.bs/search?' + elif area == 'cu': + url = 'http://www.google.com.cu/search?' + elif area == 'jm': + url = 'http://www.google.com.jm/search?' + elif area == 'ht': + url = 'http://www.google.ht/search?' + else: + raise AreaError('invalid name, no area found') + url += params return url @@ -77,7 +417,7 @@ def get_html(url): print("Error accessing:", url) print(e) if e.code == 503 and 'CaptchaRedirect' in e.read(): - print("Google is requiring a Captcha. " \ + print("Google is requiring a Captcha. " "For more information see: 'https://support.google.com/websearch/answer/86640'") if e.code == 503: sys.exit("503 Error: service is currently unavailable. Program will exit.") @@ -168,10 +508,12 @@ def inner(*args, **kwargs): maxs = dt if dt > maxs else maxs sums += dt if verbose: - print('\t%r ran in %2.9f sec on run %s' % (func.__name__, dt, i)) + print('\t%r ran in %2.9f sec on run %s' % + (func.__name__, dt, i)) print('%r min run time was %2.9f sec' % (func.__name__, mins)) print('%r max run time was %2.9f sec' % (func.__name__, maxs)) - print('%r avg run time was %2.9f sec in %s runs' % (func.__name__, old_div(sums, loops), loops)) + print('%r avg run time was %2.9f sec in %s runs' % + (func.__name__, old_div(sums, loops), loops)) print('==== end ====') return result @@ -188,7 +530,7 @@ def wrap(*args, **kw): ts = time.time() result = f(*args, **kw) te = time.time() - print('func:%r args:[%r, %r] took: %2.4f sec' % \ - (f.__name__, args, kw, te - ts)) + print('func:%r args:[%r, %r] took: %2.4f sec' % + (f.__name__, args, kw, te - ts)) return result return wrap diff --git a/google/tests/test_utils.py b/google/tests/test_utils.py index f8bafda..bc51172 100644 --- a/google/tests/test_utils.py +++ b/google/tests/test_utils.py @@ -15,8 +15,8 @@ class UtilsTestCase(unittest.TestCase): """Tests for helper methods.""" @unittest.skip('Don\t know why but it not work. Skipping for now') def test_get_search_url(self): - url = _get_search_url("apple", 0, 10, "en") - exp_url = "http://www.google.com/search?q=apple&start=0&num=10&hl=en" + url = _get_search_url("apple", 0, 10, "en", "jp") + exp_url = "http://www.google.co.jp/search?q=apple&start=0&num=10&hl=en" self.assertEqual(url, exp_url)