Skip to content

Commit

Permalink
Merge pull request #275 from mmmaisel/master
Browse files Browse the repository at this point in the history
Fix wrong Mouser scraping results.
  • Loading branch information
hildogjr authored Jun 18, 2018
2 parents 56e1504 + 74561cb commit e2db72c
Show file tree
Hide file tree
Showing 7 changed files with 95 additions and 9 deletions.
1 change: 1 addition & 0 deletions kicost/distributors/digikey/digikey.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class dist_digikey(distributor.distributor):
def __init__(self, name, scrape_retries, throttle_delay):
super(dist_digikey, self).__init__(name, distributor_dict[name]['site']['url'],
scrape_retries, throttle_delay)
self.browser.start_new_session()

@staticmethod
def dist_init_distributor_dict():
Expand Down
14 changes: 9 additions & 5 deletions kicost/distributors/fake_browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,10 +150,11 @@ def __init__(self, domain, logger, scrape_retries, throttle_delay):

self.scrape_retries = scrape_retries
self.logger = logger
self.ret_url = None

self.start_new_session()
self.start_new_session(False)

def start_new_session(self):
def start_new_session(self, scrape_base_url=True):
self.userAgent = get_user_agent()

# Use "requests" instead of "urllib" because "urllib" does not allow
Expand All @@ -163,11 +164,12 @@ def start_new_session(self):

# Restore configuration cookies from previous session.
for c in self.config_cookies:
print("Restore cookie: %s", c)
self.logger.log(DEBUG_OBSESSIVE, "Restore cookie: %s", c)
self.session.cookies.set(c[1], c[2], domain=c[0])

self.scrape_URL(self.domain, retry=False)
self.show_cookies()
if scrape_base_url:
self.scrape_URL(self.domain, retry=False)
self.show_cookies()

def show_cookies(self):
for x in self.session.cookies:
Expand Down Expand Up @@ -218,6 +220,8 @@ def scrape_URL(self, url, retry=True, postData=None):
" Starting new session for %s" % self.domain)
continue

# Store last accessed URL to allow check for regional redirect.
self.ret_url = resp.url
html = resp.text
break
except Exception as ex:
Expand Down
1 change: 1 addition & 0 deletions kicost/distributors/farnell/farnell.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ class dist_farnell(distributor.distributor):
def __init__(self, name, scrape_retries, throttle_delay):
super(dist_farnell, self).__init__(name, distributor_dict[name]['site']['url'],
scrape_retries, throttle_delay)
self.browser.start_new_session()

@staticmethod
def dist_init_distributor_dict():
Expand Down
85 changes: 81 additions & 4 deletions kicost/distributors/mouser/mouser.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,13 @@

from urllib.parse import quote_plus as urlquote

import pycountry

class dist_mouser(distributor.distributor):
def __init__(self, name, scrape_retries, throttle_delay):
super(dist_mouser, self).__init__(name, distributor_dict[name]['site']['url'],
scrape_retries, throttle_delay)
self.browser.add_cookie('.mouser.com', 'preferences', 'ps=www2&pl=en-US&pc_www2=USDe')
self.browser.start_new_session()

@staticmethod
def dist_init_distributor_dict():
Expand Down Expand Up @@ -156,6 +158,80 @@ def dist_get_qty_avail(self, html_tree):
self.logger.log(DEBUG_OBSESSIVE, 'No Mouser part quantity found!')
return None

def dist_define_locale_currency(self, locale_iso=None, currency_iso=None):
'''@brief Configure the distributor for the country and currency intended.
Scrape the configuration page and define the base URL of Mouser for the
currency and locale chosen.
The currency is predominant over the locale/country and the default
settings depend on your location.
@param locale_iso `str` Country in ISO3166 alpha 2 standard.
@param currency_iso `str` Currency in ISO4217 alpha 3 standard.'''

# Configuring mouser locale and currency is difficult because
# Mouser automatically selects locale, currency and used Mouser
# (sub)domain based on your IP address! All override attempts of
# this behaviour within a session seems to be ignored so far
# (you will be 302 redirected to your regional domain).

# TODO:
# Switch locale via the following URL ignores currency settings!
# Switch to regions far away from your location is rejected!
# url = 'https://www.mouser.com/localsites.aspx'

# The following approach works for selecting currency:
# - Access "www.mouser.com" (done in constructor) and store local redirect URL.
# - Manually set currency preference for your regional URL.
# - Completely restart fake_browser session to apply currency settings.
# Switching locale seems to be not possible yet.
try:
if currency_iso and not locale_iso:
money = pycountry.currencies.get(alpha_3=currency_iso.upper())
locale_iso = pycountry.countries.get(numeric=money.numeric).alpha_2
if locale_iso:
currency_iso = currency_iso.upper()
country = pycountry.countries.get(alpha_2=locale_iso.upper())

# TODO: Mouser uses either "USDe" or "USDu" to select USD as
# currency, depending on your location.
if currency_iso == "USD":
currency_iso = "USDe"

# Some mouser regions are subdomains from mouser.com, other
# regions user their own top level domains, e.g. mouser.eu.
# Extract the region specific part and suffix it to
# the preferences cookie.
local_domains = re.search("https://(.+)\.mouser\.(.+)/", self.browser.ret_url)
if local_domains[1].startswith("www"):
domain = local_domains[2]
else:
domain = local_domains[1]

# Store currency perference (pc_%localdomain)
# for your regional domain.
self.browser.add_cookie('.mouser.%s' % local_domains[2], \
'preferences', 'pc_%s=%s' % (domain, currency_iso))

# Store new localized url in distributor_dict.
distributor_dict[self.name]['site']['url'] = self.browser.ret_url.rstrip('/')
distributor_dict[self.name]['site']['currency'] = pycountry.currencies.get(numeric=country.numeric).alpha_3
distributor_dict[self.name]['site']['locale'] = locale_iso

# Restarting complete session is required to apply
# new locale and currency settings.
self.browser.domain = distributor_dict[self.name]['site']['url']
self.browser.start_new_session()

except Exception as ex:
self.logger.log(DEBUG_OBSESSIVE, "Exception was %s" % type(ex).__name__)
self.logger.log(DEBUG_OVERVIEW, 'Kept the last configuration {}, {} on {}.'.format(
pycountry.currencies.get(alpha_3=distributor_dict[self.name]['site']['currency']).name,
pycountry.countries.get(alpha_2=distributor_dict[self.name]['site']['locale']).name,
distributor_dict[self.name]['site']['url']
)) # Keep the current configuration.
return

def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2):
'''@brief Find the Mouser HTML page for a part number and return the URL and parse tree.
@param pn Part number `str()`.
Expand All @@ -167,13 +243,14 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2

# Use the part number to lookup the part using the site search function, unless a starting url was given.
if url is None:
url = 'https://www.mouser.com/Search/Refine.aspx?Keyword=' + urlquote(pn, safe='')
url = distributor_dict[self.name]['site']['url'] + \
'/Search/Refine.aspx?Keyword=' + urlquote(pn, safe='')
if extra_search_terms:
url = url + urlquote(' ' + extra_search_terms, safe='')
elif url[0] == '/':
url = 'https://www.mouser.com' + url
url = distributor_dict[self.name]['site']['url'] + url
elif url.startswith('..'):
url = 'https://www.mouser.com/Search/' + url
url = distributor_dict[self.name]['site']['url'] + '/Search/' + url

# Open the URL, read the HTML from it, and parse it into a tree structure.
try:
Expand Down
1 change: 1 addition & 0 deletions kicost/distributors/newark/newark.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class dist_newark(distributor.distributor):
def __init__(self, name, scrape_retries, throttle_delay):
super(dist_newark, self).__init__(name, distributor_dict[name]['site']['url'],
scrape_retries, throttle_delay)
self.browser.start_new_session()

@staticmethod
def dist_init_distributor_dict():
Expand Down
1 change: 1 addition & 0 deletions kicost/distributors/rs/rs.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class dist_rs(distributor.distributor):
def __init__(self, name, scrape_retries, throttle_delay):
super(dist_rs, self).__init__(name, distributor_dict[name]['site']['url'],
scrape_retries, throttle_delay)
self.browser.start_new_session()

@staticmethod
def dist_init_distributor_dict():
Expand Down
1 change: 1 addition & 0 deletions kicost/distributors/tme/tme.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class dist_tme(distributor.distributor):
def __init__(self, name, scrape_retries, throttle_delay):
super(dist_tme, self).__init__(name, distributor_dict[name]['site']['url'],
scrape_retries, throttle_delay)
self.browser.start_new_session()

@staticmethod
def dist_init_distributor_dict():
Expand Down

0 comments on commit e2db72c

Please sign in to comment.