Improved to Excel

Fixed some issue to same behaviour in LibreOffice and Microsoft Excel (function CEILING and "======" in comments).
hildogjr · Sep 20, 2017 · 3343eff · 3343eff
1 parent b87edd1
commit 3343eff
Show file tree

Hide file tree

Showing 20 changed files with 2,344 additions and 244 deletions.
diff --git a/kicost/distributors/__init__.py b/kicost/distributors/__init__.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8 -*-
+
+__author__ = 'XESS Corporation'
+__email__ = 'info@xess.com'
+
+from random import randint
+
+# Global constants for distributor site scraping.
+import http.client # For web scraping exceptions.
+try:
+    from urllib.parse import urlencode, quote as urlquote, urlsplit, urlunsplit
+    import urllib.request
+    from urllib.request import urlopen, Request
+except ImportError:
+    from urlparse import quote as urlquote, urlsplit, urlunsplit
+    from urllib import urlencode
+    from urllib2 import urlopen, Request
+HTML_RESPONSE_RETRIES = 2 # Num of retries for getting part data web page.
+WEB_SCRAPE_EXCEPTIONS = (urllib.request.URLError, http.client.HTTPException)
+
+
+def get_user_agent():
+    # The default user_agent_list comprises chrome, IE, firefox, Mozilla, opera, netscape.
+    # for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
+    user_agent_list = [
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
+        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
+        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
+        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
+        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
+    ]
+    return user_agent_list[randint(0, len(user_agent_list) - 1)]
+
+def FakeBrowser(url):
+    req = Request(url)
+    req.add_header('Accept-Language', 'en-US')
+    req.add_header('User-agent', get_user_agent())
+    return req
+
+
+# The global dictionary of distributor information starts out empty.
+distributors = {}
+
+import os
+
+# The distributor module directories will be found in this directory.
+directory = os.path.dirname(__file__)
+
+# Search for the distributor modules and import them.
+for module in os.listdir(os.path.dirname(__file__)):
+
+    # Avoid importing non-directories.
+    abs_module = os.path.join(directory, module)
+    if not os.path.isdir(abs_module):
+        continue
+
+    # Avoid directories like __pycache__.
+    if module.startswith('__'):
+        continue
+
+    # Import the module.
+    __import__(module, globals(), locals(), [], level=1)
diff --git a/kicost/distributors/digikey/__init__.py b/kicost/distributors/digikey/__init__.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+
+__author__ = 'XESS Corporation'
+__email__ = 'info@xess.com'
+
+from .digikey import *
+
+# Place information about this distributor into the distributor dictionary.
+from .. import distributors
+distributors.update(
+    {
+        'digikey': {
+            'module': 'digikey', # The directory name containing this file.
+            'scrape': 'web',     # Allowable values: 'web' or 'local'.
+            'label': 'Digi-Key', # Distributor label used in spreadsheet columns.
+            'order_cols': ['purch', 'part_num', 'refs'],  # Sort-order for online orders.
+            'order_delimiter': ',',  # Delimiter for online orders.
+            # Formatting for distributor header in worksheet.
+            'wrk_hdr_format': {
+                'font_size': 14,
+                'font_color': 'white',
+                'bold': True,
+                'align': 'center',
+                'valign': 'vcenter',
+                'bg_color': '#CC0000'  # Digi-Key red.
+            }
+        }
+    }
+)
+
diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py
@@ -0,0 +1,286 @@
+# MIT license
+#
+# Copyright (C) 2015 by XESS Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+# Inserted by Pasteurize tool.
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import division
+from __future__ import absolute_import
+from builtins import zip
+from builtins import range
+from builtins import int
+from builtins import str
+from future import standard_library
+standard_library.install_aliases()
+
+import future
+
+import re
+import difflib
+from bs4 import BeautifulSoup
+import http.client # For web scraping exceptions.
+from .. import urlquote, urlsplit, urlunsplit, urlopen, Request
+from .. import HTML_RESPONSE_RETRIES
+from .. import WEB_SCRAPE_EXCEPTIONS
+from .. import FakeBrowser
+from ...kicost import PartHtmlError
+from ...kicost import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
+
+
+def get_price_tiers(html_tree):
+    '''Get the pricing tiers from the parsed tree of the Digikey product page.'''
+    price_tiers = {}
+    try:
+        for tr in html_tree.find('table', id='product-dollars').find_all('tr'):
+            try:
+                td = tr.find_all('td')
+                qty = int(re.sub('[^0-9]', '', td[0].text))
+                price_tiers[qty] = float(re.sub('[^0-9\.]', '', td[1].text))
+            except (TypeError, AttributeError, ValueError,
+                    IndexError):  # Happens when there's no <td> in table row.
+                continue
+    except AttributeError:
+        # This happens when no pricing info is found in the tree.
+        logger.log(DEBUG_OBSESSIVE, 'No Digikey pricing information found!')
+        return price_tiers  # Return empty price tiers.
+    return price_tiers
+
+
+def part_is_reeled(html_tree):
+    '''Returns True if this Digi-Key part is reeled or Digi-reeled.'''
+    qty_tiers = list(get_price_tiers(html_tree).keys())
+    if len(qty_tiers) > 0 and min(qty_tiers) >= 100:
+        return True
+    if html_tree.find('table',
+                      id='product-details-reel-pricing') is not None:
+        return True
+    return False
+
+
+def get_part_num(html_tree):
+    '''Get the part number from the Digikey product page.'''
+    try:
+        return re.sub('\s', '', html_tree.find('td',
+                                               id='reportPartNumber').text)
+    except AttributeError:
+        logger.log(DEBUG_OBSESSIVE, 'No Digikey part number found!')
+        return ''
+
+
+def get_qty_avail(html_tree):
+    '''Get the available quantity of the part from the Digikey product page.'''
+    try:
+        qty_tree = html_tree.find('td', id='quantityAvailable').find('span', id='dkQty')
+        qty_str = qty_tree.text
+    except AttributeError:
+        # No quantity found (not even 0) so this is probably a non-stocked part.
+        # Return None so the part won't show in the spreadsheet for this dist.
+        return None
+    try:
+        qty_str = re.search('([0-9,]*)', qty_str, re.IGNORECASE).group(1)
+        return int(re.sub('[^0-9]', '', qty_str))
+    except (AttributeError, ValueError):
+        # Didn't find the usual quantity text field. This might be one of those
+        # input fields for requesting a quantity, so get the value from the
+        # input field.
+        try:
+            logger.log(DEBUG_OBSESSIVE, 'No Digikey part quantity found!')
+            return int(qty_tree.find('input', type='text').get('value'))
+        except (AttributeError, ValueError):
+            # Well, there's a quantityAvailable section in the website, but
+            # it doesn't contain anything decipherable. Let's just assume it's 0.
+            return 0
+
+
+def get_part_html_tree(dist, pn, extra_search_terms='', url=None, descend=2, local_part_html=None):
+    '''Find the Digikey HTML page for a part number and return the URL and parse tree.'''
+
+    def merge_price_tiers(main_tree, alt_tree):
+        '''Merge the price tiers from the alternate-packaging tree into the main tree.'''
+        try:
+            insertion_point = main_tree.find('table', id='product-dollars').find('tr')
+            for tr in alt_tree.find('table', id='product-dollars').find_all('tr'):
+                insertion_point.insert_after(tr)
+        except AttributeError:
+            logger.log(DEBUG_OBSESSIVE, 'Problem merging price tiers for Digikey part {} with alternate packaging!'.format(pn))
+
+    def merge_qty_avail(main_tree, alt_tree):
+        '''Merge the quantities from the alternate-packaging tree into the main tree.'''
+        try:
+            main_qty = get_qty_avail(main_tree)
+            alt_qty = get_qty_avail(alt_tree)
+            if main_qty is None:
+                merged_qty = alt_qty
+            elif alt_qty is None:
+                merged_qty = main_qty
+            else:
+                merged_qty = max(main_qty, alt_qty)
+            if merged_qty is not None:
+                insertion_point = main_tree.find('td', id='quantityAvailable').find('span', id='dkQty')
+                insertion_point.string = '{}'.format(merged_qty)
+        except AttributeError:
+            logger.log(DEBUG_OBSESSIVE, 'Problem merging available quantities for Digikey part {} with alternate packaging!'.format(pn))
+
+    # Use the part number to lookup the part using the site search function, unless a starting url was given.
+    if url is None:
+        url = 'http://www.digikey.com/scripts/DkSearch/dksus.dll?WT.z_header=search_go&lang=en&keywords=' + urlquote(
+            pn + ' ' + extra_search_terms,
+            safe='')
+        #url = 'http://www.digikey.com/product-search/en?KeyWords=' + urlquote(pn,safe='') + '&WT.z_header=search_go'
+    elif url[0] == '/':
+        url = 'http://www.digikey.com' + url
+
+    # Open the URL, read the HTML from it, and parse it into a tree structure.
+    req = FakeBrowser(url)
+    for _ in range(HTML_RESPONSE_RETRIES):
+        try:
+            response = urlopen(req)
+            html = response.read()
+            break
+        except WEB_SCRAPE_EXCEPTIONS:
+            logger.log(DEBUG_DETAILED,'Exception while web-scraping {} from {}'.format(pn, dist))
+
+    else: # Couldn't get a good read from the website.
+        logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist))
+        raise PartHtmlError
+
+    # Abort if the part number isn't in the HTML somewhere.
+    # (Only use the numbers and letters to compare PN to HTML.)
+    if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))):
+        logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, dist))
+        raise PartHtmlError
+
+    # Use the following code if Javascript challenge pages are used to block scrapers.
+    # try:
+    # ghst = Ghost()
+    # sess = ghst.start(plugins_enabled=False, download_images=False, show_scrollbars=False, javascript_enabled=False)
+    # html, resources = sess.open(url)
+    # print('type of HTML is {}'.format(type(html.content)))
+    # html = html.content
+    # except Exception as e:
+    # print('Exception reading with Ghost: {}'.format(e))
+
+    try:
+        tree = BeautifulSoup(html, 'lxml')
+    except Exception:
+        logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist))
+        raise PartHtmlError
+
+    # If the tree contains the tag for a product page, then return it.
+    if tree.find('div', class_='product-top-section') is not None:
+
+        # Digikey separates cut-tape and reel packaging, so we need to examine more pages
+        # to get all the pricing info. But don't descend any further if limit has been reached.
+        if descend > 0:
+            try:
+                # Find all the URLs to alternate-packaging pages for this part.
+                ap_urls = [
+                    ap.find('li', class_='lnkAltPack').find_all('a')[-1].get('href')
+                    for ap in tree.find(
+                        'div', class_='bota',
+                        id='additionalPackaging').find_all(
+                            'ul', class_='more-expander-item')
+                ]
+                logger.log(DEBUG_OBSESSIVE,'Found {} alternate packagings for {} from {}'.format(len(ap_urls), pn, dist))
+                ap_trees_and_urls = []  # Initialize as empty in case no alternate packagings are found.
+                try:
+                    ap_trees_and_urls = [get_part_html_tree(dist, pn, 
+                                     extra_search_terms, ap_url, descend=0)
+                                     for ap_url in ap_urls]
+                except Exception:
+                    logger.log(DEBUG_OBSESSIVE,'Failed to find alternate packagings for {} from {}'.format(pn, dist))
+
+                # Put the main tree on the list as well and then look through
+                # the entire list for one that's non-reeled. Use this as the
+                # main page for the part.
+                ap_trees_and_urls.append((tree, url))
+                if part_is_reeled(tree):
+                    for ap_tree, ap_url in ap_trees_and_urls:
+                        if not part_is_reeled(ap_tree):
+                            # Found a non-reeled part, so use it as the main page.
+                            tree = ap_tree
+                            url = ap_url
+                            break  # Done looking.
+
+                # Now go through the other pages, merging their pricing and quantity
+                # info into the main page.
+                for ap_tree, ap_url in ap_trees_and_urls:
+                    if ap_tree is tree:
+                        continue  # Skip examining the main tree. It already contains its info.
+                    try:
+                        # Merge the pricing info from that into the main parse tree to make
+                        # a single, unified set of price tiers...
+                        merge_price_tiers(tree, ap_tree)
+                        # and merge available quantity, using the maximum found.
+                        merge_qty_avail(tree, ap_tree)
+                    except AttributeError:
+                        logger.log(DEBUG_OBSESSIVE,'Problem merging price/qty for {} from {}'.format(pn, dist))
+                        continue
+            except AttributeError as e:
+                logger.log(DEBUG_OBSESSIVE,'Problem parsing URLs from product page for {} from {}'.format(pn, dist))
+
+        return tree, url  # Return the parse tree and the URL where it came from.
+
+    # If the tree is for a list of products, then examine the links to try to find the part number.
+    if tree.find('table', id='productTable') is not None:
+        logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist))
+        if descend <= 0:
+            logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist))
+            raise PartHtmlError
+        else:
+            # Look for the table of products.
+            products = tree.find(
+                'table',
+                id='productTable').find('tbody').find_all('tr')
+
+            # Extract the product links for the part numbers from the table.
+            # Extract links for both manufacturer and catalog numbers.
+            product_links = [p.find('td',
+                                    class_='tr-mfgPartNumber').a
+                             for p in products]
+            product_links.extend([p.find('td',
+                                    class_='tr-dkPartNumber').a
+                             for p in products])
+
+            # Extract all the part numbers from the text portion of the links.
+            part_numbers = [l.text for l in product_links]
+
+            # Look for the part number in the list that most closely matches the requested part number.
+            match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0]
+
+            # Now look for the link that goes with the closest matching part number.
+            for l in product_links:
+                if l.text == match:
+                    # Get the tree for the linked-to page and return that.
+                    logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text, pn, dist))
+                    return get_part_html_tree(dist, pn, extra_search_terms,
+                                                      url=l['href'],
+                                                      descend=descend - 1)
+
+    # If the HTML contains a list of part categories, then give up.
+    if tree.find('form', id='keywordSearchForm') is not None:
+        logger.log(DEBUG_OBSESSIVE,'Found high-level part categories for {} from {}'.format(pn, dist))
+        raise PartHtmlError
+
+    # I don't know what happened here, so give up.
+    logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist))
+    raise PartHtmlError