hildogjr · xesscorp · Jan 15, 2018 · Dec 28, 2017 · Dec 30, 2017 · Dec 30, 2017
diff --git a/kicost/distributors/web_routines.py b/kicost/distributors/web_routines.py
@@ -26,8 +26,157 @@
 __company__ = 'University of Campinas - Brazil'
 
 # Libraries.
+import sys
+from bs4 import BeautifulSoup # XML file interpreter.
+from yattag import Doc, indent  # For generating HTML page for local parts.
+import multiprocessing # To deal with the parallel scrape.
+import logging
 from ..kicost import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE # Debug configurations.
+from ..kicost import SEPRTR
+#from . import distributors
 
-#__all__ = ['']
+__all__ = ['scrape_part', 'get_part_html_tree', 'create_local_part_html']
 
-# Organazing the functions...
+# Extra informations to by got by each part in the distributors.
+EXTRA_INFO = ['value', 'tolerance', 'footprint', 'power', 'current', 'voltage', 'frequency', 'temp_coeff', 'manf',
+              'datasheet', 'image' # Links.
+             ]
+
+def create_local_part_html(parts):
+    '''Create HTML page containing info for local (non-webscraped) parts.'''
+
+    global distributors
+
+    logger.log(DEBUG_OVERVIEW, 'Create HTML page for parts with custom pricing...')
+
+    doc, tag, text = Doc().tagtext()
+    with tag('html'):
+        with tag('body'):
+            for p in parts:
+                # Find the manufacturer's part number if it exists.
+                pn = p.fields.get('manf#') # Returns None if no manf# field.
+
+                # Find the various distributors for this part by
+                # looking for leading fields terminated by SEPRTR.
+                for key in p.fields:
+                    try:
+                        dist = key[:key.index(SEPRTR)]
+                    except ValueError:
+                        continue
+
+                    # If the distributor is not in the list of web-scrapable distributors,
+                    # then it's a local distributor. Copy the local distributor template
+                    # and add it to the table of distributors.
+                    if dist not in distributors:
+                        distributors[dist] = copy.copy(distributors['local_template'])
+                        distributors[dist]['label'] = dist  # Set dist name for spreadsheet header.
+
+                # Now look for catalog number, price list and webpage link for this part.
+                for dist in distributors:
+                    cat_num = p.fields.get(dist+':cat#')
+                    pricing = p.fields.get(dist+':pricing')
+                    link = p.fields.get(dist+':link')
+                    if cat_num is None and pricing is None and link is None:
+                        continue
+
+                    def make_random_catalog_number(p):
+                        hash_fields = {k: p.fields[k] for k in p.fields}
+                        hash_fields['dist'] = dist
+                        return '#{0:08X}'.format(abs(hash(tuple(sorted(hash_fields.items())))))
+
+                    cat_num = cat_num or pn or make_random_catalog_number(p)
+                    p.fields[dist+':cat#'] = cat_num # Store generated cat#.
+                    with tag('div', klass=dist+SEPRTR+cat_num):
+                        with tag('div', klass='cat#'):
+                            text(cat_num)
+                        if pricing is not None:
+                            with tag('div', klass='pricing'):
+                                text(pricing)
+                        if link is not None:
+                            url_parts = list(urlsplit(link))
+                            if url_parts[0] == '':
+                                url_parts[0] = u'http'
+                            link = urlunsplit(url_parts)
+                            with tag('div', klass='link'):
+                                text(link)
+
+    # Remove the local distributor template so it won't be processed later on.
+    # It has served its purpose.
+    del distributors['local_template']
+
+    html = doc.getvalue()
+    if logger.isEnabledFor(DEBUG_OBSESSIVE):
+        print(indent(html))
+    return html
+
+
+def get_part_html_tree(part, dist, get_html_tree_func, local_part_html, scrape_retries, logger):
+    '''Get the HTML tree for a part from the given distributor website or local HTML.'''
+
+    logger.log(DEBUG_OBSESSIVE, '%s %s', dist, str(part.refs))
+
+    for extra_search_terms in set([part.fields.get('manf', ''), '']):
+        try:
+            # Search for part information using one of the following:
+            #    1) the distributor's catalog number.
+            #    2) the manufacturer's part number.
+            for key in (dist+'#', dist+SEPRTR+'cat#', 'manf#'):
+                if key in part.fields:
+                    if part.fields[key]:
+                        # Founded manufacturer / distributor code valid (not empty).
+                        return get_html_tree_func(dist, part.fields[key], extra_search_terms, local_part_html=local_part_html, scrape_retries=scrape_retries)
+            # No distributor or manufacturer number, so give up.
+            else:
+                logger.warning("No '%s#' or 'manf#' field: cannot lookup part %s at %s", dist, part.refs, dist)
+                return BeautifulSoup('<html></html>', 'lxml'), ''
+                #raise PartHtmlError
+        except PartHtmlError:
+            pass
+        except AttributeError:
+            break
+    logger.warning("Part %s not found at %s", part.refs, dist)
+    # If no HTML page was found, then return a tree for an empty page.
+    return BeautifulSoup('<html></html>', 'lxml'), ''
+
+
+def scrape_part(args):
+    '''Scrape the data for a part from each distributor website or local HTML.'''
+
+    id, part, distributor_dict, local_part_html, scrape_retries, log_level = args # Unpack the arguments.
+
+    if multiprocessing.current_process().name == "MainProcess":
+        scrape_logger = logging.getLogger('kicost')
+    else:
+        scrape_logger = multiprocessing.get_logger()
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setLevel(log_level)
+        scrape_logger.addHandler(handler)
+        scrape_logger.setLevel(log_level)
+
+    # Create dictionaries for the various items of part data from each distributor.
+    url = {}
+    part_num = {}
+    price_tiers = {}
+    qty_avail = {}
+
+    # Scrape the part data from each distributor website or the local HTML.
+    print(__all__) #TODO ERROR
+    print(dir(locals()) #TODO ERROR
+    for d in distributor_dict:
+        try:
+            #dist_module = getattr(distributor_imports, d)
+            dist_module = getattr(locals(), d)
+        except AttributeError:
+            #dist_module = getattr(distributor_imports, distributor_dict[d]['module'])
+            dist_module = getattr(locals(), distributor_dict[d]['module'])
+
+        # Get the HTML tree for the part.
+        html_tree, url[d] = get_part_html_tree(part, d, dist_module.get_part_html_tree, local_part_html, scrape_retries, scrape_logger)
+
+        # Call the functions that extract the data from the HTML tree.
+        part_num[d] = dist_module.get_part_num(html_tree)
+        qty_avail[d] = dist_module.get_qty_avail(html_tree)
+        price_tiers[d] = dist_module.get_price_tiers(html_tree)
+
+    # Return the part data.
+    return id, url, part_num, price_tiers, qty_avail
diff --git a/kicost/kicost.py b/kicost/kicost.py
@@ -31,11 +31,10 @@
 import future
 
 import sys
-import copy
-import logging, pprint
-from bs4 import BeautifulSoup # XML file interpreter.
-from yattag import Doc, indent  # For generating HTML page for local parts.
-import multiprocessing, tqdm
+import pprint
+import logging
+import tqdm
+import multiprocessing # To deal with the parallel scrape.
 from multiprocessing import Pool # For running web scrapes in parallel.
 
 try:
@@ -71,21 +70,15 @@ class PartHtmlError(Exception):
 # Import information about various distributors.
 from . import distributors as distributor_imports
 distributors = distributor_imports.distributors
+create_local_part_html = distributor_imports.create_local_part_html
+scrape_part = distributor_imports.scrape_part
 
 # Import import functions for various EDA tools.
 from . import eda_tools as eda_tools_imports
 eda_tools = eda_tools_imports.eda_tools
 group_parts = eda_tools_imports.group_parts
 from .eda_tools.eda_tools import SUB_SEPRTR
-
-# Regular expression for detecting part reference ids consisting of a
-# prefix of letters followed by a sequence of digits, such as 'LED10'
-# or a sequence of digits followed by a subpart number like 'CONN1#3'.
-# There can even be an interposer character so 'LED-10' is also OK.
-#PART_REF_REGEX = re.compile('(?P<prefix>[a-z]+\W?)(?P<num>((?P<ref_num>\d+)({}(?P<subpart_num>\d+))?))'.format(SUB_SEPRTR), re.IGNORECASE)
-#from .eda_tools.eda_tools import PART_REF_REGEX
-
-from .spreadsheet import *
+from .spreadsheet import * # Creation of the final XLSX spreadsheet.
 
 def kicost(in_file, out_filename, user_fields, ignore_fields, variant, num_processes, 
         eda_tool_name, exclude_dist_list, include_dist_list, scrape_retries):
@@ -96,7 +89,7 @@ def kicost(in_file, out_filename, user_fields, ignore_fields, variant, num_proce
         include_dist_list = list(distributors.keys())
     rmv_dist = set(exclude_dist_list)
     rmv_dist |= set(list(distributors.keys())) - set(include_dist_list)
-    rmv_dist -= set(['local_template'])  # We need this later for creating non-web distributors.
+    rmv_dist -= set(['local_template'])  # Needed later for creating non-web distributors.
     for dist in rmv_dist:
         distributors.pop(dist, None)
 
@@ -202,140 +195,4 @@ def update(x):
                         print(part.__dict__[f])
                     except KeyError:
                         pass
-            print()
-
-
-def create_local_part_html(parts):
-    '''Create HTML page containing info for local (non-webscraped) parts.'''
-
-    global distributors
-
-    logger.log(DEBUG_OVERVIEW, 'Create HTML page for parts with custom pricing...')
-
-    doc, tag, text = Doc().tagtext()
-    with tag('html'):
-        with tag('body'):
-            for p in parts:
-                # Find the manufacturer's part number if it exists.
-                pn = p.fields.get('manf#') # Returns None if no manf# field.
-
-                # Find the various distributors for this part by
-                # looking for leading fields terminated by SEPRTR.
-                for key in p.fields:
-                    try:
-                        dist = key[:key.index(SEPRTR)]
-                    except ValueError:
-                        continue
-
-                    # If the distributor is not in the list of web-scrapable distributors,
-                    # then it's a local distributor. Copy the local distributor template
-                    # and add it to the table of distributors.
-                    if dist not in distributors:
-                        distributors[dist] = copy.copy(distributors['local_template'])
-                        distributors[dist]['label'] = dist  # Set dist name for spreadsheet header.
-
-                # Now look for catalog number, price list and webpage link for this part.
-                for dist in distributors:
-                    cat_num = p.fields.get(dist+':cat#')
-                    pricing = p.fields.get(dist+':pricing')
-                    link = p.fields.get(dist+':link')
-                    if cat_num is None and pricing is None and link is None:
-                        continue
-
-                    def make_random_catalog_number(p):
-                        hash_fields = {k: p.fields[k] for k in p.fields}
-                        hash_fields['dist'] = dist
-                        return '#{0:08X}'.format(abs(hash(tuple(sorted(hash_fields.items())))))
-
-                    cat_num = cat_num or pn or make_random_catalog_number(p)
-                    p.fields[dist+':cat#'] = cat_num # Store generated cat#.
-                    with tag('div', klass=dist+SEPRTR+cat_num):
-                        with tag('div', klass='cat#'):
-                            text(cat_num)
-                        if pricing is not None:
-                            with tag('div', klass='pricing'):
-                                text(pricing)
-                        if link is not None:
-                            url_parts = list(urlsplit(link))
-                            if url_parts[0] == '':
-                                url_parts[0] = u'http'
-                            link = urlunsplit(url_parts)
-                            with tag('div', klass='link'):
-                                text(link)
-
-    # Remove the local distributor template so it won't be processed later on.
-    # It has served its purpose.
-    del distributors['local_template']
-
-    html = doc.getvalue()
-    if logger.isEnabledFor(DEBUG_OBSESSIVE):
-        print(indent(html))
-    return html
-
-
-def get_part_html_tree(part, dist, get_html_tree_func, local_part_html, scrape_retries, logger):
-    '''Get the HTML tree for a part from the given distributor website or local HTML.'''
-
-    logger.log(DEBUG_OBSESSIVE, '%s %s', dist, str(part.refs))
-
-    for extra_search_terms in set([part.fields.get('manf', ''), '']):
-        try:
-            # Search for part information using one of the following:
-            #    1) the distributor's catalog number.
-            #    2) the manufacturer's part number.
-            for key in (dist+'#', dist+SEPRTR+'cat#', 'manf#'):
-                if key in part.fields:
-                    if part.fields[key]:
-                        # Founded manufacturer / distributor code valid (not empty).
-                        return get_html_tree_func(dist, part.fields[key], extra_search_terms, local_part_html=local_part_html, scrape_retries=scrape_retries)
-            # No distributor or manufacturer number, so give up.
-            else:
-                logger.warning("No '%s#' or 'manf#' field: cannot lookup part %s at %s", dist, part.refs, dist)
-                return BeautifulSoup('<html></html>', 'lxml'), ''
-                #raise PartHtmlError
-        except PartHtmlError:
-            pass
-        except AttributeError:
-            break
-    logger.warning("Part %s not found at %s", part.refs, dist)
-    # If no HTML page was found, then return a tree for an empty page.
-    return BeautifulSoup('<html></html>', 'lxml'), ''
-
-
-def scrape_part(args):
-    '''Scrape the data for a part from each distributor website or local HTML.'''
-
-    id, part, distributor_dict, local_part_html, scrape_retries, log_level = args # Unpack the arguments.
-
-    if multiprocessing.current_process().name == "MainProcess":
-        scrape_logger = logging.getLogger('kicost')
-    else:
-        scrape_logger = multiprocessing.get_logger()
-        handler = logging.StreamHandler(sys.stdout)
-        handler.setLevel(log_level)
-        scrape_logger.addHandler(handler)
-        scrape_logger.setLevel(log_level)
-
-    # Create dictionaries for the various items of part data from each distributor.
-    url = {}
-    part_num = {}
-    price_tiers = {}
-    qty_avail = {}
-
-    # Scrape the part data from each distributor website or the local HTML.
-    for d in distributor_dict:
-        try:
-            dist_module = getattr(distributor_imports, d)
-        except AttributeError:
-            dist_module = getattr(distributor_imports, distributor_dict[d]['module'])
-
-        # Get the HTML tree for the part.
-        html_tree, url[d] = get_part_html_tree(part, d, dist_module.get_part_html_tree, local_part_html, scrape_retries, scrape_logger)
-
-        # Call the functions that extract the data from the HTML tree.
-        part_num[d] = dist_module.get_part_num(html_tree)
-        qty_avail[d] = dist_module.get_qty_avail(html_tree)
-        price_tiers[d] = dist_module.get_price_tiers(html_tree)
-
-    # Return the part data.
-    return id, url, part_num, price_tiers, qty_avail
+            print()