From 629a80853dac3bc0e1e2733205e9017464e64100 Mon Sep 17 00:00:00 2001
From: studentbrad <studentbrad.github@gmail.com>
Date: Fri, 4 Oct 2019 02:20:18 -0400
Subject: [PATCH 1/2] changed --no_scrape option to gather all pickles

---
 jobfunnel/__init__.py      |  2 +-
 jobfunnel/__main__.py      |  4 +--
 jobfunnel/config/parser.py |  2 +-
 jobfunnel/jobfunnel.py     | 27 +++++++++++-------
 readme.md                  | 57 +++++++++++++++++++++-----------------
 5 files changed, 52 insertions(+), 40 deletions(-)

diff --git a/jobfunnel/__init__.py b/jobfunnel/__init__.py
index 7bb021e2..bc50bee6 100644
--- a/jobfunnel/__init__.py
+++ b/jobfunnel/__init__.py
@@ -1 +1 @@
-__version__ = '1.1.3'
+__version__ = '1.1.4'
diff --git a/jobfunnel/__main__.py b/jobfunnel/__main__.py
index 7d3c70a2..643a01ef 100755
--- a/jobfunnel/__main__.py
+++ b/jobfunnel/__main__.py
@@ -25,9 +25,9 @@ def main():
     # parse the master list path to update filter list
     jp.update_filterjson()
 
-    # get jobs by either scraping jobs or loading today's dumped pickle
+    # get jobs by either scraping jobs or loading dumped pickles
     if config['no_scrape']:
-        jp.load_pickle(config)
+        jp.load_pickles(config)
     else:
         for p in config['providers']:
             provider = providers[p](config)
diff --git a/jobfunnel/config/parser.py b/jobfunnel/config/parser.py
index cb1af603..0dcb8765 100644
--- a/jobfunnel/config/parser.py
+++ b/jobfunnel/config/parser.py
@@ -52,7 +52,7 @@ def parse_cli():
         dest='no_scrape',
         action='store_true',
         default=False,
-        help='skip web-scraping and load a previously saved pickle')
+        help='skip web-scraping and load previously saved pickles')
 
     return parser.parse_args()
 
diff --git a/jobfunnel/jobfunnel.py b/jobfunnel/jobfunnel.py
index 3ffdc44f..cd5a79ea 100644
--- a/jobfunnel/jobfunnel.py
+++ b/jobfunnel/jobfunnel.py
@@ -5,6 +5,7 @@
 import json
 import logging
 import os
+import re
 import csv
 import random
 from datetime import date
@@ -88,13 +89,20 @@ def scrape(self):
         """ to be implemented by child classes"""
         raise NotImplementedError()
 
-    def load_pickle(self, args):
-        # try to load today's pickle from set var first:
-        pickle_filepath = os.path.join(args['data_path'], f'jobs_{self.date_string}.pkl')
-        try:
-            self.scrape_data = pickle.load(open(pickle_filepath, 'rb'))
-        except FileNotFoundError as e:
-            logging.error(f'{pickle_filepath} not found! Have you scraped any jobs today?')
+    def load_pickles(self, args):
+        # try to load any pickle from the data path
+        pickle_found = False
+        pickle_path = os.path.join(args['data_path'])
+        for root, dirs, files in os.walk(pickle_path):
+            for file in files:
+                if re.findall(r'jobs_.*', file):
+                    if not pickle_found: pickle_found = True
+                    pickle_file = file
+                    pickle_filepath = os.path.join(pickle_path, pickle_file)
+                    logging.info(f'loading pickle file: {pickle_filepath}')
+                    self.scrape_data.update(pickle.load(open(pickle_filepath, 'rb')))
+        if not pickle_found:
+            logging.error(f'no pickles found in {pickle_path}! Have you scraped any jobs?')
             raise e
 
     def dump_pickle(self):
@@ -132,8 +140,7 @@ def remove_jobs_in_filterlist(self, data: Dict[str, dict]):
                 if jobid in data:
                     data.pop(jobid)
                     n_filtered += 1
-            logging.info(f'removed {n_filtered} jobs present in filter-list'
-                         ' from master-list')
+            logging.info(f'removed {n_filtered} jobs present in filter-list')
         else:
             self.logger.warning(
                 f'no jobs filtered, missing {self.filterlist_path}')
@@ -200,7 +207,7 @@ def update_masterlist(self):
             self.remove_jobs_in_filterlist(masterlist)
             self.remove_blacklisted_companies(masterlist)
 
-            # update masterslist to contain only new (unqiue) listings
+            # update masterlist to contain only new (unique) listings
             tfidf_filter(self.scrape_data, masterlist)
             masterlist.update(self.scrape_data)
 
diff --git a/readme.md b/readme.md
index 972ee0f3..adb7b907 100644
--- a/readme.md
+++ b/readme.md
@@ -38,40 +38,44 @@ funnel --help
 
 1. Set your job search preferences in the `yaml` configuration file (or use `-kw`).
 1. Run `funnel` to scrape all-available job listings.
-1. Review jobs in the master list, set any undesired jobs `status` to `archive`, these jobs will be removed from the `.csv` next time you run `funnel`.
-1. If you get an `interview`/`offer` or are `rejected`, update the job `status`.
+1. Review jobs in the master-list, update the job `status` to other values such as `interview` or `offer`.
+1. Set any undesired job `status` to `archive`, these jobs will be removed from the `.csv` next time you run `funnel`.
 1. Check out [demo/readme.md][demo] if you want to try the demo.
 
 __*Note*__: `rejected` jobs will be filtered out and will disappear from the output `.csv`.
 
 ### Usage Notes
 
-* Note that any custom states (i.e `applied`) are preserved in the spreadsheet.
-* To update active filters and to see any `new` jobs going forwards, just run `funnel` again, and review the `.csv` file.
-* You can keep multiple search results across multiple `.csv` files:
-```
-funnel -kw Python -o python_search
-funnel -kw AI Machine Learning -o ML_search
-```
-* Filter undesired companies by providing your own `yaml` configuration and adding them to the black list (see `JobFunnel/jobfunnel/config/settings.yaml`).
-* JobFunnel can be easily automated to run nightly with [crontab][cron]
-* You can review the job list in the command line:
-```
-column -s, -t < master_list.csv | less -#2 -N -S
-```
-* You can run several independent job searches with a directory structure like the following:
+* **Custom Status** <br/>
+  Note that any custom states (i.e `applied`) are preserved in the spreadsheet.
 
-```bash
-python_search/
-  |_ settings.yaml
-ML_search/
-  |_ settings.yaml
+* **Running Filters** <br />
+  To update active filters and to see any `new` jobs going forwards, just run `funnel` again, and review the `.csv` file.
 
-for dir in */ ; do
-    funnel -s $dir/settings.yaml
-done
-```
-where each `settings.yaml` file can point to it's own directory.
+* **Recovering Lost Spreadsheets** <br />
+  If ever your spreadsheet gets deleted you still have the pickle files. <br />
+  Simply run `funnel --no_scrape` to generate a new master-list.
+
+* **Managing Multiple Searches** <br />
+  You can keep multiple search results across multiple `.csv` files:
+  ```
+  funnel -kw Python -o python_search
+  funnel -kw AI Machine Learning -o ML_search
+  ```
+
+* **Filtering Undesired Companies** <br />
+  Filter undesired companies by providing your own `yaml` configuration and adding them to the black list (see `JobFunnel/jobfunnel/config/settings.yaml`).
+
+* **Automating Searches** <br />
+  JobFunnel can be easily automated to run nightly with [crontab][cron] <br />
+  For more information see the [crontab document][cron_doc].
+
+* **Reviewing Jobs in Terminal** <br />
+  You can review the job list in the command line:
+  ```
+  column -s, -t < master_list.csv | less -#2 -N -S
+  ```
+  
 
 <!-- links -->
 
@@ -79,3 +83,4 @@ where each `settings.yaml` file can point to it's own directory.
 [python]:https://www.python.org/
 [demo]:demo/readme.md
 [cron]:https://en.wikipedia.org/wiki/Cron
+[cron_doc]:docs/crontab/readme.md

From 7f04e114c853d7f5709fa7bddb1334b4c7c47696 Mon Sep 17 00:00:00 2001
From: studentbrad <studentbrad.github@gmail.com>
Date: Fri, 4 Oct 2019 17:29:06 -0400
Subject: [PATCH 2/2] added recovery option

---
 jobfunnel/__main__.py      |  4 +++-
 jobfunnel/config/parser.py | 11 ++++++++++-
 jobfunnel/jobfunnel.py     | 17 ++++++++++++++---
 readme.md                  |  6 +++---
 4 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/jobfunnel/__main__.py b/jobfunnel/__main__.py
index 643a01ef..3c28a2da 100755
--- a/jobfunnel/__main__.py
+++ b/jobfunnel/__main__.py
@@ -26,8 +26,10 @@ def main():
     jp.update_filterjson()
 
     # get jobs by either scraping jobs or loading dumped pickles
-    if config['no_scrape']:
+    if config['recover']:
         jp.load_pickles(config)
+    elif config['no_scrape']:
+        jp.load_pickle(config)
     else:
         for p in config['providers']:
             provider = providers[p](config)
diff --git a/jobfunnel/config/parser.py b/jobfunnel/config/parser.py
index 0dcb8765..9463f31c 100644
--- a/jobfunnel/config/parser.py
+++ b/jobfunnel/config/parser.py
@@ -52,7 +52,13 @@ def parse_cli():
         dest='no_scrape',
         action='store_true',
         default=False,
-        help='skip web-scraping and load previously saved pickles')
+        help='skip web-scraping and load a previously saved daily scrape pickle')
+
+    parser.add_argument('--recover',
+        dest='recover',
+        action='store_true',
+        default=False,
+        help='recover master-list by accessing all historic scrapes pickles')
 
     return parser.parse_args()
 
@@ -120,6 +126,9 @@ def parse_config():
     # parse the no_scrape option
     config['no_scrape'] = cli.no_scrape
 
+    # parse the recovery option
+    config['recover'] = cli.recover
+
     # parse the log level
     config['log_level'] = log_levels[default_yaml['log_level']]
     if not given_yaml_path is None:
diff --git a/jobfunnel/jobfunnel.py b/jobfunnel/jobfunnel.py
index cd5a79ea..e2cd3a0f 100644
--- a/jobfunnel/jobfunnel.py
+++ b/jobfunnel/jobfunnel.py
@@ -86,11 +86,22 @@ def init_logging(self):
         self.logger.info(f'jobfunnel initialized at {self.date_string}')
 
     def scrape(self):
-        """ to be implemented by child classes"""
+        """function to be implemented by child classes"""
         raise NotImplementedError()
 
+    def load_pickle(self, args):
+        """function to load today's daily scrape pickle"""
+        ## only to be used in no_scrape mode
+        pickle_filepath = os.path.join(args['data_path'], f'jobs_{self.date_string}.pkl')
+        try:
+            self.scrape_data = pickle.load(open(pickle_filepath, 'rb'))
+        except FileNotFoundError as e:
+            logging.error(f'{pickle_filepath} not found! Have you scraped any jobs today?')
+            raise e
+
     def load_pickles(self, args):
-        # try to load any pickle from the data path
+        """function to load all historic daily scrape pickles"""
+        ## only to be used in recovery mode
         pickle_found = False
         pickle_path = os.path.join(args['data_path'])
         for root, dirs, files in os.walk(pickle_path):
@@ -106,7 +117,7 @@ def load_pickles(self, args):
             raise e
 
     def dump_pickle(self):
-        """ dump a pickle of the daily scrape dict"""
+        """function to dump a pickle of the daily scrape dict"""
         pickle_name = f'jobs_{self.date_string}.pkl'
         pickle.dump(self.scrape_data,
                     open(os.path.join(self.pickles_dir, pickle_name), 'wb'))
diff --git a/readme.md b/readme.md
index adb7b907..86ffc766 100644
--- a/readme.md
+++ b/readme.md
@@ -52,9 +52,9 @@ __*Note*__: `rejected` jobs will be filtered out and will disappear from the out
 * **Running Filters** <br />
   To update active filters and to see any `new` jobs going forwards, just run `funnel` again, and review the `.csv` file.
 
-* **Recovering Lost Spreadsheets** <br />
-  If ever your spreadsheet gets deleted you still have the pickle files. <br />
-  Simply run `funnel --no_scrape` to generate a new master-list.
+* **Recovering Lost Master-list** <br />
+  If ever your master-list gets deleted you still have the historic pickle files. <br />
+  Simply run `funnel --recover` to generate a new master-list.
 
 * **Managing Multiple Searches** <br />
   You can keep multiple search results across multiple `.csv` files: