Merge pull request #34 from PaulMcInnis/studentbrad/bad_status

recover option to gather all historic pickles
PaulMcInnis · Oct 5, 2019 · 404bba7 · 404bba7
2 parents f2cc231 + 7f04e11
commit 404bba7
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 36 deletions.
diff --git a/jobfunnel/__init__.py b/jobfunnel/__init__.py
@@ -1 +1 @@
-__version__ = '1.1.3'
+__version__ = '1.1.4'
diff --git a/jobfunnel/__main__.py b/jobfunnel/__main__.py
@@ -25,8 +25,10 @@ def main():
     # parse the master list path to update filter list
     jp.update_filterjson()
 
-    # get jobs by either scraping jobs or loading today's dumped pickle
-    if config['no_scrape']:
+    # get jobs by either scraping jobs or loading dumped pickles
+    if config['recover']:
+        jp.load_pickles(config)
+    elif config['no_scrape']:
         jp.load_pickle(config)
     else:
         for p in config['providers']:

diff --git a/jobfunnel/config/parser.py b/jobfunnel/config/parser.py
@@ -52,7 +52,13 @@ def parse_cli():
         dest='no_scrape',
         action='store_true',
         default=False,
-        help='skip web-scraping and load a previously saved pickle')
+        help='skip web-scraping and load a previously saved daily scrape pickle')
+
+    parser.add_argument('--recover',
+        dest='recover',
+        action='store_true',
+        default=False,
+        help='recover master-list by accessing all historic scrapes pickles')
 
     return parser.parse_args()
 
@@ -120,6 +126,9 @@ def parse_config():
     # parse the no_scrape option
     config['no_scrape'] = cli.no_scrape
 
+    # parse the recovery option
+    config['recover'] = cli.recover
+
     # parse the log level
     config['log_level'] = log_levels[default_yaml['log_level']]
     if not given_yaml_path is None:

diff --git a/jobfunnel/jobfunnel.py b/jobfunnel/jobfunnel.py
@@ -5,6 +5,7 @@
 import json
 import logging
 import os
+import re
 import csv
 import random
 from datetime import date
@@ -85,20 +86,38 @@ def init_logging(self):
         self.logger.info(f'jobfunnel initialized at {self.date_string}')
 
     def scrape(self):
-        """ to be implemented by child classes"""
+        """function to be implemented by child classes"""
         raise NotImplementedError()
 
     def load_pickle(self, args):
-        # try to load today's pickle from set var first:
+        """function to load today's daily scrape pickle"""
+        ## only to be used in no_scrape mode
         pickle_filepath = os.path.join(args['data_path'], f'jobs_{self.date_string}.pkl')
         try:
             self.scrape_data = pickle.load(open(pickle_filepath, 'rb'))
         except FileNotFoundError as e:
             logging.error(f'{pickle_filepath} not found! Have you scraped any jobs today?')
             raise e
 
+    def load_pickles(self, args):
+        """function to load all historic daily scrape pickles"""
+        ## only to be used in recovery mode
+        pickle_found = False
+        pickle_path = os.path.join(args['data_path'])
+        for root, dirs, files in os.walk(pickle_path):
+            for file in files:
+                if re.findall(r'jobs_.*', file):
+                    if not pickle_found: pickle_found = True
+                    pickle_file = file
+                    pickle_filepath = os.path.join(pickle_path, pickle_file)
+                    logging.info(f'loading pickle file: {pickle_filepath}')
+                    self.scrape_data.update(pickle.load(open(pickle_filepath, 'rb')))
+        if not pickle_found:
+            logging.error(f'no pickles found in {pickle_path}! Have you scraped any jobs?')
+            raise e
+
     def dump_pickle(self):
-        """ dump a pickle of the daily scrape dict"""
+        """function to dump a pickle of the daily scrape dict"""
         pickle_name = f'jobs_{self.date_string}.pkl'
         pickle.dump(self.scrape_data,
                     open(os.path.join(self.pickles_dir, pickle_name), 'wb'))
@@ -132,8 +151,7 @@ def remove_jobs_in_filterlist(self, data: Dict[str, dict]):
                 if jobid in data:
                     data.pop(jobid)
                     n_filtered += 1
-            logging.info(f'removed {n_filtered} jobs present in filter-list'
-                         ' from master-list')
+            logging.info(f'removed {n_filtered} jobs present in filter-list')
         else:
             self.logger.warning(
                 f'no jobs filtered, missing {self.filterlist_path}')
@@ -200,7 +218,7 @@ def update_masterlist(self):
             self.remove_jobs_in_filterlist(masterlist)
             self.remove_blacklisted_companies(masterlist)
 
-            # update masterslist to contain only new (unqiue) listings
+            # update masterlist to contain only new (unique) listings
             tfidf_filter(self.scrape_data, masterlist)
             masterlist.update(self.scrape_data)
 

diff --git a/readme.md b/readme.md
@@ -38,44 +38,49 @@ funnel --help
 
 1. Set your job search preferences in the `yaml` configuration file (or use `-kw`).
 1. Run `funnel` to scrape all-available job listings.
-1. Review jobs in the master list, set any undesired jobs `status` to `archive`, these jobs will be removed from the `.csv` next time you run `funnel`.
-1. If you get an `interview`/`offer` or are `rejected`, update the job `status`.
+1. Review jobs in the master-list, update the job `status` to other values such as `interview` or `offer`.
+1. Set any undesired job `status` to `archive`, these jobs will be removed from the `.csv` next time you run `funnel`.
 1. Check out [demo/readme.md][demo] if you want to try the demo.
 
 __*Note*__: `rejected` jobs will be filtered out and will disappear from the output `.csv`.
 
 ### Usage Notes
 
-* Note that any custom states (i.e `applied`) are preserved in the spreadsheet.
-* To update active filters and to see any `new` jobs going forwards, just run `funnel` again, and review the `.csv` file.
-* You can keep multiple search results across multiple `.csv` files:
-```
-funnel -kw Python -o python_search
-funnel -kw AI Machine Learning -o ML_search
-```
-* Filter undesired companies by providing your own `yaml` configuration and adding them to the black list (see `JobFunnel/jobfunnel/config/settings.yaml`).
-* JobFunnel can be easily automated to run nightly with [crontab][cron]
-* You can review the job list in the command line:
-```
-column -s, -t < master_list.csv | less -#2 -N -S
-```
-* You can run several independent job searches with a directory structure like the following:
+* **Custom Status** <br/>
+  Note that any custom states (i.e `applied`) are preserved in the spreadsheet.
 
-```bash
-python_search/
-  |_ settings.yaml
-ML_search/
-  |_ settings.yaml
+* **Running Filters** <br />
+  To update active filters and to see any `new` jobs going forwards, just run `funnel` again, and review the `.csv` file.
 
-for dir in */ ; do
-    funnel -s $dir/settings.yaml
-done
-```
-where each `settings.yaml` file can point to it's own directory.
+* **Recovering Lost Master-list** <br />
+  If ever your master-list gets deleted you still have the historic pickle files. <br />
+  Simply run `funnel --recover` to generate a new master-list.
+
+* **Managing Multiple Searches** <br />
+  You can keep multiple search results across multiple `.csv` files:
+  ```
+  funnel -kw Python -o python_search
+  funnel -kw AI Machine Learning -o ML_search
+  ```
+
+* **Filtering Undesired Companies** <br />
+  Filter undesired companies by providing your own `yaml` configuration and adding them to the black list (see `JobFunnel/jobfunnel/config/settings.yaml`).
+
+* **Automating Searches** <br />
+  JobFunnel can be easily automated to run nightly with [crontab][cron] <br />
+  For more information see the [crontab document][cron_doc].
+
+* **Reviewing Jobs in Terminal** <br />
+  You can review the job list in the command line:
+  ```
+  column -s, -t < master_list.csv | less -#2 -N -S
+  ```
+
 
 <!-- links -->
 
 [masterlist]:demo/assests/demo.png "masterlist.csv"
 [python]:https://www.python.org/
 [demo]:demo/readme.md
 [cron]:https://en.wikipedia.org/wiki/Cron
+[cron_doc]:docs/crontab/readme.md