From 629a80853dac3bc0e1e2733205e9017464e64100 Mon Sep 17 00:00:00 2001 From: studentbrad Date: Fri, 4 Oct 2019 02:20:18 -0400 Subject: [PATCH 1/2] changed --no_scrape option to gather all pickles --- jobfunnel/__init__.py | 2 +- jobfunnel/__main__.py | 4 +-- jobfunnel/config/parser.py | 2 +- jobfunnel/jobfunnel.py | 27 +++++++++++------- readme.md | 57 +++++++++++++++++++++----------------- 5 files changed, 52 insertions(+), 40 deletions(-) diff --git a/jobfunnel/__init__.py b/jobfunnel/__init__.py index 7bb021e2..bc50bee6 100644 --- a/jobfunnel/__init__.py +++ b/jobfunnel/__init__.py @@ -1 +1 @@ -__version__ = '1.1.3' +__version__ = '1.1.4' diff --git a/jobfunnel/__main__.py b/jobfunnel/__main__.py index 7d3c70a2..643a01ef 100755 --- a/jobfunnel/__main__.py +++ b/jobfunnel/__main__.py @@ -25,9 +25,9 @@ def main(): # parse the master list path to update filter list jp.update_filterjson() - # get jobs by either scraping jobs or loading today's dumped pickle + # get jobs by either scraping jobs or loading dumped pickles if config['no_scrape']: - jp.load_pickle(config) + jp.load_pickles(config) else: for p in config['providers']: provider = providers[p](config) diff --git a/jobfunnel/config/parser.py b/jobfunnel/config/parser.py index cb1af603..0dcb8765 100644 --- a/jobfunnel/config/parser.py +++ b/jobfunnel/config/parser.py @@ -52,7 +52,7 @@ def parse_cli(): dest='no_scrape', action='store_true', default=False, - help='skip web-scraping and load a previously saved pickle') + help='skip web-scraping and load previously saved pickles') return parser.parse_args() diff --git a/jobfunnel/jobfunnel.py b/jobfunnel/jobfunnel.py index 3ffdc44f..cd5a79ea 100644 --- a/jobfunnel/jobfunnel.py +++ b/jobfunnel/jobfunnel.py @@ -5,6 +5,7 @@ import json import logging import os +import re import csv import random from datetime import date @@ -88,13 +89,20 @@ def scrape(self): """ to be implemented by child classes""" raise NotImplementedError() - def load_pickle(self, args): - # try to load today's pickle from set var first: - pickle_filepath = os.path.join(args['data_path'], f'jobs_{self.date_string}.pkl') - try: - self.scrape_data = pickle.load(open(pickle_filepath, 'rb')) - except FileNotFoundError as e: - logging.error(f'{pickle_filepath} not found! Have you scraped any jobs today?') + def load_pickles(self, args): + # try to load any pickle from the data path + pickle_found = False + pickle_path = os.path.join(args['data_path']) + for root, dirs, files in os.walk(pickle_path): + for file in files: + if re.findall(r'jobs_.*', file): + if not pickle_found: pickle_found = True + pickle_file = file + pickle_filepath = os.path.join(pickle_path, pickle_file) + logging.info(f'loading pickle file: {pickle_filepath}') + self.scrape_data.update(pickle.load(open(pickle_filepath, 'rb'))) + if not pickle_found: + logging.error(f'no pickles found in {pickle_path}! Have you scraped any jobs?') raise e def dump_pickle(self): @@ -132,8 +140,7 @@ def remove_jobs_in_filterlist(self, data: Dict[str, dict]): if jobid in data: data.pop(jobid) n_filtered += 1 - logging.info(f'removed {n_filtered} jobs present in filter-list' - ' from master-list') + logging.info(f'removed {n_filtered} jobs present in filter-list') else: self.logger.warning( f'no jobs filtered, missing {self.filterlist_path}') @@ -200,7 +207,7 @@ def update_masterlist(self): self.remove_jobs_in_filterlist(masterlist) self.remove_blacklisted_companies(masterlist) - # update masterslist to contain only new (unqiue) listings + # update masterlist to contain only new (unique) listings tfidf_filter(self.scrape_data, masterlist) masterlist.update(self.scrape_data) diff --git a/readme.md b/readme.md index 972ee0f3..adb7b907 100644 --- a/readme.md +++ b/readme.md @@ -38,40 +38,44 @@ funnel --help 1. Set your job search preferences in the `yaml` configuration file (or use `-kw`). 1. Run `funnel` to scrape all-available job listings. -1. Review jobs in the master list, set any undesired jobs `status` to `archive`, these jobs will be removed from the `.csv` next time you run `funnel`. -1. If you get an `interview`/`offer` or are `rejected`, update the job `status`. +1. Review jobs in the master-list, update the job `status` to other values such as `interview` or `offer`. +1. Set any undesired job `status` to `archive`, these jobs will be removed from the `.csv` next time you run `funnel`. 1. Check out [demo/readme.md][demo] if you want to try the demo. __*Note*__: `rejected` jobs will be filtered out and will disappear from the output `.csv`. ### Usage Notes -* Note that any custom states (i.e `applied`) are preserved in the spreadsheet. -* To update active filters and to see any `new` jobs going forwards, just run `funnel` again, and review the `.csv` file. -* You can keep multiple search results across multiple `.csv` files: -``` -funnel -kw Python -o python_search -funnel -kw AI Machine Learning -o ML_search -``` -* Filter undesired companies by providing your own `yaml` configuration and adding them to the black list (see `JobFunnel/jobfunnel/config/settings.yaml`). -* JobFunnel can be easily automated to run nightly with [crontab][cron] -* You can review the job list in the command line: -``` -column -s, -t < master_list.csv | less -#2 -N -S -``` -* You can run several independent job searches with a directory structure like the following: +* **Custom Status**
+ Note that any custom states (i.e `applied`) are preserved in the spreadsheet. -```bash -python_search/ - |_ settings.yaml -ML_search/ - |_ settings.yaml +* **Running Filters**
+ To update active filters and to see any `new` jobs going forwards, just run `funnel` again, and review the `.csv` file. -for dir in */ ; do - funnel -s $dir/settings.yaml -done -``` -where each `settings.yaml` file can point to it's own directory. +* **Recovering Lost Spreadsheets**
+ If ever your spreadsheet gets deleted you still have the pickle files.
+ Simply run `funnel --no_scrape` to generate a new master-list. + +* **Managing Multiple Searches**
+ You can keep multiple search results across multiple `.csv` files: + ``` + funnel -kw Python -o python_search + funnel -kw AI Machine Learning -o ML_search + ``` + +* **Filtering Undesired Companies**
+ Filter undesired companies by providing your own `yaml` configuration and adding them to the black list (see `JobFunnel/jobfunnel/config/settings.yaml`). + +* **Automating Searches**
+ JobFunnel can be easily automated to run nightly with [crontab][cron]
+ For more information see the [crontab document][cron_doc]. + +* **Reviewing Jobs in Terminal**
+ You can review the job list in the command line: + ``` + column -s, -t < master_list.csv | less -#2 -N -S + ``` + @@ -79,3 +83,4 @@ where each `settings.yaml` file can point to it's own directory. [python]:https://www.python.org/ [demo]:demo/readme.md [cron]:https://en.wikipedia.org/wiki/Cron +[cron_doc]:docs/crontab/readme.md From 7f04e114c853d7f5709fa7bddb1334b4c7c47696 Mon Sep 17 00:00:00 2001 From: studentbrad Date: Fri, 4 Oct 2019 17:29:06 -0400 Subject: [PATCH 2/2] added recovery option --- jobfunnel/__main__.py | 4 +++- jobfunnel/config/parser.py | 11 ++++++++++- jobfunnel/jobfunnel.py | 17 ++++++++++++++--- readme.md | 6 +++--- 4 files changed, 30 insertions(+), 8 deletions(-) diff --git a/jobfunnel/__main__.py b/jobfunnel/__main__.py index 643a01ef..3c28a2da 100755 --- a/jobfunnel/__main__.py +++ b/jobfunnel/__main__.py @@ -26,8 +26,10 @@ def main(): jp.update_filterjson() # get jobs by either scraping jobs or loading dumped pickles - if config['no_scrape']: + if config['recover']: jp.load_pickles(config) + elif config['no_scrape']: + jp.load_pickle(config) else: for p in config['providers']: provider = providers[p](config) diff --git a/jobfunnel/config/parser.py b/jobfunnel/config/parser.py index 0dcb8765..9463f31c 100644 --- a/jobfunnel/config/parser.py +++ b/jobfunnel/config/parser.py @@ -52,7 +52,13 @@ def parse_cli(): dest='no_scrape', action='store_true', default=False, - help='skip web-scraping and load previously saved pickles') + help='skip web-scraping and load a previously saved daily scrape pickle') + + parser.add_argument('--recover', + dest='recover', + action='store_true', + default=False, + help='recover master-list by accessing all historic scrapes pickles') return parser.parse_args() @@ -120,6 +126,9 @@ def parse_config(): # parse the no_scrape option config['no_scrape'] = cli.no_scrape + # parse the recovery option + config['recover'] = cli.recover + # parse the log level config['log_level'] = log_levels[default_yaml['log_level']] if not given_yaml_path is None: diff --git a/jobfunnel/jobfunnel.py b/jobfunnel/jobfunnel.py index cd5a79ea..e2cd3a0f 100644 --- a/jobfunnel/jobfunnel.py +++ b/jobfunnel/jobfunnel.py @@ -86,11 +86,22 @@ def init_logging(self): self.logger.info(f'jobfunnel initialized at {self.date_string}') def scrape(self): - """ to be implemented by child classes""" + """function to be implemented by child classes""" raise NotImplementedError() + def load_pickle(self, args): + """function to load today's daily scrape pickle""" + ## only to be used in no_scrape mode + pickle_filepath = os.path.join(args['data_path'], f'jobs_{self.date_string}.pkl') + try: + self.scrape_data = pickle.load(open(pickle_filepath, 'rb')) + except FileNotFoundError as e: + logging.error(f'{pickle_filepath} not found! Have you scraped any jobs today?') + raise e + def load_pickles(self, args): - # try to load any pickle from the data path + """function to load all historic daily scrape pickles""" + ## only to be used in recovery mode pickle_found = False pickle_path = os.path.join(args['data_path']) for root, dirs, files in os.walk(pickle_path): @@ -106,7 +117,7 @@ def load_pickles(self, args): raise e def dump_pickle(self): - """ dump a pickle of the daily scrape dict""" + """function to dump a pickle of the daily scrape dict""" pickle_name = f'jobs_{self.date_string}.pkl' pickle.dump(self.scrape_data, open(os.path.join(self.pickles_dir, pickle_name), 'wb')) diff --git a/readme.md b/readme.md index adb7b907..86ffc766 100644 --- a/readme.md +++ b/readme.md @@ -52,9 +52,9 @@ __*Note*__: `rejected` jobs will be filtered out and will disappear from the out * **Running Filters**
To update active filters and to see any `new` jobs going forwards, just run `funnel` again, and review the `.csv` file. -* **Recovering Lost Spreadsheets**
- If ever your spreadsheet gets deleted you still have the pickle files.
- Simply run `funnel --no_scrape` to generate a new master-list. +* **Recovering Lost Master-list**
+ If ever your master-list gets deleted you still have the historic pickle files.
+ Simply run `funnel --recover` to generate a new master-list. * **Managing Multiple Searches**
You can keep multiple search results across multiple `.csv` files: