Skip to content

Commit

Permalink
Merge pull request #34 from PaulMcInnis/studentbrad/bad_status
Browse files Browse the repository at this point in the history
recover option to gather all historic pickles
  • Loading branch information
Bradley Aaron Kohler authored Oct 5, 2019
2 parents f2cc231 + 7f04e11 commit 404bba7
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 36 deletions.
2 changes: 1 addition & 1 deletion jobfunnel/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '1.1.3'
__version__ = '1.1.4'
6 changes: 4 additions & 2 deletions jobfunnel/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ def main():
# parse the master list path to update filter list
jp.update_filterjson()

# get jobs by either scraping jobs or loading today's dumped pickle
if config['no_scrape']:
# get jobs by either scraping jobs or loading dumped pickles
if config['recover']:
jp.load_pickles(config)
elif config['no_scrape']:
jp.load_pickle(config)
else:
for p in config['providers']:
Expand Down
11 changes: 10 additions & 1 deletion jobfunnel/config/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,13 @@ def parse_cli():
dest='no_scrape',
action='store_true',
default=False,
help='skip web-scraping and load a previously saved pickle')
help='skip web-scraping and load a previously saved daily scrape pickle')

parser.add_argument('--recover',
dest='recover',
action='store_true',
default=False,
help='recover master-list by accessing all historic scrapes pickles')

return parser.parse_args()

Expand Down Expand Up @@ -120,6 +126,9 @@ def parse_config():
# parse the no_scrape option
config['no_scrape'] = cli.no_scrape

# parse the recovery option
config['recover'] = cli.recover

# parse the log level
config['log_level'] = log_levels[default_yaml['log_level']]
if not given_yaml_path is None:
Expand Down
30 changes: 24 additions & 6 deletions jobfunnel/jobfunnel.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import json
import logging
import os
import re
import csv
import random
from datetime import date
Expand Down Expand Up @@ -85,20 +86,38 @@ def init_logging(self):
self.logger.info(f'jobfunnel initialized at {self.date_string}')

def scrape(self):
""" to be implemented by child classes"""
"""function to be implemented by child classes"""
raise NotImplementedError()

def load_pickle(self, args):
# try to load today's pickle from set var first:
"""function to load today's daily scrape pickle"""
## only to be used in no_scrape mode
pickle_filepath = os.path.join(args['data_path'], f'jobs_{self.date_string}.pkl')
try:
self.scrape_data = pickle.load(open(pickle_filepath, 'rb'))
except FileNotFoundError as e:
logging.error(f'{pickle_filepath} not found! Have you scraped any jobs today?')
raise e

def load_pickles(self, args):
"""function to load all historic daily scrape pickles"""
## only to be used in recovery mode
pickle_found = False
pickle_path = os.path.join(args['data_path'])
for root, dirs, files in os.walk(pickle_path):
for file in files:
if re.findall(r'jobs_.*', file):
if not pickle_found: pickle_found = True
pickle_file = file
pickle_filepath = os.path.join(pickle_path, pickle_file)
logging.info(f'loading pickle file: {pickle_filepath}')
self.scrape_data.update(pickle.load(open(pickle_filepath, 'rb')))
if not pickle_found:
logging.error(f'no pickles found in {pickle_path}! Have you scraped any jobs?')
raise e

def dump_pickle(self):
""" dump a pickle of the daily scrape dict"""
"""function to dump a pickle of the daily scrape dict"""
pickle_name = f'jobs_{self.date_string}.pkl'
pickle.dump(self.scrape_data,
open(os.path.join(self.pickles_dir, pickle_name), 'wb'))
Expand Down Expand Up @@ -132,8 +151,7 @@ def remove_jobs_in_filterlist(self, data: Dict[str, dict]):
if jobid in data:
data.pop(jobid)
n_filtered += 1
logging.info(f'removed {n_filtered} jobs present in filter-list'
' from master-list')
logging.info(f'removed {n_filtered} jobs present in filter-list')
else:
self.logger.warning(
f'no jobs filtered, missing {self.filterlist_path}')
Expand Down Expand Up @@ -200,7 +218,7 @@ def update_masterlist(self):
self.remove_jobs_in_filterlist(masterlist)
self.remove_blacklisted_companies(masterlist)

# update masterslist to contain only new (unqiue) listings
# update masterlist to contain only new (unique) listings
tfidf_filter(self.scrape_data, masterlist)
masterlist.update(self.scrape_data)

Expand Down
57 changes: 31 additions & 26 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,44 +38,49 @@ funnel --help

1. Set your job search preferences in the `yaml` configuration file (or use `-kw`).
1. Run `funnel` to scrape all-available job listings.
1. Review jobs in the master list, set any undesired jobs `status` to `archive`, these jobs will be removed from the `.csv` next time you run `funnel`.
1. If you get an `interview`/`offer` or are `rejected`, update the job `status`.
1. Review jobs in the master-list, update the job `status` to other values such as `interview` or `offer`.
1. Set any undesired job `status` to `archive`, these jobs will be removed from the `.csv` next time you run `funnel`.
1. Check out [demo/readme.md][demo] if you want to try the demo.

__*Note*__: `rejected` jobs will be filtered out and will disappear from the output `.csv`.

### Usage Notes

* Note that any custom states (i.e `applied`) are preserved in the spreadsheet.
* To update active filters and to see any `new` jobs going forwards, just run `funnel` again, and review the `.csv` file.
* You can keep multiple search results across multiple `.csv` files:
```
funnel -kw Python -o python_search
funnel -kw AI Machine Learning -o ML_search
```
* Filter undesired companies by providing your own `yaml` configuration and adding them to the black list (see `JobFunnel/jobfunnel/config/settings.yaml`).
* JobFunnel can be easily automated to run nightly with [crontab][cron]
* You can review the job list in the command line:
```
column -s, -t < master_list.csv | less -#2 -N -S
```
* You can run several independent job searches with a directory structure like the following:
* **Custom Status** <br/>
Note that any custom states (i.e `applied`) are preserved in the spreadsheet.

```bash
python_search/
|_ settings.yaml
ML_search/
|_ settings.yaml
* **Running Filters** <br />
To update active filters and to see any `new` jobs going forwards, just run `funnel` again, and review the `.csv` file.

for dir in */ ; do
funnel -s $dir/settings.yaml
done
```
where each `settings.yaml` file can point to it's own directory.
* **Recovering Lost Master-list** <br />
If ever your master-list gets deleted you still have the historic pickle files. <br />
Simply run `funnel --recover` to generate a new master-list.

* **Managing Multiple Searches** <br />
You can keep multiple search results across multiple `.csv` files:
```
funnel -kw Python -o python_search
funnel -kw AI Machine Learning -o ML_search
```

* **Filtering Undesired Companies** <br />
Filter undesired companies by providing your own `yaml` configuration and adding them to the black list (see `JobFunnel/jobfunnel/config/settings.yaml`).

* **Automating Searches** <br />
JobFunnel can be easily automated to run nightly with [crontab][cron] <br />
For more information see the [crontab document][cron_doc].

* **Reviewing Jobs in Terminal** <br />
You can review the job list in the command line:
```
column -s, -t < master_list.csv | less -#2 -N -S
```


<!-- links -->

[masterlist]:demo/assests/demo.png "masterlist.csv"
[python]:https://www.python.org/
[demo]:demo/readme.md
[cron]:https://en.wikipedia.org/wiki/Cron
[cron_doc]:docs/crontab/readme.md

0 comments on commit 404bba7

Please sign in to comment.