Skip to content

Commit

Permalink
Merge pull request #64 from thebigG/date_filter
Browse files Browse the repository at this point in the history
Date filter
  • Loading branch information
thebigG authored Mar 2, 2020
2 parents e87b8fe + 8cbe28f commit cc0b880
Show file tree
Hide file tree
Showing 10 changed files with 66 additions and 14 deletions.
2 changes: 1 addition & 1 deletion jobfunnel/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '2.1.3'
__version__ = '2.1.4'
1 change: 1 addition & 0 deletions jobfunnel/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def main():
print(e.strerror)
sys.exit()


# init class + logging
jf = JobFunnel(config)
jf.init_logging()
Expand Down
13 changes: 12 additions & 1 deletion jobfunnel/config/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def parse_cli():
dest='domain',
type=str,
required=False,
help='domain value for a region ')
help='domain value for a region ')

parser.add_argument('-r',
dest='random',
Expand Down Expand Up @@ -143,6 +143,13 @@ def parse_cli():
required=False,
default=None,
help='save duplicates popped by tf_idf filter to file')
parser.add_argument('--max_listing_days',
dest='max_listing_days',
type=int,
default=None,
required=False,
help='The maximum number of days old a job can be.'
'(i.e pass 30 to filter out jobs older than a month)')

return parser.parse_args()

Expand Down Expand Up @@ -178,6 +185,8 @@ def cli_to_yaml(cli):

if cli.proxy is not None:
yaml['proxy'] = split_url(cli.proxy)
if cli.max_listing_days is not None:
yaml['max_listing_days'] = cli.max_listing_days

return yaml

Expand Down Expand Up @@ -290,5 +299,7 @@ def parse_config():
# check if proxy has not been set yet (optional)
if 'proxy' not in config:
config['proxy'] = None
if 'max_listing_days' not in config:
config['max_listing_days'] = None

return config
6 changes: 4 additions & 2 deletions jobfunnel/config/valid_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,15 @@
'converge': [bool]
},
'proxy': [
None,
None,
{
'protocol': str,
'ip_address': str,
'port': str
}
]
],
'max_listing_days':[int]

}

PROVIDERS = ['glassdoor', 'indeed', 'monster']
Expand Down
4 changes: 4 additions & 0 deletions jobfunnel/config/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,7 @@ def validate_config(config):

# check validity of delay settings
validate_delay(config['delay_config'])

#check the validity of max_listing_days settings
if(config['max_listing_days'] is not None and config['max_listing_days']<0):
raise ConfigError('max_listing_days')
7 changes: 4 additions & 3 deletions jobfunnel/glassdoor.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,14 +282,15 @@ def scrape(self):

# key by id
self.scrape_data[str(job['id'])] = job
# apply job pre-filter before scraping blurbs
super().pre_filter(self.scrape_data, self.provider)

# Do not change the order of the next three statements if you want date_filter to work

# stores references to jobs in list to be used in blurb retrieval
scrape_list = [i for i in self.scrape_data.values()]

# converts job date formats into a standard date format
post_date_from_relative_post_age(scrape_list)
# apply job pre-filter before scraping blurbs
super().pre_filter(self.scrape_data, self.provider)

# checks if delay is set or not, then extracts blurbs from job links
if self.delay_config is not None:
Expand Down
7 changes: 6 additions & 1 deletion jobfunnel/jobfunnel.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from requests import Session

from .tools.delay import delay_alg
from .tools.filters import tfidf_filter, id_filter
from .tools.filters import tfidf_filter, id_filter, date_filter
from .tools.tools import proxy_dict_to_url

# setting job status to these words removes them from masterlist + adds to
Expand All @@ -39,6 +39,8 @@ class JobFunnel(object):
filters """

def __init__(self, args):
#The maximum number of days old a job can be
self.max_listing_days = args['max_listing_days']
# paths
self.master_list_path = args['master_list_path']
self.filterlist_path = args['filter_list_path']
Expand Down Expand Up @@ -230,6 +232,9 @@ def update_filterjson(self):
def pre_filter(self, data: Dict[str, dict], provider):
"""function called by child classes that applies multiple filters
before getting job blurbs"""
#call date_filter if it is turned on
if self.max_listing_days is not None:
date_filter(data, self.max_listing_days)
# call id_filter for master and duplicate lists, if they exist
if os.path.isfile(self.master_list_path):
id_filter(data, self.read_csv(self.master_list_path),
Expand Down
8 changes: 4 additions & 4 deletions jobfunnel/monster.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,14 +211,14 @@ def scrape(self):
# key by id
self.scrape_data[str(job['id'])] = job

# apply job pre-filter before scraping blurbs
super().pre_filter(self.scrape_data, self.provider)

# Do not change the order of the next three statements if you want date_filter to work

# stores references to jobs in list to be used in blurb retrieval
scrape_list = [i for i in self.scrape_data.values()]

# converts job date formats into a standard date format
post_date_from_relative_post_age(scrape_list)
# apply job pre-filter before scraping blurbs
super().pre_filter(self.scrape_data, self.provider)

threads = ThreadPoolExecutor(max_workers=8)
# checks if delay is set or not, then extracts blurbs from job links
Expand Down
25 changes: 24 additions & 1 deletion jobfunnel/tools/filters.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,35 @@
import nltk
import logging

from datetime import datetime, date, timedelta
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict, Optional
from numpy import delete as np_delete, max as np_max, fill_diagonal


def date_filter(cur_dict: Dict[str, dict], number_of_days: int):
"""Filter out jobs that are older than number_of_days
The assumed date format is yyyy-mm-dd
Args:
cur_dict: today's job scrape dict
number_of_days: how many days old a job can be
"""
if number_of_days<0 or cur_dict is None:
return
print("date_filter running")
cur_job_ids = [job['id'] for job in cur_dict.values()]
#calculate the oldest date a job can be
threshold_date = datetime.now() - timedelta(days=number_of_days)
for job_id in cur_job_ids:
#get the date from job with job_id
job_date = datetime.strptime(cur_dict[job_id]['date'], '%Y-%m-%d')
#if this job is older than threshold_date, delete it from current scrape
if job_date<threshold_date:
logging.info(f"{cur_dict[job_id]['link']} has been filtered out by date_filter because"
f" it is older than {number_of_days} days")
del cur_dict[job_id]


def id_filter(cur_dict: Dict[str, dict], prev_dict: Dict[str, dict], provider):
""" Filter duplicates on job id per provider.
Expand Down
7 changes: 6 additions & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@ __*Note*__: `rejected` jobs will be filtered out and will disappear from the out
```

* **Filtering Undesired Companies** <br />
Filter undesired companies by providing your own `yaml` configuration and adding them to the black list (see `JobFunnel/jobfunnel/config/settings.yaml`).
Filter undesired companies by providing your own `yaml` configuration and adding them to the black list(see `JobFunnel/jobfunnel/config/settings.yaml`).

* **Filtering Old Jobs**<br />
Filter jobs that you think are too old:
`funnel -s JobFunnel/demo/settings.yaml --max_listing_days 30` will filter out job listings that are older than 30 days.


* **Automating Searches** <br />
JobFunnel can be easily automated to run nightly with [crontab][cron] <br />
Expand Down

0 comments on commit cc0b880

Please sign in to comment.