Merge pull request #64 from thebigG/date_filter

Date filter
PaulMcInnis · Mar 2, 2020 · cc0b880 · cc0b880
2 parents e87b8fe + 8cbe28f
commit cc0b880
Show file tree

Hide file tree

Showing 10 changed files with 66 additions and 14 deletions.
diff --git a/jobfunnel/__init__.py b/jobfunnel/__init__.py
@@ -1 +1 @@
-__version__ = '2.1.3'
+__version__ = '2.1.4'
diff --git a/jobfunnel/__main__.py b/jobfunnel/__main__.py
@@ -25,6 +25,7 @@ def main():
         print(e.strerror)
         sys.exit()
 
+
     # init class + logging
     jf = JobFunnel(config)
     jf.init_logging()

diff --git a/jobfunnel/config/parser.py b/jobfunnel/config/parser.py
@@ -64,7 +64,7 @@ def parse_cli():
                         dest='domain',
                         type=str,
                         required=False,
-                        help='domain value for a region ')   
+                        help='domain value for a region ')
 
     parser.add_argument('-r',
                         dest='random',
@@ -143,6 +143,13 @@ def parse_cli():
                         required=False,
                         default=None,
                         help='save duplicates popped by tf_idf filter to file')
+    parser.add_argument('--max_listing_days',
+                        dest='max_listing_days',
+                        type=int,
+                        default=None,
+                        required=False,
+                        help='The maximum number of days old a job can be.'
+                              '(i.e pass 30 to filter out jobs older than a month)')
 
     return parser.parse_args()
 
@@ -178,6 +185,8 @@ def cli_to_yaml(cli):
 
     if cli.proxy is not None:
         yaml['proxy'] = split_url(cli.proxy)
+    if cli.max_listing_days is not None:
+        yaml['max_listing_days'] = cli.max_listing_days
 
     return yaml
 
@@ -290,5 +299,7 @@ def parse_config():
     # check if proxy has not been set yet (optional)
     if 'proxy' not in config:
         config['proxy'] = None
+    if 'max_listing_days' not in config:
+        config['max_listing_days'] = None
 
     return config
diff --git a/jobfunnel/config/valid_options.py b/jobfunnel/config/valid_options.py
@@ -25,13 +25,15 @@
         'converge': [bool]
     },
     'proxy': [
-        None, 
+        None,
         {
             'protocol': str,
             'ip_address': str,
             'port': str
         }
-    ]
+    ],
+    'max_listing_days':[int]
+
 }
 
 PROVIDERS = ['glassdoor', 'indeed', 'monster']

diff --git a/jobfunnel/config/validate.py b/jobfunnel/config/validate.py
@@ -68,3 +68,7 @@ def validate_config(config):
 
     # check validity of delay settings
     validate_delay(config['delay_config'])
+
+    #check the validity of max_listing_days settings
+    if(config['max_listing_days'] is not None and config['max_listing_days']<0):
+        raise ConfigError('max_listing_days')
diff --git a/jobfunnel/glassdoor.py b/jobfunnel/glassdoor.py
@@ -282,14 +282,15 @@ def scrape(self):
 
             # key by id
             self.scrape_data[str(job['id'])] = job
-        # apply job pre-filter before scraping blurbs
-        super().pre_filter(self.scrape_data, self.provider)
 
+        # Do not change the order of the next three statements if you want date_filter to work
+
         # stores references to jobs in list to be used in blurb retrieval
         scrape_list = [i for i in self.scrape_data.values()]
-
         # converts job date formats into a standard date format
         post_date_from_relative_post_age(scrape_list)
+        # apply job pre-filter before scraping blurbs
+        super().pre_filter(self.scrape_data, self.provider)
 
         # checks if delay is set or not, then extracts blurbs from job links
         if self.delay_config is not None:

diff --git a/jobfunnel/jobfunnel.py b/jobfunnel/jobfunnel.py
@@ -18,7 +18,7 @@
 from requests import Session
 
 from .tools.delay import delay_alg
-from .tools.filters import tfidf_filter, id_filter
+from .tools.filters import tfidf_filter, id_filter, date_filter
 from .tools.tools import proxy_dict_to_url
 
 # setting job status to these words removes them from masterlist + adds to
@@ -39,6 +39,8 @@ class JobFunnel(object):
     filters """
 
     def __init__(self, args):
+        #The maximum number of days old a job can be
+        self.max_listing_days = args['max_listing_days']
         # paths
         self.master_list_path = args['master_list_path']
         self.filterlist_path = args['filter_list_path']
@@ -230,6 +232,9 @@ def update_filterjson(self):
     def pre_filter(self, data: Dict[str, dict], provider):
         """function called by child classes that applies multiple filters
         before getting job blurbs"""
+        #call date_filter if it is turned on
+        if self.max_listing_days is not None:
+            date_filter(data, self.max_listing_days)
         # call id_filter for master and duplicate lists, if they exist
         if os.path.isfile(self.master_list_path):
             id_filter(data, self.read_csv(self.master_list_path),

diff --git a/jobfunnel/monster.py b/jobfunnel/monster.py
@@ -211,14 +211,14 @@ def scrape(self):
             # key by id
             self.scrape_data[str(job['id'])] = job
 
-        # apply job pre-filter before scraping blurbs
-        super().pre_filter(self.scrape_data, self.provider)
-
+         # Do not change the order of the next three statements if you want date_filter to work
+
         # stores references to jobs in list to be used in blurb retrieval
         scrape_list = [i for i in self.scrape_data.values()]
-
         # converts job date formats into a standard date format
         post_date_from_relative_post_age(scrape_list)
+        # apply job pre-filter before scraping blurbs
+        super().pre_filter(self.scrape_data, self.provider)
 
         threads = ThreadPoolExecutor(max_workers=8)
         # checks if delay is set or not, then extracts blurbs from job links

diff --git a/jobfunnel/tools/filters.py b/jobfunnel/tools/filters.py
@@ -1,12 +1,35 @@
 import nltk
 import logging
-
+from datetime import datetime, date, timedelta
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from typing import Dict, Optional
 from numpy import delete as np_delete, max as np_max, fill_diagonal
 
 
+def date_filter(cur_dict: Dict[str, dict], number_of_days: int):
+    """Filter out jobs that are older than number_of_days
+        The assumed date format is yyyy-mm-dd
+        Args:
+        cur_dict: today's job scrape dict
+        number_of_days: how many days old a job can be
+    """
+    if number_of_days<0 or cur_dict is None:
+        return
+    print("date_filter running")
+    cur_job_ids = [job['id'] for job in cur_dict.values()]
+    #calculate the oldest date a job can be
+    threshold_date  = datetime.now() - timedelta(days=number_of_days)
+    for job_id in cur_job_ids:
+        #get the date from job with job_id
+        job_date = datetime.strptime(cur_dict[job_id]['date'], '%Y-%m-%d')
+        #if this job is older than threshold_date, delete it from current scrape
+        if job_date<threshold_date:
+            logging.info(f"{cur_dict[job_id]['link']} has been filtered out by date_filter because"
+                    f" it is older than {number_of_days} days")
+            del cur_dict[job_id]
+
+
 def id_filter(cur_dict: Dict[str, dict], prev_dict: Dict[str, dict], provider):
     """ Filter duplicates on job id per provider.
 

diff --git a/readme.md b/readme.md
@@ -64,7 +64,12 @@ __*Note*__: `rejected` jobs will be filtered out and will disappear from the out
   ```
 
 * **Filtering Undesired Companies** <br />
-  Filter undesired companies by providing your own `yaml` configuration and adding them to the black list (see `JobFunnel/jobfunnel/config/settings.yaml`).
+Filter undesired companies by providing your own `yaml` configuration and adding them to the black list(see `JobFunnel/jobfunnel/config/settings.yaml`).
+
+* **Filtering Old Jobs**<br />
+  Filter jobs that you think are too old:
+  `funnel -s JobFunnel/demo/settings.yaml --max_listing_days 30` will filter out job listings that are older than 30 days.
+
 
 * **Automating Searches** <br />
   JobFunnel can be easily automated to run nightly with [crontab][cron] <br />