-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
147 lines (119 loc) · 6.32 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import importlib
import argparse
from flask import Flask, request, jsonify, render_template, make_response, send_from_directory
from flask_limiter import Limiter
from flask_limiter.util import get_remote_address
from werkzeug.exceptions import RequestEntityTooLarge
from config import Config
from src.ranking import rank_job_descriptions
from src.utils import validate_and_clean_input, dump_ranked_jobs, process_job_dataframe
import logging
import asyncio
import gc
# Flask app setup
app = Flask(__name__, template_folder=Config.TEMPLATE_FOLDER, static_folder=Config.STATIC_FOLDER)
app.config.from_object(Config)
# Set up rate limiter
limiter = Limiter(
get_remote_address,
app=app,
default_limits=["50 per minute"]
)
# Logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def load_fetcher_modules(fetcher_dir):
"""Dynamically load fetchers modules from the specified directory."""
fetcher_modules = {}
for filename in os.listdir(fetcher_dir):
if filename.endswith(".py") and filename != "__init__.py":
module_name = filename[:-3] # Remove the .py extension
module_path = f"src.fetchers.{module_name}"
# Dynamically import the module
module = importlib.import_module(module_path)
# Ensure the module provides a `fetch_jobs` function
if hasattr(module, 'fetch_jobs'):
fetcher_modules[module_name] = module.fetch_jobs
else:
logger.warning(f"Module {module_name} does not have a 'fetch_jobs' function")
return fetcher_modules
# Initialize job fetching function based on configuration
def load_job_fetching_function(fetcher_name, fetcher_modules):
"""Initialize the job fetching function based on the specified fetcher name."""
if fetcher_name in fetcher_modules:
logger.info(f"Using {fetcher_name}-based job fetching.")
return fetcher_modules[fetcher_name]
else:
available_fetchers = ', '.join(fetcher_modules.keys())
raise ValueError(f"Unknown fetcher: {fetcher_name}. Available fetchers: {available_fetchers}. Check the .env setting and your fetcher implementation in src/fetchers/")
# Load fetcher modules and initialize job fetching function
fetcher_modules = load_fetcher_modules("src/fetchers")
job_fetching_function = load_job_fetching_function(Config.FETCHER_NAME, fetcher_modules)
@app.after_request
def add_security_headers(response):
response.headers['Content-Security-Policy'] = (
"default-src 'self'; " # Only allow resources from the same origin
"script-src 'self' https://cdn.jsdelivr.net https://code.jquery.com 'unsafe-inline'; " # Allow scripts from jsdelivr.net and code.jquery.com
"style-src 'self' 'unsafe-inline' https://cdn.jsdelivr.net; " # Allow inline styles and from jsdelivr.net
"img-src 'self' data: data: https://icons.duckduckgo.com;" # Allow images from self, data URIs, and google
"font-src 'self' https://cdn.jsdelivr.net; " # Allow fonts from jsdelivr.net
"connect-src 'self' https://ipapi.co; " # Allow API calls to ipapi.co
"object-src 'none'; " # Disallow object and embed elements
"frame-ancestors 'none'; " # Prevent clickjacking by disallowing framing
)
return response
@app.route('/', methods=['GET'])
def index():
rendered_page = render_template('home.html')
response = make_response(rendered_page)
response.headers['ngrok-skip-browser-warning'] = 'any_value'
return response
@app.route('/search-jobs', methods=['POST'])
@limiter.limit("10 per minute")
def search_jobs():
"""Handle job search requests by dynamically selecting the fetcher (scraper or JSearch)."""
form_input = validate_and_clean_input(request.form, request.files)
if isinstance(form_input, tuple) and form_input[1] == 400:
return form_input
search_terms = form_input['search_terms']
country = form_input['country']
location = form_input['region']
interval = form_input['interval']
logger.info(f"Incoming request. Terms: {form_input['search_terms']} - County: {country['name']} - location: {location} - Posted since: {form_input['interval']}")
honeypot = request.form.get('jamesbond')
if honeypot: # Bots will fill this, humans won't
logger.warning("Bot detected!")
return jsonify({'error': 'Something fishy is going on!'}), 400
try:
# Use the dynamically selected job fetching function
all_jobs_df = asyncio.run(job_fetching_function(search_terms, country, location, interval))
# Check if the DataFrame is empty (i.e., no jobs found)
if all_jobs_df.empty:
return jsonify({
'jobs': [],
'message': 'No jobs found 😔. Please try again with different terms or locations.'
}), 200 # Return a 200 status code with an empty job list and a message.
all_jobs_df = process_job_dataframe(all_jobs_df)
ranked_jobs_df = rank_job_descriptions(all_jobs_df, form_input['cv_text'], form_input['preferred_keywords'], form_input['required_keywords'], form_input['exclude_keywords'])
dump_ranked_jobs(ranked_jobs_df, Config.DUMP_FILE_NAME)
ranked_jobs = ranked_jobs_df[['display_title', 'display_company', 'date_posted', 'combined_score', 'tier', 'apply_options']].head(Config.RESULTS_WANTED).to_dict(orient='records')
del all_jobs_df, ranked_jobs_df # Free DataFrames explicitly after use
gc.collect() # Force garbage collection
return jsonify({'jobs': ranked_jobs})
except Exception as e:
logger.error(f"Error fetching jobs: {str(e)}", exc_info=True) # Log the full traceback for debugging
return jsonify({'error': 'An unexpected error occurred while fetching jobs 🫤. Please try again later.'}), 500
# Error handler for file size limit
@app.errorhandler(RequestEntityTooLarge)
def handle_file_size_error(e):
return jsonify({'error': 'File size exceeds the limit.'}), 413
@app.route('/healthz', methods=['GET'])
def health_check():
"""Health check endpoint to ensure the app is running."""
return jsonify({"status": "healthy"}), 200
@app.route('/robots.txt')
def robots_txt():
return send_from_directory(app.static_folder, 'robots.txt')
if __name__ == '__main__':
app.run(port=5000, debug=True)