-
Notifications
You must be signed in to change notification settings - Fork 0
/
inference.py
148 lines (107 loc) · 6.2 KB
/
inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import pandas as pd
import joblib
import os
class Inference:
def __init__(self, data_path, models_dir):
self.data = pd.read_csv(data_path)
print(f"Data shape: {self.data.shape}")
# Load the trained models
self.stress_score_model = joblib.load(os.path.join(models_dir, 'stress_score_model.pkl'))
self.moral_model = joblib.load(os.path.join(models_dir, 'moral_model.pkl'))
self.completion_time_model = joblib.load(os.path.join(models_dir, 'completion_time_model.pkl'))
def data_preprocessing(self, name):
# Label Encoding for Categorical Features
df = self.data[self.data['Employee Name'] == name].copy()
categorical_columns = ['Gender', 'Married', 'Role', 'Position', 'Moral', 'Project_Difficulty','Stress & Burnout Score']
for col in categorical_columns:
df[col] = pd.factorize(df[col])[0]
# Removing unnecessary columns
cols = ['Employee Name', 'Joining_Year','ID','Project_Description']
df = df.drop(columns=cols)
# Date Processing
try:
df['Project_Start_Date'] = pd.to_datetime(df['Project_Start_Date'], format='%d/%m/%y')
df['Project_Deadline'] = pd.to_datetime(df['Project_Deadline'], format='%d/%m/%y')
except:
df['Project_Start_Date'] = pd.to_datetime(df['Project_Start_Date'], format='%d/%m/%Y',errors='coerce')
df['Project_Deadline'] = pd.to_datetime(df['Project_Deadline'], format='%d/%m/%Y',errors='coerce')
df['Time_Allotted'] = (df['Project_Deadline'] - df['Project_Start_Date']).dt.days
# Dropping date columns
cols = ['Project_Start_Date','Project_Deadline']
df = df.drop(columns=cols)
# Salary Conversion
df['Salary'] = df['Salary'].str.replace(',', '').astype(int)
# Standard Scaler for Numerical Features
numerical_features = ['Age', 'Salary','Mean Monthly Hours', 'Absences',
'Ongoing_Project_Count','Projects_Within_Deadline','Projects_Completed','Completion_Time']
categorical_features = ['Gender', 'Current_Employ_Rating','Married', 'Role','Position', 'Moral', 'Project_Difficulty','Manager_ID']
features = numerical_features + categorical_features
scaler = joblib.load('scaler.pkl')
df[numerical_features] = scaler.fit_transform(df[numerical_features])
# Removing highly correlated features
highly_correlated_features = ['Years in the company']
df = df.drop(highly_correlated_features, axis=1)
return df
def stress_score_preprocessing(self, name):
# Call data preprocessing method
df = self.data_preprocessing(name)
# Prepare the target and features for stress score prediction
target_variable = 'Stress & Burnout Score'
X = df.drop(columns=[target_variable])
return X
def moral_data_preprocessing(self, name):
# Call general preprocessing
df = self.data_preprocessing(name)
# Preprocessing steps specific to the 'Moral' predictio
target_variable = 'Moral'
X = df.drop(columns=[target_variable])
return X
def completion_time_preprocessor(self, name):
df = self.data[self.data['Employee Name'] == name].copy()
cols = ['Employee Name', 'Joining_Year','ID']
df = df.drop(columns=cols)
try:
df['Project_Start_Date'] = pd.to_datetime(df['Project_Start_Date'], format='%d/%m/%y')
df['Project_Deadline'] = pd.to_datetime(df['Project_Deadline'], format='%d/%m/%y')
except:
df['Project_Start_Date'] = pd.to_datetime(df['Project_Start_Date'], format='%d/%m/%Y',errors='coerce')
df['Project_Deadline'] = pd.to_datetime(df['Project_Deadline'], format='%d/%m/%Y',errors='coerce')
df['Time_Allotted'] = (df['Project_Deadline'] - df['Project_Start_Date']).dt.days
df['Time_Allotted'] = df['Time_Allotted'].abs()
cols = ['Project_Start_Date','Project_Deadline']
df = df.drop(columns=cols)
categorical_columns = ['Gender', 'Married', 'Role', 'Position', 'Moral', 'Project_Difficulty']
for col in categorical_columns:
df[col] = pd.factorize(df[col])[0]
df['Salary'] = df['Salary'].str.replace(',', '').astype(int)
numerical_features = ['Age', 'Salary','Mean Monthly Hours', 'Absences',
'Ongoing_Project_Count','Projects_Within_Deadline','Projects_Completed']
categorical_features = ['Gender', 'Current_Employ_Rating', 'Role','Position', 'Moral', 'Stress & Burnout Score','Project_Difficulty','Manager_ID']
text_features = ['Project_Description']
target_variable = ['Completion_Time']
scaler = joblib.load('scaler.pkl')
df[numerical_features] = scaler.fit_transform(df[numerical_features])
features = numerical_features + categorical_features + text_features
target = df['Completion_Time']
df = df.drop(columns=['Completion_Time','Project_Description'])
highly_correlated_features = ['Years in the company']
df = df.drop(highly_correlated_features, axis=1)
cols = ['Age','Salary','Mean Monthly Hours','Absences','Ongoing_Project_Count','Projects_Within_Deadline','Current_Employ_Rating', 'Role','Position','Moral','Stress & Burnout Score','Project_Difficulty']
X = df
return X
def predict_stress_score(self, name):
# Prepare the data in the format expected by the stress score model
# This involves selecting the appropriate columns, etc.
X_stress = self.stress_score_preprocessing(name)
predictions = self.stress_score_model.predict(X_stress)
return predictions
def predict_moral(self, name):
# Prepare the data for the moral model
X_moral = self.moral_data_preprocessing(name) # Modify as per the model's requirements
predictions = self.moral_model.predict(X_moral)
return predictions
def predict_completion_time(self, name):
# Prepare the data for the completion time model
X_completion = self.completion_time_preprocessor(name) # Modify as per the model's requirements
predictions = self.completion_time_model.predict(X_completion)
return predictions