-
Notifications
You must be signed in to change notification settings - Fork 0
/
hrDataFeeder.py
executable file
·317 lines (290 loc) · 14.5 KB
/
hrDataFeeder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
#!/usr/bin/env python
# This script queries the Faculty Data Repository (FDR) and the Service Directory (SD) for faculty HR data to feed Symplectic Elements.
# For FDR records without email addresses it can query the LDAP Service Directory using the python util package
# from https://intranet.lib.duke.edu/download/python/
# There are few tricky spots in here due to the untrustworthiness of the FDR data, which the FDR people cannot or will not fix.
import cx_Oracle
import logging
import logging.handlers
from os.path import join
from os import getcwd, environ
from sys import exit
from djangoutil.xmlrpc import getServerProxy
from xml.sax.saxutils import escape
from ConfigParser import SafeConfigParser
import codecs
import io
from django.conf import settings
from djangoutil import config
settings.configure(config)
# encoding=utf8
import sys
reload(sys)
sys.setdefaultencoding('utf8')
# set to UTF-8 to capture diacritics
environ['NLS_LANG']= 'AMERICAN_AMERICA.AL32UTF8'
# database configuration
usedb = 'test' # choose database: dev, test, or prod
config = SafeConfigParser()
config.read(join(getcwd(), 'fdr.config')) # read config file to gather parameters
dbhost = config.get(usedb, 'dbhost')
dbport = config.get(usedb, 'dbport')
dbsid = config.get(usedb, 'dbsid')
dbuser = config.get(usedb, 'dbuser')
dbpassword = config.get(usedb, 'dbpassword')
useldapforemail = False # LDAP is slow and hasn't returned significant number of emails. If False, use netid+@duke.edu instead.
sd_file = join(getcwd(), 'libsymel.dat') # Nightly export of Service Directory data
xmlfile = join(getcwd(), 'people.xml') # Output file for Symplectic Elements consumption
affiliationsfile = join(getcwd(), 'affiliations.txt') # Output file for unique affiliations to populate Elements Auto Groups
# instantiate and configure logger
logfile = join(getcwd(), 'hrDataFeeder.log')
logger = logging.getLogger('fdrlogger')
logger.setLevel(logging.DEBUG)
handler = logging.handlers.RotatingFileHandler(logfile, maxBytes=20971520, backupCount=5) # limit to 6 files of 20 MB or less
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
# retrieve results from Faculty Data Repository (FDR)
def getResults(ora, sql):
ocur = ora.cursor()
ocur.execute(sql)
res = ocur.fetchall()
ocur.close()
return res
# Take list of dictionaries and build XML elements. Return string.
def buildXml(list):
sequence_dict = {1:'Secondary', 2:'Tertiary', 3:'Quaternary', 4:'Quinary', 5:'Senary', 6:'Septenary', 7:'Octonary', 8:'Nonary', 9:'Denary'}
xml = ''
for record in list:
xml += '\t\t<person>\n'
xml += '\t\t\t<Lastname>%s</Lastname>\n' % (record['surname'])
xml += '\t\t\t<Firstname>%s</Firstname>\n' % (record['forename'])
try:
xml += '\t\t\t<Middlename>%s</Middlename>\n' % (record['middlename'])
except:
pass
xml += '\t\t\t<Email>%s</Email>\n' % (record['email']) # removing angle brackets in some email fields
xml += '\t\t\t<Proprietary_ID>%s</Proprietary_ID>\n' % (record['duid'])
xml += '\t\t\t<Username>%s</Username>\n' % (record['netid'])
xml += '\t\t\t<PrimaryGroupDescriptor>%s</PrimaryGroupDescriptor>\n' % (escape(record['primary']))
# this must change in response to addition of school
if 'secondary' in record:
if len(record['secondary']) > 0:
i = 1
for appointment in record['secondary']:
xml += '\t\t\t<%sGroupDescriptor>%s</%sGroupDescriptor>\n' % (sequence_dict[i], escape(appointment.strip()), sequence_dict[i])
i += 1
xml += '\t\t\t<IsAcademic>%s</IsAcademic>\n' % (record['academic'])
xml += '\t\t\t<LoginAllowed>%s</LoginAllowed>\n' % (record['login'])
xml += '\t\t\t<AuthenticatingAuthority>%s</AuthenticatingAuthority>\n' % (record['authority'])
xml += '\t\t</person>\n'
return xml
# Build list of dictionaries of FDR people. Also return list of Duke Unique IDs.
def buildFdrDict(data, rpcserver, sd_dict_list):
print 'buildFdrDict'
fdr_dict_list = []
# CHANGE THIS. DROP FDR RECORD WITHOUT NETID, USE NETID as KEY
netid_list = []
missing_fdr_email = 0
missing_email_found_sd = 0
for record in data:
drop_record = False
fdr_dict = {}
try: # Confusing. FDR forced their names on us. Their PRIMARY_SCHOOL is our primary group, all other groups are secondary for us.
duid, netid, salutation, surname, forename, middlename, lsurname, lforename, lmiddlename, email, primary, school, secondary, primary_affiliation = record
except ValueError:
logmessage = 'Database view has changed.'
logger.critical(logmessage)
exit()
if not netid: # Some people records do not contain netid. Look in SD file. If not there, log and discard person.
logmessage = 'Record dropped - No NetID in FDR. %s %s, %s' % (forename, surname, duid)
logger.critical(logmessage)
print logmessage
drop_record = True
continue
else:
pass
# for person in sd_dict_list: # Look through SD records
# if duid == person['duid']: # If DUID matches...
# print person
# netid = person['netid'] # Assign SD netid to person
# logmessage = "Found FDR person %s missing netid." % (duid)
# logger.info(logmessage)
# print logmessage
# break
# else: # If also no netid in SD, log and set flag to drop this record.
# logmessage = "Person %s missing netid in FDR and SD." % (duid)
# logger.critical(logmessage)
# print logmessage
# drop_record = True
if surname: # If professional name set, use that. Otherwise fall back to legal name.
fdr_dict['surname'] = surname
fdr_dict['forename'] = forename
if middlename: # Many records do not contain middle name.
fdr_dict['middlename'] = middlename
else: # Legal name block
fdr_dict['surname'] = lsurname
fdr_dict['forename'] = lforename
if lmiddlename:
fdr_dict['middlename'] = lmiddlename
if not email: # Some people do not have email addresses for some reason that I cannot comprehend.
missing_fdr_email += 1
if not drop_record: # If there's no netid, there's no point in continuing with this record.
for person in sd_dict_list: # Look through SD records
if duid == person['duid']: # If DUID matches...
email = person['email'] # Assign SD netid to person
logmessage = "FDR person %s missing email found in Service Directory." % (duid)
missing_email_found_sd += 1
#print logmessage
#logger.info(logmessage)
break
else:
email = person['email']
email = email.translate(None, "<>") # Remove angle brackets present in some email fields
#email = netid + "@duke.edu"
fdr_dict['email'] = email
fdr_dict['duid'] = duid
fdr_dict['netid'] = netid
fdr_dict['primary'] = school
# Non-primary appointments. Convert double-pipe delimited string to list and add PRIMARY_VIVO_ORG to that.
secondary_deduped_list = [] # Deduplicate the secondary appointments. Often duplicates.
if secondary:
secondary = secondary.strip() # Remove EOL character.
if '||' in secondary: # Double pipes indicates concatenated result.
secondary_list = secondary.split('||') # Split results into list
for appt in secondary_list:
if (appt not in secondary_deduped_list) and (appt != school): # Don't want school twice.
secondary_deduped_list.append(appt)
elif secondary != school: # Single result, dedupe against school.
secondary_deduped_list.append(secondary)
if (primary not in secondary_deduped_list) and (primary != school): # Dedupe primary against secondary appts and school.
secondary_list.append(primary)
fdr_dict['secondary'] = secondary_deduped_list
fdr_dict['academic'] = 'Y'
fdr_dict['login'] = 'Y'
fdr_dict['authority'] = 'Shibboleth'
netid_list.append(netid)
if not drop_record:
fdr_dict_list.append(fdr_dict)
else: # Discard this record and log.
logmessage = 'Record dropped for DUID:%s Forename: %s Surname: %s' % (duid, forename, surname)
logger.info(logmessage)
if missing_fdr_email > 0:
logmessage = '%s FDR records without email addresses' % (missing_fdr_email)
logger.info(logmessage)
print '%s people missing FDR email found in SD' % (missing_email_found_sd)
return fdr_dict_list, netid_list
# Build list of dictionaries of service directory entries after deduplicating people from FDR
def buildSdDict(sd_file):
sd_dict_list = []
duplicates = 0
sd_missing_email = 0
sd = open(sd_file, 'r')
print '1'
for line in sd:
sd_dict = {}
duid , netid, surname, forename, email, status = line.split('|')
sd_dict['duid'] = duid
sd_dict['netid'] = netid
sd_dict['surname'] = surname
sd_dict['forename'] = forename
sd_dict['primary'] = status.strip() # Remove line break
sd_dict['academic'] = 'N'
sd_dict['login'] = 'Y'
sd_dict['authority'] = 'Shibboleth'
if email:
email = email.translate(None, "<>") # Remove angle brackets present in some email fields
sd_dict['email'] = email
else:
sd_dict['email'] = netid + '@duke.edu'
sd_missing_email += 1
sd_dict_list.append(sd_dict)
sd.close()
logmessage = 'Found %s Service Directory records.' % (len(sd_dict_list) + duplicates)
logger.info(logmessage)
logmessage = '%s Service Directory records without email addresses' % (sd_missing_email)
logger.info(logmessage)
#logmessage = '%s Service Directory records were duplicates.' % (duplicates)
#logger.info(logmessage)
print 'testing" return buildDdDict'
return sd_dict_list
# Deduplicate the SD people to prevent creating multiple accounts as some will appear in FDR data.
def dedupeSdDictList(sd_dict_list, netid_list):
duplicates = 0
sd_dict_list_dedupe = []
for record in sd_dict_list:
if record['netid'] not in netid_list: # Deduplicate these records against FDR records.
sd_dict_list_dedupe.append(record)
duplicates += 1
logmessage = "Found %s Service record duplicates." % (duplicates)
logger.info(logmessage)
return sd_dict_list_dedupe
# Serialize list of unique affiliations to populate Elements Auto Groups
def getUniqueAffiliations(fdr_dict_list):
unique_affiliations_list = []
for dict in fdr_dict_list:
if 'secondary' in dict:
for affiliation in dict['secondary']:
if affiliation not in unique_affiliations_list:
unique_affiliations_list.append(affiliation)
return unique_affiliations_list
if __name__=='__main__':
try:
logmessage = "Starting update." # Begin logging
logger.info(logmessage)
dbdsn = cx_Oracle.makedsn(dbhost, dbport, dbsid) # Open the connection to the FDR database
try:
ora = cx_Oracle.connect(dbuser, dbpassword, dbdsn)
except:
logmessage = 'Database connection error.'
logger.critical(logmessage)
exit()
sql = 'select DUID, NETID, SALUTATION, SURNAME, FIRSTNAME, MIDDLENAME, LEGAL_SURNAME, LEGAL_FIRSTNAME, LEGAL_MIDDLENAME, EMAIL, PRIMARY_VIVO_ORG, PRIMARY_SCHOOL, affiliations, PRIMARY_AFFILIATION from APT.V_PEOPLE_WITH_AFFILIATIONS'
data = getResults(ora, sql) # Query FDR. data is a list of tuples, 1 tuple per record.
logmessage = 'Found %s FDR faculty.' % (len(data))
logger.info(logmessage)
ora.close()
xml_preabmle = '<?xml version="1.0" encoding="UTF-8" ?>\n<HR_Data>\n' # Begin the XML string to write to people.xml
xml_preabmle += '\t<Feed_ID>FDR</Feed_ID>\n'
xml_preabmle += '\t<people>\n'
if useldapforemail:
rpcserver = getServerProxy() # Open connection to Service Directory
else:
rpcserver = False
sd_dict_list = buildSdDict(sd_file) # Build list of attributes about people from Service Directory dump file.
fdr_dict_list, netid_list = buildFdrDict(data, rpcserver, sd_dict_list)
unique_affiliations_list = getUniqueAffiliations(fdr_dict_list) # Build list of unique affiliations/appointments for Elements
netid_list.sort()
sd_dict_list_dedupe = dedupeSdDictList(sd_dict_list, netid_list) # Deduplicate Service Directory people so we don't name people twice
# TESTED TO HERE
sd_xml = buildXml(sd_dict_list_dedupe) # Build the XML string from SD people
print 'testing buildXML sd dict'
fdr_xml = buildXml(fdr_dict_list) # Build the XML string for FDR people
print 'testing buildXML fdr dict'
xml_postamble = '\t</people>\n</HR_Data>'
xml = xml_preabmle + fdr_xml + sd_xml + xml_postamble # Complete XML string.
f = open(xmlfile, 'w') # Serialize the XML string
f.write(xml)
f.close()
af = open(affiliationsfile, 'w') # Serialize the unique affiliations
unique_affiliations_list.sort
for affiliation in unique_affiliations_list:
af.write(affiliation + '\n')
af.close()
logmessage = "Update complete."
print logmessage
logger.info(logmessage)
except Exception as e:
print (e)
# successful sending of email necessitated disabling McAfee email rule
import smtplib
from email.mime.text import MIMEText
msg = MIMEText('The HR data serialization script has failed on lib-symeldata.')
sender = 'jjim.tuttle@duke.edu'
recipient = 'elements@duke.edu'
msg['Subject'] = 'HR data failed on Elements development'
msg['From'] = 'jim.tuttle@duke.edu'
msg['To'] = 'jim.tuttle@duke.edu'
s = smtplib.SMTP('smtp.duke.edu', '587')
s.sendmail(sender, [recipient], msg.as_string())
s.quit()