-
Notifications
You must be signed in to change notification settings - Fork 1
/
grantsGov_utilities.py
2871 lines (2526 loc) · 128 KB
/
grantsGov_utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import xmltodict
import sys
import os
import pandas as pd
import glob
def detectLocalDataSource(location, sourceOrg, singleMulti='single'):
"""
Detects the presence of local data from the source organization.
checks the location directory for the relevant content. If singleMulti
is set to 'single' it checks for a single, omnibus file, if set to
'multi' it checks for multiple files in the designated directory.
Parameters
----------
location : str
The directory which is to be checked for the desired data.
Checks for the existence of the directory and throws an error if not found.
sourceOrg : str
One of the following, indicating the source organization:
- 'grantsGov'
- 'NIH'
- 'NSF'
singleMulti : str, optional
Indicates whether the data should be a single file or multiple files. The default is 'single'.
- 'single' : checks for a single, omnibus file
- 'multi' : checks for multiple files in the designated directory.
Returns
-------
localData : string or boolean
If the data is found, returns the path to the data. If not found, returns False.
NOTE: the data comes from the respective sources in different formats:
- grants.gov : single omnibus XML file
- NIH : ?
- NSF : year-wise tar or zip files containing per-award XML files
"""
import os
import glob
# check if the location exists
if os.path.isdir(location):
# if it does not exist, throw an error
print('The location ' + location + ' does not exist. Please check the location and try again.')
return False
# if it does exist, check for the data
else:
# TODO: This is where to add additional data sources
# handle each case respectively
# arbitrary convention: double spacing separating each case,
# separating single and multi
if sourceOrg=='grantsGov':
# check for the single file
if singleMulti=='single':
# check for the single file
# use the file stem to check for the file
grantsGovFileStem='GrantsDBExtract'
# check for the file, check for either xml or json, case insensitive
# first check for xml
grantsGovFile=glob.glob(location + grantsGovFileStem + '*.[xX][mM][lL]')
# then check for json, and cat the results
grantsGovFile=grantsGovFile+glob.glob(location + grantsGovFileStem + '*.[jJ][sS][oO][nN]')
# if the file is found, return the path
if len(grantsGovFile)>0:
return grantsGovFile[0]
# if the file is not found, return False
else:
return False
# check for the multiple files
elif singleMulti=='multi':
# haven't implemented the split function yet, but
# we'll assume that we will use the same convention as
# the NSF data, with separate xml files for each grant
# named by the grants.gov ID number
# check for the multiple files, just assume they are numbered and end with .xml
grantsGovFiles=glob.glob(location + '*.xml')
# check if the returned file names are numbers and thus valid
# do this by iterating through the list of returns and checking if they are string numbers
# if they are, return the list of files
# if they are not, return False
# get a list of the file names that meet this criteria
grantsGovFiles=[x for x in grantsGovFiles if x.split(os.sep())[-1].split('.')[0].isdigit()]
# if the list is not empty, return the list
if len(grantsGovFiles)>0:
return grantsGovFiles
# if the list is empty, return False
else:
return False
elif sourceOrg=='NIH':
#TODO: figure out what the NIH data looks like
# for now, throw a not implemented error
raise NotImplementedError('The NIH data has not yet been implemented. Please check back later.')
return False
elif sourceOrg=='NSF':
# check for the single file
if singleMulti=='single':
# check for the single file
# use the file stem to check for the file
#TODO: confirm that this is the correct stem for the omnibus file
nsfFileStem='NSF_Awards'
# check for the file
# check for both xml and json, case insensitive
# first check for xml
nsfFile=glob.glob(location + nsfFileStem + '*.[xX][mM][lL]')
# then check for json, and cat the results
nsfFile=nsfFile+glob.glob(location + nsfFileStem + '*.[jJ][sS][oO][nN]')
# if the file is found, return the path
if len(nsfFile)>0:
return nsfFile[0]
# if the file is not found, return False
else:
return False
# check for the multiple files
elif singleMulti=='multi':
# check for the multiple files, just assume they are numbered and end with .xml
nsfFiles=glob.glob(location + '*.xml')
# check if the returned file names are numbers and thus valid
# do this by iterating through the list of returns and checking if they are string numbers
# if they are, return the list of files
# if they are not, return False
# get a list of the file names that meet this criteria
nsfFiles=[x for x in nsfFiles if x.split(os.sep())[-1].split('.')[0].isdigit()]
# if the list is not empty, return the list
if len(nsfFiles)>0:
return nsfFiles
# if the list is empty, return False
else:
return False
else:
# if the source organization is not recognized, throw an error
print('The source organization ' + sourceOrg + ' is not recognized. Please check the source organization and try again.')
return False
def getDataFromRemoteSource(destination,sourceOrg):
"""
This function downloads the data from the remote source and saves it to the designated destination.
Parameters
----------
destination : string
The path to the directory where the data will be saved.
sourceOrg : string
The organization from which the data will be downloaded, from one of the following
- 'grantsGov'
- 'NIH'
- 'NSF'
Returns
-------
result : str or boolean
The path to the downloaded data, or False if the download failed.
"""
import os
import requests
import shutil
import zipfile
import tarfile
# check if the destination exists
if os.path.isdir(destination):
# if it does not, create a directory for the sourceOrg data
os.mkdir(destination + os.sep() + sourceOrg)
# set this as the new destination
destination=destination + os.sep() + sourceOrg
# if it does exist, check for the data
else:
# NOTE: this is where to add additional data sources
# if it does exist, check for the data
# for the grants.gov data, check for the single xml
if sourceOrg=='grantsGov':
# check for the single file
# use the file stem to check for the file
grantsGovFileStem='GrantsDBExtract'
# first check for xml
grantsGovFile=glob.glob(destination + grantsGovFileStem + '*.[xX][mM][lL]')
# then check for json, and cat the results
# if the file is found, return True, no need to go and re-download
if len(grantsGovFile)>0:
return grantsGovFile[0]
# if the file is not found, continue
else:
pass
# for the NIH data, check for the single xml
# TODO: confirm that this is the case
elif sourceOrg=='NIH':
# check for the single file
# use the file stem to check for the file
nihFileStem='NIH_Awards'
# first check for xml
nihFile=glob.glob(destination + nihFileStem + '*.[xX][mM][lL]')
# then check for json, and cat the results
# if the file is found, return True, no need to go and re-download
if len(nihFile)>0:
return nihFile[0]
# if the file is not found, continue
else:
pass
# for the NSF data, look for a series of zip / tar files with four digit years as their names
elif sourceOrg=='NSF':
# check for zip and tar files
# first check for zip
nsfFiles=glob.glob(destination + '*.zip')
# then check for tar, cat the results
nsfFiles=nsfFiles+glob.glob(destination + '*.tar')
# if the file are found, return True, no need to go and re-download
if len(nsfFiles)>0:
return nsfFiles
# if the file is not found, continue
else:
pass
# if the source organization is not recognized, throw an error
else:
print('The source organization ' + sourceOrg + ' is not recognized. Please check the source organization and try again.')
return False
# NOTE: this is where to add additional data sources
# if the data is not found, download it, depending on the source organization
# for the grants.gov data, download and unzip the single xml
if sourceOrg=='grantsGov':
# use the downloadGrantsGovGrantsData
# this returns the path to the downloaded data
try:
grantsGovFile=downloadGrantsGovGrantsData(destination)
except:
print('The download of the grants.gov data failed. Please check the source and try again.')
return False
# if the download is successful, check that the grantsGovFile exists
if os.path.isfile(grantsGovFile):
return grantsGovFile
# if the download is not successful, return False
else:
return False
# for the NIH data, download and unzip the single xml
elif sourceOrg=='NIH':
#TODO confirm that this is the case
# for now raise a not implemented error
raise NotImplementedError('The NIH data source is not yet implemented. Please check back later.')
elif sourceOrg=='NSF':
# create this list of files to be downloaded
nsfDownloadURLS=genNSFdownloadURLs()
# download the files with downloadNSFgrantsData
nsfFiles=downloadNSFgrantsData(nsfDownloadURLS,destination)
# if the download is successful, check that the first file exists, there are too many to check all
if os.path.isfile(nsfFiles[0]):
return nsfFiles
# if the download is not successful, return False
else:
return False
# NOTE: here we have the functions for downloading the data from the remote sources
def downloadGrantsGovGrantsData(savePathDir=None):
"""
Downloads the latest XML data structure from https://www.grants.gov/xml-extract.html.
Downloads to local directory if no path value entered for savePath. Returns the resultant XML structure as well.
Parameters
----------
savePathDir : str
A string path corresponding to the *directory* the user would like the grants.gov xml file downloaded.
Default value is None, resulting in download to current working directory.
Returns
-------
XML_filePath : str
The path to the resultant (e.g. downloaded and unzipped) data structure.
See Also
--------
grantXML_to_dictionary : convert the XML data structure from https://www.grants.gov/xml-extract.html to a pandas dataframe.
"""
import os
import zipfile
from datetime import datetime
import requests
# check and see what the save path has been set to
if savePathDir == None:
savePathDir = ''
# check if the path exists
elif os.path.isdir(savePathDir):
# do nothing, the path is set
pass
# if the directory doesn't exist, *don't* make the directory, instead raise an error
else:
raise Exception ('Input save path\n' + savePathDir + '\ndoes not exist. Create first.')
# file pathing taken care of, begin prep for download
# grants.gov extract url
grantsExtractURL='https://www.grants.gov/extract/'
# generate all of the file name parts
dateString= datetime.today().strftime('%Y%m%d')
fileStem='GrantsDBExtract'
fileEnd='v2.zip'
fullFileName= fileStem + dateString + fileEnd
# set queryURL
queryURL= grantsExtractURL + fullFileName
# use recomended requests method
# https://stackoverflow.com/questions/9419162/download-returned-zip-file-from-url
def download_url(url, save_path, chunk_size=128):
r = requests.get(url, stream=True)
with open(save_path, 'wb') as fd:
for chunk in r.iter_content(chunk_size=chunk_size):
fd.write(chunk)
print(str(os.path.getsize(save_path)) + ' bytes file downloaded from\n' + url)
print('Saved to ' + savePathDir)
# establish save path
zipSavePath=os.path.join(savePathDir,fullFileName)
# download
download_url(queryURL, zipSavePath, chunk_size=128)
# unzip in place
with zipfile.ZipFile(zipSavePath, 'r') as zip_ref:
zip_ref.extractall(savePathDir)
print ('XML file located at\n' + zipSavePath.replace('zip','xml'))
return zipSavePath.replace('zip','xml')
def genNSFdownloadURLs():
"""
This funtion generates the paths to the NSF grant data files that are stored at www.nsf.gov/awardsearch/.
Note that it will generate files up to the current year.
Inputs: None
Outputs:
downloadURLs: list
A list of strings corresponding to the download URLs for the NSF grant data files.
"""
import datetime
# get the current year
currentYear=datetime.datetime.now().year
# create a holder for the download URLs
downloadURLs=[]
# append the historical data URL
downloadURLs.append('https://www.nsf.gov/awardsearch/download?DownloadFileName=Historical&All=true')
#as of 04/04/2022, the earliest year of data is 1959
firstYear=1959
# iterate across the years
for iYears in range(firstYear,currentYear+1):
# create the download URL
currentURL='https://www.nsf.gov/awardsearch/download?DownloadFileName='+str(iYears)+'&All=true'
# append it to the holder
downloadURLs.append(currentURL)
return downloadURLs
def downloadNSFgrantsData(downloadURLs,saveDirectory=None):
"""
This function downloads the NSF grant data files that are stored at www.nsf.gov/awardsearch/.
Note that it will download files up to the current year.
Inputs:
downloadURLs: list
A list of strings corresponding to the download URLs for the NSF grant data files.
saveDirectory: string
A string corresponding to the directory in which the downloaded files should be saved.
Outputs: None
"""
if saveDirectory is None:
saveDirectory=os.getcwd()+os.sep+'NSF_grant_data'
# create the save directory if it doesn't exist
if not os.path.exists(saveDirectory):
os.makedirs(saveDirectory)
import requests
import os
import zipfile
import tarfile
outPaths=[]
# iterate across the download URLs
for iURLs in downloadURLs:
# get the file name
fileName=iURLs.split('=')[-2].split('&')[0]+'.zip'
# create the save path
savePath=os.path.join(saveDirectory,fileName)
# download the file
response = requests.get(iURLs)
# Write content in pdf file
currFile = open(savePath, 'wb')
currFile.write(response.content)
currFile.close()
# check if it is a zip or a tar file
if fileName.endswith('.zip'):
# unzip the file
with zipfile.ZipFile(savePath, 'r') as zip_ref:
zip_ref.extractall(saveDirectory)
elif fileName.endswith('.tar'):
# untar the file
with tarfile.open(savePath, "r:") as tar_ref:
tar_ref.extractall(saveDirectory)
# append the save path to the holder
outPaths.append(savePath)
# print an indicator of how many files have been downloaded
print('Downloaded '+str(len(outPaths))+' of '+str(len(downloadURLs))+' files')
return
# now that we have handled downloading, we need to process and curate the data
def processDownloadedData(dataLocation,sourceOrg,singleMulti='multi'):
"""
This function processes downloaded data from any of the supported sources and saves down the proceessed resultant
in a "processed" subdirectory of the source data directory. The singleMulti flag determines whether the data are
stored as a single file or as multiple files.
Inputs:
dataLocation: string
A string corresponding to the path to the downloaded data.
sourceOrg: string
A string corresponding to the source organization of the data. Currently supported are 'NSF' and 'Grants.gov'.
singleMulti: string
A string corresponding to whether the data should be stored as a single file or as multiple files. Currently
supported are 'single' and 'multi'.
Outputs: None
"""
import pandas as pd
from glob import glob
import xmltodict
# establish a vector with the currently accepted sources
# currently no support for NIH
acceptedSources=['NSF','grantsGov']
# check if the source is in the accepted sources
if sourceOrg not in acceptedSources:
# if not, raise an error
raise ValueError('The source organization '+sourceOrg+' is not currently supported. Supported sources are '+str(acceptedSources))
# otherwise proceed in a casewise fashion
else:
# if the source is NSF
if sourceOrg=='NSF':
# TODO: is this even used here?
# load the directorate remap file
directorateRemap=pd.read_csv('../NSF_directorate_remap.csv')
# find the remapped directorate names
validDirectorateNames=directorateRemap['fixedName'].unique()
# given that the NSF data should have been unzipped at this point, they should now be in a single directory
# filled with xml files
# use glob to find all of the xml files
xmlFiles=glob(dataLocation+os.sep+'*.xml')
# create a "processed" subdirectory if it doesn't exist
if not os.path.exists(dataLocation+os.sep+'processed'):
os.makedirs(dataLocation+os.sep+'processed')
# iterate across the xml files
for iFiles in xmlFiles:
# use attemptXMLrepair to attempt to repair the xml file if necessary
attemptXMLrepair(iFiles)
# load the xml file into a dictionary, which presumably works now
with open(iFiles) as fd:
doc = xmltodict.parse(fd.read())
# use applyFixesToRecord_NSF
# it was originally designed to work on a json files, but should work fine on a dictionary
# it will return a dictionary with the fixed record
fixedRecord=applyFixesToRecord_NSF(doc)
# save it down to the processed directory as an xml file with the same name
# get the file name
fileName=iFiles.split(os.sep)[-1]
# get the save path
savePath=dataLocation+os.sep+'processed'+os.sep+fileName
# save it down
with open(savePath, 'w') as fd:
fd.write(xmltodict.unparse(fixedRecord, pretty=True))
# close the file
fd.close()
# NOTE: thus the NSF data have been processed and saved down to the "processed" subdirectory of the dataLocation
print('NSF data have been processed and saved down to ' + dataLocation+os.sep+'processed')
# if the source is grants.gov
elif sourceOrg=='grantsGov':
# determine if the dataLocation is a directory or a specific file
if os.path.isdir(dataLocation):
# use glob to find an xml file with "GrantsDBExtract" in the name
grantsGovFilename=glob(dataLocation+os.sep+'*GrantsDBExtract*.xml')[0]
# if this file exists, load it with pandas
if os.path.isfile(grantsGovFilename):
# load the presumed xml file from the dataLocation directory, which already should have been downloaded
# use pandas to read the xml file
currentData=pd.read_xml(grantsGovFilename)
elif os.path.isfile(dataLocation):
# if this file exists, load it with pandas
if os.path.isfile(dataLocation):
# load the presumed xml file from the dataLocation directory, which already should have been downloaded
# use pandas to read the xml file
currentData=pd.read_xml(dataLocation)
else:
# if not, raise an error
raise ValueError('The grants.gov data file '+dataLocation+' does not exist.')
else:
# if not, raise an error
raise ValueError('The grants.gov data file '+dataLocation+' does not exist.')
# process the grants.gov data using repairFunding_GovGrantsDF
processedCurrentData=repairFunding_GovGrantsDF(currentData,singleMulti=singleMulti)
# save the processed data in the desired format, either as a single file, or per entry, as xml, using the "OpportunityID" as the xml file name
# check to see if a "processed" subdirectory exists
if not os.path.exists(dataLocation+os.sep+'processed'):
os.makedirs(dataLocation+os.sep+'processed')
# if the singleMulti flag is set to single
if singleMulti=='single':
# save the data as a single file
# NOTE: this is being saved down as a csv, maybe this isn't what we want to do in the long run
processedCurrentData.to_csv(dataLocation+os.sep+'processedGrantsGovData.csv',index=False)
# if the singleMulti flag is set to multi
elif singleMulti=='multi':
# iterate across the rows
for iRows in range(processedCurrentData.shape[0]):
# get the current row
currentRow=processedCurrentData.iloc[iRows,:]
# get the current row's OpportunityID
currentOpportunityID=currentRow['OpportunityID']
# save the current row as an xml file
currentRow.to_xml(dataLocation+os.sep+'processed'+os.sep+currentOpportunityID+'.xml')
print('grants.gov data have been processed and saved down to ' + dataLocation+os.sep+'processed')
return
def attemptXMLrepair(xmlPath,errorLogPath=None):
"""
This function loads a putative xml file, and if it is not a valid xml file, attempts to repair it.
Inputs:
xmlPath: string
The path to the putative xml file.
errorLogPath: string
The path to the error log file.
Outputs:
None (saves down the repaired xml file if it is successful)
"""
import xmltodict
import os
from bs4 import BeautifulSoup
# get the directory that the xml file is in
xmlDirectory=os.path.dirname(xmlPath)
# if no errorLogPath is passed in, set it to the xmlDirectory
if errorLogPath is None:
errorLogPath=xmlDirectory+os.sep+'errorLog.txt'
# check if the errorLogPath exists
if not os.path.isfile(errorLogPath):
# if it doesn't exist, create it
open(errorLogPath,'w').close()
# check if the input file exists
if os.path.isfile(xmlPath):
# if it exists, attempt to load it as a dictionary
currXml=open(xmlPath).read()
# determine if it is a valid XML file
try:
currentDict=xmltodict.parse(currXml)
except:
try:
# throw a warning indicating that the file is not valid
print('Warning: '+xmlPath+' is not a valid XML file.')
print('Attempting to repair the file.')
# if it is not a valid XML file, use BeautifulSoup to repair it
currXml = BeautifulSoup(currXml, 'xml')
currentDict=xmltodict.parse(currXml.prettify())
# save the repaired xml file
with open(xmlPath, 'w') as outfile:
outfile.write(currXml.prettify())
# close the file
outfile.close()
except:
# if it is still not a valid XML file, throw an error
print('Error: '+xmlPath+' is not a valid XML file.')
print('Skipping this file.')
# create an error log file in this directory if it doesn't exist
errorLogPath=os.path.join(xmlDirectory,'xml2json_errorLog.txt')
if not os.path.exists(errorLogPath):
with open(errorLogPath, 'w') as outfile:
outfile.write('Error: '+xmlPath+' is not a valid XML file.')
# and append the error to the error log file
else:
with open(errorLogPath, 'a') as outfile:
outfile.write('Error: '+xmlPath+' is not a valid XML file.')
else:
# if it's not a file, return an error
print('Error: '+xmlPath+' is not a valid XML file.')
return
def applyFixesToRecord_NSF(inputRecord,errorLogPath=None):
"""
This function applies the established fixes to the NSF record.
Inputs:
inputRecord: dictionary
A dictionary containing the JSON data.
Outputs:
inputRecord: dictionary
A dictionary containing the JSON data.
"""
from bs4 import BeautifulSoup
import os
# if no errorLogPath is passed in, set it to the presumptive data directory, which is
# "inputData" in the root of the repository
if errorLogPath is None:
errorLogPath=os.path.join('inputData','NSF-fixes_errorLog.txt')
# find location of directorate remap
directorateRemap=pd.read_csv('../NSF_directorate_remap.csv')
# find the remapped directorate names
validDirectorateNames=directorateRemap['fixedName'].unique()
# first, we need to convert the html entities to unicode
try:
if inputRecord['rootTag']['Award']['AbstractNarration'] is not None:
soup=BeautifulSoup(inputRecord['rootTag']['Award']['AbstractNarration'],'html.parser')
inputRecord['rootTag']['Award']['AbstractNarration']=soup.get_text().replace('<br/>','\n')
except:
# try and create an informative error log message
try:
# open the file for appending
with open(errorLogPath, 'a') as outfile:
outfile.write('Error locating AbstractNarration field for '+inputRecord['rootTag']['Award']['AwardID']+'\r')
#print('Error locating AbstractNarration field for '+inputRecord['rootTag']['Award']['AwardID'])
except:
pass
# next implement the directorate remapping
try:
if inputRecord['rootTag']['Award']['Organization']['Directorate']['LongName'] not in validDirectorateNames:
# get the current invalid directorate name
currentInvalidName=inputRecord['rootTag']['Award']['Organization']['Directorate']['LongName']
# find its index in the directorate remap file
currentInvalidNameIndex=directorateRemap.loc[directorateRemap['foundName']==currentInvalidName].index[0]
# remap the directorate name
inputRecord['rootTag']['Award']['Organization']['Directorate']['LongName']=directorateRemap.loc[currentInvalidNameIndex,'fixedName']
# if the directorate field is empty, check the division field
except:
try:
# if the longNamefield is empty, check the division field
if inputRecord['rootTag']['Award']['Organization']['Division']['LongName'] not in validDirectorateNames:
# get the current invalid directorate name
currentInvalidName=inputRecord['rootTag']['Award']['Organization']['Division']['LongName']
# find its index in the directorate remap file
currentInvalidNameIndex=directorateRemap.loc[directorateRemap['foundName']==currentInvalidName].index[0]
# remap the directorate name
inputRecord['rootTag']['Award']['Organization']['Directorate']['LongName']=directorateRemap.loc[currentInvalidNameIndex,'fixedName']
except:
try:
# if this still fails, check if the field is empty, and then convert it to a "None" string
if inputRecord['rootTag']['Award']['Organization']['Division']['LongName'] not in validDirectorateNames:
# get the current invalid directorate name
currentInvalidName=inputRecord['rootTag']['Award']['Organization']['Division']['LongName']
if isempty(currentInvalidName):
currentInvalidName='None'
# remap the directorate name
inputRecord['rootTag']['Award']['Organization']['Directorate']['LongName']=currentInvalidName
else:
with open(errorLogPath, 'a') as outfile:
outfile.write('Directorate remapping failed for '+inputRecord['rootTag']['Award']['AwardID']+'\r')
outfile.write(currentInvalidName + ' not found in directorate remap file\r')
except:
with open(errorLogPath, 'a') as outfile:
outfile.write('Directorate remapping failed for '+inputRecord['rootTag']['Award']['AwardID']+'\r')
try:
outfile.write(str(inputRecord['rootTag']['Award']['Organization']) + ' cause of error\r')
except:
pass
# TODO: implement future record fixes here
return inputRecord
def reTypeGrantColumns(grantsDF):
"""
Iterates through columns and retypes the columns in an intentional fashion.
Parameters
----------
grantsDF : pandas.DataFrame
A dataframe containing grants data from grants.gov, likely derived from the function grantXML_to_dictionary.
Returns
-------
grantsDF : pandas.DataFrame
The same dataframe which was input, with the relevant column types (and nan values) changed in place.
See Also
--------
grantXML_to_dictionary : convert the XML data structure from https://www.grants.gov/xml-extract.html to a pandas dataframe.
"""
import numpy as np
from warnings import warn
# get the column names
columnNames=grantsDF.columns
# set the replacement types, need int 64 because money gets big
# string, int, float sqeuencing
replacementTypes=[np.int64,np.float64,str]
# set the values that nan will be replaced by
replacementValues=[np.int64(0),np.float64(0),'']
#iterate through the columns and get the types
for iColumns in columnNames:
# implement try structure way out here, makes it robust, I guess
try:
# get a sample from the current column
currentSample=grantsDF[iColumns].iloc[0:50]
# TEMPORARILY replace the na values with a string '0'
currentSample=currentSample.fillna(str('0'))
# check if it's a string or a null. Should be an easy case. Shouldn't fail unless object
strOrNull=np.all(currentSample.apply(lambda x: isinstance(x,str)))
# if it's a string or null
if strOrNull:
# check if it's convertable to an int or float
# int check DOES NOT replace period
intORNull = np.all(currentSample.apply(lambda x: x.isnumeric()))
# float check replaces a SINGLE period
floatORNull= np.all(currentSample.apply(lambda x: x.replace('.','',1).isnumeric()))
# BEGIN REPLACEMENT PROCESS; no switch cases, so just iterate through options
# if it's an int and null column, convert it as such; do it in place
if intORNull:
# start by replacing the nans
grantsDF[iColumns].fillna(replacementValues[0], inplace=True)
# then replace the types
grantsDF=grantsDF.astype({iColumns : replacementTypes[0]}, copy=False)
# if it's a float
elif floatORNull:
# start by replacing the nans
grantsDF[iColumns].fillna(replacementValues[1], inplace=True)
# then replace the types
grantsDF=grantsDF.astype({iColumns : replacementTypes[1]}, copy=False)
# I guess it's just a string (or maybe an object, which will result in an error at some point)
else:
# start by replacing the nans
grantsDF[iColumns].fillna(replacementValues[2], inplace=True)
# then replace the types
grantsDF=grantsDF.astype({iColumns : replacementTypes[2]}, copy=False)
# in the event that it fails, throw a warning; probably because there were objects in there somewhere
except:
warn('Type conversion for column ' + iColumns + ' failed.')
print(grantsDF.dtypes)
return grantsDF
def repairFunding_GovGrantsDF(grantsDF):
"""
Repairs the content of the grantsDF dataframe in accordance with established heuristics.
NOTE: this function CHANGES the values / content of the grantsDF from the information contained on grants.gov, including
but not limited to adding data columns, replacing null/empty values, and/or inferring missing values.
Parameters
----------
grantsDF : pandas.DataFrame
A dataframe containing grants data from grants.gov
Returns
-------
grantsDF : pandas.DataFrame
A dataframe containing grants data from grants.gov, converted from XML. Likely includes NAN values for empty entries.
See Also
--------
grantXML_to_dictionary : convert the XML data structure from https://www.grants.gov/xml-extract.html to a pandas dataframe.
"""
from warnings import warn
import numpy as np
import copy
warn('NOTE: this function CHANGES the values / content of the grantsDF from the information contained on grants.gov, including but not limited to adding data columns, replacing null/empty values, and/or inferring missing values.')
# fairly complicated heuristics here
#arbitrary assumption about floor value of grant
floorThresh=3000
# here's a presumed sequencing for these columns
# AwardCeiling AwardFloor EstimatedTotalProgramFunding ExpectedNumberOfAwards
# EstimatedTotalProgramFunding > AwardCeiling > (AwardFloor == 0 OR AwardFloor > ExpectedNumberOfAwards
# So imagine an algorithm we apply to these four values for each row
# first sort(grantsDF[['AwardCeiling','AwardFloor','EstimatedTotalProgramFunding','ExpectedNumberOfAwards'] )
# e.g. [87753000 , 4000, 0 , 0]= sort([0 0 87753000 4000])
# if the number of zeros is 3 and the
quantColumns=['AwardCeiling','AwardFloor','EstimatedTotalProgramFunding','ExpectedNumberOfAwards']
correctedCount=0
for iIndex,iRows in grantsDF.iterrows():
# get the grant quantificaiton values for this row
grantQuantificationValues=[iRows[iColumns] for iColumns in quantColumns]
# get the sorted order of the vector
#sortedOrder=np.argsort(grantQuantificationValues)
grantQuantificationValues_sorted=sorted(grantQuantificationValues)
# create a vector for the sorting to occur in
grantQuantificationValues_RE_sorted=copy.deepcopy(grantQuantificationValues)
#pctThresh=1
# if the award floor is less than 1% of the total funding, set it to zero
#if (grantQuantificationValues_RE_sorted[1]*(100/pctThresh))<grantQuantificationValues_RE_sorted[2]:
# print(grantQuantificationValues_RE_sorted)
# grantQuantificationValues_RE_sorted[1]=0
# by definition, your grant value total ought to be the largest number
if not grantQuantificationValues_sorted[3] == grantQuantificationValues[2]:
# if it's not, move the value in that spot to where the largest number is
grantQuantificationValues_RE_sorted[list(grantQuantificationValues).index(grantQuantificationValues_sorted[3])]=grantQuantificationValues[2]
# and switch the spots
grantQuantificationValues_RE_sorted[2]=grantQuantificationValues_sorted[3]
# if the largest value is above the threshold, we can do something
if grantQuantificationValues_RE_sorted[2] > floorThresh:
# now we need a count of how many zeros there are
unique, counts = np.unique(grantQuantificationValues_RE_sorted, return_counts=True)
countsDict=dict(zip(unique, counts))
try:
zeroCount=countsDict[0]
except:
zeroCount=0
# now we get into our case based logic
# if there are 3 zeros, and the grant value is a above the threshold, we can assume that there's 1 grant,
# and the ceiling is the total value
if zeroCount == 3:
grantQuantificationValues_RE_sorted=[grantQuantificationValues_RE_sorted[2],0,grantQuantificationValues_RE_sorted[2],1 ]
# if you have two zeros, one can't be in the estimated number
elif zeroCount == 2:
# if you have a ceiling value of 0 and a floor value that isn't, I have to assume those are flipped
if grantQuantificationValues_RE_sorted[0]==0 and not grantQuantificationValues_RE_sorted[1]==0:
grantQuantificationValues_RE_sorted[0]=grantQuantificationValues_RE_sorted[1]
grantQuantificationValues_RE_sorted[1]=0
# if the grant count is zero, but the ceiling value isn't
if grantQuantificationValues_RE_sorted[3]==0 and not grantQuantificationValues_RE_sorted[0]==0:
# then use that info to estimate the number of awards
grantQuantificationValues_RE_sorted[3]=np.floor(np.divide(grantQuantificationValues_RE_sorted[2],grantQuantificationValues_RE_sorted[0])).astype(int)
# if you have a count but not a ceiling value, do the inverse computation of the first
elif not grantQuantificationValues_RE_sorted[3]==0 and grantQuantificationValues_RE_sorted[0]==0:
grantQuantificationValues_RE_sorted[0]=np.floor(np.divide(grantQuantificationValues_RE_sorted[2],grantQuantificationValues_RE_sorted[3])).astype(int)
# if you've asserted that the max grant val is
else:
pass
# shouldn't be anything else, we've handled all the cases I think for two zeros.
elif zeroCount == 1:
# if the zero isn't in the value floor column, we have a problem.
if not grantQuantificationValues_RE_sorted[1]==0:
# if it's in the ceiling, I'm again going to assume you flipped them
if grantQuantificationValues_RE_sorted[0]==0 and not grantQuantificationValues_RE_sorted[1]==0:
grantQuantificationValues_RE_sorted[0]=grantQuantificationValues_RE_sorted[1]
grantQuantificationValues_RE_sorted[1]=0
# if the grant count is zero, but the ceiling value isn't
elif grantQuantificationValues_RE_sorted[3]==0 and not grantQuantificationValues_RE_sorted[0]==0:
# then use that info to estimate the number of awards
grantQuantificationValues_RE_sorted[3]=np.floor(np.divide(grantQuantificationValues_RE_sorted[2],grantQuantificationValues_RE_sorted[0])).astype(int)
else:
# cases seem mostly taken care of
pass
else:
pass
#print(grantQuantificationValues_RE_sorted)
if not np.all(np.equal(grantQuantificationValues,grantQuantificationValues_RE_sorted)):
correctedCount=correctedCount+1
grantsDF[quantColumns].iloc[iIndex]=grantQuantificationValues_RE_sorted
print(str(correctedCount) + ' grant funding value records repaired')
return grantsDF
def inferNames_GovGrantsDF(grantsDF):
"""
Infers agency names for the grantsDF dataframe in accordance with established heuristics.
NOTE: this function CHANGES the values / content of the grantsDF from the information contained on grants.gov, including
but not limited to adding data columns, replacing null/empty values, and/or inferring missing values.
Parameters
----------
grantsDF : pandas.DataFrame
A dataframe containing grants data from grants.gov
Returns
-------
grantsDF : pandas.DataFrame
A dataframe containing grants data from grants.gov, with an additional column
See Also
--------
reTypeGrantColumns : Iterates through columns and retypes the columns in an intentional fashion.
repairFunding_GovGrantsDF : Repairs the content of the grantsDF dataframe in accordance with established heuristics.
"""
import numpy
import copy
import pandas as pd
# silence!
pd.options.mode.chained_assignment = None
# get the column names
allColumnNameslist=list(grantsDF.columns)
# add a column for agency sub code
try:
grantsDF.insert(allColumnNameslist.index('AgencyCode')+1,'AgencySubCode', '')
except:
pass
#quantColumns=['AgencyName','AgencyCode']
# set a fill value for null name values
fillValue='Other'
correctedCount=0
for iIndex,iRows in grantsDF.iterrows():
currAgencyName=iRows['AgencyName']
currAngencyCode=iRows['AgencyCode']
currAngencySubCode=''
inputInfo=[currAgencyName,currAngencyCode,currAngencySubCode]
# try and split the subcode out now
try:
currAngencySubCode=currAngencyCode.split('-',1)[1]
currAngencyCode=currAngencyCode.split('-',1)[0]
except:
currAngencySubCode=''
# go ahead and throw it in
grantsDF['AgencySubCode'].iloc[iIndex]=currAngencySubCode
#create a vector to hold all of these
# if the agency code is either nan or empty we'll try and fix it
if currAngencyCode == '':
if not currAgencyName == '':
# use the capital letters to infer, replace commas with dashes
# start wit the full agency name
currAngencyCode=copy.deepcopy(currAgencyName)
# commas to dashes
currAngencyCode=currAngencyCode.replace(',','-')
# drop all non capital, non dash characters
currAngencyCode=''.join([char for char in currAngencyCode if char.isupper() or char == '-'])
try:
currAngencySubCode=currAngencyCode.split('-',1)[1]
except:
currAngencySubCode=''
currAngencyCode=currAngencyCode.split('-',1)[0]
else:
# if there's no agency name available, just set both to 'Other'
currAngencyCode=fillValue
currAngencySubCode=''
currAgencyName=fillValue
#correctedCount =correctedCount + 1
# if the name is null set it to the fill value as well
if currAgencyName == '':
currAgencyName=fillValue
#correctedCount =correctedCount + 1
try:
currAngencySubCode=currAngencyCode.split('-',1)[1]
currAngencyCode=currAngencyCode.split('-',1)[0]
except:
currAngencySubCode=''
outputInfo=[currAgencyName,currAngencyCode,currAngencySubCode]
#if there is new information to add, update the record
if not inputInfo==outputInfo:
#print(outputInfo)
grantsDF['AgencyName'].iloc[iIndex] = outputInfo[0]
grantsDF['AgencyCode'].iloc[iIndex] = outputInfo[1]
grantsDF['AgencySubCode'].iloc[iIndex] = outputInfo[2]
correctedCount =correctedCount + 1
# dont bother updating if not relevant.
#print(iIndex)
print(str(correctedCount) + ' grant agency name or code value records altered')
return grantsDF
def prepareGrantsDF(grantsDF, repair=True):
"""
Resets column types, infers agency names, and repairs grant values
NOTE: this function CHANGES the values / content of the grantsDF from the information contained on grants.gov, including
but not limited to adding data columns, replacing null/empty values, and/or inferring missing values.
Parameters
----------