-
Notifications
You must be signed in to change notification settings - Fork 2
/
fabfile.py
executable file
·3444 lines (2986 loc) · 143 KB
/
fabfile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
import csvkit
import fabric
from fabric.api import *
from fabric.operations import *
import collections
import pandas as pd
import datetime
from canonical.canonical import *
import time
import re
from twitter import *
import requests
fabric.state.output.status = False
def getConfig():
"""
Cache list of config values.
"""
with open("../config.txt", "rb") as w:
c = w.readlines()
c = [linebr.replace("\n", "") for linebr in c]
return c
configlist = getConfig()
THISPATH = configlist[5]
def validDate(datestring):
"""
Check for known bad dates, invalid dates, dates in the future
"""
try:
return GARBAGE_DATES[datestring]
except:
try:
# is it a valid date?
x = datetime.datetime.strptime(datestring, '%Y-%m-%d')
# is the date before today?
if x < datetime.datetime.now():
return datestring
else:
return "broke"
except:
return "broke"
def getFloat(i):
"""
Return a float or 0.0
"""
if not i or i == "":
return "0.0"
else:
return str(float(i))
def lookItUp(input, param, namefield):
"""
Check if a record exists in canonical donors lookup dict
"""
try:
return str(CANON[input][param])
except:
if param == "canonicalid":
return input
else:
return namefield
def canonFlag(input):
"""
Temporary workaround to display front-page canonical records
"""
try:
x = CANON[input]
return "I"
except:
return ""
def canonOffice(rawstring, param):
"""
Hit the office canonical dict to standardize office names
"""
try:
x = CANON_OFFICE[rawstring][param]
return x
except:
return ""
def getDate():
"""
Parse the "last updated" date from a file in the NADC data dump
"""
q = open(THISPATH + "nadc_data/last_updated.py", "wb")
with open(THISPATH + "nadc_data/DATE_UPDATED.TXT", "rb") as d:
last_updated = d.readline().split(": ")[1].split(" ")[0].split("-")
year = last_updated[0]
month = last_updated[1].lstrip("0")
day = last_updated[2].lstrip("0")
q.write("import datetime\n\nLAST_UPDATED = datetime.date(" + year + ", " + month + ", " + day + ")")
q.close()
def parseErrything():
"""
Kicks out ready-to-upload data files:
toupload/entity.txt
toupload/candidate.txt
toupload/donation.txt
toupload/loan.txt
toupload/expenditure.txt
toupload/misc.txt
Forms we care about:
A1: Most committees
A1CAND: Candidates
B1: Campaign statements for candidate or ballot question committees
B1AB: Main donations table
B1C: Loans to candidate or ballot question committees
B1D: Expenditures by candidate or ballot question committees
B2: Campaign statements for political party committees
B2A: Contributions to candidate or ballot question committees
B2B: Expenditures by party committees
B4: Campaign statements for independent committees
B4A: Donations to independent committees
B4B1: Expenditures by independent committees
B4B2: Federal and Out of State Disbursements
B4B3: Administrative/Operating Disbursements
B5: Late contributions
B6: Reports of an independent expenditure or donation made by people or entities that are not registered as committees
B6CONT: Contributions to committees by people who do not have an ID
B6EXPEND: Expenditures made on behalf of committees by people who do not have an ID
B7: Registration of corporations, unions and other associations
B72: Direct contributions by corporations, unions and other associations
B73: Indirect contributions by corporations, unions and other associations
B9: Out of state expenditures/donations
Assumptions:
A "direct expenditure" or "cash disbursement" to a candidate or registered committee is equivalent to a donation and will be treated as such.
"""
delim = "|"
id_master_list = []
rows_with_new_bad_dates = []
counter = 0
entities = open(THISPATH + "nadc_data/toupload/entity_raw.txt", "wb")
candidates = open(THISPATH + "nadc_data/toupload/candidate.txt", "wb")
donations = open(THISPATH + "nadc_data/toupload/donations_raw.txt", "wb")
loans = open(THISPATH + "nadc_data/toupload/loan.txt", "wb")
expenditures = open(THISPATH + "nadc_data/toupload/expenditure_raw.txt", "wb")
firehose = open(THISPATH + "nadc_data/toupload/firehose.txt", "wb")
misc = open(THISPATH + "nadc_data/toupload/misc.txt", "wb")
#write headers to files that get deduped by pandas or whatever
donations_headers = [
"db_id",
"cash",
"inkind",
"pledge",
"inkind_desc",
"donation_date",
"donor_id",
"recipient_id",
"donation_year",
"notes",
"stance",
"donor_name",
"source_table"
]
donations.write("|".join(donations_headers) + "\n")
entities_headers = [
"nadcid",
"name",
"address",
"city",
"state",
"zip",
"entity_type",
"notes",
"employer",
"occupation",
"place_of_business",
"dissolved_date",
"date_we_care_about"
]
entities.write("|".join(entities_headers) + "\n")
print "\nPARSING RAW FILES"
with open('forma1.txt', 'rb') as a1:
"""
FormA1: Top-level table for committees. Supposed to include all committees in FormB1, FormB4, FormB2 reports, but we are going to be paranoid and not assume this is the case.
Data is fed to Entity and Candidate tables. (We're treating ballot questions as candidates.)
COLUMNS
=======
0: Committee ID Number
1: Committee Name
2: Committee Address
3: Committee City
4: Committee State
5: Committee Zip
6: Committee Type
7: Date Received
8: Postmark Date
9: Nature Of Filing
10: Ballot Question
11: Oppose Ballot Question
12: Ballot Type
13: Date over Theshold
14: Acronym
15: Separate Seg Political Fund ID
16: Separate Segregated Political Fund Name
17: SSPF Address
18: SSPF City
19: SSPF State
20: SSPF Zip
21: SSPF Type
22: Date Dissolved
23: Date of Next Election
24: Election Type
25: Won General
26: Won Primary
"""
print " forma1 ..."
a1reader = csvkit.reader(a1, delimiter = delim)
a1reader.next()
for row in a1reader:
a1_entity_id = row[0] #NADC ID
if a1_entity_id not in GARBAGE_COMMITTEES:
#Append ID to master list
id_master_list.append(a1_entity_id)
#Add to Entity
a1_entity_name = ' '.join((row[1].upper().strip()).split()).replace('"',"")
a1_entity_notes = ""
#Committee name
a1_address = row[2] #Address
a1_city = row[3] #City
a1_state = row[4] #State
a1_zip = row[5] #ZIP
#a1_entity_type = row[6].strip().upper() #Committee type
a1_entity_type = canonFlag(a1_entity_id) # canonical flag
a1_entity_dissolved = row[21] #Date dissolved
a1_entity_date_of_thing_happening = row[7] #Date used to eval recency on dedupe
"""
DB fields
===========
nadcid, name, address, city, state, zip, entity_type, notes, employer, occupation, place_of_business, dissolved_date
We're adding a1_entity_date_of_thing_happening so that later we can eval for recency on dedupe.
"""
a1_entity_list = [
a1_entity_id,
a1_entity_name,
a1_address.upper(),
a1_city.upper(),
a1_state.upper(),
a1_zip,
a1_entity_type,
a1_entity_notes,
"",
"",
"",
a1_entity_dissolved,
a1_entity_date_of_thing_happening,
]
entities.write("|".join(a1_entity_list) + "\n")
#is there a separate segregated political fund?
if row[15] and row[15].strip() != "":
a1_sspf_id = row[15] #NADC ID
if a1_sspf_id not in GARBAGE_COMMITTEES:
#Append ID to master list
id_master_list.append(a1_sspf_id)
#Add to Entity
a1_sspf_name = row[16] #Committee name
a1_sspf_address = row[17] #Address
a1_sspf_city = row[18] #City
a1_sspf_state = row[19] #State
a1_sspf_zip = row[20] #ZIP
#a1_sspf_type = row[21] #Committee type
a1_sspf_type = canonFlag(a1_sspf_id) # canonical flag
a1_sspf_entity_date_of_thing_happening = row[7] #Date used to eval recency on dedupe
a1_sspf_list = [
a1_sspf_id,
a1_sspf_name,
a1_sspf_address,
a1_sspf_city,
a1_sspf_state,
a1_sspf_zip,
a1_sspf_type,
"",
"",
"",
"",
a1_sspf_entity_date_of_thing_happening,
]
entities.write("|".join(a1_sspf_list) + "\n")
#is this a ballot question?
if row[6].upper().strip() == "B":
a1_nadc_id = row[0]
if a1_nadc_id not in GARBAGE_COMMITTEES:
a1_entity_name = ' '.join((row[1].upper().strip()).split()).replace('"',"") #Committee name
a1_ballot = ' '.join((row[10].upper().strip()).split()).replace('"',"")
a1_ballot_type = row[12].upper().strip() #(I=Initiative, R=Recall, F=Referendum, C=Constitutional Amendment)
a1_ballot_stance = row[11].strip() #(0=Support, 1=Oppose)
#Unpack lookup to replace known bad strings
for item in GARBAGE_STRINGS:
a1_entity_name = a1_entity_name.upper().strip().replace(*item)
ballot_types = {
"I": "Initiative",
"R": "Recall",
"F": "Referendum",
"C": "Constitutional Amendment",
"O": "Other"
}
def ballotType(str):
try:
return ballot_types[str]
except:
return "Uncategorized"
"""
DB fields
=========
cand_id, cand_name, committee_id, office_dist, office_govt, office_title, stance, donor_id, notes, db_id (""), govslug
"""
a1_ballot_cand_list = [
"BQCAND" + a1_nadc_id,
a1_entity_name,
a1_nadc_id,
"",
"Ballot question",
ballotType(a1_ballot_type),
a1_ballot_stance,
"",
a1_ballot,
"",
""
]
candidates.write("|".join(a1_ballot_cand_list) + "\n")
with open('forma1cand.txt', 'rb') as a1cand:
"""
FormA1CAND: Candidates connected to committees
Data is fed to Candidate table
COLUMNS
=======
0: Form A1 ID Number
1: Date Received
2: Candidate ID
3: Candidate Last Name
4: Candidate First Name
5: Candidate Middle Initial
6: Support/Oppose
7: Office Sought
8: Office Title
9: Office Description
"""
print " forma1cand ..."
a1candreader = csvkit.reader(a1cand, delimiter = delim)
a1candreader.next()
for row in a1candreader:
a1cand_id = row[2] #Candidate ID
a1cand_committee_id = row[0] #Candidate Committee ID
if a1cand_committee_id not in GARBAGE_COMMITTEES:
id_master_list.append(a1cand_committee_id)
#Add to Entity
a1cand_entity_name = ""
a1cand_address = ""
a1cand_city = ""
a1cand_state = ""
a1cand_zip = ""
#a1cand_entity_type = ""
a1cand_entity_type = canonFlag(a1cand_committee_id) # canonical flag
a1cand_entity_dissolved = ""
a1cand_entity_date_of_thing_happening = row[1] #Date used to eval recency on dedupe
"""
DB fields
===========
nadcid, name, address, city, state, zip, entity_type, notes, employer, occupation, place_of_business, dissolved_date
We're adding a1cand_entity_date_of_thing_happening so that later we can eval for recency on dedupe.
"""
a1cand_entity_list = [
a1cand_committee_id,
a1cand_entity_name,
a1cand_address,
a1cand_city,
a1cand_state,
a1cand_zip,
a1cand_entity_type,
"",
"",
"",
"",
a1cand_entity_dissolved,
a1cand_entity_date_of_thing_happening,
]
entities.write("|".join(a1cand_entity_list) + "\n")
if a1cand_committee_id not in GARBAGE_COMMITTEES and a1cand_id not in GARBAGE_COMMITTEES:
#Append to Candidate
a1cand_cand_last = row[3] #Last name
a1cand_cand_first = row[4] #First name
a1cand_cand_mid = row[5] #Middle initial
a1cand_cand_full_name = " ".join([a1cand_cand_first, a1cand_cand_mid, a1cand_cand_last]) #Full name
a1cand_cand_full_name = " ".join((a1cand_cand_full_name.upper().strip()).split()).replace('"',"")
a1cand_stance = row[6] #Does committee support or oppose candidate? 0 = support, 1 = oppose
a1cand_office_sought = " ".join((row[7].upper().strip()).split()).replace('"',"") #Office sought
a1cand_office_title = " ".join((row[8].upper().strip()).split()).replace('"',"") #Office title
a1cand_office_desc = " ".join((row[9].upper().strip()).split()).replace('"',"") #Office description
a1cand_office_string = " ".join([a1cand_office_desc, a1cand_office_sought, a1cand_office_title])
a1cand_office = canonOffice(a1cand_office_string, "office")
a1cand_gov = canonOffice(a1cand_office_string, "gov")
a1cand_dist = canonOffice(a1cand_office_string, "district")
#Fixing a couple of weird edge cases
for item in STANDARD_CANDIDATES:
a1cand_cand_full_name = a1cand_cand_full_name.upper().replace(*item).strip()
a1cand_id = a1cand_id.upper().replace(*item).strip()
"""
DB fields
=========
cand_id, cand_name, committee_id, office_dist, office_govt, office_title, stance, donor_id, notes, db_id (""), govslug
"""
a1cand_list = [
a1cand_id,
a1cand_cand_full_name,
a1cand_committee_id,
a1cand_dist,
a1cand_gov,
a1cand_office,
a1cand_stance,
"",
"",
"",
"",
]
candidates.write("|".join(a1cand_list) + "\n")
with open('formb1.txt', 'rb') as b1:
"""
FormB1: Campaign statements for candidate or ballot question committees
Data is added to Entity
COLUMNS
=======
0: Committee Name
1: Committee Address
2: Committee Type
3: Committee City
4: Committee State
5: Committee Zip
6: Committee ID Number
7: Date Last Revised
8: Last Revised By
9: Date Received
10: Postmark Date
11: Microfilm Number
12: Election Date
13: Type of Filing
14: Nature of Filing
15: Additional Ballot Question
16: Report Start Date
17: Report End Date
18: Field 1
19: Field 2A
20: Field 2B
21: Field 2C
22: Field 3
23: Field 4A
24: Field 4B
25: Field 5
26: Field 6
27: Field 7A
28: Field 7B
29: Field 7C
30: Field 7D
31: Field 8A
32: Field 8B
33: Field 8C
34: Field 8D
35: Field 9
36: Field 10
37: Field 11
38: Field 12
39: Field 13
40: Field 14
41: Field 15
42: Field 16
43: Field 17
44: Field 18
45: Field 19
46: Field 20
47: Field 21
48: Field 22
49: Field 23
50: Field 23
51: Field 24
52: Field 25
53: Field 26
54: Field 27
55: Adjustment
56: Total Unitemized Bills
57: Total Unpaid Bills
58: Total All Bills
"""
print " formb1 ..."
b1reader = csvkit.reader(b1, delimiter = delim)
b1reader.next()
for row in b1reader:
b1_entity_id = row[6]
if b1_entity_id not in GARBAGE_COMMITTEES:
#Append ID to master list
id_master_list.append(b1_entity_id)
#Add to Entity
b1_entity_name = ' '.join((row[0].upper().strip()).split()).replace('"',"") #Committee name
b1_address = ' '.join((row[1].upper().strip()).split()).replace('"',"") #Address
b1_city = ' '.join((row[3].upper().strip()).split()).replace('"',"") #City
b1_state = ' '.join((row[4].upper().strip()).split()).replace('"',"") #State
b1_zip = row[5].strip() #ZIP
#b1_entity_type = row[2].strip().upper() #Committee type
b1_entity_type = canonFlag(b1_entity_id) # canonical flag
b1_entity_date_of_thing_happening = row[9] #Date used to eval recency on dedupe
"""
DB fields
========
nadcid, name, address, city, state, zip, entity_type, notes, employer, occupation, place_of_business, dissolved_date
We're adding b1_entity_date_of_thing_happening so that later we can eval for recency on dedupe.
"""
b1_entity_list = [
b1_entity_id,
b1_entity_name,
b1_address,
b1_city,
b1_state,
b1_zip,
b1_entity_type,
"",
"",
"",
"",
"",
b1_entity_date_of_thing_happening,
]
entities.write("|".join(b1_entity_list) + "\n")
with open('formb1ab.txt', 'rb') as b1ab:
"""
FormB1AB: Main donations table
Data is added to Entity and Donation tables
COLUMNS
=======
0: Committee Name
1: Committee ID
2: Date Received
3: Type of Contributor
4: Contributor ID
5: Contribution Date
6: Cash Contribution
7: In-Kind Contribution
8: Unpaid Pledges
9: Contributor Last Name
10: Contributor First Name
11: Contributor Middle Initial
12: Contributor Organization Name
13: Contributor Address
14: Contributor City
15: Contributor State
16: Contributor Zipcode
"""
print " formb1ab ..."
b1abreader = csvkit.reader(b1ab, delimiter = delim)
b1abreader.next()
for row in b1abreader:
b1ab_committee_id = row[1]
b1ab_contributor_id = row[4]
if b1ab_committee_id not in GARBAGE_COMMITTEES:
#Append ID to master list
id_master_list.append(b1ab_committee_id)
#Add committee to Entity
b1ab_committee_name = ' '.join((row[0].upper().strip()).split()).replace('"',"") #Committee name
b1ab_committee_address = "" #Address
b1ab_committee_city = "" #City
b1ab_committee_state = "" #State
b1ab_committee_zip = "" #ZIP
#b1ab_committee_type = "" #Committee type
b1ab_committee_type = canonFlag(b1ab_committee_id) # canonical flag
b1ab_entity_date_of_thing_happening = row[2] #Date used to eval recency on dedupe
"""
DB fields
===========
nadcid, name, address, city, state, zip, entity_type, notes, employer, occupation, place_of_business, dissolved_date
We're adding b1ab_entity_date_of_thing_happening so that later we can eval for recency on dedupe.
"""
b1ab_committee_list = [
b1ab_committee_id,
b1ab_committee_name,
b1ab_committee_address,
b1ab_committee_city,
b1ab_committee_state,
b1ab_committee_zip,
b1ab_committee_type,
"",
"",
"",
"",
"",
b1ab_entity_date_of_thing_happening,
]
entities.write("|".join(b1ab_committee_list) + "\n")
if b1ab_contributor_id not in GARBAGE_COMMITTEES:
#Append ID to master list
id_master_list.append(b1ab_contributor_id)
#Add contributor to Entity
b1ab_contributor_last = row[9] #Contributor last name
b1ab_contributor_first = row[10] #Contributor first name
b1ab_contributor_mid = row[11] #Contributor middle name
b1ab_contributor_org_name = row[12] #Contributor org name
b1ab_concat_name = " ".join([b1ab_contributor_first, b1ab_contributor_mid, b1ab_contributor_last, b1ab_contributor_org_name])
b1ab_contributor_name = ' '.join((b1ab_concat_name.upper().strip()).split()).replace('"',"") #Contributor name
b1ab_contributor_address = row[13].upper().strip() #Address
b1ab_contributor_city = row[14].upper().strip() #City
b1ab_contributor_state = row[15].upper().strip() #State
b1ab_contributor_zip = row[16] #ZIP
#b1ab_contributor_type = row[3].upper().strip() #Contributor type
b1ab_contributor_type = canonFlag(b1ab_contributor_id) # canonical flag
b1ab_entity_date_of_thing_happening = row[2] #Date used to eval recency on dedupe
"""
DB fields
=========
nadcid, name, address, city, state, zip, entity_type, notes, employer, occupation, place_of_business, dissolved_date
We're adding b1ab_entity_date_of_thing_happening so that later we can eval for recency on dedupe.
"""
b1ab_contributor_list = [
b1ab_contributor_id,
b1ab_contributor_name,
b1ab_contributor_address,
b1ab_contributor_city,
b1ab_contributor_state,
b1ab_contributor_zip,
b1ab_contributor_type,
"",
"",
"",
"",
"",
b1ab_entity_date_of_thing_happening,
]
entities.write("|".join(b1ab_contributor_list) + "\n")
#Womp into donations
if b1ab_contributor_id not in GARBAGE_COMMITTEES and b1ab_committee_id not in GARBAGE_COMMITTEES:
#datetest
b1ab_donation_date = row[5]
b1ab_date_test = validDate(b1ab_donation_date)
if b1ab_date_test == "broke":
b1ab_dict = {}
b1ab_dict["donor_id"] = row[10]
b1ab_dict["recipient_id"] = row[1]
b1ab_dict["lookup_name"] = ' '.join((row[0].upper().strip()).split()).replace('"',"")
b1ab_dict["source_table"] = "b1ab"
b1ab_dict["destination_table"] = "donation"
b1ab_dict["donation_date"] = b1ab_donation_date
rows_with_new_bad_dates.append(b1ab_dict)
else:
b1ab_year = b1ab_date_test.split("-")[0]
if int(b1ab_year) >= 1999:
b1ab_cash = getFloat(str(row[6])) #cash
b1ab_inkind_amount = getFloat(str(row[7])) #inkind
b1ab_pledge_amount = getFloat(str(row[8])) #pledge
b1ab_inkind_desc = "" #in-kind description
"""
DB fields
=========
db_id, cash, inkind, pledge, inkind_desc, donation_date, donor_id, recipient_id, donation_year, notes, stance, donor_name, source_table
"""
b1ab_donation_list = [
str(counter),
b1ab_cash,
b1ab_inkind_amount,
b1ab_pledge_amount,
b1ab_inkind_desc,
b1ab_date_test,
b1ab_contributor_id,
b1ab_committee_id,
b1ab_year,
"",
"",
"",
"b1ab",
]
donations.write("|".join(b1ab_donation_list) + "\n")
firehose.write("|".join(b1ab_donation_list) + "\n")
counter += 1
with open('formb1c.txt', 'rb') as b1c:
"""
FormB1C: Loans to candidate or ballot question committees
Data is added to Entity and Loan tables
COLUMNS
=======
0: Committee Name
1: Committee ID
2: Date Received
3: Lender Name
4: Lender Address
5: Loan Date
6: Amount Received
7: Amount Repaid
8: Amount Forgiven
9: Paid by 3rd Party
10: Guarantor
"""
print " formb1c ..."
b1creader = csvkit.reader(b1c, delimiter = delim)
b1creader.next()
for row in b1creader:
b1c_committee_id = row[1]
if b1c_committee_id not in GARBAGE_COMMITTEES:
#Append ID to master list
id_master_list.append(b1c_committee_id)
#Add committee to Entity
b1c_committee_name = ' '.join((row[0].upper().strip()).split()).replace('"',"") #Committee name
b1c_committee_address = "" #Address
b1c_committee_city = "" #City
b1c_committee_state = "" #State
b1c_committee_zip = "" #ZIP
#b1c_committee_type = "" #Committee type
b1c_committee_type = canonFlag(b1c_committee_id) # canonical flag
b1c_entity_date_of_thing_happening = row[2] #Date used to eval recency on dedupe
"""
DB fields
=========
nadcid, name, address, city, state, zip, entity_type, notes, employer, occupation, place_of_business, dissolved_date
We're adding b1c_entity_date_of_thing_happening so that later we can eval for recency on dedupe.
"""
b1c_committee_list = [
b1c_committee_id,
b1c_committee_name,
b1c_committee_address,
b1c_committee_city,
b1c_committee_state,
b1c_committee_zip,
b1c_committee_type,
"",
"",
"",
"",
"",
b1c_entity_date_of_thing_happening,
]
entities.write("|".join(b1c_committee_list) + "\n")
#Womp loans into loan table
b1c_lender_name = ' '.join((row[3].strip().upper()).split())
b1c_lender_addr = row[4].upper().strip()
b1c_loan_date = row[5]
b1c_loan_amount = row[6]
b1c_loan_repaid = row[7]
b1c_loan_forgiven = row[8]
b1c_paid_by_third_party = row[9]
b1c_guarantor = row[10]
b1c_loan_date_test = validDate(b1c_loan_date)
if b1c_loan_date_test == "broke":
b1c_dict = {}
b1c_dict["donor_id"] = ""
b1c_dict["recipient_id"] = row[1]
b1c_dict["lookup_name"] = ' '.join((row[0].upper().strip()).split()).replace('"',"")
b1c_dict["source_table"] = "b1c"
b1c_dict["destination_table"] = "loans"
b1c_dict["donation_date"] = b1c_loan_date
rows_with_new_bad_dates.append(b1c_dict)
else:
b1c_year = b1c_loan_date_test.split("-")[0]
if int(b1c_year) >= 1999:
"""
DB fields
========
db_id, lender_name, lender_addr, loan_date, loan_amount, loan_repaid, loan_forgiven, paid_by_third_party, guarantor, committee_id, notes, stance, lending_committee_id
"""
b1c_loan_list = [
"", #DB ID
b1c_lender_name, #lender name
b1c_lender_addr, #lender address
b1c_loan_date_test, #loan date
b1c_loan_amount, #loan amount
b1c_loan_repaid, #amount repaid
b1c_loan_forgiven, #amount forgiven
b1c_paid_by_third_party, #amount covered by 3rd party
b1c_guarantor, #guarantor
b1c_committee_id, #committee ID
"", #notes field
"", #stance field
"", #lending committee ID
]
loans.write("|".join(b1c_loan_list) + "\n")
with open('formb1d.txt', 'rb') as b1d:
"""
FormB1D: Expenditures by candidate or ballot question committees
Data is added to Entity and Expenditure tables
COLUMNS
=======
0: Committee Name
1: Committee ID
2: Date Received
3: Payee Name
4: Payee Address
5: Expenditure Purpose
6: Expenditure Date
7: Amount
8: In-Kind
"""
print " formb1d ..."
b1dreader = csvkit.reader(b1d, delimiter = delim)
b1dreader.next()
for row in b1dreader:
b1d_committee_id = row[1]
if b1d_committee_id not in GARBAGE_COMMITTEES:
#Append ID to master list
id_master_list.append(b1d_committee_id)
#Add committee to Entity
b1d_committee_name = ' '.join((row[0].upper().strip()).split()).replace('"',"") #Committee name
b1d_committee_address = "" #Address
b1d_committee_city = "" #City
b1d_committee_state = "" #State
b1d_committee_zip = "" #ZIP
#b1d_committee_type = "" #Committee type
b1d_committee_type = canonFlag(b1d_committee_id) # canonical flag
b1d_entity_date_of_thing_happening = row[2] #Date used to eval recency on dedupe
"""
DB fields
========
nadcid, name, address, city, state, zip, entity_type, notes, employer, occupation, place_of_business, dissolved_date
We're adding b1d_entity_date_of_thing_happening so that later we can eval for recency on dedupe.
"""
b1d_committee_list = [
b1d_committee_id,
b1d_committee_name,
b1d_committee_address,
b1d_committee_city,
b1d_committee_state,
b1d_committee_zip,
b1d_committee_type,
"",
"",
"",
"",
"",
b1d_entity_date_of_thing_happening,
]
entities.write("|".join(b1d_committee_list) + "\n")
# womp expenditures in there
b1d_exp_date = row[6]
b1d_exp_date_test = validDate(b1d_exp_date)
if b1d_exp_date_test == "broke":
b1d_dict = {}
b1d_dict["donor_id"] = ""
b1d_dict["recipient_id"] = row[1]
b1d_dict["lookup_name"] = ' '.join((row[0].upper().strip()).split()).replace('"',"")
b1d_dict["source_table"] = "b1d"
b1d_dict["destination_table"] = "expenditures"
b1d_dict["donation_date"] = b1d_exp_date
rows_with_new_bad_dates.append(b1d_dict)
else:
b1d_year = b1d_exp_date_test.split("-")[0]
if int(b1d_year) >= 1999:
b1d_payee = ' '.join((row[3].upper().strip()).split()).replace('"',"")
b1d_address = ' '.join((row[4].upper().strip()).split()).replace('"',"")
b1d_exp_purpose = ' '.join((row[5].strip()).split()).replace('"',"")
b1d_amount = getFloat(row[7])
b1d_inkind = getFloat(row[8])
"""
DB fields
=========
db_id (""), payee (name, free text), payee_addr, exp_date, exp_purpose, amount, in_kind, committee_id (doing the expending), stance (support/oppose), notes, payee_committee_id (the payee ID, if exists), committee_exp_name (name of the committee doing the expending), raw_target (free text ID of target ID, will get shunted to candidate or committee ID on save), target_candidate_id, target_committee_id
"""
b1d_exp_list = [
"",
b1d_payee,
b1d_address,
b1d_exp_date_test,
b1d_exp_purpose,
b1d_amount,
b1d_inkind,
b1d_committee_id,
"", #stance
"", #notes
"", #payee committee ID
b1d_committee_name, #name of committee doing the expending,
"", #raw target
"\N", #target candidate ID
"", #target committee ID
]
expenditures.write("|".join(b1d_exp_list) + "\n")
with open('formb2.txt', 'rb') as b2:
"""
FormB2: Campaign statements for political party committees
Data is added to Entity
COLUMNS
=======
0: Committee Name