-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheckDeaTargets.py
175 lines (162 loc) · 5.38 KB
/
checkDeaTargets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# Mark when student test scores met targets on DEA tests. See README
# on github for details.
import glob, os
import numpy as np
import pandas as pd
SUBJECT_CATEGORY = {
'math':[
'Algebra 1',
'Algebra 2',
'Geometry',
'Mathematics',
],
'language':[
'English 1',
'English 2',
'Reading Language Arts',
]
}
SUBJECT_LONG_TO_SHORT = {
'HS Algebra 1': 'Algebra 1',
'HS Algebra 2': 'Algebra 2',
'HS Geometry': 'Geometry',
'HS English 2':'English 2',
'HS English 1':'English 1',
'8 Reading Language Arts':'8th Grade Reading',
'7 Reading Language Arts':'7th Grade Reading',
'9 Mathematics':'9th Grade Math',
'7 Mathematics':'7th Grade Math',
'8 Mathematics':'8th Grade Math',
}
TARGET_SUBJECTS = {
'math':[
'Algebra 1',
'Algebra 2',
'Geometry',
'7th Grade Math',
'8th Grade Math',
'9th Grade Math',
],
'language':[
'English 1',
'English 2',
'7th Grade Reading',
'8th Grade Reading',
],
}
# columns to load from student data files
READ_COLS = [
'Student ID',
'Grade',
'Test Period:Opening Test Date',
'Scale Score',
'Achievement Level',
]
# columns which repeat per test period
SUFFIX_COLS = [
'Grade',
'Test',
'Scale Score',
'Level',
]
TESTS = ['A:20150824','B:20151102','C:20160201']
YEAR = '2016'
CSV_DIR = '/mnt/nasshare/DEA data 2016/'
def processCategory(category):
#
# Read csv files with student data. Each file has its subject in the name.
#
subjectDfs = []
for subject in SUBJECT_CATEGORY[category]:
fname = glob.glob(CSV_DIR+subject+' '+YEAR+'.csv')[0]
subjectDf = pd.read_csv(fname, usecols=READ_COLS, index_col=2)
subjectDf.columns = SUFFIX_COLS
subjectDf.set_index('Test', append=True, inplace=True)
subjectDf['Subject'] = subject
subjectDfs += [subjectDf]
# merge all subjects into one dataframe
allSubjectDf = pd.concat(subjectDfs).swaplevel(0, 1, axis=0)
#
# Denormalize for each test period so all periods occur in each row
#
testDfs = []
for test in TESTS:
suffix = ' '+test.split(':')[0]
testDf = allSubjectDf.loc[test]
testDf.columns = [x+suffix for x in testDf.columns]
testDfs += [testDf]
categoryDf = testDfs[0].join(testDfs[1:], how='outer')
# tbd
# categoryDf = joinedDf.swaplevel(0, 1, axis=0)
#
# load target score tables
#
# File Headers look like:
# Scale Score,Growth points,Target score
#
scoreToTarget = {}
for subject in TARGET_SUBJECTS[category]:
fname = CSV_DIR+'Target score tables '+subject+'.csv'
targetDf = pd.read_csv(fname, usecols=[1,3], header=2)
subScoreToTarget = {}
for row in range(len(targetDf)-1):
ts = targetDf['Target score'][row]
if pd.notnull(ts):
targetScore = ts
# fill in internal values in range for easy lookup, later
for baseScore in range(int(targetDf['Scale Score'][row]),
int(targetDf['Scale Score'][row+1])):
subScoreToTarget[baseScore] = int(targetScore)
scoreToTarget[subject] = subScoreToTarget
#
# add result columns with default values
#
dlen = len(categoryDf)
categoryDf.loc[:,'Base Score'] = np.zeros(dlen)*np.nan
categoryDf.loc[:,'Target Score'] = np.zeros(dlen)*np.nan
categoryDf.loc[:,'Met Target'] = np.zeros(dlen, dtype=bool)
# Look in A, B, or C test period for fields like grade and
# subject. All three instances should be the same but when they
# are not populated if student did not take test during a period.
# And rarely a student might move up a grade level between tests.
def lookupABC(row, field):
prefix = field.title()+' '
result = row[prefix+'A']
if pd.isnull(result):
result = row[prefix+'B']
if pd.isnull(result):
result = row[prefix+'C']
if pd.notnull(row[prefix+'B']):
if result != row[prefix+'B']:
return None
if pd.notnull(row[prefix+'C']):
if result != row[prefix+'C']:
return None
return str(result)
#
# assign result values
#
for id,row in categoryDf.iterrows():
# base score is lower of first two tests
baseScore = np.nanmin((row['Scale Score A'], row['Scale Score B']))
if pd.isnull(baseScore):
print id,'has no test data for first two tests in',category
continue
baseScore = int(baseScore)
grade = lookupABC(row, 'Grade')
if pd.isnull(grade):
print id,'took tests from multiple grade levels in',category
continue
subject = lookupABC(row, 'Subject')
subject = SUBJECT_LONG_TO_SHORT[grade+' '+subject]
# find target in lookup table
targetScore = scoreToTarget[subject][baseScore]
# store results for this student
categoryDf.loc[id,'Base Score'] = baseScore
categoryDf.loc[id,'Target Score'] = targetScore
if targetScore <= np.nanmax((row['Scale Score B'], row['Scale Score C'])):
categoryDf.loc[id,'Met Target'] = True
# save results
categoryDf.to_csv(category+'.csv')
for category in ['math','language']:
processCategory(category)