-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathrepro_diags.py
185 lines (158 loc) · 7.03 KB
/
repro_diags.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import pandas as pd
import re
import os
addPath = 'C:\\Users\\joeld\\Downloads\\'
data_sources_path = addPath+'files\\Q2.2\\'
analysis_data_path = addPath+'files\\Q2.4\\'
code_desc_path = addPath+'files\\Q2.6\\'
# Function to split semicolon separated lists into python list objects
def splitLists(cell):
if not pd.isna(cell):
split = re.split('\s*;\s*',cell)
return split
else:
return cell
# Get the list of users, or identifiers of the individuals we're making diagrams for
# these identifiers need to be constant across the three files (raw, analysis, code)
users = os.listdir(data_sources_path)
# Remove the file suffix to get the identifier.
# File suffixes need to be constant across users
users = list(map(lambda x: re.sub('_data_sources.csv','',x),users))
if ".DS_Store" in users:
users.remove(".DS_Store")
for user in users:
# The input-output dataframe has a semicolon separated list of inputs and
# outputs associated with each file
IO = pd.read_csv(code_desc_path+user+'_code_desc.csv',\
usecols=['file_name','inputs','outputs'])
# Drop rows that don't produce output
IO = IO.dropna(axis=0,subset = ['outputs'])
#,
# encoding = 'cp1252')
# The rawSources and analyticSources dataframes are used to check which
# files aren't used by any code scripts
rawSources = pd.read_csv(data_sources_path+user+'_data_sources.csv')
analyticSources = pd.read_csv(analysis_data_path+user+'_analysis_data.csv')
# Start writing the diagram
output = open(addPath+user+'_diagram.txt','w')
# Make the inputs and outputs columns into python list objects
IO['inputs']=IO['inputs'].apply(splitLists)
IO['outputs']=IO['outputs'].apply(splitLists)
IO = IO.fillna('')
# In the odd case there are no code scripts
if len(IO.index) == 0:
IO['input'] = ''
inputs = []
outputs = []
# Otherwise...
else:
# Make a row for each (code script, input file, output file)
IO = IO.explode('inputs')
IO = IO.explode('outputs')
# Get the children for each output
children = IO.groupby('outputs')['inputs'].apply(lambda x: list(set(x))).reset_index()
# Get the inputs for each code script
code_inputs = IO.groupby('file_name')['inputs'].apply(lambda x: list(set(x))).reset_index()
# Get the code script that produces each output
output_makers = IO.groupby('outputs')['file_name'].apply(lambda x: list(x)[0]).reset_index()
# Get list of unique outputs
outputs = IO['outputs'].unique().tolist()
# Find the set of outputs that aren't used as inputs to any script
finalOutputs = []
for out in outputs:
if len(IO.loc[IO['inputs']==out].index)==0:
finalOutputs.append(out)
# This function is really tricky. The main args are
# finalOut, which is the output we're tracing down the tree
# and depth, which is how deep we are in the tree
# prefix is an arg that adds text to the beginning of the line
# it's used to connect outputs on the same level that have things in between
# last is an arg that tracks whether or not we're on the last output for a branch-level
# because it no longer needs the connecting bar
def makeDiagram(finalOut,depth,prefix,last):
# add the prefix, which always ends in a pipe, and name of the output
if depth >0:
output.write(prefix+'___'+finalOut+'\n')
# If we're at the top of the tree, just write the output
else:
output.write(finalOut+'\n')
# If we're in the last output for a branch-level, get rid of the pipe in the prefix
# for items in lower levels
# Note, comes AFTER writing the output for this level to avoid a missing pipe
if last == True:
prefix = prefix[0:-1]+' '
# Find the code script that generates the output
next_code = output_makers.loc[output_makers['outputs']==finalOut]['file_name']
# If there is one write it
if len(next_code.index)>0:
# If we're below the first level, add space for the |___ before the output
if depth>0:
output.write(prefix+' '+'|___'+output_makers.loc[output_makers['outputs']==finalOut]['file_name'].iloc[0]+'\n')
# Otherwise, just record it
else:
output.write('|___'+output_makers.loc[output_makers['outputs']==finalOut]['file_name'].iloc[0]+'\n')
# Find the cildren of the output
next_children = children.loc[children['outputs']==finalOut]['inputs']
# if there are any, add spaces and pipe to prefix
if len(next_children.index)>0:
prefix = prefix+' '*4
if depth >0:
prefix = prefix+' '*3
prefix = prefix+'|'
# Increase depth
depth = depth +1
# And iterate for the next level
next_children = next_children.iloc[0]
for i in range(0,len(next_children)-1):
makeDiagram(next_children[i],depth, prefix,False)
makeDiagram(next_children[-1],depth,prefix,True)
# Make the diagram for all of those outputs
for final in finalOutputs:
makeDiagram(final,0,'',False)
output.write('\n'*3)
# Find unused data sources
# get list of inputs
inputs = IO['inputs'].unique().tolist()
# Make list of data sources and analytic sources
rawSources['data_files'] = rawSources['data_files'].apply(splitLists)
rawSources = rawSources.data_files.apply(pd.Series)\
.merge(rawSources['data_source'],right_index=True,left_index=True)\
.melt(id_vars=['data_source'],value_name='data_file')\
.drop('variable',axis=1)\
.dropna()
rawSourcesList = rawSources['data_file'].unique().tolist()
analyticSourcesList = analyticSources['analysis_data'].unique().tolist()
for i in analyticSourcesList:
if i in rawSourcesList:
analyticSourcesList.remove(i)
# Check if sources are used in scripts at all
notUsed = []
used = []
while len(rawSourcesList) >0:
source = rawSourcesList.pop(0)
if source in outputs:
used.append(source)
elif source in inputs:
used.append(source)
else:
notUsed.append(source)
# Write the unused sources and close diagram
output.write('Unusued data sources:\n')
for source in notUsed:
output.write(str(source)+'\n')
output.write('\n'*3)
notUsed = []
used = []
while len(analyticSourcesList) >0:
source = analyticSourcesList.pop(0)
if source in outputs:
used.append(source)
elif source in inputs:
used.append(source)
else:
notUsed.append(source)
# Write the unused sources and close diagram
output.write('Unusued analytic data:\n')
for source in notUsed:
output.write(str(source)+'\n')
output.close()