-
Notifications
You must be signed in to change notification settings - Fork 93
/
result_collection.jl
286 lines (257 loc) · 11.6 KB
/
result_collection.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
export collect_results, collect_results!
"""
collect_results!([filename,] folder; kwargs...) -> df
!!! note "Requires `DataFrames`"
The function `collect_results!` is only available if you do
`using DataFrames` in your Julia session.
Search the `folder` (and possibly all subfolders) for new result-files and add
them to `df` which is a `DataFrame` containing all the information from
each result-file.
If a result-file is missing keys that are already columns in `df`,
they will be set as `missing`. If on the other hand new keys are encountered,
a new column will be added and filled with `missing` for all previous entries.
If no file exists in `filename`, then `df` will be saved there. If however
`filename` exists, the existing `df` will be first loaded and then reused.
The reused `df` has some results already collected: files already
included in `df` are skipped in subsequent calls to `collect_results!` while
new result-files are simply appended to the dataframe.
`filename` defaults to:
```julia
filename = joinpath(dirname(folder), "results_\$(basename(folder)).jld2")
```
See also [`collect_results`](@ref).
!!! warning "Don't use `:path` as a parameter name."
`df` contains a column `:path` which is the path where each result-file
is saved to. This is used to not reload and reprocess files already
present in `df` when searching for new ones.
## Keyword Arguments
* `subfolders::Bool = false` : If `true` also scan all subfolders of `folder`
for result-files.
* `valid_filetypes = [".bson", ".jld", ".jld2"]`: Only files that have these
endings are interpreted as result-files. Other files are skipped.
* `rpath = nothing` : If not `nothing`, then it must be a path to a folder. The `path`
column of the result-files is then `relpath(file, rpath)`, instead of the absolute
path, which is used by default.
* `verbose = true` : Print (using `@info`) information about the process.
* `update = false` : Update data from modified files and remove entries for deleted
files.
* `rinclude = [r\"\"]` : Only include files whose name matches any of these Regex expressions. Default value includes all files.
* `rexclude = [r\"^\\b\$\"]` : Exclude any files whose name matches any of these Regex expressions. Default value does not exclude any files.
* `white_list` : List of keys to use from result file. By default
uses all keys from all loaded result-files.
* `black_list = [:gitcommit, :gitpatch, :script]`: List of keys not to include from result-file.
* `special_list = []`: List of additional (derived) key-value pairs
to put in `df` as explained below.
* `load_function = wload`: Load function. Defaults to `wload`. You may want to specify a custom load function for example if you store results as a struct and you want the fields of the struct to form the columns of the dataframe. The struct is saved to file as a one-element dictionary so the dataframe will only have a single column. To work around this you could convert it to a dictionary by specifying `load_function = (filename) -> struct2dict(wload(filename)["mykey"])`. This way `collect_results` will receive a `Dict` whose keys are the fields of the struct.
`special_list` is a `Vector` where each entry
is a derived quantity to be included in `df`. There are two types of entries.
The first option is of the form `key => func` where the `key` is a symbol
to be used as column name in the DataFrame. The function entry always
takes a single argument, which is the loaded result-file (a dictionary).
The second option is to provide just one function `func`. This function
also takes the single dictionary argument but returns one or more
`key => value` pairs. This second notation may be useful when one wants
to extract values for multiple columns in a single step.
As an example consider that each result-file
contains a field `:longvector` too large to be included in the `df`.
The quantity of interest is the mean and the variance of said field.
To have these values in your results first use `black_list = [:longvector]`
and then define
special_list = [ :lv_mean => data -> mean(data[:longvector]),
:lv_lar => data -> var(data[:longvector]) ]
In case this operation fails the values will be treated as `missing`.
"""
collect_results!(folder; kwargs...) =
collect_results!(
joinpath(dirname(rstrip(folder, '/')), "results_$(rstrip(basename(folder), '/')).jld2"),
folder; kwargs...)
struct InvalidResultsCollection <: Exception
msg::AbstractString
end
Base.showerror(io::IO, e::InvalidResultsCollection) = print(io, e.msg)
function collect_results!(filename, folder;
valid_filetypes = [".bson", "jld", ".jld2"],
subfolders = false,
rpath = nothing,
verbose = true,
update = false,
newfile = false, # keyword only for defining collect_results without !
rinclude = [r""],
rexclude = [r"^\b$"],
load_function = wload,
kwargs...)
@assert all(eltype(r) <: Regex for r in (rinclude, rexclude)) "Elements of `rinclude` and `rexclude` must be Regex expressions."
if newfile || !isfile(filename)
!newfile && verbose && @info "Starting a new result collection..."
df = DataFrames.DataFrame()
mtimes = Dict{String,Float64}()
else
verbose && @info "Loading existing result collection..."
data = load_function(filename)
df = data["df"]
# Check if we have pre-recorded mtimes (if not this could be because of an old results database).
if "mtime" ∈ keys(data)
mtimes = data["mtime"]
else
if update
throw(InvalidResultsCollection("update of existing results collection requested, but no previously recorded modification time found. Likely the existing results collection was produced with an old version of DrWatson. Recomputing the collection solves this problem."))
end
mtimes = nothing
end
end
verbose && @info "Scanning folder $folder for result files."
if subfolders
allfiles = String[]
for (root, dirs, files) in walkdir(folder)
for file in files
push!(allfiles, joinpath(root,file))
end
end
else
allfiles = joinpath.(Ref(folder), readdir(folder))
end
if (rinclude == [r""] && rexclude == [r"^\b$"]) == false
idx_filt = Int[]
for i in eachindex(allfiles)
file = allfiles[i]
include_bool = any(match(rgx, file) !== nothing for rgx in rinclude)
exclude_bool = any(match(rgx, file) !== nothing for rgx in rexclude)
if include_bool == false || exclude_bool == true
push!(idx_filt, i)
end
end
deleteat!(allfiles, idx_filt)
end
n = 0 # new entries added
u = 0 # entries updated
existing_files = "path" in string.(names(df)) ? df[:,:path] : ()
for file ∈ allfiles
is_valid_file(file, valid_filetypes) || continue
# maybe use relative path
file = rpath === nothing ? file : relpath(file, rpath)
mtime_file = mtime(file)
replace_entry = false
#already added?
if file ∈ existing_files
if !update
continue
end
# Error if file is not in the mtimes database
if file ∉ keys(mtimes)
throw(InvalidResultsCollection("existing results correction is corrupt: no `mtime` entry for file $(file) found."))
end
# Skip if mtime is the same as the one previously recorded
if mtimes[file] == mtime_file
continue
end
replace_entry = true
end
# Now update the mtime of the new or modified file
mtimes[file] = mtime_file
fpath = rpath === nothing ? file : joinpath(rpath, file)
df_new = to_data_row(FileIO.query(fpath); load_function=load_function, kwargs...)
#add filename
df_new[!, :path] .= file
if replace_entry
# Delete the row with the old data
delete!(df, findfirst((x)->(x.path == file), eachrow(df)))
u += 1
else
n += 1
end
df = merge_dataframes!(df, df_new)
end
if update
# Delete entries with nonexisting files.
idx = findall((x)->(!isfile(x.path)), eachrow(df))
deleteat!(df, idx)
verbose && @info "Added $n entries. Updated $u entries. Deleted $(length(idx)) entries."
else
verbose && @info "Added $n entries."
end
if !newfile
data = Dict{String,Any}("df" => df)
# mtimes is only `nothing` if we are working with an older collection
# We want to keep it that way, so do not try to create mtimes entry.
if !isnothing(mtimes)
data["mtime"] = mtimes
end
wsave(filename, data)
end
return df
end
"""
merge_dataframes!(df, df_new) -> merged_df
Merge two dataframes `df` and `df_new`. If the `names` of the dataframes
are the same this is just `vcat`.
If `df_new` is missing keys that are already columns in `df`,
they will set as `missing` in `df`.
If on the other hand new keys are encountered, existing in `df_new`
but not `df`, a new column will be added and filled with `missing`
in `df`. Then `df` and `df_new` are concatenated.
"""
function merge_dataframes!(df1, df2)
if sort!(names(df1)) == sort!(names(df2))
return vcat(df1, df2)
else
for m ∈ setdiff(names(df1), names(df2))
df2[!, m] .= [missing]
end
for m ∈ setdiff(names(df2), names(df1))
DataFrames.insertcols!(df1, length(names(df1))+1, m => fill(missing, size(df1,1)))
end
return vcat(df1,df2)
end
end
is_valid_file(file, valid_filetypes) =
any(endswith(file, v) for v in valid_filetypes)
# Use wload per default when nothing else is available
function to_data_row(file::File; load_function=wload, kwargs...)
fpath = filename(file)
@debug "Opening $(filename(file)) with fallback wload."
return to_data_row(load_function(fpath), fpath; kwargs...)
end
# Specialize for JLD2 files, can do much faster mmapped access
function to_data_row(file::File{format"JLD2"}; load_function=(filename) -> JLD2.jldopen(filename, "r"), kwargs...)
fpath = filename(file)
@debug "Opening $(filename(file)) with jldopen."
data = load_function(fpath)
return to_data_row(data, fpath; kwargs...)
end
function to_data_row(data, file;
white_list = collect(keys(data)),
black_list = keytype(data).((:gitcommit, :gitpatch, :script)),
special_list = [])
cnames = setdiff!(white_list, black_list)
entries = Pair{Symbol,Any}[]
append!(entries,Symbol.(cnames) .=> (x->[x]).(getindex.(Ref(data),cnames)))
#Add special things here
for elem in special_list
try
if elem isa Pair
push!(entries, first(elem) => last(elem)(data))
elseif elem isa Function
res = elem(data)
if res isa Pair
# Use push! if a single key value pair is returned
push!(entries, res)
else
# Use append! if a vector of pairs is returned
append!(entries, res)
end
end
catch e
@warn "While applying $(string(elem)) to file "*
"$(file), got error $e."
end
end
return DataFrames.DataFrame(entries...)
end
"""
collect_results(folder; kwargs...) -> df
Do exactly the same as [`collect_results!`](@ref) but don't care to
load (or later save) an existing dataframe. Thus all found results files
are processed.
"""
collect_results(folder; kwargs...) =
collect_results!("", folder; newfile = true, kwargs...)