Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add functions 'groupby' and 'stats' to, respectively, split a DS by a column values and give DS statistics per column. #1671

Merged
merged 9 commits into from
Feb 22, 2025
1 change: 1 addition & 0 deletions ext/GMTExcelExt/GMTExcelExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ module GMTExcelExt
end
D = GMT.GMTdataset(mat, text_col)
D.colnames = [colnames[inds_r]; colnames[inds_s[1]]]
GMT.set_dsBB!(D)
return D
end

Expand Down
1 change: 1 addition & 0 deletions src/GMT.jl
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ export
gunique, sortslicesperm,

Ginnerjoin, Gouterjoin, Gleftjoin, Grightjoin, Gcrossjoin, Gsemijoin, Gantijoin, spatialjoin,
groupby, stats,

lazinfo, lazread, lazwrite, lasread, laswrite,

Expand Down
1 change: 1 addition & 0 deletions src/gdal_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,7 @@ function helper_read_XLSCSV(dataset::Gdal.AbstractDataset)::GDtype
D[n_layer] = GMTdataset(mat, text_col)
D[n_layer].colnames = [colnames[inds_r]; colnames[inds_s[1]]]
end
set_dsBB!(D)
return (length(D) == 1) ? D[1] : D
end

Expand Down
1 change: 1 addition & 0 deletions src/tables_gmt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ function Tables.getcolumn(D::GMTdataset, name::AbstractString)
Columnames = [join("Column" * "$(n)") for n = 1:size(D,2)]
((i = findfirst(name .== Columnames)) === nothing) && error("Column name - $(name) - not found in this dataset")
end
(i > size(D.data, 2)) && return D.text # It must be the text column
D.data[:,i]
end

Expand Down
75 changes: 73 additions & 2 deletions src/utils_types.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ does not need explicit coordinates to place the text.
mat2ds(mat::Nothing) = mat # Method to simplify life and let call mat2ds on a nothing
mat2ds(mat::GDtype) = mat # Method to simplify life and let call mat2ds on a already GMTdataset
mat2ds(text::Union{AbstractString, Vector{<:AbstractString}}) = text_record(text) # Now we can hide text_record
mat2ds(text::Vector{String}; hdr::String="") = text_record(fill(NaN,length(text),2), text, [hdr])
mat2ds(text::Vector{String}; hdr::String="", kw...) = text_record(fill(NaN,length(text),2), text, [hdr]) # The kw... is only to not error

function mat2ds(mat::AbstractMatrix; hdr=String[], geom=0, kwargs...)::GMTdataset
# Here we are expecting that Any-ness results from columns with DateTime. If not returm 'mat' as is
Expand Down Expand Up @@ -843,8 +843,10 @@ end
Compute the indices that split a vector of datasets into groups. The grouping is done either by a provided
attribute name (`groupby`) or by the Feature_ID attribute. This function is mostly used internally by `zonal_statistics`

### Args
- `D`: A vector of GMTdataset

### Kwargs
- `groupby`: If provided, it must be an attribute name, for example, `groupby="NAME"`. If not provided, we use
the `Feature_ID` attribute that is a unique identifier assigned during an OGR file reading (by the GMT6.5 C lib).
If the `Feature_ID` attribute does not exist, you must use a valid attribute name passed in `groupby`.
Expand Down Expand Up @@ -883,6 +885,75 @@ function splitds(D::Vector{<:GMTdataset}; groupby::String="")
return vv, feature_names
end

# ---------------------------------------------------------------------------------------------------
"""
Dv = groupby(D::GMTdataset, col)::Vector{GMTdataset}

Split a GMTdataset by the unique values of the column selected by `col`.

`col` can be a column name, either as a String or a Symbol, or a column number. In any case, it must point
to a column with integers (normaly a flint (Floating Point Integer)), or a string column. _i.e.,_ the last
column in a GMTdataset.
"""
function groupby(D::GMTdataset, cols::Union{String,Symbol,Int})::Vector{GMTdataset}
n_cols = size(D.data, 2)
colnames = !isempty(D.colnames) ? D.colnames : ["col.$i" for i=1:n_cols]
!isempty(D.text) && length(colnames) == n_cols && push!(colnames, "Text") # 'Text' is the text column generic name
colname::String = isa(cols, Real) ? colnames[cols] : (isa(cols, Symbol) ? string(cols) : cols)
((cn = findfirst(colname .== colnames)) === nothing) && error("Column '$colname' not found in dataset!")
if (cn > n_cols) # It must be the text column
feature_names = unique(D.text)
nf = length(feature_names)
vv = Vector{Vector{Int}}(undef, nf)
for k = 1:nf vv[k] = findall(D.text .== feature_names[k]) end
else
c = D.data[min(50, size(D.data, 1)), cn] # look at the first 50 elements of the picked column
!all(round.(c) .== c) && error("The selected column is not an integer (flint, in fact) column!")
ids = unique(view(D.data, :, cn))
vv = Vector{Vector{Int}}(undef, length(ids))
for k = 1:numel(ids) vv[k] = findall(view(D.data, :, cn) .== ids[k]) end
end
Dv = Vector{GMTdataset}(undef, numel(vv))
for k = 1:numel(vv) # Loop over groups
Dv[k] = mat2ds(D, (vv[k], :))
end
return Dv
end

# ---------------------------------------------------------------------------------------------------
"""
D = stats(D::GMTdataset, cols=0) -> GMTdataset

Return descriptive statistics for a dataset `GMTdataset` where each row represents the statistics for each column.

If `cols` is a column name, either as a String or a Symbol, or a column number, only the statistics
for that column are returned.
"""
function stats(D::GMTdataset, cols::Union{String,Symbol,Int}=0)::GMTdataset
if (cols == 0)
nc = size(D.data, 2)
q25 = Vector{Float64}(undef, nc); q50 = similar(q25); q75 = similar(q25); m = similar(q25); s = similar(q25)
mi, ma = D.bbox[1:2:end], D.bbox[2:2:end]
for k = 1:nc
q25[k] = quantile(view(D,:,k), 0.25)
q50[k] = quantile(view(D,:,k), 0.50)
q75[k] = quantile(view(D,:,k), 0.75)
m[k] = mean(view(D,:,k))
s[k] = std(view(D,:,k))
end
Ds = mat2ds(hcat(mi, m, ma, q25, q50, q75, s))
else
colname::String = isa(cols, Real) ? D.colnames[cols] : (isa(cols, Symbol) ? string(cols) : cols)
((cn = findfirst(colname .== D.colnames)) === nothing) && error("Column '$colname' not found in dataset!")
(cn > size(D.data, 2)) && error("Column '$colname' is the text column!. Must select a numeric column.")
_q25, _q50, _q75, _m, _s = quantile(view(D,:,cn), 0.25), quantile(view(D,:,cn), 0.50),
quantile(view(D,:,cn), 0.75), mean(view(D,:,cn)), std(view(D,:,cn))
Ds = mat2ds(hcat(D.bbox[2cn-1], _m, D.bbox[2cn], _q25, _q50, _q75, _s))
end
Ds.colnames = ["min", "mean", "max", "q25", "q50", "q75", "std"]
return Ds
end

# ---------------------------------------------------------------------------------------------------
function tabletypes2ds(arg, interp=0)
# Try guesswork to convert Tables types into GMTdatasets usable in plots.
Expand All @@ -901,7 +972,7 @@ end
"""
D = mesh2ds(mesh) -> Vector{GMTdataset}

Extract data from a GeometryBasics Mesh type and return it into a vector of GMTdataset.
Extract data from a GeometryBasics Mesh type and return it in a vector of GMTdataset.
"""
function mesh2ds(mesh)
(!startswith(string(typeof(mesh)), "Mesh{3,")) && error("Argument must be a GeometryBasics mesh")
Expand Down
5 changes: 5 additions & 0 deletions test/test_misc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,11 @@
#GMT.line2multiseg([mat2ds(rand(3,2)), mat2ds(rand(4,2))], lt=[1,2], auto_color=true);
#GMT.resetGMT()

stats(D);
stats(D, 2);
@test groupby(mat2ds([1.0 44; 1 7; 2 9; 2 5]), 1)[1].data == [1.0 44; 1 7]
@test groupby(mat2ds([1.0 44; 1 7; 2 9; 2 5], ["A", "A", "B", "B"]), "Text")[1].data == [1.0 44; 1 7]

GMT.mat2grid(rand(Float32, 10,10), reg=1);
GMT.mat2grid(1, hdr=[0. 5 0 5 1 1])
GMT.num2str(rand(2,3));
Expand Down
Loading