IQVIA-ML · yaxxie · Jun 9, 2022 · Jun 12, 2022 · Jun 12, 2022 · Jun 13, 2022
diff --git a/src/LightGBM.jl b/src/LightGBM.jl
@@ -44,6 +44,7 @@ function __init__()
 end
 
 
+include("types.jl")
 include("wrapper.jl")
 include("estimators.jl")
 include("utils.jl")

diff --git a/src/estimators.jl b/src/estimators.jl
@@ -1,10 +1,7 @@
-abstract type Estimator end
-abstract type LGBMEstimator <: Estimator end
-
 mutable struct LGBMRegression <: LGBMEstimator
     booster::Booster
     model::String
-    application::String
+    application::LGBMObjective
     boosting::String
 
     num_iterations::Int
@@ -214,7 +211,7 @@ function LGBMRegression(;
 )
 
     return LGBMRegression(
-        Booster(), "", objective, boosting, num_iterations, learning_rate, num_leaves,
+        Booster(), "", LGBMObjective(objective), boosting, num_iterations, learning_rate, num_leaves,
         max_depth, tree_learner, num_threads, histogram_pool_size,
         min_data_in_leaf, min_sum_hessian_in_leaf, max_delta_step, lambda_l1, lambda_l2,
         min_gain_to_split, feature_fraction, feature_fraction_bynode, feature_fraction_seed,
@@ -233,7 +230,7 @@ end
 mutable struct LGBMClassification <: LGBMEstimator
     booster::Booster
     model::String
-    application::String
+    application::LGBMObjective
     boosting :: String
 
     num_iterations::Int
@@ -455,7 +452,7 @@ function LGBMClassification(;
 )
 
     return LGBMClassification(
-        Booster(), "", objective, boosting, num_iterations, learning_rate,
+        Booster(), "", LGBMObjective(objective), boosting, num_iterations, learning_rate,
         num_leaves, max_depth, tree_learner, num_threads, histogram_pool_size,
         min_data_in_leaf, min_sum_hessian_in_leaf, max_delta_step, lambda_l1, lambda_l2,
         min_gain_to_split, feature_fraction, feature_fraction_bynode, feature_fraction_seed,
@@ -470,3 +467,5 @@ function LGBMClassification(;
         force_col_wise, force_row_wise,
     )
 end
+
+
diff --git a/src/fit.jl b/src/fit.jl
@@ -131,19 +131,21 @@ function train!(
         end
     end
 
+    objectivedata, metricdata = LGBMFitData(estimator.booster, estimator.application, estimator.metric)
+
     start_iter = get_iter_number(estimator) + 1
     end_iter = start_iter + num_iterations - 1
 
     for (idx, iter) in enumerate(start_iter:end_iter)
 
-        is_finished = LGBM_BoosterUpdateOneIter(estimator.booster)
+        is_finished = boosting(estimator.booster, estimator.application, objectivedata)
 
         log_debug(verbosity, Dates.CompoundPeriod(now() - start_time), " elapsed, finished iteration ", iter, "\n")
 
         if is_finished == 0
             is_finished = eval_metrics!(
-                results, estimator, tests_names, iter, verbosity,
-                bigger_is_better, best_scores, best_iterations, metrics,
+                results, estimator, metrics, tests_names, iter, verbosity,
+                bigger_is_better, best_scores, best_iterations,
             )
         end
 
@@ -181,16 +183,24 @@ function truncate_model!(estimator::LGBMEstimator, best_iteration::Integer)
 end
 
 
+boosting(booster::Booster, ::LGBMObjective, ::LGBMFitData) = LGBM_BoosterUpdateOneIter(booster)
+function boosting(booster::Booster, objective::LGBMObjective, data::CustomFitData)
+    preds = LGBM_BoosterGetPredict(booster, 0)
+    grads, hessians = objective.custom_function(preds, data)
+    return LGBM_BoosterUpdateOneIterCustom(booster, grads, hessians)
+end
+
+
 function eval_metrics!(
     results::Dict,
     estimator::LGBMEstimator,
+    metrics::Vector{String},
     tests_names::Vector{String},
     iter::Integer,
     verbosity::Integer,
     bigger_is_better::Dict{String,Float64},
     best_scores::Dict{String,Dict{String,Real}},
     best_iterations::Dict{String,Dict{String,Real}},
-    metrics::Vector{String},
 )
     now_scores = Dict{String,Vector{Float64}}()
 

diff --git a/src/types.jl b/src/types.jl
@@ -0,0 +1,101 @@
+# C-API wrapper types
+
+const DatasetHandle = Ptr{Nothing}
+const BoosterHandle = Ptr{Nothing}
+
+"""
+Base type wrapping the LGBM C Dataset object
+"""
+mutable struct Dataset
+    handle::DatasetHandle
+
+    function Dataset(handle::DatasetHandle)
+        ds = new(handle)
+        finalizer(Dataset_finalizer, ds)
+        return ds
+    end
+
+    function Dataset_finalizer(ds::Dataset)
+        if ds.handle != C_NULL
+            LGBM_DatasetFree(ds)
+        end
+    end
+end
+
+
+"""
+Base type wrapping the LGBM C Booster object
+"""
+mutable struct Booster
+    handle::BoosterHandle
+    datasets::Vector{Dataset}
+
+    function Booster(handle::BoosterHandle, datasets::Vector{Dataset})
+        bst = new(handle, datasets)
+        finalizer(Booster_finalizer, bst)
+        return bst
+    end
+
+    function Booster_finalizer(bst::Booster)
+        if bst.handle != C_NULL
+            LGBM_BoosterFree(bst)
+        end
+    end
+end
+function Booster()
+    return Booster(C_NULL, Dataset[])
+end
+function Booster(handle::BoosterHandle)
+    return Booster(handle, Dataset[])
+end
+
+
+# Base types for estimators
+abstract type Estimator end
+abstract type LGBMEstimator <: Estimator end
+
+
+# A type for wrapping an objective function, whether a string or user supplied function
+# `Base.print` needs to be overloaded to provide `string` functionality
+# `Base.isempty` needs to be overloaded for empty checks (for the string represented type)
+# Plus, a constructor given the user-supplied objective function
+abstract type LGBMObjective end
+Base.print(io::IO, obj::LGBMObjective) = print(io, obj.objective)
+Base.isempty(x::LGBMObjective) = isempty(x.objective)
+struct PredefinedObjective <: LGBMObjective
+    objective::String
+end
+struct CustomObjective <: LGBMObjective
+    objective::String
+    custom_function::Function
+end
+LGBMObjective(x::String) = PredefinedObjective(x)
+LGBMObjective(x::Function) = CustomObjective("custom", x)
+
+
+abstract type LGBMFitData end
+struct EmptyFitData <: LGBMFitData end
+"""
+Datatype holding data which is useful during fitting iterations
+"""
+struct CustomFitData <: LGBMFitData
+    labels::Vector{Float32}
+    weights::Vector{Float32}
+    num_models::Integer
+end
+function CustomFitData(b::Booster)
+    if length(b.datasets) == 0
+        throw(ErrorException("Booster does not have any training data associated"))
+    end
+    dataset = first(b.datasets)
+    labels = LGBM_DatasetGetField(dataset, "label")
+    weights = LGBM_DatasetGetField(dataset, "weight")
+    nummodels = LGBM_BoosterNumModelPerIteration(b)
+
+    return CustomFitData(labels, weights, nummodels)
+end
+# Last arg is meant to be the metric type later on, and the
+# 2nd return value meant to be the data used for computing custom metrics
+LGBMFitData(::Booster, ::PredefinedObjective, ::Any) = EmptyFitData(), nothing
+LGBMFitData(b::Booster, ::CustomObjective, ::Any) = CustomFitData(b), nothing
+
diff --git a/src/wrapper.jl b/src/wrapper.jl
@@ -1,54 +1,10 @@
-const DatasetHandle = Ptr{Nothing}
-const BoosterHandle = Ptr{Nothing}
-
 const C_API_DTYPE_FLOAT32 = 0
 const C_API_DTYPE_FLOAT64 = 1
 const C_API_DTYPE_INT32 = 2
 const C_API_DTYPE_INT64 = 3
 const C_API_MATRIX_TYPE_CSC = 1
 const C_API_MATRIX_TYPE_CSR = 0
 
-mutable struct Dataset
-    handle::DatasetHandle
-
-    function Dataset(handle::DatasetHandle)
-        ds = new(handle)
-        finalizer(Dataset_finalizer, ds)
-        return ds
-    end
-
-    function Dataset_finalizer(ds::Dataset)
-        if ds.handle != C_NULL
-            LGBM_DatasetFree(ds)
-        end
-    end
-end
-
-mutable struct Booster
-    handle::BoosterHandle
-    datasets::Vector{Dataset}
-
-    function Booster(handle::BoosterHandle, datasets::Vector{Dataset})
-        bst = new(handle, datasets)
-        finalizer(Booster_finalizer, bst)
-        return bst
-    end
-
-    function Booster_finalizer(bst::Booster)
-        if bst.handle != C_NULL
-            LGBM_BoosterFree(bst)
-        end
-    end
-end
-
-function Booster()
-    return Booster(C_NULL, Dataset[])
-end
-
-function Booster(handle::BoosterHandle)
-    return Booster(handle, Dataset[])
-end
-
 
 # deepcopy utils, but we can't reasonably do this for datasets
 function Base.deepcopy_internal(x::Booster, stackdict::IdDict)

diff --git a/test/basic/test_fit.jl b/test/basic/test_fit.jl
@@ -1,28 +1,43 @@
 module TestFit
 
+
 using Test
 using Dates
 using LightGBM
 using SparseArrays
 
+
 # test fixtures
-train_matrix = rand(5000,70) # create random dataset
+train_matrix = rand(5000, 70) # create random dataset
 train_sparse = sparse(train_matrix)
 train_labels = rand([0, 1], 5000)
 train_dataset = LightGBM.LGBM_DatasetCreateFromMat(train_matrix, "")
 LightGBM.LGBM_DatasetSetField(train_dataset, "label", train_labels)
 
 
-test_matrix = rand(2000,70) # create random dataset
+test_matrix = rand(2000, 70) # create random dataset
 test_labels = rand([0, 1], 2000)
 test_dataset = LightGBM.LGBM_DatasetCreateFromMat(test_matrix, "", train_dataset)
 LightGBM.LGBM_DatasetSetField(test_dataset, "label", test_labels)
 
-test2_matrix = rand(1500,70) # create second random dataset
+
+test2_matrix = rand(1500, 70) # create second random dataset
 test2_labels = rand([0, 1], 1500)
 test2_dataset = LightGBM.LGBM_DatasetCreateFromMat(test2_matrix, "", train_dataset)
 LightGBM.LGBM_DatasetSetField(test2_dataset, "label", test2_labels)
 
+
+regression_matrix = randn(1000, 10)
+regression_targets = randn(1000)
+regression_dataset = LightGBM.LGBM_DatasetCreateFromMat(regression_matrix, "")
+LightGBM.LGBM_DatasetSetField(regression_dataset, "label", regression_targets)
+
+regression_test_matrix = randn(2000, 10)
+regression_test_targets = randn(2000)
+regression_test_dataset = LightGBM.LGBM_DatasetCreateFromMat(regression_test_matrix, "", regression_dataset)
+LightGBM.LGBM_DatasetSetField(regression_test_dataset, "label", regression_test_targets)
+
+
 @testset "test fit! with dataset -- binary" begin
     # Arrange
     estimator = LightGBM.LGBMClassification(objective = "binary", num_class = 1)
@@ -197,8 +212,8 @@ end
     for iter in 1:10
         LightGBM.LGBM_BoosterUpdateOneIter(estimator.booster)
         output = LightGBM.eval_metrics!(
-            results_fixture, estimator, ["test_bla"], iter, -1,
-            bigger_is_better, best_scores, best_iterations, ["auc"]
+            results_fixture, estimator, estimator.metric, ["test_bla"], iter, -1,
+            bigger_is_better, best_scores, best_iterations,
         )
 
         @test output == false
@@ -237,8 +252,8 @@ Criteria: early_stopping should kick in on round 6
     for iter in 1:10
         LightGBM.LGBM_BoosterUpdateOneIter(estimator.booster)
         output = LightGBM.eval_metrics!(
-            results_fixture, estimator, ["test_bla"], iter, -1,
-            bigger_is_better, best_scores, best_iterations, ["auc"]
+            results_fixture, estimator, estimator.metric, ["test_bla"], iter, -1,
+            bigger_is_better, best_scores, best_iterations,
         )
 
         # reset scores to round 1 being best
@@ -283,7 +298,6 @@ end
         metric = ["auc"],
         objective = "binary",
     )
-    verbosity = "verbose=-1"
 
     # Act
     output = LightGBM.fit!(estimator, train_dataset, test_dataset; truncate_booster=true, verbosity=-1)
@@ -304,7 +318,6 @@ end
         metric = ["auc"],
         objective = "binary",
     )
-    verbosity = "verbose=-1"
 
     # Act
     output = LightGBM.fit!(estimator, train_dataset, test_dataset; truncate_booster=false, verbosity=-1)
@@ -326,7 +339,6 @@ end
         metric = ["auc"],
         objective = "binary",
     )
-    verbosity = "verbose=-1"
 
     # Act
     output = LightGBM.fit!(estimator, train_dataset, test_dataset; truncate_booster=true, verbosity=-1)
@@ -340,4 +352,44 @@ end
 end
 
 
+@testset "test fit with custom objective" begin
+
+    # equivalent custom regression objective
+    regression_objective(pred, data) = (pred .- data.labels), ones(size(pred))
+
+    # The tests are somewhat sensitive to these values. Too many iterations and the error starts diverging
+    # too few, and the error on prediction values hasn't quite converged.
+    # Ideally, the comparison tolerance could be pushed to close to double precision
+    # But custom objectives optimise over single precision gradients and
+    # it isn't fully clear why it is hard to get closer
+    num_iter = 1000
+    tolerance = 1e-8
+
+    estimator = LightGBM.LGBMClassification(
+        num_class = 1,
+        num_iterations = num_iter,
+        metric = [],
+        objective = "regression",
+    )
+
+    estimator_custom = LightGBM.LGBMClassification(
+        num_class = 1,
+        num_iterations = num_iter,
+        metric = [],
+        objective = regression_objective,
+    )
+
+    LightGBM.fit!(estimator, regression_dataset; verbosity=-1)
+    LightGBM.fit!(estimator_custom, regression_dataset; verbosity=-1)
+
+    p = LightGBM.predict(estimator, regression_test_matrix, verbosity=-1)
+    p_custom = LightGBM.predict(estimator_custom, regression_test_matrix, verbosity=-1)
+    num_iter = LightGBM.LGBM_BoosterGetCurrentIteration(estimator.booster)
+    num_iter_custom = LightGBM.LGBM_BoosterGetCurrentIteration(estimator_custom.booster)
+
+    @test isapprox(p, p_custom, rtol=tolerance)
+    @test num_iter == num_iter_custom
+
+end
+
 end # module