Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: updating lightgbm to 2.3.180 #850

Merged
merged 1 commit into from
May 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ libraryDependencies ++= Seq(
"com.jcraft" % "jsch" % "0.1.54",
"com.microsoft.cognitiveservices.speech" % "client-sdk" % "1.11.0",
"org.apache.httpcomponents" % "httpclient" % "4.5.6",
"com.microsoft.ml.lightgbm" % "lightgbmlib" % "2.3.150",
"com.microsoft.ml.lightgbm" % "lightgbmlib" % "2.3.180",
"com.github.vowpalwabbit" % "vw-jni" % "8.7.0.3",
"com.linkedin.isolation-forest" %% "isolation-forest_2.4.3" % "0.3.2",
"org.apache.spark" %% "spark-avro" % sparkVersion
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ protected class BoosterHandler(model: String) {
val scoredDataOutPtr: ThreadLocal[DoubleNativePtrHandler] = {
new ThreadLocal[DoubleNativePtrHandler] {
override def initialValue(): DoubleNativePtrHandler = {
new DoubleNativePtrHandler(lightgbmlib.new_doubleArray(numClasses))
new DoubleNativePtrHandler(lightgbmlib.new_doubleArray(numClasses.toLong))
}
}
}
Expand All @@ -62,7 +62,7 @@ protected class BoosterHandler(model: String) {
val leafIndexDataOutPtr: ThreadLocal[DoubleNativePtrHandler] = {
new ThreadLocal[DoubleNativePtrHandler] {
override def initialValue(): DoubleNativePtrHandler = {
new DoubleNativePtrHandler(lightgbmlib.new_doubleArray(numTotalModel))
new DoubleNativePtrHandler(lightgbmlib.new_doubleArray(numTotalModel.toLong))
}
}
}
Expand All @@ -80,7 +80,7 @@ protected class BoosterHandler(model: String) {
val shapDataOutPtr: ThreadLocal[DoubleNativePtrHandler] = {
new ThreadLocal[DoubleNativePtrHandler] {
override def initialValue(): DoubleNativePtrHandler = {
new DoubleNativePtrHandler(lightgbmlib.new_doubleArray(numFeatures + 1))
new DoubleNativePtrHandler(lightgbmlib.new_doubleArray(numFeatures.toLong + 1))
}
}
}
Expand All @@ -98,7 +98,7 @@ protected class BoosterHandler(model: String) {
val featureImportanceOutPtr: ThreadLocal[DoubleNativePtrHandler] = {
new ThreadLocal[DoubleNativePtrHandler] {
override def initialValue(): DoubleNativePtrHandler = {
new DoubleNativePtrHandler(lightgbmlib.new_doubleArray(numFeatures))
new DoubleNativePtrHandler(lightgbmlib.new_doubleArray(numFeatures.toLong))
}
}
}
Expand Down Expand Up @@ -303,14 +303,14 @@ class LightGBMBooster(val model: String) extends Serializable {
lightgbmlib.LGBM_BoosterFeatureImportance(boosterHandler.boosterPtr, -1,
importanceTypeNum, boosterHandler.featureImportanceOutPtr.get().ptr),
"Booster FeatureImportance")
(0 until numFeatures).map(lightgbmlib.doubleArray_getitem(boosterHandler.featureImportanceOutPtr.get().ptr, _)).toArray
(0L until numFeatures.toLong).map(lightgbmlib.doubleArray_getitem(boosterHandler.featureImportanceOutPtr.get().ptr, _)).toArray
}

private def predScoreToArray(classification: Boolean, scoredDataOutPtr: SWIGTYPE_p_double,
kind: Int): Array[Double] = {
if (classification && numClasses == 1) {
// Binary classification scenario - LightGBM only returns the value for the positive class
val pred = lightgbmlib.doubleArray_getitem(scoredDataOutPtr, 0)
val pred = lightgbmlib.doubleArray_getitem(scoredDataOutPtr, 0L)
if (kind == boosterHandler.rawScoreConstant) {
// Return the raw score for binary classification
Array(-pred, pred)
Expand All @@ -320,17 +320,17 @@ class LightGBMBooster(val model: String) extends Serializable {
}
} else {
(0 until numClasses).map(classNum =>
lightgbmlib.doubleArray_getitem(scoredDataOutPtr, classNum)).toArray
lightgbmlib.doubleArray_getitem(scoredDataOutPtr, classNum.toLong)).toArray
}
}

private def predLeafToArray(leafIndexDataOutPtr: SWIGTYPE_p_double): Array[Double] = {
(0 until numTotalModel).map(modelNum =>
lightgbmlib.doubleArray_getitem(leafIndexDataOutPtr, modelNum)).toArray
lightgbmlib.doubleArray_getitem(leafIndexDataOutPtr, modelNum.toLong)).toArray
}

private def shapToArray(shapDataOutPtr: SWIGTYPE_p_double): Array[Double] = {
(0 until (numFeatures + 1)).map(featNum =>
lightgbmlib.doubleArray_getitem(shapDataOutPtr, featNum)).toArray
lightgbmlib.doubleArray_getitem(shapDataOutPtr, featNum.toLong)).toArray
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@ import com.microsoft.ml.lightgbm._
* @param dataset The native representation of the dataset.
*/
class LightGBMDataset(val dataset: SWIGTYPE_p_void) extends AutoCloseable {
var featureNames: Option[SWIGTYPE_p_p_char] = None
var featureNamesOpt2: Option[Array[String]] = None

def validateDataset(): Unit = {
// Validate num rows
val numDataPtr = lightgbmlib.new_intp()
Expand All @@ -37,9 +34,9 @@ class LightGBMDataset(val dataset: SWIGTYPE_p_void) extends AutoCloseable {
// Generate the column and add to dataset
var colArray: Option[SWIGTYPE_p_float] = None
try {
colArray = Some(lightgbmlib.new_floatArray(numRows))
colArray = Some(lightgbmlib.new_floatArray(numRows.toLong))
field.zipWithIndex.foreach(ri =>
lightgbmlib.floatArray_setitem(colArray.get, ri._2, ri._1.toFloat))
lightgbmlib.floatArray_setitem(colArray.get, ri._2.toLong, ri._1.toFloat))
val colAsVoidPtr = lightgbmlib.float_to_voidp_ptr(colArray.get)
val data32bitType = lightgbmlibConstants.C_API_DTYPE_FLOAT32
LightGBMUtils.validate(
Expand All @@ -55,9 +52,9 @@ class LightGBMDataset(val dataset: SWIGTYPE_p_void) extends AutoCloseable {
// Generate the column and add to dataset
var colArray: Option[SWIGTYPE_p_double] = None
try {
colArray = Some(lightgbmlib.new_doubleArray(field.length))
colArray = Some(lightgbmlib.new_doubleArray(field.length.toLong))
field.zipWithIndex.foreach(ri =>
lightgbmlib.doubleArray_setitem(colArray.get, ri._2, ri._1))
lightgbmlib.doubleArray_setitem(colArray.get, ri._2.toLong, ri._1))
val colAsVoidPtr = lightgbmlib.double_to_voidp_ptr(colArray.get)
val data64bitType = lightgbmlibConstants.C_API_DTYPE_FLOAT64
LightGBMUtils.validate(
Expand All @@ -73,9 +70,9 @@ class LightGBMDataset(val dataset: SWIGTYPE_p_void) extends AutoCloseable {
// Generate the column and add to dataset
var colArray: Option[SWIGTYPE_p_int] = None
try {
colArray = Some(lightgbmlib.new_intArray(numRows))
colArray = Some(lightgbmlib.new_intArray(numRows.toLong))
field.zipWithIndex.foreach(ri =>
lightgbmlib.intArray_setitem(colArray.get, ri._2, ri._1))
lightgbmlib.intArray_setitem(colArray.get, ri._2.toLong, ri._1))
val colAsVoidPtr = lightgbmlib.int_to_voidp_ptr(colArray.get)
val data32bitType = lightgbmlibConstants.C_API_DTYPE_INT32
LightGBMUtils.validate(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ object LightGBMUtils {
}
}

def validateArray(result: SWIGTYPE_p_void, component: String): Unit = {
if (result == null) {
throw new Exception(component + " call failed in LightGBM with error: "
+ lightgbmlib.LGBM_GetLastError())
}
}

/** Loads the native shared object binaries lib_lightgbm.so and lib_lightgbm_swig.so
*/
def initializeNativeLibrary(): Unit = {
Expand Down Expand Up @@ -195,10 +202,10 @@ object LightGBMUtils {
def generateData(numRows: Int, rowsAsDoubleArray: Array[Array[Double]]):
(SWIGTYPE_p_void, SWIGTYPE_p_double) = {
val numCols = rowsAsDoubleArray.head.length
val data = lightgbmlib.new_doubleArray(numCols * numRows)
val data = lightgbmlib.new_doubleArray(numCols.toLong * numRows.toLong)
rowsAsDoubleArray.zipWithIndex.foreach(ri =>
ri._1.zipWithIndex.foreach(value =>
lightgbmlib.doubleArray_setitem(data, value._2 + (ri._2 * numCols), value._1)))
lightgbmlib.doubleArray_setitem(data, (value._2 + (ri._2 * numCols)).toLong, value._1)))
(lightgbmlib.double_to_voidp_ptr(data), data)
}

Expand Down
23 changes: 11 additions & 12 deletions src/main/scala/com/microsoft/ml/spark/lightgbm/TrainUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -181,13 +181,12 @@ private object TrainUtils extends Serializable {

def getEvalNames(boosterPtr: Option[SWIGTYPE_p_void]): Array[String] = {
// Need to keep track of best scores for each metric, see callback.py in lightgbm for reference
val evalCountsPtr = lightgbmlib.new_intp()
val resultCounts = lightgbmlib.LGBM_BoosterGetEvalCounts(boosterPtr.get, evalCountsPtr)
LightGBMUtils.validate(resultCounts, "Booster Get Eval Counts")
val evalCounts = lightgbmlib.intp_value(evalCountsPtr)
// For debugging, can get metric names:
val evalNamesPtr = lightgbmlib.LGBM_BoosterGetEvalNamesSWIG(boosterPtr.get, evalCounts)
(0 until evalCounts).map(lightgbmlib.stringArray_getitem(evalNamesPtr, _)).toArray
// For debugging, can get metric names
val stringArrayHandle = lightgbmlib.LGBM_BoosterGetEvalNamesSWIG(boosterPtr.get)
LightGBMUtils.validateArray(stringArrayHandle, "Booster Get Eval Names")
val evalNames = lightgbmlib.StringArrayHandle_get_strings(stringArrayHandle)
lightgbmlib.StringArrayHandle_free(stringArrayHandle)
evalNames
}

def beforeTrainIteration(batchIndex: Int, partitionId: Int, curIters: Int, log: Logger,
Expand Down Expand Up @@ -257,14 +256,14 @@ private object TrainUtils extends Serializable {
}

val trainEvalResults: Option[Map[String, Double]] = if (trainParams.isProvideTrainingMetric && !isFinished) {
val trainResults = lightgbmlib.new_doubleArray(evalNames.length)
val trainResults = lightgbmlib.new_doubleArray(evalNames.length.toLong)
val dummyEvalCountsPtr = lightgbmlib.new_intp()
val resultEval = lightgbmlib.LGBM_BoosterGetEval(boosterPtr.get, 0, dummyEvalCountsPtr, trainResults)
lightgbmlib.delete_intp(dummyEvalCountsPtr)
LightGBMUtils.validate(resultEval, "Booster Get Train Eval")

val results: Array[(String, Double)] = evalNames.zipWithIndex.map { case (evalName, index) =>
val score = lightgbmlib.doubleArray_getitem(trainResults, index)
val score = lightgbmlib.doubleArray_getitem(trainResults, index.toLong)
log.info(s"Train $evalName=$score")
(evalName, score)
}
Expand All @@ -275,13 +274,13 @@ private object TrainUtils extends Serializable {
}

val validEvalResults: Option[Map[String, Double]] = if (hasValid && !isFinished) {
val evalResults = lightgbmlib.new_doubleArray(evalNames.length)
val evalResults = lightgbmlib.new_doubleArray(evalNames.length.toLong)
val dummyEvalCountsPtr = lightgbmlib.new_intp()
val resultEval = lightgbmlib.LGBM_BoosterGetEval(boosterPtr.get, 1, dummyEvalCountsPtr, evalResults)
lightgbmlib.delete_intp(dummyEvalCountsPtr)
LightGBMUtils.validate(resultEval, "Booster Get Valid Eval")
val results: Array[(String, Double)] = evalNames.zipWithIndex.map { case (evalName, index) =>
val score = lightgbmlib.doubleArray_getitem(evalResults, index)
val score = lightgbmlib.doubleArray_getitem(evalResults, index.toLong)
log.info(s"Valid $evalName=$score")
val cmp =
if (evalName.startsWith("auc") || evalName.startsWith("ndcg@") || evalName.startsWith("map@"))
Expand All @@ -292,7 +291,7 @@ private object TrainUtils extends Serializable {
bestScore(index) = score
bestIter(index) = iters
bestScores(index) = evalNames.indices
.map(j => lightgbmlib.doubleArray_getitem(evalResults, j)).toArray
.map(j => lightgbmlib.doubleArray_getitem(evalResults, j.toLong)).toArray
} else if (iters - bestIter(index) >= trainParams.earlyStoppingRound) {
isFinished = true
log.info("Early stopping, best iteration is " + bestIter(index))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ class VerifyLightGBMClassifier extends Benchmarks with EstimatorFuzzing[LightGBM
assert(modelStr.contains("[lambda_l2: 0.1]") || modelStr.contains("[lambda_l2: 0.5]"))
}

ignore("Verify LightGBM Classifier with batch training") {
test("Verify LightGBM Classifier with batch training") {
val batches = Array(0, 2, 10)
batches.foreach(nBatches => assertFitWithoutErrors(baseModel.setNumBatches(nBatches), pimaDF))
}
Expand Down Expand Up @@ -279,7 +279,7 @@ class VerifyLightGBMClassifier extends Benchmarks with EstimatorFuzzing[LightGBM
assertBinaryImprovement(scoredDF1, scoredDF2)
}

ignore("Verify LightGBM Multiclass Classifier with vector initial score") {
test("Verify LightGBM Multiclass Classifier with vector initial score") {
val scoredDF1 = baseModel.fit(breastTissueDF).transform(breastTissueDF)
val df2 = scoredDF1.withColumn(initScoreCol, col(rawPredCol))
.drop(predCol, rawPredCol, probCol, leafPredCol, featuresShapCol)
Expand All @@ -299,9 +299,9 @@ class VerifyLightGBMClassifier extends Benchmarks with EstimatorFuzzing[LightGBM
// If the max delta step is specified, assert AUC differs (assert parameter works)
// Note: the final max output of leaves is learning_rate * max_delta_step, so param should reduce the effect
val Array(train, test) = taskDF.randomSplit(Array(0.8, 0.2), seed)
val baseModelWithLR = baseModel.setLearningRate(0.9).setNumIterations(100)
val baseModelWithLR = baseModel.setLearningRate(0.9).setNumIterations(200)
val scoredDF1 = baseModelWithLR.fit(train).transform(test)
val scoredDF2 = baseModelWithLR.setMaxDeltaStep(0.1).fit(train).transform(test)
val scoredDF2 = baseModelWithLR.setMaxDeltaStep(0.5).fit(train).transform(test)
assertBinaryImprovement(scoredDF1, scoredDF2)
}

Expand Down